From 91e02156ddea27d980fa8e5f7f3d10dda06139d2 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 19 Sep 2022 11:07:15 -0700 Subject: [PATCH] Revert "Reduce dbuf_find() lock contention" This reverts commit 34dbc618f50cfcd392f90af80c140398c38cbcd1. While this change resolved the lock contention observed for certain workloads, it inadventantly reduced the maximum hash inserts/removes per second. This appears to be due to the slightly higher acquisition cost of a rwlock vs a mutex. Reviewed-by: Brian Behlendorf --- include/sys/dbuf.h | 7 ++++--- module/zfs/dbuf.c | 26 +++++++++++++------------- module/zfs/dbuf_stats.c | 4 ++-- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 2e7385113e..b757b26641 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -321,12 +321,13 @@ typedef struct dmu_buf_impl { uint8_t db_dirtycnt; } dmu_buf_impl_t; -#define DBUF_RWLOCKS 8192 -#define DBUF_HASH_RWLOCK(h, idx) (&(h)->hash_rwlocks[(idx) & (DBUF_RWLOCKS-1)]) +/* Note: the dbuf hash table is exposed only for the mdb module */ +#define DBUF_MUTEXES 2048 +#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) typedef struct dbuf_hash_table { uint64_t hash_table_mask; dmu_buf_impl_t **hash_table; - krwlock_t hash_rwlocks[DBUF_RWLOCKS] ____cacheline_aligned; + kmutex_t hash_mutexes[DBUF_MUTEXES] ____cacheline_aligned; } dbuf_hash_table_t; typedef void (*dbuf_prefetch_fn)(void *, uint64_t, uint64_t, boolean_t); diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 1a022c8b8a..7ecc2812b4 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -339,18 +339,18 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_READER); + mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } @@ -393,13 +393,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db) hv = dbuf_hash(os, obj, level, blkid); idx = hv & h->hash_table_mask; - rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); + mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx], i = 0; dbf != NULL; dbf = dbf->db_hash_next, i++) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); @@ -417,7 +417,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64); DBUF_STAT_MAX(hash_elements_max, he); @@ -474,13 +474,13 @@ dbuf_hash_remove(dmu_buf_impl_t *db) /* * We mustn't hold db_mtx to maintain lock ordering: - * DBUF_HASH_RWLOCK > db_mtx. + * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(zfs_refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); - rw_enter(DBUF_HASH_RWLOCK(h, idx), RW_WRITER); + mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; @@ -491,7 +491,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) if (h->hash_table[idx] && h->hash_table[idx]->db_hash_next == NULL) DBUF_STAT_BUMPDOWN(hash_chains); - rw_exit(DBUF_HASH_RWLOCK(h, idx)); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_stats.hash_elements.value.ui64); } @@ -914,8 +914,8 @@ retry: sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); - for (i = 0; i < DBUF_RWLOCKS; i++) - rw_init(&h->hash_rwlocks[i], NULL, RW_DEFAULT, NULL); + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); dbuf_stats_init(h); @@ -981,8 +981,8 @@ dbuf_fini(void) dbuf_stats_destroy(); - for (i = 0; i < DBUF_RWLOCKS; i++) - rw_destroy(&h->hash_rwlocks[i]); + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); #if defined(_KERNEL) /* * Large allocations which do not require contiguous pages diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c index 037190a81b..12bb568a08 100644 --- a/module/zfs/dbuf_stats.c +++ b/module/zfs/dbuf_stats.c @@ -137,7 +137,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) if (size) buf[0] = 0; - rw_enter(DBUF_HASH_RWLOCK(h, dsh->idx), RW_READER); + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { /* * Returning ENOMEM will cause the data and header functions @@ -158,7 +158,7 @@ dbuf_stats_hash_table_data(char *buf, size_t size, void *data) mutex_exit(&db->db_mtx); } - rw_exit(DBUF_HASH_RWLOCK(h, dsh->idx)); + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); return (error); }