From d057807ede05ce809e9ba1e2b47b12ada0d3b2ed Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 14 Jun 2023 11:02:27 -0400 Subject: [PATCH] Switch refcount tracking from lists to AVL-trees. With large number of tracked references list searches under the lock become too expensive, creating enormous lock contention. On my tests with ZFS_DEBUG enabled this increases write throughput with 32KB blocks from ~1.2GB/s to ~7.5GB/s. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #14970 --- include/sys/zfs_refcount.h | 18 ++-- module/zfs/refcount.c | 189 +++++++++++++++++++------------------ 2 files changed, 110 insertions(+), 97 deletions(-) diff --git a/include/sys/zfs_refcount.h b/include/sys/zfs_refcount.h index 4efa266a53..77965a0aa5 100644 --- a/include/sys/zfs_refcount.h +++ b/include/sys/zfs_refcount.h @@ -27,6 +27,7 @@ #define _SYS_ZFS_REFCOUNT_H #include +#include #include #include @@ -43,19 +44,22 @@ extern "C" { #ifdef ZFS_DEBUG typedef struct reference { - list_node_t ref_link; + union { + avl_node_t a; + list_node_t l; + } ref_link; const void *ref_holder; uint64_t ref_number; - uint8_t *ref_removed; + boolean_t ref_search; } reference_t; typedef struct refcount { - kmutex_t rc_mtx; - boolean_t rc_tracked; - list_t rc_list; - list_t rc_removed; uint64_t rc_count; - uint64_t rc_removed_count; + kmutex_t rc_mtx; + avl_tree_t rc_tree; + list_t rc_removed; + uint_t rc_removed_count; + boolean_t rc_tracked; } zfs_refcount_t; /* diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c index 601d27f8c4..718bbb34a8 100644 --- a/module/zfs/refcount.c +++ b/module/zfs/refcount.c @@ -36,33 +36,40 @@ int reference_tracking_enable = B_FALSE; static uint_t reference_history = 3; /* tunable */ static kmem_cache_t *reference_cache; -static kmem_cache_t *reference_history_cache; void zfs_refcount_init(void) { reference_cache = kmem_cache_create("reference_cache", sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); - - reference_history_cache = kmem_cache_create("reference_history_cache", - sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zfs_refcount_fini(void) { kmem_cache_destroy(reference_cache); - kmem_cache_destroy(reference_history_cache); +} + +static int +zfs_refcount_compare(const void *x1, const void *x2) +{ + const reference_t *r1 = (const reference_t *)x1; + const reference_t *r2 = (const reference_t *)x2; + + int cmp1 = TREE_CMP(r1->ref_holder, r2->ref_holder); + int cmp2 = TREE_CMP(r1->ref_number, r2->ref_number); + int cmp = cmp1 ? cmp1 : cmp2; + return ((cmp || r1->ref_search) ? cmp : TREE_PCMP(r1, r2)); } void zfs_refcount_create(zfs_refcount_t *rc) { mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); - list_create(&rc->rc_list, sizeof (reference_t), - offsetof(reference_t, ref_link)); + avl_create(&rc->rc_tree, zfs_refcount_compare, sizeof (reference_t), + offsetof(reference_t, ref_link.a)); list_create(&rc->rc_removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); + offsetof(reference_t, ref_link.l)); rc->rc_count = 0; rc->rc_removed_count = 0; rc->rc_tracked = reference_tracking_enable; @@ -86,16 +93,15 @@ void zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) { reference_t *ref; + void *cookie = NULL; ASSERT3U(rc->rc_count, ==, number); - while ((ref = list_remove_head(&rc->rc_list))) + while ((ref = avl_destroy_nodes(&rc->rc_tree, &cookie)) != NULL) kmem_cache_free(reference_cache, ref); - list_destroy(&rc->rc_list); + avl_destroy(&rc->rc_tree); - while ((ref = list_remove_head(&rc->rc_removed))) { - kmem_cache_free(reference_history_cache, ref->ref_removed); + while ((ref = list_remove_head(&rc->rc_removed))) kmem_cache_free(reference_cache, ref); - } list_destroy(&rc->rc_removed); mutex_destroy(&rc->rc_mtx); } @@ -121,10 +127,10 @@ zfs_refcount_count(zfs_refcount_t *rc) int64_t zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref = NULL; + reference_t *ref; int64_t count; - if (!rc->rc_tracked) { + if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, number); ASSERT3U(count, >=, number); return (count); @@ -133,8 +139,9 @@ zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) ref = kmem_cache_alloc(reference_cache, KM_SLEEP); ref->ref_holder = holder; ref->ref_number = number; + ref->ref_search = B_FALSE; mutex_enter(&rc->rc_mtx); - list_insert_head(&rc->rc_list, ref); + avl_add(&rc->rc_tree, ref); rc->rc_count += number; count = rc->rc_count; mutex_exit(&rc->rc_mtx); @@ -151,7 +158,7 @@ zfs_refcount_add(zfs_refcount_t *rc, const void *holder) void zfs_refcount_add_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) (void) zfs_refcount_add_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_add(rc, holder); @@ -161,47 +168,42 @@ int64_t zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, const void *holder) { - reference_t *ref; + reference_t *ref, s; int64_t count; - if (!rc->rc_tracked) { + if (likely(!rc->rc_tracked)) { count = atomic_add_64_nv(&(rc)->rc_count, -number); ASSERT3S(count, >=, 0); return (count); } + s.ref_holder = holder; + s.ref_number = number; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); ASSERT3U(rc->rc_count, >=, number); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder && ref->ref_number == number) { - list_remove(&rc->rc_list, ref); - if (reference_history > 0) { - ref->ref_removed = - kmem_cache_alloc(reference_history_cache, - KM_SLEEP); - list_insert_head(&rc->rc_removed, ref); - rc->rc_removed_count++; - if (rc->rc_removed_count > reference_history) { - ref = list_tail(&rc->rc_removed); - list_remove(&rc->rc_removed, ref); - kmem_cache_free(reference_history_cache, - ref->ref_removed); - kmem_cache_free(reference_cache, ref); - rc->rc_removed_count--; - } - } else { - kmem_cache_free(reference_cache, ref); - } - rc->rc_count -= number; - count = rc->rc_count; - mutex_exit(&rc->rc_mtx); - return (count); - } + ref = avl_find(&rc->rc_tree, &s, NULL); + if (unlikely(ref == NULL)) { + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); } - panic("No such hold %p on refcount %llx", holder, - (u_longlong_t)(uintptr_t)rc); - return (-1); + avl_remove(&rc->rc_tree, ref); + if (reference_history > 0) { + list_insert_head(&rc->rc_removed, ref); + if (rc->rc_removed_count >= reference_history) { + ref = list_remove_tail(&rc->rc_removed); + kmem_cache_free(reference_cache, ref); + } else { + rc->rc_removed_count++; + } + } else { + kmem_cache_free(reference_cache, ref); + } + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); } int64_t @@ -213,7 +215,7 @@ zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) void zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) { - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) (void) zfs_refcount_remove_many(rc, number, holder); else for (; number > 0; number--) (void) zfs_refcount_remove(rc, holder); @@ -222,31 +224,38 @@ zfs_refcount_remove_few(zfs_refcount_t *rc, uint64_t number, const void *holder) void zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) { - int64_t count, removed_count; - list_t list, removed; + avl_tree_t tree; + list_t removed; + reference_t *ref; + void *cookie = NULL; + uint64_t count; + uint_t removed_count; - list_create(&list, sizeof (reference_t), - offsetof(reference_t, ref_link)); + avl_create(&tree, zfs_refcount_compare, sizeof (reference_t), + offsetof(reference_t, ref_link.a)); list_create(&removed, sizeof (reference_t), - offsetof(reference_t, ref_link)); + offsetof(reference_t, ref_link.l)); mutex_enter(&src->rc_mtx); count = src->rc_count; removed_count = src->rc_removed_count; src->rc_count = 0; src->rc_removed_count = 0; - list_move_tail(&list, &src->rc_list); + avl_swap(&tree, &src->rc_tree); list_move_tail(&removed, &src->rc_removed); mutex_exit(&src->rc_mtx); mutex_enter(&dst->rc_mtx); dst->rc_count += count; dst->rc_removed_count += removed_count; - list_move_tail(&dst->rc_list, &list); + if (avl_is_empty(&dst->rc_tree)) + avl_swap(&dst->rc_tree, &tree); + else while ((ref = avl_destroy_nodes(&tree, &cookie)) != NULL) + avl_add(&dst->rc_tree, ref); list_move_tail(&dst->rc_removed, &removed); mutex_exit(&dst->rc_mtx); - list_destroy(&list); + avl_destroy(&tree); list_destroy(&removed); } @@ -254,23 +263,19 @@ void zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, const void *current_holder, const void *new_holder) { - reference_t *ref; - boolean_t found = B_FALSE; + reference_t *ref, s; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return; + s.ref_holder = current_holder; + s.ref_number = number; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == current_holder && - ref->ref_number == number) { - ref->ref_holder = new_holder; - found = B_TRUE; - break; - } - } - ASSERT(found); + ref = avl_find(&rc->rc_tree, &s, NULL); + ASSERT(ref); + ref->ref_holder = new_holder; + avl_update(&rc->rc_tree, ref); mutex_exit(&rc->rc_mtx); } @@ -290,21 +295,23 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, boolean_t zfs_refcount_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref; + reference_t *ref, s; + avl_index_t idx; + boolean_t res; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return (zfs_refcount_count(rc) > 0); + s.ref_holder = holder; + s.ref_number = 0; + s.ref_search = B_TRUE; mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_TRUE); - } - } + ref = avl_find(&rc->rc_tree, &s, &idx); + if (likely(ref == NULL)) + ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); + res = ref && ref->ref_holder == holder; mutex_exit(&rc->rc_mtx); - return (B_FALSE); + return (res); } /* @@ -315,21 +322,23 @@ zfs_refcount_held(zfs_refcount_t *rc, const void *holder) boolean_t zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) { - reference_t *ref; + reference_t *ref, s; + avl_index_t idx; + boolean_t res; - if (!rc->rc_tracked) + if (likely(!rc->rc_tracked)) return (B_TRUE); mutex_enter(&rc->rc_mtx); - for (ref = list_head(&rc->rc_list); ref; - ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == holder) { - mutex_exit(&rc->rc_mtx); - return (B_FALSE); - } - } + s.ref_holder = holder; + s.ref_number = 0; + s.ref_search = B_TRUE; + ref = avl_find(&rc->rc_tree, &s, &idx); + if (likely(ref == NULL)) + ref = avl_nearest(&rc->rc_tree, idx, AVL_AFTER); + res = ref == NULL || ref->ref_holder != holder; mutex_exit(&rc->rc_mtx); - return (B_TRUE); + return (res); } EXPORT_SYMBOL(zfs_refcount_create);