Optimize allocation throttling

Remove mc_lock use from metaslab_class_throttle_*().  The math there
is based on refcounts and so atomic, so the only race possible there
is between zfs_refcount_count() and zfs_refcount_add().  But in most
cases metaslab_class_throttle_reserve() is called with the allocator
lock held, which covers the race.  In cases where the lock is not
held, GANG_ALLOCATION() or METASLAB_MUST_RESERVE are set, and so we
do not use zfs_refcount_count().  And even if we assume some other
non-existing scenario, the worst that may happen from this race is
few more I/Os get to allocation earlier, that is not a problem.

Move locks and data of different allocators into different cache
lines to avoid false sharing.  Group spa_alloc_* arrays together
into single array of aligned struct spa_alloc spa_allocs.  Align
struct metaslab_class_allocator.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Reviewed-by: Don Brady <don.brady@delphix.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12314
This commit is contained in:
Alexander Motin 2021-07-21 08:40:36 -04:00 committed by GitHub
parent bc93935ef0
commit 1b50749ce9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 45 additions and 58 deletions

View File

@ -157,7 +157,7 @@ typedef struct metaslab_class_allocator {
*/ */
uint64_t mca_alloc_max_slots; uint64_t mca_alloc_max_slots;
zfs_refcount_t mca_alloc_slots; zfs_refcount_t mca_alloc_slots;
} metaslab_class_allocator_t; } ____cacheline_aligned metaslab_class_allocator_t;
/* /*
* A metaslab class encompasses a category of allocatable top-level vdevs. * A metaslab class encompasses a category of allocatable top-level vdevs.

View File

@ -57,6 +57,11 @@
extern "C" { extern "C" {
#endif #endif
typedef struct spa_alloc {
kmutex_t spaa_lock;
avl_tree_t spaa_tree;
} ____cacheline_aligned spa_alloc_t;
typedef struct spa_error_entry { typedef struct spa_error_entry {
zbookmark_phys_t se_bookmark; zbookmark_phys_t se_bookmark;
char *se_name; char *se_name;
@ -250,13 +255,11 @@ struct spa {
list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */ list_t spa_state_dirty_list; /* vdevs with dirty state */
/* /*
* spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are * spa_allocs is an array, whose lengths is stored in spa_alloc_count.
* stored in spa_alloc_count. There is one tree and one lock for each * There is one tree and one lock for each allocator, to help improve
* allocator, to help improve allocation performance in write-heavy * allocation performance in write-heavy workloads.
* workloads.
*/ */
kmutex_t *spa_alloc_locks; spa_alloc_t *spa_allocs;
avl_tree_t *spa_alloc_trees;
int spa_alloc_count; int spa_alloc_count;
spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_spares; /* hot spares */

View File

@ -5611,19 +5611,11 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
zio_t *zio, int flags) zio_t *zio, int flags)
{ {
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
uint64_t available_slots = 0;
boolean_t slot_reserved = B_FALSE;
uint64_t max = mca->mca_alloc_max_slots; uint64_t max = mca->mca_alloc_max_slots;
ASSERT(mc->mc_alloc_throttle_enabled); ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock); if (GANG_ALLOCATION(flags) || (flags & METASLAB_MUST_RESERVE) ||
zfs_refcount_count(&mca->mca_alloc_slots) + slots <= max) {
uint64_t reserved_slots = zfs_refcount_count(&mca->mca_alloc_slots);
if (reserved_slots < max)
available_slots = max - reserved_slots;
if (slots <= available_slots || GANG_ALLOCATION(flags) ||
flags & METASLAB_MUST_RESERVE) {
/* /*
* We reserve the slots individually so that we can unreserve * We reserve the slots individually so that we can unreserve
* them individually when an I/O completes. * them individually when an I/O completes.
@ -5631,11 +5623,9 @@ metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator,
for (int d = 0; d < slots; d++) for (int d = 0; d < slots; d++)
zfs_refcount_add(&mca->mca_alloc_slots, zio); zfs_refcount_add(&mca->mca_alloc_slots, zio);
zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
slot_reserved = B_TRUE; return (B_TRUE);
} }
return (B_FALSE);
mutex_exit(&mc->mc_lock);
return (slot_reserved);
} }
void void
@ -5645,10 +5635,8 @@ metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots,
metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator]; metaslab_class_allocator_t *mca = &mc->mc_allocator[allocator];
ASSERT(mc->mc_alloc_throttle_enabled); ASSERT(mc->mc_alloc_throttle_enabled);
mutex_enter(&mc->mc_lock);
for (int d = 0; d < slots; d++) for (int d = 0; d < slots; d++)
zfs_refcount_remove(&mca->mca_alloc_slots, zio); zfs_refcount_remove(&mca->mca_alloc_slots, zio);
mutex_exit(&mc->mc_lock);
} }
static int static int

View File

@ -9197,9 +9197,9 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_sync_pass = 0; spa->spa_sync_pass = 0;
for (int i = 0; i < spa->spa_alloc_count; i++) { for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_enter(&spa->spa_alloc_locks[i]); mutex_enter(&spa->spa_allocs[i].spaa_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
mutex_exit(&spa->spa_alloc_locks[i]); mutex_exit(&spa->spa_allocs[i].spaa_lock);
} }
/* /*
@ -9309,9 +9309,9 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_sync_done(dp, txg); dsl_pool_sync_done(dp, txg);
for (int i = 0; i < spa->spa_alloc_count; i++) { for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_enter(&spa->spa_alloc_locks[i]); mutex_enter(&spa->spa_allocs[i].spaa_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); VERIFY0(avl_numnodes(&spa->spa_allocs[i].spaa_tree));
mutex_exit(&spa->spa_alloc_locks[i]); mutex_exit(&spa->spa_allocs[i].spaa_lock);
} }
/* /*

View File

@ -700,13 +700,12 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa->spa_root = spa_strdup(altroot); spa->spa_root = spa_strdup(altroot);
spa->spa_alloc_count = spa_allocators; spa->spa_alloc_count = spa_allocators;
spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * spa->spa_allocs = kmem_zalloc(spa->spa_alloc_count *
sizeof (kmutex_t), KM_SLEEP); sizeof (spa_alloc_t), KM_SLEEP);
spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count *
sizeof (avl_tree_t), KM_SLEEP);
for (int i = 0; i < spa->spa_alloc_count; i++) { for (int i = 0; i < spa->spa_alloc_count; i++) {
mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node)); sizeof (zio_t), offsetof(zio_t, io_alloc_node));
} }
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
@ -799,13 +798,11 @@ spa_remove(spa_t *spa)
} }
for (int i = 0; i < spa->spa_alloc_count; i++) { for (int i = 0; i < spa->spa_alloc_count; i++) {
avl_destroy(&spa->spa_alloc_trees[i]); avl_destroy(&spa->spa_allocs[i].spaa_tree);
mutex_destroy(&spa->spa_alloc_locks[i]); mutex_destroy(&spa->spa_allocs[i].spaa_lock);
} }
kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * kmem_free(spa->spa_allocs, spa->spa_alloc_count *
sizeof (kmutex_t)); sizeof (spa_alloc_t));
kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count *
sizeof (avl_tree_t));
avl_destroy(&spa->spa_metaslabs_by_flushed); avl_destroy(&spa->spa_metaslabs_by_flushed);
avl_destroy(&spa->spa_sm_logs_by_txg); avl_destroy(&spa->spa_sm_logs_by_txg);

View File

@ -877,8 +877,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_bookmark = *zb; zio->io_bookmark = *zb;
if (pio != NULL) { if (pio != NULL) {
if (zio->io_metaslab_class == NULL) zio->io_metaslab_class = pio->io_metaslab_class;
zio->io_metaslab_class = pio->io_metaslab_class;
if (zio->io_logical == NULL) if (zio->io_logical == NULL)
zio->io_logical = pio->io_logical; zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG) if (zio->io_child_type == ZIO_CHILD_GANG)
@ -3380,9 +3379,9 @@ zio_io_to_allocate(spa_t *spa, int allocator)
{ {
zio_t *zio; zio_t *zio;
ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); ASSERT(MUTEX_HELD(&spa->spa_allocs[allocator].spaa_lock));
zio = avl_first(&spa->spa_alloc_trees[allocator]); zio = avl_first(&spa->spa_allocs[allocator].spaa_tree);
if (zio == NULL) if (zio == NULL)
return (NULL); return (NULL);
@ -3394,11 +3393,11 @@ zio_io_to_allocate(spa_t *spa, int allocator)
*/ */
ASSERT3U(zio->io_allocator, ==, allocator); ASSERT3U(zio->io_allocator, ==, allocator);
if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, if (!metaslab_class_throttle_reserve(zio->io_metaslab_class,
zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { zio->io_prop.zp_copies, allocator, zio, 0)) {
return (NULL); return (NULL);
} }
avl_remove(&spa->spa_alloc_trees[allocator], zio); avl_remove(&spa->spa_allocs[allocator].spaa_tree, zio);
ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
return (zio); return (zio);
@ -3422,8 +3421,8 @@ zio_dva_throttle(zio_t *zio)
return (zio); return (zio);
} }
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
ASSERT(zio->io_child_type > ZIO_CHILD_GANG); ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
ASSERT3U(zio->io_queued_timestamp, >, 0); ASSERT3U(zio->io_queued_timestamp, >, 0);
ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
@ -3435,14 +3434,14 @@ zio_dva_throttle(zio_t *zio)
* into 2^20 block regions, and then hash based on the objset, object, * into 2^20 block regions, and then hash based on the objset, object,
* level, and region to accomplish both of these goals. * level, and region to accomplish both of these goals.
*/ */
zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, int allocator = (uint_t)cityhash4(bm->zb_objset, bm->zb_object,
bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count;
mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); zio->io_allocator = allocator;
ASSERT(zio->io_type == ZIO_TYPE_WRITE);
zio->io_metaslab_class = mc; zio->io_metaslab_class = mc;
avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
nio = zio_io_to_allocate(spa, zio->io_allocator); avl_add(&spa->spa_allocs[allocator].spaa_tree, zio);
mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); nio = zio_io_to_allocate(spa, allocator);
mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
return (nio); return (nio);
} }
@ -3451,9 +3450,9 @@ zio_allocate_dispatch(spa_t *spa, int allocator)
{ {
zio_t *zio; zio_t *zio;
mutex_enter(&spa->spa_alloc_locks[allocator]); mutex_enter(&spa->spa_allocs[allocator].spaa_lock);
zio = zio_io_to_allocate(spa, allocator); zio = zio_io_to_allocate(spa, allocator);
mutex_exit(&spa->spa_alloc_locks[allocator]); mutex_exit(&spa->spa_allocs[allocator].spaa_lock);
if (zio == NULL) if (zio == NULL)
return; return;
@ -3643,8 +3642,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp,
* some parallelism. * some parallelism.
*/ */
int flags = METASLAB_FASTWRITE | METASLAB_ZIL; int flags = METASLAB_FASTWRITE | METASLAB_ZIL;
int allocator = cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % int allocator = (uint_t)cityhash4(0, 0, 0,
spa->spa_alloc_count; os->os_dsl_dataset->ds_object) % spa->spa_alloc_count;
error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1,
txg, NULL, flags, &io_alloc_list, NULL, allocator); txg, NULL, flags, &io_alloc_list, NULL, allocator);
*slog = (error == 0); *slog = (error == 0);