Use a struct to organize metaslab-group-allocator fields
Each metaslab group (of which there is one per top-level vdev) has several (4, by default) "metaslab group allocators". Each "allocator" has its own metaslab that it prefers to allocate from (the "primary" allocator), and each can perform allocations concurrently with the other allocators. In addition to the primary metaslab, there are several other fields that need to be tracked separately for each allocator. These are currently stored as several arrays in the metaslab_group_t, each array indexed by allocator number. This change organizes all the metaslab-group-allocator-specific fields into a new struct, metaslab_group_allocator_t. The metaslab_group_t now needs only one array indexed by the allocator number - which contains the metaslab_group_allocator_t's. Reviewed-by: Paul Dagnelie <pcd@delphix.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Matthew Ahrens <mahrens@delphix.com> Closes #10213
This commit is contained in:
parent
a84c92f933
commit
32d805c3e2
|
@ -203,6 +203,16 @@ struct metaslab_class {
|
|||
multilist_t *mc_metaslab_txg_list;
|
||||
};
|
||||
|
||||
/*
|
||||
* Per-allocator data structure.
|
||||
*/
|
||||
typedef struct metaslab_group_allocator {
|
||||
uint64_t mga_cur_max_alloc_queue_depth;
|
||||
zfs_refcount_t mga_alloc_queue_depth;
|
||||
metaslab_t *mga_primary;
|
||||
metaslab_t *mga_secondary;
|
||||
} metaslab_group_allocator_t;
|
||||
|
||||
/*
|
||||
* Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
|
||||
* of a top-level vdev. They are linked together to form a circular linked
|
||||
|
@ -214,8 +224,6 @@ struct metaslab_class {
|
|||
*/
|
||||
struct metaslab_group {
|
||||
kmutex_t mg_lock;
|
||||
metaslab_t **mg_primaries;
|
||||
metaslab_t **mg_secondaries;
|
||||
avl_tree_t mg_metaslab_tree;
|
||||
uint64_t mg_aliquot;
|
||||
boolean_t mg_allocatable; /* can we allocate? */
|
||||
|
@ -263,9 +271,8 @@ struct metaslab_group {
|
|||
* groups are unable to handle their share of allocations.
|
||||
*/
|
||||
uint64_t mg_max_alloc_queue_depth;
|
||||
uint64_t *mg_cur_max_alloc_queue_depth;
|
||||
zfs_refcount_t *mg_alloc_queue_depth;
|
||||
int mg_allocators;
|
||||
metaslab_group_allocator_t *mg_allocator; /* array */
|
||||
/*
|
||||
* A metalab group that can no longer allocate the minimum block
|
||||
* size will set mg_no_free_space. Once a metaslab group is out
|
||||
|
|
|
@ -814,10 +814,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
|
|||
mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL);
|
||||
mg->mg_primaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
|
||||
KM_SLEEP);
|
||||
mg->mg_secondaries = kmem_zalloc(allocators * sizeof (metaslab_t *),
|
||||
KM_SLEEP);
|
||||
avl_create(&mg->mg_metaslab_tree, metaslab_compare,
|
||||
sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node));
|
||||
mg->mg_vd = vd;
|
||||
|
@ -827,13 +823,11 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
|
|||
mg->mg_no_free_space = B_TRUE;
|
||||
mg->mg_allocators = allocators;
|
||||
|
||||
mg->mg_alloc_queue_depth = kmem_zalloc(allocators *
|
||||
sizeof (zfs_refcount_t), KM_SLEEP);
|
||||
mg->mg_cur_max_alloc_queue_depth = kmem_zalloc(allocators *
|
||||
sizeof (uint64_t), KM_SLEEP);
|
||||
mg->mg_allocator = kmem_zalloc(allocators *
|
||||
sizeof (metaslab_group_allocator_t), KM_SLEEP);
|
||||
for (int i = 0; i < allocators; i++) {
|
||||
zfs_refcount_create_tracked(&mg->mg_alloc_queue_depth[i]);
|
||||
mg->mg_cur_max_alloc_queue_depth[i] = 0;
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
|
||||
zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
|
||||
}
|
||||
|
||||
mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
|
||||
|
@ -856,21 +850,16 @@ metaslab_group_destroy(metaslab_group_t *mg)
|
|||
|
||||
taskq_destroy(mg->mg_taskq);
|
||||
avl_destroy(&mg->mg_metaslab_tree);
|
||||
kmem_free(mg->mg_primaries, mg->mg_allocators * sizeof (metaslab_t *));
|
||||
kmem_free(mg->mg_secondaries, mg->mg_allocators *
|
||||
sizeof (metaslab_t *));
|
||||
mutex_destroy(&mg->mg_lock);
|
||||
mutex_destroy(&mg->mg_ms_disabled_lock);
|
||||
cv_destroy(&mg->mg_ms_disabled_cv);
|
||||
|
||||
for (int i = 0; i < mg->mg_allocators; i++) {
|
||||
zfs_refcount_destroy(&mg->mg_alloc_queue_depth[i]);
|
||||
mg->mg_cur_max_alloc_queue_depth[i] = 0;
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
|
||||
zfs_refcount_destroy(&mga->mga_alloc_queue_depth);
|
||||
}
|
||||
kmem_free(mg->mg_alloc_queue_depth, mg->mg_allocators *
|
||||
sizeof (zfs_refcount_t));
|
||||
kmem_free(mg->mg_cur_max_alloc_queue_depth, mg->mg_allocators *
|
||||
sizeof (uint64_t));
|
||||
kmem_free(mg->mg_allocator, mg->mg_allocators *
|
||||
sizeof (metaslab_group_allocator_t));
|
||||
|
||||
kmem_free(mg, sizeof (metaslab_group_t));
|
||||
}
|
||||
|
@ -951,14 +940,15 @@ metaslab_group_passivate(metaslab_group_t *mg)
|
|||
spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
|
||||
metaslab_group_alloc_update(mg);
|
||||
for (int i = 0; i < mg->mg_allocators; i++) {
|
||||
metaslab_t *msp = mg->mg_primaries[i];
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[i];
|
||||
metaslab_t *msp = mga->mga_primary;
|
||||
if (msp != NULL) {
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_passivate(msp,
|
||||
metaslab_weight_from_range_tree(msp));
|
||||
mutex_exit(&msp->ms_lock);
|
||||
}
|
||||
msp = mg->mg_secondaries[i];
|
||||
msp = mga->mga_secondary;
|
||||
if (msp != NULL) {
|
||||
mutex_enter(&msp->ms_lock);
|
||||
metaslab_passivate(msp,
|
||||
|
@ -1218,9 +1208,9 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
|||
* regardless of the mg_allocatable or throttle settings.
|
||||
*/
|
||||
if (mg->mg_allocatable) {
|
||||
metaslab_group_t *mgp;
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
|
||||
int64_t qdepth;
|
||||
uint64_t qmax = mg->mg_cur_max_alloc_queue_depth[allocator];
|
||||
uint64_t qmax = mga->mga_cur_max_alloc_queue_depth;
|
||||
|
||||
if (!mc->mc_alloc_throttle_enabled)
|
||||
return (B_TRUE);
|
||||
|
@ -1239,8 +1229,7 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
|||
*/
|
||||
qmax = qmax * (4 + d) / 4;
|
||||
|
||||
qdepth = zfs_refcount_count(
|
||||
&mg->mg_alloc_queue_depth[allocator]);
|
||||
qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth);
|
||||
|
||||
/*
|
||||
* If this metaslab group is below its qmax or it's
|
||||
|
@ -1258,11 +1247,14 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
|
|||
* racy since we can't hold the locks for all metaslab
|
||||
* groups at the same time when we make this check.
|
||||
*/
|
||||
for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
|
||||
qmax = mgp->mg_cur_max_alloc_queue_depth[allocator];
|
||||
for (metaslab_group_t *mgp = mg->mg_next;
|
||||
mgp != rotor; mgp = mgp->mg_next) {
|
||||
metaslab_group_allocator_t *mgap =
|
||||
&mgp->mg_allocator[allocator];
|
||||
qmax = mgap->mga_cur_max_alloc_queue_depth;
|
||||
qmax = qmax * (4 + d) / 4;
|
||||
qdepth = zfs_refcount_count(
|
||||
&mgp->mg_alloc_queue_depth[allocator]);
|
||||
qdepth =
|
||||
zfs_refcount_count(&mgap->mga_alloc_queue_depth);
|
||||
|
||||
/*
|
||||
* If there is another metaslab group that
|
||||
|
@ -3205,6 +3197,7 @@ static int
|
|||
metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
|
||||
int allocator, uint64_t activation_weight)
|
||||
{
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
|
||||
ASSERT(MUTEX_HELD(&msp->ms_lock));
|
||||
|
||||
/*
|
||||
|
@ -3219,16 +3212,16 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
|
|||
return (0);
|
||||
}
|
||||
|
||||
metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
|
||||
mg->mg_primaries : mg->mg_secondaries);
|
||||
metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
|
||||
&mga->mga_primary : &mga->mga_secondary);
|
||||
|
||||
mutex_enter(&mg->mg_lock);
|
||||
if (arr[allocator] != NULL) {
|
||||
if (*mspp != NULL) {
|
||||
mutex_exit(&mg->mg_lock);
|
||||
return (EEXIST);
|
||||
}
|
||||
|
||||
arr[allocator] = msp;
|
||||
*mspp = msp;
|
||||
ASSERT3S(msp->ms_allocator, ==, -1);
|
||||
msp->ms_allocator = allocator;
|
||||
msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
|
||||
|
@ -3237,7 +3230,6 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
|
|||
msp->ms_activation_weight = msp->ms_weight;
|
||||
metaslab_group_sort_impl(mg, msp,
|
||||
msp->ms_weight | activation_weight);
|
||||
|
||||
mutex_exit(&mg->mg_lock);
|
||||
|
||||
return (0);
|
||||
|
@ -3337,14 +3329,15 @@ metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp,
|
|||
ASSERT3S(0, <=, msp->ms_allocator);
|
||||
ASSERT3U(msp->ms_allocator, <, mg->mg_allocators);
|
||||
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator];
|
||||
if (msp->ms_primary) {
|
||||
ASSERT3P(mg->mg_primaries[msp->ms_allocator], ==, msp);
|
||||
ASSERT3P(mga->mga_primary, ==, msp);
|
||||
ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY);
|
||||
mg->mg_primaries[msp->ms_allocator] = NULL;
|
||||
mga->mga_primary = NULL;
|
||||
} else {
|
||||
ASSERT3P(mg->mg_secondaries[msp->ms_allocator], ==, msp);
|
||||
ASSERT3P(mga->mga_secondary, ==, msp);
|
||||
ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY);
|
||||
mg->mg_secondaries[msp->ms_allocator] = NULL;
|
||||
mga->mga_secondary = NULL;
|
||||
}
|
||||
msp->ms_allocator = -1;
|
||||
metaslab_group_sort_impl(mg, msp, weight);
|
||||
|
@ -4493,22 +4486,24 @@ metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags,
|
|||
if (!mg->mg_class->mc_alloc_throttle_enabled)
|
||||
return;
|
||||
|
||||
(void) zfs_refcount_add(&mg->mg_alloc_queue_depth[allocator], tag);
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
|
||||
(void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag);
|
||||
}
|
||||
|
||||
static void
|
||||
metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator)
|
||||
{
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
|
||||
uint64_t max = mg->mg_max_alloc_queue_depth;
|
||||
uint64_t cur = mg->mg_cur_max_alloc_queue_depth[allocator];
|
||||
uint64_t cur = mga->mga_cur_max_alloc_queue_depth;
|
||||
while (cur < max) {
|
||||
if (atomic_cas_64(&mg->mg_cur_max_alloc_queue_depth[allocator],
|
||||
if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth,
|
||||
cur, cur + 1) == cur) {
|
||||
atomic_inc_64(
|
||||
&mg->mg_class->mc_alloc_max_slots[allocator]);
|
||||
return;
|
||||
}
|
||||
cur = mg->mg_cur_max_alloc_queue_depth[allocator];
|
||||
cur = mga->mga_cur_max_alloc_queue_depth;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4524,7 +4519,8 @@ metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags,
|
|||
if (!mg->mg_class->mc_alloc_throttle_enabled)
|
||||
return;
|
||||
|
||||
(void) zfs_refcount_remove(&mg->mg_alloc_queue_depth[allocator], tag);
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
|
||||
(void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag);
|
||||
if (io_complete)
|
||||
metaslab_group_increment_qdepth(mg, allocator);
|
||||
}
|
||||
|
@ -4540,8 +4536,8 @@ metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag,
|
|||
for (int d = 0; d < ndvas; d++) {
|
||||
uint64_t vdev = DVA_GET_VDEV(&dva[d]);
|
||||
metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
|
||||
VERIFY(zfs_refcount_not_held(
|
||||
&mg->mg_alloc_queue_depth[allocator], tag));
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
|
||||
VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag));
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -4716,6 +4712,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
|||
*/
|
||||
if (mg->mg_ms_ready < mg->mg_allocators * 3)
|
||||
allocator = 0;
|
||||
metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator];
|
||||
|
||||
ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2);
|
||||
|
||||
|
@ -4737,8 +4734,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
|||
mutex_enter(&mg->mg_lock);
|
||||
|
||||
if (activation_weight == METASLAB_WEIGHT_PRIMARY &&
|
||||
mg->mg_primaries[allocator] != NULL) {
|
||||
msp = mg->mg_primaries[allocator];
|
||||
mga->mga_primary != NULL) {
|
||||
msp = mga->mga_primary;
|
||||
|
||||
/*
|
||||
* Even though we don't hold the ms_lock for the
|
||||
|
@ -4753,8 +4750,8 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
|
|||
was_active = B_TRUE;
|
||||
ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
|
||||
} else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
|
||||
mg->mg_secondaries[allocator] != NULL) {
|
||||
msp = mg->mg_secondaries[allocator];
|
||||
mga->mga_secondary != NULL) {
|
||||
msp = mga->mga_secondary;
|
||||
|
||||
/*
|
||||
* See comment above about the similar assertions
|
||||
|
|
|
@ -8720,13 +8720,14 @@ spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
|
|||
* allocations look at mg_max_alloc_queue_depth, and async
|
||||
* allocations all happen from spa_sync().
|
||||
*/
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++)
|
||||
for (int i = 0; i < mg->mg_allocators; i++) {
|
||||
ASSERT0(zfs_refcount_count(
|
||||
&(mg->mg_alloc_queue_depth[i])));
|
||||
&(mg->mg_allocator[i].mga_alloc_queue_depth)));
|
||||
}
|
||||
mg->mg_max_alloc_queue_depth = max_queue_depth;
|
||||
|
||||
for (int i = 0; i < spa->spa_alloc_count; i++) {
|
||||
mg->mg_cur_max_alloc_queue_depth[i] =
|
||||
for (int i = 0; i < mg->mg_allocators; i++) {
|
||||
mg->mg_allocator[i].mga_cur_max_alloc_queue_depth =
|
||||
zfs_vdev_def_queue_depth;
|
||||
}
|
||||
slots_per_allocator += zfs_vdev_def_queue_depth;
|
||||
|
|
Loading…
Reference in New Issue