Initialize metaslab range trees in metaslab_init
= Motivation We've noticed several zloop crashes within Delphix generated due to the following sequence of events: - A device gets expanded and new metaslabas are allocated for it. These metaslabs go through `metaslab_init()` but haven't gone through `metaslab_sync_done()` yet. This meas that the only range tree that's actually set is the `ms_allocatable`. All the others are NULL. - A vdev_initialization is issues and `vdev_initialize_thread` starts processing one of these new metaslabs of the expanded vdev. - As part of `vdev_initialize_calculate_progress()` we call into `metaslab_load()` and `metaslab_load_impl()` which in turn tries to dereference the metaslabs trees that are still NULL and therefore we crash. The same failure can come up from the `vdev_trim` code paths. = This Patch We considered the following solutions to deal with this issue: [A] Add logic to `vdev_initialize/trim` to skip those new metaslabs. We decided against this as it would be good to avoid exposing this lower-level detail to higer-level operations. [B] Have `metaslab_load_impl()` return early for new metaslabs and thus never touch those range_trees that are NULL at that time. This seemed more of a work-around for the bug and not a clear-cut solution. [C] Refactor our logic so all metaslabs have their range_trees created at the time of their creatin in `metaslab_init()`. In this patch we decided to go with [C] because: (1) It doesn't expose more metaslab details to higher level operations such as vdev initialize and trim. (2) The current behavior of creating the range trees lazily in `metaslab_sync_done()` is unnecessarily complicated. (3) Always initializing the metaslab range_trees makes other parts of the codebase cleaner. For example, we used to use `ms_freed` as the reference value for knowing whether all the range_trees have been initialized. Now we no longer need to do that check in most places (and in the few that we do we use the `ms_new` boolean field now which is more readable). = Side Changes Probably due to a mismerge we set `ms_loaded` to `B_TRUE` twice in `metasloab_load_impl()`. In this patch we remove the extraneous assignment. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Serapheim Dimitropoulos <serapheim@delphix.com> Closes #11737
This commit is contained in:
parent
ffd6978ef5
commit
793c958f6f
|
@ -2316,18 +2316,13 @@ metaslab_load_impl(metaslab_t *msp)
|
|||
range_tree_add(msp->ms_allocatable,
|
||||
msp->ms_start, msp->ms_size);
|
||||
|
||||
if (msp->ms_freed != NULL) {
|
||||
if (msp->ms_new) {
|
||||
/*
|
||||
* If the ms_sm doesn't exist, this means that this
|
||||
* metaslab hasn't gone through metaslab_sync() and
|
||||
* thus has never been dirtied. So we shouldn't
|
||||
* expect any unflushed allocs or frees from previous
|
||||
* TXGs.
|
||||
*
|
||||
* Note: ms_freed and all the other trees except for
|
||||
* the ms_allocatable, can be NULL at this point only
|
||||
* if this is a new metaslab of a vdev that just got
|
||||
* expanded.
|
||||
*/
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs));
|
||||
ASSERT(range_tree_is_empty(msp->ms_unflushed_frees));
|
||||
|
@ -2365,8 +2360,6 @@ metaslab_load_impl(metaslab_t *msp)
|
|||
range_tree_walk(msp->ms_unflushed_frees,
|
||||
range_tree_add, msp->ms_allocatable);
|
||||
|
||||
msp->ms_loaded = B_TRUE;
|
||||
|
||||
ASSERT3P(msp->ms_group, !=, NULL);
|
||||
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
|
||||
if (spa_syncing_log_sm(spa) != NULL) {
|
||||
|
@ -2680,19 +2673,31 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
|
|||
ms->ms_allocated_space = space_map_allocated(ms->ms_sm);
|
||||
}
|
||||
|
||||
range_seg_type_t type;
|
||||
uint64_t shift, start;
|
||||
type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
|
||||
range_seg_type_t type =
|
||||
metaslab_calculate_range_tree_type(vd, ms, &start, &shift);
|
||||
|
||||
/*
|
||||
* We create the ms_allocatable here, but we don't create the
|
||||
* other range trees until metaslab_sync_done(). This serves
|
||||
* two purposes: it allows metaslab_sync_done() to detect the
|
||||
* addition of new space; and for debugging, it ensures that
|
||||
* we'd data fault on any attempt to use this metaslab before
|
||||
* it's ready.
|
||||
*/
|
||||
ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift);
|
||||
for (int t = 0; t < TXG_SIZE; t++) {
|
||||
ms->ms_allocating[t] = range_tree_create(NULL, type,
|
||||
NULL, start, shift);
|
||||
}
|
||||
ms->ms_freeing = range_tree_create(NULL, type, NULL, start, shift);
|
||||
ms->ms_freed = range_tree_create(NULL, type, NULL, start, shift);
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
ms->ms_defer[t] = range_tree_create(NULL, type, NULL,
|
||||
start, shift);
|
||||
}
|
||||
ms->ms_checkpointing =
|
||||
range_tree_create(NULL, type, NULL, start, shift);
|
||||
ms->ms_unflushed_allocs =
|
||||
range_tree_create(NULL, type, NULL, start, shift);
|
||||
|
||||
metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
|
||||
mrap->mra_bt = &ms->ms_unflushed_frees_by_size;
|
||||
mrap->mra_floor_shift = metaslab_by_size_min_shift;
|
||||
ms->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
|
||||
type, mrap, start, shift);
|
||||
|
||||
ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift);
|
||||
|
||||
|
@ -2765,13 +2770,13 @@ metaslab_fini(metaslab_t *msp)
|
|||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
VERIFY(msp->ms_group == NULL);
|
||||
|
||||
/*
|
||||
* If the range trees haven't been allocated, this metaslab hasn't
|
||||
* been through metaslab_sync_done() for the first time yet, so its
|
||||
* If this metaslab hasn't been through metaslab_sync_done() yet its
|
||||
* space hasn't been accounted for in its vdev and doesn't need to be
|
||||
* subtracted.
|
||||
*/
|
||||
if (msp->ms_freed != NULL) {
|
||||
if (!msp->ms_new) {
|
||||
metaslab_space_update(vd, mg->mg_class,
|
||||
-metaslab_allocated_space(msp), 0, -msp->ms_size);
|
||||
|
||||
|
@ -2782,27 +2787,24 @@ metaslab_fini(metaslab_t *msp)
|
|||
metaslab_unload(msp);
|
||||
|
||||
range_tree_destroy(msp->ms_allocatable);
|
||||
range_tree_destroy(msp->ms_freeing);
|
||||
range_tree_destroy(msp->ms_freed);
|
||||
|
||||
if (msp->ms_freed != NULL) {
|
||||
range_tree_destroy(msp->ms_freeing);
|
||||
range_tree_destroy(msp->ms_freed);
|
||||
ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
|
||||
metaslab_unflushed_changes_memused(msp));
|
||||
spa->spa_unflushed_stats.sus_memused -=
|
||||
metaslab_unflushed_changes_memused(msp);
|
||||
range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
|
||||
range_tree_destroy(msp->ms_unflushed_allocs);
|
||||
range_tree_destroy(msp->ms_checkpointing);
|
||||
range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
|
||||
range_tree_destroy(msp->ms_unflushed_frees);
|
||||
|
||||
ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=,
|
||||
metaslab_unflushed_changes_memused(msp));
|
||||
spa->spa_unflushed_stats.sus_memused -=
|
||||
metaslab_unflushed_changes_memused(msp);
|
||||
range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL);
|
||||
range_tree_destroy(msp->ms_unflushed_allocs);
|
||||
range_tree_destroy(msp->ms_checkpointing);
|
||||
range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL);
|
||||
range_tree_destroy(msp->ms_unflushed_frees);
|
||||
|
||||
for (int t = 0; t < TXG_SIZE; t++) {
|
||||
range_tree_destroy(msp->ms_allocating[t]);
|
||||
}
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
range_tree_destroy(msp->ms_defer[t]);
|
||||
}
|
||||
for (int t = 0; t < TXG_SIZE; t++) {
|
||||
range_tree_destroy(msp->ms_allocating[t]);
|
||||
}
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
range_tree_destroy(msp->ms_defer[t]);
|
||||
}
|
||||
ASSERT0(msp->ms_deferspace);
|
||||
|
||||
|
@ -3926,17 +3928,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
|||
/*
|
||||
* This metaslab has just been added so there's no work to do now.
|
||||
*/
|
||||
if (msp->ms_freeing == NULL) {
|
||||
ASSERT3P(alloctree, ==, NULL);
|
||||
if (msp->ms_new) {
|
||||
ASSERT0(range_tree_space(alloctree));
|
||||
ASSERT0(range_tree_space(msp->ms_freeing));
|
||||
ASSERT0(range_tree_space(msp->ms_freed));
|
||||
ASSERT0(range_tree_space(msp->ms_checkpointing));
|
||||
ASSERT0(range_tree_space(msp->ms_trim));
|
||||
return;
|
||||
}
|
||||
|
||||
ASSERT3P(alloctree, !=, NULL);
|
||||
ASSERT3P(msp->ms_freeing, !=, NULL);
|
||||
ASSERT3P(msp->ms_freed, !=, NULL);
|
||||
ASSERT3P(msp->ms_checkpointing, !=, NULL);
|
||||
ASSERT3P(msp->ms_trim, !=, NULL);
|
||||
|
||||
/*
|
||||
* Normally, we don't want to process a metaslab if there are no
|
||||
* allocations or frees to perform. However, if the metaslab is being
|
||||
|
@ -4240,54 +4240,15 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
|
|||
|
||||
mutex_enter(&msp->ms_lock);
|
||||
|
||||
/*
|
||||
* If this metaslab is just becoming available, initialize its
|
||||
* range trees and add its capacity to the vdev.
|
||||
*/
|
||||
if (msp->ms_freed == NULL) {
|
||||
range_seg_type_t type;
|
||||
uint64_t shift, start;
|
||||
type = metaslab_calculate_range_tree_type(vd, msp, &start,
|
||||
&shift);
|
||||
|
||||
for (int t = 0; t < TXG_SIZE; t++) {
|
||||
ASSERT(msp->ms_allocating[t] == NULL);
|
||||
|
||||
msp->ms_allocating[t] = range_tree_create(NULL, type,
|
||||
NULL, start, shift);
|
||||
}
|
||||
|
||||
ASSERT3P(msp->ms_freeing, ==, NULL);
|
||||
msp->ms_freeing = range_tree_create(NULL, type, NULL, start,
|
||||
shift);
|
||||
|
||||
ASSERT3P(msp->ms_freed, ==, NULL);
|
||||
msp->ms_freed = range_tree_create(NULL, type, NULL, start,
|
||||
shift);
|
||||
|
||||
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
|
||||
ASSERT3P(msp->ms_defer[t], ==, NULL);
|
||||
msp->ms_defer[t] = range_tree_create(NULL, type, NULL,
|
||||
start, shift);
|
||||
}
|
||||
|
||||
ASSERT3P(msp->ms_checkpointing, ==, NULL);
|
||||
msp->ms_checkpointing = range_tree_create(NULL, type, NULL,
|
||||
start, shift);
|
||||
|
||||
ASSERT3P(msp->ms_unflushed_allocs, ==, NULL);
|
||||
msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL,
|
||||
start, shift);
|
||||
|
||||
metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP);
|
||||
mrap->mra_bt = &msp->ms_unflushed_frees_by_size;
|
||||
mrap->mra_floor_shift = metaslab_by_size_min_shift;
|
||||
ASSERT3P(msp->ms_unflushed_frees, ==, NULL);
|
||||
msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops,
|
||||
type, mrap, start, shift);
|
||||
|
||||
if (msp->ms_new) {
|
||||
/* this is a new metaslab, add its capacity to the vdev */
|
||||
metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size);
|
||||
|
||||
/* there should be no allocations nor frees at this point */
|
||||
VERIFY0(msp->ms_allocated_this_txg);
|
||||
VERIFY0(range_tree_space(msp->ms_freed));
|
||||
}
|
||||
|
||||
ASSERT0(range_tree_space(msp->ms_freeing));
|
||||
ASSERT0(range_tree_space(msp->ms_checkpointing));
|
||||
|
||||
|
|
Loading…
Reference in New Issue