Cap metaslab memory usage

On systems with large amounts of storage and high fragmentation, a huge 
amount of space can be used by storing metaslab range trees. Since 
metaslabs are only unloaded during a txg sync, and only if they have 
been inactive for 8 txgs, it is possible to get into a state where all 
of the system's memory is consumed by range trees and metaslabs, and 
txgs cannot sync. While ZFS knows how to evict ARC data when needed, 
it has no such mechanism for range tree data. This can result in boot 
hangs for some system configurations.

First, we add the ability to unload metaslabs outside of syncing 
context. Second, we store a multilist of all loaded metaslabs, sorted 
by their selection txg, so we can quickly identify the oldest 
metaslabs.  We use a multilist to reduce lock contention during heavy 
write workloads. Finally, we add logic that will unload a metaslab 
when we're loading a new metaslab, if we're using more than a certain 
fraction of the available memory on range trees.

Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Sebastien Roy <sebastien.roy@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Paul Dagnelie <pcd@delphix.com>
Closes #9128
This commit is contained in:
Paul Dagnelie 2019-08-16 08:08:21 -07:00 committed by Brian Behlendorf
parent 9323aad14d
commit f09fda5071
11 changed files with 289 additions and 58 deletions

View File

@ -291,6 +291,7 @@ void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve); void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg);
uint64_t arc_all_memory(void);
uint64_t arc_target_bytes(void); uint64_t arc_target_bytes(void);
void arc_init(void); void arc_init(void);
void arc_fini(void); void arc_fini(void);

View File

@ -57,7 +57,6 @@ int metaslab_sort_by_flushed(const void *, const void *);
uint64_t metaslab_unflushed_changes_memused(metaslab_t *); uint64_t metaslab_unflushed_changes_memused(metaslab_t *);
int metaslab_load(metaslab_t *); int metaslab_load(metaslab_t *);
void metaslab_potentially_unload(metaslab_t *, uint64_t);
void metaslab_unload(metaslab_t *); void metaslab_unload(metaslab_t *);
boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *); boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *);
@ -110,7 +109,7 @@ uint64_t metaslab_class_expandable_space(metaslab_class_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int,
zio_t *, int); zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *);
void metaslab_class_evict_old(metaslab_class_t *, uint64_t);
uint64_t metaslab_class_get_alloc(metaslab_class_t *); uint64_t metaslab_class_get_alloc(metaslab_class_t *);
uint64_t metaslab_class_get_space(metaslab_class_t *); uint64_t metaslab_class_get_space(metaslab_class_t *);
uint64_t metaslab_class_get_dspace(metaslab_class_t *); uint64_t metaslab_class_get_dspace(metaslab_class_t *);
@ -133,7 +132,8 @@ void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int,
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int);
void metaslab_recalculate_weight_and_sort(metaslab_t *); void metaslab_recalculate_weight_and_sort(metaslab_t *);
void metaslab_disable(metaslab_t *); void metaslab_disable(metaslab_t *);
void metaslab_enable(metaslab_t *, boolean_t); void metaslab_enable(metaslab_t *, boolean_t, boolean_t);
void metaslab_set_selected_txg(metaslab_t *, uint64_t);
extern int metaslab_debug_load; extern int metaslab_debug_load;

View File

@ -36,6 +36,7 @@
#include <sys/vdev.h> #include <sys/vdev.h>
#include <sys/txg.h> #include <sys/txg.h>
#include <sys/avl.h> #include <sys/avl.h>
#include <sys/multilist.h>
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -194,6 +195,12 @@ struct metaslab_class {
uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_space; /* total space (alloc + free) */
uint64_t mc_dspace; /* total deflated space */ uint64_t mc_dspace; /* total deflated space */
uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
/*
* List of all loaded metaslabs in the class, sorted in order of most
* recent use.
*/
multilist_t *mc_metaslab_txg_list;
}; };
/* /*
@ -378,6 +385,7 @@ struct metaslab {
range_tree_t *ms_allocating[TXG_SIZE]; range_tree_t *ms_allocating[TXG_SIZE];
range_tree_t *ms_allocatable; range_tree_t *ms_allocatable;
uint64_t ms_allocated_this_txg; uint64_t ms_allocated_this_txg;
uint64_t ms_allocating_total;
/* /*
* The following range trees are accessed only from syncing context. * The following range trees are accessed only from syncing context.
@ -508,6 +516,10 @@ struct metaslab {
avl_node_t ms_group_node; /* node in metaslab group tree */ avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */
/*
* Node in metaslab class's selected txg list
*/
multilist_node_t ms_class_txg_node;
/* /*
* Allocs and frees that are committed to the vdev log spacemap but * Allocs and frees that are committed to the vdev log spacemap but

View File

@ -386,6 +386,21 @@ considering only the histogram instead.
Default value: \fB3600 seconds\fR (one hour) Default value: \fB3600 seconds\fR (one hour)
.RE .RE
.sp
.ne 2
.na
\fBzfs_metaslab_mem_limit\fR (int)
.ad
.RS 12n
When we are loading a new metaslab, we check the amount of memory being used
to store metaslab range trees. If it is over a threshold, we attempt to unload
the least recently used metaslab to prevent the system from clogging all of
its memory with range trees. This tunable sets the percentage of total system
memory that is the threshold.
.sp
Default value: \fB75 percent\fR
.RE
.sp .sp
.ne 2 .ne 2
.na .na

View File

@ -1110,7 +1110,6 @@ static boolean_t arc_is_overflowing(void);
static void arc_buf_watch(arc_buf_t *); static void arc_buf_watch(arc_buf_t *);
static void arc_tuning_update(void); static void arc_tuning_update(void);
static void arc_prune_async(int64_t); static void arc_prune_async(int64_t);
static uint64_t arc_all_memory(void);
static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
static uint32_t arc_bufc_to_flags(arc_buf_contents_t); static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
@ -4828,7 +4827,7 @@ arc_reduce_target_size(int64_t to_free)
* Return maximum amount of memory that we could possibly use. Reduced * Return maximum amount of memory that we could possibly use. Reduced
* to half of all memory in user space which is primarily used for testing. * to half of all memory in user space which is primarily used for testing.
*/ */
static uint64_t uint64_t
arc_all_memory(void) arc_all_memory(void)
{ {
#ifdef _KERNEL #ifdef _KERNEL

View File

@ -278,6 +278,13 @@ int max_disabled_ms = 3;
*/ */
unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */ unsigned long zfs_metaslab_max_size_cache_sec = 3600; /* 1 hour */
/*
* Maximum percentage of memory to use on storing loaded metaslabs. If loading
* a metaslab would take it over this percentage, the oldest selected metaslab
* is automatically unloaded.
*/
int zfs_metaslab_mem_limit = 75;
static uint64_t metaslab_weight(metaslab_t *); static uint64_t metaslab_weight(metaslab_t *);
static void metaslab_set_fragmentation(metaslab_t *); static void metaslab_set_fragmentation(metaslab_t *);
static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t); static void metaslab_free_impl(vdev_t *, uint64_t, uint64_t, boolean_t);
@ -286,6 +293,8 @@ static void metaslab_check_free_impl(vdev_t *, uint64_t, uint64_t);
static void metaslab_passivate(metaslab_t *msp, uint64_t weight); static void metaslab_passivate(metaslab_t *msp, uint64_t weight);
static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp); static uint64_t metaslab_weight_from_range_tree(metaslab_t *msp);
static void metaslab_flush_update(metaslab_t *, dmu_tx_t *); static void metaslab_flush_update(metaslab_t *, dmu_tx_t *);
static unsigned int metaslab_idx_func(multilist_t *, void *);
static void metaslab_evict(metaslab_t *, uint64_t);
#ifdef _METASLAB_TRACING #ifdef _METASLAB_TRACING
kmem_cache_t *metaslab_alloc_trace_cache; kmem_cache_t *metaslab_alloc_trace_cache;
#endif #endif
@ -306,6 +315,8 @@ metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
mc->mc_rotor = NULL; mc->mc_rotor = NULL;
mc->mc_ops = ops; mc->mc_ops = ops;
mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t),
offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func);
mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count *
sizeof (zfs_refcount_t), KM_SLEEP); sizeof (zfs_refcount_t), KM_SLEEP);
mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count *
@ -332,6 +343,7 @@ metaslab_class_destroy(metaslab_class_t *mc)
kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count *
sizeof (uint64_t)); sizeof (uint64_t));
mutex_destroy(&mc->mc_lock); mutex_destroy(&mc->mc_lock);
multilist_destroy(mc->mc_metaslab_txg_list);
kmem_free(mc, sizeof (metaslab_class_t)); kmem_free(mc, sizeof (metaslab_class_t));
} }
@ -517,6 +529,47 @@ metaslab_class_expandable_space(metaslab_class_t *mc)
return (space); return (space);
} }
void
metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
{
multilist_t *ml = mc->mc_metaslab_txg_list;
for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
metaslab_t *msp = multilist_sublist_head(mls);
multilist_sublist_unlock(mls);
while (msp != NULL) {
mutex_enter(&msp->ms_lock);
/*
* Once we've hit a metaslab selected too recently to
* evict, we're done evicting for now.
*/
if (msp->ms_selected_txg + metaslab_unload_delay >=
txg) {
mutex_exit(&msp->ms_lock);
break;
}
/*
* If the metaslab has been removed from the list
* (which could happen if we were at the memory limit
* and it was evicted during this loop), then we can't
* proceed and we should restart the sublist.
*/
if (!multilist_link_active(&msp->ms_class_txg_node)) {
mutex_exit(&msp->ms_lock);
i--;
break;
}
mls = multilist_sublist_lock(ml, i);
metaslab_t *next_msp = multilist_sublist_next(mls, msp);
multilist_sublist_unlock(mls);
metaslab_evict(msp, txg);
mutex_exit(&msp->ms_lock);
msp = next_msp;
}
}
}
static int static int
metaslab_compare(const void *x1, const void *x2) metaslab_compare(const void *x1, const void *x2)
{ {
@ -960,6 +1013,14 @@ metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
mutex_enter(&mg->mg_lock); mutex_enter(&mg->mg_lock);
ASSERT(msp->ms_group == mg); ASSERT(msp->ms_group == mg);
avl_remove(&mg->mg_metaslab_tree, msp); avl_remove(&mg->mg_metaslab_tree, msp);
metaslab_class_t *mc = msp->ms_group->mg_class;
multilist_sublist_t *mls =
multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
if (multilist_link_active(&msp->ms_class_txg_node))
multilist_sublist_remove(mls, msp);
multilist_sublist_unlock(mls);
msp->ms_group = NULL; msp->ms_group = NULL;
mutex_exit(&mg->mg_lock); mutex_exit(&mg->mg_lock);
} }
@ -1519,6 +1580,13 @@ metaslab_flush_wait(metaslab_t *msp)
cv_wait(&msp->ms_flush_cv, &msp->ms_lock); cv_wait(&msp->ms_flush_cv, &msp->ms_lock);
} }
static unsigned int
metaslab_idx_func(multilist_t *ml, void *arg)
{
metaslab_t *msp = arg;
return (msp->ms_id % multilist_get_num_sublists(ml));
}
uint64_t uint64_t
metaslab_allocated_space(metaslab_t *msp) metaslab_allocated_space(metaslab_t *msp)
{ {
@ -1577,6 +1645,8 @@ metaslab_verify_space(metaslab_t *msp, uint64_t txg)
allocating += allocating +=
range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]);
} }
ASSERT3U(allocating + msp->ms_allocated_this_txg, ==,
msp->ms_allocating_total);
ASSERT3U(msp->ms_deferspace, ==, ASSERT3U(msp->ms_deferspace, ==,
range_tree_space(msp->ms_defer[0]) + range_tree_space(msp->ms_defer[0]) +
@ -1792,6 +1862,86 @@ metaslab_verify_weight_and_frag(metaslab_t *msp)
VERIFY3U(msp->ms_weight, ==, weight); VERIFY3U(msp->ms_weight, ==, weight);
} }
/*
* If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from
* this class that was used longest ago, and attempt to unload it. We don't
* want to spend too much time in this loop to prevent performance
* degredation, and we expect that most of the time this operation will
* succeed. Between that and the normal unloading processing during txg sync,
* we expect this to keep the metaslab memory usage under control.
*/
static void
metaslab_potentially_evict(metaslab_class_t *mc)
{
#ifdef _KERNEL
uint64_t allmem = arc_all_memory();
extern kmem_cache_t *range_seg_cache;
uint64_t inuse = range_seg_cache->skc_obj_total;
uint64_t size = range_seg_cache->skc_obj_size;
int tries = 0;
for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size &&
tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2;
tries++) {
unsigned int idx = multilist_get_random_index(
mc->mc_metaslab_txg_list);
multilist_sublist_t *mls =
multilist_sublist_lock(mc->mc_metaslab_txg_list, idx);
metaslab_t *msp = multilist_sublist_head(mls);
multilist_sublist_unlock(mls);
while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
inuse * size) {
VERIFY3P(mls, ==, multilist_sublist_lock(
mc->mc_metaslab_txg_list, idx));
ASSERT3U(idx, ==,
metaslab_idx_func(mc->mc_metaslab_txg_list, msp));
if (!multilist_link_active(&msp->ms_class_txg_node)) {
multilist_sublist_unlock(mls);
break;
}
metaslab_t *next_msp = multilist_sublist_next(mls, msp);
multilist_sublist_unlock(mls);
/*
* If the metaslab is currently loading there are two
* cases. If it's the metaslab we're evicting, we
* can't continue on or we'll panic when we attempt to
* recursively lock the mutex. If it's another
* metaslab that's loading, it can be safely skipped,
* since we know it's very new and therefore not a
* good eviction candidate. We check later once the
* lock is held that the metaslab is fully loaded
* before actually unloading it.
*/
if (msp->ms_loading) {
msp = next_msp;
inuse = range_seg_cache->skc_obj_total;
continue;
}
/*
* We can't unload metaslabs with no spacemap because
* they're not ready to be unloaded yet. We can't
* unload metaslabs with outstanding allocations
* because doing so could cause the metaslab's weight
* to decrease while it's unloaded, which violates an
* invariant that we use to prevent unnecessary
* loading. We also don't unload metaslabs that are
* currently active because they are high-weight
* metaslabs that are likely to be used in the near
* future.
*/
mutex_enter(&msp->ms_lock);
if (msp->ms_allocator == -1 && msp->ms_sm != NULL &&
msp->ms_allocating_total == 0) {
metaslab_unload(msp);
}
mutex_exit(&msp->ms_lock);
msp = next_msp;
inuse = range_seg_cache->skc_obj_total;
}
}
#endif
}
static int static int
metaslab_load_impl(metaslab_t *msp) metaslab_load_impl(metaslab_t *msp)
{ {
@ -2024,6 +2174,16 @@ metaslab_load(metaslab_t *msp)
*/ */
ASSERT(!msp->ms_loaded); ASSERT(!msp->ms_loaded);
/*
* If we're loading a metaslab in the normal class, consider evicting
* another one to keep our memory usage under the limit defined by the
* zfs_metaslab_mem_limit tunable.
*/
if (spa_normal_class(msp->ms_group->mg_class->mc_spa) ==
msp->ms_group->mg_class) {
metaslab_potentially_evict(msp->ms_group->mg_class);
}
int error = metaslab_load_impl(msp); int error = metaslab_load_impl(msp);
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
@ -2038,7 +2198,13 @@ metaslab_unload(metaslab_t *msp)
{ {
ASSERT(MUTEX_HELD(&msp->ms_lock)); ASSERT(MUTEX_HELD(&msp->ms_lock));
metaslab_verify_weight_and_frag(msp); /*
* This can happen if a metaslab is selected for eviction (in
* metaslab_potentially_evict) and then unloaded during spa_sync (via
* metaslab_class_evict_old).
*/
if (!msp->ms_loaded)
return;
range_tree_vacate(msp->ms_allocatable, NULL, NULL); range_tree_vacate(msp->ms_allocatable, NULL, NULL);
msp->ms_loaded = B_FALSE; msp->ms_loaded = B_FALSE;
@ -2047,6 +2213,15 @@ metaslab_unload(metaslab_t *msp)
msp->ms_activation_weight = 0; msp->ms_activation_weight = 0;
msp->ms_weight &= ~METASLAB_ACTIVE_MASK; msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
if (msp->ms_group != NULL) {
metaslab_class_t *mc = msp->ms_group->mg_class;
multilist_sublist_t *mls =
multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
if (multilist_link_active(&msp->ms_class_txg_node))
multilist_sublist_remove(mls, msp);
multilist_sublist_unlock(mls);
}
/* /*
* We explicitly recalculate the metaslab's weight based on its space * We explicitly recalculate the metaslab's weight based on its space
* map (as it is now not loaded). We want unload metaslabs to always * map (as it is now not loaded). We want unload metaslabs to always
@ -2063,6 +2238,20 @@ metaslab_unload(metaslab_t *msp)
metaslab_recalculate_weight_and_sort(msp); metaslab_recalculate_weight_and_sort(msp);
} }
void
metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg)
{
ASSERT(MUTEX_HELD(&msp->ms_lock));
metaslab_class_t *mc = msp->ms_group->mg_class;
multilist_sublist_t *mls =
multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
if (multilist_link_active(&msp->ms_class_txg_node))
multilist_sublist_remove(mls, msp);
msp->ms_selected_txg = txg;
multilist_sublist_insert_tail(mls, msp);
multilist_sublist_unlock(mls);
}
void void
metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta,
int64_t defer_delta, int64_t space_delta) int64_t defer_delta, int64_t space_delta)
@ -2091,6 +2280,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object,
mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL);
multilist_link_init(&ms->ms_class_txg_node);
ms->ms_id = id; ms->ms_id = id;
ms->ms_start = id << vd->vdev_ms_shift; ms->ms_start = id << vd->vdev_ms_shift;
@ -2703,8 +2893,13 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
* If we're activating for the claim code, we don't want to actually * If we're activating for the claim code, we don't want to actually
* set the metaslab up for a specific allocator. * set the metaslab up for a specific allocator.
*/ */
if (activation_weight == METASLAB_WEIGHT_CLAIM) if (activation_weight == METASLAB_WEIGHT_CLAIM) {
ASSERT0(msp->ms_activation_weight);
msp->ms_activation_weight = msp->ms_weight;
metaslab_group_sort(mg, msp, msp->ms_weight |
activation_weight);
return (0); return (0);
}
metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ? metaslab_t **arr = (activation_weight == METASLAB_WEIGHT_PRIMARY ?
mg->mg_primaries : mg->mg_secondaries); mg->mg_primaries : mg->mg_secondaries);
@ -2719,6 +2914,12 @@ metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp,
ASSERT3S(msp->ms_allocator, ==, -1); ASSERT3S(msp->ms_allocator, ==, -1);
msp->ms_allocator = allocator; msp->ms_allocator = allocator;
msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY);
ASSERT0(msp->ms_activation_weight);
msp->ms_activation_weight = msp->ms_weight;
metaslab_group_sort_impl(mg, msp,
msp->ms_weight | activation_weight);
mutex_exit(&mg->mg_lock); mutex_exit(&mg->mg_lock);
return (0); return (0);
@ -2795,11 +2996,6 @@ metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight)
return (error); return (error);
} }
ASSERT0(msp->ms_activation_weight);
msp->ms_activation_weight = msp->ms_weight;
metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight);
ASSERT(msp->ms_loaded); ASSERT(msp->ms_loaded);
ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
@ -2894,14 +3090,15 @@ static void
metaslab_preload(void *arg) metaslab_preload(void *arg)
{ {
metaslab_t *msp = arg; metaslab_t *msp = arg;
spa_t *spa = msp->ms_group->mg_vd->vdev_spa; metaslab_class_t *mc = msp->ms_group->mg_class;
spa_t *spa = mc->mc_spa;
fstrans_cookie_t cookie = spl_fstrans_mark(); fstrans_cookie_t cookie = spl_fstrans_mark();
ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
(void) metaslab_load(msp); (void) metaslab_load(msp);
msp->ms_selected_txg = spa_syncing_txg(spa); metaslab_set_selected_txg(msp, spa_syncing_txg(spa));
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
spl_fstrans_unmark(cookie); spl_fstrans_unmark(cookie);
} }
@ -3613,28 +3810,21 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
dmu_tx_commit(tx); dmu_tx_commit(tx);
} }
void static void
metaslab_potentially_unload(metaslab_t *msp, uint64_t txg) metaslab_evict(metaslab_t *msp, uint64_t txg)
{ {
/* if (!msp->ms_loaded || msp->ms_disabled != 0)
* If the metaslab is loaded and we've not tried to load or allocate return;
* from it in 'metaslab_unload_delay' txgs, then unload it.
*/
if (msp->ms_loaded &&
msp->ms_disabled == 0 &&
msp->ms_selected_txg + metaslab_unload_delay < txg) {
for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
VERIFY0(range_tree_space( VERIFY0(range_tree_space(
msp->ms_allocating[(txg + t) & TXG_MASK])); msp->ms_allocating[(txg + t) & TXG_MASK]));
} }
if (msp->ms_allocator != -1) { if (msp->ms_allocator != -1)
metaslab_passivate(msp, msp->ms_weight & metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK);
~METASLAB_ACTIVE_MASK);
}
if (!metaslab_debug_unload) if (!metaslab_debug_unload)
metaslab_unload(msp); metaslab_unload(msp);
}
} }
/* /*
@ -3791,7 +3981,7 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
ASSERT0(range_tree_space(msp->ms_freeing)); ASSERT0(range_tree_space(msp->ms_freeing));
ASSERT0(range_tree_space(msp->ms_freed)); ASSERT0(range_tree_space(msp->ms_freed));
ASSERT0(range_tree_space(msp->ms_checkpointing)); ASSERT0(range_tree_space(msp->ms_checkpointing));
msp->ms_allocating_total -= msp->ms_allocated_this_txg;
msp->ms_allocated_this_txg = 0; msp->ms_allocated_this_txg = 0;
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
} }
@ -4072,6 +4262,7 @@ metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg)
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size);
msp->ms_allocating_total += size;
/* Track the last successful allocation */ /* Track the last successful allocation */
msp->ms_alloc_txg = txg; msp->ms_alloc_txg = txg;
@ -4250,6 +4441,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
ASSERT(msp->ms_loaded); ASSERT(msp->ms_loaded);
was_active = B_TRUE; was_active = B_TRUE;
ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
} else if (activation_weight == METASLAB_WEIGHT_SECONDARY && } else if (activation_weight == METASLAB_WEIGHT_SECONDARY &&
mg->mg_secondaries[allocator] != NULL) { mg->mg_secondaries[allocator] != NULL) {
msp = mg->mg_secondaries[allocator]; msp = mg->mg_secondaries[allocator];
@ -4263,6 +4455,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
ASSERT(msp->ms_loaded); ASSERT(msp->ms_loaded);
was_active = B_TRUE; was_active = B_TRUE;
ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
} else { } else {
msp = find_valid_metaslab(mg, activation_weight, dva, d, msp = find_valid_metaslab(mg, activation_weight, dva, d,
want_unique, asize, allocator, try_hard, zal, want_unique, asize, allocator, try_hard, zal,
@ -4293,7 +4486,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
* capable of handling our request. It's possible that * capable of handling our request. It's possible that
* another thread may have changed the weight while we * another thread may have changed the weight while we
* were blocked on the metaslab lock. We check the * were blocked on the metaslab lock. We check the
* active status first to see if we need to reselect * active status first to see if we need to set_selected_txg
* a new metaslab. * a new metaslab.
*/ */
if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) {
@ -4336,7 +4529,7 @@ metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal,
continue; continue;
} }
msp->ms_selected_txg = txg; metaslab_set_selected_txg(msp, txg);
int activation_error = int activation_error =
metaslab_activate(msp, allocator, activation_weight); metaslab_activate(msp, allocator, activation_weight);
@ -5027,6 +5220,7 @@ metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
range_tree_remove(msp->ms_allocating[txg & TXG_MASK], range_tree_remove(msp->ms_allocating[txg & TXG_MASK],
offset, size); offset, size);
msp->ms_allocating_total -= size;
VERIFY(!msp->ms_condensing); VERIFY(!msp->ms_condensing);
VERIFY3U(offset, >=, msp->ms_start); VERIFY3U(offset, >=, msp->ms_start);
@ -5158,10 +5352,20 @@ metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size,
range_tree_clear(msp->ms_trim, offset, size); range_tree_clear(msp->ms_trim, offset, size);
if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
metaslab_class_t *mc = msp->ms_group->mg_class;
multilist_sublist_t *mls =
multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp);
if (!multilist_link_active(&msp->ms_class_txg_node)) {
msp->ms_selected_txg = txg;
multilist_sublist_insert_head(mls, msp);
}
multilist_sublist_unlock(mls);
if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK]))
vdev_dirty(vd, VDD_METASLAB, msp, txg); vdev_dirty(vd, VDD_METASLAB, msp, txg);
range_tree_add(msp->ms_allocating[txg & TXG_MASK], range_tree_add(msp->ms_allocating[txg & TXG_MASK],
offset, size); offset, size);
msp->ms_allocating_total += size;
} }
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
@ -5571,7 +5775,7 @@ metaslab_disable(metaslab_t *msp)
} }
void void
metaslab_enable(metaslab_t *msp, boolean_t sync) metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload)
{ {
metaslab_group_t *mg = msp->ms_group; metaslab_group_t *mg = msp->ms_group;
spa_t *spa = mg->mg_vd->vdev_spa; spa_t *spa = mg->mg_vd->vdev_spa;
@ -5589,6 +5793,8 @@ metaslab_enable(metaslab_t *msp, boolean_t sync)
if (--msp->ms_disabled == 0) { if (--msp->ms_disabled == 0) {
mg->mg_ms_disabled--; mg->mg_ms_disabled--;
cv_broadcast(&mg->mg_ms_disabled_cv); cv_broadcast(&mg->mg_ms_disabled_cv);
if (unload)
metaslab_unload(msp);
} }
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
mutex_exit(&mg->mg_ms_disabled_lock); mutex_exit(&mg->mg_ms_disabled_lock);
@ -5710,6 +5916,10 @@ MODULE_PARM_DESC(metaslab_df_use_largest_segment,
module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644); module_param(zfs_metaslab_max_size_cache_sec, ulong, 0644);
MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec, MODULE_PARM_DESC(zfs_metaslab_max_size_cache_sec,
"how long to trust the cached max chunk size of a metaslab"); "how long to trust the cached max chunk size of a metaslab");
module_param(zfs_metaslab_mem_limit, int, 0644);
MODULE_PARM_DESC(zfs_metaslab_mem_limit,
"percentage of memory that can be used to store metaslab range trees");
/* END CSTYLED */ /* END CSTYLED */
#endif #endif

View File

@ -9013,6 +9013,10 @@ spa_sync(spa_t *spa, uint64_t txg)
while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
!= NULL) != NULL)
vdev_sync_done(vd, txg); vdev_sync_done(vd, txg);
metaslab_class_evict_old(spa->spa_normal_class, txg);
metaslab_class_evict_old(spa->spa_log_class, txg);
spa_sync_close_syncing_log_sm(spa); spa_sync_close_syncing_log_sm(spa);
spa_update_dspace(spa); spa_update_dspace(spa);

View File

@ -1189,6 +1189,7 @@ out:
if (metaslab_debug_load && m->ms_sm != NULL) { if (metaslab_debug_load && m->ms_sm != NULL) {
VERIFY0(metaslab_load(m)); VERIFY0(metaslab_load(m));
metaslab_set_selected_txg(m, 0);
} }
mutex_exit(&m->ms_lock); mutex_exit(&m->ms_lock);
} }

View File

@ -3262,20 +3262,6 @@ vdev_sync_done(vdev_t *vd, uint64_t txg)
!= NULL) != NULL)
metaslab_sync_done(msp, txg); metaslab_sync_done(msp, txg);
/*
* Because this function is only called on dirty vdevs, it's possible
* we won't consider all metaslabs for unloading on every
* txg. However, unless the system is largely idle it is likely that
* we will dirty all vdevs within a few txgs.
*/
for (int i = 0; i < vd->vdev_ms_count; i++) {
msp = vd->vdev_ms[i];
mutex_enter(&msp->ms_lock);
if (msp->ms_sm != NULL)
metaslab_potentially_unload(msp, txg);
mutex_exit(&msp->ms_lock);
}
if (reassess) if (reassess)
metaslab_sync_reassess(vd->vdev_mg); metaslab_sync_reassess(vd->vdev_mg);
} }

View File

@ -20,7 +20,7 @@
*/ */
/* /*
* Copyright (c) 2016 by Delphix. All rights reserved. * Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/ */
#include <sys/spa.h> #include <sys/spa.h>
@ -483,6 +483,7 @@ vdev_initialize_thread(void *arg)
for (uint64_t i = 0; !vd->vdev_detached && for (uint64_t i = 0; !vd->vdev_detached &&
i < vd->vdev_top->vdev_ms_count; i++) { i < vd->vdev_top->vdev_ms_count; i++) {
metaslab_t *msp = vd->vdev_top->vdev_ms[i]; metaslab_t *msp = vd->vdev_top->vdev_ms[i];
boolean_t unload_when_done = B_FALSE;
/* /*
* If we've expanded the top-level vdev or it's our * If we've expanded the top-level vdev or it's our
@ -496,6 +497,8 @@ vdev_initialize_thread(void *arg)
spa_config_exit(spa, SCL_CONFIG, FTAG); spa_config_exit(spa, SCL_CONFIG, FTAG);
metaslab_disable(msp); metaslab_disable(msp);
mutex_enter(&msp->ms_lock); mutex_enter(&msp->ms_lock);
if (!msp->ms_loaded && !msp->ms_loading)
unload_when_done = B_TRUE;
VERIFY0(metaslab_load(msp)); VERIFY0(metaslab_load(msp));
range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
@ -503,7 +506,7 @@ vdev_initialize_thread(void *arg)
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
error = vdev_initialize_ranges(vd, deadbeef); error = vdev_initialize_ranges(vd, deadbeef);
metaslab_enable(msp, B_TRUE); metaslab_enable(msp, B_TRUE, unload_when_done);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL);

View File

@ -837,7 +837,7 @@ vdev_trim_thread(void *arg)
*/ */
if (msp->ms_sm == NULL && vd->vdev_trim_partial) { if (msp->ms_sm == NULL && vd->vdev_trim_partial) {
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
metaslab_enable(msp, B_FALSE); metaslab_enable(msp, B_FALSE, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
vdev_trim_calculate_progress(vd); vdev_trim_calculate_progress(vd);
continue; continue;
@ -849,7 +849,7 @@ vdev_trim_thread(void *arg)
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
error = vdev_trim_ranges(&ta); error = vdev_trim_ranges(&ta);
metaslab_enable(msp, B_TRUE); metaslab_enable(msp, B_TRUE, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
range_tree_vacate(ta.trim_tree, NULL, NULL); range_tree_vacate(ta.trim_tree, NULL, NULL);
@ -1154,7 +1154,7 @@ vdev_autotrim_thread(void *arg)
if (msp->ms_sm == NULL || if (msp->ms_sm == NULL ||
range_tree_is_empty(msp->ms_trim)) { range_tree_is_empty(msp->ms_trim)) {
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
metaslab_enable(msp, B_FALSE); metaslab_enable(msp, B_FALSE, B_FALSE);
continue; continue;
} }
@ -1170,7 +1170,7 @@ vdev_autotrim_thread(void *arg)
*/ */
if (msp->ms_disabled > 1) { if (msp->ms_disabled > 1) {
mutex_exit(&msp->ms_lock); mutex_exit(&msp->ms_lock);
metaslab_enable(msp, B_FALSE); metaslab_enable(msp, B_FALSE, B_FALSE);
continue; continue;
} }
@ -1288,7 +1288,7 @@ vdev_autotrim_thread(void *arg)
range_tree_vacate(trim_tree, NULL, NULL); range_tree_vacate(trim_tree, NULL, NULL);
range_tree_destroy(trim_tree); range_tree_destroy(trim_tree);
metaslab_enable(msp, issued_trim); metaslab_enable(msp, issued_trim, B_FALSE);
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
for (uint64_t c = 0; c < children; c++) { for (uint64_t c = 0; c < children; c++) {