Reduce number of metaslab preload taskq threads.
Before this change ZFS created threads for 50% of CPUs for each top- level vdev. Plus it created the same number of threads for embedded log groups (that have only one metaslab and don't need any preload). As result, on system with 80 CPUs and pool of 60 vdevs this resulted in 4800 metaslab preload threads, that is absolutely insane. This patch changes the preload threads to 50% of CPUs in one taskq per pool, so on the mentioned system it will be only 40 threads. Among other things this fixes zdb on the mentioned system and pool on FreeBSD, that failed to create so many threads in one process. Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Alexander Motin <mav@FreeBSD.org> Sponsored by: iXsystems, Inc. Closes #15319
This commit is contained in:
parent
75a2eb7fac
commit
342357cd9e
|
@ -250,7 +250,6 @@ struct metaslab_group {
|
||||||
int64_t mg_activation_count;
|
int64_t mg_activation_count;
|
||||||
metaslab_class_t *mg_class;
|
metaslab_class_t *mg_class;
|
||||||
vdev_t *mg_vd;
|
vdev_t *mg_vd;
|
||||||
taskq_t *mg_taskq;
|
|
||||||
metaslab_group_t *mg_prev;
|
metaslab_group_t *mg_prev;
|
||||||
metaslab_group_t *mg_next;
|
metaslab_group_t *mg_next;
|
||||||
|
|
||||||
|
|
|
@ -424,7 +424,9 @@ struct spa {
|
||||||
|
|
||||||
hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
|
hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
|
||||||
taskq_t *spa_zvol_taskq; /* Taskq for minor management */
|
taskq_t *spa_zvol_taskq; /* Taskq for minor management */
|
||||||
|
taskq_t *spa_metaslab_taskq; /* Taskq for metaslab preload */
|
||||||
taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */
|
taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */
|
||||||
|
taskq_t *spa_upgrade_taskq; /* Taskq for upgrade jobs */
|
||||||
uint64_t spa_multihost; /* multihost aware (mmp) */
|
uint64_t spa_multihost; /* multihost aware (mmp) */
|
||||||
mmp_thread_t spa_mmp; /* multihost mmp thread */
|
mmp_thread_t spa_mmp; /* multihost mmp thread */
|
||||||
list_t spa_leaf_list; /* list of leaf vdevs */
|
list_t spa_leaf_list; /* list of leaf vdevs */
|
||||||
|
@ -448,8 +450,6 @@ struct spa {
|
||||||
*/
|
*/
|
||||||
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
|
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
|
||||||
zfs_refcount_t spa_refcount; /* number of opens */
|
zfs_refcount_t spa_refcount; /* number of opens */
|
||||||
|
|
||||||
taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */
|
|
||||||
};
|
};
|
||||||
|
|
||||||
extern char *spa_config_path;
|
extern char *spa_config_path;
|
||||||
|
|
|
@ -402,6 +402,12 @@ Practical upper limit of total metaslabs per top-level vdev.
|
||||||
.It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
|
.It Sy metaslab_preload_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
|
||||||
Enable metaslab group preloading.
|
Enable metaslab group preloading.
|
||||||
.
|
.
|
||||||
|
.It Sy metaslab_preload_limit Ns = Ns Sy 10 Pq uint
|
||||||
|
Maximum number of metaslabs per group to preload
|
||||||
|
.
|
||||||
|
.It Sy metaslab_preload_pct Ns = Ns Sy 50 Pq uint
|
||||||
|
Percentage of CPUs to run a metaslab preload taskq
|
||||||
|
.
|
||||||
.It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
|
.It Sy metaslab_lba_weighting_enabled Ns = Ns Sy 1 Ns | Ns 0 Pq int
|
||||||
Give more weight to metaslabs with lower LBAs,
|
Give more weight to metaslabs with lower LBAs,
|
||||||
assuming they have greater bandwidth,
|
assuming they have greater bandwidth,
|
||||||
|
|
|
@ -614,28 +614,6 @@ SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct,
|
||||||
" space map to continue allocations in a first-fit fashion");
|
" space map to continue allocations in a first-fit fashion");
|
||||||
/* END CSTYLED */
|
/* END CSTYLED */
|
||||||
|
|
||||||
/*
|
|
||||||
* Percentage of all cpus that can be used by the metaslab taskq.
|
|
||||||
*/
|
|
||||||
extern int metaslab_load_pct;
|
|
||||||
|
|
||||||
/* BEGIN CSTYLED */
|
|
||||||
SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct,
|
|
||||||
CTLFLAG_RWTUN, &metaslab_load_pct, 0,
|
|
||||||
"Percentage of cpus that can be used by the metaslab taskq");
|
|
||||||
/* END CSTYLED */
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Max number of metaslabs per group to preload.
|
|
||||||
*/
|
|
||||||
extern uint_t metaslab_preload_limit;
|
|
||||||
|
|
||||||
/* BEGIN CSTYLED */
|
|
||||||
SYSCTL_UINT(_vfs_zfs_metaslab, OID_AUTO, preload_limit,
|
|
||||||
CTLFLAG_RWTUN, &metaslab_preload_limit, 0,
|
|
||||||
"Max number of metaslabs per group to preload");
|
|
||||||
/* END CSTYLED */
|
|
||||||
|
|
||||||
/* mmp.c */
|
/* mmp.c */
|
||||||
|
|
||||||
int
|
int
|
||||||
|
|
|
@ -205,11 +205,6 @@ static const uint32_t metaslab_min_search_count = 100;
|
||||||
*/
|
*/
|
||||||
static int metaslab_df_use_largest_segment = B_FALSE;
|
static int metaslab_df_use_largest_segment = B_FALSE;
|
||||||
|
|
||||||
/*
|
|
||||||
* Percentage of all cpus that can be used by the metaslab taskq.
|
|
||||||
*/
|
|
||||||
int metaslab_load_pct = 50;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* These tunables control how long a metaslab will remain loaded after the
|
* These tunables control how long a metaslab will remain loaded after the
|
||||||
* last allocation from it. A metaslab can't be unloaded until at least
|
* last allocation from it. A metaslab can't be unloaded until at least
|
||||||
|
@ -854,9 +849,6 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators)
|
||||||
zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
|
zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth);
|
||||||
}
|
}
|
||||||
|
|
||||||
mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
|
|
||||||
maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC);
|
|
||||||
|
|
||||||
return (mg);
|
return (mg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -872,7 +864,6 @@ metaslab_group_destroy(metaslab_group_t *mg)
|
||||||
*/
|
*/
|
||||||
ASSERT(mg->mg_activation_count <= 0);
|
ASSERT(mg->mg_activation_count <= 0);
|
||||||
|
|
||||||
taskq_destroy(mg->mg_taskq);
|
|
||||||
avl_destroy(&mg->mg_metaslab_tree);
|
avl_destroy(&mg->mg_metaslab_tree);
|
||||||
mutex_destroy(&mg->mg_lock);
|
mutex_destroy(&mg->mg_lock);
|
||||||
mutex_destroy(&mg->mg_ms_disabled_lock);
|
mutex_destroy(&mg->mg_ms_disabled_lock);
|
||||||
|
@ -963,7 +954,7 @@ metaslab_group_passivate(metaslab_group_t *mg)
|
||||||
* allocations from taking place and any changes to the vdev tree.
|
* allocations from taking place and any changes to the vdev tree.
|
||||||
*/
|
*/
|
||||||
spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
|
spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa);
|
||||||
taskq_wait_outstanding(mg->mg_taskq, 0);
|
taskq_wait_outstanding(spa->spa_metaslab_taskq, 0);
|
||||||
spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
|
spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER);
|
||||||
metaslab_group_alloc_update(mg);
|
metaslab_group_alloc_update(mg);
|
||||||
for (int i = 0; i < mg->mg_allocators; i++) {
|
for (int i = 0; i < mg->mg_allocators; i++) {
|
||||||
|
@ -3571,10 +3562,8 @@ metaslab_group_preload(metaslab_group_t *mg)
|
||||||
avl_tree_t *t = &mg->mg_metaslab_tree;
|
avl_tree_t *t = &mg->mg_metaslab_tree;
|
||||||
int m = 0;
|
int m = 0;
|
||||||
|
|
||||||
if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
|
if (spa_shutting_down(spa) || !metaslab_preload_enabled)
|
||||||
taskq_wait_outstanding(mg->mg_taskq, 0);
|
|
||||||
return;
|
return;
|
||||||
}
|
|
||||||
|
|
||||||
mutex_enter(&mg->mg_lock);
|
mutex_enter(&mg->mg_lock);
|
||||||
|
|
||||||
|
@ -3594,8 +3583,9 @@ metaslab_group_preload(metaslab_group_t *mg)
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
|
VERIFY(taskq_dispatch(spa->spa_metaslab_taskq, metaslab_preload,
|
||||||
msp, TQ_SLEEP) != TASKQID_INVALID);
|
msp, TQ_SLEEP | (m <= mg->mg_allocators ? TQ_FRONT : 0))
|
||||||
|
!= TASKQID_INVALID);
|
||||||
}
|
}
|
||||||
mutex_exit(&mg->mg_lock);
|
mutex_exit(&mg->mg_lock);
|
||||||
}
|
}
|
||||||
|
@ -6224,6 +6214,9 @@ ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW,
|
||||||
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW,
|
||||||
"Preload potential metaslabs during reassessment");
|
"Preload potential metaslabs during reassessment");
|
||||||
|
|
||||||
|
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_limit, UINT, ZMOD_RW,
|
||||||
|
"Max number of metaslabs per group to preload");
|
||||||
|
|
||||||
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, UINT, ZMOD_RW,
|
||||||
"Delay in txgs after metaslab was last used before unloading");
|
"Delay in txgs after metaslab was last used before unloading");
|
||||||
|
|
||||||
|
|
|
@ -169,6 +169,11 @@ static int spa_load_impl(spa_t *spa, spa_import_type_t type,
|
||||||
const char **ereport);
|
const char **ereport);
|
||||||
static void spa_vdev_resilver_done(spa_t *spa);
|
static void spa_vdev_resilver_done(spa_t *spa);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Percentage of all CPUs that can be used by the metaslab preload taskq.
|
||||||
|
*/
|
||||||
|
static uint_t metaslab_preload_pct = 50;
|
||||||
|
|
||||||
static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
|
static uint_t zio_taskq_batch_pct = 80; /* 1 thread per cpu in pset */
|
||||||
static uint_t zio_taskq_batch_tpq; /* threads per taskq */
|
static uint_t zio_taskq_batch_tpq; /* threads per taskq */
|
||||||
static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
|
static const boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
|
||||||
|
@ -1399,6 +1404,13 @@ spa_activate(spa_t *spa, spa_mode_t mode)
|
||||||
spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
|
spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
|
||||||
1, INT_MAX, 0);
|
1, INT_MAX, 0);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* The taskq to preload metaslabs.
|
||||||
|
*/
|
||||||
|
spa->spa_metaslab_taskq = taskq_create("z_metaslab",
|
||||||
|
metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
|
||||||
|
TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Taskq dedicated to prefetcher threads: this is used to prevent the
|
* Taskq dedicated to prefetcher threads: this is used to prevent the
|
||||||
* pool traverse code from monopolizing the global (and limited)
|
* pool traverse code from monopolizing the global (and limited)
|
||||||
|
@ -1434,6 +1446,11 @@ spa_deactivate(spa_t *spa)
|
||||||
spa->spa_zvol_taskq = NULL;
|
spa->spa_zvol_taskq = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (spa->spa_metaslab_taskq) {
|
||||||
|
taskq_destroy(spa->spa_metaslab_taskq);
|
||||||
|
spa->spa_metaslab_taskq = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
if (spa->spa_prefetch_taskq) {
|
if (spa->spa_prefetch_taskq) {
|
||||||
taskq_destroy(spa->spa_prefetch_taskq);
|
taskq_destroy(spa->spa_prefetch_taskq);
|
||||||
spa->spa_prefetch_taskq = NULL;
|
spa->spa_prefetch_taskq = NULL;
|
||||||
|
@ -1706,13 +1723,7 @@ spa_unload(spa_t *spa)
|
||||||
* This ensures that there is no async metaslab prefetching
|
* This ensures that there is no async metaslab prefetching
|
||||||
* while we attempt to unload the spa.
|
* while we attempt to unload the spa.
|
||||||
*/
|
*/
|
||||||
if (spa->spa_root_vdev != NULL) {
|
taskq_wait(spa->spa_metaslab_taskq);
|
||||||
for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
|
|
||||||
vdev_t *vc = spa->spa_root_vdev->vdev_child[c];
|
|
||||||
if (vc->vdev_mg != NULL)
|
|
||||||
taskq_wait(vc->vdev_mg->mg_taskq);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (spa->spa_mmp.mmp_thread)
|
if (spa->spa_mmp.mmp_thread)
|
||||||
mmp_thread_stop(spa);
|
mmp_thread_stop(spa);
|
||||||
|
@ -10134,6 +10145,9 @@ EXPORT_SYMBOL(spa_prop_clear_bootfs);
|
||||||
/* asynchronous event notification */
|
/* asynchronous event notification */
|
||||||
EXPORT_SYMBOL(spa_event_notify);
|
EXPORT_SYMBOL(spa_event_notify);
|
||||||
|
|
||||||
|
ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
|
||||||
|
"Percentage of CPUs to run a metaslab preload taskq");
|
||||||
|
|
||||||
/* BEGIN CSTYLED */
|
/* BEGIN CSTYLED */
|
||||||
ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
|
ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
|
||||||
"log2 fraction of arc that can be used by inflight I/Os when "
|
"log2 fraction of arc that can be used by inflight I/Os when "
|
||||||
|
|
Loading…
Reference in New Issue