Improve scrub maxinflight_bytes math.

Previously, ZFS scaled maxinflight_bytes based on total number of
disks in the pool.  A 3-wide mirror was receiving a queue depth of 3
disks, which it should not, since it reads from all the disks inside.
For wide raidz the situation was slightly better, but still a 3-wide
raidz1 received a depth of 3 disks instead of 2.

The new code counts only unique data disks, i.e. 1 disk for mirrors
and non-parity disks for raidz/draid.  For draid the math is still
imperfect, since vdev_get_nparity() returns number of parity disks
per group, not per vdev, but still some better than it was.

This should slightly reduce scrub influence on payload for some pool
topologies by avoiding excessive queuing.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored-By:	iXsystems, Inc.
Closing #12046
This commit is contained in:
Alexander Motin 2021-05-27 12:11:39 -04:00 committed by GitHub
parent ba646e3e89
commit 2041d6eecd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 26 deletions

View File

@ -3326,7 +3326,7 @@ Default value: \fB0\fR.
Maximum amount of data that can be concurrently issued at once for scrubs and Maximum amount of data that can be concurrently issued at once for scrubs and
resilvers per leaf device, given in bytes. resilvers per leaf device, given in bytes.
.sp .sp
Default value: \fB41943040\fR. Default value: \fB4194304\fR.
.RE .RE
.sp .sp

View File

@ -126,7 +126,7 @@ static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
static uint64_t dsl_scan_count_leaves(vdev_t *vd); static uint64_t dsl_scan_count_data_disks(vdev_t *vd);
extern int zfs_vdev_async_write_active_min_dirty_percent; extern int zfs_vdev_async_write_active_min_dirty_percent;
@ -451,7 +451,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
* phase are done per top-level vdev and are handled separately. * phase are done per top-level vdev and are handled separately.
*/ */
scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20); dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
offsetof(scan_ds_t, sds_node)); offsetof(scan_ds_t, sds_node));
@ -2759,22 +2759,16 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
} }
static uint64_t static uint64_t
dsl_scan_count_leaves(vdev_t *vd) dsl_scan_count_data_disks(vdev_t *rvd)
{ {
uint64_t i, leaves = 0; uint64_t i, leaves = 0;
/* we only count leaves that belong to the main pool and are readable */ for (i = 0; i < rvd->vdev_children; i++) {
if (vd->vdev_islog || vd->vdev_isspare || vdev_t *vd = rvd->vdev_child[i];
vd->vdev_isl2cache || !vdev_readable(vd)) if (vd->vdev_islog || vd->vdev_isspare || vd->vdev_isl2cache)
return (0); continue;
leaves += vdev_get_ndisks(vd) - vdev_get_nparity(vd);
if (vd->vdev_ops->vdev_op_leaf)
return (1);
for (i = 0; i < vd->vdev_children; i++) {
leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
} }
return (leaves); return (leaves);
} }
@ -3017,8 +3011,6 @@ scan_io_queues_run_one(void *arg)
range_seg_t *rs = NULL; range_seg_t *rs = NULL;
scan_io_t *sio = NULL; scan_io_t *sio = NULL;
list_t sio_list; list_t sio_list;
uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
ASSERT(queue->q_scn->scn_is_sorted); ASSERT(queue->q_scn->scn_is_sorted);
@ -3026,9 +3018,9 @@ scan_io_queues_run_one(void *arg)
offsetof(scan_io_t, sio_nodes.sio_list_node)); offsetof(scan_io_t, sio_nodes.sio_list_node));
mutex_enter(q_lock); mutex_enter(q_lock);
/* calculate maximum in-flight bytes for this txg (min 1MB) */ /* Calculate maximum in-flight bytes for this vdev. */
queue->q_maxinflight_bytes = queue->q_maxinflight_bytes = MAX(1, zfs_scan_vdev_limit *
MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); (vdev_get_ndisks(queue->q_vd) - vdev_get_nparity(queue->q_vd)));
/* reset per-queue scan statistics for this txg */ /* reset per-queue scan statistics for this txg */
queue->q_total_seg_size_this_txg = 0; queue->q_total_seg_size_this_txg = 0;
@ -3665,16 +3657,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
/* Need to scan metadata for more blocks to scrub */ /* Need to scan metadata for more blocks to scrub */
dsl_scan_phys_t *scnp = &scn->scn_phys; dsl_scan_phys_t *scnp = &scn->scn_phys;
taskqid_t prefetch_tqid; taskqid_t prefetch_tqid;
uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
/* /*
* Recalculate the max number of in-flight bytes for pool-wide * Recalculate the max number of in-flight bytes for pool-wide
* scanning operations (minimum 1MB). Limits for the issuing * scanning operations (minimum 1MB). Limits for the issuing
* phase are done per top-level vdev and are handled separately. * phase are done per top-level vdev and are handled separately.
*/ */
scn->scn_maxinflight_bytes = scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); dsl_scan_count_data_disks(spa->spa_root_vdev), 1ULL << 20);
if (scnp->scn_ddt_bookmark.ddb_class <= if (scnp->scn_ddt_bookmark.ddb_class <=
scnp->scn_ddt_class_max) { scnp->scn_ddt_class_max) {
@ -4050,9 +4040,8 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
size_t size = BP_GET_PSIZE(bp); size_t size = BP_GET_PSIZE(bp);
abd_t *data = abd_alloc_for_io(size, B_FALSE); abd_t *data = abd_alloc_for_io(size, B_FALSE);
ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
if (queue == NULL) { if (queue == NULL) {
ASSERT3U(scn->scn_maxinflight_bytes, >, 0);
mutex_enter(&spa->spa_scrub_lock); mutex_enter(&spa->spa_scrub_lock);
while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
@ -4061,6 +4050,7 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
} else { } else {
kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
ASSERT3U(queue->q_maxinflight_bytes, >, 0);
mutex_enter(q_lock); mutex_enter(q_lock);
while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
cv_wait(&queue->q_zio_cv, q_lock); cv_wait(&queue->q_zio_cv, q_lock);