Several sorted scrub optimizations

- Reduce size and comparison complexity of q_exts_by_size B-tree.
Previous code used two 64-bit divisions and many other operations to
compare two B-tree elements.  It created enormous overhead.  This
implementation moves the math to the upper level and stores the score
in the B-tree elements themselves.  Since all that we need to store in
that B-tree is the extent score and offset, those can fit into single
8 byte value instead of 24 bytes of q_exts_by_addr element and can be
compared with single operation.
 - Better decouple secondary tree logic from main range_tree by moving
rt_btree_ops and related functions into dsl_scan.c as ext_size_ops.
Those functions are very small to worry about the code duplication and
range_tree does not need to know details such as rt_btree_compare.
 - Instead of accounting number of pending bytes per pool, that needs
atomic on global variable per block, account the number of non-empty
per-vdev queues, that change much more rarely.
 - When extent scan is interrupted by TXG end, continue it in the next
TXG instead of selecting next best extent.  It allows to avoid leaving
one truncated (and so likely not the best any more) extent each TXG.

On top of some other optimizations this saves about 1.5 minutes out of
10 to scrub pool of 12 SSDs, storing 1.5TB of 4KB zvol blocks.

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tom Caputi <caputit1@tcnj.edu>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes 
This commit is contained in:
Alexander Motin 2022-06-24 12:50:37 -04:00 committed by Brian Behlendorf
parent 881249de6f
commit a861aa2b9e
4 changed files with 160 additions and 207 deletions

View File

@ -155,7 +155,7 @@ typedef struct dsl_scan {
dsl_scan_phys_t scn_phys; /* on disk representation of scan */ dsl_scan_phys_t scn_phys; /* on disk representation of scan */
dsl_scan_phys_t scn_phys_cached; dsl_scan_phys_t scn_phys_cached;
avl_tree_t scn_queue; /* queue of datasets to scan */ avl_tree_t scn_queue; /* queue of datasets to scan */
uint64_t scn_bytes_pending; /* outstanding data to issue */ uint64_t scn_queues_pending; /* outstanding data to issue */
} dsl_scan_t; } dsl_scan_t;
typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;

View File

@ -63,12 +63,8 @@ typedef struct range_tree {
*/ */
uint8_t rt_shift; uint8_t rt_shift;
uint64_t rt_start; uint64_t rt_start;
range_tree_ops_t *rt_ops; const range_tree_ops_t *rt_ops;
/* rt_btree_compare should only be set if rt_arg is a b-tree */
void *rt_arg; void *rt_arg;
int (*rt_btree_compare)(const void *, const void *);
uint64_t rt_gap; /* allowable inter-segment gap */ uint64_t rt_gap; /* allowable inter-segment gap */
/* /*
@ -278,11 +274,11 @@ rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill)
typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, range_tree_t *range_tree_create_gap(const range_tree_ops_t *ops,
range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, range_seg_type_t type, void *arg, uint64_t start, uint64_t shift,
int (*zfs_btree_compare) (const void *, const void *), uint64_t gap); uint64_t gap);
range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, range_tree_t *range_tree_create(const range_tree_ops_t *ops,
void *arg, uint64_t start, uint64_t shift); range_seg_type_t type, void *arg, uint64_t start, uint64_t shift);
void range_tree_destroy(range_tree_t *rt); void range_tree_destroy(range_tree_t *rt);
boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
@ -316,13 +312,6 @@ void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end,
void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom,
range_tree_t *addto); range_tree_t *addto);
void rt_btree_create(range_tree_t *rt, void *arg);
void rt_btree_destroy(range_tree_t *rt, void *arg);
void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg);
void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
void rt_btree_vacate(range_tree_t *rt, void *arg);
extern range_tree_ops_t rt_btree_ops;
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -219,9 +219,9 @@ typedef struct {
/* /*
* This controls what conditions are placed on dsl_scan_sync_state(): * This controls what conditions are placed on dsl_scan_sync_state():
* SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 * SYNC_OPTIONAL) write out scn_phys iff scn_queues_pending == 0
* SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. * SYNC_MANDATORY) write out scn_phys always. scn_queues_pending must be 0.
* SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise * SYNC_CACHED) if scn_queues_pending == 0, write out scn_phys. Otherwise
* write out the scn_phys_cached version. * write out the scn_phys_cached version.
* See dsl_scan_sync_state for details. * See dsl_scan_sync_state for details.
*/ */
@ -283,9 +283,10 @@ struct dsl_scan_io_queue {
/* trees used for sorting I/Os and extents of I/Os */ /* trees used for sorting I/Os and extents of I/Os */
range_tree_t *q_exts_by_addr; range_tree_t *q_exts_by_addr;
zfs_btree_t q_exts_by_size; zfs_btree_t q_exts_by_size;
avl_tree_t q_sios_by_addr; avl_tree_t q_sios_by_addr;
uint64_t q_sio_memused; uint64_t q_sio_memused;
uint64_t q_last_ext_addr;
/* members for zio rate limiting */ /* members for zio rate limiting */
uint64_t q_maxinflight_bytes; uint64_t q_maxinflight_bytes;
@ -639,7 +640,7 @@ dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
* Because we can be running in the block sorting algorithm, we do not always * Because we can be running in the block sorting algorithm, we do not always
* want to write out the record, only when it is "safe" to do so. This safety * want to write out the record, only when it is "safe" to do so. This safety
* condition is achieved by making sure that the sorting queues are empty * condition is achieved by making sure that the sorting queues are empty
* (scn_bytes_pending == 0). When this condition is not true, the sync'd state * (scn_queues_pending == 0). When this condition is not true, the sync'd state
* is inconsistent with how much actual scanning progress has been made. The * is inconsistent with how much actual scanning progress has been made. The
* kind of sync to be performed is specified by the sync_type argument. If the * kind of sync to be performed is specified by the sync_type argument. If the
* sync is optional, we only sync if the queues are empty. If the sync is * sync is optional, we only sync if the queues are empty. If the sync is
@ -662,8 +663,8 @@ dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
int i; int i;
spa_t *spa = scn->scn_dp->dp_spa; spa_t *spa = scn->scn_dp->dp_spa;
ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); ASSERT(sync_type != SYNC_MANDATORY || scn->scn_queues_pending == 0);
if (scn->scn_bytes_pending == 0) { if (scn->scn_queues_pending == 0) {
for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
@ -1198,7 +1199,7 @@ scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
ASSERT0(scn->scn_bytes_pending); ASSERT0(scn->scn_queues_pending);
ASSERT(scn->scn_phys.scn_queue_obj != 0); ASSERT(scn->scn_phys.scn_queue_obj != 0);
VERIFY0(dmu_object_free(dp->dp_meta_objset, VERIFY0(dmu_object_free(dp->dp_meta_objset,
@ -1270,11 +1271,12 @@ dsl_scan_should_clear(dsl_scan_t *scn)
queue = tvd->vdev_scan_io_queue; queue = tvd->vdev_scan_io_queue;
if (queue != NULL) { if (queue != NULL) {
/* /*
* # of extents in exts_by_size = # in exts_by_addr. * # of extents in exts_by_addr = # in exts_by_size.
* B-tree efficiency is ~75%, but can be as low as 50%. * B-tree efficiency is ~75%, but can be as low as 50%.
*/ */
mused += zfs_btree_numnodes(&queue->q_exts_by_size) * mused += zfs_btree_numnodes(&queue->q_exts_by_size) *
3 * sizeof (range_seg_gap_t) + queue->q_sio_memused; ((sizeof (range_seg_gap_t) + sizeof (uint64_t)) *
3 / 2) + queue->q_sio_memused;
} }
mutex_exit(&tvd->vdev_scan_io_queue_lock); mutex_exit(&tvd->vdev_scan_io_queue_lock);
} }
@ -1282,7 +1284,7 @@ dsl_scan_should_clear(dsl_scan_t *scn)
dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
if (mused == 0) if (mused == 0)
ASSERT0(scn->scn_bytes_pending); ASSERT0(scn->scn_queues_pending);
/* /*
* If we are above our hard limit, we need to clear out memory. * If we are above our hard limit, we need to clear out memory.
@ -2840,7 +2842,6 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
{ {
dsl_scan_t *scn = queue->q_scn; dsl_scan_t *scn = queue->q_scn;
scan_io_t *sio; scan_io_t *sio;
int64_t bytes_issued = 0;
boolean_t suspended = B_FALSE; boolean_t suspended = B_FALSE;
while ((sio = list_head(io_list)) != NULL) { while ((sio = list_head(io_list)) != NULL) {
@ -2852,16 +2853,12 @@ scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
} }
sio2bp(sio, &bp); sio2bp(sio, &bp);
bytes_issued += SIO_GET_ASIZE(sio);
scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
&sio->sio_zb, queue); &sio->sio_zb, queue);
(void) list_remove_head(io_list); (void) list_remove_head(io_list);
scan_io_queues_update_zio_stats(queue, &bp); scan_io_queues_update_zio_stats(queue, &bp);
sio_free(sio); sio_free(sio);
} }
atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
return (suspended); return (suspended);
} }
@ -2906,6 +2903,8 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
avl_remove(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio);
if (avl_is_empty(&queue->q_sios_by_addr))
atomic_add_64(&queue->q_scn->scn_queues_pending, -1);
queue->q_sio_memused -= SIO_GET_MUSED(sio); queue->q_sio_memused -= SIO_GET_MUSED(sio);
bytes_issued += SIO_GET_ASIZE(sio); bytes_issued += SIO_GET_ASIZE(sio);
@ -2927,12 +2926,13 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
range_tree_resize_segment(queue->q_exts_by_addr, rs, range_tree_resize_segment(queue->q_exts_by_addr, rs,
SIO_GET_OFFSET(sio), rs_get_end(rs, SIO_GET_OFFSET(sio), rs_get_end(rs,
queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); queue->q_exts_by_addr) - SIO_GET_OFFSET(sio));
queue->q_last_ext_addr = SIO_GET_OFFSET(sio);
return (B_TRUE); return (B_TRUE);
} else { } else {
uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr);
uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr);
range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart);
queue->q_last_ext_addr = -1;
return (B_FALSE); return (B_FALSE);
} }
} }
@ -2957,31 +2957,8 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
ASSERT(scn->scn_is_sorted); ASSERT(scn->scn_is_sorted);
/* handle tunable overrides */ if (!scn->scn_checkpointing && !scn->scn_clearing)
if (scn->scn_checkpointing || scn->scn_clearing) { return (NULL);
if (zfs_scan_issue_strategy == 1) {
return (range_tree_first(rt));
} else if (zfs_scan_issue_strategy == 2) {
/*
* We need to get the original entry in the by_addr
* tree so we can modify it.
*/
range_seg_t *size_rs =
zfs_btree_first(&queue->q_exts_by_size, NULL);
if (size_rs == NULL)
return (NULL);
uint64_t start = rs_get_start(size_rs, rt);
uint64_t size = rs_get_end(size_rs, rt) - start;
range_seg_t *addr_rs = range_tree_find(rt, start,
size);
ASSERT3P(addr_rs, !=, NULL);
ASSERT3U(rs_get_start(size_rs, rt), ==,
rs_get_start(addr_rs, rt));
ASSERT3U(rs_get_end(size_rs, rt), ==,
rs_get_end(addr_rs, rt));
return (addr_rs);
}
}
/* /*
* During normal clearing, we want to issue our largest segments * During normal clearing, we want to issue our largest segments
@ -2992,28 +2969,42 @@ scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
* so the way we are sorted now is as good as it will ever get. * so the way we are sorted now is as good as it will ever get.
* In this case, we instead switch to issuing extents in LBA order. * In this case, we instead switch to issuing extents in LBA order.
*/ */
if (scn->scn_checkpointing) { if ((zfs_scan_issue_strategy < 1 && scn->scn_checkpointing) ||
zfs_scan_issue_strategy == 1)
return (range_tree_first(rt)); return (range_tree_first(rt));
} else if (scn->scn_clearing) {
/* /*
* We need to get the original entry in the by_addr * Try to continue previous extent if it is not completed yet. After
* tree so we can modify it. * shrink in scan_io_queue_gather() it may no longer be the best, but
*/ * otherwise we leave shorter remnant every txg.
range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, */
NULL); uint64_t start;
if (size_rs == NULL) uint64_t size = 1 << rt->rt_shift;
return (NULL); range_seg_t *addr_rs;
uint64_t start = rs_get_start(size_rs, rt); if (queue->q_last_ext_addr != -1) {
uint64_t size = rs_get_end(size_rs, rt) - start; start = queue->q_last_ext_addr;
range_seg_t *addr_rs = range_tree_find(rt, start, size); addr_rs = range_tree_find(rt, start, size);
ASSERT3P(addr_rs, !=, NULL); if (addr_rs != NULL)
ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs, return (addr_rs);
rt));
ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt));
return (addr_rs);
} else {
return (NULL);
} }
/*
* Nothing to continue, so find new best extent.
*/
uint64_t *v = zfs_btree_first(&queue->q_exts_by_size, NULL);
if (v == NULL)
return (NULL);
queue->q_last_ext_addr = start = *v << rt->rt_shift;
/*
* We need to get the original entry in the by_addr tree so we can
* modify it.
*/
addr_rs = range_tree_find(rt, start, size);
ASSERT3P(addr_rs, !=, NULL);
ASSERT3U(rs_get_start(addr_rs, rt), ==, start);
ASSERT3U(rs_get_end(addr_rs, rt), >, start);
return (addr_rs);
} }
static void static void
@ -3049,12 +3040,12 @@ scan_io_queues_run_one(void *arg)
/* loop until we run out of time or sios */ /* loop until we run out of time or sios */
while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) { while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
uint64_t seg_start = 0, seg_end = 0; uint64_t seg_start = 0, seg_end = 0;
boolean_t more_left = B_TRUE; boolean_t more_left;
ASSERT(list_is_empty(&sio_list)); ASSERT(list_is_empty(&sio_list));
/* loop while we still have sios left to process in this rs */ /* loop while we still have sios left to process in this rs */
while (more_left) { do {
scan_io_t *first_sio, *last_sio; scan_io_t *first_sio, *last_sio;
/* /*
@ -3083,7 +3074,7 @@ scan_io_queues_run_one(void *arg)
if (suspended) if (suspended)
break; break;
} } while (more_left);
/* update statistics for debugging purposes */ /* update statistics for debugging purposes */
scan_io_queues_update_seg_stats(queue, seg_start, seg_end); scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
@ -3124,7 +3115,7 @@ scan_io_queues_run(dsl_scan_t *scn)
ASSERT(scn->scn_is_sorted); ASSERT(scn->scn_is_sorted);
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
if (scn->scn_bytes_pending == 0) if (scn->scn_queues_pending == 0)
return; return;
if (scn->scn_taskq == NULL) { if (scn->scn_taskq == NULL) {
@ -3749,7 +3740,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
zfs_dbgmsg("scan complete txg %llu", zfs_dbgmsg("scan complete txg %llu",
(longlong_t)tx->tx_txg); (longlong_t)tx->tx_txg);
} }
} else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { } else if (scn->scn_is_sorted && scn->scn_queues_pending != 0) {
ASSERT(scn->scn_clearing); ASSERT(scn->scn_clearing);
/* need to issue scrubbing IOs from per-vdev queues */ /* need to issue scrubbing IOs from per-vdev queues */
@ -3777,7 +3768,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
(longlong_t)tx->tx_txg); (longlong_t)tx->tx_txg);
ASSERT3U(scn->scn_done_txg, !=, 0); ASSERT3U(scn->scn_done_txg, !=, 0);
ASSERT0(spa->spa_scrub_inflight); ASSERT0(spa->spa_scrub_inflight);
ASSERT0(scn->scn_bytes_pending); ASSERT0(scn->scn_queues_pending);
dsl_scan_done(scn, B_TRUE, tx); dsl_scan_done(scn, B_TRUE, tx);
sync_type = SYNC_MANDATORY; sync_type = SYNC_MANDATORY;
} }
@ -3868,20 +3859,21 @@ static void
scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
{ {
avl_index_t idx; avl_index_t idx;
int64_t asize = SIO_GET_ASIZE(sio);
dsl_scan_t *scn = queue->q_scn; dsl_scan_t *scn = queue->q_scn;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
if (unlikely(avl_is_empty(&queue->q_sios_by_addr)))
atomic_add_64(&scn->scn_queues_pending, 1);
if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
/* block is already scheduled for reading */ /* block is already scheduled for reading */
atomic_add_64(&scn->scn_bytes_pending, -asize);
sio_free(sio); sio_free(sio);
return; return;
} }
avl_insert(&queue->q_sios_by_addr, sio, idx); avl_insert(&queue->q_sios_by_addr, sio, idx);
queue->q_sio_memused += SIO_GET_MUSED(sio); queue->q_sio_memused += SIO_GET_MUSED(sio);
range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize); range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio),
SIO_GET_ASIZE(sio));
} }
/* /*
@ -3894,7 +3886,6 @@ static void
scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
int zio_flags, const zbookmark_phys_t *zb) int zio_flags, const zbookmark_phys_t *zb)
{ {
dsl_scan_t *scn = queue->q_scn;
scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
ASSERT0(BP_IS_GANG(bp)); ASSERT0(BP_IS_GANG(bp));
@ -3904,13 +3895,7 @@ scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
sio->sio_flags = zio_flags; sio->sio_flags = zio_flags;
sio->sio_zb = *zb; sio->sio_zb = *zb;
/* queue->q_last_ext_addr = -1;
* Increment the bytes pending counter now so that we can't
* get an integer underflow in case the worker processes the
* zio before we get to incrementing this counter.
*/
atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
scan_io_queue_insert_impl(queue, sio); scan_io_queue_insert_impl(queue, sio);
} }
@ -3996,8 +3981,9 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
* Keep track of how much data we've examined so that * Keep track of how much data we've examined so that
* zpool(8) status can make useful progress reports. * zpool(8) status can make useful progress reports.
*/ */
scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); uint64_t asize = DVA_GET_ASIZE(dva);
spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); scn->scn_phys.scn_examined += asize;
spa->spa_scan_pass_exam += asize;
/* if it's a resilver, this may not be in the target range */ /* if it's a resilver, this may not be in the target range */
if (!needs_io) if (!needs_io)
@ -4118,33 +4104,88 @@ scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
* extents that are more completely filled (in a 3:2 ratio) vs just larger. * extents that are more completely filled (in a 3:2 ratio) vs just larger.
* Note that as an optimization, we replace multiplication and division by * Note that as an optimization, we replace multiplication and division by
* 100 with bitshifting by 7 (which effectively multiplies and divides by 128). * 100 with bitshifting by 7 (which effectively multiplies and divides by 128).
*
* Since we do not care if one extent is only few percent better than another,
* compress the score into 6 bits via binary logarithm AKA highbit64() and
* put into otherwise unused due to ashift high bits of offset. This allows
* to reduce q_exts_by_size B-tree elements to only 64 bits and compare them
* with single operation. Plus it makes scrubs more sequential and reduces
* chances that minor extent change move it within the B-tree.
*/ */
static int static int
ext_size_compare(const void *x, const void *y) ext_size_compare(const void *x, const void *y)
{ {
const range_seg_gap_t *rsa = x, *rsb = y; const uint64_t *a = x, *b = y;
uint64_t sa = rsa->rs_end - rsa->rs_start; return (TREE_CMP(*a, *b));
uint64_t sb = rsb->rs_end - rsb->rs_start;
uint64_t score_a, score_b;
score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
fill_weight * rsa->rs_fill) >> 7);
score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
fill_weight * rsb->rs_fill) >> 7);
if (score_a > score_b)
return (-1);
if (score_a == score_b) {
if (rsa->rs_start < rsb->rs_start)
return (-1);
if (rsa->rs_start == rsb->rs_start)
return (0);
return (1);
}
return (1);
} }
static void
ext_size_create(range_tree_t *rt, void *arg)
{
(void) rt;
zfs_btree_t *size_tree = arg;
zfs_btree_create(size_tree, ext_size_compare, sizeof (uint64_t));
}
static void
ext_size_destroy(range_tree_t *rt, void *arg)
{
(void) rt;
zfs_btree_t *size_tree = arg;
ASSERT0(zfs_btree_numnodes(size_tree));
zfs_btree_destroy(size_tree);
}
static uint64_t
ext_size_value(range_tree_t *rt, range_seg_gap_t *rsg)
{
(void) rt;
uint64_t size = rsg->rs_end - rsg->rs_start;
uint64_t score = rsg->rs_fill + ((((rsg->rs_fill << 7) / size) *
fill_weight * rsg->rs_fill) >> 7);
ASSERT3U(rt->rt_shift, >=, 8);
return (((uint64_t)(64 - highbit64(score)) << 56) | rsg->rs_start);
}
static void
ext_size_add(range_tree_t *rt, range_seg_t *rs, void *arg)
{
zfs_btree_t *size_tree = arg;
ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
zfs_btree_add(size_tree, &v);
}
static void
ext_size_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
{
zfs_btree_t *size_tree = arg;
ASSERT3U(rt->rt_type, ==, RANGE_SEG_GAP);
uint64_t v = ext_size_value(rt, (range_seg_gap_t *)rs);
zfs_btree_remove(size_tree, &v);
}
static void
ext_size_vacate(range_tree_t *rt, void *arg)
{
zfs_btree_t *size_tree = arg;
zfs_btree_clear(size_tree);
zfs_btree_destroy(size_tree);
ext_size_create(rt, arg);
}
static const range_tree_ops_t ext_size_ops = {
.rtop_create = ext_size_create,
.rtop_destroy = ext_size_destroy,
.rtop_add = ext_size_add,
.rtop_remove = ext_size_remove,
.rtop_vacate = ext_size_vacate
};
/* /*
* Comparator for the q_sios_by_addr tree. Sorting is simply performed * Comparator for the q_sios_by_addr tree. Sorting is simply performed
* based on LBA-order (from lowest to highest). * based on LBA-order (from lowest to highest).
@ -4167,9 +4208,10 @@ scan_io_queue_create(vdev_t *vd)
q->q_scn = scn; q->q_scn = scn;
q->q_vd = vd; q->q_vd = vd;
q->q_sio_memused = 0; q->q_sio_memused = 0;
q->q_last_ext_addr = -1;
cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP, q->q_exts_by_addr = range_tree_create_gap(&ext_size_ops, RANGE_SEG_GAP,
&q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap); &q->q_exts_by_size, 0, vd->vdev_ashift, zfs_scan_max_ext_gap);
avl_create(&q->q_sios_by_addr, sio_addr_compare, avl_create(&q->q_sios_by_addr, sio_addr_compare,
sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
@ -4187,21 +4229,20 @@ dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
dsl_scan_t *scn = queue->q_scn; dsl_scan_t *scn = queue->q_scn;
scan_io_t *sio; scan_io_t *sio;
void *cookie = NULL; void *cookie = NULL;
int64_t bytes_dequeued = 0;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
if (!avl_is_empty(&queue->q_sios_by_addr))
atomic_add_64(&scn->scn_queues_pending, -1);
while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
NULL) { NULL) {
ASSERT(range_tree_contains(queue->q_exts_by_addr, ASSERT(range_tree_contains(queue->q_exts_by_addr,
SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
bytes_dequeued += SIO_GET_ASIZE(sio);
queue->q_sio_memused -= SIO_GET_MUSED(sio); queue->q_sio_memused -= SIO_GET_MUSED(sio);
sio_free(sio); sio_free(sio);
} }
ASSERT0(queue->q_sio_memused); ASSERT0(queue->q_sio_memused);
atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
range_tree_vacate(queue->q_exts_by_addr, NULL, queue); range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
range_tree_destroy(queue->q_exts_by_addr); range_tree_destroy(queue->q_exts_by_addr);
avl_destroy(&queue->q_sios_by_addr); avl_destroy(&queue->q_sios_by_addr);
@ -4297,25 +4338,19 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
sio_free(srch_sio); sio_free(srch_sio);
if (sio != NULL) { if (sio != NULL) {
int64_t asize = SIO_GET_ASIZE(sio);
blkptr_t tmpbp; blkptr_t tmpbp;
/* Got it while it was cold in the queue */ /* Got it while it was cold in the queue */
ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
ASSERT3U(size, ==, asize); ASSERT3U(size, ==, SIO_GET_ASIZE(sio));
avl_remove(&queue->q_sios_by_addr, sio); avl_remove(&queue->q_sios_by_addr, sio);
if (avl_is_empty(&queue->q_sios_by_addr))
atomic_add_64(&scn->scn_queues_pending, -1);
queue->q_sio_memused -= SIO_GET_MUSED(sio); queue->q_sio_memused -= SIO_GET_MUSED(sio);
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
range_tree_remove_fill(queue->q_exts_by_addr, start, size); range_tree_remove_fill(queue->q_exts_by_addr, start, size);
/*
* We only update scn_bytes_pending in the cold path,
* otherwise it will already have been accounted for as
* part of the zio's execution.
*/
atomic_add_64(&scn->scn_bytes_pending, -asize);
/* count the block as though we issued it */ /* count the block as though we issued it */
sio2bp(sio, &tmpbp); sio2bp(sio, &tmpbp);
count_block(scn, dp->dp_blkstats, &tmpbp); count_block(scn, dp->dp_blkstats, &tmpbp);

View File

@ -188,10 +188,8 @@ range_tree_seg_gap_compare(const void *x1, const void *x2)
} }
range_tree_t * range_tree_t *
range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, range_tree_create_gap(const range_tree_ops_t *ops, range_seg_type_t type,
uint64_t start, uint64_t shift, void *arg, uint64_t start, uint64_t shift, uint64_t gap)
int (*zfs_btree_compare) (const void *, const void *),
uint64_t gap)
{ {
range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
@ -223,7 +221,6 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
rt->rt_type = type; rt->rt_type = type;
rt->rt_start = start; rt->rt_start = start;
rt->rt_shift = shift; rt->rt_shift = shift;
rt->rt_btree_compare = zfs_btree_compare;
if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
rt->rt_ops->rtop_create(rt, rt->rt_arg); rt->rt_ops->rtop_create(rt, rt->rt_arg);
@ -232,10 +229,10 @@ range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg,
} }
range_tree_t * range_tree_t *
range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, range_tree_create(const range_tree_ops_t *ops, range_seg_type_t type,
void *arg, uint64_t start, uint64_t shift) void *arg, uint64_t start, uint64_t shift)
{ {
return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0)); return (range_tree_create_gap(ops, type, arg, start, shift, 0));
} }
void void
@ -741,74 +738,6 @@ range_tree_is_empty(range_tree_t *rt)
return (range_tree_space(rt) == 0); return (range_tree_space(rt) == 0);
} }
void
rt_btree_create(range_tree_t *rt, void *arg)
{
zfs_btree_t *size_tree = arg;
size_t size;
switch (rt->rt_type) {
case RANGE_SEG32:
size = sizeof (range_seg32_t);
break;
case RANGE_SEG64:
size = sizeof (range_seg64_t);
break;
case RANGE_SEG_GAP:
size = sizeof (range_seg_gap_t);
break;
default:
panic("Invalid range seg type %d", rt->rt_type);
}
zfs_btree_create(size_tree, rt->rt_btree_compare, size);
}
void
rt_btree_destroy(range_tree_t *rt, void *arg)
{
(void) rt;
zfs_btree_t *size_tree = arg;
ASSERT0(zfs_btree_numnodes(size_tree));
zfs_btree_destroy(size_tree);
}
void
rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg)
{
(void) rt;
zfs_btree_t *size_tree = arg;
zfs_btree_add(size_tree, rs);
}
void
rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
{
(void) rt;
zfs_btree_t *size_tree = arg;
zfs_btree_remove(size_tree, rs);
}
void
rt_btree_vacate(range_tree_t *rt, void *arg)
{
zfs_btree_t *size_tree = arg;
zfs_btree_clear(size_tree);
zfs_btree_destroy(size_tree);
rt_btree_create(rt, arg);
}
range_tree_ops_t rt_btree_ops = {
.rtop_create = rt_btree_create,
.rtop_destroy = rt_btree_destroy,
.rtop_add = rt_btree_add,
.rtop_remove = rt_btree_remove,
.rtop_vacate = rt_btree_vacate
};
/* /*
* Remove any overlapping ranges between the given segment [start, end) * Remove any overlapping ranges between the given segment [start, end)
* from removefrom. Add non-overlapping leftovers to addto. * from removefrom. Add non-overlapping leftovers to addto.