Another set of vdev queue optimizations.

Switch FIFO queues (SYNC/TRIM) and active queue of vdev queue from
time-sorted AVL-trees to simple lists.  AVL-trees are too expensive
for such a simple task.  To change I/O priority without searching
through the trees, add io_queue_state field to struct zio.

To not check number of queued I/Os for each priority add vq_cqueued
bitmap to struct vdev_queue.  Update it when adding/removing I/Os.
Make vq_cactive a separate array instead of struct vdev_queue_class
member.  Together those allow to avoid lots of cache misses when
looking for work in vdev_queue_class_to_issue().

Introduce deadline of ~0.5s for LBA-sorted queues.  Before this I
saw some I/Os waiting in a queue for up to 8 seconds and possibly
more due to starvation.  With this change I no longer see it.  I
had to slightly more complicate the comparison function, but since
it uses all the same cache lines the difference is minimal.  For a
sequential I/Os the new code in vdev_queue_io_to_issue() actually
often uses more simple avl_first(), falling back to avl_find() and
avl_nearest() only when needed.

Arrange members in struct zio to access only one cache line when
searching through vdev queues.  While there, remove io_alloc_node,
reusing the io_queue_node instead.  Those two are never used same
time.

Remove zfs_vdev_aggregate_trim parameter.  It was disabled for 4
years since implemented, while still wasted time maintaining the
offset-sorted tree of TRIM requests.  Just remove the tree.

Remove locking from txg_all_lists_empty().  It is racy by design,
while 2 pair of locks/unlocks take noticeable time under the vdev
queue lock.

With these changes in my tests with volblocksize=4KB I measure vdev
queue lock spin time reduction by 50% on read and 75% on write.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by:	Alexander Motin <mav@FreeBSD.org>
Sponsored by:	iXsystems, Inc.
Closes #14925
This commit is contained in:
Alexander Motin 2023-06-27 12:09:48 -04:00 committed by GitHub
parent 35a6247c5f
commit 8469b5aac0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 205 additions and 172 deletions

View File

@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
extern int vdev_queue_length(vdev_t *vd); extern uint32_t vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd);
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd);

View File

@ -130,27 +130,24 @@ typedef const struct vdev_ops {
/* /*
* Virtual device properties * Virtual device properties
*/ */
typedef struct vdev_queue_class { typedef union vdev_queue_class {
uint32_t vqc_active; list_t vqc_list;
avl_tree_t vqc_tree;
/*
* Sorted by offset or timestamp, depending on if the queue is
* LBA-ordered vs FIFO.
*/
avl_tree_t vqc_queued_tree;
} vdev_queue_class_t; } vdev_queue_class_t;
struct vdev_queue { struct vdev_queue {
vdev_t *vq_vdev; vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree; avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree; avl_tree_t vq_write_offset_tree;
avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset; uint64_t vq_last_offset;
zio_priority_t vq_last_prio; /* Last sent I/O priority. */ zio_priority_t vq_last_prio; /* Last sent I/O priority. */
uint32_t vq_cqueued; /* Classes with queued I/Os. */
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
uint32_t vq_active; /* Number of active I/Os. */
uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
list_t vq_active_list; /* List of active I/Os. */
hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts; hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */ zio_t vq_io_search; /* used as local for stack reduction */

View File

@ -436,6 +436,12 @@ typedef struct zio_link {
list_node_t zl_child_node; list_node_t zl_child_node;
} zio_link_t; } zio_link_t;
enum zio_qstate {
ZIO_QS_NONE = 0,
ZIO_QS_QUEUED,
ZIO_QS_ACTIVE,
};
struct zio { struct zio {
/* Core information about this I/O */ /* Core information about this I/O */
zbookmark_phys_t io_bookmark; zbookmark_phys_t io_bookmark;
@ -479,6 +485,12 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops; const zio_vsd_ops_t *io_vsd_ops;
metaslab_class_t *io_metaslab_class; /* dva throttle class */ metaslab_class_t *io_metaslab_class; /* dva throttle class */
enum zio_qstate io_queue_state; /* vdev queue state */
union {
list_node_t l;
avl_node_t a;
} io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
avl_node_t io_offset_node; /* vdev offset queues */
uint64_t io_offset; uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */ hrtime_t io_timestamp; /* submitted at */
hrtime_t io_queued_timestamp; hrtime_t io_queued_timestamp;
@ -486,9 +498,6 @@ struct zio {
hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */ hrtime_t io_delay; /* Device access time (disk or */
/* file). */ /* file). */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
zio_alloc_list_t io_alloc_list; zio_alloc_list_t io_alloc_list;
/* Internal pipeline state */ /* Internal pipeline state */

View File

@ -2016,12 +2016,6 @@ Historical statistics for this many latest TXGs will be available in
Flush dirty data to disk at least every this many seconds (maximum TXG Flush dirty data to disk at least every this many seconds (maximum TXG
duration). duration).
. .
.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Allow TRIM I/O operations to be aggregated.
This is normally not helpful because the extents to be trimmed
will have been already been aggregated by the metaslab.
This option is provided for debugging and performance analysis.
.
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
Max vdev I/O aggregation size. Max vdev I/O aggregation size.
. .

View File

@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL); NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node)); sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
} }
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));

View File

@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
boolean_t boolean_t
txg_all_lists_empty(txg_list_t *tl) txg_all_lists_empty(txg_list_t *tl)
{ {
mutex_enter(&tl->tl_lock); boolean_t res = B_TRUE;
for (int i = 0; i < TXG_SIZE; i++) { for (int i = 0; i < TXG_SIZE; i++)
if (!txg_list_empty_impl(tl, i)) { res &= (tl->tl_head[i] == NULL);
mutex_exit(&tl->tl_lock); return (res);
return (B_FALSE);
}
}
mutex_exit(&tl->tl_lock);
return (B_TRUE);
} }
/* /*

View File

@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
vsx->vsx_active_queue[t] = vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
vd->vdev_queue.vq_class[t].vqc_active; vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
vsx->vsx_pend_queue[t] = avl_numnodes(
&vd->vdev_queue.vq_class[t].vqc_queued_tree);
} }
} }
} }
@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
vdev_queue_t *vq = &vd->vdev_queue; vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock); mutex_enter(&vq->vq_lock);
if (avl_numnodes(&vq->vq_active_tree) > 0) { if (vq->vq_active > 0) {
spa_t *spa = vd->vdev_spa; spa_t *spa = vd->vdev_spa;
zio_t *fio; zio_t *fio;
uint64_t delta; uint64_t delta;
zfs_dbgmsg("slow vdev: %s has %lu active IOs", zfs_dbgmsg("slow vdev: %s has %u active IOs",
vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); vd->vdev_path, vq->vq_active);
/* /*
* Look at the head of all the pending queues, * Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than * if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic. * the spa_deadman_synctime invoke the deadman logic.
*/ */
fio = avl_first(&vq->vq_active_tree); fio = list_head(&vq->vq_active_list);
delta = gethrtime() - fio->io_timestamp; delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa)) if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag); zio_deadman(fio, tag);

View File

@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300;
*/ */
uint_t zfs_vdev_def_queue_depth = 32; uint_t zfs_vdev_def_queue_depth = 32;
/*
* Allow TRIM I/Os to be aggregated. This should normally not be needed since
* TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
* by the TRIM code in zfs_trim.c.
*/
static uint_t zfs_vdev_aggregate_trim = 0;
static int static int
vdev_queue_offset_compare(const void *x1, const void *x2) vdev_queue_offset_compare(const void *x1, const void *x2)
{ {
@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
return (TREE_PCMP(z1, z2)); return (TREE_PCMP(z1, z2));
} }
static inline avl_tree_t * #define VDQ_T_SHIFT 29
vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
{
return (&vq->vq_class[p].vqc_queued_tree);
}
static inline avl_tree_t *
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
{
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
if (t == ZIO_TYPE_READ)
return (&vq->vq_read_offset_tree);
else if (t == ZIO_TYPE_WRITE)
return (&vq->vq_write_offset_tree);
else
return (&vq->vq_trim_offset_tree);
}
static int static int
vdev_queue_timestamp_compare(const void *x1, const void *x2) vdev_queue_to_compare(const void *x1, const void *x2)
{ {
const zio_t *z1 = (const zio_t *)x1; const zio_t *z1 = (const zio_t *)x1;
const zio_t *z2 = (const zio_t *)x2; const zio_t *z2 = (const zio_t *)x2;
int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
z2->io_timestamp >> VDQ_T_SHIFT);
int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
int cmp = tcmp ? tcmp : ocmp;
if (likely(cmp)) if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
return (cmp); return (cmp);
return (TREE_PCMP(z1, z2)); return (TREE_PCMP(z1, z2));
} }
static inline boolean_t
vdev_queue_class_fifo(zio_priority_t p)
{
return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
p == ZIO_PRIORITY_TRIM);
}
static void
vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
{
zio_priority_t p = zio->io_priority;
vq->vq_cqueued |= 1U << p;
if (vdev_queue_class_fifo(p))
list_insert_tail(&vq->vq_class[p].vqc_list, zio);
else
avl_add(&vq->vq_class[p].vqc_tree, zio);
}
static void
vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
{
zio_priority_t p = zio->io_priority;
uint32_t empty;
if (vdev_queue_class_fifo(p)) {
list_t *list = &vq->vq_class[p].vqc_list;
list_remove(list, zio);
empty = list_is_empty(list);
} else {
avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
avl_remove(tree, zio);
empty = avl_is_empty(tree);
}
vq->vq_cqueued &= ~(empty << p);
}
static uint_t static uint_t
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
{ {
@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa)
} }
static uint_t static uint_t
vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
{ {
switch (p) { switch (p) {
case ZIO_PRIORITY_SYNC_READ: case ZIO_PRIORITY_SYNC_READ:
@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
case ZIO_PRIORITY_ASYNC_READ: case ZIO_PRIORITY_ASYNC_READ:
return (zfs_vdev_async_read_max_active); return (zfs_vdev_async_read_max_active);
case ZIO_PRIORITY_ASYNC_WRITE: case ZIO_PRIORITY_ASYNC_WRITE:
return (vdev_queue_max_async_writes(spa)); return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
case ZIO_PRIORITY_SCRUB: case ZIO_PRIORITY_SCRUB:
if (vq->vq_ia_active > 0) { if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit, return (MIN(vq->vq_nia_credit,
@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
static zio_priority_t static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq) vdev_queue_class_to_issue(vdev_queue_t *vq)
{ {
spa_t *spa = vq->vq_vdev->vdev_spa; uint32_t cq = vq->vq_cqueued;
zio_priority_t p, n; zio_priority_t p, p1;
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE); return (ZIO_PRIORITY_NUM_QUEUEABLE);
/* /*
@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* Do round-robin to reduce starvation due to zfs_vdev_max_active * Do round-robin to reduce starvation due to zfs_vdev_max_active
* and vq_nia_credit limits. * and vq_nia_credit limits.
*/ */
for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { p1 = vq->vq_last_prio + 1;
p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && p1 = 0;
vq->vq_class[p].vqc_active < for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
vdev_queue_class_min_active(vq, p)) { if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
vq->vq_last_prio = p; vdev_queue_class_min_active(vq, p))
return (p); goto found;
} }
for (p = 0; p < p1; p++) {
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
vdev_queue_class_min_active(vq, p))
goto found;
} }
/* /*
@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* maximum # outstanding i/os. * maximum # outstanding i/os.
*/ */
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
vq->vq_class[p].vqc_active < vdev_queue_class_max_active(vq, p))
vdev_queue_class_max_active(spa, vq, p)) { break;
vq->vq_last_prio = p;
return (p);
}
} }
/* No eligible queued i/os */ found:
return (ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_last_prio = p;
return (p);
} }
void void
@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd)
vdev_queue_t *vq = &vd->vdev_queue; vdev_queue_t *vq = &vd->vdev_queue;
zio_priority_t p; zio_priority_t p;
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
vq->vq_vdev = vd; vq->vq_vdev = vd;
taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *); if (vdev_queue_class_fifo(p)) {
list_create(&vq->vq_class[p].vqc_list,
/* sizeof (zio_t),
* The synchronous/trim i/o queues are dispatched in FIFO rather offsetof(struct zio, io_queue_node.l));
* than LBA order. This provides more consistent latency for
* these i/os.
*/
if (p == ZIO_PRIORITY_SYNC_READ ||
p == ZIO_PRIORITY_SYNC_WRITE ||
p == ZIO_PRIORITY_TRIM) {
compfn = vdev_queue_timestamp_compare;
} else { } else {
compfn = vdev_queue_offset_compare; avl_create(&vq->vq_class[p].vqc_tree,
vdev_queue_to_compare, sizeof (zio_t),
offsetof(struct zio, io_queue_node.a));
} }
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
} }
avl_create(&vq->vq_read_offset_tree,
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(&vq->vq_write_offset_tree,
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
vq->vq_last_offset = 0; vq->vq_last_offset = 0;
list_create(&vq->vq_active_list, sizeof (struct zio),
offsetof(struct zio, io_queue_node.l));
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
} }
void void
@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd)
{ {
vdev_queue_t *vq = &vd->vdev_queue; vdev_queue_t *vq = &vd->vdev_queue;
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
avl_destroy(vdev_queue_class_tree(vq, p)); if (vdev_queue_class_fifo(p))
avl_destroy(&vq->vq_active_tree); list_destroy(&vq->vq_class[p].vqc_list);
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); else
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); avl_destroy(&vq->vq_class[p].vqc_tree);
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM)); }
avl_destroy(&vq->vq_read_offset_tree);
avl_destroy(&vq->vq_write_offset_tree);
list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock); mutex_destroy(&vq->vq_lock);
} }
static void static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{ {
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); zio->io_queue_state = ZIO_QS_QUEUED;
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); vdev_queue_class_add(vq, zio);
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); if (zio->io_type == ZIO_TYPE_READ)
avl_add(&vq->vq_read_offset_tree, zio);
else if (zio->io_type == ZIO_TYPE_WRITE)
avl_add(&vq->vq_write_offset_tree, zio);
} }
static void static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{ {
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vdev_queue_class_remove(vq, zio);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); if (zio->io_type == ZIO_TYPE_READ)
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); avl_remove(&vq->vq_read_offset_tree, zio);
else if (zio->io_type == ZIO_TYPE_WRITE)
avl_remove(&vq->vq_write_offset_tree, zio);
zio->io_queue_state = ZIO_QS_NONE;
} }
static boolean_t static boolean_t
@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{ {
ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active++; vq->vq_cactive[zio->io_priority]++;
vq->vq_active++;
if (vdev_queue_is_interactive(zio->io_priority)) { if (vdev_queue_is_interactive(zio->io_priority)) {
if (++vq->vq_ia_active == 1) if (++vq->vq_ia_active == 1)
vq->vq_nia_credit = 1; vq->vq_nia_credit = 1;
} else if (vq->vq_ia_active > 0) { } else if (vq->vq_ia_active > 0) {
vq->vq_nia_credit--; vq->vq_nia_credit--;
} }
avl_add(&vq->vq_active_tree, zio); zio->io_queue_state = ZIO_QS_ACTIVE;
list_insert_tail(&vq->vq_active_list, zio);
} }
static void static void
@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
{ {
ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active--; vq->vq_cactive[zio->io_priority]--;
vq->vq_active--;
if (vdev_queue_is_interactive(zio->io_priority)) { if (vdev_queue_is_interactive(zio->io_priority)) {
if (--vq->vq_ia_active == 0) if (--vq->vq_ia_active == 0)
vq->vq_nia_credit = 0; vq->vq_nia_credit = 0;
@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit = zfs_vdev_nia_credit; vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0) } else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++; vq->vq_nia_credit++;
avl_remove(&vq->vq_active_tree, zio); list_remove(&vq->vq_active_list, zio);
zio->io_queue_state = ZIO_QS_NONE;
} }
static void static void
@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
uint64_t maxgap = 0; uint64_t maxgap = 0;
uint64_t size; uint64_t size;
uint64_t limit; uint64_t limit;
int maxblocksize;
boolean_t stretch = B_FALSE; boolean_t stretch = B_FALSE;
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
uint64_t next_offset; uint64_t next_offset;
abd_t *abd; abd_t *abd;
avl_tree_t *t;
/*
* TRIM aggregation should not be needed since code in zfs_trim.c can
* submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
*/
if (zio->io_type == ZIO_TYPE_TRIM)
return (NULL);
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
return (NULL);
maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
if (vq->vq_vdev->vdev_nonrot) if (vq->vq_vdev->vdev_nonrot)
limit = zfs_vdev_aggregation_limit_non_rotating; limit = zfs_vdev_aggregation_limit_non_rotating;
else else
limit = zfs_vdev_aggregation_limit; limit = zfs_vdev_aggregation_limit;
limit = MIN(limit, maxblocksize); if (limit == 0)
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
return (NULL);
/*
* While TRIM commands could be aggregated based on offset this
* behavior is disabled until it's determined to be beneficial.
*/
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
return (NULL); return (NULL);
limit = MIN(limit, SPA_MAXBLOCKSIZE);
/* /*
* I/Os to distributed spares are directly dispatched to the dRAID * I/Os to distributed spares are directly dispatched to the dRAID
@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
first = last = zio; first = last = zio;
if (zio->io_type == ZIO_TYPE_READ) if (zio->io_type == ZIO_TYPE_READ) {
maxgap = zfs_vdev_read_gap_limit; maxgap = zfs_vdev_read_gap_limit;
t = &vq->vq_read_offset_tree;
} else {
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
t = &vq->vq_write_offset_tree;
}
/* /*
* We can aggregate I/Os that are sufficiently adjacent and of * We can aggregate I/Os that are sufficiently adjacent and of
@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* Walk backwards through sufficiently contiguous I/Os * Walk backwards through sufficiently contiguous I/Os
* recording the last non-optional I/O. * recording the last non-optional I/O.
*/ */
zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
while ((dio = AVL_PREV(t, first)) != NULL && while ((dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= limit && IO_SPAN(dio, last) <= limit &&
@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
(IO_SPAN(first, dio) <= limit || (IO_SPAN(first, dio) <= limit ||
(dio->io_flags & ZIO_FLAG_OPTIONAL)) && (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
IO_SPAN(first, dio) <= maxblocksize && IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
IO_GAP(last, dio) <= maxgap && IO_GAP(last, dio) <= maxgap &&
dio->io_type == zio->io_type) { dio->io_type == zio->io_type) {
last = dio; last = dio;
@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
return (NULL); return (NULL);
size = IO_SPAN(first, last); size = IO_SPAN(first, last);
ASSERT3U(size, <=, maxblocksize); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
abd = abd_alloc_gang(); abd = abd_alloc_gang();
if (abd == NULL) if (abd == NULL)
@ -824,19 +847,30 @@ again:
return (NULL); return (NULL);
} }
/* if (vdev_queue_class_fifo(p)) {
* For LBA-ordered queues (async / scrub / initializing), issue the zio = list_head(&vq->vq_class[p].vqc_list);
* i/o which follows the most recently issued i/o in LBA (offset) order. } else {
* /*
* For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. * For LBA-ordered queues (async / scrub / initializing),
*/ * issue the I/O which follows the most recently issued I/O
tree = vdev_queue_class_tree(vq, p); * in LBA (offset) order, but to avoid starvation only within
vq->vq_io_search.io_timestamp = 0; * the same 0.5 second interval as the first I/O.
vq->vq_io_search.io_offset = vq->vq_last_offset - 1; */
VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); tree = &vq->vq_class[p].vqc_tree;
zio = avl_nearest(tree, idx, AVL_AFTER); zio = aio = avl_first(tree);
if (zio == NULL) if (zio->io_offset < vq->vq_last_offset) {
zio = avl_first(tree); vq->vq_io_search.io_timestamp = zio->io_timestamp;
vq->vq_io_search.io_offset = vq->vq_last_offset;
zio = avl_find(tree, &vq->vq_io_search, &idx);
if (zio == NULL) {
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL ||
(zio->io_timestamp >> VDQ_T_SHIFT) !=
(aio->io_timestamp >> VDQ_T_SHIFT))
zio = aio;
}
}
}
ASSERT3U(zio->io_priority, ==, p); ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio); aio = vdev_queue_aggregate(vq, zio);
@ -967,7 +1001,6 @@ void
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
{ {
vdev_queue_t *vq = &zio->io_vd->vdev_queue; vdev_queue_t *vq = &zio->io_vd->vdev_queue;
avl_tree_t *tree;
/* /*
* ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* Otherwise, the zio is currently active and we cannot change its * Otherwise, the zio is currently active and we cannot change its
* priority. * priority.
*/ */
tree = vdev_queue_class_tree(vq, zio->io_priority); if (zio->io_queue_state == ZIO_QS_QUEUED) {
if (avl_find(tree, zio, NULL) == zio) { vdev_queue_class_remove(vq, zio);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
zio->io_priority = priority; zio->io_priority = priority;
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); vdev_queue_class_add(vq, zio);
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { } else if (zio->io_queue_state == ZIO_QS_NONE) {
zio->io_priority = priority; zio->io_priority = priority;
} }
@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* vq_lock mutex use here, instead we prefer to keep it lock free for * vq_lock mutex use here, instead we prefer to keep it lock free for
* performance. * performance.
*/ */
int uint32_t
vdev_queue_length(vdev_t *vd) vdev_queue_length(vdev_t *vd)
{ {
return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); return (vd->vdev_queue.vq_active);
} }
uint64_t uint64_t
@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd)
return (vd->vdev_queue.vq_last_offset); return (vd->vdev_queue.vq_last_offset);
} }
uint64_t
vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
{
vdev_queue_t *vq = &vd->vdev_queue;
if (vdev_queue_class_fifo(p))
return (list_is_empty(&vq->vq_class[p].vqc_list) == 0);
else
return (avl_numnodes(&vq->vq_class[p].vqc_tree));
}
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
"Max vdev I/O aggregation size"); "Max vdev I/O aggregation size");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW,
"Allow TRIM I/O to be aggregated");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW, ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
"Aggregate read I/O over gap"); "Aggregate read I/O over gap");