diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h index e39cf6a8ff..a923449d96 100644 --- a/include/sys/arc_impl.h +++ b/include/sys/arc_impl.h @@ -98,6 +98,7 @@ struct arc_callback { boolean_t acb_noauth; uint64_t acb_dsobj; zio_t *acb_zio_dummy; + zio_t *acb_zio_head; arc_callback_t *acb_next; }; diff --git a/include/sys/trace_arc.h b/include/sys/trace_arc.h index 74a76520df..c40b58e32d 100644 --- a/include/sys/trace_arc.h +++ b/include/sys/trace_arc.h @@ -108,7 +108,7 @@ DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); -DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__sync__wait__for__async); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync); DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 473d2691c9..bc2f4f0ead 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -123,6 +123,7 @@ extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); +extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); extern int vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); diff --git a/include/sys/zio.h b/include/sys/zio.h index 1f9f482684..8beea7adf1 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -586,6 +586,8 @@ extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); +extern void zio_change_priority(zio_t *pio, zio_priority_t priority); + extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); diff --git a/io_spa- b/io_spa- new file mode 100644 index 0000000000..e69de29bb2 diff --git a/module/zfs/arc.c b/module/zfs/arc.c index 10b1c60d53..476351eb42 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -663,7 +663,7 @@ typedef struct arc_stats { kstat_named_t arcstat_dnode_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_sync_wait_for_async; + kstat_named_t arcstat_async_upgrade_sync; kstat_named_t arcstat_demand_hit_predictive_prefetch; kstat_named_t arcstat_demand_hit_prescient_prefetch; kstat_named_t arcstat_need_free; @@ -763,7 +763,7 @@ static arc_stats_t arc_stats = { { "arc_dnode_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, - { "sync_wait_for_async", KSTAT_DATA_UINT64 }, + { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, { "arc_need_free", KSTAT_DATA_UINT64 }, @@ -5911,32 +5911,20 @@ top: *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; + ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* - * This sync read must wait for an - * in-progress async read (e.g. a predictive - * prefetch). Async reads are queued - * separately at the vdev_queue layer, so - * this is a form of priority inversion. - * Ideally, we would "inherit" the demand - * i/o's priority by moving the i/o from - * the async queue to the synchronous queue, - * but there is currently no mechanism to do - * so. Track this so that we can evaluate - * the magnitude of this potential performance - * problem. - * - * Note that if the prefetch i/o is already - * active (has been issued to the device), - * the prefetch improved performance, because - * we issued it sooner than we would have - * without the prefetch. + * This is a sync read that needs to wait for + * an in-flight async read. Request that the + * zio have its priority upgraded. */ - DTRACE_PROBE1(arc__sync__wait__for__async, + zio_change_priority(head_zio, priority); + DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_sync_wait_for_async); + ARCSTAT_BUMP(arcstat_async_upgrade_sync); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, @@ -5966,6 +5954,7 @@ top: spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); + acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); @@ -6182,14 +6171,17 @@ top: vd = NULL; } - if (priority == ZIO_PRIORITY_ASYNC_READ) + /* + * We count both async reads and scrub IOs as asynchronous so + * that both can be upgraded in the event of a cache hit while + * the read IO is still in-flight. + */ + if (priority == ZIO_PRIORITY_ASYNC_READ || + priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); - if (hash_lock != NULL) - mutex_exit(hash_lock); - /* * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. @@ -6260,6 +6252,10 @@ top: ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); @@ -6276,6 +6272,8 @@ top: goto out; /* l2arc read error; goto zio_read() */ + if (hash_lock != NULL) + mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); @@ -6296,6 +6294,10 @@ top: rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) { rc = zio_wait(rzio); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 52c700f118..de579d1c7e 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -1529,7 +1529,7 @@ dsl_scan_prefetch_thread(void *arg) /* issue the prefetch asynchronously */ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &spic->spic_zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); } @@ -1611,7 +1611,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -1639,7 +1639,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, } err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -1658,7 +1658,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 792642952e..f121eddc7a 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -803,6 +803,48 @@ vdev_queue_io_done(zio_t *zio) mutex_exit(&vq->vq_lock); } +void +vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + avl_tree_t *tree; + + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (zio->io_type == ZIO_TYPE_READ) { + if (priority != ZIO_PRIORITY_SYNC_READ && + priority != ZIO_PRIORITY_ASYNC_READ && + priority != ZIO_PRIORITY_SCRUB) + priority = ZIO_PRIORITY_ASYNC_READ; + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + if (priority != ZIO_PRIORITY_SYNC_WRITE && + priority != ZIO_PRIORITY_ASYNC_WRITE) + priority = ZIO_PRIORITY_ASYNC_WRITE; + } + + mutex_enter(&vq->vq_lock); + + /* + * If the zio is in none of the queues we can simply change + * the priority. If the zio is waiting to be submitted we must + * remove it from the queue and re-insert it with the new priority. + * Otherwise, the zio is currently active and we cannot change its + * priority. + */ + tree = vdev_queue_class_tree(vq, zio->io_priority); + if (avl_find(tree, zio, NULL) == zio) { + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + zio->io_priority = priority; + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + zio->io_priority = priority; + } + + mutex_exit(&vq->vq_lock); +} + /* * As these two methods are only used for load calculations we're not * concerned if we get an incorrect value on 32bit platforms due to lack of diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 92e5a8dd83..c3b571e9a7 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -539,6 +539,8 @@ zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; + ASSERT(MUTEX_HELD(&pio->io_lock)); + *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); @@ -573,8 +575,8 @@ zio_add_child(zio_t *pio, zio_t *cio) zl->zl_parent = pio; zl->zl_child = cio; - mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); @@ -587,8 +589,8 @@ zio_add_child(zio_t *pio, zio_t *cio) pio->io_child_count++; cio->io_parent_count++; - mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); + mutex_exit(&pio->io_lock); } static void @@ -597,8 +599,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); - mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); @@ -606,8 +608,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) pio->io_child_count--; cio->io_parent_count--; - mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); + mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } @@ -1963,14 +1965,16 @@ zio_reexecute(zio_t *pio) * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; + mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); - mutex_enter(&pio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); + mutex_enter(&pio->io_lock); } + mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. @@ -3474,6 +3478,35 @@ zio_vdev_io_done(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } +/* + * This function is used to change the priority of an existing zio that is + * currently in-flight. This is used by the arc to upgrade priority in the + * event that a demand read is made for a block that is currently queued + * as a scrub or async read IO. Otherwise, the high priority read request + * would end up having to wait for the lower priority IO. + */ +void +zio_change_priority(zio_t *pio, zio_priority_t priority) +{ + zio_t *cio, *cio_next; + zio_link_t *zl = NULL; + + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { + vdev_queue_change_io_priority(pio, priority); + } else { + pio->io_priority = priority; + } + + mutex_enter(&pio->io_lock); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); + zio_change_priority(cio, priority); + } + mutex_exit(&pio->io_lock); +} + /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. diff --git a/spa_stats.io_history b/spa_stats.io_history new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/zfs-tests/tests/perf/scripts/prefetch_io.sh b/tests/zfs-tests/tests/perf/scripts/prefetch_io.sh index 85e7d940a9..75bf08f4b6 100755 --- a/tests/zfs-tests/tests/perf/scripts/prefetch_io.sh +++ b/tests/zfs-tests/tests/perf/scripts/prefetch_io.sh @@ -41,9 +41,9 @@ function get_prefetched_demand_reads echo $demand_reads } -function get_sync_wait_for_async +function get_async_upgrade_sync { - typeset -l sync_wait=`awk '$1 == "sync_wait_for_async" \ + typeset -l sync_wait=`awk '$1 == "async_upgrade_sync" \ { print $3 }' $zfs_kstats/arcstats` echo $sync_wait @@ -59,7 +59,7 @@ poolname=$1 interval=$2 prefetch_ios=$(get_prefetch_ios) prefetched_demand_reads=$(get_prefetched_demand_reads) -sync_wait_for_async=$(get_sync_wait_for_async) +async_upgrade_sync=$(get_async_upgrade_sync) while true do @@ -73,10 +73,10 @@ do $(( $new_prefetched_demand_reads - $prefetched_demand_reads )) prefetched_demand_reads=$new_prefetched_demand_reads - new_sync_wait_for_async=$(get_sync_wait_for_async) - printf "%-24s\t%u\n" "sync_wait_for_async" \ - $(( $new_sync_wait_for_async - $sync_wait_for_async )) - sync_wait_for_async=$new_sync_wait_for_async + new_async_upgrade_sync=$(get_async_upgrade_sync) + printf "%-24s\t%u\n" "async_upgrade_sync" \ + $(( $new_async_upgrade_sync - $async_upgrade_sync )) + async_upgrade_sync=$new_async_upgrade_sync sleep $interval done