diff --git a/include/os/linux/zfs/sys/trace_common.h b/include/os/linux/zfs/sys/trace_common.h index 3d4b1920d5..6ffa57c864 100644 --- a/include/os/linux/zfs/sys/trace_common.h +++ b/include/os/linux/zfs/sys/trace_common.h @@ -31,7 +31,6 @@ /* ZIO macros */ #define ZIO_TP_STRUCT_ENTRY \ __field(zio_type_t, zio_type) \ - __field(int, zio_cmd) \ __field(zio_priority_t, zio_priority) \ __field(uint64_t, zio_size) \ __field(uint64_t, zio_orig_size) \ @@ -61,7 +60,6 @@ #define ZIO_TP_FAST_ASSIGN \ __entry->zio_type = zio->io_type; \ - __entry->zio_cmd = zio->io_cmd; \ __entry->zio_priority = zio->io_priority; \ __entry->zio_size = zio->io_size; \ __entry->zio_orig_size = zio->io_orig_size; \ @@ -90,7 +88,7 @@ __entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify; #define ZIO_TP_PRINTK_FMT \ - "zio { type %u cmd %i prio %u size %llu orig_size %llu " \ + "zio { type %u prio %u size %llu orig_size %llu " \ "offset %llu timestamp %llu delta %llu delay %llu " \ "flags 0x%llx stage 0x%x pipeline 0x%x orig_flags 0x%llx " \ "orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \ @@ -98,7 +96,7 @@ "type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }" #define ZIO_TP_PRINTK_ARGS \ - __entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \ + __entry->zio_type, __entry->zio_priority, \ __entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \ __entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \ __entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \ diff --git a/include/sys/zio.h b/include/sys/zio.h index 5dcd7fe073..545b9cf0c3 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -451,7 +451,6 @@ struct zio { zio_type_t io_type; enum zio_child io_child_type; enum trim_flag io_trim_flags; - int io_cmd; zio_priority_t io_priority; uint8_t io_reexecute; uint8_t io_state[ZIO_WAIT_TYPES]; diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c index a65dfec86c..888c8e7f88 100644 --- a/module/os/freebsd/zfs/vdev_file.c +++ b/module/os/freebsd/zfs/vdev_file.c @@ -255,14 +255,7 @@ vdev_file_io_start(zio_t *zio) return; } - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - zio->io_error = zfs_file_fsync(vf->vf_file, - O_SYNC|O_DSYNC); - break; - default: - zio->io_error = SET_ERROR(ENOTSUP); - } + zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC|O_DSYNC); zio_execute(zio); return; diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c index 196d67b4b5..264dfa5c92 100644 --- a/module/os/freebsd/zfs/vdev_geom.c +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -1153,42 +1153,31 @@ vdev_geom_io_start(zio_t *zio) vd = zio->io_vd; - switch (zio->io_type) { - case ZIO_TYPE_IOCTL: + if (zio->io_type == ZIO_TYPE_IOCTL) { /* XXPOLICY */ if (!vdev_readable(vd)) { zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; - } else { - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - if (zfs_nocacheflush || - vdev_geom_bio_flush_disable) - break; - if (vd->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - goto sendreq; - default: - zio->io_error = SET_ERROR(ENOTSUP); - } } - zio_execute(zio); - return; - case ZIO_TYPE_TRIM: - if (!vdev_geom_bio_delete_disable) { - goto sendreq; + if (zfs_nocacheflush || vdev_geom_bio_flush_disable) { + zio_execute(zio); + return; + } + + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } + } else if (zio->io_type == ZIO_TYPE_TRIM) { + if (vdev_geom_bio_delete_disable) { + zio_execute(zio); + return; } - zio_execute(zio); - return; - default: - ; - /* PASSTHROUGH --- placate compiler */ } -sendreq: + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM || diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index f3f0c08752..554ed22b9d 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1403,38 +1403,29 @@ vdev_disk_io_start(zio_t *zio) case ZIO_TYPE_IOCTL: if (!vdev_readable(v)) { - rw_exit(&vd->vd_lock); - zio->io_error = SET_ERROR(ENXIO); - zio_interrupt(zio); - return; - } - - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - if (v->vdev_nowritecache) { - zio->io_error = SET_ERROR(ENOTSUP); - break; - } - + /* Drive not there, can't flush */ + error = SET_ERROR(ENXIO); + } else if (zfs_nocacheflush) { + /* Flushing disabled by operator, declare success */ + error = 0; + } else if (v->vdev_nowritecache) { + /* This vdev not capable of flushing */ + error = SET_ERROR(ENOTSUP); + } else { + /* + * Issue the flush. If successful, the response will + * be handled in the completion callback, so we're done. + */ error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); if (error == 0) { rw_exit(&vd->vd_lock); return; } - - zio->io_error = error; - - break; - - default: - zio->io_error = SET_ERROR(ENOTSUP); } + /* Couldn't issue the flush, so set the error and return it */ rw_exit(&vd->vd_lock); + zio->io_error = error; zio_execute(zio); return; diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c index 5abc0426d1..2b483c9a9f 100644 --- a/module/os/linux/zfs/vdev_file.c +++ b/module/os/linux/zfs/vdev_file.c @@ -250,33 +250,27 @@ vdev_file_io_start(zio_t *zio) return; } - switch (zio->io_cmd) { - case DKIOCFLUSHWRITECACHE: - - if (zfs_nocacheflush) - break; - - /* - * We cannot safely call vfs_fsync() when PF_FSTRANS - * is set in the current context. Filesystems like - * XFS include sanity checks to verify it is not - * already set, see xfs_vm_writepage(). Therefore - * the sync must be dispatched to a different context. - */ - if (__spl_pf_fstrans_check()) { - VERIFY3U(taskq_dispatch(vdev_file_taskq, - vdev_file_io_fsync, zio, TQ_SLEEP), !=, - TASKQID_INVALID); - return; - } - - zio->io_error = zfs_file_fsync(vf->vf_file, - O_SYNC | O_DSYNC); - break; - default: - zio->io_error = SET_ERROR(ENOTSUP); + if (zfs_nocacheflush) { + zio_execute(zio); + return; } + /* + * We cannot safely call vfs_fsync() when PF_FSTRANS + * is set in the current context. Filesystems like + * XFS include sanity checks to verify it is not + * already set, see xfs_vm_writepage(). Therefore + * the sync must be dispatched to a different context. + */ + if (__spl_pf_fstrans_check()) { + VERIFY3U(taskq_dispatch(vdev_file_taskq, + vdev_file_io_fsync, zio, TQ_SLEEP), !=, + TASKQID_INVALID); + return; + } + + zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); + zio_execute(zio); return; } else if (zio->io_type == ZIO_TYPE_TRIM) { diff --git a/module/zfs/vdev_draid.c b/module/zfs/vdev_draid.c index ec961255fd..7769ed6a37 100644 --- a/module/zfs/vdev_draid.c +++ b/module/zfs/vdev_draid.c @@ -2557,15 +2557,11 @@ vdev_draid_spare_ioctl(zio_t *zio) vdev_t *vd = zio->io_vd; int error = 0; - if (zio->io_cmd == DKIOCFLUSHWRITECACHE) { - for (int c = 0; c < vd->vdev_children; c++) { - zio_nowait(zio_vdev_child_io(zio, NULL, - vd->vdev_child[c], zio->io_offset, zio->io_abd, - zio->io_size, zio->io_type, zio->io_priority, 0, - vdev_draid_spare_child_done, zio)); - } - } else { - error = SET_ERROR(ENOTSUP); + for (int c = 0; c < vd->vdev_children; c++) { + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[c], zio->io_offset, zio->io_abd, + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_draid_spare_child_done, zio)); } return (error); diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c index 481af2ba82..2f43c4aa41 100644 --- a/module/zfs/zfs_fm.c +++ b/module/zfs/zfs_fm.c @@ -1096,10 +1096,7 @@ zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) return (B_FALSE); if (zio != NULL) { - /* - * If this is not a read or write zio, ignore the error. This - * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. - */ + /* If this is not a read or write zio, ignore the error */ if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) return (B_FALSE); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 1af357c580..34be54b337 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -125,10 +125,9 @@ static kstat_t *zil_kstats_global; int zil_replay_disable = 0; /* - * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to - * the disk(s) by the ZIL after an LWB write has completed. Setting this - * will cause ZIL corruption on power loss if a volatile out-of-order - * write cache is enabled. + * Disable the flush commands that are normally sent to the disk(s) by the ZIL + * after an LWB write has completed. Setting this will cause ZIL corruption on + * power loss if a volatile out-of-order write cache is enabled. */ static int zil_nocacheflush = 0; @@ -1406,19 +1405,17 @@ zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) } /* - * This function is a called after all vdevs associated with a given lwb - * write have completed their DKIOCFLUSHWRITECACHE command; or as soon - * as the lwb write completes, if "zil_nocacheflush" is set. Further, - * all "previous" lwb's will have completed before this function is - * called; i.e. this function is called for all previous lwbs before - * it's called for "this" lwb (enforced via zio the dependencies - * configured in zil_lwb_set_zio_dependency()). + * This function is a called after all vdevs associated with a given lwb write + * have completed their flush command; or as soon as the lwb write completes, + * if "zil_nocacheflush" is set. Further, all "previous" lwb's will have + * completed before this function is called; i.e. this function is called for + * all previous lwbs before it's called for "this" lwb (enforced via zio the + * dependencies configured in zil_lwb_set_zio_dependency()). * - * The intention is for this function to be called as soon as the - * contents of an lwb are considered "stable" on disk, and will survive - * any sudden loss of power. At this point, any threads waiting for the - * lwb to reach this state are signalled, and the "waiter" structures - * are marked "done". + * The intention is for this function to be called as soon as the contents of + * an lwb are considered "stable" on disk, and will survive any sudden loss of + * power. At this point, any threads waiting for the lwb to reach this state + * are signalled, and the "waiter" structures are marked "done". */ static void zil_lwb_flush_vdevs_done(zio_t *zio) @@ -1532,17 +1529,16 @@ zil_lwb_flush_wait_all(zilog_t *zilog, uint64_t txg) } /* - * This is called when an lwb's write zio completes. The callback's - * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs - * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved - * in writing out this specific lwb's data, and in the case that cache - * flushes have been deferred, vdevs involved in writing the data for - * previous lwbs. The writes corresponding to all the vdevs in the - * lwb_vdev_tree will have completed by the time this is called, due to - * the zio dependencies configured in zil_lwb_set_zio_dependency(), - * which takes deferred flushes into account. The lwb will be "done" - * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio - * completion callback for the lwb's root zio. + * This is called when an lwb's write zio completes. The callback's purpose is + * to issue the flush commands for the vdevs in the lwb's lwb_vdev_tree. The + * tree will contain the vdevs involved in writing out this specific lwb's + * data, and in the case that cache flushes have been deferred, vdevs involved + * in writing the data for previous lwbs. The writes corresponding to all the + * vdevs in the lwb_vdev_tree will have completed by the time this is called, + * due to the zio dependencies configured in zil_lwb_set_zio_dependency(), + * which takes deferred flushes into account. The lwb will be "done" once + * zil_lwb_flush_vdevs_done() is called, which occurs in the zio completion + * callback for the lwb's root zio. */ static void zil_lwb_write_done(zio_t *zio) @@ -1601,19 +1597,18 @@ zil_lwb_write_done(zio_t *zio) } /* - * If this lwb does not have any threads waiting for it to - * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE - * command to the vdevs written to by "this" lwb, and instead - * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE - * command for those vdevs. Thus, we merge the vdev tree of - * "this" lwb with the vdev tree of the "next" lwb in the list, - * and assume the "next" lwb will handle flushing the vdevs (or - * deferring the flush(s) again). + * If this lwb does not have any threads waiting for it to complete, we + * want to defer issuing the flush command to the vdevs written to by + * "this" lwb, and instead rely on the "next" lwb to handle the flush + * command for those vdevs. Thus, we merge the vdev tree of "this" lwb + * with the vdev tree of the "next" lwb in the list, and assume the + * "next" lwb will handle flushing the vdevs (or deferring the flush(s) + * again). * - * This is a useful performance optimization, especially for - * workloads with lots of async write activity and few sync - * write and/or fsync activity, as it has the potential to - * coalesce multiple flush commands to a vdev into one. + * This is a useful performance optimization, especially for workloads + * with lots of async write activity and few sync write and/or fsync + * activity, as it has the potential to coalesce multiple flush + * commands to a vdev into one. */ if (list_is_empty(&lwb->lwb_waiters) && nlwb != NULL) { zil_lwb_flush_defer(lwb, nlwb); @@ -1663,16 +1658,16 @@ zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) * If the previous lwb's write hasn't already completed, we also want * to order the completion of the lwb write zios (above, we only order * the completion of the lwb root zios). This is required because of - * how we can defer the DKIOCFLUSHWRITECACHE commands for each lwb. + * how we can defer the flush commands for each lwb. * - * When the DKIOCFLUSHWRITECACHE commands are deferred, the previous - * lwb will rely on this lwb to flush the vdevs written to by that - * previous lwb. Thus, we need to ensure this lwb doesn't issue the - * flush until after the previous lwb's write completes. We ensure - * this ordering by setting the zio parent/child relationship here. + * When the flush commands are deferred, the previous lwb will rely on + * this lwb to flush the vdevs written to by that previous lwb. Thus, + * we need to ensure this lwb doesn't issue the flush until after the + * previous lwb's write completes. We ensure this ordering by setting + * the zio parent/child relationship here. * - * Without this relationship on the lwb's write zio, it's possible - * for this lwb's write to complete prior to the previous lwb's write + * Without this relationship on the lwb's write zio, it's possible for + * this lwb's write to complete prior to the previous lwb's write * completing; and thus, the vdevs for the previous lwb would be * flushed prior to that lwb's data being written to those vdevs (the * vdevs are flushed in the lwb write zio's completion handler, @@ -3499,8 +3494,8 @@ zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) * callback of the lwb's zio[*]. * * * Actually, the waiters are signaled in the zio completion - * callback of the root zio for the DKIOCFLUSHWRITECACHE commands - * that are sent to the vdevs upon completion of the lwb zio. + * callback of the root zio for the flush commands that are sent to + * the vdevs upon completion of the lwb zio. * * 2. When the itxs are inserted into the ZIL's queue of uncommitted * itxs, the order in which they are inserted is preserved[*]; as diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 4aa08f3b30..031fc3d513 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -1631,11 +1631,9 @@ zio_flush(zio_t *pio, vdev_t *vd) return; if (vd->vdev_children == 0) { - zio_t *zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, + zio_nowait(zio_create(pio, vd->vdev_spa, 0, NULL, NULL, 0, 0, NULL, NULL, ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, - NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); - zio->io_cmd = DKIOCFLUSHWRITECACHE; - zio_nowait(zio); + NULL, ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE)); } else { for (uint64_t c = 0; c < vd->vdev_children; c++) zio_flush(pio, vd->vdev_child[c]); @@ -4241,8 +4239,7 @@ zio_vdev_io_assess(zio_t *zio) * boolean flag so that we don't bother with it in the future. */ if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && - zio->io_type == ZIO_TYPE_IOCTL && - zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) + zio->io_type == ZIO_TYPE_IOCTL && vd != NULL) vd->vdev_nowritecache = B_TRUE; if (zio->io_error)