diff --git a/META b/META index e4b476aff1..5f834d5cc7 100644 --- a/META +++ b/META @@ -1,8 +1,8 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.1.99 -Release: 1 +Version: 2.2.0 +Release: rc1 Release-Tags: relext License: CDDL Author: OpenZFS diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 04a10c4eed..9568d2bbfe 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -794,7 +794,7 @@ usage(void) "\t\t[-o =]... [-t ] [-U ] [-x ]\n" "\t\t[-K ] / []\n" "\t%s [-v] \n" - "\t%s -C [-A] [-U ]\n" + "\t%s -C [-A] [-U ] []\n" "\t%s -l [-Aqu] \n" "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " "[-U ]\n\t\t [ [ ...]]\n" diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index e28f1d04f3..5ed25d1ea7 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -6057,8 +6057,8 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) if (p != NULL) rid = p->pw_uid; else if (*endch != '\0') { - (void) snprintf(errbuf, 256, gettext( - "invalid user %s\n"), curr); + (void) snprintf(errbuf, sizeof (errbuf), + gettext("invalid user %s\n"), curr); allow_usage(un, B_TRUE, errbuf); } } else if (opts->group) { @@ -6071,8 +6071,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) if (g != NULL) rid = g->gr_gid; else if (*endch != '\0') { - (void) snprintf(errbuf, 256, gettext( - "invalid group %s\n"), curr); + (void) snprintf(errbuf, sizeof (errbuf), + gettext("invalid group %s\n"), + curr); allow_usage(un, B_TRUE, errbuf); } } else { @@ -6097,8 +6098,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) who_type = ZFS_DELEG_GROUP; rid = g->gr_gid; } else { - (void) snprintf(errbuf, 256, gettext( - "invalid user/group %s\n"), curr); + (void) snprintf(errbuf, sizeof (errbuf), + gettext("invalid user/group %s\n"), + curr); allow_usage(un, B_TRUE, errbuf); } } diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 013dd4a233..10a3b5b14f 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -7662,11 +7662,11 @@ static void print_scan_scrub_resilver_status(pool_scan_stat_t *ps) { time_t start, end, pause; - uint64_t pass_scanned, scanned, pass_issued, issued, total; + uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i; uint64_t elapsed, scan_rate, issue_rate; double fraction_done; - char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; - char srate_buf[7], irate_buf[7], time_buf[32]; + char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7]; + char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32]; printf(" "); printf_color(ANSI_BOLD, gettext("scan:")); @@ -7738,10 +7738,11 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) pass_scanned = ps->pss_pass_exam; issued = ps->pss_issued; pass_issued = ps->pss_pass_issued; - total = ps->pss_to_examine; + total_s = ps->pss_to_examine; + total_i = ps->pss_to_examine - ps->pss_skipped; /* we are only done with a block once we have issued the IO for it */ - fraction_done = (double)issued / total; + fraction_done = (double)issued / total_i; /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; @@ -7750,26 +7751,25 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) scan_rate = pass_scanned / elapsed; issue_rate = pass_issued / elapsed; - uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? - ((total - issued) / issue_rate) : UINT64_MAX; - secs_to_dhms(total_secs_left, time_buf); /* format all of the numbers we will be reporting */ zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); - zfs_nicebytes(total, total_buf, sizeof (total_buf)); - zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); - zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); + zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf)); + zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf)); /* do not print estimated time if we have a paused scrub */ - if (pause == 0) { - (void) printf(gettext("\t%s scanned at %s/s, " - "%s issued at %s/s, %s total\n"), - scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); - } else { - (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), - scanned_buf, issued_buf, total_buf); + (void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf); + if (pause == 0 && scan_rate > 0) { + zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); + (void) printf(gettext(" at %s/s"), srate_buf); } + (void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf); + if (pause == 0 && issue_rate > 0) { + zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); + (void) printf(gettext(" at %s/s"), irate_buf); + } + (void) printf(gettext("\n")); if (is_resilver) { (void) printf(gettext("\t%s resilvered, %.2f%% done"), @@ -7782,16 +7782,16 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) if (pause == 0) { /* * Only provide an estimate iff: - * 1) the time remaining is valid, and + * 1) we haven't yet issued all we expected, and * 2) the issue rate exceeds 10 MB/s, and * 3) it's either: * a) a resilver which has started repairs, or * b) a scrub which has entered the issue phase. */ - if (total_secs_left != UINT64_MAX && - issue_rate >= 10 * 1024 * 1024 && + if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 && ((is_resilver && ps->pss_processed > 0) || (is_scrub && issued > 0))) { + secs_to_dhms((total_i - issued) / issue_rate, time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " @@ -7803,7 +7803,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps) } static void -print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) +print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name) { if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) return; @@ -7815,17 +7815,20 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) uint64_t bytes_scanned = vrs->vrs_bytes_scanned; uint64_t bytes_issued = vrs->vrs_bytes_issued; uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; - uint64_t bytes_est = vrs->vrs_bytes_est; + uint64_t bytes_est_s = vrs->vrs_bytes_est; + uint64_t bytes_est_i = vrs->vrs_bytes_est; + if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8) + bytes_est_i -= vrs->vrs_pass_bytes_skipped; uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / (vrs->vrs_pass_time_ms + 1)) * 1000; uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / (vrs->vrs_pass_time_ms + 1)) * 1000; double scan_pct = MIN((double)bytes_scanned * 100 / - (bytes_est + 1), 100); + (bytes_est_s + 1), 100); /* Format all of the numbers we will be reporting */ char bytes_scanned_buf[7], bytes_issued_buf[7]; - char bytes_rebuilt_buf[7], bytes_est_buf[7]; + char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7]; char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; zfs_nicebytes(bytes_scanned, bytes_scanned_buf, sizeof (bytes_scanned_buf)); @@ -7833,9 +7836,8 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) sizeof (bytes_issued_buf)); zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, sizeof (bytes_rebuilt_buf)); - zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); - zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); - zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); + zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf)); + zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf)); time_t start = vrs->vrs_start_time; time_t end = vrs->vrs_end_time; @@ -7858,17 +7860,29 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); - secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / - MAX(scan_rate, 1), time_buf); + (void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf, + bytes_est_s_buf); + if (scan_rate > 0) { + zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); + (void) printf(gettext(" at %s/s"), scan_rate_buf); + } + (void) printf(gettext(", %s / %s issued"), bytes_issued_buf, + bytes_est_i_buf); + if (issue_rate > 0) { + zfs_nicebytes(issue_rate, issue_rate_buf, + sizeof (issue_rate_buf)); + (void) printf(gettext(" at %s/s"), issue_rate_buf); + } + (void) printf(gettext("\n")); - (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " - "%s total\n"), bytes_scanned_buf, scan_rate_buf, - bytes_issued_buf, issue_rate_buf, bytes_est_buf); (void) printf(gettext("\t%s resilvered, %.2f%% done"), bytes_rebuilt_buf, scan_pct); if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { - if (scan_rate >= 10 * 1024 * 1024) { + if (bytes_est_s >= bytes_scanned && + scan_rate >= 10 * 1024 * 1024) { + secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate, + time_buf); (void) printf(gettext(", %s to go\n"), time_buf); } else { (void) printf(gettext(", no estimated " @@ -7900,7 +7914,7 @@ print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { char *name = zpool_vdev_name(g_zfs, zhp, child[c], VDEV_NAME_TYPE_ID); - print_rebuild_status_impl(vrs, name); + print_rebuild_status_impl(vrs, i, name); free(name); } } @@ -8005,13 +8019,15 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) active_resilver = (ps->pss_state == DSS_SCANNING); } - have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); scrub_start = ps->pss_start_time; - have_errorscrub = (ps->pss_error_scrub_func == - POOL_SCAN_ERRORSCRUB); - errorscrub_start = ps->pss_error_scrub_start; + if (c > offsetof(pool_scan_stat_t, + pss_pass_error_scrub_pause) / 8) { + have_errorscrub = (ps->pss_error_scrub_func == + POOL_SCAN_ERRORSCRUB); + errorscrub_start = ps->pss_error_scrub_start; + } } boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); diff --git a/cmd/zpool_influxdb/zpool_influxdb.c b/cmd/zpool_influxdb/zpool_influxdb.c index 80d0848589..520e569269 100644 --- a/cmd/zpool_influxdb/zpool_influxdb.c +++ b/cmd/zpool_influxdb/zpool_influxdb.c @@ -238,6 +238,7 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name) print_kv("end_ts", ps->pss_end_time); print_kv(",errors", ps->pss_errors); print_kv(",examined", examined); + print_kv(",skipped", ps->pss_skipped); print_kv(",issued", ps->pss_issued); print_kv(",pass_examined", pass_exam); print_kv(",pass_issued", ps->pss_pass_issued); @@ -249,7 +250,6 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name) print_kv(",remaining_t", remaining_time); print_kv(",start_ts", ps->pss_start_time); print_kv(",to_examine", ps->pss_to_examine); - print_kv(",to_process", ps->pss_to_process); printf(" %llu\n", (u_longlong_t)timestamp); return (0); } diff --git a/contrib/dracut/90zfs/module-setup.sh.in b/contrib/dracut/90zfs/module-setup.sh.in index e55cb60e16..acad468edf 100755 --- a/contrib/dracut/90zfs/module-setup.sh.in +++ b/contrib/dracut/90zfs/module-setup.sh.in @@ -36,7 +36,7 @@ install() { { dfatal "Failed to install essential binaries"; exit 1; } # Adapted from https://github.com/zbm-dev/zfsbootmenu - if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so'; then + if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so' && ldconfig -p 2> /dev/null | grep -qF 'libc.so.6' ; then # On systems with gcc-config (Gentoo, Funtoo, etc.), use it to find libgcc_s if command -v gcc-config >/dev/null; then inst_simple "/usr/lib/gcc/$(s=$(gcc-config -c); echo "${s%-*}/${s##*-}")/libgcc_s.so.1" || diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h index 2a6cd8283d..6c1b4377a9 100644 --- a/include/os/linux/spl/sys/taskq.h +++ b/include/os/linux/spl/sys/taskq.h @@ -104,6 +104,7 @@ typedef struct taskq { /* list node for the cpu hotplug callback */ struct hlist_node tq_hp_cb_node; boolean_t tq_hp_support; + unsigned long lastshouldstop; /* when to purge dynamic */ } taskq_t; typedef struct taskq_ent { diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h index 6753b4a8f3..2e3452e5eb 100644 --- a/include/sys/dsl_scan.h +++ b/include/sys/dsl_scan.h @@ -61,7 +61,7 @@ typedef struct dsl_scan_phys { uint64_t scn_end_time; uint64_t scn_to_examine; /* total bytes to be scanned */ uint64_t scn_examined; /* bytes scanned so far */ - uint64_t scn_to_process; + uint64_t scn_skipped; /* bytes skipped by scanner */ uint64_t scn_processed; uint64_t scn_errors; /* scan I/O error count */ uint64_t scn_ddt_class_max; diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 93193fa142..bc940e8a79 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -1088,7 +1088,7 @@ typedef struct pool_scan_stat { uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ uint64_t pss_examined; /* total bytes located by scanner */ - uint64_t pss_to_process; /* total bytes to process */ + uint64_t pss_skipped; /* total bytes skipped by scanner */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ @@ -1152,6 +1152,7 @@ typedef struct vdev_rebuild_stat { uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ + uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */ } vdev_rebuild_stat_t; /* diff --git a/include/sys/vdev.h b/include/sys/vdev.h index 26c834ff57..03e1f438aa 100644 --- a/include/sys/vdev.h +++ b/include/sys/vdev.h @@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); -extern int vdev_queue_length(vdev_t *vd); +extern uint32_t vdev_queue_length(vdev_t *vd); extern uint64_t vdev_queue_last_offset(vdev_t *vd); +extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index 74b3737d8e..2b22b973ba 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -130,27 +130,24 @@ typedef const struct vdev_ops { /* * Virtual device properties */ -typedef struct vdev_queue_class { - uint32_t vqc_active; - - /* - * Sorted by offset or timestamp, depending on if the queue is - * LBA-ordered vs FIFO. - */ - avl_tree_t vqc_queued_tree; +typedef union vdev_queue_class { + list_t vqc_list; + avl_tree_t vqc_tree; } vdev_queue_class_t; struct vdev_queue { vdev_t *vq_vdev; vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; - avl_tree_t vq_active_tree; avl_tree_t vq_read_offset_tree; avl_tree_t vq_write_offset_tree; - avl_tree_t vq_trim_offset_tree; uint64_t vq_last_offset; zio_priority_t vq_last_prio; /* Last sent I/O priority. */ + uint32_t vq_cqueued; /* Classes with queued I/Os. */ + uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE]; + uint32_t vq_active; /* Number of active I/Os. */ uint32_t vq_ia_active; /* Active interactive I/Os. */ uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */ + list_t vq_active_list; /* List of active I/Os. */ hrtime_t vq_io_complete_ts; /* time last i/o completed */ hrtime_t vq_io_delta_ts; zio_t vq_io_search; /* used as local for stack reduction */ diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h index c4cfe0c567..55ec6c5703 100644 --- a/include/sys/vdev_rebuild.h +++ b/include/sys/vdev_rebuild.h @@ -79,6 +79,7 @@ typedef struct vdev_rebuild { uint64_t vr_pass_start_time; uint64_t vr_pass_bytes_scanned; uint64_t vr_pass_bytes_issued; + uint64_t vr_pass_bytes_skipped; /* On-disk state updated by vdev_rebuild_zap_update_sync() */ vdev_rebuild_phys_t vr_rebuild_phys; diff --git a/include/sys/zio.h b/include/sys/zio.h index ec32211f69..f4da80783e 100644 --- a/include/sys/zio.h +++ b/include/sys/zio.h @@ -341,9 +341,9 @@ typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; uint8_t zp_complevel; - dmu_object_type_t zp_type; uint8_t zp_level; uint8_t zp_copies; + dmu_object_type_t zp_type; boolean_t zp_dedup; boolean_t zp_dedup_verify; boolean_t zp_nopwrite; @@ -436,6 +436,12 @@ typedef struct zio_link { list_node_t zl_child_node; } zio_link_t; +enum zio_qstate { + ZIO_QS_NONE = 0, + ZIO_QS_QUEUED, + ZIO_QS_ACTIVE, +}; + struct zio { /* Core information about this I/O */ zbookmark_phys_t io_bookmark; @@ -479,6 +485,12 @@ struct zio { const zio_vsd_ops_t *io_vsd_ops; metaslab_class_t *io_metaslab_class; /* dva throttle class */ + enum zio_qstate io_queue_state; /* vdev queue state */ + union { + list_node_t l; + avl_node_t a; + } io_queue_node ____cacheline_aligned; /* allocator and vdev queues */ + avl_node_t io_offset_node; /* vdev offset queues */ uint64_t io_offset; hrtime_t io_timestamp; /* submitted at */ hrtime_t io_queued_timestamp; @@ -486,9 +498,6 @@ struct zio { hrtime_t io_delta; /* vdev queue service delta */ hrtime_t io_delay; /* Device access time (disk or */ /* file). */ - avl_node_t io_queue_node; - avl_node_t io_offset_node; - avl_node_t io_alloc_node; zio_alloc_list_t io_alloc_list; /* Internal pipeline state */ @@ -602,6 +611,7 @@ extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); extern zio_t *zio_unique_parent(zio_t *cio); extern void zio_add_child(zio_t *pio, zio_t *cio); +extern void zio_add_child_first(zio_t *pio, zio_t *cio); extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c index fe9f3268d3..11d3eb6a3c 100644 --- a/lib/libzfs/libzfs_dataset.c +++ b/lib/libzfs/libzfs_dataset.c @@ -1789,7 +1789,8 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) nvlist_t *nvl; int nvl_len = 0; int added_resv = 0; - zfs_prop_t prop = 0; + zfs_prop_t prop; + boolean_t nsprop = B_FALSE; nvpair_t *elem; (void) snprintf(errbuf, sizeof (errbuf), @@ -1836,6 +1837,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) elem = nvlist_next_nvpair(nvl, elem)) { prop = zfs_name_to_prop(nvpair_name(elem)); + nsprop |= zfs_is_namespace_prop(prop); assert(cl_idx < nvl_len); /* @@ -1934,8 +1936,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) * if one of the options handled by the generic * Linux namespace layer has been modified. */ - if (zfs_is_namespace_prop(prop) && - zfs_is_mounted(zhp, NULL)) + if (nsprop && zfs_is_mounted(zhp, NULL)) ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); } } diff --git a/man/man4/spl.4 b/man/man4/spl.4 index 02efaf16dc..82455fb532 100644 --- a/man/man4/spl.4 +++ b/man/man4/spl.4 @@ -193,4 +193,19 @@ The proc file will walk the lists with lock held, reading it could cause a lock-up if the list grow too large without limiting the output. "(truncated)" will be shown if the list is larger than the limit. +. +.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint +(Linux-only) +How long a taskq has to have had no work before we tear it down. +Previously, we would tear down a dynamic taskq worker as soon +as we noticed it had no work, but it was observed that this led +to a lot of churn in tearing down things we then immediately +spawned anew. +In practice, it seems any nonzero value will remove the vast +majority of this churn, while the nontrivially larger value +was chosen to help filter out the little remaining churn on +a mostly idle system. +Setting this value to +.Sy 0 +will revert to the previous behavior. .El diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 5fbd9d7db9..271b02b6ee 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -239,6 +239,16 @@ relative to the pool. Make some blocks above a certain size be gang blocks. This option is used by the test suite to facilitate testing. . +.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int +Default DDT ZAP data block size as a power of 2. Note that changing this after +creating a DDT on the pool will not affect existing DDTs, only newly created +ones. +. +.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int +Default DDT ZAP indirect block size as a power of 2. Note that changing this +after creating a DDT on the pool will not affect existing DDTs, only newly +created ones. +. .It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int Default dnode block size as a power of 2. . @@ -2016,12 +2026,6 @@ Historical statistics for this many latest TXGs will be available in Flush dirty data to disk at least every this many seconds (maximum TXG duration). . -.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint -Allow TRIM I/O operations to be aggregated. -This is normally not helpful because the extents to be trimmed -will have been already been aggregated by the metaslab. -This option is provided for debugging and performance analysis. -. .It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint Max vdev I/O aggregation size. . diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 index 031953c543..52c8e452fa 100644 --- a/man/man8/zdb.8 +++ b/man/man8/zdb.8 @@ -14,7 +14,7 @@ .\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. .\" Copyright (c) 2017 Intel Corporation. .\" -.Dd June 4, 2023 +.Dd June 27, 2023 .Dt ZDB 8 .Os . @@ -51,6 +51,7 @@ .Fl C .Op Fl A .Op Fl U Ar cache +.Op Ar poolname .Nm .Fl E .Op Fl A diff --git a/man/man8/zpool-create.8 b/man/man8/zpool-create.8 index da1a79c72c..8449520944 100644 --- a/man/man8/zpool-create.8 +++ b/man/man8/zpool-create.8 @@ -87,13 +87,13 @@ currently in use by another subsystem. However this check is not robust enough to detect simultaneous attempts to use a new device in different pools, even if .Sy multihost Ns = Sy enabled . -The administrator must ensure, that simultaneous invocations of any combination +The administrator must ensure that simultaneous invocations of any combination of .Nm zpool Cm replace , .Nm zpool Cm create , .Nm zpool Cm add , or -.Nm zpool Cm labelclear , +.Nm zpool Cm labelclear do not refer to the same device. Using the same device in two pools will result in pool corruption. .Pp diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8 index 138226e456..03f3ad4991 100644 --- a/man/man8/zpool-scrub.8 +++ b/man/man8/zpool-scrub.8 @@ -26,7 +26,7 @@ .\" Copyright 2017 Nexenta Systems, Inc. .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. .\" -.Dd July 25, 2021 +.Dd June 22, 2023 .Dt ZPOOL-SCRUB 8 .Os . @@ -123,7 +123,7 @@ Status of pool with ongoing scrub: .No # Nm zpool Cm status ... scan: scrub in progress since Sun Jul 25 16:07:49 2021 - 403M scanned at 100M/s, 68.4M issued at 10.0M/s, 405M total + 403M / 405M scanned at 100M/s, 68.4M / 405M issued at 10.0M/s 0B repaired, 16.91% done, 00:00:04 to go ... .Ed diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c index 84497359ce..d18f935b16 100644 --- a/module/os/linux/spl/spl-taskq.c +++ b/module/os/linux/spl/spl-taskq.c @@ -36,6 +36,12 @@ static int spl_taskq_thread_bind = 0; module_param(spl_taskq_thread_bind, int, 0644); MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); +static uint_t spl_taskq_thread_timeout_ms = 10000; +/* BEGIN CSTYLED */ +module_param(spl_taskq_thread_timeout_ms, uint, 0644); +/* END CSTYLED */ +MODULE_PARM_DESC(spl_taskq_thread_timeout_ms, + "Time to require a dynamic thread be idle before it gets cleaned up"); static int spl_taskq_thread_dynamic = 1; module_param(spl_taskq_thread_dynamic, int, 0444); @@ -848,12 +854,37 @@ taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) tqt_thread_list) == tqt) return (0); - return + int no_work = ((tq->tq_nspawn == 0) && /* No threads are being spawned */ (tq->tq_nactive == 0) && /* No threads are handling tasks */ (tq->tq_nthreads > 1) && /* More than 1 thread is running */ (!taskq_next_ent(tq)) && /* There are no pending tasks */ (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ + + /* + * If we would have said stop before, let's instead wait a bit, maybe + * we'll see more work come our way soon... + */ + if (no_work) { + /* if it's 0, we want the old behavior. */ + /* if the taskq is being torn down, we also want to go away. */ + if (spl_taskq_thread_timeout_ms == 0 || + !(tq->tq_flags & TASKQ_ACTIVE)) + return (1); + unsigned long lasttime = tq->lastshouldstop; + if (lasttime > 0) { + if (time_after(jiffies, lasttime + + msecs_to_jiffies(spl_taskq_thread_timeout_ms))) + return (1); + else + return (0); + } else { + tq->lastshouldstop = jiffies; + } + } else { + tq->lastshouldstop = 0; + } + return (0); } static int @@ -1091,6 +1122,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri, tq->tq_flags = (flags | TASKQ_ACTIVE); tq->tq_next_id = TASKQID_INITIAL; tq->tq_lowest_id = TASKQID_INITIAL; + tq->lastshouldstop = 0; INIT_LIST_HEAD(&tq->tq_free_list); INIT_LIST_HEAD(&tq->tq_pend_list); INIT_LIST_HEAD(&tq->tq_prio_list); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index cdf32c78b4..38bc8e2c4e 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -54,7 +54,7 @@ static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS -static const unsigned int zvol_open_timeout_ms = 1000; +static unsigned int zvol_open_timeout_ms = 1000; #endif static unsigned int zvol_threads = 0; @@ -1612,4 +1612,9 @@ MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, "Process volblocksize blocks per thread"); #endif +#ifndef HAVE_BLKDEV_GET_ERESTARTSYS +module_param(zvol_open_timeout_ms, uint, 0644); +MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); +#endif + /* END CSTYLED */ diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 33fea0ba3d..1fb1982199 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -1209,10 +1209,19 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ASSERT3S(dde->dde_class, <, DDT_CLASSES); ddp = &dde->dde_phys[BP_GET_NDVAS(bp)]; - if (ddp->ddp_refcnt == 0) { - /* This should never happen? */ - ddt_phys_fill(ddp, bp); - } + + /* + * This entry already existed (dde_type is real), so it must + * have refcnt >0 at the start of this txg. We are called from + * brt_pending_apply(), before frees are issued, so the refcnt + * can't be lowered yet. Therefore, it must be >0. We assert + * this because if the order of BRT and DDT interactions were + * ever to change and the refcnt was ever zero here, then + * likely further action is required to fill out the DDT entry, + * and this is a place that is likely to be missed in testing. + */ + ASSERT3U(ddp->ddp_refcnt, >, 0); + ddt_phys_addref(ddp); result = B_TRUE; } else { diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 27dbbc55f1..8f6397a6d1 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -31,8 +31,8 @@ #include #include -static const int ddt_zap_leaf_blockshift = 12; -static const int ddt_zap_indirect_blockshift = 12; +static unsigned int ddt_zap_default_bs = 15; +static unsigned int ddt_zap_default_ibs = 15; static int ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) @@ -43,7 +43,7 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) flags |= ZAP_FLAG_PRE_HASHED_KEY; *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, - ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, + ddt_zap_default_bs, ddt_zap_default_ibs, DMU_OT_NONE, 0, tx); return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0); @@ -166,3 +166,10 @@ const ddt_ops_t ddt_zap_ops = { ddt_zap_walk, ddt_zap_count, }; + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW, + "DDT ZAP leaf blockshift"); +ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW, + "DDT ZAP indirect blockshift"); +/* END CSTYLED */ diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index 1dd44171c1..50428bff3e 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -573,7 +573,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) * counter to how far we've scanned. We know we're consistent * up to here. */ - scn->scn_issued_before_pass = scn->scn_phys.scn_examined; + scn->scn_issued_before_pass = scn->scn_phys.scn_examined - + scn->scn_phys.scn_skipped; if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { @@ -4362,7 +4363,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) * Disabled by default, set zfs_scan_report_txgs to report * average performance over the last zfs_scan_report_txgs TXGs. */ - if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 && + if (zfs_scan_report_txgs != 0 && tx->tx_txg % zfs_scan_report_txgs == 0) { scn->scn_issued_before_pass += spa->spa_scan_pass_issued; spa_scan_stat_init(spa); @@ -4564,6 +4565,15 @@ count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all) all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); } +static void +count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all) +{ + if (BP_IS_EMBEDDED(bp)) + return; + atomic_add_64(&scn->scn_phys.scn_skipped, + all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0])); +} + static void count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) { @@ -4709,7 +4719,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, count_block(dp->dp_blkstats, bp); if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) { - count_block_issued(spa, bp, B_TRUE); + count_block_skipped(scn, bp, B_TRUE); return (0); } @@ -4750,7 +4760,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (needs_io && !zfs_no_scrub_io) { dsl_scan_enqueue(dp, bp, zio_flags, zb); } else { - count_block_issued(spa, bp, B_TRUE); + count_block_skipped(scn, bp, B_TRUE); } /* do not relocate this block */ @@ -5119,9 +5129,9 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); range_tree_remove_fill(queue->q_exts_by_addr, start, size); - /* count the block as though we issued it */ + /* count the block as though we skipped it */ sio2bp(sio, &tmpbp); - count_block_issued(spa, &tmpbp, B_FALSE); + count_block_skipped(scn, &tmpbp, B_FALSE); sio_free(sio); } diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 9ef948e9e4..06f6407690 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare, - sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + sizeof (zio_t), offsetof(zio_t, io_queue_node.a)); } avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); @@ -2611,7 +2611,7 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; - ps->pss_to_process = scn->scn_phys.scn_to_process; + ps->pss_skipped = scn->scn_phys.scn_skipped; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; diff --git a/module/zfs/txg.c b/module/zfs/txg.c index ec61cabcaa..a67c043446 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl) boolean_t txg_all_lists_empty(txg_list_t *tl) { - mutex_enter(&tl->tl_lock); - for (int i = 0; i < TXG_SIZE; i++) { - if (!txg_list_empty_impl(tl, i)) { - mutex_exit(&tl->tl_lock); - return (B_FALSE); - } - } - mutex_exit(&tl->tl_lock); - return (B_TRUE); + boolean_t res = B_TRUE; + for (int i = 0; i < TXG_SIZE; i++) + res &= (tl->tl_head[i] == NULL); + return (res); } /* diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index 612e66c3a8..30551feb63 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); - for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { - vsx->vsx_active_queue[t] = - vd->vdev_queue.vq_class[t].vqc_active; - vsx->vsx_pend_queue[t] = avl_numnodes( - &vd->vdev_queue.vq_class[t].vqc_queued_tree); + for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { + vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t]; + vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t); } } } @@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag) vdev_queue_t *vq = &vd->vdev_queue; mutex_enter(&vq->vq_lock); - if (avl_numnodes(&vq->vq_active_tree) > 0) { + if (vq->vq_active > 0) { spa_t *spa = vd->vdev_spa; zio_t *fio; uint64_t delta; - zfs_dbgmsg("slow vdev: %s has %lu active IOs", - vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); + zfs_dbgmsg("slow vdev: %s has %u active IOs", + vd->vdev_path, vq->vq_active); /* * Look at the head of all the pending queues, * if any I/O has been outstanding for longer than * the spa_deadman_synctime invoke the deadman logic. */ - fio = avl_first(&vq->vq_active_tree); + fio = list_head(&vq->vq_active_list); delta = gethrtime() - fio->io_timestamp; if (delta > spa_deadman_synctime(spa)) zio_deadman(fio, tag); diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index abb7d0662b..08d918467d 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300; */ uint_t zfs_vdev_def_queue_depth = 32; -/* - * Allow TRIM I/Os to be aggregated. This should normally not be needed since - * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted - * by the TRIM code in zfs_trim.c. - */ -static uint_t zfs_vdev_aggregate_trim = 0; - static int vdev_queue_offset_compare(const void *x1, const void *x2) { @@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2) return (TREE_PCMP(z1, z2)); } -static inline avl_tree_t * -vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) -{ - return (&vq->vq_class[p].vqc_queued_tree); -} - -static inline avl_tree_t * -vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) -{ - ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM); - if (t == ZIO_TYPE_READ) - return (&vq->vq_read_offset_tree); - else if (t == ZIO_TYPE_WRITE) - return (&vq->vq_write_offset_tree); - else - return (&vq->vq_trim_offset_tree); -} +#define VDQ_T_SHIFT 29 static int -vdev_queue_timestamp_compare(const void *x1, const void *x2) +vdev_queue_to_compare(const void *x1, const void *x2) { const zio_t *z1 = (const zio_t *)x1; const zio_t *z2 = (const zio_t *)x2; - int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); + int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT, + z2->io_timestamp >> VDQ_T_SHIFT); + int ocmp = TREE_CMP(z1->io_offset, z2->io_offset); + int cmp = tcmp ? tcmp : ocmp; - if (likely(cmp)) + if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE))) return (cmp); return (TREE_PCMP(z1, z2)); } +static inline boolean_t +vdev_queue_class_fifo(zio_priority_t p) +{ + return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE || + p == ZIO_PRIORITY_TRIM); +} + +static void +vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + vq->vq_cqueued |= 1U << p; + if (vdev_queue_class_fifo(p)) + list_insert_tail(&vq->vq_class[p].vqc_list, zio); + else + avl_add(&vq->vq_class[p].vqc_tree, zio); +} + +static void +vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio) +{ + zio_priority_t p = zio->io_priority; + uint32_t empty; + if (vdev_queue_class_fifo(p)) { + list_t *list = &vq->vq_class[p].vqc_list; + list_remove(list, zio); + empty = list_is_empty(list); + } else { + avl_tree_t *tree = &vq->vq_class[p].vqc_tree; + avl_remove(tree, zio); + empty = avl_is_empty(tree); + } + vq->vq_cqueued &= ~(empty << p); +} + static uint_t vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p) { @@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa) } static uint_t -vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) +vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p) { switch (p) { case ZIO_PRIORITY_SYNC_READ: @@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) case ZIO_PRIORITY_ASYNC_READ: return (zfs_vdev_async_read_max_active); case ZIO_PRIORITY_ASYNC_WRITE: - return (vdev_queue_max_async_writes(spa)); + return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa)); case ZIO_PRIORITY_SCRUB: if (vq->vq_ia_active > 0) { return (MIN(vq->vq_nia_credit, @@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p) static zio_priority_t vdev_queue_class_to_issue(vdev_queue_t *vq) { - spa_t *spa = vq->vq_vdev->vdev_spa; - zio_priority_t p, n; + uint32_t cq = vq->vq_cqueued; + zio_priority_t p, p1; - if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) + if (cq == 0 || vq->vq_active >= zfs_vdev_max_active) return (ZIO_PRIORITY_NUM_QUEUEABLE); /* @@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * Do round-robin to reduce starvation due to zfs_vdev_max_active * and vq_nia_credit limits. */ - for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) { - p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE; - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_min_active(vq, p)) { - vq->vq_last_prio = p; - return (p); - } + p1 = vq->vq_last_prio + 1; + if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE) + p1 = 0; + for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; + } + for (p = 0; p < p1; p++) { + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_min_active(vq, p)) + goto found; } /* @@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq) * maximum # outstanding i/os. */ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && - vq->vq_class[p].vqc_active < - vdev_queue_class_max_active(spa, vq, p)) { - vq->vq_last_prio = p; - return (p); - } + if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < + vdev_queue_class_max_active(vq, p)) + break; } - /* No eligible queued i/os */ - return (ZIO_PRIORITY_NUM_QUEUEABLE); +found: + vq->vq_last_prio = p; + return (p); } void @@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd) vdev_queue_t *vq = &vd->vdev_queue; zio_priority_t p; - mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); vq->vq_vdev = vd; - taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent); - - avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); - avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM), - vdev_queue_offset_compare, sizeof (zio_t), - offsetof(struct zio, io_offset_node)); for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { - int (*compfn) (const void *, const void *); - - /* - * The synchronous/trim i/o queues are dispatched in FIFO rather - * than LBA order. This provides more consistent latency for - * these i/os. - */ - if (p == ZIO_PRIORITY_SYNC_READ || - p == ZIO_PRIORITY_SYNC_WRITE || - p == ZIO_PRIORITY_TRIM) { - compfn = vdev_queue_timestamp_compare; + if (vdev_queue_class_fifo(p)) { + list_create(&vq->vq_class[p].vqc_list, + sizeof (zio_t), + offsetof(struct zio, io_queue_node.l)); } else { - compfn = vdev_queue_offset_compare; + avl_create(&vq->vq_class[p].vqc_tree, + vdev_queue_to_compare, sizeof (zio_t), + offsetof(struct zio, io_queue_node.a)); } - avl_create(vdev_queue_class_tree(vq, p), compfn, - sizeof (zio_t), offsetof(struct zio, io_queue_node)); } + avl_create(&vq->vq_read_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(&vq->vq_write_offset_tree, + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); vq->vq_last_offset = 0; + list_create(&vq->vq_active_list, sizeof (struct zio), + offsetof(struct zio, io_queue_node.l)); + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); } void @@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; - for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) - avl_destroy(vdev_queue_class_tree(vq, p)); - avl_destroy(&vq->vq_active_tree); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); - avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM)); + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (vdev_queue_class_fifo(p)) + list_destroy(&vq->vq_class[p].vqc_list); + else + avl_destroy(&vq->vq_class[p].vqc_tree); + } + avl_destroy(&vq->vq_read_offset_tree); + avl_destroy(&vq->vq_write_offset_tree); + list_destroy(&vq->vq_active_list); mutex_destroy(&vq->vq_lock); } static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); + zio->io_queue_state = ZIO_QS_QUEUED; + vdev_queue_class_add(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_add(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_add(&vq->vq_write_offset_tree, zio); } static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { - ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); - avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); + vdev_queue_class_remove(vq, zio); + if (zio->io_type == ZIO_TYPE_READ) + avl_remove(&vq->vq_read_offset_tree, zio); + else if (zio->io_type == ZIO_TYPE_WRITE) + avl_remove(&vq->vq_write_offset_tree, zio); + zio->io_queue_state = ZIO_QS_NONE; } static boolean_t @@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active++; + vq->vq_cactive[zio->io_priority]++; + vq->vq_active++; if (vdev_queue_is_interactive(zio->io_priority)) { if (++vq->vq_ia_active == 1) vq->vq_nia_credit = 1; } else if (vq->vq_ia_active > 0) { vq->vq_nia_credit--; } - avl_add(&vq->vq_active_tree, zio); + zio->io_queue_state = ZIO_QS_ACTIVE; + list_insert_tail(&vq->vq_active_list, zio); } static void @@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); - vq->vq_class[zio->io_priority].vqc_active--; + vq->vq_cactive[zio->io_priority]--; + vq->vq_active--; if (vdev_queue_is_interactive(zio->io_priority)) { if (--vq->vq_ia_active == 0) vq->vq_nia_credit = 0; @@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) vq->vq_nia_credit = zfs_vdev_nia_credit; } else if (vq->vq_ia_active == 0) vq->vq_nia_credit++; - avl_remove(&vq->vq_active_tree, zio); + list_remove(&vq->vq_active_list, zio); + zio->io_queue_state = ZIO_QS_NONE; } static void @@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) uint64_t maxgap = 0; uint64_t size; uint64_t limit; - int maxblocksize; boolean_t stretch = B_FALSE; - avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); - zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; uint64_t next_offset; abd_t *abd; + avl_tree_t *t; + + /* + * TRIM aggregation should not be needed since code in zfs_trim.c can + * submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M). + */ + if (zio->io_type == ZIO_TYPE_TRIM) + return (NULL); + + if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) + return (NULL); - maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); if (vq->vq_vdev->vdev_nonrot) limit = zfs_vdev_aggregation_limit_non_rotating; else limit = zfs_vdev_aggregation_limit; - limit = MIN(limit, maxblocksize); - - if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) - return (NULL); - - /* - * While TRIM commands could be aggregated based on offset this - * behavior is disabled until it's determined to be beneficial. - */ - if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) + if (limit == 0) return (NULL); + limit = MIN(limit, SPA_MAXBLOCKSIZE); /* * I/Os to distributed spares are directly dispatched to the dRAID @@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) first = last = zio; - if (zio->io_type == ZIO_TYPE_READ) + if (zio->io_type == ZIO_TYPE_READ) { maxgap = zfs_vdev_read_gap_limit; + t = &vq->vq_read_offset_tree; + } else { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + t = &vq->vq_write_offset_tree; + } /* * We can aggregate I/Os that are sufficiently adjacent and of @@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) * Walk backwards through sufficiently contiguous I/Os * recording the last non-optional I/O. */ + zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; while ((dio = AVL_PREV(t, first)) != NULL && (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && IO_SPAN(dio, last) <= limit && @@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && (IO_SPAN(first, dio) <= limit || (dio->io_flags & ZIO_FLAG_OPTIONAL)) && - IO_SPAN(first, dio) <= maxblocksize && + IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE && IO_GAP(last, dio) <= maxgap && dio->io_type == zio->io_type) { last = dio; @@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) return (NULL); size = IO_SPAN(first, last); - ASSERT3U(size, <=, maxblocksize); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); abd = abd_alloc_gang(); if (abd == NULL) @@ -824,19 +847,30 @@ again: return (NULL); } - /* - * For LBA-ordered queues (async / scrub / initializing), issue the - * i/o which follows the most recently issued i/o in LBA (offset) order. - * - * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. - */ - tree = vdev_queue_class_tree(vq, p); - vq->vq_io_search.io_timestamp = 0; - vq->vq_io_search.io_offset = vq->vq_last_offset - 1; - VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); - zio = avl_nearest(tree, idx, AVL_AFTER); - if (zio == NULL) - zio = avl_first(tree); + if (vdev_queue_class_fifo(p)) { + zio = list_head(&vq->vq_class[p].vqc_list); + } else { + /* + * For LBA-ordered queues (async / scrub / initializing), + * issue the I/O which follows the most recently issued I/O + * in LBA (offset) order, but to avoid starvation only within + * the same 0.5 second interval as the first I/O. + */ + tree = &vq->vq_class[p].vqc_tree; + zio = aio = avl_first(tree); + if (zio->io_offset < vq->vq_last_offset) { + vq->vq_io_search.io_timestamp = zio->io_timestamp; + vq->vq_io_search.io_offset = vq->vq_last_offset; + zio = avl_find(tree, &vq->vq_io_search, &idx); + if (zio == NULL) { + zio = avl_nearest(tree, idx, AVL_AFTER); + if (zio == NULL || + (zio->io_timestamp >> VDQ_T_SHIFT) != + (aio->io_timestamp >> VDQ_T_SHIFT)) + zio = aio; + } + } + } ASSERT3U(zio->io_priority, ==, p); aio = vdev_queue_aggregate(vq, zio); @@ -967,7 +1001,6 @@ void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) { vdev_queue_t *vq = &zio->io_vd->vdev_queue; - avl_tree_t *tree; /* * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio @@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * Otherwise, the zio is currently active and we cannot change its * priority. */ - tree = vdev_queue_class_tree(vq, zio->io_priority); - if (avl_find(tree, zio, NULL) == zio) { - avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + if (zio->io_queue_state == ZIO_QS_QUEUED) { + vdev_queue_class_remove(vq, zio); zio->io_priority = priority; - avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); - } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + vdev_queue_class_add(vq, zio); + } else if (zio->io_queue_state == ZIO_QS_NONE) { zio->io_priority = priority; } @@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) * vq_lock mutex use here, instead we prefer to keep it lock free for * performance. */ -int +uint32_t vdev_queue_length(vdev_t *vd) { - return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); + return (vd->vdev_queue.vq_active); } uint64_t @@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd) return (vd->vdev_queue.vq_last_offset); } +uint64_t +vdev_queue_class_length(vdev_t *vd, zio_priority_t p) +{ + vdev_queue_t *vq = &vd->vdev_queue; + if (vdev_queue_class_fifo(p)) + return (list_is_empty(&vq->vq_class[p].vqc_list) == 0); + else + return (avl_numnodes(&vq->vq_class[p].vqc_tree)); +} + ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW, "Max vdev I/O aggregation size"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT, ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media"); -ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW, - "Allow TRIM I/O to be aggregated"); - ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW, "Aggregate read I/O over gap"); diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c index 62aa61b3b9..75c3900cbb 100644 --- a/module/zfs/vdev_rebuild.c +++ b/module/zfs/vdev_rebuild.c @@ -571,8 +571,10 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) vdev_rebuild_blkptr_init(&blk, vd, start, size); uint64_t psize = BP_GET_PSIZE(&blk); - if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) + if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) { + vr->vr_pass_bytes_skipped += size; return (0); + } mutex_enter(&vr->vr_io_lock); @@ -786,6 +788,7 @@ vdev_rebuild_thread(void *arg) vr->vr_pass_start_time = gethrtime(); vr->vr_pass_bytes_scanned = 0; vr->vr_pass_bytes_issued = 0; + vr->vr_pass_bytes_skipped = 0; uint64_t update_est_time = gethrtime(); vdev_rebuild_update_bytes_est(vd, 0); @@ -1153,6 +1156,7 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) vr->vr_pass_start_time); vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; + vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped; mutex_exit(&tvd->vdev_rebuild_lock); } diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 86706469ac..7bdcc16393 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -462,14 +462,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) return (SET_ERROR(EINVAL)); } - const uint64_t max_blksz = zfsvfs->z_max_blksz; - /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. - * Skip this if uio contains loaned arc_buf. */ - if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { + ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1); + if (zfs_uio_prefaultpages(pfbytes, uio)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EFAULT)); } @@ -544,10 +542,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) break; } + uint64_t blksz; + if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) { + if (zp->z_blksz > zfsvfs->z_max_blksz && + !ISP2(zp->z_blksz)) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + blksz = 1 << highbit64(zp->z_blksz); + } else { + blksz = zfsvfs->z_max_blksz; + } + blksz = MIN(blksz, P2ROUNDUP(end_size, + SPA_MINBLOCKSIZE)); + blksz = MAX(blksz, zp->z_blksz); + } else { + blksz = zp->z_blksz; + } + arc_buf_t *abuf = NULL; - if (n >= max_blksz && woff >= zp->z_size && - P2PHASE(woff, max_blksz) == 0 && - zp->z_blksz == max_blksz) { + ssize_t nbytes = n; + if (n >= blksz && woff >= zp->z_size && + P2PHASE(woff, blksz) == 0 && + (blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter @@ -555,18 +574,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ - size_t cbytes; - abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), - max_blksz); + blksz); ASSERT(abuf != NULL); - ASSERT(arc_buf_size(abuf) == max_blksz); - if ((error = zfs_uiocopy(abuf->b_data, max_blksz, - UIO_WRITE, uio, &cbytes))) { + ASSERT(arc_buf_size(abuf) == blksz); + if ((error = zfs_uiocopy(abuf->b_data, blksz, + UIO_WRITE, uio, &nbytes))) { dmu_return_arcbuf(abuf); break; } - ASSERT3S(cbytes, ==, max_blksz); + ASSERT3S(nbytes, ==, blksz); + } else { + nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) - + P2PHASE(woff, blksz)); + if (pfbytes < nbytes) { + if (zfs_uio_prefaultpages(nbytes, uio)) { + error = SET_ERROR(EFAULT); + break; + } + pfbytes = nbytes; + } } /* @@ -576,8 +603,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); - dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, - MIN(n, max_blksz)); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); @@ -600,31 +626,10 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * shrink down lr_length to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { - uint64_t new_blksz; - - if (zp->z_blksz > max_blksz) { - /* - * File's blocksize is already larger than the - * "recordsize" property. Only let it grow to - * the next power of 2. - */ - ASSERT(!ISP2(zp->z_blksz)); - new_blksz = MIN(end_size, - 1 << highbit64(zp->z_blksz)); - } else { - new_blksz = MIN(end_size, max_blksz); - } - zfs_grow_blocksize(zp, new_blksz, tx); + zfs_grow_blocksize(zp, blksz, tx); zfs_rangelock_reduce(lr, woff, n); } - /* - * XXX - should we really limit each write to z_max_blksz? - * Perhaps we should use SPA_MAXBLOCKSIZE chunks? - */ - const ssize_t nbytes = - MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - ssize_t tx_bytes; if (abuf == NULL) { tx_bytes = zfs_uio_resid(uio); @@ -644,12 +649,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) * zfs_uio_prefaultpages, or prefaultpages may * error, and we may break the loop early. */ - if (tx_bytes != zfs_uio_resid(uio)) - n -= tx_bytes - zfs_uio_resid(uio); - if (zfs_uio_prefaultpages(MIN(n, max_blksz), - uio)) { - break; - } + n -= tx_bytes - zfs_uio_resid(uio); + pfbytes -= tx_bytes - zfs_uio_resid(uio); continue; } #endif @@ -665,15 +666,6 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) } tx_bytes -= zfs_uio_resid(uio); } else { - /* Implied by abuf != NULL: */ - ASSERT3S(n, >=, max_blksz); - ASSERT0(P2PHASE(woff, max_blksz)); - /* - * We can simplify nbytes to MIN(n, max_blksz) since - * P2PHASE(woff, max_blksz) is 0, and knowing - * n >= max_blksz lets us simplify further: - */ - ASSERT3S(nbytes, ==, max_blksz); /* * Thus, we're writing a full block at a block-aligned * offset and extending the file past EOF. @@ -758,13 +750,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr) break; ASSERT3S(tx_bytes, ==, nbytes); n -= nbytes; - - if (n > 0) { - if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) { - error = SET_ERROR(EFAULT); - break; - } - } + pfbytes -= nbytes; } zfs_znode_update_vfs(zp); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index ee8dcce3b3..00d66a2481 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -798,8 +798,8 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb) { ASSERT(MUTEX_HELD(&zilog->zl_lock)); ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); - ASSERT(list_is_empty(&lwb->lwb_waiters)); - ASSERT(list_is_empty(&lwb->lwb_itxs)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); + VERIFY(list_is_empty(&lwb->lwb_itxs)); ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); ASSERT3P(lwb->lwb_write_zio, ==, NULL); ASSERT3P(lwb->lwb_root_zio, ==, NULL); @@ -1425,6 +1425,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio) list_move_tail(&itxs, &lwb->lwb_itxs); list_move_tail(&waiters, &lwb->lwb_waiters); + txg = lwb->lwb_issued_txg; ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); lwb->lwb_state = LWB_STATE_FLUSH_DONE; @@ -1465,7 +1466,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio) list_destroy(&waiters); mutex_enter(&zilog->zl_lwb_io_lock); - txg = lwb->lwb_issued_txg; ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0); zilog->zl_lwb_inflight[txg & TXG_MASK]--; if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0) @@ -2525,10 +2525,10 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg) * This function will traverse the queue of itxs that need to be * committed, and move them onto the ZIL's zl_itx_commit_list. */ -static void +static uint64_t zil_get_commit_list(zilog_t *zilog) { - uint64_t otxg, txg; + uint64_t otxg, txg, wtxg = 0; list_t *commit_list = &zilog->zl_itx_commit_list; ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); @@ -2562,10 +2562,22 @@ zil_get_commit_list(zilog_t *zilog) */ ASSERT(zilog_is_dirty_in_txg(zilog, txg) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); - list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); + list_t *sync_list = &itxg->itxg_itxs->i_sync_list; + if (unlikely(zilog->zl_suspend > 0)) { + /* + * ZIL was just suspended, but we lost the race. + * Allow all earlier itxs to be committed, but ask + * caller to do txg_wait_synced(txg) for any new. + */ + if (!list_is_empty(sync_list)) + wtxg = MAX(wtxg, txg); + } else { + list_move_tail(commit_list, sync_list); + } mutex_exit(&itxg->itxg_lock); } + return (wtxg); } /* @@ -2953,11 +2965,12 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs) * not issued, we rely on future calls to zil_commit_writer() to issue * the lwb, or the timeout mechanism found in zil_commit_waiter(). */ -static void +static uint64_t zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) { list_t ilwbs; lwb_t *lwb; + uint64_t wtxg = 0; ASSERT(!MUTEX_HELD(&zilog->zl_lock)); ASSERT(spa_writeable(zilog->zl_spa)); @@ -2987,7 +3000,7 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) ZIL_STAT_BUMP(zilog, zil_commit_writer_count); - zil_get_commit_list(zilog); + wtxg = zil_get_commit_list(zilog); zil_prune_commit_list(zilog); zil_process_commit_list(zilog, zcw, &ilwbs); @@ -2996,6 +3009,7 @@ out: while ((lwb = list_remove_head(&ilwbs)) != NULL) zil_lwb_write_issue(zilog, lwb); list_destroy(&ilwbs); + return (wtxg); } static void @@ -3511,7 +3525,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); zil_commit_itx_assign(zilog, zcw); - zil_commit_writer(zilog, zcw); + uint64_t wtxg = zil_commit_writer(zilog, zcw); zil_commit_waiter(zilog, zcw); if (zcw->zcw_zio_error != 0) { @@ -3526,6 +3540,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid) DTRACE_PROBE2(zil__commit__io__error, zilog_t *, zilog, zil_commit_waiter_t *, zcw); txg_wait_synced(zilog->zl_dmu_pool, 0); + } else if (wtxg != 0) { + txg_wait_synced(zilog->zl_dmu_pool, wtxg); } zil_free_commit_waiter(zcw); @@ -3905,11 +3921,13 @@ zil_suspend(const char *osname, void **cookiep) return (error); zilog = dmu_objset_zil(os); + mutex_enter(&zilog->zl_issuer_lock); mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); + mutex_exit(&zilog->zl_issuer_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } @@ -3923,6 +3941,7 @@ zil_suspend(const char *osname, void **cookiep) if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); + mutex_exit(&zilog->zl_issuer_lock); dmu_objset_rele(os, suspend_tag); return (0); } @@ -3931,6 +3950,7 @@ zil_suspend(const char *osname, void **cookiep) dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; + mutex_exit(&zilog->zl_issuer_lock); if (zilog->zl_suspend > 1) { /* diff --git a/module/zfs/zio.c b/module/zfs/zio.c index fb8164f0ae..10279fde89 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -626,8 +626,6 @@ zio_unique_parent(zio_t *cio) void zio_add_child(zio_t *pio, zio_t *cio) { - zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); - /* * Logical I/Os can have logical, gang, or vdev children. * Gang I/Os can have gang or vdev children. @@ -636,6 +634,7 @@ zio_add_child(zio_t *pio, zio_t *cio) */ ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); zl->zl_parent = pio; zl->zl_child = cio; @@ -644,8 +643,9 @@ zio_add_child(zio_t *pio, zio_t *cio) ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + uint64_t *countp = pio->io_children[cio->io_child_type]; for (int w = 0; w < ZIO_WAIT_TYPES; w++) - pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; + countp[w] += !cio->io_state[w]; list_insert_head(&pio->io_child_list, zl); list_insert_head(&cio->io_parent_list, zl); @@ -654,6 +654,37 @@ zio_add_child(zio_t *pio, zio_t *cio) mutex_exit(&pio->io_lock); } +void +zio_add_child_first(zio_t *pio, zio_t *cio) +{ + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + zl->zl_parent = pio; + zl->zl_child = cio; + + ASSERT(list_is_empty(&cio->io_parent_list)); + list_insert_head(&cio->io_parent_list, zl); + + mutex_enter(&pio->io_lock); + + ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + + uint64_t *countp = pio->io_children[cio->io_child_type]; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + countp[w] += !cio->io_state[w]; + + list_insert_head(&pio->io_child_list, zl); + + mutex_exit(&pio->io_lock); +} + static void zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { @@ -840,12 +871,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { - zio->io_bp = (blkptr_t *)bp; - zio->io_bp_copy = *bp; - zio->io_bp_orig = *bp; if (type != ZIO_TYPE_WRITE || - zio->io_child_type == ZIO_CHILD_DDT) + zio->io_child_type == ZIO_CHILD_DDT) { + zio->io_bp_copy = *bp; zio->io_bp = &zio->io_bp_copy; /* so caller can free */ + } else { + zio->io_bp = (blkptr_t *)bp; + } + zio->io_bp_orig = *bp; if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) @@ -880,7 +913,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, zio->io_logical = pio->io_logical; if (zio->io_child_type == ZIO_CHILD_GANG) zio->io_gang_leader = pio->io_gang_leader; - zio_add_child(pio, zio); + zio_add_child_first(pio, zio); } taskq_init_ent(&zio->io_tqent); @@ -1601,7 +1634,6 @@ zio_read_bp_init(zio_t *zio) abd_return_buf_copy(zio->io_abd, data, psize); } else { ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); } if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) @@ -4442,8 +4474,10 @@ zio_ready(zio_t *zio) zio->io_ready(zio); } +#ifdef ZFS_DEBUG if (bp != NULL && bp != &zio->io_bp_copy) zio->io_bp_copy = *bp; +#endif if (zio->io_error != 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;