Merge pull request #143 from truenas/truenas/zfs-2.2-release-test

Merge openzfs/zfs-2.2-release to truenas/zfs-2.2-release
This commit is contained in:
Alexander Motin 2023-07-04 15:07:33 -04:00 committed by GitHub
commit 586731645f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 529 additions and 328 deletions

4
META
View File

@ -1,8 +1,8 @@
Meta: 1
Name: zfs
Branch: 1.0
Version: 2.1.99
Release: 1
Version: 2.2.0
Release: rc1
Release-Tags: relext
License: CDDL
Author: OpenZFS

View File

@ -794,7 +794,7 @@ usage(void)
"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
"\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
"\t%s [-v] <bookmark>\n"
"\t%s -C [-A] [-U <cache>]\n"
"\t%s -C [-A] [-U <cache>] [<poolname>]\n"
"\t%s -l [-Aqu] <device>\n"
"\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
"[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"

View File

@ -6057,8 +6057,8 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
if (p != NULL)
rid = p->pw_uid;
else if (*endch != '\0') {
(void) snprintf(errbuf, 256, gettext(
"invalid user %s\n"), curr);
(void) snprintf(errbuf, sizeof (errbuf),
gettext("invalid user %s\n"), curr);
allow_usage(un, B_TRUE, errbuf);
}
} else if (opts->group) {
@ -6071,8 +6071,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
if (g != NULL)
rid = g->gr_gid;
else if (*endch != '\0') {
(void) snprintf(errbuf, 256, gettext(
"invalid group %s\n"), curr);
(void) snprintf(errbuf, sizeof (errbuf),
gettext("invalid group %s\n"),
curr);
allow_usage(un, B_TRUE, errbuf);
}
} else {
@ -6097,8 +6098,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
who_type = ZFS_DELEG_GROUP;
rid = g->gr_gid;
} else {
(void) snprintf(errbuf, 256, gettext(
"invalid user/group %s\n"), curr);
(void) snprintf(errbuf, sizeof (errbuf),
gettext("invalid user/group %s\n"),
curr);
allow_usage(un, B_TRUE, errbuf);
}
}

View File

@ -7662,11 +7662,11 @@ static void
print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
{
time_t start, end, pause;
uint64_t pass_scanned, scanned, pass_issued, issued, total;
uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i;
uint64_t elapsed, scan_rate, issue_rate;
double fraction_done;
char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
char srate_buf[7], irate_buf[7], time_buf[32];
char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7];
char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32];
printf(" ");
printf_color(ANSI_BOLD, gettext("scan:"));
@ -7738,10 +7738,11 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
pass_scanned = ps->pss_pass_exam;
issued = ps->pss_issued;
pass_issued = ps->pss_pass_issued;
total = ps->pss_to_examine;
total_s = ps->pss_to_examine;
total_i = ps->pss_to_examine - ps->pss_skipped;
/* we are only done with a block once we have issued the IO for it */
fraction_done = (double)issued / total;
fraction_done = (double)issued / total_i;
/* elapsed time for this pass, rounding up to 1 if it's 0 */
elapsed = time(NULL) - ps->pss_pass_start;
@ -7750,26 +7751,25 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
scan_rate = pass_scanned / elapsed;
issue_rate = pass_issued / elapsed;
uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
((total - issued) / issue_rate) : UINT64_MAX;
secs_to_dhms(total_secs_left, time_buf);
/* format all of the numbers we will be reporting */
zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
zfs_nicebytes(issued, issued_buf, sizeof (issued_buf));
zfs_nicebytes(total, total_buf, sizeof (total_buf));
zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf));
zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf));
/* do not print estimated time if we have a paused scrub */
if (pause == 0) {
(void) printf(gettext("\t%s scanned at %s/s, "
"%s issued at %s/s, %s total\n"),
scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
} else {
(void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
scanned_buf, issued_buf, total_buf);
(void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf);
if (pause == 0 && scan_rate > 0) {
zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
(void) printf(gettext(" at %s/s"), srate_buf);
}
(void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf);
if (pause == 0 && issue_rate > 0) {
zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
(void) printf(gettext(" at %s/s"), irate_buf);
}
(void) printf(gettext("\n"));
if (is_resilver) {
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
@ -7782,16 +7782,16 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
if (pause == 0) {
/*
* Only provide an estimate iff:
* 1) the time remaining is valid, and
* 1) we haven't yet issued all we expected, and
* 2) the issue rate exceeds 10 MB/s, and
* 3) it's either:
* a) a resilver which has started repairs, or
* b) a scrub which has entered the issue phase.
*/
if (total_secs_left != UINT64_MAX &&
issue_rate >= 10 * 1024 * 1024 &&
if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 &&
((is_resilver && ps->pss_processed > 0) ||
(is_scrub && issued > 0))) {
secs_to_dhms((total_i - issued) / issue_rate, time_buf);
(void) printf(gettext(", %s to go\n"), time_buf);
} else {
(void) printf(gettext(", no estimated "
@ -7803,7 +7803,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
}
static void
print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name)
{
if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
return;
@ -7815,17 +7815,20 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
uint64_t bytes_issued = vrs->vrs_bytes_issued;
uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
uint64_t bytes_est = vrs->vrs_bytes_est;
uint64_t bytes_est_s = vrs->vrs_bytes_est;
uint64_t bytes_est_i = vrs->vrs_bytes_est;
if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8)
bytes_est_i -= vrs->vrs_pass_bytes_skipped;
uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
(vrs->vrs_pass_time_ms + 1)) * 1000;
uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
(vrs->vrs_pass_time_ms + 1)) * 1000;
double scan_pct = MIN((double)bytes_scanned * 100 /
(bytes_est + 1), 100);
(bytes_est_s + 1), 100);
/* Format all of the numbers we will be reporting */
char bytes_scanned_buf[7], bytes_issued_buf[7];
char bytes_rebuilt_buf[7], bytes_est_buf[7];
char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7];
char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
sizeof (bytes_scanned_buf));
@ -7833,9 +7836,8 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
sizeof (bytes_issued_buf));
zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
sizeof (bytes_rebuilt_buf));
zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf));
zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf));
time_t start = vrs->vrs_start_time;
time_t end = vrs->vrs_end_time;
@ -7858,17 +7860,29 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
MAX(scan_rate, 1), time_buf);
(void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf,
bytes_est_s_buf);
if (scan_rate > 0) {
zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
(void) printf(gettext(" at %s/s"), scan_rate_buf);
}
(void) printf(gettext(", %s / %s issued"), bytes_issued_buf,
bytes_est_i_buf);
if (issue_rate > 0) {
zfs_nicebytes(issue_rate, issue_rate_buf,
sizeof (issue_rate_buf));
(void) printf(gettext(" at %s/s"), issue_rate_buf);
}
(void) printf(gettext("\n"));
(void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
"%s total\n"), bytes_scanned_buf, scan_rate_buf,
bytes_issued_buf, issue_rate_buf, bytes_est_buf);
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
bytes_rebuilt_buf, scan_pct);
if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
if (scan_rate >= 10 * 1024 * 1024) {
if (bytes_est_s >= bytes_scanned &&
scan_rate >= 10 * 1024 * 1024) {
secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate,
time_buf);
(void) printf(gettext(", %s to go\n"), time_buf);
} else {
(void) printf(gettext(", no estimated "
@ -7900,7 +7914,7 @@ print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot)
ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
char *name = zpool_vdev_name(g_zfs, zhp,
child[c], VDEV_NAME_TYPE_ID);
print_rebuild_status_impl(vrs, name);
print_rebuild_status_impl(vrs, i, name);
free(name);
}
}
@ -8005,13 +8019,15 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
active_resilver = (ps->pss_state == DSS_SCANNING);
}
have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
scrub_start = ps->pss_start_time;
have_errorscrub = (ps->pss_error_scrub_func ==
POOL_SCAN_ERRORSCRUB);
errorscrub_start = ps->pss_error_scrub_start;
if (c > offsetof(pool_scan_stat_t,
pss_pass_error_scrub_pause) / 8) {
have_errorscrub = (ps->pss_error_scrub_func ==
POOL_SCAN_ERRORSCRUB);
errorscrub_start = ps->pss_error_scrub_start;
}
}
boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);

View File

@ -238,6 +238,7 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name)
print_kv("end_ts", ps->pss_end_time);
print_kv(",errors", ps->pss_errors);
print_kv(",examined", examined);
print_kv(",skipped", ps->pss_skipped);
print_kv(",issued", ps->pss_issued);
print_kv(",pass_examined", pass_exam);
print_kv(",pass_issued", ps->pss_pass_issued);
@ -249,7 +250,6 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name)
print_kv(",remaining_t", remaining_time);
print_kv(",start_ts", ps->pss_start_time);
print_kv(",to_examine", ps->pss_to_examine);
print_kv(",to_process", ps->pss_to_process);
printf(" %llu\n", (u_longlong_t)timestamp);
return (0);
}

View File

@ -1,3 +1,9 @@
openzfs-linux (2.2.0-0) unstable; urgency=medium
* Merge tag zfs-2.2.0
-- Ameer Hamza <ahamza@ixsystems.com> Mon, 03 Jul 2023 10:00:00 -0500
openzfs-linux (2.1.99-1) unstable; urgency=medium
* Merge tag zfs-2.1.99.

View File

@ -36,7 +36,7 @@ install() {
{ dfatal "Failed to install essential binaries"; exit 1; }
# Adapted from https://github.com/zbm-dev/zfsbootmenu
if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so'; then
if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so' && ldconfig -p 2> /dev/null | grep -qF 'libc.so.6' ; then
# On systems with gcc-config (Gentoo, Funtoo, etc.), use it to find libgcc_s
if command -v gcc-config >/dev/null; then
inst_simple "/usr/lib/gcc/$(s=$(gcc-config -c); echo "${s%-*}/${s##*-}")/libgcc_s.so.1" ||

View File

@ -1,3 +1,9 @@
openzfs (2.2.0-0) unstable; urgency=medium
* Merge tag zfs-2.2.0
-- Ameer Hamza <ahamza@ixsystems.com> Mon, 03 Jul 2023 10:00:00 -0500
openzfs (2.1.99-1) unstable; urgency=medium
* Merge tag zfs-2.1.99.

View File

@ -104,6 +104,7 @@ typedef struct taskq {
/* list node for the cpu hotplug callback */
struct hlist_node tq_hp_cb_node;
boolean_t tq_hp_support;
unsigned long lastshouldstop; /* when to purge dynamic */
} taskq_t;
typedef struct taskq_ent {

View File

@ -61,7 +61,7 @@ typedef struct dsl_scan_phys {
uint64_t scn_end_time;
uint64_t scn_to_examine; /* total bytes to be scanned */
uint64_t scn_examined; /* bytes scanned so far */
uint64_t scn_to_process;
uint64_t scn_skipped; /* bytes skipped by scanner */
uint64_t scn_processed;
uint64_t scn_errors; /* scan I/O error count */
uint64_t scn_ddt_class_max;

View File

@ -1088,7 +1088,7 @@ typedef struct pool_scan_stat {
uint64_t pss_end_time; /* scan end time */
uint64_t pss_to_examine; /* total bytes to scan */
uint64_t pss_examined; /* total bytes located by scanner */
uint64_t pss_to_process; /* total bytes to process */
uint64_t pss_skipped; /* total bytes skipped by scanner */
uint64_t pss_processed; /* total processed bytes */
uint64_t pss_errors; /* scan errors */
@ -1152,6 +1152,7 @@ typedef struct vdev_rebuild_stat {
uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */
uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */
uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */
uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */
} vdev_rebuild_stat_t;
/*

View File

@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
extern int vdev_queue_length(vdev_t *vd);
extern uint32_t vdev_queue_length(vdev_t *vd);
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);

View File

@ -130,27 +130,24 @@ typedef const struct vdev_ops {
/*
* Virtual device properties
*/
typedef struct vdev_queue_class {
uint32_t vqc_active;
/*
* Sorted by offset or timestamp, depending on if the queue is
* LBA-ordered vs FIFO.
*/
avl_tree_t vqc_queued_tree;
typedef union vdev_queue_class {
list_t vqc_list;
avl_tree_t vqc_tree;
} vdev_queue_class_t;
struct vdev_queue {
vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
avl_tree_t vq_trim_offset_tree;
uint64_t vq_last_offset;
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
uint32_t vq_cqueued; /* Classes with queued I/Os. */
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
uint32_t vq_active; /* Number of active I/Os. */
uint32_t vq_ia_active; /* Active interactive I/Os. */
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
list_t vq_active_list; /* List of active I/Os. */
hrtime_t vq_io_complete_ts; /* time last i/o completed */
hrtime_t vq_io_delta_ts;
zio_t vq_io_search; /* used as local for stack reduction */

View File

@ -79,6 +79,7 @@ typedef struct vdev_rebuild {
uint64_t vr_pass_start_time;
uint64_t vr_pass_bytes_scanned;
uint64_t vr_pass_bytes_issued;
uint64_t vr_pass_bytes_skipped;
/* On-disk state updated by vdev_rebuild_zap_update_sync() */
vdev_rebuild_phys_t vr_rebuild_phys;

View File

@ -341,9 +341,9 @@ typedef struct zio_prop {
enum zio_checksum zp_checksum;
enum zio_compress zp_compress;
uint8_t zp_complevel;
dmu_object_type_t zp_type;
uint8_t zp_level;
uint8_t zp_copies;
dmu_object_type_t zp_type;
boolean_t zp_dedup;
boolean_t zp_dedup_verify;
boolean_t zp_nopwrite;
@ -436,6 +436,12 @@ typedef struct zio_link {
list_node_t zl_child_node;
} zio_link_t;
enum zio_qstate {
ZIO_QS_NONE = 0,
ZIO_QS_QUEUED,
ZIO_QS_ACTIVE,
};
struct zio {
/* Core information about this I/O */
zbookmark_phys_t io_bookmark;
@ -479,6 +485,12 @@ struct zio {
const zio_vsd_ops_t *io_vsd_ops;
metaslab_class_t *io_metaslab_class; /* dva throttle class */
enum zio_qstate io_queue_state; /* vdev queue state */
union {
list_node_t l;
avl_node_t a;
} io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
avl_node_t io_offset_node; /* vdev offset queues */
uint64_t io_offset;
hrtime_t io_timestamp; /* submitted at */
hrtime_t io_queued_timestamp;
@ -486,9 +498,6 @@ struct zio {
hrtime_t io_delta; /* vdev queue service delta */
hrtime_t io_delay; /* Device access time (disk or */
/* file). */
avl_node_t io_queue_node;
avl_node_t io_offset_node;
avl_node_t io_alloc_node;
zio_alloc_list_t io_alloc_list;
/* Internal pipeline state */
@ -602,6 +611,7 @@ extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
extern zio_t *zio_unique_parent(zio_t *cio);
extern void zio_add_child(zio_t *pio, zio_t *cio);
extern void zio_add_child_first(zio_t *pio, zio_t *cio);
extern void *zio_buf_alloc(size_t size);
extern void zio_buf_free(void *buf, size_t size);

View File

@ -1789,7 +1789,8 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
nvlist_t *nvl;
int nvl_len = 0;
int added_resv = 0;
zfs_prop_t prop = 0;
zfs_prop_t prop;
boolean_t nsprop = B_FALSE;
nvpair_t *elem;
(void) snprintf(errbuf, sizeof (errbuf),
@ -1836,6 +1837,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
elem = nvlist_next_nvpair(nvl, elem)) {
prop = zfs_name_to_prop(nvpair_name(elem));
nsprop |= zfs_is_namespace_prop(prop);
assert(cl_idx < nvl_len);
/*
@ -1934,8 +1936,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
* if one of the options handled by the generic
* Linux namespace layer has been modified.
*/
if (zfs_is_namespace_prop(prop) &&
zfs_is_mounted(zhp, NULL))
if (nsprop && zfs_is_mounted(zhp, NULL))
ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0);
}
}

View File

@ -193,4 +193,19 @@ The proc file will walk the lists with lock held,
reading it could cause a lock-up if the list grow too large
without limiting the output.
"(truncated)" will be shown if the list is larger than the limit.
.
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint
(Linux-only)
How long a taskq has to have had no work before we tear it down.
Previously, we would tear down a dynamic taskq worker as soon
as we noticed it had no work, but it was observed that this led
to a lot of churn in tearing down things we then immediately
spawned anew.
In practice, it seems any nonzero value will remove the vast
majority of this churn, while the nontrivially larger value
was chosen to help filter out the little remaining churn on
a mostly idle system.
Setting this value to
.Sy 0
will revert to the previous behavior.
.El

View File

@ -239,6 +239,16 @@ relative to the pool.
Make some blocks above a certain size be gang blocks.
This option is used by the test suite to facilitate testing.
.
.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
Default DDT ZAP data block size as a power of 2. Note that changing this after
creating a DDT on the pool will not affect existing DDTs, only newly created
ones.
.
.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
Default DDT ZAP indirect block size as a power of 2. Note that changing this
after creating a DDT on the pool will not affect existing DDTs, only newly
created ones.
.
.It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int
Default dnode block size as a power of 2.
.
@ -2016,12 +2026,6 @@ Historical statistics for this many latest TXGs will be available in
Flush dirty data to disk at least every this many seconds (maximum TXG
duration).
.
.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
Allow TRIM I/O operations to be aggregated.
This is normally not helpful because the extents to be trimmed
will have been already been aggregated by the metaslab.
This option is provided for debugging and performance analysis.
.
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
Max vdev I/O aggregation size.
.

View File

@ -14,7 +14,7 @@
.\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
.\" Copyright (c) 2017 Intel Corporation.
.\"
.Dd June 4, 2023
.Dd June 27, 2023
.Dt ZDB 8
.Os
.
@ -51,6 +51,7 @@
.Fl C
.Op Fl A
.Op Fl U Ar cache
.Op Ar poolname
.Nm
.Fl E
.Op Fl A

View File

@ -87,13 +87,13 @@ currently in use by another subsystem.
However this check is not robust enough
to detect simultaneous attempts to use a new device in different pools, even if
.Sy multihost Ns = Sy enabled .
The administrator must ensure, that simultaneous invocations of any combination
The administrator must ensure that simultaneous invocations of any combination
of
.Nm zpool Cm replace ,
.Nm zpool Cm create ,
.Nm zpool Cm add ,
or
.Nm zpool Cm labelclear ,
.Nm zpool Cm labelclear
do not refer to the same device.
Using the same device in two pools will result in pool corruption.
.Pp

View File

@ -26,7 +26,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
.Dd July 25, 2021
.Dd June 22, 2023
.Dt ZPOOL-SCRUB 8
.Os
.
@ -123,7 +123,7 @@ Status of pool with ongoing scrub:
.No # Nm zpool Cm status
...
scan: scrub in progress since Sun Jul 25 16:07:49 2021
403M scanned at 100M/s, 68.4M issued at 10.0M/s, 405M total
403M / 405M scanned at 100M/s, 68.4M / 405M issued at 10.0M/s
0B repaired, 16.91% done, 00:00:04 to go
...
.Ed

View File

@ -36,6 +36,12 @@ static int spl_taskq_thread_bind = 0;
module_param(spl_taskq_thread_bind, int, 0644);
MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
static uint_t spl_taskq_thread_timeout_ms = 10000;
/* BEGIN CSTYLED */
module_param(spl_taskq_thread_timeout_ms, uint, 0644);
/* END CSTYLED */
MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
"Time to require a dynamic thread be idle before it gets cleaned up");
static int spl_taskq_thread_dynamic = 1;
module_param(spl_taskq_thread_dynamic, int, 0444);
@ -848,12 +854,37 @@ taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
tqt_thread_list) == tqt)
return (0);
return
int no_work =
((tq->tq_nspawn == 0) && /* No threads are being spawned */
(tq->tq_nactive == 0) && /* No threads are handling tasks */
(tq->tq_nthreads > 1) && /* More than 1 thread is running */
(!taskq_next_ent(tq)) && /* There are no pending tasks */
(spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
/*
* If we would have said stop before, let's instead wait a bit, maybe
* we'll see more work come our way soon...
*/
if (no_work) {
/* if it's 0, we want the old behavior. */
/* if the taskq is being torn down, we also want to go away. */
if (spl_taskq_thread_timeout_ms == 0 ||
!(tq->tq_flags & TASKQ_ACTIVE))
return (1);
unsigned long lasttime = tq->lastshouldstop;
if (lasttime > 0) {
if (time_after(jiffies, lasttime +
msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
return (1);
else
return (0);
} else {
tq->lastshouldstop = jiffies;
}
} else {
tq->lastshouldstop = 0;
}
return (0);
}
static int
@ -1091,6 +1122,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
tq->tq_flags = (flags | TASKQ_ACTIVE);
tq->tq_next_id = TASKQID_INITIAL;
tq->tq_lowest_id = TASKQID_INITIAL;
tq->lastshouldstop = 0;
INIT_LIST_HEAD(&tq->tq_free_list);
INIT_LIST_HEAD(&tq->tq_pend_list);
INIT_LIST_HEAD(&tq->tq_prio_list);

View File

@ -54,7 +54,7 @@ static unsigned int zvol_prefetch_bytes = (128 * 1024);
static unsigned long zvol_max_discard_blocks = 16384;
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
static const unsigned int zvol_open_timeout_ms = 1000;
static unsigned int zvol_open_timeout_ms = 1000;
#endif
static unsigned int zvol_threads = 0;
@ -1612,4 +1612,9 @@ MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
"Process volblocksize blocks per thread");
#endif
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
module_param(zvol_open_timeout_ms, uint, 0644);
MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
#endif
/* END CSTYLED */

View File

@ -1209,10 +1209,19 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
ASSERT3S(dde->dde_class, <, DDT_CLASSES);
ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
if (ddp->ddp_refcnt == 0) {
/* This should never happen? */
ddt_phys_fill(ddp, bp);
}
/*
* This entry already existed (dde_type is real), so it must
* have refcnt >0 at the start of this txg. We are called from
* brt_pending_apply(), before frees are issued, so the refcnt
* can't be lowered yet. Therefore, it must be >0. We assert
* this because if the order of BRT and DDT interactions were
* ever to change and the refcnt was ever zero here, then
* likely further action is required to fill out the DDT entry,
* and this is a place that is likely to be missed in testing.
*/
ASSERT3U(ddp->ddp_refcnt, >, 0);
ddt_phys_addref(ddp);
result = B_TRUE;
} else {

View File

@ -31,8 +31,8 @@
#include <sys/zap.h>
#include <sys/dmu_tx.h>
static const int ddt_zap_leaf_blockshift = 12;
static const int ddt_zap_indirect_blockshift = 12;
static unsigned int ddt_zap_default_bs = 15;
static unsigned int ddt_zap_default_ibs = 15;
static int
ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
@ -43,7 +43,7 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
flags |= ZAP_FLAG_PRE_HASHED_KEY;
*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
ddt_zap_default_bs, ddt_zap_default_ibs,
DMU_OT_NONE, 0, tx);
return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0);
@ -166,3 +166,10 @@ const ddt_ops_t ddt_zap_ops = {
ddt_zap_walk,
ddt_zap_count,
};
/* BEGIN CSTYLED */
ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW,
"DDT ZAP leaf blockshift");
ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW,
"DDT ZAP indirect blockshift");
/* END CSTYLED */

View File

@ -573,7 +573,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
* counter to how far we've scanned. We know we're consistent
* up to here.
*/
scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
scn->scn_phys.scn_skipped;
if (dsl_scan_is_running(scn) &&
spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
@ -4362,7 +4363,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
* Disabled by default, set zfs_scan_report_txgs to report
* average performance over the last zfs_scan_report_txgs TXGs.
*/
if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 &&
if (zfs_scan_report_txgs != 0 &&
tx->tx_txg % zfs_scan_report_txgs == 0) {
scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
spa_scan_stat_init(spa);
@ -4564,6 +4565,15 @@ count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all)
all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
}
static void
count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
{
if (BP_IS_EMBEDDED(bp))
return;
atomic_add_64(&scn->scn_phys.scn_skipped,
all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
}
static void
count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
{
@ -4709,7 +4719,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
count_block(dp->dp_blkstats, bp);
if (phys_birth <= scn->scn_phys.scn_min_txg ||
phys_birth >= scn->scn_phys.scn_max_txg) {
count_block_issued(spa, bp, B_TRUE);
count_block_skipped(scn, bp, B_TRUE);
return (0);
}
@ -4750,7 +4760,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
if (needs_io && !zfs_no_scrub_io) {
dsl_scan_enqueue(dp, bp, zio_flags, zb);
} else {
count_block_issued(spa, bp, B_TRUE);
count_block_skipped(scn, bp, B_TRUE);
}
/* do not relocate this block */
@ -5119,9 +5129,9 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
/* count the block as though we issued it */
/* count the block as though we skipped it */
sio2bp(sio, &tmpbp);
count_block_issued(spa, &tmpbp, B_FALSE);
count_block_skipped(scn, &tmpbp, B_FALSE);
sio_free(sio);
}

View File

@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
NULL);
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
}
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
@ -2611,7 +2611,7 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
ps->pss_end_time = scn->scn_phys.scn_end_time;
ps->pss_to_examine = scn->scn_phys.scn_to_examine;
ps->pss_examined = scn->scn_phys.scn_examined;
ps->pss_to_process = scn->scn_phys.scn_to_process;
ps->pss_skipped = scn->scn_phys.scn_skipped;
ps->pss_processed = scn->scn_phys.scn_processed;
ps->pss_errors = scn->scn_phys.scn_errors;

View File

@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
boolean_t
txg_all_lists_empty(txg_list_t *tl)
{
mutex_enter(&tl->tl_lock);
for (int i = 0; i < TXG_SIZE; i++) {
if (!txg_list_empty_impl(tl, i)) {
mutex_exit(&tl->tl_lock);
return (B_FALSE);
}
}
mutex_exit(&tl->tl_lock);
return (B_TRUE);
boolean_t res = B_TRUE;
for (int i = 0; i < TXG_SIZE; i++)
res &= (tl->tl_head[i] == NULL);
return (res);
}
/*

View File

@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
vsx->vsx_active_queue[t] =
vd->vdev_queue.vq_class[t].vqc_active;
vsx->vsx_pend_queue[t] = avl_numnodes(
&vd->vdev_queue.vq_class[t].vqc_queued_tree);
for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
}
}
}
@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock);
if (avl_numnodes(&vq->vq_active_tree) > 0) {
if (vq->vq_active > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
zfs_dbgmsg("slow vdev: %s has %lu active IOs",
vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
zfs_dbgmsg("slow vdev: %s has %u active IOs",
vd->vdev_path, vq->vq_active);
/*
* Look at the head of all the pending queues,
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime invoke the deadman logic.
*/
fio = avl_first(&vq->vq_active_tree);
fio = list_head(&vq->vq_active_list);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa))
zio_deadman(fio, tag);

View File

@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300;
*/
uint_t zfs_vdev_def_queue_depth = 32;
/*
* Allow TRIM I/Os to be aggregated. This should normally not be needed since
* TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
* by the TRIM code in zfs_trim.c.
*/
static uint_t zfs_vdev_aggregate_trim = 0;
static int
vdev_queue_offset_compare(const void *x1, const void *x2)
{
@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
return (TREE_PCMP(z1, z2));
}
static inline avl_tree_t *
vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
{
return (&vq->vq_class[p].vqc_queued_tree);
}
static inline avl_tree_t *
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
{
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
if (t == ZIO_TYPE_READ)
return (&vq->vq_read_offset_tree);
else if (t == ZIO_TYPE_WRITE)
return (&vq->vq_write_offset_tree);
else
return (&vq->vq_trim_offset_tree);
}
#define VDQ_T_SHIFT 29
static int
vdev_queue_timestamp_compare(const void *x1, const void *x2)
vdev_queue_to_compare(const void *x1, const void *x2)
{
const zio_t *z1 = (const zio_t *)x1;
const zio_t *z2 = (const zio_t *)x2;
int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
z2->io_timestamp >> VDQ_T_SHIFT);
int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
int cmp = tcmp ? tcmp : ocmp;
if (likely(cmp))
if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
return (cmp);
return (TREE_PCMP(z1, z2));
}
static inline boolean_t
vdev_queue_class_fifo(zio_priority_t p)
{
return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
p == ZIO_PRIORITY_TRIM);
}
static void
vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
{
zio_priority_t p = zio->io_priority;
vq->vq_cqueued |= 1U << p;
if (vdev_queue_class_fifo(p))
list_insert_tail(&vq->vq_class[p].vqc_list, zio);
else
avl_add(&vq->vq_class[p].vqc_tree, zio);
}
static void
vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
{
zio_priority_t p = zio->io_priority;
uint32_t empty;
if (vdev_queue_class_fifo(p)) {
list_t *list = &vq->vq_class[p].vqc_list;
list_remove(list, zio);
empty = list_is_empty(list);
} else {
avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
avl_remove(tree, zio);
empty = avl_is_empty(tree);
}
vq->vq_cqueued &= ~(empty << p);
}
static uint_t
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
{
@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa)
}
static uint_t
vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
{
switch (p) {
case ZIO_PRIORITY_SYNC_READ:
@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
case ZIO_PRIORITY_ASYNC_READ:
return (zfs_vdev_async_read_max_active);
case ZIO_PRIORITY_ASYNC_WRITE:
return (vdev_queue_max_async_writes(spa));
return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
case ZIO_PRIORITY_SCRUB:
if (vq->vq_ia_active > 0) {
return (MIN(vq->vq_nia_credit,
@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
static zio_priority_t
vdev_queue_class_to_issue(vdev_queue_t *vq)
{
spa_t *spa = vq->vq_vdev->vdev_spa;
zio_priority_t p, n;
uint32_t cq = vq->vq_cqueued;
zio_priority_t p, p1;
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
return (ZIO_PRIORITY_NUM_QUEUEABLE);
/*
@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* Do round-robin to reduce starvation due to zfs_vdev_max_active
* and vq_nia_credit limits.
*/
for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_min_active(vq, p)) {
vq->vq_last_prio = p;
return (p);
}
p1 = vq->vq_last_prio + 1;
if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
p1 = 0;
for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
vdev_queue_class_min_active(vq, p))
goto found;
}
for (p = 0; p < p1; p++) {
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
vdev_queue_class_min_active(vq, p))
goto found;
}
/*
@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* maximum # outstanding i/os.
*/
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_max_active(spa, vq, p)) {
vq->vq_last_prio = p;
return (p);
}
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
vdev_queue_class_max_active(vq, p))
break;
}
/* No eligible queued i/os */
return (ZIO_PRIORITY_NUM_QUEUEABLE);
found:
vq->vq_last_prio = p;
return (p);
}
void
@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd)
vdev_queue_t *vq = &vd->vdev_queue;
zio_priority_t p;
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
vq->vq_vdev = vd;
taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *);
/*
* The synchronous/trim i/o queues are dispatched in FIFO rather
* than LBA order. This provides more consistent latency for
* these i/os.
*/
if (p == ZIO_PRIORITY_SYNC_READ ||
p == ZIO_PRIORITY_SYNC_WRITE ||
p == ZIO_PRIORITY_TRIM) {
compfn = vdev_queue_timestamp_compare;
if (vdev_queue_class_fifo(p)) {
list_create(&vq->vq_class[p].vqc_list,
sizeof (zio_t),
offsetof(struct zio, io_queue_node.l));
} else {
compfn = vdev_queue_offset_compare;
avl_create(&vq->vq_class[p].vqc_tree,
vdev_queue_to_compare, sizeof (zio_t),
offsetof(struct zio, io_queue_node.a));
}
avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
avl_create(&vq->vq_read_offset_tree,
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(&vq->vq_write_offset_tree,
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
vq->vq_last_offset = 0;
list_create(&vq->vq_active_list, sizeof (struct zio),
offsetof(struct zio, io_queue_node.l));
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
}
void
@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
avl_destroy(vdev_queue_class_tree(vq, p));
avl_destroy(&vq->vq_active_tree);
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (vdev_queue_class_fifo(p))
list_destroy(&vq->vq_class[p].vqc_list);
else
avl_destroy(&vq->vq_class[p].vqc_tree);
}
avl_destroy(&vq->vq_read_offset_tree);
avl_destroy(&vq->vq_write_offset_tree);
list_destroy(&vq->vq_active_list);
mutex_destroy(&vq->vq_lock);
}
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
zio->io_queue_state = ZIO_QS_QUEUED;
vdev_queue_class_add(vq, zio);
if (zio->io_type == ZIO_TYPE_READ)
avl_add(&vq->vq_read_offset_tree, zio);
else if (zio->io_type == ZIO_TYPE_WRITE)
avl_add(&vq->vq_write_offset_tree, zio);
}
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
vdev_queue_class_remove(vq, zio);
if (zio->io_type == ZIO_TYPE_READ)
avl_remove(&vq->vq_read_offset_tree, zio);
else if (zio->io_type == ZIO_TYPE_WRITE)
avl_remove(&vq->vq_write_offset_tree, zio);
zio->io_queue_state = ZIO_QS_NONE;
}
static boolean_t
@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active++;
vq->vq_cactive[zio->io_priority]++;
vq->vq_active++;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (++vq->vq_ia_active == 1)
vq->vq_nia_credit = 1;
} else if (vq->vq_ia_active > 0) {
vq->vq_nia_credit--;
}
avl_add(&vq->vq_active_tree, zio);
zio->io_queue_state = ZIO_QS_ACTIVE;
list_insert_tail(&vq->vq_active_list, zio);
}
static void
@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
{
ASSERT(MUTEX_HELD(&vq->vq_lock));
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
vq->vq_class[zio->io_priority].vqc_active--;
vq->vq_cactive[zio->io_priority]--;
vq->vq_active--;
if (vdev_queue_is_interactive(zio->io_priority)) {
if (--vq->vq_ia_active == 0)
vq->vq_nia_credit = 0;
@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
vq->vq_nia_credit = zfs_vdev_nia_credit;
} else if (vq->vq_ia_active == 0)
vq->vq_nia_credit++;
avl_remove(&vq->vq_active_tree, zio);
list_remove(&vq->vq_active_list, zio);
zio->io_queue_state = ZIO_QS_NONE;
}
static void
@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
uint64_t maxgap = 0;
uint64_t size;
uint64_t limit;
int maxblocksize;
boolean_t stretch = B_FALSE;
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
uint64_t next_offset;
abd_t *abd;
avl_tree_t *t;
/*
* TRIM aggregation should not be needed since code in zfs_trim.c can
* submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
*/
if (zio->io_type == ZIO_TYPE_TRIM)
return (NULL);
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
return (NULL);
maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
if (vq->vq_vdev->vdev_nonrot)
limit = zfs_vdev_aggregation_limit_non_rotating;
else
limit = zfs_vdev_aggregation_limit;
limit = MIN(limit, maxblocksize);
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
return (NULL);
/*
* While TRIM commands could be aggregated based on offset this
* behavior is disabled until it's determined to be beneficial.
*/
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
if (limit == 0)
return (NULL);
limit = MIN(limit, SPA_MAXBLOCKSIZE);
/*
* I/Os to distributed spares are directly dispatched to the dRAID
@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
first = last = zio;
if (zio->io_type == ZIO_TYPE_READ)
if (zio->io_type == ZIO_TYPE_READ) {
maxgap = zfs_vdev_read_gap_limit;
t = &vq->vq_read_offset_tree;
} else {
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
t = &vq->vq_write_offset_tree;
}
/*
* We can aggregate I/Os that are sufficiently adjacent and of
@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
* Walk backwards through sufficiently contiguous I/Os
* recording the last non-optional I/O.
*/
zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
while ((dio = AVL_PREV(t, first)) != NULL &&
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
IO_SPAN(dio, last) <= limit &&
@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
(IO_SPAN(first, dio) <= limit ||
(dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
IO_SPAN(first, dio) <= maxblocksize &&
IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
IO_GAP(last, dio) <= maxgap &&
dio->io_type == zio->io_type) {
last = dio;
@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
return (NULL);
size = IO_SPAN(first, last);
ASSERT3U(size, <=, maxblocksize);
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
abd = abd_alloc_gang();
if (abd == NULL)
@ -824,19 +847,30 @@ again:
return (NULL);
}
/*
* For LBA-ordered queues (async / scrub / initializing), issue the
* i/o which follows the most recently issued i/o in LBA (offset) order.
*
* For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
*/
tree = vdev_queue_class_tree(vq, p);
vq->vq_io_search.io_timestamp = 0;
vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL)
zio = avl_first(tree);
if (vdev_queue_class_fifo(p)) {
zio = list_head(&vq->vq_class[p].vqc_list);
} else {
/*
* For LBA-ordered queues (async / scrub / initializing),
* issue the I/O which follows the most recently issued I/O
* in LBA (offset) order, but to avoid starvation only within
* the same 0.5 second interval as the first I/O.
*/
tree = &vq->vq_class[p].vqc_tree;
zio = aio = avl_first(tree);
if (zio->io_offset < vq->vq_last_offset) {
vq->vq_io_search.io_timestamp = zio->io_timestamp;
vq->vq_io_search.io_offset = vq->vq_last_offset;
zio = avl_find(tree, &vq->vq_io_search, &idx);
if (zio == NULL) {
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL ||
(zio->io_timestamp >> VDQ_T_SHIFT) !=
(aio->io_timestamp >> VDQ_T_SHIFT))
zio = aio;
}
}
}
ASSERT3U(zio->io_priority, ==, p);
aio = vdev_queue_aggregate(vq, zio);
@ -967,7 +1001,6 @@ void
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
avl_tree_t *tree;
/*
* ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* Otherwise, the zio is currently active and we cannot change its
* priority.
*/
tree = vdev_queue_class_tree(vq, zio->io_priority);
if (avl_find(tree, zio, NULL) == zio) {
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
if (zio->io_queue_state == ZIO_QS_QUEUED) {
vdev_queue_class_remove(vq, zio);
zio->io_priority = priority;
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
vdev_queue_class_add(vq, zio);
} else if (zio->io_queue_state == ZIO_QS_NONE) {
zio->io_priority = priority;
}
@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
* vq_lock mutex use here, instead we prefer to keep it lock free for
* performance.
*/
int
uint32_t
vdev_queue_length(vdev_t *vd)
{
return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
return (vd->vdev_queue.vq_active);
}
uint64_t
@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd)
return (vd->vdev_queue.vq_last_offset);
}
uint64_t
vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
{
vdev_queue_t *vq = &vd->vdev_queue;
if (vdev_queue_class_fifo(p))
return (list_is_empty(&vq->vq_class[p].vqc_list) == 0);
else
return (avl_numnodes(&vq->vq_class[p].vqc_tree));
}
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
"Max vdev I/O aggregation size");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW,
"Allow TRIM I/O to be aggregated");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
"Aggregate read I/O over gap");

View File

@ -571,8 +571,10 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
vdev_rebuild_blkptr_init(&blk, vd, start, size);
uint64_t psize = BP_GET_PSIZE(&blk);
if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) {
vr->vr_pass_bytes_skipped += size;
return (0);
}
mutex_enter(&vr->vr_io_lock);
@ -786,6 +788,7 @@ vdev_rebuild_thread(void *arg)
vr->vr_pass_start_time = gethrtime();
vr->vr_pass_bytes_scanned = 0;
vr->vr_pass_bytes_issued = 0;
vr->vr_pass_bytes_skipped = 0;
uint64_t update_est_time = gethrtime();
vdev_rebuild_update_bytes_est(vd, 0);
@ -1153,6 +1156,7 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
vr->vr_pass_start_time);
vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped;
mutex_exit(&tvd->vdev_rebuild_lock);
}

View File

@ -462,14 +462,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
return (SET_ERROR(EINVAL));
}
const uint64_t max_blksz = zfsvfs->z_max_blksz;
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
* Skip this if uio contains loaned arc_buf.
*/
if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
if (zfs_uio_prefaultpages(pfbytes, uio)) {
zfs_exit(zfsvfs, FTAG);
return (SET_ERROR(EFAULT));
}
@ -544,10 +542,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
break;
}
uint64_t blksz;
if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
if (zp->z_blksz > zfsvfs->z_max_blksz &&
!ISP2(zp->z_blksz)) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
blksz = 1 << highbit64(zp->z_blksz);
} else {
blksz = zfsvfs->z_max_blksz;
}
blksz = MIN(blksz, P2ROUNDUP(end_size,
SPA_MINBLOCKSIZE));
blksz = MAX(blksz, zp->z_blksz);
} else {
blksz = zp->z_blksz;
}
arc_buf_t *abuf = NULL;
if (n >= max_blksz && woff >= zp->z_size &&
P2PHASE(woff, max_blksz) == 0 &&
zp->z_blksz == max_blksz) {
ssize_t nbytes = n;
if (n >= blksz && woff >= zp->z_size &&
P2PHASE(woff, blksz) == 0 &&
(blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
/*
* This write covers a full block. "Borrow" a buffer
* from the dmu so that we can fill it before we enter
@ -555,18 +574,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* holding up the transaction if the data copy hangs
* up on a pagefault (e.g., from an NFS server mapping).
*/
size_t cbytes;
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
max_blksz);
blksz);
ASSERT(abuf != NULL);
ASSERT(arc_buf_size(abuf) == max_blksz);
if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
UIO_WRITE, uio, &cbytes))) {
ASSERT(arc_buf_size(abuf) == blksz);
if ((error = zfs_uiocopy(abuf->b_data, blksz,
UIO_WRITE, uio, &nbytes))) {
dmu_return_arcbuf(abuf);
break;
}
ASSERT3S(cbytes, ==, max_blksz);
ASSERT3S(nbytes, ==, blksz);
} else {
nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
P2PHASE(woff, blksz));
if (pfbytes < nbytes) {
if (zfs_uio_prefaultpages(nbytes, uio)) {
error = SET_ERROR(EFAULT);
break;
}
pfbytes = nbytes;
}
}
/*
@ -576,8 +603,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
DB_DNODE_ENTER(db);
dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
MIN(n, max_blksz));
dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
DB_DNODE_EXIT(db);
zfs_sa_upgrade_txholds(tx, zp);
error = dmu_tx_assign(tx, TXG_WAIT);
@ -600,31 +626,10 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* shrink down lr_length to the appropriate size.
*/
if (lr->lr_length == UINT64_MAX) {
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
/*
* File's blocksize is already larger than the
* "recordsize" property. Only let it grow to
* the next power of 2.
*/
ASSERT(!ISP2(zp->z_blksz));
new_blksz = MIN(end_size,
1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
zfs_grow_blocksize(zp, new_blksz, tx);
zfs_grow_blocksize(zp, blksz, tx);
zfs_rangelock_reduce(lr, woff, n);
}
/*
* XXX - should we really limit each write to z_max_blksz?
* Perhaps we should use SPA_MAXBLOCKSIZE chunks?
*/
const ssize_t nbytes =
MIN(n, max_blksz - P2PHASE(woff, max_blksz));
ssize_t tx_bytes;
if (abuf == NULL) {
tx_bytes = zfs_uio_resid(uio);
@ -644,12 +649,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
* zfs_uio_prefaultpages, or prefaultpages may
* error, and we may break the loop early.
*/
if (tx_bytes != zfs_uio_resid(uio))
n -= tx_bytes - zfs_uio_resid(uio);
if (zfs_uio_prefaultpages(MIN(n, max_blksz),
uio)) {
break;
}
n -= tx_bytes - zfs_uio_resid(uio);
pfbytes -= tx_bytes - zfs_uio_resid(uio);
continue;
}
#endif
@ -665,15 +666,6 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
}
tx_bytes -= zfs_uio_resid(uio);
} else {
/* Implied by abuf != NULL: */
ASSERT3S(n, >=, max_blksz);
ASSERT0(P2PHASE(woff, max_blksz));
/*
* We can simplify nbytes to MIN(n, max_blksz) since
* P2PHASE(woff, max_blksz) is 0, and knowing
* n >= max_blksz lets us simplify further:
*/
ASSERT3S(nbytes, ==, max_blksz);
/*
* Thus, we're writing a full block at a block-aligned
* offset and extending the file past EOF.
@ -758,13 +750,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
break;
ASSERT3S(tx_bytes, ==, nbytes);
n -= nbytes;
if (n > 0) {
if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
error = SET_ERROR(EFAULT);
break;
}
}
pfbytes -= nbytes;
}
zfs_znode_update_vfs(zp);

View File

@ -798,8 +798,8 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
{
ASSERT(MUTEX_HELD(&zilog->zl_lock));
ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
ASSERT(list_is_empty(&lwb->lwb_waiters));
ASSERT(list_is_empty(&lwb->lwb_itxs));
VERIFY(list_is_empty(&lwb->lwb_waiters));
VERIFY(list_is_empty(&lwb->lwb_itxs));
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
ASSERT3P(lwb->lwb_write_zio, ==, NULL);
ASSERT3P(lwb->lwb_root_zio, ==, NULL);
@ -1425,6 +1425,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
list_move_tail(&itxs, &lwb->lwb_itxs);
list_move_tail(&waiters, &lwb->lwb_waiters);
txg = lwb->lwb_issued_txg;
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
lwb->lwb_state = LWB_STATE_FLUSH_DONE;
@ -1465,7 +1466,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
list_destroy(&waiters);
mutex_enter(&zilog->zl_lwb_io_lock);
txg = lwb->lwb_issued_txg;
ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
zilog->zl_lwb_inflight[txg & TXG_MASK]--;
if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
@ -2525,10 +2525,10 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
* This function will traverse the queue of itxs that need to be
* committed, and move them onto the ZIL's zl_itx_commit_list.
*/
static void
static uint64_t
zil_get_commit_list(zilog_t *zilog)
{
uint64_t otxg, txg;
uint64_t otxg, txg, wtxg = 0;
list_t *commit_list = &zilog->zl_itx_commit_list;
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
@ -2562,10 +2562,22 @@ zil_get_commit_list(zilog_t *zilog)
*/
ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
if (unlikely(zilog->zl_suspend > 0)) {
/*
* ZIL was just suspended, but we lost the race.
* Allow all earlier itxs to be committed, but ask
* caller to do txg_wait_synced(txg) for any new.
*/
if (!list_is_empty(sync_list))
wtxg = MAX(wtxg, txg);
} else {
list_move_tail(commit_list, sync_list);
}
mutex_exit(&itxg->itxg_lock);
}
return (wtxg);
}
/*
@ -2953,11 +2965,12 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
* not issued, we rely on future calls to zil_commit_writer() to issue
* the lwb, or the timeout mechanism found in zil_commit_waiter().
*/
static void
static uint64_t
zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
{
list_t ilwbs;
lwb_t *lwb;
uint64_t wtxg = 0;
ASSERT(!MUTEX_HELD(&zilog->zl_lock));
ASSERT(spa_writeable(zilog->zl_spa));
@ -2987,7 +3000,7 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
zil_get_commit_list(zilog);
wtxg = zil_get_commit_list(zilog);
zil_prune_commit_list(zilog);
zil_process_commit_list(zilog, zcw, &ilwbs);
@ -2996,6 +3009,7 @@ out:
while ((lwb = list_remove_head(&ilwbs)) != NULL)
zil_lwb_write_issue(zilog, lwb);
list_destroy(&ilwbs);
return (wtxg);
}
static void
@ -3511,7 +3525,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
zil_commit_itx_assign(zilog, zcw);
zil_commit_writer(zilog, zcw);
uint64_t wtxg = zil_commit_writer(zilog, zcw);
zil_commit_waiter(zilog, zcw);
if (zcw->zcw_zio_error != 0) {
@ -3526,6 +3540,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
DTRACE_PROBE2(zil__commit__io__error,
zilog_t *, zilog, zil_commit_waiter_t *, zcw);
txg_wait_synced(zilog->zl_dmu_pool, 0);
} else if (wtxg != 0) {
txg_wait_synced(zilog->zl_dmu_pool, wtxg);
}
zil_free_commit_waiter(zcw);
@ -3905,11 +3921,13 @@ zil_suspend(const char *osname, void **cookiep)
return (error);
zilog = dmu_objset_zil(os);
mutex_enter(&zilog->zl_issuer_lock);
mutex_enter(&zilog->zl_lock);
zh = zilog->zl_header;
if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
mutex_exit(&zilog->zl_lock);
mutex_exit(&zilog->zl_issuer_lock);
dmu_objset_rele(os, suspend_tag);
return (SET_ERROR(EBUSY));
}
@ -3923,6 +3941,7 @@ zil_suspend(const char *osname, void **cookiep)
if (cookiep == NULL && !zilog->zl_suspending &&
(zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
mutex_exit(&zilog->zl_lock);
mutex_exit(&zilog->zl_issuer_lock);
dmu_objset_rele(os, suspend_tag);
return (0);
}
@ -3931,6 +3950,7 @@ zil_suspend(const char *osname, void **cookiep)
dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
zilog->zl_suspend++;
mutex_exit(&zilog->zl_issuer_lock);
if (zilog->zl_suspend > 1) {
/*

View File

@ -626,8 +626,6 @@ zio_unique_parent(zio_t *cio)
void
zio_add_child(zio_t *pio, zio_t *cio)
{
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
/*
* Logical I/Os can have logical, gang, or vdev children.
* Gang I/Os can have gang or vdev children.
@ -636,6 +634,7 @@ zio_add_child(zio_t *pio, zio_t *cio)
*/
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
zl->zl_parent = pio;
zl->zl_child = cio;
@ -644,8 +643,9 @@ zio_add_child(zio_t *pio, zio_t *cio)
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
uint64_t *countp = pio->io_children[cio->io_child_type];
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
countp[w] += !cio->io_state[w];
list_insert_head(&pio->io_child_list, zl);
list_insert_head(&cio->io_parent_list, zl);
@ -654,6 +654,37 @@ zio_add_child(zio_t *pio, zio_t *cio)
mutex_exit(&pio->io_lock);
}
void
zio_add_child_first(zio_t *pio, zio_t *cio)
{
/*
* Logical I/Os can have logical, gang, or vdev children.
* Gang I/Os can have gang or vdev children.
* Vdev I/Os can only have vdev children.
* The following ASSERT captures all of these constraints.
*/
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
zl->zl_parent = pio;
zl->zl_child = cio;
ASSERT(list_is_empty(&cio->io_parent_list));
list_insert_head(&cio->io_parent_list, zl);
mutex_enter(&pio->io_lock);
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
uint64_t *countp = pio->io_children[cio->io_child_type];
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
countp[w] += !cio->io_state[w];
list_insert_head(&pio->io_child_list, zl);
mutex_exit(&pio->io_lock);
}
static void
zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
{
@ -840,12 +871,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_child_type = ZIO_CHILD_LOGICAL;
if (bp != NULL) {
zio->io_bp = (blkptr_t *)bp;
zio->io_bp_copy = *bp;
zio->io_bp_orig = *bp;
if (type != ZIO_TYPE_WRITE ||
zio->io_child_type == ZIO_CHILD_DDT)
zio->io_child_type == ZIO_CHILD_DDT) {
zio->io_bp_copy = *bp;
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
} else {
zio->io_bp = (blkptr_t *)bp;
}
zio->io_bp_orig = *bp;
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
zio->io_logical = zio;
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
@ -880,7 +913,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
zio->io_logical = pio->io_logical;
if (zio->io_child_type == ZIO_CHILD_GANG)
zio->io_gang_leader = pio->io_gang_leader;
zio_add_child(pio, zio);
zio_add_child_first(pio, zio);
}
taskq_init_ent(&zio->io_tqent);
@ -1601,7 +1634,6 @@ zio_read_bp_init(zio_t *zio)
abd_return_buf_copy(zio->io_abd, data, psize);
} else {
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
}
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
@ -4442,8 +4474,10 @@ zio_ready(zio_t *zio)
zio->io_ready(zio);
}
#ifdef ZFS_DEBUG
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
#endif
if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;