Merge pull request #143 from truenas/truenas/zfs-2.2-release-test
Merge openzfs/zfs-2.2-release to truenas/zfs-2.2-release
This commit is contained in:
commit
586731645f
4
META
4
META
|
@ -1,8 +1,8 @@
|
|||
Meta: 1
|
||||
Name: zfs
|
||||
Branch: 1.0
|
||||
Version: 2.1.99
|
||||
Release: 1
|
||||
Version: 2.2.0
|
||||
Release: rc1
|
||||
Release-Tags: relext
|
||||
License: CDDL
|
||||
Author: OpenZFS
|
||||
|
|
|
@ -794,7 +794,7 @@ usage(void)
|
|||
"\t\t[-o <var>=<value>]... [-t <txg>] [-U <cache>] [-x <dumpdir>]\n"
|
||||
"\t\t[-K <key>] <poolname>/<objset id> [<backupflags>]\n"
|
||||
"\t%s [-v] <bookmark>\n"
|
||||
"\t%s -C [-A] [-U <cache>]\n"
|
||||
"\t%s -C [-A] [-U <cache>] [<poolname>]\n"
|
||||
"\t%s -l [-Aqu] <device>\n"
|
||||
"\t%s -m [-AFLPX] [-e [-V] [-p <path> ...]] [-t <txg>] "
|
||||
"[-U <cache>]\n\t\t<poolname> [<vdev> [<metaslab> ...]]\n"
|
||||
|
|
|
@ -6057,8 +6057,8 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
|
|||
if (p != NULL)
|
||||
rid = p->pw_uid;
|
||||
else if (*endch != '\0') {
|
||||
(void) snprintf(errbuf, 256, gettext(
|
||||
"invalid user %s\n"), curr);
|
||||
(void) snprintf(errbuf, sizeof (errbuf),
|
||||
gettext("invalid user %s\n"), curr);
|
||||
allow_usage(un, B_TRUE, errbuf);
|
||||
}
|
||||
} else if (opts->group) {
|
||||
|
@ -6071,8 +6071,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
|
|||
if (g != NULL)
|
||||
rid = g->gr_gid;
|
||||
else if (*endch != '\0') {
|
||||
(void) snprintf(errbuf, 256, gettext(
|
||||
"invalid group %s\n"), curr);
|
||||
(void) snprintf(errbuf, sizeof (errbuf),
|
||||
gettext("invalid group %s\n"),
|
||||
curr);
|
||||
allow_usage(un, B_TRUE, errbuf);
|
||||
}
|
||||
} else {
|
||||
|
@ -6097,8 +6098,9 @@ construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp)
|
|||
who_type = ZFS_DELEG_GROUP;
|
||||
rid = g->gr_gid;
|
||||
} else {
|
||||
(void) snprintf(errbuf, 256, gettext(
|
||||
"invalid user/group %s\n"), curr);
|
||||
(void) snprintf(errbuf, sizeof (errbuf),
|
||||
gettext("invalid user/group %s\n"),
|
||||
curr);
|
||||
allow_usage(un, B_TRUE, errbuf);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -7662,11 +7662,11 @@ static void
|
|||
print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
|
||||
{
|
||||
time_t start, end, pause;
|
||||
uint64_t pass_scanned, scanned, pass_issued, issued, total;
|
||||
uint64_t pass_scanned, scanned, pass_issued, issued, total_s, total_i;
|
||||
uint64_t elapsed, scan_rate, issue_rate;
|
||||
double fraction_done;
|
||||
char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
|
||||
char srate_buf[7], irate_buf[7], time_buf[32];
|
||||
char processed_buf[7], scanned_buf[7], issued_buf[7], total_s_buf[7];
|
||||
char total_i_buf[7], srate_buf[7], irate_buf[7], time_buf[32];
|
||||
|
||||
printf(" ");
|
||||
printf_color(ANSI_BOLD, gettext("scan:"));
|
||||
|
@ -7738,10 +7738,11 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
|
|||
pass_scanned = ps->pss_pass_exam;
|
||||
issued = ps->pss_issued;
|
||||
pass_issued = ps->pss_pass_issued;
|
||||
total = ps->pss_to_examine;
|
||||
total_s = ps->pss_to_examine;
|
||||
total_i = ps->pss_to_examine - ps->pss_skipped;
|
||||
|
||||
/* we are only done with a block once we have issued the IO for it */
|
||||
fraction_done = (double)issued / total;
|
||||
fraction_done = (double)issued / total_i;
|
||||
|
||||
/* elapsed time for this pass, rounding up to 1 if it's 0 */
|
||||
elapsed = time(NULL) - ps->pss_pass_start;
|
||||
|
@ -7750,26 +7751,25 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
|
|||
|
||||
scan_rate = pass_scanned / elapsed;
|
||||
issue_rate = pass_issued / elapsed;
|
||||
uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
|
||||
((total - issued) / issue_rate) : UINT64_MAX;
|
||||
secs_to_dhms(total_secs_left, time_buf);
|
||||
|
||||
/* format all of the numbers we will be reporting */
|
||||
zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
|
||||
zfs_nicebytes(issued, issued_buf, sizeof (issued_buf));
|
||||
zfs_nicebytes(total, total_buf, sizeof (total_buf));
|
||||
zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
|
||||
zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
|
||||
zfs_nicebytes(total_s, total_s_buf, sizeof (total_s_buf));
|
||||
zfs_nicebytes(total_i, total_i_buf, sizeof (total_i_buf));
|
||||
|
||||
/* do not print estimated time if we have a paused scrub */
|
||||
if (pause == 0) {
|
||||
(void) printf(gettext("\t%s scanned at %s/s, "
|
||||
"%s issued at %s/s, %s total\n"),
|
||||
scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
|
||||
} else {
|
||||
(void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
|
||||
scanned_buf, issued_buf, total_buf);
|
||||
(void) printf(gettext("\t%s / %s scanned"), scanned_buf, total_s_buf);
|
||||
if (pause == 0 && scan_rate > 0) {
|
||||
zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
|
||||
(void) printf(gettext(" at %s/s"), srate_buf);
|
||||
}
|
||||
(void) printf(gettext(", %s / %s issued"), issued_buf, total_i_buf);
|
||||
if (pause == 0 && issue_rate > 0) {
|
||||
zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
|
||||
(void) printf(gettext(" at %s/s"), irate_buf);
|
||||
}
|
||||
(void) printf(gettext("\n"));
|
||||
|
||||
if (is_resilver) {
|
||||
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
|
||||
|
@ -7782,16 +7782,16 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
|
|||
if (pause == 0) {
|
||||
/*
|
||||
* Only provide an estimate iff:
|
||||
* 1) the time remaining is valid, and
|
||||
* 1) we haven't yet issued all we expected, and
|
||||
* 2) the issue rate exceeds 10 MB/s, and
|
||||
* 3) it's either:
|
||||
* a) a resilver which has started repairs, or
|
||||
* b) a scrub which has entered the issue phase.
|
||||
*/
|
||||
if (total_secs_left != UINT64_MAX &&
|
||||
issue_rate >= 10 * 1024 * 1024 &&
|
||||
if (total_i >= issued && issue_rate >= 10 * 1024 * 1024 &&
|
||||
((is_resilver && ps->pss_processed > 0) ||
|
||||
(is_scrub && issued > 0))) {
|
||||
secs_to_dhms((total_i - issued) / issue_rate, time_buf);
|
||||
(void) printf(gettext(", %s to go\n"), time_buf);
|
||||
} else {
|
||||
(void) printf(gettext(", no estimated "
|
||||
|
@ -7803,7 +7803,7 @@ print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
|
|||
}
|
||||
|
||||
static void
|
||||
print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
|
||||
print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, uint_t c, char *vdev_name)
|
||||
{
|
||||
if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
|
||||
return;
|
||||
|
@ -7815,17 +7815,20 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
|
|||
uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
|
||||
uint64_t bytes_issued = vrs->vrs_bytes_issued;
|
||||
uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
|
||||
uint64_t bytes_est = vrs->vrs_bytes_est;
|
||||
uint64_t bytes_est_s = vrs->vrs_bytes_est;
|
||||
uint64_t bytes_est_i = vrs->vrs_bytes_est;
|
||||
if (c > offsetof(vdev_rebuild_stat_t, vrs_pass_bytes_skipped) / 8)
|
||||
bytes_est_i -= vrs->vrs_pass_bytes_skipped;
|
||||
uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
|
||||
(vrs->vrs_pass_time_ms + 1)) * 1000;
|
||||
uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
|
||||
(vrs->vrs_pass_time_ms + 1)) * 1000;
|
||||
double scan_pct = MIN((double)bytes_scanned * 100 /
|
||||
(bytes_est + 1), 100);
|
||||
(bytes_est_s + 1), 100);
|
||||
|
||||
/* Format all of the numbers we will be reporting */
|
||||
char bytes_scanned_buf[7], bytes_issued_buf[7];
|
||||
char bytes_rebuilt_buf[7], bytes_est_buf[7];
|
||||
char bytes_rebuilt_buf[7], bytes_est_s_buf[7], bytes_est_i_buf[7];
|
||||
char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
|
||||
zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
|
||||
sizeof (bytes_scanned_buf));
|
||||
|
@ -7833,9 +7836,8 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
|
|||
sizeof (bytes_issued_buf));
|
||||
zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
|
||||
sizeof (bytes_rebuilt_buf));
|
||||
zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
|
||||
zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
|
||||
zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
|
||||
zfs_nicebytes(bytes_est_s, bytes_est_s_buf, sizeof (bytes_est_s_buf));
|
||||
zfs_nicebytes(bytes_est_i, bytes_est_i_buf, sizeof (bytes_est_i_buf));
|
||||
|
||||
time_t start = vrs->vrs_start_time;
|
||||
time_t end = vrs->vrs_end_time;
|
||||
|
@ -7858,17 +7860,29 @@ print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
|
|||
|
||||
assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
|
||||
|
||||
secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
|
||||
MAX(scan_rate, 1), time_buf);
|
||||
(void) printf(gettext("\t%s / %s scanned"), bytes_scanned_buf,
|
||||
bytes_est_s_buf);
|
||||
if (scan_rate > 0) {
|
||||
zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
|
||||
(void) printf(gettext(" at %s/s"), scan_rate_buf);
|
||||
}
|
||||
(void) printf(gettext(", %s / %s issued"), bytes_issued_buf,
|
||||
bytes_est_i_buf);
|
||||
if (issue_rate > 0) {
|
||||
zfs_nicebytes(issue_rate, issue_rate_buf,
|
||||
sizeof (issue_rate_buf));
|
||||
(void) printf(gettext(" at %s/s"), issue_rate_buf);
|
||||
}
|
||||
(void) printf(gettext("\n"));
|
||||
|
||||
(void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
|
||||
"%s total\n"), bytes_scanned_buf, scan_rate_buf,
|
||||
bytes_issued_buf, issue_rate_buf, bytes_est_buf);
|
||||
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
|
||||
bytes_rebuilt_buf, scan_pct);
|
||||
|
||||
if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
|
||||
if (scan_rate >= 10 * 1024 * 1024) {
|
||||
if (bytes_est_s >= bytes_scanned &&
|
||||
scan_rate >= 10 * 1024 * 1024) {
|
||||
secs_to_dhms((bytes_est_s - bytes_scanned) / scan_rate,
|
||||
time_buf);
|
||||
(void) printf(gettext(", %s to go\n"), time_buf);
|
||||
} else {
|
||||
(void) printf(gettext(", no estimated "
|
||||
|
@ -7900,7 +7914,7 @@ print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot)
|
|||
ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
|
||||
char *name = zpool_vdev_name(g_zfs, zhp,
|
||||
child[c], VDEV_NAME_TYPE_ID);
|
||||
print_rebuild_status_impl(vrs, name);
|
||||
print_rebuild_status_impl(vrs, i, name);
|
||||
free(name);
|
||||
}
|
||||
}
|
||||
|
@ -8005,13 +8019,15 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
|
|||
active_resilver = (ps->pss_state == DSS_SCANNING);
|
||||
}
|
||||
|
||||
|
||||
have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
|
||||
have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
|
||||
scrub_start = ps->pss_start_time;
|
||||
have_errorscrub = (ps->pss_error_scrub_func ==
|
||||
POOL_SCAN_ERRORSCRUB);
|
||||
errorscrub_start = ps->pss_error_scrub_start;
|
||||
if (c > offsetof(pool_scan_stat_t,
|
||||
pss_pass_error_scrub_pause) / 8) {
|
||||
have_errorscrub = (ps->pss_error_scrub_func ==
|
||||
POOL_SCAN_ERRORSCRUB);
|
||||
errorscrub_start = ps->pss_error_scrub_start;
|
||||
}
|
||||
}
|
||||
|
||||
boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
|
||||
|
|
|
@ -238,6 +238,7 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name)
|
|||
print_kv("end_ts", ps->pss_end_time);
|
||||
print_kv(",errors", ps->pss_errors);
|
||||
print_kv(",examined", examined);
|
||||
print_kv(",skipped", ps->pss_skipped);
|
||||
print_kv(",issued", ps->pss_issued);
|
||||
print_kv(",pass_examined", pass_exam);
|
||||
print_kv(",pass_issued", ps->pss_pass_issued);
|
||||
|
@ -249,7 +250,6 @@ print_scan_status(nvlist_t *nvroot, const char *pool_name)
|
|||
print_kv(",remaining_t", remaining_time);
|
||||
print_kv(",start_ts", ps->pss_start_time);
|
||||
print_kv(",to_examine", ps->pss_to_examine);
|
||||
print_kv(",to_process", ps->pss_to_process);
|
||||
printf(" %llu\n", (u_longlong_t)timestamp);
|
||||
return (0);
|
||||
}
|
||||
|
|
|
@ -1,3 +1,9 @@
|
|||
openzfs-linux (2.2.0-0) unstable; urgency=medium
|
||||
|
||||
* Merge tag zfs-2.2.0
|
||||
|
||||
-- Ameer Hamza <ahamza@ixsystems.com> Mon, 03 Jul 2023 10:00:00 -0500
|
||||
|
||||
openzfs-linux (2.1.99-1) unstable; urgency=medium
|
||||
|
||||
* Merge tag zfs-2.1.99.
|
||||
|
|
|
@ -36,7 +36,7 @@ install() {
|
|||
{ dfatal "Failed to install essential binaries"; exit 1; }
|
||||
|
||||
# Adapted from https://github.com/zbm-dev/zfsbootmenu
|
||||
if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so'; then
|
||||
if ! ldd "$(command -v zpool)" | grep -qF 'libgcc_s.so' && ldconfig -p 2> /dev/null | grep -qF 'libc.so.6' ; then
|
||||
# On systems with gcc-config (Gentoo, Funtoo, etc.), use it to find libgcc_s
|
||||
if command -v gcc-config >/dev/null; then
|
||||
inst_simple "/usr/lib/gcc/$(s=$(gcc-config -c); echo "${s%-*}/${s##*-}")/libgcc_s.so.1" ||
|
||||
|
|
|
@ -1,3 +1,9 @@
|
|||
openzfs (2.2.0-0) unstable; urgency=medium
|
||||
|
||||
* Merge tag zfs-2.2.0
|
||||
|
||||
-- Ameer Hamza <ahamza@ixsystems.com> Mon, 03 Jul 2023 10:00:00 -0500
|
||||
|
||||
openzfs (2.1.99-1) unstable; urgency=medium
|
||||
|
||||
* Merge tag zfs-2.1.99.
|
||||
|
|
|
@ -104,6 +104,7 @@ typedef struct taskq {
|
|||
/* list node for the cpu hotplug callback */
|
||||
struct hlist_node tq_hp_cb_node;
|
||||
boolean_t tq_hp_support;
|
||||
unsigned long lastshouldstop; /* when to purge dynamic */
|
||||
} taskq_t;
|
||||
|
||||
typedef struct taskq_ent {
|
||||
|
|
|
@ -61,7 +61,7 @@ typedef struct dsl_scan_phys {
|
|||
uint64_t scn_end_time;
|
||||
uint64_t scn_to_examine; /* total bytes to be scanned */
|
||||
uint64_t scn_examined; /* bytes scanned so far */
|
||||
uint64_t scn_to_process;
|
||||
uint64_t scn_skipped; /* bytes skipped by scanner */
|
||||
uint64_t scn_processed;
|
||||
uint64_t scn_errors; /* scan I/O error count */
|
||||
uint64_t scn_ddt_class_max;
|
||||
|
|
|
@ -1088,7 +1088,7 @@ typedef struct pool_scan_stat {
|
|||
uint64_t pss_end_time; /* scan end time */
|
||||
uint64_t pss_to_examine; /* total bytes to scan */
|
||||
uint64_t pss_examined; /* total bytes located by scanner */
|
||||
uint64_t pss_to_process; /* total bytes to process */
|
||||
uint64_t pss_skipped; /* total bytes skipped by scanner */
|
||||
uint64_t pss_processed; /* total processed bytes */
|
||||
uint64_t pss_errors; /* scan errors */
|
||||
|
||||
|
@ -1152,6 +1152,7 @@ typedef struct vdev_rebuild_stat {
|
|||
uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */
|
||||
uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */
|
||||
uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */
|
||||
uint64_t vrs_pass_bytes_skipped; /* bytes skipped since start/resume */
|
||||
} vdev_rebuild_stat_t;
|
||||
|
||||
/*
|
||||
|
|
|
@ -164,8 +164,9 @@ extern zio_t *vdev_queue_io(zio_t *zio);
|
|||
extern void vdev_queue_io_done(zio_t *zio);
|
||||
extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority);
|
||||
|
||||
extern int vdev_queue_length(vdev_t *vd);
|
||||
extern uint32_t vdev_queue_length(vdev_t *vd);
|
||||
extern uint64_t vdev_queue_last_offset(vdev_t *vd);
|
||||
extern uint64_t vdev_queue_class_length(vdev_t *vq, zio_priority_t p);
|
||||
|
||||
extern void vdev_config_dirty(vdev_t *vd);
|
||||
extern void vdev_config_clean(vdev_t *vd);
|
||||
|
|
|
@ -130,27 +130,24 @@ typedef const struct vdev_ops {
|
|||
/*
|
||||
* Virtual device properties
|
||||
*/
|
||||
typedef struct vdev_queue_class {
|
||||
uint32_t vqc_active;
|
||||
|
||||
/*
|
||||
* Sorted by offset or timestamp, depending on if the queue is
|
||||
* LBA-ordered vs FIFO.
|
||||
*/
|
||||
avl_tree_t vqc_queued_tree;
|
||||
typedef union vdev_queue_class {
|
||||
list_t vqc_list;
|
||||
avl_tree_t vqc_tree;
|
||||
} vdev_queue_class_t;
|
||||
|
||||
struct vdev_queue {
|
||||
vdev_t *vq_vdev;
|
||||
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
|
||||
avl_tree_t vq_active_tree;
|
||||
avl_tree_t vq_read_offset_tree;
|
||||
avl_tree_t vq_write_offset_tree;
|
||||
avl_tree_t vq_trim_offset_tree;
|
||||
uint64_t vq_last_offset;
|
||||
zio_priority_t vq_last_prio; /* Last sent I/O priority. */
|
||||
uint32_t vq_cqueued; /* Classes with queued I/Os. */
|
||||
uint32_t vq_cactive[ZIO_PRIORITY_NUM_QUEUEABLE];
|
||||
uint32_t vq_active; /* Number of active I/Os. */
|
||||
uint32_t vq_ia_active; /* Active interactive I/Os. */
|
||||
uint32_t vq_nia_credit; /* Non-interactive I/Os credit. */
|
||||
list_t vq_active_list; /* List of active I/Os. */
|
||||
hrtime_t vq_io_complete_ts; /* time last i/o completed */
|
||||
hrtime_t vq_io_delta_ts;
|
||||
zio_t vq_io_search; /* used as local for stack reduction */
|
||||
|
|
|
@ -79,6 +79,7 @@ typedef struct vdev_rebuild {
|
|||
uint64_t vr_pass_start_time;
|
||||
uint64_t vr_pass_bytes_scanned;
|
||||
uint64_t vr_pass_bytes_issued;
|
||||
uint64_t vr_pass_bytes_skipped;
|
||||
|
||||
/* On-disk state updated by vdev_rebuild_zap_update_sync() */
|
||||
vdev_rebuild_phys_t vr_rebuild_phys;
|
||||
|
|
|
@ -341,9 +341,9 @@ typedef struct zio_prop {
|
|||
enum zio_checksum zp_checksum;
|
||||
enum zio_compress zp_compress;
|
||||
uint8_t zp_complevel;
|
||||
dmu_object_type_t zp_type;
|
||||
uint8_t zp_level;
|
||||
uint8_t zp_copies;
|
||||
dmu_object_type_t zp_type;
|
||||
boolean_t zp_dedup;
|
||||
boolean_t zp_dedup_verify;
|
||||
boolean_t zp_nopwrite;
|
||||
|
@ -436,6 +436,12 @@ typedef struct zio_link {
|
|||
list_node_t zl_child_node;
|
||||
} zio_link_t;
|
||||
|
||||
enum zio_qstate {
|
||||
ZIO_QS_NONE = 0,
|
||||
ZIO_QS_QUEUED,
|
||||
ZIO_QS_ACTIVE,
|
||||
};
|
||||
|
||||
struct zio {
|
||||
/* Core information about this I/O */
|
||||
zbookmark_phys_t io_bookmark;
|
||||
|
@ -479,6 +485,12 @@ struct zio {
|
|||
const zio_vsd_ops_t *io_vsd_ops;
|
||||
metaslab_class_t *io_metaslab_class; /* dva throttle class */
|
||||
|
||||
enum zio_qstate io_queue_state; /* vdev queue state */
|
||||
union {
|
||||
list_node_t l;
|
||||
avl_node_t a;
|
||||
} io_queue_node ____cacheline_aligned; /* allocator and vdev queues */
|
||||
avl_node_t io_offset_node; /* vdev offset queues */
|
||||
uint64_t io_offset;
|
||||
hrtime_t io_timestamp; /* submitted at */
|
||||
hrtime_t io_queued_timestamp;
|
||||
|
@ -486,9 +498,6 @@ struct zio {
|
|||
hrtime_t io_delta; /* vdev queue service delta */
|
||||
hrtime_t io_delay; /* Device access time (disk or */
|
||||
/* file). */
|
||||
avl_node_t io_queue_node;
|
||||
avl_node_t io_offset_node;
|
||||
avl_node_t io_alloc_node;
|
||||
zio_alloc_list_t io_alloc_list;
|
||||
|
||||
/* Internal pipeline state */
|
||||
|
@ -602,6 +611,7 @@ extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
|
|||
extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
|
||||
extern zio_t *zio_unique_parent(zio_t *cio);
|
||||
extern void zio_add_child(zio_t *pio, zio_t *cio);
|
||||
extern void zio_add_child_first(zio_t *pio, zio_t *cio);
|
||||
|
||||
extern void *zio_buf_alloc(size_t size);
|
||||
extern void zio_buf_free(void *buf, size_t size);
|
||||
|
|
|
@ -1789,7 +1789,8 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
|
|||
nvlist_t *nvl;
|
||||
int nvl_len = 0;
|
||||
int added_resv = 0;
|
||||
zfs_prop_t prop = 0;
|
||||
zfs_prop_t prop;
|
||||
boolean_t nsprop = B_FALSE;
|
||||
nvpair_t *elem;
|
||||
|
||||
(void) snprintf(errbuf, sizeof (errbuf),
|
||||
|
@ -1836,6 +1837,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
|
|||
elem = nvlist_next_nvpair(nvl, elem)) {
|
||||
|
||||
prop = zfs_name_to_prop(nvpair_name(elem));
|
||||
nsprop |= zfs_is_namespace_prop(prop);
|
||||
|
||||
assert(cl_idx < nvl_len);
|
||||
/*
|
||||
|
@ -1934,8 +1936,7 @@ zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props)
|
|||
* if one of the options handled by the generic
|
||||
* Linux namespace layer has been modified.
|
||||
*/
|
||||
if (zfs_is_namespace_prop(prop) &&
|
||||
zfs_is_mounted(zhp, NULL))
|
||||
if (nsprop && zfs_is_mounted(zhp, NULL))
|
||||
ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -193,4 +193,19 @@ The proc file will walk the lists with lock held,
|
|||
reading it could cause a lock-up if the list grow too large
|
||||
without limiting the output.
|
||||
"(truncated)" will be shown if the list is larger than the limit.
|
||||
.
|
||||
.It Sy spl_taskq_thread_timeout_ms Ns = Ns Sy 10000 Pq uint
|
||||
(Linux-only)
|
||||
How long a taskq has to have had no work before we tear it down.
|
||||
Previously, we would tear down a dynamic taskq worker as soon
|
||||
as we noticed it had no work, but it was observed that this led
|
||||
to a lot of churn in tearing down things we then immediately
|
||||
spawned anew.
|
||||
In practice, it seems any nonzero value will remove the vast
|
||||
majority of this churn, while the nontrivially larger value
|
||||
was chosen to help filter out the little remaining churn on
|
||||
a mostly idle system.
|
||||
Setting this value to
|
||||
.Sy 0
|
||||
will revert to the previous behavior.
|
||||
.El
|
||||
|
|
|
@ -239,6 +239,16 @@ relative to the pool.
|
|||
Make some blocks above a certain size be gang blocks.
|
||||
This option is used by the test suite to facilitate testing.
|
||||
.
|
||||
.It Sy zfs_ddt_zap_default_bs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
|
||||
Default DDT ZAP data block size as a power of 2. Note that changing this after
|
||||
creating a DDT on the pool will not affect existing DDTs, only newly created
|
||||
ones.
|
||||
.
|
||||
.It Sy zfs_ddt_zap_default_ibs Ns = Ns Sy 15 Po 32 KiB Pc Pq int
|
||||
Default DDT ZAP indirect block size as a power of 2. Note that changing this
|
||||
after creating a DDT on the pool will not affect existing DDTs, only newly
|
||||
created ones.
|
||||
.
|
||||
.It Sy zfs_default_bs Ns = Ns Sy 9 Po 512 B Pc Pq int
|
||||
Default dnode block size as a power of 2.
|
||||
.
|
||||
|
@ -2016,12 +2026,6 @@ Historical statistics for this many latest TXGs will be available in
|
|||
Flush dirty data to disk at least every this many seconds (maximum TXG
|
||||
duration).
|
||||
.
|
||||
.It Sy zfs_vdev_aggregate_trim Ns = Ns Sy 0 Ns | Ns 1 Pq uint
|
||||
Allow TRIM I/O operations to be aggregated.
|
||||
This is normally not helpful because the extents to be trimmed
|
||||
will have been already been aggregated by the metaslab.
|
||||
This option is provided for debugging and performance analysis.
|
||||
.
|
||||
.It Sy zfs_vdev_aggregation_limit Ns = Ns Sy 1048576 Ns B Po 1 MiB Pc Pq uint
|
||||
Max vdev I/O aggregation size.
|
||||
.
|
||||
|
|
|
@ -14,7 +14,7 @@
|
|||
.\" Copyright (c) 2017 Lawrence Livermore National Security, LLC.
|
||||
.\" Copyright (c) 2017 Intel Corporation.
|
||||
.\"
|
||||
.Dd June 4, 2023
|
||||
.Dd June 27, 2023
|
||||
.Dt ZDB 8
|
||||
.Os
|
||||
.
|
||||
|
@ -51,6 +51,7 @@
|
|||
.Fl C
|
||||
.Op Fl A
|
||||
.Op Fl U Ar cache
|
||||
.Op Ar poolname
|
||||
.Nm
|
||||
.Fl E
|
||||
.Op Fl A
|
||||
|
|
|
@ -87,13 +87,13 @@ currently in use by another subsystem.
|
|||
However this check is not robust enough
|
||||
to detect simultaneous attempts to use a new device in different pools, even if
|
||||
.Sy multihost Ns = Sy enabled .
|
||||
The administrator must ensure, that simultaneous invocations of any combination
|
||||
The administrator must ensure that simultaneous invocations of any combination
|
||||
of
|
||||
.Nm zpool Cm replace ,
|
||||
.Nm zpool Cm create ,
|
||||
.Nm zpool Cm add ,
|
||||
or
|
||||
.Nm zpool Cm labelclear ,
|
||||
.Nm zpool Cm labelclear
|
||||
do not refer to the same device.
|
||||
Using the same device in two pools will result in pool corruption.
|
||||
.Pp
|
||||
|
|
|
@ -26,7 +26,7 @@
|
|||
.\" Copyright 2017 Nexenta Systems, Inc.
|
||||
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
|
||||
.\"
|
||||
.Dd July 25, 2021
|
||||
.Dd June 22, 2023
|
||||
.Dt ZPOOL-SCRUB 8
|
||||
.Os
|
||||
.
|
||||
|
@ -123,7 +123,7 @@ Status of pool with ongoing scrub:
|
|||
.No # Nm zpool Cm status
|
||||
...
|
||||
scan: scrub in progress since Sun Jul 25 16:07:49 2021
|
||||
403M scanned at 100M/s, 68.4M issued at 10.0M/s, 405M total
|
||||
403M / 405M scanned at 100M/s, 68.4M / 405M issued at 10.0M/s
|
||||
0B repaired, 16.91% done, 00:00:04 to go
|
||||
...
|
||||
.Ed
|
||||
|
|
|
@ -36,6 +36,12 @@ static int spl_taskq_thread_bind = 0;
|
|||
module_param(spl_taskq_thread_bind, int, 0644);
|
||||
MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
|
||||
|
||||
static uint_t spl_taskq_thread_timeout_ms = 10000;
|
||||
/* BEGIN CSTYLED */
|
||||
module_param(spl_taskq_thread_timeout_ms, uint, 0644);
|
||||
/* END CSTYLED */
|
||||
MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
|
||||
"Time to require a dynamic thread be idle before it gets cleaned up");
|
||||
|
||||
static int spl_taskq_thread_dynamic = 1;
|
||||
module_param(spl_taskq_thread_dynamic, int, 0444);
|
||||
|
@ -848,12 +854,37 @@ taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
|
|||
tqt_thread_list) == tqt)
|
||||
return (0);
|
||||
|
||||
return
|
||||
int no_work =
|
||||
((tq->tq_nspawn == 0) && /* No threads are being spawned */
|
||||
(tq->tq_nactive == 0) && /* No threads are handling tasks */
|
||||
(tq->tq_nthreads > 1) && /* More than 1 thread is running */
|
||||
(!taskq_next_ent(tq)) && /* There are no pending tasks */
|
||||
(spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */
|
||||
|
||||
/*
|
||||
* If we would have said stop before, let's instead wait a bit, maybe
|
||||
* we'll see more work come our way soon...
|
||||
*/
|
||||
if (no_work) {
|
||||
/* if it's 0, we want the old behavior. */
|
||||
/* if the taskq is being torn down, we also want to go away. */
|
||||
if (spl_taskq_thread_timeout_ms == 0 ||
|
||||
!(tq->tq_flags & TASKQ_ACTIVE))
|
||||
return (1);
|
||||
unsigned long lasttime = tq->lastshouldstop;
|
||||
if (lasttime > 0) {
|
||||
if (time_after(jiffies, lasttime +
|
||||
msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
|
||||
return (1);
|
||||
else
|
||||
return (0);
|
||||
} else {
|
||||
tq->lastshouldstop = jiffies;
|
||||
}
|
||||
} else {
|
||||
tq->lastshouldstop = 0;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -1091,6 +1122,7 @@ taskq_create(const char *name, int threads_arg, pri_t pri,
|
|||
tq->tq_flags = (flags | TASKQ_ACTIVE);
|
||||
tq->tq_next_id = TASKQID_INITIAL;
|
||||
tq->tq_lowest_id = TASKQID_INITIAL;
|
||||
tq->lastshouldstop = 0;
|
||||
INIT_LIST_HEAD(&tq->tq_free_list);
|
||||
INIT_LIST_HEAD(&tq->tq_pend_list);
|
||||
INIT_LIST_HEAD(&tq->tq_prio_list);
|
||||
|
|
|
@ -54,7 +54,7 @@ static unsigned int zvol_prefetch_bytes = (128 * 1024);
|
|||
static unsigned long zvol_max_discard_blocks = 16384;
|
||||
|
||||
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
|
||||
static const unsigned int zvol_open_timeout_ms = 1000;
|
||||
static unsigned int zvol_open_timeout_ms = 1000;
|
||||
#endif
|
||||
|
||||
static unsigned int zvol_threads = 0;
|
||||
|
@ -1612,4 +1612,9 @@ MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread,
|
|||
"Process volblocksize blocks per thread");
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_BLKDEV_GET_ERESTARTSYS
|
||||
module_param(zvol_open_timeout_ms, uint, 0644);
|
||||
MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries");
|
||||
#endif
|
||||
|
||||
/* END CSTYLED */
|
||||
|
|
|
@ -1209,10 +1209,19 @@ ddt_addref(spa_t *spa, const blkptr_t *bp)
|
|||
ASSERT3S(dde->dde_class, <, DDT_CLASSES);
|
||||
|
||||
ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
|
||||
if (ddp->ddp_refcnt == 0) {
|
||||
/* This should never happen? */
|
||||
ddt_phys_fill(ddp, bp);
|
||||
}
|
||||
|
||||
/*
|
||||
* This entry already existed (dde_type is real), so it must
|
||||
* have refcnt >0 at the start of this txg. We are called from
|
||||
* brt_pending_apply(), before frees are issued, so the refcnt
|
||||
* can't be lowered yet. Therefore, it must be >0. We assert
|
||||
* this because if the order of BRT and DDT interactions were
|
||||
* ever to change and the refcnt was ever zero here, then
|
||||
* likely further action is required to fill out the DDT entry,
|
||||
* and this is a place that is likely to be missed in testing.
|
||||
*/
|
||||
ASSERT3U(ddp->ddp_refcnt, >, 0);
|
||||
|
||||
ddt_phys_addref(ddp);
|
||||
result = B_TRUE;
|
||||
} else {
|
||||
|
|
|
@ -31,8 +31,8 @@
|
|||
#include <sys/zap.h>
|
||||
#include <sys/dmu_tx.h>
|
||||
|
||||
static const int ddt_zap_leaf_blockshift = 12;
|
||||
static const int ddt_zap_indirect_blockshift = 12;
|
||||
static unsigned int ddt_zap_default_bs = 15;
|
||||
static unsigned int ddt_zap_default_ibs = 15;
|
||||
|
||||
static int
|
||||
ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
|
||||
|
@ -43,7 +43,7 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
|
|||
flags |= ZAP_FLAG_PRE_HASHED_KEY;
|
||||
|
||||
*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
|
||||
ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
|
||||
ddt_zap_default_bs, ddt_zap_default_ibs,
|
||||
DMU_OT_NONE, 0, tx);
|
||||
|
||||
return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0);
|
||||
|
@ -166,3 +166,10 @@ const ddt_ops_t ddt_zap_ops = {
|
|||
ddt_zap_walk,
|
||||
ddt_zap_count,
|
||||
};
|
||||
|
||||
/* BEGIN CSTYLED */
|
||||
ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_bs, UINT, ZMOD_RW,
|
||||
"DDT ZAP leaf blockshift");
|
||||
ZFS_MODULE_PARAM(zfs_dedup, , ddt_zap_default_ibs, UINT, ZMOD_RW,
|
||||
"DDT ZAP indirect blockshift");
|
||||
/* END CSTYLED */
|
||||
|
|
|
@ -573,7 +573,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
|
|||
* counter to how far we've scanned. We know we're consistent
|
||||
* up to here.
|
||||
*/
|
||||
scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
|
||||
scn->scn_issued_before_pass = scn->scn_phys.scn_examined -
|
||||
scn->scn_phys.scn_skipped;
|
||||
|
||||
if (dsl_scan_is_running(scn) &&
|
||||
spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
|
||||
|
@ -4362,7 +4363,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
|
|||
* Disabled by default, set zfs_scan_report_txgs to report
|
||||
* average performance over the last zfs_scan_report_txgs TXGs.
|
||||
*/
|
||||
if (!dsl_scan_is_paused_scrub(scn) && zfs_scan_report_txgs != 0 &&
|
||||
if (zfs_scan_report_txgs != 0 &&
|
||||
tx->tx_txg % zfs_scan_report_txgs == 0) {
|
||||
scn->scn_issued_before_pass += spa->spa_scan_pass_issued;
|
||||
spa_scan_stat_init(spa);
|
||||
|
@ -4564,6 +4565,15 @@ count_block_issued(spa_t *spa, const blkptr_t *bp, boolean_t all)
|
|||
all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
|
||||
}
|
||||
|
||||
static void
|
||||
count_block_skipped(dsl_scan_t *scn, const blkptr_t *bp, boolean_t all)
|
||||
{
|
||||
if (BP_IS_EMBEDDED(bp))
|
||||
return;
|
||||
atomic_add_64(&scn->scn_phys.scn_skipped,
|
||||
all ? BP_GET_ASIZE(bp) : DVA_GET_ASIZE(&bp->blk_dva[0]));
|
||||
}
|
||||
|
||||
static void
|
||||
count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
|
||||
{
|
||||
|
@ -4709,7 +4719,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
|
|||
count_block(dp->dp_blkstats, bp);
|
||||
if (phys_birth <= scn->scn_phys.scn_min_txg ||
|
||||
phys_birth >= scn->scn_phys.scn_max_txg) {
|
||||
count_block_issued(spa, bp, B_TRUE);
|
||||
count_block_skipped(scn, bp, B_TRUE);
|
||||
return (0);
|
||||
}
|
||||
|
||||
|
@ -4750,7 +4760,7 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
|
|||
if (needs_io && !zfs_no_scrub_io) {
|
||||
dsl_scan_enqueue(dp, bp, zio_flags, zb);
|
||||
} else {
|
||||
count_block_issued(spa, bp, B_TRUE);
|
||||
count_block_skipped(scn, bp, B_TRUE);
|
||||
}
|
||||
|
||||
/* do not relocate this block */
|
||||
|
@ -5119,9 +5129,9 @@ dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
|
|||
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
|
||||
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
|
||||
|
||||
/* count the block as though we issued it */
|
||||
/* count the block as though we skipped it */
|
||||
sio2bp(sio, &tmpbp);
|
||||
count_block_issued(spa, &tmpbp, B_FALSE);
|
||||
count_block_skipped(scn, &tmpbp, B_FALSE);
|
||||
|
||||
sio_free(sio);
|
||||
}
|
||||
|
|
|
@ -730,7 +730,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
|
|||
mutex_init(&spa->spa_allocs[i].spaa_lock, NULL, MUTEX_DEFAULT,
|
||||
NULL);
|
||||
avl_create(&spa->spa_allocs[i].spaa_tree, zio_bookmark_compare,
|
||||
sizeof (zio_t), offsetof(zio_t, io_alloc_node));
|
||||
sizeof (zio_t), offsetof(zio_t, io_queue_node.a));
|
||||
}
|
||||
avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed,
|
||||
sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node));
|
||||
|
@ -2611,7 +2611,7 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
|
|||
ps->pss_end_time = scn->scn_phys.scn_end_time;
|
||||
ps->pss_to_examine = scn->scn_phys.scn_to_examine;
|
||||
ps->pss_examined = scn->scn_phys.scn_examined;
|
||||
ps->pss_to_process = scn->scn_phys.scn_to_process;
|
||||
ps->pss_skipped = scn->scn_phys.scn_skipped;
|
||||
ps->pss_processed = scn->scn_phys.scn_processed;
|
||||
ps->pss_errors = scn->scn_phys.scn_errors;
|
||||
|
||||
|
|
|
@ -895,15 +895,10 @@ txg_list_destroy(txg_list_t *tl)
|
|||
boolean_t
|
||||
txg_all_lists_empty(txg_list_t *tl)
|
||||
{
|
||||
mutex_enter(&tl->tl_lock);
|
||||
for (int i = 0; i < TXG_SIZE; i++) {
|
||||
if (!txg_list_empty_impl(tl, i)) {
|
||||
mutex_exit(&tl->tl_lock);
|
||||
return (B_FALSE);
|
||||
}
|
||||
}
|
||||
mutex_exit(&tl->tl_lock);
|
||||
return (B_TRUE);
|
||||
boolean_t res = B_TRUE;
|
||||
for (int i = 0; i < TXG_SIZE; i++)
|
||||
res &= (tl->tl_head[i] == NULL);
|
||||
return (res);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -4608,11 +4608,9 @@ vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
|
|||
|
||||
memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex));
|
||||
|
||||
for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) {
|
||||
vsx->vsx_active_queue[t] =
|
||||
vd->vdev_queue.vq_class[t].vqc_active;
|
||||
vsx->vsx_pend_queue[t] = avl_numnodes(
|
||||
&vd->vdev_queue.vq_class[t].vqc_queued_tree);
|
||||
for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) {
|
||||
vsx->vsx_active_queue[t] = vd->vdev_queue.vq_cactive[t];
|
||||
vsx->vsx_pend_queue[t] = vdev_queue_class_length(vd, t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5470,20 +5468,20 @@ vdev_deadman(vdev_t *vd, const char *tag)
|
|||
vdev_queue_t *vq = &vd->vdev_queue;
|
||||
|
||||
mutex_enter(&vq->vq_lock);
|
||||
if (avl_numnodes(&vq->vq_active_tree) > 0) {
|
||||
if (vq->vq_active > 0) {
|
||||
spa_t *spa = vd->vdev_spa;
|
||||
zio_t *fio;
|
||||
uint64_t delta;
|
||||
|
||||
zfs_dbgmsg("slow vdev: %s has %lu active IOs",
|
||||
vd->vdev_path, avl_numnodes(&vq->vq_active_tree));
|
||||
zfs_dbgmsg("slow vdev: %s has %u active IOs",
|
||||
vd->vdev_path, vq->vq_active);
|
||||
|
||||
/*
|
||||
* Look at the head of all the pending queues,
|
||||
* if any I/O has been outstanding for longer than
|
||||
* the spa_deadman_synctime invoke the deadman logic.
|
||||
*/
|
||||
fio = avl_first(&vq->vq_active_tree);
|
||||
fio = list_head(&vq->vq_active_list);
|
||||
delta = gethrtime() - fio->io_timestamp;
|
||||
if (delta > spa_deadman_synctime(spa))
|
||||
zio_deadman(fio, tag);
|
||||
|
|
|
@ -228,13 +228,6 @@ uint_t zfs_vdev_queue_depth_pct = 300;
|
|||
*/
|
||||
uint_t zfs_vdev_def_queue_depth = 32;
|
||||
|
||||
/*
|
||||
* Allow TRIM I/Os to be aggregated. This should normally not be needed since
|
||||
* TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
|
||||
* by the TRIM code in zfs_trim.c.
|
||||
*/
|
||||
static uint_t zfs_vdev_aggregate_trim = 0;
|
||||
|
||||
static int
|
||||
vdev_queue_offset_compare(const void *x1, const void *x2)
|
||||
{
|
||||
|
@ -249,38 +242,60 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
|
|||
return (TREE_PCMP(z1, z2));
|
||||
}
|
||||
|
||||
static inline avl_tree_t *
|
||||
vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
|
||||
{
|
||||
return (&vq->vq_class[p].vqc_queued_tree);
|
||||
}
|
||||
|
||||
static inline avl_tree_t *
|
||||
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
|
||||
{
|
||||
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
|
||||
if (t == ZIO_TYPE_READ)
|
||||
return (&vq->vq_read_offset_tree);
|
||||
else if (t == ZIO_TYPE_WRITE)
|
||||
return (&vq->vq_write_offset_tree);
|
||||
else
|
||||
return (&vq->vq_trim_offset_tree);
|
||||
}
|
||||
#define VDQ_T_SHIFT 29
|
||||
|
||||
static int
|
||||
vdev_queue_timestamp_compare(const void *x1, const void *x2)
|
||||
vdev_queue_to_compare(const void *x1, const void *x2)
|
||||
{
|
||||
const zio_t *z1 = (const zio_t *)x1;
|
||||
const zio_t *z2 = (const zio_t *)x2;
|
||||
|
||||
int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
|
||||
int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT,
|
||||
z2->io_timestamp >> VDQ_T_SHIFT);
|
||||
int ocmp = TREE_CMP(z1->io_offset, z2->io_offset);
|
||||
int cmp = tcmp ? tcmp : ocmp;
|
||||
|
||||
if (likely(cmp))
|
||||
if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE)))
|
||||
return (cmp);
|
||||
|
||||
return (TREE_PCMP(z1, z2));
|
||||
}
|
||||
|
||||
static inline boolean_t
|
||||
vdev_queue_class_fifo(zio_priority_t p)
|
||||
{
|
||||
return (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE ||
|
||||
p == ZIO_PRIORITY_TRIM);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_queue_class_add(vdev_queue_t *vq, zio_t *zio)
|
||||
{
|
||||
zio_priority_t p = zio->io_priority;
|
||||
vq->vq_cqueued |= 1U << p;
|
||||
if (vdev_queue_class_fifo(p))
|
||||
list_insert_tail(&vq->vq_class[p].vqc_list, zio);
|
||||
else
|
||||
avl_add(&vq->vq_class[p].vqc_tree, zio);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_queue_class_remove(vdev_queue_t *vq, zio_t *zio)
|
||||
{
|
||||
zio_priority_t p = zio->io_priority;
|
||||
uint32_t empty;
|
||||
if (vdev_queue_class_fifo(p)) {
|
||||
list_t *list = &vq->vq_class[p].vqc_list;
|
||||
list_remove(list, zio);
|
||||
empty = list_is_empty(list);
|
||||
} else {
|
||||
avl_tree_t *tree = &vq->vq_class[p].vqc_tree;
|
||||
avl_remove(tree, zio);
|
||||
empty = avl_is_empty(tree);
|
||||
}
|
||||
vq->vq_cqueued &= ~(empty << p);
|
||||
}
|
||||
|
||||
static uint_t
|
||||
vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
|
||||
{
|
||||
|
@ -360,7 +375,7 @@ vdev_queue_max_async_writes(spa_t *spa)
|
|||
}
|
||||
|
||||
static uint_t
|
||||
vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
|
||||
vdev_queue_class_max_active(vdev_queue_t *vq, zio_priority_t p)
|
||||
{
|
||||
switch (p) {
|
||||
case ZIO_PRIORITY_SYNC_READ:
|
||||
|
@ -370,7 +385,7 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
|
|||
case ZIO_PRIORITY_ASYNC_READ:
|
||||
return (zfs_vdev_async_read_max_active);
|
||||
case ZIO_PRIORITY_ASYNC_WRITE:
|
||||
return (vdev_queue_max_async_writes(spa));
|
||||
return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa));
|
||||
case ZIO_PRIORITY_SCRUB:
|
||||
if (vq->vq_ia_active > 0) {
|
||||
return (MIN(vq->vq_nia_credit,
|
||||
|
@ -414,10 +429,10 @@ vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
|
|||
static zio_priority_t
|
||||
vdev_queue_class_to_issue(vdev_queue_t *vq)
|
||||
{
|
||||
spa_t *spa = vq->vq_vdev->vdev_spa;
|
||||
zio_priority_t p, n;
|
||||
uint32_t cq = vq->vq_cqueued;
|
||||
zio_priority_t p, p1;
|
||||
|
||||
if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
|
||||
if (cq == 0 || vq->vq_active >= zfs_vdev_max_active)
|
||||
return (ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
|
||||
/*
|
||||
|
@ -425,14 +440,18 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
|
|||
* Do round-robin to reduce starvation due to zfs_vdev_max_active
|
||||
* and vq_nia_credit limits.
|
||||
*/
|
||||
for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
|
||||
p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
|
||||
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
|
||||
vq->vq_class[p].vqc_active <
|
||||
vdev_queue_class_min_active(vq, p)) {
|
||||
vq->vq_last_prio = p;
|
||||
return (p);
|
||||
}
|
||||
p1 = vq->vq_last_prio + 1;
|
||||
if (p1 >= ZIO_PRIORITY_NUM_QUEUEABLE)
|
||||
p1 = 0;
|
||||
for (p = p1; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
||||
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
|
||||
vdev_queue_class_min_active(vq, p))
|
||||
goto found;
|
||||
}
|
||||
for (p = 0; p < p1; p++) {
|
||||
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
|
||||
vdev_queue_class_min_active(vq, p))
|
||||
goto found;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -440,16 +459,14 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
|
|||
* maximum # outstanding i/os.
|
||||
*/
|
||||
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
||||
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
|
||||
vq->vq_class[p].vqc_active <
|
||||
vdev_queue_class_max_active(spa, vq, p)) {
|
||||
vq->vq_last_prio = p;
|
||||
return (p);
|
||||
}
|
||||
if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] <
|
||||
vdev_queue_class_max_active(vq, p))
|
||||
break;
|
||||
}
|
||||
|
||||
/* No eligible queued i/os */
|
||||
return (ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
found:
|
||||
vq->vq_last_prio = p;
|
||||
return (p);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -458,42 +475,30 @@ vdev_queue_init(vdev_t *vd)
|
|||
vdev_queue_t *vq = &vd->vdev_queue;
|
||||
zio_priority_t p;
|
||||
|
||||
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
vq->vq_vdev = vd;
|
||||
taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
|
||||
|
||||
avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
|
||||
sizeof (zio_t), offsetof(struct zio, io_queue_node));
|
||||
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
|
||||
vdev_queue_offset_compare, sizeof (zio_t),
|
||||
offsetof(struct zio, io_offset_node));
|
||||
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
|
||||
vdev_queue_offset_compare, sizeof (zio_t),
|
||||
offsetof(struct zio, io_offset_node));
|
||||
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
|
||||
vdev_queue_offset_compare, sizeof (zio_t),
|
||||
offsetof(struct zio, io_offset_node));
|
||||
|
||||
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
||||
int (*compfn) (const void *, const void *);
|
||||
|
||||
/*
|
||||
* The synchronous/trim i/o queues are dispatched in FIFO rather
|
||||
* than LBA order. This provides more consistent latency for
|
||||
* these i/os.
|
||||
*/
|
||||
if (p == ZIO_PRIORITY_SYNC_READ ||
|
||||
p == ZIO_PRIORITY_SYNC_WRITE ||
|
||||
p == ZIO_PRIORITY_TRIM) {
|
||||
compfn = vdev_queue_timestamp_compare;
|
||||
if (vdev_queue_class_fifo(p)) {
|
||||
list_create(&vq->vq_class[p].vqc_list,
|
||||
sizeof (zio_t),
|
||||
offsetof(struct zio, io_queue_node.l));
|
||||
} else {
|
||||
compfn = vdev_queue_offset_compare;
|
||||
avl_create(&vq->vq_class[p].vqc_tree,
|
||||
vdev_queue_to_compare, sizeof (zio_t),
|
||||
offsetof(struct zio, io_queue_node.a));
|
||||
}
|
||||
avl_create(vdev_queue_class_tree(vq, p), compfn,
|
||||
sizeof (zio_t), offsetof(struct zio, io_queue_node));
|
||||
}
|
||||
avl_create(&vq->vq_read_offset_tree,
|
||||
vdev_queue_offset_compare, sizeof (zio_t),
|
||||
offsetof(struct zio, io_offset_node));
|
||||
avl_create(&vq->vq_write_offset_tree,
|
||||
vdev_queue_offset_compare, sizeof (zio_t),
|
||||
offsetof(struct zio, io_offset_node));
|
||||
|
||||
vq->vq_last_offset = 0;
|
||||
list_create(&vq->vq_active_list, sizeof (struct zio),
|
||||
offsetof(struct zio, io_queue_node.l));
|
||||
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
|
||||
}
|
||||
|
||||
void
|
||||
|
@ -501,30 +506,39 @@ vdev_queue_fini(vdev_t *vd)
|
|||
{
|
||||
vdev_queue_t *vq = &vd->vdev_queue;
|
||||
|
||||
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
|
||||
avl_destroy(vdev_queue_class_tree(vq, p));
|
||||
avl_destroy(&vq->vq_active_tree);
|
||||
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
|
||||
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
|
||||
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
|
||||
for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
|
||||
if (vdev_queue_class_fifo(p))
|
||||
list_destroy(&vq->vq_class[p].vqc_list);
|
||||
else
|
||||
avl_destroy(&vq->vq_class[p].vqc_tree);
|
||||
}
|
||||
avl_destroy(&vq->vq_read_offset_tree);
|
||||
avl_destroy(&vq->vq_write_offset_tree);
|
||||
|
||||
list_destroy(&vq->vq_active_list);
|
||||
mutex_destroy(&vq->vq_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
|
||||
{
|
||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
|
||||
zio->io_queue_state = ZIO_QS_QUEUED;
|
||||
vdev_queue_class_add(vq, zio);
|
||||
if (zio->io_type == ZIO_TYPE_READ)
|
||||
avl_add(&vq->vq_read_offset_tree, zio);
|
||||
else if (zio->io_type == ZIO_TYPE_WRITE)
|
||||
avl_add(&vq->vq_write_offset_tree, zio);
|
||||
}
|
||||
|
||||
static void
|
||||
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
|
||||
{
|
||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
|
||||
vdev_queue_class_remove(vq, zio);
|
||||
if (zio->io_type == ZIO_TYPE_READ)
|
||||
avl_remove(&vq->vq_read_offset_tree, zio);
|
||||
else if (zio->io_type == ZIO_TYPE_WRITE)
|
||||
avl_remove(&vq->vq_write_offset_tree, zio);
|
||||
zio->io_queue_state = ZIO_QS_NONE;
|
||||
}
|
||||
|
||||
static boolean_t
|
||||
|
@ -546,14 +560,16 @@ vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
|
|||
{
|
||||
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
vq->vq_class[zio->io_priority].vqc_active++;
|
||||
vq->vq_cactive[zio->io_priority]++;
|
||||
vq->vq_active++;
|
||||
if (vdev_queue_is_interactive(zio->io_priority)) {
|
||||
if (++vq->vq_ia_active == 1)
|
||||
vq->vq_nia_credit = 1;
|
||||
} else if (vq->vq_ia_active > 0) {
|
||||
vq->vq_nia_credit--;
|
||||
}
|
||||
avl_add(&vq->vq_active_tree, zio);
|
||||
zio->io_queue_state = ZIO_QS_ACTIVE;
|
||||
list_insert_tail(&vq->vq_active_list, zio);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -561,7 +577,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
|
|||
{
|
||||
ASSERT(MUTEX_HELD(&vq->vq_lock));
|
||||
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
|
||||
vq->vq_class[zio->io_priority].vqc_active--;
|
||||
vq->vq_cactive[zio->io_priority]--;
|
||||
vq->vq_active--;
|
||||
if (vdev_queue_is_interactive(zio->io_priority)) {
|
||||
if (--vq->vq_ia_active == 0)
|
||||
vq->vq_nia_credit = 0;
|
||||
|
@ -569,7 +586,8 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
|
|||
vq->vq_nia_credit = zfs_vdev_nia_credit;
|
||||
} else if (vq->vq_ia_active == 0)
|
||||
vq->vq_nia_credit++;
|
||||
avl_remove(&vq->vq_active_tree, zio);
|
||||
list_remove(&vq->vq_active_list, zio);
|
||||
zio->io_queue_state = ZIO_QS_NONE;
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -602,29 +620,28 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
|||
uint64_t maxgap = 0;
|
||||
uint64_t size;
|
||||
uint64_t limit;
|
||||
int maxblocksize;
|
||||
boolean_t stretch = B_FALSE;
|
||||
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
|
||||
zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
|
||||
uint64_t next_offset;
|
||||
abd_t *abd;
|
||||
avl_tree_t *t;
|
||||
|
||||
/*
|
||||
* TRIM aggregation should not be needed since code in zfs_trim.c can
|
||||
* submit TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M).
|
||||
*/
|
||||
if (zio->io_type == ZIO_TYPE_TRIM)
|
||||
return (NULL);
|
||||
|
||||
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
|
||||
return (NULL);
|
||||
|
||||
maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
|
||||
if (vq->vq_vdev->vdev_nonrot)
|
||||
limit = zfs_vdev_aggregation_limit_non_rotating;
|
||||
else
|
||||
limit = zfs_vdev_aggregation_limit;
|
||||
limit = MIN(limit, maxblocksize);
|
||||
|
||||
if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
|
||||
return (NULL);
|
||||
|
||||
/*
|
||||
* While TRIM commands could be aggregated based on offset this
|
||||
* behavior is disabled until it's determined to be beneficial.
|
||||
*/
|
||||
if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
|
||||
if (limit == 0)
|
||||
return (NULL);
|
||||
limit = MIN(limit, SPA_MAXBLOCKSIZE);
|
||||
|
||||
/*
|
||||
* I/Os to distributed spares are directly dispatched to the dRAID
|
||||
|
@ -635,8 +652,13 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
|||
|
||||
first = last = zio;
|
||||
|
||||
if (zio->io_type == ZIO_TYPE_READ)
|
||||
if (zio->io_type == ZIO_TYPE_READ) {
|
||||
maxgap = zfs_vdev_read_gap_limit;
|
||||
t = &vq->vq_read_offset_tree;
|
||||
} else {
|
||||
ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
|
||||
t = &vq->vq_write_offset_tree;
|
||||
}
|
||||
|
||||
/*
|
||||
* We can aggregate I/Os that are sufficiently adjacent and of
|
||||
|
@ -657,6 +679,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
|||
* Walk backwards through sufficiently contiguous I/Os
|
||||
* recording the last non-optional I/O.
|
||||
*/
|
||||
zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
|
||||
while ((dio = AVL_PREV(t, first)) != NULL &&
|
||||
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
|
||||
IO_SPAN(dio, last) <= limit &&
|
||||
|
@ -686,7 +709,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
|||
(dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
|
||||
(IO_SPAN(first, dio) <= limit ||
|
||||
(dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
|
||||
IO_SPAN(first, dio) <= maxblocksize &&
|
||||
IO_SPAN(first, dio) <= SPA_MAXBLOCKSIZE &&
|
||||
IO_GAP(last, dio) <= maxgap &&
|
||||
dio->io_type == zio->io_type) {
|
||||
last = dio;
|
||||
|
@ -740,7 +763,7 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
|
|||
return (NULL);
|
||||
|
||||
size = IO_SPAN(first, last);
|
||||
ASSERT3U(size, <=, maxblocksize);
|
||||
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
|
||||
|
||||
abd = abd_alloc_gang();
|
||||
if (abd == NULL)
|
||||
|
@ -824,19 +847,30 @@ again:
|
|||
return (NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
* For LBA-ordered queues (async / scrub / initializing), issue the
|
||||
* i/o which follows the most recently issued i/o in LBA (offset) order.
|
||||
*
|
||||
* For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
|
||||
*/
|
||||
tree = vdev_queue_class_tree(vq, p);
|
||||
vq->vq_io_search.io_timestamp = 0;
|
||||
vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
|
||||
VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
|
||||
zio = avl_nearest(tree, idx, AVL_AFTER);
|
||||
if (zio == NULL)
|
||||
zio = avl_first(tree);
|
||||
if (vdev_queue_class_fifo(p)) {
|
||||
zio = list_head(&vq->vq_class[p].vqc_list);
|
||||
} else {
|
||||
/*
|
||||
* For LBA-ordered queues (async / scrub / initializing),
|
||||
* issue the I/O which follows the most recently issued I/O
|
||||
* in LBA (offset) order, but to avoid starvation only within
|
||||
* the same 0.5 second interval as the first I/O.
|
||||
*/
|
||||
tree = &vq->vq_class[p].vqc_tree;
|
||||
zio = aio = avl_first(tree);
|
||||
if (zio->io_offset < vq->vq_last_offset) {
|
||||
vq->vq_io_search.io_timestamp = zio->io_timestamp;
|
||||
vq->vq_io_search.io_offset = vq->vq_last_offset;
|
||||
zio = avl_find(tree, &vq->vq_io_search, &idx);
|
||||
if (zio == NULL) {
|
||||
zio = avl_nearest(tree, idx, AVL_AFTER);
|
||||
if (zio == NULL ||
|
||||
(zio->io_timestamp >> VDQ_T_SHIFT) !=
|
||||
(aio->io_timestamp >> VDQ_T_SHIFT))
|
||||
zio = aio;
|
||||
}
|
||||
}
|
||||
}
|
||||
ASSERT3U(zio->io_priority, ==, p);
|
||||
|
||||
aio = vdev_queue_aggregate(vq, zio);
|
||||
|
@ -967,7 +1001,6 @@ void
|
|||
vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
|
||||
{
|
||||
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
|
||||
avl_tree_t *tree;
|
||||
|
||||
/*
|
||||
* ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
|
||||
|
@ -1002,12 +1035,11 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
|
|||
* Otherwise, the zio is currently active and we cannot change its
|
||||
* priority.
|
||||
*/
|
||||
tree = vdev_queue_class_tree(vq, zio->io_priority);
|
||||
if (avl_find(tree, zio, NULL) == zio) {
|
||||
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||
if (zio->io_queue_state == ZIO_QS_QUEUED) {
|
||||
vdev_queue_class_remove(vq, zio);
|
||||
zio->io_priority = priority;
|
||||
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
|
||||
} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
|
||||
vdev_queue_class_add(vq, zio);
|
||||
} else if (zio->io_queue_state == ZIO_QS_NONE) {
|
||||
zio->io_priority = priority;
|
||||
}
|
||||
|
||||
|
@ -1020,10 +1052,10 @@ vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
|
|||
* vq_lock mutex use here, instead we prefer to keep it lock free for
|
||||
* performance.
|
||||
*/
|
||||
int
|
||||
uint32_t
|
||||
vdev_queue_length(vdev_t *vd)
|
||||
{
|
||||
return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
|
||||
return (vd->vdev_queue.vq_active);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
|
@ -1032,15 +1064,22 @@ vdev_queue_last_offset(vdev_t *vd)
|
|||
return (vd->vdev_queue.vq_last_offset);
|
||||
}
|
||||
|
||||
uint64_t
|
||||
vdev_queue_class_length(vdev_t *vd, zio_priority_t p)
|
||||
{
|
||||
vdev_queue_t *vq = &vd->vdev_queue;
|
||||
if (vdev_queue_class_fifo(p))
|
||||
return (list_is_empty(&vq->vq_class[p].vqc_list) == 0);
|
||||
else
|
||||
return (avl_numnodes(&vq->vq_class[p].vqc_tree));
|
||||
}
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, UINT, ZMOD_RW,
|
||||
"Max vdev I/O aggregation size");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, UINT,
|
||||
ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, UINT, ZMOD_RW,
|
||||
"Allow TRIM I/O to be aggregated");
|
||||
|
||||
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, UINT, ZMOD_RW,
|
||||
"Aggregate read I/O over gap");
|
||||
|
||||
|
|
|
@ -571,8 +571,10 @@ vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
|
|||
vdev_rebuild_blkptr_init(&blk, vd, start, size);
|
||||
uint64_t psize = BP_GET_PSIZE(&blk);
|
||||
|
||||
if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN))
|
||||
if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) {
|
||||
vr->vr_pass_bytes_skipped += size;
|
||||
return (0);
|
||||
}
|
||||
|
||||
mutex_enter(&vr->vr_io_lock);
|
||||
|
||||
|
@ -786,6 +788,7 @@ vdev_rebuild_thread(void *arg)
|
|||
vr->vr_pass_start_time = gethrtime();
|
||||
vr->vr_pass_bytes_scanned = 0;
|
||||
vr->vr_pass_bytes_issued = 0;
|
||||
vr->vr_pass_bytes_skipped = 0;
|
||||
|
||||
uint64_t update_est_time = gethrtime();
|
||||
vdev_rebuild_update_bytes_est(vd, 0);
|
||||
|
@ -1153,6 +1156,7 @@ vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
|
|||
vr->vr_pass_start_time);
|
||||
vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
|
||||
vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
|
||||
vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped;
|
||||
mutex_exit(&tvd->vdev_rebuild_lock);
|
||||
}
|
||||
|
||||
|
|
|
@ -462,14 +462,12 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
return (SET_ERROR(EINVAL));
|
||||
}
|
||||
|
||||
const uint64_t max_blksz = zfsvfs->z_max_blksz;
|
||||
|
||||
/*
|
||||
* Pre-fault the pages to ensure slow (eg NFS) pages
|
||||
* don't hold up txg.
|
||||
* Skip this if uio contains loaned arc_buf.
|
||||
*/
|
||||
if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
||||
ssize_t pfbytes = MIN(n, DMU_MAX_ACCESS >> 1);
|
||||
if (zfs_uio_prefaultpages(pfbytes, uio)) {
|
||||
zfs_exit(zfsvfs, FTAG);
|
||||
return (SET_ERROR(EFAULT));
|
||||
}
|
||||
|
@ -544,10 +542,31 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
break;
|
||||
}
|
||||
|
||||
uint64_t blksz;
|
||||
if (lr->lr_length == UINT64_MAX && zp->z_size <= zp->z_blksz) {
|
||||
if (zp->z_blksz > zfsvfs->z_max_blksz &&
|
||||
!ISP2(zp->z_blksz)) {
|
||||
/*
|
||||
* File's blocksize is already larger than the
|
||||
* "recordsize" property. Only let it grow to
|
||||
* the next power of 2.
|
||||
*/
|
||||
blksz = 1 << highbit64(zp->z_blksz);
|
||||
} else {
|
||||
blksz = zfsvfs->z_max_blksz;
|
||||
}
|
||||
blksz = MIN(blksz, P2ROUNDUP(end_size,
|
||||
SPA_MINBLOCKSIZE));
|
||||
blksz = MAX(blksz, zp->z_blksz);
|
||||
} else {
|
||||
blksz = zp->z_blksz;
|
||||
}
|
||||
|
||||
arc_buf_t *abuf = NULL;
|
||||
if (n >= max_blksz && woff >= zp->z_size &&
|
||||
P2PHASE(woff, max_blksz) == 0 &&
|
||||
zp->z_blksz == max_blksz) {
|
||||
ssize_t nbytes = n;
|
||||
if (n >= blksz && woff >= zp->z_size &&
|
||||
P2PHASE(woff, blksz) == 0 &&
|
||||
(blksz >= SPA_OLD_MAXBLOCKSIZE || n < 4 * blksz)) {
|
||||
/*
|
||||
* This write covers a full block. "Borrow" a buffer
|
||||
* from the dmu so that we can fill it before we enter
|
||||
|
@ -555,18 +574,26 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
* holding up the transaction if the data copy hangs
|
||||
* up on a pagefault (e.g., from an NFS server mapping).
|
||||
*/
|
||||
size_t cbytes;
|
||||
|
||||
abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
|
||||
max_blksz);
|
||||
blksz);
|
||||
ASSERT(abuf != NULL);
|
||||
ASSERT(arc_buf_size(abuf) == max_blksz);
|
||||
if ((error = zfs_uiocopy(abuf->b_data, max_blksz,
|
||||
UIO_WRITE, uio, &cbytes))) {
|
||||
ASSERT(arc_buf_size(abuf) == blksz);
|
||||
if ((error = zfs_uiocopy(abuf->b_data, blksz,
|
||||
UIO_WRITE, uio, &nbytes))) {
|
||||
dmu_return_arcbuf(abuf);
|
||||
break;
|
||||
}
|
||||
ASSERT3S(cbytes, ==, max_blksz);
|
||||
ASSERT3S(nbytes, ==, blksz);
|
||||
} else {
|
||||
nbytes = MIN(n, (DMU_MAX_ACCESS >> 1) -
|
||||
P2PHASE(woff, blksz));
|
||||
if (pfbytes < nbytes) {
|
||||
if (zfs_uio_prefaultpages(nbytes, uio)) {
|
||||
error = SET_ERROR(EFAULT);
|
||||
break;
|
||||
}
|
||||
pfbytes = nbytes;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -576,8 +603,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
|
||||
dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
|
||||
DB_DNODE_ENTER(db);
|
||||
dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff,
|
||||
MIN(n, max_blksz));
|
||||
dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, nbytes);
|
||||
DB_DNODE_EXIT(db);
|
||||
zfs_sa_upgrade_txholds(tx, zp);
|
||||
error = dmu_tx_assign(tx, TXG_WAIT);
|
||||
|
@ -600,31 +626,10 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
* shrink down lr_length to the appropriate size.
|
||||
*/
|
||||
if (lr->lr_length == UINT64_MAX) {
|
||||
uint64_t new_blksz;
|
||||
|
||||
if (zp->z_blksz > max_blksz) {
|
||||
/*
|
||||
* File's blocksize is already larger than the
|
||||
* "recordsize" property. Only let it grow to
|
||||
* the next power of 2.
|
||||
*/
|
||||
ASSERT(!ISP2(zp->z_blksz));
|
||||
new_blksz = MIN(end_size,
|
||||
1 << highbit64(zp->z_blksz));
|
||||
} else {
|
||||
new_blksz = MIN(end_size, max_blksz);
|
||||
}
|
||||
zfs_grow_blocksize(zp, new_blksz, tx);
|
||||
zfs_grow_blocksize(zp, blksz, tx);
|
||||
zfs_rangelock_reduce(lr, woff, n);
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX - should we really limit each write to z_max_blksz?
|
||||
* Perhaps we should use SPA_MAXBLOCKSIZE chunks?
|
||||
*/
|
||||
const ssize_t nbytes =
|
||||
MIN(n, max_blksz - P2PHASE(woff, max_blksz));
|
||||
|
||||
ssize_t tx_bytes;
|
||||
if (abuf == NULL) {
|
||||
tx_bytes = zfs_uio_resid(uio);
|
||||
|
@ -644,12 +649,8 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
* zfs_uio_prefaultpages, or prefaultpages may
|
||||
* error, and we may break the loop early.
|
||||
*/
|
||||
if (tx_bytes != zfs_uio_resid(uio))
|
||||
n -= tx_bytes - zfs_uio_resid(uio);
|
||||
if (zfs_uio_prefaultpages(MIN(n, max_blksz),
|
||||
uio)) {
|
||||
break;
|
||||
}
|
||||
n -= tx_bytes - zfs_uio_resid(uio);
|
||||
pfbytes -= tx_bytes - zfs_uio_resid(uio);
|
||||
continue;
|
||||
}
|
||||
#endif
|
||||
|
@ -665,15 +666,6 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
}
|
||||
tx_bytes -= zfs_uio_resid(uio);
|
||||
} else {
|
||||
/* Implied by abuf != NULL: */
|
||||
ASSERT3S(n, >=, max_blksz);
|
||||
ASSERT0(P2PHASE(woff, max_blksz));
|
||||
/*
|
||||
* We can simplify nbytes to MIN(n, max_blksz) since
|
||||
* P2PHASE(woff, max_blksz) is 0, and knowing
|
||||
* n >= max_blksz lets us simplify further:
|
||||
*/
|
||||
ASSERT3S(nbytes, ==, max_blksz);
|
||||
/*
|
||||
* Thus, we're writing a full block at a block-aligned
|
||||
* offset and extending the file past EOF.
|
||||
|
@ -758,13 +750,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
|
|||
break;
|
||||
ASSERT3S(tx_bytes, ==, nbytes);
|
||||
n -= nbytes;
|
||||
|
||||
if (n > 0) {
|
||||
if (zfs_uio_prefaultpages(MIN(n, max_blksz), uio)) {
|
||||
error = SET_ERROR(EFAULT);
|
||||
break;
|
||||
}
|
||||
}
|
||||
pfbytes -= nbytes;
|
||||
}
|
||||
|
||||
zfs_znode_update_vfs(zp);
|
||||
|
|
|
@ -798,8 +798,8 @@ zil_free_lwb(zilog_t *zilog, lwb_t *lwb)
|
|||
{
|
||||
ASSERT(MUTEX_HELD(&zilog->zl_lock));
|
||||
ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock));
|
||||
ASSERT(list_is_empty(&lwb->lwb_waiters));
|
||||
ASSERT(list_is_empty(&lwb->lwb_itxs));
|
||||
VERIFY(list_is_empty(&lwb->lwb_waiters));
|
||||
VERIFY(list_is_empty(&lwb->lwb_itxs));
|
||||
ASSERT(avl_is_empty(&lwb->lwb_vdev_tree));
|
||||
ASSERT3P(lwb->lwb_write_zio, ==, NULL);
|
||||
ASSERT3P(lwb->lwb_root_zio, ==, NULL);
|
||||
|
@ -1425,6 +1425,7 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
|
|||
|
||||
list_move_tail(&itxs, &lwb->lwb_itxs);
|
||||
list_move_tail(&waiters, &lwb->lwb_waiters);
|
||||
txg = lwb->lwb_issued_txg;
|
||||
|
||||
ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE);
|
||||
lwb->lwb_state = LWB_STATE_FLUSH_DONE;
|
||||
|
@ -1465,7 +1466,6 @@ zil_lwb_flush_vdevs_done(zio_t *zio)
|
|||
list_destroy(&waiters);
|
||||
|
||||
mutex_enter(&zilog->zl_lwb_io_lock);
|
||||
txg = lwb->lwb_issued_txg;
|
||||
ASSERT3U(zilog->zl_lwb_inflight[txg & TXG_MASK], >, 0);
|
||||
zilog->zl_lwb_inflight[txg & TXG_MASK]--;
|
||||
if (zilog->zl_lwb_inflight[txg & TXG_MASK] == 0)
|
||||
|
@ -2525,10 +2525,10 @@ zil_clean(zilog_t *zilog, uint64_t synced_txg)
|
|||
* This function will traverse the queue of itxs that need to be
|
||||
* committed, and move them onto the ZIL's zl_itx_commit_list.
|
||||
*/
|
||||
static void
|
||||
static uint64_t
|
||||
zil_get_commit_list(zilog_t *zilog)
|
||||
{
|
||||
uint64_t otxg, txg;
|
||||
uint64_t otxg, txg, wtxg = 0;
|
||||
list_t *commit_list = &zilog->zl_itx_commit_list;
|
||||
|
||||
ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock));
|
||||
|
@ -2562,10 +2562,22 @@ zil_get_commit_list(zilog_t *zilog)
|
|||
*/
|
||||
ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
|
||||
spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
|
||||
list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
|
||||
list_t *sync_list = &itxg->itxg_itxs->i_sync_list;
|
||||
if (unlikely(zilog->zl_suspend > 0)) {
|
||||
/*
|
||||
* ZIL was just suspended, but we lost the race.
|
||||
* Allow all earlier itxs to be committed, but ask
|
||||
* caller to do txg_wait_synced(txg) for any new.
|
||||
*/
|
||||
if (!list_is_empty(sync_list))
|
||||
wtxg = MAX(wtxg, txg);
|
||||
} else {
|
||||
list_move_tail(commit_list, sync_list);
|
||||
}
|
||||
|
||||
mutex_exit(&itxg->itxg_lock);
|
||||
}
|
||||
return (wtxg);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -2953,11 +2965,12 @@ zil_process_commit_list(zilog_t *zilog, zil_commit_waiter_t *zcw, list_t *ilwbs)
|
|||
* not issued, we rely on future calls to zil_commit_writer() to issue
|
||||
* the lwb, or the timeout mechanism found in zil_commit_waiter().
|
||||
*/
|
||||
static void
|
||||
static uint64_t
|
||||
zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
|
||||
{
|
||||
list_t ilwbs;
|
||||
lwb_t *lwb;
|
||||
uint64_t wtxg = 0;
|
||||
|
||||
ASSERT(!MUTEX_HELD(&zilog->zl_lock));
|
||||
ASSERT(spa_writeable(zilog->zl_spa));
|
||||
|
@ -2987,7 +3000,7 @@ zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw)
|
|||
|
||||
ZIL_STAT_BUMP(zilog, zil_commit_writer_count);
|
||||
|
||||
zil_get_commit_list(zilog);
|
||||
wtxg = zil_get_commit_list(zilog);
|
||||
zil_prune_commit_list(zilog);
|
||||
zil_process_commit_list(zilog, zcw, &ilwbs);
|
||||
|
||||
|
@ -2996,6 +3009,7 @@ out:
|
|||
while ((lwb = list_remove_head(&ilwbs)) != NULL)
|
||||
zil_lwb_write_issue(zilog, lwb);
|
||||
list_destroy(&ilwbs);
|
||||
return (wtxg);
|
||||
}
|
||||
|
||||
static void
|
||||
|
@ -3511,7 +3525,7 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
|
|||
zil_commit_waiter_t *zcw = zil_alloc_commit_waiter();
|
||||
zil_commit_itx_assign(zilog, zcw);
|
||||
|
||||
zil_commit_writer(zilog, zcw);
|
||||
uint64_t wtxg = zil_commit_writer(zilog, zcw);
|
||||
zil_commit_waiter(zilog, zcw);
|
||||
|
||||
if (zcw->zcw_zio_error != 0) {
|
||||
|
@ -3526,6 +3540,8 @@ zil_commit_impl(zilog_t *zilog, uint64_t foid)
|
|||
DTRACE_PROBE2(zil__commit__io__error,
|
||||
zilog_t *, zilog, zil_commit_waiter_t *, zcw);
|
||||
txg_wait_synced(zilog->zl_dmu_pool, 0);
|
||||
} else if (wtxg != 0) {
|
||||
txg_wait_synced(zilog->zl_dmu_pool, wtxg);
|
||||
}
|
||||
|
||||
zil_free_commit_waiter(zcw);
|
||||
|
@ -3905,11 +3921,13 @@ zil_suspend(const char *osname, void **cookiep)
|
|||
return (error);
|
||||
zilog = dmu_objset_zil(os);
|
||||
|
||||
mutex_enter(&zilog->zl_issuer_lock);
|
||||
mutex_enter(&zilog->zl_lock);
|
||||
zh = zilog->zl_header;
|
||||
|
||||
if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */
|
||||
mutex_exit(&zilog->zl_lock);
|
||||
mutex_exit(&zilog->zl_issuer_lock);
|
||||
dmu_objset_rele(os, suspend_tag);
|
||||
return (SET_ERROR(EBUSY));
|
||||
}
|
||||
|
@ -3923,6 +3941,7 @@ zil_suspend(const char *osname, void **cookiep)
|
|||
if (cookiep == NULL && !zilog->zl_suspending &&
|
||||
(zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) {
|
||||
mutex_exit(&zilog->zl_lock);
|
||||
mutex_exit(&zilog->zl_issuer_lock);
|
||||
dmu_objset_rele(os, suspend_tag);
|
||||
return (0);
|
||||
}
|
||||
|
@ -3931,6 +3950,7 @@ zil_suspend(const char *osname, void **cookiep)
|
|||
dsl_pool_rele(dmu_objset_pool(os), suspend_tag);
|
||||
|
||||
zilog->zl_suspend++;
|
||||
mutex_exit(&zilog->zl_issuer_lock);
|
||||
|
||||
if (zilog->zl_suspend > 1) {
|
||||
/*
|
||||
|
|
|
@ -626,8 +626,6 @@ zio_unique_parent(zio_t *cio)
|
|||
void
|
||||
zio_add_child(zio_t *pio, zio_t *cio)
|
||||
{
|
||||
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
|
||||
|
||||
/*
|
||||
* Logical I/Os can have logical, gang, or vdev children.
|
||||
* Gang I/Os can have gang or vdev children.
|
||||
|
@ -636,6 +634,7 @@ zio_add_child(zio_t *pio, zio_t *cio)
|
|||
*/
|
||||
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
|
||||
|
||||
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
|
||||
zl->zl_parent = pio;
|
||||
zl->zl_child = cio;
|
||||
|
||||
|
@ -644,8 +643,9 @@ zio_add_child(zio_t *pio, zio_t *cio)
|
|||
|
||||
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
|
||||
|
||||
uint64_t *countp = pio->io_children[cio->io_child_type];
|
||||
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
||||
pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
|
||||
countp[w] += !cio->io_state[w];
|
||||
|
||||
list_insert_head(&pio->io_child_list, zl);
|
||||
list_insert_head(&cio->io_parent_list, zl);
|
||||
|
@ -654,6 +654,37 @@ zio_add_child(zio_t *pio, zio_t *cio)
|
|||
mutex_exit(&pio->io_lock);
|
||||
}
|
||||
|
||||
void
|
||||
zio_add_child_first(zio_t *pio, zio_t *cio)
|
||||
{
|
||||
/*
|
||||
* Logical I/Os can have logical, gang, or vdev children.
|
||||
* Gang I/Os can have gang or vdev children.
|
||||
* Vdev I/Os can only have vdev children.
|
||||
* The following ASSERT captures all of these constraints.
|
||||
*/
|
||||
ASSERT3S(cio->io_child_type, <=, pio->io_child_type);
|
||||
|
||||
zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
|
||||
zl->zl_parent = pio;
|
||||
zl->zl_child = cio;
|
||||
|
||||
ASSERT(list_is_empty(&cio->io_parent_list));
|
||||
list_insert_head(&cio->io_parent_list, zl);
|
||||
|
||||
mutex_enter(&pio->io_lock);
|
||||
|
||||
ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
|
||||
|
||||
uint64_t *countp = pio->io_children[cio->io_child_type];
|
||||
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
|
||||
countp[w] += !cio->io_state[w];
|
||||
|
||||
list_insert_head(&pio->io_child_list, zl);
|
||||
|
||||
mutex_exit(&pio->io_lock);
|
||||
}
|
||||
|
||||
static void
|
||||
zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
|
||||
{
|
||||
|
@ -840,12 +871,14 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|||
zio->io_child_type = ZIO_CHILD_LOGICAL;
|
||||
|
||||
if (bp != NULL) {
|
||||
zio->io_bp = (blkptr_t *)bp;
|
||||
zio->io_bp_copy = *bp;
|
||||
zio->io_bp_orig = *bp;
|
||||
if (type != ZIO_TYPE_WRITE ||
|
||||
zio->io_child_type == ZIO_CHILD_DDT)
|
||||
zio->io_child_type == ZIO_CHILD_DDT) {
|
||||
zio->io_bp_copy = *bp;
|
||||
zio->io_bp = &zio->io_bp_copy; /* so caller can free */
|
||||
} else {
|
||||
zio->io_bp = (blkptr_t *)bp;
|
||||
}
|
||||
zio->io_bp_orig = *bp;
|
||||
if (zio->io_child_type == ZIO_CHILD_LOGICAL)
|
||||
zio->io_logical = zio;
|
||||
if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
|
||||
|
@ -880,7 +913,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
|
|||
zio->io_logical = pio->io_logical;
|
||||
if (zio->io_child_type == ZIO_CHILD_GANG)
|
||||
zio->io_gang_leader = pio->io_gang_leader;
|
||||
zio_add_child(pio, zio);
|
||||
zio_add_child_first(pio, zio);
|
||||
}
|
||||
|
||||
taskq_init_ent(&zio->io_tqent);
|
||||
|
@ -1601,7 +1634,6 @@ zio_read_bp_init(zio_t *zio)
|
|||
abd_return_buf_copy(zio->io_abd, data, psize);
|
||||
} else {
|
||||
ASSERT(!BP_IS_EMBEDDED(bp));
|
||||
ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy);
|
||||
}
|
||||
|
||||
if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
|
||||
|
@ -4442,8 +4474,10 @@ zio_ready(zio_t *zio)
|
|||
zio->io_ready(zio);
|
||||
}
|
||||
|
||||
#ifdef ZFS_DEBUG
|
||||
if (bp != NULL && bp != &zio->io_bp_copy)
|
||||
zio->io_bp_copy = *bp;
|
||||
#endif
|
||||
|
||||
if (zio->io_error != 0) {
|
||||
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
|
||||
|
|
Loading…
Reference in New Issue