Add device rebuild feature

The device_rebuild feature enables sequential reconstruction when
resilvering.  Mirror vdevs can be rebuilt in LBA order which may
more quickly restore redundancy depending on the pools average block
size, overall fragmentation and the performance characteristics
of the devices.  However, block checksums cannot be verified
as part of the rebuild thus a scrub is automatically started after
the sequential resilver completes.

The new '-s' option has been added to the `zpool attach` and
`zpool replace` command to request sequential reconstruction
instead of healing reconstruction when resilvering.

    zpool attach -s <pool> <existing vdev> <new vdev>
    zpool replace -s <pool> <old vdev> <new vdev>

The `zpool status` output has been updated to report the progress
of sequential resilvering in the same way as healing resilvering.
The one notable difference is that multiple sequential resilvers
may be in progress as long as they're operating on different
top-level vdevs.

The `zpool wait -t resilver` command was extended to wait on
sequential resilvers.  From this perspective they are no different
than healing resilvers.

Sequential resilvers cannot be supported for RAIDZ, but are
compatible with the dRAID feature being developed.

As part of this change the resilver_restart_* tests were moved
in to the functional/replacement directory.  Additionally, the
replacement tests were renamed and extended to verify both
resilvering and rebuilding.

Original-patch-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: John Poduska <jpoduska@datto.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10349
This commit is contained in:
Brian Behlendorf 2020-07-03 11:05:50 -07:00 committed by GitHub
parent 7ddb753d17
commit 9a49d3f3d3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
65 changed files with 3281 additions and 362 deletions

View File

@ -437,7 +437,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
return;
}
ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
fullpath, path, (ret == 0) ? "no errors" :

View File

@ -237,7 +237,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
dev_name, basename(spare_name));
if (zpool_vdev_attach(zhp, dev_name, spare_name,
replacement, B_TRUE) == 0) {
replacement, B_TRUE, B_FALSE) == 0) {
free(dev_name);
nvlist_free(replacement);
return (B_TRUE);
@ -319,12 +319,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);
nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state);
/*
* If this is a resource notifying us of device removal then simply
* check for an available spare and continue unless the device is a
* l2arc vdev, in which case we just offline it.
*/
if (strcmp(class, "resource.fs.zfs.removed") == 0) {
if (strcmp(class, "resource.fs.zfs.removed") == 0 ||
(strcmp(class, "resource.fs.zfs.statechange") == 0 &&
state == VDEV_STATE_REMOVED)) {
char *devtype;
char *devname;
@ -365,8 +369,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
* healthy ones so we need to confirm the actual state value.
*/
if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE,
&state) == 0 && state == VDEV_STATE_HEALTHY) {
state == VDEV_STATE_HEALTHY) {
zfs_vdev_repair(hdl, nvl);
return;
}

View File

@ -5,10 +5,12 @@
# Exit codes:
# 1: Internal error
# 2: Script wasn't enabled in zed.rc
# 3: Scrubs are automatically started for sequential resilvers
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"
[ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2
[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3
[ -n "${ZEVENT_POOL}" ] || exit 1
[ -n "${ZEVENT_SUBCLASS}" ] || exit 1
zed_check_cmd "${ZPOOL}" || exit 1

View File

@ -337,7 +337,7 @@ get_usage(zpool_help_t idx)
return (gettext("\tadd [-fgLnP] [-o property=value] "
"<pool> <vdev> ...\n"));
case HELP_ATTACH:
return (gettext("\tattach [-fw] [-o property=value] "
return (gettext("\tattach [-fsw] [-o property=value] "
"<pool> <device> <new-device>\n"));
case HELP_CLEAR:
return (gettext("\tclear [-nF] <pool> [device]\n"));
@ -380,7 +380,7 @@ get_usage(zpool_help_t idx)
case HELP_ONLINE:
return (gettext("\tonline [-e] <pool> <device> ...\n"));
case HELP_REPLACE:
return (gettext("\treplace [-fw] [-o property=value] "
return (gettext("\treplace [-fsw] [-o property=value] "
"<pool> <device> [new-device]\n"));
case HELP_REMOVE:
return (gettext("\tremove [-npsw] <pool> <device> ...\n"));
@ -2077,10 +2077,10 @@ health_str_to_color(const char *health)
*/
static void
print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
nvlist_t *nv, int depth, boolean_t isspare)
nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs)
{
nvlist_t **child, *root;
uint_t c, children;
uint_t c, i, children;
pool_scan_stat_t *ps = NULL;
vdev_stat_t *vs;
char rbuf[6], wbuf[6], cbuf[6];
@ -2266,6 +2266,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
}
}
/* The top-level vdevs have the rebuild stats */
if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE &&
children == 0) {
if (vs->vs_rebuild_processed != 0) {
(void) printf(gettext(" (resilvering)"));
}
}
if (cb->vcdl != NULL) {
if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
printf(" ");
@ -2295,11 +2303,17 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
continue;
/* Provide vdev_rebuild_stats to children if available */
if (vrs == NULL) {
(void) nvlist_lookup_uint64_array(nv,
ZPOOL_CONFIG_REBUILD_STATS,
(uint64_t **)&vrs, &i);
}
vname = zpool_vdev_name(g_zfs, zhp, child[c],
cb->cb_name_flags | VDEV_NAME_TYPE_ID);
print_status_config(zhp, cb, vname, child[c], depth + 2,
isspare);
isspare, vrs);
free(vname);
}
}
@ -2468,7 +2482,7 @@ print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv,
cb->cb_name_flags | VDEV_NAME_TYPE_ID);
if (cb->cb_print_status)
print_status_config(zhp, cb, name, child[c], 2,
B_FALSE);
B_FALSE, NULL);
else
print_import_config(cb, name, child[c], 2);
free(name);
@ -2622,6 +2636,7 @@ show_import(nvlist_t *config)
break;
case ZPOOL_STATUS_RESILVERING:
case ZPOOL_STATUS_REBUILDING:
printf_color(ANSI_BOLD, gettext("status: "));
printf_color(ANSI_YELLOW, gettext("One or more devices were "
"being resilvered.\n"));
@ -6118,6 +6133,7 @@ static int
zpool_do_attach_or_replace(int argc, char **argv, int replacing)
{
boolean_t force = B_FALSE;
boolean_t rebuild = B_FALSE;
boolean_t wait = B_FALSE;
int c;
nvlist_t *nvroot;
@ -6128,7 +6144,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
int ret;
/* check options */
while ((c = getopt(argc, argv, "fo:w")) != -1) {
while ((c = getopt(argc, argv, "fo:sw")) != -1) {
switch (c) {
case 'f':
force = B_TRUE;
@ -6146,6 +6162,9 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
(add_prop_list(optarg, propval, &props, B_TRUE)))
usage(B_FALSE);
break;
case 's':
rebuild = B_TRUE;
break;
case 'w':
wait = B_TRUE;
break;
@ -6230,7 +6249,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
return (1);
}
ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing);
ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing,
rebuild);
if (ret == 0 && wait)
ret = zpool_wait(zhp,
@ -6244,9 +6264,10 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
}
/*
* zpool replace [-fw] [-o property=value] <pool> <device> <new_device>
* zpool replace [-fsw] [-o property=value] <pool> <device> <new_device>
*
* -f Force attach, even if <new_device> appears to be in use.
* -s Use sequential instead of healing reconstruction for resilver.
* -o Set property=value.
* -w Wait for replacing to complete before returning
*
@ -6260,9 +6281,10 @@ zpool_do_replace(int argc, char **argv)
}
/*
* zpool attach [-fw] [-o property=value] <pool> <device> <new_device>
* zpool attach [-fsw] [-o property=value] <pool> <device> <new_device>
*
* -f Force attach, even if <new_device> appears to be in use.
* -s Use sequential instead of healing reconstruction for resilver.
* -o Set property=value.
* -w Wait for resilvering to complete before returning
*
@ -7131,20 +7153,41 @@ zpool_do_trim(int argc, char **argv)
return (error);
}
/*
* Converts a total number of seconds to a human readable string broken
* down in to days/hours/minutes/seconds.
*/
static void
secs_to_dhms(uint64_t total, char *buf)
{
uint64_t days = total / 60 / 60 / 24;
uint64_t hours = (total / 60 / 60) % 24;
uint64_t mins = (total / 60) % 60;
uint64_t secs = (total % 60);
if (days > 0) {
(void) sprintf(buf, "%llu days %02llu:%02llu:%02llu",
(u_longlong_t)days, (u_longlong_t)hours,
(u_longlong_t)mins, (u_longlong_t)secs);
} else {
(void) sprintf(buf, "%02llu:%02llu:%02llu",
(u_longlong_t)hours, (u_longlong_t)mins,
(u_longlong_t)secs);
}
}
/*
* Print out detailed scrub status.
*/
static void
print_scan_status(pool_scan_stat_t *ps)
print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
{
time_t start, end, pause;
uint64_t total_secs_left;
uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
uint64_t pass_scanned, scanned, pass_issued, issued, total;
uint64_t scan_rate, issue_rate;
uint64_t elapsed, scan_rate, issue_rate;
double fraction_done;
char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
char srate_buf[7], irate_buf[7];
char srate_buf[7], irate_buf[7], time_buf[32];
printf(" ");
printf_color(ANSI_BOLD, gettext("scan:"));
@ -7168,26 +7211,18 @@ print_scan_status(pool_scan_stat_t *ps)
/* Scan is finished or canceled. */
if (ps->pss_state == DSS_FINISHED) {
total_secs_left = end - start;
days_left = total_secs_left / 60 / 60 / 24;
hours_left = (total_secs_left / 60 / 60) % 24;
mins_left = (total_secs_left / 60) % 60;
secs_left = (total_secs_left % 60);
secs_to_dhms(end - start, time_buf);
if (ps->pss_func == POOL_SCAN_SCRUB) {
(void) printf(gettext("scrub repaired %s "
"in %llu days %02llu:%02llu:%02llu "
"with %llu errors on %s"), processed_buf,
(u_longlong_t)days_left, (u_longlong_t)hours_left,
(u_longlong_t)mins_left, (u_longlong_t)secs_left,
(u_longlong_t)ps->pss_errors, ctime(&end));
"in %s with %llu errors on %s"), processed_buf,
time_buf, (u_longlong_t)ps->pss_errors,
ctime(&end));
} else if (ps->pss_func == POOL_SCAN_RESILVER) {
(void) printf(gettext("resilvered %s "
"in %llu days %02llu:%02llu:%02llu "
"with %llu errors on %s"), processed_buf,
(u_longlong_t)days_left, (u_longlong_t)hours_left,
(u_longlong_t)mins_left, (u_longlong_t)secs_left,
(u_longlong_t)ps->pss_errors, ctime(&end));
"in %s with %llu errors on %s"), processed_buf,
time_buf, (u_longlong_t)ps->pss_errors,
ctime(&end));
}
return;
} else if (ps->pss_state == DSS_CANCELED) {
@ -7235,13 +7270,9 @@ print_scan_status(pool_scan_stat_t *ps)
scan_rate = pass_scanned / elapsed;
issue_rate = pass_issued / elapsed;
total_secs_left = (issue_rate != 0 && total >= issued) ?
uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
((total - issued) / issue_rate) : UINT64_MAX;
days_left = total_secs_left / 60 / 60 / 24;
hours_left = (total_secs_left / 60 / 60) % 24;
mins_left = (total_secs_left / 60) % 60;
secs_left = (total_secs_left % 60);
secs_to_dhms(total_secs_left, time_buf);
/* format all of the numbers we will be reporting */
zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
@ -7271,10 +7302,84 @@ print_scan_status(pool_scan_stat_t *ps)
if (pause == 0) {
if (total_secs_left != UINT64_MAX &&
issue_rate >= 10 * 1024 * 1024) {
(void) printf(gettext(", %llu days "
"%02llu:%02llu:%02llu to go\n"),
(u_longlong_t)days_left, (u_longlong_t)hours_left,
(u_longlong_t)mins_left, (u_longlong_t)secs_left);
(void) printf(gettext(", %s to go\n"), time_buf);
} else {
(void) printf(gettext(", no estimated "
"completion time\n"));
}
} else {
(void) printf(gettext("\n"));
}
}
static void
print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
{
if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
return;
printf(" ");
printf_color(ANSI_BOLD, gettext("scan:"));
printf(" ");
uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
uint64_t bytes_issued = vrs->vrs_bytes_issued;
uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
uint64_t bytes_est = vrs->vrs_bytes_est;
uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
(vrs->vrs_pass_time_ms + 1)) * 1000;
uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
(vrs->vrs_pass_time_ms + 1)) * 1000;
double scan_pct = MIN((double)bytes_scanned * 100 /
(bytes_est + 1), 100);
/* Format all of the numbers we will be reporting */
char bytes_scanned_buf[7], bytes_issued_buf[7];
char bytes_rebuilt_buf[7], bytes_est_buf[7];
char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
sizeof (bytes_scanned_buf));
zfs_nicebytes(bytes_issued, bytes_issued_buf,
sizeof (bytes_issued_buf));
zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
sizeof (bytes_rebuilt_buf));
zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
time_t start = vrs->vrs_start_time;
time_t end = vrs->vrs_end_time;
/* Rebuild is finished or canceled. */
if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) {
secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf);
(void) printf(gettext("resilvered (%s) %s in %s "
"with %llu errors on %s"), vdev_name, bytes_rebuilt_buf,
time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end));
return;
} else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) {
(void) printf(gettext("resilver (%s) canceled on %s"),
vdev_name, ctime(&end));
return;
} else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
(void) printf(gettext("resilver (%s) in progress since %s"),
vdev_name, ctime(&start));
}
assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
MAX(scan_rate, 1), time_buf);
(void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
"%s total\n"), bytes_scanned_buf, scan_rate_buf,
bytes_issued_buf, issue_rate_buf, bytes_est_buf);
(void) printf(gettext("\t%s resilvered, %.2f%% done"),
bytes_rebuilt_buf, scan_pct);
if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
if (scan_rate >= 10 * 1024 * 1024) {
(void) printf(gettext(", %s to go\n"), time_buf);
} else {
(void) printf(gettext(", no estimated "
"completion time\n"));
@ -7285,9 +7390,38 @@ print_scan_status(pool_scan_stat_t *ps)
}
/*
* As we don't scrub checkpointed blocks, we want to warn the
* user that we skipped scanning some blocks if a checkpoint exists
* or existed at any time during the scan.
* Print rebuild status for top-level vdevs.
*/
static void
print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot)
{
nvlist_t **child;
uint_t children;
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
children = 0;
for (uint_t c = 0; c < children; c++) {
vdev_rebuild_stat_t *vrs;
uint_t i;
if (nvlist_lookup_uint64_array(child[c],
ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
char *name = zpool_vdev_name(g_zfs, zhp,
child[c], VDEV_NAME_TYPE_ID);
print_rebuild_status_impl(vrs, name);
free(name);
}
}
}
/*
* As we don't scrub checkpointed blocks, we want to warn the user that we
* skipped scanning some blocks if a checkpoint exists or existed at any
* time during the scan. If a sequential instead of healing reconstruction
* was performed then the blocks were reconstructed. However, their checksums
* have not been verified so we still print the warning.
*/
static void
print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
@ -7318,6 +7452,95 @@ print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
}
}
/*
* Returns B_TRUE if there is an active rebuild in progress. Otherwise,
* B_FALSE is returned and 'rebuild_end_time' is set to the end time for
* the last completed (or cancelled) rebuild.
*/
static boolean_t
check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time)
{
nvlist_t **child;
uint_t children;
boolean_t rebuilding = B_FALSE;
uint64_t end_time = 0;
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
children = 0;
for (uint_t c = 0; c < children; c++) {
vdev_rebuild_stat_t *vrs;
uint_t i;
if (nvlist_lookup_uint64_array(child[c],
ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
if (vrs->vrs_end_time > end_time)
end_time = vrs->vrs_end_time;
if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
rebuilding = B_TRUE;
end_time = 0;
break;
}
}
}
if (rebuild_end_time != NULL)
*rebuild_end_time = end_time;
return (rebuilding);
}
/*
* Print the scan status.
*/
static void
print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
{
uint64_t rebuild_end_time = 0, resilver_end_time = 0;
boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE;
boolean_t active_resilver = B_FALSE;
pool_checkpoint_stat_t *pcs = NULL;
pool_scan_stat_t *ps = NULL;
uint_t c;
if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
(uint64_t **)&ps, &c) == 0) {
if (ps->pss_func == POOL_SCAN_RESILVER) {
resilver_end_time = ps->pss_end_time;
active_resilver = (ps->pss_state == DSS_SCANNING);
}
have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
}
boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0));
/* Always print the scrub status when available. */
if (have_scrub)
print_scan_scrub_resilver_status(ps);
/*
* When there is an active resilver or rebuild print its status.
* Otherwise print the status of the last resilver or rebuild.
*/
if (active_resilver || (!active_rebuild && have_resilver &&
resilver_end_time && resilver_end_time > rebuild_end_time)) {
print_scan_scrub_resilver_status(ps);
} else if (active_rebuild || (!active_resilver && have_rebuild &&
rebuild_end_time && rebuild_end_time > resilver_end_time)) {
print_rebuild_status(zhp, nvroot);
}
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
print_checkpoint_scan_warning(ps, pcs);
}
/*
* Print out detailed removal status.
*/
@ -7504,7 +7727,7 @@ print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares,
for (i = 0; i < nspares; i++) {
name = zpool_vdev_name(g_zfs, zhp, spares[i],
cb->cb_name_flags);
print_status_config(zhp, cb, name, spares[i], 2, B_TRUE);
print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL);
free(name);
}
}
@ -7524,7 +7747,8 @@ print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache,
for (i = 0; i < nl2cache; i++) {
name = zpool_vdev_name(g_zfs, zhp, l2cache[i],
cb->cb_name_flags);
print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE);
print_status_config(zhp, cb, name, l2cache[i], 2,
B_FALSE, NULL);
free(name);
}
}
@ -7718,6 +7942,7 @@ status_callback(zpool_handle_t *zhp, void *data)
break;
case ZPOOL_STATUS_RESILVERING:
case ZPOOL_STATUS_REBUILDING:
printf_color(ANSI_BOLD, gettext("status: "));
printf_color(ANSI_YELLOW, gettext("One or more devices is "
"currently being resilvered. The pool will\n\tcontinue "
@ -7727,6 +7952,16 @@ status_callback(zpool_handle_t *zhp, void *data)
"complete.\n"));
break;
case ZPOOL_STATUS_REBUILD_SCRUB:
printf_color(ANSI_BOLD, gettext("status: "));
printf_color(ANSI_YELLOW, gettext("One or more devices have "
"been sequentially resilvered, scrubbing\n\tthe pool "
"is recommended.\n"));
printf_color(ANSI_BOLD, gettext("action: "));
printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to "
"verify all data checksums.\n"));
break;
case ZPOOL_STATUS_CORRUPT_DATA:
printf_color(ANSI_BOLD, gettext("status: "));
printf_color(ANSI_YELLOW, gettext("One or more devices has "
@ -7951,18 +8186,16 @@ status_callback(zpool_handle_t *zhp, void *data)
nvlist_t **spares, **l2cache;
uint_t nspares, nl2cache;
pool_checkpoint_stat_t *pcs = NULL;
pool_scan_stat_t *ps = NULL;
pool_removal_stat_t *prs = NULL;
print_scan_status(zhp, nvroot);
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
print_removal_status(zhp, prs);
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
print_scan_status(ps);
print_checkpoint_scan_warning(ps, pcs);
print_removal_status(zhp, prs);
print_checkpoint_status(pcs);
cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
@ -7987,7 +8220,7 @@ status_callback(zpool_handle_t *zhp, void *data)
printf("\n");
print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0,
B_FALSE);
B_FALSE, NULL);
print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP);
print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
@ -9543,6 +9776,36 @@ vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity)
return (bytes_remaining);
}
/* Add up the total number of bytes left to rebuild across top-level vdevs */
static uint64_t
vdev_activity_top_remaining(nvlist_t *nv)
{
uint64_t bytes_remaining = 0;
nvlist_t **child;
uint_t children;
int error;
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0)
children = 0;
for (uint_t c = 0; c < children; c++) {
vdev_rebuild_stat_t *vrs;
uint_t i;
error = nvlist_lookup_uint64_array(child[c],
ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i);
if (error == 0) {
if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
bytes_remaining += (vrs->vrs_bytes_est -
vrs->vrs_bytes_rebuilt);
}
}
}
return (bytes_remaining);
}
/* Whether any vdevs are 'spare' or 'replacing' vdevs */
static boolean_t
vdev_any_spare_replacing(nvlist_t *nv)
@ -9652,6 +9915,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
bytes_rem[ZPOOL_WAIT_SCRUB] = rem;
else
bytes_rem[ZPOOL_WAIT_RESILVER] = rem;
} else if (check_rebuilding(nvroot, NULL)) {
bytes_rem[ZPOOL_WAIT_RESILVER] =
vdev_activity_top_remaining(nvroot);
}
bytes_rem[ZPOOL_WAIT_INITIALIZE] =

View File

@ -3507,7 +3507,16 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
ashift, NULL, 0, 0, 1);
error = spa_vdev_attach(spa, oldguid, root, replacing);
/*
* When supported select either a healing or sequential resilver.
*/
boolean_t rebuilding = B_FALSE;
if (pvd->vdev_ops == &vdev_mirror_ops ||
pvd->vdev_ops == &vdev_root_ops) {
rebuilding = !!ztest_random(2);
}
error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding);
nvlist_free(root);
@ -3527,10 +3536,11 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
expected_error = error;
if (error == ZFS_ERR_CHECKPOINT_EXISTS ||
error == ZFS_ERR_DISCARDING_CHECKPOINT)
error == ZFS_ERR_DISCARDING_CHECKPOINT ||
error == ZFS_ERR_RESILVER_IN_PROGRESS ||
error == ZFS_ERR_REBUILD_IN_PROGRESS)
expected_error = error;
/* XXX workaround 6690467 */
if (error != expected_error && expected_error != EBUSY) {
fatal(0, "attach (%s %llu, %s %llu, %d) "
"returned %d, expected %d",

View File

@ -368,7 +368,6 @@ AC_CONFIG_FILES([
tests/zfs-tests/tests/functional/rename_dirs/Makefile
tests/zfs-tests/tests/functional/replacement/Makefile
tests/zfs-tests/tests/functional/reservation/Makefile
tests/zfs-tests/tests/functional/resilver/Makefile
tests/zfs-tests/tests/functional/rootpool/Makefile
tests/zfs-tests/tests/functional/rsend/Makefile
tests/zfs-tests/tests/functional/scrub_mirror/Makefile

View File

@ -95,6 +95,8 @@ zfs_errno = enum_with_offset(1024, [
'ZFS_ERR_EXPORT_IN_PROGRESS',
'ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR',
'ZFS_ERR_STREAM_TRUNCATED',
'ZFS_ERR_RESILVER_IN_PROGRESS',
'ZFS_ERR_REBUILD_IN_PROGRESS',
],
{}
)

View File

@ -79,7 +79,7 @@ typedef enum zfs_error {
EZFS_NODEVICE, /* no such device in pool */
EZFS_BADDEV, /* invalid device to add */
EZFS_NOREPLICAS, /* no valid replicas */
EZFS_RESILVERING, /* currently resilvering */
EZFS_RESILVERING, /* resilvering (healing reconstruction) */
EZFS_BADVERSION, /* unsupported version */
EZFS_POOLUNAVAIL, /* pool is currently unavailable */
EZFS_DEVOVERFLOW, /* too many devices in one vdev */
@ -148,6 +148,7 @@ typedef enum zfs_error {
EZFS_TRIM_NOTSUP, /* device does not support trim */
EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */
EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */
EZFS_REBUILDING, /* resilvering (sequential reconstrution) */
EZFS_UNKNOWN
} zfs_error_t;
@ -297,7 +298,7 @@ extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
vdev_state_t *);
extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
extern int zpool_vdev_attach(zpool_handle_t *, const char *,
const char *, nvlist_t *, int);
const char *, nvlist_t *, int, boolean_t);
extern int zpool_vdev_detach(zpool_handle_t *, const char *);
extern int zpool_vdev_remove(zpool_handle_t *, const char *);
extern int zpool_vdev_remove_cancel(zpool_handle_t *);
@ -387,6 +388,8 @@ typedef enum {
ZPOOL_STATUS_RESILVERING, /* device being resilvered */
ZPOOL_STATUS_OFFLINE_DEV, /* device offline */
ZPOOL_STATUS_REMOVED_DEV, /* removed device */
ZPOOL_STATUS_REBUILDING, /* device being rebuilt */
ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */
/*
* Finally, the following indicates a healthy pool.

View File

@ -89,6 +89,7 @@ COMMON_H = \
vdev_initialize.h \
vdev_raidz.h \
vdev_raidz_impl.h \
vdev_rebuild.h \
vdev_removal.h \
vdev_trim.h \
xvattr.h \

View File

@ -42,6 +42,8 @@ struct dsl_dataset;
struct dsl_pool;
struct dmu_tx;
extern int zfs_scan_suspend_progress;
/*
* All members of this structure must be uint64_t, for byteswap
* purposes.

View File

@ -704,6 +704,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
#define ZPOOL_CONFIG_REMOVING "removing"
#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
#define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg"
#define ZPOOL_CONFIG_COMMENT "comment"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */
@ -730,6 +731,7 @@ typedef struct zpool_load_policy {
#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */
#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */
#define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */
#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats"
/*
* The persistent vdev state is stored as separate values rather than a single
@ -778,6 +780,9 @@ typedef struct zpool_load_policy {
#define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \
"com.delphix:ms_unflushed_phys_txgs"
#define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \
"org.openzfs:vdev_rebuild"
#define VDEV_TOP_ZAP_ALLOCATION_BIAS \
"org.zfsonlinux:allocation_bias"
@ -991,6 +996,21 @@ typedef enum dsl_scan_state {
DSS_NUM_STATES
} dsl_scan_state_t;
typedef struct vdev_rebuild_stat {
uint64_t vrs_state; /* vdev_rebuild_state_t */
uint64_t vrs_start_time; /* time_t */
uint64_t vrs_end_time; /* time_t */
uint64_t vrs_scan_time_ms; /* total run time (millisecs) */
uint64_t vrs_bytes_scanned; /* allocated bytes scanned */
uint64_t vrs_bytes_issued; /* read bytes issued */
uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */
uint64_t vrs_bytes_est; /* total bytes to scan */
uint64_t vrs_errors; /* scanning errors */
uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */
uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */
uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */
} vdev_rebuild_stat_t;
/*
* Errata described by https://zfsonlinux.org/msg/ZFS-8000-ER. The ordering
* of this enum must be maintained to ensure the errata identifiers map to
@ -1047,6 +1067,7 @@ typedef struct vdev_stat {
uint64_t vs_trim_bytes_est; /* total bytes to trim */
uint64_t vs_trim_state; /* vdev_trim_state_t */
uint64_t vs_trim_action_time; /* time_t */
uint64_t vs_rebuild_processed; /* bytes rebuilt */
} vdev_stat_t;
/*
@ -1178,6 +1199,13 @@ typedef enum {
VDEV_TRIM_COMPLETE,
} vdev_trim_state_t;
typedef enum {
VDEV_REBUILD_NONE,
VDEV_REBUILD_ACTIVE,
VDEV_REBUILD_CANCELED,
VDEV_REBUILD_COMPLETE,
} vdev_rebuild_state_t;
/*
* nvlist name constants. Facilitate restricting snapshot iteration range for
* the "list next snapshot" ioctl
@ -1337,6 +1365,8 @@ typedef enum {
ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR,
ZFS_ERR_STREAM_TRUNCATED,
ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH,
ZFS_ERR_RESILVER_IN_PROGRESS,
ZFS_ERR_REBUILD_IN_PROGRESS,
} zfs_errno_t;
/*
@ -1478,7 +1508,12 @@ typedef enum {
* given payloads:
*
* ESC_ZFS_RESILVER_START
* ESC_ZFS_RESILVER_END
* ESC_ZFS_RESILVER_FINISH
*
* ZFS_EV_POOL_NAME DATA_TYPE_STRING
* ZFS_EV_POOL_GUID DATA_TYPE_UINT64
* ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING
*
* ESC_ZFS_POOL_DESTROY
* ESC_ZFS_POOL_REGUID
*
@ -1532,6 +1567,7 @@ typedef enum {
#define ZFS_EV_HIST_IOCTL "history_ioctl"
#define ZFS_EV_HIST_DSNAME "history_dsname"
#define ZFS_EV_HIST_DSID "history_dsid"
#define ZFS_EV_RESILVER_TYPE "resilver_type"
#ifdef __cplusplus
}

View File

@ -790,17 +790,12 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
#define SPA_ASYNC_AUTOTRIM_RESTART 0x400
#define SPA_ASYNC_L2CACHE_REBUILD 0x800
#define SPA_ASYNC_L2CACHE_TRIM 0x1000
/*
* Controls the behavior of spa_vdev_remove().
*/
#define SPA_REMOVE_UNSPARE 0x01
#define SPA_REMOVE_DONE 0x02
#define SPA_ASYNC_REBUILD_DONE 0x2000
/* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
int replacing);
int replacing, int rebuild);
extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
int replace_done);
extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
@ -988,6 +983,7 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
/* Pool vdev add/remove lock */
extern uint64_t spa_vdev_enter(spa_t *spa);
extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid);
extern uint64_t spa_vdev_config_enter(spa_t *spa);
extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
int error, char *tag);

View File

@ -36,6 +36,7 @@
#include <sys/spa_checkpoint.h>
#include <sys/spa_log_spacemap.h>
#include <sys/vdev.h>
#include <sys/vdev_rebuild.h>
#include <sys/vdev_removal.h>
#include <sys/metaslab.h>
#include <sys/dmu.h>

View File

@ -73,7 +73,7 @@ extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
int scrub_done);
boolean_t scrub_done, boolean_t rebuild_done);
extern boolean_t vdev_dtl_required(vdev_t *vd);
extern boolean_t vdev_resilver_needed(vdev_t *vd,
uint64_t *minp, uint64_t *maxp);

View File

@ -38,6 +38,7 @@
#include <sys/uberblock_impl.h>
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
#include <sys/vdev_rebuild.h>
#include <sys/vdev_removal.h>
#include <sys/zfs_ratelimit.h>
@ -295,13 +296,26 @@ struct vdev {
uint64_t vdev_trim_secure; /* requested secure TRIM */
uint64_t vdev_trim_action_time; /* start and end time */
/* for limiting outstanding I/Os (initialize and TRIM) */
/* Rebuild related */
boolean_t vdev_rebuilding;
boolean_t vdev_rebuild_exit_wanted;
boolean_t vdev_rebuild_cancel_wanted;
boolean_t vdev_rebuild_reset_wanted;
kmutex_t vdev_rebuild_lock;
kcondvar_t vdev_rebuild_cv;
kthread_t *vdev_rebuild_thread;
vdev_rebuild_t vdev_rebuild_config;
/* For limiting outstanding I/Os (initialize, TRIM, rebuild) */
kmutex_t vdev_initialize_io_lock;
kcondvar_t vdev_initialize_io_cv;
uint64_t vdev_initialize_inflight;
kmutex_t vdev_trim_io_lock;
kcondvar_t vdev_trim_io_cv;
uint64_t vdev_trim_inflight[3];
kmutex_t vdev_rebuild_io_lock;
kcondvar_t vdev_rebuild_io_cv;
uint64_t vdev_rebuild_inflight;
/*
* Values stored in the config for an indirect or removing vdev.
@ -358,6 +372,7 @@ struct vdev {
uint64_t vdev_degraded; /* persistent degraded state */
uint64_t vdev_removed; /* persistent removed state */
uint64_t vdev_resilver_txg; /* persistent resilvering state */
uint64_t vdev_rebuild_txg; /* persistent rebuilding state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */

View File

@ -0,0 +1,97 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 2018, Intel Corporation.
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
*/
#ifndef _SYS_VDEV_REBUILD_H
#define _SYS_VDEV_REBUILD_H
#include <sys/spa.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
* Number of entries in the physical vdev_rebuild_phys structure. This
* state is stored per top-level as VDEV_ZAP_TOP_VDEV_REBUILD_PHYS.
*/
#define REBUILD_PHYS_ENTRIES 12
/*
* On-disk rebuild configuration and state. When adding new fields they
* must be added to the end of the structure.
*/
typedef struct vdev_rebuild_phys {
uint64_t vrp_rebuild_state; /* vdev_rebuild_state_t */
uint64_t vrp_last_offset; /* last rebuilt offset */
uint64_t vrp_min_txg; /* minimum missing txg */
uint64_t vrp_max_txg; /* maximum missing txg */
uint64_t vrp_start_time; /* start time */
uint64_t vrp_end_time; /* end time */
uint64_t vrp_scan_time_ms; /* total run time in ms */
uint64_t vrp_bytes_scanned; /* alloc bytes scanned */
uint64_t vrp_bytes_issued; /* read bytes rebuilt */
uint64_t vrp_bytes_rebuilt; /* rebuilt bytes */
uint64_t vrp_bytes_est; /* total bytes to scan */
uint64_t vrp_errors; /* errors during rebuild */
} vdev_rebuild_phys_t;
/*
* The vdev_rebuild_t describes the current state and how a top-level vdev
* should be rebuilt. The core elements are the top-vdev, the metaslab being
* rebuilt, range tree containing the allocted extents and the on-disk state.
*/
typedef struct vdev_rebuild {
vdev_t *vr_top_vdev; /* top-level vdev to rebuild */
metaslab_t *vr_scan_msp; /* scanning disabled metaslab */
range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */
/* In-core state and progress */
uint64_t vr_scan_offset[TXG_SIZE];
uint64_t vr_prev_scan_time_ms; /* any previous scan time */
/* Per-rebuild pass statistics for calculating bandwidth */
uint64_t vr_pass_start_time;
uint64_t vr_pass_bytes_scanned;
uint64_t vr_pass_bytes_issued;
/* On-disk state updated by vdev_rebuild_zap_update_sync() */
vdev_rebuild_phys_t vr_rebuild_phys;
} vdev_rebuild_t;
boolean_t vdev_rebuild_active(vdev_t *);
int vdev_rebuild_load(vdev_t *);
void vdev_rebuild(vdev_t *);
void vdev_rebuild_stop_wait(vdev_t *);
void vdev_rebuild_stop_all(spa_t *);
void vdev_rebuild_restart(spa_t *);
void vdev_rebuild_clear_sync(void *, dmu_tx_t *);
int vdev_rebuild_get_stats(vdev_t *, vdev_rebuild_stat_t *);
#ifdef __cplusplus
}
#endif
#endif /* _SYS_VDEV_REBUILD_H */

View File

@ -31,6 +31,7 @@ typedef enum zio_priority {
ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */
ZIO_PRIORITY_INITIALIZING, /* initializing I/O */
ZIO_PRIORITY_TRIM, /* trim I/O (discard) */
ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */
ZIO_PRIORITY_NUM_QUEUEABLE,
ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */
} zio_priority_t;

View File

@ -74,6 +74,7 @@ typedef enum spa_feature {
SPA_FEATURE_BOOKMARK_WRITTEN,
SPA_FEATURE_LOG_SPACEMAP,
SPA_FEATURE_LIVELIST,
SPA_FEATURE_DEVICE_REBUILD,
SPA_FEATURES
} spa_feature_t;

View File

@ -2446,7 +2446,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
(void) nvlist_lookup_uint64_array(nvroot,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
if (ps && ps->pss_func == POOL_SCAN_SCRUB) {
if (ps && ps->pss_func == POOL_SCAN_SCRUB &&
ps->pss_state == DSS_SCANNING) {
if (cmd == POOL_SCRUB_PAUSE)
return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg));
else
@ -3128,8 +3129,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which)
* If 'replacing' is specified, the new disk will replace the old one.
*/
int
zpool_vdev_attach(zpool_handle_t *zhp,
const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing)
zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild)
{
zfs_cmd_t zc = {"\0"};
char msg[1024];
@ -3164,6 +3165,14 @@ zpool_vdev_attach(zpool_handle_t *zhp,
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
zc.zc_cookie = replacing;
zc.zc_simple = rebuild;
if (rebuild &&
zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"the loaded zfs module doesn't support device rebuilds"));
return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
}
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&child, &children) != 0 || children != 1) {
@ -3224,16 +3233,21 @@ zpool_vdev_attach(zpool_handle_t *zhp,
uint64_t version = zpool_get_prop_int(zhp,
ZPOOL_PROP_VERSION, NULL);
if (islog)
if (islog) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a log with a spare"));
else if (version >= SPA_VERSION_MULTI_REPLACE)
} else if (rebuild) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"only mirror vdevs support sequential "
"reconstruction"));
} else if (version >= SPA_VERSION_MULTI_REPLACE) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"already in replacing/spare config; wait "
"for completion or use 'zpool detach'"));
else
} else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"cannot replace a replacing device"));
}
} else {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"can only attach to mirrors and top-level "

View File

@ -84,6 +84,8 @@ static char *zfs_msgid_table[] = {
* ZPOOL_STATUS_RESILVERING
* ZPOOL_STATUS_OFFLINE_DEV
* ZPOOL_STATUS_REMOVED_DEV
* ZPOOL_STATUS_REBUILDING
* ZPOOL_STATUS_REBUILD_SCRUB
* ZPOOL_STATUS_OK
*/
};
@ -195,7 +197,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
* - Check for any data errors
* - Check for any faulted or missing devices in a replicated config
* - Look for any devices showing errors
* - Check for any resilvering devices
* - Check for any resilvering or rebuilding devices
*
* There can obviously be multiple errors within a single pool, so this routine
* only picks the most damaging of all the current errors to report.
@ -233,6 +235,49 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap)
ps->pss_state == DSS_SCANNING)
return (ZPOOL_STATUS_RESILVERING);
/*
* Currently rebuilding a vdev, check top-level vdevs.
*/
vdev_rebuild_stat_t *vrs = NULL;
nvlist_t **child;
uint_t c, i, children;
uint64_t rebuild_end_time = 0;
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
&child, &children) == 0) {
for (c = 0; c < children; c++) {
if ((nvlist_lookup_uint64_array(child[c],
ZPOOL_CONFIG_REBUILD_STATS,
(uint64_t **)&vrs, &i) == 0) && (vrs != NULL)) {
uint64_t state = vrs->vrs_state;
if (state == VDEV_REBUILD_ACTIVE) {
return (ZPOOL_STATUS_REBUILDING);
} else if (state == VDEV_REBUILD_COMPLETE &&
vrs->vrs_end_time > rebuild_end_time) {
rebuild_end_time = vrs->vrs_end_time;
}
}
}
/*
* If we can determine when the last scrub was run, and it
* was before the last rebuild completed, then recommend
* that the pool be scrubbed to verify all checksums. When
* ps is NULL we can infer the pool has never been scrubbed.
*/
if (rebuild_end_time > 0) {
if (ps != NULL) {
if ((ps->pss_state == DSS_FINISHED &&
ps->pss_func == POOL_SCAN_SCRUB &&
rebuild_end_time > ps->pss_end_time) ||
ps->pss_state == DSS_NONE)
return (ZPOOL_STATUS_REBUILD_SCRUB);
} else {
return (ZPOOL_STATUS_REBUILD_SCRUB);
}
}
}
/*
* The multihost property is set and the pool may be active.
*/

View File

@ -286,6 +286,9 @@ libzfs_error_description(libzfs_handle_t *hdl)
"resilver_defer feature"));
case EZFS_EXPORT_IN_PROGRESS:
return (dgettext(TEXT_DOMAIN, "pool export in progress"));
case EZFS_REBUILDING:
return (dgettext(TEXT_DOMAIN, "currently sequentially "
"resilvering"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
@ -693,6 +696,12 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
case ZFS_ERR_EXPORT_IN_PROGRESS:
zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap);
break;
case ZFS_ERR_RESILVER_IN_PROGRESS:
zfs_verror(hdl, EZFS_RESILVERING, fmt, ap);
break;
case ZFS_ERR_REBUILD_IN_PROGRESS:
zfs_verror(hdl, EZFS_REBUILDING, fmt, ap);
break;
case ZFS_ERR_IOC_CMD_UNAVAIL:
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
"module does not support this operation. A reboot may "

View File

@ -132,6 +132,7 @@ KERNEL_C = \
vdev_raidz_math_sse2.c \
vdev_raidz_math_ssse3.c \
vdev_raidz_math_powerpc_altivec.c \
vdev_rebuild.c \
vdev_removal.c \
vdev_root.c \
vdev_trim.c \

View File

@ -1862,6 +1862,30 @@ queue's min_active. See the section "ZFS I/O SCHEDULER".
Default value: \fB1,000\fR.
.RE
.sp
.ne 2
.na
\fBzfs_vdev_rebuild_max_active\fR (int)
.ad
.RS 12n
Maximum sequential resilver I/Os active to each device.
See the section "ZFS I/O SCHEDULER".
.sp
Default value: \fB3\fR.
.RE
.sp
.ne 2
.na
\fBzfs_vdev_rebuild_min_active\fR (int)
.ad
.RS 12n
Minimum sequential resilver I/Os active to each device.
See the section "ZFS I/O SCHEDULER".
.sp
Default value: \fB1\fR.
.RE
.sp
.ne 2
.na
@ -2707,6 +2731,18 @@ Include cache hits in read history
Use \fB1\fR for yes and \fB0\fR for no (default).
.RE
.sp
.ne 2
.na
\fBzfs_rebuild_max_segment\fR (ulong)
.ad
.RS 12n
Maximum read segment size to issue when sequentially resilvering a
top-level vdev.
.sp
Default value: \fB1,048,576\fR.
.RE
.sp
.ne 2
.na

View File

@ -255,6 +255,35 @@ This feature becomes \fBactive\fR when a bookmark is created and will be
returned to the \fBenabled\fR state when all bookmarks with these fields are destroyed.
.RE
.sp
.ne 2
.na
\fBdevice_rebuild\fR
.ad
.RS 4n
.TS
l l .
GUID org.openzfs:device_rebuild
READ\-ONLY COMPATIBLE yes
DEPENDENCIES none
.TE
This feature enables the ability for the \fBzpool attach\fR and \fBzpool
replace\fR subcommands to perform sequential reconstruction (instead of
healing reconstruction) when resilvering.
Sequential reconstruction resilvers a device in LBA order without immediately
verifying the checksums. Once complete a scrub is started which then verifies
the checksums. This approach allows full redundancy to be restored to the pool
in the minimum amount of time. This two phase approach will take longer than a
healing resilver when the time to verify the checksums is included. However,
unless there is additional pool damage no checksum errors should be reported
by the scrub. This feature is incompatible with raidz configurations.
This feature becomes \fBactive\fR while a sequential resilver is in progress,
and returns to \fBenabled\fR when the resilver completes.
.RE
.sp
.ne 2
.na

View File

@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
.Dd August 9, 2019
.Dd May 15, 2020
.Dt ZPOOL-ATTACH 8
.Os Linux
.Sh NAME
@ -36,7 +36,7 @@
.Sh SYNOPSIS
.Nm
.Cm attach
.Op Fl fw
.Op Fl fsw
.Oo Fl o Ar property Ns = Ns Ar value Oc
.Ar pool device new_device
.Sh DESCRIPTION
@ -44,7 +44,7 @@
.It Xo
.Nm
.Cm attach
.Op Fl fw
.Op Fl fsw
.Oo Fl o Ar property Ns = Ns Ar value Oc
.Ar pool device new_device
.Xc
@ -68,22 +68,29 @@ is part of a two-way mirror, attaching
creates a three-way mirror, and so on.
In either case,
.Ar new_device
begins to resilver immediately.
begins to resilver immediately and any running scrub is cancelled.
.Bl -tag -width Ds
.It Fl f
Forces use of
.Ar new_device ,
even if it appears to be in use.
Not all devices can be overridden in this manner.
.It Fl w
Waits until
.Ar new_device
has finished resilvering before returning.
.It Fl o Ar property Ns = Ns Ar value
Sets the given pool properties. See the
.Xr zpoolprops 8
manual page for a list of valid properties that can be set. The only property
supported at the moment is ashift.
.It Fl s
The
.Ar new_device
is reconstructed sequentially to restore redundancy as quickly as possible.
Checksums are not verfied during sequential reconstruction so a scrub is
started when the resilver completes.
Sequential reconstruction is not supported for raidz configurations.
.It Fl w
Waits until
.Ar new_device
has finished resilvering before returning.
.El
.El
.Sh SEE ALSO

View File

@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
.Dd August 9, 2019
.Dd May 15, 2020
.Dt ZPOOL-REPLACE 8
.Os Linux
.Sh NAME
@ -36,7 +36,7 @@
.Sh SYNOPSIS
.Nm
.Cm replace
.Op Fl fw
.Op Fl fsw
.Oo Fl o Ar property Ns = Ns Ar value Oc
.Ar pool Ar device Op Ar new_device
.Sh DESCRIPTION
@ -44,7 +44,7 @@
.It Xo
.Nm
.Cm replace
.Op Fl fw
.Op Fl fsw
.Op Fl o Ar property Ns = Ns Ar value
.Ar pool Ar device Op Ar new_device
.Xc
@ -56,6 +56,7 @@ This is equivalent to attaching
.Ar new_device ,
waiting for it to resilver, and then detaching
.Ar old_device .
Any in progress scrub will be cancelled.
.Pp
The size of
.Ar new_device
@ -86,6 +87,13 @@ Sets the given pool properties. See the
manual page for a list of valid properties that can be set.
The only property supported at the moment is
.Sy ashift .
.It Fl s
The
.Ar new_device
is reconstructed sequentially to restore redundancy as quickly as possible.
Checksums are not verfied during sequential reconstruction so a scrub is
started when the resilver completes.
Sequential reconstruction is not supported for raidz configurations.
.It Fl w
Waits until the replacement has completed before returning.
.El

View File

@ -27,7 +27,7 @@
.\" Copyright 2017 Nexenta Systems, Inc.
.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
.\"
.Dd August 9, 2019
.Dd May 15, 2020
.Dt ZPOOL-STATUS 8
.Os Linux
.Sh NAME

View File

@ -251,6 +251,7 @@ SRCS+= abd.c \
vdev_raidz.c \
vdev_raidz_math.c \
vdev_raidz_math_scalar.c \
vdev_rebuild.c \
vdev_raidz_math_avx2.c \
vdev_raidz_math_avx512bw.c \
vdev_raidz_math_avx512f.c \

View File

@ -570,6 +570,11 @@ zpool_feature_init(void)
"com.datto:resilver_defer", "resilver_defer",
"Support for deferring new resilvers when one is already running.",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
zfeature_register(SPA_FEATURE_DEVICE_REBUILD,
"org.openzfs:device_rebuild", "device_rebuild",
"Support for sequential device rebuilds",
ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
}
#if defined(_KERNEL)

View File

@ -94,6 +94,7 @@ $(MODULE)-objs += vdev_queue.o
$(MODULE)-objs += vdev_raidz.o
$(MODULE)-objs += vdev_raidz_math.o
$(MODULE)-objs += vdev_raidz_math_scalar.o
$(MODULE)-objs += vdev_rebuild.o
$(MODULE)-objs += vdev_removal.o
$(MODULE)-objs += vdev_root.o
$(MODULE)-objs += vdev_trim.o

View File

@ -704,8 +704,9 @@ static int
dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
{
dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
if (dsl_scan_is_running(scn))
if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
return (SET_ERROR(EBUSY));
return (0);
@ -746,8 +747,12 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
if (vdev_resilver_needed(spa->spa_root_vdev,
&scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
spa_event_notify(spa, NULL, NULL,
nvlist_t *aux = fnvlist_alloc();
fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
"healing");
spa_event_notify(spa, NULL, aux,
ESC_ZFS_RESILVER_START);
nvlist_free(aux);
} else {
spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
}
@ -761,6 +766,21 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
/*
* When starting a resilver clear any existing rebuild state.
* This is required to prevent stale rebuild status from
* being reported when a rebuild is run, then a resilver and
* finally a scrub. In which case only the scrub status
* should be reported by 'zpool status'.
*/
if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
vdev_t *rvd = spa->spa_root_vdev;
for (uint64_t i = 0; i < rvd->vdev_children; i++) {
vdev_t *vd = rvd->vdev_child[i];
vdev_rebuild_clear_sync(
(void *)(uintptr_t)vd->vdev_id, tx);
}
}
}
/* back to the generic stuff */
@ -918,14 +938,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
if (complete &&
!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
scn->scn_phys.scn_max_txg, B_TRUE);
scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
if (scn->scn_phys.scn_min_txg) {
nvlist_t *aux = fnvlist_alloc();
fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
"healing");
spa_event_notify(spa, NULL, aux,
ESC_ZFS_RESILVER_FINISH);
nvlist_free(aux);
} else {
spa_event_notify(spa, NULL, NULL,
scn->scn_phys.scn_min_txg ?
ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
ESC_ZFS_SCRUB_FINISH);
}
} else {
vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
0, B_TRUE);
0, B_TRUE, B_FALSE);
}
spa_errlog_rotate(spa);

View File

@ -57,6 +57,7 @@
#include <sys/vdev_indirect_mapping.h>
#include <sys/vdev_indirect_births.h>
#include <sys/vdev_initialize.h>
#include <sys/vdev_rebuild.h>
#include <sys/vdev_trim.h>
#include <sys/vdev_disk.h>
#include <sys/metaslab.h>
@ -1562,6 +1563,7 @@ spa_unload(spa_t *spa)
vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
vdev_rebuild_stop_all(spa);
}
/*
@ -4240,7 +4242,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
* Propagate the leaf DTLs we just loaded all the way up the vdev tree.
*/
spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
spa_config_exit(spa, SCL_ALL, FTAG);
return (0);
@ -4829,11 +4831,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
update_config_cache);
/*
* Check all DTLs to see if anything needs resilvering.
* Check if a rebuild was in progress and if so resume it.
* Then check all DTLs to see if anything needs resilvering.
* The resilver will be deferred if a rebuild was started.
*/
if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
if (vdev_rebuild_active(spa->spa_root_vdev)) {
vdev_rebuild_restart(spa);
} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
spa_async_request(spa, SPA_ASYNC_RESILVER);
}
/*
* Log the fact that we booted up (so that we can detect if
@ -6313,6 +6320,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
vdev_autotrim_stop_all(spa);
vdev_rebuild_stop_all(spa);
}
/*
@ -6536,12 +6544,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* extra rules: you can't attach to it after it's been created, and upon
* completion of resilvering, the first disk (the one being replaced)
* is automatically detached.
*
* If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
* should be performed instead of traditional healing reconstruction. From
* an administrators perspective these are both resilver operations.
*/
int
spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
int rebuild)
{
uint64_t txg, dtl_max_txg;
vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
vdev_ops_t *pvops;
char *oldvdpath, *newvdpath;
@ -6561,6 +6574,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
return (spa_vdev_exit(spa, NULL, txg, error));
}
if (rebuild) {
if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
if (dsl_scan_resilvering(spa_get_dsl(spa)))
return (spa_vdev_exit(spa, NULL, txg,
ZFS_ERR_RESILVER_IN_PROGRESS));
} else {
if (vdev_rebuild_active(rvd))
return (spa_vdev_exit(spa, NULL, txg,
ZFS_ERR_REBUILD_IN_PROGRESS));
}
if (spa->spa_vdev_removal != NULL)
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
@ -6593,6 +6619,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
if (rebuild) {
/*
* For rebuilds, the parent vdev must support reconstruction
* using only space maps. This means the only allowable
* parents are the root vdev or a mirror vdev.
*/
if (pvd->vdev_ops != &vdev_mirror_ops &&
pvd->vdev_ops != &vdev_root_ops) {
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
}
}
if (!replacing) {
/*
* For attach, the only allowable parent is a mirror or the root
@ -6646,7 +6684,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
* than the top-level vdev.
*/
if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
/*
* If this is an in-place replacement, update oldvd's path and devid
@ -6664,9 +6702,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
}
}
/* mark the device being resilvered */
newvd->vdev_resilver_txg = txg;
/*
* If the parent is not a mirror, or if we're replacing, insert the new
* mirror/replacing/spare vdev above oldvd.
@ -6704,8 +6739,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
*/
dtl_max_txg = txg + TXG_CONCURRENT_STATES;
vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
dtl_max_txg - TXG_INITIAL);
vdev_dtl_dirty(newvd, DTL_MISSING,
TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
if (newvd->vdev_isspare) {
spa_spare_activate(newvd);
@ -6722,16 +6757,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
vdev_dirty(tvd, VDD_DTL, newvd, txg);
/*
* Schedule the resilver to restart in the future. We do this to
* ensure that dmu_sync-ed blocks have been stitched into the
* respective datasets. We do not do this if resilvers have been
* deferred.
* Schedule the resilver or rebuild to restart in the future. We do
* this to ensure that dmu_sync-ed blocks have been stitched into the
* respective datasets.
*/
if (rebuild) {
newvd->vdev_rebuild_txg = txg;
vdev_rebuild(tvd);
} else {
newvd->vdev_resilver_txg = txg;
if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
vdev_defer_resilver(newvd);
else
dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg);
} else {
dsl_scan_restart_resilver(spa->spa_dsl_pool,
dtl_max_txg);
}
}
if (spa->spa_bootfs)
spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@ -6774,7 +6818,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
ASSERT(spa_writeable(spa));
txg = spa_vdev_enter(spa);
txg = spa_vdev_detach_enter(spa, guid);
vd = spa_lookup_by_guid(spa, guid, B_FALSE);
@ -7728,6 +7772,12 @@ spa_vdev_resilver_done(spa_t *spa)
}
spa_config_exit(spa, SCL_ALL, FTAG);
/*
* If a detach was not performed above replace waiters will not have
* been notified. In which case we must do so now.
*/
spa_notify_waiters(spa);
}
/*
@ -7970,10 +8020,22 @@ spa_async_thread(void *arg)
if (tasks & SPA_ASYNC_RESILVER_DONE)
spa_vdev_resilver_done(spa);
/*
* If any devices are done replacing, detach them. Then if no
* top-level vdevs are rebuilding attempt to kick off a scrub.
*/
if (tasks & SPA_ASYNC_REBUILD_DONE) {
spa_vdev_resilver_done(spa);
if (!vdev_rebuild_active(spa->spa_root_vdev))
(void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
}
/*
* Kick off a resilver.
*/
if (tasks & SPA_ASYNC_RESILVER &&
!vdev_rebuild_active(spa->spa_root_vdev) &&
(!dsl_scan_resilvering(dp) ||
!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
dsl_scan_restart_resilver(dp, 0);
@ -9470,6 +9532,9 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
DSS_SCANNING);
break;
case ZPOOL_WAIT_RESILVER:
if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
break;
/* fall through */
case ZPOOL_WAIT_SCRUB:
{
boolean_t scanning, paused, is_scrub;

View File

@ -1165,6 +1165,30 @@ spa_vdev_enter(spa_t *spa)
return (spa_vdev_config_enter(spa));
}
/*
* The same as spa_vdev_enter() above but additionally takes the guid of
* the vdev being detached. When there is a rebuild in process it will be
* suspended while the vdev tree is modified then resumed by spa_vdev_exit().
* The rebuild is canceled if only a single child remains after the detach.
*/
uint64_t
spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
{
mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
vdev_autotrim_stop_all(spa);
if (guid != 0) {
vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
if (vd) {
vdev_rebuild_stop_wait(vd->vdev_top);
}
}
return (spa_vdev_config_enter(spa));
}
/*
* Internal implementation for spa_vdev_enter(). Used when a vdev
* operation requires multiple syncs (i.e. removing a device) while
@ -1198,7 +1222,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
/*
* Reassess the DTLs.
*/
vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE);
if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
config_changed = B_TRUE;
@ -1271,6 +1295,7 @@ int
spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
{
vdev_autotrim_restart(spa);
vdev_rebuild_restart(spa);
spa_vdev_config_exit(spa, vd, txg, error, FTAG);
mutex_exit(&spa_namespace_lock);
@ -1322,7 +1347,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
}
if (vd != NULL || error == 0)
vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE);
vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE);
if (vd != NULL) {
if (vd != spa->spa_root_vdev)

View File

@ -39,6 +39,7 @@
#include <sys/dmu_tx.h>
#include <sys/dsl_dir.h>
#include <sys/vdev_impl.h>
#include <sys/vdev_rebuild.h>
#include <sys/uberblock_impl.h>
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
@ -551,10 +552,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
@ -562,10 +565,16 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
0);
}
txg_list_create(&vd->vdev_ms_list, spa,
offsetof(struct metaslab, ms_txg_node));
txg_list_create(&vd->vdev_dtl_list, spa,
@ -835,6 +844,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
&vd->vdev_resilver_txg);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
&vd->vdev_rebuild_txg);
if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
vdev_defer_resilver(vd);
@ -890,6 +902,7 @@ vdev_free(vdev_t *vd)
ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
ASSERT3P(vd->vdev_trim_thread, ==, NULL);
ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
/*
* Scan queues are normally destroyed at the end of a scan. If the
@ -998,10 +1011,12 @@ vdev_free(vdev_t *vd)
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
mutex_destroy(&vd->vdev_scan_io_queue_lock);
mutex_destroy(&vd->vdev_initialize_lock);
mutex_destroy(&vd->vdev_initialize_io_lock);
cv_destroy(&vd->vdev_initialize_io_cv);
cv_destroy(&vd->vdev_initialize_cv);
mutex_destroy(&vd->vdev_trim_lock);
mutex_destroy(&vd->vdev_autotrim_lock);
mutex_destroy(&vd->vdev_trim_io_lock);
@ -1009,6 +1024,11 @@ vdev_free(vdev_t *vd)
cv_destroy(&vd->vdev_autotrim_cv);
cv_destroy(&vd->vdev_trim_io_cv);
mutex_destroy(&vd->vdev_rebuild_lock);
mutex_destroy(&vd->vdev_rebuild_io_lock);
cv_destroy(&vd->vdev_rebuild_cv);
cv_destroy(&vd->vdev_rebuild_io_cv);
zfs_ratelimit_fini(&vd->vdev_delay_rl);
zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@ -1078,7 +1098,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
ASSERT0(tvd->vdev_removing);
ASSERT0(tvd->vdev_rebuilding);
tvd->vdev_removing = svd->vdev_removing;
tvd->vdev_rebuilding = svd->vdev_rebuilding;
tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
tvd->vdev_indirect_config = svd->vdev_indirect_config;
tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
tvd->vdev_indirect_births = svd->vdev_indirect_births;
@ -1092,6 +1115,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
svd->vdev_indirect_births = NULL;
svd->vdev_obsolete_sm = NULL;
svd->vdev_removing = 0;
svd->vdev_rebuilding = 0;
for (t = 0; t < TXG_SIZE; t++) {
while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
@ -2576,11 +2600,8 @@ vdev_dtl_max(vdev_t *vd)
* excise the DTLs.
*/
static boolean_t
vdev_dtl_should_excise(vdev_t *vd)
vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
{
spa_t *spa = vd->vdev_spa;
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
ASSERT0(vd->vdev_children);
if (vd->vdev_state < VDEV_STATE_DEGRADED)
@ -2589,23 +2610,52 @@ vdev_dtl_should_excise(vdev_t *vd)
if (vd->vdev_resilver_deferred)
return (B_FALSE);
if (vd->vdev_resilver_txg == 0 ||
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
return (B_TRUE);
if (rebuild_done) {
vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
/* Rebuild not initiated by attach */
if (vd->vdev_rebuild_txg == 0)
return (B_TRUE);
/*
* When a resilver is initiated the scan will assign the scn_max_txg
* value to the highest txg value that exists in all DTLs. If this
* device's max DTL is not part of this scan (i.e. it is not in
* the range (scn_min_txg, scn_max_txg] then it is not eligible
* for excision.
* When a rebuild completes without error then all missing data
* up to the rebuild max txg has been reconstructed and the DTL
* is eligible for excision.
*/
if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
return (B_TRUE);
}
} else {
dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
/* Resilver not initiated by attach */
if (vd->vdev_resilver_txg == 0)
return (B_TRUE);
/*
* When a resilver is initiated the scan will assign the
* scn_max_txg value to the highest txg value that exists
* in all DTLs. If this device's max DTL is not part of this
* scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
* then it is not eligible for excision.
*/
if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
return (B_TRUE);
}
}
return (B_FALSE);
}
@ -2614,7 +2664,8 @@ vdev_dtl_should_excise(vdev_t *vd)
* write operations will be issued to the pool.
*/
void
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
boolean_t scrub_done, boolean_t rebuild_done)
{
spa_t *spa = vd->vdev_spa;
avl_tree_t reftree;
@ -2624,22 +2675,28 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
for (int c = 0; c < vd->vdev_children; c++)
vdev_dtl_reassess(vd->vdev_child[c], txg,
scrub_txg, scrub_done);
scrub_txg, scrub_done, rebuild_done);
if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
return;
if (vd->vdev_ops->vdev_op_leaf) {
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
boolean_t check_excise = B_FALSE;
boolean_t wasempty = B_TRUE;
mutex_enter(&vd->vdev_dtl_lock);
/*
* If requested, pretend the scan completed cleanly.
* If requested, pretend the scan or rebuild completed cleanly.
*/
if (zfs_scan_ignore_errors && scn)
if (zfs_scan_ignore_errors) {
if (scn != NULL)
scn->scn_phys.scn_errors = 0;
if (vr != NULL)
vr->vr_rebuild_phys.vrp_errors = 0;
}
if (scrub_txg != 0 &&
!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
@ -2654,21 +2711,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
}
/*
* If we've completed a scan cleanly then determine
* if this vdev should remove any DTLs. We only want to
* excise regions on vdevs that were available during
* the entire duration of this scan.
* If we've completed a scrub/resilver or a rebuild cleanly
* then determine if this vdev should remove any DTLs. We
* only want to excise regions on vdevs that were available
* during the entire duration of this scan.
*/
if (scrub_txg != 0 &&
(spa->spa_scrub_started ||
(scn != NULL && scn->scn_phys.scn_errors == 0)) &&
vdev_dtl_should_excise(vd)) {
if (rebuild_done &&
vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
check_excise = B_TRUE;
} else {
if (spa->spa_scrub_started ||
(scn != NULL && scn->scn_phys.scn_errors == 0)) {
check_excise = B_TRUE;
}
}
if (scrub_txg && check_excise &&
vdev_dtl_should_excise(vd, rebuild_done)) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
* will be valid, so excise the old region and
* fold in the scrub dtl. Otherwise, leave the
* dtl as-is if there was an error.
* We completed a scrub, resilver or rebuild up to
* scrub_txg. If we did it without rebooting, then
* the scrub dtl will be valid, so excise the old
* region and fold in the scrub dtl. Otherwise,
* leave the dtl as-is if there was an error.
*
* There's little trick here: to excise the beginning
* of the DTL_MISSING map, we put it into a reference
@ -2711,16 +2776,21 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
/*
* If the vdev was resilvering and no longer has any
* DTLs then reset its resilvering flag and dirty
* If the vdev was resilvering or rebuilding and no longer
* has any DTLs then reset the appropriate flag and dirty
* the top level so that we persist the change.
*/
if (txg != 0 && vd->vdev_resilver_txg != 0 &&
if (txg != 0 &&
range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
if (vd->vdev_rebuild_txg != 0) {
vd->vdev_rebuild_txg = 0;
vdev_config_dirty(vd->vdev_top);
} else if (vd->vdev_resilver_txg != 0) {
vd->vdev_resilver_txg = 0;
vdev_config_dirty(vd->vdev_top);
}
}
mutex_exit(&vd->vdev_dtl_lock);
@ -2955,10 +3025,10 @@ vdev_dtl_required(vdev_t *vd)
* If not, we can safely offline/detach/remove the device.
*/
vd->vdev_cant_read = B_TRUE;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
vd->vdev_cant_read = cant_read;
vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
if (!required && zio_injection_enabled) {
required = !!zio_handle_device_injection(vd, NULL,
@ -3065,6 +3135,20 @@ vdev_load(vdev_t *vd)
}
}
/*
* Load any rebuild state from the top-level vdev zap.
*/
if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
error = vdev_rebuild_load(vd);
if (error && error != ENOTSUP) {
vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
"failed [error=%d]", error);
return (error);
}
}
/*
* If this is a top-level vdev, initialize its metaslabs.
*/
@ -3947,6 +4031,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
vs->vs_state = vd->vdev_state;
vs->vs_rsize = vdev_get_min_asize(vd);
if (vd->vdev_ops->vdev_op_leaf) {
vs->vs_rsize += VDEV_LABEL_START_SIZE +
VDEV_LABEL_END_SIZE;
@ -3973,7 +4058,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
vs->vs_trim_state = vd->vdev_trim_state;
vs->vs_trim_action_time = vd->vdev_trim_action_time;
/* Set when there is a deferred resilver. */
vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
/*
* Report expandable space on top-level, non-auxiliary devices
* only. The expandable space is reported in terms of metaslab
@ -3985,13 +4074,16 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
vd->vdev_max_asize - vd->vdev_asize,
1ULL << tvd->vdev_ms_shift);
}
/*
* Report fragmentation and rebuild progress for top-level,
* non-auxiliary, concrete devices.
*/
if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
vdev_is_concrete(vd)) {
vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
vd->vdev_mg->mg_fragmentation : 0;
}
if (vd->vdev_ops->vdev_op_leaf)
vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
}
vdev_get_stats_ex_impl(vd, vs, vsx);
@ -4072,17 +4164,35 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
mutex_enter(&vd->vdev_stat_lock);
if (flags & ZIO_FLAG_IO_REPAIR) {
/*
* Repair is the result of a resilver issued by the
* scan thread (spa_sync).
*/
if (flags & ZIO_FLAG_SCAN_THREAD) {
dsl_scan_phys_t *scn_phys =
&spa->spa_dsl_pool->dp_scan->scn_phys;
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
dsl_scan_phys_t *scn_phys = &scn->scn_phys;
uint64_t *processed = &scn_phys->scn_processed;
/* XXX cleanup? */
if (vd->vdev_ops->vdev_op_leaf)
atomic_add_64(processed, psize);
vs->vs_scan_processed += psize;
}
/*
* Repair is the result of a rebuild issued by the
* rebuild thread (vdev_rebuild_thread).
*/
if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
vdev_t *tvd = vd->vdev_top;
vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
if (vd->vdev_ops->vdev_op_leaf)
atomic_add_64(rebuilt, psize);
vs->vs_rebuild_processed += psize;
}
if (flags & ZIO_FLAG_SELF_HEAL)
vs->vs_self_healed += psize;
}
@ -4094,6 +4204,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (vd->vdev_ops->vdev_op_leaf &&
(zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
zio_type_t vs_type = type;
zio_priority_t priority = zio->io_priority;
/*
* TRIM ops and bytes are reported to user space as
@ -4103,19 +4214,44 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
if (type == ZIO_TYPE_TRIM)
vs_type = ZIO_TYPE_IOCTL;
/*
* Solely for the purposes of 'zpool iostat -lqrw'
* reporting use the priority to catagorize the IO.
* Only the following are reported to user space:
*
* ZIO_PRIORITY_SYNC_READ,
* ZIO_PRIORITY_SYNC_WRITE,
* ZIO_PRIORITY_ASYNC_READ,
* ZIO_PRIORITY_ASYNC_WRITE,
* ZIO_PRIORITY_SCRUB,
* ZIO_PRIORITY_TRIM.
*/
if (priority == ZIO_PRIORITY_REBUILD) {
priority = ((type == ZIO_TYPE_WRITE) ?
ZIO_PRIORITY_ASYNC_WRITE :
ZIO_PRIORITY_SCRUB);
} else if (priority == ZIO_PRIORITY_INITIALIZING) {
ASSERT3U(type, ==, ZIO_TYPE_WRITE);
priority = ZIO_PRIORITY_ASYNC_WRITE;
} else if (priority == ZIO_PRIORITY_REMOVAL) {
priority = ((type == ZIO_TYPE_WRITE) ?
ZIO_PRIORITY_ASYNC_WRITE :
ZIO_PRIORITY_ASYNC_READ);
}
vs->vs_ops[vs_type]++;
vs->vs_bytes[vs_type] += psize;
if (flags & ZIO_FLAG_DELEGATED) {
vsx->vsx_agg_histo[zio->io_priority]
vsx->vsx_agg_histo[priority]
[RQ_HISTO(zio->io_size)]++;
} else {
vsx->vsx_ind_histo[zio->io_priority]
vsx->vsx_ind_histo[priority]
[RQ_HISTO(zio->io_size)]++;
}
if (zio->io_delta && zio->io_delay) {
vsx->vsx_queue_histo[zio->io_priority]
vsx->vsx_queue_histo[priority]
[L_HISTO(zio->io_delta - zio->io_delay)]++;
vsx->vsx_disk_histo[type]
[L_HISTO(zio->io_delay)]++;

View File

@ -404,6 +404,19 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
}
}
static void
top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
{
if (vd == vd->vdev_top) {
vdev_rebuild_stat_t vrs;
if (vdev_rebuild_get_stats(vd, &vrs) == 0) {
fnvlist_add_uint64_array(nvl,
ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs,
sizeof (vrs) / sizeof (uint64_t));
}
}
}
/*
* Generate the nvlist representing this vdev's config.
*/
@ -559,6 +572,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
vdev_config_generate_stats(vd, nv);
root_vdev_actions_getprogress(vd, nv);
top_vdev_actions_getprogress(vd, nv);
/*
* Note: this can be called from open context
@ -663,6 +677,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_resilver_txg != 0)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
vd->vdev_resilver_txg);
if (vd->vdev_rebuild_txg != 0)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
vd->vdev_rebuild_txg);
if (vd->vdev_faulted)
fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
if (vd->vdev_degraded)

View File

@ -767,8 +767,9 @@ vdev_mirror_io_done(zio_t *zio)
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
zio->io_abd, zio->io_size,
ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
zio->io_priority == ZIO_PRIORITY_REBUILD ?
ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}

View File

@ -158,6 +158,8 @@ uint32_t zfs_vdev_initializing_min_active = 1;
uint32_t zfs_vdev_initializing_max_active = 1;
uint32_t zfs_vdev_trim_min_active = 1;
uint32_t zfs_vdev_trim_max_active = 2;
uint32_t zfs_vdev_rebuild_min_active = 1;
uint32_t zfs_vdev_rebuild_max_active = 3;
/*
* When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@ -278,6 +280,8 @@ vdev_queue_class_min_active(zio_priority_t p)
return (zfs_vdev_initializing_min_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_min_active);
case ZIO_PRIORITY_REBUILD:
return (zfs_vdev_rebuild_min_active);
default:
panic("invalid priority %u", p);
return (0);
@ -352,6 +356,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
return (zfs_vdev_initializing_max_active);
case ZIO_PRIORITY_TRIM:
return (zfs_vdev_trim_max_active);
case ZIO_PRIORITY_REBUILD:
return (zfs_vdev_rebuild_max_active);
default:
panic("invalid priority %u", p);
return (0);
@ -845,7 +851,8 @@ vdev_queue_io(zio_t *zio)
zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
zio->io_priority != ZIO_PRIORITY_SCRUB &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
zio->io_priority != ZIO_PRIORITY_REBUILD) {
zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
}
} else if (zio->io_type == ZIO_TYPE_WRITE) {
@ -854,7 +861,8 @@ vdev_queue_io(zio_t *zio)
if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
zio->io_priority != ZIO_PRIORITY_REMOVAL &&
zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
zio->io_priority != ZIO_PRIORITY_REBUILD) {
zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
}
} else {
@ -1051,6 +1059,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
"Min active trim/discard I/Os per vdev");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
"Max active rebuild I/Os per vdev");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
"Min active rebuild I/Os per vdev");
ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
"Queue depth percentage for each top-level vdev");
/* END CSTYLED */

1106
module/zfs/vdev_rebuild.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1938,8 +1938,9 @@ static int
zfs_ioc_vdev_attach(zfs_cmd_t *zc)
{
spa_t *spa;
int replacing = zc->zc_cookie;
nvlist_t *config;
int replacing = zc->zc_cookie;
int rebuild = zc->zc_simple;
int error;
if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
@ -1947,7 +1948,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
zc->zc_iflags, &config)) == 0) {
error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
error = spa_vdev_attach(spa, zc->zc_guid, config, replacing,
rebuild);
nvlist_free(config);
}

View File

@ -487,7 +487,8 @@ tests = ['zpool_wait_discard', 'zpool_wait_freeing',
tags = ['functional', 'cli_root', 'zpool_wait']
[tests/functional/cli_root/zpool_wait/scan]
tests = ['zpool_wait_replace_cancel', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild',
'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag']
tags = ['functional', 'cli_root', 'zpool_wait']
@ -748,7 +749,11 @@ tests = ['rename_dirs_001_pos']
tags = ['functional', 'rename_dirs']
[tests/functional/replacement]
tests = ['replacement_001_pos', 'replacement_002_pos', 'replacement_003_pos']
tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
'attach_resilver', 'detach', 'rebuild_disabled_feature',
'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
'scrub_cancel']
tags = ['functional', 'replacement']
[tests/functional/reservation]
@ -762,10 +767,6 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
'reservation_022_pos']
tags = ['functional', 'reservation']
[tests/functional/resilver]
tests = ['resilver_restart_001', 'resilver_restart_002']
tags = ['functional', 'resilver']
[tests/functional/rootpool]
tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos']
tags = ['functional', 'rootpool']

View File

@ -2222,26 +2222,27 @@ function check_pool_status # pool token keyword <verbose>
if [[ $verbose == true ]]; then
log_note $scan
fi
echo $scan | grep -i "$keyword" > /dev/null 2>&1
echo $scan | egrep -i "$keyword" > /dev/null 2>&1
return $?
}
#
# The following functions are instance of check_pool_status()
# is_pool_resilvering - to check if the pool is resilver in progress
# is_pool_resilvered - to check if the pool is resilver completed
# is_pool_scrubbing - to check if the pool is scrub in progress
# is_pool_scrubbed - to check if the pool is scrub completed
# is_pool_scrub_stopped - to check if the pool is scrub stopped
# is_pool_scrub_paused - to check if the pool has scrub paused
# is_pool_removing - to check if the pool is removing a vdev
# is_pool_removed - to check if the pool is remove completed
# is_pool_discarding - to check if the pool has checkpoint being discarded
# is_pool_resilvering - to check if the pool resilver is in progress
# is_pool_resilvered - to check if the pool resilver is completed
# is_pool_scrubbing - to check if the pool scrub is in progress
# is_pool_scrubbed - to check if the pool scrub is completed
# is_pool_scrub_stopped - to check if the pool scrub is stopped
# is_pool_scrub_paused - to check if the pool scrub has paused
# is_pool_removing - to check if the pool removing is a vdev
# is_pool_removed - to check if the pool remove is completed
# is_pool_discarding - to check if the pool checkpoint is being discarded
#
function is_pool_resilvering #pool <verbose>
{
check_pool_status "$1" "scan" "resilver in progress since " $2
check_pool_status "$1" "scan" \
"resilver[ ()0-9A-Za-z_-]* in progress since" $2
return $?
}
@ -3487,7 +3488,7 @@ function wait_scrubbed
typeset pool=${1:-$TESTPOOL}
while true ; do
is_pool_scrubbed $pool && break
log_must sleep 1
sleep 1
done
}

View File

@ -65,7 +65,6 @@ SUBDIRS = \
rename_dirs \
replacement \
reservation \
resilver \
rootpool \
rsend \
scrub_mirror \

View File

@ -79,6 +79,7 @@ typeset -a properties=(
"feature@redacted_datasets"
"feature@bookmark_written"
"feature@log_spacemap"
"feature@device_rebuild"
)
if is_linux || is_freebsd; then

View File

@ -4,6 +4,7 @@ dist_pkgdata_SCRIPTS = \
cleanup.ksh \
zpool_wait_replace.ksh \
zpool_wait_replace_cancel.ksh \
zpool_wait_rebuild.ksh \
zpool_wait_resilver.ksh \
zpool_wait_scrub_basic.ksh \
zpool_wait_scrub_cancel.ksh \

View File

@ -0,0 +1,64 @@
#!/bin/ksh -p
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2018 by Delphix. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib
#
# DESCRIPTION:
# 'zpool wait' works when waiting for sequential resilvering to complete.
#
# STRATEGY:
# 1. Attach a device to the pool so that sequential resilvering starts.
# 2. Start 'zpool wait'.
# 3. Monitor the waiting process to make sure it returns neither too soon nor
# too late.
# 4. Repeat 1-3, except using the '-w' flag with 'zpool attach' instead of using
# 'zpool wait'.
#
function cleanup
{
remove_io_delay
kill_if_running $pid
get_disklist $TESTPOOL | grep $DISK2 >/dev/null && \
log_must zpool detach $TESTPOOL $DISK2
}
typeset -r IN_PROGRESS_CHECK="is_pool_resilvering $TESTPOOL"
typeset pid
log_onexit cleanup
add_io_delay $TESTPOOL
# Test 'zpool wait -t resilver'
log_must zpool attach -s $TESTPOOL $DISK1 $DISK2
log_bkgrnd zpool wait -t resilver $TESTPOOL
pid=$!
check_while_waiting $pid "$IN_PROGRESS_CHECK"
log_must zpool detach $TESTPOOL $DISK2
# Test 'zpool attach -w'
log_bkgrnd zpool attach -sw $TESTPOOL $DISK1 $DISK2
pid=$!
while ! is_pool_resilvering $TESTPOOL && proc_exists $pid; do
log_must sleep .5
done
check_while_waiting $pid "$IN_PROGRESS_CHECK"
log_pass "'zpool wait -t resilver' and 'zpool attach -w' work."

View File

@ -2,9 +2,20 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/replacement
dist_pkgdata_SCRIPTS = \
setup.ksh \
cleanup.ksh \
replacement_001_pos.ksh \
replacement_002_pos.ksh \
replacement_003_pos.ksh
attach_import.ksh \
attach_multiple.ksh \
attach_rebuild.ksh \
attach_resilver.ksh \
detach.ksh \
rebuild_disabled_feature.ksh \
rebuild_multiple.ksh \
rebuild_raidz.ksh \
replace_import.ksh \
replace_rebuild.ksh \
replace_resilver.ksh \
resilver_restart_001.ksh \
resilver_restart_002.ksh \
scrub_cancel.ksh
dist_pkgdata_DATA = \
replacement.cfg

View File

@ -0,0 +1,67 @@
#!/bin/ksh
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# Description:
# Verify that on import an in progress attach operation is resumed.
#
# Strategy:
# 1. For both healing and sequential resilvering.
# a. Create a pool
# b. Add a vdev with 'zpool attach' and resilver (-s) it.
# c. Export the pool
# d. Import the pool
# e. Verify the 'zpool attach' resumed resilvering
# f. Destroy the pool
#
function cleanup
{
log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
$ORIG_SCAN_SUSPEND_PROGRESS
destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]}
}
log_assert "Verify attach is resumed on import"
ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
log_onexit cleanup
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]}
# Verify healing and sequential resilver resume on import.
for arg in "" "-s"; do
log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]}
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
log_must zpool attach $arg $TESTPOOL1 ${VDEV_FILES[0]} ${VDEV_FILES[1]}
log_must is_pool_resilvering $TESTPOOL1
log_must zpool export $TESTPOOL1
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1
log_must is_pool_resilvering $TESTPOOL1
log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
log_must zpool wait -t resilver $TESTPOOL1
log_must is_pool_resilvered $TESTPOOL1
destroy_pool $TESTPOOL1
done
log_pass "Verify attach is resumed on import"

View File

@ -0,0 +1,111 @@
#!/bin/ksh
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# Description:
# Verify that attach/detach work while resilvering and attaching
# multiple vdevs.
#
# Strategy:
# 1. Create a single vdev pool
# 2. While healing or sequential resilvering:
# a. Attach a vdev to convert the pool to a mirror.
# b. Attach a vdev to convert the pool to a 3-way mirror.
# c. Verify the original vdev cannot be removed (no redundant copies)
# d. Detach a vdev. Healing and sequential resilver remain running.
# e. Detach a vdev. Healing resilver remains running, sequential
# resilver is canceled.
# f. Wait for resilver to complete.
#
function cleanup
{
log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
$ORIG_SCAN_SUSPEND_PROGRESS
destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]}
}
log_assert "Verify attach/detech with multiple vdevs"
ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
log_onexit cleanup
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]}
# Verify resilver resumes on import.
log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]}
for replace_mode in "healing" "sequential"; do
#
# Resilvers abort the dsl_scan and reconfigure it for resilvering.
# Rebuilds cancel the dsl_scan and start the vdev_rebuild thread.
#
if [[ "$replace_mode" = "healing" ]]; then
flags=""
else
flags="-s"
fi
log_mustnot is_pool_resilvering $TESTPOOL1
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
# Attach first vdev (stripe -> mirror)
log_must zpool attach $flags $TESTPOOL1 \
${VDEV_FILES[0]} ${VDEV_FILES[1]}
log_must is_pool_resilvering $TESTPOOL1
# Attach second vdev (2-way -> 3-way mirror)
log_must zpool attach $flags $TESTPOOL1 \
${VDEV_FILES[1]} ${VDEV_FILES[2]}
log_must is_pool_resilvering $TESTPOOL1
# Original vdev cannot be detached until there is sufficent redundancy.
log_mustnot zpool detach $TESTPOOL1 ${VDEV_FILES[0]}
# Detach first vdev (resilver keeps running)
log_must zpool detach $TESTPOOL1 ${VDEV_FILES[1]}
log_must is_pool_resilvering $TESTPOOL1
#
# Detach second vdev. There's a difference in behavior between
# healing and sequential resilvers. A healing resilver will not be
# cancelled even though there's nothing on the original vdev which
# needs to be rebuilt. A sequential resilver on the otherhand is
# canceled when returning to a non-redundant striped layout. At
# some point the healing resilver behavior should be updated to match
# the sequential resilver behavior.
#
log_must zpool detach $TESTPOOL1 ${VDEV_FILES[2]}
if [[ "$replace_mode" = "healing" ]]; then
log_must is_pool_resilvering $TESTPOOL1
else
log_mustnot is_pool_resilvering $TESTPOOL1
fi
log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
$ORIG_SCAN_SUSPEND_PROGRESS
log_must zpool wait $TESTPOOL1
done
log_pass "Verify attach/detech with multiple vdevs"

View File

@ -0,0 +1,173 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
# Attaching disks during I/O should pass for supported pools.
#
# STRATEGY:
# 1. Create multidisk pools (stripe/mirror/raidz) and
# start some random I/O
# 2. Attach a disk to the pool.
# 3. Verify the integrity of the file system and the resilvering.
#
# NOTE: Raidz does not support the sequential resilver (-s) option.
#
verify_runnable "global"
function cleanup
{
if [[ -n "$child_pids" ]]; then
for wait_pid in $child_pids; do
kill $wait_pid
done
fi
if poolexists $TESTPOOL1; then
destroy_pool $TESTPOOL1
fi
[[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
}
log_assert "Replacing a disk during I/O completes."
options=""
options_display="default options"
log_onexit cleanup
[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
options="$options -r "
[[ -n "$options" ]] && options_display=$options
child_pids=""
function attach_test
{
typeset -i iters=2
typeset -i index=0
typeset opt=$1
typeset disk1=$2
typeset disk2=$3
typeset i=0
while [[ $i -lt $iters ]]; do
log_note "Invoking file_trunc with: $options_display"
file_trunc $options $TESTDIR/$TESTFILE.$i &
typeset pid=$!
sleep 1
child_pids="$child_pids $pid"
((i = i + 1))
done
log_must zpool attach -sw $opt $TESTPOOL1 $disk1 $disk2
for wait_pid in $child_pids; do
kill $wait_pid
done
child_pids=""
log_must zpool export $TESTPOOL1
log_must zpool import -d $TESTDIR $TESTPOOL1
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
verify_pool $TESTPOOL1
}
specials_list=""
i=0
while [[ $i != 3 ]]; do
truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
done
#
# Create a replacement disk special file.
#
truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
for op in "" "-f"; do
create_pool $TESTPOOL1 mirror $specials_list
log_must zfs create $TESTPOOL1/$TESTFS1
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
if [[ $? -ne 0 ]]; then
log_fail "$REPLACEFILE is not present."
fi
destroy_pool $TESTPOOL1
done
log_note "Verify 'zpool attach' fails with non-mirrors."
for type in "" "raidz" "raidz1"; do
for op in "" "-f"; do
create_pool $TESTPOOL1 $type $specials_list
log_must zfs create $TESTPOOL1/$TESTFS1
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
log_mustnot zpool attach -s "$opt" $TESTDIR/$TESTFILE1.1 \
$TESTDIR/$REPLACEFILE
zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
if [[ $? -eq 0 ]]; then
log_fail "$REPLACEFILE should not be present."
fi
destroy_pool $TESTPOOL1
done
done
log_pass

View File

@ -104,9 +104,7 @@ function attach_test
((i = i + 1))
done
log_must zpool attach $opt $TESTPOOL1 $disk1 $disk2
sleep 10
log_must zpool attach -w $opt $TESTPOOL1 $disk1 $disk2
for wait_pid in $child_pids
do
@ -119,13 +117,13 @@ function attach_test
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
verify_pool $TESTPOOL1
}
specials_list=""
i=0
while [[ $i != 2 ]]; do
mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
while [[ $i != 3 ]]; do
truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
@ -134,7 +132,7 @@ done
#
# Create a replacement disk special file.
#
mkfile $MINVDEVSIZE $TESTDIR/$REPLACEFILE
truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
for op in "" "-f"; do
create_pool $TESTPOOL1 mirror $specials_list
@ -143,7 +141,7 @@ for op in "" "-f"; do
attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE"
zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
if [[ $? -ne 0 ]]; then
log_fail "$REPLACEFILE is not present."
fi
@ -162,7 +160,7 @@ for type in "" "raidz" "raidz1"; do
log_mustnot zpool attach "$opt" $TESTDIR/$TESTFILE1.1 \
$TESTDIR/$REPLACEFILE
zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE"
zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
if [[ $? -eq 0 ]]; then
log_fail "$REPLACEFILE should not be present."
fi

View File

@ -121,8 +121,8 @@ function detach_test
specials_list=""
i=0
while [[ $i != 2 ]]; do
mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
while [[ $i != 3 ]]; do
truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
@ -134,7 +134,7 @@ log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
detach_test $TESTDIR/$TESTFILE1.1
zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1"
zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1"
if [[ $? -eq 0 ]]; then
log_fail "$TESTFILE1.1 should no longer be present."
fi
@ -150,7 +150,7 @@ for type in "" "raidz" "raidz1" ; do
log_mustnot zpool detach $TESTDIR/$TESTFILE1.1
zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1"
zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1"
if [[ $? -ne 0 ]]; then
log_fail "$TESTFILE1.1 is not present."
fi

View File

@ -0,0 +1,78 @@
#!/bin/ksh
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# Description:
# Verify device_rebuild feature flags.
#
# Strategy:
# 1. Create a pool with all features disabled.
# 2. Verify 'zpool replace -s' fails and the feature is disabled.
# 3. Enable the device_rebuild feature.
# 4. Verify 'zpool replace -s' works and the feature is active.
# 5. Wait for the feature to return to enabled.
#
function cleanup
{
log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
$ORIG_SCAN_SUSPEND_PROGRESS
destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
}
function check_feature_flag
{
feature=$1
pool=$2
expected_value=$3
value="$(zpool get -H -o property,value all $pool | \
egrep "$feature" | awk '{print $2}')"
if [ "$value" = "$expected_value" ]; then
log_note "$feature verified to be $value"
else
log_fail "$feature should be $expected_value but is $value"
fi
}
log_assert "Verify device_rebuild feature flags."
ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
log_onexit cleanup
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
log_must zpool create -d $TESTPOOL1 ${VDEV_FILES[@]}
log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "disabled"
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
log_must zpool set feature@device_rebuild=enabled $TESTPOOL1
log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "active"
log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
log_must zpool wait -t resilver $TESTPOOL1
check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "enabled"
log_pass "Verify device_rebuild feature flags."

View File

@ -0,0 +1,126 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
# CDDL HEADER END
#
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
# Sequential reconstruction (unlike healing reconstruction) operate on the
# top-level vdev. This means that a sequential resilver operation can be
# started/stopped on a different top-level vdev without impacting other
# sequential resilvers.
#
# STRATEGY:
# 1. Create a mirrored pool.
#
function cleanup
{
log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
$ORIG_SCAN_SUSPEND_PROGRESS
destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE $SPARE_VDEV_FILE2
}
function check_history
{
pool=$1
msg=$2
exp=$3
count=$(zpool history -i $pool | grep "rebuild" | grep -c "$msg")
if [[ "$count" -ne "$exp" ]]; then
log_fail "Expected $exp rebuild '$msg' messages, found $count"
else
log_note "Found $count/$exp rebuild '$msg' messages"
fi
}
log_assert "Rebuilds operate on the top-level vdevs"
ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
log_onexit cleanup
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} \
$SPARE_VDEV_FILE $SPARE_VDEV_FILE2
# Verify two sequential resilvers can run concurrently.
log_must zpool create -f $TESTPOOL1 \
mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \
mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]}
log_must zfs create $TESTPOOL1/$TESTFS
mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS)
log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32
log_must zpool sync $TESTPOOL1
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2
check_history $TESTPOOL1 "started" 2
check_history $TESTPOOL1 "reset" 0
check_history $TESTPOOL1 "complete" 0
check_history $TESTPOOL1 "canceled" 0
log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
log_must zpool wait -t resilver $TESTPOOL1
check_history $TESTPOOL1 "complete" 2
destroy_pool $TESTPOOL1
# Verify canceling one resilver (zpool detach) does not impact others.
log_must zpool create -f $TESTPOOL1 \
mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \
mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]}
log_must zfs create $TESTPOOL1/$TESTFS
mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS)
log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32
log_must zpool sync $TESTPOOL1
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2
check_history $TESTPOOL1 "started" 2
check_history $TESTPOOL1 "reset" 0
check_history $TESTPOOL1 "complete" 0
check_history $TESTPOOL1 "canceled" 0
log_must zpool detach $TESTPOOL1 $SPARE_VDEV_FILE2
check_history $TESTPOOL1 "complete" 0
check_history $TESTPOOL1 "canceled" 1
log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
log_must zpool wait -t resilver $TESTPOOL1
check_history $TESTPOOL1 "complete" 1
check_history $TESTPOOL1 "canceled" 1
destroy_pool $TESTPOOL1
log_pass "Rebuilds operate on the top-level vdevs"

View File

@ -0,0 +1,70 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
# CDDL HEADER END
#
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
# Executing 'zpool replace -s' for raidz vdevs failed. Sequential
# resilvers are only allowed for stripe/mirror pools.
#
# STRATEGY:
# 1. Create a raidz pool, verify 'zpool replace -s' fails
# 2. Create a stripe/mirror pool, verify 'zpool replace -s' passes
#
function cleanup
{
log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
$ORIG_SCAN_SUSPEND_PROGRESS
destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
}
log_assert "Sequential resilver is not allowed for raidz vdevs"
ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
log_onexit cleanup
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
# raidz[1-3]
for vdev_type in "raidz" "raidz2" "raidz3"; do
log_must zpool create -f $TESTPOOL1 $vdev_type ${VDEV_FILES[@]}
log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} \
$SPARE_VDEV_FILE
destroy_pool $TESTPOOL1
done
# stripe
log_must zpool create $TESTPOOL1 ${VDEV_FILES[@]}
log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
destroy_pool $TESTPOOL1
# mirror
log_must zpool create $TESTPOOL1 mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]}
log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
destroy_pool $TESTPOOL1
log_pass "Sequential resilver is not allowed for raidz vdevs"

View File

@ -0,0 +1,67 @@
#!/bin/ksh
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# Description:
# Verify that on import an in progress replace operation is resumed.
#
# Strategy:
# 1. For both healing and sequential resilvering replace:
# a. Create a pool
# b. Repalce a vdev with 'zpool replace' to resilver (-s) it.
# c. Export the pool
# d. Import the pool
# e. Verify the 'zpool replace' resumed resilvering.
# f. Destroy the pool
#
function cleanup
{
log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
$ORIG_SCAN_SUSPEND_PROGRESS
destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
}
log_assert "Verify replace is resumed on import"
ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
log_onexit cleanup
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
# Verify healing and sequential resilver resume on import.
for arg in "" "-s"; do
log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]}
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE
log_must is_pool_resilvering $TESTPOOL1
log_must zpool export $TESTPOOL1
log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1
log_must is_pool_resilvering $TESTPOOL1
log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
log_must zpool wait -t resilver $TESTPOOL1
log_must is_pool_resilvered $TESTPOOL1
destroy_pool $TESTPOOL1
done
log_pass "Verify replace is resumed on import"

View File

@ -0,0 +1,158 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
#
# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
# Replacing disks during I/O should pass for supported pools.
#
# STRATEGY:
# 1. Create multidisk pools (stripe/mirror) and
# start some random I/O
# 2. Replace a disk in the pool with another disk.
# 3. Verify the integrity of the file system and the rebuilding.
#
# NOTE: Raidz does not support the sequential resilver (-s) option.
#
verify_runnable "global"
function cleanup
{
if [[ -n "$child_pids" ]]; then
for wait_pid in $child_pids
do
kill $wait_pid
done
fi
if poolexists $TESTPOOL1; then
destroy_pool $TESTPOOL1
fi
[[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
}
log_assert "Replacing a disk with -r during I/O completes."
options=""
options_display="default options"
log_onexit cleanup
[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
options="$options -r "
[[ -n "$options" ]] && options_display=$options
child_pids=""
function replace_test
{
typeset -i iters=2
typeset -i index=0
typeset opt=$1
typeset disk1=$2
typeset disk2=$3
typeset i=0
while [[ $i -lt $iters ]]; do
log_note "Invoking file_trunc with: $options_display"
file_trunc $options $TESTDIR/$TESTFILE.$i &
typeset pid=$!
sleep 1
child_pids="$child_pids $pid"
((i = i + 1))
done
log_must zpool replace -sw $opt $TESTPOOL1 $disk1 $disk2
for wait_pid in $child_pids
do
kill $wait_pid
done
child_pids=""
log_must zpool export $TESTPOOL1
log_must zpool import -d $TESTDIR $TESTPOOL1
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
verify_pool $TESTPOOL1
}
specials_list=""
i=0
while [[ $i != 3 ]]; do
log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
((i = i + 1))
done
#
# Create a replacement disk special file.
#
log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
for type in "" "mirror"; do
for op in "" "-f"; do
create_pool $TESTPOOL1 $type $specials_list
log_must zfs create $TESTPOOL1/$TESTFS1
log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
if [[ $? -ne 0 ]]; then
log_fail "$REPLACEFILE is not present."
fi
destroy_pool $TESTPOOL1
log_must rm -rf /$TESTPOOL1
done
done
log_pass

View File

@ -104,9 +104,7 @@ function replace_test
((i = i + 1))
done
log_must zpool replace $opt $TESTPOOL1 $disk1 $disk2
sleep 10
log_must zpool replace -w $opt $TESTPOOL1 $disk1 $disk2
for wait_pid in $child_pids
do
@ -119,11 +117,12 @@ function replace_test
log_must zfs umount $TESTPOOL1/$TESTFS1
log_must zdb -cdui $TESTPOOL1/$TESTFS1
log_must zfs mount $TESTPOOL1/$TESTFS1
verify_pool $TESTPOOL1
}
specials_list=""
i=0
while [[ $i != 2 ]]; do
while [[ $i != 3 ]]; do
log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
@ -143,7 +142,7 @@ for type in "" "raidz" "mirror"; do
replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE"
zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
if [[ $? -ne 0 ]]; then
log_fail "$REPLACEFILE is not present."
fi

View File

@ -36,3 +36,8 @@ export HOLES_SEED=${HOLES_SEED-""}
export HOLES_FILEOFFSET=${HOLES_FILEOFFSET-""}
export HOLES_COUNT=${HOLES_COUNT-"16384"} # FILESIZE/BLKSIZE/8
export REPLACEFILE="sparedisk"
set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4}
export VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 ))
export SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1
export SPARE_VDEV_FILE2=$TEST_BASE_DIR/spare-2

View File

@ -20,7 +20,7 @@
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/resilver/resilver.cfg
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
@ -50,7 +50,7 @@ function cleanup
$ORIG_SCAN_SUSPEND_PROGRESS
log_must set_tunable32 ZEVENT_LEN_MAX $ORIG_ZFS_ZEVENT_LEN_MAX
log_must zinject -c all
destroy_pool $TESTPOOL
destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
}
@ -70,7 +70,7 @@ function verify_restarts # <msg> <cnt> <defer>
[[ -z "$defer" ]] && return
# use zdb to find which vdevs have the resilver defer flag
VDEV_DEFERS=$(zdb -C $TESTPOOL | awk '
VDEV_DEFERS=$(zdb -C $TESTPOOL1 | awk '
/children/ { gsub(/[^0-9]/, ""); child = $0 }
/com\.datto:resilver_defer$/ { print child }
')
@ -106,17 +106,17 @@ log_must set_tunable32 ZEVENT_LEN_MAX 512
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL \
log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL1 \
raidz ${VDEV_FILES[@]}
# create 4 filesystems
for fs in fs{0..3}
do
log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL/$fs
log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL1/$fs
done
# simultaneously write 16M to each of them
set -A DATAPATHS /$TESTPOOL/fs{0..3}/dat.0
set -A DATAPATHS /$TESTPOOL1/fs{0..3}/dat.0
log_note "Writing data files"
for path in ${DATAPATHS[@]}
do
@ -131,7 +131,7 @@ do
if [[ $test == "with" ]]
then
log_must zpool set feature@resilver_defer=enabled $TESTPOOL
log_must zpool set feature@resilver_defer=enabled $TESTPOOL1
RESTARTS=( "${DEFER_RESTARTS[@]}" )
VDEVS=( "${DEFER_VDEVS[@]}" )
VDEV_REPLACE="$SPARE_VDEV_FILE ${VDEV_FILES[1]}"
@ -144,7 +144,7 @@ do
log_must set_tunable32 RESILVER_MIN_TIME_MS 50
# initiate a resilver and suspend the scan as soon as possible
log_must zpool replace $TESTPOOL $VDEV_REPLACE
log_must zpool replace $TESTPOOL1 $VDEV_REPLACE
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
# there should only be 1 resilver start
@ -152,16 +152,16 @@ do
# offline then online a vdev to introduce a new DTL range after current
# scan, which should restart (or defer) the resilver
log_must zpool offline $TESTPOOL ${VDEV_FILES[2]}
log_must zpool sync $TESTPOOL
log_must zpool online $TESTPOOL ${VDEV_FILES[2]}
log_must zpool sync $TESTPOOL
log_must zpool offline $TESTPOOL1 ${VDEV_FILES[2]}
log_must zpool sync $TESTPOOL1
log_must zpool online $TESTPOOL1 ${VDEV_FILES[2]}
log_must zpool sync $TESTPOOL1
# there should now be 2 resilver starts w/o defer, 1 with defer
verify_restarts ' after offline/online' "${RESTARTS[1]}" "${VDEVS[1]}"
# inject read io errors on vdev and verify resilver does not restart
log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL
log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL1
log_must cat ${DATAPATHS[1]} > /dev/null
log_must zinject -c all
@ -173,17 +173,12 @@ do
log_must set_tunable32 RESILVER_MIN_TIME_MS 3000
# wait for resilver to finish
for iter in {0..59}
do
is_pool_resilvered $TESTPOOL && break
sleep 1
done
is_pool_resilvered $TESTPOOL ||
log_fail "resilver timed out"
log_must zpool wait -t resilver $TESTPOOL1
log_must is_pool_resilvered $TESTPOOL1
# wait for a few txg's to see if a resilver happens
log_must zpool sync $TESTPOOL
log_must zpool sync $TESTPOOL
log_must zpool sync $TESTPOOL1
log_must zpool sync $TESTPOOL1
# there should now be 2 resilver starts
verify_restarts ' after resilver' "${RESTARTS[3]}" "${VDEVS[3]}"

View File

@ -20,7 +20,7 @@
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/resilver/resilver.cfg
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
@ -40,7 +40,7 @@
function cleanup
{
log_must zinject -c all
destroy_pool $TESTPOOL
destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY
}
@ -56,25 +56,25 @@ log_must set_tunable32 SCAN_LEGACY 1
# create the pool and a 32M file (32k blocks)
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE
log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]}
log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1
log_must zpool create -f -O recordsize=1k $TESTPOOL1 ${VDEV_FILES[0]}
log_must dd if=/dev/urandom of=/$TESTPOOL1/file bs=1M count=32 > /dev/null 2>&1
# determine objset/object
objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p')
object=$(ls -i /$TESTPOOL/file | awk '{print $1}')
objset=$(zdb -d $TESTPOOL1/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p')
object=$(ls -i /$TESTPOOL1/file | awk '{print $1}')
# inject event to cause error during resilver
log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL
log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL1
# clear events and start resilver
log_must zpool events -c
log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE
log_must zpool attach $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE
log_note "waiting for read errors to start showing up"
for iter in {0..59}
do
zpool sync $TESTPOOL
err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}')
zpool sync $TESTPOOL1
err=$(zpool status $TESTPOOL1 | grep ${VDEV_FILES[0]} | awk '{print $3}')
(( $err > 0 )) && break
sleep 1
done
@ -92,8 +92,8 @@ done
(( $finish == 0 )) && log_fail "resilver took too long to finish"
# wait a few syncs to ensure that zfs does not restart the resilver
log_must zpool sync $TESTPOOL
log_must zpool sync $TESTPOOL
log_must zpool sync $TESTPOOL1
log_must zpool sync $TESTPOOL1
# check if resilver was restarted
start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l)

View File

@ -0,0 +1,112 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# This file and its contents are supplied under the terms of the
# Common Development and Distribution License ("CDDL"), version 1.0.
# You may only use this file in accordance with the terms of version
# 1.0 of the CDDL.
#
# A full copy of the text of the CDDL should have accompanied this
# source. A copy of the CDDL is also available via the Internet at
# http://www.illumos.org/license/CDDL.
#
# CDDL HEADER END
#
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/replacement/replacement.cfg
#
# DESCRIPTION:
# Verify scrub behaves as intended when contending with a healing or
# sequential resilver.
#
# STRATEGY:
# 1. Create a pool
# 2. Add a modest amount of data to the pool.
# 3. For healing and sequential resilver:
# a. Start scrubbing.
# b. Verify a resilver can be started and it cancels the scrub.
# c. Verify a scrub cannot be started when resilvering
#
function cleanup
{
log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME
log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
$ORIG_SCAN_SUSPEND_PROGRESS
destroy_pool $TESTPOOL1
rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
}
log_assert "Scrub was cancelled by resilver"
ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS)
ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
log_onexit cleanup
log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]}
log_must zfs create $TESTPOOL1/$TESTFS
mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS)
log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=64
log_must zpool sync $TESTPOOL1
# Request a healing or sequential resilver
for replace_mode in "healing" "sequential"; do
#
# Healing resilvers abort the dsl_scan and reconfigure it for
# resilvering. Sequential resilvers cancel the dsl_scan and start
# the vdev_rebuild thread.
#
if [[ "$replace_mode" = "healing" ]]; then
history_msg="scan aborted, restarting"
flags=""
else
history_msg="scan cancelled"
flags="-s"
fi
# Limit scanning time and suspend the scan as soon as possible.
log_must set_tunable32 RESILVER_MIN_TIME_MS 50
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
# Initiate a scrub.
log_must zpool scrub $TESTPOOL1
# Initiate a resilver to cancel the scrub.
log_must zpool replace $flags $TESTPOOL1 ${VDEV_FILES[1]} \
$SPARE_VDEV_FILE
# Verify the scrub was canceled, it may take a few seconds to exit.
while is_pool_scrubbing $TESTPOOL1; do
sleep 1
done
log_mustnot is_pool_scrubbing $TESTPOOL1
# Verify a scrub cannot be started while resilvering.
log_must is_pool_resilvering $TESTPOOL1
log_mustnot zpool scrub $TESTPOOL1
# Unsuspend resilver.
log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
log_must set_tunable32 RESILVER_MIN_TIME_MS 3000
# Wait for resilver to finish then put the original back.
log_must zpool wait $TESTPOOL1
log_must zpool replace $flags -w $TESTPOOL1 $SPARE_VDEV_FILE \
${VDEV_FILES[1]}
done
log_pass "Scrub was cancelled by resilver"

View File

@ -1,9 +0,0 @@
pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/resilver
dist_pkgdata_SCRIPTS = \
setup.ksh \
cleanup.ksh \
resilver_restart_001.ksh \
resilver_restart_002.ksh
dist_pkgdata_DATA = \
resilver.cfg

View File

@ -1,31 +0,0 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/resilver/resilver.cfg
verify_runnable "global"
log_pass

View File

@ -1,32 +0,0 @@
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
verify_runnable "global"
set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4}
SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1
VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 ))

View File

@ -1,31 +0,0 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
# Copyright (c) 2019, Datto Inc. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/resilver/resilver.cfg
verify_runnable "global"
log_pass