2020-07-03 18:05:50 +00:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
2022-07-11 21:16:13 +00:00
|
|
|
* or https://opensource.org/licenses/CDDL-1.0.
|
2020-07-03 18:05:50 +00:00
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* Copyright (c) 2018, Intel Corporation.
|
|
|
|
* Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _SYS_VDEV_REBUILD_H
|
|
|
|
#define _SYS_VDEV_REBUILD_H
|
|
|
|
|
|
|
|
#include <sys/spa.h>
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
extern "C" {
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Number of entries in the physical vdev_rebuild_phys structure. This
|
|
|
|
* state is stored per top-level as VDEV_ZAP_TOP_VDEV_REBUILD_PHYS.
|
|
|
|
*/
|
|
|
|
#define REBUILD_PHYS_ENTRIES 12
|
|
|
|
|
|
|
|
/*
|
|
|
|
* On-disk rebuild configuration and state. When adding new fields they
|
|
|
|
* must be added to the end of the structure.
|
|
|
|
*/
|
|
|
|
typedef struct vdev_rebuild_phys {
|
|
|
|
uint64_t vrp_rebuild_state; /* vdev_rebuild_state_t */
|
|
|
|
uint64_t vrp_last_offset; /* last rebuilt offset */
|
|
|
|
uint64_t vrp_min_txg; /* minimum missing txg */
|
|
|
|
uint64_t vrp_max_txg; /* maximum missing txg */
|
|
|
|
uint64_t vrp_start_time; /* start time */
|
|
|
|
uint64_t vrp_end_time; /* end time */
|
|
|
|
uint64_t vrp_scan_time_ms; /* total run time in ms */
|
|
|
|
uint64_t vrp_bytes_scanned; /* alloc bytes scanned */
|
|
|
|
uint64_t vrp_bytes_issued; /* read bytes rebuilt */
|
|
|
|
uint64_t vrp_bytes_rebuilt; /* rebuilt bytes */
|
|
|
|
uint64_t vrp_bytes_est; /* total bytes to scan */
|
|
|
|
uint64_t vrp_errors; /* errors during rebuild */
|
|
|
|
} vdev_rebuild_phys_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* The vdev_rebuild_t describes the current state and how a top-level vdev
|
|
|
|
* should be rebuilt. The core elements are the top-vdev, the metaslab being
|
2021-04-03 01:38:53 +00:00
|
|
|
* rebuilt, range tree containing the allocated extents and the on-disk state.
|
2020-07-03 18:05:50 +00:00
|
|
|
*/
|
|
|
|
typedef struct vdev_rebuild {
|
|
|
|
vdev_t *vr_top_vdev; /* top-level vdev to rebuild */
|
|
|
|
metaslab_t *vr_scan_msp; /* scanning disabled metaslab */
|
|
|
|
range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-13 21:51:51 +00:00
|
|
|
kmutex_t vr_io_lock; /* inflight IO lock */
|
|
|
|
kcondvar_t vr_io_cv; /* inflight IO cv */
|
2020-07-03 18:05:50 +00:00
|
|
|
|
|
|
|
/* In-core state and progress */
|
|
|
|
uint64_t vr_scan_offset[TXG_SIZE];
|
|
|
|
uint64_t vr_prev_scan_time_ms; /* any previous scan time */
|
Distributed Spare (dRAID) Feature
This patch adds a new top-level vdev type called dRAID, which stands
for Distributed parity RAID. This pool configuration allows all dRAID
vdevs to participate when rebuilding to a distributed hot spare device.
This can substantially reduce the total time required to restore full
parity to pool with a failed device.
A dRAID pool can be created using the new top-level `draid` type.
Like `raidz`, the desired redundancy is specified after the type:
`draid[1,2,3]`. No additional information is required to create the
pool and reasonable default values will be chosen based on the number
of child vdevs in the dRAID vdev.
zpool create <pool> draid[1,2,3] <vdevs...>
Unlike raidz, additional optional dRAID configuration values can be
provided as part of the draid type as colon separated values. This
allows administrators to fully specify a layout for either performance
or capacity reasons. The supported options include:
zpool create <pool> \
draid[<parity>][:<data>d][:<children>c][:<spares>s] \
<vdevs...>
- draid[parity] - Parity level (default 1)
- draid[:<data>d] - Data devices per group (default 8)
- draid[:<children>c] - Expected number of child vdevs
- draid[:<spares>s] - Distributed hot spares (default 0)
Abbreviated example `zpool status` output for a 68 disk dRAID pool
with two distributed spares using special allocation classes.
```
pool: tank
state: ONLINE
config:
NAME STATE READ WRITE CKSUM
slag7 ONLINE 0 0 0
draid2:8d:68c:2s-0 ONLINE 0 0 0
L0 ONLINE 0 0 0
L1 ONLINE 0 0 0
...
U25 ONLINE 0 0 0
U26 ONLINE 0 0 0
spare-53 ONLINE 0 0 0
U27 ONLINE 0 0 0
draid2-0-0 ONLINE 0 0 0
U28 ONLINE 0 0 0
U29 ONLINE 0 0 0
...
U42 ONLINE 0 0 0
U43 ONLINE 0 0 0
special
mirror-1 ONLINE 0 0 0
L5 ONLINE 0 0 0
U5 ONLINE 0 0 0
mirror-2 ONLINE 0 0 0
L6 ONLINE 0 0 0
U6 ONLINE 0 0 0
spares
draid2-0-0 INUSE currently in use
draid2-0-1 AVAIL
```
When adding test coverage for the new dRAID vdev type the following
options were added to the ztest command. These options are leverages
by zloop.sh to test a wide range of dRAID configurations.
-K draid|raidz|random - kind of RAID to test
-D <value> - dRAID data drives per group
-S <value> - dRAID distributed hot spares
-R <value> - RAID parity (raidz or dRAID)
The zpool_create, zpool_import, redundancy, replacement and fault
test groups have all been updated provide test coverage for the
dRAID feature.
Co-authored-by: Isaac Huang <he.huang@intel.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Co-authored-by: Don Brady <don.brady@delphix.com>
Co-authored-by: Matthew Ahrens <mahrens@delphix.com>
Co-authored-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Mark Maybee <mmaybee@cray.com>
Reviewed-by: Matt Ahrens <matt@delphix.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10102
2020-11-13 21:51:51 +00:00
|
|
|
uint64_t vr_bytes_inflight_max; /* maximum bytes inflight */
|
|
|
|
uint64_t vr_bytes_inflight; /* current bytes inflight */
|
2020-07-03 18:05:50 +00:00
|
|
|
|
|
|
|
/* Per-rebuild pass statistics for calculating bandwidth */
|
|
|
|
uint64_t vr_pass_start_time;
|
|
|
|
uint64_t vr_pass_bytes_scanned;
|
|
|
|
uint64_t vr_pass_bytes_issued;
|
Do not report bytes skipped by scan as issued.
Scan process may skip blocks based on their birth time, DVA, etc.
Traditionally those blocks were accounted as issued, that caused
reporting of hugely over-inflated numbers, having nothing to do
with actual disk I/O. This change utilizes never used field in
struct dsl_scan_phys to account such skipped bytes, allowing to
report how much data were actually scrubbed/resilvered and what
is the actual I/O speed. While formally it is an on-disk format
change, it should be compatible both ways, so should not need a
feature flag.
This should partially address the same issue as c85ac731a0e, but
from a different perspective, complementing it.
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Akash B <akash-b@hpe.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15007
2023-06-30 15:47:13 +00:00
|
|
|
uint64_t vr_pass_bytes_skipped;
|
2020-07-03 18:05:50 +00:00
|
|
|
|
|
|
|
/* On-disk state updated by vdev_rebuild_zap_update_sync() */
|
|
|
|
vdev_rebuild_phys_t vr_rebuild_phys;
|
|
|
|
} vdev_rebuild_t;
|
|
|
|
|
|
|
|
boolean_t vdev_rebuild_active(vdev_t *);
|
|
|
|
|
|
|
|
int vdev_rebuild_load(vdev_t *);
|
|
|
|
void vdev_rebuild(vdev_t *);
|
|
|
|
void vdev_rebuild_stop_wait(vdev_t *);
|
|
|
|
void vdev_rebuild_stop_all(spa_t *);
|
|
|
|
void vdev_rebuild_restart(spa_t *);
|
|
|
|
void vdev_rebuild_clear_sync(void *, dmu_tx_t *);
|
|
|
|
int vdev_rebuild_get_stats(vdev_t *, vdev_rebuild_stat_t *);
|
|
|
|
|
|
|
|
#ifdef __cplusplus
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#endif /* _SYS_VDEV_REBUILD_H */
|