OpenZFS 9486 - reduce memory used by device removal on fragmented pools
Device removal allocates a new location for each allocated segment on the disk that's being removed. Each allocation results in one entry in the mapping table, which maps from old location + length to new location. When a fragmented disk is removed, this can result in a large number of mapping entries, and thus a large amount of memory consumed by the mapping table. In the worst real-world cases, we've seen around 1GB of RAM per 1TB of storage removed. We can improve on this situation by allocating larger segments, which span across both allocated and free regions of the device being removed. By including free regions in the allocation (and thus mapping), we reduce the number of mapping entries. For example, if we have a 4K allocation followed by 1K free and then 4K allocated, we would allocate 4+1+4 = 9KB, and then move the entire region (including allocated and free parts). In this case we used one mapping where previously we would have used two, but often the ratio is much higher (up to 20:1 in real-world use). We then need to mark the regions that were free on the removing device as free in the new locations, and also obsolete in the mapping entry. This method preserves the fragmentation of the removing device, rather than consolidating its allocated space into a small number of chunks where possible. But it results in drastic reduction of memory used by the mapping table - around 20x in the most-fragmented cases. In the most fragmented real-world cases, this reduces memory used by the mapping from ~1GB to ~50MB of RAM per 1TB of storage removed. Less fragmented cases will typically also see around 50-100MB of RAM per 1TB of storage. Porting notes: * Add the following as module parameters: * zfs_condense_indirect_vdevs_enable * zfs_condense_max_obsolete_bytes * Document the following module parameters: * zfs_condense_indirect_vdevs_enable * zfs_condense_max_obsolete_bytes * zfs_condense_min_mapping_bytes Authored by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Ported-by: Tim Chase <tim@chase2k.com> Signed-off-by: Tim Chase <tim@chase2k.com> OpenZFS-issue: https://illumos.org/issues/9486 OpenZFS-commit: https://github.com/ahrens/illumos/commit/07152e142e44c External-issue: DLPX-57962 Closes #7536
This commit is contained in:
parent
ba863d0be4
commit
0dc2f70c5c
|
@ -93,9 +93,13 @@ range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||||
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
|
void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
|
||||||
uint64_t newstart, uint64_t newsize);
|
uint64_t newstart, uint64_t newsize);
|
||||||
uint64_t range_tree_space(range_tree_t *rt);
|
uint64_t range_tree_space(range_tree_t *rt);
|
||||||
|
boolean_t range_tree_is_empty(range_tree_t *rt);
|
||||||
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
|
void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
|
||||||
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
|
void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
|
||||||
void range_tree_stat_verify(range_tree_t *rt);
|
void range_tree_stat_verify(range_tree_t *rt);
|
||||||
|
uint64_t range_tree_min(range_tree_t *rt);
|
||||||
|
uint64_t range_tree_max(range_tree_t *rt);
|
||||||
|
uint64_t range_tree_span(range_tree_t *rt);
|
||||||
|
|
||||||
void range_tree_add(void *arg, uint64_t start, uint64_t size);
|
void range_tree_add(void *arg, uint64_t start, uint64_t size);
|
||||||
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
|
void range_tree_remove(void *arg, uint64_t start, uint64_t size);
|
||||||
|
|
|
@ -86,6 +86,9 @@ extern void spa_vdev_remove_suspend(spa_t *);
|
||||||
extern int spa_vdev_remove_cancel(spa_t *);
|
extern int spa_vdev_remove_cancel(spa_t *);
|
||||||
extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
|
extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
|
||||||
|
|
||||||
|
extern int vdev_removal_max_span;
|
||||||
|
extern int zfs_remove_max_segment;
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -425,6 +425,24 @@ create) will return ENOSPC.
|
||||||
Default value: \fB5\fR.
|
Default value: \fB5\fR.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
|
\fBvdev_removal_max_span\fR (int)
|
||||||
|
.ad
|
||||||
|
.RS 12n
|
||||||
|
During top-level vdev removal, chunks of data are copied from the vdev
|
||||||
|
which may include free space in order to trade bandwidth for IOPS.
|
||||||
|
This parameter determines the maximum span of free space (in bytes)
|
||||||
|
which will be included as "unnecessary" data in a chunk of copied data.
|
||||||
|
|
||||||
|
The default value here was chosen to align with
|
||||||
|
\fBzfs_vdev_read_gap_limit\fR, which is a similar concept when doing
|
||||||
|
regular reads (but there's no reason it has to be the same).
|
||||||
|
.sp
|
||||||
|
Default value: \fB32,768\fR.
|
||||||
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
.ne 2
|
.ne 2
|
||||||
.na
|
.na
|
||||||
|
@ -868,6 +886,47 @@ transaction record (itx).
|
||||||
Default value: \fB5\fR%.
|
Default value: \fB5\fR%.
|
||||||
.RE
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
|
\fBzfs_condense_indirect_vdevs_enable\fR (int)
|
||||||
|
.ad
|
||||||
|
.RS 12n
|
||||||
|
Enable condensing indirect vdev mappings. When set to a non-zero value,
|
||||||
|
attempt to condense indirect vdev mappings if the mapping uses more than
|
||||||
|
\fBzfs_condense_min_mapping_bytes\fR bytes of memory and if the obsolete
|
||||||
|
space map object uses more than \fBzfs_condense_max_obsolete_bytes\fR
|
||||||
|
bytes on-disk. The condensing process is an attempt to save memory by
|
||||||
|
removing obsolete mappings.
|
||||||
|
.sp
|
||||||
|
Default value: \fB1\fR.
|
||||||
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
|
\fBzfs_condense_max_obsolete_bytes\fR (ulong)
|
||||||
|
.ad
|
||||||
|
.RS 12n
|
||||||
|
Only attempt to condense indirect vdev mappings if the on-disk size
|
||||||
|
of the obsolete space map object is greater than this number of bytes
|
||||||
|
(see \fBfBzfs_condense_indirect_vdevs_enable\fR).
|
||||||
|
.sp
|
||||||
|
Default value: \fB1,073,741,824\fR.
|
||||||
|
.RE
|
||||||
|
|
||||||
|
.sp
|
||||||
|
.ne 2
|
||||||
|
.na
|
||||||
|
\fBzfs_condense_min_mapping_bytes\fR (ulong)
|
||||||
|
.ad
|
||||||
|
.RS 12n
|
||||||
|
Minimum size vdev mapping to attempt to condense (see
|
||||||
|
\fBzfs_condense_indirect_vdevs_enable\fR).
|
||||||
|
.sp
|
||||||
|
Default value: \fB131,072\fR.
|
||||||
|
.RE
|
||||||
|
|
||||||
.sp
|
.sp
|
||||||
.ne 2
|
.ne 2
|
||||||
.na
|
.na
|
||||||
|
|
|
@ -491,7 +491,6 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
|
||||||
static range_seg_t *
|
static range_seg_t *
|
||||||
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
|
range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||||
{
|
{
|
||||||
avl_index_t where;
|
|
||||||
range_seg_t rsearch;
|
range_seg_t rsearch;
|
||||||
uint64_t end = start + size;
|
uint64_t end = start + size;
|
||||||
|
|
||||||
|
@ -499,7 +498,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
|
||||||
|
|
||||||
rsearch.rs_start = start;
|
rsearch.rs_start = start;
|
||||||
rsearch.rs_end = end;
|
rsearch.rs_end = end;
|
||||||
return (avl_find(&rt->rt_root, &rsearch, &where));
|
return (avl_find(&rt->rt_root, &rsearch, NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
range_seg_t *
|
range_seg_t *
|
||||||
|
@ -599,6 +598,13 @@ range_tree_space(range_tree_t *rt)
|
||||||
return (rt->rt_space);
|
return (rt->rt_space);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
boolean_t
|
||||||
|
range_tree_is_empty(range_tree_t *rt)
|
||||||
|
{
|
||||||
|
ASSERT(rt != NULL);
|
||||||
|
return (range_tree_space(rt) == 0);
|
||||||
|
}
|
||||||
|
|
||||||
/* Generic range tree functions for maintaining segments in an AVL tree. */
|
/* Generic range tree functions for maintaining segments in an AVL tree. */
|
||||||
void
|
void
|
||||||
rt_avl_create(range_tree_t *rt, void *arg)
|
rt_avl_create(range_tree_t *rt, void *arg)
|
||||||
|
@ -643,3 +649,23 @@ rt_avl_vacate(range_tree_t *rt, void *arg)
|
||||||
*/
|
*/
|
||||||
rt_avl_create(rt, arg);
|
rt_avl_create(rt, arg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
uint64_t
|
||||||
|
range_tree_min(range_tree_t *rt)
|
||||||
|
{
|
||||||
|
range_seg_t *rs = avl_first(&rt->rt_root);
|
||||||
|
return (rs != NULL ? rs->rs_start : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t
|
||||||
|
range_tree_max(range_tree_t *rt)
|
||||||
|
{
|
||||||
|
range_seg_t *rs = avl_last(&rt->rt_root);
|
||||||
|
return (rs != NULL ? rs->rs_end : 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t
|
||||||
|
range_tree_span(range_tree_t *rt)
|
||||||
|
{
|
||||||
|
return (range_tree_max(rt) - range_tree_min(rt));
|
||||||
|
}
|
||||||
|
|
|
@ -171,7 +171,7 @@
|
||||||
* object.
|
* object.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
boolean_t zfs_condense_indirect_vdevs_enable = B_TRUE;
|
int zfs_condense_indirect_vdevs_enable = B_TRUE;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Condense if at least this percent of the bytes in the mapping is
|
* Condense if at least this percent of the bytes in the mapping is
|
||||||
|
@ -188,7 +188,7 @@ int zfs_indirect_condense_obsolete_pct = 25;
|
||||||
* consumed by the obsolete space map; the default of 1GB is small enough
|
* consumed by the obsolete space map; the default of 1GB is small enough
|
||||||
* that we typically don't mind "wasting" it.
|
* that we typically don't mind "wasting" it.
|
||||||
*/
|
*/
|
||||||
uint64_t zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
|
unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Don't bother condensing if the mapping uses less than this amount of
|
* Don't bother condensing if the mapping uses less than this amount of
|
||||||
|
@ -1700,11 +1700,20 @@ EXPORT_SYMBOL(vdev_indirect_sync_obsolete);
|
||||||
EXPORT_SYMBOL(vdev_obsolete_counts_are_precise);
|
EXPORT_SYMBOL(vdev_obsolete_counts_are_precise);
|
||||||
EXPORT_SYMBOL(vdev_obsolete_sm_object);
|
EXPORT_SYMBOL(vdev_obsolete_sm_object);
|
||||||
|
|
||||||
|
module_param(zfs_condense_indirect_vdevs_enable, int, 0644);
|
||||||
|
MODULE_PARM_DESC(zfs_condense_indirect_vdevs_enable,
|
||||||
|
"Whether to attempt condensing indirect vdev mappings");
|
||||||
|
|
||||||
/* CSTYLED */
|
/* CSTYLED */
|
||||||
module_param(zfs_condense_min_mapping_bytes, ulong, 0644);
|
module_param(zfs_condense_min_mapping_bytes, ulong, 0644);
|
||||||
MODULE_PARM_DESC(zfs_condense_min_mapping_bytes,
|
MODULE_PARM_DESC(zfs_condense_min_mapping_bytes,
|
||||||
"Minimum size of vdev mapping to condense");
|
"Minimum size of vdev mapping to condense");
|
||||||
|
|
||||||
|
/* CSTYLED */
|
||||||
|
module_param(zfs_condense_max_obsolete_bytes, ulong, 0644);
|
||||||
|
MODULE_PARM_DESC(zfs_condense_max_obsolete_bytes,
|
||||||
|
"Minimum size obsolete spacemap to attempt condensing");
|
||||||
|
|
||||||
module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644);
|
module_param(zfs_condense_indirect_commit_entry_delay_ms, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms,
|
MODULE_PARM_DESC(zfs_condense_indirect_commit_entry_delay_ms,
|
||||||
"Delay while condensing vdev mapping");
|
"Delay while condensing vdev mapping");
|
||||||
|
|
|
@ -33,15 +33,15 @@
|
||||||
* 1. Uniquely identify this device as part of a ZFS pool and confirm its
|
* 1. Uniquely identify this device as part of a ZFS pool and confirm its
|
||||||
* identity within the pool.
|
* identity within the pool.
|
||||||
*
|
*
|
||||||
* 2. Verify that all the devices given in a configuration are present
|
* 2. Verify that all the devices given in a configuration are present
|
||||||
* within the pool.
|
* within the pool.
|
||||||
*
|
*
|
||||||
* 3. Determine the uberblock for the pool.
|
* 3. Determine the uberblock for the pool.
|
||||||
*
|
*
|
||||||
* 4. In case of an import operation, determine the configuration of the
|
* 4. In case of an import operation, determine the configuration of the
|
||||||
* toplevel vdev of which it is a part.
|
* toplevel vdev of which it is a part.
|
||||||
*
|
*
|
||||||
* 5. If an import operation cannot find all the devices in the pool,
|
* 5. If an import operation cannot find all the devices in the pool,
|
||||||
* provide enough information to the administrator to determine which
|
* provide enough information to the administrator to determine which
|
||||||
* devices are missing.
|
* devices are missing.
|
||||||
*
|
*
|
||||||
|
@ -77,9 +77,9 @@
|
||||||
* In order to identify which labels are valid, the labels are written in the
|
* In order to identify which labels are valid, the labels are written in the
|
||||||
* following manner:
|
* following manner:
|
||||||
*
|
*
|
||||||
* 1. For each vdev, update 'L1' to the new label
|
* 1. For each vdev, update 'L1' to the new label
|
||||||
* 2. Update the uberblock
|
* 2. Update the uberblock
|
||||||
* 3. For each vdev, update 'L2' to the new label
|
* 3. For each vdev, update 'L2' to the new label
|
||||||
*
|
*
|
||||||
* Given arbitrary failure, we can determine the correct label to use based on
|
* Given arbitrary failure, we can determine the correct label to use based on
|
||||||
* the transaction group. If we fail after updating L1 but before updating the
|
* the transaction group. If we fail after updating L1 but before updating the
|
||||||
|
@ -117,19 +117,19 @@
|
||||||
*
|
*
|
||||||
* The nvlist describing the pool and vdev contains the following elements:
|
* The nvlist describing the pool and vdev contains the following elements:
|
||||||
*
|
*
|
||||||
* version ZFS on-disk version
|
* version ZFS on-disk version
|
||||||
* name Pool name
|
* name Pool name
|
||||||
* state Pool state
|
* state Pool state
|
||||||
* txg Transaction group in which this label was written
|
* txg Transaction group in which this label was written
|
||||||
* pool_guid Unique identifier for this pool
|
* pool_guid Unique identifier for this pool
|
||||||
* vdev_tree An nvlist describing vdev tree.
|
* vdev_tree An nvlist describing vdev tree.
|
||||||
* features_for_read
|
* features_for_read
|
||||||
* An nvlist of the features necessary for reading the MOS.
|
* An nvlist of the features necessary for reading the MOS.
|
||||||
*
|
*
|
||||||
* Each leaf device label also contains the following:
|
* Each leaf device label also contains the following:
|
||||||
*
|
*
|
||||||
* top_guid Unique ID for top-level vdev in which this is contained
|
* top_guid Unique ID for top-level vdev in which this is contained
|
||||||
* guid Unique ID for the leaf vdev
|
* guid Unique ID for the leaf vdev
|
||||||
*
|
*
|
||||||
* The 'vs' configuration follows the format described in 'spa_config.c'.
|
* The 'vs' configuration follows the format described in 'spa_config.c'.
|
||||||
*/
|
*/
|
||||||
|
@ -515,22 +515,33 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
|
||||||
* histograms.
|
* histograms.
|
||||||
*/
|
*/
|
||||||
uint64_t seg_count = 0;
|
uint64_t seg_count = 0;
|
||||||
|
uint64_t to_alloc = vd->vdev_stat.vs_alloc;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* There are the same number of allocated segments
|
* There are the same number of allocated segments
|
||||||
* as free segments, so we will have at least one
|
* as free segments, so we will have at least one
|
||||||
* entry per free segment.
|
* entry per free segment. However, small free
|
||||||
|
* segments (smaller than vdev_removal_max_span)
|
||||||
|
* will be combined with adjacent allocated segments
|
||||||
|
* as a single mapping.
|
||||||
*/
|
*/
|
||||||
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
|
for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
|
||||||
seg_count += vd->vdev_mg->mg_histogram[i];
|
if (1ULL << (i + 1) < vdev_removal_max_span) {
|
||||||
|
to_alloc +=
|
||||||
|
vd->vdev_mg->mg_histogram[i] <<
|
||||||
|
(i + 1);
|
||||||
|
} else {
|
||||||
|
seg_count +=
|
||||||
|
vd->vdev_mg->mg_histogram[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* The maximum length of a mapping is SPA_MAXBLOCKSIZE,
|
* The maximum length of a mapping is
|
||||||
* so we need at least one entry per SPA_MAXBLOCKSIZE
|
* zfs_remove_max_segment, so we need at least one entry
|
||||||
* of allocated data.
|
* per zfs_remove_max_segment of allocated data.
|
||||||
*/
|
*/
|
||||||
seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE;
|
seg_count += to_alloc / zfs_remove_max_segment;
|
||||||
|
|
||||||
fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
|
fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
|
||||||
seg_count *
|
seg_count *
|
||||||
|
|
|
@ -99,6 +99,24 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
|
||||||
*/
|
*/
|
||||||
int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
|
int zfs_remove_max_segment = SPA_MAXBLOCKSIZE;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Allow a remap segment to span free chunks of at most this size. The main
|
||||||
|
* impact of a larger span is that we will read and write larger, more
|
||||||
|
* contiguous chunks, with more "unnecessary" data -- trading off bandwidth
|
||||||
|
* for iops. The value here was chosen to align with
|
||||||
|
* zfs_vdev_read_gap_limit, which is a similar concept when doing regular
|
||||||
|
* reads (but there's no reason it has to be the same).
|
||||||
|
*
|
||||||
|
* Additionally, a higher span will have the following relatively minor
|
||||||
|
* effects:
|
||||||
|
* - the mapping will be smaller, since one entry can cover more allocated
|
||||||
|
* segments
|
||||||
|
* - more of the fragmentation in the removing device will be preserved
|
||||||
|
* - we'll do larger allocations, which may fail and fall back on smaller
|
||||||
|
* allocations
|
||||||
|
*/
|
||||||
|
int vdev_removal_max_span = 32 * 1024;
|
||||||
|
|
||||||
#define VDEV_REMOVAL_ZAP_OBJS "lzap"
|
#define VDEV_REMOVAL_ZAP_OBJS "lzap"
|
||||||
|
|
||||||
static void spa_vdev_remove_thread(void *arg);
|
static void spa_vdev_remove_thread(void *arg);
|
||||||
|
@ -710,13 +728,52 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
|
||||||
spa_sync_removing_state(spa, tx);
|
spa_sync_removing_state(spa, tx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
typedef struct vdev_copy_segment_arg {
|
||||||
|
spa_t *vcsa_spa;
|
||||||
|
dva_t *vcsa_dest_dva;
|
||||||
|
uint64_t vcsa_txg;
|
||||||
|
range_tree_t *vcsa_obsolete_segs;
|
||||||
|
} vdev_copy_segment_arg_t;
|
||||||
|
|
||||||
|
static void
|
||||||
|
unalloc_seg(void *arg, uint64_t start, uint64_t size)
|
||||||
|
{
|
||||||
|
vdev_copy_segment_arg_t *vcsa = arg;
|
||||||
|
spa_t *spa = vcsa->vcsa_spa;
|
||||||
|
blkptr_t bp = { { { {0} } } };
|
||||||
|
|
||||||
|
BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
|
||||||
|
BP_SET_LSIZE(&bp, size);
|
||||||
|
BP_SET_PSIZE(&bp, size);
|
||||||
|
BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
|
||||||
|
BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
|
||||||
|
BP_SET_TYPE(&bp, DMU_OT_NONE);
|
||||||
|
BP_SET_LEVEL(&bp, 0);
|
||||||
|
BP_SET_DEDUP(&bp, 0);
|
||||||
|
BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
|
||||||
|
|
||||||
|
DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
|
||||||
|
DVA_SET_OFFSET(&bp.blk_dva[0],
|
||||||
|
DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
|
||||||
|
DVA_SET_ASIZE(&bp.blk_dva[0], size);
|
||||||
|
|
||||||
|
zio_free(spa, vcsa->vcsa_txg, &bp);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* All reads and writes associated with a call to spa_vdev_copy_segment()
|
* All reads and writes associated with a call to spa_vdev_copy_segment()
|
||||||
* are done.
|
* are done.
|
||||||
*/
|
*/
|
||||||
static void
|
static void
|
||||||
spa_vdev_copy_nullzio_done(zio_t *zio)
|
spa_vdev_copy_segment_done(zio_t *zio)
|
||||||
{
|
{
|
||||||
|
vdev_copy_segment_arg_t *vcsa = zio->io_private;
|
||||||
|
|
||||||
|
range_tree_vacate(vcsa->vcsa_obsolete_segs,
|
||||||
|
unalloc_seg, vcsa);
|
||||||
|
range_tree_destroy(vcsa->vcsa_obsolete_segs);
|
||||||
|
kmem_free(vcsa, sizeof (*vcsa));
|
||||||
|
|
||||||
spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
|
spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -833,7 +890,8 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio,
|
||||||
* read from the old location and write to the new location.
|
* read from the old location and write to the new location.
|
||||||
*/
|
*/
|
||||||
static int
|
static int
|
||||||
spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
|
spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
|
||||||
|
uint64_t maxalloc, uint64_t txg,
|
||||||
vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
|
vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
|
||||||
{
|
{
|
||||||
metaslab_group_t *mg = vd->vdev_mg;
|
metaslab_group_t *mg = vd->vdev_mg;
|
||||||
|
@ -841,14 +899,70 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
|
||||||
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
|
spa_vdev_removal_t *svr = spa->spa_vdev_removal;
|
||||||
vdev_indirect_mapping_entry_t *entry;
|
vdev_indirect_mapping_entry_t *entry;
|
||||||
dva_t dst = {{ 0 }};
|
dva_t dst = {{ 0 }};
|
||||||
|
uint64_t start = range_tree_min(segs);
|
||||||
|
|
||||||
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
|
ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
|
||||||
|
|
||||||
|
uint64_t size = range_tree_span(segs);
|
||||||
|
if (range_tree_span(segs) > maxalloc) {
|
||||||
|
/*
|
||||||
|
* We can't allocate all the segments. Prefer to end
|
||||||
|
* the allocation at the end of a segment, thus avoiding
|
||||||
|
* additional split blocks.
|
||||||
|
*/
|
||||||
|
range_seg_t search;
|
||||||
|
avl_index_t where;
|
||||||
|
search.rs_start = start + maxalloc;
|
||||||
|
search.rs_end = search.rs_start;
|
||||||
|
range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
|
||||||
|
if (rs == NULL) {
|
||||||
|
rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
|
||||||
|
} else {
|
||||||
|
rs = AVL_PREV(&segs->rt_root, rs);
|
||||||
|
}
|
||||||
|
if (rs != NULL) {
|
||||||
|
size = rs->rs_end - start;
|
||||||
|
} else {
|
||||||
|
/*
|
||||||
|
* There are no segments that end before maxalloc.
|
||||||
|
* I.e. the first segment is larger than maxalloc,
|
||||||
|
* so we must split it.
|
||||||
|
*/
|
||||||
|
size = maxalloc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ASSERT3U(size, <=, maxalloc);
|
||||||
|
|
||||||
int error = metaslab_alloc_dva(spa, mg->mg_class, size,
|
int error = metaslab_alloc_dva(spa, mg->mg_class, size,
|
||||||
&dst, 0, NULL, txg, 0, zal);
|
&dst, 0, NULL, txg, 0, zal);
|
||||||
if (error != 0)
|
if (error != 0)
|
||||||
return (error);
|
return (error);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Determine the ranges that are not actually needed. Offsets are
|
||||||
|
* relative to the start of the range to be copied (i.e. relative to the
|
||||||
|
* local variable "start").
|
||||||
|
*/
|
||||||
|
range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
|
||||||
|
|
||||||
|
range_seg_t *rs = avl_first(&segs->rt_root);
|
||||||
|
ASSERT3U(rs->rs_start, ==, start);
|
||||||
|
uint64_t prev_seg_end = rs->rs_end;
|
||||||
|
while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
|
||||||
|
if (rs->rs_start >= start + size) {
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
range_tree_add(obsolete_segs,
|
||||||
|
prev_seg_end - start,
|
||||||
|
rs->rs_start - prev_seg_end);
|
||||||
|
}
|
||||||
|
prev_seg_end = rs->rs_end;
|
||||||
|
}
|
||||||
|
/* We don't end in the middle of an obsolete range */
|
||||||
|
ASSERT3U(start + size, <=, prev_seg_end);
|
||||||
|
|
||||||
|
range_tree_clear(segs, start, size);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We can't have any padding of the allocated size, otherwise we will
|
* We can't have any padding of the allocated size, otherwise we will
|
||||||
* misunderstand what's allocated, and the size of the mapping.
|
* misunderstand what's allocated, and the size of the mapping.
|
||||||
|
@ -860,13 +974,22 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
|
||||||
entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
|
entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
|
||||||
DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
|
DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
|
||||||
entry->vime_mapping.vimep_dst = dst;
|
entry->vime_mapping.vimep_dst = dst;
|
||||||
|
if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
|
||||||
|
entry->vime_obsolete_count = range_tree_space(obsolete_segs);
|
||||||
|
}
|
||||||
|
|
||||||
|
vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
|
||||||
|
vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
|
||||||
|
vcsa->vcsa_obsolete_segs = obsolete_segs;
|
||||||
|
vcsa->vcsa_spa = spa;
|
||||||
|
vcsa->vcsa_txg = txg;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* See comment before spa_vdev_copy_one_child().
|
* See comment before spa_vdev_copy_one_child().
|
||||||
*/
|
*/
|
||||||
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
|
spa_config_enter(spa, SCL_STATE, spa, RW_READER);
|
||||||
zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
|
zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
|
||||||
spa_vdev_copy_nullzio_done, NULL, 0);
|
spa_vdev_copy_segment_done, vcsa, 0);
|
||||||
vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
|
vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
|
||||||
if (dest_vd->vdev_ops == &vdev_mirror_ops) {
|
if (dest_vd->vdev_ops == &vdev_mirror_ops) {
|
||||||
for (int i = 0; i < dest_vd->vdev_children; i++) {
|
for (int i = 0; i < dest_vd->vdev_children; i++) {
|
||||||
|
@ -1069,39 +1192,79 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
|
||||||
|
|
||||||
mutex_enter(&svr->svr_lock);
|
mutex_enter(&svr->svr_lock);
|
||||||
|
|
||||||
range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
|
/*
|
||||||
if (rs == NULL) {
|
* Determine how big of a chunk to copy. We can allocate up
|
||||||
|
* to max_alloc bytes, and we can span up to vdev_removal_max_span
|
||||||
|
* bytes of unallocated space at a time. "segs" will track the
|
||||||
|
* allocated segments that we are copying. We may also be copying
|
||||||
|
* free segments (of up to vdev_removal_max_span bytes).
|
||||||
|
*/
|
||||||
|
range_tree_t *segs = range_tree_create(NULL, NULL);
|
||||||
|
for (;;) {
|
||||||
|
range_seg_t *rs = range_tree_first(svr->svr_allocd_segs);
|
||||||
|
|
||||||
|
if (rs == NULL)
|
||||||
|
break;
|
||||||
|
|
||||||
|
uint64_t seg_length;
|
||||||
|
|
||||||
|
if (range_tree_is_empty(segs)) {
|
||||||
|
/* need to truncate the first seg based on max_alloc */
|
||||||
|
seg_length =
|
||||||
|
MIN(rs->rs_end - rs->rs_start, *max_alloc);
|
||||||
|
} else {
|
||||||
|
if (rs->rs_start - range_tree_max(segs) >
|
||||||
|
vdev_removal_max_span) {
|
||||||
|
/*
|
||||||
|
* Including this segment would cause us to
|
||||||
|
* copy a larger unneeded chunk than is allowed.
|
||||||
|
*/
|
||||||
|
break;
|
||||||
|
} else if (rs->rs_end - range_tree_min(segs) >
|
||||||
|
*max_alloc) {
|
||||||
|
/*
|
||||||
|
* This additional segment would extend past
|
||||||
|
* max_alloc. Rather than splitting this
|
||||||
|
* segment, leave it for the next mapping.
|
||||||
|
*/
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
seg_length = rs->rs_end - rs->rs_start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
range_tree_add(segs, rs->rs_start, seg_length);
|
||||||
|
range_tree_remove(svr->svr_allocd_segs,
|
||||||
|
rs->rs_start, seg_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (range_tree_is_empty(segs)) {
|
||||||
mutex_exit(&svr->svr_lock);
|
mutex_exit(&svr->svr_lock);
|
||||||
|
range_tree_destroy(segs);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
uint64_t offset = rs->rs_start;
|
|
||||||
uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
|
|
||||||
|
|
||||||
range_tree_remove(svr->svr_allocd_segs, offset, length);
|
|
||||||
|
|
||||||
if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
|
if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
|
||||||
dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
|
dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
|
||||||
svr, 0, ZFS_SPACE_CHECK_NONE, tx);
|
svr, 0, ZFS_SPACE_CHECK_NONE, tx);
|
||||||
}
|
}
|
||||||
|
|
||||||
svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
|
svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note: this is the amount of *allocated* space
|
* Note: this is the amount of *allocated* space
|
||||||
* that we are taking care of each txg.
|
* that we are taking care of each txg.
|
||||||
*/
|
*/
|
||||||
svr->svr_bytes_done[txg & TXG_MASK] += length;
|
svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
|
||||||
|
|
||||||
mutex_exit(&svr->svr_lock);
|
mutex_exit(&svr->svr_lock);
|
||||||
|
|
||||||
zio_alloc_list_t zal;
|
zio_alloc_list_t zal;
|
||||||
metaslab_trace_init(&zal);
|
metaslab_trace_init(&zal);
|
||||||
uint64_t thismax = *max_alloc;
|
uint64_t thismax = SPA_MAXBLOCKSIZE;
|
||||||
while (length > 0) {
|
while (!range_tree_is_empty(segs)) {
|
||||||
uint64_t mylen = MIN(length, thismax);
|
|
||||||
|
|
||||||
int error = spa_vdev_copy_segment(vd,
|
int error = spa_vdev_copy_segment(vd,
|
||||||
offset, mylen, txg, vca, &zal);
|
segs, thismax, txg, vca, &zal);
|
||||||
|
|
||||||
if (error == ENOSPC) {
|
if (error == ENOSPC) {
|
||||||
/*
|
/*
|
||||||
|
@ -1115,18 +1278,17 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
|
||||||
*/
|
*/
|
||||||
ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
|
ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
|
||||||
ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
|
ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
|
||||||
thismax = P2ROUNDUP(mylen / 2,
|
uint64_t attempted =
|
||||||
|
MIN(range_tree_span(segs), thismax);
|
||||||
|
thismax = P2ROUNDUP(attempted / 2,
|
||||||
1 << spa->spa_max_ashift);
|
1 << spa->spa_max_ashift);
|
||||||
ASSERT3U(thismax, <, mylen);
|
|
||||||
/*
|
/*
|
||||||
* The minimum-size allocation can not fail.
|
* The minimum-size allocation can not fail.
|
||||||
*/
|
*/
|
||||||
ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
|
ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
|
||||||
*max_alloc = mylen - (1 << spa->spa_max_ashift);
|
*max_alloc = attempted - (1 << spa->spa_max_ashift);
|
||||||
} else {
|
} else {
|
||||||
ASSERT0(error);
|
ASSERT0(error);
|
||||||
length -= mylen;
|
|
||||||
offset += mylen;
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We've performed an allocation, so reset the
|
* We've performed an allocation, so reset the
|
||||||
|
@ -1137,6 +1299,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
metaslab_trace_fini(&zal);
|
metaslab_trace_fini(&zal);
|
||||||
|
range_tree_destroy(segs);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1944,6 +2107,10 @@ module_param(zfs_remove_max_segment, int, 0644);
|
||||||
MODULE_PARM_DESC(zfs_remove_max_segment,
|
MODULE_PARM_DESC(zfs_remove_max_segment,
|
||||||
"Largest contiguous segment to allocate when removing device");
|
"Largest contiguous segment to allocate when removing device");
|
||||||
|
|
||||||
|
module_param(vdev_removal_max_span, int, 0644);
|
||||||
|
MODULE_PARM_DESC(vdev_removal_max_span,
|
||||||
|
"Largest span of free chunks a remap segment can span");
|
||||||
|
|
||||||
EXPORT_SYMBOL(free_from_removing_vdev);
|
EXPORT_SYMBOL(free_from_removing_vdev);
|
||||||
EXPORT_SYMBOL(spa_removal_get_stats);
|
EXPORT_SYMBOL(spa_removal_get_stats);
|
||||||
EXPORT_SYMBOL(spa_remove_init);
|
EXPORT_SYMBOL(spa_remove_init);
|
||||||
|
|
Loading…
Reference in New Issue