Illumos 5164-5165 - space map fixes
5164 space_map_max_blksz causes panic, does not work
5165 zdb fails assertion when run on pool with recently-enabled
space map_histogram feature
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com>
Approved by: Dan McDonald <danmcd@omniti.com>
References:
https://www.illumos.org/issues/5164
https://www.illumos.org/issues/5165
https://github.com/illumos/illumos-gate/commit/b1be289
Porting Notes:
The metaslab_fragmentation() hunk was dropped from this patch
because it was already resolved by commit 8b0a084
.
The comment modified in metaslab.c was updated to use the correct
variable name, space_map_blksz. The upstream commit incorrectly
used space_map_blksize.
Ported by: Turbo Fredriksson <turbo@bayour.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #2697
This commit is contained in:
parent
b02fe35d37
commit
9635861742
|
@ -1025,7 +1025,7 @@ ztest_random_spa_version(uint64_t initial_version)
|
||||||
* Find the largest ashift used
|
* Find the largest ashift used
|
||||||
*/
|
*/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
ztest_spa_get_ashift() {
|
ztest_spa_get_ashift(void) {
|
||||||
uint64_t i;
|
uint64_t i;
|
||||||
uint64_t ashift = SPA_MINBLOCKSHIFT;
|
uint64_t ashift = SPA_MINBLOCKSHIFT;
|
||||||
vdev_t *rvd = ztest_spa->spa_root_vdev;
|
vdev_t *rvd = ztest_spa->spa_root_vdev;
|
||||||
|
|
|
@ -133,17 +133,6 @@ typedef enum {
|
||||||
SM_FREE
|
SM_FREE
|
||||||
} maptype_t;
|
} maptype_t;
|
||||||
|
|
||||||
/*
|
|
||||||
* The data for a given space map can be kept on blocks of any size.
|
|
||||||
* Larger blocks entail fewer i/o operations, but they also cause the
|
|
||||||
* DMU to keep more data in-core, and also to waste more i/o bandwidth
|
|
||||||
* when only a few blocks have changed since the last transaction group.
|
|
||||||
* Rather than having a fixed block size for all space maps the block size
|
|
||||||
* can adjust as needed (see space_map_max_blksz). Set the initial block
|
|
||||||
* size for the space map to 4k.
|
|
||||||
*/
|
|
||||||
#define SPACE_MAP_INITIAL_BLOCKSIZE (1ULL << 12)
|
|
||||||
|
|
||||||
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
|
int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
|
||||||
|
|
||||||
void space_map_histogram_clear(space_map_t *sm);
|
void space_map_histogram_clear(space_map_t *sm);
|
||||||
|
|
|
@ -67,7 +67,7 @@ int zfs_condense_pct = 200;
|
||||||
/*
|
/*
|
||||||
* Condensing a metaslab is not guaranteed to actually reduce the amount of
|
* Condensing a metaslab is not guaranteed to actually reduce the amount of
|
||||||
* space used on disk. In particular, a space map uses data in increments of
|
* space used on disk. In particular, a space map uses data in increments of
|
||||||
* MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
|
* MAX(1 << ashift, space_map_blksz), so a metaslab might use the
|
||||||
* same number of blocks after condensing. Since the goal of condensing is to
|
* same number of blocks after condensing. Since the goal of condensing is to
|
||||||
* reduce the number of IOPs required to read the space map, we only want to
|
* reduce the number of IOPs required to read the space map, we only want to
|
||||||
* condense when we can be sure we will reduce the number of blocks used by the
|
* condense when we can be sure we will reduce the number of blocks used by the
|
||||||
|
@ -1864,6 +1864,15 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||||
|
|
||||||
mutex_enter(&msp->ms_lock);
|
mutex_enter(&msp->ms_lock);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Note: metaslab_condense() clears the space_map's histogram.
|
||||||
|
* Therefore we muse verify and remove this histogram before
|
||||||
|
* condensing.
|
||||||
|
*/
|
||||||
|
metaslab_group_histogram_verify(mg);
|
||||||
|
metaslab_class_histogram_verify(mg->mg_class);
|
||||||
|
metaslab_group_histogram_remove(mg, msp);
|
||||||
|
|
||||||
if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
|
if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
|
||||||
metaslab_should_condense(msp)) {
|
metaslab_should_condense(msp)) {
|
||||||
metaslab_condense(msp, txg, tx);
|
metaslab_condense(msp, txg, tx);
|
||||||
|
@ -1872,9 +1881,6 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
|
||||||
space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
|
space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
|
||||||
}
|
}
|
||||||
|
|
||||||
metaslab_group_histogram_verify(mg);
|
|
||||||
metaslab_class_histogram_verify(mg->mg_class);
|
|
||||||
metaslab_group_histogram_remove(mg, msp);
|
|
||||||
if (msp->ms_loaded) {
|
if (msp->ms_loaded) {
|
||||||
/*
|
/*
|
||||||
* When the space map is loaded, we have an accruate
|
* When the space map is loaded, we have an accruate
|
||||||
|
|
|
@ -38,15 +38,12 @@
|
||||||
#include <sys/zfeature.h>
|
#include <sys/zfeature.h>
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This value controls how the space map's block size is allowed to grow.
|
* The data for a given space map can be kept on blocks of any size.
|
||||||
* If the value is set to the same size as SPACE_MAP_INITIAL_BLOCKSIZE then
|
* Larger blocks entail fewer i/o operations, but they also cause the
|
||||||
* the space map block size will remain fixed. Setting this value to something
|
* DMU to keep more data in-core, and also to waste more i/o bandwidth
|
||||||
* greater than SPACE_MAP_INITIAL_BLOCKSIZE will allow the space map to
|
* when only a few blocks have changed since the last transaction group.
|
||||||
* increase its block size as needed. To maintain backwards compatibilty the
|
|
||||||
* space map's block size must be a power of 2 and SPACE_MAP_INITIAL_BLOCKSIZE
|
|
||||||
* or larger.
|
|
||||||
*/
|
*/
|
||||||
int space_map_max_blksz = (1 << 12);
|
int space_map_blksz = (1 << 12);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Load the space map disk into the specified range tree. Segments of maptype
|
* Load the space map disk into the specified range tree. Segments of maptype
|
||||||
|
@ -236,58 +233,6 @@ space_map_entries(space_map_t *sm, range_tree_t *rt)
|
||||||
return (entries);
|
return (entries);
|
||||||
}
|
}
|
||||||
|
|
||||||
void
|
|
||||||
space_map_set_blocksize(space_map_t *sm, uint64_t size, dmu_tx_t *tx)
|
|
||||||
{
|
|
||||||
uint32_t blksz;
|
|
||||||
u_longlong_t blocks;
|
|
||||||
|
|
||||||
ASSERT3U(sm->sm_blksz, !=, 0);
|
|
||||||
ASSERT3U(space_map_object(sm), !=, 0);
|
|
||||||
ASSERT(sm->sm_dbuf != NULL);
|
|
||||||
VERIFY(ISP2(space_map_max_blksz));
|
|
||||||
|
|
||||||
if (sm->sm_blksz >= space_map_max_blksz)
|
|
||||||
return;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* The object contains more than one block so we can't adjust
|
|
||||||
* its size.
|
|
||||||
*/
|
|
||||||
if (sm->sm_phys->smp_objsize > sm->sm_blksz)
|
|
||||||
return;
|
|
||||||
|
|
||||||
if (size > sm->sm_blksz) {
|
|
||||||
uint64_t newsz;
|
|
||||||
|
|
||||||
/*
|
|
||||||
* Older software versions treat space map blocks as fixed
|
|
||||||
* entities. The DMU is capable of handling different block
|
|
||||||
* sizes making it possible for us to increase the
|
|
||||||
* block size and maintain backwards compatibility. The
|
|
||||||
* caveat is that the new block sizes must be a
|
|
||||||
* power of 2 so that old software can append to the file,
|
|
||||||
* adding more blocks. The block size can grow until it
|
|
||||||
* reaches space_map_max_blksz.
|
|
||||||
*/
|
|
||||||
newsz = ISP2(size) ? size : 1ULL << highbit64(size);
|
|
||||||
if (newsz > space_map_max_blksz)
|
|
||||||
newsz = space_map_max_blksz;
|
|
||||||
|
|
||||||
VERIFY0(dmu_object_set_blocksize(sm->sm_os,
|
|
||||||
space_map_object(sm), newsz, 0, tx));
|
|
||||||
dmu_object_size_from_db(sm->sm_dbuf, &blksz, &blocks);
|
|
||||||
|
|
||||||
zfs_dbgmsg("txg %llu, spa %s, increasing blksz from %d to %d",
|
|
||||||
dmu_tx_get_txg(tx), spa_name(dmu_objset_spa(sm->sm_os)),
|
|
||||||
sm->sm_blksz, blksz);
|
|
||||||
|
|
||||||
VERIFY3U(newsz, ==, blksz);
|
|
||||||
VERIFY3U(sm->sm_blksz, <, blksz);
|
|
||||||
sm->sm_blksz = blksz;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Note: space_map_write() will drop sm_lock across dmu_write() calls.
|
* Note: space_map_write() will drop sm_lock across dmu_write() calls.
|
||||||
*/
|
*/
|
||||||
|
@ -301,7 +246,7 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
||||||
range_seg_t *rs;
|
range_seg_t *rs;
|
||||||
uint64_t size, total, rt_space, nodes;
|
uint64_t size, total, rt_space, nodes;
|
||||||
uint64_t *entry, *entry_map, *entry_map_end;
|
uint64_t *entry, *entry_map, *entry_map_end;
|
||||||
uint64_t newsz, expected_entries, actual_entries = 1;
|
uint64_t expected_entries, actual_entries = 1;
|
||||||
|
|
||||||
ASSERT(MUTEX_HELD(rt->rt_lock));
|
ASSERT(MUTEX_HELD(rt->rt_lock));
|
||||||
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
|
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
|
||||||
|
@ -327,13 +272,6 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
|
||||||
|
|
||||||
expected_entries = space_map_entries(sm, rt);
|
expected_entries = space_map_entries(sm, rt);
|
||||||
|
|
||||||
/*
|
|
||||||
* Calculate the new size for the space map on-disk and see if
|
|
||||||
* we can grow the block size to accommodate the new size.
|
|
||||||
*/
|
|
||||||
newsz = sm->sm_phys->smp_objsize + expected_entries * sizeof (uint64_t);
|
|
||||||
space_map_set_blocksize(sm, newsz, tx);
|
|
||||||
|
|
||||||
entry_map = zio_buf_alloc(sm->sm_blksz);
|
entry_map = zio_buf_alloc(sm->sm_blksz);
|
||||||
entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
|
entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
|
||||||
entry = entry_map;
|
entry = entry_map;
|
||||||
|
@ -465,46 +403,48 @@ space_map_close(space_map_t *sm)
|
||||||
kmem_free(sm, sizeof (*sm));
|
kmem_free(sm, sizeof (*sm));
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
|
||||||
space_map_reallocate(space_map_t *sm, dmu_tx_t *tx)
|
|
||||||
{
|
|
||||||
ASSERT(dmu_tx_is_syncing(tx));
|
|
||||||
|
|
||||||
space_map_free(sm, tx);
|
|
||||||
dmu_buf_rele(sm->sm_dbuf, sm);
|
|
||||||
|
|
||||||
sm->sm_object = space_map_alloc(sm->sm_os, tx);
|
|
||||||
VERIFY0(space_map_open_impl(sm));
|
|
||||||
}
|
|
||||||
|
|
||||||
void
|
void
|
||||||
space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
|
space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
|
||||||
{
|
{
|
||||||
objset_t *os = sm->sm_os;
|
objset_t *os = sm->sm_os;
|
||||||
spa_t *spa = dmu_objset_spa(os);
|
spa_t *spa = dmu_objset_spa(os);
|
||||||
dmu_object_info_t doi;
|
dmu_object_info_t doi;
|
||||||
int bonuslen;
|
|
||||||
|
|
||||||
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
|
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
|
||||||
ASSERT(dmu_tx_is_syncing(tx));
|
ASSERT(dmu_tx_is_syncing(tx));
|
||||||
|
|
||||||
VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
|
|
||||||
dmu_object_info_from_db(sm->sm_dbuf, &doi);
|
dmu_object_info_from_db(sm->sm_dbuf, &doi);
|
||||||
|
|
||||||
if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
|
/*
|
||||||
bonuslen = sizeof (space_map_phys_t);
|
* If the space map has the wrong bonus size (because
|
||||||
ASSERT3U(bonuslen, <=, dmu_bonus_max());
|
* SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
|
||||||
} else {
|
* the wrong block size (because space_map_blksz has changed),
|
||||||
bonuslen = SPACE_MAP_SIZE_V0;
|
* free and re-allocate its object with the updated sizes.
|
||||||
}
|
*
|
||||||
|
* Otherwise, just truncate the current object.
|
||||||
if (bonuslen != doi.doi_bonus_size ||
|
*/
|
||||||
doi.doi_data_block_size != SPACE_MAP_INITIAL_BLOCKSIZE) {
|
if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
|
||||||
|
doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
|
||||||
|
doi.doi_data_block_size != space_map_blksz) {
|
||||||
zfs_dbgmsg("txg %llu, spa %s, reallocating: "
|
zfs_dbgmsg("txg %llu, spa %s, reallocating: "
|
||||||
"old bonus %u, old blocksz %u", dmu_tx_get_txg(tx),
|
"old bonus %u, old blocksz %u", dmu_tx_get_txg(tx),
|
||||||
spa_name(spa), doi.doi_bonus_size, doi.doi_data_block_size);
|
spa_name(spa), doi.doi_bonus_size, doi.doi_data_block_size);
|
||||||
space_map_reallocate(sm, tx);
|
|
||||||
VERIFY3U(sm->sm_blksz, ==, SPACE_MAP_INITIAL_BLOCKSIZE);
|
space_map_free(sm, tx);
|
||||||
|
dmu_buf_rele(sm->sm_dbuf, sm);
|
||||||
|
|
||||||
|
sm->sm_object = space_map_alloc(sm->sm_os, tx);
|
||||||
|
VERIFY0(space_map_open_impl(sm));
|
||||||
|
} else {
|
||||||
|
VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If the spacemap is reallocated, its histogram
|
||||||
|
* will be reset. Do the same in the common case so that
|
||||||
|
* bugs related to the uncommon case do not go unnoticed.
|
||||||
|
*/
|
||||||
|
bzero(sm->sm_phys->smp_histogram,
|
||||||
|
sizeof (sm->sm_phys->smp_histogram));
|
||||||
}
|
}
|
||||||
|
|
||||||
dmu_buf_will_dirty(sm->sm_dbuf, tx);
|
dmu_buf_will_dirty(sm->sm_dbuf, tx);
|
||||||
|
@ -543,7 +483,7 @@ space_map_alloc(objset_t *os, dmu_tx_t *tx)
|
||||||
}
|
}
|
||||||
|
|
||||||
object = dmu_object_alloc(os,
|
object = dmu_object_alloc(os,
|
||||||
DMU_OT_SPACE_MAP, SPACE_MAP_INITIAL_BLOCKSIZE,
|
DMU_OT_SPACE_MAP, space_map_blksz,
|
||||||
DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
|
DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
|
||||||
|
|
||||||
return (object);
|
return (object);
|
||||||
|
|
Loading…
Reference in New Issue