diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index d1e77cce7f..0d2f3623b6 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -776,7 +776,6 @@ verify_spacemap_refcounts(spa_t *spa) static void dump_spacemap(objset_t *os, space_map_t *sm) { - uint64_t alloc, offset, entry; const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", "INVALID", "INVALID", "INVALID", "INVALID" }; @@ -793,41 +792,73 @@ dump_spacemap(objset_t *os, space_map_t *sm) /* * Print out the freelist entries in both encoded and decoded form. */ - alloc = 0; - for (offset = 0; offset < space_map_length(sm); - offset += sizeof (entry)) { - uint8_t mapshift = sm->sm_shift; + uint8_t mapshift = sm->sm_shift; + int64_t alloc = 0; + uint64_t word; + for (uint64_t offset = 0; offset < space_map_length(sm); + offset += sizeof (word)) { VERIFY0(dmu_read(os, space_map_object(sm), offset, - sizeof (entry), &entry, DMU_READ_PREFETCH)); - if (SM_DEBUG_DECODE(entry)) { + sizeof (word), &word, DMU_READ_PREFETCH)); + if (sm_entry_is_debug(word)) { (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", - (u_longlong_t)(offset / sizeof (entry)), - ddata[SM_DEBUG_ACTION_DECODE(entry)], - (u_longlong_t)SM_DEBUG_TXG_DECODE(entry), - (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry)); - } else { - (void) printf("\t [%6llu] %c range:" - " %010llx-%010llx size: %06llx\n", - (u_longlong_t)(offset / sizeof (entry)), - SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F', - (u_longlong_t)((SM_OFFSET_DECODE(entry) << - mapshift) + sm->sm_start), - (u_longlong_t)((SM_OFFSET_DECODE(entry) << - mapshift) + sm->sm_start + - (SM_RUN_DECODE(entry) << mapshift)), - (u_longlong_t)(SM_RUN_DECODE(entry) << mapshift)); - if (SM_TYPE_DECODE(entry) == SM_ALLOC) - alloc += SM_RUN_DECODE(entry) << mapshift; - else - alloc -= SM_RUN_DECODE(entry) << mapshift; + (u_longlong_t)(offset / sizeof (word)), + ddata[SM_DEBUG_ACTION_DECODE(word)], + (u_longlong_t)SM_DEBUG_TXG_DECODE(word), + (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(word)); + continue; } + + uint8_t words; + char entry_type; + uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; + + if (sm_entry_is_single_word(word)) { + entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? + 'A' : 'F'; + entry_off = (SM_OFFSET_DECODE(word) << mapshift) + + sm->sm_start; + entry_run = SM_RUN_DECODE(word) << mapshift; + words = 1; + } else { + /* it is a two-word entry so we read another word */ + ASSERT(sm_entry_is_double_word(word)); + + uint64_t extra_word; + offset += sizeof (extra_word); + VERIFY0(dmu_read(os, space_map_object(sm), offset, + sizeof (extra_word), &extra_word, + DMU_READ_PREFETCH)); + + ASSERT3U(offset, <=, space_map_length(sm)); + + entry_run = SM2_RUN_DECODE(word) << mapshift; + entry_vdev = SM2_VDEV_DECODE(word); + entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? + 'A' : 'F'; + entry_off = (SM2_OFFSET_DECODE(extra_word) << + mapshift) + sm->sm_start; + words = 2; + } + + (void) printf("\t [%6llu] %c range:" + " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", + (u_longlong_t)(offset / sizeof (word)), + entry_type, (u_longlong_t)entry_off, + (u_longlong_t)(entry_off + entry_run), + (u_longlong_t)entry_run, + (u_longlong_t)entry_vdev, words); + + if (entry_type == 'A') + alloc += entry_run; + else + alloc -= entry_run; } - if (alloc != space_map_allocated(sm)) { - (void) printf("space_map_object alloc (%llu) INCONSISTENT " - "with space map summary (%llu)\n", - (u_longlong_t)space_map_allocated(sm), (u_longlong_t)alloc); + if ((uint64_t)alloc != space_map_allocated(sm)) { + (void) printf("space_map_object alloc (%lld) INCONSISTENT " + "with space map summary (%lld)\n", + (longlong_t)space_map_allocated(sm), (longlong_t)alloc); } } @@ -1158,7 +1189,7 @@ dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) dump_dde(ddt, &dde, walk); - ASSERT(error == ENOENT); + ASSERT3U(error, ==, ENOENT); (void) printf("\n"); } @@ -3579,15 +3610,14 @@ typedef struct checkpoint_sm_exclude_entry_arg { } checkpoint_sm_exclude_entry_arg_t; static int -checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size, - void *arg) +checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) { checkpoint_sm_exclude_entry_arg_t *cseea = arg; vdev_t *vd = cseea->cseea_vd; - metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - uint64_t end = offset + size; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; - ASSERT(type == SM_FREE); + ASSERT(sme->sme_type == SM_FREE); /* * Since the vdev_checkpoint_sm exists in the vdev level @@ -3605,7 +3635,7 @@ checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size, * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ - VERIFY3U(offset, >=, ms->ms_start); + VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* @@ -3613,10 +3643,10 @@ checkpoint_sm_exclude_entry_cb(maptype_t type, uint64_t offset, uint64_t size, * also verify that the entry is there to begin with. */ mutex_enter(&ms->ms_lock); - range_tree_remove(ms->ms_allocatable, offset, size); + range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); - cseea->cseea_checkpoint_size += size; + cseea->cseea_checkpoint_size += sme->sme_run; return (0); } @@ -4606,15 +4636,14 @@ typedef struct verify_checkpoint_sm_entry_cb_arg { #define ENTRIES_PER_PROGRESS_UPDATE 10000 static int -verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size, - void *arg) +verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) { verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; vdev_t *vd = vcsec->vcsec_vd; - metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - uint64_t end = offset + size; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; - ASSERT(type == SM_FREE); + ASSERT(sme->sme_type == SM_FREE); if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { (void) fprintf(stderr, @@ -4628,7 +4657,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size, /* * See comment in checkpoint_sm_exclude_entry_cb() */ - VERIFY3U(offset, >=, ms->ms_start); + VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* @@ -4637,7 +4666,7 @@ verify_checkpoint_sm_entry_cb(maptype_t type, uint64_t offset, uint64_t size, * their respective ms_allocateable trees should not contain them. */ mutex_enter(&ms->ms_lock); - range_tree_verify(ms->ms_allocatable, offset, size); + range_tree_verify(ms->ms_allocatable, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); return (0); @@ -4883,7 +4912,7 @@ verify_checkpoint(spa_t *spa) DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); - if (error == ENOENT) { + if (error == ENOENT && !dump_opt['L']) { /* * If the feature is active but the uberblock is missing * then we must be in the middle of discarding the @@ -4906,7 +4935,7 @@ verify_checkpoint(spa_t *spa) error = 3; } - if (error == 0) + if (error == 0 && !dump_opt['L']) verify_checkpoint_blocks(spa); return (error); @@ -5015,7 +5044,7 @@ dump_zpool(spa_t *spa) if (dump_opt['h']) dump_history(spa); - if (rc == 0 && !dump_opt['L']) + if (rc == 0) rc = verify_checkpoint(spa); if (rc != 0) { diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 78ad7e8de0..5347a0abef 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -202,6 +202,7 @@ extern int metaslab_preload_limit; extern boolean_t zfs_compressed_arc_enabled; extern int zfs_abd_scatter_enabled; extern int dmu_object_alloc_chunk_shift; +extern boolean_t zfs_force_some_double_word_sm_entries; static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -7349,6 +7350,12 @@ main(int argc, char **argv) dprintf_setup(&argc, argv); zfs_deadman_synctime_ms = 300000; + /* + * As two-word space map entries may not come up often (especially + * if pool and vdev sizes are small) we want to force at least some + * of them so the feature get tested. + */ + zfs_force_some_double_word_sm_entries = B_TRUE; action.sa_handler = sig_handler; sigemptyset(&action.sa_mask); diff --git a/include/sys/spa.h b/include/sys/spa.h index b6483e11b2..4a3fc71f74 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -153,6 +153,7 @@ _NOTE(CONSTCOND) } while (0) #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ #define SPA_COMPRESSBITS 7 +#define SPA_VDEVBITS 24 /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). @@ -177,15 +178,15 @@ typedef struct zio_cksum_salt { * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 0 | vdev1 | GRID | ASIZE | + * 0 | pad | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 2 | vdev2 | GRID | ASIZE | + * 2 | pad | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 4 | vdev3 | GRID | ASIZE | + * 4 | pad | vdev3 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -443,8 +444,9 @@ typedef struct blkptr { #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) -#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) -#define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) +#define DVA_SET_VDEV(dva, x) \ + BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) diff --git a/include/sys/space_map.h b/include/sys/space_map.h index 98b87269cb..64c97bb4dd 100644 --- a/include/sys/space_map.h +++ b/include/sys/space_map.h @@ -93,50 +93,100 @@ typedef struct space_map { /* * debug entry * - * 1 3 10 50 - * ,---+--------+------------+---------------------------------. - * | 1 | action | syncpass | txg (lower bits) | - * `---+--------+------------+---------------------------------' - * 63 62 60 59 50 49 0 + * 2 2 10 50 + * +-----+-----+------------+----------------------------------+ + * | 1 0 | act | syncpass | txg (lower bits) | + * +-----+-----+------------+----------------------------------+ + * 63 62 61 60 59 50 49 0 * * - * non-debug entry + * one-word entry * * 1 47 1 15 - * ,-----------------------------------------------------------. + * +-----------------------------------------------------------+ * | 0 | offset (sm_shift units) | type | run | - * `-----------------------------------------------------------' - * 63 62 17 16 15 0 + * +-----------------------------------------------------------+ + * 63 62 16 15 14 0 + * + * + * two-word entry + * + * 2 2 36 24 + * +-----+-----+---------------------------+-------------------+ + * | 1 1 | pad | run | vdev | + * +-----+-----+---------------------------+-------------------+ + * 63 62 61 60 59 24 23 0 + * + * 1 63 + * +------+----------------------------------------------------+ + * | type | offset | + * +------+----------------------------------------------------+ + * 63 62 0 + * + * Note that a two-word entry will not straddle a block boundary. + * If necessary, the last word of a block will be padded with a + * debug entry (with act = syncpass = txg = 0). */ -/* All this stuff takes and returns bytes */ -#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, 15) + 1) -#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, 15) -#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) -#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) -#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, 47) -#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, 47) -#define SM_DEBUG_DECODE(x) BF64_DECODE(x, 63, 1) -#define SM_DEBUG_ENCODE(x) BF64_ENCODE(x, 63, 1) - -#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 3) -#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 3) - -#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) -#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) - -#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) -#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) - -#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) - typedef enum { SM_ALLOC, SM_FREE } maptype_t; -typedef int (*sm_cb_t)(maptype_t type, uint64_t offset, uint64_t size, - void *arg); +typedef struct space_map_entry { + maptype_t sme_type; + uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ + uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ + uint64_t sme_run; /* max is 2^36; units of sm_shift */ +} space_map_entry_t; + +#define SM_NO_VDEVID (1 << SPA_VDEVBITS) + +/* one-word entry constants */ +#define SM_DEBUG_PREFIX 2 +#define SM_OFFSET_BITS 47 +#define SM_RUN_BITS 15 + +/* two-word entry constants */ +#define SM2_PREFIX 3 +#define SM2_OFFSET_BITS 63 +#define SM2_RUN_BITS 36 + +#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2) +#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2) +#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) +#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) +#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) +#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) + +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS) +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) +#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL) + +#define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1) +#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS) +#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS) +#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS) +#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1) +#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1) +#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS) +#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS) +#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL) +#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL) + +boolean_t sm_entry_is_debug(uint64_t e); +boolean_t sm_entry_is_single_word(uint64_t e); +boolean_t sm_entry_is_double_word(uint64_t e); + +typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg); @@ -154,7 +204,9 @@ uint64_t space_map_allocated(space_map_t *sm); uint64_t space_map_length(space_map_t *sm); void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - dmu_tx_t *tx); + uint64_t vdev_id, dmu_tx_t *tx); +uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, + uint64_t vdev_id); void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx); uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx); void space_map_free(space_map_t *sm, dmu_tx_t *tx); diff --git a/include/zfeature_common.h b/include/zfeature_common.h index c59b800d3c..c5aabce0e3 100644 --- a/include/zfeature_common.h +++ b/include/zfeature_common.h @@ -62,6 +62,7 @@ typedef enum spa_feature { SPA_FEATURE_DEVICE_REMOVAL, SPA_FEATURE_OBSOLETE_COUNTS, SPA_FEATURE_POOL_CHECKPOINT, + SPA_FEATURE_SPACEMAP_V2, SPA_FEATURES } spa_feature_t; diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 index e93943ec26..8d5f468217 100644 --- a/man/man5/zpool-features.5 +++ b/man/man5/zpool-features.5 @@ -503,6 +503,29 @@ is used to checkpoint the pool. The feature will only return back to being \fBenabled\fR when the pool is rewound or the checkpoint has been discarded. +.RE +.sp +.ne 2 +.na +\fB\fBspacemap_v2\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:spacemap_v2 +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature enables the use of the new space map encoding which +consists of two words (instead of one) whenever it is advantageous. +The new encoding allows space maps to represent large regions of +space more efficiently on-disk while also increasing their maximum +addressable offset. + +This feature becomes \fBactive\fR once it is \fBenabled\fR, and never +returns back to being \fBenabled\fR. + .RE .sp .ne 2 diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c index b010c88434..f5c98933cc 100644 --- a/module/zcommon/zfeature_common.c +++ b/module/zcommon/zfeature_common.c @@ -215,6 +215,12 @@ zpool_feature_init(void) "Pool state can be checkpointed, allowing rewind later.", ZFEATURE_FLAG_READONLY_COMPAT, NULL); + zfeature_register(SPA_FEATURE_SPACEMAP_V2, + "com.delphix:spacemap_v2", "spacemap_v2", + "Space maps representing large segments are more efficient.", + ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, + NULL); + zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, "com.delphix:extensible_dataset", "extensible_dataset", "Enhanced dataset functionality, used by other features.", diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 76fa99e8b1..879238e7d8 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -2025,17 +2025,6 @@ metaslab_group_preload(metaslab_group_t *mg) * * 3. The on-disk size of the space map should actually decrease. * - * Checking the first condition is tricky since we don't want to walk - * the entire AVL tree calculating the estimated on-disk size. Instead we - * use the size-ordered range tree in the metaslab and calculate the - * size required to write out the largest segment in our free tree. If the - * size required to represent that segment on disk is larger than the space - * map object then we avoid condensing this map. - * - * To determine the second criterion we use a best-case estimate and assume - * each segment can be represented on-disk as a single 64-bit entry. We refer - * to this best-case estimate as the space map's minimal form. - * * Unfortunately, we cannot compute the on-disk size of the space map in this * context because we cannot accurately compute the effects of compression, etc. * Instead, we apply the heuristic described in the block comment for @@ -2046,9 +2035,6 @@ static boolean_t metaslab_should_condense(metaslab_t *msp) { space_map_t *sm = msp->ms_sm; - range_seg_t *rs; - uint64_t size, entries, segsz, object_size, optimal_size, record_size; - dmu_object_info_t doi; vdev_t *vd = msp->ms_group->mg_vd; uint64_t vdev_blocksize = 1 << vd->vdev_ashift; uint64_t current_txg = spa_syncing_txg(vd->vdev_spa); @@ -2074,34 +2060,22 @@ metaslab_should_condense(metaslab_t *msp) msp->ms_condense_checked_txg = current_txg; /* - * Use the ms_allocatable_by_size range tree, which is ordered by - * size, to obtain the largest segment in the free tree. We always - * condense metaslabs that are empty and metaslabs for which a - * condense request has been made. + * We always condense metaslabs that are empty and metaslabs for + * which a condense request has been made. */ - rs = avl_last(&msp->ms_allocatable_by_size); - if (rs == NULL || msp->ms_condense_wanted) + if (avl_is_empty(&msp->ms_allocatable_by_size) || + msp->ms_condense_wanted) return (B_TRUE); - /* - * Calculate the number of 64-bit entries this segment would - * require when written to disk. If this single segment would be - * larger on-disk than the entire current on-disk structure, then - * clearly condensing will increase the on-disk structure size. - */ - size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - entries = size / (MIN(size, SM_RUN_MAX)); - segsz = entries * sizeof (uint64_t); - - optimal_size = - sizeof (uint64_t) * avl_numnodes(&msp->ms_allocatable->rt_root); - object_size = space_map_length(msp->ms_sm); + uint64_t object_size = space_map_length(msp->ms_sm); + uint64_t optimal_size = space_map_estimate_optimal_size(sm, + msp->ms_allocatable, SM_NO_VDEVID); + dmu_object_info_t doi; dmu_object_info_from_db(sm->sm_dbuf, &doi); - record_size = MAX(doi.doi_data_block_size, vdev_blocksize); + uint64_t record_size = MAX(doi.doi_data_block_size, vdev_blocksize); - return (segsz <= object_size && - object_size >= (optimal_size * zfs_condense_pct / 100) && + return (object_size >= (optimal_size * zfs_condense_pct / 100) && object_size > zfs_metaslab_condense_block_threshold * record_size); } @@ -2177,11 +2151,11 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) * optimal, this is typically close to optimal, and much cheaper to * compute. */ - space_map_write(sm, condense_tree, SM_ALLOC, tx); + space_map_write(sm, condense_tree, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(condense_tree, NULL, NULL); range_tree_destroy(condense_tree); - space_map_write(sm, msp->ms_allocatable, SM_FREE, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); msp->ms_condensing = B_FALSE; } @@ -2293,8 +2267,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) metaslab_condense(msp, txg, tx); } else { mutex_exit(&msp->ms_lock); - space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx); - space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, tx); + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, + SM_NO_VDEVID, tx); + space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, + SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); } @@ -2309,7 +2285,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ mutex_exit(&msp->ms_lock); space_map_write(vd->vdev_checkpoint_sm, - msp->ms_checkpointing, SM_FREE, tx); + msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); mutex_enter(&msp->ms_lock); space_map_update(vd->vdev_checkpoint_sm); diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c index 544658821d..6f7e9ab839 100644 --- a/module/zfs/spa_checkpoint.c +++ b/module/zfs/spa_checkpoint.c @@ -203,13 +203,12 @@ typedef struct spa_checkpoint_discard_sync_callback_arg { } spa_checkpoint_discard_sync_callback_arg_t; static int -spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, - uint64_t size, void *arg) +spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) { spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; vdev_t *vd = sdc->sdc_vd; - metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; - uint64_t end = offset + size; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; if (sdc->sdc_entry_limit == 0) return (EINTR); @@ -224,8 +223,8 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, * metaslab boundaries. So if needed we could add code * that handles metaslab-crossing segments in the future. */ - VERIFY3U(type, ==, SM_FREE); - VERIFY3U(offset, >=, ms->ms_start); + VERIFY3U(sme->sme_type, ==, SM_FREE); + VERIFY3U(sme->sme_offset, >=, ms->ms_start); VERIFY3U(end, <=, ms->ms_start + ms->ms_size); /* @@ -237,14 +236,15 @@ spa_checkpoint_discard_sync_callback(maptype_t type, uint64_t offset, mutex_enter(&ms->ms_lock); if (range_tree_is_empty(ms->ms_freeing)) vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); - range_tree_add(ms->ms_freeing, offset, size); + range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); mutex_exit(&ms->ms_lock); - ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, size); - ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, size); + ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, + sme->sme_run); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); - vd->vdev_spa->spa_checkpoint_info.sci_dspace -= size; - vd->vdev_stat.vs_checkpoint_space -= size; + vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; + vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; sdc->sdc_entry_limit--; return (0); @@ -291,12 +291,13 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) * Thus, we set the maximum entries that the space map callback * will be applied to be half the entries that could fit in the * imposed memory limit. + * + * Note that since this is a conservative estimate we also + * assume the worst case scenario in our computation where each + * entry is two-word. */ uint64_t max_entry_limit = - (zfs_spa_discard_memory_limit / sizeof (uint64_t)) >> 1; - - uint64_t entries_in_sm = - space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); + (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; /* * Iterate from the end of the space map towards the beginning, @@ -320,14 +321,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) spa_checkpoint_discard_sync_callback_arg_t sdc; sdc.sdc_vd = vd; sdc.sdc_txg = tx->tx_txg; - sdc.sdc_entry_limit = MIN(entries_in_sm, max_entry_limit); + sdc.sdc_entry_limit = max_entry_limit; - uint64_t entries_before = entries_in_sm; + uint64_t words_before = + space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, spa_checkpoint_discard_sync_callback, &sdc, tx); - uint64_t entries_after = + uint64_t words_after = space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); #ifdef ZFS_DEBUG @@ -335,9 +337,9 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) #endif zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " - "deleted %llu entries - %llu entries are left", - tx->tx_txg, vd->vdev_id, (entries_before - entries_after), - entries_after); + "deleted %llu words - %llu words are left", + tx->tx_txg, vd->vdev_id, (words_before - words_after), + words_after); if (error != EINTR) { if (error != 0) { @@ -346,15 +348,15 @@ spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) "space map of vdev %llu\n", error, vd->vdev_id); } - ASSERT0(entries_after); + ASSERT0(words_after); ASSERT0(vd->vdev_checkpoint_sm->sm_alloc); - ASSERT0(vd->vdev_checkpoint_sm->sm_length); + ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); space_map_free(vd->vdev_checkpoint_sm, tx); space_map_close(vd->vdev_checkpoint_sm); vd->vdev_checkpoint_sm = NULL; - VERIFY0(zap_remove(vd->vdev_spa->spa_meta_objset, + VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); } } diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c index 0e5a4b9765..5f67a79872 100644 --- a/module/zfs/space_map.c +++ b/module/zfs/space_map.c @@ -41,11 +41,36 @@ * Note on space map block size: * * The data for a given space map can be kept on blocks of any size. - * Larger blocks entail fewer i/o operations, but they also cause the - * DMU to keep more data in-core, and also to waste more i/o bandwidth + * Larger blocks entail fewer I/O operations, but they also cause the + * DMU to keep more data in-core, and also to waste more I/O bandwidth * when only a few blocks have changed since the last transaction group. */ +/* + * Enabled whenever we want to stress test the use of double-word + * space map entries. + */ +boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; + +boolean_t +sm_entry_is_debug(uint64_t e) +{ + return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); +} + +boolean_t +sm_entry_is_single_word(uint64_t e) +{ + uint8_t prefix = SM_PREFIX_DECODE(e); + return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); +} + +boolean_t +sm_entry_is_double_word(uint64_t e) +{ + return (SM_PREFIX_DECODE(e) == SM2_PREFIX); +} + /* * Iterate through the space map, invoking the callback on each (non-debug) * space map entry. @@ -53,56 +78,157 @@ int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) { - uint64_t *entry, *entry_map, *entry_map_end; - uint64_t bufsize, size, offset, end; + uint64_t sm_len = space_map_length(sm); + ASSERT3U(sm->sm_blksz, !=, 0); + + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len, + ZIO_PRIORITY_SYNC_READ); + + uint64_t blksz = sm->sm_blksz; int error = 0; - - end = space_map_length(sm); - - bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); - entry_map = vmem_alloc(bufsize, KM_SLEEP); - - if (end > bufsize) { - dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize, - end - bufsize, ZIO_PRIORITY_SYNC_READ); - } - - for (offset = 0; offset < end && error == 0; offset += bufsize) { - size = MIN(end - offset, bufsize); - VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); - VERIFY(size != 0); - ASSERT3U(sm->sm_blksz, !=, 0); - - dprintf("object=%llu offset=%llx size=%llx\n", - space_map_object(sm), offset, size); - - error = dmu_read(sm->sm_os, space_map_object(sm), offset, size, - entry_map, DMU_READ_PREFETCH); + for (uint64_t block_base = 0; block_base < sm_len && error == 0; + block_base += blksz) { + dmu_buf_t *db; + error = dmu_buf_hold(sm->sm_os, space_map_object(sm), + block_base, FTAG, &db, DMU_READ_PREFETCH); if (error != 0) - break; + return (error); - entry_map_end = entry_map + (size / sizeof (uint64_t)); - for (entry = entry_map; entry < entry_map_end && error == 0; - entry++) { - uint64_t e = *entry; - uint64_t offset, size; + uint64_t *block_start = db->db_data; + uint64_t block_length = MIN(sm_len - block_base, blksz); + uint64_t *block_end = block_start + + (block_length / sizeof (uint64_t)); - if (SM_DEBUG_DECODE(e)) /* Skip debug entries */ + VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); + VERIFY3U(block_length, !=, 0); + ASSERT3U(blksz, ==, db->db_size); + + for (uint64_t *block_cursor = block_start; + block_cursor < block_end && error == 0; block_cursor++) { + uint64_t e = *block_cursor; + + if (sm_entry_is_debug(e)) /* Skip debug entries */ continue; - offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + - sm->sm_start; - size = SM_RUN_DECODE(e) << sm->sm_shift; + uint64_t raw_offset, raw_run, vdev_id; + maptype_t type; + if (sm_entry_is_single_word(e)) { + type = SM_TYPE_DECODE(e); + vdev_id = SM_NO_VDEVID; + raw_offset = SM_OFFSET_DECODE(e); + raw_run = SM_RUN_DECODE(e); + } else { + /* it is a two-word entry */ + ASSERT(sm_entry_is_double_word(e)); + raw_run = SM2_RUN_DECODE(e); + vdev_id = SM2_VDEV_DECODE(e); - VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift)); - VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift)); - VERIFY3U(offset, >=, sm->sm_start); - VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size); - error = callback(SM_TYPE_DECODE(e), offset, size, arg); + /* move on to the second word */ + block_cursor++; + e = *block_cursor; + VERIFY3P(block_cursor, <=, block_end); + + type = SM2_TYPE_DECODE(e); + raw_offset = SM2_OFFSET_DECODE(e); + } + + uint64_t entry_offset = (raw_offset << sm->sm_shift) + + sm->sm_start; + uint64_t entry_run = raw_run << sm->sm_shift; + + VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); + ASSERT3U(entry_offset, >=, sm->sm_start); + ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); + ASSERT3U(entry_run, <=, sm->sm_size); + ASSERT3U(entry_offset + entry_run, <=, + sm->sm_start + sm->sm_size); + + space_map_entry_t sme = { + .sme_type = type, + .sme_vdev = vdev_id, + .sme_offset = entry_offset, + .sme_run = entry_run + }; + error = callback(&sme, arg); + } + dmu_buf_rele(db, FTAG); + } + return (error); +} + +/* + * Reads the entries from the last block of the space map into + * buf in reverse order. Populates nwords with number of words + * in the last block. + * + * Refer to block comment within space_map_incremental_destroy() + * to understand why this function is needed. + */ +static int +space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, + uint64_t bufsz, uint64_t *nwords) +{ + int error = 0; + dmu_buf_t *db; + + /* + * Find the offset of the last word in the space map and use + * that to read the last block of the space map with + * dmu_buf_hold(). + */ + uint64_t last_word_offset = + sm->sm_phys->smp_objsize - sizeof (uint64_t); + error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, + FTAG, &db, DMU_READ_NO_PREFETCH); + if (error != 0) + return (error); + + ASSERT3U(sm->sm_object, ==, db->db_object); + ASSERT3U(sm->sm_blksz, ==, db->db_size); + ASSERT3U(bufsz, >=, db->db_size); + ASSERT(nwords != NULL); + + uint64_t *words = db->db_data; + *nwords = + (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + + ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); + + uint64_t n = *nwords; + uint64_t j = n - 1; + for (uint64_t i = 0; i < n; i++) { + uint64_t entry = words[i]; + if (sm_entry_is_double_word(entry)) { + /* + * Since we are populating the buffer backwards + * we have to be extra careful and add the two + * words of the double-word entry in the right + * order. + */ + ASSERT3U(j, >, 0); + buf[j - 1] = entry; + + i++; + ASSERT3U(i, <, n); + entry = words[i]; + buf[j] = entry; + j -= 2; + } else { + ASSERT(sm_entry_is_debug(entry) || + sm_entry_is_single_word(entry)); + buf[j] = entry; + j--; } } - vmem_free(entry_map, bufsize); + /* + * Assert that we wrote backwards all the + * way to the beginning of the buffer. + */ + ASSERT3S(j, ==, -1); + + dmu_buf_rele(db, FTAG); return (error); } @@ -116,124 +242,122 @@ int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx) { - uint64_t bufsize, len; - uint64_t *entry_map; - int error = 0; - - len = space_map_length(sm); - bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); - entry_map = zio_buf_alloc(bufsize); + uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); + uint64_t *buf = zio_buf_alloc(bufsz); dmu_buf_will_dirty(sm->sm_dbuf, tx); /* - * Since we can't move the starting offset of the space map - * (e.g there are reference on-disk pointing to it), we destroy - * its entries incrementally starting from the end. + * Ideally we would want to iterate from the beginning of the + * space map to the end in incremental steps. The issue with this + * approach is that we don't have any field on-disk that points + * us where to start between each step. We could try zeroing out + * entries that we've destroyed, but this doesn't work either as + * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). * - * The logic that follows is basically the same as the one used - * in space_map_iterate() but it traverses the space map - * backwards: + * As a result, we destroy its entries incrementally starting from + * the end after applying the callback to each of them. * - * 1] We figure out the size of the buffer that we want to use - * to read the on-disk space map entries. - * 2] We figure out the offset at the end of the space map where - * we will start reading entries into our buffer. - * 3] We read the on-disk entries into the buffer. - * 4] We iterate over the entries from end to beginning calling - * the callback function on each one. As we move from entry - * to entry we decrease the size of the space map, deleting - * effectively each entry. - * 5] If there are no more entries in the space map or the - * callback returns a value other than 0, we stop iterating - * over the space map. If there are entries remaining and - * the callback returned zero we go back to step [1]. + * The problem with this approach is that we cannot literally + * iterate through the words in the space map backwards as we + * can't distinguish two-word space map entries from their second + * word. Thus we do the following: + * + * 1] We get all the entries from the last block of the space map + * and put them into a buffer in reverse order. This way the + * last entry comes first in the buffer, the second to last is + * second, etc. + * 2] We iterate through the entries in the buffer and we apply + * the callback to each one. As we move from entry to entry we + * we decrease the size of the space map, deleting effectively + * each entry. + * 3] If there are no more entries in the space map or the callback + * returns a value other than 0, we stop iterating over the + * space map. If there are entries remaining and the callback + * returned 0, we go back to step [1]. */ - uint64_t offset = 0, size = 0; - while (len > 0 && error == 0) { - size = MIN(bufsize, len); - - VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0); - VERIFY3U(size, >, 0); - ASSERT3U(sm->sm_blksz, !=, 0); - - offset = len - size; - - IMPLY(bufsize > len, offset == 0); - IMPLY(bufsize == len, offset == 0); - IMPLY(bufsize < len, offset > 0); - - - EQUIV(size == len, offset == 0); - IMPLY(size < len, bufsize < len); - - dprintf("object=%llu offset=%llx size=%llx\n", - space_map_object(sm), offset, size); - - error = dmu_read(sm->sm_os, space_map_object(sm), - offset, size, entry_map, DMU_READ_PREFETCH); + int error = 0; + while (space_map_length(sm) > 0 && error == 0) { + uint64_t nwords = 0; + error = space_map_reversed_last_block_entries(sm, buf, bufsz, + &nwords); if (error != 0) break; - uint64_t num_entries = size / sizeof (uint64_t); + ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); - ASSERT3U(num_entries, >, 0); + for (uint64_t i = 0; i < nwords; i++) { + uint64_t e = buf[i]; - while (num_entries > 0) { - uint64_t e, entry_offset, entry_size; - maptype_t type; - - e = entry_map[num_entries - 1]; - - ASSERT3U(num_entries, >, 0); - ASSERT0(error); - - if (SM_DEBUG_DECODE(e)) { + if (sm_entry_is_debug(e)) { sm->sm_phys->smp_objsize -= sizeof (uint64_t); space_map_update(sm); - len -= sizeof (uint64_t); - num_entries--; continue; } - type = SM_TYPE_DECODE(e); - entry_offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) + - sm->sm_start; - entry_size = SM_RUN_DECODE(e) << sm->sm_shift; + int words = 1; + uint64_t raw_offset, raw_run, vdev_id; + maptype_t type; + if (sm_entry_is_single_word(e)) { + type = SM_TYPE_DECODE(e); + vdev_id = SM_NO_VDEVID; + raw_offset = SM_OFFSET_DECODE(e); + raw_run = SM_RUN_DECODE(e); + } else { + ASSERT(sm_entry_is_double_word(e)); + words = 2; + + raw_run = SM2_RUN_DECODE(e); + vdev_id = SM2_VDEV_DECODE(e); + + /* move to the second word */ + i++; + e = buf[i]; + + ASSERT3P(i, <=, nwords); + + type = SM2_TYPE_DECODE(e); + raw_offset = SM2_OFFSET_DECODE(e); + } + + uint64_t entry_offset = + (raw_offset << sm->sm_shift) + sm->sm_start; + uint64_t entry_run = raw_run << sm->sm_shift; VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); - VERIFY0(P2PHASE(entry_size, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); VERIFY3U(entry_offset, >=, sm->sm_start); - VERIFY3U(entry_offset + entry_size, <=, + VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); + VERIFY3U(entry_run, <=, sm->sm_size); + VERIFY3U(entry_offset + entry_run, <=, sm->sm_start + sm->sm_size); - error = callback(type, entry_offset, entry_size, arg); + space_map_entry_t sme = { + .sme_type = type, + .sme_vdev = vdev_id, + .sme_offset = entry_offset, + .sme_run = entry_run + }; + error = callback(&sme, arg); if (error != 0) break; if (type == SM_ALLOC) - sm->sm_phys->smp_alloc -= entry_size; + sm->sm_phys->smp_alloc -= entry_run; else - sm->sm_phys->smp_alloc += entry_size; - - sm->sm_phys->smp_objsize -= sizeof (uint64_t); + sm->sm_phys->smp_alloc += entry_run; + sm->sm_phys->smp_objsize -= words * sizeof (uint64_t); space_map_update(sm); - len -= sizeof (uint64_t); - num_entries--; } - IMPLY(error == 0, num_entries == 0); - EQUIV(offset == 0 && error == 0, len == 0 && num_entries == 0); } - if (len == 0) { + if (space_map_length(sm) == 0) { ASSERT0(error); - ASSERT0(offset); - ASSERT0(sm->sm_length); ASSERT0(sm->sm_phys->smp_objsize); ASSERT0(sm->sm_alloc); } - zio_buf_free(entry_map, bufsize); + zio_buf_free(buf, bufsz); return (error); } @@ -244,16 +368,15 @@ typedef struct space_map_load_arg { } space_map_load_arg_t; static int -space_map_load_callback(maptype_t type, uint64_t offset, uint64_t size, - void *arg) +space_map_load_callback(space_map_entry_t *sme, void *arg) { space_map_load_arg_t *smla = arg; - if (type == smla->smla_type) { - VERIFY3U(range_tree_space(smla->smla_rt) + size, <=, + if (sme->sme_type == smla->smla_type) { + VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=, smla->smla_sm->sm_size); - range_tree_add(smla->smla_rt, offset, size); + range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); } else { - range_tree_remove(smla->smla_rt, offset, size); + range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); } return (0); @@ -365,43 +488,237 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) } } -uint64_t -space_map_entries(space_map_t *sm, range_tree_t *rt) +static void +space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) { - avl_tree_t *t = &rt->rt_root; - range_seg_t *rs; - uint64_t size, entries; + dmu_buf_will_dirty(sm->sm_dbuf, tx); - /* - * All space_maps always have a debug entry so account for it here. - */ - entries = 1; + uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | + SM_DEBUG_ACTION_ENCODE(maptype) | + SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | + SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - /* - * Traverse the range tree and calculate the number of space map - * entries that would be required to write out the range tree. - */ - for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { - size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - entries += howmany(size, SM_RUN_MAX); - } - return (entries); + dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize, + sizeof (dentry), &dentry, tx); + + sm->sm_phys->smp_objsize += sizeof (dentry); } +/* + * Writes one or more entries given a segment. + * + * Note: The function may release the dbuf from the pointer initially + * passed to it, and return a different dbuf. Also, the space map's + * dbuf must be dirty for the changes in sm_phys to take effect. + */ +static void +space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, + uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) +{ + ASSERT3U(words, !=, 0); + ASSERT3U(words, <=, 2); + + /* ensure the vdev_id can be represented by the space map */ + ASSERT3U(vdev_id, <=, SM_NO_VDEVID); + + /* + * if this is a single word entry, ensure that no vdev was + * specified. + */ + IMPLY(words == 1, vdev_id == SM_NO_VDEVID); + + dmu_buf_t *db = *dbp; + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + uint64_t *block_base = db->db_data; + uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); + uint64_t *block_cursor = block_base + + (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); + + ASSERT3P(block_cursor, <=, block_end); + + uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; + + ASSERT3U(rs->rs_start, >=, sm->sm_start); + ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); + ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); + ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); + + while (size != 0) { + ASSERT3P(block_cursor, <=, block_end); + + /* + * If we are at the end of this block, flush it and start + * writing again from the beginning. + */ + if (block_cursor == block_end) { + dmu_buf_rele(db, tag); + + uint64_t next_word_offset = sm->sm_phys->smp_objsize; + VERIFY0(dmu_buf_hold(sm->sm_os, + space_map_object(sm), next_word_offset, + tag, &db, DMU_READ_PREFETCH)); + dmu_buf_will_dirty(db, tx); + + /* update caller's dbuf */ + *dbp = db; + + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + block_base = db->db_data; + block_cursor = block_base; + block_end = block_base + + (db->db_size / sizeof (uint64_t)); + } + + /* + * If we are writing a two-word entry and we only have one + * word left on this block, just pad it with an empty debug + * entry and write the two-word entry in the next block. + */ + uint64_t *next_entry = block_cursor + 1; + if (next_entry == block_end && words > 1) { + ASSERT3U(words, ==, 2); + *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | + SM_DEBUG_ACTION_ENCODE(0) | + SM_DEBUG_SYNCPASS_ENCODE(0) | + SM_DEBUG_TXG_ENCODE(0); + block_cursor++; + sm->sm_phys->smp_objsize += sizeof (uint64_t); + ASSERT3P(block_cursor, ==, block_end); + continue; + } + + uint64_t run_len = MIN(size, run_max); + switch (words) { + case 1: + *block_cursor = SM_OFFSET_ENCODE(start) | + SM_TYPE_ENCODE(maptype) | + SM_RUN_ENCODE(run_len); + block_cursor++; + break; + case 2: + /* write the first word of the entry */ + *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | + SM2_RUN_ENCODE(run_len) | + SM2_VDEV_ENCODE(vdev_id); + block_cursor++; + + /* move on to the second word of the entry */ + ASSERT3P(block_cursor, <, block_end); + *block_cursor = SM2_TYPE_ENCODE(maptype) | + SM2_OFFSET_ENCODE(start); + block_cursor++; + break; + default: + panic("%d-word space map entries are not supported", + words); + break; + } + sm->sm_phys->smp_objsize += words * sizeof (uint64_t); + + start += run_len; + size -= run_len; + } + ASSERT0(size); + +} + +/* + * Note: The space map's dbuf must be dirty for the changes in sm_phys to + * take effect. + */ +static void +space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t vdev_id, dmu_tx_t *tx) +{ + spa_t *spa = tx->tx_pool->dp_spa; + dmu_buf_t *db; + + space_map_write_intro_debug(sm, maptype, tx); + +#ifdef DEBUG + /* + * We do this right after we write the intro debug entry + * because the estimate does not take it into account. + */ + uint64_t initial_objsize = sm->sm_phys->smp_objsize; + uint64_t estimated_growth = + space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); + uint64_t estimated_final_objsize = initial_objsize + estimated_growth; +#endif + + /* + * Find the offset right after the last word in the space map + * and use that to get a hold of the last block, so we can + * start appending to it. + */ + uint64_t next_word_offset = sm->sm_phys->smp_objsize; + VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), + next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + dmu_buf_will_dirty(db, tx); + + avl_tree_t *t = &rt->rt_root; + for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { + uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; + uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; + uint8_t words = 1; + + /* + * We only write two-word entries when both of the following + * are true: + * + * [1] The feature is enabled. + * [2] The offset or run is too big for a single-word entry, + * or the vdev_id is set (meaning not equal to + * SM_NO_VDEVID). + * + * Note that for purposes of testing we've added the case that + * we write two-word entries occasionally when the feature is + * enabled and zfs_force_some_double_word_sm_entries has been + * set. + */ + if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && + (offset >= (1ULL << SM_OFFSET_BITS) || + length > SM_RUN_MAX || + vdev_id != SM_NO_VDEVID || + (zfs_force_some_double_word_sm_entries && + spa_get_random(100) == 0))) + words = 2; + + space_map_write_seg(sm, rs, maptype, vdev_id, words, + &db, FTAG, tx); + } + + dmu_buf_rele(db, FTAG); + +#ifdef DEBUG + /* + * We expect our estimation to be based on the worst case + * scenario [see comment in space_map_estimate_optimal_size()]. + * Therefore we expect the actual objsize to be equal or less + * than whatever we estimated it to be. + */ + ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize); +#endif +} + +/* + * Note: This function manipulates the state of the given space map but + * does not hold any locks implicitly. Thus the caller is responsible + * for synchronizing writes to the space map. + */ void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, - dmu_tx_t *tx) + uint64_t vdev_id, dmu_tx_t *tx) { - objset_t *os = sm->sm_os; - spa_t *spa = dmu_objset_spa(os); - avl_tree_t *t = &rt->rt_root; - range_seg_t *rs; - uint64_t size, total, rt_space, nodes; - uint64_t *entry, *entry_map, *entry_map_end; - uint64_t expected_entries, actual_entries = 1; - - ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os))); VERIFY3U(space_map_object(sm), !=, 0); + dmu_buf_will_dirty(sm->sm_dbuf, tx); /* @@ -421,58 +738,10 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, else sm->sm_phys->smp_alloc -= range_tree_space(rt); - expected_entries = space_map_entries(sm, rt); + uint64_t nodes = avl_numnodes(&rt->rt_root); + uint64_t rt_space = range_tree_space(rt); - entry_map = vmem_alloc(sm->sm_blksz, KM_SLEEP); - entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t)); - entry = entry_map; - - *entry++ = SM_DEBUG_ENCODE(1) | - SM_DEBUG_ACTION_ENCODE(maptype) | - SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) | - SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); - - total = 0; - nodes = avl_numnodes(&rt->rt_root); - rt_space = range_tree_space(rt); - for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { - uint64_t start; - - size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; - start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; - - total += size << sm->sm_shift; - - while (size != 0) { - uint64_t run_len; - - run_len = MIN(size, SM_RUN_MAX); - - if (entry == entry_map_end) { - dmu_write(os, space_map_object(sm), - sm->sm_phys->smp_objsize, sm->sm_blksz, - entry_map, tx); - sm->sm_phys->smp_objsize += sm->sm_blksz; - entry = entry_map; - } - - *entry++ = SM_OFFSET_ENCODE(start) | - SM_TYPE_ENCODE(maptype) | - SM_RUN_ENCODE(run_len); - - start += run_len; - size -= run_len; - actual_entries++; - } - } - - if (entry != entry_map) { - size = (entry - entry_map) * sizeof (uint64_t); - dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize, - size, entry_map, tx); - sm->sm_phys->smp_objsize += size; - } - ASSERT3U(expected_entries, ==, actual_entries); + space_map_write_impl(sm, rt, maptype, vdev_id, tx); /* * Ensure that the space_map's accounting wasn't changed @@ -480,9 +749,6 @@ space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, */ VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); VERIFY3U(range_tree_space(rt), ==, rt_space); - VERIFY3U(range_tree_space(rt), ==, total); - - vmem_free(entry_map, sm->sm_blksz); } static int @@ -529,7 +795,6 @@ space_map_open(space_map_t **smp, objset_t *os, uint64_t object, space_map_close(sm); return (error); } - *smp = sm; return (0); @@ -661,6 +926,133 @@ space_map_free(space_map_t *sm, dmu_tx_t *tx) sm->sm_object = 0; } +/* + * Given a range tree, it makes a worst-case estimate of how much + * space would the tree's segments take if they were written to + * the given space map. + */ +uint64_t +space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, + uint64_t vdev_id) +{ + spa_t *spa = dmu_objset_spa(sm->sm_os); + uint64_t shift = sm->sm_shift; + uint64_t *histogram = rt->rt_histogram; + uint64_t entries_for_seg = 0; + + /* + * In order to get a quick estimate of the optimal size that this + * range tree would have on-disk as a space map, we iterate through + * its histogram buckets instead of iterating through its nodes. + * + * Note that this is a highest-bound/worst-case estimate for the + * following reasons: + * + * 1] We assume that we always add a debug padding for each block + * we write and we also assume that we start at the last word + * of a block attempting to write a two-word entry. + * 2] Rounding up errors due to the way segments are distributed + * in the buckets of the range tree's histogram. + * 3] The activation of zfs_force_some_double_word_sm_entries + * (tunable) when testing. + * + * = Math and Rounding Errors = + * + * rt_histogram[i] bucket of a range tree represents the number + * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given + * that, we want to divide the buckets into groups: Buckets that + * can be represented using a single-word entry, ones that can + * be represented with a double-word entry, and ones that can + * only be represented with multiple two-word entries. + * + * [Note that if the new encoding feature is not enabled there + * are only two groups: single-word entry buckets and multiple + * single-word entry buckets. The information below assumes + * two-word entries enabled, but it can easily applied when + * the feature is not enabled] + * + * To find the highest bucket that can be represented with a + * single-word entry we look at the maximum run that such entry + * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that + * the run of a space map entry is shifted by sm_shift, thus we + * add it to the exponent]. This way, excluding the value of the + * maximum run that can be represented by a single-word entry, + * all runs that are smaller exist in buckets 0 to + * SM_RUN_BITS + shift - 1. + * + * To find the highest bucket that can be represented with a + * double-word entry, we follow the same approach. Finally, any + * bucket higher than that are represented with multiple two-word + * entries. To be more specific, if the highest bucket whose + * segments can be represented with a single two-word entry is X, + * then bucket X+1 will need 2 two-word entries for each of its + * segments, X+2 will need 4, X+3 will need 8, ...etc. + * + * With all of the above we make our estimation based on bucket + * groups. There is a rounding error though. As we mentioned in + * the example with the one-word entry, the maximum run that can + * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is + * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of + * that length fall into the next bucket (and bucket group) where + * we start counting two-word entries and this is one more reason + * why the estimated size may end up being bigger than the actual + * size written. + */ + uint64_t size = 0; + uint64_t idx = 0; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || + (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { + + /* + * If we are trying to force some double word entries just + * assume the worst-case of every single word entry being + * written as a double word entry. + */ + uint64_t entry_size = + (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && + zfs_force_some_double_word_sm_entries) ? + (2 * sizeof (uint64_t)) : sizeof (uint64_t); + + uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; + for (; idx <= single_entry_max_bucket; idx++) + size += histogram[idx] * entry_size; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { + for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + ASSERT3U(idx, >=, single_entry_max_bucket); + entries_for_seg = + 1ULL << (idx - single_entry_max_bucket); + size += histogram[idx] * + entries_for_seg * entry_size; + } + return (size); + } + } + + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); + + uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; + for (; idx <= double_entry_max_bucket; idx++) + size += histogram[idx] * 2 * sizeof (uint64_t); + + for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + ASSERT3U(idx, >=, double_entry_max_bucket); + entries_for_seg = 1ULL << (idx - double_entry_max_bucket); + size += histogram[idx] * + entries_for_seg * 2 * sizeof (uint64_t); + } + + /* + * Assume the worst case where we start with the padding at the end + * of the current block and we add an extra padding entry at the end + * of all subsequent blocks. + */ + size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); + + return (size); +} + uint64_t space_map_object(space_map_t *sm) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index cf1bf2837f..a2f1f0658e 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2634,7 +2634,7 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) mutex_exit(&vd->vdev_dtl_lock); space_map_truncate(vd->vdev_dtl_sm, vdev_dtl_sm_blksz, tx); - space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx); + space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); range_tree_vacate(rtsync, NULL, NULL); range_tree_destroy(rtsync); diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c index b14b153b2a..f56d024ca2 100644 --- a/module/zfs/vdev_indirect.c +++ b/module/zfs/vdev_indirect.c @@ -825,7 +825,7 @@ vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) space_map_object(vd->vdev_obsolete_sm)); space_map_write(vd->vdev_obsolete_sm, - vd->vdev_obsolete_segments, SM_ALLOC, tx); + vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); space_map_update(vd->vdev_obsolete_sm); range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); } diff --git a/module/zfs/vdev_indirect_mapping.c b/module/zfs/vdev_indirect_mapping.c index d91f233836..a2766bd0d0 100644 --- a/module/zfs/vdev_indirect_mapping.c +++ b/module/zfs/vdev_indirect_mapping.c @@ -14,7 +14,7 @@ */ /* - * Copyright (c) 2015 by Delphix. All rights reserved. + * Copyright (c) 2015, 2017 by Delphix. All rights reserved. */ #include @@ -539,14 +539,13 @@ typedef struct load_obsolete_space_map_arg { } load_obsolete_space_map_arg_t; static int -load_obsolete_sm_callback(maptype_t type, uint64_t offset, uint64_t size, - void *arg) +load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) { load_obsolete_space_map_arg_t *losma = arg; - ASSERT3S(type, ==, SM_ALLOC); + ASSERT3S(sme->sme_type, ==, SM_ALLOC); vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, - offset, size, losma->losma_counts); + sme->sme_offset, sme->sme_run, losma->losma_counts); return (0); } diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg index fb389cb102..8994332b5d 100644 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg @@ -74,6 +74,7 @@ typeset -a properties=( "feature@device_removal" "feature@obsolete_counts" "feature@zpool_checkpoint" + "feature@spacemap_v2" ) # Additional properties added for Linux. diff --git a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh index 54dcd59c31..f1abad063d 100755 --- a/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh +++ b/tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh @@ -19,7 +19,7 @@ # # DESCRIPTION: -# Discard checkpoint on a stressed pool. Ensure that we can +# Discard checkpoint on a stressed pool. Ensure that we can # export and import the pool while discarding but not run any # operations that have to do with the checkpoint or change the # pool's config. @@ -63,6 +63,10 @@ log_onexit test_cleanup # the current setup the checkpoint space maps should # have tens of thousands of entries. # +# Note: If two-words entries are used in the space +# map, we should have even more time to +# verify this. +# set_tunable64 zfs_spa_discard_memory_limit 128 log_must zpool checkpoint $NESTEDPOOL