From 7d75cde9405d771ac1b1fc43844b62c42ffca309 Mon Sep 17 00:00:00 2001 From: Rich Ercolani Date: Wed, 13 Mar 2024 01:58:35 -0400 Subject: [PATCH] Special vdevs weren't special enough for embedded_logs People keep wanting to use part of their special device as a slog, but manually partitioning off some space for it seems unnecessarily complex. So let's just redo the embedded_log dance, but for special vdevs, and make the allocator prefer those if they exist. Also plumbs in a vdev property for turning off this behavior on devices in case this is not desirable. Signed-off-by: Rich Ercolani --- cmd/zdb/zdb.c | 22 ++++++++++++++++++++-- include/sys/fs/zfs.h | 1 + include/sys/spa.h | 1 + include/sys/spa_impl.h | 1 + include/sys/vdev_impl.h | 1 + module/zcommon/zpool_prop.c | 3 +++ module/zfs/metaslab.c | 22 ++++++++++++++++++++++ module/zfs/spa.c | 12 ++++++++++++ module/zfs/spa_misc.c | 24 ++++++++++++++++++------ module/zfs/vdev.c | 35 +++++++++++++++++++++++++++++++---- module/zfs/zio.c | 5 +++++ 11 files changed, 115 insertions(+), 12 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 4880c80487..ae0fce3fdc 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -6427,6 +6427,7 @@ zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; spa->spa_log_class->mc_ops = &zdb_metaslab_ops; spa->spa_embedded_log_class->mc_ops = &zdb_metaslab_ops; + spa->spa_special_embedded_log_class->mc_ops = &zdb_metaslab_ops; zcb->zcb_vd_obsolete_counts = umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), @@ -6567,8 +6568,11 @@ zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; - ASSERT3P(msp->ms_group, ==, (msp->ms_group->mg_class == - spa_embedded_log_class(spa)) ? + ASSERT3P(msp->ms_group, ==, ( + (msp->ms_group->mg_class == + spa_embedded_log_class(spa)) || + (msp->ms_group->mg_class == + spa_special_embedded_log_class(spa))) ? vd->vdev_log_mg : vd->vdev_mg); /* @@ -6799,6 +6803,8 @@ dump_block_stats(spa_t *spa) zcb->zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); zcb->zcb_totalasize += metaslab_class_get_alloc(spa_embedded_log_class(spa)); + zcb->zcb_totalasize += + metaslab_class_get_alloc(spa_special_embedded_log_class(spa)); zcb->zcb_start = zcb->zcb_lastprint = gethrtime(); err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, zcb); @@ -6848,6 +6854,7 @@ dump_block_stats(spa_t *spa) metaslab_class_get_alloc(spa_log_class(spa)) + metaslab_class_get_alloc(spa_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_special_class(spa)) + + metaslab_class_get_alloc(spa_special_embedded_log_class(spa)) + metaslab_class_get_alloc(spa_dedup_class(spa)) + get_unflushed_alloc_space(spa); total_found = @@ -6930,6 +6937,17 @@ dump_block_stats(spa_t *spa) "Embedded log class", (u_longlong_t)alloc, 100.0 * alloc / space); } + if (spa_special_embedded_log_class(spa)->mc_allocator[0].mca_rotor + != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_special_embedded_log_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_special_embedded_log_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Special embedded log class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { if (zcb->zcb_embedded_blocks[i] == 0) diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h index 025567e218..8e4ec9bc83 100644 --- a/include/sys/fs/zfs.h +++ b/include/sys/fs/zfs.h @@ -368,6 +368,7 @@ typedef enum { VDEV_PROP_RAIDZ_EXPANDING, VDEV_PROP_SLOW_IO_N, VDEV_PROP_SLOW_IO_T, + VDEV_PROP_ELOG, VDEV_NUM_PROPS } vdev_prop_t; diff --git a/include/sys/spa.h b/include/sys/spa.h index cada3c8410..54ab9c8a91 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -1042,6 +1042,7 @@ extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); extern metaslab_class_t *spa_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_special_class(spa_t *spa); +extern metaslab_class_t *spa_special_embedded_log_class(spa_t *spa); extern metaslab_class_t *spa_dedup_class(spa_t *spa); extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index 0cd0c4720f..5a6d1ff1c9 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -240,6 +240,7 @@ struct spa { metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ metaslab_class_t *spa_embedded_log_class; /* log on normal vdevs */ + metaslab_class_t *spa_special_embedded_log_class; /* "" special */ metaslab_class_t *spa_special_class; /* special allocation class */ metaslab_class_t *spa_dedup_class; /* dedup allocation class */ uint64_t spa_first_txg; /* first txg after spa_open() */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h index f39ebf031c..4068299b4e 100644 --- a/include/sys/vdev_impl.h +++ b/include/sys/vdev_impl.h @@ -269,6 +269,7 @@ struct vdev { uint64_t vdev_ms_count; /* number of metaslabs */ metaslab_group_t *vdev_mg; /* metaslab group */ metaslab_group_t *vdev_log_mg; /* embedded slog metaslab group */ + boolean_t use_embedded_log; /* use embedded slog mg */ metaslab_t **vdev_ms; /* metaslab array */ txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c index e2e3bf5be6..e17c490e23 100644 --- a/module/zcommon/zpool_prop.c +++ b/module/zcommon/zpool_prop.c @@ -448,6 +448,9 @@ vdev_prop_init(void) zprop_register_index(VDEV_PROP_RAIDZ_EXPANDING, "raidz_expanding", 0, PROP_READONLY, ZFS_TYPE_VDEV, "on | off", "RAIDZ_EXPANDING", boolean_table, sfeatures); + zprop_register_index(VDEV_PROP_ELOG, "embedded_log_target", 1, + PROP_DEFAULT, ZFS_TYPE_VDEV, "on | off", "ELOG", + boolean_table, sfeatures); /* default index properties */ zprop_register_index(VDEV_PROP_FAILFAST, "failfast", B_TRUE, diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 7237fa8eeb..764b9d9169 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -1222,6 +1222,15 @@ metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, spa_t *spa = mg->mg_vd->vdev_spa; metaslab_class_t *mc = mg->mg_class; + /* + * If we're attempting to allocate from an embedded_log class, + * and we have it set to not use that on this vdev, don't. + */ + if ((mc == spa_special_embedded_log_class(spa) || + mc == spa_embedded_log_class(spa)) && + mg->mg_vd->use_embedded_log == B_FALSE) { + return (B_FALSE); + } /* * We can only consider skipping this metaslab group if it's * in the normal metaslab class and there are other metaslab @@ -5226,6 +5235,7 @@ top: ASSERT(mg->mg_activation_count == 1); vd = mg->mg_vd; + /* * Don't allocate from faulted devices. */ @@ -5237,6 +5247,18 @@ top: allocatable = vdev_allocatable(vd); } + /* + * If we're trying a log allocation from an + * embedded_log allocation class, and we + * have turned off allocating those from this vdev, + * don't. + */ + if ((mc == spa_special_embedded_log_class(spa) || + mc == spa_embedded_log_class(spa)) && + ((flags & METASLAB_ZIL) != 0) && + vd->use_embedded_log == B_FALSE) + allocatable = B_FALSE; + /* * Determine if the selected metaslab group is eligible * for allocations. If we're ganging then don't allow diff --git a/module/zfs/spa.c b/module/zfs/spa.c index b144d06529..495dc999e8 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -367,11 +367,15 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) alloc += metaslab_class_get_alloc(spa_special_class(spa)); alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa)); + alloc += metaslab_class_get_alloc( + spa_special_embedded_log_class(spa)); size = metaslab_class_get_space(mc); size += metaslab_class_get_space(spa_special_class(spa)); size += metaslab_class_get_space(spa_dedup_class(spa)); size += metaslab_class_get_space(spa_embedded_log_class(spa)); + size += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); @@ -1634,6 +1638,7 @@ spa_activate(spa_t *spa, spa_mode_t mode) spa->spa_log_class = metaslab_class_create(spa, msp); spa->spa_embedded_log_class = metaslab_class_create(spa, msp); spa->spa_special_class = metaslab_class_create(spa, msp); + spa->spa_special_embedded_log_class = metaslab_class_create(spa, msp); spa->spa_dedup_class = metaslab_class_create(spa, msp); /* Try to create a covering process */ @@ -1807,6 +1812,9 @@ spa_deactivate(spa_t *spa) metaslab_class_destroy(spa->spa_special_class); spa->spa_special_class = NULL; + metaslab_class_destroy(spa->spa_special_embedded_log_class); + spa->spa_special_embedded_log_class = NULL; + metaslab_class_destroy(spa->spa_dedup_class); spa->spa_dedup_class = NULL; @@ -8792,6 +8800,8 @@ spa_async_thread(void *arg) old_space += metaslab_class_get_space(spa_dedup_class(spa)); old_space += metaslab_class_get_space( spa_embedded_log_class(spa)); + old_space += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); @@ -8800,6 +8810,8 @@ spa_async_thread(void *arg) new_space += metaslab_class_get_space(spa_dedup_class(spa)); new_space += metaslab_class_get_space( spa_embedded_log_class(spa)); + new_space += metaslab_class_get_space( + spa_special_embedded_log_class(spa)); mutex_exit(&spa_namespace_lock); /* diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 68b9076141..4261f111ac 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1262,7 +1262,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, int config_changed = B_FALSE; - ASSERT(txg > spa_last_synced_txg(spa)); + ASSERT3U(txg, >, spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; @@ -1279,11 +1279,13 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, /* * Verify the metaslab classes. */ - ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_embedded_log_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); - ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); + ASSERT3U(metaslab_class_validate(spa_normal_class(spa)), ==, 0); + ASSERT3U(metaslab_class_validate(spa_log_class(spa)), ==, 0); + ASSERT3U(metaslab_class_validate(spa_embedded_log_class(spa)), ==, 0); + ASSERT3U(metaslab_class_validate(spa_special_class(spa)), ==, 0); + ASSERT3U(metaslab_class_validate( + spa_special_embedded_log_class(spa)), ==, 0); + ASSERT3U(metaslab_class_validate(spa_dedup_class(spa)), ==, 0); spa_config_exit(spa, SCL_ALL, spa); @@ -1851,6 +1853,10 @@ spa_get_slop_space(spa_t *spa) metaslab_class_get_dspace(spa_embedded_log_class(spa)); slop -= MIN(embedded_log, slop >> 1); + uint64_t s_embedded_log = + metaslab_class_get_dspace(spa_special_embedded_log_class(spa)); + slop -= MIN(s_embedded_log, slop >> 1); + /* * Slop space should be at least spa_min_slop, but no more than half * the entire pool. @@ -1952,6 +1958,12 @@ spa_special_class(spa_t *spa) return (spa->spa_special_class); } +metaslab_class_t * +spa_special_embedded_log_class(spa_t *spa) +{ + return (spa->spa_special_embedded_log_class); +} + metaslab_class_t * spa_dedup_class(spa_t *spa) { diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index ebba453e2b..34c27a529d 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -265,8 +265,9 @@ vdev_getops(const char *type) metaslab_group_t * vdev_get_mg(vdev_t *vd, metaslab_class_t *mc) { - if (mc == spa_embedded_log_class(vd->vdev_spa) && - vd->vdev_log_mg != NULL) + if ((mc == spa_embedded_log_class(vd->vdev_spa) || + mc == spa_special_embedded_log_class(vd->vdev_spa)) && + vd->vdev_log_mg != NULL && vd->use_embedded_log == B_TRUE) return (vd->vdev_log_mg); else return (vd->vdev_mg); @@ -1470,6 +1471,13 @@ vdev_metaslab_group_create(vdev_t *vd) if (!vd->vdev_islog) { vd->vdev_log_mg = metaslab_group_create( spa_embedded_log_class(spa), vd, 1); + vd->use_embedded_log = B_TRUE; + } + + if (vd->vdev_alloc_bias == VDEV_BIAS_SPECIAL) { + vd->vdev_log_mg = metaslab_group_create( + spa_special_embedded_log_class(spa), vd, 1); + vd->use_embedded_log = B_TRUE; } /* @@ -1555,7 +1563,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) * embedded slog by moving it from the regular to the log metaslab * group. */ - if (vd->vdev_mg->mg_class == spa_normal_class(spa) && + if ((vd->vdev_mg->mg_class == spa_normal_class(spa) || + vd->vdev_mg->mg_class == spa_special_class(spa)) && vd->vdev_ms_count > zfs_embedded_slog_min_ms && avl_is_empty(&vd->vdev_log_mg->mg_metaslab_tree)) { uint64_t slog_msid = 0; @@ -5998,11 +6007,18 @@ vdev_prop_set(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) } vd->vdev_slow_io_t = intval; break; + case VDEV_PROP_ELOG: + if (nvpair_value_uint64(elem, &intval) != 0) { + error = EINVAL; + break; + } + vd->use_embedded_log = intval; + break; default: /* Most processing is done in vdev_props_set_sync */ break; } -end: + end: if (error != 0) { intval = error; vdev_prop_add_list(outnvl, propname, strval, intval, 0); @@ -6318,6 +6334,17 @@ vdev_prop_get(vdev_t *vd, nvlist_t *innvl, nvlist_t *outnvl) vdev_prop_add_list(outnvl, propname, NULL, intval, src); break; + case VDEV_PROP_ELOG: + intval = vd->use_embedded_log; + + if (intval == vdev_prop_default_numeric(prop)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + vdev_prop_add_list(outnvl, propname, NULL, + intval, src); + break; + case VDEV_PROP_FAILFAST: src = ZPROP_SRC_LOCAL; strval = NULL; diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 213fe5c483..584fdabdb3 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -3858,6 +3858,11 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0); + if (error != 0) { + error = metaslab_alloc(spa, spa_special_embedded_log_class(spa), + size, new_bp, 1, txg, NULL, flags, + &io_alloc_list, NULL, allocator); + } if (error != 0) { error = metaslab_alloc(spa, spa_embedded_log_class(spa), size, new_bp, 1, txg, NULL, flags,