From 81591ace4a5f1aa9cedb00d2ccb25eb103358f3a Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 27 Aug 2024 15:52:33 +0800 Subject: [PATCH 1/3] dmu_objset: replace dnode_hash impl with cityhash4 As mentioned in PR #16131, replacing CRC-based hash with cityhash4 could slightly improve the performance by eliminating memory access. Replacing algorightm is safe since the hash result is not persisted. See: openzfs/zfs#16131 Signed-off-by: Shengqi Chen --- module/zfs/dmu_objset.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 8f4fefa4f4..32a492cb4c 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -66,6 +66,7 @@ #include "zfs_namecheck.h" #include #include +#include /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -390,27 +391,13 @@ dmu_objset_byteswap(void *buf, size_t size) } /* - * The hash is a CRC-based hash of the objset_t pointer and the object number. + * Runs cityhash4 on the objset_t pointer and the object number. */ static uint64_t dnode_hash(const objset_t *os, uint64_t obj) { uintptr_t osv = (uintptr_t)os; - uint64_t crc = -1ULL; - - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - /* - * The lower 11 bits of the pointer don't have much entropy, because - * the objset_t is more than 1KB long and so likely aligned to 2KB. - */ - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; - - crc ^= (osv>>14) ^ (obj>>24); - - return (crc); + return (cityhash4((uint64_t)osv, obj, 0, 0)); } static unsigned int From 7b580dcdf292b0648e9422ccd8bc4fc817792611 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Sat, 7 Sep 2024 21:55:03 +0800 Subject: [PATCH 2/3] zcommon: add specialized versions of cityhash4 Specializing cityhash4 on 32-bit architectures can reduce the size of stack frames as well as instruction count. This is a tiny but useful optimization, since some callers invoke it frequently. When specializing into 1/2/3/4-arg versions, the stack usage (in bytes) on some 32-bit arches are listed as follows: - x86: 32, 32, 32, 40 - arm-v7a: 20, 20, 28, 36 - riscv: 0, 0, 0, 16 - power: 16, 16, 16, 32 - mipsel: 8, 8, 8, 24 And each actual argument (even if passing 0) contributes evenly to the number of multiplication instructions generated: - x86: 9, 12, 15 ,18 - arm-v7a: 6, 8, 10, 12 - riscv / power: 12, 18, 20, 24 - mipsel: 9, 12, 15, 19 On 64-bit architectures, the tendencies are similar. But both stack sizes and instruction counts are significantly smaller thus negligible. See more discussion at https://github.com/openzfs/zfs/pull/16483. Acked-by: Alexander Motin Signed-off-by: Shengqi Chen --- include/cityhash.h | 7 +++++++ lib/libzfs/libzfs.abi | 18 ++++++++++++++++++ module/zcommon/cityhash.c | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/include/cityhash.h b/include/cityhash.h index 3b2d1e84b5..346fb673a0 100644 --- a/include/cityhash.h +++ b/include/cityhash.h @@ -32,6 +32,13 @@ extern "C" { #endif +/* + * Define 1/2/3-argument specialized versions of cityhash4, which can reduce + * instruction count (especially multiplication) on some 32-bit arches. + */ +_SYS_CITYHASH_H uint64_t cityhash1(uint64_t); +_SYS_CITYHASH_H uint64_t cityhash2(uint64_t, uint64_t); +_SYS_CITYHASH_H uint64_t cityhash3(uint64_t, uint64_t, uint64_t); _SYS_CITYHASH_H uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t); #ifdef __cplusplus diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 51b29643ee..fab97ada68 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -153,6 +153,9 @@ + + + @@ -9179,6 +9182,21 @@ + + + + + + + + + + + + + + + diff --git a/module/zcommon/cityhash.c b/module/zcommon/cityhash.c index 413a96df2c..c758ec47d1 100644 --- a/module/zcommon/cityhash.c +++ b/module/zcommon/cityhash.c @@ -49,8 +49,8 @@ cityhash_helper(uint64_t u, uint64_t v, uint64_t mul) return (b); } -uint64_t -cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) +static inline uint64_t +cityhash_impl(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) { uint64_t mul = HASH_K2 + 64; uint64_t a = w1 * HASH_K1; @@ -59,9 +59,38 @@ cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) uint64_t d = w3 * HASH_K2; return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d, a + rotate(b + HASH_K2, 18) + c, mul)); +} +/* + * Passing w as the 2nd argument could save one 64-bit multiplication. + */ +uint64_t +cityhash1(uint64_t w) +{ + return (cityhash_impl(0, w, 0, 0)); +} + +uint64_t +cityhash2(uint64_t w1, uint64_t w2) +{ + return (cityhash_impl(w1, w2, 0, 0)); +} + +uint64_t +cityhash3(uint64_t w1, uint64_t w2, uint64_t w3) +{ + return (cityhash_impl(w1, w2, w3, 0)); +} + +uint64_t +cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) +{ + return (cityhash_impl(w1, w2, w3, w4)); } #if defined(_KERNEL) +EXPORT_SYMBOL(cityhash1); +EXPORT_SYMBOL(cityhash2); +EXPORT_SYMBOL(cityhash3); EXPORT_SYMBOL(cityhash4); #endif From af3228c69b863919f4d4f2240c71be752226274f Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Sat, 7 Sep 2024 22:07:14 +0800 Subject: [PATCH 3/3] cityhash: replace invocations with specialized versions when possible So that we can get actual benefit from last commit. See more discussion at https://github.com/openzfs/zfs/pull/16483. Acked-by: Alexander Motin Signed-off-by: Shengqi Chen --- cmd/zstream/zstream_redup.c | 4 ++-- module/os/linux/zfs/zvol_os.c | 4 ++-- module/zfs/dmu_objset.c | 4 ++-- module/zfs/zio.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/zstream/zstream_redup.c b/cmd/zstream/zstream_redup.c index dccd325d4c..51266b0b66 100644 --- a/cmd/zstream/zstream_redup.c +++ b/cmd/zstream/zstream_redup.c @@ -132,7 +132,7 @@ static void rdt_insert(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) { - uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t ch = cityhash3(guid, object, offset); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); redup_entry_t **rdepp; @@ -152,7 +152,7 @@ rdt_lookup(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t *stream_offsetp) { - uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t ch = cityhash3(guid, object, offset); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index d1e3061b50..b9885d4ddf 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -566,8 +566,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; #endif #endif - taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, - blk_mq_hw_queue, 0); + taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, + blk_mq_hw_queue); tq_idx = taskq_hash % ztqs->tqs_cnt; if (rw == WRITE) { diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 32a492cb4c..dfe471b644 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -391,13 +391,13 @@ dmu_objset_byteswap(void *buf, size_t size) } /* - * Runs cityhash4 on the objset_t pointer and the object number. + * Runs cityhash on the objset_t pointer and the object number. */ static uint64_t dnode_hash(const objset_t *os, uint64_t obj) { uintptr_t osv = (uintptr_t)os; - return (cityhash4((uint64_t)osv, obj, 0, 0)); + return (cityhash2((uint64_t)osv, obj)); } static unsigned int diff --git a/module/zfs/zio.c b/module/zfs/zio.c index e4ccd144f0..865896d3fe 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4157,8 +4157,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * some parallelism. */ int flags = METASLAB_ZIL; - int allocator = (uint_t)cityhash4(0, 0, 0, - os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; + int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object) + % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0);