From fb823061b037070f709a8ff0d9aabfbb702892f5 Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Mon, 30 Aug 2021 17:13:46 -0400 Subject: [PATCH] Fix cross-endian interoperability of zstd It turns out that layouts of union bitfields are a pain, and the current code results in an inconsistent layout between BE and LE systems, leading to zstd-active datasets on one erroring out on the other. Switch everyone over to the LE layout, and add compatibility code to read both. Reviewed-by: Brian Behlendorf Reviewed-by: Matthew Ahrens Signed-off-by: Rich Ercolani Closes #12008 Closes #12022 --- cmd/zdb/zdb.c | 6 +- include/sys/zstd/zstd.h | 148 ++++++++++++++++++++++++++--- module/zstd/Makefile.in | 1 + module/zstd/include/sparc_compat.h | 4 + module/zstd/zfs_zstd.c | 14 +-- module/zstd/zstd_sparc.c | 11 +++ 6 files changed, 165 insertions(+), 19 deletions(-) create mode 100644 module/zstd/include/sparc_compat.h create mode 100644 module/zstd/zstd_sparc.c diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 8228e98a1d..cd14ddfcf6 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2211,7 +2211,8 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", - zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level); + zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), + zfs_get_hdrlevel(&zstd_hdr)); return; } @@ -2235,7 +2236,8 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, (void) snprintf(blkbuf + strlen(blkbuf), buflen - strlen(blkbuf), " ZSTD:size=%u:version=%u:level=%u:NORMAL", - zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level); + zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr), + zfs_get_hdrlevel(&zstd_hdr)); abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); } diff --git a/include/sys/zstd/zstd.h b/include/sys/zstd/zstd.h index 8fe4a24828..e87dda1b18 100644 --- a/include/sys/zstd/zstd.h +++ b/include/sys/zstd/zstd.h @@ -56,21 +56,24 @@ typedef struct zfs_zstd_header { /* * Version and compression level - * We use a union to be able to big endian encode a single 32 bit - * unsigned integer, but still access the individual bitmasked - * components easily. + * We used to use a union to reference compression level + * and version easily, but as it turns out, relying on the + * ordering of bitfields is not remotely portable. + * So now we have get/set functions in zfs_zstd.c for + * manipulating this in just the right way forever. */ - union { - uint32_t raw_version_level; - struct { - uint32_t version : 24; - uint8_t level; - }; - }; - + uint32_t raw_version_level; char data[]; } zfs_zstdhdr_t; +/* + * Simple struct to pass the data from raw_version_level around. + */ +typedef struct zfs_zstd_meta { + uint8_t level; + uint32_t version; +} zfs_zstdmeta_t; + /* * kstat helper macros */ @@ -94,6 +97,129 @@ int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n); void zfs_zstd_cache_reap_now(void); +/* + * So, the reason we have all these complicated set/get functions is that + * originally, in the zstd "header" we wrote out to disk, we used a 32-bit + * bitfield to store the "level" (8 bits) and "version" (24 bits). + * + * Unfortunately, bitfields make few promises about how they're arranged in + * memory... + * + * By way of example, if we were using version 1.4.5 and level 3, it'd be + * level = 0x03, version = 10405/0x0028A5, which gets broken into Vhigh = 0x00, + * Vmid = 0x28, Vlow = 0xA5. We include these positions below to help follow + * which data winds up where. + * + * As a consequence, we wound up with little endian platforms with a layout + * like this in memory: + * + * 0 8 16 24 32 + * +-------+-------+-------+-------+ + * | Vlow | Vmid | Vhigh | level | + * +-------+-------+-------+-------+ + * =A5 =28 =00 =03 + * + * ...and then, after being run through BE_32(), serializing this out to + * disk: + * + * 0 8 16 24 32 + * +-------+-------+-------+-------+ + * | level | Vhigh | Vmid | Vlow | + * +-------+-------+-------+-------+ + * =03 =00 =28 =A5 + * + * while on big-endian systems, since BE_32() is a noop there, both in + * memory and on disk, we wind up with: + * + * 0 8 16 24 32 + * +-------+-------+-------+-------+ + * | Vhigh | Vmid | Vlow | level | + * +-------+-------+-------+-------+ + * =00 =28 =A5 =03 + * + * (Vhigh is always 0 until version exceeds 6.55.35. Vmid and Vlow are the + * other two bytes of the "version" data.) + * + * So now we use the BF32_SET macros to get consistent behavior (the + * ondisk LE encoding, since x86 currently rules the world) across + * platforms, but the "get" behavior requires that we check each of the + * bytes in the aforementioned former-bitfield for 0x00, and from there, + * we can know which possible layout we're dealing with. (Only the two + * that have been observed in the wild are illustrated above, but handlers + * for all 4 positions of 0x00 are implemented. + */ + +static inline void +zfs_get_hdrmeta(const zfs_zstdhdr_t *blob, zfs_zstdmeta_t *res) +{ + uint32_t raw = blob->raw_version_level; + uint8_t findme = 0xff; + int shift; + for (shift = 0; shift < 4; shift++) { + findme = BF32_GET(raw, 8*shift, 8); + if (findme == 0) + break; + } + switch (shift) { + case 0: + res->level = BF32_GET(raw, 24, 8); + res->version = BSWAP_32(raw); + res->version = BF32_GET(res->version, 8, 24); + break; + case 1: + res->level = BF32_GET(raw, 0, 8); + res->version = BSWAP_32(raw); + res->version = BF32_GET(res->version, 0, 24); + break; + case 2: + res->level = BF32_GET(raw, 24, 8); + res->version = BF32_GET(raw, 0, 24); + break; + case 3: + res->level = BF32_GET(raw, 0, 8); + res->version = BF32_GET(raw, 8, 24); + break; + default: + res->level = 0; + res->version = 0; + break; + } +} + +static inline uint8_t +zfs_get_hdrlevel(const zfs_zstdhdr_t *blob) +{ + uint8_t level = 0; + zfs_zstdmeta_t res; + zfs_get_hdrmeta(blob, &res); + level = res.level; + return (level); +} + +static inline uint32_t +zfs_get_hdrversion(const zfs_zstdhdr_t *blob) +{ + uint32_t version = 0; + zfs_zstdmeta_t res; + zfs_get_hdrmeta(blob, &res); + version = res.version; + return (version); + +} + +static inline void +zfs_set_hdrversion(zfs_zstdhdr_t *blob, uint32_t version) +{ + BF32_SET(blob->raw_version_level, 0, 24, version); +} + +static inline void +zfs_set_hdrlevel(zfs_zstdhdr_t *blob, uint8_t level) +{ + BF32_SET(blob->raw_version_level, 24, 8, level); +} + + #ifdef __cplusplus } #endif diff --git a/module/zstd/Makefile.in b/module/zstd/Makefile.in index f67db710f0..091f7cea36 100644 --- a/module/zstd/Makefile.in +++ b/module/zstd/Makefile.in @@ -33,6 +33,7 @@ $(obj)/zfs_zstd.o: c_flags += -include $(zstd_include)/zstd_compat_wrapper.h $(MODULE)-objs += zfs_zstd.o $(MODULE)-objs += lib/zstd.o +$(MODULE)-objs += zstd_sparc.o all: mkdir -p lib diff --git a/module/zstd/include/sparc_compat.h b/module/zstd/include/sparc_compat.h new file mode 100644 index 0000000000..14c1bdde91 --- /dev/null +++ b/module/zstd/include/sparc_compat.h @@ -0,0 +1,4 @@ +#if defined(__sparc) +uint64_t __bswapdi2(uint64_t in); +uint32_t __bswapsi2(uint32_t in); +#endif diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c index dfcd938aea..5b993cc7cf 100644 --- a/module/zstd/zfs_zstd.c +++ b/module/zstd/zfs_zstd.c @@ -361,6 +361,7 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) return (1); } + /* Compress block using zstd */ size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, @@ -458,8 +459,8 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, * As soon as such incompatibility occurs, handling code needs to be * added, differentiating between the versions. */ - hdr->version = ZSTD_VERSION_NUMBER; - hdr->level = level; + zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER); + zfs_set_hdrlevel(hdr, level); hdr->raw_version_level = BE_32(hdr->raw_version_level); return (c_len + sizeof (*hdr)); @@ -485,6 +486,7 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, * not modify the original data that may be used again later. */ hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); + uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy); /* * NOTE: We ignore the ZSTD version for now. As soon as any @@ -497,13 +499,13 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, * An invalid level is a strong indicator for data corruption! In such * case return an error so the upper layers can try to fix it. */ - if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) { + if (zstd_enum_to_level(curlevel, &zstd_level)) { ZSTDSTAT_BUMP(zstd_stat_dec_inval); return (1); } ASSERT3U(d_len, >=, s_len); - ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT); + ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT); /* Invalid compressed buffer size encoded at start */ if (c_len + sizeof (*hdr) > s_len) { @@ -534,7 +536,7 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, } if (level) { - *level = hdr_copy.level; + *level = curlevel; } return (0); @@ -764,7 +766,7 @@ module_exit(zstd_fini); ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS"); ZFS_MODULE_LICENSE("Dual BSD/GPL"); -ZFS_MODULE_VERSION(ZSTD_VERSION_STRING); +ZFS_MODULE_VERSION(ZSTD_VERSION_STRING "a"); EXPORT_SYMBOL(zfs_zstd_compress); EXPORT_SYMBOL(zfs_zstd_decompress_level); diff --git a/module/zstd/zstd_sparc.c b/module/zstd/zstd_sparc.c new file mode 100644 index 0000000000..463df99bd7 --- /dev/null +++ b/module/zstd/zstd_sparc.c @@ -0,0 +1,11 @@ +#ifdef __sparc__ +#include +#include +#include "include/sparc_compat.h" +uint64_t __bswapdi2(uint64_t in) { + return (BSWAP_64(in)); +} +uint32_t __bswapsi2(uint32_t in) { + return (BSWAP_32(in)); +} +#endif