Fix cross-endian interoperability of zstd
It turns out that layouts of union bitfields are a pain, and the current code results in an inconsistent layout between BE and LE systems, leading to zstd-active datasets on one erroring out on the other. Switch everyone over to the LE layout, and add compatibility code to read both. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Matthew Ahrens <mahrens@delphix.com> Signed-off-by: Rich Ercolani <rincebrain@gmail.com> Closes #12008 Closes #12022
This commit is contained in:
parent
4e8a639d5f
commit
fb823061b0
|
@ -2211,7 +2211,8 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
|
||||||
(void) snprintf(blkbuf + strlen(blkbuf),
|
(void) snprintf(blkbuf + strlen(blkbuf),
|
||||||
buflen - strlen(blkbuf),
|
buflen - strlen(blkbuf),
|
||||||
" ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
|
" ZSTD:size=%u:version=%u:level=%u:EMBEDDED",
|
||||||
zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level);
|
zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
|
||||||
|
zfs_get_hdrlevel(&zstd_hdr));
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2235,7 +2236,8 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen,
|
||||||
(void) snprintf(blkbuf + strlen(blkbuf),
|
(void) snprintf(blkbuf + strlen(blkbuf),
|
||||||
buflen - strlen(blkbuf),
|
buflen - strlen(blkbuf),
|
||||||
" ZSTD:size=%u:version=%u:level=%u:NORMAL",
|
" ZSTD:size=%u:version=%u:level=%u:NORMAL",
|
||||||
zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level);
|
zstd_hdr.c_len, zfs_get_hdrversion(&zstd_hdr),
|
||||||
|
zfs_get_hdrlevel(&zstd_hdr));
|
||||||
|
|
||||||
abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
|
abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp));
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,21 +56,24 @@ typedef struct zfs_zstd_header {
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Version and compression level
|
* Version and compression level
|
||||||
* We use a union to be able to big endian encode a single 32 bit
|
* We used to use a union to reference compression level
|
||||||
* unsigned integer, but still access the individual bitmasked
|
* and version easily, but as it turns out, relying on the
|
||||||
* components easily.
|
* ordering of bitfields is not remotely portable.
|
||||||
|
* So now we have get/set functions in zfs_zstd.c for
|
||||||
|
* manipulating this in just the right way forever.
|
||||||
*/
|
*/
|
||||||
union {
|
uint32_t raw_version_level;
|
||||||
uint32_t raw_version_level;
|
|
||||||
struct {
|
|
||||||
uint32_t version : 24;
|
|
||||||
uint8_t level;
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
char data[];
|
char data[];
|
||||||
} zfs_zstdhdr_t;
|
} zfs_zstdhdr_t;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Simple struct to pass the data from raw_version_level around.
|
||||||
|
*/
|
||||||
|
typedef struct zfs_zstd_meta {
|
||||||
|
uint8_t level;
|
||||||
|
uint32_t version;
|
||||||
|
} zfs_zstdmeta_t;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* kstat helper macros
|
* kstat helper macros
|
||||||
*/
|
*/
|
||||||
|
@ -94,6 +97,129 @@ int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len,
|
||||||
size_t d_len, int n);
|
size_t d_len, int n);
|
||||||
void zfs_zstd_cache_reap_now(void);
|
void zfs_zstd_cache_reap_now(void);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* So, the reason we have all these complicated set/get functions is that
|
||||||
|
* originally, in the zstd "header" we wrote out to disk, we used a 32-bit
|
||||||
|
* bitfield to store the "level" (8 bits) and "version" (24 bits).
|
||||||
|
*
|
||||||
|
* Unfortunately, bitfields make few promises about how they're arranged in
|
||||||
|
* memory...
|
||||||
|
*
|
||||||
|
* By way of example, if we were using version 1.4.5 and level 3, it'd be
|
||||||
|
* level = 0x03, version = 10405/0x0028A5, which gets broken into Vhigh = 0x00,
|
||||||
|
* Vmid = 0x28, Vlow = 0xA5. We include these positions below to help follow
|
||||||
|
* which data winds up where.
|
||||||
|
*
|
||||||
|
* As a consequence, we wound up with little endian platforms with a layout
|
||||||
|
* like this in memory:
|
||||||
|
*
|
||||||
|
* 0 8 16 24 32
|
||||||
|
* +-------+-------+-------+-------+
|
||||||
|
* | Vlow | Vmid | Vhigh | level |
|
||||||
|
* +-------+-------+-------+-------+
|
||||||
|
* =A5 =28 =00 =03
|
||||||
|
*
|
||||||
|
* ...and then, after being run through BE_32(), serializing this out to
|
||||||
|
* disk:
|
||||||
|
*
|
||||||
|
* 0 8 16 24 32
|
||||||
|
* +-------+-------+-------+-------+
|
||||||
|
* | level | Vhigh | Vmid | Vlow |
|
||||||
|
* +-------+-------+-------+-------+
|
||||||
|
* =03 =00 =28 =A5
|
||||||
|
*
|
||||||
|
* while on big-endian systems, since BE_32() is a noop there, both in
|
||||||
|
* memory and on disk, we wind up with:
|
||||||
|
*
|
||||||
|
* 0 8 16 24 32
|
||||||
|
* +-------+-------+-------+-------+
|
||||||
|
* | Vhigh | Vmid | Vlow | level |
|
||||||
|
* +-------+-------+-------+-------+
|
||||||
|
* =00 =28 =A5 =03
|
||||||
|
*
|
||||||
|
* (Vhigh is always 0 until version exceeds 6.55.35. Vmid and Vlow are the
|
||||||
|
* other two bytes of the "version" data.)
|
||||||
|
*
|
||||||
|
* So now we use the BF32_SET macros to get consistent behavior (the
|
||||||
|
* ondisk LE encoding, since x86 currently rules the world) across
|
||||||
|
* platforms, but the "get" behavior requires that we check each of the
|
||||||
|
* bytes in the aforementioned former-bitfield for 0x00, and from there,
|
||||||
|
* we can know which possible layout we're dealing with. (Only the two
|
||||||
|
* that have been observed in the wild are illustrated above, but handlers
|
||||||
|
* for all 4 positions of 0x00 are implemented.
|
||||||
|
*/
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
zfs_get_hdrmeta(const zfs_zstdhdr_t *blob, zfs_zstdmeta_t *res)
|
||||||
|
{
|
||||||
|
uint32_t raw = blob->raw_version_level;
|
||||||
|
uint8_t findme = 0xff;
|
||||||
|
int shift;
|
||||||
|
for (shift = 0; shift < 4; shift++) {
|
||||||
|
findme = BF32_GET(raw, 8*shift, 8);
|
||||||
|
if (findme == 0)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
switch (shift) {
|
||||||
|
case 0:
|
||||||
|
res->level = BF32_GET(raw, 24, 8);
|
||||||
|
res->version = BSWAP_32(raw);
|
||||||
|
res->version = BF32_GET(res->version, 8, 24);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
res->level = BF32_GET(raw, 0, 8);
|
||||||
|
res->version = BSWAP_32(raw);
|
||||||
|
res->version = BF32_GET(res->version, 0, 24);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
res->level = BF32_GET(raw, 24, 8);
|
||||||
|
res->version = BF32_GET(raw, 0, 24);
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
res->level = BF32_GET(raw, 0, 8);
|
||||||
|
res->version = BF32_GET(raw, 8, 24);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
res->level = 0;
|
||||||
|
res->version = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint8_t
|
||||||
|
zfs_get_hdrlevel(const zfs_zstdhdr_t *blob)
|
||||||
|
{
|
||||||
|
uint8_t level = 0;
|
||||||
|
zfs_zstdmeta_t res;
|
||||||
|
zfs_get_hdrmeta(blob, &res);
|
||||||
|
level = res.level;
|
||||||
|
return (level);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline uint32_t
|
||||||
|
zfs_get_hdrversion(const zfs_zstdhdr_t *blob)
|
||||||
|
{
|
||||||
|
uint32_t version = 0;
|
||||||
|
zfs_zstdmeta_t res;
|
||||||
|
zfs_get_hdrmeta(blob, &res);
|
||||||
|
version = res.version;
|
||||||
|
return (version);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
zfs_set_hdrversion(zfs_zstdhdr_t *blob, uint32_t version)
|
||||||
|
{
|
||||||
|
BF32_SET(blob->raw_version_level, 0, 24, version);
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void
|
||||||
|
zfs_set_hdrlevel(zfs_zstdhdr_t *blob, uint8_t level)
|
||||||
|
{
|
||||||
|
BF32_SET(blob->raw_version_level, 24, 8, level);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -33,6 +33,7 @@ $(obj)/zfs_zstd.o: c_flags += -include $(zstd_include)/zstd_compat_wrapper.h
|
||||||
|
|
||||||
$(MODULE)-objs += zfs_zstd.o
|
$(MODULE)-objs += zfs_zstd.o
|
||||||
$(MODULE)-objs += lib/zstd.o
|
$(MODULE)-objs += lib/zstd.o
|
||||||
|
$(MODULE)-objs += zstd_sparc.o
|
||||||
|
|
||||||
all:
|
all:
|
||||||
mkdir -p lib
|
mkdir -p lib
|
||||||
|
|
|
@ -0,0 +1,4 @@
|
||||||
|
#if defined(__sparc)
|
||||||
|
uint64_t __bswapdi2(uint64_t in);
|
||||||
|
uint32_t __bswapsi2(uint32_t in);
|
||||||
|
#endif
|
|
@ -361,6 +361,7 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
|
||||||
return (1);
|
return (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/* Compress block using zstd */
|
/* Compress block using zstd */
|
||||||
size_t
|
size_t
|
||||||
zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
|
zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
|
||||||
|
@ -458,8 +459,8 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
|
||||||
* As soon as such incompatibility occurs, handling code needs to be
|
* As soon as such incompatibility occurs, handling code needs to be
|
||||||
* added, differentiating between the versions.
|
* added, differentiating between the versions.
|
||||||
*/
|
*/
|
||||||
hdr->version = ZSTD_VERSION_NUMBER;
|
zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
|
||||||
hdr->level = level;
|
zfs_set_hdrlevel(hdr, level);
|
||||||
hdr->raw_version_level = BE_32(hdr->raw_version_level);
|
hdr->raw_version_level = BE_32(hdr->raw_version_level);
|
||||||
|
|
||||||
return (c_len + sizeof (*hdr));
|
return (c_len + sizeof (*hdr));
|
||||||
|
@ -485,6 +486,7 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
|
||||||
* not modify the original data that may be used again later.
|
* not modify the original data that may be used again later.
|
||||||
*/
|
*/
|
||||||
hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
|
hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
|
||||||
|
uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* NOTE: We ignore the ZSTD version for now. As soon as any
|
* NOTE: We ignore the ZSTD version for now. As soon as any
|
||||||
|
@ -497,13 +499,13 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
|
||||||
* An invalid level is a strong indicator for data corruption! In such
|
* An invalid level is a strong indicator for data corruption! In such
|
||||||
* case return an error so the upper layers can try to fix it.
|
* case return an error so the upper layers can try to fix it.
|
||||||
*/
|
*/
|
||||||
if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) {
|
if (zstd_enum_to_level(curlevel, &zstd_level)) {
|
||||||
ZSTDSTAT_BUMP(zstd_stat_dec_inval);
|
ZSTDSTAT_BUMP(zstd_stat_dec_inval);
|
||||||
return (1);
|
return (1);
|
||||||
}
|
}
|
||||||
|
|
||||||
ASSERT3U(d_len, >=, s_len);
|
ASSERT3U(d_len, >=, s_len);
|
||||||
ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT);
|
ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
|
||||||
|
|
||||||
/* Invalid compressed buffer size encoded at start */
|
/* Invalid compressed buffer size encoded at start */
|
||||||
if (c_len + sizeof (*hdr) > s_len) {
|
if (c_len + sizeof (*hdr) > s_len) {
|
||||||
|
@ -534,7 +536,7 @@ zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (level) {
|
if (level) {
|
||||||
*level = hdr_copy.level;
|
*level = curlevel;
|
||||||
}
|
}
|
||||||
|
|
||||||
return (0);
|
return (0);
|
||||||
|
@ -764,7 +766,7 @@ module_exit(zstd_fini);
|
||||||
|
|
||||||
ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
|
ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS");
|
||||||
ZFS_MODULE_LICENSE("Dual BSD/GPL");
|
ZFS_MODULE_LICENSE("Dual BSD/GPL");
|
||||||
ZFS_MODULE_VERSION(ZSTD_VERSION_STRING);
|
ZFS_MODULE_VERSION(ZSTD_VERSION_STRING "a");
|
||||||
|
|
||||||
EXPORT_SYMBOL(zfs_zstd_compress);
|
EXPORT_SYMBOL(zfs_zstd_compress);
|
||||||
EXPORT_SYMBOL(zfs_zstd_decompress_level);
|
EXPORT_SYMBOL(zfs_zstd_decompress_level);
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
#ifdef __sparc__
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <sys/byteorder.h>
|
||||||
|
#include "include/sparc_compat.h"
|
||||||
|
uint64_t __bswapdi2(uint64_t in) {
|
||||||
|
return (BSWAP_64(in));
|
||||||
|
}
|
||||||
|
uint32_t __bswapsi2(uint32_t in) {
|
||||||
|
return (BSWAP_32(in));
|
||||||
|
}
|
||||||
|
#endif
|
Loading…
Reference in New Issue