ICP: gcm: Allocate hash subkey table separately
While evaluating other assembler implementations it turns out that the precomputed hash subkey tables vary in size, from 8*16 bytes (avx2/avx512) up to 48*16 bytes (avx512-vaes), depending on the implementation. To be able to handle the size differences later, allocate `gcm_Htable` dynamically rather then having a fixed size array, and adapt consumers. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Attila Fülöp <attila@fueloep.org> Closes #11102
This commit is contained in:
parent
d9655c5b37
commit
e8beeaa111
|
@ -59,10 +59,12 @@ boolean_t gcm_avx_can_use_movbe = B_FALSE;
|
||||||
static boolean_t gcm_use_avx = B_FALSE;
|
static boolean_t gcm_use_avx = B_FALSE;
|
||||||
#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
|
#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx)
|
||||||
|
|
||||||
|
extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
|
||||||
|
|
||||||
static inline boolean_t gcm_avx_will_work(void);
|
static inline boolean_t gcm_avx_will_work(void);
|
||||||
static inline void gcm_set_avx(boolean_t);
|
static inline void gcm_set_avx(boolean_t);
|
||||||
static inline boolean_t gcm_toggle_avx(void);
|
static inline boolean_t gcm_toggle_avx(void);
|
||||||
extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
|
static inline size_t gcm_simd_get_htab_size(boolean_t);
|
||||||
|
|
||||||
static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
|
static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
|
||||||
crypto_data_t *, size_t);
|
crypto_data_t *, size_t);
|
||||||
|
@ -629,6 +631,21 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
|
||||||
(volatile boolean_t *)&gcm_avx_can_use_movbe);
|
(volatile boolean_t *)&gcm_avx_can_use_movbe);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
/* Allocate Htab memory as needed. */
|
||||||
|
if (gcm_ctx->gcm_use_avx == B_TRUE) {
|
||||||
|
size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
|
||||||
|
|
||||||
|
if (htab_len == 0) {
|
||||||
|
return (CRYPTO_MECHANISM_PARAM_INVALID);
|
||||||
|
}
|
||||||
|
gcm_ctx->gcm_htab_len = htab_len;
|
||||||
|
gcm_ctx->gcm_Htable =
|
||||||
|
(uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
|
||||||
|
|
||||||
|
if (gcm_ctx->gcm_Htable == NULL) {
|
||||||
|
return (CRYPTO_HOST_MEMORY);
|
||||||
|
}
|
||||||
|
}
|
||||||
/* Avx and non avx context initialization differs from here on. */
|
/* Avx and non avx context initialization differs from here on. */
|
||||||
if (gcm_ctx->gcm_use_avx == B_FALSE) {
|
if (gcm_ctx->gcm_use_avx == B_FALSE) {
|
||||||
#endif /* ifdef CAN_USE_GCM_ASM */
|
#endif /* ifdef CAN_USE_GCM_ASM */
|
||||||
|
@ -689,6 +706,22 @@ gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
|
||||||
if (ks->ops->needs_byteswap == B_TRUE) {
|
if (ks->ops->needs_byteswap == B_TRUE) {
|
||||||
gcm_ctx->gcm_use_avx = B_FALSE;
|
gcm_ctx->gcm_use_avx = B_FALSE;
|
||||||
}
|
}
|
||||||
|
/* Allocate Htab memory as needed. */
|
||||||
|
if (gcm_ctx->gcm_use_avx == B_TRUE) {
|
||||||
|
size_t htab_len = gcm_simd_get_htab_size(gcm_ctx->gcm_use_avx);
|
||||||
|
|
||||||
|
if (htab_len == 0) {
|
||||||
|
return (CRYPTO_MECHANISM_PARAM_INVALID);
|
||||||
|
}
|
||||||
|
gcm_ctx->gcm_htab_len = htab_len;
|
||||||
|
gcm_ctx->gcm_Htable =
|
||||||
|
(uint64_t *)kmem_alloc(htab_len, gcm_ctx->gcm_kmflag);
|
||||||
|
|
||||||
|
if (gcm_ctx->gcm_Htable == NULL) {
|
||||||
|
return (CRYPTO_HOST_MEMORY);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* Avx and non avx context initialization differs from here on. */
|
/* Avx and non avx context initialization differs from here on. */
|
||||||
if (gcm_ctx->gcm_use_avx == B_FALSE) {
|
if (gcm_ctx->gcm_use_avx == B_FALSE) {
|
||||||
#endif /* ifdef CAN_USE_GCM_ASM */
|
#endif /* ifdef CAN_USE_GCM_ASM */
|
||||||
|
@ -1018,7 +1051,7 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
|
||||||
/* Clear the FPU registers since they hold sensitive internal state. */
|
/* Clear the FPU registers since they hold sensitive internal state. */
|
||||||
#define clear_fpu_regs() clear_fpu_regs_avx()
|
#define clear_fpu_regs() clear_fpu_regs_avx()
|
||||||
#define GHASH_AVX(ctx, in, len) \
|
#define GHASH_AVX(ctx, in, len) \
|
||||||
gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t (*)[2])(ctx)->gcm_Htable, \
|
gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t *)(ctx)->gcm_Htable, \
|
||||||
in, len)
|
in, len)
|
||||||
|
|
||||||
#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
|
#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1)
|
||||||
|
@ -1036,8 +1069,8 @@ extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
|
||||||
extern void aes_encrypt_intel(const uint32_t rk[], int nr,
|
extern void aes_encrypt_intel(const uint32_t rk[], int nr,
|
||||||
const uint32_t pt[4], uint32_t ct[4]);
|
const uint32_t pt[4], uint32_t ct[4]);
|
||||||
|
|
||||||
extern void gcm_init_htab_avx(uint64_t Htable[16][2], const uint64_t H[2]);
|
extern void gcm_init_htab_avx(uint64_t *Htable, const uint64_t H[2]);
|
||||||
extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t Htable[16][2],
|
extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t *Htable,
|
||||||
const uint8_t *in, size_t len);
|
const uint8_t *in, size_t len);
|
||||||
|
|
||||||
extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
|
extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t,
|
||||||
|
@ -1073,6 +1106,18 @@ gcm_toggle_avx(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline size_t
|
||||||
|
gcm_simd_get_htab_size(boolean_t simd_mode)
|
||||||
|
{
|
||||||
|
switch (simd_mode) {
|
||||||
|
case B_TRUE:
|
||||||
|
return (2 * 6 * 2 * sizeof (uint64_t));
|
||||||
|
|
||||||
|
default:
|
||||||
|
return (0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Clear sensitive data in the context.
|
* Clear sensitive data in the context.
|
||||||
*
|
*
|
||||||
|
@ -1088,7 +1133,6 @@ gcm_clear_ctx(gcm_ctx_t *ctx)
|
||||||
{
|
{
|
||||||
bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder));
|
bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder));
|
||||||
bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
|
bzero(ctx->gcm_H, sizeof (ctx->gcm_H));
|
||||||
bzero(ctx->gcm_Htable, sizeof (ctx->gcm_Htable));
|
|
||||||
bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0));
|
bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0));
|
||||||
bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp));
|
bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp));
|
||||||
}
|
}
|
||||||
|
|
|
@ -152,6 +152,14 @@ crypto_free_mode_ctx(void *ctx)
|
||||||
vmem_free(((gcm_ctx_t *)ctx)->gcm_pt_buf,
|
vmem_free(((gcm_ctx_t *)ctx)->gcm_pt_buf,
|
||||||
((gcm_ctx_t *)ctx)->gcm_pt_buf_len);
|
((gcm_ctx_t *)ctx)->gcm_pt_buf_len);
|
||||||
|
|
||||||
|
#ifdef CAN_USE_GCM_ASM
|
||||||
|
if (((gcm_ctx_t *)ctx)->gcm_Htable != NULL) {
|
||||||
|
gcm_ctx_t *gcm_ctx = (gcm_ctx_t *)ctx;
|
||||||
|
bzero(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len);
|
||||||
|
kmem_free(gcm_ctx->gcm_Htable, gcm_ctx->gcm_htab_len);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
kmem_free(ctx, sizeof (gcm_ctx_t));
|
kmem_free(ctx, sizeof (gcm_ctx_t));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -718,6 +718,8 @@ aesni_gcm_decrypt:
|
||||||
.cfi_offset %r14,-48
|
.cfi_offset %r14,-48
|
||||||
pushq %r15
|
pushq %r15
|
||||||
.cfi_offset %r15,-56
|
.cfi_offset %r15,-56
|
||||||
|
pushq %r9
|
||||||
|
.cfi_offset %r9,-64
|
||||||
vzeroupper
|
vzeroupper
|
||||||
|
|
||||||
vmovdqu (%r8),%xmm1
|
vmovdqu (%r8),%xmm1
|
||||||
|
@ -730,7 +732,8 @@ aesni_gcm_decrypt:
|
||||||
andq $-128,%rsp
|
andq $-128,%rsp
|
||||||
vmovdqu (%r11),%xmm0
|
vmovdqu (%r11),%xmm0
|
||||||
leaq 128(%rcx),%rcx
|
leaq 128(%rcx),%rcx
|
||||||
leaq 32+32(%r9),%r9
|
movq 32(%r9),%r9
|
||||||
|
leaq 32(%r9),%r9
|
||||||
movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
|
movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds.
|
||||||
vpshufb %xmm0,%xmm8,%xmm8
|
vpshufb %xmm0,%xmm8,%xmm8
|
||||||
|
|
||||||
|
@ -786,7 +789,9 @@ aesni_gcm_decrypt:
|
||||||
vmovups %xmm14,-16(%rsi)
|
vmovups %xmm14,-16(%rsi)
|
||||||
|
|
||||||
vpshufb (%r11),%xmm8,%xmm8
|
vpshufb (%r11),%xmm8,%xmm8
|
||||||
vmovdqu %xmm8,-64(%r9)
|
movq -56(%rax),%r9
|
||||||
|
.cfi_restore %r9
|
||||||
|
vmovdqu %xmm8,(%r9)
|
||||||
|
|
||||||
vzeroupper
|
vzeroupper
|
||||||
movq -48(%rax),%r15
|
movq -48(%rax),%r15
|
||||||
|
@ -924,6 +929,8 @@ aesni_gcm_encrypt:
|
||||||
.cfi_offset %r14,-48
|
.cfi_offset %r14,-48
|
||||||
pushq %r15
|
pushq %r15
|
||||||
.cfi_offset %r15,-56
|
.cfi_offset %r15,-56
|
||||||
|
pushq %r9
|
||||||
|
.cfi_offset %r9,-64
|
||||||
vzeroupper
|
vzeroupper
|
||||||
|
|
||||||
vmovdqu (%r8),%xmm1
|
vmovdqu (%r8),%xmm1
|
||||||
|
@ -966,7 +973,8 @@ aesni_gcm_encrypt:
|
||||||
call _aesni_ctr32_6x
|
call _aesni_ctr32_6x
|
||||||
|
|
||||||
vmovdqu (%r9),%xmm8
|
vmovdqu (%r9),%xmm8
|
||||||
leaq 32+32(%r9),%r9
|
movq 32(%r9),%r9
|
||||||
|
leaq 32(%r9),%r9
|
||||||
subq $12,%rdx
|
subq $12,%rdx
|
||||||
movq $192,%r10
|
movq $192,%r10
|
||||||
vpshufb %xmm0,%xmm8,%xmm8
|
vpshufb %xmm0,%xmm8,%xmm8
|
||||||
|
@ -1157,7 +1165,9 @@ aesni_gcm_encrypt:
|
||||||
vpxor %xmm7,%xmm2,%xmm2
|
vpxor %xmm7,%xmm2,%xmm2
|
||||||
vpxor %xmm2,%xmm8,%xmm8
|
vpxor %xmm2,%xmm8,%xmm8
|
||||||
vpshufb (%r11),%xmm8,%xmm8
|
vpshufb (%r11),%xmm8,%xmm8
|
||||||
vmovdqu %xmm8,-64(%r9)
|
movq -56(%rax),%r9
|
||||||
|
.cfi_restore %r9
|
||||||
|
vmovdqu %xmm8,(%r9)
|
||||||
|
|
||||||
vzeroupper
|
vzeroupper
|
||||||
movq -48(%rax),%r15
|
movq -48(%rax),%r15
|
||||||
|
|
|
@ -219,14 +219,14 @@ typedef struct gcm_ctx {
|
||||||
size_t gcm_pt_buf_len;
|
size_t gcm_pt_buf_len;
|
||||||
uint32_t gcm_tmp[4];
|
uint32_t gcm_tmp[4];
|
||||||
/*
|
/*
|
||||||
* The relative positions of gcm_ghash, gcm_H and pre-computed
|
* The offset of gcm_Htable relative to gcm_ghash, (32), is hard coded
|
||||||
* gcm_Htable are hard coded in aesni-gcm-x86_64.S and ghash-x86_64.S,
|
* in aesni-gcm-x86_64.S, so please don't change (or adjust there).
|
||||||
* so please don't change (or adjust accordingly).
|
|
||||||
*/
|
*/
|
||||||
uint64_t gcm_ghash[2];
|
uint64_t gcm_ghash[2];
|
||||||
uint64_t gcm_H[2];
|
uint64_t gcm_H[2];
|
||||||
#ifdef CAN_USE_GCM_ASM
|
#ifdef CAN_USE_GCM_ASM
|
||||||
uint64_t gcm_Htable[12][2];
|
uint64_t *gcm_Htable;
|
||||||
|
size_t gcm_htab_len;
|
||||||
#endif
|
#endif
|
||||||
uint64_t gcm_J0[2];
|
uint64_t gcm_J0[2];
|
||||||
uint64_t gcm_len_a_len_c[2];
|
uint64_t gcm_len_a_len_c[2];
|
||||||
|
|
|
@ -1051,6 +1051,16 @@ out:
|
||||||
bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
|
bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
|
||||||
kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
|
kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len);
|
||||||
}
|
}
|
||||||
|
#ifdef CAN_USE_GCM_ASM
|
||||||
|
if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE) &&
|
||||||
|
((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) {
|
||||||
|
|
||||||
|
gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx;
|
||||||
|
|
||||||
|
bzero(ctx->gcm_Htable, ctx->gcm_htab_len);
|
||||||
|
kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return (ret);
|
return (ret);
|
||||||
}
|
}
|
||||||
|
@ -1209,6 +1219,14 @@ out:
|
||||||
vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf,
|
vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf,
|
||||||
((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len);
|
((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len);
|
||||||
}
|
}
|
||||||
|
#ifdef CAN_USE_GCM_ASM
|
||||||
|
if (((gcm_ctx_t *)&aes_ctx)->gcm_Htable != NULL) {
|
||||||
|
gcm_ctx_t *ctx = (gcm_ctx_t *)&aes_ctx;
|
||||||
|
|
||||||
|
bzero(ctx->gcm_Htable, ctx->gcm_htab_len);
|
||||||
|
kmem_free(ctx->gcm_Htable, ctx->gcm_htab_len);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
return (ret);
|
return (ret);
|
||||||
|
|
Loading…
Reference in New Issue