ICP: gcm-avx: Support architectures lacking the MOVBE instruction
There are a couple of x86_64 architectures which support all needed features to make the accelerated GCM implementation work but the MOVBE instruction. Those are mainly Intel Sandy- and Ivy-Bridge and AMD Bulldozer, Piledriver, and Steamroller. By using MOVBE only if available and replacing it with a MOV followed by a BSWAP if not, those architectures now benefit from the new GCM routines and performance is considerably better compared to the original implementation. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Adam D. Moss <c@yotes.com> Signed-off-by: Attila Fülöp <attila@fueloep.org> Followup #9749 Closes #10029
This commit is contained in:
parent
a57d3d45d6
commit
5b3b79559c
|
@ -50,6 +50,8 @@ static uint32_t icp_gcm_impl = IMPL_FASTEST;
|
||||||
static uint32_t user_sel_impl = IMPL_FASTEST;
|
static uint32_t user_sel_impl = IMPL_FASTEST;
|
||||||
|
|
||||||
#ifdef CAN_USE_GCM_ASM
|
#ifdef CAN_USE_GCM_ASM
|
||||||
|
/* Does the architecture we run on support the MOVBE instruction? */
|
||||||
|
boolean_t gcm_avx_can_use_movbe = B_FALSE;
|
||||||
/*
|
/*
|
||||||
* Whether to use the optimized openssl gcm and ghash implementations.
|
* Whether to use the optimized openssl gcm and ghash implementations.
|
||||||
* Set to true if module parameter icp_gcm_impl == "avx".
|
* Set to true if module parameter icp_gcm_impl == "avx".
|
||||||
|
@ -60,6 +62,7 @@ static boolean_t gcm_use_avx = B_FALSE;
|
||||||
static inline boolean_t gcm_avx_will_work(void);
|
static inline boolean_t gcm_avx_will_work(void);
|
||||||
static inline void gcm_set_avx(boolean_t);
|
static inline void gcm_set_avx(boolean_t);
|
||||||
static inline boolean_t gcm_toggle_avx(void);
|
static inline boolean_t gcm_toggle_avx(void);
|
||||||
|
extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
|
||||||
|
|
||||||
static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
|
static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
|
||||||
crypto_data_t *, size_t);
|
crypto_data_t *, size_t);
|
||||||
|
@ -622,20 +625,29 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef CAN_USE_GCM_ASM
|
#ifdef CAN_USE_GCM_ASM
|
||||||
/*
|
|
||||||
* Handle the "cycle" implementation by creating avx and non avx
|
|
||||||
* contexts alternately.
|
|
||||||
*/
|
|
||||||
if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
|
if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
|
||||||
gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
|
gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
|
||||||
} else {
|
} else {
|
||||||
|
/*
|
||||||
|
* Handle the "cycle" implementation by creating avx and
|
||||||
|
* non-avx contexts alternately.
|
||||||
|
*/
|
||||||
gcm_ctx->gcm_use_avx = gcm_toggle_avx();
|
gcm_ctx->gcm_use_avx = gcm_toggle_avx();
|
||||||
}
|
/*
|
||||||
/* We don't handle byte swapped key schedules in the avx code path. */
|
* We don't handle byte swapped key schedules in the avx
|
||||||
|
* code path.
|
||||||
|
*/
|
||||||
aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
|
aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
|
||||||
if (ks->ops->needs_byteswap == B_TRUE) {
|
if (ks->ops->needs_byteswap == B_TRUE) {
|
||||||
gcm_ctx->gcm_use_avx = B_FALSE;
|
gcm_ctx->gcm_use_avx = B_FALSE;
|
||||||
}
|
}
|
||||||
|
/* Use the MOVBE and the BSWAP variants alternately. */
|
||||||
|
if (gcm_ctx->gcm_use_avx == B_TRUE &&
|
||||||
|
zfs_movbe_available() == B_TRUE) {
|
||||||
|
(void) atomic_toggle_boolean_nv(
|
||||||
|
(volatile boolean_t *)&gcm_avx_can_use_movbe);
|
||||||
|
}
|
||||||
|
}
|
||||||
/* Avx and non avx context initialization differs from here on. */
|
/* Avx and non avx context initialization differs from here on. */
|
||||||
if (gcm_ctx->gcm_use_avx == B_FALSE) {
|
if (gcm_ctx->gcm_use_avx == B_FALSE) {
|
||||||
#endif /* ifdef CAN_USE_GCM_ASM */
|
#endif /* ifdef CAN_USE_GCM_ASM */
|
||||||
|
@ -856,10 +868,16 @@ gcm_impl_init(void)
|
||||||
* Use the avx implementation if it's available and the implementation
|
* Use the avx implementation if it's available and the implementation
|
||||||
* hasn't changed from its default value of fastest on module load.
|
* hasn't changed from its default value of fastest on module load.
|
||||||
*/
|
*/
|
||||||
if (gcm_avx_will_work() &&
|
if (gcm_avx_will_work()) {
|
||||||
GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
|
#ifdef HAVE_MOVBE
|
||||||
|
if (zfs_movbe_available() == B_TRUE) {
|
||||||
|
atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
|
||||||
gcm_set_avx(B_TRUE);
|
gcm_set_avx(B_TRUE);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
/* Finish initialization */
|
/* Finish initialization */
|
||||||
atomic_swap_32(&icp_gcm_impl, user_sel_impl);
|
atomic_swap_32(&icp_gcm_impl, user_sel_impl);
|
||||||
|
@ -1032,7 +1050,6 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
|
||||||
static uint32_t gcm_avx_chunk_size =
|
static uint32_t gcm_avx_chunk_size =
|
||||||
((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
|
((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
|
||||||
|
|
||||||
extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
|
|
||||||
extern void clear_fpu_regs_avx(void);
|
extern void clear_fpu_regs_avx(void);
|
||||||
extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
|
extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
|
||||||
extern void aes_encrypt_intel(const uint32_t rk[], int nr,
|
extern void aes_encrypt_intel(const uint32_t rk[], int nr,
|
||||||
|
@ -1053,8 +1070,8 @@ gcm_avx_will_work(void)
|
||||||
{
|
{
|
||||||
/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
|
/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
|
||||||
return (kfpu_allowed() &&
|
return (kfpu_allowed() &&
|
||||||
zfs_avx_available() && zfs_movbe_available() &&
|
zfs_avx_available() && zfs_aes_available() &&
|
||||||
zfs_aes_available() && zfs_pclmulqdq_available());
|
zfs_pclmulqdq_available());
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void
|
static inline void
|
||||||
|
|
|
@ -45,10 +45,13 @@
|
||||||
# upstream merges.
|
# upstream merges.
|
||||||
|
|
||||||
#if defined(__x86_64__) && defined(HAVE_AVX) && \
|
#if defined(__x86_64__) && defined(HAVE_AVX) && \
|
||||||
defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) && defined(HAVE_MOVBE)
|
defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
|
||||||
|
|
||||||
|
.extern gcm_avx_can_use_movbe
|
||||||
|
|
||||||
.text
|
.text
|
||||||
|
|
||||||
|
#ifdef HAVE_MOVBE
|
||||||
.type _aesni_ctr32_ghash_6x,@function
|
.type _aesni_ctr32_ghash_6x,@function
|
||||||
.align 32
|
.align 32
|
||||||
_aesni_ctr32_ghash_6x:
|
_aesni_ctr32_ghash_6x:
|
||||||
|
@ -361,6 +364,333 @@ _aesni_ctr32_ghash_6x:
|
||||||
|
|
||||||
.byte 0xf3,0xc3
|
.byte 0xf3,0xc3
|
||||||
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
|
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
|
||||||
|
#endif /* ifdef HAVE_MOVBE */
|
||||||
|
|
||||||
|
.type _aesni_ctr32_ghash_no_movbe_6x,@function
|
||||||
|
.align 32
|
||||||
|
_aesni_ctr32_ghash_no_movbe_6x:
|
||||||
|
vmovdqu 32(%r11),%xmm2
|
||||||
|
subq $6,%rdx
|
||||||
|
vpxor %xmm4,%xmm4,%xmm4
|
||||||
|
vmovdqu 0-128(%rcx),%xmm15
|
||||||
|
vpaddb %xmm2,%xmm1,%xmm10
|
||||||
|
vpaddb %xmm2,%xmm10,%xmm11
|
||||||
|
vpaddb %xmm2,%xmm11,%xmm12
|
||||||
|
vpaddb %xmm2,%xmm12,%xmm13
|
||||||
|
vpaddb %xmm2,%xmm13,%xmm14
|
||||||
|
vpxor %xmm15,%xmm1,%xmm9
|
||||||
|
vmovdqu %xmm4,16+8(%rsp)
|
||||||
|
jmp .Loop6x_nmb
|
||||||
|
|
||||||
|
.align 32
|
||||||
|
.Loop6x_nmb:
|
||||||
|
addl $100663296,%ebx
|
||||||
|
jc .Lhandle_ctr32_nmb
|
||||||
|
vmovdqu 0-32(%r9),%xmm3
|
||||||
|
vpaddb %xmm2,%xmm14,%xmm1
|
||||||
|
vpxor %xmm15,%xmm10,%xmm10
|
||||||
|
vpxor %xmm15,%xmm11,%xmm11
|
||||||
|
|
||||||
|
.Lresume_ctr32_nmb:
|
||||||
|
vmovdqu %xmm1,(%r8)
|
||||||
|
vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5
|
||||||
|
vpxor %xmm15,%xmm12,%xmm12
|
||||||
|
vmovups 16-128(%rcx),%xmm2
|
||||||
|
vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6
|
||||||
|
xorq %r12,%r12
|
||||||
|
cmpq %r14,%r15
|
||||||
|
|
||||||
|
vaesenc %xmm2,%xmm9,%xmm9
|
||||||
|
vmovdqu 48+8(%rsp),%xmm0
|
||||||
|
vpxor %xmm15,%xmm13,%xmm13
|
||||||
|
vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1
|
||||||
|
vaesenc %xmm2,%xmm10,%xmm10
|
||||||
|
vpxor %xmm15,%xmm14,%xmm14
|
||||||
|
setnc %r12b
|
||||||
|
vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7
|
||||||
|
vaesenc %xmm2,%xmm11,%xmm11
|
||||||
|
vmovdqu 16-32(%r9),%xmm3
|
||||||
|
negq %r12
|
||||||
|
vaesenc %xmm2,%xmm12,%xmm12
|
||||||
|
vpxor %xmm5,%xmm6,%xmm6
|
||||||
|
vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5
|
||||||
|
vpxor %xmm4,%xmm8,%xmm8
|
||||||
|
vaesenc %xmm2,%xmm13,%xmm13
|
||||||
|
vpxor %xmm5,%xmm1,%xmm4
|
||||||
|
andq $0x60,%r12
|
||||||
|
vmovups 32-128(%rcx),%xmm15
|
||||||
|
vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1
|
||||||
|
vaesenc %xmm2,%xmm14,%xmm14
|
||||||
|
|
||||||
|
vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2
|
||||||
|
leaq (%r14,%r12,1),%r14
|
||||||
|
vaesenc %xmm15,%xmm9,%xmm9
|
||||||
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
||||||
|
vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3
|
||||||
|
vmovdqu 64+8(%rsp),%xmm0
|
||||||
|
vaesenc %xmm15,%xmm10,%xmm10
|
||||||
|
movq 88(%r14),%r13
|
||||||
|
bswapq %r13
|
||||||
|
vaesenc %xmm15,%xmm11,%xmm11
|
||||||
|
movq 80(%r14),%r12
|
||||||
|
bswapq %r12
|
||||||
|
vaesenc %xmm15,%xmm12,%xmm12
|
||||||
|
movq %r13,32+8(%rsp)
|
||||||
|
vaesenc %xmm15,%xmm13,%xmm13
|
||||||
|
movq %r12,40+8(%rsp)
|
||||||
|
vmovdqu 48-32(%r9),%xmm5
|
||||||
|
vaesenc %xmm15,%xmm14,%xmm14
|
||||||
|
|
||||||
|
vmovups 48-128(%rcx),%xmm15
|
||||||
|
vpxor %xmm1,%xmm6,%xmm6
|
||||||
|
vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1
|
||||||
|
vaesenc %xmm15,%xmm9,%xmm9
|
||||||
|
vpxor %xmm2,%xmm6,%xmm6
|
||||||
|
vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2
|
||||||
|
vaesenc %xmm15,%xmm10,%xmm10
|
||||||
|
vpxor %xmm3,%xmm7,%xmm7
|
||||||
|
vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3
|
||||||
|
vaesenc %xmm15,%xmm11,%xmm11
|
||||||
|
vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5
|
||||||
|
vmovdqu 80+8(%rsp),%xmm0
|
||||||
|
vaesenc %xmm15,%xmm12,%xmm12
|
||||||
|
vaesenc %xmm15,%xmm13,%xmm13
|
||||||
|
vpxor %xmm1,%xmm4,%xmm4
|
||||||
|
vmovdqu 64-32(%r9),%xmm1
|
||||||
|
vaesenc %xmm15,%xmm14,%xmm14
|
||||||
|
|
||||||
|
vmovups 64-128(%rcx),%xmm15
|
||||||
|
vpxor %xmm2,%xmm6,%xmm6
|
||||||
|
vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2
|
||||||
|
vaesenc %xmm15,%xmm9,%xmm9
|
||||||
|
vpxor %xmm3,%xmm6,%xmm6
|
||||||
|
vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3
|
||||||
|
vaesenc %xmm15,%xmm10,%xmm10
|
||||||
|
movq 72(%r14),%r13
|
||||||
|
bswapq %r13
|
||||||
|
vpxor %xmm5,%xmm7,%xmm7
|
||||||
|
vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5
|
||||||
|
vaesenc %xmm15,%xmm11,%xmm11
|
||||||
|
movq 64(%r14),%r12
|
||||||
|
bswapq %r12
|
||||||
|
vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1
|
||||||
|
vmovdqu 96+8(%rsp),%xmm0
|
||||||
|
vaesenc %xmm15,%xmm12,%xmm12
|
||||||
|
movq %r13,48+8(%rsp)
|
||||||
|
vaesenc %xmm15,%xmm13,%xmm13
|
||||||
|
movq %r12,56+8(%rsp)
|
||||||
|
vpxor %xmm2,%xmm4,%xmm4
|
||||||
|
vmovdqu 96-32(%r9),%xmm2
|
||||||
|
vaesenc %xmm15,%xmm14,%xmm14
|
||||||
|
|
||||||
|
vmovups 80-128(%rcx),%xmm15
|
||||||
|
vpxor %xmm3,%xmm6,%xmm6
|
||||||
|
vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3
|
||||||
|
vaesenc %xmm15,%xmm9,%xmm9
|
||||||
|
vpxor %xmm5,%xmm6,%xmm6
|
||||||
|
vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5
|
||||||
|
vaesenc %xmm15,%xmm10,%xmm10
|
||||||
|
movq 56(%r14),%r13
|
||||||
|
bswapq %r13
|
||||||
|
vpxor %xmm1,%xmm7,%xmm7
|
||||||
|
vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1
|
||||||
|
vpxor 112+8(%rsp),%xmm8,%xmm8
|
||||||
|
vaesenc %xmm15,%xmm11,%xmm11
|
||||||
|
movq 48(%r14),%r12
|
||||||
|
bswapq %r12
|
||||||
|
vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2
|
||||||
|
vaesenc %xmm15,%xmm12,%xmm12
|
||||||
|
movq %r13,64+8(%rsp)
|
||||||
|
vaesenc %xmm15,%xmm13,%xmm13
|
||||||
|
movq %r12,72+8(%rsp)
|
||||||
|
vpxor %xmm3,%xmm4,%xmm4
|
||||||
|
vmovdqu 112-32(%r9),%xmm3
|
||||||
|
vaesenc %xmm15,%xmm14,%xmm14
|
||||||
|
|
||||||
|
vmovups 96-128(%rcx),%xmm15
|
||||||
|
vpxor %xmm5,%xmm6,%xmm6
|
||||||
|
vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5
|
||||||
|
vaesenc %xmm15,%xmm9,%xmm9
|
||||||
|
vpxor %xmm1,%xmm6,%xmm6
|
||||||
|
vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1
|
||||||
|
vaesenc %xmm15,%xmm10,%xmm10
|
||||||
|
movq 40(%r14),%r13
|
||||||
|
bswapq %r13
|
||||||
|
vpxor %xmm2,%xmm7,%xmm7
|
||||||
|
vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2
|
||||||
|
vaesenc %xmm15,%xmm11,%xmm11
|
||||||
|
movq 32(%r14),%r12
|
||||||
|
bswapq %r12
|
||||||
|
vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8
|
||||||
|
vaesenc %xmm15,%xmm12,%xmm12
|
||||||
|
movq %r13,80+8(%rsp)
|
||||||
|
vaesenc %xmm15,%xmm13,%xmm13
|
||||||
|
movq %r12,88+8(%rsp)
|
||||||
|
vpxor %xmm5,%xmm6,%xmm6
|
||||||
|
vaesenc %xmm15,%xmm14,%xmm14
|
||||||
|
vpxor %xmm1,%xmm6,%xmm6
|
||||||
|
|
||||||
|
vmovups 112-128(%rcx),%xmm15
|
||||||
|
vpslldq $8,%xmm6,%xmm5
|
||||||
|
vpxor %xmm2,%xmm4,%xmm4
|
||||||
|
vmovdqu 16(%r11),%xmm3
|
||||||
|
|
||||||
|
vaesenc %xmm15,%xmm9,%xmm9
|
||||||
|
vpxor %xmm8,%xmm7,%xmm7
|
||||||
|
vaesenc %xmm15,%xmm10,%xmm10
|
||||||
|
vpxor %xmm5,%xmm4,%xmm4
|
||||||
|
movq 24(%r14),%r13
|
||||||
|
bswapq %r13
|
||||||
|
vaesenc %xmm15,%xmm11,%xmm11
|
||||||
|
movq 16(%r14),%r12
|
||||||
|
bswapq %r12
|
||||||
|
vpalignr $8,%xmm4,%xmm4,%xmm0
|
||||||
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
||||||
|
movq %r13,96+8(%rsp)
|
||||||
|
vaesenc %xmm15,%xmm12,%xmm12
|
||||||
|
movq %r12,104+8(%rsp)
|
||||||
|
vaesenc %xmm15,%xmm13,%xmm13
|
||||||
|
vmovups 128-128(%rcx),%xmm1
|
||||||
|
vaesenc %xmm15,%xmm14,%xmm14
|
||||||
|
|
||||||
|
vaesenc %xmm1,%xmm9,%xmm9
|
||||||
|
vmovups 144-128(%rcx),%xmm15
|
||||||
|
vaesenc %xmm1,%xmm10,%xmm10
|
||||||
|
vpsrldq $8,%xmm6,%xmm6
|
||||||
|
vaesenc %xmm1,%xmm11,%xmm11
|
||||||
|
vpxor %xmm6,%xmm7,%xmm7
|
||||||
|
vaesenc %xmm1,%xmm12,%xmm12
|
||||||
|
vpxor %xmm0,%xmm4,%xmm4
|
||||||
|
movq 8(%r14),%r13
|
||||||
|
bswapq %r13
|
||||||
|
vaesenc %xmm1,%xmm13,%xmm13
|
||||||
|
movq 0(%r14),%r12
|
||||||
|
bswapq %r12
|
||||||
|
vaesenc %xmm1,%xmm14,%xmm14
|
||||||
|
vmovups 160-128(%rcx),%xmm1
|
||||||
|
cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds.
|
||||||
|
jb .Lenc_tail_nmb
|
||||||
|
|
||||||
|
vaesenc %xmm15,%xmm9,%xmm9
|
||||||
|
vaesenc %xmm15,%xmm10,%xmm10
|
||||||
|
vaesenc %xmm15,%xmm11,%xmm11
|
||||||
|
vaesenc %xmm15,%xmm12,%xmm12
|
||||||
|
vaesenc %xmm15,%xmm13,%xmm13
|
||||||
|
vaesenc %xmm15,%xmm14,%xmm14
|
||||||
|
|
||||||
|
vaesenc %xmm1,%xmm9,%xmm9
|
||||||
|
vaesenc %xmm1,%xmm10,%xmm10
|
||||||
|
vaesenc %xmm1,%xmm11,%xmm11
|
||||||
|
vaesenc %xmm1,%xmm12,%xmm12
|
||||||
|
vaesenc %xmm1,%xmm13,%xmm13
|
||||||
|
vmovups 176-128(%rcx),%xmm15
|
||||||
|
vaesenc %xmm1,%xmm14,%xmm14
|
||||||
|
vmovups 192-128(%rcx),%xmm1
|
||||||
|
cmpl $14,%ebp // ICP does not zero key schedule.
|
||||||
|
jb .Lenc_tail_nmb
|
||||||
|
|
||||||
|
vaesenc %xmm15,%xmm9,%xmm9
|
||||||
|
vaesenc %xmm15,%xmm10,%xmm10
|
||||||
|
vaesenc %xmm15,%xmm11,%xmm11
|
||||||
|
vaesenc %xmm15,%xmm12,%xmm12
|
||||||
|
vaesenc %xmm15,%xmm13,%xmm13
|
||||||
|
vaesenc %xmm15,%xmm14,%xmm14
|
||||||
|
|
||||||
|
vaesenc %xmm1,%xmm9,%xmm9
|
||||||
|
vaesenc %xmm1,%xmm10,%xmm10
|
||||||
|
vaesenc %xmm1,%xmm11,%xmm11
|
||||||
|
vaesenc %xmm1,%xmm12,%xmm12
|
||||||
|
vaesenc %xmm1,%xmm13,%xmm13
|
||||||
|
vmovups 208-128(%rcx),%xmm15
|
||||||
|
vaesenc %xmm1,%xmm14,%xmm14
|
||||||
|
vmovups 224-128(%rcx),%xmm1
|
||||||
|
jmp .Lenc_tail_nmb
|
||||||
|
|
||||||
|
.align 32
|
||||||
|
.Lhandle_ctr32_nmb:
|
||||||
|
vmovdqu (%r11),%xmm0
|
||||||
|
vpshufb %xmm0,%xmm1,%xmm6
|
||||||
|
vmovdqu 48(%r11),%xmm5
|
||||||
|
vpaddd 64(%r11),%xmm6,%xmm10
|
||||||
|
vpaddd %xmm5,%xmm6,%xmm11
|
||||||
|
vmovdqu 0-32(%r9),%xmm3
|
||||||
|
vpaddd %xmm5,%xmm10,%xmm12
|
||||||
|
vpshufb %xmm0,%xmm10,%xmm10
|
||||||
|
vpaddd %xmm5,%xmm11,%xmm13
|
||||||
|
vpshufb %xmm0,%xmm11,%xmm11
|
||||||
|
vpxor %xmm15,%xmm10,%xmm10
|
||||||
|
vpaddd %xmm5,%xmm12,%xmm14
|
||||||
|
vpshufb %xmm0,%xmm12,%xmm12
|
||||||
|
vpxor %xmm15,%xmm11,%xmm11
|
||||||
|
vpaddd %xmm5,%xmm13,%xmm1
|
||||||
|
vpshufb %xmm0,%xmm13,%xmm13
|
||||||
|
vpshufb %xmm0,%xmm14,%xmm14
|
||||||
|
vpshufb %xmm0,%xmm1,%xmm1
|
||||||
|
jmp .Lresume_ctr32_nmb
|
||||||
|
|
||||||
|
.align 32
|
||||||
|
.Lenc_tail_nmb:
|
||||||
|
vaesenc %xmm15,%xmm9,%xmm9
|
||||||
|
vmovdqu %xmm7,16+8(%rsp)
|
||||||
|
vpalignr $8,%xmm4,%xmm4,%xmm8
|
||||||
|
vaesenc %xmm15,%xmm10,%xmm10
|
||||||
|
vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4
|
||||||
|
vpxor 0(%rdi),%xmm1,%xmm2
|
||||||
|
vaesenc %xmm15,%xmm11,%xmm11
|
||||||
|
vpxor 16(%rdi),%xmm1,%xmm0
|
||||||
|
vaesenc %xmm15,%xmm12,%xmm12
|
||||||
|
vpxor 32(%rdi),%xmm1,%xmm5
|
||||||
|
vaesenc %xmm15,%xmm13,%xmm13
|
||||||
|
vpxor 48(%rdi),%xmm1,%xmm6
|
||||||
|
vaesenc %xmm15,%xmm14,%xmm14
|
||||||
|
vpxor 64(%rdi),%xmm1,%xmm7
|
||||||
|
vpxor 80(%rdi),%xmm1,%xmm3
|
||||||
|
vmovdqu (%r8),%xmm1
|
||||||
|
|
||||||
|
vaesenclast %xmm2,%xmm9,%xmm9
|
||||||
|
vmovdqu 32(%r11),%xmm2
|
||||||
|
vaesenclast %xmm0,%xmm10,%xmm10
|
||||||
|
vpaddb %xmm2,%xmm1,%xmm0
|
||||||
|
movq %r13,112+8(%rsp)
|
||||||
|
leaq 96(%rdi),%rdi
|
||||||
|
vaesenclast %xmm5,%xmm11,%xmm11
|
||||||
|
vpaddb %xmm2,%xmm0,%xmm5
|
||||||
|
movq %r12,120+8(%rsp)
|
||||||
|
leaq 96(%rsi),%rsi
|
||||||
|
vmovdqu 0-128(%rcx),%xmm15
|
||||||
|
vaesenclast %xmm6,%xmm12,%xmm12
|
||||||
|
vpaddb %xmm2,%xmm5,%xmm6
|
||||||
|
vaesenclast %xmm7,%xmm13,%xmm13
|
||||||
|
vpaddb %xmm2,%xmm6,%xmm7
|
||||||
|
vaesenclast %xmm3,%xmm14,%xmm14
|
||||||
|
vpaddb %xmm2,%xmm7,%xmm3
|
||||||
|
|
||||||
|
addq $0x60,%r10
|
||||||
|
subq $0x6,%rdx
|
||||||
|
jc .L6x_done_nmb
|
||||||
|
|
||||||
|
vmovups %xmm9,-96(%rsi)
|
||||||
|
vpxor %xmm15,%xmm1,%xmm9
|
||||||
|
vmovups %xmm10,-80(%rsi)
|
||||||
|
vmovdqa %xmm0,%xmm10
|
||||||
|
vmovups %xmm11,-64(%rsi)
|
||||||
|
vmovdqa %xmm5,%xmm11
|
||||||
|
vmovups %xmm12,-48(%rsi)
|
||||||
|
vmovdqa %xmm6,%xmm12
|
||||||
|
vmovups %xmm13,-32(%rsi)
|
||||||
|
vmovdqa %xmm7,%xmm13
|
||||||
|
vmovups %xmm14,-16(%rsi)
|
||||||
|
vmovdqa %xmm3,%xmm14
|
||||||
|
vmovdqu 32+8(%rsp),%xmm7
|
||||||
|
jmp .Loop6x_nmb
|
||||||
|
|
||||||
|
.L6x_done_nmb:
|
||||||
|
vpxor 16+8(%rsp),%xmm8,%xmm8
|
||||||
|
vpxor %xmm4,%xmm8,%xmm8
|
||||||
|
|
||||||
|
.byte 0xf3,0xc3
|
||||||
|
.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
|
||||||
|
|
||||||
.globl aesni_gcm_decrypt
|
.globl aesni_gcm_decrypt
|
||||||
.type aesni_gcm_decrypt,@function
|
.type aesni_gcm_decrypt,@function
|
||||||
.align 32
|
.align 32
|
||||||
|
@ -431,8 +761,19 @@ aesni_gcm_decrypt:
|
||||||
vmovdqu %xmm2,96(%rsp)
|
vmovdqu %xmm2,96(%rsp)
|
||||||
vmovdqu %xmm3,112(%rsp)
|
vmovdqu %xmm3,112(%rsp)
|
||||||
|
|
||||||
|
#ifdef HAVE_MOVBE
|
||||||
|
#ifdef _KERNEL
|
||||||
|
testl $1,gcm_avx_can_use_movbe(%rip)
|
||||||
|
#else
|
||||||
|
testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
|
||||||
|
#endif
|
||||||
|
jz 1f
|
||||||
call _aesni_ctr32_ghash_6x
|
call _aesni_ctr32_ghash_6x
|
||||||
|
jmp 2f
|
||||||
|
1:
|
||||||
|
#endif
|
||||||
|
call _aesni_ctr32_ghash_no_movbe_6x
|
||||||
|
2:
|
||||||
vmovups %xmm9,-96(%rsi)
|
vmovups %xmm9,-96(%rsi)
|
||||||
vmovups %xmm10,-80(%rsi)
|
vmovups %xmm10,-80(%rsi)
|
||||||
vmovups %xmm11,-64(%rsi)
|
vmovups %xmm11,-64(%rsi)
|
||||||
|
@ -624,7 +965,19 @@ aesni_gcm_encrypt:
|
||||||
movq $192,%r10
|
movq $192,%r10
|
||||||
vpshufb %xmm0,%xmm8,%xmm8
|
vpshufb %xmm0,%xmm8,%xmm8
|
||||||
|
|
||||||
|
#ifdef HAVE_MOVBE
|
||||||
|
#ifdef _KERNEL
|
||||||
|
testl $1,gcm_avx_can_use_movbe(%rip)
|
||||||
|
#else
|
||||||
|
testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
|
||||||
|
#endif
|
||||||
|
jz 1f
|
||||||
call _aesni_ctr32_ghash_6x
|
call _aesni_ctr32_ghash_6x
|
||||||
|
jmp 2f
|
||||||
|
1:
|
||||||
|
#endif
|
||||||
|
call _aesni_ctr32_ghash_no_movbe_6x
|
||||||
|
2:
|
||||||
vmovdqu 32(%rsp),%xmm7
|
vmovdqu 32(%rsp),%xmm7
|
||||||
vmovdqu (%r11),%xmm0
|
vmovdqu (%r11),%xmm0
|
||||||
vmovdqu 0-32(%r9),%xmm3
|
vmovdqu 0-32(%r9),%xmm3
|
||||||
|
|
|
@ -40,8 +40,9 @@ extern "C" {
|
||||||
* anyhow.
|
* anyhow.
|
||||||
*/
|
*/
|
||||||
#if defined(__x86_64__) && defined(HAVE_AVX) && \
|
#if defined(__x86_64__) && defined(HAVE_AVX) && \
|
||||||
defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) && defined(HAVE_MOVBE)
|
defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
|
||||||
#define CAN_USE_GCM_ASM
|
#define CAN_USE_GCM_ASM
|
||||||
|
extern boolean_t gcm_avx_can_use_movbe;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define ECB_MODE 0x00000002
|
#define ECB_MODE 0x00000002
|
||||||
|
|
Loading…
Reference in New Issue