ICP: gcm-avx: Support architectures lacking the MOVBE instruction

There are a couple of x86_64 architectures which support all needed features to make the accelerated GCM implementation work but the MOVBE instruction. Those are mainly Intel Sandy- and Ivy-Bridge and AMD Bulldozer, Piledriver, and Steamroller. By using MOVBE only if available and replacing it with a MOV followed by a BSWAP if not, those architectures now benefit from the new GCM routines and performance is considerably better compared to the original implementation. Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Adam D. Moss <c@yotes.com> Signed-off-by: Attila Fülöp <attila@fueloep.org> Followup #9749 Closes #10029
2020-03-17 18:24:38 +01:00 · 2020-03-17 18:24:38 +01:00 · 5b3b79559c
parent a57d3d45d6
commit 5b3b79559c
3 changed files with 389 additions and 18 deletions
--- a/module/icp/algs/modes/gcm.c
+++ b/module/icp/algs/modes/gcm.c
@ -50,6 +50,8 @@ static uint32_t icp_gcm_impl = IMPL_FASTEST;
 static uint32_t user_sel_impl = IMPL_FASTEST;
 #ifdef CAN_USE_GCM_ASM
 /* Does the architecture we run on support the MOVBE instruction? */
 boolean_t gcm_avx_can_use_movbe = B_FALSE;
 /*
 * Whether to use the optimized openssl gcm and ghash implementations.
 * Set to true if module parameter icp_gcm_impl == "avx".
@ -60,6 +62,7 @@ static boolean_t gcm_use_avx = B_FALSE;
 static inline boolean_t gcm_avx_will_work(void);
 static inline void gcm_set_avx(boolean_t);
 static inline boolean_t gcm_toggle_avx(void);
 extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
 static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t,
    crypto_data_t *, size_t);
@ -622,20 +625,29 @@ gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size,
 	}
 #ifdef CAN_USE_GCM_ASM
 	/*
 	 * Handle the "cycle" implementation by creating avx and non avx
 	 * contexts alternately.
 	 */
 	if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) {
 		gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX;
 	} else {
 		/*
 		 * Handle the "cycle" implementation by creating avx and
 		 * non-avx contexts alternately.
 		 */
 		gcm_ctx->gcm_use_avx = gcm_toggle_avx();
-	}
+		/*
-	/* We don't handle byte swapped key schedules in the avx code path. */
+		 * We don't handle byte swapped key schedules in the avx
 		 * code path.
 		 */
 		aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched;
 		if (ks->ops->needs_byteswap == B_TRUE) {
 			gcm_ctx->gcm_use_avx = B_FALSE;
 		}
 		/* Use the MOVBE and the BSWAP variants alternately. */
 		if (gcm_ctx->gcm_use_avx == B_TRUE &&
 		    zfs_movbe_available() == B_TRUE) {
 			(void) atomic_toggle_boolean_nv(
 			    (volatile boolean_t *)&gcm_avx_can_use_movbe);
 		}
 	}
 	/* Avx and non avx context initialization differs from here on. */
 	if (gcm_ctx->gcm_use_avx == B_FALSE) {
 #endif /* ifdef CAN_USE_GCM_ASM */
@ -856,10 +868,16 @@ gcm_impl_init(void)
 	 * Use the avx implementation if it's available and the implementation
 	 * hasn't changed from its default value of fastest on module load.
 	 */
-	if (gcm_avx_will_work() &&
+	if (gcm_avx_will_work()) {
-	    GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
+#ifdef HAVE_MOVBE
 		if (zfs_movbe_available() == B_TRUE) {
 			atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE);
 		}
 #endif
 		if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) {
 			gcm_set_avx(B_TRUE);
 		}
 	}
 #endif
 	/* Finish initialization */
 	atomic_swap_32(&icp_gcm_impl, user_sel_impl);
@ -1032,7 +1050,6 @@ MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation.");
 static uint32_t gcm_avx_chunk_size =
 	((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES;
 extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *);
 extern void clear_fpu_regs_avx(void);
 extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst);
 extern void aes_encrypt_intel(const uint32_t rk[], int nr,
@ -1053,8 +1070,8 @@ gcm_avx_will_work(void)
 {
 	/* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */
 	return (kfpu_allowed() &&
-	    zfs_avx_available() && zfs_movbe_available() &&
+	    zfs_avx_available() && zfs_aes_available() &&
-	    zfs_aes_available() && zfs_pclmulqdq_available());
+	    zfs_pclmulqdq_available());
 }
 static inline void
--- a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
+++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S
@ -45,10 +45,13 @@
 # upstream merges.
 #if defined(__x86_64__) && defined(HAVE_AVX) && \
-    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) && defined(HAVE_MOVBE)
+    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
 .extern gcm_avx_can_use_movbe
 .text
 #ifdef HAVE_MOVBE
 .type	_aesni_ctr32_ghash_6x,@function
 .align	32
 _aesni_ctr32_ghash_6x:
@ -361,6 +364,333 @@ _aesni_ctr32_ghash_6x:
 	.byte	0xf3,0xc3
 .size	_aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
 #endif /* ifdef HAVE_MOVBE */
 .type	_aesni_ctr32_ghash_no_movbe_6x,@function
 .align	32
 _aesni_ctr32_ghash_no_movbe_6x:
 	vmovdqu	32(%r11),%xmm2
 	subq	$6,%rdx
 	vpxor	%xmm4,%xmm4,%xmm4
 	vmovdqu	0-128(%rcx),%xmm15
 	vpaddb	%xmm2,%xmm1,%xmm10
 	vpaddb	%xmm2,%xmm10,%xmm11
 	vpaddb	%xmm2,%xmm11,%xmm12
 	vpaddb	%xmm2,%xmm12,%xmm13
 	vpaddb	%xmm2,%xmm13,%xmm14
 	vpxor	%xmm15,%xmm1,%xmm9
 	vmovdqu	%xmm4,16+8(%rsp)
 	jmp	.Loop6x_nmb
 .align	32
 .Loop6x_nmb:
 	addl	$100663296,%ebx
 	jc	.Lhandle_ctr32_nmb
 	vmovdqu	0-32(%r9),%xmm3
 	vpaddb	%xmm2,%xmm14,%xmm1
 	vpxor	%xmm15,%xmm10,%xmm10
 	vpxor	%xmm15,%xmm11,%xmm11
 .Lresume_ctr32_nmb:
 	vmovdqu	%xmm1,(%r8)
 	vpclmulqdq	$0x10,%xmm3,%xmm7,%xmm5
 	vpxor	%xmm15,%xmm12,%xmm12
 	vmovups	16-128(%rcx),%xmm2
 	vpclmulqdq	$0x01,%xmm3,%xmm7,%xmm6
 	xorq	%r12,%r12
 	cmpq	%r14,%r15
 	vaesenc	%xmm2,%xmm9,%xmm9
 	vmovdqu	48+8(%rsp),%xmm0
 	vpxor	%xmm15,%xmm13,%xmm13
 	vpclmulqdq	$0x00,%xmm3,%xmm7,%xmm1
 	vaesenc	%xmm2,%xmm10,%xmm10
 	vpxor	%xmm15,%xmm14,%xmm14
 	setnc	%r12b
 	vpclmulqdq	$0x11,%xmm3,%xmm7,%xmm7
 	vaesenc	%xmm2,%xmm11,%xmm11
 	vmovdqu	16-32(%r9),%xmm3
 	negq	%r12
 	vaesenc	%xmm2,%xmm12,%xmm12
 	vpxor	%xmm5,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm3,%xmm0,%xmm5
 	vpxor	%xmm4,%xmm8,%xmm8
 	vaesenc	%xmm2,%xmm13,%xmm13
 	vpxor	%xmm5,%xmm1,%xmm4
 	andq	$0x60,%r12
 	vmovups	32-128(%rcx),%xmm15
 	vpclmulqdq	$0x10,%xmm3,%xmm0,%xmm1
 	vaesenc	%xmm2,%xmm14,%xmm14
 	vpclmulqdq	$0x01,%xmm3,%xmm0,%xmm2
 	leaq	(%r14,%r12,1),%r14
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	16+8(%rsp),%xmm8,%xmm8
 	vpclmulqdq	$0x11,%xmm3,%xmm0,%xmm3
 	vmovdqu	64+8(%rsp),%xmm0
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movq	88(%r14),%r13
 	bswapq	%r13
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	80(%r14),%r12
 	bswapq	%r12
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,32+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,40+8(%rsp)
 	vmovdqu	48-32(%r9),%xmm5
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vmovups	48-128(%rcx),%xmm15
 	vpxor	%xmm1,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm5,%xmm0,%xmm1
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm2,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm5,%xmm0,%xmm2
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpxor	%xmm3,%xmm7,%xmm7
 	vpclmulqdq	$0x01,%xmm5,%xmm0,%xmm3
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vpclmulqdq	$0x11,%xmm5,%xmm0,%xmm5
 	vmovdqu	80+8(%rsp),%xmm0
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vpxor	%xmm1,%xmm4,%xmm4
 	vmovdqu	64-32(%r9),%xmm1
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vmovups	64-128(%rcx),%xmm15
 	vpxor	%xmm2,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm1,%xmm0,%xmm2
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm3,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm1,%xmm0,%xmm3
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movq	72(%r14),%r13
 	bswapq	%r13
 	vpxor	%xmm5,%xmm7,%xmm7
 	vpclmulqdq	$0x01,%xmm1,%xmm0,%xmm5
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	64(%r14),%r12
 	bswapq	%r12
 	vpclmulqdq	$0x11,%xmm1,%xmm0,%xmm1
 	vmovdqu	96+8(%rsp),%xmm0
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,48+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,56+8(%rsp)
 	vpxor	%xmm2,%xmm4,%xmm4
 	vmovdqu	96-32(%r9),%xmm2
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vmovups	80-128(%rcx),%xmm15
 	vpxor	%xmm3,%xmm6,%xmm6
 	vpclmulqdq	$0x00,%xmm2,%xmm0,%xmm3
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm5,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm2,%xmm0,%xmm5
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movq	56(%r14),%r13
 	bswapq	%r13
 	vpxor	%xmm1,%xmm7,%xmm7
 	vpclmulqdq	$0x01,%xmm2,%xmm0,%xmm1
 	vpxor	112+8(%rsp),%xmm8,%xmm8
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	48(%r14),%r12
 	bswapq	%r12
 	vpclmulqdq	$0x11,%xmm2,%xmm0,%xmm2
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,64+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,72+8(%rsp)
 	vpxor	%xmm3,%xmm4,%xmm4
 	vmovdqu	112-32(%r9),%xmm3
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vmovups	96-128(%rcx),%xmm15
 	vpxor	%xmm5,%xmm6,%xmm6
 	vpclmulqdq	$0x10,%xmm3,%xmm8,%xmm5
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm1,%xmm6,%xmm6
 	vpclmulqdq	$0x01,%xmm3,%xmm8,%xmm1
 	vaesenc	%xmm15,%xmm10,%xmm10
 	movq	40(%r14),%r13
 	bswapq	%r13
 	vpxor	%xmm2,%xmm7,%xmm7
 	vpclmulqdq	$0x00,%xmm3,%xmm8,%xmm2
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	32(%r14),%r12
 	bswapq	%r12
 	vpclmulqdq	$0x11,%xmm3,%xmm8,%xmm8
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r13,80+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	movq	%r12,88+8(%rsp)
 	vpxor	%xmm5,%xmm6,%xmm6
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vpxor	%xmm1,%xmm6,%xmm6
 	vmovups	112-128(%rcx),%xmm15
 	vpslldq	$8,%xmm6,%xmm5
 	vpxor	%xmm2,%xmm4,%xmm4
 	vmovdqu	16(%r11),%xmm3
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vpxor	%xmm8,%xmm7,%xmm7
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpxor	%xmm5,%xmm4,%xmm4
 	movq	24(%r14),%r13
 	bswapq	%r13
 	vaesenc	%xmm15,%xmm11,%xmm11
 	movq	16(%r14),%r12
 	bswapq	%r12
 	vpalignr	$8,%xmm4,%xmm4,%xmm0
 	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
 	movq	%r13,96+8(%rsp)
 	vaesenc	%xmm15,%xmm12,%xmm12
 	movq	%r12,104+8(%rsp)
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vmovups	128-128(%rcx),%xmm1
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vaesenc	%xmm1,%xmm9,%xmm9
 	vmovups	144-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm10,%xmm10
 	vpsrldq	$8,%xmm6,%xmm6
 	vaesenc	%xmm1,%xmm11,%xmm11
 	vpxor	%xmm6,%xmm7,%xmm7
 	vaesenc	%xmm1,%xmm12,%xmm12
 	vpxor	%xmm0,%xmm4,%xmm4
 	movq	8(%r14),%r13
 	bswapq	%r13
 	vaesenc	%xmm1,%xmm13,%xmm13
 	movq	0(%r14),%r12
 	bswapq	%r12
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	160-128(%rcx),%xmm1
 	cmpl	$12,%ebp	// ICP uses 10,12,14 not 9,11,13 for rounds.
 	jb	.Lenc_tail_nmb
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vaesenc	%xmm1,%xmm9,%xmm9
 	vaesenc	%xmm1,%xmm10,%xmm10
 	vaesenc	%xmm1,%xmm11,%xmm11
 	vaesenc	%xmm1,%xmm12,%xmm12
 	vaesenc	%xmm1,%xmm13,%xmm13
 	vmovups	176-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	192-128(%rcx),%xmm1
 	cmpl	$14,%ebp	// ICP does not zero key schedule.
 	jb	.Lenc_tail_nmb
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vaesenc	%xmm1,%xmm9,%xmm9
 	vaesenc	%xmm1,%xmm10,%xmm10
 	vaesenc	%xmm1,%xmm11,%xmm11
 	vaesenc	%xmm1,%xmm12,%xmm12
 	vaesenc	%xmm1,%xmm13,%xmm13
 	vmovups	208-128(%rcx),%xmm15
 	vaesenc	%xmm1,%xmm14,%xmm14
 	vmovups	224-128(%rcx),%xmm1
 	jmp	.Lenc_tail_nmb
 .align	32
 .Lhandle_ctr32_nmb:
 	vmovdqu	(%r11),%xmm0
 	vpshufb	%xmm0,%xmm1,%xmm6
 	vmovdqu	48(%r11),%xmm5
 	vpaddd	64(%r11),%xmm6,%xmm10
 	vpaddd	%xmm5,%xmm6,%xmm11
 	vmovdqu	0-32(%r9),%xmm3
 	vpaddd	%xmm5,%xmm10,%xmm12
 	vpshufb	%xmm0,%xmm10,%xmm10
 	vpaddd	%xmm5,%xmm11,%xmm13
 	vpshufb	%xmm0,%xmm11,%xmm11
 	vpxor	%xmm15,%xmm10,%xmm10
 	vpaddd	%xmm5,%xmm12,%xmm14
 	vpshufb	%xmm0,%xmm12,%xmm12
 	vpxor	%xmm15,%xmm11,%xmm11
 	vpaddd	%xmm5,%xmm13,%xmm1
 	vpshufb	%xmm0,%xmm13,%xmm13
 	vpshufb	%xmm0,%xmm14,%xmm14
 	vpshufb	%xmm0,%xmm1,%xmm1
 	jmp	.Lresume_ctr32_nmb
 .align	32
 .Lenc_tail_nmb:
 	vaesenc	%xmm15,%xmm9,%xmm9
 	vmovdqu	%xmm7,16+8(%rsp)
 	vpalignr	$8,%xmm4,%xmm4,%xmm8
 	vaesenc	%xmm15,%xmm10,%xmm10
 	vpclmulqdq	$0x10,%xmm3,%xmm4,%xmm4
 	vpxor	0(%rdi),%xmm1,%xmm2
 	vaesenc	%xmm15,%xmm11,%xmm11
 	vpxor	16(%rdi),%xmm1,%xmm0
 	vaesenc	%xmm15,%xmm12,%xmm12
 	vpxor	32(%rdi),%xmm1,%xmm5
 	vaesenc	%xmm15,%xmm13,%xmm13
 	vpxor	48(%rdi),%xmm1,%xmm6
 	vaesenc	%xmm15,%xmm14,%xmm14
 	vpxor	64(%rdi),%xmm1,%xmm7
 	vpxor	80(%rdi),%xmm1,%xmm3
 	vmovdqu	(%r8),%xmm1
 	vaesenclast	%xmm2,%xmm9,%xmm9
 	vmovdqu	32(%r11),%xmm2
 	vaesenclast	%xmm0,%xmm10,%xmm10
 	vpaddb	%xmm2,%xmm1,%xmm0
 	movq	%r13,112+8(%rsp)
 	leaq	96(%rdi),%rdi
 	vaesenclast	%xmm5,%xmm11,%xmm11
 	vpaddb	%xmm2,%xmm0,%xmm5
 	movq	%r12,120+8(%rsp)
 	leaq	96(%rsi),%rsi
 	vmovdqu	0-128(%rcx),%xmm15
 	vaesenclast	%xmm6,%xmm12,%xmm12
 	vpaddb	%xmm2,%xmm5,%xmm6
 	vaesenclast	%xmm7,%xmm13,%xmm13
 	vpaddb	%xmm2,%xmm6,%xmm7
 	vaesenclast	%xmm3,%xmm14,%xmm14
 	vpaddb	%xmm2,%xmm7,%xmm3
 	addq	$0x60,%r10
 	subq	$0x6,%rdx
 	jc	.L6x_done_nmb
 	vmovups	%xmm9,-96(%rsi)
 	vpxor	%xmm15,%xmm1,%xmm9
 	vmovups	%xmm10,-80(%rsi)
 	vmovdqa	%xmm0,%xmm10
 	vmovups	%xmm11,-64(%rsi)
 	vmovdqa	%xmm5,%xmm11
 	vmovups	%xmm12,-48(%rsi)
 	vmovdqa	%xmm6,%xmm12
 	vmovups	%xmm13,-32(%rsi)
 	vmovdqa	%xmm7,%xmm13
 	vmovups	%xmm14,-16(%rsi)
 	vmovdqa	%xmm3,%xmm14
 	vmovdqu	32+8(%rsp),%xmm7
 	jmp	.Loop6x_nmb
 .L6x_done_nmb:
 	vpxor	16+8(%rsp),%xmm8,%xmm8
 	vpxor	%xmm4,%xmm8,%xmm8
 	.byte	0xf3,0xc3
 .size	_aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x
 .globl	aesni_gcm_decrypt
 .type	aesni_gcm_decrypt,@function
 .align	32
@ -431,8 +761,19 @@ aesni_gcm_decrypt:
 	vmovdqu	%xmm2,96(%rsp)
 	vmovdqu	%xmm3,112(%rsp)
 #ifdef HAVE_MOVBE
 #ifdef _KERNEL
 	testl	$1,gcm_avx_can_use_movbe(%rip)
 #else
 	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
 #endif
 	jz	1f
 	call	_aesni_ctr32_ghash_6x
-
+	jmp	2f
 1:
 #endif
 	call	_aesni_ctr32_ghash_no_movbe_6x
 2:
 	vmovups	%xmm9,-96(%rsi)
 	vmovups	%xmm10,-80(%rsi)
 	vmovups	%xmm11,-64(%rsi)
@ -624,7 +965,19 @@ aesni_gcm_encrypt:
 	movq	$192,%r10
 	vpshufb	%xmm0,%xmm8,%xmm8
 #ifdef HAVE_MOVBE
 #ifdef _KERNEL
 	testl	$1,gcm_avx_can_use_movbe(%rip)
 #else
 	testl	$1,gcm_avx_can_use_movbe@GOTPCREL(%rip)
 #endif
 	jz	1f
 	call	_aesni_ctr32_ghash_6x
 	jmp	2f
 1:
 #endif
 	call	_aesni_ctr32_ghash_no_movbe_6x
 2:
 	vmovdqu	32(%rsp),%xmm7
 	vmovdqu	(%r11),%xmm0
 	vmovdqu	0-32(%r9),%xmm3
--- a/module/icp/include/modes/modes.h
+++ b/module/icp/include/modes/modes.h
@ -40,8 +40,9 @@ extern "C" {
 * anyhow.
 */
 #if defined(__x86_64__) && defined(HAVE_AVX) && \
-    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) && defined(HAVE_MOVBE)
+    defined(HAVE_AES) && defined(HAVE_PCLMULQDQ)
 #define	CAN_USE_GCM_ASM
 extern boolean_t gcm_avx_can_use_movbe;
 #endif
 #define	ECB_MODE			0x00000002