zfs/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009 Intel Corporation
 * All Rights Reserved.
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
 * instructions.  This file contains an accelerated
 * Galois Field Multiplication implementation.
 *
 * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,
 * carry-less multiplication. More information about PCLMULQDQ can be
 * found at:
 * http://software.intel.com/en-us/articles/
 * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
 *
 */

/*
 * ====================================================================
 * OpenSolaris OS modifications
 *
 * This source originates as file galois_hash_asm.c from
 * Intel Corporation dated September 21, 2009.
 *
 * This OpenSolaris version has these major changes from the original source:
 *
 * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from
 * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function
 * definition for lint.
 *
 * 2. Formatted code, added comments, and added #includes and #defines.
 *
 * 3. If bit CR0.TS is set, clear and set the TS bit, after and before
 * calling kpreempt_disable() and kpreempt_enable().
 * If the TS bit is not set, Save and restore %xmm registers at the beginning
 * and end of function calls (%xmm* registers are not saved and restored by
 * during kernel thread preemption).
 *
 * 4. Removed code to perform hashing.  This is already done with C macro
 * GHASH in gcm.c.  For better performance, this removed code should be
 * reintegrated in the future to replace the C GHASH macro.
 *
 * 5. Added code to byte swap 16-byte input and output.
 *
 * 6. Folded in comments from the original C source with embedded assembly
 * (SB_w_shift_xor.c)
 *
 * 7. Renamed function and reordered parameters to match OpenSolaris:
 * Intel interface:
 *	void galois_hash_asm(unsigned char *hk, unsigned char *s,
 *		unsigned char *d, int length)
 * OpenSolaris OS interface:
 *	void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
 * ====================================================================
 */


#if defined(lint) || defined(__lint)	/* lint */

#include <sys/types.h>

/* ARGSUSED */
void
gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) {
}

#elif defined(HAVE_PCLMULQDQ)	/* guard by instruction set */

#define _ASM
#include <sys/asm_linkage.h>

/*
 * Use this mask to byte-swap a 16-byte integer with the pshufb instruction
 */

// static uint8_t byte_swap16_mask[] = {
//	 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };
.data
.align XMM_ALIGN
.Lbyte_swap16_mask:
	.byte	15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0


/*
 * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res);
 *
 * Perform a carry-less multiplication (that is, use XOR instead of the
 * multiply operator) on P1 and P2 and place the result in P3.
 *
 * Byte swap the input and the output.
 *
 * Note: x_in, y, and res all point to a block of 20-byte numbers
 * (an array of two 64-bit integers).
 *
 * Note2: For kernel code, caller is responsible for ensuring
 * kpreempt_disable() has been called.  This is because %xmm registers are
 * not saved/restored.  Clear and set the CR0.TS bit on entry and exit,
 * respectively, if TS is set on entry.  Otherwise, if TS is not set,
 * save and restore %xmm registers on the stack.
 *
 * Note3: Original Intel definition:
 * void galois_hash_asm(unsigned char *hk, unsigned char *s,
 *	unsigned char *d, int length)
 *
 * Note4: Register/parameter mapping:
 * Intel:
 *	Parameter 1: %rcx (copied to %xmm0)	hk or x_in
 *	Parameter 2: %rdx (copied to %xmm1)	s or y
 *	Parameter 3: %rdi (result)		d or res
 * OpenSolaris:
 *	Parameter 1: %rdi (copied to %xmm0)	x_in
 *	Parameter 2: %rsi (copied to %xmm1)	y
 *	Parameter 3: %rdx (result)		res
 */

ENTRY_NP(gcm_mul_pclmulqdq)
	//
	// Copy Parameters
	//
	movdqu	(%rdi), %xmm0	// P1
	movdqu	(%rsi), %xmm1	// P2

	//
	// Byte swap 16-byte input
	//
	lea	.Lbyte_swap16_mask(%rip), %rax
	movups	(%rax), %xmm10
	pshufb	%xmm10, %xmm0
	pshufb	%xmm10, %xmm1


	//
	// Multiply with the hash key
	//
	movdqu	%xmm0, %xmm3
	pclmulqdq $0, %xmm1, %xmm3	// xmm3 holds a0*b0

	movdqu	%xmm0, %xmm4
	pclmulqdq $16, %xmm1, %xmm4	// xmm4 holds a0*b1

	movdqu	%xmm0, %xmm5
	pclmulqdq $1, %xmm1, %xmm5	// xmm5 holds a1*b0
	movdqu	%xmm0, %xmm6
	pclmulqdq $17, %xmm1, %xmm6	// xmm6 holds a1*b1

	pxor	%xmm5, %xmm4	// xmm4 holds a0*b1 + a1*b0

	movdqu	%xmm4, %xmm5	// move the contents of xmm4 to xmm5
	psrldq	$8, %xmm4	// shift by xmm4 64 bits to the right
	pslldq	$8, %xmm5	// shift by xmm5 64 bits to the left
	pxor	%xmm5, %xmm3
	pxor	%xmm4, %xmm6	// Register pair <xmm6:xmm3> holds the result
				// of the carry-less multiplication of
				// xmm0 by xmm1.

	// We shift the result of the multiplication by one bit position
	// to the left to cope for the fact that the bits are reversed.
	movdqu	%xmm3, %xmm7
	movdqu	%xmm6, %xmm8
	pslld	$1, %xmm3
	pslld	$1, %xmm6
	psrld	$31, %xmm7
	psrld	$31, %xmm8
	movdqu	%xmm7, %xmm9
	pslldq	$4, %xmm8
	pslldq	$4, %xmm7
	psrldq	$12, %xmm9
	por	%xmm7, %xmm3
	por	%xmm8, %xmm6
	por	%xmm9, %xmm6

	//
	// First phase of the reduction
	//
	// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts
	// independently.
	movdqu	%xmm3, %xmm7
	movdqu	%xmm3, %xmm8
	movdqu	%xmm3, %xmm9
	pslld	$31, %xmm7	// packed right shift shifting << 31
	pslld	$30, %xmm8	// packed right shift shifting << 30
	pslld	$25, %xmm9	// packed right shift shifting << 25
	pxor	%xmm8, %xmm7	// xor the shifted versions
	pxor	%xmm9, %xmm7
	movdqu	%xmm7, %xmm8
	pslldq	$12, %xmm7
	psrldq	$4, %xmm8
	pxor	%xmm7, %xmm3	// first phase of the reduction complete

	//
	// Second phase of the reduction
	//
	// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these
	// shift operations.
	movdqu	%xmm3, %xmm2
	movdqu	%xmm3, %xmm4	// packed left shifting >> 1
	movdqu	%xmm3, %xmm5
	psrld	$1, %xmm2
	psrld	$2, %xmm4	// packed left shifting >> 2
	psrld	$7, %xmm5	// packed left shifting >> 7
	pxor	%xmm4, %xmm2	// xor the shifted versions
	pxor	%xmm5, %xmm2
	pxor	%xmm8, %xmm2
	pxor	%xmm2, %xmm3
	pxor	%xmm3, %xmm6	// the result is in xmm6

	//
	// Byte swap 16-byte result
	//
	pshufb	%xmm10, %xmm6	// %xmm10 has the swap mask

	//
	// Store the result
	//
	movdqu	%xmm6, (%rdx)	// P3


	//
	// Return
	//
	RET
	SET_SIZE(gcm_mul_pclmulqdq)

#endif	/* lint || __lint */

#ifdef __ELF__
.section .note.GNU-stack,"",%progbits
#endif
Illumos Crypto Port module added to enable native encryption in zfs A port of the Illumos Crypto Framework to a Linux kernel module (found in module/icp). This is needed to do the actual encryption work. We cannot use the Linux kernel's built in crypto api because it is only exported to GPL-licensed modules. Having the ICP also means the crypto code can run on any of the other kernels under OpenZFS. I ended up porting over most of the internals of the framework, which means that porting over other API calls (if we need them) should be fairly easy. Specifically, I have ported over the API functions related to encryption, digests, macs, and crypto templates. The ICP is able to use assembly-accelerated encryption on amd64 machines and AES-NI instructions on Intel chips that support it. There are place-holder directories for similar assembly optimizations for other architectures (although they have not been written). Signed-off-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #4329 2016-05-12 14:51:24 +00:00			`/*`
			`* CDDL HEADER START`
			`*`
			`* The contents of this file are subject to the terms of the`
			`* Common Development and Distribution License (the "License").`
			`* You may not use this file except in compliance with the License.`
			`*`
			`* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE`
			`* or http://www.opensolaris.org/os/licensing.`
			`* See the License for the specific language governing permissions`
			`* and limitations under the License.`
			`*`
			`* When distributing Covered Code, include this CDDL HEADER in each`
			`* file and include the License file at usr/src/OPENSOLARIS.LICENSE.`
			`* If applicable, add the following below this CDDL HEADER, with the`
			`* fields enclosed by brackets "[]" replaced with your own identifying`
			`* information: Portions Copyright [yyyy] [name of copyright owner]`
			`*`
			`* CDDL HEADER END`
			`*/`

			`/*`
			`* Copyright (c) 2009 Intel Corporation`
			`* All Rights Reserved.`
			`*/`
			`/*`
			`* Copyright 2009 Sun Microsystems, Inc. All rights reserved.`
			`* Use is subject to license terms.`
			`*/`

			`/*`
			`* Accelerated GHASH implementation with Intel PCLMULQDQ-NI`
			`* instructions. This file contains an accelerated`
			`* Galois Field Multiplication implementation.`
			`*`
			`* PCLMULQDQ is used to accelerate the most time-consuming part of GHASH,`
			`* carry-less multiplication. More information about PCLMULQDQ can be`
			`* found at:`
			`* http://software.intel.com/en-us/articles/`
			`* carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/`
			`*`
			`*/`

			`/*`
			`* ====================================================================`
			`* OpenSolaris OS modifications`
			`*`
			`* This source originates as file galois_hash_asm.c from`
			`* Intel Corporation dated September 21, 2009.`
			`*`
			`* This OpenSolaris version has these major changes from the original source:`
			`*`
			`* 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from`
			`* /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function`
			`* definition for lint.`
			`*`
			`* 2. Formatted code, added comments, and added #includes and #defines.`
			`*`
			`* 3. If bit CR0.TS is set, clear and set the TS bit, after and before`
			`* calling kpreempt_disable() and kpreempt_enable().`
			`* If the TS bit is not set, Save and restore %xmm registers at the beginning`
			`* and end of function calls (%xmm* registers are not saved and restored by`
			`* during kernel thread preemption).`
			`*`
			`* 4. Removed code to perform hashing. This is already done with C macro`
			`* GHASH in gcm.c. For better performance, this removed code should be`
			`* reintegrated in the future to replace the C GHASH macro.`
			`*`
			`* 5. Added code to byte swap 16-byte input and output.`
			`*`
			`* 6. Folded in comments from the original C source with embedded assembly`
			`* (SB_w_shift_xor.c)`
			`*`
			`* 7. Renamed function and reordered parameters to match OpenSolaris:`
			`* Intel interface:`
			`* void galois_hash_asm(unsigned char hk, unsigned char s,`
			`* unsigned char *d, int length)`
			`* OpenSolaris OS interface:`
			`* void gcm_mul_pclmulqdq(uint64_t x_in, uint64_t y, uint64_t *res);`
			`* ====================================================================`
			`*/`


Add support for selecting encryption backend - Add two new module parameters to icp (icp_aes_impl, icp_gcm_impl) that control the crypto implementation. At the moment there is a choice between generic and aesni (on platforms that support it). - This enables support for AES-NI and PCLMULQDQ-NI on AMD Family 15h (bulldozer) and newer CPUs (zen). - Modify aes_key_t to track what implementation it was generated with as key schedules generated with various implementations are not necessarily interchangable. Reviewed by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tom Caputi <tcaputi@datto.com> Reviewed-by: Richard Laager <rlaager@wiktel.com> Signed-off-by: Nathaniel R. Lewis <linux.robotdude@gmail.com> Closes #7102 Closes #7103 2018-08-02 18:59:24 +00:00			`#if defined(lint) \|\| defined(__lint) /* lint */`
Illumos Crypto Port module added to enable native encryption in zfs A port of the Illumos Crypto Framework to a Linux kernel module (found in module/icp). This is needed to do the actual encryption work. We cannot use the Linux kernel's built in crypto api because it is only exported to GPL-licensed modules. Having the ICP also means the crypto code can run on any of the other kernels under OpenZFS. I ended up porting over most of the internals of the framework, which means that porting over other API calls (if we need them) should be fairly easy. Specifically, I have ported over the API functions related to encryption, digests, macs, and crypto templates. The ICP is able to use assembly-accelerated encryption on amd64 machines and AES-NI instructions on Intel chips that support it. There are place-holder directories for similar assembly optimizations for other architectures (although they have not been written). Signed-off-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #4329 2016-05-12 14:51:24 +00:00
			`#include <sys/types.h>`

			`/* ARGSUSED */`
			`void`
			`gcm_mul_pclmulqdq(uint64_t x_in, uint64_t y, uint64_t *res) {`
			`}`

Add support for selecting encryption backend - Add two new module parameters to icp (icp_aes_impl, icp_gcm_impl) that control the crypto implementation. At the moment there is a choice between generic and aesni (on platforms that support it). - This enables support for AES-NI and PCLMULQDQ-NI on AMD Family 15h (bulldozer) and newer CPUs (zen). - Modify aes_key_t to track what implementation it was generated with as key schedules generated with various implementations are not necessarily interchangable. Reviewed by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Tom Caputi <tcaputi@datto.com> Reviewed-by: Richard Laager <rlaager@wiktel.com> Signed-off-by: Nathaniel R. Lewis <linux.robotdude@gmail.com> Closes #7102 Closes #7103 2018-08-02 18:59:24 +00:00			`#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */`
Illumos Crypto Port module added to enable native encryption in zfs A port of the Illumos Crypto Framework to a Linux kernel module (found in module/icp). This is needed to do the actual encryption work. We cannot use the Linux kernel's built in crypto api because it is only exported to GPL-licensed modules. Having the ICP also means the crypto code can run on any of the other kernels under OpenZFS. I ended up porting over most of the internals of the framework, which means that porting over other API calls (if we need them) should be fairly easy. Specifically, I have ported over the API functions related to encryption, digests, macs, and crypto templates. The ICP is able to use assembly-accelerated encryption on amd64 machines and AES-NI instructions on Intel chips that support it. There are place-holder directories for similar assembly optimizations for other architectures (although they have not been written). Signed-off-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #4329 2016-05-12 14:51:24 +00:00
			`#define _ASM`
			`#include <sys/asm_linkage.h>`

			`/*`
			`* Use this mask to byte-swap a 16-byte integer with the pshufb instruction`
			`*/`

			`// static uint8_t byte_swap16_mask[] = {`
			`// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 };`
[icp] fpu and asm cleanup for linux Properly annotate functions and data section so that objtool does not complain when CONFIG_STACK_VALIDATION and CONFIG_FRAME_POINTER are enabled. Pass KERNELCPPFLAGS to assembler. Use kfpu_begin()/kfpu_end() to protect SIMD regions in Linux kernel. Reviewed-by: Tom Caputi <tcaputi@datto.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Closes #5872 Closes #5041 2017-03-07 20:59:31 +00:00			`.data`
Illumos Crypto Port module added to enable native encryption in zfs A port of the Illumos Crypto Framework to a Linux kernel module (found in module/icp). This is needed to do the actual encryption work. We cannot use the Linux kernel's built in crypto api because it is only exported to GPL-licensed modules. Having the ICP also means the crypto code can run on any of the other kernels under OpenZFS. I ended up porting over most of the internals of the framework, which means that porting over other API calls (if we need them) should be fairly easy. Specifically, I have ported over the API functions related to encryption, digests, macs, and crypto templates. The ICP is able to use assembly-accelerated encryption on amd64 machines and AES-NI instructions on Intel chips that support it. There are place-holder directories for similar assembly optimizations for other architectures (although they have not been written). Signed-off-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #4329 2016-05-12 14:51:24 +00:00			`.align XMM_ALIGN`
			`.Lbyte_swap16_mask:`
			`.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0`


			`/*`
			`* void gcm_mul_pclmulqdq(uint64_t x_in, uint64_t y, uint64_t *res);`
			`*`
			`* Perform a carry-less multiplication (that is, use XOR instead of the`
			`* multiply operator) on P1 and P2 and place the result in P3.`
			`*`
			`* Byte swap the input and the output.`
			`*`
			`* Note: x_in, y, and res all point to a block of 20-byte numbers`
			`* (an array of two 64-bit integers).`
			`*`
			`* Note2: For kernel code, caller is responsible for ensuring`
			`* kpreempt_disable() has been called. This is because %xmm registers are`
			`* not saved/restored. Clear and set the CR0.TS bit on entry and exit,`
			`* respectively, if TS is set on entry. Otherwise, if TS is not set,`
			`* save and restore %xmm registers on the stack.`
			`*`
			`* Note3: Original Intel definition:`
			`* void galois_hash_asm(unsigned char hk, unsigned char s,`
			`* unsigned char *d, int length)`
			`*`
			`* Note4: Register/parameter mapping:`
			`* Intel:`
			`* Parameter 1: %rcx (copied to %xmm0) hk or x_in`
			`* Parameter 2: %rdx (copied to %xmm1) s or y`
			`* Parameter 3: %rdi (result) d or res`
			`* OpenSolaris:`
			`* Parameter 1: %rdi (copied to %xmm0) x_in`
			`* Parameter 2: %rsi (copied to %xmm1) y`
			`* Parameter 3: %rdx (result) res`
			`*/`

			`ENTRY_NP(gcm_mul_pclmulqdq)`
			`//`
			`// Copy Parameters`
			`//`
			`movdqu (%rdi), %xmm0 // P1`
			`movdqu (%rsi), %xmm1 // P2`

			`//`
			`// Byte swap 16-byte input`
			`//`
			`lea .Lbyte_swap16_mask(%rip), %rax`
Change movaps to movups in AES-NI code Currently, the ICP contains accelerated assembly code to be used specifically on CPUs with AES-NI enabled. This code makes heavy use of the movaps instruction which assumes that it will be provided aes keys that are 16 byte aligned. This assumption seems to hold on Illumos, but on Linux some kernel options such as 'slub_debug=P' will violate it. This patch changes all instances of this instruction to movups which is the same except that it can handle unaligned memory. This patch also adds a few flags which were accidentally never given to the assembly compiler, resulting in objtool warnings. Reviewed by: Gvozden Neskovic <neskovic@gmail.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-by: Nathaniel R. Lewis <linux.robotdude@gmail.com> Signed-off-by: Tom Caputi <tcaputi@datto.com> Closes #7065 Closes #7108 2018-01-31 23:17:56 +00:00			`movups (%rax), %xmm10`
Illumos Crypto Port module added to enable native encryption in zfs A port of the Illumos Crypto Framework to a Linux kernel module (found in module/icp). This is needed to do the actual encryption work. We cannot use the Linux kernel's built in crypto api because it is only exported to GPL-licensed modules. Having the ICP also means the crypto code can run on any of the other kernels under OpenZFS. I ended up porting over most of the internals of the framework, which means that porting over other API calls (if we need them) should be fairly easy. Specifically, I have ported over the API functions related to encryption, digests, macs, and crypto templates. The ICP is able to use assembly-accelerated encryption on amd64 machines and AES-NI instructions on Intel chips that support it. There are place-holder directories for similar assembly optimizations for other architectures (although they have not been written). Signed-off-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #4329 2016-05-12 14:51:24 +00:00			`pshufb %xmm10, %xmm0`
			`pshufb %xmm10, %xmm1`


			`//`
			`// Multiply with the hash key`
			`//`
			`movdqu %xmm0, %xmm3`
			`pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0`

			`movdqu %xmm0, %xmm4`
			`pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1`

			`movdqu %xmm0, %xmm5`
			`pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0`
			`movdqu %xmm0, %xmm6`
			`pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1`

			`pxor %xmm5, %xmm4 // xmm4 holds a0b1 + a1b0`

			`movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5`
			`psrldq $8, %xmm4 // shift by xmm4 64 bits to the right`
			`pslldq $8, %xmm5 // shift by xmm5 64 bits to the left`
			`pxor %xmm5, %xmm3`
			`pxor %xmm4, %xmm6 // Register pair <xmm6:xmm3> holds the result`
			`// of the carry-less multiplication of`
			`// xmm0 by xmm1.`

			`// We shift the result of the multiplication by one bit position`
			`// to the left to cope for the fact that the bits are reversed.`
			`movdqu %xmm3, %xmm7`
			`movdqu %xmm6, %xmm8`
			`pslld $1, %xmm3`
			`pslld $1, %xmm6`
			`psrld $31, %xmm7`
			`psrld $31, %xmm8`
			`movdqu %xmm7, %xmm9`
			`pslldq $4, %xmm8`
			`pslldq $4, %xmm7`
			`psrldq $12, %xmm9`
			`por %xmm7, %xmm3`
			`por %xmm8, %xmm6`
			`por %xmm9, %xmm6`

			`//`
			`// First phase of the reduction`
			`//`
			`// Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts`
			`// independently.`
			`movdqu %xmm3, %xmm7`
			`movdqu %xmm3, %xmm8`
			`movdqu %xmm3, %xmm9`
			`pslld $31, %xmm7 // packed right shift shifting << 31`
			`pslld $30, %xmm8 // packed right shift shifting << 30`
			`pslld $25, %xmm9 // packed right shift shifting << 25`
			`pxor %xmm8, %xmm7 // xor the shifted versions`
			`pxor %xmm9, %xmm7`
			`movdqu %xmm7, %xmm8`
			`pslldq $12, %xmm7`
			`psrldq $4, %xmm8`
			`pxor %xmm7, %xmm3 // first phase of the reduction complete`

			`//`
			`// Second phase of the reduction`
			`//`
			`// Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these`
			`// shift operations.`
			`movdqu %xmm3, %xmm2`
			`movdqu %xmm3, %xmm4 // packed left shifting >> 1`
			`movdqu %xmm3, %xmm5`
			`psrld $1, %xmm2`
			`psrld $2, %xmm4 // packed left shifting >> 2`
			`psrld $7, %xmm5 // packed left shifting >> 7`
			`pxor %xmm4, %xmm2 // xor the shifted versions`
			`pxor %xmm5, %xmm2`
			`pxor %xmm8, %xmm2`
			`pxor %xmm2, %xmm3`
			`pxor %xmm3, %xmm6 // the result is in xmm6`

			`//`
			`// Byte swap 16-byte result`
			`//`
			`pshufb %xmm10, %xmm6 // %xmm10 has the swap mask`

			`//`
			`// Store the result`
			`//`
			`movdqu %xmm6, (%rdx) // P3`


			`//`
[icp] fpu and asm cleanup for linux Properly annotate functions and data section so that objtool does not complain when CONFIG_STACK_VALIDATION and CONFIG_FRAME_POINTER are enabled. Pass KERNELCPPFLAGS to assembler. Use kfpu_begin()/kfpu_end() to protect SIMD regions in Linux kernel. Reviewed-by: Tom Caputi <tcaputi@datto.com> Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov> Signed-off-by: Gvozden Neskovic <neskovic@gmail.com> Closes #5872 Closes #5041 2017-03-07 20:59:31 +00:00			`// Return`
Illumos Crypto Port module added to enable native encryption in zfs A port of the Illumos Crypto Framework to a Linux kernel module (found in module/icp). This is needed to do the actual encryption work. We cannot use the Linux kernel's built in crypto api because it is only exported to GPL-licensed modules. Having the ICP also means the crypto code can run on any of the other kernels under OpenZFS. I ended up porting over most of the internals of the framework, which means that porting over other API calls (if we need them) should be fairly easy. Specifically, I have ported over the API functions related to encryption, digests, macs, and crypto templates. The ICP is able to use assembly-accelerated encryption on amd64 machines and AES-NI instructions on Intel chips that support it. There are place-holder directories for similar assembly optimizations for other architectures (although they have not been written). Signed-off-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #4329 2016-05-12 14:51:24 +00:00			`//`
Fix objtool: missing int3 after ret warning Resolve straight-line speculation warnings reported by objtool for x86_64 assembly on Linux when CONFIG_SLS is set. See the following LWN article for the complete details. https://lwn.net/Articles/877845/ Reviewed-by: Ryan Moeller <ryan@iXsystems.com> Reviewed-by: Alexander Motin <mav@FreeBSD.org> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #13528 Closes #13575 2022-06-20 23:36:21 +00:00			`RET`
Illumos Crypto Port module added to enable native encryption in zfs A port of the Illumos Crypto Framework to a Linux kernel module (found in module/icp). This is needed to do the actual encryption work. We cannot use the Linux kernel's built in crypto api because it is only exported to GPL-licensed modules. Having the ICP also means the crypto code can run on any of the other kernels under OpenZFS. I ended up porting over most of the internals of the framework, which means that porting over other API calls (if we need them) should be fairly easy. Specifically, I have ported over the API functions related to encryption, digests, macs, and crypto templates. The ICP is able to use assembly-accelerated encryption on amd64 machines and AES-NI instructions on Intel chips that support it. There are place-holder directories for similar assembly optimizations for other architectures (although they have not been written). Signed-off-by: Tom Caputi <tcaputi@datto.com> Signed-off-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Issue #4329 2016-05-12 14:51:24 +00:00			`SET_SIZE(gcm_mul_pclmulqdq)`

			`#endif /* lint \|\| __lint */`
icp: mark asm files with noexec stack If there is no explicit note in the .S files, the obj file will mark it as requiring an executable stack. This is unneeded and causes issues on hardened systems. More info: https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart Signed-off-by: Jason Zaman <jason@perfinion.com> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #4947 Closes #4962 2016-08-11 15:59:03 +00:00
			`#ifdef __ELF__`
			`.section .note.GNU-stack,"",%progbits`
			`#endif`