zfs/module/icp/include/aes/aes_impl.h

222 lines
6.5 KiB
C
Raw Normal View History

/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _AES_IMPL_H
#define _AES_IMPL_H
/*
* Common definitions used by AES.
*/
#ifdef __cplusplus
extern "C" {
#endif
#include <sys/zfs_context.h>
#include <sys/crypto/common.h>
#include <sys/asm_linkage.h>
/* Similar to sysmacros.h IS_P2ALIGNED, but checks two pointers: */
#define IS_P2ALIGNED2(v, w, a) \
((((uintptr_t)(v) | (uintptr_t)(w)) & ((uintptr_t)(a) - 1)) == 0)
#define AES_BLOCK_LEN 16 /* bytes */
/* Round constant length, in number of 32-bit elements: */
#define RC_LENGTH (5 * ((AES_BLOCK_LEN) / 4 - 2))
#define AES_COPY_BLOCK(src, dst) \
(dst)[0] = (src)[0]; \
(dst)[1] = (src)[1]; \
(dst)[2] = (src)[2]; \
(dst)[3] = (src)[3]; \
(dst)[4] = (src)[4]; \
(dst)[5] = (src)[5]; \
(dst)[6] = (src)[6]; \
(dst)[7] = (src)[7]; \
(dst)[8] = (src)[8]; \
(dst)[9] = (src)[9]; \
(dst)[10] = (src)[10]; \
(dst)[11] = (src)[11]; \
(dst)[12] = (src)[12]; \
(dst)[13] = (src)[13]; \
(dst)[14] = (src)[14]; \
(dst)[15] = (src)[15]
#define AES_XOR_BLOCK(src, dst) \
(dst)[0] ^= (src)[0]; \
(dst)[1] ^= (src)[1]; \
(dst)[2] ^= (src)[2]; \
(dst)[3] ^= (src)[3]; \
(dst)[4] ^= (src)[4]; \
(dst)[5] ^= (src)[5]; \
(dst)[6] ^= (src)[6]; \
(dst)[7] ^= (src)[7]; \
(dst)[8] ^= (src)[8]; \
(dst)[9] ^= (src)[9]; \
(dst)[10] ^= (src)[10]; \
(dst)[11] ^= (src)[11]; \
(dst)[12] ^= (src)[12]; \
(dst)[13] ^= (src)[13]; \
(dst)[14] ^= (src)[14]; \
(dst)[15] ^= (src)[15]
/* AES key size definitions */
#define AES_MINBITS 128
#define AES_MAXBITS 256
/* AES key schedule may be implemented with 32- or 64-bit elements: */
#define AES_32BIT_KS 32
#define AES_64BIT_KS 64
#define MAX_AES_NR 14 /* Maximum number of rounds */
#define MAX_AES_NB 4 /* Number of columns comprising a state */
typedef union {
#ifdef sun4u
uint64_t ks64[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
#endif
uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)];
} aes_ks_t;
typedef struct aes_impl_ops aes_impl_ops_t;
ICP: Improve AES-GCM performance Currently SIMD accelerated AES-GCM performance is limited by two factors: a. The need to disable preemption and interrupts and save the FPU state before using it and to do the reverse when done. Due to the way the code is organized (see (b) below) we have to pay this price twice for each 16 byte GCM block processed. b. Most processing is done in C, operating on single GCM blocks. The use of SIMD instructions is limited to the AES encryption of the counter block (AES-NI) and the Galois multiplication (PCLMULQDQ). This leads to the FPU not being fully utilized for crypto operations. To solve (a) we do crypto processing in larger chunks while owning the FPU. An `icp_gcm_avx_chunk_size` module parameter was introduced to make this chunk size tweakable. It defaults to 32 KiB. This step alone roughly doubles performance. (b) is tackled by porting and using the highly optimized openssl AES-GCM assembler routines, which do all the processing (CTR, AES, GMULT) in a single routine. Both steps together result in up to 32x reduction of the time spend in the en/decryption routines, leading up to approximately 12x throughput increase for large (128 KiB) blocks. Lastly, this commit changes the default encryption algorithm from AES-CCM to AES-GCM when setting the `encryption=on` property. Reviewed-By: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed-By: Jason King <jason.king@joyent.com> Reviewed-By: Tom Caputi <tcaputi@datto.com> Reviewed-By: Richard Laager <rlaager@wiktel.com> Signed-off-by: Attila Fülöp <attila@fueloep.org> Closes #9749
2020-02-10 20:59:50 +00:00
/*
* The absolute offset of the encr_ks (0) and the nr (504) fields are hard
* coded in aesni-gcm-x86_64, so please don't change (or adjust accordingly).
*/
typedef struct aes_key aes_key_t;
struct aes_key {
aes_ks_t encr_ks; /* encryption key schedule */
aes_ks_t decr_ks; /* decryption key schedule */
#ifdef __amd64
long double align128; /* Align fields above for Intel AES-NI */
#endif /* __amd64 */
const aes_impl_ops_t *ops; /* ops associated with this schedule */
int nr; /* number of rounds (10, 12, or 14) */
int type; /* key schedule size (32 or 64 bits) */
};
/*
* Core AES functions.
* ks and keysched are pointers to aes_key_t.
* They are declared void* as they are intended to be opaque types.
* Use function aes_alloc_keysched() to allocate memory for ks and keysched.
*/
extern void *aes_alloc_keysched(size_t *size, int kmflag);
extern void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits,
void *keysched);
extern int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct);
extern int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt);
/*
* AES mode functions.
* The first 2 functions operate on 16-byte AES blocks.
*/
extern void aes_copy_block(uint8_t *in, uint8_t *out);
extern void aes_xor_block(uint8_t *data, uint8_t *dst);
/* Note: ctx is a pointer to aes_ctx_t defined in modes.h */
extern int aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length,
crypto_data_t *out);
extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length,
crypto_data_t *out);
/*
* The following definitions and declarations are only used by AES FIPS POST
*/
#ifdef _AES_IMPL
typedef enum aes_mech_type {
AES_ECB_MECH_INFO_TYPE, /* SUN_CKM_AES_ECB */
AES_CBC_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC */
AES_CBC_PAD_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC_PAD */
AES_CTR_MECH_INFO_TYPE, /* SUN_CKM_AES_CTR */
AES_CCM_MECH_INFO_TYPE, /* SUN_CKM_AES_CCM */
AES_GCM_MECH_INFO_TYPE, /* SUN_CKM_AES_GCM */
AES_GMAC_MECH_INFO_TYPE /* SUN_CKM_AES_GMAC */
} aes_mech_type_t;
#endif /* _AES_IMPL */
/*
* Methods used to define AES implementation
*
* @aes_gen_f Key generation
* @aes_enc_f Function encrypts one block
* @aes_dec_f Function decrypts one block
* @aes_will_work_f Function tests whether method will function
*/
typedef void (*aes_generate_f)(aes_key_t *, const uint32_t *, int);
typedef void (*aes_encrypt_f)(const uint32_t[], int,
const uint32_t[4], uint32_t[4]);
typedef void (*aes_decrypt_f)(const uint32_t[], int,
const uint32_t[4], uint32_t[4]);
typedef boolean_t (*aes_will_work_f)(void);
#define AES_IMPL_NAME_MAX (16)
struct aes_impl_ops {
aes_generate_f generate;
aes_encrypt_f encrypt;
aes_decrypt_f decrypt;
aes_will_work_f is_supported;
boolean_t needs_byteswap;
char name[AES_IMPL_NAME_MAX];
};
extern const aes_impl_ops_t aes_generic_impl;
#if defined(__x86_64)
extern const aes_impl_ops_t aes_x86_64_impl;
/* These functions are used to execute amd64 instructions for AMD or Intel: */
extern ASMABI int rijndael_key_setup_enc_amd64(uint32_t rk[],
const uint32_t cipherKey[], int keyBits);
extern ASMABI int rijndael_key_setup_dec_amd64(uint32_t rk[],
const uint32_t cipherKey[], int keyBits);
extern ASMABI void aes_encrypt_amd64(const uint32_t rk[], int Nr,
const uint32_t pt[4], uint32_t ct[4]);
extern ASMABI void aes_decrypt_amd64(const uint32_t rk[], int Nr,
const uint32_t ct[4], uint32_t pt[4]);
#endif
#if defined(__x86_64) && defined(HAVE_AES)
extern const aes_impl_ops_t aes_aesni_impl;
#endif
/*
* Initializes fastest implementation
*/
void aes_impl_init(void);
/*
Linux 5.0 compat: SIMD compatibility Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS, and 5.0 and newer kernels. This is accomplished by leveraging the fact that by definition dedicated kernel threads never need to concern themselves with saving and restoring the user FPU state. Therefore, they may use the FPU as long as we can guarantee user tasks always restore their FPU state before context switching back to user space. For the 5.0 and 5.1 kernels disabling preemption and local interrupts is sufficient to allow the FPU to be used. All non-kernel threads will restore the preserved user FPU state. For 5.2 and latter kernels the user FPU state restoration will be skipped if the kernel determines the registers have not changed. Therefore, for these kernels we need to perform the additional step of saving and restoring the FPU registers. Invalidating the per-cpu global tracking the FPU state would force a restore but that functionality is private to the core x86 FPU implementation and unavailable. In practice, restricting SIMD to kernel threads is not a major restriction for ZFS. The vast majority of SIMD operations are already performed by the IO pipeline. The remaining cases are relatively infrequent and can be handled by the generic code without significant impact. The two most noteworthy cases are: 1) Decrypting the wrapping key for an encrypted dataset, i.e. `zfs load-key`. All other encryption and decryption operations will use the SIMD optimized implementations. 2) Generating the payload checksums for a `zfs send` stream. In order to avoid making any changes to the higher layers of ZFS all of the `*_get_ops()` functions were updated to take in to consideration the calling context. This allows for the fastest implementation to be used as appropriate (see kfpu_allowed()). The only other notable instance of SIMD operations being used outside a kernel thread was at module load time. This code was moved in to a taskq in order to accommodate the new kernel thread restriction. Finally, a few other modifications were made in order to further harden this code and facilitate testing. They include updating each implementations operations structure to be declared as a constant. And allowing "cycle" to be set when selecting the preferred ops in the kernel as well as user space. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #8754 Closes #8793 Closes #8965
2019-07-12 16:31:20 +00:00
* Returns optimal allowed AES implementation
*/
Linux 5.0 compat: SIMD compatibility Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS, and 5.0 and newer kernels. This is accomplished by leveraging the fact that by definition dedicated kernel threads never need to concern themselves with saving and restoring the user FPU state. Therefore, they may use the FPU as long as we can guarantee user tasks always restore their FPU state before context switching back to user space. For the 5.0 and 5.1 kernels disabling preemption and local interrupts is sufficient to allow the FPU to be used. All non-kernel threads will restore the preserved user FPU state. For 5.2 and latter kernels the user FPU state restoration will be skipped if the kernel determines the registers have not changed. Therefore, for these kernels we need to perform the additional step of saving and restoring the FPU registers. Invalidating the per-cpu global tracking the FPU state would force a restore but that functionality is private to the core x86 FPU implementation and unavailable. In practice, restricting SIMD to kernel threads is not a major restriction for ZFS. The vast majority of SIMD operations are already performed by the IO pipeline. The remaining cases are relatively infrequent and can be handled by the generic code without significant impact. The two most noteworthy cases are: 1) Decrypting the wrapping key for an encrypted dataset, i.e. `zfs load-key`. All other encryption and decryption operations will use the SIMD optimized implementations. 2) Generating the payload checksums for a `zfs send` stream. In order to avoid making any changes to the higher layers of ZFS all of the `*_get_ops()` functions were updated to take in to consideration the calling context. This allows for the fastest implementation to be used as appropriate (see kfpu_allowed()). The only other notable instance of SIMD operations being used outside a kernel thread was at module load time. This code was moved in to a taskq in order to accommodate the new kernel thread restriction. Finally, a few other modifications were made in order to further harden this code and facilitate testing. They include updating each implementations operations structure to be declared as a constant. And allowing "cycle" to be set when selecting the preferred ops in the kernel as well as user space. Reviewed-by: Tony Hutter <hutter2@llnl.gov> Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> Closes #8754 Closes #8793 Closes #8965
2019-07-12 16:31:20 +00:00
const struct aes_impl_ops *aes_impl_get_ops(void);
#ifdef __cplusplus
}
#endif
#endif /* _AES_IMPL_H */