zfs/module/icp/monocypher.c

378 lines
11 KiB
C

/*
* Copyright (c) 2017-2019, Loup Vaillant
* All rights reserved.
*
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the
* distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* Monocypher 4.0.2 (Poly1305, Chacha20, and supporting utilities)
* adapted for OpenZFS by Rob Norris <robn@despairlabs.com>
*/
/*
* Note: this follows the Monocypher style rather than the OpenZFS style to
* keep the diff to the bare minimum. This is important for making it easy to
* compare the two and confirm that they are in fact the same. The diff should
* be almost entirely in deleted lines.
*/
#include "monocypher.h"
/////////////////
/// Utilities ///
/////////////////
#define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
#define FOR(i, start, end) FOR_T(size_t, i, start, end)
#define ZERO(buf, size) FOR(_i_, 0, size) (buf)[_i_] = 0
#define WIPE_CTX(ctx) crypto_wipe(ctx , sizeof(*(ctx)))
#define WIPE_BUFFER(buffer) crypto_wipe(buffer, sizeof(buffer))
/*
* OpenZFS: userspace libicp build on Linux will already have MIN/MAX defined
* through sys/types.h -> sys/param.h. Undefine them and let Monocypher use its
* own, in case they change in some important way in the future.
*/
#undef MIN
#undef MAX
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
typedef int8_t i8;
typedef uint8_t u8;
typedef int16_t i16;
typedef uint32_t u32;
typedef int32_t i32;
typedef int64_t i64;
typedef uint64_t u64;
static const u8 zero[128] = {0};
// returns the smallest positive integer y such that
// (x + y) % pow_2 == 0
// Basically, y is the "gap" missing to align x.
// Only works when pow_2 is a power of 2.
// Note: we use ~x+1 instead of -x to avoid compiler warnings
static size_t gap(size_t x, size_t pow_2)
{
return (~x + 1) & (pow_2 - 1);
}
static u32 load32_le(const u8 s[4])
{
return
((u32)s[0] << 0) |
((u32)s[1] << 8) |
((u32)s[2] << 16) |
((u32)s[3] << 24);
}
static u64 load64_le(const u8 s[8])
{
return load32_le(s) | ((u64)load32_le(s+4) << 32);
}
static void store32_le(u8 out[4], u32 in)
{
out[0] = in & 0xff;
out[1] = (in >> 8) & 0xff;
out[2] = (in >> 16) & 0xff;
out[3] = (in >> 24) & 0xff;
}
static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
}
static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
static int neq0(u64 diff)
{
// constant time comparison to zero
// return diff != 0 ? -1 : 0
u64 half = (diff >> 32) | ((u32)diff);
return (1 & ((half - 1) >> 32)) - 1;
}
static u64 x16(const u8 a[16], const u8 b[16])
{
return (load64_le(a + 0) ^ load64_le(b + 0))
| (load64_le(a + 8) ^ load64_le(b + 8));
}
int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
void crypto_wipe(void *secret, size_t size)
{
volatile u8 *v_secret = (u8*)secret;
ZERO(v_secret, size);
}
/////////////////
/// Chacha 20 ///
/////////////////
#define QUARTERROUND(a, b, c, d) \
a += b; d = rotl32(d ^ a, 16); \
c += d; b = rotl32(b ^ c, 12); \
a += b; d = rotl32(d ^ a, 8); \
c += d; b = rotl32(b ^ c, 7)
static void chacha20_rounds(u32 out[16], const u32 in[16])
{
// The temporary variables make Chacha20 10% faster.
u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3];
u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7];
u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11];
u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15];
FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
QUARTERROUND(t0, t4, t8 , t12); // column 0
QUARTERROUND(t1, t5, t9 , t13); // column 1
QUARTERROUND(t2, t6, t10, t14); // column 2
QUARTERROUND(t3, t7, t11, t15); // column 3
QUARTERROUND(t0, t5, t10, t15); // diagonal 0
QUARTERROUND(t1, t6, t11, t12); // diagonal 1
QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
}
out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3;
out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7;
out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11;
out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15;
}
static const u8 *chacha20_constant = (const u8*)"expand 32-byte k"; // 16 bytes
static u64 crypto_chacha20_djb(u8 *cipher_text, const u8 *plain_text,
size_t text_size, const u8 key[32], const u8 nonce[8],
u64 ctr)
{
u32 input[16];
load32_le_buf(input , chacha20_constant, 4);
load32_le_buf(input + 4, key , 8);
load32_le_buf(input + 14, nonce , 2);
input[12] = (u32) ctr;
input[13] = (u32)(ctr >> 32);
// Whole blocks
u32 pool[16];
size_t nb_blocks = text_size >> 6;
FOR (i, 0, nb_blocks) {
chacha20_rounds(pool, input);
if (plain_text != 0) {
FOR (j, 0, 16) {
u32 p = pool[j] + input[j];
store32_le(cipher_text, p ^ load32_le(plain_text));
cipher_text += 4;
plain_text += 4;
}
} else {
FOR (j, 0, 16) {
u32 p = pool[j] + input[j];
store32_le(cipher_text, p);
cipher_text += 4;
}
}
input[12]++;
if (input[12] == 0) {
input[13]++;
}
}
text_size &= 63;
// Last (incomplete) block
if (text_size > 0) {
if (plain_text == 0) {
plain_text = zero;
}
chacha20_rounds(pool, input);
u8 tmp[64];
FOR (i, 0, 16) {
store32_le(tmp + i*4, pool[i] + input[i]);
}
FOR (i, 0, text_size) {
cipher_text[i] = tmp[i] ^ plain_text[i];
}
WIPE_BUFFER(tmp);
}
ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
WIPE_BUFFER(pool);
WIPE_BUFFER(input);
return ctr;
}
u32 crypto_chacha20_ietf(u8 *cipher_text, const u8 *plain_text,
size_t text_size,
const u8 key[32], const u8 nonce[12], u32 ctr)
{
u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
return (u32)crypto_chacha20_djb(cipher_text, plain_text, text_size,
key, nonce + 4, big_ctr);
}
/////////////////
/// Poly 1305 ///
/////////////////
// h = (h + c) * r
// preconditions:
// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
// ctx->r <= 0ffffffc_0ffffffc_0ffffffc_0fffffff
// end <= 1
// Postcondition:
// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
static void poly_blocks(crypto_poly1305_ctx *ctx, const u8 *in,
size_t nb_blocks, unsigned end)
{
// Local all the things!
const u32 r0 = ctx->r[0];
const u32 r1 = ctx->r[1];
const u32 r2 = ctx->r[2];
const u32 r3 = ctx->r[3];
const u32 rr0 = (r0 >> 2) * 5; // lose 2 bits...
const u32 rr1 = (r1 >> 2) + r1; // rr1 == (r1 >> 2) * 5
const u32 rr2 = (r2 >> 2) + r2; // rr1 == (r2 >> 2) * 5
const u32 rr3 = (r3 >> 2) + r3; // rr1 == (r3 >> 2) * 5
const u32 rr4 = r0 & 3; // ...recover 2 bits
u32 h0 = ctx->h[0];
u32 h1 = ctx->h[1];
u32 h2 = ctx->h[2];
u32 h3 = ctx->h[3];
u32 h4 = ctx->h[4];
FOR (i, 0, nb_blocks) {
// h + c, without carry propagation
const u64 s0 = (u64)h0 + load32_le(in); in += 4;
const u64 s1 = (u64)h1 + load32_le(in); in += 4;
const u64 s2 = (u64)h2 + load32_le(in); in += 4;
const u64 s3 = (u64)h3 + load32_le(in); in += 4;
const u32 s4 = h4 + end;
// (h + c) * r, without carry propagation
const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0;
const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1;
const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2;
const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3;
const u32 x4 = s4*rr4;
// partial reduction modulo 2^130 - 5
const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
const u64 u0 = (u5 >> 2) * 5 + (x0 & 0xffffffff);
const u64 u1 = (u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32);
const u64 u2 = (u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32);
const u64 u3 = (u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32);
const u32 u4 = (u3 >> 32) + (u5 & 3); // u4 <= 4
// Update the hash
h0 = u0 & 0xffffffff;
h1 = u1 & 0xffffffff;
h2 = u2 & 0xffffffff;
h3 = u3 & 0xffffffff;
h4 = u4;
}
ctx->h[0] = h0;
ctx->h[1] = h1;
ctx->h[2] = h2;
ctx->h[3] = h3;
ctx->h[4] = h4;
}
void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
{
ZERO(ctx->h, 5); // Initial hash is zero
ctx->c_idx = 0;
// load r and pad (r has some of its bits cleared)
load32_le_buf(ctx->r , key , 4);
load32_le_buf(ctx->pad, key+16, 4);
FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
}
void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
const u8 *message, size_t message_size)
{
// Avoid undefined NULL pointer increments with empty messages
if (message_size == 0) {
return;
}
// Align ourselves with block boundaries
size_t aligned = MIN(gap(ctx->c_idx, 16), message_size);
FOR (i, 0, aligned) {
ctx->c[ctx->c_idx] = *message;
ctx->c_idx++;
message++;
message_size--;
}
// If block is complete, process it
if (ctx->c_idx == 16) {
poly_blocks(ctx, ctx->c, 1, 1);
ctx->c_idx = 0;
}
// Process the message block by block
size_t nb_blocks = message_size >> 4;
poly_blocks(ctx, message, nb_blocks, 1);
message += nb_blocks << 4;
message_size &= 15;
// remaining bytes (we never complete a block here)
FOR (i, 0, message_size) {
ctx->c[ctx->c_idx] = message[i];
ctx->c_idx++;
}
}
void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
{
// Process the last block (if any)
// We move the final 1 according to remaining input length
// (this will add less than 2^130 to the last input block)
if (ctx->c_idx != 0) {
ZERO(ctx->c + ctx->c_idx, 16 - ctx->c_idx);
ctx->c[ctx->c_idx] = 1;
poly_blocks(ctx, ctx->c, 1, 0);
}
// check if we should subtract 2^130-5 by performing the
// corresponding carry propagation.
u64 c = 5;
FOR (i, 0, 4) {
c += ctx->h[i];
c >>= 32;
}
c += ctx->h[4];
c = (c >> 2) * 5; // shift the carry back to the beginning
// c now indicates how many times we should subtract 2^130-5 (0 or 1)
FOR (i, 0, 4) {
c += (u64)ctx->h[i] + ctx->pad[i];
store32_le(mac + i*4, (u32)c);
c = c >> 32;
}
WIPE_CTX(ctx);
}