378 lines
11 KiB
C
378 lines
11 KiB
C
/*
|
|
* Copyright (c) 2017-2019, Loup Vaillant
|
|
* All rights reserved.
|
|
*
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are
|
|
* met:
|
|
*
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
*
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the
|
|
* distribution.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
/*
|
|
* Monocypher 4.0.2 (Poly1305, Chacha20, and supporting utilities)
|
|
* adapted for OpenZFS by Rob Norris <robn@despairlabs.com>
|
|
*/
|
|
|
|
/*
|
|
* Note: this follows the Monocypher style rather than the OpenZFS style to
|
|
* keep the diff to the bare minimum. This is important for making it easy to
|
|
* compare the two and confirm that they are in fact the same. The diff should
|
|
* be almost entirely in deleted lines.
|
|
*/
|
|
|
|
#include "monocypher.h"
|
|
|
|
/////////////////
|
|
/// Utilities ///
|
|
/////////////////
|
|
#define FOR_T(type, i, start, end) for (type i = (start); i < (end); i++)
|
|
#define FOR(i, start, end) FOR_T(size_t, i, start, end)
|
|
#define ZERO(buf, size) FOR(_i_, 0, size) (buf)[_i_] = 0
|
|
#define WIPE_CTX(ctx) crypto_wipe(ctx , sizeof(*(ctx)))
|
|
#define WIPE_BUFFER(buffer) crypto_wipe(buffer, sizeof(buffer))
|
|
|
|
/*
|
|
* OpenZFS: userspace libicp build on Linux will already have MIN/MAX defined
|
|
* through sys/types.h -> sys/param.h. Undefine them and let Monocypher use its
|
|
* own, in case they change in some important way in the future.
|
|
*/
|
|
#undef MIN
|
|
#undef MAX
|
|
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
|
|
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
|
|
|
|
typedef int8_t i8;
|
|
typedef uint8_t u8;
|
|
typedef int16_t i16;
|
|
typedef uint32_t u32;
|
|
typedef int32_t i32;
|
|
typedef int64_t i64;
|
|
typedef uint64_t u64;
|
|
|
|
static const u8 zero[128] = {0};
|
|
|
|
// returns the smallest positive integer y such that
|
|
// (x + y) % pow_2 == 0
|
|
// Basically, y is the "gap" missing to align x.
|
|
// Only works when pow_2 is a power of 2.
|
|
// Note: we use ~x+1 instead of -x to avoid compiler warnings
|
|
static size_t gap(size_t x, size_t pow_2)
|
|
{
|
|
return (~x + 1) & (pow_2 - 1);
|
|
}
|
|
|
|
static u32 load32_le(const u8 s[4])
|
|
{
|
|
return
|
|
((u32)s[0] << 0) |
|
|
((u32)s[1] << 8) |
|
|
((u32)s[2] << 16) |
|
|
((u32)s[3] << 24);
|
|
}
|
|
|
|
static u64 load64_le(const u8 s[8])
|
|
{
|
|
return load32_le(s) | ((u64)load32_le(s+4) << 32);
|
|
}
|
|
|
|
static void store32_le(u8 out[4], u32 in)
|
|
{
|
|
out[0] = in & 0xff;
|
|
out[1] = (in >> 8) & 0xff;
|
|
out[2] = (in >> 16) & 0xff;
|
|
out[3] = (in >> 24) & 0xff;
|
|
}
|
|
|
|
static void load32_le_buf (u32 *dst, const u8 *src, size_t size) {
|
|
FOR(i, 0, size) { dst[i] = load32_le(src + i*4); }
|
|
}
|
|
|
|
static u32 rotl32(u32 x, u32 n) { return (x << n) ^ (x >> (32 - n)); }
|
|
|
|
static int neq0(u64 diff)
|
|
{
|
|
// constant time comparison to zero
|
|
// return diff != 0 ? -1 : 0
|
|
u64 half = (diff >> 32) | ((u32)diff);
|
|
return (1 & ((half - 1) >> 32)) - 1;
|
|
}
|
|
|
|
static u64 x16(const u8 a[16], const u8 b[16])
|
|
{
|
|
return (load64_le(a + 0) ^ load64_le(b + 0))
|
|
| (load64_le(a + 8) ^ load64_le(b + 8));
|
|
}
|
|
int crypto_verify16(const u8 a[16], const u8 b[16]){ return neq0(x16(a, b)); }
|
|
|
|
void crypto_wipe(void *secret, size_t size)
|
|
{
|
|
volatile u8 *v_secret = (u8*)secret;
|
|
ZERO(v_secret, size);
|
|
}
|
|
|
|
/////////////////
|
|
/// Chacha 20 ///
|
|
/////////////////
|
|
#define QUARTERROUND(a, b, c, d) \
|
|
a += b; d = rotl32(d ^ a, 16); \
|
|
c += d; b = rotl32(b ^ c, 12); \
|
|
a += b; d = rotl32(d ^ a, 8); \
|
|
c += d; b = rotl32(b ^ c, 7)
|
|
|
|
static void chacha20_rounds(u32 out[16], const u32 in[16])
|
|
{
|
|
// The temporary variables make Chacha20 10% faster.
|
|
u32 t0 = in[ 0]; u32 t1 = in[ 1]; u32 t2 = in[ 2]; u32 t3 = in[ 3];
|
|
u32 t4 = in[ 4]; u32 t5 = in[ 5]; u32 t6 = in[ 6]; u32 t7 = in[ 7];
|
|
u32 t8 = in[ 8]; u32 t9 = in[ 9]; u32 t10 = in[10]; u32 t11 = in[11];
|
|
u32 t12 = in[12]; u32 t13 = in[13]; u32 t14 = in[14]; u32 t15 = in[15];
|
|
|
|
FOR (i, 0, 10) { // 20 rounds, 2 rounds per loop.
|
|
QUARTERROUND(t0, t4, t8 , t12); // column 0
|
|
QUARTERROUND(t1, t5, t9 , t13); // column 1
|
|
QUARTERROUND(t2, t6, t10, t14); // column 2
|
|
QUARTERROUND(t3, t7, t11, t15); // column 3
|
|
QUARTERROUND(t0, t5, t10, t15); // diagonal 0
|
|
QUARTERROUND(t1, t6, t11, t12); // diagonal 1
|
|
QUARTERROUND(t2, t7, t8 , t13); // diagonal 2
|
|
QUARTERROUND(t3, t4, t9 , t14); // diagonal 3
|
|
}
|
|
out[ 0] = t0; out[ 1] = t1; out[ 2] = t2; out[ 3] = t3;
|
|
out[ 4] = t4; out[ 5] = t5; out[ 6] = t6; out[ 7] = t7;
|
|
out[ 8] = t8; out[ 9] = t9; out[10] = t10; out[11] = t11;
|
|
out[12] = t12; out[13] = t13; out[14] = t14; out[15] = t15;
|
|
}
|
|
|
|
static const u8 *chacha20_constant = (const u8*)"expand 32-byte k"; // 16 bytes
|
|
|
|
static u64 crypto_chacha20_djb(u8 *cipher_text, const u8 *plain_text,
|
|
size_t text_size, const u8 key[32], const u8 nonce[8],
|
|
u64 ctr)
|
|
{
|
|
u32 input[16];
|
|
load32_le_buf(input , chacha20_constant, 4);
|
|
load32_le_buf(input + 4, key , 8);
|
|
load32_le_buf(input + 14, nonce , 2);
|
|
input[12] = (u32) ctr;
|
|
input[13] = (u32)(ctr >> 32);
|
|
|
|
// Whole blocks
|
|
u32 pool[16];
|
|
size_t nb_blocks = text_size >> 6;
|
|
FOR (i, 0, nb_blocks) {
|
|
chacha20_rounds(pool, input);
|
|
if (plain_text != 0) {
|
|
FOR (j, 0, 16) {
|
|
u32 p = pool[j] + input[j];
|
|
store32_le(cipher_text, p ^ load32_le(plain_text));
|
|
cipher_text += 4;
|
|
plain_text += 4;
|
|
}
|
|
} else {
|
|
FOR (j, 0, 16) {
|
|
u32 p = pool[j] + input[j];
|
|
store32_le(cipher_text, p);
|
|
cipher_text += 4;
|
|
}
|
|
}
|
|
input[12]++;
|
|
if (input[12] == 0) {
|
|
input[13]++;
|
|
}
|
|
}
|
|
text_size &= 63;
|
|
|
|
// Last (incomplete) block
|
|
if (text_size > 0) {
|
|
if (plain_text == 0) {
|
|
plain_text = zero;
|
|
}
|
|
chacha20_rounds(pool, input);
|
|
u8 tmp[64];
|
|
FOR (i, 0, 16) {
|
|
store32_le(tmp + i*4, pool[i] + input[i]);
|
|
}
|
|
FOR (i, 0, text_size) {
|
|
cipher_text[i] = tmp[i] ^ plain_text[i];
|
|
}
|
|
WIPE_BUFFER(tmp);
|
|
}
|
|
ctr = input[12] + ((u64)input[13] << 32) + (text_size > 0);
|
|
|
|
WIPE_BUFFER(pool);
|
|
WIPE_BUFFER(input);
|
|
return ctr;
|
|
}
|
|
|
|
u32 crypto_chacha20_ietf(u8 *cipher_text, const u8 *plain_text,
|
|
size_t text_size,
|
|
const u8 key[32], const u8 nonce[12], u32 ctr)
|
|
{
|
|
u64 big_ctr = ctr + ((u64)load32_le(nonce) << 32);
|
|
return (u32)crypto_chacha20_djb(cipher_text, plain_text, text_size,
|
|
key, nonce + 4, big_ctr);
|
|
}
|
|
|
|
/////////////////
|
|
/// Poly 1305 ///
|
|
/////////////////
|
|
|
|
// h = (h + c) * r
|
|
// preconditions:
|
|
// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
|
|
// ctx->r <= 0ffffffc_0ffffffc_0ffffffc_0fffffff
|
|
// end <= 1
|
|
// Postcondition:
|
|
// ctx->h <= 4_ffffffff_ffffffff_ffffffff_ffffffff
|
|
static void poly_blocks(crypto_poly1305_ctx *ctx, const u8 *in,
|
|
size_t nb_blocks, unsigned end)
|
|
{
|
|
// Local all the things!
|
|
const u32 r0 = ctx->r[0];
|
|
const u32 r1 = ctx->r[1];
|
|
const u32 r2 = ctx->r[2];
|
|
const u32 r3 = ctx->r[3];
|
|
const u32 rr0 = (r0 >> 2) * 5; // lose 2 bits...
|
|
const u32 rr1 = (r1 >> 2) + r1; // rr1 == (r1 >> 2) * 5
|
|
const u32 rr2 = (r2 >> 2) + r2; // rr1 == (r2 >> 2) * 5
|
|
const u32 rr3 = (r3 >> 2) + r3; // rr1 == (r3 >> 2) * 5
|
|
const u32 rr4 = r0 & 3; // ...recover 2 bits
|
|
u32 h0 = ctx->h[0];
|
|
u32 h1 = ctx->h[1];
|
|
u32 h2 = ctx->h[2];
|
|
u32 h3 = ctx->h[3];
|
|
u32 h4 = ctx->h[4];
|
|
|
|
FOR (i, 0, nb_blocks) {
|
|
// h + c, without carry propagation
|
|
const u64 s0 = (u64)h0 + load32_le(in); in += 4;
|
|
const u64 s1 = (u64)h1 + load32_le(in); in += 4;
|
|
const u64 s2 = (u64)h2 + load32_le(in); in += 4;
|
|
const u64 s3 = (u64)h3 + load32_le(in); in += 4;
|
|
const u32 s4 = h4 + end;
|
|
|
|
// (h + c) * r, without carry propagation
|
|
const u64 x0 = s0*r0+ s1*rr3+ s2*rr2+ s3*rr1+ s4*rr0;
|
|
const u64 x1 = s0*r1+ s1*r0 + s2*rr3+ s3*rr2+ s4*rr1;
|
|
const u64 x2 = s0*r2+ s1*r1 + s2*r0 + s3*rr3+ s4*rr2;
|
|
const u64 x3 = s0*r3+ s1*r2 + s2*r1 + s3*r0 + s4*rr3;
|
|
const u32 x4 = s4*rr4;
|
|
|
|
// partial reduction modulo 2^130 - 5
|
|
const u32 u5 = x4 + (x3 >> 32); // u5 <= 7ffffff5
|
|
const u64 u0 = (u5 >> 2) * 5 + (x0 & 0xffffffff);
|
|
const u64 u1 = (u0 >> 32) + (x1 & 0xffffffff) + (x0 >> 32);
|
|
const u64 u2 = (u1 >> 32) + (x2 & 0xffffffff) + (x1 >> 32);
|
|
const u64 u3 = (u2 >> 32) + (x3 & 0xffffffff) + (x2 >> 32);
|
|
const u32 u4 = (u3 >> 32) + (u5 & 3); // u4 <= 4
|
|
|
|
// Update the hash
|
|
h0 = u0 & 0xffffffff;
|
|
h1 = u1 & 0xffffffff;
|
|
h2 = u2 & 0xffffffff;
|
|
h3 = u3 & 0xffffffff;
|
|
h4 = u4;
|
|
}
|
|
ctx->h[0] = h0;
|
|
ctx->h[1] = h1;
|
|
ctx->h[2] = h2;
|
|
ctx->h[3] = h3;
|
|
ctx->h[4] = h4;
|
|
}
|
|
|
|
void crypto_poly1305_init(crypto_poly1305_ctx *ctx, const u8 key[32])
|
|
{
|
|
ZERO(ctx->h, 5); // Initial hash is zero
|
|
ctx->c_idx = 0;
|
|
// load r and pad (r has some of its bits cleared)
|
|
load32_le_buf(ctx->r , key , 4);
|
|
load32_le_buf(ctx->pad, key+16, 4);
|
|
FOR (i, 0, 1) { ctx->r[i] &= 0x0fffffff; }
|
|
FOR (i, 1, 4) { ctx->r[i] &= 0x0ffffffc; }
|
|
}
|
|
|
|
void crypto_poly1305_update(crypto_poly1305_ctx *ctx,
|
|
const u8 *message, size_t message_size)
|
|
{
|
|
// Avoid undefined NULL pointer increments with empty messages
|
|
if (message_size == 0) {
|
|
return;
|
|
}
|
|
|
|
// Align ourselves with block boundaries
|
|
size_t aligned = MIN(gap(ctx->c_idx, 16), message_size);
|
|
FOR (i, 0, aligned) {
|
|
ctx->c[ctx->c_idx] = *message;
|
|
ctx->c_idx++;
|
|
message++;
|
|
message_size--;
|
|
}
|
|
|
|
// If block is complete, process it
|
|
if (ctx->c_idx == 16) {
|
|
poly_blocks(ctx, ctx->c, 1, 1);
|
|
ctx->c_idx = 0;
|
|
}
|
|
|
|
// Process the message block by block
|
|
size_t nb_blocks = message_size >> 4;
|
|
poly_blocks(ctx, message, nb_blocks, 1);
|
|
message += nb_blocks << 4;
|
|
message_size &= 15;
|
|
|
|
// remaining bytes (we never complete a block here)
|
|
FOR (i, 0, message_size) {
|
|
ctx->c[ctx->c_idx] = message[i];
|
|
ctx->c_idx++;
|
|
}
|
|
}
|
|
|
|
void crypto_poly1305_final(crypto_poly1305_ctx *ctx, u8 mac[16])
|
|
{
|
|
// Process the last block (if any)
|
|
// We move the final 1 according to remaining input length
|
|
// (this will add less than 2^130 to the last input block)
|
|
if (ctx->c_idx != 0) {
|
|
ZERO(ctx->c + ctx->c_idx, 16 - ctx->c_idx);
|
|
ctx->c[ctx->c_idx] = 1;
|
|
poly_blocks(ctx, ctx->c, 1, 0);
|
|
}
|
|
|
|
// check if we should subtract 2^130-5 by performing the
|
|
// corresponding carry propagation.
|
|
u64 c = 5;
|
|
FOR (i, 0, 4) {
|
|
c += ctx->h[i];
|
|
c >>= 32;
|
|
}
|
|
c += ctx->h[4];
|
|
c = (c >> 2) * 5; // shift the carry back to the beginning
|
|
// c now indicates how many times we should subtract 2^130-5 (0 or 1)
|
|
FOR (i, 0, 4) {
|
|
c += (u64)ctx->h[i] + ctx->pad[i];
|
|
store32_le(mac + i*4, (u32)c);
|
|
c = c >> 32;
|
|
}
|
|
WIPE_CTX(ctx);
|
|
}
|