Introduce BLAKE3 checksums as an OpenZFS feature
This commit adds BLAKE3 checksums to OpenZFS, it has similar
performance to Edon-R, but without the caveats around the latter.
Homepage of BLAKE3: https://github.com/BLAKE3-team/BLAKE3
Wikipedia: https://en.wikipedia.org/wiki/BLAKE_(hash_function)#BLAKE3
Short description of Wikipedia:
BLAKE3 is a cryptographic hash function based on Bao and BLAKE2,
created by Jack O'Connor, Jean-Philippe Aumasson, Samuel Neves, and
Zooko Wilcox-O'Hearn. It was announced on January 9, 2020, at Real
World Crypto. BLAKE3 is a single algorithm with many desirable
features (parallelism, XOF, KDF, PRF and MAC), in contrast to BLAKE
and BLAKE2, which are algorithm families with multiple variants.
BLAKE3 has a binary tree structure, so it supports a practically
unlimited degree of parallelism (both SIMD and multithreading) given
enough input. The official Rust and C implementations are
dual-licensed as public domain (CC0) and the Apache License.
Along with adding the BLAKE3 hash into the OpenZFS infrastructure a
new benchmarking file called chksum_bench was introduced. When read
it reports the speed of the available checksum functions.
On Linux: cat /proc/spl/kstat/zfs/chksum_bench
On FreeBSD: sysctl kstat.zfs.misc.chksum_bench
This is an example output of an i3-1005G1 test system with Debian 11:
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1196 1602 1761 1749 1762 1759 1751
skein-generic 546 591 608 615 619 612 616
sha256-generic 240 300 316 314 304 285 276
sha512-generic 353 441 467 476 472 467 426
blake3-generic 308 313 313 313 312 313 312
blake3-sse2 402 1289 1423 1446 1432 1458 1413
blake3-sse41 427 1470 1625 1704 1679 1607 1629
blake3-avx2 428 1920 3095 3343 3356 3318 3204
blake3-avx512 473 2687 4905 5836 5844 5643 5374
Output on Debian 5.10.0-10-amd64 system: (Ryzen 7 5800X)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1840 2458 2665 2719 2711 2723 2693
skein-generic 870 966 996 992 1003 1005 1009
sha256-generic 415 442 453 455 457 457 457
sha512-generic 608 690 711 718 719 720 721
blake3-generic 301 313 311 309 309 310 310
blake3-sse2 343 1865 2124 2188 2180 2181 2186
blake3-sse41 364 2091 2396 2509 2463 2482 2488
blake3-avx2 365 2590 4399 4971 4915 4802 4764
Output on Debian 5.10.0-9-powerpc64le system: (POWER 9)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1213 1703 1889 1918 1957 1902 1907
skein-generic 434 492 520 522 511 525 525
sha256-generic 167 183 187 188 188 187 188
sha512-generic 186 216 222 221 225 224 224
blake3-generic 153 152 154 153 151 153 153
blake3-sse2 391 1170 1366 1406 1428 1426 1414
blake3-sse41 352 1049 1212 1174 1262 1258 1259
Output on Debian 5.10.0-11-arm64 system: (Pi400)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 487 603 629 639 643 641 641
skein-generic 271 299 303 308 309 309 307
sha256-generic 117 127 128 130 130 129 130
sha512-generic 145 165 170 172 173 174 175
blake3-generic 81 29 71 89 89 89 89
blake3-sse2 112 323 368 379 380 371 374
blake3-sse41 101 315 357 368 369 364 360
Structurally, the new code is mainly split into these parts:
- 1x cross platform generic c variant: blake3_generic.c
- 4x assembly for X86-64 (SSE2, SSE4.1, AVX2, AVX512)
- 2x assembly for ARMv8 (NEON converted from SSE2)
- 2x assembly for PPC64-LE (POWER8 converted from SSE2)
- one file for switching between the implementations
Note the PPC64 assembly requires the VSX instruction set and the
kfpu_begin() / kfpu_end() calls on PowerPC were updated accordingly.
Reviewed-by: Felix Dörre <felix@dogcraft.de>
Reviewed-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Co-authored-by: Rich Ercolani <rincebrain@gmail.com>
Closes #10058
Closes #12918
2022-06-08 22:55:57 +00:00
|
|
|
/*
|
|
|
|
* CDDL HEADER START
|
|
|
|
*
|
|
|
|
* The contents of this file are subject to the terms of the
|
|
|
|
* Common Development and Distribution License (the "License").
|
|
|
|
* You may not use this file except in compliance with the License.
|
|
|
|
*
|
|
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
|
|
* See the License for the specific language governing permissions
|
|
|
|
* and limitations under the License.
|
|
|
|
*
|
|
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
|
|
*
|
|
|
|
* CDDL HEADER END
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
|
|
|
|
* Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
|
|
|
|
* Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
|
|
|
|
*/
|
|
|
|
|
|
|
|
#if defined(HAVE_SSE2)
|
|
|
|
|
|
|
|
#define _ASM
|
|
|
|
#include <sys/asm_linkage.h>
|
|
|
|
|
|
|
|
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
|
|
|
|
#if __has_include(<cet.h>)
|
|
|
|
#include <cet.h>
|
|
|
|
#endif
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#if !defined(_CET_ENDBR)
|
|
|
|
#define _CET_ENDBR
|
|
|
|
#endif
|
|
|
|
|
|
|
|
.intel_syntax noprefix
|
|
|
|
.global zfs_blake3_hash_many_sse2
|
|
|
|
.global zfs_blake3_compress_in_place_sse2
|
|
|
|
.global zfs_blake3_compress_xof_sse2
|
|
|
|
|
|
|
|
.text
|
|
|
|
.type zfs_blake3_hash_many_sse2,@function
|
|
|
|
.type zfs_blake3_compress_in_place_sse2,@function
|
|
|
|
.type zfs_blake3_compress_xof_sse2,@function
|
|
|
|
|
|
|
|
.p2align 6
|
|
|
|
zfs_blake3_hash_many_sse2:
|
|
|
|
_CET_ENDBR
|
|
|
|
push r15
|
|
|
|
push r14
|
|
|
|
push r13
|
|
|
|
push r12
|
|
|
|
push rbx
|
|
|
|
push rbp
|
|
|
|
mov rbp, rsp
|
|
|
|
sub rsp, 360
|
|
|
|
and rsp, 0xFFFFFFFFFFFFFFC0
|
|
|
|
neg r9d
|
|
|
|
movd xmm0, r9d
|
|
|
|
pshufd xmm0, xmm0, 0x00
|
|
|
|
movdqa xmmword ptr [rsp+0x130], xmm0
|
|
|
|
movdqa xmm1, xmm0
|
|
|
|
pand xmm1, xmmword ptr [ADD0+rip]
|
|
|
|
pand xmm0, xmmword ptr [ADD1+rip]
|
|
|
|
movdqa xmmword ptr [rsp+0x150], xmm0
|
|
|
|
movd xmm0, r8d
|
|
|
|
pshufd xmm0, xmm0, 0x00
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
movdqa xmmword ptr [rsp+0x110], xmm0
|
|
|
|
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
|
|
|
|
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
|
|
|
|
pcmpgtd xmm1, xmm0
|
|
|
|
shr r8, 32
|
|
|
|
movd xmm2, r8d
|
|
|
|
pshufd xmm2, xmm2, 0x00
|
|
|
|
psubd xmm2, xmm1
|
|
|
|
movdqa xmmword ptr [rsp+0x120], xmm2
|
|
|
|
mov rbx, qword ptr [rbp+0x50]
|
|
|
|
mov r15, rdx
|
|
|
|
shl r15, 6
|
|
|
|
movzx r13d, byte ptr [rbp+0x38]
|
|
|
|
movzx r12d, byte ptr [rbp+0x48]
|
|
|
|
cmp rsi, 4
|
|
|
|
jc 3f
|
|
|
|
2:
|
|
|
|
movdqu xmm3, xmmword ptr [rcx]
|
|
|
|
pshufd xmm0, xmm3, 0x00
|
|
|
|
pshufd xmm1, xmm3, 0x55
|
|
|
|
pshufd xmm2, xmm3, 0xAA
|
|
|
|
pshufd xmm3, xmm3, 0xFF
|
|
|
|
movdqu xmm7, xmmword ptr [rcx+0x10]
|
|
|
|
pshufd xmm4, xmm7, 0x00
|
|
|
|
pshufd xmm5, xmm7, 0x55
|
|
|
|
pshufd xmm6, xmm7, 0xAA
|
|
|
|
pshufd xmm7, xmm7, 0xFF
|
|
|
|
mov r8, qword ptr [rdi]
|
|
|
|
mov r9, qword ptr [rdi+0x8]
|
|
|
|
mov r10, qword ptr [rdi+0x10]
|
|
|
|
mov r11, qword ptr [rdi+0x18]
|
|
|
|
movzx eax, byte ptr [rbp+0x40]
|
|
|
|
or eax, r13d
|
|
|
|
xor edx, edx
|
|
|
|
9:
|
|
|
|
mov r14d, eax
|
|
|
|
or eax, r12d
|
|
|
|
add rdx, 64
|
|
|
|
cmp rdx, r15
|
|
|
|
cmovne eax, r14d
|
|
|
|
movdqu xmm8, xmmword ptr [r8+rdx-0x40]
|
|
|
|
movdqu xmm9, xmmword ptr [r9+rdx-0x40]
|
|
|
|
movdqu xmm10, xmmword ptr [r10+rdx-0x40]
|
|
|
|
movdqu xmm11, xmmword ptr [r11+rdx-0x40]
|
|
|
|
movdqa xmm12, xmm8
|
|
|
|
punpckldq xmm8, xmm9
|
|
|
|
punpckhdq xmm12, xmm9
|
|
|
|
movdqa xmm14, xmm10
|
|
|
|
punpckldq xmm10, xmm11
|
|
|
|
punpckhdq xmm14, xmm11
|
|
|
|
movdqa xmm9, xmm8
|
|
|
|
punpcklqdq xmm8, xmm10
|
|
|
|
punpckhqdq xmm9, xmm10
|
|
|
|
movdqa xmm13, xmm12
|
|
|
|
punpcklqdq xmm12, xmm14
|
|
|
|
punpckhqdq xmm13, xmm14
|
|
|
|
movdqa xmmword ptr [rsp], xmm8
|
|
|
|
movdqa xmmword ptr [rsp+0x10], xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x20], xmm12
|
|
|
|
movdqa xmmword ptr [rsp+0x30], xmm13
|
|
|
|
movdqu xmm8, xmmword ptr [r8+rdx-0x30]
|
|
|
|
movdqu xmm9, xmmword ptr [r9+rdx-0x30]
|
|
|
|
movdqu xmm10, xmmword ptr [r10+rdx-0x30]
|
|
|
|
movdqu xmm11, xmmword ptr [r11+rdx-0x30]
|
|
|
|
movdqa xmm12, xmm8
|
|
|
|
punpckldq xmm8, xmm9
|
|
|
|
punpckhdq xmm12, xmm9
|
|
|
|
movdqa xmm14, xmm10
|
|
|
|
punpckldq xmm10, xmm11
|
|
|
|
punpckhdq xmm14, xmm11
|
|
|
|
movdqa xmm9, xmm8
|
|
|
|
punpcklqdq xmm8, xmm10
|
|
|
|
punpckhqdq xmm9, xmm10
|
|
|
|
movdqa xmm13, xmm12
|
|
|
|
punpcklqdq xmm12, xmm14
|
|
|
|
punpckhqdq xmm13, xmm14
|
|
|
|
movdqa xmmword ptr [rsp+0x40], xmm8
|
|
|
|
movdqa xmmword ptr [rsp+0x50], xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x60], xmm12
|
|
|
|
movdqa xmmword ptr [rsp+0x70], xmm13
|
|
|
|
movdqu xmm8, xmmword ptr [r8+rdx-0x20]
|
|
|
|
movdqu xmm9, xmmword ptr [r9+rdx-0x20]
|
|
|
|
movdqu xmm10, xmmword ptr [r10+rdx-0x20]
|
|
|
|
movdqu xmm11, xmmword ptr [r11+rdx-0x20]
|
|
|
|
movdqa xmm12, xmm8
|
|
|
|
punpckldq xmm8, xmm9
|
|
|
|
punpckhdq xmm12, xmm9
|
|
|
|
movdqa xmm14, xmm10
|
|
|
|
punpckldq xmm10, xmm11
|
|
|
|
punpckhdq xmm14, xmm11
|
|
|
|
movdqa xmm9, xmm8
|
|
|
|
punpcklqdq xmm8, xmm10
|
|
|
|
punpckhqdq xmm9, xmm10
|
|
|
|
movdqa xmm13, xmm12
|
|
|
|
punpcklqdq xmm12, xmm14
|
|
|
|
punpckhqdq xmm13, xmm14
|
|
|
|
movdqa xmmword ptr [rsp+0x80], xmm8
|
|
|
|
movdqa xmmword ptr [rsp+0x90], xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0xA0], xmm12
|
|
|
|
movdqa xmmword ptr [rsp+0xB0], xmm13
|
|
|
|
movdqu xmm8, xmmword ptr [r8+rdx-0x10]
|
|
|
|
movdqu xmm9, xmmword ptr [r9+rdx-0x10]
|
|
|
|
movdqu xmm10, xmmword ptr [r10+rdx-0x10]
|
|
|
|
movdqu xmm11, xmmword ptr [r11+rdx-0x10]
|
|
|
|
movdqa xmm12, xmm8
|
|
|
|
punpckldq xmm8, xmm9
|
|
|
|
punpckhdq xmm12, xmm9
|
|
|
|
movdqa xmm14, xmm10
|
|
|
|
punpckldq xmm10, xmm11
|
|
|
|
punpckhdq xmm14, xmm11
|
|
|
|
movdqa xmm9, xmm8
|
|
|
|
punpcklqdq xmm8, xmm10
|
|
|
|
punpckhqdq xmm9, xmm10
|
|
|
|
movdqa xmm13, xmm12
|
|
|
|
punpcklqdq xmm12, xmm14
|
|
|
|
punpckhqdq xmm13, xmm14
|
|
|
|
movdqa xmmword ptr [rsp+0xC0], xmm8
|
|
|
|
movdqa xmmword ptr [rsp+0xD0], xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0xE0], xmm12
|
|
|
|
movdqa xmmword ptr [rsp+0xF0], xmm13
|
|
|
|
movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
|
|
|
|
movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
|
|
|
|
movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
|
|
|
|
movdqa xmm12, xmmword ptr [rsp+0x110]
|
|
|
|
movdqa xmm13, xmmword ptr [rsp+0x120]
|
|
|
|
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
|
|
|
|
movd xmm15, eax
|
|
|
|
pshufd xmm15, xmm15, 0x00
|
|
|
|
prefetcht0 [r8+rdx+0x80]
|
|
|
|
prefetcht0 [r9+rdx+0x80]
|
|
|
|
prefetcht0 [r10+rdx+0x80]
|
|
|
|
prefetcht0 [r11+rdx+0x80]
|
|
|
|
paddd xmm0, xmmword ptr [rsp]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x20]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x40]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x60]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x10]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x30]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x50]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x70]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x80]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xA0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xC0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xE0]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x90]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xB0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xD0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xF0]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x20]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x30]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x70]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x40]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x60]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xA0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xD0]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x10]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xC0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x90]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xF0]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xB0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x50]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xE0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x80]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x30]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xA0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xD0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x70]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x40]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xC0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x20]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xE0]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x60]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x90]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xB0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x80]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x50]
|
|
|
|
paddd xmm1, xmmword ptr [rsp]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xF0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x10]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xA0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xC0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xE0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xD0]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x70]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x90]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x30]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xF0]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x40]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xB0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x50]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x10]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x20]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x80]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x60]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xC0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x90]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xF0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xE0]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xD0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xB0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xA0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x80]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x70]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x50]
|
|
|
|
paddd xmm2, xmmword ptr [rsp]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x60]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x20]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x30]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x10]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x40]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x90]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xB0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x80]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xF0]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xE0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x50]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0xC0]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x10]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xD0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x20]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x40]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0x30]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xA0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x60]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x70]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xB0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x50]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x10]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x80]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xF0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x90]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x60]
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm1, xmm5
|
|
|
|
paddd xmm2, xmm6
|
|
|
|
paddd xmm3, xmm7
|
|
|
|
pxor xmm12, xmm0
|
|
|
|
pxor xmm13, xmm1
|
|
|
|
pxor xmm14, xmm2
|
|
|
|
pxor xmm15, xmm3
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
paddd xmm9, xmm13
|
|
|
|
paddd xmm10, xmm14
|
|
|
|
paddd xmm11, xmm15
|
|
|
|
pxor xmm4, xmm8
|
|
|
|
pxor xmm5, xmm9
|
|
|
|
pxor xmm6, xmm10
|
|
|
|
pxor xmm7, xmm11
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xE0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x20]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x30]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0x70]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm5, 20
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm6, 20
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm7, 20
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 12
|
|
|
|
pslld xmm4, 20
|
|
|
|
por xmm4, xmm8
|
|
|
|
paddd xmm0, xmmword ptr [rsp+0xA0]
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0xC0]
|
|
|
|
paddd xmm2, xmmword ptr [rsp+0x40]
|
|
|
|
paddd xmm3, xmmword ptr [rsp+0xD0]
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm1, xmm6
|
|
|
|
paddd xmm2, xmm7
|
|
|
|
paddd xmm3, xmm4
|
|
|
|
pxor xmm15, xmm0
|
|
|
|
pxor xmm12, xmm1
|
|
|
|
pxor xmm13, xmm2
|
|
|
|
pxor xmm14, xmm3
|
|
|
|
movdqa xmm8, xmm15
|
|
|
|
psrld xmm15, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm15, xmm8
|
|
|
|
movdqa xmm8, xmm12
|
|
|
|
psrld xmm12, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm12, xmm8
|
|
|
|
movdqa xmm8, xmm13
|
|
|
|
psrld xmm13, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm13, xmm8
|
|
|
|
movdqa xmm8, xmm14
|
|
|
|
psrld xmm14, 8
|
|
|
|
pslld xmm8, 24
|
|
|
|
pxor xmm14, xmm8
|
|
|
|
paddd xmm10, xmm15
|
|
|
|
paddd xmm11, xmm12
|
|
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
paddd xmm9, xmm14
|
|
|
|
pxor xmm5, xmm10
|
|
|
|
pxor xmm6, xmm11
|
|
|
|
pxor xmm7, xmm8
|
|
|
|
pxor xmm4, xmm9
|
|
|
|
pxor xmm0, xmm8
|
|
|
|
pxor xmm1, xmm9
|
|
|
|
pxor xmm2, xmm10
|
|
|
|
pxor xmm3, xmm11
|
|
|
|
movdqa xmm8, xmm5
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm5, 25
|
|
|
|
por xmm5, xmm8
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm6, 25
|
|
|
|
por xmm6, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm7, 25
|
|
|
|
por xmm7, xmm8
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
psrld xmm8, 7
|
|
|
|
pslld xmm4, 25
|
|
|
|
por xmm4, xmm8
|
|
|
|
pxor xmm4, xmm12
|
|
|
|
pxor xmm5, xmm13
|
|
|
|
pxor xmm6, xmm14
|
|
|
|
pxor xmm7, xmm15
|
|
|
|
mov eax, r13d
|
|
|
|
jne 9b
|
|
|
|
movdqa xmm9, xmm0
|
|
|
|
punpckldq xmm0, xmm1
|
|
|
|
punpckhdq xmm9, xmm1
|
|
|
|
movdqa xmm11, xmm2
|
|
|
|
punpckldq xmm2, xmm3
|
|
|
|
punpckhdq xmm11, xmm3
|
|
|
|
movdqa xmm1, xmm0
|
|
|
|
punpcklqdq xmm0, xmm2
|
|
|
|
punpckhqdq xmm1, xmm2
|
|
|
|
movdqa xmm3, xmm9
|
|
|
|
punpcklqdq xmm9, xmm11
|
|
|
|
punpckhqdq xmm3, xmm11
|
|
|
|
movdqu xmmword ptr [rbx], xmm0
|
|
|
|
movdqu xmmword ptr [rbx+0x20], xmm1
|
|
|
|
movdqu xmmword ptr [rbx+0x40], xmm9
|
|
|
|
movdqu xmmword ptr [rbx+0x60], xmm3
|
|
|
|
movdqa xmm9, xmm4
|
|
|
|
punpckldq xmm4, xmm5
|
|
|
|
punpckhdq xmm9, xmm5
|
|
|
|
movdqa xmm11, xmm6
|
|
|
|
punpckldq xmm6, xmm7
|
|
|
|
punpckhdq xmm11, xmm7
|
|
|
|
movdqa xmm5, xmm4
|
|
|
|
punpcklqdq xmm4, xmm6
|
|
|
|
punpckhqdq xmm5, xmm6
|
|
|
|
movdqa xmm7, xmm9
|
|
|
|
punpcklqdq xmm9, xmm11
|
|
|
|
punpckhqdq xmm7, xmm11
|
|
|
|
movdqu xmmword ptr [rbx+0x10], xmm4
|
|
|
|
movdqu xmmword ptr [rbx+0x30], xmm5
|
|
|
|
movdqu xmmword ptr [rbx+0x50], xmm9
|
|
|
|
movdqu xmmword ptr [rbx+0x70], xmm7
|
|
|
|
movdqa xmm1, xmmword ptr [rsp+0x110]
|
|
|
|
movdqa xmm0, xmm1
|
|
|
|
paddd xmm1, xmmword ptr [rsp+0x150]
|
|
|
|
movdqa xmmword ptr [rsp+0x110], xmm1
|
|
|
|
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
|
|
|
|
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
|
|
|
|
pcmpgtd xmm0, xmm1
|
|
|
|
movdqa xmm1, xmmword ptr [rsp+0x120]
|
|
|
|
psubd xmm1, xmm0
|
|
|
|
movdqa xmmword ptr [rsp+0x120], xmm1
|
|
|
|
add rbx, 128
|
|
|
|
add rdi, 32
|
|
|
|
sub rsi, 4
|
|
|
|
cmp rsi, 4
|
|
|
|
jnc 2b
|
|
|
|
test rsi, rsi
|
|
|
|
jnz 3f
|
|
|
|
4:
|
|
|
|
mov rsp, rbp
|
|
|
|
pop rbp
|
|
|
|
pop rbx
|
|
|
|
pop r12
|
|
|
|
pop r13
|
|
|
|
pop r14
|
|
|
|
pop r15
|
2022-06-20 23:36:21 +00:00
|
|
|
RET
|
Introduce BLAKE3 checksums as an OpenZFS feature
This commit adds BLAKE3 checksums to OpenZFS, it has similar
performance to Edon-R, but without the caveats around the latter.
Homepage of BLAKE3: https://github.com/BLAKE3-team/BLAKE3
Wikipedia: https://en.wikipedia.org/wiki/BLAKE_(hash_function)#BLAKE3
Short description of Wikipedia:
BLAKE3 is a cryptographic hash function based on Bao and BLAKE2,
created by Jack O'Connor, Jean-Philippe Aumasson, Samuel Neves, and
Zooko Wilcox-O'Hearn. It was announced on January 9, 2020, at Real
World Crypto. BLAKE3 is a single algorithm with many desirable
features (parallelism, XOF, KDF, PRF and MAC), in contrast to BLAKE
and BLAKE2, which are algorithm families with multiple variants.
BLAKE3 has a binary tree structure, so it supports a practically
unlimited degree of parallelism (both SIMD and multithreading) given
enough input. The official Rust and C implementations are
dual-licensed as public domain (CC0) and the Apache License.
Along with adding the BLAKE3 hash into the OpenZFS infrastructure a
new benchmarking file called chksum_bench was introduced. When read
it reports the speed of the available checksum functions.
On Linux: cat /proc/spl/kstat/zfs/chksum_bench
On FreeBSD: sysctl kstat.zfs.misc.chksum_bench
This is an example output of an i3-1005G1 test system with Debian 11:
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1196 1602 1761 1749 1762 1759 1751
skein-generic 546 591 608 615 619 612 616
sha256-generic 240 300 316 314 304 285 276
sha512-generic 353 441 467 476 472 467 426
blake3-generic 308 313 313 313 312 313 312
blake3-sse2 402 1289 1423 1446 1432 1458 1413
blake3-sse41 427 1470 1625 1704 1679 1607 1629
blake3-avx2 428 1920 3095 3343 3356 3318 3204
blake3-avx512 473 2687 4905 5836 5844 5643 5374
Output on Debian 5.10.0-10-amd64 system: (Ryzen 7 5800X)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1840 2458 2665 2719 2711 2723 2693
skein-generic 870 966 996 992 1003 1005 1009
sha256-generic 415 442 453 455 457 457 457
sha512-generic 608 690 711 718 719 720 721
blake3-generic 301 313 311 309 309 310 310
blake3-sse2 343 1865 2124 2188 2180 2181 2186
blake3-sse41 364 2091 2396 2509 2463 2482 2488
blake3-avx2 365 2590 4399 4971 4915 4802 4764
Output on Debian 5.10.0-9-powerpc64le system: (POWER 9)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1213 1703 1889 1918 1957 1902 1907
skein-generic 434 492 520 522 511 525 525
sha256-generic 167 183 187 188 188 187 188
sha512-generic 186 216 222 221 225 224 224
blake3-generic 153 152 154 153 151 153 153
blake3-sse2 391 1170 1366 1406 1428 1426 1414
blake3-sse41 352 1049 1212 1174 1262 1258 1259
Output on Debian 5.10.0-11-arm64 system: (Pi400)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 487 603 629 639 643 641 641
skein-generic 271 299 303 308 309 309 307
sha256-generic 117 127 128 130 130 129 130
sha512-generic 145 165 170 172 173 174 175
blake3-generic 81 29 71 89 89 89 89
blake3-sse2 112 323 368 379 380 371 374
blake3-sse41 101 315 357 368 369 364 360
Structurally, the new code is mainly split into these parts:
- 1x cross platform generic c variant: blake3_generic.c
- 4x assembly for X86-64 (SSE2, SSE4.1, AVX2, AVX512)
- 2x assembly for ARMv8 (NEON converted from SSE2)
- 2x assembly for PPC64-LE (POWER8 converted from SSE2)
- one file for switching between the implementations
Note the PPC64 assembly requires the VSX instruction set and the
kfpu_begin() / kfpu_end() calls on PowerPC were updated accordingly.
Reviewed-by: Felix Dörre <felix@dogcraft.de>
Reviewed-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Co-authored-by: Rich Ercolani <rincebrain@gmail.com>
Closes #10058
Closes #12918
2022-06-08 22:55:57 +00:00
|
|
|
.p2align 5
|
|
|
|
3:
|
|
|
|
test esi, 0x2
|
|
|
|
je 3f
|
|
|
|
movups xmm0, xmmword ptr [rcx]
|
|
|
|
movups xmm1, xmmword ptr [rcx+0x10]
|
|
|
|
movaps xmm8, xmm0
|
|
|
|
movaps xmm9, xmm1
|
|
|
|
movd xmm13, dword ptr [rsp+0x110]
|
|
|
|
movd xmm14, dword ptr [rsp+0x120]
|
|
|
|
punpckldq xmm13, xmm14
|
|
|
|
movaps xmmword ptr [rsp], xmm13
|
|
|
|
movd xmm14, dword ptr [rsp+0x114]
|
|
|
|
movd xmm13, dword ptr [rsp+0x124]
|
|
|
|
punpckldq xmm14, xmm13
|
|
|
|
movaps xmmword ptr [rsp+0x10], xmm14
|
|
|
|
mov r8, qword ptr [rdi]
|
|
|
|
mov r9, qword ptr [rdi+0x8]
|
|
|
|
movzx eax, byte ptr [rbp+0x40]
|
|
|
|
or eax, r13d
|
|
|
|
xor edx, edx
|
|
|
|
2:
|
|
|
|
mov r14d, eax
|
|
|
|
or eax, r12d
|
|
|
|
add rdx, 64
|
|
|
|
cmp rdx, r15
|
|
|
|
cmovne eax, r14d
|
|
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
|
|
movaps xmm10, xmm2
|
|
|
|
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
|
|
|
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
|
|
|
movaps xmm3, xmm4
|
|
|
|
shufps xmm4, xmm5, 136
|
|
|
|
shufps xmm3, xmm5, 221
|
|
|
|
movaps xmm5, xmm3
|
|
|
|
movups xmm6, xmmword ptr [r8+rdx-0x20]
|
|
|
|
movups xmm7, xmmword ptr [r8+rdx-0x10]
|
|
|
|
movaps xmm3, xmm6
|
|
|
|
shufps xmm6, xmm7, 136
|
|
|
|
pshufd xmm6, xmm6, 0x93
|
|
|
|
shufps xmm3, xmm7, 221
|
|
|
|
pshufd xmm7, xmm3, 0x93
|
|
|
|
movups xmm12, xmmword ptr [r9+rdx-0x40]
|
|
|
|
movups xmm13, xmmword ptr [r9+rdx-0x30]
|
|
|
|
movaps xmm11, xmm12
|
|
|
|
shufps xmm12, xmm13, 136
|
|
|
|
shufps xmm11, xmm13, 221
|
|
|
|
movaps xmm13, xmm11
|
|
|
|
movups xmm14, xmmword ptr [r9+rdx-0x20]
|
|
|
|
movups xmm15, xmmword ptr [r9+rdx-0x10]
|
|
|
|
movaps xmm11, xmm14
|
|
|
|
shufps xmm14, xmm15, 136
|
|
|
|
pshufd xmm14, xmm14, 0x93
|
|
|
|
shufps xmm11, xmm15, 221
|
|
|
|
pshufd xmm15, xmm11, 0x93
|
|
|
|
shl rax, 0x20
|
|
|
|
or rax, 0x40
|
|
|
|
movq xmm3, rax
|
|
|
|
movdqa xmmword ptr [rsp+0x20], xmm3
|
|
|
|
movaps xmm3, xmmword ptr [rsp]
|
|
|
|
movaps xmm11, xmmword ptr [rsp+0x10]
|
|
|
|
punpcklqdq xmm3, xmmword ptr [rsp+0x20]
|
|
|
|
punpcklqdq xmm11, xmmword ptr [rsp+0x20]
|
|
|
|
mov al, 7
|
|
|
|
9:
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm8, xmm12
|
|
|
|
movaps xmmword ptr [rsp+0x20], xmm4
|
|
|
|
movaps xmmword ptr [rsp+0x30], xmm12
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
paddd xmm8, xmm9
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pxor xmm11, xmm8
|
|
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
|
|
pshuflw xmm11, xmm11, 0xB1
|
|
|
|
pshufhw xmm11, xmm11, 0xB1
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
paddd xmm10, xmm11
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
pxor xmm9, xmm10
|
|
|
|
movdqa xmm4, xmm1
|
|
|
|
pslld xmm1, 20
|
|
|
|
psrld xmm4, 12
|
|
|
|
por xmm1, xmm4
|
|
|
|
movdqa xmm4, xmm9
|
|
|
|
pslld xmm9, 20
|
|
|
|
psrld xmm4, 12
|
|
|
|
por xmm9, xmm4
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm8, xmm13
|
|
|
|
movaps xmmword ptr [rsp+0x40], xmm5
|
|
|
|
movaps xmmword ptr [rsp+0x50], xmm13
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
paddd xmm8, xmm9
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pxor xmm11, xmm8
|
|
|
|
movdqa xmm13, xmm3
|
|
|
|
psrld xmm3, 8
|
|
|
|
pslld xmm13, 24
|
|
|
|
pxor xmm3, xmm13
|
|
|
|
movdqa xmm13, xmm11
|
|
|
|
psrld xmm11, 8
|
|
|
|
pslld xmm13, 24
|
|
|
|
pxor xmm11, xmm13
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
paddd xmm10, xmm11
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
pxor xmm9, xmm10
|
|
|
|
movdqa xmm4, xmm1
|
|
|
|
pslld xmm1, 25
|
|
|
|
psrld xmm4, 7
|
|
|
|
por xmm1, xmm4
|
|
|
|
movdqa xmm4, xmm9
|
|
|
|
pslld xmm9, 25
|
|
|
|
psrld xmm4, 7
|
|
|
|
por xmm9, xmm4
|
|
|
|
pshufd xmm0, xmm0, 0x93
|
|
|
|
pshufd xmm8, xmm8, 0x93
|
|
|
|
pshufd xmm3, xmm3, 0x4E
|
|
|
|
pshufd xmm11, xmm11, 0x4E
|
|
|
|
pshufd xmm2, xmm2, 0x39
|
|
|
|
pshufd xmm10, xmm10, 0x39
|
|
|
|
paddd xmm0, xmm6
|
|
|
|
paddd xmm8, xmm14
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
paddd xmm8, xmm9
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pxor xmm11, xmm8
|
|
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
|
|
pshuflw xmm11, xmm11, 0xB1
|
|
|
|
pshufhw xmm11, xmm11, 0xB1
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
paddd xmm10, xmm11
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
pxor xmm9, xmm10
|
|
|
|
movdqa xmm4, xmm1
|
|
|
|
pslld xmm1, 20
|
|
|
|
psrld xmm4, 12
|
|
|
|
por xmm1, xmm4
|
|
|
|
movdqa xmm4, xmm9
|
|
|
|
pslld xmm9, 20
|
|
|
|
psrld xmm4, 12
|
|
|
|
por xmm9, xmm4
|
|
|
|
paddd xmm0, xmm7
|
|
|
|
paddd xmm8, xmm15
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
paddd xmm8, xmm9
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pxor xmm11, xmm8
|
|
|
|
movdqa xmm13, xmm3
|
|
|
|
psrld xmm3, 8
|
|
|
|
pslld xmm13, 24
|
|
|
|
pxor xmm3, xmm13
|
|
|
|
movdqa xmm13, xmm11
|
|
|
|
psrld xmm11, 8
|
|
|
|
pslld xmm13, 24
|
|
|
|
pxor xmm11, xmm13
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
paddd xmm10, xmm11
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
pxor xmm9, xmm10
|
|
|
|
movdqa xmm4, xmm1
|
|
|
|
pslld xmm1, 25
|
|
|
|
psrld xmm4, 7
|
|
|
|
por xmm1, xmm4
|
|
|
|
movdqa xmm4, xmm9
|
|
|
|
pslld xmm9, 25
|
|
|
|
psrld xmm4, 7
|
|
|
|
por xmm9, xmm4
|
|
|
|
pshufd xmm0, xmm0, 0x39
|
|
|
|
pshufd xmm8, xmm8, 0x39
|
|
|
|
pshufd xmm3, xmm3, 0x4E
|
|
|
|
pshufd xmm11, xmm11, 0x4E
|
|
|
|
pshufd xmm2, xmm2, 0x93
|
|
|
|
pshufd xmm10, xmm10, 0x93
|
|
|
|
dec al
|
|
|
|
je 9f
|
|
|
|
movdqa xmm12, xmmword ptr [rsp+0x20]
|
|
|
|
movdqa xmm5, xmmword ptr [rsp+0x40]
|
|
|
|
pshufd xmm13, xmm12, 0x0F
|
|
|
|
shufps xmm12, xmm5, 214
|
|
|
|
pshufd xmm4, xmm12, 0x39
|
|
|
|
movdqa xmm12, xmm6
|
|
|
|
shufps xmm12, xmm7, 250
|
|
|
|
pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
|
|
pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
|
|
por xmm13, xmm12
|
|
|
|
movdqa xmmword ptr [rsp+0x20], xmm13
|
|
|
|
movdqa xmm12, xmm7
|
|
|
|
punpcklqdq xmm12, xmm5
|
|
|
|
movdqa xmm13, xmm6
|
|
|
|
pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
|
|
pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
|
|
por xmm12, xmm13
|
|
|
|
pshufd xmm12, xmm12, 0x78
|
|
|
|
punpckhdq xmm5, xmm7
|
|
|
|
punpckldq xmm6, xmm5
|
|
|
|
pshufd xmm7, xmm6, 0x1E
|
|
|
|
movdqa xmmword ptr [rsp+0x40], xmm12
|
|
|
|
movdqa xmm5, xmmword ptr [rsp+0x30]
|
|
|
|
movdqa xmm13, xmmword ptr [rsp+0x50]
|
|
|
|
pshufd xmm6, xmm5, 0x0F
|
|
|
|
shufps xmm5, xmm13, 214
|
|
|
|
pshufd xmm12, xmm5, 0x39
|
|
|
|
movdqa xmm5, xmm14
|
|
|
|
shufps xmm5, xmm15, 250
|
|
|
|
pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
|
|
pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
|
|
por xmm6, xmm5
|
|
|
|
movdqa xmm5, xmm15
|
|
|
|
punpcklqdq xmm5, xmm13
|
|
|
|
movdqa xmmword ptr [rsp+0x30], xmm2
|
|
|
|
movdqa xmm2, xmm14
|
|
|
|
pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
|
|
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
|
|
por xmm5, xmm2
|
|
|
|
movdqa xmm2, xmmword ptr [rsp+0x30]
|
|
|
|
pshufd xmm5, xmm5, 0x78
|
|
|
|
punpckhdq xmm13, xmm15
|
|
|
|
punpckldq xmm14, xmm13
|
|
|
|
pshufd xmm15, xmm14, 0x1E
|
|
|
|
movdqa xmm13, xmm6
|
|
|
|
movdqa xmm14, xmm5
|
|
|
|
movdqa xmm5, xmmword ptr [rsp+0x20]
|
|
|
|
movdqa xmm6, xmmword ptr [rsp+0x40]
|
|
|
|
jmp 9b
|
|
|
|
9:
|
|
|
|
pxor xmm0, xmm2
|
|
|
|
pxor xmm1, xmm3
|
|
|
|
pxor xmm8, xmm10
|
|
|
|
pxor xmm9, xmm11
|
|
|
|
mov eax, r13d
|
|
|
|
cmp rdx, r15
|
|
|
|
jne 2b
|
|
|
|
movups xmmword ptr [rbx], xmm0
|
|
|
|
movups xmmword ptr [rbx+0x10], xmm1
|
|
|
|
movups xmmword ptr [rbx+0x20], xmm8
|
|
|
|
movups xmmword ptr [rbx+0x30], xmm9
|
|
|
|
mov eax, dword ptr [rsp+0x130]
|
|
|
|
neg eax
|
|
|
|
mov r10d, dword ptr [rsp+0x110+8*rax]
|
|
|
|
mov r11d, dword ptr [rsp+0x120+8*rax]
|
|
|
|
mov dword ptr [rsp+0x110], r10d
|
|
|
|
mov dword ptr [rsp+0x120], r11d
|
|
|
|
add rdi, 16
|
|
|
|
add rbx, 64
|
|
|
|
sub rsi, 2
|
|
|
|
3:
|
|
|
|
test esi, 0x1
|
|
|
|
je 4b
|
|
|
|
movups xmm0, xmmword ptr [rcx]
|
|
|
|
movups xmm1, xmmword ptr [rcx+0x10]
|
|
|
|
movd xmm13, dword ptr [rsp+0x110]
|
|
|
|
movd xmm14, dword ptr [rsp+0x120]
|
|
|
|
punpckldq xmm13, xmm14
|
|
|
|
mov r8, qword ptr [rdi]
|
|
|
|
movzx eax, byte ptr [rbp+0x40]
|
|
|
|
or eax, r13d
|
|
|
|
xor edx, edx
|
|
|
|
2:
|
|
|
|
mov r14d, eax
|
|
|
|
or eax, r12d
|
|
|
|
add rdx, 64
|
|
|
|
cmp rdx, r15
|
|
|
|
cmovne eax, r14d
|
|
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
|
|
shl rax, 32
|
|
|
|
or rax, 64
|
|
|
|
movq xmm12, rax
|
|
|
|
movdqa xmm3, xmm13
|
|
|
|
punpcklqdq xmm3, xmm12
|
|
|
|
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
|
|
|
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
|
|
|
movaps xmm8, xmm4
|
|
|
|
shufps xmm4, xmm5, 136
|
|
|
|
shufps xmm8, xmm5, 221
|
|
|
|
movaps xmm5, xmm8
|
|
|
|
movups xmm6, xmmword ptr [r8+rdx-0x20]
|
|
|
|
movups xmm7, xmmword ptr [r8+rdx-0x10]
|
|
|
|
movaps xmm8, xmm6
|
|
|
|
shufps xmm6, xmm7, 136
|
|
|
|
pshufd xmm6, xmm6, 0x93
|
|
|
|
shufps xmm8, xmm7, 221
|
|
|
|
pshufd xmm7, xmm8, 0x93
|
|
|
|
mov al, 7
|
|
|
|
9:
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 20
|
|
|
|
psrld xmm11, 12
|
|
|
|
por xmm1, xmm11
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
movdqa xmm14, xmm3
|
|
|
|
psrld xmm3, 8
|
|
|
|
pslld xmm14, 24
|
|
|
|
pxor xmm3, xmm14
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 25
|
|
|
|
psrld xmm11, 7
|
|
|
|
por xmm1, xmm11
|
|
|
|
pshufd xmm0, xmm0, 0x93
|
|
|
|
pshufd xmm3, xmm3, 0x4E
|
|
|
|
pshufd xmm2, xmm2, 0x39
|
|
|
|
paddd xmm0, xmm6
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 20
|
|
|
|
psrld xmm11, 12
|
|
|
|
por xmm1, xmm11
|
|
|
|
paddd xmm0, xmm7
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
movdqa xmm14, xmm3
|
|
|
|
psrld xmm3, 8
|
|
|
|
pslld xmm14, 24
|
|
|
|
pxor xmm3, xmm14
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 25
|
|
|
|
psrld xmm11, 7
|
|
|
|
por xmm1, xmm11
|
|
|
|
pshufd xmm0, xmm0, 0x39
|
|
|
|
pshufd xmm3, xmm3, 0x4E
|
|
|
|
pshufd xmm2, xmm2, 0x93
|
|
|
|
dec al
|
|
|
|
jz 9f
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
shufps xmm8, xmm5, 214
|
|
|
|
pshufd xmm9, xmm4, 0x0F
|
|
|
|
pshufd xmm4, xmm8, 0x39
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
shufps xmm8, xmm7, 250
|
|
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
|
|
por xmm9, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
punpcklqdq xmm8, xmm5
|
|
|
|
movdqa xmm10, xmm6
|
|
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
|
|
por xmm8, xmm10
|
|
|
|
pshufd xmm8, xmm8, 0x78
|
|
|
|
punpckhdq xmm5, xmm7
|
|
|
|
punpckldq xmm6, xmm5
|
|
|
|
pshufd xmm7, xmm6, 0x1E
|
|
|
|
movdqa xmm5, xmm9
|
|
|
|
movdqa xmm6, xmm8
|
|
|
|
jmp 9b
|
|
|
|
9:
|
|
|
|
pxor xmm0, xmm2
|
|
|
|
pxor xmm1, xmm3
|
|
|
|
mov eax, r13d
|
|
|
|
cmp rdx, r15
|
|
|
|
jne 2b
|
|
|
|
movups xmmword ptr [rbx], xmm0
|
|
|
|
movups xmmword ptr [rbx+0x10], xmm1
|
|
|
|
jmp 4b
|
|
|
|
|
|
|
|
.p2align 6
|
|
|
|
zfs_blake3_compress_in_place_sse2:
|
|
|
|
_CET_ENDBR
|
|
|
|
movups xmm0, xmmword ptr [rdi]
|
|
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
|
|
shl r8, 32
|
|
|
|
add rdx, r8
|
|
|
|
movq xmm3, rcx
|
|
|
|
movq xmm4, rdx
|
|
|
|
punpcklqdq xmm3, xmm4
|
|
|
|
movups xmm4, xmmword ptr [rsi]
|
|
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
|
|
movaps xmm8, xmm4
|
|
|
|
shufps xmm4, xmm5, 136
|
|
|
|
shufps xmm8, xmm5, 221
|
|
|
|
movaps xmm5, xmm8
|
|
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
|
|
movaps xmm8, xmm6
|
|
|
|
shufps xmm6, xmm7, 136
|
|
|
|
pshufd xmm6, xmm6, 0x93
|
|
|
|
shufps xmm8, xmm7, 221
|
|
|
|
pshufd xmm7, xmm8, 0x93
|
|
|
|
mov al, 7
|
|
|
|
9:
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 20
|
|
|
|
psrld xmm11, 12
|
|
|
|
por xmm1, xmm11
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
movdqa xmm14, xmm3
|
|
|
|
psrld xmm3, 8
|
|
|
|
pslld xmm14, 24
|
|
|
|
pxor xmm3, xmm14
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 25
|
|
|
|
psrld xmm11, 7
|
|
|
|
por xmm1, xmm11
|
|
|
|
pshufd xmm0, xmm0, 0x93
|
|
|
|
pshufd xmm3, xmm3, 0x4E
|
|
|
|
pshufd xmm2, xmm2, 0x39
|
|
|
|
paddd xmm0, xmm6
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 20
|
|
|
|
psrld xmm11, 12
|
|
|
|
por xmm1, xmm11
|
|
|
|
paddd xmm0, xmm7
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
movdqa xmm14, xmm3
|
|
|
|
psrld xmm3, 8
|
|
|
|
pslld xmm14, 24
|
|
|
|
pxor xmm3, xmm14
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 25
|
|
|
|
psrld xmm11, 7
|
|
|
|
por xmm1, xmm11
|
|
|
|
pshufd xmm0, xmm0, 0x39
|
|
|
|
pshufd xmm3, xmm3, 0x4E
|
|
|
|
pshufd xmm2, xmm2, 0x93
|
|
|
|
dec al
|
|
|
|
jz 9f
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
shufps xmm8, xmm5, 214
|
|
|
|
pshufd xmm9, xmm4, 0x0F
|
|
|
|
pshufd xmm4, xmm8, 0x39
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
shufps xmm8, xmm7, 250
|
|
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
|
|
por xmm9, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
punpcklqdq xmm8, xmm5
|
|
|
|
movdqa xmm10, xmm6
|
|
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
|
|
por xmm8, xmm10
|
|
|
|
pshufd xmm8, xmm8, 0x78
|
|
|
|
punpckhdq xmm5, xmm7
|
|
|
|
punpckldq xmm6, xmm5
|
|
|
|
pshufd xmm7, xmm6, 0x1E
|
|
|
|
movdqa xmm5, xmm9
|
|
|
|
movdqa xmm6, xmm8
|
|
|
|
jmp 9b
|
|
|
|
9:
|
|
|
|
pxor xmm0, xmm2
|
|
|
|
pxor xmm1, xmm3
|
|
|
|
movups xmmword ptr [rdi], xmm0
|
|
|
|
movups xmmword ptr [rdi+0x10], xmm1
|
2022-06-20 23:36:21 +00:00
|
|
|
RET
|
Introduce BLAKE3 checksums as an OpenZFS feature
This commit adds BLAKE3 checksums to OpenZFS, it has similar
performance to Edon-R, but without the caveats around the latter.
Homepage of BLAKE3: https://github.com/BLAKE3-team/BLAKE3
Wikipedia: https://en.wikipedia.org/wiki/BLAKE_(hash_function)#BLAKE3
Short description of Wikipedia:
BLAKE3 is a cryptographic hash function based on Bao and BLAKE2,
created by Jack O'Connor, Jean-Philippe Aumasson, Samuel Neves, and
Zooko Wilcox-O'Hearn. It was announced on January 9, 2020, at Real
World Crypto. BLAKE3 is a single algorithm with many desirable
features (parallelism, XOF, KDF, PRF and MAC), in contrast to BLAKE
and BLAKE2, which are algorithm families with multiple variants.
BLAKE3 has a binary tree structure, so it supports a practically
unlimited degree of parallelism (both SIMD and multithreading) given
enough input. The official Rust and C implementations are
dual-licensed as public domain (CC0) and the Apache License.
Along with adding the BLAKE3 hash into the OpenZFS infrastructure a
new benchmarking file called chksum_bench was introduced. When read
it reports the speed of the available checksum functions.
On Linux: cat /proc/spl/kstat/zfs/chksum_bench
On FreeBSD: sysctl kstat.zfs.misc.chksum_bench
This is an example output of an i3-1005G1 test system with Debian 11:
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1196 1602 1761 1749 1762 1759 1751
skein-generic 546 591 608 615 619 612 616
sha256-generic 240 300 316 314 304 285 276
sha512-generic 353 441 467 476 472 467 426
blake3-generic 308 313 313 313 312 313 312
blake3-sse2 402 1289 1423 1446 1432 1458 1413
blake3-sse41 427 1470 1625 1704 1679 1607 1629
blake3-avx2 428 1920 3095 3343 3356 3318 3204
blake3-avx512 473 2687 4905 5836 5844 5643 5374
Output on Debian 5.10.0-10-amd64 system: (Ryzen 7 5800X)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1840 2458 2665 2719 2711 2723 2693
skein-generic 870 966 996 992 1003 1005 1009
sha256-generic 415 442 453 455 457 457 457
sha512-generic 608 690 711 718 719 720 721
blake3-generic 301 313 311 309 309 310 310
blake3-sse2 343 1865 2124 2188 2180 2181 2186
blake3-sse41 364 2091 2396 2509 2463 2482 2488
blake3-avx2 365 2590 4399 4971 4915 4802 4764
Output on Debian 5.10.0-9-powerpc64le system: (POWER 9)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1213 1703 1889 1918 1957 1902 1907
skein-generic 434 492 520 522 511 525 525
sha256-generic 167 183 187 188 188 187 188
sha512-generic 186 216 222 221 225 224 224
blake3-generic 153 152 154 153 151 153 153
blake3-sse2 391 1170 1366 1406 1428 1426 1414
blake3-sse41 352 1049 1212 1174 1262 1258 1259
Output on Debian 5.10.0-11-arm64 system: (Pi400)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 487 603 629 639 643 641 641
skein-generic 271 299 303 308 309 309 307
sha256-generic 117 127 128 130 130 129 130
sha512-generic 145 165 170 172 173 174 175
blake3-generic 81 29 71 89 89 89 89
blake3-sse2 112 323 368 379 380 371 374
blake3-sse41 101 315 357 368 369 364 360
Structurally, the new code is mainly split into these parts:
- 1x cross platform generic c variant: blake3_generic.c
- 4x assembly for X86-64 (SSE2, SSE4.1, AVX2, AVX512)
- 2x assembly for ARMv8 (NEON converted from SSE2)
- 2x assembly for PPC64-LE (POWER8 converted from SSE2)
- one file for switching between the implementations
Note the PPC64 assembly requires the VSX instruction set and the
kfpu_begin() / kfpu_end() calls on PowerPC were updated accordingly.
Reviewed-by: Felix Dörre <felix@dogcraft.de>
Reviewed-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Co-authored-by: Rich Ercolani <rincebrain@gmail.com>
Closes #10058
Closes #12918
2022-06-08 22:55:57 +00:00
|
|
|
|
|
|
|
.p2align 6
|
|
|
|
zfs_blake3_compress_xof_sse2:
|
|
|
|
_CET_ENDBR
|
|
|
|
movups xmm0, xmmword ptr [rdi]
|
|
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
|
|
movzx eax, r8b
|
|
|
|
movzx edx, dl
|
|
|
|
shl rax, 32
|
|
|
|
add rdx, rax
|
|
|
|
movq xmm3, rcx
|
|
|
|
movq xmm4, rdx
|
|
|
|
punpcklqdq xmm3, xmm4
|
|
|
|
movups xmm4, xmmword ptr [rsi]
|
|
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
|
|
movaps xmm8, xmm4
|
|
|
|
shufps xmm4, xmm5, 136
|
|
|
|
shufps xmm8, xmm5, 221
|
|
|
|
movaps xmm5, xmm8
|
|
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
|
|
movaps xmm8, xmm6
|
|
|
|
shufps xmm6, xmm7, 136
|
|
|
|
pshufd xmm6, xmm6, 0x93
|
|
|
|
shufps xmm8, xmm7, 221
|
|
|
|
pshufd xmm7, xmm8, 0x93
|
|
|
|
mov al, 7
|
|
|
|
9:
|
|
|
|
paddd xmm0, xmm4
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 20
|
|
|
|
psrld xmm11, 12
|
|
|
|
por xmm1, xmm11
|
|
|
|
paddd xmm0, xmm5
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
movdqa xmm14, xmm3
|
|
|
|
psrld xmm3, 8
|
|
|
|
pslld xmm14, 24
|
|
|
|
pxor xmm3, xmm14
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 25
|
|
|
|
psrld xmm11, 7
|
|
|
|
por xmm1, xmm11
|
|
|
|
pshufd xmm0, xmm0, 0x93
|
|
|
|
pshufd xmm3, xmm3, 0x4E
|
|
|
|
pshufd xmm2, xmm2, 0x39
|
|
|
|
paddd xmm0, xmm6
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 20
|
|
|
|
psrld xmm11, 12
|
|
|
|
por xmm1, xmm11
|
|
|
|
paddd xmm0, xmm7
|
|
|
|
paddd xmm0, xmm1
|
|
|
|
pxor xmm3, xmm0
|
|
|
|
movdqa xmm14, xmm3
|
|
|
|
psrld xmm3, 8
|
|
|
|
pslld xmm14, 24
|
|
|
|
pxor xmm3, xmm14
|
|
|
|
paddd xmm2, xmm3
|
|
|
|
pxor xmm1, xmm2
|
|
|
|
movdqa xmm11, xmm1
|
|
|
|
pslld xmm1, 25
|
|
|
|
psrld xmm11, 7
|
|
|
|
por xmm1, xmm11
|
|
|
|
pshufd xmm0, xmm0, 0x39
|
|
|
|
pshufd xmm3, xmm3, 0x4E
|
|
|
|
pshufd xmm2, xmm2, 0x93
|
|
|
|
dec al
|
|
|
|
jz 9f
|
|
|
|
movdqa xmm8, xmm4
|
|
|
|
shufps xmm8, xmm5, 214
|
|
|
|
pshufd xmm9, xmm4, 0x0F
|
|
|
|
pshufd xmm4, xmm8, 0x39
|
|
|
|
movdqa xmm8, xmm6
|
|
|
|
shufps xmm8, xmm7, 250
|
|
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
|
|
por xmm9, xmm8
|
|
|
|
movdqa xmm8, xmm7
|
|
|
|
punpcklqdq xmm8, xmm5
|
|
|
|
movdqa xmm10, xmm6
|
|
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
|
|
por xmm8, xmm10
|
|
|
|
pshufd xmm8, xmm8, 0x78
|
|
|
|
punpckhdq xmm5, xmm7
|
|
|
|
punpckldq xmm6, xmm5
|
|
|
|
pshufd xmm7, xmm6, 0x1E
|
|
|
|
movdqa xmm5, xmm9
|
|
|
|
movdqa xmm6, xmm8
|
|
|
|
jmp 9b
|
|
|
|
9:
|
|
|
|
movdqu xmm4, xmmword ptr [rdi]
|
|
|
|
movdqu xmm5, xmmword ptr [rdi+0x10]
|
|
|
|
pxor xmm0, xmm2
|
|
|
|
pxor xmm1, xmm3
|
|
|
|
pxor xmm2, xmm4
|
|
|
|
pxor xmm3, xmm5
|
|
|
|
movups xmmword ptr [r9], xmm0
|
|
|
|
movups xmmword ptr [r9+0x10], xmm1
|
|
|
|
movups xmmword ptr [r9+0x20], xmm2
|
|
|
|
movups xmmword ptr [r9+0x30], xmm3
|
2022-06-20 23:36:21 +00:00
|
|
|
RET
|
Introduce BLAKE3 checksums as an OpenZFS feature
This commit adds BLAKE3 checksums to OpenZFS, it has similar
performance to Edon-R, but without the caveats around the latter.
Homepage of BLAKE3: https://github.com/BLAKE3-team/BLAKE3
Wikipedia: https://en.wikipedia.org/wiki/BLAKE_(hash_function)#BLAKE3
Short description of Wikipedia:
BLAKE3 is a cryptographic hash function based on Bao and BLAKE2,
created by Jack O'Connor, Jean-Philippe Aumasson, Samuel Neves, and
Zooko Wilcox-O'Hearn. It was announced on January 9, 2020, at Real
World Crypto. BLAKE3 is a single algorithm with many desirable
features (parallelism, XOF, KDF, PRF and MAC), in contrast to BLAKE
and BLAKE2, which are algorithm families with multiple variants.
BLAKE3 has a binary tree structure, so it supports a practically
unlimited degree of parallelism (both SIMD and multithreading) given
enough input. The official Rust and C implementations are
dual-licensed as public domain (CC0) and the Apache License.
Along with adding the BLAKE3 hash into the OpenZFS infrastructure a
new benchmarking file called chksum_bench was introduced. When read
it reports the speed of the available checksum functions.
On Linux: cat /proc/spl/kstat/zfs/chksum_bench
On FreeBSD: sysctl kstat.zfs.misc.chksum_bench
This is an example output of an i3-1005G1 test system with Debian 11:
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1196 1602 1761 1749 1762 1759 1751
skein-generic 546 591 608 615 619 612 616
sha256-generic 240 300 316 314 304 285 276
sha512-generic 353 441 467 476 472 467 426
blake3-generic 308 313 313 313 312 313 312
blake3-sse2 402 1289 1423 1446 1432 1458 1413
blake3-sse41 427 1470 1625 1704 1679 1607 1629
blake3-avx2 428 1920 3095 3343 3356 3318 3204
blake3-avx512 473 2687 4905 5836 5844 5643 5374
Output on Debian 5.10.0-10-amd64 system: (Ryzen 7 5800X)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1840 2458 2665 2719 2711 2723 2693
skein-generic 870 966 996 992 1003 1005 1009
sha256-generic 415 442 453 455 457 457 457
sha512-generic 608 690 711 718 719 720 721
blake3-generic 301 313 311 309 309 310 310
blake3-sse2 343 1865 2124 2188 2180 2181 2186
blake3-sse41 364 2091 2396 2509 2463 2482 2488
blake3-avx2 365 2590 4399 4971 4915 4802 4764
Output on Debian 5.10.0-9-powerpc64le system: (POWER 9)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 1213 1703 1889 1918 1957 1902 1907
skein-generic 434 492 520 522 511 525 525
sha256-generic 167 183 187 188 188 187 188
sha512-generic 186 216 222 221 225 224 224
blake3-generic 153 152 154 153 151 153 153
blake3-sse2 391 1170 1366 1406 1428 1426 1414
blake3-sse41 352 1049 1212 1174 1262 1258 1259
Output on Debian 5.10.0-11-arm64 system: (Pi400)
implementation 1k 4k 16k 64k 256k 1m 4m
edonr-generic 487 603 629 639 643 641 641
skein-generic 271 299 303 308 309 309 307
sha256-generic 117 127 128 130 130 129 130
sha512-generic 145 165 170 172 173 174 175
blake3-generic 81 29 71 89 89 89 89
blake3-sse2 112 323 368 379 380 371 374
blake3-sse41 101 315 357 368 369 364 360
Structurally, the new code is mainly split into these parts:
- 1x cross platform generic c variant: blake3_generic.c
- 4x assembly for X86-64 (SSE2, SSE4.1, AVX2, AVX512)
- 2x assembly for ARMv8 (NEON converted from SSE2)
- 2x assembly for PPC64-LE (POWER8 converted from SSE2)
- one file for switching between the implementations
Note the PPC64 assembly requires the VSX instruction set and the
kfpu_begin() / kfpu_end() calls on PowerPC were updated accordingly.
Reviewed-by: Felix Dörre <felix@dogcraft.de>
Reviewed-by: Ahelenia Ziemiańska <nabijaczleweli@nabijaczleweli.xyz>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tino Reichardt <milky-zfs@mcmilk.de>
Co-authored-by: Rich Ercolani <rincebrain@gmail.com>
Closes #10058
Closes #12918
2022-06-08 22:55:57 +00:00
|
|
|
|
|
|
|
.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2
|
|
|
|
.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2
|
|
|
|
.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2
|
|
|
|
|
|
|
|
#ifdef __APPLE__
|
|
|
|
.static_data
|
|
|
|
#else
|
|
|
|
.section .rodata
|
|
|
|
#endif
|
|
|
|
.p2align 6
|
|
|
|
BLAKE3_IV:
|
|
|
|
.long 0x6A09E667, 0xBB67AE85
|
|
|
|
.long 0x3C6EF372, 0xA54FF53A
|
|
|
|
ADD0:
|
|
|
|
.long 0, 1, 2, 3
|
|
|
|
ADD1:
|
|
|
|
.long 4, 4, 4, 4
|
|
|
|
BLAKE3_IV_0:
|
|
|
|
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
|
|
|
|
BLAKE3_IV_1:
|
|
|
|
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
|
|
|
|
BLAKE3_IV_2:
|
|
|
|
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
|
|
|
|
BLAKE3_IV_3:
|
|
|
|
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
|
|
|
|
BLAKE3_BLOCK_LEN:
|
|
|
|
.long 64, 64, 64, 64
|
|
|
|
CMP_MSB_MASK:
|
|
|
|
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
|
|
|
|
PBLENDW_0x33_MASK:
|
|
|
|
.long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
|
|
|
|
PBLENDW_0xCC_MASK:
|
|
|
|
.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
|
|
|
|
PBLENDW_0x3F_MASK:
|
|
|
|
.long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
|
|
|
|
PBLENDW_0xC0_MASK:
|
|
|
|
.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
|
|
|
|
|
|
|
|
#endif /* HAVE_SSE2 */
|
|
|
|
|
|
|
|
#ifdef __ELF__
|
|
|
|
.section .note.GNU-stack,"",%progbits
|
|
|
|
#endif
|