2324 lines
68 KiB
ArmAsm
2324 lines
68 KiB
ArmAsm
/*
|
|
* CDDL HEADER START
|
|
*
|
|
* The contents of this file are subject to the terms of the
|
|
* Common Development and Distribution License (the "License").
|
|
* You may not use this file except in compliance with the License.
|
|
*
|
|
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
|
|
* or http://www.opensolaris.org/os/licensing.
|
|
* See the License for the specific language governing permissions
|
|
* and limitations under the License.
|
|
*
|
|
* When distributing Covered Code, include this CDDL HEADER in each
|
|
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
|
|
* If applicable, add the following below this CDDL HEADER, with the
|
|
* fields enclosed by brackets "[]" replaced with your own identifying
|
|
* information: Portions Copyright [yyyy] [name of copyright owner]
|
|
*
|
|
* CDDL HEADER END
|
|
*/
|
|
|
|
/*
|
|
* Based on BLAKE3 v1.3.1, https://github.com/BLAKE3-team/BLAKE3
|
|
* Copyright (c) 2019-2020 Samuel Neves and Matthew Krupcale
|
|
* Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
|
|
*/
|
|
|
|
#if defined(HAVE_SSE2)
|
|
|
|
#define _ASM
|
|
#include <sys/asm_linkage.h>
|
|
|
|
#if defined(__ELF__) && defined(__CET__) && defined(__has_include)
|
|
#if __has_include(<cet.h>)
|
|
#include <cet.h>
|
|
#endif
|
|
#endif
|
|
|
|
#if !defined(_CET_ENDBR)
|
|
#define _CET_ENDBR
|
|
#endif
|
|
|
|
.intel_syntax noprefix
|
|
.global zfs_blake3_hash_many_sse2
|
|
.global zfs_blake3_compress_in_place_sse2
|
|
.global zfs_blake3_compress_xof_sse2
|
|
|
|
.text
|
|
.type zfs_blake3_hash_many_sse2,@function
|
|
.type zfs_blake3_compress_in_place_sse2,@function
|
|
.type zfs_blake3_compress_xof_sse2,@function
|
|
|
|
.p2align 6
|
|
zfs_blake3_hash_many_sse2:
|
|
_CET_ENDBR
|
|
push r15
|
|
push r14
|
|
push r13
|
|
push r12
|
|
push rbx
|
|
push rbp
|
|
mov rbp, rsp
|
|
sub rsp, 360
|
|
and rsp, 0xFFFFFFFFFFFFFFC0
|
|
neg r9d
|
|
movd xmm0, r9d
|
|
pshufd xmm0, xmm0, 0x00
|
|
movdqa xmmword ptr [rsp+0x130], xmm0
|
|
movdqa xmm1, xmm0
|
|
pand xmm1, xmmword ptr [ADD0+rip]
|
|
pand xmm0, xmmword ptr [ADD1+rip]
|
|
movdqa xmmword ptr [rsp+0x150], xmm0
|
|
movd xmm0, r8d
|
|
pshufd xmm0, xmm0, 0x00
|
|
paddd xmm0, xmm1
|
|
movdqa xmmword ptr [rsp+0x110], xmm0
|
|
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
|
|
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
|
|
pcmpgtd xmm1, xmm0
|
|
shr r8, 32
|
|
movd xmm2, r8d
|
|
pshufd xmm2, xmm2, 0x00
|
|
psubd xmm2, xmm1
|
|
movdqa xmmword ptr [rsp+0x120], xmm2
|
|
mov rbx, qword ptr [rbp+0x50]
|
|
mov r15, rdx
|
|
shl r15, 6
|
|
movzx r13d, byte ptr [rbp+0x38]
|
|
movzx r12d, byte ptr [rbp+0x48]
|
|
cmp rsi, 4
|
|
jc 3f
|
|
2:
|
|
movdqu xmm3, xmmword ptr [rcx]
|
|
pshufd xmm0, xmm3, 0x00
|
|
pshufd xmm1, xmm3, 0x55
|
|
pshufd xmm2, xmm3, 0xAA
|
|
pshufd xmm3, xmm3, 0xFF
|
|
movdqu xmm7, xmmword ptr [rcx+0x10]
|
|
pshufd xmm4, xmm7, 0x00
|
|
pshufd xmm5, xmm7, 0x55
|
|
pshufd xmm6, xmm7, 0xAA
|
|
pshufd xmm7, xmm7, 0xFF
|
|
mov r8, qword ptr [rdi]
|
|
mov r9, qword ptr [rdi+0x8]
|
|
mov r10, qword ptr [rdi+0x10]
|
|
mov r11, qword ptr [rdi+0x18]
|
|
movzx eax, byte ptr [rbp+0x40]
|
|
or eax, r13d
|
|
xor edx, edx
|
|
9:
|
|
mov r14d, eax
|
|
or eax, r12d
|
|
add rdx, 64
|
|
cmp rdx, r15
|
|
cmovne eax, r14d
|
|
movdqu xmm8, xmmword ptr [r8+rdx-0x40]
|
|
movdqu xmm9, xmmword ptr [r9+rdx-0x40]
|
|
movdqu xmm10, xmmword ptr [r10+rdx-0x40]
|
|
movdqu xmm11, xmmword ptr [r11+rdx-0x40]
|
|
movdqa xmm12, xmm8
|
|
punpckldq xmm8, xmm9
|
|
punpckhdq xmm12, xmm9
|
|
movdqa xmm14, xmm10
|
|
punpckldq xmm10, xmm11
|
|
punpckhdq xmm14, xmm11
|
|
movdqa xmm9, xmm8
|
|
punpcklqdq xmm8, xmm10
|
|
punpckhqdq xmm9, xmm10
|
|
movdqa xmm13, xmm12
|
|
punpcklqdq xmm12, xmm14
|
|
punpckhqdq xmm13, xmm14
|
|
movdqa xmmword ptr [rsp], xmm8
|
|
movdqa xmmword ptr [rsp+0x10], xmm9
|
|
movdqa xmmword ptr [rsp+0x20], xmm12
|
|
movdqa xmmword ptr [rsp+0x30], xmm13
|
|
movdqu xmm8, xmmword ptr [r8+rdx-0x30]
|
|
movdqu xmm9, xmmword ptr [r9+rdx-0x30]
|
|
movdqu xmm10, xmmword ptr [r10+rdx-0x30]
|
|
movdqu xmm11, xmmword ptr [r11+rdx-0x30]
|
|
movdqa xmm12, xmm8
|
|
punpckldq xmm8, xmm9
|
|
punpckhdq xmm12, xmm9
|
|
movdqa xmm14, xmm10
|
|
punpckldq xmm10, xmm11
|
|
punpckhdq xmm14, xmm11
|
|
movdqa xmm9, xmm8
|
|
punpcklqdq xmm8, xmm10
|
|
punpckhqdq xmm9, xmm10
|
|
movdqa xmm13, xmm12
|
|
punpcklqdq xmm12, xmm14
|
|
punpckhqdq xmm13, xmm14
|
|
movdqa xmmword ptr [rsp+0x40], xmm8
|
|
movdqa xmmword ptr [rsp+0x50], xmm9
|
|
movdqa xmmword ptr [rsp+0x60], xmm12
|
|
movdqa xmmword ptr [rsp+0x70], xmm13
|
|
movdqu xmm8, xmmword ptr [r8+rdx-0x20]
|
|
movdqu xmm9, xmmword ptr [r9+rdx-0x20]
|
|
movdqu xmm10, xmmword ptr [r10+rdx-0x20]
|
|
movdqu xmm11, xmmword ptr [r11+rdx-0x20]
|
|
movdqa xmm12, xmm8
|
|
punpckldq xmm8, xmm9
|
|
punpckhdq xmm12, xmm9
|
|
movdqa xmm14, xmm10
|
|
punpckldq xmm10, xmm11
|
|
punpckhdq xmm14, xmm11
|
|
movdqa xmm9, xmm8
|
|
punpcklqdq xmm8, xmm10
|
|
punpckhqdq xmm9, xmm10
|
|
movdqa xmm13, xmm12
|
|
punpcklqdq xmm12, xmm14
|
|
punpckhqdq xmm13, xmm14
|
|
movdqa xmmword ptr [rsp+0x80], xmm8
|
|
movdqa xmmword ptr [rsp+0x90], xmm9
|
|
movdqa xmmword ptr [rsp+0xA0], xmm12
|
|
movdqa xmmword ptr [rsp+0xB0], xmm13
|
|
movdqu xmm8, xmmword ptr [r8+rdx-0x10]
|
|
movdqu xmm9, xmmword ptr [r9+rdx-0x10]
|
|
movdqu xmm10, xmmword ptr [r10+rdx-0x10]
|
|
movdqu xmm11, xmmword ptr [r11+rdx-0x10]
|
|
movdqa xmm12, xmm8
|
|
punpckldq xmm8, xmm9
|
|
punpckhdq xmm12, xmm9
|
|
movdqa xmm14, xmm10
|
|
punpckldq xmm10, xmm11
|
|
punpckhdq xmm14, xmm11
|
|
movdqa xmm9, xmm8
|
|
punpcklqdq xmm8, xmm10
|
|
punpckhqdq xmm9, xmm10
|
|
movdqa xmm13, xmm12
|
|
punpcklqdq xmm12, xmm14
|
|
punpckhqdq xmm13, xmm14
|
|
movdqa xmmword ptr [rsp+0xC0], xmm8
|
|
movdqa xmmword ptr [rsp+0xD0], xmm9
|
|
movdqa xmmword ptr [rsp+0xE0], xmm12
|
|
movdqa xmmword ptr [rsp+0xF0], xmm13
|
|
movdqa xmm9, xmmword ptr [BLAKE3_IV_1+rip]
|
|
movdqa xmm10, xmmword ptr [BLAKE3_IV_2+rip]
|
|
movdqa xmm11, xmmword ptr [BLAKE3_IV_3+rip]
|
|
movdqa xmm12, xmmword ptr [rsp+0x110]
|
|
movdqa xmm13, xmmword ptr [rsp+0x120]
|
|
movdqa xmm14, xmmword ptr [BLAKE3_BLOCK_LEN+rip]
|
|
movd xmm15, eax
|
|
pshufd xmm15, xmm15, 0x00
|
|
prefetcht0 [r8+rdx+0x80]
|
|
prefetcht0 [r9+rdx+0x80]
|
|
prefetcht0 [r10+rdx+0x80]
|
|
prefetcht0 [r11+rdx+0x80]
|
|
paddd xmm0, xmmword ptr [rsp]
|
|
paddd xmm1, xmmword ptr [rsp+0x20]
|
|
paddd xmm2, xmmword ptr [rsp+0x40]
|
|
paddd xmm3, xmmword ptr [rsp+0x60]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
movdqa xmm8, xmmword ptr [BLAKE3_IV_0+rip]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x10]
|
|
paddd xmm1, xmmword ptr [rsp+0x30]
|
|
paddd xmm2, xmmword ptr [rsp+0x50]
|
|
paddd xmm3, xmmword ptr [rsp+0x70]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x80]
|
|
paddd xmm1, xmmword ptr [rsp+0xA0]
|
|
paddd xmm2, xmmword ptr [rsp+0xC0]
|
|
paddd xmm3, xmmword ptr [rsp+0xE0]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x90]
|
|
paddd xmm1, xmmword ptr [rsp+0xB0]
|
|
paddd xmm2, xmmword ptr [rsp+0xD0]
|
|
paddd xmm3, xmmword ptr [rsp+0xF0]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x20]
|
|
paddd xmm1, xmmword ptr [rsp+0x30]
|
|
paddd xmm2, xmmword ptr [rsp+0x70]
|
|
paddd xmm3, xmmword ptr [rsp+0x40]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x60]
|
|
paddd xmm1, xmmword ptr [rsp+0xA0]
|
|
paddd xmm2, xmmword ptr [rsp]
|
|
paddd xmm3, xmmword ptr [rsp+0xD0]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x10]
|
|
paddd xmm1, xmmword ptr [rsp+0xC0]
|
|
paddd xmm2, xmmword ptr [rsp+0x90]
|
|
paddd xmm3, xmmword ptr [rsp+0xF0]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xB0]
|
|
paddd xmm1, xmmword ptr [rsp+0x50]
|
|
paddd xmm2, xmmword ptr [rsp+0xE0]
|
|
paddd xmm3, xmmword ptr [rsp+0x80]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x30]
|
|
paddd xmm1, xmmword ptr [rsp+0xA0]
|
|
paddd xmm2, xmmword ptr [rsp+0xD0]
|
|
paddd xmm3, xmmword ptr [rsp+0x70]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x40]
|
|
paddd xmm1, xmmword ptr [rsp+0xC0]
|
|
paddd xmm2, xmmword ptr [rsp+0x20]
|
|
paddd xmm3, xmmword ptr [rsp+0xE0]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x60]
|
|
paddd xmm1, xmmword ptr [rsp+0x90]
|
|
paddd xmm2, xmmword ptr [rsp+0xB0]
|
|
paddd xmm3, xmmword ptr [rsp+0x80]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x50]
|
|
paddd xmm1, xmmword ptr [rsp]
|
|
paddd xmm2, xmmword ptr [rsp+0xF0]
|
|
paddd xmm3, xmmword ptr [rsp+0x10]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xA0]
|
|
paddd xmm1, xmmword ptr [rsp+0xC0]
|
|
paddd xmm2, xmmword ptr [rsp+0xE0]
|
|
paddd xmm3, xmmword ptr [rsp+0xD0]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x70]
|
|
paddd xmm1, xmmword ptr [rsp+0x90]
|
|
paddd xmm2, xmmword ptr [rsp+0x30]
|
|
paddd xmm3, xmmword ptr [rsp+0xF0]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x40]
|
|
paddd xmm1, xmmword ptr [rsp+0xB0]
|
|
paddd xmm2, xmmword ptr [rsp+0x50]
|
|
paddd xmm3, xmmword ptr [rsp+0x10]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp]
|
|
paddd xmm1, xmmword ptr [rsp+0x20]
|
|
paddd xmm2, xmmword ptr [rsp+0x80]
|
|
paddd xmm3, xmmword ptr [rsp+0x60]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xC0]
|
|
paddd xmm1, xmmword ptr [rsp+0x90]
|
|
paddd xmm2, xmmword ptr [rsp+0xF0]
|
|
paddd xmm3, xmmword ptr [rsp+0xE0]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xD0]
|
|
paddd xmm1, xmmword ptr [rsp+0xB0]
|
|
paddd xmm2, xmmword ptr [rsp+0xA0]
|
|
paddd xmm3, xmmword ptr [rsp+0x80]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x70]
|
|
paddd xmm1, xmmword ptr [rsp+0x50]
|
|
paddd xmm2, xmmword ptr [rsp]
|
|
paddd xmm3, xmmword ptr [rsp+0x60]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x20]
|
|
paddd xmm1, xmmword ptr [rsp+0x30]
|
|
paddd xmm2, xmmword ptr [rsp+0x10]
|
|
paddd xmm3, xmmword ptr [rsp+0x40]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x90]
|
|
paddd xmm1, xmmword ptr [rsp+0xB0]
|
|
paddd xmm2, xmmword ptr [rsp+0x80]
|
|
paddd xmm3, xmmword ptr [rsp+0xF0]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xE0]
|
|
paddd xmm1, xmmword ptr [rsp+0x50]
|
|
paddd xmm2, xmmword ptr [rsp+0xC0]
|
|
paddd xmm3, xmmword ptr [rsp+0x10]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xD0]
|
|
paddd xmm1, xmmword ptr [rsp]
|
|
paddd xmm2, xmmword ptr [rsp+0x20]
|
|
paddd xmm3, xmmword ptr [rsp+0x40]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0x30]
|
|
paddd xmm1, xmmword ptr [rsp+0xA0]
|
|
paddd xmm2, xmmword ptr [rsp+0x60]
|
|
paddd xmm3, xmmword ptr [rsp+0x70]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xB0]
|
|
paddd xmm1, xmmword ptr [rsp+0x50]
|
|
paddd xmm2, xmmword ptr [rsp+0x10]
|
|
paddd xmm3, xmmword ptr [rsp+0x80]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xF0]
|
|
paddd xmm1, xmmword ptr [rsp]
|
|
paddd xmm2, xmmword ptr [rsp+0x90]
|
|
paddd xmm3, xmmword ptr [rsp+0x60]
|
|
paddd xmm0, xmm4
|
|
paddd xmm1, xmm5
|
|
paddd xmm2, xmm6
|
|
paddd xmm3, xmm7
|
|
pxor xmm12, xmm0
|
|
pxor xmm13, xmm1
|
|
pxor xmm14, xmm2
|
|
pxor xmm15, xmm3
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm12
|
|
paddd xmm9, xmm13
|
|
paddd xmm10, xmm14
|
|
paddd xmm11, xmm15
|
|
pxor xmm4, xmm8
|
|
pxor xmm5, xmm9
|
|
pxor xmm6, xmm10
|
|
pxor xmm7, xmm11
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xE0]
|
|
paddd xmm1, xmmword ptr [rsp+0x20]
|
|
paddd xmm2, xmmword ptr [rsp+0x30]
|
|
paddd xmm3, xmmword ptr [rsp+0x70]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
pshuflw xmm15, xmm15, 0xB1
|
|
pshufhw xmm15, xmm15, 0xB1
|
|
pshuflw xmm12, xmm12, 0xB1
|
|
pshufhw xmm12, xmm12, 0xB1
|
|
pshuflw xmm13, xmm13, 0xB1
|
|
pshufhw xmm13, xmm13, 0xB1
|
|
pshuflw xmm14, xmm14, 0xB1
|
|
pshufhw xmm14, xmm14, 0xB1
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
movdqa xmmword ptr [rsp+0x100], xmm8
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 12
|
|
pslld xmm5, 20
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 12
|
|
pslld xmm6, 20
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 12
|
|
pslld xmm7, 20
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 12
|
|
pslld xmm4, 20
|
|
por xmm4, xmm8
|
|
paddd xmm0, xmmword ptr [rsp+0xA0]
|
|
paddd xmm1, xmmword ptr [rsp+0xC0]
|
|
paddd xmm2, xmmword ptr [rsp+0x40]
|
|
paddd xmm3, xmmword ptr [rsp+0xD0]
|
|
paddd xmm0, xmm5
|
|
paddd xmm1, xmm6
|
|
paddd xmm2, xmm7
|
|
paddd xmm3, xmm4
|
|
pxor xmm15, xmm0
|
|
pxor xmm12, xmm1
|
|
pxor xmm13, xmm2
|
|
pxor xmm14, xmm3
|
|
movdqa xmm8, xmm15
|
|
psrld xmm15, 8
|
|
pslld xmm8, 24
|
|
pxor xmm15, xmm8
|
|
movdqa xmm8, xmm12
|
|
psrld xmm12, 8
|
|
pslld xmm8, 24
|
|
pxor xmm12, xmm8
|
|
movdqa xmm8, xmm13
|
|
psrld xmm13, 8
|
|
pslld xmm8, 24
|
|
pxor xmm13, xmm8
|
|
movdqa xmm8, xmm14
|
|
psrld xmm14, 8
|
|
pslld xmm8, 24
|
|
pxor xmm14, xmm8
|
|
paddd xmm10, xmm15
|
|
paddd xmm11, xmm12
|
|
movdqa xmm8, xmmword ptr [rsp+0x100]
|
|
paddd xmm8, xmm13
|
|
paddd xmm9, xmm14
|
|
pxor xmm5, xmm10
|
|
pxor xmm6, xmm11
|
|
pxor xmm7, xmm8
|
|
pxor xmm4, xmm9
|
|
pxor xmm0, xmm8
|
|
pxor xmm1, xmm9
|
|
pxor xmm2, xmm10
|
|
pxor xmm3, xmm11
|
|
movdqa xmm8, xmm5
|
|
psrld xmm8, 7
|
|
pslld xmm5, 25
|
|
por xmm5, xmm8
|
|
movdqa xmm8, xmm6
|
|
psrld xmm8, 7
|
|
pslld xmm6, 25
|
|
por xmm6, xmm8
|
|
movdqa xmm8, xmm7
|
|
psrld xmm8, 7
|
|
pslld xmm7, 25
|
|
por xmm7, xmm8
|
|
movdqa xmm8, xmm4
|
|
psrld xmm8, 7
|
|
pslld xmm4, 25
|
|
por xmm4, xmm8
|
|
pxor xmm4, xmm12
|
|
pxor xmm5, xmm13
|
|
pxor xmm6, xmm14
|
|
pxor xmm7, xmm15
|
|
mov eax, r13d
|
|
jne 9b
|
|
movdqa xmm9, xmm0
|
|
punpckldq xmm0, xmm1
|
|
punpckhdq xmm9, xmm1
|
|
movdqa xmm11, xmm2
|
|
punpckldq xmm2, xmm3
|
|
punpckhdq xmm11, xmm3
|
|
movdqa xmm1, xmm0
|
|
punpcklqdq xmm0, xmm2
|
|
punpckhqdq xmm1, xmm2
|
|
movdqa xmm3, xmm9
|
|
punpcklqdq xmm9, xmm11
|
|
punpckhqdq xmm3, xmm11
|
|
movdqu xmmword ptr [rbx], xmm0
|
|
movdqu xmmword ptr [rbx+0x20], xmm1
|
|
movdqu xmmword ptr [rbx+0x40], xmm9
|
|
movdqu xmmword ptr [rbx+0x60], xmm3
|
|
movdqa xmm9, xmm4
|
|
punpckldq xmm4, xmm5
|
|
punpckhdq xmm9, xmm5
|
|
movdqa xmm11, xmm6
|
|
punpckldq xmm6, xmm7
|
|
punpckhdq xmm11, xmm7
|
|
movdqa xmm5, xmm4
|
|
punpcklqdq xmm4, xmm6
|
|
punpckhqdq xmm5, xmm6
|
|
movdqa xmm7, xmm9
|
|
punpcklqdq xmm9, xmm11
|
|
punpckhqdq xmm7, xmm11
|
|
movdqu xmmword ptr [rbx+0x10], xmm4
|
|
movdqu xmmword ptr [rbx+0x30], xmm5
|
|
movdqu xmmword ptr [rbx+0x50], xmm9
|
|
movdqu xmmword ptr [rbx+0x70], xmm7
|
|
movdqa xmm1, xmmword ptr [rsp+0x110]
|
|
movdqa xmm0, xmm1
|
|
paddd xmm1, xmmword ptr [rsp+0x150]
|
|
movdqa xmmword ptr [rsp+0x110], xmm1
|
|
pxor xmm0, xmmword ptr [CMP_MSB_MASK+rip]
|
|
pxor xmm1, xmmword ptr [CMP_MSB_MASK+rip]
|
|
pcmpgtd xmm0, xmm1
|
|
movdqa xmm1, xmmword ptr [rsp+0x120]
|
|
psubd xmm1, xmm0
|
|
movdqa xmmword ptr [rsp+0x120], xmm1
|
|
add rbx, 128
|
|
add rdi, 32
|
|
sub rsi, 4
|
|
cmp rsi, 4
|
|
jnc 2b
|
|
test rsi, rsi
|
|
jnz 3f
|
|
4:
|
|
mov rsp, rbp
|
|
pop rbp
|
|
pop rbx
|
|
pop r12
|
|
pop r13
|
|
pop r14
|
|
pop r15
|
|
RET
|
|
.p2align 5
|
|
3:
|
|
test esi, 0x2
|
|
je 3f
|
|
movups xmm0, xmmword ptr [rcx]
|
|
movups xmm1, xmmword ptr [rcx+0x10]
|
|
movaps xmm8, xmm0
|
|
movaps xmm9, xmm1
|
|
movd xmm13, dword ptr [rsp+0x110]
|
|
movd xmm14, dword ptr [rsp+0x120]
|
|
punpckldq xmm13, xmm14
|
|
movaps xmmword ptr [rsp], xmm13
|
|
movd xmm14, dword ptr [rsp+0x114]
|
|
movd xmm13, dword ptr [rsp+0x124]
|
|
punpckldq xmm14, xmm13
|
|
movaps xmmword ptr [rsp+0x10], xmm14
|
|
mov r8, qword ptr [rdi]
|
|
mov r9, qword ptr [rdi+0x8]
|
|
movzx eax, byte ptr [rbp+0x40]
|
|
or eax, r13d
|
|
xor edx, edx
|
|
2:
|
|
mov r14d, eax
|
|
or eax, r12d
|
|
add rdx, 64
|
|
cmp rdx, r15
|
|
cmovne eax, r14d
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
movaps xmm10, xmm2
|
|
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
|
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
|
movaps xmm3, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm3, xmm5, 221
|
|
movaps xmm5, xmm3
|
|
movups xmm6, xmmword ptr [r8+rdx-0x20]
|
|
movups xmm7, xmmword ptr [r8+rdx-0x10]
|
|
movaps xmm3, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm3, xmm7, 221
|
|
pshufd xmm7, xmm3, 0x93
|
|
movups xmm12, xmmword ptr [r9+rdx-0x40]
|
|
movups xmm13, xmmword ptr [r9+rdx-0x30]
|
|
movaps xmm11, xmm12
|
|
shufps xmm12, xmm13, 136
|
|
shufps xmm11, xmm13, 221
|
|
movaps xmm13, xmm11
|
|
movups xmm14, xmmword ptr [r9+rdx-0x20]
|
|
movups xmm15, xmmword ptr [r9+rdx-0x10]
|
|
movaps xmm11, xmm14
|
|
shufps xmm14, xmm15, 136
|
|
pshufd xmm14, xmm14, 0x93
|
|
shufps xmm11, xmm15, 221
|
|
pshufd xmm15, xmm11, 0x93
|
|
shl rax, 0x20
|
|
or rax, 0x40
|
|
movq xmm3, rax
|
|
movdqa xmmword ptr [rsp+0x20], xmm3
|
|
movaps xmm3, xmmword ptr [rsp]
|
|
movaps xmm11, xmmword ptr [rsp+0x10]
|
|
punpcklqdq xmm3, xmmword ptr [rsp+0x20]
|
|
punpcklqdq xmm11, xmmword ptr [rsp+0x20]
|
|
mov al, 7
|
|
9:
|
|
paddd xmm0, xmm4
|
|
paddd xmm8, xmm12
|
|
movaps xmmword ptr [rsp+0x20], xmm4
|
|
movaps xmmword ptr [rsp+0x30], xmm12
|
|
paddd xmm0, xmm1
|
|
paddd xmm8, xmm9
|
|
pxor xmm3, xmm0
|
|
pxor xmm11, xmm8
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
pshuflw xmm11, xmm11, 0xB1
|
|
pshufhw xmm11, xmm11, 0xB1
|
|
paddd xmm2, xmm3
|
|
paddd xmm10, xmm11
|
|
pxor xmm1, xmm2
|
|
pxor xmm9, xmm10
|
|
movdqa xmm4, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm4, 12
|
|
por xmm1, xmm4
|
|
movdqa xmm4, xmm9
|
|
pslld xmm9, 20
|
|
psrld xmm4, 12
|
|
por xmm9, xmm4
|
|
paddd xmm0, xmm5
|
|
paddd xmm8, xmm13
|
|
movaps xmmword ptr [rsp+0x40], xmm5
|
|
movaps xmmword ptr [rsp+0x50], xmm13
|
|
paddd xmm0, xmm1
|
|
paddd xmm8, xmm9
|
|
pxor xmm3, xmm0
|
|
pxor xmm11, xmm8
|
|
movdqa xmm13, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm13, 24
|
|
pxor xmm3, xmm13
|
|
movdqa xmm13, xmm11
|
|
psrld xmm11, 8
|
|
pslld xmm13, 24
|
|
pxor xmm11, xmm13
|
|
paddd xmm2, xmm3
|
|
paddd xmm10, xmm11
|
|
pxor xmm1, xmm2
|
|
pxor xmm9, xmm10
|
|
movdqa xmm4, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm4, 7
|
|
por xmm1, xmm4
|
|
movdqa xmm4, xmm9
|
|
pslld xmm9, 25
|
|
psrld xmm4, 7
|
|
por xmm9, xmm4
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm8, xmm8, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm11, xmm11, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
pshufd xmm10, xmm10, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm8, xmm14
|
|
paddd xmm0, xmm1
|
|
paddd xmm8, xmm9
|
|
pxor xmm3, xmm0
|
|
pxor xmm11, xmm8
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
pshuflw xmm11, xmm11, 0xB1
|
|
pshufhw xmm11, xmm11, 0xB1
|
|
paddd xmm2, xmm3
|
|
paddd xmm10, xmm11
|
|
pxor xmm1, xmm2
|
|
pxor xmm9, xmm10
|
|
movdqa xmm4, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm4, 12
|
|
por xmm1, xmm4
|
|
movdqa xmm4, xmm9
|
|
pslld xmm9, 20
|
|
psrld xmm4, 12
|
|
por xmm9, xmm4
|
|
paddd xmm0, xmm7
|
|
paddd xmm8, xmm15
|
|
paddd xmm0, xmm1
|
|
paddd xmm8, xmm9
|
|
pxor xmm3, xmm0
|
|
pxor xmm11, xmm8
|
|
movdqa xmm13, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm13, 24
|
|
pxor xmm3, xmm13
|
|
movdqa xmm13, xmm11
|
|
psrld xmm11, 8
|
|
pslld xmm13, 24
|
|
pxor xmm11, xmm13
|
|
paddd xmm2, xmm3
|
|
paddd xmm10, xmm11
|
|
pxor xmm1, xmm2
|
|
pxor xmm9, xmm10
|
|
movdqa xmm4, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm4, 7
|
|
por xmm1, xmm4
|
|
movdqa xmm4, xmm9
|
|
pslld xmm9, 25
|
|
psrld xmm4, 7
|
|
por xmm9, xmm4
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm8, xmm8, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm11, xmm11, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
pshufd xmm10, xmm10, 0x93
|
|
dec al
|
|
je 9f
|
|
movdqa xmm12, xmmword ptr [rsp+0x20]
|
|
movdqa xmm5, xmmword ptr [rsp+0x40]
|
|
pshufd xmm13, xmm12, 0x0F
|
|
shufps xmm12, xmm5, 214
|
|
pshufd xmm4, xmm12, 0x39
|
|
movdqa xmm12, xmm6
|
|
shufps xmm12, xmm7, 250
|
|
pand xmm13, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm12, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm13, xmm12
|
|
movdqa xmmword ptr [rsp+0x20], xmm13
|
|
movdqa xmm12, xmm7
|
|
punpcklqdq xmm12, xmm5
|
|
movdqa xmm13, xmm6
|
|
pand xmm12, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm13, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm12, xmm13
|
|
pshufd xmm12, xmm12, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmmword ptr [rsp+0x40], xmm12
|
|
movdqa xmm5, xmmword ptr [rsp+0x30]
|
|
movdqa xmm13, xmmword ptr [rsp+0x50]
|
|
pshufd xmm6, xmm5, 0x0F
|
|
shufps xmm5, xmm13, 214
|
|
pshufd xmm12, xmm5, 0x39
|
|
movdqa xmm5, xmm14
|
|
shufps xmm5, xmm15, 250
|
|
pand xmm6, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm5, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm6, xmm5
|
|
movdqa xmm5, xmm15
|
|
punpcklqdq xmm5, xmm13
|
|
movdqa xmmword ptr [rsp+0x30], xmm2
|
|
movdqa xmm2, xmm14
|
|
pand xmm5, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm2, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm5, xmm2
|
|
movdqa xmm2, xmmword ptr [rsp+0x30]
|
|
pshufd xmm5, xmm5, 0x78
|
|
punpckhdq xmm13, xmm15
|
|
punpckldq xmm14, xmm13
|
|
pshufd xmm15, xmm14, 0x1E
|
|
movdqa xmm13, xmm6
|
|
movdqa xmm14, xmm5
|
|
movdqa xmm5, xmmword ptr [rsp+0x20]
|
|
movdqa xmm6, xmmword ptr [rsp+0x40]
|
|
jmp 9b
|
|
9:
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
pxor xmm8, xmm10
|
|
pxor xmm9, xmm11
|
|
mov eax, r13d
|
|
cmp rdx, r15
|
|
jne 2b
|
|
movups xmmword ptr [rbx], xmm0
|
|
movups xmmword ptr [rbx+0x10], xmm1
|
|
movups xmmword ptr [rbx+0x20], xmm8
|
|
movups xmmword ptr [rbx+0x30], xmm9
|
|
mov eax, dword ptr [rsp+0x130]
|
|
neg eax
|
|
mov r10d, dword ptr [rsp+0x110+8*rax]
|
|
mov r11d, dword ptr [rsp+0x120+8*rax]
|
|
mov dword ptr [rsp+0x110], r10d
|
|
mov dword ptr [rsp+0x120], r11d
|
|
add rdi, 16
|
|
add rbx, 64
|
|
sub rsi, 2
|
|
3:
|
|
test esi, 0x1
|
|
je 4b
|
|
movups xmm0, xmmword ptr [rcx]
|
|
movups xmm1, xmmword ptr [rcx+0x10]
|
|
movd xmm13, dword ptr [rsp+0x110]
|
|
movd xmm14, dword ptr [rsp+0x120]
|
|
punpckldq xmm13, xmm14
|
|
mov r8, qword ptr [rdi]
|
|
movzx eax, byte ptr [rbp+0x40]
|
|
or eax, r13d
|
|
xor edx, edx
|
|
2:
|
|
mov r14d, eax
|
|
or eax, r12d
|
|
add rdx, 64
|
|
cmp rdx, r15
|
|
cmovne eax, r14d
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
shl rax, 32
|
|
or rax, 64
|
|
movq xmm12, rax
|
|
movdqa xmm3, xmm13
|
|
punpcklqdq xmm3, xmm12
|
|
movups xmm4, xmmword ptr [r8+rdx-0x40]
|
|
movups xmm5, xmmword ptr [r8+rdx-0x30]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [r8+rdx-0x20]
|
|
movups xmm7, xmmword ptr [r8+rdx-0x10]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 0x93
|
|
mov al, 7
|
|
9:
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
dec al
|
|
jz 9f
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
jmp 9b
|
|
9:
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
mov eax, r13d
|
|
cmp rdx, r15
|
|
jne 2b
|
|
movups xmmword ptr [rbx], xmm0
|
|
movups xmmword ptr [rbx+0x10], xmm1
|
|
jmp 4b
|
|
|
|
.p2align 6
|
|
zfs_blake3_compress_in_place_sse2:
|
|
_CET_ENDBR
|
|
movups xmm0, xmmword ptr [rdi]
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
shl r8, 32
|
|
add rdx, r8
|
|
movq xmm3, rcx
|
|
movq xmm4, rdx
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rsi]
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 0x93
|
|
mov al, 7
|
|
9:
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
dec al
|
|
jz 9f
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
jmp 9b
|
|
9:
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
movups xmmword ptr [rdi], xmm0
|
|
movups xmmword ptr [rdi+0x10], xmm1
|
|
RET
|
|
|
|
.p2align 6
|
|
zfs_blake3_compress_xof_sse2:
|
|
_CET_ENDBR
|
|
movups xmm0, xmmword ptr [rdi]
|
|
movups xmm1, xmmword ptr [rdi+0x10]
|
|
movaps xmm2, xmmword ptr [BLAKE3_IV+rip]
|
|
movzx eax, r8b
|
|
movzx edx, dl
|
|
shl rax, 32
|
|
add rdx, rax
|
|
movq xmm3, rcx
|
|
movq xmm4, rdx
|
|
punpcklqdq xmm3, xmm4
|
|
movups xmm4, xmmword ptr [rsi]
|
|
movups xmm5, xmmword ptr [rsi+0x10]
|
|
movaps xmm8, xmm4
|
|
shufps xmm4, xmm5, 136
|
|
shufps xmm8, xmm5, 221
|
|
movaps xmm5, xmm8
|
|
movups xmm6, xmmword ptr [rsi+0x20]
|
|
movups xmm7, xmmword ptr [rsi+0x30]
|
|
movaps xmm8, xmm6
|
|
shufps xmm6, xmm7, 136
|
|
pshufd xmm6, xmm6, 0x93
|
|
shufps xmm8, xmm7, 221
|
|
pshufd xmm7, xmm8, 0x93
|
|
mov al, 7
|
|
9:
|
|
paddd xmm0, xmm4
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm5
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x93
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x39
|
|
paddd xmm0, xmm6
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
pshuflw xmm3, xmm3, 0xB1
|
|
pshufhw xmm3, xmm3, 0xB1
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 20
|
|
psrld xmm11, 12
|
|
por xmm1, xmm11
|
|
paddd xmm0, xmm7
|
|
paddd xmm0, xmm1
|
|
pxor xmm3, xmm0
|
|
movdqa xmm14, xmm3
|
|
psrld xmm3, 8
|
|
pslld xmm14, 24
|
|
pxor xmm3, xmm14
|
|
paddd xmm2, xmm3
|
|
pxor xmm1, xmm2
|
|
movdqa xmm11, xmm1
|
|
pslld xmm1, 25
|
|
psrld xmm11, 7
|
|
por xmm1, xmm11
|
|
pshufd xmm0, xmm0, 0x39
|
|
pshufd xmm3, xmm3, 0x4E
|
|
pshufd xmm2, xmm2, 0x93
|
|
dec al
|
|
jz 9f
|
|
movdqa xmm8, xmm4
|
|
shufps xmm8, xmm5, 214
|
|
pshufd xmm9, xmm4, 0x0F
|
|
pshufd xmm4, xmm8, 0x39
|
|
movdqa xmm8, xmm6
|
|
shufps xmm8, xmm7, 250
|
|
pand xmm9, xmmword ptr [PBLENDW_0x33_MASK+rip]
|
|
pand xmm8, xmmword ptr [PBLENDW_0xCC_MASK+rip]
|
|
por xmm9, xmm8
|
|
movdqa xmm8, xmm7
|
|
punpcklqdq xmm8, xmm5
|
|
movdqa xmm10, xmm6
|
|
pand xmm8, xmmword ptr [PBLENDW_0x3F_MASK+rip]
|
|
pand xmm10, xmmword ptr [PBLENDW_0xC0_MASK+rip]
|
|
por xmm8, xmm10
|
|
pshufd xmm8, xmm8, 0x78
|
|
punpckhdq xmm5, xmm7
|
|
punpckldq xmm6, xmm5
|
|
pshufd xmm7, xmm6, 0x1E
|
|
movdqa xmm5, xmm9
|
|
movdqa xmm6, xmm8
|
|
jmp 9b
|
|
9:
|
|
movdqu xmm4, xmmword ptr [rdi]
|
|
movdqu xmm5, xmmword ptr [rdi+0x10]
|
|
pxor xmm0, xmm2
|
|
pxor xmm1, xmm3
|
|
pxor xmm2, xmm4
|
|
pxor xmm3, xmm5
|
|
movups xmmword ptr [r9], xmm0
|
|
movups xmmword ptr [r9+0x10], xmm1
|
|
movups xmmword ptr [r9+0x20], xmm2
|
|
movups xmmword ptr [r9+0x30], xmm3
|
|
RET
|
|
|
|
.size zfs_blake3_hash_many_sse2, . - zfs_blake3_hash_many_sse2
|
|
.size zfs_blake3_compress_in_place_sse2, . - zfs_blake3_compress_in_place_sse2
|
|
.size zfs_blake3_compress_xof_sse2, . - zfs_blake3_compress_xof_sse2
|
|
|
|
#ifdef __APPLE__
|
|
.static_data
|
|
#else
|
|
.section .rodata
|
|
#endif
|
|
.p2align 6
|
|
BLAKE3_IV:
|
|
.long 0x6A09E667, 0xBB67AE85
|
|
.long 0x3C6EF372, 0xA54FF53A
|
|
ADD0:
|
|
.long 0, 1, 2, 3
|
|
ADD1:
|
|
.long 4, 4, 4, 4
|
|
BLAKE3_IV_0:
|
|
.long 0x6A09E667, 0x6A09E667, 0x6A09E667, 0x6A09E667
|
|
BLAKE3_IV_1:
|
|
.long 0xBB67AE85, 0xBB67AE85, 0xBB67AE85, 0xBB67AE85
|
|
BLAKE3_IV_2:
|
|
.long 0x3C6EF372, 0x3C6EF372, 0x3C6EF372, 0x3C6EF372
|
|
BLAKE3_IV_3:
|
|
.long 0xA54FF53A, 0xA54FF53A, 0xA54FF53A, 0xA54FF53A
|
|
BLAKE3_BLOCK_LEN:
|
|
.long 64, 64, 64, 64
|
|
CMP_MSB_MASK:
|
|
.long 0x80000000, 0x80000000, 0x80000000, 0x80000000
|
|
PBLENDW_0x33_MASK:
|
|
.long 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000
|
|
PBLENDW_0xCC_MASK:
|
|
.long 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF
|
|
PBLENDW_0x3F_MASK:
|
|
.long 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000
|
|
PBLENDW_0xC0_MASK:
|
|
.long 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF
|
|
|
|
#endif /* HAVE_SSE2 */
|
|
|
|
#ifdef __ELF__
|
|
.section .note.GNU-stack,"",%progbits
|
|
#endif
|