1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * ChaCha 256-bit cipher algorithm, x64 AVX2 functions 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2015 Martin Willi 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun.section .rodata.cst32.ROT8, "aM", @progbits, 32 11*4882a593Smuzhiyun.align 32 12*4882a593SmuzhiyunROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 13*4882a593Smuzhiyun .octa 0x0e0d0c0f0a09080b0605040702010003 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun.section .rodata.cst32.ROT16, "aM", @progbits, 32 16*4882a593Smuzhiyun.align 32 17*4882a593SmuzhiyunROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 18*4882a593Smuzhiyun .octa 0x0d0c0f0e09080b0a0504070601000302 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun.section .rodata.cst32.CTRINC, "aM", @progbits, 32 21*4882a593Smuzhiyun.align 32 22*4882a593SmuzhiyunCTRINC: .octa 0x00000003000000020000000100000000 23*4882a593Smuzhiyun .octa 0x00000007000000060000000500000004 24*4882a593Smuzhiyun 25*4882a593Smuzhiyun.section .rodata.cst32.CTR2BL, "aM", @progbits, 32 26*4882a593Smuzhiyun.align 32 27*4882a593SmuzhiyunCTR2BL: .octa 0x00000000000000000000000000000000 28*4882a593Smuzhiyun .octa 0x00000000000000000000000000000001 29*4882a593Smuzhiyun 30*4882a593Smuzhiyun.section .rodata.cst32.CTR4BL, "aM", @progbits, 32 31*4882a593Smuzhiyun.align 32 32*4882a593SmuzhiyunCTR4BL: .octa 0x00000000000000000000000000000002 33*4882a593Smuzhiyun .octa 0x00000000000000000000000000000003 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun.text 36*4882a593Smuzhiyun 37*4882a593SmuzhiyunSYM_FUNC_START(chacha_2block_xor_avx2) 38*4882a593Smuzhiyun # %rdi: Input state matrix, s 39*4882a593Smuzhiyun # %rsi: up to 2 data blocks output, o 40*4882a593Smuzhiyun # %rdx: up to 2 data blocks input, i 41*4882a593Smuzhiyun # %rcx: input/output length in bytes 42*4882a593Smuzhiyun # %r8d: nrounds 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun # This function encrypts two ChaCha blocks by loading the state 45*4882a593Smuzhiyun # matrix twice across four AVX registers. It performs matrix operations 46*4882a593Smuzhiyun # on four words in each matrix in parallel, but requires shuffling to 47*4882a593Smuzhiyun # rearrange the words after each round. 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun vzeroupper 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun # x0..3[0-2] = s0..3 52*4882a593Smuzhiyun vbroadcasti128 0x00(%rdi),%ymm0 53*4882a593Smuzhiyun vbroadcasti128 0x10(%rdi),%ymm1 54*4882a593Smuzhiyun vbroadcasti128 0x20(%rdi),%ymm2 55*4882a593Smuzhiyun vbroadcasti128 0x30(%rdi),%ymm3 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun vpaddd CTR2BL(%rip),%ymm3,%ymm3 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun vmovdqa %ymm0,%ymm8 60*4882a593Smuzhiyun vmovdqa %ymm1,%ymm9 61*4882a593Smuzhiyun vmovdqa %ymm2,%ymm10 62*4882a593Smuzhiyun vmovdqa %ymm3,%ymm11 63*4882a593Smuzhiyun 64*4882a593Smuzhiyun vmovdqa ROT8(%rip),%ymm4 65*4882a593Smuzhiyun vmovdqa ROT16(%rip),%ymm5 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun mov %rcx,%rax 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun.Ldoubleround: 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 72*4882a593Smuzhiyun vpaddd %ymm1,%ymm0,%ymm0 73*4882a593Smuzhiyun vpxor %ymm0,%ymm3,%ymm3 74*4882a593Smuzhiyun vpshufb %ymm5,%ymm3,%ymm3 75*4882a593Smuzhiyun 76*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 77*4882a593Smuzhiyun vpaddd %ymm3,%ymm2,%ymm2 78*4882a593Smuzhiyun vpxor %ymm2,%ymm1,%ymm1 79*4882a593Smuzhiyun vmovdqa %ymm1,%ymm6 80*4882a593Smuzhiyun vpslld $12,%ymm6,%ymm6 81*4882a593Smuzhiyun vpsrld $20,%ymm1,%ymm1 82*4882a593Smuzhiyun vpor %ymm6,%ymm1,%ymm1 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 85*4882a593Smuzhiyun vpaddd %ymm1,%ymm0,%ymm0 86*4882a593Smuzhiyun vpxor %ymm0,%ymm3,%ymm3 87*4882a593Smuzhiyun vpshufb %ymm4,%ymm3,%ymm3 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 90*4882a593Smuzhiyun vpaddd %ymm3,%ymm2,%ymm2 91*4882a593Smuzhiyun vpxor %ymm2,%ymm1,%ymm1 92*4882a593Smuzhiyun vmovdqa %ymm1,%ymm7 93*4882a593Smuzhiyun vpslld $7,%ymm7,%ymm7 94*4882a593Smuzhiyun vpsrld $25,%ymm1,%ymm1 95*4882a593Smuzhiyun vpor %ymm7,%ymm1,%ymm1 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 98*4882a593Smuzhiyun vpshufd $0x39,%ymm1,%ymm1 99*4882a593Smuzhiyun # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 100*4882a593Smuzhiyun vpshufd $0x4e,%ymm2,%ymm2 101*4882a593Smuzhiyun # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 102*4882a593Smuzhiyun vpshufd $0x93,%ymm3,%ymm3 103*4882a593Smuzhiyun 104*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 105*4882a593Smuzhiyun vpaddd %ymm1,%ymm0,%ymm0 106*4882a593Smuzhiyun vpxor %ymm0,%ymm3,%ymm3 107*4882a593Smuzhiyun vpshufb %ymm5,%ymm3,%ymm3 108*4882a593Smuzhiyun 109*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 110*4882a593Smuzhiyun vpaddd %ymm3,%ymm2,%ymm2 111*4882a593Smuzhiyun vpxor %ymm2,%ymm1,%ymm1 112*4882a593Smuzhiyun vmovdqa %ymm1,%ymm6 113*4882a593Smuzhiyun vpslld $12,%ymm6,%ymm6 114*4882a593Smuzhiyun vpsrld $20,%ymm1,%ymm1 115*4882a593Smuzhiyun vpor %ymm6,%ymm1,%ymm1 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 118*4882a593Smuzhiyun vpaddd %ymm1,%ymm0,%ymm0 119*4882a593Smuzhiyun vpxor %ymm0,%ymm3,%ymm3 120*4882a593Smuzhiyun vpshufb %ymm4,%ymm3,%ymm3 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 123*4882a593Smuzhiyun vpaddd %ymm3,%ymm2,%ymm2 124*4882a593Smuzhiyun vpxor %ymm2,%ymm1,%ymm1 125*4882a593Smuzhiyun vmovdqa %ymm1,%ymm7 126*4882a593Smuzhiyun vpslld $7,%ymm7,%ymm7 127*4882a593Smuzhiyun vpsrld $25,%ymm1,%ymm1 128*4882a593Smuzhiyun vpor %ymm7,%ymm1,%ymm1 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 131*4882a593Smuzhiyun vpshufd $0x93,%ymm1,%ymm1 132*4882a593Smuzhiyun # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 133*4882a593Smuzhiyun vpshufd $0x4e,%ymm2,%ymm2 134*4882a593Smuzhiyun # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 135*4882a593Smuzhiyun vpshufd $0x39,%ymm3,%ymm3 136*4882a593Smuzhiyun 137*4882a593Smuzhiyun sub $2,%r8d 138*4882a593Smuzhiyun jnz .Ldoubleround 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun # o0 = i0 ^ (x0 + s0) 141*4882a593Smuzhiyun vpaddd %ymm8,%ymm0,%ymm7 142*4882a593Smuzhiyun cmp $0x10,%rax 143*4882a593Smuzhiyun jl .Lxorpart2 144*4882a593Smuzhiyun vpxor 0x00(%rdx),%xmm7,%xmm6 145*4882a593Smuzhiyun vmovdqu %xmm6,0x00(%rsi) 146*4882a593Smuzhiyun vextracti128 $1,%ymm7,%xmm0 147*4882a593Smuzhiyun # o1 = i1 ^ (x1 + s1) 148*4882a593Smuzhiyun vpaddd %ymm9,%ymm1,%ymm7 149*4882a593Smuzhiyun cmp $0x20,%rax 150*4882a593Smuzhiyun jl .Lxorpart2 151*4882a593Smuzhiyun vpxor 0x10(%rdx),%xmm7,%xmm6 152*4882a593Smuzhiyun vmovdqu %xmm6,0x10(%rsi) 153*4882a593Smuzhiyun vextracti128 $1,%ymm7,%xmm1 154*4882a593Smuzhiyun # o2 = i2 ^ (x2 + s2) 155*4882a593Smuzhiyun vpaddd %ymm10,%ymm2,%ymm7 156*4882a593Smuzhiyun cmp $0x30,%rax 157*4882a593Smuzhiyun jl .Lxorpart2 158*4882a593Smuzhiyun vpxor 0x20(%rdx),%xmm7,%xmm6 159*4882a593Smuzhiyun vmovdqu %xmm6,0x20(%rsi) 160*4882a593Smuzhiyun vextracti128 $1,%ymm7,%xmm2 161*4882a593Smuzhiyun # o3 = i3 ^ (x3 + s3) 162*4882a593Smuzhiyun vpaddd %ymm11,%ymm3,%ymm7 163*4882a593Smuzhiyun cmp $0x40,%rax 164*4882a593Smuzhiyun jl .Lxorpart2 165*4882a593Smuzhiyun vpxor 0x30(%rdx),%xmm7,%xmm6 166*4882a593Smuzhiyun vmovdqu %xmm6,0x30(%rsi) 167*4882a593Smuzhiyun vextracti128 $1,%ymm7,%xmm3 168*4882a593Smuzhiyun 169*4882a593Smuzhiyun # xor and write second block 170*4882a593Smuzhiyun vmovdqa %xmm0,%xmm7 171*4882a593Smuzhiyun cmp $0x50,%rax 172*4882a593Smuzhiyun jl .Lxorpart2 173*4882a593Smuzhiyun vpxor 0x40(%rdx),%xmm7,%xmm6 174*4882a593Smuzhiyun vmovdqu %xmm6,0x40(%rsi) 175*4882a593Smuzhiyun 176*4882a593Smuzhiyun vmovdqa %xmm1,%xmm7 177*4882a593Smuzhiyun cmp $0x60,%rax 178*4882a593Smuzhiyun jl .Lxorpart2 179*4882a593Smuzhiyun vpxor 0x50(%rdx),%xmm7,%xmm6 180*4882a593Smuzhiyun vmovdqu %xmm6,0x50(%rsi) 181*4882a593Smuzhiyun 182*4882a593Smuzhiyun vmovdqa %xmm2,%xmm7 183*4882a593Smuzhiyun cmp $0x70,%rax 184*4882a593Smuzhiyun jl .Lxorpart2 185*4882a593Smuzhiyun vpxor 0x60(%rdx),%xmm7,%xmm6 186*4882a593Smuzhiyun vmovdqu %xmm6,0x60(%rsi) 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun vmovdqa %xmm3,%xmm7 189*4882a593Smuzhiyun cmp $0x80,%rax 190*4882a593Smuzhiyun jl .Lxorpart2 191*4882a593Smuzhiyun vpxor 0x70(%rdx),%xmm7,%xmm6 192*4882a593Smuzhiyun vmovdqu %xmm6,0x70(%rsi) 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun.Ldone2: 195*4882a593Smuzhiyun vzeroupper 196*4882a593Smuzhiyun RET 197*4882a593Smuzhiyun 198*4882a593Smuzhiyun.Lxorpart2: 199*4882a593Smuzhiyun # xor remaining bytes from partial register into output 200*4882a593Smuzhiyun mov %rax,%r9 201*4882a593Smuzhiyun and $0x0f,%r9 202*4882a593Smuzhiyun jz .Ldone2 203*4882a593Smuzhiyun and $~0x0f,%rax 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun mov %rsi,%r11 206*4882a593Smuzhiyun 207*4882a593Smuzhiyun lea 8(%rsp),%r10 208*4882a593Smuzhiyun sub $0x10,%rsp 209*4882a593Smuzhiyun and $~31,%rsp 210*4882a593Smuzhiyun 211*4882a593Smuzhiyun lea (%rdx,%rax),%rsi 212*4882a593Smuzhiyun mov %rsp,%rdi 213*4882a593Smuzhiyun mov %r9,%rcx 214*4882a593Smuzhiyun rep movsb 215*4882a593Smuzhiyun 216*4882a593Smuzhiyun vpxor 0x00(%rsp),%xmm7,%xmm7 217*4882a593Smuzhiyun vmovdqa %xmm7,0x00(%rsp) 218*4882a593Smuzhiyun 219*4882a593Smuzhiyun mov %rsp,%rsi 220*4882a593Smuzhiyun lea (%r11,%rax),%rdi 221*4882a593Smuzhiyun mov %r9,%rcx 222*4882a593Smuzhiyun rep movsb 223*4882a593Smuzhiyun 224*4882a593Smuzhiyun lea -8(%r10),%rsp 225*4882a593Smuzhiyun jmp .Ldone2 226*4882a593Smuzhiyun 227*4882a593SmuzhiyunSYM_FUNC_END(chacha_2block_xor_avx2) 228*4882a593Smuzhiyun 229*4882a593SmuzhiyunSYM_FUNC_START(chacha_4block_xor_avx2) 230*4882a593Smuzhiyun # %rdi: Input state matrix, s 231*4882a593Smuzhiyun # %rsi: up to 4 data blocks output, o 232*4882a593Smuzhiyun # %rdx: up to 4 data blocks input, i 233*4882a593Smuzhiyun # %rcx: input/output length in bytes 234*4882a593Smuzhiyun # %r8d: nrounds 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun # This function encrypts four ChaCha blocks by loading the state 237*4882a593Smuzhiyun # matrix four times across eight AVX registers. It performs matrix 238*4882a593Smuzhiyun # operations on four words in two matrices in parallel, sequentially 239*4882a593Smuzhiyun # to the operations on the four words of the other two matrices. The 240*4882a593Smuzhiyun # required word shuffling has a rather high latency, we can do the 241*4882a593Smuzhiyun # arithmetic on two matrix-pairs without much slowdown. 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun vzeroupper 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun # x0..3[0-4] = s0..3 246*4882a593Smuzhiyun vbroadcasti128 0x00(%rdi),%ymm0 247*4882a593Smuzhiyun vbroadcasti128 0x10(%rdi),%ymm1 248*4882a593Smuzhiyun vbroadcasti128 0x20(%rdi),%ymm2 249*4882a593Smuzhiyun vbroadcasti128 0x30(%rdi),%ymm3 250*4882a593Smuzhiyun 251*4882a593Smuzhiyun vmovdqa %ymm0,%ymm4 252*4882a593Smuzhiyun vmovdqa %ymm1,%ymm5 253*4882a593Smuzhiyun vmovdqa %ymm2,%ymm6 254*4882a593Smuzhiyun vmovdqa %ymm3,%ymm7 255*4882a593Smuzhiyun 256*4882a593Smuzhiyun vpaddd CTR2BL(%rip),%ymm3,%ymm3 257*4882a593Smuzhiyun vpaddd CTR4BL(%rip),%ymm7,%ymm7 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun vmovdqa %ymm0,%ymm11 260*4882a593Smuzhiyun vmovdqa %ymm1,%ymm12 261*4882a593Smuzhiyun vmovdqa %ymm2,%ymm13 262*4882a593Smuzhiyun vmovdqa %ymm3,%ymm14 263*4882a593Smuzhiyun vmovdqa %ymm7,%ymm15 264*4882a593Smuzhiyun 265*4882a593Smuzhiyun vmovdqa ROT8(%rip),%ymm8 266*4882a593Smuzhiyun vmovdqa ROT16(%rip),%ymm9 267*4882a593Smuzhiyun 268*4882a593Smuzhiyun mov %rcx,%rax 269*4882a593Smuzhiyun 270*4882a593Smuzhiyun.Ldoubleround4: 271*4882a593Smuzhiyun 272*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 273*4882a593Smuzhiyun vpaddd %ymm1,%ymm0,%ymm0 274*4882a593Smuzhiyun vpxor %ymm0,%ymm3,%ymm3 275*4882a593Smuzhiyun vpshufb %ymm9,%ymm3,%ymm3 276*4882a593Smuzhiyun 277*4882a593Smuzhiyun vpaddd %ymm5,%ymm4,%ymm4 278*4882a593Smuzhiyun vpxor %ymm4,%ymm7,%ymm7 279*4882a593Smuzhiyun vpshufb %ymm9,%ymm7,%ymm7 280*4882a593Smuzhiyun 281*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 282*4882a593Smuzhiyun vpaddd %ymm3,%ymm2,%ymm2 283*4882a593Smuzhiyun vpxor %ymm2,%ymm1,%ymm1 284*4882a593Smuzhiyun vmovdqa %ymm1,%ymm10 285*4882a593Smuzhiyun vpslld $12,%ymm10,%ymm10 286*4882a593Smuzhiyun vpsrld $20,%ymm1,%ymm1 287*4882a593Smuzhiyun vpor %ymm10,%ymm1,%ymm1 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun vpaddd %ymm7,%ymm6,%ymm6 290*4882a593Smuzhiyun vpxor %ymm6,%ymm5,%ymm5 291*4882a593Smuzhiyun vmovdqa %ymm5,%ymm10 292*4882a593Smuzhiyun vpslld $12,%ymm10,%ymm10 293*4882a593Smuzhiyun vpsrld $20,%ymm5,%ymm5 294*4882a593Smuzhiyun vpor %ymm10,%ymm5,%ymm5 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 297*4882a593Smuzhiyun vpaddd %ymm1,%ymm0,%ymm0 298*4882a593Smuzhiyun vpxor %ymm0,%ymm3,%ymm3 299*4882a593Smuzhiyun vpshufb %ymm8,%ymm3,%ymm3 300*4882a593Smuzhiyun 301*4882a593Smuzhiyun vpaddd %ymm5,%ymm4,%ymm4 302*4882a593Smuzhiyun vpxor %ymm4,%ymm7,%ymm7 303*4882a593Smuzhiyun vpshufb %ymm8,%ymm7,%ymm7 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 306*4882a593Smuzhiyun vpaddd %ymm3,%ymm2,%ymm2 307*4882a593Smuzhiyun vpxor %ymm2,%ymm1,%ymm1 308*4882a593Smuzhiyun vmovdqa %ymm1,%ymm10 309*4882a593Smuzhiyun vpslld $7,%ymm10,%ymm10 310*4882a593Smuzhiyun vpsrld $25,%ymm1,%ymm1 311*4882a593Smuzhiyun vpor %ymm10,%ymm1,%ymm1 312*4882a593Smuzhiyun 313*4882a593Smuzhiyun vpaddd %ymm7,%ymm6,%ymm6 314*4882a593Smuzhiyun vpxor %ymm6,%ymm5,%ymm5 315*4882a593Smuzhiyun vmovdqa %ymm5,%ymm10 316*4882a593Smuzhiyun vpslld $7,%ymm10,%ymm10 317*4882a593Smuzhiyun vpsrld $25,%ymm5,%ymm5 318*4882a593Smuzhiyun vpor %ymm10,%ymm5,%ymm5 319*4882a593Smuzhiyun 320*4882a593Smuzhiyun # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 321*4882a593Smuzhiyun vpshufd $0x39,%ymm1,%ymm1 322*4882a593Smuzhiyun vpshufd $0x39,%ymm5,%ymm5 323*4882a593Smuzhiyun # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 324*4882a593Smuzhiyun vpshufd $0x4e,%ymm2,%ymm2 325*4882a593Smuzhiyun vpshufd $0x4e,%ymm6,%ymm6 326*4882a593Smuzhiyun # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 327*4882a593Smuzhiyun vpshufd $0x93,%ymm3,%ymm3 328*4882a593Smuzhiyun vpshufd $0x93,%ymm7,%ymm7 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 331*4882a593Smuzhiyun vpaddd %ymm1,%ymm0,%ymm0 332*4882a593Smuzhiyun vpxor %ymm0,%ymm3,%ymm3 333*4882a593Smuzhiyun vpshufb %ymm9,%ymm3,%ymm3 334*4882a593Smuzhiyun 335*4882a593Smuzhiyun vpaddd %ymm5,%ymm4,%ymm4 336*4882a593Smuzhiyun vpxor %ymm4,%ymm7,%ymm7 337*4882a593Smuzhiyun vpshufb %ymm9,%ymm7,%ymm7 338*4882a593Smuzhiyun 339*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 340*4882a593Smuzhiyun vpaddd %ymm3,%ymm2,%ymm2 341*4882a593Smuzhiyun vpxor %ymm2,%ymm1,%ymm1 342*4882a593Smuzhiyun vmovdqa %ymm1,%ymm10 343*4882a593Smuzhiyun vpslld $12,%ymm10,%ymm10 344*4882a593Smuzhiyun vpsrld $20,%ymm1,%ymm1 345*4882a593Smuzhiyun vpor %ymm10,%ymm1,%ymm1 346*4882a593Smuzhiyun 347*4882a593Smuzhiyun vpaddd %ymm7,%ymm6,%ymm6 348*4882a593Smuzhiyun vpxor %ymm6,%ymm5,%ymm5 349*4882a593Smuzhiyun vmovdqa %ymm5,%ymm10 350*4882a593Smuzhiyun vpslld $12,%ymm10,%ymm10 351*4882a593Smuzhiyun vpsrld $20,%ymm5,%ymm5 352*4882a593Smuzhiyun vpor %ymm10,%ymm5,%ymm5 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 355*4882a593Smuzhiyun vpaddd %ymm1,%ymm0,%ymm0 356*4882a593Smuzhiyun vpxor %ymm0,%ymm3,%ymm3 357*4882a593Smuzhiyun vpshufb %ymm8,%ymm3,%ymm3 358*4882a593Smuzhiyun 359*4882a593Smuzhiyun vpaddd %ymm5,%ymm4,%ymm4 360*4882a593Smuzhiyun vpxor %ymm4,%ymm7,%ymm7 361*4882a593Smuzhiyun vpshufb %ymm8,%ymm7,%ymm7 362*4882a593Smuzhiyun 363*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 364*4882a593Smuzhiyun vpaddd %ymm3,%ymm2,%ymm2 365*4882a593Smuzhiyun vpxor %ymm2,%ymm1,%ymm1 366*4882a593Smuzhiyun vmovdqa %ymm1,%ymm10 367*4882a593Smuzhiyun vpslld $7,%ymm10,%ymm10 368*4882a593Smuzhiyun vpsrld $25,%ymm1,%ymm1 369*4882a593Smuzhiyun vpor %ymm10,%ymm1,%ymm1 370*4882a593Smuzhiyun 371*4882a593Smuzhiyun vpaddd %ymm7,%ymm6,%ymm6 372*4882a593Smuzhiyun vpxor %ymm6,%ymm5,%ymm5 373*4882a593Smuzhiyun vmovdqa %ymm5,%ymm10 374*4882a593Smuzhiyun vpslld $7,%ymm10,%ymm10 375*4882a593Smuzhiyun vpsrld $25,%ymm5,%ymm5 376*4882a593Smuzhiyun vpor %ymm10,%ymm5,%ymm5 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 379*4882a593Smuzhiyun vpshufd $0x93,%ymm1,%ymm1 380*4882a593Smuzhiyun vpshufd $0x93,%ymm5,%ymm5 381*4882a593Smuzhiyun # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 382*4882a593Smuzhiyun vpshufd $0x4e,%ymm2,%ymm2 383*4882a593Smuzhiyun vpshufd $0x4e,%ymm6,%ymm6 384*4882a593Smuzhiyun # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 385*4882a593Smuzhiyun vpshufd $0x39,%ymm3,%ymm3 386*4882a593Smuzhiyun vpshufd $0x39,%ymm7,%ymm7 387*4882a593Smuzhiyun 388*4882a593Smuzhiyun sub $2,%r8d 389*4882a593Smuzhiyun jnz .Ldoubleround4 390*4882a593Smuzhiyun 391*4882a593Smuzhiyun # o0 = i0 ^ (x0 + s0), first block 392*4882a593Smuzhiyun vpaddd %ymm11,%ymm0,%ymm10 393*4882a593Smuzhiyun cmp $0x10,%rax 394*4882a593Smuzhiyun jl .Lxorpart4 395*4882a593Smuzhiyun vpxor 0x00(%rdx),%xmm10,%xmm9 396*4882a593Smuzhiyun vmovdqu %xmm9,0x00(%rsi) 397*4882a593Smuzhiyun vextracti128 $1,%ymm10,%xmm0 398*4882a593Smuzhiyun # o1 = i1 ^ (x1 + s1), first block 399*4882a593Smuzhiyun vpaddd %ymm12,%ymm1,%ymm10 400*4882a593Smuzhiyun cmp $0x20,%rax 401*4882a593Smuzhiyun jl .Lxorpart4 402*4882a593Smuzhiyun vpxor 0x10(%rdx),%xmm10,%xmm9 403*4882a593Smuzhiyun vmovdqu %xmm9,0x10(%rsi) 404*4882a593Smuzhiyun vextracti128 $1,%ymm10,%xmm1 405*4882a593Smuzhiyun # o2 = i2 ^ (x2 + s2), first block 406*4882a593Smuzhiyun vpaddd %ymm13,%ymm2,%ymm10 407*4882a593Smuzhiyun cmp $0x30,%rax 408*4882a593Smuzhiyun jl .Lxorpart4 409*4882a593Smuzhiyun vpxor 0x20(%rdx),%xmm10,%xmm9 410*4882a593Smuzhiyun vmovdqu %xmm9,0x20(%rsi) 411*4882a593Smuzhiyun vextracti128 $1,%ymm10,%xmm2 412*4882a593Smuzhiyun # o3 = i3 ^ (x3 + s3), first block 413*4882a593Smuzhiyun vpaddd %ymm14,%ymm3,%ymm10 414*4882a593Smuzhiyun cmp $0x40,%rax 415*4882a593Smuzhiyun jl .Lxorpart4 416*4882a593Smuzhiyun vpxor 0x30(%rdx),%xmm10,%xmm9 417*4882a593Smuzhiyun vmovdqu %xmm9,0x30(%rsi) 418*4882a593Smuzhiyun vextracti128 $1,%ymm10,%xmm3 419*4882a593Smuzhiyun 420*4882a593Smuzhiyun # xor and write second block 421*4882a593Smuzhiyun vmovdqa %xmm0,%xmm10 422*4882a593Smuzhiyun cmp $0x50,%rax 423*4882a593Smuzhiyun jl .Lxorpart4 424*4882a593Smuzhiyun vpxor 0x40(%rdx),%xmm10,%xmm9 425*4882a593Smuzhiyun vmovdqu %xmm9,0x40(%rsi) 426*4882a593Smuzhiyun 427*4882a593Smuzhiyun vmovdqa %xmm1,%xmm10 428*4882a593Smuzhiyun cmp $0x60,%rax 429*4882a593Smuzhiyun jl .Lxorpart4 430*4882a593Smuzhiyun vpxor 0x50(%rdx),%xmm10,%xmm9 431*4882a593Smuzhiyun vmovdqu %xmm9,0x50(%rsi) 432*4882a593Smuzhiyun 433*4882a593Smuzhiyun vmovdqa %xmm2,%xmm10 434*4882a593Smuzhiyun cmp $0x70,%rax 435*4882a593Smuzhiyun jl .Lxorpart4 436*4882a593Smuzhiyun vpxor 0x60(%rdx),%xmm10,%xmm9 437*4882a593Smuzhiyun vmovdqu %xmm9,0x60(%rsi) 438*4882a593Smuzhiyun 439*4882a593Smuzhiyun vmovdqa %xmm3,%xmm10 440*4882a593Smuzhiyun cmp $0x80,%rax 441*4882a593Smuzhiyun jl .Lxorpart4 442*4882a593Smuzhiyun vpxor 0x70(%rdx),%xmm10,%xmm9 443*4882a593Smuzhiyun vmovdqu %xmm9,0x70(%rsi) 444*4882a593Smuzhiyun 445*4882a593Smuzhiyun # o0 = i0 ^ (x0 + s0), third block 446*4882a593Smuzhiyun vpaddd %ymm11,%ymm4,%ymm10 447*4882a593Smuzhiyun cmp $0x90,%rax 448*4882a593Smuzhiyun jl .Lxorpart4 449*4882a593Smuzhiyun vpxor 0x80(%rdx),%xmm10,%xmm9 450*4882a593Smuzhiyun vmovdqu %xmm9,0x80(%rsi) 451*4882a593Smuzhiyun vextracti128 $1,%ymm10,%xmm4 452*4882a593Smuzhiyun # o1 = i1 ^ (x1 + s1), third block 453*4882a593Smuzhiyun vpaddd %ymm12,%ymm5,%ymm10 454*4882a593Smuzhiyun cmp $0xa0,%rax 455*4882a593Smuzhiyun jl .Lxorpart4 456*4882a593Smuzhiyun vpxor 0x90(%rdx),%xmm10,%xmm9 457*4882a593Smuzhiyun vmovdqu %xmm9,0x90(%rsi) 458*4882a593Smuzhiyun vextracti128 $1,%ymm10,%xmm5 459*4882a593Smuzhiyun # o2 = i2 ^ (x2 + s2), third block 460*4882a593Smuzhiyun vpaddd %ymm13,%ymm6,%ymm10 461*4882a593Smuzhiyun cmp $0xb0,%rax 462*4882a593Smuzhiyun jl .Lxorpart4 463*4882a593Smuzhiyun vpxor 0xa0(%rdx),%xmm10,%xmm9 464*4882a593Smuzhiyun vmovdqu %xmm9,0xa0(%rsi) 465*4882a593Smuzhiyun vextracti128 $1,%ymm10,%xmm6 466*4882a593Smuzhiyun # o3 = i3 ^ (x3 + s3), third block 467*4882a593Smuzhiyun vpaddd %ymm15,%ymm7,%ymm10 468*4882a593Smuzhiyun cmp $0xc0,%rax 469*4882a593Smuzhiyun jl .Lxorpart4 470*4882a593Smuzhiyun vpxor 0xb0(%rdx),%xmm10,%xmm9 471*4882a593Smuzhiyun vmovdqu %xmm9,0xb0(%rsi) 472*4882a593Smuzhiyun vextracti128 $1,%ymm10,%xmm7 473*4882a593Smuzhiyun 474*4882a593Smuzhiyun # xor and write fourth block 475*4882a593Smuzhiyun vmovdqa %xmm4,%xmm10 476*4882a593Smuzhiyun cmp $0xd0,%rax 477*4882a593Smuzhiyun jl .Lxorpart4 478*4882a593Smuzhiyun vpxor 0xc0(%rdx),%xmm10,%xmm9 479*4882a593Smuzhiyun vmovdqu %xmm9,0xc0(%rsi) 480*4882a593Smuzhiyun 481*4882a593Smuzhiyun vmovdqa %xmm5,%xmm10 482*4882a593Smuzhiyun cmp $0xe0,%rax 483*4882a593Smuzhiyun jl .Lxorpart4 484*4882a593Smuzhiyun vpxor 0xd0(%rdx),%xmm10,%xmm9 485*4882a593Smuzhiyun vmovdqu %xmm9,0xd0(%rsi) 486*4882a593Smuzhiyun 487*4882a593Smuzhiyun vmovdqa %xmm6,%xmm10 488*4882a593Smuzhiyun cmp $0xf0,%rax 489*4882a593Smuzhiyun jl .Lxorpart4 490*4882a593Smuzhiyun vpxor 0xe0(%rdx),%xmm10,%xmm9 491*4882a593Smuzhiyun vmovdqu %xmm9,0xe0(%rsi) 492*4882a593Smuzhiyun 493*4882a593Smuzhiyun vmovdqa %xmm7,%xmm10 494*4882a593Smuzhiyun cmp $0x100,%rax 495*4882a593Smuzhiyun jl .Lxorpart4 496*4882a593Smuzhiyun vpxor 0xf0(%rdx),%xmm10,%xmm9 497*4882a593Smuzhiyun vmovdqu %xmm9,0xf0(%rsi) 498*4882a593Smuzhiyun 499*4882a593Smuzhiyun.Ldone4: 500*4882a593Smuzhiyun vzeroupper 501*4882a593Smuzhiyun RET 502*4882a593Smuzhiyun 503*4882a593Smuzhiyun.Lxorpart4: 504*4882a593Smuzhiyun # xor remaining bytes from partial register into output 505*4882a593Smuzhiyun mov %rax,%r9 506*4882a593Smuzhiyun and $0x0f,%r9 507*4882a593Smuzhiyun jz .Ldone4 508*4882a593Smuzhiyun and $~0x0f,%rax 509*4882a593Smuzhiyun 510*4882a593Smuzhiyun mov %rsi,%r11 511*4882a593Smuzhiyun 512*4882a593Smuzhiyun lea 8(%rsp),%r10 513*4882a593Smuzhiyun sub $0x10,%rsp 514*4882a593Smuzhiyun and $~31,%rsp 515*4882a593Smuzhiyun 516*4882a593Smuzhiyun lea (%rdx,%rax),%rsi 517*4882a593Smuzhiyun mov %rsp,%rdi 518*4882a593Smuzhiyun mov %r9,%rcx 519*4882a593Smuzhiyun rep movsb 520*4882a593Smuzhiyun 521*4882a593Smuzhiyun vpxor 0x00(%rsp),%xmm10,%xmm10 522*4882a593Smuzhiyun vmovdqa %xmm10,0x00(%rsp) 523*4882a593Smuzhiyun 524*4882a593Smuzhiyun mov %rsp,%rsi 525*4882a593Smuzhiyun lea (%r11,%rax),%rdi 526*4882a593Smuzhiyun mov %r9,%rcx 527*4882a593Smuzhiyun rep movsb 528*4882a593Smuzhiyun 529*4882a593Smuzhiyun lea -8(%r10),%rsp 530*4882a593Smuzhiyun jmp .Ldone4 531*4882a593Smuzhiyun 532*4882a593SmuzhiyunSYM_FUNC_END(chacha_4block_xor_avx2) 533*4882a593Smuzhiyun 534*4882a593SmuzhiyunSYM_FUNC_START(chacha_8block_xor_avx2) 535*4882a593Smuzhiyun # %rdi: Input state matrix, s 536*4882a593Smuzhiyun # %rsi: up to 8 data blocks output, o 537*4882a593Smuzhiyun # %rdx: up to 8 data blocks input, i 538*4882a593Smuzhiyun # %rcx: input/output length in bytes 539*4882a593Smuzhiyun # %r8d: nrounds 540*4882a593Smuzhiyun 541*4882a593Smuzhiyun # This function encrypts eight consecutive ChaCha blocks by loading 542*4882a593Smuzhiyun # the state matrix in AVX registers eight times. As we need some 543*4882a593Smuzhiyun # scratch registers, we save the first four registers on the stack. The 544*4882a593Smuzhiyun # algorithm performs each operation on the corresponding word of each 545*4882a593Smuzhiyun # state matrix, hence requires no word shuffling. For final XORing step 546*4882a593Smuzhiyun # we transpose the matrix by interleaving 32-, 64- and then 128-bit 547*4882a593Smuzhiyun # words, which allows us to do XOR in AVX registers. 8/16-bit word 548*4882a593Smuzhiyun # rotation is done with the slightly better performing byte shuffling, 549*4882a593Smuzhiyun # 7/12-bit word rotation uses traditional shift+OR. 550*4882a593Smuzhiyun 551*4882a593Smuzhiyun vzeroupper 552*4882a593Smuzhiyun # 4 * 32 byte stack, 32-byte aligned 553*4882a593Smuzhiyun lea 8(%rsp),%r10 554*4882a593Smuzhiyun and $~31, %rsp 555*4882a593Smuzhiyun sub $0x80, %rsp 556*4882a593Smuzhiyun mov %rcx,%rax 557*4882a593Smuzhiyun 558*4882a593Smuzhiyun # x0..15[0-7] = s[0..15] 559*4882a593Smuzhiyun vpbroadcastd 0x00(%rdi),%ymm0 560*4882a593Smuzhiyun vpbroadcastd 0x04(%rdi),%ymm1 561*4882a593Smuzhiyun vpbroadcastd 0x08(%rdi),%ymm2 562*4882a593Smuzhiyun vpbroadcastd 0x0c(%rdi),%ymm3 563*4882a593Smuzhiyun vpbroadcastd 0x10(%rdi),%ymm4 564*4882a593Smuzhiyun vpbroadcastd 0x14(%rdi),%ymm5 565*4882a593Smuzhiyun vpbroadcastd 0x18(%rdi),%ymm6 566*4882a593Smuzhiyun vpbroadcastd 0x1c(%rdi),%ymm7 567*4882a593Smuzhiyun vpbroadcastd 0x20(%rdi),%ymm8 568*4882a593Smuzhiyun vpbroadcastd 0x24(%rdi),%ymm9 569*4882a593Smuzhiyun vpbroadcastd 0x28(%rdi),%ymm10 570*4882a593Smuzhiyun vpbroadcastd 0x2c(%rdi),%ymm11 571*4882a593Smuzhiyun vpbroadcastd 0x30(%rdi),%ymm12 572*4882a593Smuzhiyun vpbroadcastd 0x34(%rdi),%ymm13 573*4882a593Smuzhiyun vpbroadcastd 0x38(%rdi),%ymm14 574*4882a593Smuzhiyun vpbroadcastd 0x3c(%rdi),%ymm15 575*4882a593Smuzhiyun # x0..3 on stack 576*4882a593Smuzhiyun vmovdqa %ymm0,0x00(%rsp) 577*4882a593Smuzhiyun vmovdqa %ymm1,0x20(%rsp) 578*4882a593Smuzhiyun vmovdqa %ymm2,0x40(%rsp) 579*4882a593Smuzhiyun vmovdqa %ymm3,0x60(%rsp) 580*4882a593Smuzhiyun 581*4882a593Smuzhiyun vmovdqa CTRINC(%rip),%ymm1 582*4882a593Smuzhiyun vmovdqa ROT8(%rip),%ymm2 583*4882a593Smuzhiyun vmovdqa ROT16(%rip),%ymm3 584*4882a593Smuzhiyun 585*4882a593Smuzhiyun # x12 += counter values 0-3 586*4882a593Smuzhiyun vpaddd %ymm1,%ymm12,%ymm12 587*4882a593Smuzhiyun 588*4882a593Smuzhiyun.Ldoubleround8: 589*4882a593Smuzhiyun # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 590*4882a593Smuzhiyun vpaddd 0x00(%rsp),%ymm4,%ymm0 591*4882a593Smuzhiyun vmovdqa %ymm0,0x00(%rsp) 592*4882a593Smuzhiyun vpxor %ymm0,%ymm12,%ymm12 593*4882a593Smuzhiyun vpshufb %ymm3,%ymm12,%ymm12 594*4882a593Smuzhiyun # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 595*4882a593Smuzhiyun vpaddd 0x20(%rsp),%ymm5,%ymm0 596*4882a593Smuzhiyun vmovdqa %ymm0,0x20(%rsp) 597*4882a593Smuzhiyun vpxor %ymm0,%ymm13,%ymm13 598*4882a593Smuzhiyun vpshufb %ymm3,%ymm13,%ymm13 599*4882a593Smuzhiyun # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 600*4882a593Smuzhiyun vpaddd 0x40(%rsp),%ymm6,%ymm0 601*4882a593Smuzhiyun vmovdqa %ymm0,0x40(%rsp) 602*4882a593Smuzhiyun vpxor %ymm0,%ymm14,%ymm14 603*4882a593Smuzhiyun vpshufb %ymm3,%ymm14,%ymm14 604*4882a593Smuzhiyun # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 605*4882a593Smuzhiyun vpaddd 0x60(%rsp),%ymm7,%ymm0 606*4882a593Smuzhiyun vmovdqa %ymm0,0x60(%rsp) 607*4882a593Smuzhiyun vpxor %ymm0,%ymm15,%ymm15 608*4882a593Smuzhiyun vpshufb %ymm3,%ymm15,%ymm15 609*4882a593Smuzhiyun 610*4882a593Smuzhiyun # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 611*4882a593Smuzhiyun vpaddd %ymm12,%ymm8,%ymm8 612*4882a593Smuzhiyun vpxor %ymm8,%ymm4,%ymm4 613*4882a593Smuzhiyun vpslld $12,%ymm4,%ymm0 614*4882a593Smuzhiyun vpsrld $20,%ymm4,%ymm4 615*4882a593Smuzhiyun vpor %ymm0,%ymm4,%ymm4 616*4882a593Smuzhiyun # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 617*4882a593Smuzhiyun vpaddd %ymm13,%ymm9,%ymm9 618*4882a593Smuzhiyun vpxor %ymm9,%ymm5,%ymm5 619*4882a593Smuzhiyun vpslld $12,%ymm5,%ymm0 620*4882a593Smuzhiyun vpsrld $20,%ymm5,%ymm5 621*4882a593Smuzhiyun vpor %ymm0,%ymm5,%ymm5 622*4882a593Smuzhiyun # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 623*4882a593Smuzhiyun vpaddd %ymm14,%ymm10,%ymm10 624*4882a593Smuzhiyun vpxor %ymm10,%ymm6,%ymm6 625*4882a593Smuzhiyun vpslld $12,%ymm6,%ymm0 626*4882a593Smuzhiyun vpsrld $20,%ymm6,%ymm6 627*4882a593Smuzhiyun vpor %ymm0,%ymm6,%ymm6 628*4882a593Smuzhiyun # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 629*4882a593Smuzhiyun vpaddd %ymm15,%ymm11,%ymm11 630*4882a593Smuzhiyun vpxor %ymm11,%ymm7,%ymm7 631*4882a593Smuzhiyun vpslld $12,%ymm7,%ymm0 632*4882a593Smuzhiyun vpsrld $20,%ymm7,%ymm7 633*4882a593Smuzhiyun vpor %ymm0,%ymm7,%ymm7 634*4882a593Smuzhiyun 635*4882a593Smuzhiyun # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 636*4882a593Smuzhiyun vpaddd 0x00(%rsp),%ymm4,%ymm0 637*4882a593Smuzhiyun vmovdqa %ymm0,0x00(%rsp) 638*4882a593Smuzhiyun vpxor %ymm0,%ymm12,%ymm12 639*4882a593Smuzhiyun vpshufb %ymm2,%ymm12,%ymm12 640*4882a593Smuzhiyun # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 641*4882a593Smuzhiyun vpaddd 0x20(%rsp),%ymm5,%ymm0 642*4882a593Smuzhiyun vmovdqa %ymm0,0x20(%rsp) 643*4882a593Smuzhiyun vpxor %ymm0,%ymm13,%ymm13 644*4882a593Smuzhiyun vpshufb %ymm2,%ymm13,%ymm13 645*4882a593Smuzhiyun # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 646*4882a593Smuzhiyun vpaddd 0x40(%rsp),%ymm6,%ymm0 647*4882a593Smuzhiyun vmovdqa %ymm0,0x40(%rsp) 648*4882a593Smuzhiyun vpxor %ymm0,%ymm14,%ymm14 649*4882a593Smuzhiyun vpshufb %ymm2,%ymm14,%ymm14 650*4882a593Smuzhiyun # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 651*4882a593Smuzhiyun vpaddd 0x60(%rsp),%ymm7,%ymm0 652*4882a593Smuzhiyun vmovdqa %ymm0,0x60(%rsp) 653*4882a593Smuzhiyun vpxor %ymm0,%ymm15,%ymm15 654*4882a593Smuzhiyun vpshufb %ymm2,%ymm15,%ymm15 655*4882a593Smuzhiyun 656*4882a593Smuzhiyun # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 657*4882a593Smuzhiyun vpaddd %ymm12,%ymm8,%ymm8 658*4882a593Smuzhiyun vpxor %ymm8,%ymm4,%ymm4 659*4882a593Smuzhiyun vpslld $7,%ymm4,%ymm0 660*4882a593Smuzhiyun vpsrld $25,%ymm4,%ymm4 661*4882a593Smuzhiyun vpor %ymm0,%ymm4,%ymm4 662*4882a593Smuzhiyun # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 663*4882a593Smuzhiyun vpaddd %ymm13,%ymm9,%ymm9 664*4882a593Smuzhiyun vpxor %ymm9,%ymm5,%ymm5 665*4882a593Smuzhiyun vpslld $7,%ymm5,%ymm0 666*4882a593Smuzhiyun vpsrld $25,%ymm5,%ymm5 667*4882a593Smuzhiyun vpor %ymm0,%ymm5,%ymm5 668*4882a593Smuzhiyun # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 669*4882a593Smuzhiyun vpaddd %ymm14,%ymm10,%ymm10 670*4882a593Smuzhiyun vpxor %ymm10,%ymm6,%ymm6 671*4882a593Smuzhiyun vpslld $7,%ymm6,%ymm0 672*4882a593Smuzhiyun vpsrld $25,%ymm6,%ymm6 673*4882a593Smuzhiyun vpor %ymm0,%ymm6,%ymm6 674*4882a593Smuzhiyun # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 675*4882a593Smuzhiyun vpaddd %ymm15,%ymm11,%ymm11 676*4882a593Smuzhiyun vpxor %ymm11,%ymm7,%ymm7 677*4882a593Smuzhiyun vpslld $7,%ymm7,%ymm0 678*4882a593Smuzhiyun vpsrld $25,%ymm7,%ymm7 679*4882a593Smuzhiyun vpor %ymm0,%ymm7,%ymm7 680*4882a593Smuzhiyun 681*4882a593Smuzhiyun # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 682*4882a593Smuzhiyun vpaddd 0x00(%rsp),%ymm5,%ymm0 683*4882a593Smuzhiyun vmovdqa %ymm0,0x00(%rsp) 684*4882a593Smuzhiyun vpxor %ymm0,%ymm15,%ymm15 685*4882a593Smuzhiyun vpshufb %ymm3,%ymm15,%ymm15 686*4882a593Smuzhiyun # x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0 687*4882a593Smuzhiyun vpaddd 0x20(%rsp),%ymm6,%ymm0 688*4882a593Smuzhiyun vmovdqa %ymm0,0x20(%rsp) 689*4882a593Smuzhiyun vpxor %ymm0,%ymm12,%ymm12 690*4882a593Smuzhiyun vpshufb %ymm3,%ymm12,%ymm12 691*4882a593Smuzhiyun # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 692*4882a593Smuzhiyun vpaddd 0x40(%rsp),%ymm7,%ymm0 693*4882a593Smuzhiyun vmovdqa %ymm0,0x40(%rsp) 694*4882a593Smuzhiyun vpxor %ymm0,%ymm13,%ymm13 695*4882a593Smuzhiyun vpshufb %ymm3,%ymm13,%ymm13 696*4882a593Smuzhiyun # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 697*4882a593Smuzhiyun vpaddd 0x60(%rsp),%ymm4,%ymm0 698*4882a593Smuzhiyun vmovdqa %ymm0,0x60(%rsp) 699*4882a593Smuzhiyun vpxor %ymm0,%ymm14,%ymm14 700*4882a593Smuzhiyun vpshufb %ymm3,%ymm14,%ymm14 701*4882a593Smuzhiyun 702*4882a593Smuzhiyun # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 703*4882a593Smuzhiyun vpaddd %ymm15,%ymm10,%ymm10 704*4882a593Smuzhiyun vpxor %ymm10,%ymm5,%ymm5 705*4882a593Smuzhiyun vpslld $12,%ymm5,%ymm0 706*4882a593Smuzhiyun vpsrld $20,%ymm5,%ymm5 707*4882a593Smuzhiyun vpor %ymm0,%ymm5,%ymm5 708*4882a593Smuzhiyun # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 709*4882a593Smuzhiyun vpaddd %ymm12,%ymm11,%ymm11 710*4882a593Smuzhiyun vpxor %ymm11,%ymm6,%ymm6 711*4882a593Smuzhiyun vpslld $12,%ymm6,%ymm0 712*4882a593Smuzhiyun vpsrld $20,%ymm6,%ymm6 713*4882a593Smuzhiyun vpor %ymm0,%ymm6,%ymm6 714*4882a593Smuzhiyun # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 715*4882a593Smuzhiyun vpaddd %ymm13,%ymm8,%ymm8 716*4882a593Smuzhiyun vpxor %ymm8,%ymm7,%ymm7 717*4882a593Smuzhiyun vpslld $12,%ymm7,%ymm0 718*4882a593Smuzhiyun vpsrld $20,%ymm7,%ymm7 719*4882a593Smuzhiyun vpor %ymm0,%ymm7,%ymm7 720*4882a593Smuzhiyun # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 721*4882a593Smuzhiyun vpaddd %ymm14,%ymm9,%ymm9 722*4882a593Smuzhiyun vpxor %ymm9,%ymm4,%ymm4 723*4882a593Smuzhiyun vpslld $12,%ymm4,%ymm0 724*4882a593Smuzhiyun vpsrld $20,%ymm4,%ymm4 725*4882a593Smuzhiyun vpor %ymm0,%ymm4,%ymm4 726*4882a593Smuzhiyun 727*4882a593Smuzhiyun # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 728*4882a593Smuzhiyun vpaddd 0x00(%rsp),%ymm5,%ymm0 729*4882a593Smuzhiyun vmovdqa %ymm0,0x00(%rsp) 730*4882a593Smuzhiyun vpxor %ymm0,%ymm15,%ymm15 731*4882a593Smuzhiyun vpshufb %ymm2,%ymm15,%ymm15 732*4882a593Smuzhiyun # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 733*4882a593Smuzhiyun vpaddd 0x20(%rsp),%ymm6,%ymm0 734*4882a593Smuzhiyun vmovdqa %ymm0,0x20(%rsp) 735*4882a593Smuzhiyun vpxor %ymm0,%ymm12,%ymm12 736*4882a593Smuzhiyun vpshufb %ymm2,%ymm12,%ymm12 737*4882a593Smuzhiyun # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 738*4882a593Smuzhiyun vpaddd 0x40(%rsp),%ymm7,%ymm0 739*4882a593Smuzhiyun vmovdqa %ymm0,0x40(%rsp) 740*4882a593Smuzhiyun vpxor %ymm0,%ymm13,%ymm13 741*4882a593Smuzhiyun vpshufb %ymm2,%ymm13,%ymm13 742*4882a593Smuzhiyun # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 743*4882a593Smuzhiyun vpaddd 0x60(%rsp),%ymm4,%ymm0 744*4882a593Smuzhiyun vmovdqa %ymm0,0x60(%rsp) 745*4882a593Smuzhiyun vpxor %ymm0,%ymm14,%ymm14 746*4882a593Smuzhiyun vpshufb %ymm2,%ymm14,%ymm14 747*4882a593Smuzhiyun 748*4882a593Smuzhiyun # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 749*4882a593Smuzhiyun vpaddd %ymm15,%ymm10,%ymm10 750*4882a593Smuzhiyun vpxor %ymm10,%ymm5,%ymm5 751*4882a593Smuzhiyun vpslld $7,%ymm5,%ymm0 752*4882a593Smuzhiyun vpsrld $25,%ymm5,%ymm5 753*4882a593Smuzhiyun vpor %ymm0,%ymm5,%ymm5 754*4882a593Smuzhiyun # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 755*4882a593Smuzhiyun vpaddd %ymm12,%ymm11,%ymm11 756*4882a593Smuzhiyun vpxor %ymm11,%ymm6,%ymm6 757*4882a593Smuzhiyun vpslld $7,%ymm6,%ymm0 758*4882a593Smuzhiyun vpsrld $25,%ymm6,%ymm6 759*4882a593Smuzhiyun vpor %ymm0,%ymm6,%ymm6 760*4882a593Smuzhiyun # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 761*4882a593Smuzhiyun vpaddd %ymm13,%ymm8,%ymm8 762*4882a593Smuzhiyun vpxor %ymm8,%ymm7,%ymm7 763*4882a593Smuzhiyun vpslld $7,%ymm7,%ymm0 764*4882a593Smuzhiyun vpsrld $25,%ymm7,%ymm7 765*4882a593Smuzhiyun vpor %ymm0,%ymm7,%ymm7 766*4882a593Smuzhiyun # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 767*4882a593Smuzhiyun vpaddd %ymm14,%ymm9,%ymm9 768*4882a593Smuzhiyun vpxor %ymm9,%ymm4,%ymm4 769*4882a593Smuzhiyun vpslld $7,%ymm4,%ymm0 770*4882a593Smuzhiyun vpsrld $25,%ymm4,%ymm4 771*4882a593Smuzhiyun vpor %ymm0,%ymm4,%ymm4 772*4882a593Smuzhiyun 773*4882a593Smuzhiyun sub $2,%r8d 774*4882a593Smuzhiyun jnz .Ldoubleround8 775*4882a593Smuzhiyun 776*4882a593Smuzhiyun # x0..15[0-3] += s[0..15] 777*4882a593Smuzhiyun vpbroadcastd 0x00(%rdi),%ymm0 778*4882a593Smuzhiyun vpaddd 0x00(%rsp),%ymm0,%ymm0 779*4882a593Smuzhiyun vmovdqa %ymm0,0x00(%rsp) 780*4882a593Smuzhiyun vpbroadcastd 0x04(%rdi),%ymm0 781*4882a593Smuzhiyun vpaddd 0x20(%rsp),%ymm0,%ymm0 782*4882a593Smuzhiyun vmovdqa %ymm0,0x20(%rsp) 783*4882a593Smuzhiyun vpbroadcastd 0x08(%rdi),%ymm0 784*4882a593Smuzhiyun vpaddd 0x40(%rsp),%ymm0,%ymm0 785*4882a593Smuzhiyun vmovdqa %ymm0,0x40(%rsp) 786*4882a593Smuzhiyun vpbroadcastd 0x0c(%rdi),%ymm0 787*4882a593Smuzhiyun vpaddd 0x60(%rsp),%ymm0,%ymm0 788*4882a593Smuzhiyun vmovdqa %ymm0,0x60(%rsp) 789*4882a593Smuzhiyun vpbroadcastd 0x10(%rdi),%ymm0 790*4882a593Smuzhiyun vpaddd %ymm0,%ymm4,%ymm4 791*4882a593Smuzhiyun vpbroadcastd 0x14(%rdi),%ymm0 792*4882a593Smuzhiyun vpaddd %ymm0,%ymm5,%ymm5 793*4882a593Smuzhiyun vpbroadcastd 0x18(%rdi),%ymm0 794*4882a593Smuzhiyun vpaddd %ymm0,%ymm6,%ymm6 795*4882a593Smuzhiyun vpbroadcastd 0x1c(%rdi),%ymm0 796*4882a593Smuzhiyun vpaddd %ymm0,%ymm7,%ymm7 797*4882a593Smuzhiyun vpbroadcastd 0x20(%rdi),%ymm0 798*4882a593Smuzhiyun vpaddd %ymm0,%ymm8,%ymm8 799*4882a593Smuzhiyun vpbroadcastd 0x24(%rdi),%ymm0 800*4882a593Smuzhiyun vpaddd %ymm0,%ymm9,%ymm9 801*4882a593Smuzhiyun vpbroadcastd 0x28(%rdi),%ymm0 802*4882a593Smuzhiyun vpaddd %ymm0,%ymm10,%ymm10 803*4882a593Smuzhiyun vpbroadcastd 0x2c(%rdi),%ymm0 804*4882a593Smuzhiyun vpaddd %ymm0,%ymm11,%ymm11 805*4882a593Smuzhiyun vpbroadcastd 0x30(%rdi),%ymm0 806*4882a593Smuzhiyun vpaddd %ymm0,%ymm12,%ymm12 807*4882a593Smuzhiyun vpbroadcastd 0x34(%rdi),%ymm0 808*4882a593Smuzhiyun vpaddd %ymm0,%ymm13,%ymm13 809*4882a593Smuzhiyun vpbroadcastd 0x38(%rdi),%ymm0 810*4882a593Smuzhiyun vpaddd %ymm0,%ymm14,%ymm14 811*4882a593Smuzhiyun vpbroadcastd 0x3c(%rdi),%ymm0 812*4882a593Smuzhiyun vpaddd %ymm0,%ymm15,%ymm15 813*4882a593Smuzhiyun 814*4882a593Smuzhiyun # x12 += counter values 0-3 815*4882a593Smuzhiyun vpaddd %ymm1,%ymm12,%ymm12 816*4882a593Smuzhiyun 817*4882a593Smuzhiyun # interleave 32-bit words in state n, n+1 818*4882a593Smuzhiyun vmovdqa 0x00(%rsp),%ymm0 819*4882a593Smuzhiyun vmovdqa 0x20(%rsp),%ymm1 820*4882a593Smuzhiyun vpunpckldq %ymm1,%ymm0,%ymm2 821*4882a593Smuzhiyun vpunpckhdq %ymm1,%ymm0,%ymm1 822*4882a593Smuzhiyun vmovdqa %ymm2,0x00(%rsp) 823*4882a593Smuzhiyun vmovdqa %ymm1,0x20(%rsp) 824*4882a593Smuzhiyun vmovdqa 0x40(%rsp),%ymm0 825*4882a593Smuzhiyun vmovdqa 0x60(%rsp),%ymm1 826*4882a593Smuzhiyun vpunpckldq %ymm1,%ymm0,%ymm2 827*4882a593Smuzhiyun vpunpckhdq %ymm1,%ymm0,%ymm1 828*4882a593Smuzhiyun vmovdqa %ymm2,0x40(%rsp) 829*4882a593Smuzhiyun vmovdqa %ymm1,0x60(%rsp) 830*4882a593Smuzhiyun vmovdqa %ymm4,%ymm0 831*4882a593Smuzhiyun vpunpckldq %ymm5,%ymm0,%ymm4 832*4882a593Smuzhiyun vpunpckhdq %ymm5,%ymm0,%ymm5 833*4882a593Smuzhiyun vmovdqa %ymm6,%ymm0 834*4882a593Smuzhiyun vpunpckldq %ymm7,%ymm0,%ymm6 835*4882a593Smuzhiyun vpunpckhdq %ymm7,%ymm0,%ymm7 836*4882a593Smuzhiyun vmovdqa %ymm8,%ymm0 837*4882a593Smuzhiyun vpunpckldq %ymm9,%ymm0,%ymm8 838*4882a593Smuzhiyun vpunpckhdq %ymm9,%ymm0,%ymm9 839*4882a593Smuzhiyun vmovdqa %ymm10,%ymm0 840*4882a593Smuzhiyun vpunpckldq %ymm11,%ymm0,%ymm10 841*4882a593Smuzhiyun vpunpckhdq %ymm11,%ymm0,%ymm11 842*4882a593Smuzhiyun vmovdqa %ymm12,%ymm0 843*4882a593Smuzhiyun vpunpckldq %ymm13,%ymm0,%ymm12 844*4882a593Smuzhiyun vpunpckhdq %ymm13,%ymm0,%ymm13 845*4882a593Smuzhiyun vmovdqa %ymm14,%ymm0 846*4882a593Smuzhiyun vpunpckldq %ymm15,%ymm0,%ymm14 847*4882a593Smuzhiyun vpunpckhdq %ymm15,%ymm0,%ymm15 848*4882a593Smuzhiyun 849*4882a593Smuzhiyun # interleave 64-bit words in state n, n+2 850*4882a593Smuzhiyun vmovdqa 0x00(%rsp),%ymm0 851*4882a593Smuzhiyun vmovdqa 0x40(%rsp),%ymm2 852*4882a593Smuzhiyun vpunpcklqdq %ymm2,%ymm0,%ymm1 853*4882a593Smuzhiyun vpunpckhqdq %ymm2,%ymm0,%ymm2 854*4882a593Smuzhiyun vmovdqa %ymm1,0x00(%rsp) 855*4882a593Smuzhiyun vmovdqa %ymm2,0x40(%rsp) 856*4882a593Smuzhiyun vmovdqa 0x20(%rsp),%ymm0 857*4882a593Smuzhiyun vmovdqa 0x60(%rsp),%ymm2 858*4882a593Smuzhiyun vpunpcklqdq %ymm2,%ymm0,%ymm1 859*4882a593Smuzhiyun vpunpckhqdq %ymm2,%ymm0,%ymm2 860*4882a593Smuzhiyun vmovdqa %ymm1,0x20(%rsp) 861*4882a593Smuzhiyun vmovdqa %ymm2,0x60(%rsp) 862*4882a593Smuzhiyun vmovdqa %ymm4,%ymm0 863*4882a593Smuzhiyun vpunpcklqdq %ymm6,%ymm0,%ymm4 864*4882a593Smuzhiyun vpunpckhqdq %ymm6,%ymm0,%ymm6 865*4882a593Smuzhiyun vmovdqa %ymm5,%ymm0 866*4882a593Smuzhiyun vpunpcklqdq %ymm7,%ymm0,%ymm5 867*4882a593Smuzhiyun vpunpckhqdq %ymm7,%ymm0,%ymm7 868*4882a593Smuzhiyun vmovdqa %ymm8,%ymm0 869*4882a593Smuzhiyun vpunpcklqdq %ymm10,%ymm0,%ymm8 870*4882a593Smuzhiyun vpunpckhqdq %ymm10,%ymm0,%ymm10 871*4882a593Smuzhiyun vmovdqa %ymm9,%ymm0 872*4882a593Smuzhiyun vpunpcklqdq %ymm11,%ymm0,%ymm9 873*4882a593Smuzhiyun vpunpckhqdq %ymm11,%ymm0,%ymm11 874*4882a593Smuzhiyun vmovdqa %ymm12,%ymm0 875*4882a593Smuzhiyun vpunpcklqdq %ymm14,%ymm0,%ymm12 876*4882a593Smuzhiyun vpunpckhqdq %ymm14,%ymm0,%ymm14 877*4882a593Smuzhiyun vmovdqa %ymm13,%ymm0 878*4882a593Smuzhiyun vpunpcklqdq %ymm15,%ymm0,%ymm13 879*4882a593Smuzhiyun vpunpckhqdq %ymm15,%ymm0,%ymm15 880*4882a593Smuzhiyun 881*4882a593Smuzhiyun # interleave 128-bit words in state n, n+4 882*4882a593Smuzhiyun # xor/write first four blocks 883*4882a593Smuzhiyun vmovdqa 0x00(%rsp),%ymm1 884*4882a593Smuzhiyun vperm2i128 $0x20,%ymm4,%ymm1,%ymm0 885*4882a593Smuzhiyun cmp $0x0020,%rax 886*4882a593Smuzhiyun jl .Lxorpart8 887*4882a593Smuzhiyun vpxor 0x0000(%rdx),%ymm0,%ymm0 888*4882a593Smuzhiyun vmovdqu %ymm0,0x0000(%rsi) 889*4882a593Smuzhiyun vperm2i128 $0x31,%ymm4,%ymm1,%ymm4 890*4882a593Smuzhiyun 891*4882a593Smuzhiyun vperm2i128 $0x20,%ymm12,%ymm8,%ymm0 892*4882a593Smuzhiyun cmp $0x0040,%rax 893*4882a593Smuzhiyun jl .Lxorpart8 894*4882a593Smuzhiyun vpxor 0x0020(%rdx),%ymm0,%ymm0 895*4882a593Smuzhiyun vmovdqu %ymm0,0x0020(%rsi) 896*4882a593Smuzhiyun vperm2i128 $0x31,%ymm12,%ymm8,%ymm12 897*4882a593Smuzhiyun 898*4882a593Smuzhiyun vmovdqa 0x40(%rsp),%ymm1 899*4882a593Smuzhiyun vperm2i128 $0x20,%ymm6,%ymm1,%ymm0 900*4882a593Smuzhiyun cmp $0x0060,%rax 901*4882a593Smuzhiyun jl .Lxorpart8 902*4882a593Smuzhiyun vpxor 0x0040(%rdx),%ymm0,%ymm0 903*4882a593Smuzhiyun vmovdqu %ymm0,0x0040(%rsi) 904*4882a593Smuzhiyun vperm2i128 $0x31,%ymm6,%ymm1,%ymm6 905*4882a593Smuzhiyun 906*4882a593Smuzhiyun vperm2i128 $0x20,%ymm14,%ymm10,%ymm0 907*4882a593Smuzhiyun cmp $0x0080,%rax 908*4882a593Smuzhiyun jl .Lxorpart8 909*4882a593Smuzhiyun vpxor 0x0060(%rdx),%ymm0,%ymm0 910*4882a593Smuzhiyun vmovdqu %ymm0,0x0060(%rsi) 911*4882a593Smuzhiyun vperm2i128 $0x31,%ymm14,%ymm10,%ymm14 912*4882a593Smuzhiyun 913*4882a593Smuzhiyun vmovdqa 0x20(%rsp),%ymm1 914*4882a593Smuzhiyun vperm2i128 $0x20,%ymm5,%ymm1,%ymm0 915*4882a593Smuzhiyun cmp $0x00a0,%rax 916*4882a593Smuzhiyun jl .Lxorpart8 917*4882a593Smuzhiyun vpxor 0x0080(%rdx),%ymm0,%ymm0 918*4882a593Smuzhiyun vmovdqu %ymm0,0x0080(%rsi) 919*4882a593Smuzhiyun vperm2i128 $0x31,%ymm5,%ymm1,%ymm5 920*4882a593Smuzhiyun 921*4882a593Smuzhiyun vperm2i128 $0x20,%ymm13,%ymm9,%ymm0 922*4882a593Smuzhiyun cmp $0x00c0,%rax 923*4882a593Smuzhiyun jl .Lxorpart8 924*4882a593Smuzhiyun vpxor 0x00a0(%rdx),%ymm0,%ymm0 925*4882a593Smuzhiyun vmovdqu %ymm0,0x00a0(%rsi) 926*4882a593Smuzhiyun vperm2i128 $0x31,%ymm13,%ymm9,%ymm13 927*4882a593Smuzhiyun 928*4882a593Smuzhiyun vmovdqa 0x60(%rsp),%ymm1 929*4882a593Smuzhiyun vperm2i128 $0x20,%ymm7,%ymm1,%ymm0 930*4882a593Smuzhiyun cmp $0x00e0,%rax 931*4882a593Smuzhiyun jl .Lxorpart8 932*4882a593Smuzhiyun vpxor 0x00c0(%rdx),%ymm0,%ymm0 933*4882a593Smuzhiyun vmovdqu %ymm0,0x00c0(%rsi) 934*4882a593Smuzhiyun vperm2i128 $0x31,%ymm7,%ymm1,%ymm7 935*4882a593Smuzhiyun 936*4882a593Smuzhiyun vperm2i128 $0x20,%ymm15,%ymm11,%ymm0 937*4882a593Smuzhiyun cmp $0x0100,%rax 938*4882a593Smuzhiyun jl .Lxorpart8 939*4882a593Smuzhiyun vpxor 0x00e0(%rdx),%ymm0,%ymm0 940*4882a593Smuzhiyun vmovdqu %ymm0,0x00e0(%rsi) 941*4882a593Smuzhiyun vperm2i128 $0x31,%ymm15,%ymm11,%ymm15 942*4882a593Smuzhiyun 943*4882a593Smuzhiyun # xor remaining blocks, write to output 944*4882a593Smuzhiyun vmovdqa %ymm4,%ymm0 945*4882a593Smuzhiyun cmp $0x0120,%rax 946*4882a593Smuzhiyun jl .Lxorpart8 947*4882a593Smuzhiyun vpxor 0x0100(%rdx),%ymm0,%ymm0 948*4882a593Smuzhiyun vmovdqu %ymm0,0x0100(%rsi) 949*4882a593Smuzhiyun 950*4882a593Smuzhiyun vmovdqa %ymm12,%ymm0 951*4882a593Smuzhiyun cmp $0x0140,%rax 952*4882a593Smuzhiyun jl .Lxorpart8 953*4882a593Smuzhiyun vpxor 0x0120(%rdx),%ymm0,%ymm0 954*4882a593Smuzhiyun vmovdqu %ymm0,0x0120(%rsi) 955*4882a593Smuzhiyun 956*4882a593Smuzhiyun vmovdqa %ymm6,%ymm0 957*4882a593Smuzhiyun cmp $0x0160,%rax 958*4882a593Smuzhiyun jl .Lxorpart8 959*4882a593Smuzhiyun vpxor 0x0140(%rdx),%ymm0,%ymm0 960*4882a593Smuzhiyun vmovdqu %ymm0,0x0140(%rsi) 961*4882a593Smuzhiyun 962*4882a593Smuzhiyun vmovdqa %ymm14,%ymm0 963*4882a593Smuzhiyun cmp $0x0180,%rax 964*4882a593Smuzhiyun jl .Lxorpart8 965*4882a593Smuzhiyun vpxor 0x0160(%rdx),%ymm0,%ymm0 966*4882a593Smuzhiyun vmovdqu %ymm0,0x0160(%rsi) 967*4882a593Smuzhiyun 968*4882a593Smuzhiyun vmovdqa %ymm5,%ymm0 969*4882a593Smuzhiyun cmp $0x01a0,%rax 970*4882a593Smuzhiyun jl .Lxorpart8 971*4882a593Smuzhiyun vpxor 0x0180(%rdx),%ymm0,%ymm0 972*4882a593Smuzhiyun vmovdqu %ymm0,0x0180(%rsi) 973*4882a593Smuzhiyun 974*4882a593Smuzhiyun vmovdqa %ymm13,%ymm0 975*4882a593Smuzhiyun cmp $0x01c0,%rax 976*4882a593Smuzhiyun jl .Lxorpart8 977*4882a593Smuzhiyun vpxor 0x01a0(%rdx),%ymm0,%ymm0 978*4882a593Smuzhiyun vmovdqu %ymm0,0x01a0(%rsi) 979*4882a593Smuzhiyun 980*4882a593Smuzhiyun vmovdqa %ymm7,%ymm0 981*4882a593Smuzhiyun cmp $0x01e0,%rax 982*4882a593Smuzhiyun jl .Lxorpart8 983*4882a593Smuzhiyun vpxor 0x01c0(%rdx),%ymm0,%ymm0 984*4882a593Smuzhiyun vmovdqu %ymm0,0x01c0(%rsi) 985*4882a593Smuzhiyun 986*4882a593Smuzhiyun vmovdqa %ymm15,%ymm0 987*4882a593Smuzhiyun cmp $0x0200,%rax 988*4882a593Smuzhiyun jl .Lxorpart8 989*4882a593Smuzhiyun vpxor 0x01e0(%rdx),%ymm0,%ymm0 990*4882a593Smuzhiyun vmovdqu %ymm0,0x01e0(%rsi) 991*4882a593Smuzhiyun 992*4882a593Smuzhiyun.Ldone8: 993*4882a593Smuzhiyun vzeroupper 994*4882a593Smuzhiyun lea -8(%r10),%rsp 995*4882a593Smuzhiyun RET 996*4882a593Smuzhiyun 997*4882a593Smuzhiyun.Lxorpart8: 998*4882a593Smuzhiyun # xor remaining bytes from partial register into output 999*4882a593Smuzhiyun mov %rax,%r9 1000*4882a593Smuzhiyun and $0x1f,%r9 1001*4882a593Smuzhiyun jz .Ldone8 1002*4882a593Smuzhiyun and $~0x1f,%rax 1003*4882a593Smuzhiyun 1004*4882a593Smuzhiyun mov %rsi,%r11 1005*4882a593Smuzhiyun 1006*4882a593Smuzhiyun lea (%rdx,%rax),%rsi 1007*4882a593Smuzhiyun mov %rsp,%rdi 1008*4882a593Smuzhiyun mov %r9,%rcx 1009*4882a593Smuzhiyun rep movsb 1010*4882a593Smuzhiyun 1011*4882a593Smuzhiyun vpxor 0x00(%rsp),%ymm0,%ymm0 1012*4882a593Smuzhiyun vmovdqa %ymm0,0x00(%rsp) 1013*4882a593Smuzhiyun 1014*4882a593Smuzhiyun mov %rsp,%rsi 1015*4882a593Smuzhiyun lea (%r11,%rax),%rdi 1016*4882a593Smuzhiyun mov %r9,%rcx 1017*4882a593Smuzhiyun rep movsb 1018*4882a593Smuzhiyun 1019*4882a593Smuzhiyun jmp .Ldone8 1020*4882a593Smuzhiyun 1021*4882a593SmuzhiyunSYM_FUNC_END(chacha_8block_xor_avx2) 1022