1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2015 Martin Willi 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun#include <asm/frame.h> 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun.section .rodata.cst16.ROT8, "aM", @progbits, 16 12*4882a593Smuzhiyun.align 16 13*4882a593SmuzhiyunROT8: .octa 0x0e0d0c0f0a09080b0605040702010003 14*4882a593Smuzhiyun.section .rodata.cst16.ROT16, "aM", @progbits, 16 15*4882a593Smuzhiyun.align 16 16*4882a593SmuzhiyunROT16: .octa 0x0d0c0f0e09080b0a0504070601000302 17*4882a593Smuzhiyun.section .rodata.cst16.CTRINC, "aM", @progbits, 16 18*4882a593Smuzhiyun.align 16 19*4882a593SmuzhiyunCTRINC: .octa 0x00000003000000020000000100000000 20*4882a593Smuzhiyun 21*4882a593Smuzhiyun.text 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun/* 24*4882a593Smuzhiyun * chacha_permute - permute one block 25*4882a593Smuzhiyun * 26*4882a593Smuzhiyun * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3. This 27*4882a593Smuzhiyun * function performs matrix operations on four words in parallel, but requires 28*4882a593Smuzhiyun * shuffling to rearrange the words after each round. 8/16-bit word rotation is 29*4882a593Smuzhiyun * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word 30*4882a593Smuzhiyun * rotation uses traditional shift+OR. 31*4882a593Smuzhiyun * 32*4882a593Smuzhiyun * The round count is given in %r8d. 33*4882a593Smuzhiyun * 34*4882a593Smuzhiyun * Clobbers: %r8d, %xmm4-%xmm7 35*4882a593Smuzhiyun */ 36*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(chacha_permute) 37*4882a593Smuzhiyun 38*4882a593Smuzhiyun movdqa ROT8(%rip),%xmm4 39*4882a593Smuzhiyun movdqa ROT16(%rip),%xmm5 40*4882a593Smuzhiyun 41*4882a593Smuzhiyun.Ldoubleround: 42*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 43*4882a593Smuzhiyun paddd %xmm1,%xmm0 44*4882a593Smuzhiyun pxor %xmm0,%xmm3 45*4882a593Smuzhiyun pshufb %xmm5,%xmm3 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 48*4882a593Smuzhiyun paddd %xmm3,%xmm2 49*4882a593Smuzhiyun pxor %xmm2,%xmm1 50*4882a593Smuzhiyun movdqa %xmm1,%xmm6 51*4882a593Smuzhiyun pslld $12,%xmm6 52*4882a593Smuzhiyun psrld $20,%xmm1 53*4882a593Smuzhiyun por %xmm6,%xmm1 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 56*4882a593Smuzhiyun paddd %xmm1,%xmm0 57*4882a593Smuzhiyun pxor %xmm0,%xmm3 58*4882a593Smuzhiyun pshufb %xmm4,%xmm3 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 61*4882a593Smuzhiyun paddd %xmm3,%xmm2 62*4882a593Smuzhiyun pxor %xmm2,%xmm1 63*4882a593Smuzhiyun movdqa %xmm1,%xmm7 64*4882a593Smuzhiyun pslld $7,%xmm7 65*4882a593Smuzhiyun psrld $25,%xmm1 66*4882a593Smuzhiyun por %xmm7,%xmm1 67*4882a593Smuzhiyun 68*4882a593Smuzhiyun # x1 = shuffle32(x1, MASK(0, 3, 2, 1)) 69*4882a593Smuzhiyun pshufd $0x39,%xmm1,%xmm1 70*4882a593Smuzhiyun # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 71*4882a593Smuzhiyun pshufd $0x4e,%xmm2,%xmm2 72*4882a593Smuzhiyun # x3 = shuffle32(x3, MASK(2, 1, 0, 3)) 73*4882a593Smuzhiyun pshufd $0x93,%xmm3,%xmm3 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 16) 76*4882a593Smuzhiyun paddd %xmm1,%xmm0 77*4882a593Smuzhiyun pxor %xmm0,%xmm3 78*4882a593Smuzhiyun pshufb %xmm5,%xmm3 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 12) 81*4882a593Smuzhiyun paddd %xmm3,%xmm2 82*4882a593Smuzhiyun pxor %xmm2,%xmm1 83*4882a593Smuzhiyun movdqa %xmm1,%xmm6 84*4882a593Smuzhiyun pslld $12,%xmm6 85*4882a593Smuzhiyun psrld $20,%xmm1 86*4882a593Smuzhiyun por %xmm6,%xmm1 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun # x0 += x1, x3 = rotl32(x3 ^ x0, 8) 89*4882a593Smuzhiyun paddd %xmm1,%xmm0 90*4882a593Smuzhiyun pxor %xmm0,%xmm3 91*4882a593Smuzhiyun pshufb %xmm4,%xmm3 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun # x2 += x3, x1 = rotl32(x1 ^ x2, 7) 94*4882a593Smuzhiyun paddd %xmm3,%xmm2 95*4882a593Smuzhiyun pxor %xmm2,%xmm1 96*4882a593Smuzhiyun movdqa %xmm1,%xmm7 97*4882a593Smuzhiyun pslld $7,%xmm7 98*4882a593Smuzhiyun psrld $25,%xmm1 99*4882a593Smuzhiyun por %xmm7,%xmm1 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun # x1 = shuffle32(x1, MASK(2, 1, 0, 3)) 102*4882a593Smuzhiyun pshufd $0x93,%xmm1,%xmm1 103*4882a593Smuzhiyun # x2 = shuffle32(x2, MASK(1, 0, 3, 2)) 104*4882a593Smuzhiyun pshufd $0x4e,%xmm2,%xmm2 105*4882a593Smuzhiyun # x3 = shuffle32(x3, MASK(0, 3, 2, 1)) 106*4882a593Smuzhiyun pshufd $0x39,%xmm3,%xmm3 107*4882a593Smuzhiyun 108*4882a593Smuzhiyun sub $2,%r8d 109*4882a593Smuzhiyun jnz .Ldoubleround 110*4882a593Smuzhiyun 111*4882a593Smuzhiyun RET 112*4882a593SmuzhiyunSYM_FUNC_END(chacha_permute) 113*4882a593Smuzhiyun 114*4882a593SmuzhiyunSYM_FUNC_START(chacha_block_xor_ssse3) 115*4882a593Smuzhiyun # %rdi: Input state matrix, s 116*4882a593Smuzhiyun # %rsi: up to 1 data block output, o 117*4882a593Smuzhiyun # %rdx: up to 1 data block input, i 118*4882a593Smuzhiyun # %rcx: input/output length in bytes 119*4882a593Smuzhiyun # %r8d: nrounds 120*4882a593Smuzhiyun FRAME_BEGIN 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun # x0..3 = s0..3 123*4882a593Smuzhiyun movdqu 0x00(%rdi),%xmm0 124*4882a593Smuzhiyun movdqu 0x10(%rdi),%xmm1 125*4882a593Smuzhiyun movdqu 0x20(%rdi),%xmm2 126*4882a593Smuzhiyun movdqu 0x30(%rdi),%xmm3 127*4882a593Smuzhiyun movdqa %xmm0,%xmm8 128*4882a593Smuzhiyun movdqa %xmm1,%xmm9 129*4882a593Smuzhiyun movdqa %xmm2,%xmm10 130*4882a593Smuzhiyun movdqa %xmm3,%xmm11 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun mov %rcx,%rax 133*4882a593Smuzhiyun call chacha_permute 134*4882a593Smuzhiyun 135*4882a593Smuzhiyun # o0 = i0 ^ (x0 + s0) 136*4882a593Smuzhiyun paddd %xmm8,%xmm0 137*4882a593Smuzhiyun cmp $0x10,%rax 138*4882a593Smuzhiyun jl .Lxorpart 139*4882a593Smuzhiyun movdqu 0x00(%rdx),%xmm4 140*4882a593Smuzhiyun pxor %xmm4,%xmm0 141*4882a593Smuzhiyun movdqu %xmm0,0x00(%rsi) 142*4882a593Smuzhiyun # o1 = i1 ^ (x1 + s1) 143*4882a593Smuzhiyun paddd %xmm9,%xmm1 144*4882a593Smuzhiyun movdqa %xmm1,%xmm0 145*4882a593Smuzhiyun cmp $0x20,%rax 146*4882a593Smuzhiyun jl .Lxorpart 147*4882a593Smuzhiyun movdqu 0x10(%rdx),%xmm0 148*4882a593Smuzhiyun pxor %xmm1,%xmm0 149*4882a593Smuzhiyun movdqu %xmm0,0x10(%rsi) 150*4882a593Smuzhiyun # o2 = i2 ^ (x2 + s2) 151*4882a593Smuzhiyun paddd %xmm10,%xmm2 152*4882a593Smuzhiyun movdqa %xmm2,%xmm0 153*4882a593Smuzhiyun cmp $0x30,%rax 154*4882a593Smuzhiyun jl .Lxorpart 155*4882a593Smuzhiyun movdqu 0x20(%rdx),%xmm0 156*4882a593Smuzhiyun pxor %xmm2,%xmm0 157*4882a593Smuzhiyun movdqu %xmm0,0x20(%rsi) 158*4882a593Smuzhiyun # o3 = i3 ^ (x3 + s3) 159*4882a593Smuzhiyun paddd %xmm11,%xmm3 160*4882a593Smuzhiyun movdqa %xmm3,%xmm0 161*4882a593Smuzhiyun cmp $0x40,%rax 162*4882a593Smuzhiyun jl .Lxorpart 163*4882a593Smuzhiyun movdqu 0x30(%rdx),%xmm0 164*4882a593Smuzhiyun pxor %xmm3,%xmm0 165*4882a593Smuzhiyun movdqu %xmm0,0x30(%rsi) 166*4882a593Smuzhiyun 167*4882a593Smuzhiyun.Ldone: 168*4882a593Smuzhiyun FRAME_END 169*4882a593Smuzhiyun RET 170*4882a593Smuzhiyun 171*4882a593Smuzhiyun.Lxorpart: 172*4882a593Smuzhiyun # xor remaining bytes from partial register into output 173*4882a593Smuzhiyun mov %rax,%r9 174*4882a593Smuzhiyun and $0x0f,%r9 175*4882a593Smuzhiyun jz .Ldone 176*4882a593Smuzhiyun and $~0x0f,%rax 177*4882a593Smuzhiyun 178*4882a593Smuzhiyun mov %rsi,%r11 179*4882a593Smuzhiyun 180*4882a593Smuzhiyun lea 8(%rsp),%r10 181*4882a593Smuzhiyun sub $0x10,%rsp 182*4882a593Smuzhiyun and $~31,%rsp 183*4882a593Smuzhiyun 184*4882a593Smuzhiyun lea (%rdx,%rax),%rsi 185*4882a593Smuzhiyun mov %rsp,%rdi 186*4882a593Smuzhiyun mov %r9,%rcx 187*4882a593Smuzhiyun rep movsb 188*4882a593Smuzhiyun 189*4882a593Smuzhiyun pxor 0x00(%rsp),%xmm0 190*4882a593Smuzhiyun movdqa %xmm0,0x00(%rsp) 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun mov %rsp,%rsi 193*4882a593Smuzhiyun lea (%r11,%rax),%rdi 194*4882a593Smuzhiyun mov %r9,%rcx 195*4882a593Smuzhiyun rep movsb 196*4882a593Smuzhiyun 197*4882a593Smuzhiyun lea -8(%r10),%rsp 198*4882a593Smuzhiyun jmp .Ldone 199*4882a593Smuzhiyun 200*4882a593SmuzhiyunSYM_FUNC_END(chacha_block_xor_ssse3) 201*4882a593Smuzhiyun 202*4882a593SmuzhiyunSYM_FUNC_START(hchacha_block_ssse3) 203*4882a593Smuzhiyun # %rdi: Input state matrix, s 204*4882a593Smuzhiyun # %rsi: output (8 32-bit words) 205*4882a593Smuzhiyun # %edx: nrounds 206*4882a593Smuzhiyun FRAME_BEGIN 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun movdqu 0x00(%rdi),%xmm0 209*4882a593Smuzhiyun movdqu 0x10(%rdi),%xmm1 210*4882a593Smuzhiyun movdqu 0x20(%rdi),%xmm2 211*4882a593Smuzhiyun movdqu 0x30(%rdi),%xmm3 212*4882a593Smuzhiyun 213*4882a593Smuzhiyun mov %edx,%r8d 214*4882a593Smuzhiyun call chacha_permute 215*4882a593Smuzhiyun 216*4882a593Smuzhiyun movdqu %xmm0,0x00(%rsi) 217*4882a593Smuzhiyun movdqu %xmm3,0x10(%rsi) 218*4882a593Smuzhiyun 219*4882a593Smuzhiyun FRAME_END 220*4882a593Smuzhiyun RET 221*4882a593SmuzhiyunSYM_FUNC_END(hchacha_block_ssse3) 222*4882a593Smuzhiyun 223*4882a593SmuzhiyunSYM_FUNC_START(chacha_4block_xor_ssse3) 224*4882a593Smuzhiyun # %rdi: Input state matrix, s 225*4882a593Smuzhiyun # %rsi: up to 4 data blocks output, o 226*4882a593Smuzhiyun # %rdx: up to 4 data blocks input, i 227*4882a593Smuzhiyun # %rcx: input/output length in bytes 228*4882a593Smuzhiyun # %r8d: nrounds 229*4882a593Smuzhiyun 230*4882a593Smuzhiyun # This function encrypts four consecutive ChaCha blocks by loading the 231*4882a593Smuzhiyun # the state matrix in SSE registers four times. As we need some scratch 232*4882a593Smuzhiyun # registers, we save the first four registers on the stack. The 233*4882a593Smuzhiyun # algorithm performs each operation on the corresponding word of each 234*4882a593Smuzhiyun # state matrix, hence requires no word shuffling. For final XORing step 235*4882a593Smuzhiyun # we transpose the matrix by interleaving 32- and then 64-bit words, 236*4882a593Smuzhiyun # which allows us to do XOR in SSE registers. 8/16-bit word rotation is 237*4882a593Smuzhiyun # done with the slightly better performing SSSE3 byte shuffling, 238*4882a593Smuzhiyun # 7/12-bit word rotation uses traditional shift+OR. 239*4882a593Smuzhiyun 240*4882a593Smuzhiyun lea 8(%rsp),%r10 241*4882a593Smuzhiyun sub $0x80,%rsp 242*4882a593Smuzhiyun and $~63,%rsp 243*4882a593Smuzhiyun mov %rcx,%rax 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun # x0..15[0-3] = s0..3[0..3] 246*4882a593Smuzhiyun movq 0x00(%rdi),%xmm1 247*4882a593Smuzhiyun pshufd $0x00,%xmm1,%xmm0 248*4882a593Smuzhiyun pshufd $0x55,%xmm1,%xmm1 249*4882a593Smuzhiyun movq 0x08(%rdi),%xmm3 250*4882a593Smuzhiyun pshufd $0x00,%xmm3,%xmm2 251*4882a593Smuzhiyun pshufd $0x55,%xmm3,%xmm3 252*4882a593Smuzhiyun movq 0x10(%rdi),%xmm5 253*4882a593Smuzhiyun pshufd $0x00,%xmm5,%xmm4 254*4882a593Smuzhiyun pshufd $0x55,%xmm5,%xmm5 255*4882a593Smuzhiyun movq 0x18(%rdi),%xmm7 256*4882a593Smuzhiyun pshufd $0x00,%xmm7,%xmm6 257*4882a593Smuzhiyun pshufd $0x55,%xmm7,%xmm7 258*4882a593Smuzhiyun movq 0x20(%rdi),%xmm9 259*4882a593Smuzhiyun pshufd $0x00,%xmm9,%xmm8 260*4882a593Smuzhiyun pshufd $0x55,%xmm9,%xmm9 261*4882a593Smuzhiyun movq 0x28(%rdi),%xmm11 262*4882a593Smuzhiyun pshufd $0x00,%xmm11,%xmm10 263*4882a593Smuzhiyun pshufd $0x55,%xmm11,%xmm11 264*4882a593Smuzhiyun movq 0x30(%rdi),%xmm13 265*4882a593Smuzhiyun pshufd $0x00,%xmm13,%xmm12 266*4882a593Smuzhiyun pshufd $0x55,%xmm13,%xmm13 267*4882a593Smuzhiyun movq 0x38(%rdi),%xmm15 268*4882a593Smuzhiyun pshufd $0x00,%xmm15,%xmm14 269*4882a593Smuzhiyun pshufd $0x55,%xmm15,%xmm15 270*4882a593Smuzhiyun # x0..3 on stack 271*4882a593Smuzhiyun movdqa %xmm0,0x00(%rsp) 272*4882a593Smuzhiyun movdqa %xmm1,0x10(%rsp) 273*4882a593Smuzhiyun movdqa %xmm2,0x20(%rsp) 274*4882a593Smuzhiyun movdqa %xmm3,0x30(%rsp) 275*4882a593Smuzhiyun 276*4882a593Smuzhiyun movdqa CTRINC(%rip),%xmm1 277*4882a593Smuzhiyun movdqa ROT8(%rip),%xmm2 278*4882a593Smuzhiyun movdqa ROT16(%rip),%xmm3 279*4882a593Smuzhiyun 280*4882a593Smuzhiyun # x12 += counter values 0-3 281*4882a593Smuzhiyun paddd %xmm1,%xmm12 282*4882a593Smuzhiyun 283*4882a593Smuzhiyun.Ldoubleround4: 284*4882a593Smuzhiyun # x0 += x4, x12 = rotl32(x12 ^ x0, 16) 285*4882a593Smuzhiyun movdqa 0x00(%rsp),%xmm0 286*4882a593Smuzhiyun paddd %xmm4,%xmm0 287*4882a593Smuzhiyun movdqa %xmm0,0x00(%rsp) 288*4882a593Smuzhiyun pxor %xmm0,%xmm12 289*4882a593Smuzhiyun pshufb %xmm3,%xmm12 290*4882a593Smuzhiyun # x1 += x5, x13 = rotl32(x13 ^ x1, 16) 291*4882a593Smuzhiyun movdqa 0x10(%rsp),%xmm0 292*4882a593Smuzhiyun paddd %xmm5,%xmm0 293*4882a593Smuzhiyun movdqa %xmm0,0x10(%rsp) 294*4882a593Smuzhiyun pxor %xmm0,%xmm13 295*4882a593Smuzhiyun pshufb %xmm3,%xmm13 296*4882a593Smuzhiyun # x2 += x6, x14 = rotl32(x14 ^ x2, 16) 297*4882a593Smuzhiyun movdqa 0x20(%rsp),%xmm0 298*4882a593Smuzhiyun paddd %xmm6,%xmm0 299*4882a593Smuzhiyun movdqa %xmm0,0x20(%rsp) 300*4882a593Smuzhiyun pxor %xmm0,%xmm14 301*4882a593Smuzhiyun pshufb %xmm3,%xmm14 302*4882a593Smuzhiyun # x3 += x7, x15 = rotl32(x15 ^ x3, 16) 303*4882a593Smuzhiyun movdqa 0x30(%rsp),%xmm0 304*4882a593Smuzhiyun paddd %xmm7,%xmm0 305*4882a593Smuzhiyun movdqa %xmm0,0x30(%rsp) 306*4882a593Smuzhiyun pxor %xmm0,%xmm15 307*4882a593Smuzhiyun pshufb %xmm3,%xmm15 308*4882a593Smuzhiyun 309*4882a593Smuzhiyun # x8 += x12, x4 = rotl32(x4 ^ x8, 12) 310*4882a593Smuzhiyun paddd %xmm12,%xmm8 311*4882a593Smuzhiyun pxor %xmm8,%xmm4 312*4882a593Smuzhiyun movdqa %xmm4,%xmm0 313*4882a593Smuzhiyun pslld $12,%xmm0 314*4882a593Smuzhiyun psrld $20,%xmm4 315*4882a593Smuzhiyun por %xmm0,%xmm4 316*4882a593Smuzhiyun # x9 += x13, x5 = rotl32(x5 ^ x9, 12) 317*4882a593Smuzhiyun paddd %xmm13,%xmm9 318*4882a593Smuzhiyun pxor %xmm9,%xmm5 319*4882a593Smuzhiyun movdqa %xmm5,%xmm0 320*4882a593Smuzhiyun pslld $12,%xmm0 321*4882a593Smuzhiyun psrld $20,%xmm5 322*4882a593Smuzhiyun por %xmm0,%xmm5 323*4882a593Smuzhiyun # x10 += x14, x6 = rotl32(x6 ^ x10, 12) 324*4882a593Smuzhiyun paddd %xmm14,%xmm10 325*4882a593Smuzhiyun pxor %xmm10,%xmm6 326*4882a593Smuzhiyun movdqa %xmm6,%xmm0 327*4882a593Smuzhiyun pslld $12,%xmm0 328*4882a593Smuzhiyun psrld $20,%xmm6 329*4882a593Smuzhiyun por %xmm0,%xmm6 330*4882a593Smuzhiyun # x11 += x15, x7 = rotl32(x7 ^ x11, 12) 331*4882a593Smuzhiyun paddd %xmm15,%xmm11 332*4882a593Smuzhiyun pxor %xmm11,%xmm7 333*4882a593Smuzhiyun movdqa %xmm7,%xmm0 334*4882a593Smuzhiyun pslld $12,%xmm0 335*4882a593Smuzhiyun psrld $20,%xmm7 336*4882a593Smuzhiyun por %xmm0,%xmm7 337*4882a593Smuzhiyun 338*4882a593Smuzhiyun # x0 += x4, x12 = rotl32(x12 ^ x0, 8) 339*4882a593Smuzhiyun movdqa 0x00(%rsp),%xmm0 340*4882a593Smuzhiyun paddd %xmm4,%xmm0 341*4882a593Smuzhiyun movdqa %xmm0,0x00(%rsp) 342*4882a593Smuzhiyun pxor %xmm0,%xmm12 343*4882a593Smuzhiyun pshufb %xmm2,%xmm12 344*4882a593Smuzhiyun # x1 += x5, x13 = rotl32(x13 ^ x1, 8) 345*4882a593Smuzhiyun movdqa 0x10(%rsp),%xmm0 346*4882a593Smuzhiyun paddd %xmm5,%xmm0 347*4882a593Smuzhiyun movdqa %xmm0,0x10(%rsp) 348*4882a593Smuzhiyun pxor %xmm0,%xmm13 349*4882a593Smuzhiyun pshufb %xmm2,%xmm13 350*4882a593Smuzhiyun # x2 += x6, x14 = rotl32(x14 ^ x2, 8) 351*4882a593Smuzhiyun movdqa 0x20(%rsp),%xmm0 352*4882a593Smuzhiyun paddd %xmm6,%xmm0 353*4882a593Smuzhiyun movdqa %xmm0,0x20(%rsp) 354*4882a593Smuzhiyun pxor %xmm0,%xmm14 355*4882a593Smuzhiyun pshufb %xmm2,%xmm14 356*4882a593Smuzhiyun # x3 += x7, x15 = rotl32(x15 ^ x3, 8) 357*4882a593Smuzhiyun movdqa 0x30(%rsp),%xmm0 358*4882a593Smuzhiyun paddd %xmm7,%xmm0 359*4882a593Smuzhiyun movdqa %xmm0,0x30(%rsp) 360*4882a593Smuzhiyun pxor %xmm0,%xmm15 361*4882a593Smuzhiyun pshufb %xmm2,%xmm15 362*4882a593Smuzhiyun 363*4882a593Smuzhiyun # x8 += x12, x4 = rotl32(x4 ^ x8, 7) 364*4882a593Smuzhiyun paddd %xmm12,%xmm8 365*4882a593Smuzhiyun pxor %xmm8,%xmm4 366*4882a593Smuzhiyun movdqa %xmm4,%xmm0 367*4882a593Smuzhiyun pslld $7,%xmm0 368*4882a593Smuzhiyun psrld $25,%xmm4 369*4882a593Smuzhiyun por %xmm0,%xmm4 370*4882a593Smuzhiyun # x9 += x13, x5 = rotl32(x5 ^ x9, 7) 371*4882a593Smuzhiyun paddd %xmm13,%xmm9 372*4882a593Smuzhiyun pxor %xmm9,%xmm5 373*4882a593Smuzhiyun movdqa %xmm5,%xmm0 374*4882a593Smuzhiyun pslld $7,%xmm0 375*4882a593Smuzhiyun psrld $25,%xmm5 376*4882a593Smuzhiyun por %xmm0,%xmm5 377*4882a593Smuzhiyun # x10 += x14, x6 = rotl32(x6 ^ x10, 7) 378*4882a593Smuzhiyun paddd %xmm14,%xmm10 379*4882a593Smuzhiyun pxor %xmm10,%xmm6 380*4882a593Smuzhiyun movdqa %xmm6,%xmm0 381*4882a593Smuzhiyun pslld $7,%xmm0 382*4882a593Smuzhiyun psrld $25,%xmm6 383*4882a593Smuzhiyun por %xmm0,%xmm6 384*4882a593Smuzhiyun # x11 += x15, x7 = rotl32(x7 ^ x11, 7) 385*4882a593Smuzhiyun paddd %xmm15,%xmm11 386*4882a593Smuzhiyun pxor %xmm11,%xmm7 387*4882a593Smuzhiyun movdqa %xmm7,%xmm0 388*4882a593Smuzhiyun pslld $7,%xmm0 389*4882a593Smuzhiyun psrld $25,%xmm7 390*4882a593Smuzhiyun por %xmm0,%xmm7 391*4882a593Smuzhiyun 392*4882a593Smuzhiyun # x0 += x5, x15 = rotl32(x15 ^ x0, 16) 393*4882a593Smuzhiyun movdqa 0x00(%rsp),%xmm0 394*4882a593Smuzhiyun paddd %xmm5,%xmm0 395*4882a593Smuzhiyun movdqa %xmm0,0x00(%rsp) 396*4882a593Smuzhiyun pxor %xmm0,%xmm15 397*4882a593Smuzhiyun pshufb %xmm3,%xmm15 398*4882a593Smuzhiyun # x1 += x6, x12 = rotl32(x12 ^ x1, 16) 399*4882a593Smuzhiyun movdqa 0x10(%rsp),%xmm0 400*4882a593Smuzhiyun paddd %xmm6,%xmm0 401*4882a593Smuzhiyun movdqa %xmm0,0x10(%rsp) 402*4882a593Smuzhiyun pxor %xmm0,%xmm12 403*4882a593Smuzhiyun pshufb %xmm3,%xmm12 404*4882a593Smuzhiyun # x2 += x7, x13 = rotl32(x13 ^ x2, 16) 405*4882a593Smuzhiyun movdqa 0x20(%rsp),%xmm0 406*4882a593Smuzhiyun paddd %xmm7,%xmm0 407*4882a593Smuzhiyun movdqa %xmm0,0x20(%rsp) 408*4882a593Smuzhiyun pxor %xmm0,%xmm13 409*4882a593Smuzhiyun pshufb %xmm3,%xmm13 410*4882a593Smuzhiyun # x3 += x4, x14 = rotl32(x14 ^ x3, 16) 411*4882a593Smuzhiyun movdqa 0x30(%rsp),%xmm0 412*4882a593Smuzhiyun paddd %xmm4,%xmm0 413*4882a593Smuzhiyun movdqa %xmm0,0x30(%rsp) 414*4882a593Smuzhiyun pxor %xmm0,%xmm14 415*4882a593Smuzhiyun pshufb %xmm3,%xmm14 416*4882a593Smuzhiyun 417*4882a593Smuzhiyun # x10 += x15, x5 = rotl32(x5 ^ x10, 12) 418*4882a593Smuzhiyun paddd %xmm15,%xmm10 419*4882a593Smuzhiyun pxor %xmm10,%xmm5 420*4882a593Smuzhiyun movdqa %xmm5,%xmm0 421*4882a593Smuzhiyun pslld $12,%xmm0 422*4882a593Smuzhiyun psrld $20,%xmm5 423*4882a593Smuzhiyun por %xmm0,%xmm5 424*4882a593Smuzhiyun # x11 += x12, x6 = rotl32(x6 ^ x11, 12) 425*4882a593Smuzhiyun paddd %xmm12,%xmm11 426*4882a593Smuzhiyun pxor %xmm11,%xmm6 427*4882a593Smuzhiyun movdqa %xmm6,%xmm0 428*4882a593Smuzhiyun pslld $12,%xmm0 429*4882a593Smuzhiyun psrld $20,%xmm6 430*4882a593Smuzhiyun por %xmm0,%xmm6 431*4882a593Smuzhiyun # x8 += x13, x7 = rotl32(x7 ^ x8, 12) 432*4882a593Smuzhiyun paddd %xmm13,%xmm8 433*4882a593Smuzhiyun pxor %xmm8,%xmm7 434*4882a593Smuzhiyun movdqa %xmm7,%xmm0 435*4882a593Smuzhiyun pslld $12,%xmm0 436*4882a593Smuzhiyun psrld $20,%xmm7 437*4882a593Smuzhiyun por %xmm0,%xmm7 438*4882a593Smuzhiyun # x9 += x14, x4 = rotl32(x4 ^ x9, 12) 439*4882a593Smuzhiyun paddd %xmm14,%xmm9 440*4882a593Smuzhiyun pxor %xmm9,%xmm4 441*4882a593Smuzhiyun movdqa %xmm4,%xmm0 442*4882a593Smuzhiyun pslld $12,%xmm0 443*4882a593Smuzhiyun psrld $20,%xmm4 444*4882a593Smuzhiyun por %xmm0,%xmm4 445*4882a593Smuzhiyun 446*4882a593Smuzhiyun # x0 += x5, x15 = rotl32(x15 ^ x0, 8) 447*4882a593Smuzhiyun movdqa 0x00(%rsp),%xmm0 448*4882a593Smuzhiyun paddd %xmm5,%xmm0 449*4882a593Smuzhiyun movdqa %xmm0,0x00(%rsp) 450*4882a593Smuzhiyun pxor %xmm0,%xmm15 451*4882a593Smuzhiyun pshufb %xmm2,%xmm15 452*4882a593Smuzhiyun # x1 += x6, x12 = rotl32(x12 ^ x1, 8) 453*4882a593Smuzhiyun movdqa 0x10(%rsp),%xmm0 454*4882a593Smuzhiyun paddd %xmm6,%xmm0 455*4882a593Smuzhiyun movdqa %xmm0,0x10(%rsp) 456*4882a593Smuzhiyun pxor %xmm0,%xmm12 457*4882a593Smuzhiyun pshufb %xmm2,%xmm12 458*4882a593Smuzhiyun # x2 += x7, x13 = rotl32(x13 ^ x2, 8) 459*4882a593Smuzhiyun movdqa 0x20(%rsp),%xmm0 460*4882a593Smuzhiyun paddd %xmm7,%xmm0 461*4882a593Smuzhiyun movdqa %xmm0,0x20(%rsp) 462*4882a593Smuzhiyun pxor %xmm0,%xmm13 463*4882a593Smuzhiyun pshufb %xmm2,%xmm13 464*4882a593Smuzhiyun # x3 += x4, x14 = rotl32(x14 ^ x3, 8) 465*4882a593Smuzhiyun movdqa 0x30(%rsp),%xmm0 466*4882a593Smuzhiyun paddd %xmm4,%xmm0 467*4882a593Smuzhiyun movdqa %xmm0,0x30(%rsp) 468*4882a593Smuzhiyun pxor %xmm0,%xmm14 469*4882a593Smuzhiyun pshufb %xmm2,%xmm14 470*4882a593Smuzhiyun 471*4882a593Smuzhiyun # x10 += x15, x5 = rotl32(x5 ^ x10, 7) 472*4882a593Smuzhiyun paddd %xmm15,%xmm10 473*4882a593Smuzhiyun pxor %xmm10,%xmm5 474*4882a593Smuzhiyun movdqa %xmm5,%xmm0 475*4882a593Smuzhiyun pslld $7,%xmm0 476*4882a593Smuzhiyun psrld $25,%xmm5 477*4882a593Smuzhiyun por %xmm0,%xmm5 478*4882a593Smuzhiyun # x11 += x12, x6 = rotl32(x6 ^ x11, 7) 479*4882a593Smuzhiyun paddd %xmm12,%xmm11 480*4882a593Smuzhiyun pxor %xmm11,%xmm6 481*4882a593Smuzhiyun movdqa %xmm6,%xmm0 482*4882a593Smuzhiyun pslld $7,%xmm0 483*4882a593Smuzhiyun psrld $25,%xmm6 484*4882a593Smuzhiyun por %xmm0,%xmm6 485*4882a593Smuzhiyun # x8 += x13, x7 = rotl32(x7 ^ x8, 7) 486*4882a593Smuzhiyun paddd %xmm13,%xmm8 487*4882a593Smuzhiyun pxor %xmm8,%xmm7 488*4882a593Smuzhiyun movdqa %xmm7,%xmm0 489*4882a593Smuzhiyun pslld $7,%xmm0 490*4882a593Smuzhiyun psrld $25,%xmm7 491*4882a593Smuzhiyun por %xmm0,%xmm7 492*4882a593Smuzhiyun # x9 += x14, x4 = rotl32(x4 ^ x9, 7) 493*4882a593Smuzhiyun paddd %xmm14,%xmm9 494*4882a593Smuzhiyun pxor %xmm9,%xmm4 495*4882a593Smuzhiyun movdqa %xmm4,%xmm0 496*4882a593Smuzhiyun pslld $7,%xmm0 497*4882a593Smuzhiyun psrld $25,%xmm4 498*4882a593Smuzhiyun por %xmm0,%xmm4 499*4882a593Smuzhiyun 500*4882a593Smuzhiyun sub $2,%r8d 501*4882a593Smuzhiyun jnz .Ldoubleround4 502*4882a593Smuzhiyun 503*4882a593Smuzhiyun # x0[0-3] += s0[0] 504*4882a593Smuzhiyun # x1[0-3] += s0[1] 505*4882a593Smuzhiyun movq 0x00(%rdi),%xmm3 506*4882a593Smuzhiyun pshufd $0x00,%xmm3,%xmm2 507*4882a593Smuzhiyun pshufd $0x55,%xmm3,%xmm3 508*4882a593Smuzhiyun paddd 0x00(%rsp),%xmm2 509*4882a593Smuzhiyun movdqa %xmm2,0x00(%rsp) 510*4882a593Smuzhiyun paddd 0x10(%rsp),%xmm3 511*4882a593Smuzhiyun movdqa %xmm3,0x10(%rsp) 512*4882a593Smuzhiyun # x2[0-3] += s0[2] 513*4882a593Smuzhiyun # x3[0-3] += s0[3] 514*4882a593Smuzhiyun movq 0x08(%rdi),%xmm3 515*4882a593Smuzhiyun pshufd $0x00,%xmm3,%xmm2 516*4882a593Smuzhiyun pshufd $0x55,%xmm3,%xmm3 517*4882a593Smuzhiyun paddd 0x20(%rsp),%xmm2 518*4882a593Smuzhiyun movdqa %xmm2,0x20(%rsp) 519*4882a593Smuzhiyun paddd 0x30(%rsp),%xmm3 520*4882a593Smuzhiyun movdqa %xmm3,0x30(%rsp) 521*4882a593Smuzhiyun 522*4882a593Smuzhiyun # x4[0-3] += s1[0] 523*4882a593Smuzhiyun # x5[0-3] += s1[1] 524*4882a593Smuzhiyun movq 0x10(%rdi),%xmm3 525*4882a593Smuzhiyun pshufd $0x00,%xmm3,%xmm2 526*4882a593Smuzhiyun pshufd $0x55,%xmm3,%xmm3 527*4882a593Smuzhiyun paddd %xmm2,%xmm4 528*4882a593Smuzhiyun paddd %xmm3,%xmm5 529*4882a593Smuzhiyun # x6[0-3] += s1[2] 530*4882a593Smuzhiyun # x7[0-3] += s1[3] 531*4882a593Smuzhiyun movq 0x18(%rdi),%xmm3 532*4882a593Smuzhiyun pshufd $0x00,%xmm3,%xmm2 533*4882a593Smuzhiyun pshufd $0x55,%xmm3,%xmm3 534*4882a593Smuzhiyun paddd %xmm2,%xmm6 535*4882a593Smuzhiyun paddd %xmm3,%xmm7 536*4882a593Smuzhiyun 537*4882a593Smuzhiyun # x8[0-3] += s2[0] 538*4882a593Smuzhiyun # x9[0-3] += s2[1] 539*4882a593Smuzhiyun movq 0x20(%rdi),%xmm3 540*4882a593Smuzhiyun pshufd $0x00,%xmm3,%xmm2 541*4882a593Smuzhiyun pshufd $0x55,%xmm3,%xmm3 542*4882a593Smuzhiyun paddd %xmm2,%xmm8 543*4882a593Smuzhiyun paddd %xmm3,%xmm9 544*4882a593Smuzhiyun # x10[0-3] += s2[2] 545*4882a593Smuzhiyun # x11[0-3] += s2[3] 546*4882a593Smuzhiyun movq 0x28(%rdi),%xmm3 547*4882a593Smuzhiyun pshufd $0x00,%xmm3,%xmm2 548*4882a593Smuzhiyun pshufd $0x55,%xmm3,%xmm3 549*4882a593Smuzhiyun paddd %xmm2,%xmm10 550*4882a593Smuzhiyun paddd %xmm3,%xmm11 551*4882a593Smuzhiyun 552*4882a593Smuzhiyun # x12[0-3] += s3[0] 553*4882a593Smuzhiyun # x13[0-3] += s3[1] 554*4882a593Smuzhiyun movq 0x30(%rdi),%xmm3 555*4882a593Smuzhiyun pshufd $0x00,%xmm3,%xmm2 556*4882a593Smuzhiyun pshufd $0x55,%xmm3,%xmm3 557*4882a593Smuzhiyun paddd %xmm2,%xmm12 558*4882a593Smuzhiyun paddd %xmm3,%xmm13 559*4882a593Smuzhiyun # x14[0-3] += s3[2] 560*4882a593Smuzhiyun # x15[0-3] += s3[3] 561*4882a593Smuzhiyun movq 0x38(%rdi),%xmm3 562*4882a593Smuzhiyun pshufd $0x00,%xmm3,%xmm2 563*4882a593Smuzhiyun pshufd $0x55,%xmm3,%xmm3 564*4882a593Smuzhiyun paddd %xmm2,%xmm14 565*4882a593Smuzhiyun paddd %xmm3,%xmm15 566*4882a593Smuzhiyun 567*4882a593Smuzhiyun # x12 += counter values 0-3 568*4882a593Smuzhiyun paddd %xmm1,%xmm12 569*4882a593Smuzhiyun 570*4882a593Smuzhiyun # interleave 32-bit words in state n, n+1 571*4882a593Smuzhiyun movdqa 0x00(%rsp),%xmm0 572*4882a593Smuzhiyun movdqa 0x10(%rsp),%xmm1 573*4882a593Smuzhiyun movdqa %xmm0,%xmm2 574*4882a593Smuzhiyun punpckldq %xmm1,%xmm2 575*4882a593Smuzhiyun punpckhdq %xmm1,%xmm0 576*4882a593Smuzhiyun movdqa %xmm2,0x00(%rsp) 577*4882a593Smuzhiyun movdqa %xmm0,0x10(%rsp) 578*4882a593Smuzhiyun movdqa 0x20(%rsp),%xmm0 579*4882a593Smuzhiyun movdqa 0x30(%rsp),%xmm1 580*4882a593Smuzhiyun movdqa %xmm0,%xmm2 581*4882a593Smuzhiyun punpckldq %xmm1,%xmm2 582*4882a593Smuzhiyun punpckhdq %xmm1,%xmm0 583*4882a593Smuzhiyun movdqa %xmm2,0x20(%rsp) 584*4882a593Smuzhiyun movdqa %xmm0,0x30(%rsp) 585*4882a593Smuzhiyun movdqa %xmm4,%xmm0 586*4882a593Smuzhiyun punpckldq %xmm5,%xmm4 587*4882a593Smuzhiyun punpckhdq %xmm5,%xmm0 588*4882a593Smuzhiyun movdqa %xmm0,%xmm5 589*4882a593Smuzhiyun movdqa %xmm6,%xmm0 590*4882a593Smuzhiyun punpckldq %xmm7,%xmm6 591*4882a593Smuzhiyun punpckhdq %xmm7,%xmm0 592*4882a593Smuzhiyun movdqa %xmm0,%xmm7 593*4882a593Smuzhiyun movdqa %xmm8,%xmm0 594*4882a593Smuzhiyun punpckldq %xmm9,%xmm8 595*4882a593Smuzhiyun punpckhdq %xmm9,%xmm0 596*4882a593Smuzhiyun movdqa %xmm0,%xmm9 597*4882a593Smuzhiyun movdqa %xmm10,%xmm0 598*4882a593Smuzhiyun punpckldq %xmm11,%xmm10 599*4882a593Smuzhiyun punpckhdq %xmm11,%xmm0 600*4882a593Smuzhiyun movdqa %xmm0,%xmm11 601*4882a593Smuzhiyun movdqa %xmm12,%xmm0 602*4882a593Smuzhiyun punpckldq %xmm13,%xmm12 603*4882a593Smuzhiyun punpckhdq %xmm13,%xmm0 604*4882a593Smuzhiyun movdqa %xmm0,%xmm13 605*4882a593Smuzhiyun movdqa %xmm14,%xmm0 606*4882a593Smuzhiyun punpckldq %xmm15,%xmm14 607*4882a593Smuzhiyun punpckhdq %xmm15,%xmm0 608*4882a593Smuzhiyun movdqa %xmm0,%xmm15 609*4882a593Smuzhiyun 610*4882a593Smuzhiyun # interleave 64-bit words in state n, n+2 611*4882a593Smuzhiyun movdqa 0x00(%rsp),%xmm0 612*4882a593Smuzhiyun movdqa 0x20(%rsp),%xmm1 613*4882a593Smuzhiyun movdqa %xmm0,%xmm2 614*4882a593Smuzhiyun punpcklqdq %xmm1,%xmm2 615*4882a593Smuzhiyun punpckhqdq %xmm1,%xmm0 616*4882a593Smuzhiyun movdqa %xmm2,0x00(%rsp) 617*4882a593Smuzhiyun movdqa %xmm0,0x20(%rsp) 618*4882a593Smuzhiyun movdqa 0x10(%rsp),%xmm0 619*4882a593Smuzhiyun movdqa 0x30(%rsp),%xmm1 620*4882a593Smuzhiyun movdqa %xmm0,%xmm2 621*4882a593Smuzhiyun punpcklqdq %xmm1,%xmm2 622*4882a593Smuzhiyun punpckhqdq %xmm1,%xmm0 623*4882a593Smuzhiyun movdqa %xmm2,0x10(%rsp) 624*4882a593Smuzhiyun movdqa %xmm0,0x30(%rsp) 625*4882a593Smuzhiyun movdqa %xmm4,%xmm0 626*4882a593Smuzhiyun punpcklqdq %xmm6,%xmm4 627*4882a593Smuzhiyun punpckhqdq %xmm6,%xmm0 628*4882a593Smuzhiyun movdqa %xmm0,%xmm6 629*4882a593Smuzhiyun movdqa %xmm5,%xmm0 630*4882a593Smuzhiyun punpcklqdq %xmm7,%xmm5 631*4882a593Smuzhiyun punpckhqdq %xmm7,%xmm0 632*4882a593Smuzhiyun movdqa %xmm0,%xmm7 633*4882a593Smuzhiyun movdqa %xmm8,%xmm0 634*4882a593Smuzhiyun punpcklqdq %xmm10,%xmm8 635*4882a593Smuzhiyun punpckhqdq %xmm10,%xmm0 636*4882a593Smuzhiyun movdqa %xmm0,%xmm10 637*4882a593Smuzhiyun movdqa %xmm9,%xmm0 638*4882a593Smuzhiyun punpcklqdq %xmm11,%xmm9 639*4882a593Smuzhiyun punpckhqdq %xmm11,%xmm0 640*4882a593Smuzhiyun movdqa %xmm0,%xmm11 641*4882a593Smuzhiyun movdqa %xmm12,%xmm0 642*4882a593Smuzhiyun punpcklqdq %xmm14,%xmm12 643*4882a593Smuzhiyun punpckhqdq %xmm14,%xmm0 644*4882a593Smuzhiyun movdqa %xmm0,%xmm14 645*4882a593Smuzhiyun movdqa %xmm13,%xmm0 646*4882a593Smuzhiyun punpcklqdq %xmm15,%xmm13 647*4882a593Smuzhiyun punpckhqdq %xmm15,%xmm0 648*4882a593Smuzhiyun movdqa %xmm0,%xmm15 649*4882a593Smuzhiyun 650*4882a593Smuzhiyun # xor with corresponding input, write to output 651*4882a593Smuzhiyun movdqa 0x00(%rsp),%xmm0 652*4882a593Smuzhiyun cmp $0x10,%rax 653*4882a593Smuzhiyun jl .Lxorpart4 654*4882a593Smuzhiyun movdqu 0x00(%rdx),%xmm1 655*4882a593Smuzhiyun pxor %xmm1,%xmm0 656*4882a593Smuzhiyun movdqu %xmm0,0x00(%rsi) 657*4882a593Smuzhiyun 658*4882a593Smuzhiyun movdqu %xmm4,%xmm0 659*4882a593Smuzhiyun cmp $0x20,%rax 660*4882a593Smuzhiyun jl .Lxorpart4 661*4882a593Smuzhiyun movdqu 0x10(%rdx),%xmm1 662*4882a593Smuzhiyun pxor %xmm1,%xmm0 663*4882a593Smuzhiyun movdqu %xmm0,0x10(%rsi) 664*4882a593Smuzhiyun 665*4882a593Smuzhiyun movdqu %xmm8,%xmm0 666*4882a593Smuzhiyun cmp $0x30,%rax 667*4882a593Smuzhiyun jl .Lxorpart4 668*4882a593Smuzhiyun movdqu 0x20(%rdx),%xmm1 669*4882a593Smuzhiyun pxor %xmm1,%xmm0 670*4882a593Smuzhiyun movdqu %xmm0,0x20(%rsi) 671*4882a593Smuzhiyun 672*4882a593Smuzhiyun movdqu %xmm12,%xmm0 673*4882a593Smuzhiyun cmp $0x40,%rax 674*4882a593Smuzhiyun jl .Lxorpart4 675*4882a593Smuzhiyun movdqu 0x30(%rdx),%xmm1 676*4882a593Smuzhiyun pxor %xmm1,%xmm0 677*4882a593Smuzhiyun movdqu %xmm0,0x30(%rsi) 678*4882a593Smuzhiyun 679*4882a593Smuzhiyun movdqa 0x20(%rsp),%xmm0 680*4882a593Smuzhiyun cmp $0x50,%rax 681*4882a593Smuzhiyun jl .Lxorpart4 682*4882a593Smuzhiyun movdqu 0x40(%rdx),%xmm1 683*4882a593Smuzhiyun pxor %xmm1,%xmm0 684*4882a593Smuzhiyun movdqu %xmm0,0x40(%rsi) 685*4882a593Smuzhiyun 686*4882a593Smuzhiyun movdqu %xmm6,%xmm0 687*4882a593Smuzhiyun cmp $0x60,%rax 688*4882a593Smuzhiyun jl .Lxorpart4 689*4882a593Smuzhiyun movdqu 0x50(%rdx),%xmm1 690*4882a593Smuzhiyun pxor %xmm1,%xmm0 691*4882a593Smuzhiyun movdqu %xmm0,0x50(%rsi) 692*4882a593Smuzhiyun 693*4882a593Smuzhiyun movdqu %xmm10,%xmm0 694*4882a593Smuzhiyun cmp $0x70,%rax 695*4882a593Smuzhiyun jl .Lxorpart4 696*4882a593Smuzhiyun movdqu 0x60(%rdx),%xmm1 697*4882a593Smuzhiyun pxor %xmm1,%xmm0 698*4882a593Smuzhiyun movdqu %xmm0,0x60(%rsi) 699*4882a593Smuzhiyun 700*4882a593Smuzhiyun movdqu %xmm14,%xmm0 701*4882a593Smuzhiyun cmp $0x80,%rax 702*4882a593Smuzhiyun jl .Lxorpart4 703*4882a593Smuzhiyun movdqu 0x70(%rdx),%xmm1 704*4882a593Smuzhiyun pxor %xmm1,%xmm0 705*4882a593Smuzhiyun movdqu %xmm0,0x70(%rsi) 706*4882a593Smuzhiyun 707*4882a593Smuzhiyun movdqa 0x10(%rsp),%xmm0 708*4882a593Smuzhiyun cmp $0x90,%rax 709*4882a593Smuzhiyun jl .Lxorpart4 710*4882a593Smuzhiyun movdqu 0x80(%rdx),%xmm1 711*4882a593Smuzhiyun pxor %xmm1,%xmm0 712*4882a593Smuzhiyun movdqu %xmm0,0x80(%rsi) 713*4882a593Smuzhiyun 714*4882a593Smuzhiyun movdqu %xmm5,%xmm0 715*4882a593Smuzhiyun cmp $0xa0,%rax 716*4882a593Smuzhiyun jl .Lxorpart4 717*4882a593Smuzhiyun movdqu 0x90(%rdx),%xmm1 718*4882a593Smuzhiyun pxor %xmm1,%xmm0 719*4882a593Smuzhiyun movdqu %xmm0,0x90(%rsi) 720*4882a593Smuzhiyun 721*4882a593Smuzhiyun movdqu %xmm9,%xmm0 722*4882a593Smuzhiyun cmp $0xb0,%rax 723*4882a593Smuzhiyun jl .Lxorpart4 724*4882a593Smuzhiyun movdqu 0xa0(%rdx),%xmm1 725*4882a593Smuzhiyun pxor %xmm1,%xmm0 726*4882a593Smuzhiyun movdqu %xmm0,0xa0(%rsi) 727*4882a593Smuzhiyun 728*4882a593Smuzhiyun movdqu %xmm13,%xmm0 729*4882a593Smuzhiyun cmp $0xc0,%rax 730*4882a593Smuzhiyun jl .Lxorpart4 731*4882a593Smuzhiyun movdqu 0xb0(%rdx),%xmm1 732*4882a593Smuzhiyun pxor %xmm1,%xmm0 733*4882a593Smuzhiyun movdqu %xmm0,0xb0(%rsi) 734*4882a593Smuzhiyun 735*4882a593Smuzhiyun movdqa 0x30(%rsp),%xmm0 736*4882a593Smuzhiyun cmp $0xd0,%rax 737*4882a593Smuzhiyun jl .Lxorpart4 738*4882a593Smuzhiyun movdqu 0xc0(%rdx),%xmm1 739*4882a593Smuzhiyun pxor %xmm1,%xmm0 740*4882a593Smuzhiyun movdqu %xmm0,0xc0(%rsi) 741*4882a593Smuzhiyun 742*4882a593Smuzhiyun movdqu %xmm7,%xmm0 743*4882a593Smuzhiyun cmp $0xe0,%rax 744*4882a593Smuzhiyun jl .Lxorpart4 745*4882a593Smuzhiyun movdqu 0xd0(%rdx),%xmm1 746*4882a593Smuzhiyun pxor %xmm1,%xmm0 747*4882a593Smuzhiyun movdqu %xmm0,0xd0(%rsi) 748*4882a593Smuzhiyun 749*4882a593Smuzhiyun movdqu %xmm11,%xmm0 750*4882a593Smuzhiyun cmp $0xf0,%rax 751*4882a593Smuzhiyun jl .Lxorpart4 752*4882a593Smuzhiyun movdqu 0xe0(%rdx),%xmm1 753*4882a593Smuzhiyun pxor %xmm1,%xmm0 754*4882a593Smuzhiyun movdqu %xmm0,0xe0(%rsi) 755*4882a593Smuzhiyun 756*4882a593Smuzhiyun movdqu %xmm15,%xmm0 757*4882a593Smuzhiyun cmp $0x100,%rax 758*4882a593Smuzhiyun jl .Lxorpart4 759*4882a593Smuzhiyun movdqu 0xf0(%rdx),%xmm1 760*4882a593Smuzhiyun pxor %xmm1,%xmm0 761*4882a593Smuzhiyun movdqu %xmm0,0xf0(%rsi) 762*4882a593Smuzhiyun 763*4882a593Smuzhiyun.Ldone4: 764*4882a593Smuzhiyun lea -8(%r10),%rsp 765*4882a593Smuzhiyun RET 766*4882a593Smuzhiyun 767*4882a593Smuzhiyun.Lxorpart4: 768*4882a593Smuzhiyun # xor remaining bytes from partial register into output 769*4882a593Smuzhiyun mov %rax,%r9 770*4882a593Smuzhiyun and $0x0f,%r9 771*4882a593Smuzhiyun jz .Ldone4 772*4882a593Smuzhiyun and $~0x0f,%rax 773*4882a593Smuzhiyun 774*4882a593Smuzhiyun mov %rsi,%r11 775*4882a593Smuzhiyun 776*4882a593Smuzhiyun lea (%rdx,%rax),%rsi 777*4882a593Smuzhiyun mov %rsp,%rdi 778*4882a593Smuzhiyun mov %r9,%rcx 779*4882a593Smuzhiyun rep movsb 780*4882a593Smuzhiyun 781*4882a593Smuzhiyun pxor 0x00(%rsp),%xmm0 782*4882a593Smuzhiyun movdqa %xmm0,0x00(%rsp) 783*4882a593Smuzhiyun 784*4882a593Smuzhiyun mov %rsp,%rsi 785*4882a593Smuzhiyun lea (%r11,%rax),%rdi 786*4882a593Smuzhiyun mov %r9,%rcx 787*4882a593Smuzhiyun rep movsb 788*4882a593Smuzhiyun 789*4882a593Smuzhiyun jmp .Ldone4 790*4882a593Smuzhiyun 791*4882a593SmuzhiyunSYM_FUNC_END(chacha_4block_xor_ssse3) 792