1*4882a593Smuzhiyun#!/usr/bin/env perl 2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 3*4882a593Smuzhiyun# 4*4882a593Smuzhiyun# Copyright (C) 2017-2018 Samuel Neves <sneves@dei.uc.pt>. All Rights Reserved. 5*4882a593Smuzhiyun# Copyright (C) 2017-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 6*4882a593Smuzhiyun# Copyright (C) 2006-2017 CRYPTOGAMS by <appro@openssl.org>. All Rights Reserved. 7*4882a593Smuzhiyun# 8*4882a593Smuzhiyun# This code is taken from the OpenSSL project but the author, Andy Polyakov, 9*4882a593Smuzhiyun# has relicensed it under the licenses specified in the SPDX header above. 10*4882a593Smuzhiyun# The original headers, including the original license headers, are 11*4882a593Smuzhiyun# included below for completeness. 12*4882a593Smuzhiyun# 13*4882a593Smuzhiyun# ==================================================================== 14*4882a593Smuzhiyun# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 15*4882a593Smuzhiyun# project. The module is, however, dual licensed under OpenSSL and 16*4882a593Smuzhiyun# CRYPTOGAMS licenses depending on where you obtain it. For further 17*4882a593Smuzhiyun# details see http://www.openssl.org/~appro/cryptogams/. 18*4882a593Smuzhiyun# ==================================================================== 19*4882a593Smuzhiyun# 20*4882a593Smuzhiyun# This module implements Poly1305 hash for x86_64. 21*4882a593Smuzhiyun# 22*4882a593Smuzhiyun# March 2015 23*4882a593Smuzhiyun# 24*4882a593Smuzhiyun# Initial release. 25*4882a593Smuzhiyun# 26*4882a593Smuzhiyun# December 2016 27*4882a593Smuzhiyun# 28*4882a593Smuzhiyun# Add AVX512F+VL+BW code path. 29*4882a593Smuzhiyun# 30*4882a593Smuzhiyun# November 2017 31*4882a593Smuzhiyun# 32*4882a593Smuzhiyun# Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be 33*4882a593Smuzhiyun# executed even on Knights Landing. Trigger for modification was 34*4882a593Smuzhiyun# observation that AVX512 code paths can negatively affect overall 35*4882a593Smuzhiyun# Skylake-X system performance. Since we are likely to suppress 36*4882a593Smuzhiyun# AVX512F capability flag [at least on Skylake-X], conversion serves 37*4882a593Smuzhiyun# as kind of "investment protection". Note that next *lake processor, 38*4882a593Smuzhiyun# Cannonlake, has AVX512IFMA code path to execute... 39*4882a593Smuzhiyun# 40*4882a593Smuzhiyun# Numbers are cycles per processed byte with poly1305_blocks alone, 41*4882a593Smuzhiyun# measured with rdtsc at fixed clock frequency. 42*4882a593Smuzhiyun# 43*4882a593Smuzhiyun# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 44*4882a593Smuzhiyun# P4 4.46/+120% - 45*4882a593Smuzhiyun# Core 2 2.41/+90% - 46*4882a593Smuzhiyun# Westmere 1.88/+120% - 47*4882a593Smuzhiyun# Sandy Bridge 1.39/+140% 1.10 48*4882a593Smuzhiyun# Haswell 1.14/+175% 1.11 0.65 49*4882a593Smuzhiyun# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] 50*4882a593Smuzhiyun# Silvermont 2.83/+95% - 51*4882a593Smuzhiyun# Knights L 3.60/? 1.65 1.10 0.41(***) 52*4882a593Smuzhiyun# Goldmont 1.70/+180% - 53*4882a593Smuzhiyun# VIA Nano 1.82/+150% - 54*4882a593Smuzhiyun# Sledgehammer 1.38/+160% - 55*4882a593Smuzhiyun# Bulldozer 2.30/+130% 0.97 56*4882a593Smuzhiyun# Ryzen 1.15/+200% 1.08 1.18 57*4882a593Smuzhiyun# 58*4882a593Smuzhiyun# (*) improvement coefficients relative to clang are more modest and 59*4882a593Smuzhiyun# are ~50% on most processors, in both cases we are comparing to 60*4882a593Smuzhiyun# __int128 code; 61*4882a593Smuzhiyun# (**) SSE2 implementation was attempted, but among non-AVX processors 62*4882a593Smuzhiyun# it was faster than integer-only code only on older Intel P4 and 63*4882a593Smuzhiyun# Core processors, 50-30%, less newer processor is, but slower on 64*4882a593Smuzhiyun# contemporary ones, for example almost 2x slower on Atom, and as 65*4882a593Smuzhiyun# former are naturally disappearing, SSE2 is deemed unnecessary; 66*4882a593Smuzhiyun# (***) strangely enough performance seems to vary from core to core, 67*4882a593Smuzhiyun# listed result is best case; 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun$flavour = shift; 70*4882a593Smuzhiyun$output = shift; 71*4882a593Smuzhiyunif ($flavour =~ /\./) { $output = $flavour; undef $flavour; } 72*4882a593Smuzhiyun 73*4882a593Smuzhiyun$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); 74*4882a593Smuzhiyun$kernel=0; $kernel=1 if (!$flavour && !$output); 75*4882a593Smuzhiyun 76*4882a593Smuzhiyunif (!$kernel) { 77*4882a593Smuzhiyun $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 78*4882a593Smuzhiyun ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or 79*4882a593Smuzhiyun ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or 80*4882a593Smuzhiyun die "can't locate x86_64-xlate.pl"; 81*4882a593Smuzhiyun 82*4882a593Smuzhiyun open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; 83*4882a593Smuzhiyun *STDOUT=*OUT; 84*4882a593Smuzhiyun 85*4882a593Smuzhiyun if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 86*4882a593Smuzhiyun =~ /GNU assembler version ([2-9]\.[0-9]+)/) { 87*4882a593Smuzhiyun $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); 88*4882a593Smuzhiyun } 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && 91*4882a593Smuzhiyun `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { 92*4882a593Smuzhiyun $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); 93*4882a593Smuzhiyun $avx += 1 if ($1==2.11 && $2>=8); 94*4882a593Smuzhiyun } 95*4882a593Smuzhiyun 96*4882a593Smuzhiyun if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && 97*4882a593Smuzhiyun `ml64 2>&1` =~ /Version ([0-9]+)\./) { 98*4882a593Smuzhiyun $avx = ($1>=10) + ($1>=11); 99*4882a593Smuzhiyun } 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { 102*4882a593Smuzhiyun $avx = ($2>=3.0) + ($2>3.0); 103*4882a593Smuzhiyun } 104*4882a593Smuzhiyun} else { 105*4882a593Smuzhiyun $avx = 4; # The kernel uses ifdefs for this. 106*4882a593Smuzhiyun} 107*4882a593Smuzhiyun 108*4882a593Smuzhiyunsub declare_function() { 109*4882a593Smuzhiyun my ($name, $align, $nargs) = @_; 110*4882a593Smuzhiyun if($kernel) { 111*4882a593Smuzhiyun $code .= ".align $align\n"; 112*4882a593Smuzhiyun $code .= "SYM_FUNC_START($name)\n"; 113*4882a593Smuzhiyun $code .= ".L$name:\n"; 114*4882a593Smuzhiyun } else { 115*4882a593Smuzhiyun $code .= ".globl $name\n"; 116*4882a593Smuzhiyun $code .= ".type $name,\@function,$nargs\n"; 117*4882a593Smuzhiyun $code .= ".align $align\n"; 118*4882a593Smuzhiyun $code .= "$name:\n"; 119*4882a593Smuzhiyun } 120*4882a593Smuzhiyun} 121*4882a593Smuzhiyun 122*4882a593Smuzhiyunsub end_function() { 123*4882a593Smuzhiyun my ($name) = @_; 124*4882a593Smuzhiyun if($kernel) { 125*4882a593Smuzhiyun $code .= "SYM_FUNC_END($name)\n"; 126*4882a593Smuzhiyun } else { 127*4882a593Smuzhiyun $code .= ".size $name,.-$name\n"; 128*4882a593Smuzhiyun } 129*4882a593Smuzhiyun} 130*4882a593Smuzhiyun 131*4882a593Smuzhiyun$code.=<<___ if $kernel; 132*4882a593Smuzhiyun#include <linux/linkage.h> 133*4882a593Smuzhiyun___ 134*4882a593Smuzhiyun 135*4882a593Smuzhiyunif ($avx) { 136*4882a593Smuzhiyun$code.=<<___ if $kernel; 137*4882a593Smuzhiyun.section .rodata 138*4882a593Smuzhiyun___ 139*4882a593Smuzhiyun$code.=<<___; 140*4882a593Smuzhiyun.align 64 141*4882a593Smuzhiyun.Lconst: 142*4882a593Smuzhiyun.Lmask24: 143*4882a593Smuzhiyun.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 144*4882a593Smuzhiyun.L129: 145*4882a593Smuzhiyun.long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 146*4882a593Smuzhiyun.Lmask26: 147*4882a593Smuzhiyun.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 148*4882a593Smuzhiyun.Lpermd_avx2: 149*4882a593Smuzhiyun.long 2,2,2,3,2,0,2,1 150*4882a593Smuzhiyun.Lpermd_avx512: 151*4882a593Smuzhiyun.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 152*4882a593Smuzhiyun 153*4882a593Smuzhiyun.L2_44_inp_permd: 154*4882a593Smuzhiyun.long 0,1,1,2,2,3,7,7 155*4882a593Smuzhiyun.L2_44_inp_shift: 156*4882a593Smuzhiyun.quad 0,12,24,64 157*4882a593Smuzhiyun.L2_44_mask: 158*4882a593Smuzhiyun.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff 159*4882a593Smuzhiyun.L2_44_shift_rgt: 160*4882a593Smuzhiyun.quad 44,44,42,64 161*4882a593Smuzhiyun.L2_44_shift_lft: 162*4882a593Smuzhiyun.quad 8,8,10,64 163*4882a593Smuzhiyun 164*4882a593Smuzhiyun.align 64 165*4882a593Smuzhiyun.Lx_mask44: 166*4882a593Smuzhiyun.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 167*4882a593Smuzhiyun.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff 168*4882a593Smuzhiyun.Lx_mask42: 169*4882a593Smuzhiyun.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 170*4882a593Smuzhiyun.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff 171*4882a593Smuzhiyun___ 172*4882a593Smuzhiyun} 173*4882a593Smuzhiyun$code.=<<___ if (!$kernel); 174*4882a593Smuzhiyun.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>" 175*4882a593Smuzhiyun.align 16 176*4882a593Smuzhiyun___ 177*4882a593Smuzhiyun 178*4882a593Smuzhiyunmy ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); 179*4882a593Smuzhiyunmy ($mac,$nonce)=($inp,$len); # *_emit arguments 180*4882a593Smuzhiyunmy ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13"); 181*4882a593Smuzhiyunmy ($h0,$h1,$h2)=("%r14","%rbx","%r10"); 182*4882a593Smuzhiyun 183*4882a593Smuzhiyunsub poly1305_iteration { 184*4882a593Smuzhiyun# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 185*4882a593Smuzhiyun# output: $h0-$h2 *= $r0-$r1 186*4882a593Smuzhiyun$code.=<<___; 187*4882a593Smuzhiyun mulq $h0 # h0*r1 188*4882a593Smuzhiyun mov %rax,$d2 189*4882a593Smuzhiyun mov $r0,%rax 190*4882a593Smuzhiyun mov %rdx,$d3 191*4882a593Smuzhiyun 192*4882a593Smuzhiyun mulq $h0 # h0*r0 193*4882a593Smuzhiyun mov %rax,$h0 # future $h0 194*4882a593Smuzhiyun mov $r0,%rax 195*4882a593Smuzhiyun mov %rdx,$d1 196*4882a593Smuzhiyun 197*4882a593Smuzhiyun mulq $h1 # h1*r0 198*4882a593Smuzhiyun add %rax,$d2 199*4882a593Smuzhiyun mov $s1,%rax 200*4882a593Smuzhiyun adc %rdx,$d3 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun mulq $h1 # h1*s1 203*4882a593Smuzhiyun mov $h2,$h1 # borrow $h1 204*4882a593Smuzhiyun add %rax,$h0 205*4882a593Smuzhiyun adc %rdx,$d1 206*4882a593Smuzhiyun 207*4882a593Smuzhiyun imulq $s1,$h1 # h2*s1 208*4882a593Smuzhiyun add $h1,$d2 209*4882a593Smuzhiyun mov $d1,$h1 210*4882a593Smuzhiyun adc \$0,$d3 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun imulq $r0,$h2 # h2*r0 213*4882a593Smuzhiyun add $d2,$h1 214*4882a593Smuzhiyun mov \$-4,%rax # mask value 215*4882a593Smuzhiyun adc $h2,$d3 216*4882a593Smuzhiyun 217*4882a593Smuzhiyun and $d3,%rax # last reduction step 218*4882a593Smuzhiyun mov $d3,$h2 219*4882a593Smuzhiyun shr \$2,$d3 220*4882a593Smuzhiyun and \$3,$h2 221*4882a593Smuzhiyun add $d3,%rax 222*4882a593Smuzhiyun add %rax,$h0 223*4882a593Smuzhiyun adc \$0,$h1 224*4882a593Smuzhiyun adc \$0,$h2 225*4882a593Smuzhiyun___ 226*4882a593Smuzhiyun} 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun######################################################################## 229*4882a593Smuzhiyun# Layout of opaque area is following. 230*4882a593Smuzhiyun# 231*4882a593Smuzhiyun# unsigned __int64 h[3]; # current hash value base 2^64 232*4882a593Smuzhiyun# unsigned __int64 r[2]; # key value base 2^64 233*4882a593Smuzhiyun 234*4882a593Smuzhiyun$code.=<<___; 235*4882a593Smuzhiyun.text 236*4882a593Smuzhiyun___ 237*4882a593Smuzhiyun$code.=<<___ if (!$kernel); 238*4882a593Smuzhiyun.extern OPENSSL_ia32cap_P 239*4882a593Smuzhiyun 240*4882a593Smuzhiyun.globl poly1305_init_x86_64 241*4882a593Smuzhiyun.hidden poly1305_init_x86_64 242*4882a593Smuzhiyun.globl poly1305_blocks_x86_64 243*4882a593Smuzhiyun.hidden poly1305_blocks_x86_64 244*4882a593Smuzhiyun.globl poly1305_emit_x86_64 245*4882a593Smuzhiyun.hidden poly1305_emit_x86_64 246*4882a593Smuzhiyun___ 247*4882a593Smuzhiyun&declare_function("poly1305_init_x86_64", 32, 3); 248*4882a593Smuzhiyun$code.=<<___; 249*4882a593Smuzhiyun xor %eax,%eax 250*4882a593Smuzhiyun mov %rax,0($ctx) # initialize hash value 251*4882a593Smuzhiyun mov %rax,8($ctx) 252*4882a593Smuzhiyun mov %rax,16($ctx) 253*4882a593Smuzhiyun 254*4882a593Smuzhiyun cmp \$0,$inp 255*4882a593Smuzhiyun je .Lno_key 256*4882a593Smuzhiyun___ 257*4882a593Smuzhiyun$code.=<<___ if (!$kernel); 258*4882a593Smuzhiyun lea poly1305_blocks_x86_64(%rip),%r10 259*4882a593Smuzhiyun lea poly1305_emit_x86_64(%rip),%r11 260*4882a593Smuzhiyun___ 261*4882a593Smuzhiyun$code.=<<___ if (!$kernel && $avx); 262*4882a593Smuzhiyun mov OPENSSL_ia32cap_P+4(%rip),%r9 263*4882a593Smuzhiyun lea poly1305_blocks_avx(%rip),%rax 264*4882a593Smuzhiyun lea poly1305_emit_avx(%rip),%rcx 265*4882a593Smuzhiyun bt \$`60-32`,%r9 # AVX? 266*4882a593Smuzhiyun cmovc %rax,%r10 267*4882a593Smuzhiyun cmovc %rcx,%r11 268*4882a593Smuzhiyun___ 269*4882a593Smuzhiyun$code.=<<___ if (!$kernel && $avx>1); 270*4882a593Smuzhiyun lea poly1305_blocks_avx2(%rip),%rax 271*4882a593Smuzhiyun bt \$`5+32`,%r9 # AVX2? 272*4882a593Smuzhiyun cmovc %rax,%r10 273*4882a593Smuzhiyun___ 274*4882a593Smuzhiyun$code.=<<___ if (!$kernel && $avx>3); 275*4882a593Smuzhiyun mov \$`(1<<31|1<<21|1<<16)`,%rax 276*4882a593Smuzhiyun shr \$32,%r9 277*4882a593Smuzhiyun and %rax,%r9 278*4882a593Smuzhiyun cmp %rax,%r9 279*4882a593Smuzhiyun je .Linit_base2_44 280*4882a593Smuzhiyun___ 281*4882a593Smuzhiyun$code.=<<___; 282*4882a593Smuzhiyun mov \$0x0ffffffc0fffffff,%rax 283*4882a593Smuzhiyun mov \$0x0ffffffc0ffffffc,%rcx 284*4882a593Smuzhiyun and 0($inp),%rax 285*4882a593Smuzhiyun and 8($inp),%rcx 286*4882a593Smuzhiyun mov %rax,24($ctx) 287*4882a593Smuzhiyun mov %rcx,32($ctx) 288*4882a593Smuzhiyun___ 289*4882a593Smuzhiyun$code.=<<___ if (!$kernel && $flavour !~ /elf32/); 290*4882a593Smuzhiyun mov %r10,0(%rdx) 291*4882a593Smuzhiyun mov %r11,8(%rdx) 292*4882a593Smuzhiyun___ 293*4882a593Smuzhiyun$code.=<<___ if (!$kernel && $flavour =~ /elf32/); 294*4882a593Smuzhiyun mov %r10d,0(%rdx) 295*4882a593Smuzhiyun mov %r11d,4(%rdx) 296*4882a593Smuzhiyun___ 297*4882a593Smuzhiyun$code.=<<___; 298*4882a593Smuzhiyun mov \$1,%eax 299*4882a593Smuzhiyun.Lno_key: 300*4882a593Smuzhiyun RET 301*4882a593Smuzhiyun___ 302*4882a593Smuzhiyun&end_function("poly1305_init_x86_64"); 303*4882a593Smuzhiyun 304*4882a593Smuzhiyun&declare_function("poly1305_blocks_x86_64", 32, 4); 305*4882a593Smuzhiyun$code.=<<___; 306*4882a593Smuzhiyun.cfi_startproc 307*4882a593Smuzhiyun.Lblocks: 308*4882a593Smuzhiyun shr \$4,$len 309*4882a593Smuzhiyun jz .Lno_data # too short 310*4882a593Smuzhiyun 311*4882a593Smuzhiyun push %rbx 312*4882a593Smuzhiyun.cfi_push %rbx 313*4882a593Smuzhiyun push %r12 314*4882a593Smuzhiyun.cfi_push %r12 315*4882a593Smuzhiyun push %r13 316*4882a593Smuzhiyun.cfi_push %r13 317*4882a593Smuzhiyun push %r14 318*4882a593Smuzhiyun.cfi_push %r14 319*4882a593Smuzhiyun push %r15 320*4882a593Smuzhiyun.cfi_push %r15 321*4882a593Smuzhiyun push $ctx 322*4882a593Smuzhiyun.cfi_push $ctx 323*4882a593Smuzhiyun.Lblocks_body: 324*4882a593Smuzhiyun 325*4882a593Smuzhiyun mov $len,%r15 # reassign $len 326*4882a593Smuzhiyun 327*4882a593Smuzhiyun mov 24($ctx),$r0 # load r 328*4882a593Smuzhiyun mov 32($ctx),$s1 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun mov 0($ctx),$h0 # load hash value 331*4882a593Smuzhiyun mov 8($ctx),$h1 332*4882a593Smuzhiyun mov 16($ctx),$h2 333*4882a593Smuzhiyun 334*4882a593Smuzhiyun mov $s1,$r1 335*4882a593Smuzhiyun shr \$2,$s1 336*4882a593Smuzhiyun mov $r1,%rax 337*4882a593Smuzhiyun add $r1,$s1 # s1 = r1 + (r1 >> 2) 338*4882a593Smuzhiyun jmp .Loop 339*4882a593Smuzhiyun 340*4882a593Smuzhiyun.align 32 341*4882a593Smuzhiyun.Loop: 342*4882a593Smuzhiyun add 0($inp),$h0 # accumulate input 343*4882a593Smuzhiyun adc 8($inp),$h1 344*4882a593Smuzhiyun lea 16($inp),$inp 345*4882a593Smuzhiyun adc $padbit,$h2 346*4882a593Smuzhiyun___ 347*4882a593Smuzhiyun 348*4882a593Smuzhiyun &poly1305_iteration(); 349*4882a593Smuzhiyun 350*4882a593Smuzhiyun$code.=<<___; 351*4882a593Smuzhiyun mov $r1,%rax 352*4882a593Smuzhiyun dec %r15 # len-=16 353*4882a593Smuzhiyun jnz .Loop 354*4882a593Smuzhiyun 355*4882a593Smuzhiyun mov 0(%rsp),$ctx 356*4882a593Smuzhiyun.cfi_restore $ctx 357*4882a593Smuzhiyun 358*4882a593Smuzhiyun mov $h0,0($ctx) # store hash value 359*4882a593Smuzhiyun mov $h1,8($ctx) 360*4882a593Smuzhiyun mov $h2,16($ctx) 361*4882a593Smuzhiyun 362*4882a593Smuzhiyun mov 8(%rsp),%r15 363*4882a593Smuzhiyun.cfi_restore %r15 364*4882a593Smuzhiyun mov 16(%rsp),%r14 365*4882a593Smuzhiyun.cfi_restore %r14 366*4882a593Smuzhiyun mov 24(%rsp),%r13 367*4882a593Smuzhiyun.cfi_restore %r13 368*4882a593Smuzhiyun mov 32(%rsp),%r12 369*4882a593Smuzhiyun.cfi_restore %r12 370*4882a593Smuzhiyun mov 40(%rsp),%rbx 371*4882a593Smuzhiyun.cfi_restore %rbx 372*4882a593Smuzhiyun lea 48(%rsp),%rsp 373*4882a593Smuzhiyun.cfi_adjust_cfa_offset -48 374*4882a593Smuzhiyun.Lno_data: 375*4882a593Smuzhiyun.Lblocks_epilogue: 376*4882a593Smuzhiyun RET 377*4882a593Smuzhiyun.cfi_endproc 378*4882a593Smuzhiyun___ 379*4882a593Smuzhiyun&end_function("poly1305_blocks_x86_64"); 380*4882a593Smuzhiyun 381*4882a593Smuzhiyun&declare_function("poly1305_emit_x86_64", 32, 3); 382*4882a593Smuzhiyun$code.=<<___; 383*4882a593Smuzhiyun.Lemit: 384*4882a593Smuzhiyun mov 0($ctx),%r8 # load hash value 385*4882a593Smuzhiyun mov 8($ctx),%r9 386*4882a593Smuzhiyun mov 16($ctx),%r10 387*4882a593Smuzhiyun 388*4882a593Smuzhiyun mov %r8,%rax 389*4882a593Smuzhiyun add \$5,%r8 # compare to modulus 390*4882a593Smuzhiyun mov %r9,%rcx 391*4882a593Smuzhiyun adc \$0,%r9 392*4882a593Smuzhiyun adc \$0,%r10 393*4882a593Smuzhiyun shr \$2,%r10 # did 130-bit value overflow? 394*4882a593Smuzhiyun cmovnz %r8,%rax 395*4882a593Smuzhiyun cmovnz %r9,%rcx 396*4882a593Smuzhiyun 397*4882a593Smuzhiyun add 0($nonce),%rax # accumulate nonce 398*4882a593Smuzhiyun adc 8($nonce),%rcx 399*4882a593Smuzhiyun mov %rax,0($mac) # write result 400*4882a593Smuzhiyun mov %rcx,8($mac) 401*4882a593Smuzhiyun 402*4882a593Smuzhiyun RET 403*4882a593Smuzhiyun___ 404*4882a593Smuzhiyun&end_function("poly1305_emit_x86_64"); 405*4882a593Smuzhiyunif ($avx) { 406*4882a593Smuzhiyun 407*4882a593Smuzhiyun######################################################################## 408*4882a593Smuzhiyun# Layout of opaque area is following. 409*4882a593Smuzhiyun# 410*4882a593Smuzhiyun# unsigned __int32 h[5]; # current hash value base 2^26 411*4882a593Smuzhiyun# unsigned __int32 is_base2_26; 412*4882a593Smuzhiyun# unsigned __int64 r[2]; # key value base 2^64 413*4882a593Smuzhiyun# unsigned __int64 pad; 414*4882a593Smuzhiyun# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; 415*4882a593Smuzhiyun# 416*4882a593Smuzhiyun# where r^n are base 2^26 digits of degrees of multiplier key. There are 417*4882a593Smuzhiyun# 5 digits, but last four are interleaved with multiples of 5, totalling 418*4882a593Smuzhiyun# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. 419*4882a593Smuzhiyun 420*4882a593Smuzhiyunmy ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = 421*4882a593Smuzhiyun map("%xmm$_",(0..15)); 422*4882a593Smuzhiyun 423*4882a593Smuzhiyun$code.=<<___; 424*4882a593Smuzhiyun.type __poly1305_block,\@abi-omnipotent 425*4882a593Smuzhiyun.align 32 426*4882a593Smuzhiyun__poly1305_block: 427*4882a593Smuzhiyun push $ctx 428*4882a593Smuzhiyun___ 429*4882a593Smuzhiyun &poly1305_iteration(); 430*4882a593Smuzhiyun$code.=<<___; 431*4882a593Smuzhiyun pop $ctx 432*4882a593Smuzhiyun RET 433*4882a593Smuzhiyun.size __poly1305_block,.-__poly1305_block 434*4882a593Smuzhiyun 435*4882a593Smuzhiyun.type __poly1305_init_avx,\@abi-omnipotent 436*4882a593Smuzhiyun.align 32 437*4882a593Smuzhiyun__poly1305_init_avx: 438*4882a593Smuzhiyun push %rbp 439*4882a593Smuzhiyun mov %rsp,%rbp 440*4882a593Smuzhiyun mov $r0,$h0 441*4882a593Smuzhiyun mov $r1,$h1 442*4882a593Smuzhiyun xor $h2,$h2 443*4882a593Smuzhiyun 444*4882a593Smuzhiyun lea 48+64($ctx),$ctx # size optimization 445*4882a593Smuzhiyun 446*4882a593Smuzhiyun mov $r1,%rax 447*4882a593Smuzhiyun call __poly1305_block # r^2 448*4882a593Smuzhiyun 449*4882a593Smuzhiyun mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 450*4882a593Smuzhiyun mov \$0x3ffffff,%edx 451*4882a593Smuzhiyun mov $h0,$d1 452*4882a593Smuzhiyun and $h0#d,%eax 453*4882a593Smuzhiyun mov $r0,$d2 454*4882a593Smuzhiyun and $r0#d,%edx 455*4882a593Smuzhiyun mov %eax,`16*0+0-64`($ctx) 456*4882a593Smuzhiyun shr \$26,$d1 457*4882a593Smuzhiyun mov %edx,`16*0+4-64`($ctx) 458*4882a593Smuzhiyun shr \$26,$d2 459*4882a593Smuzhiyun 460*4882a593Smuzhiyun mov \$0x3ffffff,%eax 461*4882a593Smuzhiyun mov \$0x3ffffff,%edx 462*4882a593Smuzhiyun and $d1#d,%eax 463*4882a593Smuzhiyun and $d2#d,%edx 464*4882a593Smuzhiyun mov %eax,`16*1+0-64`($ctx) 465*4882a593Smuzhiyun lea (%rax,%rax,4),%eax # *5 466*4882a593Smuzhiyun mov %edx,`16*1+4-64`($ctx) 467*4882a593Smuzhiyun lea (%rdx,%rdx,4),%edx # *5 468*4882a593Smuzhiyun mov %eax,`16*2+0-64`($ctx) 469*4882a593Smuzhiyun shr \$26,$d1 470*4882a593Smuzhiyun mov %edx,`16*2+4-64`($ctx) 471*4882a593Smuzhiyun shr \$26,$d2 472*4882a593Smuzhiyun 473*4882a593Smuzhiyun mov $h1,%rax 474*4882a593Smuzhiyun mov $r1,%rdx 475*4882a593Smuzhiyun shl \$12,%rax 476*4882a593Smuzhiyun shl \$12,%rdx 477*4882a593Smuzhiyun or $d1,%rax 478*4882a593Smuzhiyun or $d2,%rdx 479*4882a593Smuzhiyun and \$0x3ffffff,%eax 480*4882a593Smuzhiyun and \$0x3ffffff,%edx 481*4882a593Smuzhiyun mov %eax,`16*3+0-64`($ctx) 482*4882a593Smuzhiyun lea (%rax,%rax,4),%eax # *5 483*4882a593Smuzhiyun mov %edx,`16*3+4-64`($ctx) 484*4882a593Smuzhiyun lea (%rdx,%rdx,4),%edx # *5 485*4882a593Smuzhiyun mov %eax,`16*4+0-64`($ctx) 486*4882a593Smuzhiyun mov $h1,$d1 487*4882a593Smuzhiyun mov %edx,`16*4+4-64`($ctx) 488*4882a593Smuzhiyun mov $r1,$d2 489*4882a593Smuzhiyun 490*4882a593Smuzhiyun mov \$0x3ffffff,%eax 491*4882a593Smuzhiyun mov \$0x3ffffff,%edx 492*4882a593Smuzhiyun shr \$14,$d1 493*4882a593Smuzhiyun shr \$14,$d2 494*4882a593Smuzhiyun and $d1#d,%eax 495*4882a593Smuzhiyun and $d2#d,%edx 496*4882a593Smuzhiyun mov %eax,`16*5+0-64`($ctx) 497*4882a593Smuzhiyun lea (%rax,%rax,4),%eax # *5 498*4882a593Smuzhiyun mov %edx,`16*5+4-64`($ctx) 499*4882a593Smuzhiyun lea (%rdx,%rdx,4),%edx # *5 500*4882a593Smuzhiyun mov %eax,`16*6+0-64`($ctx) 501*4882a593Smuzhiyun shr \$26,$d1 502*4882a593Smuzhiyun mov %edx,`16*6+4-64`($ctx) 503*4882a593Smuzhiyun shr \$26,$d2 504*4882a593Smuzhiyun 505*4882a593Smuzhiyun mov $h2,%rax 506*4882a593Smuzhiyun shl \$24,%rax 507*4882a593Smuzhiyun or %rax,$d1 508*4882a593Smuzhiyun mov $d1#d,`16*7+0-64`($ctx) 509*4882a593Smuzhiyun lea ($d1,$d1,4),$d1 # *5 510*4882a593Smuzhiyun mov $d2#d,`16*7+4-64`($ctx) 511*4882a593Smuzhiyun lea ($d2,$d2,4),$d2 # *5 512*4882a593Smuzhiyun mov $d1#d,`16*8+0-64`($ctx) 513*4882a593Smuzhiyun mov $d2#d,`16*8+4-64`($ctx) 514*4882a593Smuzhiyun 515*4882a593Smuzhiyun mov $r1,%rax 516*4882a593Smuzhiyun call __poly1305_block # r^3 517*4882a593Smuzhiyun 518*4882a593Smuzhiyun mov \$0x3ffffff,%eax # save r^3 base 2^26 519*4882a593Smuzhiyun mov $h0,$d1 520*4882a593Smuzhiyun and $h0#d,%eax 521*4882a593Smuzhiyun shr \$26,$d1 522*4882a593Smuzhiyun mov %eax,`16*0+12-64`($ctx) 523*4882a593Smuzhiyun 524*4882a593Smuzhiyun mov \$0x3ffffff,%edx 525*4882a593Smuzhiyun and $d1#d,%edx 526*4882a593Smuzhiyun mov %edx,`16*1+12-64`($ctx) 527*4882a593Smuzhiyun lea (%rdx,%rdx,4),%edx # *5 528*4882a593Smuzhiyun shr \$26,$d1 529*4882a593Smuzhiyun mov %edx,`16*2+12-64`($ctx) 530*4882a593Smuzhiyun 531*4882a593Smuzhiyun mov $h1,%rax 532*4882a593Smuzhiyun shl \$12,%rax 533*4882a593Smuzhiyun or $d1,%rax 534*4882a593Smuzhiyun and \$0x3ffffff,%eax 535*4882a593Smuzhiyun mov %eax,`16*3+12-64`($ctx) 536*4882a593Smuzhiyun lea (%rax,%rax,4),%eax # *5 537*4882a593Smuzhiyun mov $h1,$d1 538*4882a593Smuzhiyun mov %eax,`16*4+12-64`($ctx) 539*4882a593Smuzhiyun 540*4882a593Smuzhiyun mov \$0x3ffffff,%edx 541*4882a593Smuzhiyun shr \$14,$d1 542*4882a593Smuzhiyun and $d1#d,%edx 543*4882a593Smuzhiyun mov %edx,`16*5+12-64`($ctx) 544*4882a593Smuzhiyun lea (%rdx,%rdx,4),%edx # *5 545*4882a593Smuzhiyun shr \$26,$d1 546*4882a593Smuzhiyun mov %edx,`16*6+12-64`($ctx) 547*4882a593Smuzhiyun 548*4882a593Smuzhiyun mov $h2,%rax 549*4882a593Smuzhiyun shl \$24,%rax 550*4882a593Smuzhiyun or %rax,$d1 551*4882a593Smuzhiyun mov $d1#d,`16*7+12-64`($ctx) 552*4882a593Smuzhiyun lea ($d1,$d1,4),$d1 # *5 553*4882a593Smuzhiyun mov $d1#d,`16*8+12-64`($ctx) 554*4882a593Smuzhiyun 555*4882a593Smuzhiyun mov $r1,%rax 556*4882a593Smuzhiyun call __poly1305_block # r^4 557*4882a593Smuzhiyun 558*4882a593Smuzhiyun mov \$0x3ffffff,%eax # save r^4 base 2^26 559*4882a593Smuzhiyun mov $h0,$d1 560*4882a593Smuzhiyun and $h0#d,%eax 561*4882a593Smuzhiyun shr \$26,$d1 562*4882a593Smuzhiyun mov %eax,`16*0+8-64`($ctx) 563*4882a593Smuzhiyun 564*4882a593Smuzhiyun mov \$0x3ffffff,%edx 565*4882a593Smuzhiyun and $d1#d,%edx 566*4882a593Smuzhiyun mov %edx,`16*1+8-64`($ctx) 567*4882a593Smuzhiyun lea (%rdx,%rdx,4),%edx # *5 568*4882a593Smuzhiyun shr \$26,$d1 569*4882a593Smuzhiyun mov %edx,`16*2+8-64`($ctx) 570*4882a593Smuzhiyun 571*4882a593Smuzhiyun mov $h1,%rax 572*4882a593Smuzhiyun shl \$12,%rax 573*4882a593Smuzhiyun or $d1,%rax 574*4882a593Smuzhiyun and \$0x3ffffff,%eax 575*4882a593Smuzhiyun mov %eax,`16*3+8-64`($ctx) 576*4882a593Smuzhiyun lea (%rax,%rax,4),%eax # *5 577*4882a593Smuzhiyun mov $h1,$d1 578*4882a593Smuzhiyun mov %eax,`16*4+8-64`($ctx) 579*4882a593Smuzhiyun 580*4882a593Smuzhiyun mov \$0x3ffffff,%edx 581*4882a593Smuzhiyun shr \$14,$d1 582*4882a593Smuzhiyun and $d1#d,%edx 583*4882a593Smuzhiyun mov %edx,`16*5+8-64`($ctx) 584*4882a593Smuzhiyun lea (%rdx,%rdx,4),%edx # *5 585*4882a593Smuzhiyun shr \$26,$d1 586*4882a593Smuzhiyun mov %edx,`16*6+8-64`($ctx) 587*4882a593Smuzhiyun 588*4882a593Smuzhiyun mov $h2,%rax 589*4882a593Smuzhiyun shl \$24,%rax 590*4882a593Smuzhiyun or %rax,$d1 591*4882a593Smuzhiyun mov $d1#d,`16*7+8-64`($ctx) 592*4882a593Smuzhiyun lea ($d1,$d1,4),$d1 # *5 593*4882a593Smuzhiyun mov $d1#d,`16*8+8-64`($ctx) 594*4882a593Smuzhiyun 595*4882a593Smuzhiyun lea -48-64($ctx),$ctx # size [de-]optimization 596*4882a593Smuzhiyun pop %rbp 597*4882a593Smuzhiyun RET 598*4882a593Smuzhiyun.size __poly1305_init_avx,.-__poly1305_init_avx 599*4882a593Smuzhiyun___ 600*4882a593Smuzhiyun 601*4882a593Smuzhiyun&declare_function("poly1305_blocks_avx", 32, 4); 602*4882a593Smuzhiyun$code.=<<___; 603*4882a593Smuzhiyun.cfi_startproc 604*4882a593Smuzhiyun mov 20($ctx),%r8d # is_base2_26 605*4882a593Smuzhiyun cmp \$128,$len 606*4882a593Smuzhiyun jae .Lblocks_avx 607*4882a593Smuzhiyun test %r8d,%r8d 608*4882a593Smuzhiyun jz .Lblocks 609*4882a593Smuzhiyun 610*4882a593Smuzhiyun.Lblocks_avx: 611*4882a593Smuzhiyun and \$-16,$len 612*4882a593Smuzhiyun jz .Lno_data_avx 613*4882a593Smuzhiyun 614*4882a593Smuzhiyun vzeroupper 615*4882a593Smuzhiyun 616*4882a593Smuzhiyun test %r8d,%r8d 617*4882a593Smuzhiyun jz .Lbase2_64_avx 618*4882a593Smuzhiyun 619*4882a593Smuzhiyun test \$31,$len 620*4882a593Smuzhiyun jz .Leven_avx 621*4882a593Smuzhiyun 622*4882a593Smuzhiyun push %rbp 623*4882a593Smuzhiyun.cfi_push %rbp 624*4882a593Smuzhiyun mov %rsp,%rbp 625*4882a593Smuzhiyun push %rbx 626*4882a593Smuzhiyun.cfi_push %rbx 627*4882a593Smuzhiyun push %r12 628*4882a593Smuzhiyun.cfi_push %r12 629*4882a593Smuzhiyun push %r13 630*4882a593Smuzhiyun.cfi_push %r13 631*4882a593Smuzhiyun push %r14 632*4882a593Smuzhiyun.cfi_push %r14 633*4882a593Smuzhiyun push %r15 634*4882a593Smuzhiyun.cfi_push %r15 635*4882a593Smuzhiyun.Lblocks_avx_body: 636*4882a593Smuzhiyun 637*4882a593Smuzhiyun mov $len,%r15 # reassign $len 638*4882a593Smuzhiyun 639*4882a593Smuzhiyun mov 0($ctx),$d1 # load hash value 640*4882a593Smuzhiyun mov 8($ctx),$d2 641*4882a593Smuzhiyun mov 16($ctx),$h2#d 642*4882a593Smuzhiyun 643*4882a593Smuzhiyun mov 24($ctx),$r0 # load r 644*4882a593Smuzhiyun mov 32($ctx),$s1 645*4882a593Smuzhiyun 646*4882a593Smuzhiyun ################################# base 2^26 -> base 2^64 647*4882a593Smuzhiyun mov $d1#d,$h0#d 648*4882a593Smuzhiyun and \$`-1*(1<<31)`,$d1 649*4882a593Smuzhiyun mov $d2,$r1 # borrow $r1 650*4882a593Smuzhiyun mov $d2#d,$h1#d 651*4882a593Smuzhiyun and \$`-1*(1<<31)`,$d2 652*4882a593Smuzhiyun 653*4882a593Smuzhiyun shr \$6,$d1 654*4882a593Smuzhiyun shl \$52,$r1 655*4882a593Smuzhiyun add $d1,$h0 656*4882a593Smuzhiyun shr \$12,$h1 657*4882a593Smuzhiyun shr \$18,$d2 658*4882a593Smuzhiyun add $r1,$h0 659*4882a593Smuzhiyun adc $d2,$h1 660*4882a593Smuzhiyun 661*4882a593Smuzhiyun mov $h2,$d1 662*4882a593Smuzhiyun shl \$40,$d1 663*4882a593Smuzhiyun shr \$24,$h2 664*4882a593Smuzhiyun add $d1,$h1 665*4882a593Smuzhiyun adc \$0,$h2 # can be partially reduced... 666*4882a593Smuzhiyun 667*4882a593Smuzhiyun mov \$-4,$d2 # ... so reduce 668*4882a593Smuzhiyun mov $h2,$d1 669*4882a593Smuzhiyun and $h2,$d2 670*4882a593Smuzhiyun shr \$2,$d1 671*4882a593Smuzhiyun and \$3,$h2 672*4882a593Smuzhiyun add $d2,$d1 # =*5 673*4882a593Smuzhiyun add $d1,$h0 674*4882a593Smuzhiyun adc \$0,$h1 675*4882a593Smuzhiyun adc \$0,$h2 676*4882a593Smuzhiyun 677*4882a593Smuzhiyun mov $s1,$r1 678*4882a593Smuzhiyun mov $s1,%rax 679*4882a593Smuzhiyun shr \$2,$s1 680*4882a593Smuzhiyun add $r1,$s1 # s1 = r1 + (r1 >> 2) 681*4882a593Smuzhiyun 682*4882a593Smuzhiyun add 0($inp),$h0 # accumulate input 683*4882a593Smuzhiyun adc 8($inp),$h1 684*4882a593Smuzhiyun lea 16($inp),$inp 685*4882a593Smuzhiyun adc $padbit,$h2 686*4882a593Smuzhiyun 687*4882a593Smuzhiyun call __poly1305_block 688*4882a593Smuzhiyun 689*4882a593Smuzhiyun test $padbit,$padbit # if $padbit is zero, 690*4882a593Smuzhiyun jz .Lstore_base2_64_avx # store hash in base 2^64 format 691*4882a593Smuzhiyun 692*4882a593Smuzhiyun ################################# base 2^64 -> base 2^26 693*4882a593Smuzhiyun mov $h0,%rax 694*4882a593Smuzhiyun mov $h0,%rdx 695*4882a593Smuzhiyun shr \$52,$h0 696*4882a593Smuzhiyun mov $h1,$r0 697*4882a593Smuzhiyun mov $h1,$r1 698*4882a593Smuzhiyun shr \$26,%rdx 699*4882a593Smuzhiyun and \$0x3ffffff,%rax # h[0] 700*4882a593Smuzhiyun shl \$12,$r0 701*4882a593Smuzhiyun and \$0x3ffffff,%rdx # h[1] 702*4882a593Smuzhiyun shr \$14,$h1 703*4882a593Smuzhiyun or $r0,$h0 704*4882a593Smuzhiyun shl \$24,$h2 705*4882a593Smuzhiyun and \$0x3ffffff,$h0 # h[2] 706*4882a593Smuzhiyun shr \$40,$r1 707*4882a593Smuzhiyun and \$0x3ffffff,$h1 # h[3] 708*4882a593Smuzhiyun or $r1,$h2 # h[4] 709*4882a593Smuzhiyun 710*4882a593Smuzhiyun sub \$16,%r15 711*4882a593Smuzhiyun jz .Lstore_base2_26_avx 712*4882a593Smuzhiyun 713*4882a593Smuzhiyun vmovd %rax#d,$H0 714*4882a593Smuzhiyun vmovd %rdx#d,$H1 715*4882a593Smuzhiyun vmovd $h0#d,$H2 716*4882a593Smuzhiyun vmovd $h1#d,$H3 717*4882a593Smuzhiyun vmovd $h2#d,$H4 718*4882a593Smuzhiyun jmp .Lproceed_avx 719*4882a593Smuzhiyun 720*4882a593Smuzhiyun.align 32 721*4882a593Smuzhiyun.Lstore_base2_64_avx: 722*4882a593Smuzhiyun mov $h0,0($ctx) 723*4882a593Smuzhiyun mov $h1,8($ctx) 724*4882a593Smuzhiyun mov $h2,16($ctx) # note that is_base2_26 is zeroed 725*4882a593Smuzhiyun jmp .Ldone_avx 726*4882a593Smuzhiyun 727*4882a593Smuzhiyun.align 16 728*4882a593Smuzhiyun.Lstore_base2_26_avx: 729*4882a593Smuzhiyun mov %rax#d,0($ctx) # store hash value base 2^26 730*4882a593Smuzhiyun mov %rdx#d,4($ctx) 731*4882a593Smuzhiyun mov $h0#d,8($ctx) 732*4882a593Smuzhiyun mov $h1#d,12($ctx) 733*4882a593Smuzhiyun mov $h2#d,16($ctx) 734*4882a593Smuzhiyun.align 16 735*4882a593Smuzhiyun.Ldone_avx: 736*4882a593Smuzhiyun pop %r15 737*4882a593Smuzhiyun.cfi_restore %r15 738*4882a593Smuzhiyun pop %r14 739*4882a593Smuzhiyun.cfi_restore %r14 740*4882a593Smuzhiyun pop %r13 741*4882a593Smuzhiyun.cfi_restore %r13 742*4882a593Smuzhiyun pop %r12 743*4882a593Smuzhiyun.cfi_restore %r12 744*4882a593Smuzhiyun pop %rbx 745*4882a593Smuzhiyun.cfi_restore %rbx 746*4882a593Smuzhiyun pop %rbp 747*4882a593Smuzhiyun.cfi_restore %rbp 748*4882a593Smuzhiyun.Lno_data_avx: 749*4882a593Smuzhiyun.Lblocks_avx_epilogue: 750*4882a593Smuzhiyun RET 751*4882a593Smuzhiyun.cfi_endproc 752*4882a593Smuzhiyun 753*4882a593Smuzhiyun.align 32 754*4882a593Smuzhiyun.Lbase2_64_avx: 755*4882a593Smuzhiyun.cfi_startproc 756*4882a593Smuzhiyun push %rbp 757*4882a593Smuzhiyun.cfi_push %rbp 758*4882a593Smuzhiyun mov %rsp,%rbp 759*4882a593Smuzhiyun push %rbx 760*4882a593Smuzhiyun.cfi_push %rbx 761*4882a593Smuzhiyun push %r12 762*4882a593Smuzhiyun.cfi_push %r12 763*4882a593Smuzhiyun push %r13 764*4882a593Smuzhiyun.cfi_push %r13 765*4882a593Smuzhiyun push %r14 766*4882a593Smuzhiyun.cfi_push %r14 767*4882a593Smuzhiyun push %r15 768*4882a593Smuzhiyun.cfi_push %r15 769*4882a593Smuzhiyun.Lbase2_64_avx_body: 770*4882a593Smuzhiyun 771*4882a593Smuzhiyun mov $len,%r15 # reassign $len 772*4882a593Smuzhiyun 773*4882a593Smuzhiyun mov 24($ctx),$r0 # load r 774*4882a593Smuzhiyun mov 32($ctx),$s1 775*4882a593Smuzhiyun 776*4882a593Smuzhiyun mov 0($ctx),$h0 # load hash value 777*4882a593Smuzhiyun mov 8($ctx),$h1 778*4882a593Smuzhiyun mov 16($ctx),$h2#d 779*4882a593Smuzhiyun 780*4882a593Smuzhiyun mov $s1,$r1 781*4882a593Smuzhiyun mov $s1,%rax 782*4882a593Smuzhiyun shr \$2,$s1 783*4882a593Smuzhiyun add $r1,$s1 # s1 = r1 + (r1 >> 2) 784*4882a593Smuzhiyun 785*4882a593Smuzhiyun test \$31,$len 786*4882a593Smuzhiyun jz .Linit_avx 787*4882a593Smuzhiyun 788*4882a593Smuzhiyun add 0($inp),$h0 # accumulate input 789*4882a593Smuzhiyun adc 8($inp),$h1 790*4882a593Smuzhiyun lea 16($inp),$inp 791*4882a593Smuzhiyun adc $padbit,$h2 792*4882a593Smuzhiyun sub \$16,%r15 793*4882a593Smuzhiyun 794*4882a593Smuzhiyun call __poly1305_block 795*4882a593Smuzhiyun 796*4882a593Smuzhiyun.Linit_avx: 797*4882a593Smuzhiyun ################################# base 2^64 -> base 2^26 798*4882a593Smuzhiyun mov $h0,%rax 799*4882a593Smuzhiyun mov $h0,%rdx 800*4882a593Smuzhiyun shr \$52,$h0 801*4882a593Smuzhiyun mov $h1,$d1 802*4882a593Smuzhiyun mov $h1,$d2 803*4882a593Smuzhiyun shr \$26,%rdx 804*4882a593Smuzhiyun and \$0x3ffffff,%rax # h[0] 805*4882a593Smuzhiyun shl \$12,$d1 806*4882a593Smuzhiyun and \$0x3ffffff,%rdx # h[1] 807*4882a593Smuzhiyun shr \$14,$h1 808*4882a593Smuzhiyun or $d1,$h0 809*4882a593Smuzhiyun shl \$24,$h2 810*4882a593Smuzhiyun and \$0x3ffffff,$h0 # h[2] 811*4882a593Smuzhiyun shr \$40,$d2 812*4882a593Smuzhiyun and \$0x3ffffff,$h1 # h[3] 813*4882a593Smuzhiyun or $d2,$h2 # h[4] 814*4882a593Smuzhiyun 815*4882a593Smuzhiyun vmovd %rax#d,$H0 816*4882a593Smuzhiyun vmovd %rdx#d,$H1 817*4882a593Smuzhiyun vmovd $h0#d,$H2 818*4882a593Smuzhiyun vmovd $h1#d,$H3 819*4882a593Smuzhiyun vmovd $h2#d,$H4 820*4882a593Smuzhiyun movl \$1,20($ctx) # set is_base2_26 821*4882a593Smuzhiyun 822*4882a593Smuzhiyun call __poly1305_init_avx 823*4882a593Smuzhiyun 824*4882a593Smuzhiyun.Lproceed_avx: 825*4882a593Smuzhiyun mov %r15,$len 826*4882a593Smuzhiyun pop %r15 827*4882a593Smuzhiyun.cfi_restore %r15 828*4882a593Smuzhiyun pop %r14 829*4882a593Smuzhiyun.cfi_restore %r14 830*4882a593Smuzhiyun pop %r13 831*4882a593Smuzhiyun.cfi_restore %r13 832*4882a593Smuzhiyun pop %r12 833*4882a593Smuzhiyun.cfi_restore %r12 834*4882a593Smuzhiyun pop %rbx 835*4882a593Smuzhiyun.cfi_restore %rbx 836*4882a593Smuzhiyun pop %rbp 837*4882a593Smuzhiyun.cfi_restore %rbp 838*4882a593Smuzhiyun.Lbase2_64_avx_epilogue: 839*4882a593Smuzhiyun jmp .Ldo_avx 840*4882a593Smuzhiyun.cfi_endproc 841*4882a593Smuzhiyun 842*4882a593Smuzhiyun.align 32 843*4882a593Smuzhiyun.Leven_avx: 844*4882a593Smuzhiyun.cfi_startproc 845*4882a593Smuzhiyun vmovd 4*0($ctx),$H0 # load hash value 846*4882a593Smuzhiyun vmovd 4*1($ctx),$H1 847*4882a593Smuzhiyun vmovd 4*2($ctx),$H2 848*4882a593Smuzhiyun vmovd 4*3($ctx),$H3 849*4882a593Smuzhiyun vmovd 4*4($ctx),$H4 850*4882a593Smuzhiyun 851*4882a593Smuzhiyun.Ldo_avx: 852*4882a593Smuzhiyun___ 853*4882a593Smuzhiyun$code.=<<___ if (!$win64); 854*4882a593Smuzhiyun lea 8(%rsp),%r10 855*4882a593Smuzhiyun.cfi_def_cfa_register %r10 856*4882a593Smuzhiyun and \$-32,%rsp 857*4882a593Smuzhiyun sub \$-8,%rsp 858*4882a593Smuzhiyun lea -0x58(%rsp),%r11 859*4882a593Smuzhiyun sub \$0x178,%rsp 860*4882a593Smuzhiyun___ 861*4882a593Smuzhiyun$code.=<<___ if ($win64); 862*4882a593Smuzhiyun lea -0xf8(%rsp),%r11 863*4882a593Smuzhiyun sub \$0x218,%rsp 864*4882a593Smuzhiyun vmovdqa %xmm6,0x50(%r11) 865*4882a593Smuzhiyun vmovdqa %xmm7,0x60(%r11) 866*4882a593Smuzhiyun vmovdqa %xmm8,0x70(%r11) 867*4882a593Smuzhiyun vmovdqa %xmm9,0x80(%r11) 868*4882a593Smuzhiyun vmovdqa %xmm10,0x90(%r11) 869*4882a593Smuzhiyun vmovdqa %xmm11,0xa0(%r11) 870*4882a593Smuzhiyun vmovdqa %xmm12,0xb0(%r11) 871*4882a593Smuzhiyun vmovdqa %xmm13,0xc0(%r11) 872*4882a593Smuzhiyun vmovdqa %xmm14,0xd0(%r11) 873*4882a593Smuzhiyun vmovdqa %xmm15,0xe0(%r11) 874*4882a593Smuzhiyun.Ldo_avx_body: 875*4882a593Smuzhiyun___ 876*4882a593Smuzhiyun$code.=<<___; 877*4882a593Smuzhiyun sub \$64,$len 878*4882a593Smuzhiyun lea -32($inp),%rax 879*4882a593Smuzhiyun cmovc %rax,$inp 880*4882a593Smuzhiyun 881*4882a593Smuzhiyun vmovdqu `16*3`($ctx),$D4 # preload r0^2 882*4882a593Smuzhiyun lea `16*3+64`($ctx),$ctx # size optimization 883*4882a593Smuzhiyun lea .Lconst(%rip),%rcx 884*4882a593Smuzhiyun 885*4882a593Smuzhiyun ################################################################ 886*4882a593Smuzhiyun # load input 887*4882a593Smuzhiyun vmovdqu 16*2($inp),$T0 888*4882a593Smuzhiyun vmovdqu 16*3($inp),$T1 889*4882a593Smuzhiyun vmovdqa 64(%rcx),$MASK # .Lmask26 890*4882a593Smuzhiyun 891*4882a593Smuzhiyun vpsrldq \$6,$T0,$T2 # splat input 892*4882a593Smuzhiyun vpsrldq \$6,$T1,$T3 893*4882a593Smuzhiyun vpunpckhqdq $T1,$T0,$T4 # 4 894*4882a593Smuzhiyun vpunpcklqdq $T1,$T0,$T0 # 0:1 895*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T3 # 2:3 896*4882a593Smuzhiyun 897*4882a593Smuzhiyun vpsrlq \$40,$T4,$T4 # 4 898*4882a593Smuzhiyun vpsrlq \$26,$T0,$T1 899*4882a593Smuzhiyun vpand $MASK,$T0,$T0 # 0 900*4882a593Smuzhiyun vpsrlq \$4,$T3,$T2 901*4882a593Smuzhiyun vpand $MASK,$T1,$T1 # 1 902*4882a593Smuzhiyun vpsrlq \$30,$T3,$T3 903*4882a593Smuzhiyun vpand $MASK,$T2,$T2 # 2 904*4882a593Smuzhiyun vpand $MASK,$T3,$T3 # 3 905*4882a593Smuzhiyun vpor 32(%rcx),$T4,$T4 # padbit, yes, always 906*4882a593Smuzhiyun 907*4882a593Smuzhiyun jbe .Lskip_loop_avx 908*4882a593Smuzhiyun 909*4882a593Smuzhiyun # expand and copy pre-calculated table to stack 910*4882a593Smuzhiyun vmovdqu `16*1-64`($ctx),$D1 911*4882a593Smuzhiyun vmovdqu `16*2-64`($ctx),$D2 912*4882a593Smuzhiyun vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 913*4882a593Smuzhiyun vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 914*4882a593Smuzhiyun vmovdqa $D3,-0x90(%r11) 915*4882a593Smuzhiyun vmovdqa $D0,0x00(%rsp) 916*4882a593Smuzhiyun vpshufd \$0xEE,$D1,$D4 917*4882a593Smuzhiyun vmovdqu `16*3-64`($ctx),$D0 918*4882a593Smuzhiyun vpshufd \$0x44,$D1,$D1 919*4882a593Smuzhiyun vmovdqa $D4,-0x80(%r11) 920*4882a593Smuzhiyun vmovdqa $D1,0x10(%rsp) 921*4882a593Smuzhiyun vpshufd \$0xEE,$D2,$D3 922*4882a593Smuzhiyun vmovdqu `16*4-64`($ctx),$D1 923*4882a593Smuzhiyun vpshufd \$0x44,$D2,$D2 924*4882a593Smuzhiyun vmovdqa $D3,-0x70(%r11) 925*4882a593Smuzhiyun vmovdqa $D2,0x20(%rsp) 926*4882a593Smuzhiyun vpshufd \$0xEE,$D0,$D4 927*4882a593Smuzhiyun vmovdqu `16*5-64`($ctx),$D2 928*4882a593Smuzhiyun vpshufd \$0x44,$D0,$D0 929*4882a593Smuzhiyun vmovdqa $D4,-0x60(%r11) 930*4882a593Smuzhiyun vmovdqa $D0,0x30(%rsp) 931*4882a593Smuzhiyun vpshufd \$0xEE,$D1,$D3 932*4882a593Smuzhiyun vmovdqu `16*6-64`($ctx),$D0 933*4882a593Smuzhiyun vpshufd \$0x44,$D1,$D1 934*4882a593Smuzhiyun vmovdqa $D3,-0x50(%r11) 935*4882a593Smuzhiyun vmovdqa $D1,0x40(%rsp) 936*4882a593Smuzhiyun vpshufd \$0xEE,$D2,$D4 937*4882a593Smuzhiyun vmovdqu `16*7-64`($ctx),$D1 938*4882a593Smuzhiyun vpshufd \$0x44,$D2,$D2 939*4882a593Smuzhiyun vmovdqa $D4,-0x40(%r11) 940*4882a593Smuzhiyun vmovdqa $D2,0x50(%rsp) 941*4882a593Smuzhiyun vpshufd \$0xEE,$D0,$D3 942*4882a593Smuzhiyun vmovdqu `16*8-64`($ctx),$D2 943*4882a593Smuzhiyun vpshufd \$0x44,$D0,$D0 944*4882a593Smuzhiyun vmovdqa $D3,-0x30(%r11) 945*4882a593Smuzhiyun vmovdqa $D0,0x60(%rsp) 946*4882a593Smuzhiyun vpshufd \$0xEE,$D1,$D4 947*4882a593Smuzhiyun vpshufd \$0x44,$D1,$D1 948*4882a593Smuzhiyun vmovdqa $D4,-0x20(%r11) 949*4882a593Smuzhiyun vmovdqa $D1,0x70(%rsp) 950*4882a593Smuzhiyun vpshufd \$0xEE,$D2,$D3 951*4882a593Smuzhiyun vmovdqa 0x00(%rsp),$D4 # preload r0^2 952*4882a593Smuzhiyun vpshufd \$0x44,$D2,$D2 953*4882a593Smuzhiyun vmovdqa $D3,-0x10(%r11) 954*4882a593Smuzhiyun vmovdqa $D2,0x80(%rsp) 955*4882a593Smuzhiyun 956*4882a593Smuzhiyun jmp .Loop_avx 957*4882a593Smuzhiyun 958*4882a593Smuzhiyun.align 32 959*4882a593Smuzhiyun.Loop_avx: 960*4882a593Smuzhiyun ################################################################ 961*4882a593Smuzhiyun # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 962*4882a593Smuzhiyun # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 963*4882a593Smuzhiyun # \___________________/ 964*4882a593Smuzhiyun # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 965*4882a593Smuzhiyun # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 966*4882a593Smuzhiyun # \___________________/ \____________________/ 967*4882a593Smuzhiyun # 968*4882a593Smuzhiyun # Note that we start with inp[2:3]*r^2. This is because it 969*4882a593Smuzhiyun # doesn't depend on reduction in previous iteration. 970*4882a593Smuzhiyun ################################################################ 971*4882a593Smuzhiyun # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 972*4882a593Smuzhiyun # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 973*4882a593Smuzhiyun # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 974*4882a593Smuzhiyun # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 975*4882a593Smuzhiyun # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 976*4882a593Smuzhiyun # 977*4882a593Smuzhiyun # though note that $Tx and $Hx are "reversed" in this section, 978*4882a593Smuzhiyun # and $D4 is preloaded with r0^2... 979*4882a593Smuzhiyun 980*4882a593Smuzhiyun vpmuludq $T0,$D4,$D0 # d0 = h0*r0 981*4882a593Smuzhiyun vpmuludq $T1,$D4,$D1 # d1 = h1*r0 982*4882a593Smuzhiyun vmovdqa $H2,0x20(%r11) # offload hash 983*4882a593Smuzhiyun vpmuludq $T2,$D4,$D2 # d3 = h2*r0 984*4882a593Smuzhiyun vmovdqa 0x10(%rsp),$H2 # r1^2 985*4882a593Smuzhiyun vpmuludq $T3,$D4,$D3 # d3 = h3*r0 986*4882a593Smuzhiyun vpmuludq $T4,$D4,$D4 # d4 = h4*r0 987*4882a593Smuzhiyun 988*4882a593Smuzhiyun vmovdqa $H0,0x00(%r11) # 989*4882a593Smuzhiyun vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 990*4882a593Smuzhiyun vmovdqa $H1,0x10(%r11) # 991*4882a593Smuzhiyun vpmuludq $T3,$H2,$H1 # h3*r1 992*4882a593Smuzhiyun vpaddq $H0,$D0,$D0 # d0 += h4*s1 993*4882a593Smuzhiyun vpaddq $H1,$D4,$D4 # d4 += h3*r1 994*4882a593Smuzhiyun vmovdqa $H3,0x30(%r11) # 995*4882a593Smuzhiyun vpmuludq $T2,$H2,$H0 # h2*r1 996*4882a593Smuzhiyun vpmuludq $T1,$H2,$H1 # h1*r1 997*4882a593Smuzhiyun vpaddq $H0,$D3,$D3 # d3 += h2*r1 998*4882a593Smuzhiyun vmovdqa 0x30(%rsp),$H3 # r2^2 999*4882a593Smuzhiyun vpaddq $H1,$D2,$D2 # d2 += h1*r1 1000*4882a593Smuzhiyun vmovdqa $H4,0x40(%r11) # 1001*4882a593Smuzhiyun vpmuludq $T0,$H2,$H2 # h0*r1 1002*4882a593Smuzhiyun vpmuludq $T2,$H3,$H0 # h2*r2 1003*4882a593Smuzhiyun vpaddq $H2,$D1,$D1 # d1 += h0*r1 1004*4882a593Smuzhiyun 1005*4882a593Smuzhiyun vmovdqa 0x40(%rsp),$H4 # s2^2 1006*4882a593Smuzhiyun vpaddq $H0,$D4,$D4 # d4 += h2*r2 1007*4882a593Smuzhiyun vpmuludq $T1,$H3,$H1 # h1*r2 1008*4882a593Smuzhiyun vpmuludq $T0,$H3,$H3 # h0*r2 1009*4882a593Smuzhiyun vpaddq $H1,$D3,$D3 # d3 += h1*r2 1010*4882a593Smuzhiyun vmovdqa 0x50(%rsp),$H2 # r3^2 1011*4882a593Smuzhiyun vpaddq $H3,$D2,$D2 # d2 += h0*r2 1012*4882a593Smuzhiyun vpmuludq $T4,$H4,$H0 # h4*s2 1013*4882a593Smuzhiyun vpmuludq $T3,$H4,$H4 # h3*s2 1014*4882a593Smuzhiyun vpaddq $H0,$D1,$D1 # d1 += h4*s2 1015*4882a593Smuzhiyun vmovdqa 0x60(%rsp),$H3 # s3^2 1016*4882a593Smuzhiyun vpaddq $H4,$D0,$D0 # d0 += h3*s2 1017*4882a593Smuzhiyun 1018*4882a593Smuzhiyun vmovdqa 0x80(%rsp),$H4 # s4^2 1019*4882a593Smuzhiyun vpmuludq $T1,$H2,$H1 # h1*r3 1020*4882a593Smuzhiyun vpmuludq $T0,$H2,$H2 # h0*r3 1021*4882a593Smuzhiyun vpaddq $H1,$D4,$D4 # d4 += h1*r3 1022*4882a593Smuzhiyun vpaddq $H2,$D3,$D3 # d3 += h0*r3 1023*4882a593Smuzhiyun vpmuludq $T4,$H3,$H0 # h4*s3 1024*4882a593Smuzhiyun vpmuludq $T3,$H3,$H1 # h3*s3 1025*4882a593Smuzhiyun vpaddq $H0,$D2,$D2 # d2 += h4*s3 1026*4882a593Smuzhiyun vmovdqu 16*0($inp),$H0 # load input 1027*4882a593Smuzhiyun vpaddq $H1,$D1,$D1 # d1 += h3*s3 1028*4882a593Smuzhiyun vpmuludq $T2,$H3,$H3 # h2*s3 1029*4882a593Smuzhiyun vpmuludq $T2,$H4,$T2 # h2*s4 1030*4882a593Smuzhiyun vpaddq $H3,$D0,$D0 # d0 += h2*s3 1031*4882a593Smuzhiyun 1032*4882a593Smuzhiyun vmovdqu 16*1($inp),$H1 # 1033*4882a593Smuzhiyun vpaddq $T2,$D1,$D1 # d1 += h2*s4 1034*4882a593Smuzhiyun vpmuludq $T3,$H4,$T3 # h3*s4 1035*4882a593Smuzhiyun vpmuludq $T4,$H4,$T4 # h4*s4 1036*4882a593Smuzhiyun vpsrldq \$6,$H0,$H2 # splat input 1037*4882a593Smuzhiyun vpaddq $T3,$D2,$D2 # d2 += h3*s4 1038*4882a593Smuzhiyun vpaddq $T4,$D3,$D3 # d3 += h4*s4 1039*4882a593Smuzhiyun vpsrldq \$6,$H1,$H3 # 1040*4882a593Smuzhiyun vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 1041*4882a593Smuzhiyun vpmuludq $T1,$H4,$T0 # h1*s4 1042*4882a593Smuzhiyun vpunpckhqdq $H1,$H0,$H4 # 4 1043*4882a593Smuzhiyun vpaddq $T4,$D4,$D4 # d4 += h0*r4 1044*4882a593Smuzhiyun vmovdqa -0x90(%r11),$T4 # r0^4 1045*4882a593Smuzhiyun vpaddq $T0,$D0,$D0 # d0 += h1*s4 1046*4882a593Smuzhiyun 1047*4882a593Smuzhiyun vpunpcklqdq $H1,$H0,$H0 # 0:1 1048*4882a593Smuzhiyun vpunpcklqdq $H3,$H2,$H3 # 2:3 1049*4882a593Smuzhiyun 1050*4882a593Smuzhiyun #vpsrlq \$40,$H4,$H4 # 4 1051*4882a593Smuzhiyun vpsrldq \$`40/8`,$H4,$H4 # 4 1052*4882a593Smuzhiyun vpsrlq \$26,$H0,$H1 1053*4882a593Smuzhiyun vpand $MASK,$H0,$H0 # 0 1054*4882a593Smuzhiyun vpsrlq \$4,$H3,$H2 1055*4882a593Smuzhiyun vpand $MASK,$H1,$H1 # 1 1056*4882a593Smuzhiyun vpand 0(%rcx),$H4,$H4 # .Lmask24 1057*4882a593Smuzhiyun vpsrlq \$30,$H3,$H3 1058*4882a593Smuzhiyun vpand $MASK,$H2,$H2 # 2 1059*4882a593Smuzhiyun vpand $MASK,$H3,$H3 # 3 1060*4882a593Smuzhiyun vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1061*4882a593Smuzhiyun 1062*4882a593Smuzhiyun vpaddq 0x00(%r11),$H0,$H0 # add hash value 1063*4882a593Smuzhiyun vpaddq 0x10(%r11),$H1,$H1 1064*4882a593Smuzhiyun vpaddq 0x20(%r11),$H2,$H2 1065*4882a593Smuzhiyun vpaddq 0x30(%r11),$H3,$H3 1066*4882a593Smuzhiyun vpaddq 0x40(%r11),$H4,$H4 1067*4882a593Smuzhiyun 1068*4882a593Smuzhiyun lea 16*2($inp),%rax 1069*4882a593Smuzhiyun lea 16*4($inp),$inp 1070*4882a593Smuzhiyun sub \$64,$len 1071*4882a593Smuzhiyun cmovc %rax,$inp 1072*4882a593Smuzhiyun 1073*4882a593Smuzhiyun ################################################################ 1074*4882a593Smuzhiyun # Now we accumulate (inp[0:1]+hash)*r^4 1075*4882a593Smuzhiyun ################################################################ 1076*4882a593Smuzhiyun # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1077*4882a593Smuzhiyun # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1078*4882a593Smuzhiyun # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1079*4882a593Smuzhiyun # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1080*4882a593Smuzhiyun # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1081*4882a593Smuzhiyun 1082*4882a593Smuzhiyun vpmuludq $H0,$T4,$T0 # h0*r0 1083*4882a593Smuzhiyun vpmuludq $H1,$T4,$T1 # h1*r0 1084*4882a593Smuzhiyun vpaddq $T0,$D0,$D0 1085*4882a593Smuzhiyun vpaddq $T1,$D1,$D1 1086*4882a593Smuzhiyun vmovdqa -0x80(%r11),$T2 # r1^4 1087*4882a593Smuzhiyun vpmuludq $H2,$T4,$T0 # h2*r0 1088*4882a593Smuzhiyun vpmuludq $H3,$T4,$T1 # h3*r0 1089*4882a593Smuzhiyun vpaddq $T0,$D2,$D2 1090*4882a593Smuzhiyun vpaddq $T1,$D3,$D3 1091*4882a593Smuzhiyun vpmuludq $H4,$T4,$T4 # h4*r0 1092*4882a593Smuzhiyun vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 1093*4882a593Smuzhiyun vpaddq $T4,$D4,$D4 1094*4882a593Smuzhiyun 1095*4882a593Smuzhiyun vpaddq $T0,$D0,$D0 # d0 += h4*s1 1096*4882a593Smuzhiyun vpmuludq $H2,$T2,$T1 # h2*r1 1097*4882a593Smuzhiyun vpmuludq $H3,$T2,$T0 # h3*r1 1098*4882a593Smuzhiyun vpaddq $T1,$D3,$D3 # d3 += h2*r1 1099*4882a593Smuzhiyun vmovdqa -0x60(%r11),$T3 # r2^4 1100*4882a593Smuzhiyun vpaddq $T0,$D4,$D4 # d4 += h3*r1 1101*4882a593Smuzhiyun vpmuludq $H1,$T2,$T1 # h1*r1 1102*4882a593Smuzhiyun vpmuludq $H0,$T2,$T2 # h0*r1 1103*4882a593Smuzhiyun vpaddq $T1,$D2,$D2 # d2 += h1*r1 1104*4882a593Smuzhiyun vpaddq $T2,$D1,$D1 # d1 += h0*r1 1105*4882a593Smuzhiyun 1106*4882a593Smuzhiyun vmovdqa -0x50(%r11),$T4 # s2^4 1107*4882a593Smuzhiyun vpmuludq $H2,$T3,$T0 # h2*r2 1108*4882a593Smuzhiyun vpmuludq $H1,$T3,$T1 # h1*r2 1109*4882a593Smuzhiyun vpaddq $T0,$D4,$D4 # d4 += h2*r2 1110*4882a593Smuzhiyun vpaddq $T1,$D3,$D3 # d3 += h1*r2 1111*4882a593Smuzhiyun vmovdqa -0x40(%r11),$T2 # r3^4 1112*4882a593Smuzhiyun vpmuludq $H0,$T3,$T3 # h0*r2 1113*4882a593Smuzhiyun vpmuludq $H4,$T4,$T0 # h4*s2 1114*4882a593Smuzhiyun vpaddq $T3,$D2,$D2 # d2 += h0*r2 1115*4882a593Smuzhiyun vpaddq $T0,$D1,$D1 # d1 += h4*s2 1116*4882a593Smuzhiyun vmovdqa -0x30(%r11),$T3 # s3^4 1117*4882a593Smuzhiyun vpmuludq $H3,$T4,$T4 # h3*s2 1118*4882a593Smuzhiyun vpmuludq $H1,$T2,$T1 # h1*r3 1119*4882a593Smuzhiyun vpaddq $T4,$D0,$D0 # d0 += h3*s2 1120*4882a593Smuzhiyun 1121*4882a593Smuzhiyun vmovdqa -0x10(%r11),$T4 # s4^4 1122*4882a593Smuzhiyun vpaddq $T1,$D4,$D4 # d4 += h1*r3 1123*4882a593Smuzhiyun vpmuludq $H0,$T2,$T2 # h0*r3 1124*4882a593Smuzhiyun vpmuludq $H4,$T3,$T0 # h4*s3 1125*4882a593Smuzhiyun vpaddq $T2,$D3,$D3 # d3 += h0*r3 1126*4882a593Smuzhiyun vpaddq $T0,$D2,$D2 # d2 += h4*s3 1127*4882a593Smuzhiyun vmovdqu 16*2($inp),$T0 # load input 1128*4882a593Smuzhiyun vpmuludq $H3,$T3,$T2 # h3*s3 1129*4882a593Smuzhiyun vpmuludq $H2,$T3,$T3 # h2*s3 1130*4882a593Smuzhiyun vpaddq $T2,$D1,$D1 # d1 += h3*s3 1131*4882a593Smuzhiyun vmovdqu 16*3($inp),$T1 # 1132*4882a593Smuzhiyun vpaddq $T3,$D0,$D0 # d0 += h2*s3 1133*4882a593Smuzhiyun 1134*4882a593Smuzhiyun vpmuludq $H2,$T4,$H2 # h2*s4 1135*4882a593Smuzhiyun vpmuludq $H3,$T4,$H3 # h3*s4 1136*4882a593Smuzhiyun vpsrldq \$6,$T0,$T2 # splat input 1137*4882a593Smuzhiyun vpaddq $H2,$D1,$D1 # d1 += h2*s4 1138*4882a593Smuzhiyun vpmuludq $H4,$T4,$H4 # h4*s4 1139*4882a593Smuzhiyun vpsrldq \$6,$T1,$T3 # 1140*4882a593Smuzhiyun vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 1141*4882a593Smuzhiyun vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 1142*4882a593Smuzhiyun vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 1143*4882a593Smuzhiyun vpmuludq $H1,$T4,$H0 1144*4882a593Smuzhiyun vpunpckhqdq $T1,$T0,$T4 # 4 1145*4882a593Smuzhiyun vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1146*4882a593Smuzhiyun vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1147*4882a593Smuzhiyun 1148*4882a593Smuzhiyun vpunpcklqdq $T1,$T0,$T0 # 0:1 1149*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T3 # 2:3 1150*4882a593Smuzhiyun 1151*4882a593Smuzhiyun #vpsrlq \$40,$T4,$T4 # 4 1152*4882a593Smuzhiyun vpsrldq \$`40/8`,$T4,$T4 # 4 1153*4882a593Smuzhiyun vpsrlq \$26,$T0,$T1 1154*4882a593Smuzhiyun vmovdqa 0x00(%rsp),$D4 # preload r0^2 1155*4882a593Smuzhiyun vpand $MASK,$T0,$T0 # 0 1156*4882a593Smuzhiyun vpsrlq \$4,$T3,$T2 1157*4882a593Smuzhiyun vpand $MASK,$T1,$T1 # 1 1158*4882a593Smuzhiyun vpand 0(%rcx),$T4,$T4 # .Lmask24 1159*4882a593Smuzhiyun vpsrlq \$30,$T3,$T3 1160*4882a593Smuzhiyun vpand $MASK,$T2,$T2 # 2 1161*4882a593Smuzhiyun vpand $MASK,$T3,$T3 # 3 1162*4882a593Smuzhiyun vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1163*4882a593Smuzhiyun 1164*4882a593Smuzhiyun ################################################################ 1165*4882a593Smuzhiyun # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 1166*4882a593Smuzhiyun # and P. Schwabe 1167*4882a593Smuzhiyun 1168*4882a593Smuzhiyun vpsrlq \$26,$H3,$D3 1169*4882a593Smuzhiyun vpand $MASK,$H3,$H3 1170*4882a593Smuzhiyun vpaddq $D3,$H4,$H4 # h3 -> h4 1171*4882a593Smuzhiyun 1172*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 1173*4882a593Smuzhiyun vpand $MASK,$H0,$H0 1174*4882a593Smuzhiyun vpaddq $D0,$D1,$H1 # h0 -> h1 1175*4882a593Smuzhiyun 1176*4882a593Smuzhiyun vpsrlq \$26,$H4,$D0 1177*4882a593Smuzhiyun vpand $MASK,$H4,$H4 1178*4882a593Smuzhiyun 1179*4882a593Smuzhiyun vpsrlq \$26,$H1,$D1 1180*4882a593Smuzhiyun vpand $MASK,$H1,$H1 1181*4882a593Smuzhiyun vpaddq $D1,$H2,$H2 # h1 -> h2 1182*4882a593Smuzhiyun 1183*4882a593Smuzhiyun vpaddq $D0,$H0,$H0 1184*4882a593Smuzhiyun vpsllq \$2,$D0,$D0 1185*4882a593Smuzhiyun vpaddq $D0,$H0,$H0 # h4 -> h0 1186*4882a593Smuzhiyun 1187*4882a593Smuzhiyun vpsrlq \$26,$H2,$D2 1188*4882a593Smuzhiyun vpand $MASK,$H2,$H2 1189*4882a593Smuzhiyun vpaddq $D2,$H3,$H3 # h2 -> h3 1190*4882a593Smuzhiyun 1191*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 1192*4882a593Smuzhiyun vpand $MASK,$H0,$H0 1193*4882a593Smuzhiyun vpaddq $D0,$H1,$H1 # h0 -> h1 1194*4882a593Smuzhiyun 1195*4882a593Smuzhiyun vpsrlq \$26,$H3,$D3 1196*4882a593Smuzhiyun vpand $MASK,$H3,$H3 1197*4882a593Smuzhiyun vpaddq $D3,$H4,$H4 # h3 -> h4 1198*4882a593Smuzhiyun 1199*4882a593Smuzhiyun ja .Loop_avx 1200*4882a593Smuzhiyun 1201*4882a593Smuzhiyun.Lskip_loop_avx: 1202*4882a593Smuzhiyun ################################################################ 1203*4882a593Smuzhiyun # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1204*4882a593Smuzhiyun 1205*4882a593Smuzhiyun vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 1206*4882a593Smuzhiyun add \$32,$len 1207*4882a593Smuzhiyun jnz .Long_tail_avx 1208*4882a593Smuzhiyun 1209*4882a593Smuzhiyun vpaddq $H2,$T2,$T2 1210*4882a593Smuzhiyun vpaddq $H0,$T0,$T0 1211*4882a593Smuzhiyun vpaddq $H1,$T1,$T1 1212*4882a593Smuzhiyun vpaddq $H3,$T3,$T3 1213*4882a593Smuzhiyun vpaddq $H4,$T4,$T4 1214*4882a593Smuzhiyun 1215*4882a593Smuzhiyun.Long_tail_avx: 1216*4882a593Smuzhiyun vmovdqa $H2,0x20(%r11) 1217*4882a593Smuzhiyun vmovdqa $H0,0x00(%r11) 1218*4882a593Smuzhiyun vmovdqa $H1,0x10(%r11) 1219*4882a593Smuzhiyun vmovdqa $H3,0x30(%r11) 1220*4882a593Smuzhiyun vmovdqa $H4,0x40(%r11) 1221*4882a593Smuzhiyun 1222*4882a593Smuzhiyun # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1223*4882a593Smuzhiyun # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1224*4882a593Smuzhiyun # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1225*4882a593Smuzhiyun # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1226*4882a593Smuzhiyun # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1227*4882a593Smuzhiyun 1228*4882a593Smuzhiyun vpmuludq $T2,$D4,$D2 # d2 = h2*r0 1229*4882a593Smuzhiyun vpmuludq $T0,$D4,$D0 # d0 = h0*r0 1230*4882a593Smuzhiyun vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n 1231*4882a593Smuzhiyun vpmuludq $T1,$D4,$D1 # d1 = h1*r0 1232*4882a593Smuzhiyun vpmuludq $T3,$D4,$D3 # d3 = h3*r0 1233*4882a593Smuzhiyun vpmuludq $T4,$D4,$D4 # d4 = h4*r0 1234*4882a593Smuzhiyun 1235*4882a593Smuzhiyun vpmuludq $T3,$H2,$H0 # h3*r1 1236*4882a593Smuzhiyun vpaddq $H0,$D4,$D4 # d4 += h3*r1 1237*4882a593Smuzhiyun vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n 1238*4882a593Smuzhiyun vpmuludq $T2,$H2,$H1 # h2*r1 1239*4882a593Smuzhiyun vpaddq $H1,$D3,$D3 # d3 += h2*r1 1240*4882a593Smuzhiyun vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n 1241*4882a593Smuzhiyun vpmuludq $T1,$H2,$H0 # h1*r1 1242*4882a593Smuzhiyun vpaddq $H0,$D2,$D2 # d2 += h1*r1 1243*4882a593Smuzhiyun vpmuludq $T0,$H2,$H2 # h0*r1 1244*4882a593Smuzhiyun vpaddq $H2,$D1,$D1 # d1 += h0*r1 1245*4882a593Smuzhiyun vpmuludq $T4,$H3,$H3 # h4*s1 1246*4882a593Smuzhiyun vpaddq $H3,$D0,$D0 # d0 += h4*s1 1247*4882a593Smuzhiyun 1248*4882a593Smuzhiyun vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n 1249*4882a593Smuzhiyun vpmuludq $T2,$H4,$H1 # h2*r2 1250*4882a593Smuzhiyun vpaddq $H1,$D4,$D4 # d4 += h2*r2 1251*4882a593Smuzhiyun vpmuludq $T1,$H4,$H0 # h1*r2 1252*4882a593Smuzhiyun vpaddq $H0,$D3,$D3 # d3 += h1*r2 1253*4882a593Smuzhiyun vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n 1254*4882a593Smuzhiyun vpmuludq $T0,$H4,$H4 # h0*r2 1255*4882a593Smuzhiyun vpaddq $H4,$D2,$D2 # d2 += h0*r2 1256*4882a593Smuzhiyun vpmuludq $T4,$H2,$H1 # h4*s2 1257*4882a593Smuzhiyun vpaddq $H1,$D1,$D1 # d1 += h4*s2 1258*4882a593Smuzhiyun vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n 1259*4882a593Smuzhiyun vpmuludq $T3,$H2,$H2 # h3*s2 1260*4882a593Smuzhiyun vpaddq $H2,$D0,$D0 # d0 += h3*s2 1261*4882a593Smuzhiyun 1262*4882a593Smuzhiyun vpmuludq $T1,$H3,$H0 # h1*r3 1263*4882a593Smuzhiyun vpaddq $H0,$D4,$D4 # d4 += h1*r3 1264*4882a593Smuzhiyun vpmuludq $T0,$H3,$H3 # h0*r3 1265*4882a593Smuzhiyun vpaddq $H3,$D3,$D3 # d3 += h0*r3 1266*4882a593Smuzhiyun vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n 1267*4882a593Smuzhiyun vpmuludq $T4,$H4,$H1 # h4*s3 1268*4882a593Smuzhiyun vpaddq $H1,$D2,$D2 # d2 += h4*s3 1269*4882a593Smuzhiyun vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n 1270*4882a593Smuzhiyun vpmuludq $T3,$H4,$H0 # h3*s3 1271*4882a593Smuzhiyun vpaddq $H0,$D1,$D1 # d1 += h3*s3 1272*4882a593Smuzhiyun vpmuludq $T2,$H4,$H4 # h2*s3 1273*4882a593Smuzhiyun vpaddq $H4,$D0,$D0 # d0 += h2*s3 1274*4882a593Smuzhiyun 1275*4882a593Smuzhiyun vpmuludq $T0,$H2,$H2 # h0*r4 1276*4882a593Smuzhiyun vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 1277*4882a593Smuzhiyun vpmuludq $T4,$H3,$H1 # h4*s4 1278*4882a593Smuzhiyun vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 1279*4882a593Smuzhiyun vpmuludq $T3,$H3,$H0 # h3*s4 1280*4882a593Smuzhiyun vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 1281*4882a593Smuzhiyun vpmuludq $T2,$H3,$H1 # h2*s4 1282*4882a593Smuzhiyun vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 1283*4882a593Smuzhiyun vpmuludq $T1,$H3,$H3 # h1*s4 1284*4882a593Smuzhiyun vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 1285*4882a593Smuzhiyun 1286*4882a593Smuzhiyun jz .Lshort_tail_avx 1287*4882a593Smuzhiyun 1288*4882a593Smuzhiyun vmovdqu 16*0($inp),$H0 # load input 1289*4882a593Smuzhiyun vmovdqu 16*1($inp),$H1 1290*4882a593Smuzhiyun 1291*4882a593Smuzhiyun vpsrldq \$6,$H0,$H2 # splat input 1292*4882a593Smuzhiyun vpsrldq \$6,$H1,$H3 1293*4882a593Smuzhiyun vpunpckhqdq $H1,$H0,$H4 # 4 1294*4882a593Smuzhiyun vpunpcklqdq $H1,$H0,$H0 # 0:1 1295*4882a593Smuzhiyun vpunpcklqdq $H3,$H2,$H3 # 2:3 1296*4882a593Smuzhiyun 1297*4882a593Smuzhiyun vpsrlq \$40,$H4,$H4 # 4 1298*4882a593Smuzhiyun vpsrlq \$26,$H0,$H1 1299*4882a593Smuzhiyun vpand $MASK,$H0,$H0 # 0 1300*4882a593Smuzhiyun vpsrlq \$4,$H3,$H2 1301*4882a593Smuzhiyun vpand $MASK,$H1,$H1 # 1 1302*4882a593Smuzhiyun vpsrlq \$30,$H3,$H3 1303*4882a593Smuzhiyun vpand $MASK,$H2,$H2 # 2 1304*4882a593Smuzhiyun vpand $MASK,$H3,$H3 # 3 1305*4882a593Smuzhiyun vpor 32(%rcx),$H4,$H4 # padbit, yes, always 1306*4882a593Smuzhiyun 1307*4882a593Smuzhiyun vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 1308*4882a593Smuzhiyun vpaddq 0x00(%r11),$H0,$H0 1309*4882a593Smuzhiyun vpaddq 0x10(%r11),$H1,$H1 1310*4882a593Smuzhiyun vpaddq 0x20(%r11),$H2,$H2 1311*4882a593Smuzhiyun vpaddq 0x30(%r11),$H3,$H3 1312*4882a593Smuzhiyun vpaddq 0x40(%r11),$H4,$H4 1313*4882a593Smuzhiyun 1314*4882a593Smuzhiyun ################################################################ 1315*4882a593Smuzhiyun # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate 1316*4882a593Smuzhiyun 1317*4882a593Smuzhiyun vpmuludq $H0,$T4,$T0 # h0*r0 1318*4882a593Smuzhiyun vpaddq $T0,$D0,$D0 # d0 += h0*r0 1319*4882a593Smuzhiyun vpmuludq $H1,$T4,$T1 # h1*r0 1320*4882a593Smuzhiyun vpaddq $T1,$D1,$D1 # d1 += h1*r0 1321*4882a593Smuzhiyun vpmuludq $H2,$T4,$T0 # h2*r0 1322*4882a593Smuzhiyun vpaddq $T0,$D2,$D2 # d2 += h2*r0 1323*4882a593Smuzhiyun vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n 1324*4882a593Smuzhiyun vpmuludq $H3,$T4,$T1 # h3*r0 1325*4882a593Smuzhiyun vpaddq $T1,$D3,$D3 # d3 += h3*r0 1326*4882a593Smuzhiyun vpmuludq $H4,$T4,$T4 # h4*r0 1327*4882a593Smuzhiyun vpaddq $T4,$D4,$D4 # d4 += h4*r0 1328*4882a593Smuzhiyun 1329*4882a593Smuzhiyun vpmuludq $H3,$T2,$T0 # h3*r1 1330*4882a593Smuzhiyun vpaddq $T0,$D4,$D4 # d4 += h3*r1 1331*4882a593Smuzhiyun vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 1332*4882a593Smuzhiyun vpmuludq $H2,$T2,$T1 # h2*r1 1333*4882a593Smuzhiyun vpaddq $T1,$D3,$D3 # d3 += h2*r1 1334*4882a593Smuzhiyun vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 1335*4882a593Smuzhiyun vpmuludq $H1,$T2,$T0 # h1*r1 1336*4882a593Smuzhiyun vpaddq $T0,$D2,$D2 # d2 += h1*r1 1337*4882a593Smuzhiyun vpmuludq $H0,$T2,$T2 # h0*r1 1338*4882a593Smuzhiyun vpaddq $T2,$D1,$D1 # d1 += h0*r1 1339*4882a593Smuzhiyun vpmuludq $H4,$T3,$T3 # h4*s1 1340*4882a593Smuzhiyun vpaddq $T3,$D0,$D0 # d0 += h4*s1 1341*4882a593Smuzhiyun 1342*4882a593Smuzhiyun vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 1343*4882a593Smuzhiyun vpmuludq $H2,$T4,$T1 # h2*r2 1344*4882a593Smuzhiyun vpaddq $T1,$D4,$D4 # d4 += h2*r2 1345*4882a593Smuzhiyun vpmuludq $H1,$T4,$T0 # h1*r2 1346*4882a593Smuzhiyun vpaddq $T0,$D3,$D3 # d3 += h1*r2 1347*4882a593Smuzhiyun vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 1348*4882a593Smuzhiyun vpmuludq $H0,$T4,$T4 # h0*r2 1349*4882a593Smuzhiyun vpaddq $T4,$D2,$D2 # d2 += h0*r2 1350*4882a593Smuzhiyun vpmuludq $H4,$T2,$T1 # h4*s2 1351*4882a593Smuzhiyun vpaddq $T1,$D1,$D1 # d1 += h4*s2 1352*4882a593Smuzhiyun vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 1353*4882a593Smuzhiyun vpmuludq $H3,$T2,$T2 # h3*s2 1354*4882a593Smuzhiyun vpaddq $T2,$D0,$D0 # d0 += h3*s2 1355*4882a593Smuzhiyun 1356*4882a593Smuzhiyun vpmuludq $H1,$T3,$T0 # h1*r3 1357*4882a593Smuzhiyun vpaddq $T0,$D4,$D4 # d4 += h1*r3 1358*4882a593Smuzhiyun vpmuludq $H0,$T3,$T3 # h0*r3 1359*4882a593Smuzhiyun vpaddq $T3,$D3,$D3 # d3 += h0*r3 1360*4882a593Smuzhiyun vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 1361*4882a593Smuzhiyun vpmuludq $H4,$T4,$T1 # h4*s3 1362*4882a593Smuzhiyun vpaddq $T1,$D2,$D2 # d2 += h4*s3 1363*4882a593Smuzhiyun vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 1364*4882a593Smuzhiyun vpmuludq $H3,$T4,$T0 # h3*s3 1365*4882a593Smuzhiyun vpaddq $T0,$D1,$D1 # d1 += h3*s3 1366*4882a593Smuzhiyun vpmuludq $H2,$T4,$T4 # h2*s3 1367*4882a593Smuzhiyun vpaddq $T4,$D0,$D0 # d0 += h2*s3 1368*4882a593Smuzhiyun 1369*4882a593Smuzhiyun vpmuludq $H0,$T2,$T2 # h0*r4 1370*4882a593Smuzhiyun vpaddq $T2,$D4,$D4 # d4 += h0*r4 1371*4882a593Smuzhiyun vpmuludq $H4,$T3,$T1 # h4*s4 1372*4882a593Smuzhiyun vpaddq $T1,$D3,$D3 # d3 += h4*s4 1373*4882a593Smuzhiyun vpmuludq $H3,$T3,$T0 # h3*s4 1374*4882a593Smuzhiyun vpaddq $T0,$D2,$D2 # d2 += h3*s4 1375*4882a593Smuzhiyun vpmuludq $H2,$T3,$T1 # h2*s4 1376*4882a593Smuzhiyun vpaddq $T1,$D1,$D1 # d1 += h2*s4 1377*4882a593Smuzhiyun vpmuludq $H1,$T3,$T3 # h1*s4 1378*4882a593Smuzhiyun vpaddq $T3,$D0,$D0 # d0 += h1*s4 1379*4882a593Smuzhiyun 1380*4882a593Smuzhiyun.Lshort_tail_avx: 1381*4882a593Smuzhiyun ################################################################ 1382*4882a593Smuzhiyun # horizontal addition 1383*4882a593Smuzhiyun 1384*4882a593Smuzhiyun vpsrldq \$8,$D4,$T4 1385*4882a593Smuzhiyun vpsrldq \$8,$D3,$T3 1386*4882a593Smuzhiyun vpsrldq \$8,$D1,$T1 1387*4882a593Smuzhiyun vpsrldq \$8,$D0,$T0 1388*4882a593Smuzhiyun vpsrldq \$8,$D2,$T2 1389*4882a593Smuzhiyun vpaddq $T3,$D3,$D3 1390*4882a593Smuzhiyun vpaddq $T4,$D4,$D4 1391*4882a593Smuzhiyun vpaddq $T0,$D0,$D0 1392*4882a593Smuzhiyun vpaddq $T1,$D1,$D1 1393*4882a593Smuzhiyun vpaddq $T2,$D2,$D2 1394*4882a593Smuzhiyun 1395*4882a593Smuzhiyun ################################################################ 1396*4882a593Smuzhiyun # lazy reduction 1397*4882a593Smuzhiyun 1398*4882a593Smuzhiyun vpsrlq \$26,$D3,$H3 1399*4882a593Smuzhiyun vpand $MASK,$D3,$D3 1400*4882a593Smuzhiyun vpaddq $H3,$D4,$D4 # h3 -> h4 1401*4882a593Smuzhiyun 1402*4882a593Smuzhiyun vpsrlq \$26,$D0,$H0 1403*4882a593Smuzhiyun vpand $MASK,$D0,$D0 1404*4882a593Smuzhiyun vpaddq $H0,$D1,$D1 # h0 -> h1 1405*4882a593Smuzhiyun 1406*4882a593Smuzhiyun vpsrlq \$26,$D4,$H4 1407*4882a593Smuzhiyun vpand $MASK,$D4,$D4 1408*4882a593Smuzhiyun 1409*4882a593Smuzhiyun vpsrlq \$26,$D1,$H1 1410*4882a593Smuzhiyun vpand $MASK,$D1,$D1 1411*4882a593Smuzhiyun vpaddq $H1,$D2,$D2 # h1 -> h2 1412*4882a593Smuzhiyun 1413*4882a593Smuzhiyun vpaddq $H4,$D0,$D0 1414*4882a593Smuzhiyun vpsllq \$2,$H4,$H4 1415*4882a593Smuzhiyun vpaddq $H4,$D0,$D0 # h4 -> h0 1416*4882a593Smuzhiyun 1417*4882a593Smuzhiyun vpsrlq \$26,$D2,$H2 1418*4882a593Smuzhiyun vpand $MASK,$D2,$D2 1419*4882a593Smuzhiyun vpaddq $H2,$D3,$D3 # h2 -> h3 1420*4882a593Smuzhiyun 1421*4882a593Smuzhiyun vpsrlq \$26,$D0,$H0 1422*4882a593Smuzhiyun vpand $MASK,$D0,$D0 1423*4882a593Smuzhiyun vpaddq $H0,$D1,$D1 # h0 -> h1 1424*4882a593Smuzhiyun 1425*4882a593Smuzhiyun vpsrlq \$26,$D3,$H3 1426*4882a593Smuzhiyun vpand $MASK,$D3,$D3 1427*4882a593Smuzhiyun vpaddq $H3,$D4,$D4 # h3 -> h4 1428*4882a593Smuzhiyun 1429*4882a593Smuzhiyun vmovd $D0,`4*0-48-64`($ctx) # save partially reduced 1430*4882a593Smuzhiyun vmovd $D1,`4*1-48-64`($ctx) 1431*4882a593Smuzhiyun vmovd $D2,`4*2-48-64`($ctx) 1432*4882a593Smuzhiyun vmovd $D3,`4*3-48-64`($ctx) 1433*4882a593Smuzhiyun vmovd $D4,`4*4-48-64`($ctx) 1434*4882a593Smuzhiyun___ 1435*4882a593Smuzhiyun$code.=<<___ if ($win64); 1436*4882a593Smuzhiyun vmovdqa 0x50(%r11),%xmm6 1437*4882a593Smuzhiyun vmovdqa 0x60(%r11),%xmm7 1438*4882a593Smuzhiyun vmovdqa 0x70(%r11),%xmm8 1439*4882a593Smuzhiyun vmovdqa 0x80(%r11),%xmm9 1440*4882a593Smuzhiyun vmovdqa 0x90(%r11),%xmm10 1441*4882a593Smuzhiyun vmovdqa 0xa0(%r11),%xmm11 1442*4882a593Smuzhiyun vmovdqa 0xb0(%r11),%xmm12 1443*4882a593Smuzhiyun vmovdqa 0xc0(%r11),%xmm13 1444*4882a593Smuzhiyun vmovdqa 0xd0(%r11),%xmm14 1445*4882a593Smuzhiyun vmovdqa 0xe0(%r11),%xmm15 1446*4882a593Smuzhiyun lea 0xf8(%r11),%rsp 1447*4882a593Smuzhiyun.Ldo_avx_epilogue: 1448*4882a593Smuzhiyun___ 1449*4882a593Smuzhiyun$code.=<<___ if (!$win64); 1450*4882a593Smuzhiyun lea -8(%r10),%rsp 1451*4882a593Smuzhiyun.cfi_def_cfa_register %rsp 1452*4882a593Smuzhiyun___ 1453*4882a593Smuzhiyun$code.=<<___; 1454*4882a593Smuzhiyun vzeroupper 1455*4882a593Smuzhiyun RET 1456*4882a593Smuzhiyun.cfi_endproc 1457*4882a593Smuzhiyun___ 1458*4882a593Smuzhiyun&end_function("poly1305_blocks_avx"); 1459*4882a593Smuzhiyun 1460*4882a593Smuzhiyun&declare_function("poly1305_emit_avx", 32, 3); 1461*4882a593Smuzhiyun$code.=<<___; 1462*4882a593Smuzhiyun cmpl \$0,20($ctx) # is_base2_26? 1463*4882a593Smuzhiyun je .Lemit 1464*4882a593Smuzhiyun 1465*4882a593Smuzhiyun mov 0($ctx),%eax # load hash value base 2^26 1466*4882a593Smuzhiyun mov 4($ctx),%ecx 1467*4882a593Smuzhiyun mov 8($ctx),%r8d 1468*4882a593Smuzhiyun mov 12($ctx),%r11d 1469*4882a593Smuzhiyun mov 16($ctx),%r10d 1470*4882a593Smuzhiyun 1471*4882a593Smuzhiyun shl \$26,%rcx # base 2^26 -> base 2^64 1472*4882a593Smuzhiyun mov %r8,%r9 1473*4882a593Smuzhiyun shl \$52,%r8 1474*4882a593Smuzhiyun add %rcx,%rax 1475*4882a593Smuzhiyun shr \$12,%r9 1476*4882a593Smuzhiyun add %rax,%r8 # h0 1477*4882a593Smuzhiyun adc \$0,%r9 1478*4882a593Smuzhiyun 1479*4882a593Smuzhiyun shl \$14,%r11 1480*4882a593Smuzhiyun mov %r10,%rax 1481*4882a593Smuzhiyun shr \$24,%r10 1482*4882a593Smuzhiyun add %r11,%r9 1483*4882a593Smuzhiyun shl \$40,%rax 1484*4882a593Smuzhiyun add %rax,%r9 # h1 1485*4882a593Smuzhiyun adc \$0,%r10 # h2 1486*4882a593Smuzhiyun 1487*4882a593Smuzhiyun mov %r10,%rax # could be partially reduced, so reduce 1488*4882a593Smuzhiyun mov %r10,%rcx 1489*4882a593Smuzhiyun and \$3,%r10 1490*4882a593Smuzhiyun shr \$2,%rax 1491*4882a593Smuzhiyun and \$-4,%rcx 1492*4882a593Smuzhiyun add %rcx,%rax 1493*4882a593Smuzhiyun add %rax,%r8 1494*4882a593Smuzhiyun adc \$0,%r9 1495*4882a593Smuzhiyun adc \$0,%r10 1496*4882a593Smuzhiyun 1497*4882a593Smuzhiyun mov %r8,%rax 1498*4882a593Smuzhiyun add \$5,%r8 # compare to modulus 1499*4882a593Smuzhiyun mov %r9,%rcx 1500*4882a593Smuzhiyun adc \$0,%r9 1501*4882a593Smuzhiyun adc \$0,%r10 1502*4882a593Smuzhiyun shr \$2,%r10 # did 130-bit value overflow? 1503*4882a593Smuzhiyun cmovnz %r8,%rax 1504*4882a593Smuzhiyun cmovnz %r9,%rcx 1505*4882a593Smuzhiyun 1506*4882a593Smuzhiyun add 0($nonce),%rax # accumulate nonce 1507*4882a593Smuzhiyun adc 8($nonce),%rcx 1508*4882a593Smuzhiyun mov %rax,0($mac) # write result 1509*4882a593Smuzhiyun mov %rcx,8($mac) 1510*4882a593Smuzhiyun 1511*4882a593Smuzhiyun RET 1512*4882a593Smuzhiyun___ 1513*4882a593Smuzhiyun&end_function("poly1305_emit_avx"); 1514*4882a593Smuzhiyun 1515*4882a593Smuzhiyunif ($avx>1) { 1516*4882a593Smuzhiyun 1517*4882a593Smuzhiyunmy ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = 1518*4882a593Smuzhiyun map("%ymm$_",(0..15)); 1519*4882a593Smuzhiyunmy $S4=$MASK; 1520*4882a593Smuzhiyun 1521*4882a593Smuzhiyunsub poly1305_blocks_avxN { 1522*4882a593Smuzhiyun my ($avx512) = @_; 1523*4882a593Smuzhiyun my $suffix = $avx512 ? "_avx512" : ""; 1524*4882a593Smuzhiyun$code.=<<___; 1525*4882a593Smuzhiyun.cfi_startproc 1526*4882a593Smuzhiyun mov 20($ctx),%r8d # is_base2_26 1527*4882a593Smuzhiyun cmp \$128,$len 1528*4882a593Smuzhiyun jae .Lblocks_avx2$suffix 1529*4882a593Smuzhiyun test %r8d,%r8d 1530*4882a593Smuzhiyun jz .Lblocks 1531*4882a593Smuzhiyun 1532*4882a593Smuzhiyun.Lblocks_avx2$suffix: 1533*4882a593Smuzhiyun and \$-16,$len 1534*4882a593Smuzhiyun jz .Lno_data_avx2$suffix 1535*4882a593Smuzhiyun 1536*4882a593Smuzhiyun vzeroupper 1537*4882a593Smuzhiyun 1538*4882a593Smuzhiyun test %r8d,%r8d 1539*4882a593Smuzhiyun jz .Lbase2_64_avx2$suffix 1540*4882a593Smuzhiyun 1541*4882a593Smuzhiyun test \$63,$len 1542*4882a593Smuzhiyun jz .Leven_avx2$suffix 1543*4882a593Smuzhiyun 1544*4882a593Smuzhiyun push %rbp 1545*4882a593Smuzhiyun.cfi_push %rbp 1546*4882a593Smuzhiyun mov %rsp,%rbp 1547*4882a593Smuzhiyun push %rbx 1548*4882a593Smuzhiyun.cfi_push %rbx 1549*4882a593Smuzhiyun push %r12 1550*4882a593Smuzhiyun.cfi_push %r12 1551*4882a593Smuzhiyun push %r13 1552*4882a593Smuzhiyun.cfi_push %r13 1553*4882a593Smuzhiyun push %r14 1554*4882a593Smuzhiyun.cfi_push %r14 1555*4882a593Smuzhiyun push %r15 1556*4882a593Smuzhiyun.cfi_push %r15 1557*4882a593Smuzhiyun.Lblocks_avx2_body$suffix: 1558*4882a593Smuzhiyun 1559*4882a593Smuzhiyun mov $len,%r15 # reassign $len 1560*4882a593Smuzhiyun 1561*4882a593Smuzhiyun mov 0($ctx),$d1 # load hash value 1562*4882a593Smuzhiyun mov 8($ctx),$d2 1563*4882a593Smuzhiyun mov 16($ctx),$h2#d 1564*4882a593Smuzhiyun 1565*4882a593Smuzhiyun mov 24($ctx),$r0 # load r 1566*4882a593Smuzhiyun mov 32($ctx),$s1 1567*4882a593Smuzhiyun 1568*4882a593Smuzhiyun ################################# base 2^26 -> base 2^64 1569*4882a593Smuzhiyun mov $d1#d,$h0#d 1570*4882a593Smuzhiyun and \$`-1*(1<<31)`,$d1 1571*4882a593Smuzhiyun mov $d2,$r1 # borrow $r1 1572*4882a593Smuzhiyun mov $d2#d,$h1#d 1573*4882a593Smuzhiyun and \$`-1*(1<<31)`,$d2 1574*4882a593Smuzhiyun 1575*4882a593Smuzhiyun shr \$6,$d1 1576*4882a593Smuzhiyun shl \$52,$r1 1577*4882a593Smuzhiyun add $d1,$h0 1578*4882a593Smuzhiyun shr \$12,$h1 1579*4882a593Smuzhiyun shr \$18,$d2 1580*4882a593Smuzhiyun add $r1,$h0 1581*4882a593Smuzhiyun adc $d2,$h1 1582*4882a593Smuzhiyun 1583*4882a593Smuzhiyun mov $h2,$d1 1584*4882a593Smuzhiyun shl \$40,$d1 1585*4882a593Smuzhiyun shr \$24,$h2 1586*4882a593Smuzhiyun add $d1,$h1 1587*4882a593Smuzhiyun adc \$0,$h2 # can be partially reduced... 1588*4882a593Smuzhiyun 1589*4882a593Smuzhiyun mov \$-4,$d2 # ... so reduce 1590*4882a593Smuzhiyun mov $h2,$d1 1591*4882a593Smuzhiyun and $h2,$d2 1592*4882a593Smuzhiyun shr \$2,$d1 1593*4882a593Smuzhiyun and \$3,$h2 1594*4882a593Smuzhiyun add $d2,$d1 # =*5 1595*4882a593Smuzhiyun add $d1,$h0 1596*4882a593Smuzhiyun adc \$0,$h1 1597*4882a593Smuzhiyun adc \$0,$h2 1598*4882a593Smuzhiyun 1599*4882a593Smuzhiyun mov $s1,$r1 1600*4882a593Smuzhiyun mov $s1,%rax 1601*4882a593Smuzhiyun shr \$2,$s1 1602*4882a593Smuzhiyun add $r1,$s1 # s1 = r1 + (r1 >> 2) 1603*4882a593Smuzhiyun 1604*4882a593Smuzhiyun.Lbase2_26_pre_avx2$suffix: 1605*4882a593Smuzhiyun add 0($inp),$h0 # accumulate input 1606*4882a593Smuzhiyun adc 8($inp),$h1 1607*4882a593Smuzhiyun lea 16($inp),$inp 1608*4882a593Smuzhiyun adc $padbit,$h2 1609*4882a593Smuzhiyun sub \$16,%r15 1610*4882a593Smuzhiyun 1611*4882a593Smuzhiyun call __poly1305_block 1612*4882a593Smuzhiyun mov $r1,%rax 1613*4882a593Smuzhiyun 1614*4882a593Smuzhiyun test \$63,%r15 1615*4882a593Smuzhiyun jnz .Lbase2_26_pre_avx2$suffix 1616*4882a593Smuzhiyun 1617*4882a593Smuzhiyun test $padbit,$padbit # if $padbit is zero, 1618*4882a593Smuzhiyun jz .Lstore_base2_64_avx2$suffix # store hash in base 2^64 format 1619*4882a593Smuzhiyun 1620*4882a593Smuzhiyun ################################# base 2^64 -> base 2^26 1621*4882a593Smuzhiyun mov $h0,%rax 1622*4882a593Smuzhiyun mov $h0,%rdx 1623*4882a593Smuzhiyun shr \$52,$h0 1624*4882a593Smuzhiyun mov $h1,$r0 1625*4882a593Smuzhiyun mov $h1,$r1 1626*4882a593Smuzhiyun shr \$26,%rdx 1627*4882a593Smuzhiyun and \$0x3ffffff,%rax # h[0] 1628*4882a593Smuzhiyun shl \$12,$r0 1629*4882a593Smuzhiyun and \$0x3ffffff,%rdx # h[1] 1630*4882a593Smuzhiyun shr \$14,$h1 1631*4882a593Smuzhiyun or $r0,$h0 1632*4882a593Smuzhiyun shl \$24,$h2 1633*4882a593Smuzhiyun and \$0x3ffffff,$h0 # h[2] 1634*4882a593Smuzhiyun shr \$40,$r1 1635*4882a593Smuzhiyun and \$0x3ffffff,$h1 # h[3] 1636*4882a593Smuzhiyun or $r1,$h2 # h[4] 1637*4882a593Smuzhiyun 1638*4882a593Smuzhiyun test %r15,%r15 1639*4882a593Smuzhiyun jz .Lstore_base2_26_avx2$suffix 1640*4882a593Smuzhiyun 1641*4882a593Smuzhiyun vmovd %rax#d,%x#$H0 1642*4882a593Smuzhiyun vmovd %rdx#d,%x#$H1 1643*4882a593Smuzhiyun vmovd $h0#d,%x#$H2 1644*4882a593Smuzhiyun vmovd $h1#d,%x#$H3 1645*4882a593Smuzhiyun vmovd $h2#d,%x#$H4 1646*4882a593Smuzhiyun jmp .Lproceed_avx2$suffix 1647*4882a593Smuzhiyun 1648*4882a593Smuzhiyun.align 32 1649*4882a593Smuzhiyun.Lstore_base2_64_avx2$suffix: 1650*4882a593Smuzhiyun mov $h0,0($ctx) 1651*4882a593Smuzhiyun mov $h1,8($ctx) 1652*4882a593Smuzhiyun mov $h2,16($ctx) # note that is_base2_26 is zeroed 1653*4882a593Smuzhiyun jmp .Ldone_avx2$suffix 1654*4882a593Smuzhiyun 1655*4882a593Smuzhiyun.align 16 1656*4882a593Smuzhiyun.Lstore_base2_26_avx2$suffix: 1657*4882a593Smuzhiyun mov %rax#d,0($ctx) # store hash value base 2^26 1658*4882a593Smuzhiyun mov %rdx#d,4($ctx) 1659*4882a593Smuzhiyun mov $h0#d,8($ctx) 1660*4882a593Smuzhiyun mov $h1#d,12($ctx) 1661*4882a593Smuzhiyun mov $h2#d,16($ctx) 1662*4882a593Smuzhiyun.align 16 1663*4882a593Smuzhiyun.Ldone_avx2$suffix: 1664*4882a593Smuzhiyun pop %r15 1665*4882a593Smuzhiyun.cfi_restore %r15 1666*4882a593Smuzhiyun pop %r14 1667*4882a593Smuzhiyun.cfi_restore %r14 1668*4882a593Smuzhiyun pop %r13 1669*4882a593Smuzhiyun.cfi_restore %r13 1670*4882a593Smuzhiyun pop %r12 1671*4882a593Smuzhiyun.cfi_restore %r12 1672*4882a593Smuzhiyun pop %rbx 1673*4882a593Smuzhiyun.cfi_restore %rbx 1674*4882a593Smuzhiyun pop %rbp 1675*4882a593Smuzhiyun.cfi_restore %rbp 1676*4882a593Smuzhiyun.Lno_data_avx2$suffix: 1677*4882a593Smuzhiyun.Lblocks_avx2_epilogue$suffix: 1678*4882a593Smuzhiyun RET 1679*4882a593Smuzhiyun.cfi_endproc 1680*4882a593Smuzhiyun 1681*4882a593Smuzhiyun.align 32 1682*4882a593Smuzhiyun.Lbase2_64_avx2$suffix: 1683*4882a593Smuzhiyun.cfi_startproc 1684*4882a593Smuzhiyun push %rbp 1685*4882a593Smuzhiyun.cfi_push %rbp 1686*4882a593Smuzhiyun mov %rsp,%rbp 1687*4882a593Smuzhiyun push %rbx 1688*4882a593Smuzhiyun.cfi_push %rbx 1689*4882a593Smuzhiyun push %r12 1690*4882a593Smuzhiyun.cfi_push %r12 1691*4882a593Smuzhiyun push %r13 1692*4882a593Smuzhiyun.cfi_push %r13 1693*4882a593Smuzhiyun push %r14 1694*4882a593Smuzhiyun.cfi_push %r14 1695*4882a593Smuzhiyun push %r15 1696*4882a593Smuzhiyun.cfi_push %r15 1697*4882a593Smuzhiyun.Lbase2_64_avx2_body$suffix: 1698*4882a593Smuzhiyun 1699*4882a593Smuzhiyun mov $len,%r15 # reassign $len 1700*4882a593Smuzhiyun 1701*4882a593Smuzhiyun mov 24($ctx),$r0 # load r 1702*4882a593Smuzhiyun mov 32($ctx),$s1 1703*4882a593Smuzhiyun 1704*4882a593Smuzhiyun mov 0($ctx),$h0 # load hash value 1705*4882a593Smuzhiyun mov 8($ctx),$h1 1706*4882a593Smuzhiyun mov 16($ctx),$h2#d 1707*4882a593Smuzhiyun 1708*4882a593Smuzhiyun mov $s1,$r1 1709*4882a593Smuzhiyun mov $s1,%rax 1710*4882a593Smuzhiyun shr \$2,$s1 1711*4882a593Smuzhiyun add $r1,$s1 # s1 = r1 + (r1 >> 2) 1712*4882a593Smuzhiyun 1713*4882a593Smuzhiyun test \$63,$len 1714*4882a593Smuzhiyun jz .Linit_avx2$suffix 1715*4882a593Smuzhiyun 1716*4882a593Smuzhiyun.Lbase2_64_pre_avx2$suffix: 1717*4882a593Smuzhiyun add 0($inp),$h0 # accumulate input 1718*4882a593Smuzhiyun adc 8($inp),$h1 1719*4882a593Smuzhiyun lea 16($inp),$inp 1720*4882a593Smuzhiyun adc $padbit,$h2 1721*4882a593Smuzhiyun sub \$16,%r15 1722*4882a593Smuzhiyun 1723*4882a593Smuzhiyun call __poly1305_block 1724*4882a593Smuzhiyun mov $r1,%rax 1725*4882a593Smuzhiyun 1726*4882a593Smuzhiyun test \$63,%r15 1727*4882a593Smuzhiyun jnz .Lbase2_64_pre_avx2$suffix 1728*4882a593Smuzhiyun 1729*4882a593Smuzhiyun.Linit_avx2$suffix: 1730*4882a593Smuzhiyun ################################# base 2^64 -> base 2^26 1731*4882a593Smuzhiyun mov $h0,%rax 1732*4882a593Smuzhiyun mov $h0,%rdx 1733*4882a593Smuzhiyun shr \$52,$h0 1734*4882a593Smuzhiyun mov $h1,$d1 1735*4882a593Smuzhiyun mov $h1,$d2 1736*4882a593Smuzhiyun shr \$26,%rdx 1737*4882a593Smuzhiyun and \$0x3ffffff,%rax # h[0] 1738*4882a593Smuzhiyun shl \$12,$d1 1739*4882a593Smuzhiyun and \$0x3ffffff,%rdx # h[1] 1740*4882a593Smuzhiyun shr \$14,$h1 1741*4882a593Smuzhiyun or $d1,$h0 1742*4882a593Smuzhiyun shl \$24,$h2 1743*4882a593Smuzhiyun and \$0x3ffffff,$h0 # h[2] 1744*4882a593Smuzhiyun shr \$40,$d2 1745*4882a593Smuzhiyun and \$0x3ffffff,$h1 # h[3] 1746*4882a593Smuzhiyun or $d2,$h2 # h[4] 1747*4882a593Smuzhiyun 1748*4882a593Smuzhiyun vmovd %rax#d,%x#$H0 1749*4882a593Smuzhiyun vmovd %rdx#d,%x#$H1 1750*4882a593Smuzhiyun vmovd $h0#d,%x#$H2 1751*4882a593Smuzhiyun vmovd $h1#d,%x#$H3 1752*4882a593Smuzhiyun vmovd $h2#d,%x#$H4 1753*4882a593Smuzhiyun movl \$1,20($ctx) # set is_base2_26 1754*4882a593Smuzhiyun 1755*4882a593Smuzhiyun call __poly1305_init_avx 1756*4882a593Smuzhiyun 1757*4882a593Smuzhiyun.Lproceed_avx2$suffix: 1758*4882a593Smuzhiyun mov %r15,$len # restore $len 1759*4882a593Smuzhiyun___ 1760*4882a593Smuzhiyun$code.=<<___ if (!$kernel); 1761*4882a593Smuzhiyun mov OPENSSL_ia32cap_P+8(%rip),%r9d 1762*4882a593Smuzhiyun mov \$`(1<<31|1<<30|1<<16)`,%r11d 1763*4882a593Smuzhiyun___ 1764*4882a593Smuzhiyun$code.=<<___; 1765*4882a593Smuzhiyun pop %r15 1766*4882a593Smuzhiyun.cfi_restore %r15 1767*4882a593Smuzhiyun pop %r14 1768*4882a593Smuzhiyun.cfi_restore %r14 1769*4882a593Smuzhiyun pop %r13 1770*4882a593Smuzhiyun.cfi_restore %r13 1771*4882a593Smuzhiyun pop %r12 1772*4882a593Smuzhiyun.cfi_restore %r12 1773*4882a593Smuzhiyun pop %rbx 1774*4882a593Smuzhiyun.cfi_restore %rbx 1775*4882a593Smuzhiyun pop %rbp 1776*4882a593Smuzhiyun.cfi_restore %rbp 1777*4882a593Smuzhiyun.Lbase2_64_avx2_epilogue$suffix: 1778*4882a593Smuzhiyun jmp .Ldo_avx2$suffix 1779*4882a593Smuzhiyun.cfi_endproc 1780*4882a593Smuzhiyun 1781*4882a593Smuzhiyun.align 32 1782*4882a593Smuzhiyun.Leven_avx2$suffix: 1783*4882a593Smuzhiyun.cfi_startproc 1784*4882a593Smuzhiyun___ 1785*4882a593Smuzhiyun$code.=<<___ if (!$kernel); 1786*4882a593Smuzhiyun mov OPENSSL_ia32cap_P+8(%rip),%r9d 1787*4882a593Smuzhiyun___ 1788*4882a593Smuzhiyun$code.=<<___; 1789*4882a593Smuzhiyun vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 1790*4882a593Smuzhiyun vmovd 4*1($ctx),%x#$H1 1791*4882a593Smuzhiyun vmovd 4*2($ctx),%x#$H2 1792*4882a593Smuzhiyun vmovd 4*3($ctx),%x#$H3 1793*4882a593Smuzhiyun vmovd 4*4($ctx),%x#$H4 1794*4882a593Smuzhiyun 1795*4882a593Smuzhiyun.Ldo_avx2$suffix: 1796*4882a593Smuzhiyun___ 1797*4882a593Smuzhiyun$code.=<<___ if (!$kernel && $avx>2); 1798*4882a593Smuzhiyun cmp \$512,$len 1799*4882a593Smuzhiyun jb .Lskip_avx512 1800*4882a593Smuzhiyun and %r11d,%r9d 1801*4882a593Smuzhiyun test \$`1<<16`,%r9d # check for AVX512F 1802*4882a593Smuzhiyun jnz .Lblocks_avx512 1803*4882a593Smuzhiyun.Lskip_avx512$suffix: 1804*4882a593Smuzhiyun___ 1805*4882a593Smuzhiyun$code.=<<___ if ($avx > 2 && $avx512 && $kernel); 1806*4882a593Smuzhiyun cmp \$512,$len 1807*4882a593Smuzhiyun jae .Lblocks_avx512 1808*4882a593Smuzhiyun___ 1809*4882a593Smuzhiyun$code.=<<___ if (!$win64); 1810*4882a593Smuzhiyun lea 8(%rsp),%r10 1811*4882a593Smuzhiyun.cfi_def_cfa_register %r10 1812*4882a593Smuzhiyun sub \$0x128,%rsp 1813*4882a593Smuzhiyun___ 1814*4882a593Smuzhiyun$code.=<<___ if ($win64); 1815*4882a593Smuzhiyun lea 8(%rsp),%r10 1816*4882a593Smuzhiyun sub \$0x1c8,%rsp 1817*4882a593Smuzhiyun vmovdqa %xmm6,-0xb0(%r10) 1818*4882a593Smuzhiyun vmovdqa %xmm7,-0xa0(%r10) 1819*4882a593Smuzhiyun vmovdqa %xmm8,-0x90(%r10) 1820*4882a593Smuzhiyun vmovdqa %xmm9,-0x80(%r10) 1821*4882a593Smuzhiyun vmovdqa %xmm10,-0x70(%r10) 1822*4882a593Smuzhiyun vmovdqa %xmm11,-0x60(%r10) 1823*4882a593Smuzhiyun vmovdqa %xmm12,-0x50(%r10) 1824*4882a593Smuzhiyun vmovdqa %xmm13,-0x40(%r10) 1825*4882a593Smuzhiyun vmovdqa %xmm14,-0x30(%r10) 1826*4882a593Smuzhiyun vmovdqa %xmm15,-0x20(%r10) 1827*4882a593Smuzhiyun.Ldo_avx2_body$suffix: 1828*4882a593Smuzhiyun___ 1829*4882a593Smuzhiyun$code.=<<___; 1830*4882a593Smuzhiyun lea .Lconst(%rip),%rcx 1831*4882a593Smuzhiyun lea 48+64($ctx),$ctx # size optimization 1832*4882a593Smuzhiyun vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 1833*4882a593Smuzhiyun 1834*4882a593Smuzhiyun # expand and copy pre-calculated table to stack 1835*4882a593Smuzhiyun vmovdqu `16*0-64`($ctx),%x#$T2 1836*4882a593Smuzhiyun and \$-512,%rsp 1837*4882a593Smuzhiyun vmovdqu `16*1-64`($ctx),%x#$T3 1838*4882a593Smuzhiyun vmovdqu `16*2-64`($ctx),%x#$T4 1839*4882a593Smuzhiyun vmovdqu `16*3-64`($ctx),%x#$D0 1840*4882a593Smuzhiyun vmovdqu `16*4-64`($ctx),%x#$D1 1841*4882a593Smuzhiyun vmovdqu `16*5-64`($ctx),%x#$D2 1842*4882a593Smuzhiyun lea 0x90(%rsp),%rax # size optimization 1843*4882a593Smuzhiyun vmovdqu `16*6-64`($ctx),%x#$D3 1844*4882a593Smuzhiyun vpermd $T2,$T0,$T2 # 00003412 -> 14243444 1845*4882a593Smuzhiyun vmovdqu `16*7-64`($ctx),%x#$D4 1846*4882a593Smuzhiyun vpermd $T3,$T0,$T3 1847*4882a593Smuzhiyun vmovdqu `16*8-64`($ctx),%x#$MASK 1848*4882a593Smuzhiyun vpermd $T4,$T0,$T4 1849*4882a593Smuzhiyun vmovdqa $T2,0x00(%rsp) 1850*4882a593Smuzhiyun vpermd $D0,$T0,$D0 1851*4882a593Smuzhiyun vmovdqa $T3,0x20-0x90(%rax) 1852*4882a593Smuzhiyun vpermd $D1,$T0,$D1 1853*4882a593Smuzhiyun vmovdqa $T4,0x40-0x90(%rax) 1854*4882a593Smuzhiyun vpermd $D2,$T0,$D2 1855*4882a593Smuzhiyun vmovdqa $D0,0x60-0x90(%rax) 1856*4882a593Smuzhiyun vpermd $D3,$T0,$D3 1857*4882a593Smuzhiyun vmovdqa $D1,0x80-0x90(%rax) 1858*4882a593Smuzhiyun vpermd $D4,$T0,$D4 1859*4882a593Smuzhiyun vmovdqa $D2,0xa0-0x90(%rax) 1860*4882a593Smuzhiyun vpermd $MASK,$T0,$MASK 1861*4882a593Smuzhiyun vmovdqa $D3,0xc0-0x90(%rax) 1862*4882a593Smuzhiyun vmovdqa $D4,0xe0-0x90(%rax) 1863*4882a593Smuzhiyun vmovdqa $MASK,0x100-0x90(%rax) 1864*4882a593Smuzhiyun vmovdqa 64(%rcx),$MASK # .Lmask26 1865*4882a593Smuzhiyun 1866*4882a593Smuzhiyun ################################################################ 1867*4882a593Smuzhiyun # load input 1868*4882a593Smuzhiyun vmovdqu 16*0($inp),%x#$T0 1869*4882a593Smuzhiyun vmovdqu 16*1($inp),%x#$T1 1870*4882a593Smuzhiyun vinserti128 \$1,16*2($inp),$T0,$T0 1871*4882a593Smuzhiyun vinserti128 \$1,16*3($inp),$T1,$T1 1872*4882a593Smuzhiyun lea 16*4($inp),$inp 1873*4882a593Smuzhiyun 1874*4882a593Smuzhiyun vpsrldq \$6,$T0,$T2 # splat input 1875*4882a593Smuzhiyun vpsrldq \$6,$T1,$T3 1876*4882a593Smuzhiyun vpunpckhqdq $T1,$T0,$T4 # 4 1877*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T2 # 2:3 1878*4882a593Smuzhiyun vpunpcklqdq $T1,$T0,$T0 # 0:1 1879*4882a593Smuzhiyun 1880*4882a593Smuzhiyun vpsrlq \$30,$T2,$T3 1881*4882a593Smuzhiyun vpsrlq \$4,$T2,$T2 1882*4882a593Smuzhiyun vpsrlq \$26,$T0,$T1 1883*4882a593Smuzhiyun vpsrlq \$40,$T4,$T4 # 4 1884*4882a593Smuzhiyun vpand $MASK,$T2,$T2 # 2 1885*4882a593Smuzhiyun vpand $MASK,$T0,$T0 # 0 1886*4882a593Smuzhiyun vpand $MASK,$T1,$T1 # 1 1887*4882a593Smuzhiyun vpand $MASK,$T3,$T3 # 3 1888*4882a593Smuzhiyun vpor 32(%rcx),$T4,$T4 # padbit, yes, always 1889*4882a593Smuzhiyun 1890*4882a593Smuzhiyun vpaddq $H2,$T2,$H2 # accumulate input 1891*4882a593Smuzhiyun sub \$64,$len 1892*4882a593Smuzhiyun jz .Ltail_avx2$suffix 1893*4882a593Smuzhiyun jmp .Loop_avx2$suffix 1894*4882a593Smuzhiyun 1895*4882a593Smuzhiyun.align 32 1896*4882a593Smuzhiyun.Loop_avx2$suffix: 1897*4882a593Smuzhiyun ################################################################ 1898*4882a593Smuzhiyun # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 1899*4882a593Smuzhiyun # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 1900*4882a593Smuzhiyun # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 1901*4882a593Smuzhiyun # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 1902*4882a593Smuzhiyun # \________/\__________/ 1903*4882a593Smuzhiyun ################################################################ 1904*4882a593Smuzhiyun #vpaddq $H2,$T2,$H2 # accumulate input 1905*4882a593Smuzhiyun vpaddq $H0,$T0,$H0 1906*4882a593Smuzhiyun vmovdqa `32*0`(%rsp),$T0 # r0^4 1907*4882a593Smuzhiyun vpaddq $H1,$T1,$H1 1908*4882a593Smuzhiyun vmovdqa `32*1`(%rsp),$T1 # r1^4 1909*4882a593Smuzhiyun vpaddq $H3,$T3,$H3 1910*4882a593Smuzhiyun vmovdqa `32*3`(%rsp),$T2 # r2^4 1911*4882a593Smuzhiyun vpaddq $H4,$T4,$H4 1912*4882a593Smuzhiyun vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 1913*4882a593Smuzhiyun vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 1914*4882a593Smuzhiyun 1915*4882a593Smuzhiyun # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 1916*4882a593Smuzhiyun # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 1917*4882a593Smuzhiyun # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1918*4882a593Smuzhiyun # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 1919*4882a593Smuzhiyun # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 1920*4882a593Smuzhiyun # 1921*4882a593Smuzhiyun # however, as h2 is "chronologically" first one available pull 1922*4882a593Smuzhiyun # corresponding operations up, so it's 1923*4882a593Smuzhiyun # 1924*4882a593Smuzhiyun # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 1925*4882a593Smuzhiyun # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 1926*4882a593Smuzhiyun # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 1927*4882a593Smuzhiyun # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 1928*4882a593Smuzhiyun # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 1929*4882a593Smuzhiyun 1930*4882a593Smuzhiyun vpmuludq $H2,$T0,$D2 # d2 = h2*r0 1931*4882a593Smuzhiyun vpmuludq $H2,$T1,$D3 # d3 = h2*r1 1932*4882a593Smuzhiyun vpmuludq $H2,$T2,$D4 # d4 = h2*r2 1933*4882a593Smuzhiyun vpmuludq $H2,$T3,$D0 # d0 = h2*s3 1934*4882a593Smuzhiyun vpmuludq $H2,$S4,$D1 # d1 = h2*s4 1935*4882a593Smuzhiyun 1936*4882a593Smuzhiyun vpmuludq $H0,$T1,$T4 # h0*r1 1937*4882a593Smuzhiyun vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp 1938*4882a593Smuzhiyun vpaddq $T4,$D1,$D1 # d1 += h0*r1 1939*4882a593Smuzhiyun vpaddq $H2,$D2,$D2 # d2 += h1*r1 1940*4882a593Smuzhiyun vpmuludq $H3,$T1,$T4 # h3*r1 1941*4882a593Smuzhiyun vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 1942*4882a593Smuzhiyun vpaddq $T4,$D4,$D4 # d4 += h3*r1 1943*4882a593Smuzhiyun vpaddq $H2,$D0,$D0 # d0 += h4*s1 1944*4882a593Smuzhiyun vmovdqa `32*4-0x90`(%rax),$T1 # s2 1945*4882a593Smuzhiyun 1946*4882a593Smuzhiyun vpmuludq $H0,$T0,$T4 # h0*r0 1947*4882a593Smuzhiyun vpmuludq $H1,$T0,$H2 # h1*r0 1948*4882a593Smuzhiyun vpaddq $T4,$D0,$D0 # d0 += h0*r0 1949*4882a593Smuzhiyun vpaddq $H2,$D1,$D1 # d1 += h1*r0 1950*4882a593Smuzhiyun vpmuludq $H3,$T0,$T4 # h3*r0 1951*4882a593Smuzhiyun vpmuludq $H4,$T0,$H2 # h4*r0 1952*4882a593Smuzhiyun vmovdqu 16*0($inp),%x#$T0 # load input 1953*4882a593Smuzhiyun vpaddq $T4,$D3,$D3 # d3 += h3*r0 1954*4882a593Smuzhiyun vpaddq $H2,$D4,$D4 # d4 += h4*r0 1955*4882a593Smuzhiyun vinserti128 \$1,16*2($inp),$T0,$T0 1956*4882a593Smuzhiyun 1957*4882a593Smuzhiyun vpmuludq $H3,$T1,$T4 # h3*s2 1958*4882a593Smuzhiyun vpmuludq $H4,$T1,$H2 # h4*s2 1959*4882a593Smuzhiyun vmovdqu 16*1($inp),%x#$T1 1960*4882a593Smuzhiyun vpaddq $T4,$D0,$D0 # d0 += h3*s2 1961*4882a593Smuzhiyun vpaddq $H2,$D1,$D1 # d1 += h4*s2 1962*4882a593Smuzhiyun vmovdqa `32*5-0x90`(%rax),$H2 # r3 1963*4882a593Smuzhiyun vpmuludq $H1,$T2,$T4 # h1*r2 1964*4882a593Smuzhiyun vpmuludq $H0,$T2,$T2 # h0*r2 1965*4882a593Smuzhiyun vpaddq $T4,$D3,$D3 # d3 += h1*r2 1966*4882a593Smuzhiyun vpaddq $T2,$D2,$D2 # d2 += h0*r2 1967*4882a593Smuzhiyun vinserti128 \$1,16*3($inp),$T1,$T1 1968*4882a593Smuzhiyun lea 16*4($inp),$inp 1969*4882a593Smuzhiyun 1970*4882a593Smuzhiyun vpmuludq $H1,$H2,$T4 # h1*r3 1971*4882a593Smuzhiyun vpmuludq $H0,$H2,$H2 # h0*r3 1972*4882a593Smuzhiyun vpsrldq \$6,$T0,$T2 # splat input 1973*4882a593Smuzhiyun vpaddq $T4,$D4,$D4 # d4 += h1*r3 1974*4882a593Smuzhiyun vpaddq $H2,$D3,$D3 # d3 += h0*r3 1975*4882a593Smuzhiyun vpmuludq $H3,$T3,$T4 # h3*s3 1976*4882a593Smuzhiyun vpmuludq $H4,$T3,$H2 # h4*s3 1977*4882a593Smuzhiyun vpsrldq \$6,$T1,$T3 1978*4882a593Smuzhiyun vpaddq $T4,$D1,$D1 # d1 += h3*s3 1979*4882a593Smuzhiyun vpaddq $H2,$D2,$D2 # d2 += h4*s3 1980*4882a593Smuzhiyun vpunpckhqdq $T1,$T0,$T4 # 4 1981*4882a593Smuzhiyun 1982*4882a593Smuzhiyun vpmuludq $H3,$S4,$H3 # h3*s4 1983*4882a593Smuzhiyun vpmuludq $H4,$S4,$H4 # h4*s4 1984*4882a593Smuzhiyun vpunpcklqdq $T1,$T0,$T0 # 0:1 1985*4882a593Smuzhiyun vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 1986*4882a593Smuzhiyun vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 1987*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T3 # 2:3 1988*4882a593Smuzhiyun vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 1989*4882a593Smuzhiyun vpmuludq $H1,$S4,$H0 # h1*s4 1990*4882a593Smuzhiyun vmovdqa 64(%rcx),$MASK # .Lmask26 1991*4882a593Smuzhiyun vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 1992*4882a593Smuzhiyun vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 1993*4882a593Smuzhiyun 1994*4882a593Smuzhiyun ################################################################ 1995*4882a593Smuzhiyun # lazy reduction (interleaved with tail of input splat) 1996*4882a593Smuzhiyun 1997*4882a593Smuzhiyun vpsrlq \$26,$H3,$D3 1998*4882a593Smuzhiyun vpand $MASK,$H3,$H3 1999*4882a593Smuzhiyun vpaddq $D3,$H4,$H4 # h3 -> h4 2000*4882a593Smuzhiyun 2001*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 2002*4882a593Smuzhiyun vpand $MASK,$H0,$H0 2003*4882a593Smuzhiyun vpaddq $D0,$D1,$H1 # h0 -> h1 2004*4882a593Smuzhiyun 2005*4882a593Smuzhiyun vpsrlq \$26,$H4,$D4 2006*4882a593Smuzhiyun vpand $MASK,$H4,$H4 2007*4882a593Smuzhiyun 2008*4882a593Smuzhiyun vpsrlq \$4,$T3,$T2 2009*4882a593Smuzhiyun 2010*4882a593Smuzhiyun vpsrlq \$26,$H1,$D1 2011*4882a593Smuzhiyun vpand $MASK,$H1,$H1 2012*4882a593Smuzhiyun vpaddq $D1,$H2,$H2 # h1 -> h2 2013*4882a593Smuzhiyun 2014*4882a593Smuzhiyun vpaddq $D4,$H0,$H0 2015*4882a593Smuzhiyun vpsllq \$2,$D4,$D4 2016*4882a593Smuzhiyun vpaddq $D4,$H0,$H0 # h4 -> h0 2017*4882a593Smuzhiyun 2018*4882a593Smuzhiyun vpand $MASK,$T2,$T2 # 2 2019*4882a593Smuzhiyun vpsrlq \$26,$T0,$T1 2020*4882a593Smuzhiyun 2021*4882a593Smuzhiyun vpsrlq \$26,$H2,$D2 2022*4882a593Smuzhiyun vpand $MASK,$H2,$H2 2023*4882a593Smuzhiyun vpaddq $D2,$H3,$H3 # h2 -> h3 2024*4882a593Smuzhiyun 2025*4882a593Smuzhiyun vpaddq $T2,$H2,$H2 # modulo-scheduled 2026*4882a593Smuzhiyun vpsrlq \$30,$T3,$T3 2027*4882a593Smuzhiyun 2028*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 2029*4882a593Smuzhiyun vpand $MASK,$H0,$H0 2030*4882a593Smuzhiyun vpaddq $D0,$H1,$H1 # h0 -> h1 2031*4882a593Smuzhiyun 2032*4882a593Smuzhiyun vpsrlq \$40,$T4,$T4 # 4 2033*4882a593Smuzhiyun 2034*4882a593Smuzhiyun vpsrlq \$26,$H3,$D3 2035*4882a593Smuzhiyun vpand $MASK,$H3,$H3 2036*4882a593Smuzhiyun vpaddq $D3,$H4,$H4 # h3 -> h4 2037*4882a593Smuzhiyun 2038*4882a593Smuzhiyun vpand $MASK,$T0,$T0 # 0 2039*4882a593Smuzhiyun vpand $MASK,$T1,$T1 # 1 2040*4882a593Smuzhiyun vpand $MASK,$T3,$T3 # 3 2041*4882a593Smuzhiyun vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2042*4882a593Smuzhiyun 2043*4882a593Smuzhiyun sub \$64,$len 2044*4882a593Smuzhiyun jnz .Loop_avx2$suffix 2045*4882a593Smuzhiyun 2046*4882a593Smuzhiyun .byte 0x66,0x90 2047*4882a593Smuzhiyun.Ltail_avx2$suffix: 2048*4882a593Smuzhiyun ################################################################ 2049*4882a593Smuzhiyun # while above multiplications were by r^4 in all lanes, in last 2050*4882a593Smuzhiyun # iteration we multiply least significant lane by r^4 and most 2051*4882a593Smuzhiyun # significant one by r, so copy of above except that references 2052*4882a593Smuzhiyun # to the precomputed table are displaced by 4... 2053*4882a593Smuzhiyun 2054*4882a593Smuzhiyun #vpaddq $H2,$T2,$H2 # accumulate input 2055*4882a593Smuzhiyun vpaddq $H0,$T0,$H0 2056*4882a593Smuzhiyun vmovdqu `32*0+4`(%rsp),$T0 # r0^4 2057*4882a593Smuzhiyun vpaddq $H1,$T1,$H1 2058*4882a593Smuzhiyun vmovdqu `32*1+4`(%rsp),$T1 # r1^4 2059*4882a593Smuzhiyun vpaddq $H3,$T3,$H3 2060*4882a593Smuzhiyun vmovdqu `32*3+4`(%rsp),$T2 # r2^4 2061*4882a593Smuzhiyun vpaddq $H4,$T4,$H4 2062*4882a593Smuzhiyun vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 2063*4882a593Smuzhiyun vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 2064*4882a593Smuzhiyun 2065*4882a593Smuzhiyun vpmuludq $H2,$T0,$D2 # d2 = h2*r0 2066*4882a593Smuzhiyun vpmuludq $H2,$T1,$D3 # d3 = h2*r1 2067*4882a593Smuzhiyun vpmuludq $H2,$T2,$D4 # d4 = h2*r2 2068*4882a593Smuzhiyun vpmuludq $H2,$T3,$D0 # d0 = h2*s3 2069*4882a593Smuzhiyun vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2070*4882a593Smuzhiyun 2071*4882a593Smuzhiyun vpmuludq $H0,$T1,$T4 # h0*r1 2072*4882a593Smuzhiyun vpmuludq $H1,$T1,$H2 # h1*r1 2073*4882a593Smuzhiyun vpaddq $T4,$D1,$D1 # d1 += h0*r1 2074*4882a593Smuzhiyun vpaddq $H2,$D2,$D2 # d2 += h1*r1 2075*4882a593Smuzhiyun vpmuludq $H3,$T1,$T4 # h3*r1 2076*4882a593Smuzhiyun vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 2077*4882a593Smuzhiyun vpaddq $T4,$D4,$D4 # d4 += h3*r1 2078*4882a593Smuzhiyun vpaddq $H2,$D0,$D0 # d0 += h4*s1 2079*4882a593Smuzhiyun 2080*4882a593Smuzhiyun vpmuludq $H0,$T0,$T4 # h0*r0 2081*4882a593Smuzhiyun vpmuludq $H1,$T0,$H2 # h1*r0 2082*4882a593Smuzhiyun vpaddq $T4,$D0,$D0 # d0 += h0*r0 2083*4882a593Smuzhiyun vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 2084*4882a593Smuzhiyun vpaddq $H2,$D1,$D1 # d1 += h1*r0 2085*4882a593Smuzhiyun vpmuludq $H3,$T0,$T4 # h3*r0 2086*4882a593Smuzhiyun vpmuludq $H4,$T0,$H2 # h4*r0 2087*4882a593Smuzhiyun vpaddq $T4,$D3,$D3 # d3 += h3*r0 2088*4882a593Smuzhiyun vpaddq $H2,$D4,$D4 # d4 += h4*r0 2089*4882a593Smuzhiyun 2090*4882a593Smuzhiyun vpmuludq $H3,$T1,$T4 # h3*s2 2091*4882a593Smuzhiyun vpmuludq $H4,$T1,$H2 # h4*s2 2092*4882a593Smuzhiyun vpaddq $T4,$D0,$D0 # d0 += h3*s2 2093*4882a593Smuzhiyun vpaddq $H2,$D1,$D1 # d1 += h4*s2 2094*4882a593Smuzhiyun vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 2095*4882a593Smuzhiyun vpmuludq $H1,$T2,$T4 # h1*r2 2096*4882a593Smuzhiyun vpmuludq $H0,$T2,$T2 # h0*r2 2097*4882a593Smuzhiyun vpaddq $T4,$D3,$D3 # d3 += h1*r2 2098*4882a593Smuzhiyun vpaddq $T2,$D2,$D2 # d2 += h0*r2 2099*4882a593Smuzhiyun 2100*4882a593Smuzhiyun vpmuludq $H1,$H2,$T4 # h1*r3 2101*4882a593Smuzhiyun vpmuludq $H0,$H2,$H2 # h0*r3 2102*4882a593Smuzhiyun vpaddq $T4,$D4,$D4 # d4 += h1*r3 2103*4882a593Smuzhiyun vpaddq $H2,$D3,$D3 # d3 += h0*r3 2104*4882a593Smuzhiyun vpmuludq $H3,$T3,$T4 # h3*s3 2105*4882a593Smuzhiyun vpmuludq $H4,$T3,$H2 # h4*s3 2106*4882a593Smuzhiyun vpaddq $T4,$D1,$D1 # d1 += h3*s3 2107*4882a593Smuzhiyun vpaddq $H2,$D2,$D2 # d2 += h4*s3 2108*4882a593Smuzhiyun 2109*4882a593Smuzhiyun vpmuludq $H3,$S4,$H3 # h3*s4 2110*4882a593Smuzhiyun vpmuludq $H4,$S4,$H4 # h4*s4 2111*4882a593Smuzhiyun vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 2112*4882a593Smuzhiyun vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 2113*4882a593Smuzhiyun vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 2114*4882a593Smuzhiyun vpmuludq $H1,$S4,$H0 # h1*s4 2115*4882a593Smuzhiyun vmovdqa 64(%rcx),$MASK # .Lmask26 2116*4882a593Smuzhiyun vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 2117*4882a593Smuzhiyun vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 2118*4882a593Smuzhiyun 2119*4882a593Smuzhiyun ################################################################ 2120*4882a593Smuzhiyun # horizontal addition 2121*4882a593Smuzhiyun 2122*4882a593Smuzhiyun vpsrldq \$8,$D1,$T1 2123*4882a593Smuzhiyun vpsrldq \$8,$H2,$T2 2124*4882a593Smuzhiyun vpsrldq \$8,$H3,$T3 2125*4882a593Smuzhiyun vpsrldq \$8,$H4,$T4 2126*4882a593Smuzhiyun vpsrldq \$8,$H0,$T0 2127*4882a593Smuzhiyun vpaddq $T1,$D1,$D1 2128*4882a593Smuzhiyun vpaddq $T2,$H2,$H2 2129*4882a593Smuzhiyun vpaddq $T3,$H3,$H3 2130*4882a593Smuzhiyun vpaddq $T4,$H4,$H4 2131*4882a593Smuzhiyun vpaddq $T0,$H0,$H0 2132*4882a593Smuzhiyun 2133*4882a593Smuzhiyun vpermq \$0x2,$H3,$T3 2134*4882a593Smuzhiyun vpermq \$0x2,$H4,$T4 2135*4882a593Smuzhiyun vpermq \$0x2,$H0,$T0 2136*4882a593Smuzhiyun vpermq \$0x2,$D1,$T1 2137*4882a593Smuzhiyun vpermq \$0x2,$H2,$T2 2138*4882a593Smuzhiyun vpaddq $T3,$H3,$H3 2139*4882a593Smuzhiyun vpaddq $T4,$H4,$H4 2140*4882a593Smuzhiyun vpaddq $T0,$H0,$H0 2141*4882a593Smuzhiyun vpaddq $T1,$D1,$D1 2142*4882a593Smuzhiyun vpaddq $T2,$H2,$H2 2143*4882a593Smuzhiyun 2144*4882a593Smuzhiyun ################################################################ 2145*4882a593Smuzhiyun # lazy reduction 2146*4882a593Smuzhiyun 2147*4882a593Smuzhiyun vpsrlq \$26,$H3,$D3 2148*4882a593Smuzhiyun vpand $MASK,$H3,$H3 2149*4882a593Smuzhiyun vpaddq $D3,$H4,$H4 # h3 -> h4 2150*4882a593Smuzhiyun 2151*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 2152*4882a593Smuzhiyun vpand $MASK,$H0,$H0 2153*4882a593Smuzhiyun vpaddq $D0,$D1,$H1 # h0 -> h1 2154*4882a593Smuzhiyun 2155*4882a593Smuzhiyun vpsrlq \$26,$H4,$D4 2156*4882a593Smuzhiyun vpand $MASK,$H4,$H4 2157*4882a593Smuzhiyun 2158*4882a593Smuzhiyun vpsrlq \$26,$H1,$D1 2159*4882a593Smuzhiyun vpand $MASK,$H1,$H1 2160*4882a593Smuzhiyun vpaddq $D1,$H2,$H2 # h1 -> h2 2161*4882a593Smuzhiyun 2162*4882a593Smuzhiyun vpaddq $D4,$H0,$H0 2163*4882a593Smuzhiyun vpsllq \$2,$D4,$D4 2164*4882a593Smuzhiyun vpaddq $D4,$H0,$H0 # h4 -> h0 2165*4882a593Smuzhiyun 2166*4882a593Smuzhiyun vpsrlq \$26,$H2,$D2 2167*4882a593Smuzhiyun vpand $MASK,$H2,$H2 2168*4882a593Smuzhiyun vpaddq $D2,$H3,$H3 # h2 -> h3 2169*4882a593Smuzhiyun 2170*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 2171*4882a593Smuzhiyun vpand $MASK,$H0,$H0 2172*4882a593Smuzhiyun vpaddq $D0,$H1,$H1 # h0 -> h1 2173*4882a593Smuzhiyun 2174*4882a593Smuzhiyun vpsrlq \$26,$H3,$D3 2175*4882a593Smuzhiyun vpand $MASK,$H3,$H3 2176*4882a593Smuzhiyun vpaddq $D3,$H4,$H4 # h3 -> h4 2177*4882a593Smuzhiyun 2178*4882a593Smuzhiyun vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2179*4882a593Smuzhiyun vmovd %x#$H1,`4*1-48-64`($ctx) 2180*4882a593Smuzhiyun vmovd %x#$H2,`4*2-48-64`($ctx) 2181*4882a593Smuzhiyun vmovd %x#$H3,`4*3-48-64`($ctx) 2182*4882a593Smuzhiyun vmovd %x#$H4,`4*4-48-64`($ctx) 2183*4882a593Smuzhiyun___ 2184*4882a593Smuzhiyun$code.=<<___ if ($win64); 2185*4882a593Smuzhiyun vmovdqa -0xb0(%r10),%xmm6 2186*4882a593Smuzhiyun vmovdqa -0xa0(%r10),%xmm7 2187*4882a593Smuzhiyun vmovdqa -0x90(%r10),%xmm8 2188*4882a593Smuzhiyun vmovdqa -0x80(%r10),%xmm9 2189*4882a593Smuzhiyun vmovdqa -0x70(%r10),%xmm10 2190*4882a593Smuzhiyun vmovdqa -0x60(%r10),%xmm11 2191*4882a593Smuzhiyun vmovdqa -0x50(%r10),%xmm12 2192*4882a593Smuzhiyun vmovdqa -0x40(%r10),%xmm13 2193*4882a593Smuzhiyun vmovdqa -0x30(%r10),%xmm14 2194*4882a593Smuzhiyun vmovdqa -0x20(%r10),%xmm15 2195*4882a593Smuzhiyun lea -8(%r10),%rsp 2196*4882a593Smuzhiyun.Ldo_avx2_epilogue$suffix: 2197*4882a593Smuzhiyun___ 2198*4882a593Smuzhiyun$code.=<<___ if (!$win64); 2199*4882a593Smuzhiyun lea -8(%r10),%rsp 2200*4882a593Smuzhiyun.cfi_def_cfa_register %rsp 2201*4882a593Smuzhiyun___ 2202*4882a593Smuzhiyun$code.=<<___; 2203*4882a593Smuzhiyun vzeroupper 2204*4882a593Smuzhiyun RET 2205*4882a593Smuzhiyun.cfi_endproc 2206*4882a593Smuzhiyun___ 2207*4882a593Smuzhiyunif($avx > 2 && $avx512) { 2208*4882a593Smuzhiyunmy ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); 2209*4882a593Smuzhiyunmy ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); 2210*4882a593Smuzhiyunmy $PADBIT="%zmm30"; 2211*4882a593Smuzhiyun 2212*4882a593Smuzhiyunmap(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain 2213*4882a593Smuzhiyunmap(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); 2214*4882a593Smuzhiyunmap(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); 2215*4882a593Smuzhiyunmap(s/%y/%z/,($MASK)); 2216*4882a593Smuzhiyun 2217*4882a593Smuzhiyun$code.=<<___; 2218*4882a593Smuzhiyun.cfi_startproc 2219*4882a593Smuzhiyun.Lblocks_avx512: 2220*4882a593Smuzhiyun mov \$15,%eax 2221*4882a593Smuzhiyun kmovw %eax,%k2 2222*4882a593Smuzhiyun___ 2223*4882a593Smuzhiyun$code.=<<___ if (!$win64); 2224*4882a593Smuzhiyun lea 8(%rsp),%r10 2225*4882a593Smuzhiyun.cfi_def_cfa_register %r10 2226*4882a593Smuzhiyun sub \$0x128,%rsp 2227*4882a593Smuzhiyun___ 2228*4882a593Smuzhiyun$code.=<<___ if ($win64); 2229*4882a593Smuzhiyun lea 8(%rsp),%r10 2230*4882a593Smuzhiyun sub \$0x1c8,%rsp 2231*4882a593Smuzhiyun vmovdqa %xmm6,-0xb0(%r10) 2232*4882a593Smuzhiyun vmovdqa %xmm7,-0xa0(%r10) 2233*4882a593Smuzhiyun vmovdqa %xmm8,-0x90(%r10) 2234*4882a593Smuzhiyun vmovdqa %xmm9,-0x80(%r10) 2235*4882a593Smuzhiyun vmovdqa %xmm10,-0x70(%r10) 2236*4882a593Smuzhiyun vmovdqa %xmm11,-0x60(%r10) 2237*4882a593Smuzhiyun vmovdqa %xmm12,-0x50(%r10) 2238*4882a593Smuzhiyun vmovdqa %xmm13,-0x40(%r10) 2239*4882a593Smuzhiyun vmovdqa %xmm14,-0x30(%r10) 2240*4882a593Smuzhiyun vmovdqa %xmm15,-0x20(%r10) 2241*4882a593Smuzhiyun.Ldo_avx512_body: 2242*4882a593Smuzhiyun___ 2243*4882a593Smuzhiyun$code.=<<___; 2244*4882a593Smuzhiyun lea .Lconst(%rip),%rcx 2245*4882a593Smuzhiyun lea 48+64($ctx),$ctx # size optimization 2246*4882a593Smuzhiyun vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 2247*4882a593Smuzhiyun 2248*4882a593Smuzhiyun # expand pre-calculated table 2249*4882a593Smuzhiyun vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} 2250*4882a593Smuzhiyun and \$-512,%rsp 2251*4882a593Smuzhiyun vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} 2252*4882a593Smuzhiyun mov \$0x20,%rax 2253*4882a593Smuzhiyun vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} 2254*4882a593Smuzhiyun vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} 2255*4882a593Smuzhiyun vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} 2256*4882a593Smuzhiyun vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} 2257*4882a593Smuzhiyun vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} 2258*4882a593Smuzhiyun vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} 2259*4882a593Smuzhiyun vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} 2260*4882a593Smuzhiyun vpermd $D0,$T2,$R0 # 00003412 -> 14243444 2261*4882a593Smuzhiyun vpbroadcastq 64(%rcx),$MASK # .Lmask26 2262*4882a593Smuzhiyun vpermd $D1,$T2,$R1 2263*4882a593Smuzhiyun vpermd $T0,$T2,$S1 2264*4882a593Smuzhiyun vpermd $D2,$T2,$R2 2265*4882a593Smuzhiyun vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 2266*4882a593Smuzhiyun vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 2267*4882a593Smuzhiyun vpermd $T1,$T2,$S2 2268*4882a593Smuzhiyun vmovdqu64 $R1,0x00(%rsp,%rax){%k2} 2269*4882a593Smuzhiyun vpsrlq \$32,$R1,$T1 2270*4882a593Smuzhiyun vpermd $D3,$T2,$R3 2271*4882a593Smuzhiyun vmovdqa64 $S1,0x40(%rsp){%k2} 2272*4882a593Smuzhiyun vpermd $T3,$T2,$S3 2273*4882a593Smuzhiyun vpermd $D4,$T2,$R4 2274*4882a593Smuzhiyun vmovdqu64 $R2,0x40(%rsp,%rax){%k2} 2275*4882a593Smuzhiyun vpermd $T4,$T2,$S4 2276*4882a593Smuzhiyun vmovdqa64 $S2,0x80(%rsp){%k2} 2277*4882a593Smuzhiyun vmovdqu64 $R3,0x80(%rsp,%rax){%k2} 2278*4882a593Smuzhiyun vmovdqa64 $S3,0xc0(%rsp){%k2} 2279*4882a593Smuzhiyun vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} 2280*4882a593Smuzhiyun vmovdqa64 $S4,0x100(%rsp){%k2} 2281*4882a593Smuzhiyun 2282*4882a593Smuzhiyun ################################################################ 2283*4882a593Smuzhiyun # calculate 5th through 8th powers of the key 2284*4882a593Smuzhiyun # 2285*4882a593Smuzhiyun # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 2286*4882a593Smuzhiyun # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 2287*4882a593Smuzhiyun # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 2288*4882a593Smuzhiyun # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 2289*4882a593Smuzhiyun # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 2290*4882a593Smuzhiyun 2291*4882a593Smuzhiyun vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 2292*4882a593Smuzhiyun vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 2293*4882a593Smuzhiyun vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 2294*4882a593Smuzhiyun vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 2295*4882a593Smuzhiyun vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 2296*4882a593Smuzhiyun vpsrlq \$32,$R2,$T2 2297*4882a593Smuzhiyun 2298*4882a593Smuzhiyun vpmuludq $T1,$S4,$M0 2299*4882a593Smuzhiyun vpmuludq $T1,$R0,$M1 2300*4882a593Smuzhiyun vpmuludq $T1,$R1,$M2 2301*4882a593Smuzhiyun vpmuludq $T1,$R2,$M3 2302*4882a593Smuzhiyun vpmuludq $T1,$R3,$M4 2303*4882a593Smuzhiyun vpsrlq \$32,$R3,$T3 2304*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 2305*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += r1'*r0 2306*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += r1'*r1 2307*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += r1'*r2 2308*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += r1'*r3 2309*4882a593Smuzhiyun 2310*4882a593Smuzhiyun vpmuludq $T2,$S3,$M0 2311*4882a593Smuzhiyun vpmuludq $T2,$S4,$M1 2312*4882a593Smuzhiyun vpmuludq $T2,$R1,$M3 2313*4882a593Smuzhiyun vpmuludq $T2,$R2,$M4 2314*4882a593Smuzhiyun vpmuludq $T2,$R0,$M2 2315*4882a593Smuzhiyun vpsrlq \$32,$R4,$T4 2316*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 2317*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 2318*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += r2'*r1 2319*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += r2'*r2 2320*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += r2'*r0 2321*4882a593Smuzhiyun 2322*4882a593Smuzhiyun vpmuludq $T3,$S2,$M0 2323*4882a593Smuzhiyun vpmuludq $T3,$R0,$M3 2324*4882a593Smuzhiyun vpmuludq $T3,$R1,$M4 2325*4882a593Smuzhiyun vpmuludq $T3,$S3,$M1 2326*4882a593Smuzhiyun vpmuludq $T3,$S4,$M2 2327*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 2328*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += r3'*r0 2329*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += r3'*r1 2330*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 2331*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 2332*4882a593Smuzhiyun 2333*4882a593Smuzhiyun vpmuludq $T4,$S4,$M3 2334*4882a593Smuzhiyun vpmuludq $T4,$R0,$M4 2335*4882a593Smuzhiyun vpmuludq $T4,$S1,$M0 2336*4882a593Smuzhiyun vpmuludq $T4,$S2,$M1 2337*4882a593Smuzhiyun vpmuludq $T4,$S3,$M2 2338*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 2339*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += r2'*r0 2340*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 2341*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 2342*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 2343*4882a593Smuzhiyun 2344*4882a593Smuzhiyun ################################################################ 2345*4882a593Smuzhiyun # load input 2346*4882a593Smuzhiyun vmovdqu64 16*0($inp),%z#$T3 2347*4882a593Smuzhiyun vmovdqu64 16*4($inp),%z#$T4 2348*4882a593Smuzhiyun lea 16*8($inp),$inp 2349*4882a593Smuzhiyun 2350*4882a593Smuzhiyun ################################################################ 2351*4882a593Smuzhiyun # lazy reduction 2352*4882a593Smuzhiyun 2353*4882a593Smuzhiyun vpsrlq \$26,$D3,$M3 2354*4882a593Smuzhiyun vpandq $MASK,$D3,$D3 2355*4882a593Smuzhiyun vpaddq $M3,$D4,$D4 # d3 -> d4 2356*4882a593Smuzhiyun 2357*4882a593Smuzhiyun vpsrlq \$26,$D0,$M0 2358*4882a593Smuzhiyun vpandq $MASK,$D0,$D0 2359*4882a593Smuzhiyun vpaddq $M0,$D1,$D1 # d0 -> d1 2360*4882a593Smuzhiyun 2361*4882a593Smuzhiyun vpsrlq \$26,$D4,$M4 2362*4882a593Smuzhiyun vpandq $MASK,$D4,$D4 2363*4882a593Smuzhiyun 2364*4882a593Smuzhiyun vpsrlq \$26,$D1,$M1 2365*4882a593Smuzhiyun vpandq $MASK,$D1,$D1 2366*4882a593Smuzhiyun vpaddq $M1,$D2,$D2 # d1 -> d2 2367*4882a593Smuzhiyun 2368*4882a593Smuzhiyun vpaddq $M4,$D0,$D0 2369*4882a593Smuzhiyun vpsllq \$2,$M4,$M4 2370*4882a593Smuzhiyun vpaddq $M4,$D0,$D0 # d4 -> d0 2371*4882a593Smuzhiyun 2372*4882a593Smuzhiyun vpsrlq \$26,$D2,$M2 2373*4882a593Smuzhiyun vpandq $MASK,$D2,$D2 2374*4882a593Smuzhiyun vpaddq $M2,$D3,$D3 # d2 -> d3 2375*4882a593Smuzhiyun 2376*4882a593Smuzhiyun vpsrlq \$26,$D0,$M0 2377*4882a593Smuzhiyun vpandq $MASK,$D0,$D0 2378*4882a593Smuzhiyun vpaddq $M0,$D1,$D1 # d0 -> d1 2379*4882a593Smuzhiyun 2380*4882a593Smuzhiyun vpsrlq \$26,$D3,$M3 2381*4882a593Smuzhiyun vpandq $MASK,$D3,$D3 2382*4882a593Smuzhiyun vpaddq $M3,$D4,$D4 # d3 -> d4 2383*4882a593Smuzhiyun 2384*4882a593Smuzhiyun ################################################################ 2385*4882a593Smuzhiyun # at this point we have 14243444 in $R0-$S4 and 05060708 in 2386*4882a593Smuzhiyun # $D0-$D4, ... 2387*4882a593Smuzhiyun 2388*4882a593Smuzhiyun vpunpcklqdq $T4,$T3,$T0 # transpose input 2389*4882a593Smuzhiyun vpunpckhqdq $T4,$T3,$T4 2390*4882a593Smuzhiyun 2391*4882a593Smuzhiyun # ... since input 64-bit lanes are ordered as 73625140, we could 2392*4882a593Smuzhiyun # "vperm" it to 76543210 (here and in each loop iteration), *or* 2393*4882a593Smuzhiyun # we could just flow along, hence the goal for $R0-$S4 is 2394*4882a593Smuzhiyun # 1858286838784888 ... 2395*4882a593Smuzhiyun 2396*4882a593Smuzhiyun vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: 2397*4882a593Smuzhiyun mov \$0x7777,%eax 2398*4882a593Smuzhiyun kmovw %eax,%k1 2399*4882a593Smuzhiyun 2400*4882a593Smuzhiyun vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- 2401*4882a593Smuzhiyun vpermd $R1,$M0,$R1 2402*4882a593Smuzhiyun vpermd $R2,$M0,$R2 2403*4882a593Smuzhiyun vpermd $R3,$M0,$R3 2404*4882a593Smuzhiyun vpermd $R4,$M0,$R4 2405*4882a593Smuzhiyun 2406*4882a593Smuzhiyun vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 2407*4882a593Smuzhiyun vpermd $D1,$M0,${R1}{%k1} 2408*4882a593Smuzhiyun vpermd $D2,$M0,${R2}{%k1} 2409*4882a593Smuzhiyun vpermd $D3,$M0,${R3}{%k1} 2410*4882a593Smuzhiyun vpermd $D4,$M0,${R4}{%k1} 2411*4882a593Smuzhiyun 2412*4882a593Smuzhiyun vpslld \$2,$R1,$S1 # *5 2413*4882a593Smuzhiyun vpslld \$2,$R2,$S2 2414*4882a593Smuzhiyun vpslld \$2,$R3,$S3 2415*4882a593Smuzhiyun vpslld \$2,$R4,$S4 2416*4882a593Smuzhiyun vpaddd $R1,$S1,$S1 2417*4882a593Smuzhiyun vpaddd $R2,$S2,$S2 2418*4882a593Smuzhiyun vpaddd $R3,$S3,$S3 2419*4882a593Smuzhiyun vpaddd $R4,$S4,$S4 2420*4882a593Smuzhiyun 2421*4882a593Smuzhiyun vpbroadcastq 32(%rcx),$PADBIT # .L129 2422*4882a593Smuzhiyun 2423*4882a593Smuzhiyun vpsrlq \$52,$T0,$T2 # splat input 2424*4882a593Smuzhiyun vpsllq \$12,$T4,$T3 2425*4882a593Smuzhiyun vporq $T3,$T2,$T2 2426*4882a593Smuzhiyun vpsrlq \$26,$T0,$T1 2427*4882a593Smuzhiyun vpsrlq \$14,$T4,$T3 2428*4882a593Smuzhiyun vpsrlq \$40,$T4,$T4 # 4 2429*4882a593Smuzhiyun vpandq $MASK,$T2,$T2 # 2 2430*4882a593Smuzhiyun vpandq $MASK,$T0,$T0 # 0 2431*4882a593Smuzhiyun #vpandq $MASK,$T1,$T1 # 1 2432*4882a593Smuzhiyun #vpandq $MASK,$T3,$T3 # 3 2433*4882a593Smuzhiyun #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2434*4882a593Smuzhiyun 2435*4882a593Smuzhiyun vpaddq $H2,$T2,$H2 # accumulate input 2436*4882a593Smuzhiyun sub \$192,$len 2437*4882a593Smuzhiyun jbe .Ltail_avx512 2438*4882a593Smuzhiyun jmp .Loop_avx512 2439*4882a593Smuzhiyun 2440*4882a593Smuzhiyun.align 32 2441*4882a593Smuzhiyun.Loop_avx512: 2442*4882a593Smuzhiyun ################################################################ 2443*4882a593Smuzhiyun # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 2444*4882a593Smuzhiyun # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 2445*4882a593Smuzhiyun # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 2446*4882a593Smuzhiyun # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 2447*4882a593Smuzhiyun # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 2448*4882a593Smuzhiyun # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 2449*4882a593Smuzhiyun # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 2450*4882a593Smuzhiyun # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 2451*4882a593Smuzhiyun # \________/\___________/ 2452*4882a593Smuzhiyun ################################################################ 2453*4882a593Smuzhiyun #vpaddq $H2,$T2,$H2 # accumulate input 2454*4882a593Smuzhiyun 2455*4882a593Smuzhiyun # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 2456*4882a593Smuzhiyun # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 2457*4882a593Smuzhiyun # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 2458*4882a593Smuzhiyun # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 2459*4882a593Smuzhiyun # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 2460*4882a593Smuzhiyun # 2461*4882a593Smuzhiyun # however, as h2 is "chronologically" first one available pull 2462*4882a593Smuzhiyun # corresponding operations up, so it's 2463*4882a593Smuzhiyun # 2464*4882a593Smuzhiyun # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 2465*4882a593Smuzhiyun # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 2466*4882a593Smuzhiyun # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 2467*4882a593Smuzhiyun # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 2468*4882a593Smuzhiyun # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 2469*4882a593Smuzhiyun 2470*4882a593Smuzhiyun vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2471*4882a593Smuzhiyun vpaddq $H0,$T0,$H0 2472*4882a593Smuzhiyun vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2473*4882a593Smuzhiyun vpandq $MASK,$T1,$T1 # 1 2474*4882a593Smuzhiyun vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2475*4882a593Smuzhiyun vpandq $MASK,$T3,$T3 # 3 2476*4882a593Smuzhiyun vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2477*4882a593Smuzhiyun vporq $PADBIT,$T4,$T4 # padbit, yes, always 2478*4882a593Smuzhiyun vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2479*4882a593Smuzhiyun vpaddq $H1,$T1,$H1 # accumulate input 2480*4882a593Smuzhiyun vpaddq $H3,$T3,$H3 2481*4882a593Smuzhiyun vpaddq $H4,$T4,$H4 2482*4882a593Smuzhiyun 2483*4882a593Smuzhiyun vmovdqu64 16*0($inp),$T3 # load input 2484*4882a593Smuzhiyun vmovdqu64 16*4($inp),$T4 2485*4882a593Smuzhiyun lea 16*8($inp),$inp 2486*4882a593Smuzhiyun vpmuludq $H0,$R3,$M3 2487*4882a593Smuzhiyun vpmuludq $H0,$R4,$M4 2488*4882a593Smuzhiyun vpmuludq $H0,$R0,$M0 2489*4882a593Smuzhiyun vpmuludq $H0,$R1,$M1 2490*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += h0*r3 2491*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += h0*r4 2492*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += h0*r0 2493*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += h0*r1 2494*4882a593Smuzhiyun 2495*4882a593Smuzhiyun vpmuludq $H1,$R2,$M3 2496*4882a593Smuzhiyun vpmuludq $H1,$R3,$M4 2497*4882a593Smuzhiyun vpmuludq $H1,$S4,$M0 2498*4882a593Smuzhiyun vpmuludq $H0,$R2,$M2 2499*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += h1*r2 2500*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += h1*r3 2501*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += h1*s4 2502*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += h0*r2 2503*4882a593Smuzhiyun 2504*4882a593Smuzhiyun vpunpcklqdq $T4,$T3,$T0 # transpose input 2505*4882a593Smuzhiyun vpunpckhqdq $T4,$T3,$T4 2506*4882a593Smuzhiyun 2507*4882a593Smuzhiyun vpmuludq $H3,$R0,$M3 2508*4882a593Smuzhiyun vpmuludq $H3,$R1,$M4 2509*4882a593Smuzhiyun vpmuludq $H1,$R0,$M1 2510*4882a593Smuzhiyun vpmuludq $H1,$R1,$M2 2511*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += h3*r0 2512*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += h3*r1 2513*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += h1*r0 2514*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += h1*r1 2515*4882a593Smuzhiyun 2516*4882a593Smuzhiyun vpmuludq $H4,$S4,$M3 2517*4882a593Smuzhiyun vpmuludq $H4,$R0,$M4 2518*4882a593Smuzhiyun vpmuludq $H3,$S2,$M0 2519*4882a593Smuzhiyun vpmuludq $H3,$S3,$M1 2520*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += h4*s4 2521*4882a593Smuzhiyun vpmuludq $H3,$S4,$M2 2522*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += h4*r0 2523*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += h3*s2 2524*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += h3*s3 2525*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += h3*s4 2526*4882a593Smuzhiyun 2527*4882a593Smuzhiyun vpmuludq $H4,$S1,$M0 2528*4882a593Smuzhiyun vpmuludq $H4,$S2,$M1 2529*4882a593Smuzhiyun vpmuludq $H4,$S3,$M2 2530*4882a593Smuzhiyun vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2531*4882a593Smuzhiyun vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2532*4882a593Smuzhiyun vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2533*4882a593Smuzhiyun 2534*4882a593Smuzhiyun ################################################################ 2535*4882a593Smuzhiyun # lazy reduction (interleaved with input splat) 2536*4882a593Smuzhiyun 2537*4882a593Smuzhiyun vpsrlq \$52,$T0,$T2 # splat input 2538*4882a593Smuzhiyun vpsllq \$12,$T4,$T3 2539*4882a593Smuzhiyun 2540*4882a593Smuzhiyun vpsrlq \$26,$D3,$H3 2541*4882a593Smuzhiyun vpandq $MASK,$D3,$D3 2542*4882a593Smuzhiyun vpaddq $H3,$D4,$H4 # h3 -> h4 2543*4882a593Smuzhiyun 2544*4882a593Smuzhiyun vporq $T3,$T2,$T2 2545*4882a593Smuzhiyun 2546*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 2547*4882a593Smuzhiyun vpandq $MASK,$H0,$H0 2548*4882a593Smuzhiyun vpaddq $D0,$H1,$H1 # h0 -> h1 2549*4882a593Smuzhiyun 2550*4882a593Smuzhiyun vpandq $MASK,$T2,$T2 # 2 2551*4882a593Smuzhiyun 2552*4882a593Smuzhiyun vpsrlq \$26,$H4,$D4 2553*4882a593Smuzhiyun vpandq $MASK,$H4,$H4 2554*4882a593Smuzhiyun 2555*4882a593Smuzhiyun vpsrlq \$26,$H1,$D1 2556*4882a593Smuzhiyun vpandq $MASK,$H1,$H1 2557*4882a593Smuzhiyun vpaddq $D1,$H2,$H2 # h1 -> h2 2558*4882a593Smuzhiyun 2559*4882a593Smuzhiyun vpaddq $D4,$H0,$H0 2560*4882a593Smuzhiyun vpsllq \$2,$D4,$D4 2561*4882a593Smuzhiyun vpaddq $D4,$H0,$H0 # h4 -> h0 2562*4882a593Smuzhiyun 2563*4882a593Smuzhiyun vpaddq $T2,$H2,$H2 # modulo-scheduled 2564*4882a593Smuzhiyun vpsrlq \$26,$T0,$T1 2565*4882a593Smuzhiyun 2566*4882a593Smuzhiyun vpsrlq \$26,$H2,$D2 2567*4882a593Smuzhiyun vpandq $MASK,$H2,$H2 2568*4882a593Smuzhiyun vpaddq $D2,$D3,$H3 # h2 -> h3 2569*4882a593Smuzhiyun 2570*4882a593Smuzhiyun vpsrlq \$14,$T4,$T3 2571*4882a593Smuzhiyun 2572*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 2573*4882a593Smuzhiyun vpandq $MASK,$H0,$H0 2574*4882a593Smuzhiyun vpaddq $D0,$H1,$H1 # h0 -> h1 2575*4882a593Smuzhiyun 2576*4882a593Smuzhiyun vpsrlq \$40,$T4,$T4 # 4 2577*4882a593Smuzhiyun 2578*4882a593Smuzhiyun vpsrlq \$26,$H3,$D3 2579*4882a593Smuzhiyun vpandq $MASK,$H3,$H3 2580*4882a593Smuzhiyun vpaddq $D3,$H4,$H4 # h3 -> h4 2581*4882a593Smuzhiyun 2582*4882a593Smuzhiyun vpandq $MASK,$T0,$T0 # 0 2583*4882a593Smuzhiyun #vpandq $MASK,$T1,$T1 # 1 2584*4882a593Smuzhiyun #vpandq $MASK,$T3,$T3 # 3 2585*4882a593Smuzhiyun #vporq $PADBIT,$T4,$T4 # padbit, yes, always 2586*4882a593Smuzhiyun 2587*4882a593Smuzhiyun sub \$128,$len 2588*4882a593Smuzhiyun ja .Loop_avx512 2589*4882a593Smuzhiyun 2590*4882a593Smuzhiyun.Ltail_avx512: 2591*4882a593Smuzhiyun ################################################################ 2592*4882a593Smuzhiyun # while above multiplications were by r^8 in all lanes, in last 2593*4882a593Smuzhiyun # iteration we multiply least significant lane by r^8 and most 2594*4882a593Smuzhiyun # significant one by r, that's why table gets shifted... 2595*4882a593Smuzhiyun 2596*4882a593Smuzhiyun vpsrlq \$32,$R0,$R0 # 0105020603070408 2597*4882a593Smuzhiyun vpsrlq \$32,$R1,$R1 2598*4882a593Smuzhiyun vpsrlq \$32,$R2,$R2 2599*4882a593Smuzhiyun vpsrlq \$32,$S3,$S3 2600*4882a593Smuzhiyun vpsrlq \$32,$S4,$S4 2601*4882a593Smuzhiyun vpsrlq \$32,$R3,$R3 2602*4882a593Smuzhiyun vpsrlq \$32,$R4,$R4 2603*4882a593Smuzhiyun vpsrlq \$32,$S1,$S1 2604*4882a593Smuzhiyun vpsrlq \$32,$S2,$S2 2605*4882a593Smuzhiyun 2606*4882a593Smuzhiyun ################################################################ 2607*4882a593Smuzhiyun # load either next or last 64 byte of input 2608*4882a593Smuzhiyun lea ($inp,$len),$inp 2609*4882a593Smuzhiyun 2610*4882a593Smuzhiyun #vpaddq $H2,$T2,$H2 # accumulate input 2611*4882a593Smuzhiyun vpaddq $H0,$T0,$H0 2612*4882a593Smuzhiyun 2613*4882a593Smuzhiyun vpmuludq $H2,$R1,$D3 # d3 = h2*r1 2614*4882a593Smuzhiyun vpmuludq $H2,$R2,$D4 # d4 = h2*r2 2615*4882a593Smuzhiyun vpmuludq $H2,$S3,$D0 # d0 = h2*s3 2616*4882a593Smuzhiyun vpandq $MASK,$T1,$T1 # 1 2617*4882a593Smuzhiyun vpmuludq $H2,$S4,$D1 # d1 = h2*s4 2618*4882a593Smuzhiyun vpandq $MASK,$T3,$T3 # 3 2619*4882a593Smuzhiyun vpmuludq $H2,$R0,$D2 # d2 = h2*r0 2620*4882a593Smuzhiyun vporq $PADBIT,$T4,$T4 # padbit, yes, always 2621*4882a593Smuzhiyun vpaddq $H1,$T1,$H1 # accumulate input 2622*4882a593Smuzhiyun vpaddq $H3,$T3,$H3 2623*4882a593Smuzhiyun vpaddq $H4,$T4,$H4 2624*4882a593Smuzhiyun 2625*4882a593Smuzhiyun vmovdqu 16*0($inp),%x#$T0 2626*4882a593Smuzhiyun vpmuludq $H0,$R3,$M3 2627*4882a593Smuzhiyun vpmuludq $H0,$R4,$M4 2628*4882a593Smuzhiyun vpmuludq $H0,$R0,$M0 2629*4882a593Smuzhiyun vpmuludq $H0,$R1,$M1 2630*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += h0*r3 2631*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += h0*r4 2632*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += h0*r0 2633*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += h0*r1 2634*4882a593Smuzhiyun 2635*4882a593Smuzhiyun vmovdqu 16*1($inp),%x#$T1 2636*4882a593Smuzhiyun vpmuludq $H1,$R2,$M3 2637*4882a593Smuzhiyun vpmuludq $H1,$R3,$M4 2638*4882a593Smuzhiyun vpmuludq $H1,$S4,$M0 2639*4882a593Smuzhiyun vpmuludq $H0,$R2,$M2 2640*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += h1*r2 2641*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += h1*r3 2642*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += h1*s4 2643*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += h0*r2 2644*4882a593Smuzhiyun 2645*4882a593Smuzhiyun vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 2646*4882a593Smuzhiyun vpmuludq $H3,$R0,$M3 2647*4882a593Smuzhiyun vpmuludq $H3,$R1,$M4 2648*4882a593Smuzhiyun vpmuludq $H1,$R0,$M1 2649*4882a593Smuzhiyun vpmuludq $H1,$R1,$M2 2650*4882a593Smuzhiyun vpaddq $M3,$D3,$D3 # d3 += h3*r0 2651*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += h3*r1 2652*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += h1*r0 2653*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += h1*r1 2654*4882a593Smuzhiyun 2655*4882a593Smuzhiyun vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 2656*4882a593Smuzhiyun vpmuludq $H4,$S4,$M3 2657*4882a593Smuzhiyun vpmuludq $H4,$R0,$M4 2658*4882a593Smuzhiyun vpmuludq $H3,$S2,$M0 2659*4882a593Smuzhiyun vpmuludq $H3,$S3,$M1 2660*4882a593Smuzhiyun vpmuludq $H3,$S4,$M2 2661*4882a593Smuzhiyun vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 2662*4882a593Smuzhiyun vpaddq $M4,$D4,$D4 # d4 += h4*r0 2663*4882a593Smuzhiyun vpaddq $M0,$D0,$D0 # d0 += h3*s2 2664*4882a593Smuzhiyun vpaddq $M1,$D1,$D1 # d1 += h3*s3 2665*4882a593Smuzhiyun vpaddq $M2,$D2,$D2 # d2 += h3*s4 2666*4882a593Smuzhiyun 2667*4882a593Smuzhiyun vpmuludq $H4,$S1,$M0 2668*4882a593Smuzhiyun vpmuludq $H4,$S2,$M1 2669*4882a593Smuzhiyun vpmuludq $H4,$S3,$M2 2670*4882a593Smuzhiyun vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 2671*4882a593Smuzhiyun vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 2672*4882a593Smuzhiyun vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 2673*4882a593Smuzhiyun 2674*4882a593Smuzhiyun ################################################################ 2675*4882a593Smuzhiyun # horizontal addition 2676*4882a593Smuzhiyun 2677*4882a593Smuzhiyun mov \$1,%eax 2678*4882a593Smuzhiyun vpermq \$0xb1,$H3,$D3 2679*4882a593Smuzhiyun vpermq \$0xb1,$D4,$H4 2680*4882a593Smuzhiyun vpermq \$0xb1,$H0,$D0 2681*4882a593Smuzhiyun vpermq \$0xb1,$H1,$D1 2682*4882a593Smuzhiyun vpermq \$0xb1,$H2,$D2 2683*4882a593Smuzhiyun vpaddq $D3,$H3,$H3 2684*4882a593Smuzhiyun vpaddq $D4,$H4,$H4 2685*4882a593Smuzhiyun vpaddq $D0,$H0,$H0 2686*4882a593Smuzhiyun vpaddq $D1,$H1,$H1 2687*4882a593Smuzhiyun vpaddq $D2,$H2,$H2 2688*4882a593Smuzhiyun 2689*4882a593Smuzhiyun kmovw %eax,%k3 2690*4882a593Smuzhiyun vpermq \$0x2,$H3,$D3 2691*4882a593Smuzhiyun vpermq \$0x2,$H4,$D4 2692*4882a593Smuzhiyun vpermq \$0x2,$H0,$D0 2693*4882a593Smuzhiyun vpermq \$0x2,$H1,$D1 2694*4882a593Smuzhiyun vpermq \$0x2,$H2,$D2 2695*4882a593Smuzhiyun vpaddq $D3,$H3,$H3 2696*4882a593Smuzhiyun vpaddq $D4,$H4,$H4 2697*4882a593Smuzhiyun vpaddq $D0,$H0,$H0 2698*4882a593Smuzhiyun vpaddq $D1,$H1,$H1 2699*4882a593Smuzhiyun vpaddq $D2,$H2,$H2 2700*4882a593Smuzhiyun 2701*4882a593Smuzhiyun vextracti64x4 \$0x1,$H3,%y#$D3 2702*4882a593Smuzhiyun vextracti64x4 \$0x1,$H4,%y#$D4 2703*4882a593Smuzhiyun vextracti64x4 \$0x1,$H0,%y#$D0 2704*4882a593Smuzhiyun vextracti64x4 \$0x1,$H1,%y#$D1 2705*4882a593Smuzhiyun vextracti64x4 \$0x1,$H2,%y#$D2 2706*4882a593Smuzhiyun vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case 2707*4882a593Smuzhiyun vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 2708*4882a593Smuzhiyun vpaddq $D0,$H0,${H0}{%k3}{z} 2709*4882a593Smuzhiyun vpaddq $D1,$H1,${H1}{%k3}{z} 2710*4882a593Smuzhiyun vpaddq $D2,$H2,${H2}{%k3}{z} 2711*4882a593Smuzhiyun___ 2712*4882a593Smuzhiyunmap(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); 2713*4882a593Smuzhiyunmap(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); 2714*4882a593Smuzhiyun$code.=<<___; 2715*4882a593Smuzhiyun ################################################################ 2716*4882a593Smuzhiyun # lazy reduction (interleaved with input splat) 2717*4882a593Smuzhiyun 2718*4882a593Smuzhiyun vpsrlq \$26,$H3,$D3 2719*4882a593Smuzhiyun vpand $MASK,$H3,$H3 2720*4882a593Smuzhiyun vpsrldq \$6,$T0,$T2 # splat input 2721*4882a593Smuzhiyun vpsrldq \$6,$T1,$T3 2722*4882a593Smuzhiyun vpunpckhqdq $T1,$T0,$T4 # 4 2723*4882a593Smuzhiyun vpaddq $D3,$H4,$H4 # h3 -> h4 2724*4882a593Smuzhiyun 2725*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 2726*4882a593Smuzhiyun vpand $MASK,$H0,$H0 2727*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T2 # 2:3 2728*4882a593Smuzhiyun vpunpcklqdq $T1,$T0,$T0 # 0:1 2729*4882a593Smuzhiyun vpaddq $D0,$H1,$H1 # h0 -> h1 2730*4882a593Smuzhiyun 2731*4882a593Smuzhiyun vpsrlq \$26,$H4,$D4 2732*4882a593Smuzhiyun vpand $MASK,$H4,$H4 2733*4882a593Smuzhiyun 2734*4882a593Smuzhiyun vpsrlq \$26,$H1,$D1 2735*4882a593Smuzhiyun vpand $MASK,$H1,$H1 2736*4882a593Smuzhiyun vpsrlq \$30,$T2,$T3 2737*4882a593Smuzhiyun vpsrlq \$4,$T2,$T2 2738*4882a593Smuzhiyun vpaddq $D1,$H2,$H2 # h1 -> h2 2739*4882a593Smuzhiyun 2740*4882a593Smuzhiyun vpaddq $D4,$H0,$H0 2741*4882a593Smuzhiyun vpsllq \$2,$D4,$D4 2742*4882a593Smuzhiyun vpsrlq \$26,$T0,$T1 2743*4882a593Smuzhiyun vpsrlq \$40,$T4,$T4 # 4 2744*4882a593Smuzhiyun vpaddq $D4,$H0,$H0 # h4 -> h0 2745*4882a593Smuzhiyun 2746*4882a593Smuzhiyun vpsrlq \$26,$H2,$D2 2747*4882a593Smuzhiyun vpand $MASK,$H2,$H2 2748*4882a593Smuzhiyun vpand $MASK,$T2,$T2 # 2 2749*4882a593Smuzhiyun vpand $MASK,$T0,$T0 # 0 2750*4882a593Smuzhiyun vpaddq $D2,$H3,$H3 # h2 -> h3 2751*4882a593Smuzhiyun 2752*4882a593Smuzhiyun vpsrlq \$26,$H0,$D0 2753*4882a593Smuzhiyun vpand $MASK,$H0,$H0 2754*4882a593Smuzhiyun vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 2755*4882a593Smuzhiyun vpand $MASK,$T1,$T1 # 1 2756*4882a593Smuzhiyun vpaddq $D0,$H1,$H1 # h0 -> h1 2757*4882a593Smuzhiyun 2758*4882a593Smuzhiyun vpsrlq \$26,$H3,$D3 2759*4882a593Smuzhiyun vpand $MASK,$H3,$H3 2760*4882a593Smuzhiyun vpand $MASK,$T3,$T3 # 3 2761*4882a593Smuzhiyun vpor 32(%rcx),$T4,$T4 # padbit, yes, always 2762*4882a593Smuzhiyun vpaddq $D3,$H4,$H4 # h3 -> h4 2763*4882a593Smuzhiyun 2764*4882a593Smuzhiyun lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 2765*4882a593Smuzhiyun add \$64,$len 2766*4882a593Smuzhiyun jnz .Ltail_avx2$suffix 2767*4882a593Smuzhiyun 2768*4882a593Smuzhiyun vpsubq $T2,$H2,$H2 # undo input accumulation 2769*4882a593Smuzhiyun vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced 2770*4882a593Smuzhiyun vmovd %x#$H1,`4*1-48-64`($ctx) 2771*4882a593Smuzhiyun vmovd %x#$H2,`4*2-48-64`($ctx) 2772*4882a593Smuzhiyun vmovd %x#$H3,`4*3-48-64`($ctx) 2773*4882a593Smuzhiyun vmovd %x#$H4,`4*4-48-64`($ctx) 2774*4882a593Smuzhiyun vzeroall 2775*4882a593Smuzhiyun___ 2776*4882a593Smuzhiyun$code.=<<___ if ($win64); 2777*4882a593Smuzhiyun movdqa -0xb0(%r10),%xmm6 2778*4882a593Smuzhiyun movdqa -0xa0(%r10),%xmm7 2779*4882a593Smuzhiyun movdqa -0x90(%r10),%xmm8 2780*4882a593Smuzhiyun movdqa -0x80(%r10),%xmm9 2781*4882a593Smuzhiyun movdqa -0x70(%r10),%xmm10 2782*4882a593Smuzhiyun movdqa -0x60(%r10),%xmm11 2783*4882a593Smuzhiyun movdqa -0x50(%r10),%xmm12 2784*4882a593Smuzhiyun movdqa -0x40(%r10),%xmm13 2785*4882a593Smuzhiyun movdqa -0x30(%r10),%xmm14 2786*4882a593Smuzhiyun movdqa -0x20(%r10),%xmm15 2787*4882a593Smuzhiyun lea -8(%r10),%rsp 2788*4882a593Smuzhiyun.Ldo_avx512_epilogue: 2789*4882a593Smuzhiyun___ 2790*4882a593Smuzhiyun$code.=<<___ if (!$win64); 2791*4882a593Smuzhiyun lea -8(%r10),%rsp 2792*4882a593Smuzhiyun.cfi_def_cfa_register %rsp 2793*4882a593Smuzhiyun___ 2794*4882a593Smuzhiyun$code.=<<___; 2795*4882a593Smuzhiyun RET 2796*4882a593Smuzhiyun.cfi_endproc 2797*4882a593Smuzhiyun___ 2798*4882a593Smuzhiyun 2799*4882a593Smuzhiyun} 2800*4882a593Smuzhiyun 2801*4882a593Smuzhiyun} 2802*4882a593Smuzhiyun 2803*4882a593Smuzhiyun&declare_function("poly1305_blocks_avx2", 32, 4); 2804*4882a593Smuzhiyunpoly1305_blocks_avxN(0); 2805*4882a593Smuzhiyun&end_function("poly1305_blocks_avx2"); 2806*4882a593Smuzhiyun 2807*4882a593Smuzhiyun####################################################################### 2808*4882a593Smuzhiyunif ($avx>2) { 2809*4882a593Smuzhiyun# On entry we have input length divisible by 64. But since inner loop 2810*4882a593Smuzhiyun# processes 128 bytes per iteration, cases when length is not divisible 2811*4882a593Smuzhiyun# by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this 2812*4882a593Smuzhiyun# reason stack layout is kept identical to poly1305_blocks_avx2. If not 2813*4882a593Smuzhiyun# for this tail, we wouldn't have to even allocate stack frame... 2814*4882a593Smuzhiyun 2815*4882a593Smuzhiyunif($kernel) { 2816*4882a593Smuzhiyun $code .= "#ifdef CONFIG_AS_AVX512\n"; 2817*4882a593Smuzhiyun} 2818*4882a593Smuzhiyun 2819*4882a593Smuzhiyun&declare_function("poly1305_blocks_avx512", 32, 4); 2820*4882a593Smuzhiyunpoly1305_blocks_avxN(1); 2821*4882a593Smuzhiyun&end_function("poly1305_blocks_avx512"); 2822*4882a593Smuzhiyun 2823*4882a593Smuzhiyunif ($kernel) { 2824*4882a593Smuzhiyun $code .= "#endif\n"; 2825*4882a593Smuzhiyun} 2826*4882a593Smuzhiyun 2827*4882a593Smuzhiyunif (!$kernel && $avx>3) { 2828*4882a593Smuzhiyun######################################################################## 2829*4882a593Smuzhiyun# VPMADD52 version using 2^44 radix. 2830*4882a593Smuzhiyun# 2831*4882a593Smuzhiyun# One can argue that base 2^52 would be more natural. Well, even though 2832*4882a593Smuzhiyun# some operations would be more natural, one has to recognize couple of 2833*4882a593Smuzhiyun# things. Base 2^52 doesn't provide advantage over base 2^44 if you look 2834*4882a593Smuzhiyun# at amount of multiply-n-accumulate operations. Secondly, it makes it 2835*4882a593Smuzhiyun# impossible to pre-compute multiples of 5 [referred to as s[]/sN in 2836*4882a593Smuzhiyun# reference implementations], which means that more such operations 2837*4882a593Smuzhiyun# would have to be performed in inner loop, which in turn makes critical 2838*4882a593Smuzhiyun# path longer. In other words, even though base 2^44 reduction might 2839*4882a593Smuzhiyun# look less elegant, overall critical path is actually shorter... 2840*4882a593Smuzhiyun 2841*4882a593Smuzhiyun######################################################################## 2842*4882a593Smuzhiyun# Layout of opaque area is following. 2843*4882a593Smuzhiyun# 2844*4882a593Smuzhiyun# unsigned __int64 h[3]; # current hash value base 2^44 2845*4882a593Smuzhiyun# unsigned __int64 s[2]; # key value*20 base 2^44 2846*4882a593Smuzhiyun# unsigned __int64 r[3]; # key value base 2^44 2847*4882a593Smuzhiyun# struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; 2848*4882a593Smuzhiyun# # r^n positions reflect 2849*4882a593Smuzhiyun# # placement in register, not 2850*4882a593Smuzhiyun# # memory, R[3] is R[1]*20 2851*4882a593Smuzhiyun 2852*4882a593Smuzhiyun$code.=<<___; 2853*4882a593Smuzhiyun.type poly1305_init_base2_44,\@function,3 2854*4882a593Smuzhiyun.align 32 2855*4882a593Smuzhiyunpoly1305_init_base2_44: 2856*4882a593Smuzhiyun xor %eax,%eax 2857*4882a593Smuzhiyun mov %rax,0($ctx) # initialize hash value 2858*4882a593Smuzhiyun mov %rax,8($ctx) 2859*4882a593Smuzhiyun mov %rax,16($ctx) 2860*4882a593Smuzhiyun 2861*4882a593Smuzhiyun.Linit_base2_44: 2862*4882a593Smuzhiyun lea poly1305_blocks_vpmadd52(%rip),%r10 2863*4882a593Smuzhiyun lea poly1305_emit_base2_44(%rip),%r11 2864*4882a593Smuzhiyun 2865*4882a593Smuzhiyun mov \$0x0ffffffc0fffffff,%rax 2866*4882a593Smuzhiyun mov \$0x0ffffffc0ffffffc,%rcx 2867*4882a593Smuzhiyun and 0($inp),%rax 2868*4882a593Smuzhiyun mov \$0x00000fffffffffff,%r8 2869*4882a593Smuzhiyun and 8($inp),%rcx 2870*4882a593Smuzhiyun mov \$0x00000fffffffffff,%r9 2871*4882a593Smuzhiyun and %rax,%r8 2872*4882a593Smuzhiyun shrd \$44,%rcx,%rax 2873*4882a593Smuzhiyun mov %r8,40($ctx) # r0 2874*4882a593Smuzhiyun and %r9,%rax 2875*4882a593Smuzhiyun shr \$24,%rcx 2876*4882a593Smuzhiyun mov %rax,48($ctx) # r1 2877*4882a593Smuzhiyun lea (%rax,%rax,4),%rax # *5 2878*4882a593Smuzhiyun mov %rcx,56($ctx) # r2 2879*4882a593Smuzhiyun shl \$2,%rax # magic <<2 2880*4882a593Smuzhiyun lea (%rcx,%rcx,4),%rcx # *5 2881*4882a593Smuzhiyun shl \$2,%rcx # magic <<2 2882*4882a593Smuzhiyun mov %rax,24($ctx) # s1 2883*4882a593Smuzhiyun mov %rcx,32($ctx) # s2 2884*4882a593Smuzhiyun movq \$-1,64($ctx) # write impossible value 2885*4882a593Smuzhiyun___ 2886*4882a593Smuzhiyun$code.=<<___ if ($flavour !~ /elf32/); 2887*4882a593Smuzhiyun mov %r10,0(%rdx) 2888*4882a593Smuzhiyun mov %r11,8(%rdx) 2889*4882a593Smuzhiyun___ 2890*4882a593Smuzhiyun$code.=<<___ if ($flavour =~ /elf32/); 2891*4882a593Smuzhiyun mov %r10d,0(%rdx) 2892*4882a593Smuzhiyun mov %r11d,4(%rdx) 2893*4882a593Smuzhiyun___ 2894*4882a593Smuzhiyun$code.=<<___; 2895*4882a593Smuzhiyun mov \$1,%eax 2896*4882a593Smuzhiyun RET 2897*4882a593Smuzhiyun.size poly1305_init_base2_44,.-poly1305_init_base2_44 2898*4882a593Smuzhiyun___ 2899*4882a593Smuzhiyun{ 2900*4882a593Smuzhiyunmy ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); 2901*4882a593Smuzhiyunmy ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); 2902*4882a593Smuzhiyunmy ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); 2903*4882a593Smuzhiyun 2904*4882a593Smuzhiyun$code.=<<___; 2905*4882a593Smuzhiyun.type poly1305_blocks_vpmadd52,\@function,4 2906*4882a593Smuzhiyun.align 32 2907*4882a593Smuzhiyunpoly1305_blocks_vpmadd52: 2908*4882a593Smuzhiyun shr \$4,$len 2909*4882a593Smuzhiyun jz .Lno_data_vpmadd52 # too short 2910*4882a593Smuzhiyun 2911*4882a593Smuzhiyun shl \$40,$padbit 2912*4882a593Smuzhiyun mov 64($ctx),%r8 # peek on power of the key 2913*4882a593Smuzhiyun 2914*4882a593Smuzhiyun # if powers of the key are not calculated yet, process up to 3 2915*4882a593Smuzhiyun # blocks with this single-block subroutine, otherwise ensure that 2916*4882a593Smuzhiyun # length is divisible by 2 blocks and pass the rest down to next 2917*4882a593Smuzhiyun # subroutine... 2918*4882a593Smuzhiyun 2919*4882a593Smuzhiyun mov \$3,%rax 2920*4882a593Smuzhiyun mov \$1,%r10 2921*4882a593Smuzhiyun cmp \$4,$len # is input long 2922*4882a593Smuzhiyun cmovae %r10,%rax 2923*4882a593Smuzhiyun test %r8,%r8 # is power value impossible? 2924*4882a593Smuzhiyun cmovns %r10,%rax 2925*4882a593Smuzhiyun 2926*4882a593Smuzhiyun and $len,%rax # is input of favourable length? 2927*4882a593Smuzhiyun jz .Lblocks_vpmadd52_4x 2928*4882a593Smuzhiyun 2929*4882a593Smuzhiyun sub %rax,$len 2930*4882a593Smuzhiyun mov \$7,%r10d 2931*4882a593Smuzhiyun mov \$1,%r11d 2932*4882a593Smuzhiyun kmovw %r10d,%k7 2933*4882a593Smuzhiyun lea .L2_44_inp_permd(%rip),%r10 2934*4882a593Smuzhiyun kmovw %r11d,%k1 2935*4882a593Smuzhiyun 2936*4882a593Smuzhiyun vmovq $padbit,%x#$PAD 2937*4882a593Smuzhiyun vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd 2938*4882a593Smuzhiyun vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift 2939*4882a593Smuzhiyun vpermq \$0xcf,$PAD,$PAD 2940*4882a593Smuzhiyun vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask 2941*4882a593Smuzhiyun 2942*4882a593Smuzhiyun vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value 2943*4882a593Smuzhiyun vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys 2944*4882a593Smuzhiyun vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} 2945*4882a593Smuzhiyun vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} 2946*4882a593Smuzhiyun 2947*4882a593Smuzhiyun vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt 2948*4882a593Smuzhiyun vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft 2949*4882a593Smuzhiyun 2950*4882a593Smuzhiyun jmp .Loop_vpmadd52 2951*4882a593Smuzhiyun 2952*4882a593Smuzhiyun.align 32 2953*4882a593Smuzhiyun.Loop_vpmadd52: 2954*4882a593Smuzhiyun vmovdqu32 0($inp),%x#$T0 # load input as ----3210 2955*4882a593Smuzhiyun lea 16($inp),$inp 2956*4882a593Smuzhiyun 2957*4882a593Smuzhiyun vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 2958*4882a593Smuzhiyun vpsrlvq $inp_shift,$T0,$T0 2959*4882a593Smuzhiyun vpandq $reduc_mask,$T0,$T0 2960*4882a593Smuzhiyun vporq $PAD,$T0,$T0 2961*4882a593Smuzhiyun 2962*4882a593Smuzhiyun vpaddq $T0,$Dlo,$Dlo # accumulate input 2963*4882a593Smuzhiyun 2964*4882a593Smuzhiyun vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value 2965*4882a593Smuzhiyun vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} 2966*4882a593Smuzhiyun vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} 2967*4882a593Smuzhiyun 2968*4882a593Smuzhiyun vpxord $Dlo,$Dlo,$Dlo 2969*4882a593Smuzhiyun vpxord $Dhi,$Dhi,$Dhi 2970*4882a593Smuzhiyun 2971*4882a593Smuzhiyun vpmadd52luq $r2r1r0,$H0,$Dlo 2972*4882a593Smuzhiyun vpmadd52huq $r2r1r0,$H0,$Dhi 2973*4882a593Smuzhiyun 2974*4882a593Smuzhiyun vpmadd52luq $r1r0s2,$H1,$Dlo 2975*4882a593Smuzhiyun vpmadd52huq $r1r0s2,$H1,$Dhi 2976*4882a593Smuzhiyun 2977*4882a593Smuzhiyun vpmadd52luq $r0s2s1,$H2,$Dlo 2978*4882a593Smuzhiyun vpmadd52huq $r0s2s1,$H2,$Dhi 2979*4882a593Smuzhiyun 2980*4882a593Smuzhiyun vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword 2981*4882a593Smuzhiyun vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword 2982*4882a593Smuzhiyun vpandq $reduc_mask,$Dlo,$Dlo 2983*4882a593Smuzhiyun 2984*4882a593Smuzhiyun vpaddq $T0,$Dhi,$Dhi 2985*4882a593Smuzhiyun 2986*4882a593Smuzhiyun vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword 2987*4882a593Smuzhiyun 2988*4882a593Smuzhiyun vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) 2989*4882a593Smuzhiyun 2990*4882a593Smuzhiyun vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word 2991*4882a593Smuzhiyun vpandq $reduc_mask,$Dlo,$Dlo 2992*4882a593Smuzhiyun 2993*4882a593Smuzhiyun vpermq \$0b10010011,$T0,$T0 2994*4882a593Smuzhiyun 2995*4882a593Smuzhiyun vpaddq $T0,$Dlo,$Dlo 2996*4882a593Smuzhiyun 2997*4882a593Smuzhiyun vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} 2998*4882a593Smuzhiyun 2999*4882a593Smuzhiyun vpaddq $T0,$Dlo,$Dlo 3000*4882a593Smuzhiyun vpsllq \$2,$T0,$T0 3001*4882a593Smuzhiyun 3002*4882a593Smuzhiyun vpaddq $T0,$Dlo,$Dlo 3003*4882a593Smuzhiyun 3004*4882a593Smuzhiyun dec %rax # len-=16 3005*4882a593Smuzhiyun jnz .Loop_vpmadd52 3006*4882a593Smuzhiyun 3007*4882a593Smuzhiyun vmovdqu64 $Dlo,0($ctx){%k7} # store hash value 3008*4882a593Smuzhiyun 3009*4882a593Smuzhiyun test $len,$len 3010*4882a593Smuzhiyun jnz .Lblocks_vpmadd52_4x 3011*4882a593Smuzhiyun 3012*4882a593Smuzhiyun.Lno_data_vpmadd52: 3013*4882a593Smuzhiyun RET 3014*4882a593Smuzhiyun.size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 3015*4882a593Smuzhiyun___ 3016*4882a593Smuzhiyun} 3017*4882a593Smuzhiyun{ 3018*4882a593Smuzhiyun######################################################################## 3019*4882a593Smuzhiyun# As implied by its name 4x subroutine processes 4 blocks in parallel 3020*4882a593Smuzhiyun# (but handles even 4*n+2 blocks lengths). It takes up to 4th key power 3021*4882a593Smuzhiyun# and is handled in 256-bit %ymm registers. 3022*4882a593Smuzhiyun 3023*4882a593Smuzhiyunmy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3024*4882a593Smuzhiyunmy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3025*4882a593Smuzhiyunmy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3026*4882a593Smuzhiyun 3027*4882a593Smuzhiyun$code.=<<___; 3028*4882a593Smuzhiyun.type poly1305_blocks_vpmadd52_4x,\@function,4 3029*4882a593Smuzhiyun.align 32 3030*4882a593Smuzhiyunpoly1305_blocks_vpmadd52_4x: 3031*4882a593Smuzhiyun shr \$4,$len 3032*4882a593Smuzhiyun jz .Lno_data_vpmadd52_4x # too short 3033*4882a593Smuzhiyun 3034*4882a593Smuzhiyun shl \$40,$padbit 3035*4882a593Smuzhiyun mov 64($ctx),%r8 # peek on power of the key 3036*4882a593Smuzhiyun 3037*4882a593Smuzhiyun.Lblocks_vpmadd52_4x: 3038*4882a593Smuzhiyun vpbroadcastq $padbit,$PAD 3039*4882a593Smuzhiyun 3040*4882a593Smuzhiyun vmovdqa64 .Lx_mask44(%rip),$mask44 3041*4882a593Smuzhiyun mov \$5,%eax 3042*4882a593Smuzhiyun vmovdqa64 .Lx_mask42(%rip),$mask42 3043*4882a593Smuzhiyun kmovw %eax,%k1 # used in 2x path 3044*4882a593Smuzhiyun 3045*4882a593Smuzhiyun test %r8,%r8 # is power value impossible? 3046*4882a593Smuzhiyun js .Linit_vpmadd52 # if it is, then init R[4] 3047*4882a593Smuzhiyun 3048*4882a593Smuzhiyun vmovq 0($ctx),%x#$H0 # load current hash value 3049*4882a593Smuzhiyun vmovq 8($ctx),%x#$H1 3050*4882a593Smuzhiyun vmovq 16($ctx),%x#$H2 3051*4882a593Smuzhiyun 3052*4882a593Smuzhiyun test \$3,$len # is length 4*n+2? 3053*4882a593Smuzhiyun jnz .Lblocks_vpmadd52_2x_do 3054*4882a593Smuzhiyun 3055*4882a593Smuzhiyun.Lblocks_vpmadd52_4x_do: 3056*4882a593Smuzhiyun vpbroadcastq 64($ctx),$R0 # load 4th power of the key 3057*4882a593Smuzhiyun vpbroadcastq 96($ctx),$R1 3058*4882a593Smuzhiyun vpbroadcastq 128($ctx),$R2 3059*4882a593Smuzhiyun vpbroadcastq 160($ctx),$S1 3060*4882a593Smuzhiyun 3061*4882a593Smuzhiyun.Lblocks_vpmadd52_4x_key_loaded: 3062*4882a593Smuzhiyun vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3063*4882a593Smuzhiyun vpaddq $R2,$S2,$S2 3064*4882a593Smuzhiyun vpsllq \$2,$S2,$S2 3065*4882a593Smuzhiyun 3066*4882a593Smuzhiyun test \$7,$len # is len 8*n? 3067*4882a593Smuzhiyun jz .Lblocks_vpmadd52_8x 3068*4882a593Smuzhiyun 3069*4882a593Smuzhiyun vmovdqu64 16*0($inp),$T2 # load data 3070*4882a593Smuzhiyun vmovdqu64 16*2($inp),$T3 3071*4882a593Smuzhiyun lea 16*4($inp),$inp 3072*4882a593Smuzhiyun 3073*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T1 # transpose data 3074*4882a593Smuzhiyun vpunpckhqdq $T3,$T2,$T3 3075*4882a593Smuzhiyun 3076*4882a593Smuzhiyun # at this point 64-bit lanes are ordered as 3-1-2-0 3077*4882a593Smuzhiyun 3078*4882a593Smuzhiyun vpsrlq \$24,$T3,$T2 # splat the data 3079*4882a593Smuzhiyun vporq $PAD,$T2,$T2 3080*4882a593Smuzhiyun vpaddq $T2,$H2,$H2 # accumulate input 3081*4882a593Smuzhiyun vpandq $mask44,$T1,$T0 3082*4882a593Smuzhiyun vpsrlq \$44,$T1,$T1 3083*4882a593Smuzhiyun vpsllq \$20,$T3,$T3 3084*4882a593Smuzhiyun vporq $T3,$T1,$T1 3085*4882a593Smuzhiyun vpandq $mask44,$T1,$T1 3086*4882a593Smuzhiyun 3087*4882a593Smuzhiyun sub \$4,$len 3088*4882a593Smuzhiyun jz .Ltail_vpmadd52_4x 3089*4882a593Smuzhiyun jmp .Loop_vpmadd52_4x 3090*4882a593Smuzhiyun ud2 3091*4882a593Smuzhiyun 3092*4882a593Smuzhiyun.align 32 3093*4882a593Smuzhiyun.Linit_vpmadd52: 3094*4882a593Smuzhiyun vmovq 24($ctx),%x#$S1 # load key 3095*4882a593Smuzhiyun vmovq 56($ctx),%x#$H2 3096*4882a593Smuzhiyun vmovq 32($ctx),%x#$S2 3097*4882a593Smuzhiyun vmovq 40($ctx),%x#$R0 3098*4882a593Smuzhiyun vmovq 48($ctx),%x#$R1 3099*4882a593Smuzhiyun 3100*4882a593Smuzhiyun vmovdqa $R0,$H0 3101*4882a593Smuzhiyun vmovdqa $R1,$H1 3102*4882a593Smuzhiyun vmovdqa $H2,$R2 3103*4882a593Smuzhiyun 3104*4882a593Smuzhiyun mov \$2,%eax 3105*4882a593Smuzhiyun 3106*4882a593Smuzhiyun.Lmul_init_vpmadd52: 3107*4882a593Smuzhiyun vpxorq $D0lo,$D0lo,$D0lo 3108*4882a593Smuzhiyun vpmadd52luq $H2,$S1,$D0lo 3109*4882a593Smuzhiyun vpxorq $D0hi,$D0hi,$D0hi 3110*4882a593Smuzhiyun vpmadd52huq $H2,$S1,$D0hi 3111*4882a593Smuzhiyun vpxorq $D1lo,$D1lo,$D1lo 3112*4882a593Smuzhiyun vpmadd52luq $H2,$S2,$D1lo 3113*4882a593Smuzhiyun vpxorq $D1hi,$D1hi,$D1hi 3114*4882a593Smuzhiyun vpmadd52huq $H2,$S2,$D1hi 3115*4882a593Smuzhiyun vpxorq $D2lo,$D2lo,$D2lo 3116*4882a593Smuzhiyun vpmadd52luq $H2,$R0,$D2lo 3117*4882a593Smuzhiyun vpxorq $D2hi,$D2hi,$D2hi 3118*4882a593Smuzhiyun vpmadd52huq $H2,$R0,$D2hi 3119*4882a593Smuzhiyun 3120*4882a593Smuzhiyun vpmadd52luq $H0,$R0,$D0lo 3121*4882a593Smuzhiyun vpmadd52huq $H0,$R0,$D0hi 3122*4882a593Smuzhiyun vpmadd52luq $H0,$R1,$D1lo 3123*4882a593Smuzhiyun vpmadd52huq $H0,$R1,$D1hi 3124*4882a593Smuzhiyun vpmadd52luq $H0,$R2,$D2lo 3125*4882a593Smuzhiyun vpmadd52huq $H0,$R2,$D2hi 3126*4882a593Smuzhiyun 3127*4882a593Smuzhiyun vpmadd52luq $H1,$S2,$D0lo 3128*4882a593Smuzhiyun vpmadd52huq $H1,$S2,$D0hi 3129*4882a593Smuzhiyun vpmadd52luq $H1,$R0,$D1lo 3130*4882a593Smuzhiyun vpmadd52huq $H1,$R0,$D1hi 3131*4882a593Smuzhiyun vpmadd52luq $H1,$R1,$D2lo 3132*4882a593Smuzhiyun vpmadd52huq $H1,$R1,$D2hi 3133*4882a593Smuzhiyun 3134*4882a593Smuzhiyun ################################################################ 3135*4882a593Smuzhiyun # partial reduction 3136*4882a593Smuzhiyun vpsrlq \$44,$D0lo,$tmp 3137*4882a593Smuzhiyun vpsllq \$8,$D0hi,$D0hi 3138*4882a593Smuzhiyun vpandq $mask44,$D0lo,$H0 3139*4882a593Smuzhiyun vpaddq $tmp,$D0hi,$D0hi 3140*4882a593Smuzhiyun 3141*4882a593Smuzhiyun vpaddq $D0hi,$D1lo,$D1lo 3142*4882a593Smuzhiyun 3143*4882a593Smuzhiyun vpsrlq \$44,$D1lo,$tmp 3144*4882a593Smuzhiyun vpsllq \$8,$D1hi,$D1hi 3145*4882a593Smuzhiyun vpandq $mask44,$D1lo,$H1 3146*4882a593Smuzhiyun vpaddq $tmp,$D1hi,$D1hi 3147*4882a593Smuzhiyun 3148*4882a593Smuzhiyun vpaddq $D1hi,$D2lo,$D2lo 3149*4882a593Smuzhiyun 3150*4882a593Smuzhiyun vpsrlq \$42,$D2lo,$tmp 3151*4882a593Smuzhiyun vpsllq \$10,$D2hi,$D2hi 3152*4882a593Smuzhiyun vpandq $mask42,$D2lo,$H2 3153*4882a593Smuzhiyun vpaddq $tmp,$D2hi,$D2hi 3154*4882a593Smuzhiyun 3155*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3156*4882a593Smuzhiyun vpsllq \$2,$D2hi,$D2hi 3157*4882a593Smuzhiyun 3158*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3159*4882a593Smuzhiyun 3160*4882a593Smuzhiyun vpsrlq \$44,$H0,$tmp # additional step 3161*4882a593Smuzhiyun vpandq $mask44,$H0,$H0 3162*4882a593Smuzhiyun 3163*4882a593Smuzhiyun vpaddq $tmp,$H1,$H1 3164*4882a593Smuzhiyun 3165*4882a593Smuzhiyun dec %eax 3166*4882a593Smuzhiyun jz .Ldone_init_vpmadd52 3167*4882a593Smuzhiyun 3168*4882a593Smuzhiyun vpunpcklqdq $R1,$H1,$R1 # 1,2 3169*4882a593Smuzhiyun vpbroadcastq %x#$H1,%x#$H1 # 2,2 3170*4882a593Smuzhiyun vpunpcklqdq $R2,$H2,$R2 3171*4882a593Smuzhiyun vpbroadcastq %x#$H2,%x#$H2 3172*4882a593Smuzhiyun vpunpcklqdq $R0,$H0,$R0 3173*4882a593Smuzhiyun vpbroadcastq %x#$H0,%x#$H0 3174*4882a593Smuzhiyun 3175*4882a593Smuzhiyun vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3176*4882a593Smuzhiyun vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3177*4882a593Smuzhiyun vpaddq $R1,$S1,$S1 3178*4882a593Smuzhiyun vpaddq $R2,$S2,$S2 3179*4882a593Smuzhiyun vpsllq \$2,$S1,$S1 3180*4882a593Smuzhiyun vpsllq \$2,$S2,$S2 3181*4882a593Smuzhiyun 3182*4882a593Smuzhiyun jmp .Lmul_init_vpmadd52 3183*4882a593Smuzhiyun ud2 3184*4882a593Smuzhiyun 3185*4882a593Smuzhiyun.align 32 3186*4882a593Smuzhiyun.Ldone_init_vpmadd52: 3187*4882a593Smuzhiyun vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 3188*4882a593Smuzhiyun vinserti128 \$1,%x#$R2,$H2,$R2 3189*4882a593Smuzhiyun vinserti128 \$1,%x#$R0,$H0,$R0 3190*4882a593Smuzhiyun 3191*4882a593Smuzhiyun vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 3192*4882a593Smuzhiyun vpermq \$0b11011000,$R2,$R2 3193*4882a593Smuzhiyun vpermq \$0b11011000,$R0,$R0 3194*4882a593Smuzhiyun 3195*4882a593Smuzhiyun vpsllq \$2,$R1,$S1 # S1 = R1*5*4 3196*4882a593Smuzhiyun vpaddq $R1,$S1,$S1 3197*4882a593Smuzhiyun vpsllq \$2,$S1,$S1 3198*4882a593Smuzhiyun 3199*4882a593Smuzhiyun vmovq 0($ctx),%x#$H0 # load current hash value 3200*4882a593Smuzhiyun vmovq 8($ctx),%x#$H1 3201*4882a593Smuzhiyun vmovq 16($ctx),%x#$H2 3202*4882a593Smuzhiyun 3203*4882a593Smuzhiyun test \$3,$len # is length 4*n+2? 3204*4882a593Smuzhiyun jnz .Ldone_init_vpmadd52_2x 3205*4882a593Smuzhiyun 3206*4882a593Smuzhiyun vmovdqu64 $R0,64($ctx) # save key powers 3207*4882a593Smuzhiyun vpbroadcastq %x#$R0,$R0 # broadcast 4th power 3208*4882a593Smuzhiyun vmovdqu64 $R1,96($ctx) 3209*4882a593Smuzhiyun vpbroadcastq %x#$R1,$R1 3210*4882a593Smuzhiyun vmovdqu64 $R2,128($ctx) 3211*4882a593Smuzhiyun vpbroadcastq %x#$R2,$R2 3212*4882a593Smuzhiyun vmovdqu64 $S1,160($ctx) 3213*4882a593Smuzhiyun vpbroadcastq %x#$S1,$S1 3214*4882a593Smuzhiyun 3215*4882a593Smuzhiyun jmp .Lblocks_vpmadd52_4x_key_loaded 3216*4882a593Smuzhiyun ud2 3217*4882a593Smuzhiyun 3218*4882a593Smuzhiyun.align 32 3219*4882a593Smuzhiyun.Ldone_init_vpmadd52_2x: 3220*4882a593Smuzhiyun vmovdqu64 $R0,64($ctx) # save key powers 3221*4882a593Smuzhiyun vpsrldq \$8,$R0,$R0 # 0-1-0-2 3222*4882a593Smuzhiyun vmovdqu64 $R1,96($ctx) 3223*4882a593Smuzhiyun vpsrldq \$8,$R1,$R1 3224*4882a593Smuzhiyun vmovdqu64 $R2,128($ctx) 3225*4882a593Smuzhiyun vpsrldq \$8,$R2,$R2 3226*4882a593Smuzhiyun vmovdqu64 $S1,160($ctx) 3227*4882a593Smuzhiyun vpsrldq \$8,$S1,$S1 3228*4882a593Smuzhiyun jmp .Lblocks_vpmadd52_2x_key_loaded 3229*4882a593Smuzhiyun ud2 3230*4882a593Smuzhiyun 3231*4882a593Smuzhiyun.align 32 3232*4882a593Smuzhiyun.Lblocks_vpmadd52_2x_do: 3233*4882a593Smuzhiyun vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers 3234*4882a593Smuzhiyun vmovdqu64 160+8($ctx),${S1}{%k1}{z} 3235*4882a593Smuzhiyun vmovdqu64 64+8($ctx),${R0}{%k1}{z} 3236*4882a593Smuzhiyun vmovdqu64 96+8($ctx),${R1}{%k1}{z} 3237*4882a593Smuzhiyun 3238*4882a593Smuzhiyun.Lblocks_vpmadd52_2x_key_loaded: 3239*4882a593Smuzhiyun vmovdqu64 16*0($inp),$T2 # load data 3240*4882a593Smuzhiyun vpxorq $T3,$T3,$T3 3241*4882a593Smuzhiyun lea 16*2($inp),$inp 3242*4882a593Smuzhiyun 3243*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T1 # transpose data 3244*4882a593Smuzhiyun vpunpckhqdq $T3,$T2,$T3 3245*4882a593Smuzhiyun 3246*4882a593Smuzhiyun # at this point 64-bit lanes are ordered as x-1-x-0 3247*4882a593Smuzhiyun 3248*4882a593Smuzhiyun vpsrlq \$24,$T3,$T2 # splat the data 3249*4882a593Smuzhiyun vporq $PAD,$T2,$T2 3250*4882a593Smuzhiyun vpaddq $T2,$H2,$H2 # accumulate input 3251*4882a593Smuzhiyun vpandq $mask44,$T1,$T0 3252*4882a593Smuzhiyun vpsrlq \$44,$T1,$T1 3253*4882a593Smuzhiyun vpsllq \$20,$T3,$T3 3254*4882a593Smuzhiyun vporq $T3,$T1,$T1 3255*4882a593Smuzhiyun vpandq $mask44,$T1,$T1 3256*4882a593Smuzhiyun 3257*4882a593Smuzhiyun jmp .Ltail_vpmadd52_2x 3258*4882a593Smuzhiyun ud2 3259*4882a593Smuzhiyun 3260*4882a593Smuzhiyun.align 32 3261*4882a593Smuzhiyun.Loop_vpmadd52_4x: 3262*4882a593Smuzhiyun #vpaddq $T2,$H2,$H2 # accumulate input 3263*4882a593Smuzhiyun vpaddq $T0,$H0,$H0 3264*4882a593Smuzhiyun vpaddq $T1,$H1,$H1 3265*4882a593Smuzhiyun 3266*4882a593Smuzhiyun vpxorq $D0lo,$D0lo,$D0lo 3267*4882a593Smuzhiyun vpmadd52luq $H2,$S1,$D0lo 3268*4882a593Smuzhiyun vpxorq $D0hi,$D0hi,$D0hi 3269*4882a593Smuzhiyun vpmadd52huq $H2,$S1,$D0hi 3270*4882a593Smuzhiyun vpxorq $D1lo,$D1lo,$D1lo 3271*4882a593Smuzhiyun vpmadd52luq $H2,$S2,$D1lo 3272*4882a593Smuzhiyun vpxorq $D1hi,$D1hi,$D1hi 3273*4882a593Smuzhiyun vpmadd52huq $H2,$S2,$D1hi 3274*4882a593Smuzhiyun vpxorq $D2lo,$D2lo,$D2lo 3275*4882a593Smuzhiyun vpmadd52luq $H2,$R0,$D2lo 3276*4882a593Smuzhiyun vpxorq $D2hi,$D2hi,$D2hi 3277*4882a593Smuzhiyun vpmadd52huq $H2,$R0,$D2hi 3278*4882a593Smuzhiyun 3279*4882a593Smuzhiyun vmovdqu64 16*0($inp),$T2 # load data 3280*4882a593Smuzhiyun vmovdqu64 16*2($inp),$T3 3281*4882a593Smuzhiyun lea 16*4($inp),$inp 3282*4882a593Smuzhiyun vpmadd52luq $H0,$R0,$D0lo 3283*4882a593Smuzhiyun vpmadd52huq $H0,$R0,$D0hi 3284*4882a593Smuzhiyun vpmadd52luq $H0,$R1,$D1lo 3285*4882a593Smuzhiyun vpmadd52huq $H0,$R1,$D1hi 3286*4882a593Smuzhiyun vpmadd52luq $H0,$R2,$D2lo 3287*4882a593Smuzhiyun vpmadd52huq $H0,$R2,$D2hi 3288*4882a593Smuzhiyun 3289*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T1 # transpose data 3290*4882a593Smuzhiyun vpunpckhqdq $T3,$T2,$T3 3291*4882a593Smuzhiyun vpmadd52luq $H1,$S2,$D0lo 3292*4882a593Smuzhiyun vpmadd52huq $H1,$S2,$D0hi 3293*4882a593Smuzhiyun vpmadd52luq $H1,$R0,$D1lo 3294*4882a593Smuzhiyun vpmadd52huq $H1,$R0,$D1hi 3295*4882a593Smuzhiyun vpmadd52luq $H1,$R1,$D2lo 3296*4882a593Smuzhiyun vpmadd52huq $H1,$R1,$D2hi 3297*4882a593Smuzhiyun 3298*4882a593Smuzhiyun ################################################################ 3299*4882a593Smuzhiyun # partial reduction (interleaved with data splat) 3300*4882a593Smuzhiyun vpsrlq \$44,$D0lo,$tmp 3301*4882a593Smuzhiyun vpsllq \$8,$D0hi,$D0hi 3302*4882a593Smuzhiyun vpandq $mask44,$D0lo,$H0 3303*4882a593Smuzhiyun vpaddq $tmp,$D0hi,$D0hi 3304*4882a593Smuzhiyun 3305*4882a593Smuzhiyun vpsrlq \$24,$T3,$T2 3306*4882a593Smuzhiyun vporq $PAD,$T2,$T2 3307*4882a593Smuzhiyun vpaddq $D0hi,$D1lo,$D1lo 3308*4882a593Smuzhiyun 3309*4882a593Smuzhiyun vpsrlq \$44,$D1lo,$tmp 3310*4882a593Smuzhiyun vpsllq \$8,$D1hi,$D1hi 3311*4882a593Smuzhiyun vpandq $mask44,$D1lo,$H1 3312*4882a593Smuzhiyun vpaddq $tmp,$D1hi,$D1hi 3313*4882a593Smuzhiyun 3314*4882a593Smuzhiyun vpandq $mask44,$T1,$T0 3315*4882a593Smuzhiyun vpsrlq \$44,$T1,$T1 3316*4882a593Smuzhiyun vpsllq \$20,$T3,$T3 3317*4882a593Smuzhiyun vpaddq $D1hi,$D2lo,$D2lo 3318*4882a593Smuzhiyun 3319*4882a593Smuzhiyun vpsrlq \$42,$D2lo,$tmp 3320*4882a593Smuzhiyun vpsllq \$10,$D2hi,$D2hi 3321*4882a593Smuzhiyun vpandq $mask42,$D2lo,$H2 3322*4882a593Smuzhiyun vpaddq $tmp,$D2hi,$D2hi 3323*4882a593Smuzhiyun 3324*4882a593Smuzhiyun vpaddq $T2,$H2,$H2 # accumulate input 3325*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3326*4882a593Smuzhiyun vpsllq \$2,$D2hi,$D2hi 3327*4882a593Smuzhiyun 3328*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3329*4882a593Smuzhiyun vporq $T3,$T1,$T1 3330*4882a593Smuzhiyun vpandq $mask44,$T1,$T1 3331*4882a593Smuzhiyun 3332*4882a593Smuzhiyun vpsrlq \$44,$H0,$tmp # additional step 3333*4882a593Smuzhiyun vpandq $mask44,$H0,$H0 3334*4882a593Smuzhiyun 3335*4882a593Smuzhiyun vpaddq $tmp,$H1,$H1 3336*4882a593Smuzhiyun 3337*4882a593Smuzhiyun sub \$4,$len # len-=64 3338*4882a593Smuzhiyun jnz .Loop_vpmadd52_4x 3339*4882a593Smuzhiyun 3340*4882a593Smuzhiyun.Ltail_vpmadd52_4x: 3341*4882a593Smuzhiyun vmovdqu64 128($ctx),$R2 # load all key powers 3342*4882a593Smuzhiyun vmovdqu64 160($ctx),$S1 3343*4882a593Smuzhiyun vmovdqu64 64($ctx),$R0 3344*4882a593Smuzhiyun vmovdqu64 96($ctx),$R1 3345*4882a593Smuzhiyun 3346*4882a593Smuzhiyun.Ltail_vpmadd52_2x: 3347*4882a593Smuzhiyun vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3348*4882a593Smuzhiyun vpaddq $R2,$S2,$S2 3349*4882a593Smuzhiyun vpsllq \$2,$S2,$S2 3350*4882a593Smuzhiyun 3351*4882a593Smuzhiyun #vpaddq $T2,$H2,$H2 # accumulate input 3352*4882a593Smuzhiyun vpaddq $T0,$H0,$H0 3353*4882a593Smuzhiyun vpaddq $T1,$H1,$H1 3354*4882a593Smuzhiyun 3355*4882a593Smuzhiyun vpxorq $D0lo,$D0lo,$D0lo 3356*4882a593Smuzhiyun vpmadd52luq $H2,$S1,$D0lo 3357*4882a593Smuzhiyun vpxorq $D0hi,$D0hi,$D0hi 3358*4882a593Smuzhiyun vpmadd52huq $H2,$S1,$D0hi 3359*4882a593Smuzhiyun vpxorq $D1lo,$D1lo,$D1lo 3360*4882a593Smuzhiyun vpmadd52luq $H2,$S2,$D1lo 3361*4882a593Smuzhiyun vpxorq $D1hi,$D1hi,$D1hi 3362*4882a593Smuzhiyun vpmadd52huq $H2,$S2,$D1hi 3363*4882a593Smuzhiyun vpxorq $D2lo,$D2lo,$D2lo 3364*4882a593Smuzhiyun vpmadd52luq $H2,$R0,$D2lo 3365*4882a593Smuzhiyun vpxorq $D2hi,$D2hi,$D2hi 3366*4882a593Smuzhiyun vpmadd52huq $H2,$R0,$D2hi 3367*4882a593Smuzhiyun 3368*4882a593Smuzhiyun vpmadd52luq $H0,$R0,$D0lo 3369*4882a593Smuzhiyun vpmadd52huq $H0,$R0,$D0hi 3370*4882a593Smuzhiyun vpmadd52luq $H0,$R1,$D1lo 3371*4882a593Smuzhiyun vpmadd52huq $H0,$R1,$D1hi 3372*4882a593Smuzhiyun vpmadd52luq $H0,$R2,$D2lo 3373*4882a593Smuzhiyun vpmadd52huq $H0,$R2,$D2hi 3374*4882a593Smuzhiyun 3375*4882a593Smuzhiyun vpmadd52luq $H1,$S2,$D0lo 3376*4882a593Smuzhiyun vpmadd52huq $H1,$S2,$D0hi 3377*4882a593Smuzhiyun vpmadd52luq $H1,$R0,$D1lo 3378*4882a593Smuzhiyun vpmadd52huq $H1,$R0,$D1hi 3379*4882a593Smuzhiyun vpmadd52luq $H1,$R1,$D2lo 3380*4882a593Smuzhiyun vpmadd52huq $H1,$R1,$D2hi 3381*4882a593Smuzhiyun 3382*4882a593Smuzhiyun ################################################################ 3383*4882a593Smuzhiyun # horizontal addition 3384*4882a593Smuzhiyun 3385*4882a593Smuzhiyun mov \$1,%eax 3386*4882a593Smuzhiyun kmovw %eax,%k1 3387*4882a593Smuzhiyun vpsrldq \$8,$D0lo,$T0 3388*4882a593Smuzhiyun vpsrldq \$8,$D0hi,$H0 3389*4882a593Smuzhiyun vpsrldq \$8,$D1lo,$T1 3390*4882a593Smuzhiyun vpsrldq \$8,$D1hi,$H1 3391*4882a593Smuzhiyun vpaddq $T0,$D0lo,$D0lo 3392*4882a593Smuzhiyun vpaddq $H0,$D0hi,$D0hi 3393*4882a593Smuzhiyun vpsrldq \$8,$D2lo,$T2 3394*4882a593Smuzhiyun vpsrldq \$8,$D2hi,$H2 3395*4882a593Smuzhiyun vpaddq $T1,$D1lo,$D1lo 3396*4882a593Smuzhiyun vpaddq $H1,$D1hi,$D1hi 3397*4882a593Smuzhiyun vpermq \$0x2,$D0lo,$T0 3398*4882a593Smuzhiyun vpermq \$0x2,$D0hi,$H0 3399*4882a593Smuzhiyun vpaddq $T2,$D2lo,$D2lo 3400*4882a593Smuzhiyun vpaddq $H2,$D2hi,$D2hi 3401*4882a593Smuzhiyun 3402*4882a593Smuzhiyun vpermq \$0x2,$D1lo,$T1 3403*4882a593Smuzhiyun vpermq \$0x2,$D1hi,$H1 3404*4882a593Smuzhiyun vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3405*4882a593Smuzhiyun vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3406*4882a593Smuzhiyun vpermq \$0x2,$D2lo,$T2 3407*4882a593Smuzhiyun vpermq \$0x2,$D2hi,$H2 3408*4882a593Smuzhiyun vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3409*4882a593Smuzhiyun vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3410*4882a593Smuzhiyun vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3411*4882a593Smuzhiyun vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3412*4882a593Smuzhiyun 3413*4882a593Smuzhiyun ################################################################ 3414*4882a593Smuzhiyun # partial reduction 3415*4882a593Smuzhiyun vpsrlq \$44,$D0lo,$tmp 3416*4882a593Smuzhiyun vpsllq \$8,$D0hi,$D0hi 3417*4882a593Smuzhiyun vpandq $mask44,$D0lo,$H0 3418*4882a593Smuzhiyun vpaddq $tmp,$D0hi,$D0hi 3419*4882a593Smuzhiyun 3420*4882a593Smuzhiyun vpaddq $D0hi,$D1lo,$D1lo 3421*4882a593Smuzhiyun 3422*4882a593Smuzhiyun vpsrlq \$44,$D1lo,$tmp 3423*4882a593Smuzhiyun vpsllq \$8,$D1hi,$D1hi 3424*4882a593Smuzhiyun vpandq $mask44,$D1lo,$H1 3425*4882a593Smuzhiyun vpaddq $tmp,$D1hi,$D1hi 3426*4882a593Smuzhiyun 3427*4882a593Smuzhiyun vpaddq $D1hi,$D2lo,$D2lo 3428*4882a593Smuzhiyun 3429*4882a593Smuzhiyun vpsrlq \$42,$D2lo,$tmp 3430*4882a593Smuzhiyun vpsllq \$10,$D2hi,$D2hi 3431*4882a593Smuzhiyun vpandq $mask42,$D2lo,$H2 3432*4882a593Smuzhiyun vpaddq $tmp,$D2hi,$D2hi 3433*4882a593Smuzhiyun 3434*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3435*4882a593Smuzhiyun vpsllq \$2,$D2hi,$D2hi 3436*4882a593Smuzhiyun 3437*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3438*4882a593Smuzhiyun 3439*4882a593Smuzhiyun vpsrlq \$44,$H0,$tmp # additional step 3440*4882a593Smuzhiyun vpandq $mask44,$H0,$H0 3441*4882a593Smuzhiyun 3442*4882a593Smuzhiyun vpaddq $tmp,$H1,$H1 3443*4882a593Smuzhiyun # at this point $len is 3444*4882a593Smuzhiyun # either 4*n+2 or 0... 3445*4882a593Smuzhiyun sub \$2,$len # len-=32 3446*4882a593Smuzhiyun ja .Lblocks_vpmadd52_4x_do 3447*4882a593Smuzhiyun 3448*4882a593Smuzhiyun vmovq %x#$H0,0($ctx) 3449*4882a593Smuzhiyun vmovq %x#$H1,8($ctx) 3450*4882a593Smuzhiyun vmovq %x#$H2,16($ctx) 3451*4882a593Smuzhiyun vzeroall 3452*4882a593Smuzhiyun 3453*4882a593Smuzhiyun.Lno_data_vpmadd52_4x: 3454*4882a593Smuzhiyun RET 3455*4882a593Smuzhiyun.size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x 3456*4882a593Smuzhiyun___ 3457*4882a593Smuzhiyun} 3458*4882a593Smuzhiyun{ 3459*4882a593Smuzhiyun######################################################################## 3460*4882a593Smuzhiyun# As implied by its name 8x subroutine processes 8 blocks in parallel... 3461*4882a593Smuzhiyun# This is intermediate version, as it's used only in cases when input 3462*4882a593Smuzhiyun# length is either 8*n, 8*n+1 or 8*n+2... 3463*4882a593Smuzhiyun 3464*4882a593Smuzhiyunmy ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); 3465*4882a593Smuzhiyunmy ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); 3466*4882a593Smuzhiyunmy ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); 3467*4882a593Smuzhiyunmy ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); 3468*4882a593Smuzhiyun 3469*4882a593Smuzhiyun$code.=<<___; 3470*4882a593Smuzhiyun.type poly1305_blocks_vpmadd52_8x,\@function,4 3471*4882a593Smuzhiyun.align 32 3472*4882a593Smuzhiyunpoly1305_blocks_vpmadd52_8x: 3473*4882a593Smuzhiyun shr \$4,$len 3474*4882a593Smuzhiyun jz .Lno_data_vpmadd52_8x # too short 3475*4882a593Smuzhiyun 3476*4882a593Smuzhiyun shl \$40,$padbit 3477*4882a593Smuzhiyun mov 64($ctx),%r8 # peek on power of the key 3478*4882a593Smuzhiyun 3479*4882a593Smuzhiyun vmovdqa64 .Lx_mask44(%rip),$mask44 3480*4882a593Smuzhiyun vmovdqa64 .Lx_mask42(%rip),$mask42 3481*4882a593Smuzhiyun 3482*4882a593Smuzhiyun test %r8,%r8 # is power value impossible? 3483*4882a593Smuzhiyun js .Linit_vpmadd52 # if it is, then init R[4] 3484*4882a593Smuzhiyun 3485*4882a593Smuzhiyun vmovq 0($ctx),%x#$H0 # load current hash value 3486*4882a593Smuzhiyun vmovq 8($ctx),%x#$H1 3487*4882a593Smuzhiyun vmovq 16($ctx),%x#$H2 3488*4882a593Smuzhiyun 3489*4882a593Smuzhiyun.Lblocks_vpmadd52_8x: 3490*4882a593Smuzhiyun ################################################################ 3491*4882a593Smuzhiyun # fist we calculate more key powers 3492*4882a593Smuzhiyun 3493*4882a593Smuzhiyun vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers 3494*4882a593Smuzhiyun vmovdqu64 160($ctx),$S1 3495*4882a593Smuzhiyun vmovdqu64 64($ctx),$R0 3496*4882a593Smuzhiyun vmovdqu64 96($ctx),$R1 3497*4882a593Smuzhiyun 3498*4882a593Smuzhiyun vpsllq \$2,$R2,$S2 # S2 = R2*5*4 3499*4882a593Smuzhiyun vpaddq $R2,$S2,$S2 3500*4882a593Smuzhiyun vpsllq \$2,$S2,$S2 3501*4882a593Smuzhiyun 3502*4882a593Smuzhiyun vpbroadcastq %x#$R2,$RR2 # broadcast 4th power 3503*4882a593Smuzhiyun vpbroadcastq %x#$R0,$RR0 3504*4882a593Smuzhiyun vpbroadcastq %x#$R1,$RR1 3505*4882a593Smuzhiyun 3506*4882a593Smuzhiyun vpxorq $D0lo,$D0lo,$D0lo 3507*4882a593Smuzhiyun vpmadd52luq $RR2,$S1,$D0lo 3508*4882a593Smuzhiyun vpxorq $D0hi,$D0hi,$D0hi 3509*4882a593Smuzhiyun vpmadd52huq $RR2,$S1,$D0hi 3510*4882a593Smuzhiyun vpxorq $D1lo,$D1lo,$D1lo 3511*4882a593Smuzhiyun vpmadd52luq $RR2,$S2,$D1lo 3512*4882a593Smuzhiyun vpxorq $D1hi,$D1hi,$D1hi 3513*4882a593Smuzhiyun vpmadd52huq $RR2,$S2,$D1hi 3514*4882a593Smuzhiyun vpxorq $D2lo,$D2lo,$D2lo 3515*4882a593Smuzhiyun vpmadd52luq $RR2,$R0,$D2lo 3516*4882a593Smuzhiyun vpxorq $D2hi,$D2hi,$D2hi 3517*4882a593Smuzhiyun vpmadd52huq $RR2,$R0,$D2hi 3518*4882a593Smuzhiyun 3519*4882a593Smuzhiyun vpmadd52luq $RR0,$R0,$D0lo 3520*4882a593Smuzhiyun vpmadd52huq $RR0,$R0,$D0hi 3521*4882a593Smuzhiyun vpmadd52luq $RR0,$R1,$D1lo 3522*4882a593Smuzhiyun vpmadd52huq $RR0,$R1,$D1hi 3523*4882a593Smuzhiyun vpmadd52luq $RR0,$R2,$D2lo 3524*4882a593Smuzhiyun vpmadd52huq $RR0,$R2,$D2hi 3525*4882a593Smuzhiyun 3526*4882a593Smuzhiyun vpmadd52luq $RR1,$S2,$D0lo 3527*4882a593Smuzhiyun vpmadd52huq $RR1,$S2,$D0hi 3528*4882a593Smuzhiyun vpmadd52luq $RR1,$R0,$D1lo 3529*4882a593Smuzhiyun vpmadd52huq $RR1,$R0,$D1hi 3530*4882a593Smuzhiyun vpmadd52luq $RR1,$R1,$D2lo 3531*4882a593Smuzhiyun vpmadd52huq $RR1,$R1,$D2hi 3532*4882a593Smuzhiyun 3533*4882a593Smuzhiyun ################################################################ 3534*4882a593Smuzhiyun # partial reduction 3535*4882a593Smuzhiyun vpsrlq \$44,$D0lo,$tmp 3536*4882a593Smuzhiyun vpsllq \$8,$D0hi,$D0hi 3537*4882a593Smuzhiyun vpandq $mask44,$D0lo,$RR0 3538*4882a593Smuzhiyun vpaddq $tmp,$D0hi,$D0hi 3539*4882a593Smuzhiyun 3540*4882a593Smuzhiyun vpaddq $D0hi,$D1lo,$D1lo 3541*4882a593Smuzhiyun 3542*4882a593Smuzhiyun vpsrlq \$44,$D1lo,$tmp 3543*4882a593Smuzhiyun vpsllq \$8,$D1hi,$D1hi 3544*4882a593Smuzhiyun vpandq $mask44,$D1lo,$RR1 3545*4882a593Smuzhiyun vpaddq $tmp,$D1hi,$D1hi 3546*4882a593Smuzhiyun 3547*4882a593Smuzhiyun vpaddq $D1hi,$D2lo,$D2lo 3548*4882a593Smuzhiyun 3549*4882a593Smuzhiyun vpsrlq \$42,$D2lo,$tmp 3550*4882a593Smuzhiyun vpsllq \$10,$D2hi,$D2hi 3551*4882a593Smuzhiyun vpandq $mask42,$D2lo,$RR2 3552*4882a593Smuzhiyun vpaddq $tmp,$D2hi,$D2hi 3553*4882a593Smuzhiyun 3554*4882a593Smuzhiyun vpaddq $D2hi,$RR0,$RR0 3555*4882a593Smuzhiyun vpsllq \$2,$D2hi,$D2hi 3556*4882a593Smuzhiyun 3557*4882a593Smuzhiyun vpaddq $D2hi,$RR0,$RR0 3558*4882a593Smuzhiyun 3559*4882a593Smuzhiyun vpsrlq \$44,$RR0,$tmp # additional step 3560*4882a593Smuzhiyun vpandq $mask44,$RR0,$RR0 3561*4882a593Smuzhiyun 3562*4882a593Smuzhiyun vpaddq $tmp,$RR1,$RR1 3563*4882a593Smuzhiyun 3564*4882a593Smuzhiyun ################################################################ 3565*4882a593Smuzhiyun # At this point Rx holds 1324 powers, RRx - 5768, and the goal 3566*4882a593Smuzhiyun # is 15263748, which reflects how data is loaded... 3567*4882a593Smuzhiyun 3568*4882a593Smuzhiyun vpunpcklqdq $R2,$RR2,$T2 # 3748 3569*4882a593Smuzhiyun vpunpckhqdq $R2,$RR2,$R2 # 1526 3570*4882a593Smuzhiyun vpunpcklqdq $R0,$RR0,$T0 3571*4882a593Smuzhiyun vpunpckhqdq $R0,$RR0,$R0 3572*4882a593Smuzhiyun vpunpcklqdq $R1,$RR1,$T1 3573*4882a593Smuzhiyun vpunpckhqdq $R1,$RR1,$R1 3574*4882a593Smuzhiyun___ 3575*4882a593Smuzhiyun######## switch to %zmm 3576*4882a593Smuzhiyunmap(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3577*4882a593Smuzhiyunmap(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3578*4882a593Smuzhiyunmap(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3579*4882a593Smuzhiyunmap(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); 3580*4882a593Smuzhiyun 3581*4882a593Smuzhiyun$code.=<<___; 3582*4882a593Smuzhiyun vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 3583*4882a593Smuzhiyun vshufi64x2 \$0x44,$R0,$T0,$RR0 3584*4882a593Smuzhiyun vshufi64x2 \$0x44,$R1,$T1,$RR1 3585*4882a593Smuzhiyun 3586*4882a593Smuzhiyun vmovdqu64 16*0($inp),$T2 # load data 3587*4882a593Smuzhiyun vmovdqu64 16*4($inp),$T3 3588*4882a593Smuzhiyun lea 16*8($inp),$inp 3589*4882a593Smuzhiyun 3590*4882a593Smuzhiyun vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 3591*4882a593Smuzhiyun vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 3592*4882a593Smuzhiyun vpaddq $RR2,$SS2,$SS2 3593*4882a593Smuzhiyun vpaddq $RR1,$SS1,$SS1 3594*4882a593Smuzhiyun vpsllq \$2,$SS2,$SS2 3595*4882a593Smuzhiyun vpsllq \$2,$SS1,$SS1 3596*4882a593Smuzhiyun 3597*4882a593Smuzhiyun vpbroadcastq $padbit,$PAD 3598*4882a593Smuzhiyun vpbroadcastq %x#$mask44,$mask44 3599*4882a593Smuzhiyun vpbroadcastq %x#$mask42,$mask42 3600*4882a593Smuzhiyun 3601*4882a593Smuzhiyun vpbroadcastq %x#$SS1,$S1 # broadcast 8th power 3602*4882a593Smuzhiyun vpbroadcastq %x#$SS2,$S2 3603*4882a593Smuzhiyun vpbroadcastq %x#$RR0,$R0 3604*4882a593Smuzhiyun vpbroadcastq %x#$RR1,$R1 3605*4882a593Smuzhiyun vpbroadcastq %x#$RR2,$R2 3606*4882a593Smuzhiyun 3607*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T1 # transpose data 3608*4882a593Smuzhiyun vpunpckhqdq $T3,$T2,$T3 3609*4882a593Smuzhiyun 3610*4882a593Smuzhiyun # at this point 64-bit lanes are ordered as 73625140 3611*4882a593Smuzhiyun 3612*4882a593Smuzhiyun vpsrlq \$24,$T3,$T2 # splat the data 3613*4882a593Smuzhiyun vporq $PAD,$T2,$T2 3614*4882a593Smuzhiyun vpaddq $T2,$H2,$H2 # accumulate input 3615*4882a593Smuzhiyun vpandq $mask44,$T1,$T0 3616*4882a593Smuzhiyun vpsrlq \$44,$T1,$T1 3617*4882a593Smuzhiyun vpsllq \$20,$T3,$T3 3618*4882a593Smuzhiyun vporq $T3,$T1,$T1 3619*4882a593Smuzhiyun vpandq $mask44,$T1,$T1 3620*4882a593Smuzhiyun 3621*4882a593Smuzhiyun sub \$8,$len 3622*4882a593Smuzhiyun jz .Ltail_vpmadd52_8x 3623*4882a593Smuzhiyun jmp .Loop_vpmadd52_8x 3624*4882a593Smuzhiyun 3625*4882a593Smuzhiyun.align 32 3626*4882a593Smuzhiyun.Loop_vpmadd52_8x: 3627*4882a593Smuzhiyun #vpaddq $T2,$H2,$H2 # accumulate input 3628*4882a593Smuzhiyun vpaddq $T0,$H0,$H0 3629*4882a593Smuzhiyun vpaddq $T1,$H1,$H1 3630*4882a593Smuzhiyun 3631*4882a593Smuzhiyun vpxorq $D0lo,$D0lo,$D0lo 3632*4882a593Smuzhiyun vpmadd52luq $H2,$S1,$D0lo 3633*4882a593Smuzhiyun vpxorq $D0hi,$D0hi,$D0hi 3634*4882a593Smuzhiyun vpmadd52huq $H2,$S1,$D0hi 3635*4882a593Smuzhiyun vpxorq $D1lo,$D1lo,$D1lo 3636*4882a593Smuzhiyun vpmadd52luq $H2,$S2,$D1lo 3637*4882a593Smuzhiyun vpxorq $D1hi,$D1hi,$D1hi 3638*4882a593Smuzhiyun vpmadd52huq $H2,$S2,$D1hi 3639*4882a593Smuzhiyun vpxorq $D2lo,$D2lo,$D2lo 3640*4882a593Smuzhiyun vpmadd52luq $H2,$R0,$D2lo 3641*4882a593Smuzhiyun vpxorq $D2hi,$D2hi,$D2hi 3642*4882a593Smuzhiyun vpmadd52huq $H2,$R0,$D2hi 3643*4882a593Smuzhiyun 3644*4882a593Smuzhiyun vmovdqu64 16*0($inp),$T2 # load data 3645*4882a593Smuzhiyun vmovdqu64 16*4($inp),$T3 3646*4882a593Smuzhiyun lea 16*8($inp),$inp 3647*4882a593Smuzhiyun vpmadd52luq $H0,$R0,$D0lo 3648*4882a593Smuzhiyun vpmadd52huq $H0,$R0,$D0hi 3649*4882a593Smuzhiyun vpmadd52luq $H0,$R1,$D1lo 3650*4882a593Smuzhiyun vpmadd52huq $H0,$R1,$D1hi 3651*4882a593Smuzhiyun vpmadd52luq $H0,$R2,$D2lo 3652*4882a593Smuzhiyun vpmadd52huq $H0,$R2,$D2hi 3653*4882a593Smuzhiyun 3654*4882a593Smuzhiyun vpunpcklqdq $T3,$T2,$T1 # transpose data 3655*4882a593Smuzhiyun vpunpckhqdq $T3,$T2,$T3 3656*4882a593Smuzhiyun vpmadd52luq $H1,$S2,$D0lo 3657*4882a593Smuzhiyun vpmadd52huq $H1,$S2,$D0hi 3658*4882a593Smuzhiyun vpmadd52luq $H1,$R0,$D1lo 3659*4882a593Smuzhiyun vpmadd52huq $H1,$R0,$D1hi 3660*4882a593Smuzhiyun vpmadd52luq $H1,$R1,$D2lo 3661*4882a593Smuzhiyun vpmadd52huq $H1,$R1,$D2hi 3662*4882a593Smuzhiyun 3663*4882a593Smuzhiyun ################################################################ 3664*4882a593Smuzhiyun # partial reduction (interleaved with data splat) 3665*4882a593Smuzhiyun vpsrlq \$44,$D0lo,$tmp 3666*4882a593Smuzhiyun vpsllq \$8,$D0hi,$D0hi 3667*4882a593Smuzhiyun vpandq $mask44,$D0lo,$H0 3668*4882a593Smuzhiyun vpaddq $tmp,$D0hi,$D0hi 3669*4882a593Smuzhiyun 3670*4882a593Smuzhiyun vpsrlq \$24,$T3,$T2 3671*4882a593Smuzhiyun vporq $PAD,$T2,$T2 3672*4882a593Smuzhiyun vpaddq $D0hi,$D1lo,$D1lo 3673*4882a593Smuzhiyun 3674*4882a593Smuzhiyun vpsrlq \$44,$D1lo,$tmp 3675*4882a593Smuzhiyun vpsllq \$8,$D1hi,$D1hi 3676*4882a593Smuzhiyun vpandq $mask44,$D1lo,$H1 3677*4882a593Smuzhiyun vpaddq $tmp,$D1hi,$D1hi 3678*4882a593Smuzhiyun 3679*4882a593Smuzhiyun vpandq $mask44,$T1,$T0 3680*4882a593Smuzhiyun vpsrlq \$44,$T1,$T1 3681*4882a593Smuzhiyun vpsllq \$20,$T3,$T3 3682*4882a593Smuzhiyun vpaddq $D1hi,$D2lo,$D2lo 3683*4882a593Smuzhiyun 3684*4882a593Smuzhiyun vpsrlq \$42,$D2lo,$tmp 3685*4882a593Smuzhiyun vpsllq \$10,$D2hi,$D2hi 3686*4882a593Smuzhiyun vpandq $mask42,$D2lo,$H2 3687*4882a593Smuzhiyun vpaddq $tmp,$D2hi,$D2hi 3688*4882a593Smuzhiyun 3689*4882a593Smuzhiyun vpaddq $T2,$H2,$H2 # accumulate input 3690*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3691*4882a593Smuzhiyun vpsllq \$2,$D2hi,$D2hi 3692*4882a593Smuzhiyun 3693*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3694*4882a593Smuzhiyun vporq $T3,$T1,$T1 3695*4882a593Smuzhiyun vpandq $mask44,$T1,$T1 3696*4882a593Smuzhiyun 3697*4882a593Smuzhiyun vpsrlq \$44,$H0,$tmp # additional step 3698*4882a593Smuzhiyun vpandq $mask44,$H0,$H0 3699*4882a593Smuzhiyun 3700*4882a593Smuzhiyun vpaddq $tmp,$H1,$H1 3701*4882a593Smuzhiyun 3702*4882a593Smuzhiyun sub \$8,$len # len-=128 3703*4882a593Smuzhiyun jnz .Loop_vpmadd52_8x 3704*4882a593Smuzhiyun 3705*4882a593Smuzhiyun.Ltail_vpmadd52_8x: 3706*4882a593Smuzhiyun #vpaddq $T2,$H2,$H2 # accumulate input 3707*4882a593Smuzhiyun vpaddq $T0,$H0,$H0 3708*4882a593Smuzhiyun vpaddq $T1,$H1,$H1 3709*4882a593Smuzhiyun 3710*4882a593Smuzhiyun vpxorq $D0lo,$D0lo,$D0lo 3711*4882a593Smuzhiyun vpmadd52luq $H2,$SS1,$D0lo 3712*4882a593Smuzhiyun vpxorq $D0hi,$D0hi,$D0hi 3713*4882a593Smuzhiyun vpmadd52huq $H2,$SS1,$D0hi 3714*4882a593Smuzhiyun vpxorq $D1lo,$D1lo,$D1lo 3715*4882a593Smuzhiyun vpmadd52luq $H2,$SS2,$D1lo 3716*4882a593Smuzhiyun vpxorq $D1hi,$D1hi,$D1hi 3717*4882a593Smuzhiyun vpmadd52huq $H2,$SS2,$D1hi 3718*4882a593Smuzhiyun vpxorq $D2lo,$D2lo,$D2lo 3719*4882a593Smuzhiyun vpmadd52luq $H2,$RR0,$D2lo 3720*4882a593Smuzhiyun vpxorq $D2hi,$D2hi,$D2hi 3721*4882a593Smuzhiyun vpmadd52huq $H2,$RR0,$D2hi 3722*4882a593Smuzhiyun 3723*4882a593Smuzhiyun vpmadd52luq $H0,$RR0,$D0lo 3724*4882a593Smuzhiyun vpmadd52huq $H0,$RR0,$D0hi 3725*4882a593Smuzhiyun vpmadd52luq $H0,$RR1,$D1lo 3726*4882a593Smuzhiyun vpmadd52huq $H0,$RR1,$D1hi 3727*4882a593Smuzhiyun vpmadd52luq $H0,$RR2,$D2lo 3728*4882a593Smuzhiyun vpmadd52huq $H0,$RR2,$D2hi 3729*4882a593Smuzhiyun 3730*4882a593Smuzhiyun vpmadd52luq $H1,$SS2,$D0lo 3731*4882a593Smuzhiyun vpmadd52huq $H1,$SS2,$D0hi 3732*4882a593Smuzhiyun vpmadd52luq $H1,$RR0,$D1lo 3733*4882a593Smuzhiyun vpmadd52huq $H1,$RR0,$D1hi 3734*4882a593Smuzhiyun vpmadd52luq $H1,$RR1,$D2lo 3735*4882a593Smuzhiyun vpmadd52huq $H1,$RR1,$D2hi 3736*4882a593Smuzhiyun 3737*4882a593Smuzhiyun ################################################################ 3738*4882a593Smuzhiyun # horizontal addition 3739*4882a593Smuzhiyun 3740*4882a593Smuzhiyun mov \$1,%eax 3741*4882a593Smuzhiyun kmovw %eax,%k1 3742*4882a593Smuzhiyun vpsrldq \$8,$D0lo,$T0 3743*4882a593Smuzhiyun vpsrldq \$8,$D0hi,$H0 3744*4882a593Smuzhiyun vpsrldq \$8,$D1lo,$T1 3745*4882a593Smuzhiyun vpsrldq \$8,$D1hi,$H1 3746*4882a593Smuzhiyun vpaddq $T0,$D0lo,$D0lo 3747*4882a593Smuzhiyun vpaddq $H0,$D0hi,$D0hi 3748*4882a593Smuzhiyun vpsrldq \$8,$D2lo,$T2 3749*4882a593Smuzhiyun vpsrldq \$8,$D2hi,$H2 3750*4882a593Smuzhiyun vpaddq $T1,$D1lo,$D1lo 3751*4882a593Smuzhiyun vpaddq $H1,$D1hi,$D1hi 3752*4882a593Smuzhiyun vpermq \$0x2,$D0lo,$T0 3753*4882a593Smuzhiyun vpermq \$0x2,$D0hi,$H0 3754*4882a593Smuzhiyun vpaddq $T2,$D2lo,$D2lo 3755*4882a593Smuzhiyun vpaddq $H2,$D2hi,$D2hi 3756*4882a593Smuzhiyun 3757*4882a593Smuzhiyun vpermq \$0x2,$D1lo,$T1 3758*4882a593Smuzhiyun vpermq \$0x2,$D1hi,$H1 3759*4882a593Smuzhiyun vpaddq $T0,$D0lo,$D0lo 3760*4882a593Smuzhiyun vpaddq $H0,$D0hi,$D0hi 3761*4882a593Smuzhiyun vpermq \$0x2,$D2lo,$T2 3762*4882a593Smuzhiyun vpermq \$0x2,$D2hi,$H2 3763*4882a593Smuzhiyun vpaddq $T1,$D1lo,$D1lo 3764*4882a593Smuzhiyun vpaddq $H1,$D1hi,$D1hi 3765*4882a593Smuzhiyun vextracti64x4 \$1,$D0lo,%y#$T0 3766*4882a593Smuzhiyun vextracti64x4 \$1,$D0hi,%y#$H0 3767*4882a593Smuzhiyun vpaddq $T2,$D2lo,$D2lo 3768*4882a593Smuzhiyun vpaddq $H2,$D2hi,$D2hi 3769*4882a593Smuzhiyun 3770*4882a593Smuzhiyun vextracti64x4 \$1,$D1lo,%y#$T1 3771*4882a593Smuzhiyun vextracti64x4 \$1,$D1hi,%y#$H1 3772*4882a593Smuzhiyun vextracti64x4 \$1,$D2lo,%y#$T2 3773*4882a593Smuzhiyun vextracti64x4 \$1,$D2hi,%y#$H2 3774*4882a593Smuzhiyun___ 3775*4882a593Smuzhiyun######## switch back to %ymm 3776*4882a593Smuzhiyunmap(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); 3777*4882a593Smuzhiyunmap(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); 3778*4882a593Smuzhiyunmap(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); 3779*4882a593Smuzhiyun 3780*4882a593Smuzhiyun$code.=<<___; 3781*4882a593Smuzhiyun vpaddq $T0,$D0lo,${D0lo}{%k1}{z} 3782*4882a593Smuzhiyun vpaddq $H0,$D0hi,${D0hi}{%k1}{z} 3783*4882a593Smuzhiyun vpaddq $T1,$D1lo,${D1lo}{%k1}{z} 3784*4882a593Smuzhiyun vpaddq $H1,$D1hi,${D1hi}{%k1}{z} 3785*4882a593Smuzhiyun vpaddq $T2,$D2lo,${D2lo}{%k1}{z} 3786*4882a593Smuzhiyun vpaddq $H2,$D2hi,${D2hi}{%k1}{z} 3787*4882a593Smuzhiyun 3788*4882a593Smuzhiyun ################################################################ 3789*4882a593Smuzhiyun # partial reduction 3790*4882a593Smuzhiyun vpsrlq \$44,$D0lo,$tmp 3791*4882a593Smuzhiyun vpsllq \$8,$D0hi,$D0hi 3792*4882a593Smuzhiyun vpandq $mask44,$D0lo,$H0 3793*4882a593Smuzhiyun vpaddq $tmp,$D0hi,$D0hi 3794*4882a593Smuzhiyun 3795*4882a593Smuzhiyun vpaddq $D0hi,$D1lo,$D1lo 3796*4882a593Smuzhiyun 3797*4882a593Smuzhiyun vpsrlq \$44,$D1lo,$tmp 3798*4882a593Smuzhiyun vpsllq \$8,$D1hi,$D1hi 3799*4882a593Smuzhiyun vpandq $mask44,$D1lo,$H1 3800*4882a593Smuzhiyun vpaddq $tmp,$D1hi,$D1hi 3801*4882a593Smuzhiyun 3802*4882a593Smuzhiyun vpaddq $D1hi,$D2lo,$D2lo 3803*4882a593Smuzhiyun 3804*4882a593Smuzhiyun vpsrlq \$42,$D2lo,$tmp 3805*4882a593Smuzhiyun vpsllq \$10,$D2hi,$D2hi 3806*4882a593Smuzhiyun vpandq $mask42,$D2lo,$H2 3807*4882a593Smuzhiyun vpaddq $tmp,$D2hi,$D2hi 3808*4882a593Smuzhiyun 3809*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3810*4882a593Smuzhiyun vpsllq \$2,$D2hi,$D2hi 3811*4882a593Smuzhiyun 3812*4882a593Smuzhiyun vpaddq $D2hi,$H0,$H0 3813*4882a593Smuzhiyun 3814*4882a593Smuzhiyun vpsrlq \$44,$H0,$tmp # additional step 3815*4882a593Smuzhiyun vpandq $mask44,$H0,$H0 3816*4882a593Smuzhiyun 3817*4882a593Smuzhiyun vpaddq $tmp,$H1,$H1 3818*4882a593Smuzhiyun 3819*4882a593Smuzhiyun ################################################################ 3820*4882a593Smuzhiyun 3821*4882a593Smuzhiyun vmovq %x#$H0,0($ctx) 3822*4882a593Smuzhiyun vmovq %x#$H1,8($ctx) 3823*4882a593Smuzhiyun vmovq %x#$H2,16($ctx) 3824*4882a593Smuzhiyun vzeroall 3825*4882a593Smuzhiyun 3826*4882a593Smuzhiyun.Lno_data_vpmadd52_8x: 3827*4882a593Smuzhiyun RET 3828*4882a593Smuzhiyun.size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x 3829*4882a593Smuzhiyun___ 3830*4882a593Smuzhiyun} 3831*4882a593Smuzhiyun$code.=<<___; 3832*4882a593Smuzhiyun.type poly1305_emit_base2_44,\@function,3 3833*4882a593Smuzhiyun.align 32 3834*4882a593Smuzhiyunpoly1305_emit_base2_44: 3835*4882a593Smuzhiyun mov 0($ctx),%r8 # load hash value 3836*4882a593Smuzhiyun mov 8($ctx),%r9 3837*4882a593Smuzhiyun mov 16($ctx),%r10 3838*4882a593Smuzhiyun 3839*4882a593Smuzhiyun mov %r9,%rax 3840*4882a593Smuzhiyun shr \$20,%r9 3841*4882a593Smuzhiyun shl \$44,%rax 3842*4882a593Smuzhiyun mov %r10,%rcx 3843*4882a593Smuzhiyun shr \$40,%r10 3844*4882a593Smuzhiyun shl \$24,%rcx 3845*4882a593Smuzhiyun 3846*4882a593Smuzhiyun add %rax,%r8 3847*4882a593Smuzhiyun adc %rcx,%r9 3848*4882a593Smuzhiyun adc \$0,%r10 3849*4882a593Smuzhiyun 3850*4882a593Smuzhiyun mov %r8,%rax 3851*4882a593Smuzhiyun add \$5,%r8 # compare to modulus 3852*4882a593Smuzhiyun mov %r9,%rcx 3853*4882a593Smuzhiyun adc \$0,%r9 3854*4882a593Smuzhiyun adc \$0,%r10 3855*4882a593Smuzhiyun shr \$2,%r10 # did 130-bit value overflow? 3856*4882a593Smuzhiyun cmovnz %r8,%rax 3857*4882a593Smuzhiyun cmovnz %r9,%rcx 3858*4882a593Smuzhiyun 3859*4882a593Smuzhiyun add 0($nonce),%rax # accumulate nonce 3860*4882a593Smuzhiyun adc 8($nonce),%rcx 3861*4882a593Smuzhiyun mov %rax,0($mac) # write result 3862*4882a593Smuzhiyun mov %rcx,8($mac) 3863*4882a593Smuzhiyun 3864*4882a593Smuzhiyun RET 3865*4882a593Smuzhiyun.size poly1305_emit_base2_44,.-poly1305_emit_base2_44 3866*4882a593Smuzhiyun___ 3867*4882a593Smuzhiyun} } } 3868*4882a593Smuzhiyun} 3869*4882a593Smuzhiyun 3870*4882a593Smuzhiyunif (!$kernel) 3871*4882a593Smuzhiyun{ # chacha20-poly1305 helpers 3872*4882a593Smuzhiyunmy ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order 3873*4882a593Smuzhiyun ("%rdi","%rsi","%rdx","%rcx"); # Unix order 3874*4882a593Smuzhiyun$code.=<<___; 3875*4882a593Smuzhiyun.globl xor128_encrypt_n_pad 3876*4882a593Smuzhiyun.type xor128_encrypt_n_pad,\@abi-omnipotent 3877*4882a593Smuzhiyun.align 16 3878*4882a593Smuzhiyunxor128_encrypt_n_pad: 3879*4882a593Smuzhiyun sub $otp,$inp 3880*4882a593Smuzhiyun sub $otp,$out 3881*4882a593Smuzhiyun mov $len,%r10 # put len aside 3882*4882a593Smuzhiyun shr \$4,$len # len / 16 3883*4882a593Smuzhiyun jz .Ltail_enc 3884*4882a593Smuzhiyun nop 3885*4882a593Smuzhiyun.Loop_enc_xmm: 3886*4882a593Smuzhiyun movdqu ($inp,$otp),%xmm0 3887*4882a593Smuzhiyun pxor ($otp),%xmm0 3888*4882a593Smuzhiyun movdqu %xmm0,($out,$otp) 3889*4882a593Smuzhiyun movdqa %xmm0,($otp) 3890*4882a593Smuzhiyun lea 16($otp),$otp 3891*4882a593Smuzhiyun dec $len 3892*4882a593Smuzhiyun jnz .Loop_enc_xmm 3893*4882a593Smuzhiyun 3894*4882a593Smuzhiyun and \$15,%r10 # len % 16 3895*4882a593Smuzhiyun jz .Ldone_enc 3896*4882a593Smuzhiyun 3897*4882a593Smuzhiyun.Ltail_enc: 3898*4882a593Smuzhiyun mov \$16,$len 3899*4882a593Smuzhiyun sub %r10,$len 3900*4882a593Smuzhiyun xor %eax,%eax 3901*4882a593Smuzhiyun.Loop_enc_byte: 3902*4882a593Smuzhiyun mov ($inp,$otp),%al 3903*4882a593Smuzhiyun xor ($otp),%al 3904*4882a593Smuzhiyun mov %al,($out,$otp) 3905*4882a593Smuzhiyun mov %al,($otp) 3906*4882a593Smuzhiyun lea 1($otp),$otp 3907*4882a593Smuzhiyun dec %r10 3908*4882a593Smuzhiyun jnz .Loop_enc_byte 3909*4882a593Smuzhiyun 3910*4882a593Smuzhiyun xor %eax,%eax 3911*4882a593Smuzhiyun.Loop_enc_pad: 3912*4882a593Smuzhiyun mov %al,($otp) 3913*4882a593Smuzhiyun lea 1($otp),$otp 3914*4882a593Smuzhiyun dec $len 3915*4882a593Smuzhiyun jnz .Loop_enc_pad 3916*4882a593Smuzhiyun 3917*4882a593Smuzhiyun.Ldone_enc: 3918*4882a593Smuzhiyun mov $otp,%rax 3919*4882a593Smuzhiyun RET 3920*4882a593Smuzhiyun.size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad 3921*4882a593Smuzhiyun 3922*4882a593Smuzhiyun.globl xor128_decrypt_n_pad 3923*4882a593Smuzhiyun.type xor128_decrypt_n_pad,\@abi-omnipotent 3924*4882a593Smuzhiyun.align 16 3925*4882a593Smuzhiyunxor128_decrypt_n_pad: 3926*4882a593Smuzhiyun sub $otp,$inp 3927*4882a593Smuzhiyun sub $otp,$out 3928*4882a593Smuzhiyun mov $len,%r10 # put len aside 3929*4882a593Smuzhiyun shr \$4,$len # len / 16 3930*4882a593Smuzhiyun jz .Ltail_dec 3931*4882a593Smuzhiyun nop 3932*4882a593Smuzhiyun.Loop_dec_xmm: 3933*4882a593Smuzhiyun movdqu ($inp,$otp),%xmm0 3934*4882a593Smuzhiyun movdqa ($otp),%xmm1 3935*4882a593Smuzhiyun pxor %xmm0,%xmm1 3936*4882a593Smuzhiyun movdqu %xmm1,($out,$otp) 3937*4882a593Smuzhiyun movdqa %xmm0,($otp) 3938*4882a593Smuzhiyun lea 16($otp),$otp 3939*4882a593Smuzhiyun dec $len 3940*4882a593Smuzhiyun jnz .Loop_dec_xmm 3941*4882a593Smuzhiyun 3942*4882a593Smuzhiyun pxor %xmm1,%xmm1 3943*4882a593Smuzhiyun and \$15,%r10 # len % 16 3944*4882a593Smuzhiyun jz .Ldone_dec 3945*4882a593Smuzhiyun 3946*4882a593Smuzhiyun.Ltail_dec: 3947*4882a593Smuzhiyun mov \$16,$len 3948*4882a593Smuzhiyun sub %r10,$len 3949*4882a593Smuzhiyun xor %eax,%eax 3950*4882a593Smuzhiyun xor %r11d,%r11d 3951*4882a593Smuzhiyun.Loop_dec_byte: 3952*4882a593Smuzhiyun mov ($inp,$otp),%r11b 3953*4882a593Smuzhiyun mov ($otp),%al 3954*4882a593Smuzhiyun xor %r11b,%al 3955*4882a593Smuzhiyun mov %al,($out,$otp) 3956*4882a593Smuzhiyun mov %r11b,($otp) 3957*4882a593Smuzhiyun lea 1($otp),$otp 3958*4882a593Smuzhiyun dec %r10 3959*4882a593Smuzhiyun jnz .Loop_dec_byte 3960*4882a593Smuzhiyun 3961*4882a593Smuzhiyun xor %eax,%eax 3962*4882a593Smuzhiyun.Loop_dec_pad: 3963*4882a593Smuzhiyun mov %al,($otp) 3964*4882a593Smuzhiyun lea 1($otp),$otp 3965*4882a593Smuzhiyun dec $len 3966*4882a593Smuzhiyun jnz .Loop_dec_pad 3967*4882a593Smuzhiyun 3968*4882a593Smuzhiyun.Ldone_dec: 3969*4882a593Smuzhiyun mov $otp,%rax 3970*4882a593Smuzhiyun RET 3971*4882a593Smuzhiyun.size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad 3972*4882a593Smuzhiyun___ 3973*4882a593Smuzhiyun} 3974*4882a593Smuzhiyun 3975*4882a593Smuzhiyun# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, 3976*4882a593Smuzhiyun# CONTEXT *context,DISPATCHER_CONTEXT *disp) 3977*4882a593Smuzhiyunif ($win64) { 3978*4882a593Smuzhiyun$rec="%rcx"; 3979*4882a593Smuzhiyun$frame="%rdx"; 3980*4882a593Smuzhiyun$context="%r8"; 3981*4882a593Smuzhiyun$disp="%r9"; 3982*4882a593Smuzhiyun 3983*4882a593Smuzhiyun$code.=<<___; 3984*4882a593Smuzhiyun.extern __imp_RtlVirtualUnwind 3985*4882a593Smuzhiyun.type se_handler,\@abi-omnipotent 3986*4882a593Smuzhiyun.align 16 3987*4882a593Smuzhiyunse_handler: 3988*4882a593Smuzhiyun push %rsi 3989*4882a593Smuzhiyun push %rdi 3990*4882a593Smuzhiyun push %rbx 3991*4882a593Smuzhiyun push %rbp 3992*4882a593Smuzhiyun push %r12 3993*4882a593Smuzhiyun push %r13 3994*4882a593Smuzhiyun push %r14 3995*4882a593Smuzhiyun push %r15 3996*4882a593Smuzhiyun pushfq 3997*4882a593Smuzhiyun sub \$64,%rsp 3998*4882a593Smuzhiyun 3999*4882a593Smuzhiyun mov 120($context),%rax # pull context->Rax 4000*4882a593Smuzhiyun mov 248($context),%rbx # pull context->Rip 4001*4882a593Smuzhiyun 4002*4882a593Smuzhiyun mov 8($disp),%rsi # disp->ImageBase 4003*4882a593Smuzhiyun mov 56($disp),%r11 # disp->HandlerData 4004*4882a593Smuzhiyun 4005*4882a593Smuzhiyun mov 0(%r11),%r10d # HandlerData[0] 4006*4882a593Smuzhiyun lea (%rsi,%r10),%r10 # prologue label 4007*4882a593Smuzhiyun cmp %r10,%rbx # context->Rip<.Lprologue 4008*4882a593Smuzhiyun jb .Lcommon_seh_tail 4009*4882a593Smuzhiyun 4010*4882a593Smuzhiyun mov 152($context),%rax # pull context->Rsp 4011*4882a593Smuzhiyun 4012*4882a593Smuzhiyun mov 4(%r11),%r10d # HandlerData[1] 4013*4882a593Smuzhiyun lea (%rsi,%r10),%r10 # epilogue label 4014*4882a593Smuzhiyun cmp %r10,%rbx # context->Rip>=.Lepilogue 4015*4882a593Smuzhiyun jae .Lcommon_seh_tail 4016*4882a593Smuzhiyun 4017*4882a593Smuzhiyun lea 48(%rax),%rax 4018*4882a593Smuzhiyun 4019*4882a593Smuzhiyun mov -8(%rax),%rbx 4020*4882a593Smuzhiyun mov -16(%rax),%rbp 4021*4882a593Smuzhiyun mov -24(%rax),%r12 4022*4882a593Smuzhiyun mov -32(%rax),%r13 4023*4882a593Smuzhiyun mov -40(%rax),%r14 4024*4882a593Smuzhiyun mov -48(%rax),%r15 4025*4882a593Smuzhiyun mov %rbx,144($context) # restore context->Rbx 4026*4882a593Smuzhiyun mov %rbp,160($context) # restore context->Rbp 4027*4882a593Smuzhiyun mov %r12,216($context) # restore context->R12 4028*4882a593Smuzhiyun mov %r13,224($context) # restore context->R13 4029*4882a593Smuzhiyun mov %r14,232($context) # restore context->R14 4030*4882a593Smuzhiyun mov %r15,240($context) # restore context->R14 4031*4882a593Smuzhiyun 4032*4882a593Smuzhiyun jmp .Lcommon_seh_tail 4033*4882a593Smuzhiyun.size se_handler,.-se_handler 4034*4882a593Smuzhiyun 4035*4882a593Smuzhiyun.type avx_handler,\@abi-omnipotent 4036*4882a593Smuzhiyun.align 16 4037*4882a593Smuzhiyunavx_handler: 4038*4882a593Smuzhiyun push %rsi 4039*4882a593Smuzhiyun push %rdi 4040*4882a593Smuzhiyun push %rbx 4041*4882a593Smuzhiyun push %rbp 4042*4882a593Smuzhiyun push %r12 4043*4882a593Smuzhiyun push %r13 4044*4882a593Smuzhiyun push %r14 4045*4882a593Smuzhiyun push %r15 4046*4882a593Smuzhiyun pushfq 4047*4882a593Smuzhiyun sub \$64,%rsp 4048*4882a593Smuzhiyun 4049*4882a593Smuzhiyun mov 120($context),%rax # pull context->Rax 4050*4882a593Smuzhiyun mov 248($context),%rbx # pull context->Rip 4051*4882a593Smuzhiyun 4052*4882a593Smuzhiyun mov 8($disp),%rsi # disp->ImageBase 4053*4882a593Smuzhiyun mov 56($disp),%r11 # disp->HandlerData 4054*4882a593Smuzhiyun 4055*4882a593Smuzhiyun mov 0(%r11),%r10d # HandlerData[0] 4056*4882a593Smuzhiyun lea (%rsi,%r10),%r10 # prologue label 4057*4882a593Smuzhiyun cmp %r10,%rbx # context->Rip<prologue label 4058*4882a593Smuzhiyun jb .Lcommon_seh_tail 4059*4882a593Smuzhiyun 4060*4882a593Smuzhiyun mov 152($context),%rax # pull context->Rsp 4061*4882a593Smuzhiyun 4062*4882a593Smuzhiyun mov 4(%r11),%r10d # HandlerData[1] 4063*4882a593Smuzhiyun lea (%rsi,%r10),%r10 # epilogue label 4064*4882a593Smuzhiyun cmp %r10,%rbx # context->Rip>=epilogue label 4065*4882a593Smuzhiyun jae .Lcommon_seh_tail 4066*4882a593Smuzhiyun 4067*4882a593Smuzhiyun mov 208($context),%rax # pull context->R11 4068*4882a593Smuzhiyun 4069*4882a593Smuzhiyun lea 0x50(%rax),%rsi 4070*4882a593Smuzhiyun lea 0xf8(%rax),%rax 4071*4882a593Smuzhiyun lea 512($context),%rdi # &context.Xmm6 4072*4882a593Smuzhiyun mov \$20,%ecx 4073*4882a593Smuzhiyun .long 0xa548f3fc # cld; rep movsq 4074*4882a593Smuzhiyun 4075*4882a593Smuzhiyun.Lcommon_seh_tail: 4076*4882a593Smuzhiyun mov 8(%rax),%rdi 4077*4882a593Smuzhiyun mov 16(%rax),%rsi 4078*4882a593Smuzhiyun mov %rax,152($context) # restore context->Rsp 4079*4882a593Smuzhiyun mov %rsi,168($context) # restore context->Rsi 4080*4882a593Smuzhiyun mov %rdi,176($context) # restore context->Rdi 4081*4882a593Smuzhiyun 4082*4882a593Smuzhiyun mov 40($disp),%rdi # disp->ContextRecord 4083*4882a593Smuzhiyun mov $context,%rsi # context 4084*4882a593Smuzhiyun mov \$154,%ecx # sizeof(CONTEXT) 4085*4882a593Smuzhiyun .long 0xa548f3fc # cld; rep movsq 4086*4882a593Smuzhiyun 4087*4882a593Smuzhiyun mov $disp,%rsi 4088*4882a593Smuzhiyun xor %ecx,%ecx # arg1, UNW_FLAG_NHANDLER 4089*4882a593Smuzhiyun mov 8(%rsi),%rdx # arg2, disp->ImageBase 4090*4882a593Smuzhiyun mov 0(%rsi),%r8 # arg3, disp->ControlPc 4091*4882a593Smuzhiyun mov 16(%rsi),%r9 # arg4, disp->FunctionEntry 4092*4882a593Smuzhiyun mov 40(%rsi),%r10 # disp->ContextRecord 4093*4882a593Smuzhiyun lea 56(%rsi),%r11 # &disp->HandlerData 4094*4882a593Smuzhiyun lea 24(%rsi),%r12 # &disp->EstablisherFrame 4095*4882a593Smuzhiyun mov %r10,32(%rsp) # arg5 4096*4882a593Smuzhiyun mov %r11,40(%rsp) # arg6 4097*4882a593Smuzhiyun mov %r12,48(%rsp) # arg7 4098*4882a593Smuzhiyun mov %rcx,56(%rsp) # arg8, (NULL) 4099*4882a593Smuzhiyun call *__imp_RtlVirtualUnwind(%rip) 4100*4882a593Smuzhiyun 4101*4882a593Smuzhiyun mov \$1,%eax # ExceptionContinueSearch 4102*4882a593Smuzhiyun add \$64,%rsp 4103*4882a593Smuzhiyun popfq 4104*4882a593Smuzhiyun pop %r15 4105*4882a593Smuzhiyun pop %r14 4106*4882a593Smuzhiyun pop %r13 4107*4882a593Smuzhiyun pop %r12 4108*4882a593Smuzhiyun pop %rbp 4109*4882a593Smuzhiyun pop %rbx 4110*4882a593Smuzhiyun pop %rdi 4111*4882a593Smuzhiyun pop %rsi 4112*4882a593Smuzhiyun RET 4113*4882a593Smuzhiyun.size avx_handler,.-avx_handler 4114*4882a593Smuzhiyun 4115*4882a593Smuzhiyun.section .pdata 4116*4882a593Smuzhiyun.align 4 4117*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_init_x86_64 4118*4882a593Smuzhiyun .rva .LSEH_end_poly1305_init_x86_64 4119*4882a593Smuzhiyun .rva .LSEH_info_poly1305_init_x86_64 4120*4882a593Smuzhiyun 4121*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_blocks_x86_64 4122*4882a593Smuzhiyun .rva .LSEH_end_poly1305_blocks_x86_64 4123*4882a593Smuzhiyun .rva .LSEH_info_poly1305_blocks_x86_64 4124*4882a593Smuzhiyun 4125*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_emit_x86_64 4126*4882a593Smuzhiyun .rva .LSEH_end_poly1305_emit_x86_64 4127*4882a593Smuzhiyun .rva .LSEH_info_poly1305_emit_x86_64 4128*4882a593Smuzhiyun___ 4129*4882a593Smuzhiyun$code.=<<___ if ($avx); 4130*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_blocks_avx 4131*4882a593Smuzhiyun .rva .Lbase2_64_avx 4132*4882a593Smuzhiyun .rva .LSEH_info_poly1305_blocks_avx_1 4133*4882a593Smuzhiyun 4134*4882a593Smuzhiyun .rva .Lbase2_64_avx 4135*4882a593Smuzhiyun .rva .Leven_avx 4136*4882a593Smuzhiyun .rva .LSEH_info_poly1305_blocks_avx_2 4137*4882a593Smuzhiyun 4138*4882a593Smuzhiyun .rva .Leven_avx 4139*4882a593Smuzhiyun .rva .LSEH_end_poly1305_blocks_avx 4140*4882a593Smuzhiyun .rva .LSEH_info_poly1305_blocks_avx_3 4141*4882a593Smuzhiyun 4142*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_emit_avx 4143*4882a593Smuzhiyun .rva .LSEH_end_poly1305_emit_avx 4144*4882a593Smuzhiyun .rva .LSEH_info_poly1305_emit_avx 4145*4882a593Smuzhiyun___ 4146*4882a593Smuzhiyun$code.=<<___ if ($avx>1); 4147*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_blocks_avx2 4148*4882a593Smuzhiyun .rva .Lbase2_64_avx2 4149*4882a593Smuzhiyun .rva .LSEH_info_poly1305_blocks_avx2_1 4150*4882a593Smuzhiyun 4151*4882a593Smuzhiyun .rva .Lbase2_64_avx2 4152*4882a593Smuzhiyun .rva .Leven_avx2 4153*4882a593Smuzhiyun .rva .LSEH_info_poly1305_blocks_avx2_2 4154*4882a593Smuzhiyun 4155*4882a593Smuzhiyun .rva .Leven_avx2 4156*4882a593Smuzhiyun .rva .LSEH_end_poly1305_blocks_avx2 4157*4882a593Smuzhiyun .rva .LSEH_info_poly1305_blocks_avx2_3 4158*4882a593Smuzhiyun___ 4159*4882a593Smuzhiyun$code.=<<___ if ($avx>2); 4160*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_blocks_avx512 4161*4882a593Smuzhiyun .rva .LSEH_end_poly1305_blocks_avx512 4162*4882a593Smuzhiyun .rva .LSEH_info_poly1305_blocks_avx512 4163*4882a593Smuzhiyun___ 4164*4882a593Smuzhiyun$code.=<<___; 4165*4882a593Smuzhiyun.section .xdata 4166*4882a593Smuzhiyun.align 8 4167*4882a593Smuzhiyun.LSEH_info_poly1305_init_x86_64: 4168*4882a593Smuzhiyun .byte 9,0,0,0 4169*4882a593Smuzhiyun .rva se_handler 4170*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_init_x86_64,.LSEH_begin_poly1305_init_x86_64 4171*4882a593Smuzhiyun 4172*4882a593Smuzhiyun.LSEH_info_poly1305_blocks_x86_64: 4173*4882a593Smuzhiyun .byte 9,0,0,0 4174*4882a593Smuzhiyun .rva se_handler 4175*4882a593Smuzhiyun .rva .Lblocks_body,.Lblocks_epilogue 4176*4882a593Smuzhiyun 4177*4882a593Smuzhiyun.LSEH_info_poly1305_emit_x86_64: 4178*4882a593Smuzhiyun .byte 9,0,0,0 4179*4882a593Smuzhiyun .rva se_handler 4180*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_emit_x86_64,.LSEH_begin_poly1305_emit_x86_64 4181*4882a593Smuzhiyun___ 4182*4882a593Smuzhiyun$code.=<<___ if ($avx); 4183*4882a593Smuzhiyun.LSEH_info_poly1305_blocks_avx_1: 4184*4882a593Smuzhiyun .byte 9,0,0,0 4185*4882a593Smuzhiyun .rva se_handler 4186*4882a593Smuzhiyun .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] 4187*4882a593Smuzhiyun 4188*4882a593Smuzhiyun.LSEH_info_poly1305_blocks_avx_2: 4189*4882a593Smuzhiyun .byte 9,0,0,0 4190*4882a593Smuzhiyun .rva se_handler 4191*4882a593Smuzhiyun .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] 4192*4882a593Smuzhiyun 4193*4882a593Smuzhiyun.LSEH_info_poly1305_blocks_avx_3: 4194*4882a593Smuzhiyun .byte 9,0,0,0 4195*4882a593Smuzhiyun .rva avx_handler 4196*4882a593Smuzhiyun .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] 4197*4882a593Smuzhiyun 4198*4882a593Smuzhiyun.LSEH_info_poly1305_emit_avx: 4199*4882a593Smuzhiyun .byte 9,0,0,0 4200*4882a593Smuzhiyun .rva se_handler 4201*4882a593Smuzhiyun .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx 4202*4882a593Smuzhiyun___ 4203*4882a593Smuzhiyun$code.=<<___ if ($avx>1); 4204*4882a593Smuzhiyun.LSEH_info_poly1305_blocks_avx2_1: 4205*4882a593Smuzhiyun .byte 9,0,0,0 4206*4882a593Smuzhiyun .rva se_handler 4207*4882a593Smuzhiyun .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] 4208*4882a593Smuzhiyun 4209*4882a593Smuzhiyun.LSEH_info_poly1305_blocks_avx2_2: 4210*4882a593Smuzhiyun .byte 9,0,0,0 4211*4882a593Smuzhiyun .rva se_handler 4212*4882a593Smuzhiyun .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] 4213*4882a593Smuzhiyun 4214*4882a593Smuzhiyun.LSEH_info_poly1305_blocks_avx2_3: 4215*4882a593Smuzhiyun .byte 9,0,0,0 4216*4882a593Smuzhiyun .rva avx_handler 4217*4882a593Smuzhiyun .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] 4218*4882a593Smuzhiyun___ 4219*4882a593Smuzhiyun$code.=<<___ if ($avx>2); 4220*4882a593Smuzhiyun.LSEH_info_poly1305_blocks_avx512: 4221*4882a593Smuzhiyun .byte 9,0,0,0 4222*4882a593Smuzhiyun .rva avx_handler 4223*4882a593Smuzhiyun .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] 4224*4882a593Smuzhiyun___ 4225*4882a593Smuzhiyun} 4226*4882a593Smuzhiyun 4227*4882a593Smuzhiyunopen SELF,$0; 4228*4882a593Smuzhiyunwhile(<SELF>) { 4229*4882a593Smuzhiyun next if (/^#!/); 4230*4882a593Smuzhiyun last if (!s/^#/\/\// and !/^$/); 4231*4882a593Smuzhiyun print; 4232*4882a593Smuzhiyun} 4233*4882a593Smuzhiyunclose SELF; 4234*4882a593Smuzhiyun 4235*4882a593Smuzhiyunforeach (split('\n',$code)) { 4236*4882a593Smuzhiyun s/\`([^\`]*)\`/eval($1)/ge; 4237*4882a593Smuzhiyun s/%r([a-z]+)#d/%e$1/g; 4238*4882a593Smuzhiyun s/%r([0-9]+)#d/%r$1d/g; 4239*4882a593Smuzhiyun s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; 4240*4882a593Smuzhiyun 4241*4882a593Smuzhiyun if ($kernel) { 4242*4882a593Smuzhiyun s/(^\.type.*),[0-9]+$/\1/; 4243*4882a593Smuzhiyun s/(^\.type.*),\@abi-omnipotent+$/\1,\@function/; 4244*4882a593Smuzhiyun next if /^\.cfi.*/; 4245*4882a593Smuzhiyun } 4246*4882a593Smuzhiyun 4247*4882a593Smuzhiyun print $_,"\n"; 4248*4882a593Smuzhiyun} 4249*4882a593Smuzhiyunclose STDOUT; 4250