Lines Matching +full:3 +full:d1

2 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
101 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
149 .long 2,2,2,3,2,0,2,1
151 .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
154 .long 0,1,1,2,2,3,7,7
180 my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
195 mov %rdx,$d1
205 adc %rdx,$d1
209 mov $d1,$h1
220 and \$3,$h2
231 # unsigned __int64 h[3]; # current hash value base 2^64
247 &declare_function("poly1305_init_x86_64", 32, 3);
274 $code.=<<___ if (!$kernel && $avx>3);
381 &declare_function("poly1305_emit_x86_64", 32, 3);
414 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
420 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
451 mov $h0,$d1
456 shr \$26,$d1
462 and $d1#d,%eax
469 shr \$26,$d1
477 or $d1,%rax
481 mov %eax,`16*3+0-64`($ctx)
483 mov %edx,`16*3+4-64`($ctx)
486 mov $h1,$d1
492 shr \$14,$d1
494 and $d1#d,%eax
501 shr \$26,$d1
507 or %rax,$d1
508 mov $d1#d,`16*7+0-64`($ctx)
509 lea ($d1,$d1,4),$d1 # *5
512 mov $d1#d,`16*8+0-64`($ctx)
516 call __poly1305_block # r^3
518 mov \$0x3ffffff,%eax # save r^3 base 2^26
519 mov $h0,$d1
521 shr \$26,$d1
525 and $d1#d,%edx
528 shr \$26,$d1
533 or $d1,%rax
535 mov %eax,`16*3+12-64`($ctx)
537 mov $h1,$d1
541 shr \$14,$d1
542 and $d1#d,%edx
545 shr \$26,$d1
550 or %rax,$d1
551 mov $d1#d,`16*7+12-64`($ctx)
552 lea ($d1,$d1,4),$d1 # *5
553 mov $d1#d,`16*8+12-64`($ctx)
559 mov $h0,$d1
561 shr \$26,$d1
565 and $d1#d,%edx
568 shr \$26,$d1
573 or $d1,%rax
575 mov %eax,`16*3+8-64`($ctx)
577 mov $h1,$d1
581 shr \$14,$d1
582 and $d1#d,%edx
585 shr \$26,$d1
590 or %rax,$d1
591 mov $d1#d,`16*7+8-64`($ctx)
592 lea ($d1,$d1,4),$d1 # *5
593 mov $d1#d,`16*8+8-64`($ctx)
639 mov 0($ctx),$d1 # load hash value
647 mov $d1#d,$h0#d
648 and \$`-1*(1<<31)`,$d1
653 shr \$6,$d1
655 add $d1,$h0
661 mov $h2,$d1
662 shl \$40,$d1
664 add $d1,$h1
668 mov $h2,$d1
670 shr \$2,$d1
671 and \$3,$h2
672 add $d2,$d1 # =*5
673 add $d1,$h0
707 and \$0x3ffffff,$h1 # h[3]
801 mov $h1,$d1
805 shl \$12,$d1
808 or $d1,$h0
812 and \$0x3ffffff,$h1 # h[3]
848 vmovd 4*3($ctx),$H3
881 vmovdqu `16*3`($ctx),$D4 # preload r0^2
882 lea `16*3+64`($ctx),$ctx # size optimization
888 vmovdqu 16*3($inp),$T1
895 vpunpcklqdq $T3,$T2,$T3 # 2:3
904 vpand $MASK,$T3,$T3 # 3
910 vmovdqu `16*1-64`($ctx),$D1
916 vpshufd \$0xEE,$D1,$D4
917 vmovdqu `16*3-64`($ctx),$D0
918 vpshufd \$0x44,$D1,$D1
920 vmovdqa $D1,0x10(%rsp)
922 vmovdqu `16*4-64`($ctx),$D1
931 vpshufd \$0xEE,$D1,$D3
933 vpshufd \$0x44,$D1,$D1
935 vmovdqa $D1,0x40(%rsp)
937 vmovdqu `16*7-64`($ctx),$D1
946 vpshufd \$0xEE,$D1,$D4
947 vpshufd \$0x44,$D1,$D1
949 vmovdqa $D1,0x70(%rsp)
962 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
965 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
968 # Note that we start with inp[2:3]*r^2. This is because it
974 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
981 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1003 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1014 vpaddq $H0,$D1,$D1 # d1 += h4*s2
1027 vpaddq $H1,$D1,$D1 # d1 += h3*s3
1033 vpaddq $T2,$D1,$D1 # d1 += h2*s4
1048 vpunpcklqdq $H3,$H2,$H3 # 2:3
1059 vpand $MASK,$H3,$H3 # 3
1079 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1085 vpaddq $T1,$D1,$D1
1104 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1115 vpaddq $T0,$D1,$D1 # d1 += h4*s2
1130 vpaddq $T2,$D1,$D1 # d1 += h3*s3
1131 vmovdqu 16*3($inp),$T1 #
1137 vpaddq $H2,$D1,$D1 # d1 += h2*s4
1149 vpunpcklqdq $T3,$T2,$T3 # 2:3
1161 vpand $MASK,$T3,$T3 # 3
1174 vpaddq $D0,$D1,$H1 # h0 -> h1
1179 vpsrlq \$26,$H1,$D1
1181 vpaddq $D1,$H2,$H2 # h1 -> h2
1203 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1225 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1231 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1240 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1244 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1257 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1271 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1282 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1295 vpunpcklqdq $H3,$H2,$H3 # 2:3
1304 vpand $MASK,$H3,$H3 # 3
1315 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1320 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1334 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1338 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1351 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1365 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1376 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1386 vpsrldq \$8,$D1,$T1
1392 vpaddq $T1,$D1,$D1
1404 vpaddq $H0,$D1,$D1 # h0 -> h1
1409 vpsrlq \$26,$D1,$H1
1410 vpand $MASK,$D1,$D1
1423 vpaddq $H0,$D1,$D1 # h0 -> h1
1430 vmovd $D1,`4*1-48-64`($ctx)
1432 vmovd $D3,`4*3-48-64`($ctx)
1460 &declare_function("poly1305_emit_avx", 32, 3);
1489 and \$3,%r10
1517 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1561 mov 0($ctx),$d1 # load hash value
1569 mov $d1#d,$h0#d
1570 and \$`-1*(1<<31)`,$d1
1575 shr \$6,$d1
1577 add $d1,$h0
1583 mov $h2,$d1
1584 shl \$40,$d1
1586 add $d1,$h1
1590 mov $h2,$d1
1592 shr \$2,$d1
1593 and \$3,$h2
1594 add $d2,$d1 # =*5
1595 add $d1,$h0
1635 and \$0x3ffffff,$h1 # h[3]
1734 mov $h1,$d1
1738 shl \$12,$d1
1741 or $d1,$h0
1745 and \$0x3ffffff,$h1 # h[3]
1792 vmovd 4*3($ctx),%x#$H3
1839 vmovdqu `16*3-64`($ctx),%x#$D0
1840 vmovdqu `16*4-64`($ctx),%x#$D1
1852 vpermd $D1,$T0,$D1
1857 vmovdqa $D1,0x80-0x90(%rax)
1871 vinserti128 \$1,16*3($inp),$T1,$T1
1877 vpunpcklqdq $T3,$T2,$T2 # 2:3
1887 vpand $MASK,$T3,$T3 # 3
1899 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1901 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1910 vmovdqa `32*3`(%rsp),$T2 # r2^4
1918 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1927 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1934 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1938 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1949 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1961 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1967 vinserti128 \$1,16*3($inp),$T1,$T1
1978 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1987 vpunpcklqdq $T3,$T2,$T3 # 2:3
2003 vpaddq $D0,$D1,$H1 # h0 -> h1
2010 vpsrlq \$26,$H1,$D1
2012 vpaddq $D1,$H2,$H2 # h1 -> h2
2040 vpand $MASK,$T3,$T3 # 3
2060 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
2069 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2073 vpaddq $T4,$D1,$D1 # d1 += h0*r1
2084 vpaddq $H2,$D1,$D1 # d1 += h1*r0
2093 vpaddq $H2,$D1,$D1 # d1 += h4*s2
2106 vpaddq $T4,$D1,$D1 # d1 += h3*s3
2122 vpsrldq \$8,$D1,$T1
2127 vpaddq $T1,$D1,$D1
2136 vpermq \$0x2,$D1,$T1
2141 vpaddq $T1,$D1,$D1
2153 vpaddq $D0,$D1,$H1 # h0 -> h1
2158 vpsrlq \$26,$H1,$D1
2160 vpaddq $D1,$H2,$H2 # h1 -> h2
2181 vmovd %x#$H3,`4*3-48-64`($ctx)
2213 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2251 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2254 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2262 vpermd $D1,$T2,$R1
2286 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2292 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2305 vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2317 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2330 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2341 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2359 vpaddq $M0,$D1,$D1 # d0 -> d1
2364 vpsrlq \$26,$D1,$M1
2365 vpandq $MASK,$D1,$D1
2366 vpaddq $M1,$D2,$D2 # d1 -> d2
2378 vpaddq $M0,$D1,$D1 # d0 -> d1
2400 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2407 vpermd $D1,$M0,${R1}{%k1}
2432 #vpandq $MASK,$T3,$T3 # 3
2446 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2448 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2458 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2467 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2475 vpandq $MASK,$T3,$T3 # 3
2476 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2493 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2513 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2524 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2531 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2555 vpsrlq \$26,$H1,$D1
2557 vpaddq $D1,$H2,$H2 # h1 -> h2
2584 #vpandq $MASK,$T3,$T3 # 3
2617 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2618 vpandq $MASK,$T3,$T3 # 3
2633 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2652 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2655 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
2664 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2671 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2681 vpermq \$0xb1,$H1,$D1
2686 vpaddq $D1,$H1,$H1
2693 vpermq \$0x2,$H1,$D1
2698 vpaddq $D1,$H1,$H1
2704 vextracti64x4 \$0x1,$H1,%y#$D1
2709 vpaddq $D1,$H1,${H1}{%k3}{z}
2713 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2727 vpunpcklqdq $T3,$T2,$T2 # 2:3
2734 vpsrlq \$26,$H1,$D1
2738 vpaddq $D1,$H2,$H2 # h1 -> h2
2760 vpand $MASK,$T3,$T3 # 3
2772 vmovd %x#$H3,`4*3-48-64`($ctx)
2827 if (!$kernel && $avx>3) {
2844 # unsigned __int64 h[3]; # current hash value base 2^44
2846 # unsigned __int64 r[3]; # key value base 2^44
2847 # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2850 # # memory, R[3] is R[1]*20
2853 .type poly1305_init_base2_44,\@function,3
2914 # if powers of the key are not calculated yet, process up to 3
2919 mov \$3,%rax
3052 test \$3,$len # is length 4*n+2?
3076 # at this point 64-bit lanes are ordered as 3-1-2-0
3187 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3191 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3203 test \$3,$len # is length 4*n+2?
3493 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3832 .type poly1305_emit_base2_44,\@function,3