xref: /OK3568_Linux_fs/kernel/arch/arm/crypto/poly1305-armv4.pl (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun#!/usr/bin/env perl
2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3*4882a593Smuzhiyun#
4*4882a593Smuzhiyun# ====================================================================
5*4882a593Smuzhiyun# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL
6*4882a593Smuzhiyun# project.
7*4882a593Smuzhiyun# ====================================================================
8*4882a593Smuzhiyun#
9*4882a593Smuzhiyun#			IALU(*)/gcc-4.4		NEON
10*4882a593Smuzhiyun#
11*4882a593Smuzhiyun# ARM11xx(ARMv6)	7.78/+100%		-
12*4882a593Smuzhiyun# Cortex-A5		6.35/+130%		3.00
13*4882a593Smuzhiyun# Cortex-A8		6.25/+115%		2.36
14*4882a593Smuzhiyun# Cortex-A9		5.10/+95%		2.55
15*4882a593Smuzhiyun# Cortex-A15		3.85/+85%		1.25(**)
16*4882a593Smuzhiyun# Snapdragon S4		5.70/+100%		1.48(**)
17*4882a593Smuzhiyun#
18*4882a593Smuzhiyun# (*)	this is for -march=armv6, i.e. with bunch of ldrb loading data;
19*4882a593Smuzhiyun# (**)	these are trade-off results, they can be improved by ~8% but at
20*4882a593Smuzhiyun#	the cost of 15/12% regression on Cortex-A5/A7, it's even possible
21*4882a593Smuzhiyun#	to improve Cortex-A9 result, but then A5/A7 loose more than 20%;
22*4882a593Smuzhiyun
23*4882a593Smuzhiyun$flavour = shift;
24*4882a593Smuzhiyunif ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
25*4882a593Smuzhiyunelse { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
26*4882a593Smuzhiyun
27*4882a593Smuzhiyunif ($flavour && $flavour ne "void") {
28*4882a593Smuzhiyun    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29*4882a593Smuzhiyun    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
30*4882a593Smuzhiyun    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
31*4882a593Smuzhiyun    die "can't locate arm-xlate.pl";
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun    open STDOUT,"| \"$^X\" $xlate $flavour $output";
34*4882a593Smuzhiyun} else {
35*4882a593Smuzhiyun    open STDOUT,">$output";
36*4882a593Smuzhiyun}
37*4882a593Smuzhiyun
38*4882a593Smuzhiyun($ctx,$inp,$len,$padbit)=map("r$_",(0..3));
39*4882a593Smuzhiyun
40*4882a593Smuzhiyun$code.=<<___;
41*4882a593Smuzhiyun#ifndef	__KERNEL__
42*4882a593Smuzhiyun# include "arm_arch.h"
43*4882a593Smuzhiyun#else
44*4882a593Smuzhiyun# define __ARM_ARCH__ __LINUX_ARM_ARCH__
45*4882a593Smuzhiyun# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__
46*4882a593Smuzhiyun# define poly1305_init   poly1305_init_arm
47*4882a593Smuzhiyun# define poly1305_blocks poly1305_blocks_arm
48*4882a593Smuzhiyun# define poly1305_emit   poly1305_emit_arm
49*4882a593Smuzhiyun.globl	poly1305_blocks_neon
50*4882a593Smuzhiyun#endif
51*4882a593Smuzhiyun
52*4882a593Smuzhiyun#if defined(__thumb2__)
53*4882a593Smuzhiyun.syntax	unified
54*4882a593Smuzhiyun.thumb
55*4882a593Smuzhiyun#else
56*4882a593Smuzhiyun.code	32
57*4882a593Smuzhiyun#endif
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun.text
60*4882a593Smuzhiyun
61*4882a593Smuzhiyun.globl	poly1305_emit
62*4882a593Smuzhiyun.globl	poly1305_blocks
63*4882a593Smuzhiyun.globl	poly1305_init
64*4882a593Smuzhiyun.type	poly1305_init,%function
65*4882a593Smuzhiyun.align	5
66*4882a593Smuzhiyunpoly1305_init:
67*4882a593Smuzhiyun.Lpoly1305_init:
68*4882a593Smuzhiyun	stmdb	sp!,{r4-r11}
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun	eor	r3,r3,r3
71*4882a593Smuzhiyun	cmp	$inp,#0
72*4882a593Smuzhiyun	str	r3,[$ctx,#0]		@ zero hash value
73*4882a593Smuzhiyun	str	r3,[$ctx,#4]
74*4882a593Smuzhiyun	str	r3,[$ctx,#8]
75*4882a593Smuzhiyun	str	r3,[$ctx,#12]
76*4882a593Smuzhiyun	str	r3,[$ctx,#16]
77*4882a593Smuzhiyun	str	r3,[$ctx,#36]		@ clear is_base2_26
78*4882a593Smuzhiyun	add	$ctx,$ctx,#20
79*4882a593Smuzhiyun
80*4882a593Smuzhiyun#ifdef	__thumb2__
81*4882a593Smuzhiyun	it	eq
82*4882a593Smuzhiyun#endif
83*4882a593Smuzhiyun	moveq	r0,#0
84*4882a593Smuzhiyun	beq	.Lno_key
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun#if	__ARM_MAX_ARCH__>=7
87*4882a593Smuzhiyun	mov	r3,#-1
88*4882a593Smuzhiyun	str	r3,[$ctx,#28]		@ impossible key power value
89*4882a593Smuzhiyun# ifndef __KERNEL__
90*4882a593Smuzhiyun	adr	r11,.Lpoly1305_init
91*4882a593Smuzhiyun	ldr	r12,.LOPENSSL_armcap
92*4882a593Smuzhiyun# endif
93*4882a593Smuzhiyun#endif
94*4882a593Smuzhiyun	ldrb	r4,[$inp,#0]
95*4882a593Smuzhiyun	mov	r10,#0x0fffffff
96*4882a593Smuzhiyun	ldrb	r5,[$inp,#1]
97*4882a593Smuzhiyun	and	r3,r10,#-4		@ 0x0ffffffc
98*4882a593Smuzhiyun	ldrb	r6,[$inp,#2]
99*4882a593Smuzhiyun	ldrb	r7,[$inp,#3]
100*4882a593Smuzhiyun	orr	r4,r4,r5,lsl#8
101*4882a593Smuzhiyun	ldrb	r5,[$inp,#4]
102*4882a593Smuzhiyun	orr	r4,r4,r6,lsl#16
103*4882a593Smuzhiyun	ldrb	r6,[$inp,#5]
104*4882a593Smuzhiyun	orr	r4,r4,r7,lsl#24
105*4882a593Smuzhiyun	ldrb	r7,[$inp,#6]
106*4882a593Smuzhiyun	and	r4,r4,r10
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
109*4882a593Smuzhiyun# if !defined(_WIN32)
110*4882a593Smuzhiyun	ldr	r12,[r11,r12]		@ OPENSSL_armcap_P
111*4882a593Smuzhiyun# endif
112*4882a593Smuzhiyun# if defined(__APPLE__) || defined(_WIN32)
113*4882a593Smuzhiyun	ldr	r12,[r12]
114*4882a593Smuzhiyun# endif
115*4882a593Smuzhiyun#endif
116*4882a593Smuzhiyun	ldrb	r8,[$inp,#7]
117*4882a593Smuzhiyun	orr	r5,r5,r6,lsl#8
118*4882a593Smuzhiyun	ldrb	r6,[$inp,#8]
119*4882a593Smuzhiyun	orr	r5,r5,r7,lsl#16
120*4882a593Smuzhiyun	ldrb	r7,[$inp,#9]
121*4882a593Smuzhiyun	orr	r5,r5,r8,lsl#24
122*4882a593Smuzhiyun	ldrb	r8,[$inp,#10]
123*4882a593Smuzhiyun	and	r5,r5,r3
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
126*4882a593Smuzhiyun	tst	r12,#ARMV7_NEON		@ check for NEON
127*4882a593Smuzhiyun# ifdef	__thumb2__
128*4882a593Smuzhiyun	adr	r9,.Lpoly1305_blocks_neon
129*4882a593Smuzhiyun	adr	r11,.Lpoly1305_blocks
130*4882a593Smuzhiyun	it	ne
131*4882a593Smuzhiyun	movne	r11,r9
132*4882a593Smuzhiyun	adr	r12,.Lpoly1305_emit
133*4882a593Smuzhiyun	orr	r11,r11,#1		@ thumb-ify addresses
134*4882a593Smuzhiyun	orr	r12,r12,#1
135*4882a593Smuzhiyun# else
136*4882a593Smuzhiyun	add	r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init)
137*4882a593Smuzhiyun	ite	eq
138*4882a593Smuzhiyun	addeq	r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init)
139*4882a593Smuzhiyun	addne	r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init)
140*4882a593Smuzhiyun# endif
141*4882a593Smuzhiyun#endif
142*4882a593Smuzhiyun	ldrb	r9,[$inp,#11]
143*4882a593Smuzhiyun	orr	r6,r6,r7,lsl#8
144*4882a593Smuzhiyun	ldrb	r7,[$inp,#12]
145*4882a593Smuzhiyun	orr	r6,r6,r8,lsl#16
146*4882a593Smuzhiyun	ldrb	r8,[$inp,#13]
147*4882a593Smuzhiyun	orr	r6,r6,r9,lsl#24
148*4882a593Smuzhiyun	ldrb	r9,[$inp,#14]
149*4882a593Smuzhiyun	and	r6,r6,r3
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun	ldrb	r10,[$inp,#15]
152*4882a593Smuzhiyun	orr	r7,r7,r8,lsl#8
153*4882a593Smuzhiyun	str	r4,[$ctx,#0]
154*4882a593Smuzhiyun	orr	r7,r7,r9,lsl#16
155*4882a593Smuzhiyun	str	r5,[$ctx,#4]
156*4882a593Smuzhiyun	orr	r7,r7,r10,lsl#24
157*4882a593Smuzhiyun	str	r6,[$ctx,#8]
158*4882a593Smuzhiyun	and	r7,r7,r3
159*4882a593Smuzhiyun	str	r7,[$ctx,#12]
160*4882a593Smuzhiyun#if	__ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
161*4882a593Smuzhiyun	stmia	r2,{r11,r12}		@ fill functions table
162*4882a593Smuzhiyun	mov	r0,#1
163*4882a593Smuzhiyun#else
164*4882a593Smuzhiyun	mov	r0,#0
165*4882a593Smuzhiyun#endif
166*4882a593Smuzhiyun.Lno_key:
167*4882a593Smuzhiyun	ldmia	sp!,{r4-r11}
168*4882a593Smuzhiyun#if	__ARM_ARCH__>=5
169*4882a593Smuzhiyun	ret				@ bx	lr
170*4882a593Smuzhiyun#else
171*4882a593Smuzhiyun	tst	lr,#1
172*4882a593Smuzhiyun	moveq	pc,lr			@ be binary compatible with V4, yet
173*4882a593Smuzhiyun	bx	lr			@ interoperable with Thumb ISA:-)
174*4882a593Smuzhiyun#endif
175*4882a593Smuzhiyun.size	poly1305_init,.-poly1305_init
176*4882a593Smuzhiyun___
177*4882a593Smuzhiyun{
178*4882a593Smuzhiyunmy ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12));
179*4882a593Smuzhiyunmy ($s1,$s2,$s3)=($r1,$r2,$r3);
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun$code.=<<___;
182*4882a593Smuzhiyun.type	poly1305_blocks,%function
183*4882a593Smuzhiyun.align	5
184*4882a593Smuzhiyunpoly1305_blocks:
185*4882a593Smuzhiyun.Lpoly1305_blocks:
186*4882a593Smuzhiyun	stmdb	sp!,{r3-r11,lr}
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun	ands	$len,$len,#-16
189*4882a593Smuzhiyun	beq	.Lno_data
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun	add	$len,$len,$inp		@ end pointer
192*4882a593Smuzhiyun	sub	sp,sp,#32
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun#if __ARM_ARCH__<7
195*4882a593Smuzhiyun	ldmia	$ctx,{$h0-$r3}		@ load context
196*4882a593Smuzhiyun	add	$ctx,$ctx,#20
197*4882a593Smuzhiyun	str	$len,[sp,#16]		@ offload stuff
198*4882a593Smuzhiyun	str	$ctx,[sp,#12]
199*4882a593Smuzhiyun#else
200*4882a593Smuzhiyun	ldr	lr,[$ctx,#36]		@ is_base2_26
201*4882a593Smuzhiyun	ldmia	$ctx!,{$h0-$h4}		@ load hash value
202*4882a593Smuzhiyun	str	$len,[sp,#16]		@ offload stuff
203*4882a593Smuzhiyun	str	$ctx,[sp,#12]
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun	adds	$r0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
206*4882a593Smuzhiyun	mov	$r1,$h1,lsr#6
207*4882a593Smuzhiyun	adcs	$r1,$r1,$h2,lsl#20
208*4882a593Smuzhiyun	mov	$r2,$h2,lsr#12
209*4882a593Smuzhiyun	adcs	$r2,$r2,$h3,lsl#14
210*4882a593Smuzhiyun	mov	$r3,$h3,lsr#18
211*4882a593Smuzhiyun	adcs	$r3,$r3,$h4,lsl#8
212*4882a593Smuzhiyun	mov	$len,#0
213*4882a593Smuzhiyun	teq	lr,#0
214*4882a593Smuzhiyun	str	$len,[$ctx,#16]		@ clear is_base2_26
215*4882a593Smuzhiyun	adc	$len,$len,$h4,lsr#24
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun	itttt	ne
218*4882a593Smuzhiyun	movne	$h0,$r0			@ choose between radixes
219*4882a593Smuzhiyun	movne	$h1,$r1
220*4882a593Smuzhiyun	movne	$h2,$r2
221*4882a593Smuzhiyun	movne	$h3,$r3
222*4882a593Smuzhiyun	ldmia	$ctx,{$r0-$r3}		@ load key
223*4882a593Smuzhiyun	it	ne
224*4882a593Smuzhiyun	movne	$h4,$len
225*4882a593Smuzhiyun#endif
226*4882a593Smuzhiyun
227*4882a593Smuzhiyun	mov	lr,$inp
228*4882a593Smuzhiyun	cmp	$padbit,#0
229*4882a593Smuzhiyun	str	$r1,[sp,#20]
230*4882a593Smuzhiyun	str	$r2,[sp,#24]
231*4882a593Smuzhiyun	str	$r3,[sp,#28]
232*4882a593Smuzhiyun	b	.Loop
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun.align	4
235*4882a593Smuzhiyun.Loop:
236*4882a593Smuzhiyun#if __ARM_ARCH__<7
237*4882a593Smuzhiyun	ldrb	r0,[lr],#16		@ load input
238*4882a593Smuzhiyun# ifdef	__thumb2__
239*4882a593Smuzhiyun	it	hi
240*4882a593Smuzhiyun# endif
241*4882a593Smuzhiyun	addhi	$h4,$h4,#1		@ 1<<128
242*4882a593Smuzhiyun	ldrb	r1,[lr,#-15]
243*4882a593Smuzhiyun	ldrb	r2,[lr,#-14]
244*4882a593Smuzhiyun	ldrb	r3,[lr,#-13]
245*4882a593Smuzhiyun	orr	r1,r0,r1,lsl#8
246*4882a593Smuzhiyun	ldrb	r0,[lr,#-12]
247*4882a593Smuzhiyun	orr	r2,r1,r2,lsl#16
248*4882a593Smuzhiyun	ldrb	r1,[lr,#-11]
249*4882a593Smuzhiyun	orr	r3,r2,r3,lsl#24
250*4882a593Smuzhiyun	ldrb	r2,[lr,#-10]
251*4882a593Smuzhiyun	adds	$h0,$h0,r3		@ accumulate input
252*4882a593Smuzhiyun
253*4882a593Smuzhiyun	ldrb	r3,[lr,#-9]
254*4882a593Smuzhiyun	orr	r1,r0,r1,lsl#8
255*4882a593Smuzhiyun	ldrb	r0,[lr,#-8]
256*4882a593Smuzhiyun	orr	r2,r1,r2,lsl#16
257*4882a593Smuzhiyun	ldrb	r1,[lr,#-7]
258*4882a593Smuzhiyun	orr	r3,r2,r3,lsl#24
259*4882a593Smuzhiyun	ldrb	r2,[lr,#-6]
260*4882a593Smuzhiyun	adcs	$h1,$h1,r3
261*4882a593Smuzhiyun
262*4882a593Smuzhiyun	ldrb	r3,[lr,#-5]
263*4882a593Smuzhiyun	orr	r1,r0,r1,lsl#8
264*4882a593Smuzhiyun	ldrb	r0,[lr,#-4]
265*4882a593Smuzhiyun	orr	r2,r1,r2,lsl#16
266*4882a593Smuzhiyun	ldrb	r1,[lr,#-3]
267*4882a593Smuzhiyun	orr	r3,r2,r3,lsl#24
268*4882a593Smuzhiyun	ldrb	r2,[lr,#-2]
269*4882a593Smuzhiyun	adcs	$h2,$h2,r3
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun	ldrb	r3,[lr,#-1]
272*4882a593Smuzhiyun	orr	r1,r0,r1,lsl#8
273*4882a593Smuzhiyun	str	lr,[sp,#8]		@ offload input pointer
274*4882a593Smuzhiyun	orr	r2,r1,r2,lsl#16
275*4882a593Smuzhiyun	add	$s1,$r1,$r1,lsr#2
276*4882a593Smuzhiyun	orr	r3,r2,r3,lsl#24
277*4882a593Smuzhiyun#else
278*4882a593Smuzhiyun	ldr	r0,[lr],#16		@ load input
279*4882a593Smuzhiyun	it	hi
280*4882a593Smuzhiyun	addhi	$h4,$h4,#1		@ padbit
281*4882a593Smuzhiyun	ldr	r1,[lr,#-12]
282*4882a593Smuzhiyun	ldr	r2,[lr,#-8]
283*4882a593Smuzhiyun	ldr	r3,[lr,#-4]
284*4882a593Smuzhiyun# ifdef	__ARMEB__
285*4882a593Smuzhiyun	rev	r0,r0
286*4882a593Smuzhiyun	rev	r1,r1
287*4882a593Smuzhiyun	rev	r2,r2
288*4882a593Smuzhiyun	rev	r3,r3
289*4882a593Smuzhiyun# endif
290*4882a593Smuzhiyun	adds	$h0,$h0,r0		@ accumulate input
291*4882a593Smuzhiyun	str	lr,[sp,#8]		@ offload input pointer
292*4882a593Smuzhiyun	adcs	$h1,$h1,r1
293*4882a593Smuzhiyun	add	$s1,$r1,$r1,lsr#2
294*4882a593Smuzhiyun	adcs	$h2,$h2,r2
295*4882a593Smuzhiyun#endif
296*4882a593Smuzhiyun	add	$s2,$r2,$r2,lsr#2
297*4882a593Smuzhiyun	adcs	$h3,$h3,r3
298*4882a593Smuzhiyun	add	$s3,$r3,$r3,lsr#2
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun	umull	r2,r3,$h1,$r0
301*4882a593Smuzhiyun	 adc	$h4,$h4,#0
302*4882a593Smuzhiyun	umull	r0,r1,$h0,$r0
303*4882a593Smuzhiyun	umlal	r2,r3,$h4,$s1
304*4882a593Smuzhiyun	umlal	r0,r1,$h3,$s1
305*4882a593Smuzhiyun	ldr	$r1,[sp,#20]		@ reload $r1
306*4882a593Smuzhiyun	umlal	r2,r3,$h2,$s3
307*4882a593Smuzhiyun	umlal	r0,r1,$h1,$s3
308*4882a593Smuzhiyun	umlal	r2,r3,$h3,$s2
309*4882a593Smuzhiyun	umlal	r0,r1,$h2,$s2
310*4882a593Smuzhiyun	umlal	r2,r3,$h0,$r1
311*4882a593Smuzhiyun	str	r0,[sp,#0]		@ future $h0
312*4882a593Smuzhiyun	 mul	r0,$s2,$h4
313*4882a593Smuzhiyun	ldr	$r2,[sp,#24]		@ reload $r2
314*4882a593Smuzhiyun	adds	r2,r2,r1		@ d1+=d0>>32
315*4882a593Smuzhiyun	 eor	r1,r1,r1
316*4882a593Smuzhiyun	adc	lr,r3,#0		@ future $h2
317*4882a593Smuzhiyun	str	r2,[sp,#4]		@ future $h1
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun	mul	r2,$s3,$h4
320*4882a593Smuzhiyun	eor	r3,r3,r3
321*4882a593Smuzhiyun	umlal	r0,r1,$h3,$s3
322*4882a593Smuzhiyun	ldr	$r3,[sp,#28]		@ reload $r3
323*4882a593Smuzhiyun	umlal	r2,r3,$h3,$r0
324*4882a593Smuzhiyun	umlal	r0,r1,$h2,$r0
325*4882a593Smuzhiyun	umlal	r2,r3,$h2,$r1
326*4882a593Smuzhiyun	umlal	r0,r1,$h1,$r1
327*4882a593Smuzhiyun	umlal	r2,r3,$h1,$r2
328*4882a593Smuzhiyun	umlal	r0,r1,$h0,$r2
329*4882a593Smuzhiyun	umlal	r2,r3,$h0,$r3
330*4882a593Smuzhiyun	ldr	$h0,[sp,#0]
331*4882a593Smuzhiyun	mul	$h4,$r0,$h4
332*4882a593Smuzhiyun	ldr	$h1,[sp,#4]
333*4882a593Smuzhiyun
334*4882a593Smuzhiyun	adds	$h2,lr,r0		@ d2+=d1>>32
335*4882a593Smuzhiyun	ldr	lr,[sp,#8]		@ reload input pointer
336*4882a593Smuzhiyun	adc	r1,r1,#0
337*4882a593Smuzhiyun	adds	$h3,r2,r1		@ d3+=d2>>32
338*4882a593Smuzhiyun	ldr	r0,[sp,#16]		@ reload end pointer
339*4882a593Smuzhiyun	adc	r3,r3,#0
340*4882a593Smuzhiyun	add	$h4,$h4,r3		@ h4+=d3>>32
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun	and	r1,$h4,#-4
343*4882a593Smuzhiyun	and	$h4,$h4,#3
344*4882a593Smuzhiyun	add	r1,r1,r1,lsr#2		@ *=5
345*4882a593Smuzhiyun	adds	$h0,$h0,r1
346*4882a593Smuzhiyun	adcs	$h1,$h1,#0
347*4882a593Smuzhiyun	adcs	$h2,$h2,#0
348*4882a593Smuzhiyun	adcs	$h3,$h3,#0
349*4882a593Smuzhiyun	adc	$h4,$h4,#0
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun	cmp	r0,lr			@ done yet?
352*4882a593Smuzhiyun	bhi	.Loop
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun	ldr	$ctx,[sp,#12]
355*4882a593Smuzhiyun	add	sp,sp,#32
356*4882a593Smuzhiyun	stmdb	$ctx,{$h0-$h4}		@ store the result
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun.Lno_data:
359*4882a593Smuzhiyun#if	__ARM_ARCH__>=5
360*4882a593Smuzhiyun	ldmia	sp!,{r3-r11,pc}
361*4882a593Smuzhiyun#else
362*4882a593Smuzhiyun	ldmia	sp!,{r3-r11,lr}
363*4882a593Smuzhiyun	tst	lr,#1
364*4882a593Smuzhiyun	moveq	pc,lr			@ be binary compatible with V4, yet
365*4882a593Smuzhiyun	bx	lr			@ interoperable with Thumb ISA:-)
366*4882a593Smuzhiyun#endif
367*4882a593Smuzhiyun.size	poly1305_blocks,.-poly1305_blocks
368*4882a593Smuzhiyun___
369*4882a593Smuzhiyun}
370*4882a593Smuzhiyun{
371*4882a593Smuzhiyunmy ($ctx,$mac,$nonce)=map("r$_",(0..2));
372*4882a593Smuzhiyunmy ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11));
373*4882a593Smuzhiyunmy $g4=$ctx;
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun$code.=<<___;
376*4882a593Smuzhiyun.type	poly1305_emit,%function
377*4882a593Smuzhiyun.align	5
378*4882a593Smuzhiyunpoly1305_emit:
379*4882a593Smuzhiyun.Lpoly1305_emit:
380*4882a593Smuzhiyun	stmdb	sp!,{r4-r11}
381*4882a593Smuzhiyun
382*4882a593Smuzhiyun	ldmia	$ctx,{$h0-$h4}
383*4882a593Smuzhiyun
384*4882a593Smuzhiyun#if __ARM_ARCH__>=7
385*4882a593Smuzhiyun	ldr	ip,[$ctx,#36]		@ is_base2_26
386*4882a593Smuzhiyun
387*4882a593Smuzhiyun	adds	$g0,$h0,$h1,lsl#26	@ base 2^26 -> base 2^32
388*4882a593Smuzhiyun	mov	$g1,$h1,lsr#6
389*4882a593Smuzhiyun	adcs	$g1,$g1,$h2,lsl#20
390*4882a593Smuzhiyun	mov	$g2,$h2,lsr#12
391*4882a593Smuzhiyun	adcs	$g2,$g2,$h3,lsl#14
392*4882a593Smuzhiyun	mov	$g3,$h3,lsr#18
393*4882a593Smuzhiyun	adcs	$g3,$g3,$h4,lsl#8
394*4882a593Smuzhiyun	mov	$g4,#0
395*4882a593Smuzhiyun	adc	$g4,$g4,$h4,lsr#24
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun	tst	ip,ip
398*4882a593Smuzhiyun	itttt	ne
399*4882a593Smuzhiyun	movne	$h0,$g0
400*4882a593Smuzhiyun	movne	$h1,$g1
401*4882a593Smuzhiyun	movne	$h2,$g2
402*4882a593Smuzhiyun	movne	$h3,$g3
403*4882a593Smuzhiyun	it	ne
404*4882a593Smuzhiyun	movne	$h4,$g4
405*4882a593Smuzhiyun#endif
406*4882a593Smuzhiyun
407*4882a593Smuzhiyun	adds	$g0,$h0,#5		@ compare to modulus
408*4882a593Smuzhiyun	adcs	$g1,$h1,#0
409*4882a593Smuzhiyun	adcs	$g2,$h2,#0
410*4882a593Smuzhiyun	adcs	$g3,$h3,#0
411*4882a593Smuzhiyun	adc	$g4,$h4,#0
412*4882a593Smuzhiyun	tst	$g4,#4			@ did it carry/borrow?
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun#ifdef	__thumb2__
415*4882a593Smuzhiyun	it	ne
416*4882a593Smuzhiyun#endif
417*4882a593Smuzhiyun	movne	$h0,$g0
418*4882a593Smuzhiyun	ldr	$g0,[$nonce,#0]
419*4882a593Smuzhiyun#ifdef	__thumb2__
420*4882a593Smuzhiyun	it	ne
421*4882a593Smuzhiyun#endif
422*4882a593Smuzhiyun	movne	$h1,$g1
423*4882a593Smuzhiyun	ldr	$g1,[$nonce,#4]
424*4882a593Smuzhiyun#ifdef	__thumb2__
425*4882a593Smuzhiyun	it	ne
426*4882a593Smuzhiyun#endif
427*4882a593Smuzhiyun	movne	$h2,$g2
428*4882a593Smuzhiyun	ldr	$g2,[$nonce,#8]
429*4882a593Smuzhiyun#ifdef	__thumb2__
430*4882a593Smuzhiyun	it	ne
431*4882a593Smuzhiyun#endif
432*4882a593Smuzhiyun	movne	$h3,$g3
433*4882a593Smuzhiyun	ldr	$g3,[$nonce,#12]
434*4882a593Smuzhiyun
435*4882a593Smuzhiyun	adds	$h0,$h0,$g0
436*4882a593Smuzhiyun	adcs	$h1,$h1,$g1
437*4882a593Smuzhiyun	adcs	$h2,$h2,$g2
438*4882a593Smuzhiyun	adc	$h3,$h3,$g3
439*4882a593Smuzhiyun
440*4882a593Smuzhiyun#if __ARM_ARCH__>=7
441*4882a593Smuzhiyun# ifdef __ARMEB__
442*4882a593Smuzhiyun	rev	$h0,$h0
443*4882a593Smuzhiyun	rev	$h1,$h1
444*4882a593Smuzhiyun	rev	$h2,$h2
445*4882a593Smuzhiyun	rev	$h3,$h3
446*4882a593Smuzhiyun# endif
447*4882a593Smuzhiyun	str	$h0,[$mac,#0]
448*4882a593Smuzhiyun	str	$h1,[$mac,#4]
449*4882a593Smuzhiyun	str	$h2,[$mac,#8]
450*4882a593Smuzhiyun	str	$h3,[$mac,#12]
451*4882a593Smuzhiyun#else
452*4882a593Smuzhiyun	strb	$h0,[$mac,#0]
453*4882a593Smuzhiyun	mov	$h0,$h0,lsr#8
454*4882a593Smuzhiyun	strb	$h1,[$mac,#4]
455*4882a593Smuzhiyun	mov	$h1,$h1,lsr#8
456*4882a593Smuzhiyun	strb	$h2,[$mac,#8]
457*4882a593Smuzhiyun	mov	$h2,$h2,lsr#8
458*4882a593Smuzhiyun	strb	$h3,[$mac,#12]
459*4882a593Smuzhiyun	mov	$h3,$h3,lsr#8
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun	strb	$h0,[$mac,#1]
462*4882a593Smuzhiyun	mov	$h0,$h0,lsr#8
463*4882a593Smuzhiyun	strb	$h1,[$mac,#5]
464*4882a593Smuzhiyun	mov	$h1,$h1,lsr#8
465*4882a593Smuzhiyun	strb	$h2,[$mac,#9]
466*4882a593Smuzhiyun	mov	$h2,$h2,lsr#8
467*4882a593Smuzhiyun	strb	$h3,[$mac,#13]
468*4882a593Smuzhiyun	mov	$h3,$h3,lsr#8
469*4882a593Smuzhiyun
470*4882a593Smuzhiyun	strb	$h0,[$mac,#2]
471*4882a593Smuzhiyun	mov	$h0,$h0,lsr#8
472*4882a593Smuzhiyun	strb	$h1,[$mac,#6]
473*4882a593Smuzhiyun	mov	$h1,$h1,lsr#8
474*4882a593Smuzhiyun	strb	$h2,[$mac,#10]
475*4882a593Smuzhiyun	mov	$h2,$h2,lsr#8
476*4882a593Smuzhiyun	strb	$h3,[$mac,#14]
477*4882a593Smuzhiyun	mov	$h3,$h3,lsr#8
478*4882a593Smuzhiyun
479*4882a593Smuzhiyun	strb	$h0,[$mac,#3]
480*4882a593Smuzhiyun	strb	$h1,[$mac,#7]
481*4882a593Smuzhiyun	strb	$h2,[$mac,#11]
482*4882a593Smuzhiyun	strb	$h3,[$mac,#15]
483*4882a593Smuzhiyun#endif
484*4882a593Smuzhiyun	ldmia	sp!,{r4-r11}
485*4882a593Smuzhiyun#if	__ARM_ARCH__>=5
486*4882a593Smuzhiyun	ret				@ bx	lr
487*4882a593Smuzhiyun#else
488*4882a593Smuzhiyun	tst	lr,#1
489*4882a593Smuzhiyun	moveq	pc,lr			@ be binary compatible with V4, yet
490*4882a593Smuzhiyun	bx	lr			@ interoperable with Thumb ISA:-)
491*4882a593Smuzhiyun#endif
492*4882a593Smuzhiyun.size	poly1305_emit,.-poly1305_emit
493*4882a593Smuzhiyun___
494*4882a593Smuzhiyun{
495*4882a593Smuzhiyunmy ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9));
496*4882a593Smuzhiyunmy ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14));
497*4882a593Smuzhiyunmy ($T0,$T1,$MASK) = map("q$_",(15,4,0));
498*4882a593Smuzhiyun
499*4882a593Smuzhiyunmy ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7));
500*4882a593Smuzhiyun
501*4882a593Smuzhiyun$code.=<<___;
502*4882a593Smuzhiyun#if	__ARM_MAX_ARCH__>=7
503*4882a593Smuzhiyun.fpu	neon
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun.type	poly1305_init_neon,%function
506*4882a593Smuzhiyun.align	5
507*4882a593Smuzhiyunpoly1305_init_neon:
508*4882a593Smuzhiyun.Lpoly1305_init_neon:
509*4882a593Smuzhiyun	ldr	r3,[$ctx,#48]		@ first table element
510*4882a593Smuzhiyun	cmp	r3,#-1			@ is value impossible?
511*4882a593Smuzhiyun	bne	.Lno_init_neon
512*4882a593Smuzhiyun
513*4882a593Smuzhiyun	ldr	r4,[$ctx,#20]		@ load key base 2^32
514*4882a593Smuzhiyun	ldr	r5,[$ctx,#24]
515*4882a593Smuzhiyun	ldr	r6,[$ctx,#28]
516*4882a593Smuzhiyun	ldr	r7,[$ctx,#32]
517*4882a593Smuzhiyun
518*4882a593Smuzhiyun	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
519*4882a593Smuzhiyun	mov	r3,r4,lsr#26
520*4882a593Smuzhiyun	mov	r4,r5,lsr#20
521*4882a593Smuzhiyun	orr	r3,r3,r5,lsl#6
522*4882a593Smuzhiyun	mov	r5,r6,lsr#14
523*4882a593Smuzhiyun	orr	r4,r4,r6,lsl#12
524*4882a593Smuzhiyun	mov	r6,r7,lsr#8
525*4882a593Smuzhiyun	orr	r5,r5,r7,lsl#18
526*4882a593Smuzhiyun	and	r3,r3,#0x03ffffff
527*4882a593Smuzhiyun	and	r4,r4,#0x03ffffff
528*4882a593Smuzhiyun	and	r5,r5,#0x03ffffff
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun	vdup.32	$R0,r2			@ r^1 in both lanes
531*4882a593Smuzhiyun	add	r2,r3,r3,lsl#2		@ *5
532*4882a593Smuzhiyun	vdup.32	$R1,r3
533*4882a593Smuzhiyun	add	r3,r4,r4,lsl#2
534*4882a593Smuzhiyun	vdup.32	$S1,r2
535*4882a593Smuzhiyun	vdup.32	$R2,r4
536*4882a593Smuzhiyun	add	r4,r5,r5,lsl#2
537*4882a593Smuzhiyun	vdup.32	$S2,r3
538*4882a593Smuzhiyun	vdup.32	$R3,r5
539*4882a593Smuzhiyun	add	r5,r6,r6,lsl#2
540*4882a593Smuzhiyun	vdup.32	$S3,r4
541*4882a593Smuzhiyun	vdup.32	$R4,r6
542*4882a593Smuzhiyun	vdup.32	$S4,r5
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun	mov	$zeros,#2		@ counter
545*4882a593Smuzhiyun
546*4882a593Smuzhiyun.Lsquare_neon:
547*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
548*4882a593Smuzhiyun	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
549*4882a593Smuzhiyun	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
550*4882a593Smuzhiyun	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
551*4882a593Smuzhiyun	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
552*4882a593Smuzhiyun	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun	vmull.u32	$D0,$R0,${R0}[1]
555*4882a593Smuzhiyun	vmull.u32	$D1,$R1,${R0}[1]
556*4882a593Smuzhiyun	vmull.u32	$D2,$R2,${R0}[1]
557*4882a593Smuzhiyun	vmull.u32	$D3,$R3,${R0}[1]
558*4882a593Smuzhiyun	vmull.u32	$D4,$R4,${R0}[1]
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun	vmlal.u32	$D0,$R4,${S1}[1]
561*4882a593Smuzhiyun	vmlal.u32	$D1,$R0,${R1}[1]
562*4882a593Smuzhiyun	vmlal.u32	$D2,$R1,${R1}[1]
563*4882a593Smuzhiyun	vmlal.u32	$D3,$R2,${R1}[1]
564*4882a593Smuzhiyun	vmlal.u32	$D4,$R3,${R1}[1]
565*4882a593Smuzhiyun
566*4882a593Smuzhiyun	vmlal.u32	$D0,$R3,${S2}[1]
567*4882a593Smuzhiyun	vmlal.u32	$D1,$R4,${S2}[1]
568*4882a593Smuzhiyun	vmlal.u32	$D3,$R1,${R2}[1]
569*4882a593Smuzhiyun	vmlal.u32	$D2,$R0,${R2}[1]
570*4882a593Smuzhiyun	vmlal.u32	$D4,$R2,${R2}[1]
571*4882a593Smuzhiyun
572*4882a593Smuzhiyun	vmlal.u32	$D0,$R2,${S3}[1]
573*4882a593Smuzhiyun	vmlal.u32	$D3,$R0,${R3}[1]
574*4882a593Smuzhiyun	vmlal.u32	$D1,$R3,${S3}[1]
575*4882a593Smuzhiyun	vmlal.u32	$D2,$R4,${S3}[1]
576*4882a593Smuzhiyun	vmlal.u32	$D4,$R1,${R3}[1]
577*4882a593Smuzhiyun
578*4882a593Smuzhiyun	vmlal.u32	$D3,$R4,${S4}[1]
579*4882a593Smuzhiyun	vmlal.u32	$D0,$R1,${S4}[1]
580*4882a593Smuzhiyun	vmlal.u32	$D1,$R2,${S4}[1]
581*4882a593Smuzhiyun	vmlal.u32	$D2,$R3,${S4}[1]
582*4882a593Smuzhiyun	vmlal.u32	$D4,$R0,${R4}[1]
583*4882a593Smuzhiyun
584*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
585*4882a593Smuzhiyun	@ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
586*4882a593Smuzhiyun	@ and P. Schwabe
587*4882a593Smuzhiyun	@
588*4882a593Smuzhiyun	@ H0>>+H1>>+H2>>+H3>>+H4
589*4882a593Smuzhiyun	@ H3>>+H4>>*5+H0>>+H1
590*4882a593Smuzhiyun	@
591*4882a593Smuzhiyun	@ Trivia.
592*4882a593Smuzhiyun	@
593*4882a593Smuzhiyun	@ Result of multiplication of n-bit number by m-bit number is
594*4882a593Smuzhiyun	@ n+m bits wide. However! Even though 2^n is a n+1-bit number,
595*4882a593Smuzhiyun	@ m-bit number multiplied by 2^n is still n+m bits wide.
596*4882a593Smuzhiyun	@
597*4882a593Smuzhiyun	@ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
598*4882a593Smuzhiyun	@ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
599*4882a593Smuzhiyun	@ one is n+1 bits wide.
600*4882a593Smuzhiyun	@
601*4882a593Smuzhiyun	@ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
602*4882a593Smuzhiyun	@ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
603*4882a593Smuzhiyun	@ can be 27. However! In cases when their width exceeds 26 bits
604*4882a593Smuzhiyun	@ they are limited by 2^26+2^6. This in turn means that *sum*
605*4882a593Smuzhiyun	@ of the products with these values can still be viewed as sum
606*4882a593Smuzhiyun	@ of 52-bit numbers as long as the amount of addends is not a
607*4882a593Smuzhiyun	@ power of 2. For example,
608*4882a593Smuzhiyun	@
609*4882a593Smuzhiyun	@ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
610*4882a593Smuzhiyun	@
611*4882a593Smuzhiyun	@ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
612*4882a593Smuzhiyun	@ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
613*4882a593Smuzhiyun	@ 8 * (2^52) or 2^55. However, the value is then multiplied by
614*4882a593Smuzhiyun	@ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
615*4882a593Smuzhiyun	@ which is less than 32 * (2^52) or 2^57. And when processing
616*4882a593Smuzhiyun	@ data we are looking at triple as many addends...
617*4882a593Smuzhiyun	@
618*4882a593Smuzhiyun	@ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
619*4882a593Smuzhiyun	@ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
620*4882a593Smuzhiyun	@ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
621*4882a593Smuzhiyun	@ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
622*4882a593Smuzhiyun	@ instruction accepts 2x32-bit input and writes 2x64-bit result.
623*4882a593Smuzhiyun	@ This means that result of reduction have to be compressed upon
624*4882a593Smuzhiyun	@ loop wrap-around. This can be done in the process of reduction
625*4882a593Smuzhiyun	@ to minimize amount of instructions [as well as amount of
626*4882a593Smuzhiyun	@ 128-bit instructions, which benefits low-end processors], but
627*4882a593Smuzhiyun	@ one has to watch for H2 (which is narrower than H0) and 5*H4
628*4882a593Smuzhiyun	@ not being wider than 58 bits, so that result of right shift
629*4882a593Smuzhiyun	@ by 26 bits fits in 32 bits. This is also useful on x86,
630*4882a593Smuzhiyun	@ because it allows to use paddd in place for paddq, which
631*4882a593Smuzhiyun	@ benefits Atom, where paddq is ridiculously slow.
632*4882a593Smuzhiyun
633*4882a593Smuzhiyun	vshr.u64	$T0,$D3,#26
634*4882a593Smuzhiyun	vmovn.i64	$D3#lo,$D3
635*4882a593Smuzhiyun	 vshr.u64	$T1,$D0,#26
636*4882a593Smuzhiyun	 vmovn.i64	$D0#lo,$D0
637*4882a593Smuzhiyun	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
638*4882a593Smuzhiyun	vbic.i32	$D3#lo,#0xfc000000	@ &=0x03ffffff
639*4882a593Smuzhiyun	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
640*4882a593Smuzhiyun	 vbic.i32	$D0#lo,#0xfc000000
641*4882a593Smuzhiyun
642*4882a593Smuzhiyun	vshrn.u64	$T0#lo,$D4,#26
643*4882a593Smuzhiyun	vmovn.i64	$D4#lo,$D4
644*4882a593Smuzhiyun	 vshr.u64	$T1,$D1,#26
645*4882a593Smuzhiyun	 vmovn.i64	$D1#lo,$D1
646*4882a593Smuzhiyun	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
647*4882a593Smuzhiyun	vbic.i32	$D4#lo,#0xfc000000
648*4882a593Smuzhiyun	 vbic.i32	$D1#lo,#0xfc000000
649*4882a593Smuzhiyun
650*4882a593Smuzhiyun	vadd.i32	$D0#lo,$D0#lo,$T0#lo
651*4882a593Smuzhiyun	vshl.u32	$T0#lo,$T0#lo,#2
652*4882a593Smuzhiyun	 vshrn.u64	$T1#lo,$D2,#26
653*4882a593Smuzhiyun	 vmovn.i64	$D2#lo,$D2
654*4882a593Smuzhiyun	vadd.i32	$D0#lo,$D0#lo,$T0#lo	@ h4 -> h0
655*4882a593Smuzhiyun	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
656*4882a593Smuzhiyun	 vbic.i32	$D2#lo,#0xfc000000
657*4882a593Smuzhiyun
658*4882a593Smuzhiyun	vshr.u32	$T0#lo,$D0#lo,#26
659*4882a593Smuzhiyun	vbic.i32	$D0#lo,#0xfc000000
660*4882a593Smuzhiyun	 vshr.u32	$T1#lo,$D3#lo,#26
661*4882a593Smuzhiyun	 vbic.i32	$D3#lo,#0xfc000000
662*4882a593Smuzhiyun	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
663*4882a593Smuzhiyun	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun	subs		$zeros,$zeros,#1
666*4882a593Smuzhiyun	beq		.Lsquare_break_neon
667*4882a593Smuzhiyun
668*4882a593Smuzhiyun	add		$tbl0,$ctx,#(48+0*9*4)
669*4882a593Smuzhiyun	add		$tbl1,$ctx,#(48+1*9*4)
670*4882a593Smuzhiyun
671*4882a593Smuzhiyun	vtrn.32		$R0,$D0#lo		@ r^2:r^1
672*4882a593Smuzhiyun	vtrn.32		$R2,$D2#lo
673*4882a593Smuzhiyun	vtrn.32		$R3,$D3#lo
674*4882a593Smuzhiyun	vtrn.32		$R1,$D1#lo
675*4882a593Smuzhiyun	vtrn.32		$R4,$D4#lo
676*4882a593Smuzhiyun
677*4882a593Smuzhiyun	vshl.u32	$S2,$R2,#2		@ *5
678*4882a593Smuzhiyun	vshl.u32	$S3,$R3,#2
679*4882a593Smuzhiyun	vshl.u32	$S1,$R1,#2
680*4882a593Smuzhiyun	vshl.u32	$S4,$R4,#2
681*4882a593Smuzhiyun	vadd.i32	$S2,$S2,$R2
682*4882a593Smuzhiyun	vadd.i32	$S1,$S1,$R1
683*4882a593Smuzhiyun	vadd.i32	$S3,$S3,$R3
684*4882a593Smuzhiyun	vadd.i32	$S4,$S4,$R4
685*4882a593Smuzhiyun
686*4882a593Smuzhiyun	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
687*4882a593Smuzhiyun	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
688*4882a593Smuzhiyun	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
689*4882a593Smuzhiyun	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
690*4882a593Smuzhiyun	vst1.32		{${S4}[0]},[$tbl0,:32]
691*4882a593Smuzhiyun	vst1.32		{${S4}[1]},[$tbl1,:32]
692*4882a593Smuzhiyun
693*4882a593Smuzhiyun	b		.Lsquare_neon
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun.align	4
696*4882a593Smuzhiyun.Lsquare_break_neon:
697*4882a593Smuzhiyun	add		$tbl0,$ctx,#(48+2*4*9)
698*4882a593Smuzhiyun	add		$tbl1,$ctx,#(48+3*4*9)
699*4882a593Smuzhiyun
700*4882a593Smuzhiyun	vmov		$R0,$D0#lo		@ r^4:r^3
701*4882a593Smuzhiyun	vshl.u32	$S1,$D1#lo,#2		@ *5
702*4882a593Smuzhiyun	vmov		$R1,$D1#lo
703*4882a593Smuzhiyun	vshl.u32	$S2,$D2#lo,#2
704*4882a593Smuzhiyun	vmov		$R2,$D2#lo
705*4882a593Smuzhiyun	vshl.u32	$S3,$D3#lo,#2
706*4882a593Smuzhiyun	vmov		$R3,$D3#lo
707*4882a593Smuzhiyun	vshl.u32	$S4,$D4#lo,#2
708*4882a593Smuzhiyun	vmov		$R4,$D4#lo
709*4882a593Smuzhiyun	vadd.i32	$S1,$S1,$D1#lo
710*4882a593Smuzhiyun	vadd.i32	$S2,$S2,$D2#lo
711*4882a593Smuzhiyun	vadd.i32	$S3,$S3,$D3#lo
712*4882a593Smuzhiyun	vadd.i32	$S4,$S4,$D4#lo
713*4882a593Smuzhiyun
714*4882a593Smuzhiyun	vst4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!
715*4882a593Smuzhiyun	vst4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!
716*4882a593Smuzhiyun	vst4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
717*4882a593Smuzhiyun	vst4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
718*4882a593Smuzhiyun	vst1.32		{${S4}[0]},[$tbl0]
719*4882a593Smuzhiyun	vst1.32		{${S4}[1]},[$tbl1]
720*4882a593Smuzhiyun
721*4882a593Smuzhiyun.Lno_init_neon:
722*4882a593Smuzhiyun	ret				@ bx	lr
723*4882a593Smuzhiyun.size	poly1305_init_neon,.-poly1305_init_neon
724*4882a593Smuzhiyun
725*4882a593Smuzhiyun.type	poly1305_blocks_neon,%function
726*4882a593Smuzhiyun.align	5
727*4882a593Smuzhiyunpoly1305_blocks_neon:
728*4882a593Smuzhiyun.Lpoly1305_blocks_neon:
729*4882a593Smuzhiyun	ldr	ip,[$ctx,#36]		@ is_base2_26
730*4882a593Smuzhiyun
731*4882a593Smuzhiyun	cmp	$len,#64
732*4882a593Smuzhiyun	blo	.Lpoly1305_blocks
733*4882a593Smuzhiyun
734*4882a593Smuzhiyun	stmdb	sp!,{r4-r7}
735*4882a593Smuzhiyun	vstmdb	sp!,{d8-d15}		@ ABI specification says so
736*4882a593Smuzhiyun
737*4882a593Smuzhiyun	tst	ip,ip			@ is_base2_26?
738*4882a593Smuzhiyun	bne	.Lbase2_26_neon
739*4882a593Smuzhiyun
740*4882a593Smuzhiyun	stmdb	sp!,{r1-r3,lr}
741*4882a593Smuzhiyun	bl	.Lpoly1305_init_neon
742*4882a593Smuzhiyun
743*4882a593Smuzhiyun	ldr	r4,[$ctx,#0]		@ load hash value base 2^32
744*4882a593Smuzhiyun	ldr	r5,[$ctx,#4]
745*4882a593Smuzhiyun	ldr	r6,[$ctx,#8]
746*4882a593Smuzhiyun	ldr	r7,[$ctx,#12]
747*4882a593Smuzhiyun	ldr	ip,[$ctx,#16]
748*4882a593Smuzhiyun
749*4882a593Smuzhiyun	and	r2,r4,#0x03ffffff	@ base 2^32 -> base 2^26
750*4882a593Smuzhiyun	mov	r3,r4,lsr#26
751*4882a593Smuzhiyun	 veor	$D0#lo,$D0#lo,$D0#lo
752*4882a593Smuzhiyun	mov	r4,r5,lsr#20
753*4882a593Smuzhiyun	orr	r3,r3,r5,lsl#6
754*4882a593Smuzhiyun	 veor	$D1#lo,$D1#lo,$D1#lo
755*4882a593Smuzhiyun	mov	r5,r6,lsr#14
756*4882a593Smuzhiyun	orr	r4,r4,r6,lsl#12
757*4882a593Smuzhiyun	 veor	$D2#lo,$D2#lo,$D2#lo
758*4882a593Smuzhiyun	mov	r6,r7,lsr#8
759*4882a593Smuzhiyun	orr	r5,r5,r7,lsl#18
760*4882a593Smuzhiyun	 veor	$D3#lo,$D3#lo,$D3#lo
761*4882a593Smuzhiyun	and	r3,r3,#0x03ffffff
762*4882a593Smuzhiyun	orr	r6,r6,ip,lsl#24
763*4882a593Smuzhiyun	 veor	$D4#lo,$D4#lo,$D4#lo
764*4882a593Smuzhiyun	and	r4,r4,#0x03ffffff
765*4882a593Smuzhiyun	mov	r1,#1
766*4882a593Smuzhiyun	and	r5,r5,#0x03ffffff
767*4882a593Smuzhiyun	str	r1,[$ctx,#36]		@ set is_base2_26
768*4882a593Smuzhiyun
769*4882a593Smuzhiyun	vmov.32	$D0#lo[0],r2
770*4882a593Smuzhiyun	vmov.32	$D1#lo[0],r3
771*4882a593Smuzhiyun	vmov.32	$D2#lo[0],r4
772*4882a593Smuzhiyun	vmov.32	$D3#lo[0],r5
773*4882a593Smuzhiyun	vmov.32	$D4#lo[0],r6
774*4882a593Smuzhiyun	adr	$zeros,.Lzeros
775*4882a593Smuzhiyun
776*4882a593Smuzhiyun	ldmia	sp!,{r1-r3,lr}
777*4882a593Smuzhiyun	b	.Lhash_loaded
778*4882a593Smuzhiyun
779*4882a593Smuzhiyun.align	4
780*4882a593Smuzhiyun.Lbase2_26_neon:
781*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
782*4882a593Smuzhiyun	@ load hash value
783*4882a593Smuzhiyun
784*4882a593Smuzhiyun	veor		$D0#lo,$D0#lo,$D0#lo
785*4882a593Smuzhiyun	veor		$D1#lo,$D1#lo,$D1#lo
786*4882a593Smuzhiyun	veor		$D2#lo,$D2#lo,$D2#lo
787*4882a593Smuzhiyun	veor		$D3#lo,$D3#lo,$D3#lo
788*4882a593Smuzhiyun	veor		$D4#lo,$D4#lo,$D4#lo
789*4882a593Smuzhiyun	vld4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
790*4882a593Smuzhiyun	adr		$zeros,.Lzeros
791*4882a593Smuzhiyun	vld1.32		{$D4#lo[0]},[$ctx]
792*4882a593Smuzhiyun	sub		$ctx,$ctx,#16		@ rewind
793*4882a593Smuzhiyun
794*4882a593Smuzhiyun.Lhash_loaded:
795*4882a593Smuzhiyun	add		$in2,$inp,#32
796*4882a593Smuzhiyun	mov		$padbit,$padbit,lsl#24
797*4882a593Smuzhiyun	tst		$len,#31
798*4882a593Smuzhiyun	beq		.Leven
799*4882a593Smuzhiyun
800*4882a593Smuzhiyun	vld4.32		{$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]!
801*4882a593Smuzhiyun	vmov.32		$H4#lo[0],$padbit
802*4882a593Smuzhiyun	sub		$len,$len,#16
803*4882a593Smuzhiyun	add		$in2,$inp,#32
804*4882a593Smuzhiyun
805*4882a593Smuzhiyun# ifdef	__ARMEB__
806*4882a593Smuzhiyun	vrev32.8	$H0,$H0
807*4882a593Smuzhiyun	vrev32.8	$H3,$H3
808*4882a593Smuzhiyun	vrev32.8	$H1,$H1
809*4882a593Smuzhiyun	vrev32.8	$H2,$H2
810*4882a593Smuzhiyun# endif
811*4882a593Smuzhiyun	vsri.u32	$H4#lo,$H3#lo,#8	@ base 2^32 -> base 2^26
812*4882a593Smuzhiyun	vshl.u32	$H3#lo,$H3#lo,#18
813*4882a593Smuzhiyun
814*4882a593Smuzhiyun	vsri.u32	$H3#lo,$H2#lo,#14
815*4882a593Smuzhiyun	vshl.u32	$H2#lo,$H2#lo,#12
816*4882a593Smuzhiyun	vadd.i32	$H4#hi,$H4#lo,$D4#lo	@ add hash value and move to #hi
817*4882a593Smuzhiyun
818*4882a593Smuzhiyun	vbic.i32	$H3#lo,#0xfc000000
819*4882a593Smuzhiyun	vsri.u32	$H2#lo,$H1#lo,#20
820*4882a593Smuzhiyun	vshl.u32	$H1#lo,$H1#lo,#6
821*4882a593Smuzhiyun
822*4882a593Smuzhiyun	vbic.i32	$H2#lo,#0xfc000000
823*4882a593Smuzhiyun	vsri.u32	$H1#lo,$H0#lo,#26
824*4882a593Smuzhiyun	vadd.i32	$H3#hi,$H3#lo,$D3#lo
825*4882a593Smuzhiyun
826*4882a593Smuzhiyun	vbic.i32	$H0#lo,#0xfc000000
827*4882a593Smuzhiyun	vbic.i32	$H1#lo,#0xfc000000
828*4882a593Smuzhiyun	vadd.i32	$H2#hi,$H2#lo,$D2#lo
829*4882a593Smuzhiyun
830*4882a593Smuzhiyun	vadd.i32	$H0#hi,$H0#lo,$D0#lo
831*4882a593Smuzhiyun	vadd.i32	$H1#hi,$H1#lo,$D1#lo
832*4882a593Smuzhiyun
833*4882a593Smuzhiyun	mov		$tbl1,$zeros
834*4882a593Smuzhiyun	add		$tbl0,$ctx,#48
835*4882a593Smuzhiyun
836*4882a593Smuzhiyun	cmp		$len,$len
837*4882a593Smuzhiyun	b		.Long_tail
838*4882a593Smuzhiyun
839*4882a593Smuzhiyun.align	4
840*4882a593Smuzhiyun.Leven:
841*4882a593Smuzhiyun	subs		$len,$len,#64
842*4882a593Smuzhiyun	it		lo
843*4882a593Smuzhiyun	movlo		$in2,$zeros
844*4882a593Smuzhiyun
845*4882a593Smuzhiyun	vmov.i32	$H4,#1<<24		@ padbit, yes, always
846*4882a593Smuzhiyun	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
847*4882a593Smuzhiyun	add		$inp,$inp,#64
848*4882a593Smuzhiyun	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
849*4882a593Smuzhiyun	add		$in2,$in2,#64
850*4882a593Smuzhiyun	itt		hi
851*4882a593Smuzhiyun	addhi		$tbl1,$ctx,#(48+1*9*4)
852*4882a593Smuzhiyun	addhi		$tbl0,$ctx,#(48+3*9*4)
853*4882a593Smuzhiyun
854*4882a593Smuzhiyun# ifdef	__ARMEB__
855*4882a593Smuzhiyun	vrev32.8	$H0,$H0
856*4882a593Smuzhiyun	vrev32.8	$H3,$H3
857*4882a593Smuzhiyun	vrev32.8	$H1,$H1
858*4882a593Smuzhiyun	vrev32.8	$H2,$H2
859*4882a593Smuzhiyun# endif
860*4882a593Smuzhiyun	vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
861*4882a593Smuzhiyun	vshl.u32	$H3,$H3,#18
862*4882a593Smuzhiyun
863*4882a593Smuzhiyun	vsri.u32	$H3,$H2,#14
864*4882a593Smuzhiyun	vshl.u32	$H2,$H2,#12
865*4882a593Smuzhiyun
866*4882a593Smuzhiyun	vbic.i32	$H3,#0xfc000000
867*4882a593Smuzhiyun	vsri.u32	$H2,$H1,#20
868*4882a593Smuzhiyun	vshl.u32	$H1,$H1,#6
869*4882a593Smuzhiyun
870*4882a593Smuzhiyun	vbic.i32	$H2,#0xfc000000
871*4882a593Smuzhiyun	vsri.u32	$H1,$H0,#26
872*4882a593Smuzhiyun
873*4882a593Smuzhiyun	vbic.i32	$H0,#0xfc000000
874*4882a593Smuzhiyun	vbic.i32	$H1,#0xfc000000
875*4882a593Smuzhiyun
876*4882a593Smuzhiyun	bls		.Lskip_loop
877*4882a593Smuzhiyun
878*4882a593Smuzhiyun	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^2
879*4882a593Smuzhiyun	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
880*4882a593Smuzhiyun	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
881*4882a593Smuzhiyun	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
882*4882a593Smuzhiyun	b		.Loop_neon
883*4882a593Smuzhiyun
884*4882a593Smuzhiyun.align	5
885*4882a593Smuzhiyun.Loop_neon:
886*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
887*4882a593Smuzhiyun	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
888*4882a593Smuzhiyun	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
889*4882a593Smuzhiyun	@   \___________________/
890*4882a593Smuzhiyun	@ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
891*4882a593Smuzhiyun	@ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
892*4882a593Smuzhiyun	@   \___________________/ \____________________/
893*4882a593Smuzhiyun	@
894*4882a593Smuzhiyun	@ Note that we start with inp[2:3]*r^2. This is because it
895*4882a593Smuzhiyun	@ doesn't depend on reduction in previous iteration.
896*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
897*4882a593Smuzhiyun	@ d4 = h4*r0 + h3*r1   + h2*r2   + h1*r3   + h0*r4
898*4882a593Smuzhiyun	@ d3 = h3*r0 + h2*r1   + h1*r2   + h0*r3   + h4*5*r4
899*4882a593Smuzhiyun	@ d2 = h2*r0 + h1*r1   + h0*r2   + h4*5*r3 + h3*5*r4
900*4882a593Smuzhiyun	@ d1 = h1*r0 + h0*r1   + h4*5*r2 + h3*5*r3 + h2*5*r4
901*4882a593Smuzhiyun	@ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
902*4882a593Smuzhiyun
903*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
904*4882a593Smuzhiyun	@ inp[2:3]*r^2
905*4882a593Smuzhiyun
906*4882a593Smuzhiyun	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ accumulate inp[0:1]
907*4882a593Smuzhiyun	vmull.u32	$D2,$H2#hi,${R0}[1]
908*4882a593Smuzhiyun	vadd.i32	$H0#lo,$H0#lo,$D0#lo
909*4882a593Smuzhiyun	vmull.u32	$D0,$H0#hi,${R0}[1]
910*4882a593Smuzhiyun	vadd.i32	$H3#lo,$H3#lo,$D3#lo
911*4882a593Smuzhiyun	vmull.u32	$D3,$H3#hi,${R0}[1]
912*4882a593Smuzhiyun	vmlal.u32	$D2,$H1#hi,${R1}[1]
913*4882a593Smuzhiyun	vadd.i32	$H1#lo,$H1#lo,$D1#lo
914*4882a593Smuzhiyun	vmull.u32	$D1,$H1#hi,${R0}[1]
915*4882a593Smuzhiyun
916*4882a593Smuzhiyun	vadd.i32	$H4#lo,$H4#lo,$D4#lo
917*4882a593Smuzhiyun	vmull.u32	$D4,$H4#hi,${R0}[1]
918*4882a593Smuzhiyun	subs		$len,$len,#64
919*4882a593Smuzhiyun	vmlal.u32	$D0,$H4#hi,${S1}[1]
920*4882a593Smuzhiyun	it		lo
921*4882a593Smuzhiyun	movlo		$in2,$zeros
922*4882a593Smuzhiyun	vmlal.u32	$D3,$H2#hi,${R1}[1]
923*4882a593Smuzhiyun	vld1.32		${S4}[1],[$tbl1,:32]
924*4882a593Smuzhiyun	vmlal.u32	$D1,$H0#hi,${R1}[1]
925*4882a593Smuzhiyun	vmlal.u32	$D4,$H3#hi,${R1}[1]
926*4882a593Smuzhiyun
927*4882a593Smuzhiyun	vmlal.u32	$D0,$H3#hi,${S2}[1]
928*4882a593Smuzhiyun	vmlal.u32	$D3,$H1#hi,${R2}[1]
929*4882a593Smuzhiyun	vmlal.u32	$D4,$H2#hi,${R2}[1]
930*4882a593Smuzhiyun	vmlal.u32	$D1,$H4#hi,${S2}[1]
931*4882a593Smuzhiyun	vmlal.u32	$D2,$H0#hi,${R2}[1]
932*4882a593Smuzhiyun
933*4882a593Smuzhiyun	vmlal.u32	$D3,$H0#hi,${R3}[1]
934*4882a593Smuzhiyun	vmlal.u32	$D0,$H2#hi,${S3}[1]
935*4882a593Smuzhiyun	vmlal.u32	$D4,$H1#hi,${R3}[1]
936*4882a593Smuzhiyun	vmlal.u32	$D1,$H3#hi,${S3}[1]
937*4882a593Smuzhiyun	vmlal.u32	$D2,$H4#hi,${S3}[1]
938*4882a593Smuzhiyun
939*4882a593Smuzhiyun	vmlal.u32	$D3,$H4#hi,${S4}[1]
940*4882a593Smuzhiyun	vmlal.u32	$D0,$H1#hi,${S4}[1]
941*4882a593Smuzhiyun	vmlal.u32	$D4,$H0#hi,${R4}[1]
942*4882a593Smuzhiyun	vmlal.u32	$D1,$H2#hi,${S4}[1]
943*4882a593Smuzhiyun	vmlal.u32	$D2,$H3#hi,${S4}[1]
944*4882a593Smuzhiyun
945*4882a593Smuzhiyun	vld4.32		{$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2]	@ inp[2:3] (or 0)
946*4882a593Smuzhiyun	add		$in2,$in2,#64
947*4882a593Smuzhiyun
948*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
949*4882a593Smuzhiyun	@ (hash+inp[0:1])*r^4 and accumulate
950*4882a593Smuzhiyun
951*4882a593Smuzhiyun	vmlal.u32	$D3,$H3#lo,${R0}[0]
952*4882a593Smuzhiyun	vmlal.u32	$D0,$H0#lo,${R0}[0]
953*4882a593Smuzhiyun	vmlal.u32	$D4,$H4#lo,${R0}[0]
954*4882a593Smuzhiyun	vmlal.u32	$D1,$H1#lo,${R0}[0]
955*4882a593Smuzhiyun	vmlal.u32	$D2,$H2#lo,${R0}[0]
956*4882a593Smuzhiyun	vld1.32		${S4}[0],[$tbl0,:32]
957*4882a593Smuzhiyun
958*4882a593Smuzhiyun	vmlal.u32	$D3,$H2#lo,${R1}[0]
959*4882a593Smuzhiyun	vmlal.u32	$D0,$H4#lo,${S1}[0]
960*4882a593Smuzhiyun	vmlal.u32	$D4,$H3#lo,${R1}[0]
961*4882a593Smuzhiyun	vmlal.u32	$D1,$H0#lo,${R1}[0]
962*4882a593Smuzhiyun	vmlal.u32	$D2,$H1#lo,${R1}[0]
963*4882a593Smuzhiyun
964*4882a593Smuzhiyun	vmlal.u32	$D3,$H1#lo,${R2}[0]
965*4882a593Smuzhiyun	vmlal.u32	$D0,$H3#lo,${S2}[0]
966*4882a593Smuzhiyun	vmlal.u32	$D4,$H2#lo,${R2}[0]
967*4882a593Smuzhiyun	vmlal.u32	$D1,$H4#lo,${S2}[0]
968*4882a593Smuzhiyun	vmlal.u32	$D2,$H0#lo,${R2}[0]
969*4882a593Smuzhiyun
970*4882a593Smuzhiyun	vmlal.u32	$D3,$H0#lo,${R3}[0]
971*4882a593Smuzhiyun	vmlal.u32	$D0,$H2#lo,${S3}[0]
972*4882a593Smuzhiyun	vmlal.u32	$D4,$H1#lo,${R3}[0]
973*4882a593Smuzhiyun	vmlal.u32	$D1,$H3#lo,${S3}[0]
974*4882a593Smuzhiyun	vmlal.u32	$D3,$H4#lo,${S4}[0]
975*4882a593Smuzhiyun
976*4882a593Smuzhiyun	vmlal.u32	$D2,$H4#lo,${S3}[0]
977*4882a593Smuzhiyun	vmlal.u32	$D0,$H1#lo,${S4}[0]
978*4882a593Smuzhiyun	vmlal.u32	$D4,$H0#lo,${R4}[0]
979*4882a593Smuzhiyun	vmov.i32	$H4,#1<<24		@ padbit, yes, always
980*4882a593Smuzhiyun	vmlal.u32	$D1,$H2#lo,${S4}[0]
981*4882a593Smuzhiyun	vmlal.u32	$D2,$H3#lo,${S4}[0]
982*4882a593Smuzhiyun
983*4882a593Smuzhiyun	vld4.32		{$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp]	@ inp[0:1]
984*4882a593Smuzhiyun	add		$inp,$inp,#64
985*4882a593Smuzhiyun# ifdef	__ARMEB__
986*4882a593Smuzhiyun	vrev32.8	$H0,$H0
987*4882a593Smuzhiyun	vrev32.8	$H1,$H1
988*4882a593Smuzhiyun	vrev32.8	$H2,$H2
989*4882a593Smuzhiyun	vrev32.8	$H3,$H3
990*4882a593Smuzhiyun# endif
991*4882a593Smuzhiyun
992*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
993*4882a593Smuzhiyun	@ lazy reduction interleaved with base 2^32 -> base 2^26 of
994*4882a593Smuzhiyun	@ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
995*4882a593Smuzhiyun
996*4882a593Smuzhiyun	vshr.u64	$T0,$D3,#26
997*4882a593Smuzhiyun	vmovn.i64	$D3#lo,$D3
998*4882a593Smuzhiyun	 vshr.u64	$T1,$D0,#26
999*4882a593Smuzhiyun	 vmovn.i64	$D0#lo,$D0
1000*4882a593Smuzhiyun	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
1001*4882a593Smuzhiyun	vbic.i32	$D3#lo,#0xfc000000
1002*4882a593Smuzhiyun	  vsri.u32	$H4,$H3,#8		@ base 2^32 -> base 2^26
1003*4882a593Smuzhiyun	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
1004*4882a593Smuzhiyun	  vshl.u32	$H3,$H3,#18
1005*4882a593Smuzhiyun	 vbic.i32	$D0#lo,#0xfc000000
1006*4882a593Smuzhiyun
1007*4882a593Smuzhiyun	vshrn.u64	$T0#lo,$D4,#26
1008*4882a593Smuzhiyun	vmovn.i64	$D4#lo,$D4
1009*4882a593Smuzhiyun	 vshr.u64	$T1,$D1,#26
1010*4882a593Smuzhiyun	 vmovn.i64	$D1#lo,$D1
1011*4882a593Smuzhiyun	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
1012*4882a593Smuzhiyun	  vsri.u32	$H3,$H2,#14
1013*4882a593Smuzhiyun	vbic.i32	$D4#lo,#0xfc000000
1014*4882a593Smuzhiyun	  vshl.u32	$H2,$H2,#12
1015*4882a593Smuzhiyun	 vbic.i32	$D1#lo,#0xfc000000
1016*4882a593Smuzhiyun
1017*4882a593Smuzhiyun	vadd.i32	$D0#lo,$D0#lo,$T0#lo
1018*4882a593Smuzhiyun	vshl.u32	$T0#lo,$T0#lo,#2
1019*4882a593Smuzhiyun	  vbic.i32	$H3,#0xfc000000
1020*4882a593Smuzhiyun	 vshrn.u64	$T1#lo,$D2,#26
1021*4882a593Smuzhiyun	 vmovn.i64	$D2#lo,$D2
1022*4882a593Smuzhiyun	vaddl.u32	$D0,$D0#lo,$T0#lo	@ h4 -> h0 [widen for a sec]
1023*4882a593Smuzhiyun	  vsri.u32	$H2,$H1,#20
1024*4882a593Smuzhiyun	 vadd.i32	$D3#lo,$D3#lo,$T1#lo	@ h2 -> h3
1025*4882a593Smuzhiyun	  vshl.u32	$H1,$H1,#6
1026*4882a593Smuzhiyun	 vbic.i32	$D2#lo,#0xfc000000
1027*4882a593Smuzhiyun	  vbic.i32	$H2,#0xfc000000
1028*4882a593Smuzhiyun
1029*4882a593Smuzhiyun	vshrn.u64	$T0#lo,$D0,#26		@ re-narrow
1030*4882a593Smuzhiyun	vmovn.i64	$D0#lo,$D0
1031*4882a593Smuzhiyun	  vsri.u32	$H1,$H0,#26
1032*4882a593Smuzhiyun	  vbic.i32	$H0,#0xfc000000
1033*4882a593Smuzhiyun	 vshr.u32	$T1#lo,$D3#lo,#26
1034*4882a593Smuzhiyun	 vbic.i32	$D3#lo,#0xfc000000
1035*4882a593Smuzhiyun	vbic.i32	$D0#lo,#0xfc000000
1036*4882a593Smuzhiyun	vadd.i32	$D1#lo,$D1#lo,$T0#lo	@ h0 -> h1
1037*4882a593Smuzhiyun	 vadd.i32	$D4#lo,$D4#lo,$T1#lo	@ h3 -> h4
1038*4882a593Smuzhiyun	  vbic.i32	$H1,#0xfc000000
1039*4882a593Smuzhiyun
1040*4882a593Smuzhiyun	bhi		.Loop_neon
1041*4882a593Smuzhiyun
1042*4882a593Smuzhiyun.Lskip_loop:
1043*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1044*4882a593Smuzhiyun	@ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1045*4882a593Smuzhiyun
1046*4882a593Smuzhiyun	add		$tbl1,$ctx,#(48+0*9*4)
1047*4882a593Smuzhiyun	add		$tbl0,$ctx,#(48+1*9*4)
1048*4882a593Smuzhiyun	adds		$len,$len,#32
1049*4882a593Smuzhiyun	it		ne
1050*4882a593Smuzhiyun	movne		$len,#0
1051*4882a593Smuzhiyun	bne		.Long_tail
1052*4882a593Smuzhiyun
1053*4882a593Smuzhiyun	vadd.i32	$H2#hi,$H2#lo,$D2#lo	@ add hash value and move to #hi
1054*4882a593Smuzhiyun	vadd.i32	$H0#hi,$H0#lo,$D0#lo
1055*4882a593Smuzhiyun	vadd.i32	$H3#hi,$H3#lo,$D3#lo
1056*4882a593Smuzhiyun	vadd.i32	$H1#hi,$H1#lo,$D1#lo
1057*4882a593Smuzhiyun	vadd.i32	$H4#hi,$H4#lo,$D4#lo
1058*4882a593Smuzhiyun
1059*4882a593Smuzhiyun.Long_tail:
1060*4882a593Smuzhiyun	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^1
1061*4882a593Smuzhiyun	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^2
1062*4882a593Smuzhiyun
1063*4882a593Smuzhiyun	vadd.i32	$H2#lo,$H2#lo,$D2#lo	@ can be redundant
1064*4882a593Smuzhiyun	vmull.u32	$D2,$H2#hi,$R0
1065*4882a593Smuzhiyun	vadd.i32	$H0#lo,$H0#lo,$D0#lo
1066*4882a593Smuzhiyun	vmull.u32	$D0,$H0#hi,$R0
1067*4882a593Smuzhiyun	vadd.i32	$H3#lo,$H3#lo,$D3#lo
1068*4882a593Smuzhiyun	vmull.u32	$D3,$H3#hi,$R0
1069*4882a593Smuzhiyun	vadd.i32	$H1#lo,$H1#lo,$D1#lo
1070*4882a593Smuzhiyun	vmull.u32	$D1,$H1#hi,$R0
1071*4882a593Smuzhiyun	vadd.i32	$H4#lo,$H4#lo,$D4#lo
1072*4882a593Smuzhiyun	vmull.u32	$D4,$H4#hi,$R0
1073*4882a593Smuzhiyun
1074*4882a593Smuzhiyun	vmlal.u32	$D0,$H4#hi,$S1
1075*4882a593Smuzhiyun	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1076*4882a593Smuzhiyun	vmlal.u32	$D3,$H2#hi,$R1
1077*4882a593Smuzhiyun	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1078*4882a593Smuzhiyun	vmlal.u32	$D1,$H0#hi,$R1
1079*4882a593Smuzhiyun	vmlal.u32	$D4,$H3#hi,$R1
1080*4882a593Smuzhiyun	vmlal.u32	$D2,$H1#hi,$R1
1081*4882a593Smuzhiyun
1082*4882a593Smuzhiyun	vmlal.u32	$D3,$H1#hi,$R2
1083*4882a593Smuzhiyun	vld1.32		${S4}[1],[$tbl1,:32]
1084*4882a593Smuzhiyun	vmlal.u32	$D0,$H3#hi,$S2
1085*4882a593Smuzhiyun	vld1.32		${S4}[0],[$tbl0,:32]
1086*4882a593Smuzhiyun	vmlal.u32	$D4,$H2#hi,$R2
1087*4882a593Smuzhiyun	vmlal.u32	$D1,$H4#hi,$S2
1088*4882a593Smuzhiyun	vmlal.u32	$D2,$H0#hi,$R2
1089*4882a593Smuzhiyun
1090*4882a593Smuzhiyun	vmlal.u32	$D3,$H0#hi,$R3
1091*4882a593Smuzhiyun	 it		ne
1092*4882a593Smuzhiyun	 addne		$tbl1,$ctx,#(48+2*9*4)
1093*4882a593Smuzhiyun	vmlal.u32	$D0,$H2#hi,$S3
1094*4882a593Smuzhiyun	 it		ne
1095*4882a593Smuzhiyun	 addne		$tbl0,$ctx,#(48+3*9*4)
1096*4882a593Smuzhiyun	vmlal.u32	$D4,$H1#hi,$R3
1097*4882a593Smuzhiyun	vmlal.u32	$D1,$H3#hi,$S3
1098*4882a593Smuzhiyun	vmlal.u32	$D2,$H4#hi,$S3
1099*4882a593Smuzhiyun
1100*4882a593Smuzhiyun	vmlal.u32	$D3,$H4#hi,$S4
1101*4882a593Smuzhiyun	 vorn		$MASK,$MASK,$MASK	@ all-ones, can be redundant
1102*4882a593Smuzhiyun	vmlal.u32	$D0,$H1#hi,$S4
1103*4882a593Smuzhiyun	 vshr.u64	$MASK,$MASK,#38
1104*4882a593Smuzhiyun	vmlal.u32	$D4,$H0#hi,$R4
1105*4882a593Smuzhiyun	vmlal.u32	$D1,$H2#hi,$S4
1106*4882a593Smuzhiyun	vmlal.u32	$D2,$H3#hi,$S4
1107*4882a593Smuzhiyun
1108*4882a593Smuzhiyun	beq		.Lshort_tail
1109*4882a593Smuzhiyun
1110*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1111*4882a593Smuzhiyun	@ (hash+inp[0:1])*r^4:r^3 and accumulate
1112*4882a593Smuzhiyun
1113*4882a593Smuzhiyun	vld4.32		{${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]!	@ load r^3
1114*4882a593Smuzhiyun	vld4.32		{${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]!	@ load r^4
1115*4882a593Smuzhiyun
1116*4882a593Smuzhiyun	vmlal.u32	$D2,$H2#lo,$R0
1117*4882a593Smuzhiyun	vmlal.u32	$D0,$H0#lo,$R0
1118*4882a593Smuzhiyun	vmlal.u32	$D3,$H3#lo,$R0
1119*4882a593Smuzhiyun	vmlal.u32	$D1,$H1#lo,$R0
1120*4882a593Smuzhiyun	vmlal.u32	$D4,$H4#lo,$R0
1121*4882a593Smuzhiyun
1122*4882a593Smuzhiyun	vmlal.u32	$D0,$H4#lo,$S1
1123*4882a593Smuzhiyun	vld4.32		{${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]!
1124*4882a593Smuzhiyun	vmlal.u32	$D3,$H2#lo,$R1
1125*4882a593Smuzhiyun	vld4.32		{${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]!
1126*4882a593Smuzhiyun	vmlal.u32	$D1,$H0#lo,$R1
1127*4882a593Smuzhiyun	vmlal.u32	$D4,$H3#lo,$R1
1128*4882a593Smuzhiyun	vmlal.u32	$D2,$H1#lo,$R1
1129*4882a593Smuzhiyun
1130*4882a593Smuzhiyun	vmlal.u32	$D3,$H1#lo,$R2
1131*4882a593Smuzhiyun	vld1.32		${S4}[1],[$tbl1,:32]
1132*4882a593Smuzhiyun	vmlal.u32	$D0,$H3#lo,$S2
1133*4882a593Smuzhiyun	vld1.32		${S4}[0],[$tbl0,:32]
1134*4882a593Smuzhiyun	vmlal.u32	$D4,$H2#lo,$R2
1135*4882a593Smuzhiyun	vmlal.u32	$D1,$H4#lo,$S2
1136*4882a593Smuzhiyun	vmlal.u32	$D2,$H0#lo,$R2
1137*4882a593Smuzhiyun
1138*4882a593Smuzhiyun	vmlal.u32	$D3,$H0#lo,$R3
1139*4882a593Smuzhiyun	vmlal.u32	$D0,$H2#lo,$S3
1140*4882a593Smuzhiyun	vmlal.u32	$D4,$H1#lo,$R3
1141*4882a593Smuzhiyun	vmlal.u32	$D1,$H3#lo,$S3
1142*4882a593Smuzhiyun	vmlal.u32	$D2,$H4#lo,$S3
1143*4882a593Smuzhiyun
1144*4882a593Smuzhiyun	vmlal.u32	$D3,$H4#lo,$S4
1145*4882a593Smuzhiyun	 vorn		$MASK,$MASK,$MASK	@ all-ones
1146*4882a593Smuzhiyun	vmlal.u32	$D0,$H1#lo,$S4
1147*4882a593Smuzhiyun	 vshr.u64	$MASK,$MASK,#38
1148*4882a593Smuzhiyun	vmlal.u32	$D4,$H0#lo,$R4
1149*4882a593Smuzhiyun	vmlal.u32	$D1,$H2#lo,$S4
1150*4882a593Smuzhiyun	vmlal.u32	$D2,$H3#lo,$S4
1151*4882a593Smuzhiyun
1152*4882a593Smuzhiyun.Lshort_tail:
1153*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1154*4882a593Smuzhiyun	@ horizontal addition
1155*4882a593Smuzhiyun
1156*4882a593Smuzhiyun	vadd.i64	$D3#lo,$D3#lo,$D3#hi
1157*4882a593Smuzhiyun	vadd.i64	$D0#lo,$D0#lo,$D0#hi
1158*4882a593Smuzhiyun	vadd.i64	$D4#lo,$D4#lo,$D4#hi
1159*4882a593Smuzhiyun	vadd.i64	$D1#lo,$D1#lo,$D1#hi
1160*4882a593Smuzhiyun	vadd.i64	$D2#lo,$D2#lo,$D2#hi
1161*4882a593Smuzhiyun
1162*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1163*4882a593Smuzhiyun	@ lazy reduction, but without narrowing
1164*4882a593Smuzhiyun
1165*4882a593Smuzhiyun	vshr.u64	$T0,$D3,#26
1166*4882a593Smuzhiyun	vand.i64	$D3,$D3,$MASK
1167*4882a593Smuzhiyun	 vshr.u64	$T1,$D0,#26
1168*4882a593Smuzhiyun	 vand.i64	$D0,$D0,$MASK
1169*4882a593Smuzhiyun	vadd.i64	$D4,$D4,$T0		@ h3 -> h4
1170*4882a593Smuzhiyun	 vadd.i64	$D1,$D1,$T1		@ h0 -> h1
1171*4882a593Smuzhiyun
1172*4882a593Smuzhiyun	vshr.u64	$T0,$D4,#26
1173*4882a593Smuzhiyun	vand.i64	$D4,$D4,$MASK
1174*4882a593Smuzhiyun	 vshr.u64	$T1,$D1,#26
1175*4882a593Smuzhiyun	 vand.i64	$D1,$D1,$MASK
1176*4882a593Smuzhiyun	 vadd.i64	$D2,$D2,$T1		@ h1 -> h2
1177*4882a593Smuzhiyun
1178*4882a593Smuzhiyun	vadd.i64	$D0,$D0,$T0
1179*4882a593Smuzhiyun	vshl.u64	$T0,$T0,#2
1180*4882a593Smuzhiyun	 vshr.u64	$T1,$D2,#26
1181*4882a593Smuzhiyun	 vand.i64	$D2,$D2,$MASK
1182*4882a593Smuzhiyun	vadd.i64	$D0,$D0,$T0		@ h4 -> h0
1183*4882a593Smuzhiyun	 vadd.i64	$D3,$D3,$T1		@ h2 -> h3
1184*4882a593Smuzhiyun
1185*4882a593Smuzhiyun	vshr.u64	$T0,$D0,#26
1186*4882a593Smuzhiyun	vand.i64	$D0,$D0,$MASK
1187*4882a593Smuzhiyun	 vshr.u64	$T1,$D3,#26
1188*4882a593Smuzhiyun	 vand.i64	$D3,$D3,$MASK
1189*4882a593Smuzhiyun	vadd.i64	$D1,$D1,$T0		@ h0 -> h1
1190*4882a593Smuzhiyun	 vadd.i64	$D4,$D4,$T1		@ h3 -> h4
1191*4882a593Smuzhiyun
1192*4882a593Smuzhiyun	cmp		$len,#0
1193*4882a593Smuzhiyun	bne		.Leven
1194*4882a593Smuzhiyun
1195*4882a593Smuzhiyun	@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
1196*4882a593Smuzhiyun	@ store hash value
1197*4882a593Smuzhiyun
1198*4882a593Smuzhiyun	vst4.32		{$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]!
1199*4882a593Smuzhiyun	vst1.32		{$D4#lo[0]},[$ctx]
1200*4882a593Smuzhiyun
1201*4882a593Smuzhiyun	vldmia	sp!,{d8-d15}			@ epilogue
1202*4882a593Smuzhiyun	ldmia	sp!,{r4-r7}
1203*4882a593Smuzhiyun	ret					@ bx	lr
1204*4882a593Smuzhiyun.size	poly1305_blocks_neon,.-poly1305_blocks_neon
1205*4882a593Smuzhiyun
1206*4882a593Smuzhiyun.align	5
1207*4882a593Smuzhiyun.Lzeros:
1208*4882a593Smuzhiyun.long	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1209*4882a593Smuzhiyun#ifndef	__KERNEL__
1210*4882a593Smuzhiyun.LOPENSSL_armcap:
1211*4882a593Smuzhiyun# ifdef	_WIN32
1212*4882a593Smuzhiyun.word	OPENSSL_armcap_P
1213*4882a593Smuzhiyun# else
1214*4882a593Smuzhiyun.word	OPENSSL_armcap_P-.Lpoly1305_init
1215*4882a593Smuzhiyun# endif
1216*4882a593Smuzhiyun.comm	OPENSSL_armcap_P,4,4
1217*4882a593Smuzhiyun.hidden	OPENSSL_armcap_P
1218*4882a593Smuzhiyun#endif
1219*4882a593Smuzhiyun#endif
1220*4882a593Smuzhiyun___
1221*4882a593Smuzhiyun}	}
1222*4882a593Smuzhiyun$code.=<<___;
1223*4882a593Smuzhiyun.asciz	"Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm"
1224*4882a593Smuzhiyun.align	2
1225*4882a593Smuzhiyun___
1226*4882a593Smuzhiyun
1227*4882a593Smuzhiyunforeach (split("\n",$code)) {
1228*4882a593Smuzhiyun	s/\`([^\`]*)\`/eval $1/geo;
1229*4882a593Smuzhiyun
1230*4882a593Smuzhiyun	s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo	or
1231*4882a593Smuzhiyun	s/\bret\b/bx	lr/go						or
1232*4882a593Smuzhiyun	s/\bbx\s+lr\b/.word\t0xe12fff1e/go;	# make it possible to compile with -march=armv4
1233*4882a593Smuzhiyun
1234*4882a593Smuzhiyun	print $_,"\n";
1235*4882a593Smuzhiyun}
1236*4882a593Smuzhiyunclose STDOUT; # enforce flush
1237