xref: /OK3568_Linux_fs/kernel/arch/mips/crypto/poly1305-mips.pl (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun#!/usr/bin/env perl
2*4882a593Smuzhiyun# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause
3*4882a593Smuzhiyun#
4*4882a593Smuzhiyun# ====================================================================
5*4882a593Smuzhiyun# Written by Andy Polyakov, @dot-asm, originally for the OpenSSL
6*4882a593Smuzhiyun# project.
7*4882a593Smuzhiyun# ====================================================================
8*4882a593Smuzhiyun
9*4882a593Smuzhiyun# Poly1305 hash for MIPS.
10*4882a593Smuzhiyun#
11*4882a593Smuzhiyun# May 2016
12*4882a593Smuzhiyun#
13*4882a593Smuzhiyun# Numbers are cycles per processed byte with poly1305_blocks alone.
14*4882a593Smuzhiyun#
15*4882a593Smuzhiyun#		IALU/gcc
16*4882a593Smuzhiyun# R1x000	~5.5/+130%	(big-endian)
17*4882a593Smuzhiyun# Octeon II	2.50/+70%	(little-endian)
18*4882a593Smuzhiyun#
19*4882a593Smuzhiyun# March 2019
20*4882a593Smuzhiyun#
21*4882a593Smuzhiyun# Add 32-bit code path.
22*4882a593Smuzhiyun#
23*4882a593Smuzhiyun# October 2019
24*4882a593Smuzhiyun#
25*4882a593Smuzhiyun# Modulo-scheduling reduction allows to omit dependency chain at the
26*4882a593Smuzhiyun# end of inner loop and improve performance. Also optimize MIPS32R2
27*4882a593Smuzhiyun# code path for MIPS 1004K core. Per René von Dorst's suggestions.
28*4882a593Smuzhiyun#
29*4882a593Smuzhiyun#		IALU/gcc
30*4882a593Smuzhiyun# R1x000	~9.8/?		(big-endian)
31*4882a593Smuzhiyun# Octeon II	3.65/+140%	(little-endian)
32*4882a593Smuzhiyun# MT7621/1004K	4.75/?		(little-endian)
33*4882a593Smuzhiyun#
34*4882a593Smuzhiyun######################################################################
35*4882a593Smuzhiyun# There is a number of MIPS ABI in use, O32 and N32/64 are most
36*4882a593Smuzhiyun# widely used. Then there is a new contender: NUBI. It appears that if
37*4882a593Smuzhiyun# one picks the latter, it's possible to arrange code in ABI neutral
38*4882a593Smuzhiyun# manner. Therefore let's stick to NUBI register layout:
39*4882a593Smuzhiyun#
40*4882a593Smuzhiyun($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
41*4882a593Smuzhiyun($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
42*4882a593Smuzhiyun($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
43*4882a593Smuzhiyun($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
44*4882a593Smuzhiyun#
45*4882a593Smuzhiyun# The return value is placed in $a0. Following coding rules facilitate
46*4882a593Smuzhiyun# interoperability:
47*4882a593Smuzhiyun#
48*4882a593Smuzhiyun# - never ever touch $tp, "thread pointer", former $gp [o32 can be
49*4882a593Smuzhiyun#   excluded from the rule, because it's specified volatile];
50*4882a593Smuzhiyun# - copy return value to $t0, former $v0 [or to $a0 if you're adapting
51*4882a593Smuzhiyun#   old code];
52*4882a593Smuzhiyun# - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
53*4882a593Smuzhiyun#
54*4882a593Smuzhiyun# For reference here is register layout for N32/64 MIPS ABIs:
55*4882a593Smuzhiyun#
56*4882a593Smuzhiyun# ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
57*4882a593Smuzhiyun# ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
58*4882a593Smuzhiyun# ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
59*4882a593Smuzhiyun# ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
60*4882a593Smuzhiyun# ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
61*4882a593Smuzhiyun#
62*4882a593Smuzhiyun# <appro@openssl.org>
63*4882a593Smuzhiyun#
64*4882a593Smuzhiyun######################################################################
65*4882a593Smuzhiyun
66*4882a593Smuzhiyun$flavour = shift || "64"; # supported flavours are o32,n32,64,nubi32,nubi64
67*4882a593Smuzhiyun
68*4882a593Smuzhiyun$v0 = ($flavour =~ /nubi/i) ? $a0 : $t0;
69*4882a593Smuzhiyun
70*4882a593Smuzhiyunif ($flavour =~ /64|n32/i) {{{
71*4882a593Smuzhiyun######################################################################
72*4882a593Smuzhiyun# 64-bit code path
73*4882a593Smuzhiyun#
74*4882a593Smuzhiyun
75*4882a593Smuzhiyunmy ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
76*4882a593Smuzhiyunmy ($in0,$in1,$tmp0,$tmp1,$tmp2,$tmp3,$tmp4) = ($a4,$a5,$a6,$a7,$at,$t0,$t1);
77*4882a593Smuzhiyun
78*4882a593Smuzhiyun$code.=<<___;
79*4882a593Smuzhiyun#if (defined(_MIPS_ARCH_MIPS64R3) || defined(_MIPS_ARCH_MIPS64R5) || \\
80*4882a593Smuzhiyun     defined(_MIPS_ARCH_MIPS64R6)) \\
81*4882a593Smuzhiyun     && !defined(_MIPS_ARCH_MIPS64R2)
82*4882a593Smuzhiyun# define _MIPS_ARCH_MIPS64R2
83*4882a593Smuzhiyun#endif
84*4882a593Smuzhiyun
85*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6)
86*4882a593Smuzhiyun# define dmultu(rs,rt)
87*4882a593Smuzhiyun# define mflo(rd,rs,rt)	dmulu	rd,rs,rt
88*4882a593Smuzhiyun# define mfhi(rd,rs,rt)	dmuhu	rd,rs,rt
89*4882a593Smuzhiyun#else
90*4882a593Smuzhiyun# define dmultu(rs,rt)		dmultu	rs,rt
91*4882a593Smuzhiyun# define mflo(rd,rs,rt)	mflo	rd
92*4882a593Smuzhiyun# define mfhi(rd,rs,rt)	mfhi	rd
93*4882a593Smuzhiyun#endif
94*4882a593Smuzhiyun
95*4882a593Smuzhiyun#ifdef	__KERNEL__
96*4882a593Smuzhiyun# define poly1305_init   poly1305_init_mips
97*4882a593Smuzhiyun# define poly1305_blocks poly1305_blocks_mips
98*4882a593Smuzhiyun# define poly1305_emit   poly1305_emit_mips
99*4882a593Smuzhiyun#endif
100*4882a593Smuzhiyun
101*4882a593Smuzhiyun#if defined(__MIPSEB__) && !defined(MIPSEB)
102*4882a593Smuzhiyun# define MIPSEB
103*4882a593Smuzhiyun#endif
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun#ifdef MIPSEB
106*4882a593Smuzhiyun# define MSB 0
107*4882a593Smuzhiyun# define LSB 7
108*4882a593Smuzhiyun#else
109*4882a593Smuzhiyun# define MSB 7
110*4882a593Smuzhiyun# define LSB 0
111*4882a593Smuzhiyun#endif
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun.text
114*4882a593Smuzhiyun.set	noat
115*4882a593Smuzhiyun.set	noreorder
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun.align	5
118*4882a593Smuzhiyun.globl	poly1305_init
119*4882a593Smuzhiyun.ent	poly1305_init
120*4882a593Smuzhiyunpoly1305_init:
121*4882a593Smuzhiyun	.frame	$sp,0,$ra
122*4882a593Smuzhiyun	.set	reorder
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun	sd	$zero,0($ctx)
125*4882a593Smuzhiyun	sd	$zero,8($ctx)
126*4882a593Smuzhiyun	sd	$zero,16($ctx)
127*4882a593Smuzhiyun
128*4882a593Smuzhiyun	beqz	$inp,.Lno_key
129*4882a593Smuzhiyun
130*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6)
131*4882a593Smuzhiyun	andi	$tmp0,$inp,7		# $inp % 8
132*4882a593Smuzhiyun	dsubu	$inp,$inp,$tmp0		# align $inp
133*4882a593Smuzhiyun	sll	$tmp0,$tmp0,3		# byte to bit offset
134*4882a593Smuzhiyun	ld	$in0,0($inp)
135*4882a593Smuzhiyun	ld	$in1,8($inp)
136*4882a593Smuzhiyun	beqz	$tmp0,.Laligned_key
137*4882a593Smuzhiyun	ld	$tmp2,16($inp)
138*4882a593Smuzhiyun
139*4882a593Smuzhiyun	subu	$tmp1,$zero,$tmp0
140*4882a593Smuzhiyun# ifdef	MIPSEB
141*4882a593Smuzhiyun	dsllv	$in0,$in0,$tmp0
142*4882a593Smuzhiyun	dsrlv	$tmp3,$in1,$tmp1
143*4882a593Smuzhiyun	dsllv	$in1,$in1,$tmp0
144*4882a593Smuzhiyun	dsrlv	$tmp2,$tmp2,$tmp1
145*4882a593Smuzhiyun# else
146*4882a593Smuzhiyun	dsrlv	$in0,$in0,$tmp0
147*4882a593Smuzhiyun	dsllv	$tmp3,$in1,$tmp1
148*4882a593Smuzhiyun	dsrlv	$in1,$in1,$tmp0
149*4882a593Smuzhiyun	dsllv	$tmp2,$tmp2,$tmp1
150*4882a593Smuzhiyun# endif
151*4882a593Smuzhiyun	or	$in0,$in0,$tmp3
152*4882a593Smuzhiyun	or	$in1,$in1,$tmp2
153*4882a593Smuzhiyun.Laligned_key:
154*4882a593Smuzhiyun#else
155*4882a593Smuzhiyun	ldl	$in0,0+MSB($inp)
156*4882a593Smuzhiyun	ldl	$in1,8+MSB($inp)
157*4882a593Smuzhiyun	ldr	$in0,0+LSB($inp)
158*4882a593Smuzhiyun	ldr	$in1,8+LSB($inp)
159*4882a593Smuzhiyun#endif
160*4882a593Smuzhiyun#ifdef	MIPSEB
161*4882a593Smuzhiyun# if defined(_MIPS_ARCH_MIPS64R2)
162*4882a593Smuzhiyun	dsbh	$in0,$in0		# byte swap
163*4882a593Smuzhiyun	 dsbh	$in1,$in1
164*4882a593Smuzhiyun	dshd	$in0,$in0
165*4882a593Smuzhiyun	 dshd	$in1,$in1
166*4882a593Smuzhiyun# else
167*4882a593Smuzhiyun	ori	$tmp0,$zero,0xFF
168*4882a593Smuzhiyun	dsll	$tmp2,$tmp0,32
169*4882a593Smuzhiyun	or	$tmp0,$tmp2		# 0x000000FF000000FF
170*4882a593Smuzhiyun
171*4882a593Smuzhiyun	and	$tmp1,$in0,$tmp0	# byte swap
172*4882a593Smuzhiyun	 and	$tmp3,$in1,$tmp0
173*4882a593Smuzhiyun	dsrl	$tmp2,$in0,24
174*4882a593Smuzhiyun	 dsrl	$tmp4,$in1,24
175*4882a593Smuzhiyun	dsll	$tmp1,24
176*4882a593Smuzhiyun	 dsll	$tmp3,24
177*4882a593Smuzhiyun	and	$tmp2,$tmp0
178*4882a593Smuzhiyun	 and	$tmp4,$tmp0
179*4882a593Smuzhiyun	dsll	$tmp0,8			# 0x0000FF000000FF00
180*4882a593Smuzhiyun	or	$tmp1,$tmp2
181*4882a593Smuzhiyun	 or	$tmp3,$tmp4
182*4882a593Smuzhiyun	and	$tmp2,$in0,$tmp0
183*4882a593Smuzhiyun	 and	$tmp4,$in1,$tmp0
184*4882a593Smuzhiyun	dsrl	$in0,8
185*4882a593Smuzhiyun	 dsrl	$in1,8
186*4882a593Smuzhiyun	dsll	$tmp2,8
187*4882a593Smuzhiyun	 dsll	$tmp4,8
188*4882a593Smuzhiyun	and	$in0,$tmp0
189*4882a593Smuzhiyun	 and	$in1,$tmp0
190*4882a593Smuzhiyun	or	$tmp1,$tmp2
191*4882a593Smuzhiyun	 or	$tmp3,$tmp4
192*4882a593Smuzhiyun	or	$in0,$tmp1
193*4882a593Smuzhiyun	 or	$in1,$tmp3
194*4882a593Smuzhiyun	dsrl	$tmp1,$in0,32
195*4882a593Smuzhiyun	 dsrl	$tmp3,$in1,32
196*4882a593Smuzhiyun	dsll	$in0,32
197*4882a593Smuzhiyun	 dsll	$in1,32
198*4882a593Smuzhiyun	or	$in0,$tmp1
199*4882a593Smuzhiyun	 or	$in1,$tmp3
200*4882a593Smuzhiyun# endif
201*4882a593Smuzhiyun#endif
202*4882a593Smuzhiyun	li	$tmp0,1
203*4882a593Smuzhiyun	dsll	$tmp0,32		# 0x0000000100000000
204*4882a593Smuzhiyun	daddiu	$tmp0,-63		# 0x00000000ffffffc1
205*4882a593Smuzhiyun	dsll	$tmp0,28		# 0x0ffffffc10000000
206*4882a593Smuzhiyun	daddiu	$tmp0,-1		# 0x0ffffffc0fffffff
207*4882a593Smuzhiyun
208*4882a593Smuzhiyun	and	$in0,$tmp0
209*4882a593Smuzhiyun	daddiu	$tmp0,-3		# 0x0ffffffc0ffffffc
210*4882a593Smuzhiyun	and	$in1,$tmp0
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun	sd	$in0,24($ctx)
213*4882a593Smuzhiyun	dsrl	$tmp0,$in1,2
214*4882a593Smuzhiyun	sd	$in1,32($ctx)
215*4882a593Smuzhiyun	daddu	$tmp0,$in1		# s1 = r1 + (r1 >> 2)
216*4882a593Smuzhiyun	sd	$tmp0,40($ctx)
217*4882a593Smuzhiyun
218*4882a593Smuzhiyun.Lno_key:
219*4882a593Smuzhiyun	li	$v0,0			# return 0
220*4882a593Smuzhiyun	jr	$ra
221*4882a593Smuzhiyun.end	poly1305_init
222*4882a593Smuzhiyun___
223*4882a593Smuzhiyun{
224*4882a593Smuzhiyunmy $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x0003f000" : "0x00030000";
225*4882a593Smuzhiyun
226*4882a593Smuzhiyunmy ($h0,$h1,$h2,$r0,$r1,$rs1,$d0,$d1,$d2) =
227*4882a593Smuzhiyun   ($s0,$s1,$s2,$s3,$s4,$s5,$in0,$in1,$t2);
228*4882a593Smuzhiyunmy ($shr,$shl) = ($s6,$s7);		# used on R6
229*4882a593Smuzhiyun
230*4882a593Smuzhiyun$code.=<<___;
231*4882a593Smuzhiyun.align	5
232*4882a593Smuzhiyun.globl	poly1305_blocks
233*4882a593Smuzhiyun.ent	poly1305_blocks
234*4882a593Smuzhiyunpoly1305_blocks:
235*4882a593Smuzhiyun	.set	noreorder
236*4882a593Smuzhiyun	dsrl	$len,4			# number of complete blocks
237*4882a593Smuzhiyun	bnez	$len,poly1305_blocks_internal
238*4882a593Smuzhiyun	nop
239*4882a593Smuzhiyun	jr	$ra
240*4882a593Smuzhiyun	nop
241*4882a593Smuzhiyun.end	poly1305_blocks
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun.align	5
244*4882a593Smuzhiyun.ent	poly1305_blocks_internal
245*4882a593Smuzhiyunpoly1305_blocks_internal:
246*4882a593Smuzhiyun	.set	noreorder
247*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6)
248*4882a593Smuzhiyun	.frame	$sp,8*8,$ra
249*4882a593Smuzhiyun	.mask	$SAVED_REGS_MASK|0x000c0000,-8
250*4882a593Smuzhiyun	dsubu	$sp,8*8
251*4882a593Smuzhiyun	sd	$s7,56($sp)
252*4882a593Smuzhiyun	sd	$s6,48($sp)
253*4882a593Smuzhiyun#else
254*4882a593Smuzhiyun	.frame	$sp,6*8,$ra
255*4882a593Smuzhiyun	.mask	$SAVED_REGS_MASK,-8
256*4882a593Smuzhiyun	dsubu	$sp,6*8
257*4882a593Smuzhiyun#endif
258*4882a593Smuzhiyun	sd	$s5,40($sp)
259*4882a593Smuzhiyun	sd	$s4,32($sp)
260*4882a593Smuzhiyun___
261*4882a593Smuzhiyun$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
262*4882a593Smuzhiyun	sd	$s3,24($sp)
263*4882a593Smuzhiyun	sd	$s2,16($sp)
264*4882a593Smuzhiyun	sd	$s1,8($sp)
265*4882a593Smuzhiyun	sd	$s0,0($sp)
266*4882a593Smuzhiyun___
267*4882a593Smuzhiyun$code.=<<___;
268*4882a593Smuzhiyun	.set	reorder
269*4882a593Smuzhiyun
270*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6)
271*4882a593Smuzhiyun	andi	$shr,$inp,7
272*4882a593Smuzhiyun	dsubu	$inp,$inp,$shr		# align $inp
273*4882a593Smuzhiyun	sll	$shr,$shr,3		# byte to bit offset
274*4882a593Smuzhiyun	subu	$shl,$zero,$shr
275*4882a593Smuzhiyun#endif
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun	ld	$h0,0($ctx)		# load hash value
278*4882a593Smuzhiyun	ld	$h1,8($ctx)
279*4882a593Smuzhiyun	ld	$h2,16($ctx)
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun	ld	$r0,24($ctx)		# load key
282*4882a593Smuzhiyun	ld	$r1,32($ctx)
283*4882a593Smuzhiyun	ld	$rs1,40($ctx)
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun	dsll	$len,4
286*4882a593Smuzhiyun	daddu	$len,$inp		# end of buffer
287*4882a593Smuzhiyun	b	.Loop
288*4882a593Smuzhiyun
289*4882a593Smuzhiyun.align	4
290*4882a593Smuzhiyun.Loop:
291*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6)
292*4882a593Smuzhiyun	ld	$in0,0($inp)		# load input
293*4882a593Smuzhiyun	ld	$in1,8($inp)
294*4882a593Smuzhiyun	beqz	$shr,.Laligned_inp
295*4882a593Smuzhiyun
296*4882a593Smuzhiyun	ld	$tmp2,16($inp)
297*4882a593Smuzhiyun# ifdef	MIPSEB
298*4882a593Smuzhiyun	dsllv	$in0,$in0,$shr
299*4882a593Smuzhiyun	dsrlv	$tmp3,$in1,$shl
300*4882a593Smuzhiyun	dsllv	$in1,$in1,$shr
301*4882a593Smuzhiyun	dsrlv	$tmp2,$tmp2,$shl
302*4882a593Smuzhiyun# else
303*4882a593Smuzhiyun	dsrlv	$in0,$in0,$shr
304*4882a593Smuzhiyun	dsllv	$tmp3,$in1,$shl
305*4882a593Smuzhiyun	dsrlv	$in1,$in1,$shr
306*4882a593Smuzhiyun	dsllv	$tmp2,$tmp2,$shl
307*4882a593Smuzhiyun# endif
308*4882a593Smuzhiyun	or	$in0,$in0,$tmp3
309*4882a593Smuzhiyun	or	$in1,$in1,$tmp2
310*4882a593Smuzhiyun.Laligned_inp:
311*4882a593Smuzhiyun#else
312*4882a593Smuzhiyun	ldl	$in0,0+MSB($inp)	# load input
313*4882a593Smuzhiyun	ldl	$in1,8+MSB($inp)
314*4882a593Smuzhiyun	ldr	$in0,0+LSB($inp)
315*4882a593Smuzhiyun	ldr	$in1,8+LSB($inp)
316*4882a593Smuzhiyun#endif
317*4882a593Smuzhiyun	daddiu	$inp,16
318*4882a593Smuzhiyun#ifdef	MIPSEB
319*4882a593Smuzhiyun# if defined(_MIPS_ARCH_MIPS64R2)
320*4882a593Smuzhiyun	dsbh	$in0,$in0		# byte swap
321*4882a593Smuzhiyun	 dsbh	$in1,$in1
322*4882a593Smuzhiyun	dshd	$in0,$in0
323*4882a593Smuzhiyun	 dshd	$in1,$in1
324*4882a593Smuzhiyun# else
325*4882a593Smuzhiyun	ori	$tmp0,$zero,0xFF
326*4882a593Smuzhiyun	dsll	$tmp2,$tmp0,32
327*4882a593Smuzhiyun	or	$tmp0,$tmp2		# 0x000000FF000000FF
328*4882a593Smuzhiyun
329*4882a593Smuzhiyun	and	$tmp1,$in0,$tmp0	# byte swap
330*4882a593Smuzhiyun	 and	$tmp3,$in1,$tmp0
331*4882a593Smuzhiyun	dsrl	$tmp2,$in0,24
332*4882a593Smuzhiyun	 dsrl	$tmp4,$in1,24
333*4882a593Smuzhiyun	dsll	$tmp1,24
334*4882a593Smuzhiyun	 dsll	$tmp3,24
335*4882a593Smuzhiyun	and	$tmp2,$tmp0
336*4882a593Smuzhiyun	 and	$tmp4,$tmp0
337*4882a593Smuzhiyun	dsll	$tmp0,8			# 0x0000FF000000FF00
338*4882a593Smuzhiyun	or	$tmp1,$tmp2
339*4882a593Smuzhiyun	 or	$tmp3,$tmp4
340*4882a593Smuzhiyun	and	$tmp2,$in0,$tmp0
341*4882a593Smuzhiyun	 and	$tmp4,$in1,$tmp0
342*4882a593Smuzhiyun	dsrl	$in0,8
343*4882a593Smuzhiyun	 dsrl	$in1,8
344*4882a593Smuzhiyun	dsll	$tmp2,8
345*4882a593Smuzhiyun	 dsll	$tmp4,8
346*4882a593Smuzhiyun	and	$in0,$tmp0
347*4882a593Smuzhiyun	 and	$in1,$tmp0
348*4882a593Smuzhiyun	or	$tmp1,$tmp2
349*4882a593Smuzhiyun	 or	$tmp3,$tmp4
350*4882a593Smuzhiyun	or	$in0,$tmp1
351*4882a593Smuzhiyun	 or	$in1,$tmp3
352*4882a593Smuzhiyun	dsrl	$tmp1,$in0,32
353*4882a593Smuzhiyun	 dsrl	$tmp3,$in1,32
354*4882a593Smuzhiyun	dsll	$in0,32
355*4882a593Smuzhiyun	 dsll	$in1,32
356*4882a593Smuzhiyun	or	$in0,$tmp1
357*4882a593Smuzhiyun	 or	$in1,$tmp3
358*4882a593Smuzhiyun# endif
359*4882a593Smuzhiyun#endif
360*4882a593Smuzhiyun	dsrl	$tmp1,$h2,2		# modulo-scheduled reduction
361*4882a593Smuzhiyun	andi	$h2,$h2,3
362*4882a593Smuzhiyun	dsll	$tmp0,$tmp1,2
363*4882a593Smuzhiyun
364*4882a593Smuzhiyun	daddu	$d0,$h0,$in0		# accumulate input
365*4882a593Smuzhiyun	 daddu	$tmp1,$tmp0
366*4882a593Smuzhiyun	sltu	$tmp0,$d0,$h0
367*4882a593Smuzhiyun	daddu	$d0,$d0,$tmp1		# ... and residue
368*4882a593Smuzhiyun	sltu	$tmp1,$d0,$tmp1
369*4882a593Smuzhiyun	daddu	$d1,$h1,$in1
370*4882a593Smuzhiyun	daddu	$tmp0,$tmp1
371*4882a593Smuzhiyun	sltu	$tmp1,$d1,$h1
372*4882a593Smuzhiyun	daddu	$d1,$tmp0
373*4882a593Smuzhiyun
374*4882a593Smuzhiyun	dmultu	($r0,$d0)		# h0*r0
375*4882a593Smuzhiyun	 daddu	$d2,$h2,$padbit
376*4882a593Smuzhiyun	 sltu	$tmp0,$d1,$tmp0
377*4882a593Smuzhiyun	mflo	($h0,$r0,$d0)
378*4882a593Smuzhiyun	mfhi	($h1,$r0,$d0)
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun	dmultu	($rs1,$d1)		# h1*5*r1
381*4882a593Smuzhiyun	 daddu	$d2,$tmp1
382*4882a593Smuzhiyun	 daddu	$d2,$tmp0
383*4882a593Smuzhiyun	mflo	($tmp0,$rs1,$d1)
384*4882a593Smuzhiyun	mfhi	($tmp1,$rs1,$d1)
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun	dmultu	($r1,$d0)		# h0*r1
387*4882a593Smuzhiyun	mflo	($tmp2,$r1,$d0)
388*4882a593Smuzhiyun	mfhi	($h2,$r1,$d0)
389*4882a593Smuzhiyun	 daddu	$h0,$tmp0
390*4882a593Smuzhiyun	 daddu	$h1,$tmp1
391*4882a593Smuzhiyun	 sltu	$tmp0,$h0,$tmp0
392*4882a593Smuzhiyun
393*4882a593Smuzhiyun	dmultu	($r0,$d1)		# h1*r0
394*4882a593Smuzhiyun	 daddu	$h1,$tmp0
395*4882a593Smuzhiyun	 daddu	$h1,$tmp2
396*4882a593Smuzhiyun	mflo	($tmp0,$r0,$d1)
397*4882a593Smuzhiyun	mfhi	($tmp1,$r0,$d1)
398*4882a593Smuzhiyun
399*4882a593Smuzhiyun	dmultu	($rs1,$d2)		# h2*5*r1
400*4882a593Smuzhiyun	 sltu	$tmp2,$h1,$tmp2
401*4882a593Smuzhiyun	 daddu	$h2,$tmp2
402*4882a593Smuzhiyun	mflo	($tmp2,$rs1,$d2)
403*4882a593Smuzhiyun
404*4882a593Smuzhiyun	dmultu	($r0,$d2)		# h2*r0
405*4882a593Smuzhiyun	 daddu	$h1,$tmp0
406*4882a593Smuzhiyun	 daddu	$h2,$tmp1
407*4882a593Smuzhiyun	mflo	($tmp3,$r0,$d2)
408*4882a593Smuzhiyun	 sltu	$tmp0,$h1,$tmp0
409*4882a593Smuzhiyun	 daddu	$h2,$tmp0
410*4882a593Smuzhiyun
411*4882a593Smuzhiyun	daddu	$h1,$tmp2
412*4882a593Smuzhiyun	sltu	$tmp2,$h1,$tmp2
413*4882a593Smuzhiyun	daddu	$h2,$tmp2
414*4882a593Smuzhiyun	daddu	$h2,$tmp3
415*4882a593Smuzhiyun
416*4882a593Smuzhiyun	bne	$inp,$len,.Loop
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun	sd	$h0,0($ctx)		# store hash value
419*4882a593Smuzhiyun	sd	$h1,8($ctx)
420*4882a593Smuzhiyun	sd	$h2,16($ctx)
421*4882a593Smuzhiyun
422*4882a593Smuzhiyun	.set	noreorder
423*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6)
424*4882a593Smuzhiyun	ld	$s7,56($sp)
425*4882a593Smuzhiyun	ld	$s6,48($sp)
426*4882a593Smuzhiyun#endif
427*4882a593Smuzhiyun	ld	$s5,40($sp)		# epilogue
428*4882a593Smuzhiyun	ld	$s4,32($sp)
429*4882a593Smuzhiyun___
430*4882a593Smuzhiyun$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi epilogue
431*4882a593Smuzhiyun	ld	$s3,24($sp)
432*4882a593Smuzhiyun	ld	$s2,16($sp)
433*4882a593Smuzhiyun	ld	$s1,8($sp)
434*4882a593Smuzhiyun	ld	$s0,0($sp)
435*4882a593Smuzhiyun___
436*4882a593Smuzhiyun$code.=<<___;
437*4882a593Smuzhiyun	jr	$ra
438*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS64R6)
439*4882a593Smuzhiyun	daddu	$sp,8*8
440*4882a593Smuzhiyun#else
441*4882a593Smuzhiyun	daddu	$sp,6*8
442*4882a593Smuzhiyun#endif
443*4882a593Smuzhiyun.end	poly1305_blocks_internal
444*4882a593Smuzhiyun___
445*4882a593Smuzhiyun}
446*4882a593Smuzhiyun{
447*4882a593Smuzhiyunmy ($ctx,$mac,$nonce) = ($a0,$a1,$a2);
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun$code.=<<___;
450*4882a593Smuzhiyun.align	5
451*4882a593Smuzhiyun.globl	poly1305_emit
452*4882a593Smuzhiyun.ent	poly1305_emit
453*4882a593Smuzhiyunpoly1305_emit:
454*4882a593Smuzhiyun	.frame	$sp,0,$ra
455*4882a593Smuzhiyun	.set	reorder
456*4882a593Smuzhiyun
457*4882a593Smuzhiyun	ld	$tmp2,16($ctx)
458*4882a593Smuzhiyun	ld	$tmp0,0($ctx)
459*4882a593Smuzhiyun	ld	$tmp1,8($ctx)
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun	li	$in0,-4			# final reduction
462*4882a593Smuzhiyun	dsrl	$in1,$tmp2,2
463*4882a593Smuzhiyun	and	$in0,$tmp2
464*4882a593Smuzhiyun	andi	$tmp2,$tmp2,3
465*4882a593Smuzhiyun	daddu	$in0,$in1
466*4882a593Smuzhiyun
467*4882a593Smuzhiyun	daddu	$tmp0,$tmp0,$in0
468*4882a593Smuzhiyun	sltu	$in1,$tmp0,$in0
469*4882a593Smuzhiyun	 daddiu	$in0,$tmp0,5		# compare to modulus
470*4882a593Smuzhiyun	daddu	$tmp1,$tmp1,$in1
471*4882a593Smuzhiyun	 sltiu	$tmp3,$in0,5
472*4882a593Smuzhiyun	sltu	$tmp4,$tmp1,$in1
473*4882a593Smuzhiyun	 daddu	$in1,$tmp1,$tmp3
474*4882a593Smuzhiyun	daddu	$tmp2,$tmp2,$tmp4
475*4882a593Smuzhiyun	 sltu	$tmp3,$in1,$tmp3
476*4882a593Smuzhiyun	 daddu	$tmp2,$tmp2,$tmp3
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun	dsrl	$tmp2,2			# see if it carried/borrowed
479*4882a593Smuzhiyun	dsubu	$tmp2,$zero,$tmp2
480*4882a593Smuzhiyun
481*4882a593Smuzhiyun	xor	$in0,$tmp0
482*4882a593Smuzhiyun	xor	$in1,$tmp1
483*4882a593Smuzhiyun	and	$in0,$tmp2
484*4882a593Smuzhiyun	and	$in1,$tmp2
485*4882a593Smuzhiyun	xor	$in0,$tmp0
486*4882a593Smuzhiyun	xor	$in1,$tmp1
487*4882a593Smuzhiyun
488*4882a593Smuzhiyun	lwu	$tmp0,0($nonce)		# load nonce
489*4882a593Smuzhiyun	lwu	$tmp1,4($nonce)
490*4882a593Smuzhiyun	lwu	$tmp2,8($nonce)
491*4882a593Smuzhiyun	lwu	$tmp3,12($nonce)
492*4882a593Smuzhiyun	dsll	$tmp1,32
493*4882a593Smuzhiyun	dsll	$tmp3,32
494*4882a593Smuzhiyun	or	$tmp0,$tmp1
495*4882a593Smuzhiyun	or	$tmp2,$tmp3
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun	daddu	$in0,$tmp0		# accumulate nonce
498*4882a593Smuzhiyun	daddu	$in1,$tmp2
499*4882a593Smuzhiyun	sltu	$tmp0,$in0,$tmp0
500*4882a593Smuzhiyun	daddu	$in1,$tmp0
501*4882a593Smuzhiyun
502*4882a593Smuzhiyun	dsrl	$tmp0,$in0,8		# write mac value
503*4882a593Smuzhiyun	dsrl	$tmp1,$in0,16
504*4882a593Smuzhiyun	dsrl	$tmp2,$in0,24
505*4882a593Smuzhiyun	sb	$in0,0($mac)
506*4882a593Smuzhiyun	dsrl	$tmp3,$in0,32
507*4882a593Smuzhiyun	sb	$tmp0,1($mac)
508*4882a593Smuzhiyun	dsrl	$tmp0,$in0,40
509*4882a593Smuzhiyun	sb	$tmp1,2($mac)
510*4882a593Smuzhiyun	dsrl	$tmp1,$in0,48
511*4882a593Smuzhiyun	sb	$tmp2,3($mac)
512*4882a593Smuzhiyun	dsrl	$tmp2,$in0,56
513*4882a593Smuzhiyun	sb	$tmp3,4($mac)
514*4882a593Smuzhiyun	dsrl	$tmp3,$in1,8
515*4882a593Smuzhiyun	sb	$tmp0,5($mac)
516*4882a593Smuzhiyun	dsrl	$tmp0,$in1,16
517*4882a593Smuzhiyun	sb	$tmp1,6($mac)
518*4882a593Smuzhiyun	dsrl	$tmp1,$in1,24
519*4882a593Smuzhiyun	sb	$tmp2,7($mac)
520*4882a593Smuzhiyun
521*4882a593Smuzhiyun	sb	$in1,8($mac)
522*4882a593Smuzhiyun	dsrl	$tmp2,$in1,32
523*4882a593Smuzhiyun	sb	$tmp3,9($mac)
524*4882a593Smuzhiyun	dsrl	$tmp3,$in1,40
525*4882a593Smuzhiyun	sb	$tmp0,10($mac)
526*4882a593Smuzhiyun	dsrl	$tmp0,$in1,48
527*4882a593Smuzhiyun	sb	$tmp1,11($mac)
528*4882a593Smuzhiyun	dsrl	$tmp1,$in1,56
529*4882a593Smuzhiyun	sb	$tmp2,12($mac)
530*4882a593Smuzhiyun	sb	$tmp3,13($mac)
531*4882a593Smuzhiyun	sb	$tmp0,14($mac)
532*4882a593Smuzhiyun	sb	$tmp1,15($mac)
533*4882a593Smuzhiyun
534*4882a593Smuzhiyun	jr	$ra
535*4882a593Smuzhiyun.end	poly1305_emit
536*4882a593Smuzhiyun.rdata
537*4882a593Smuzhiyun.asciiz	"Poly1305 for MIPS64, CRYPTOGAMS by \@dot-asm"
538*4882a593Smuzhiyun.align	2
539*4882a593Smuzhiyun___
540*4882a593Smuzhiyun}
541*4882a593Smuzhiyun}}} else {{{
542*4882a593Smuzhiyun######################################################################
543*4882a593Smuzhiyun# 32-bit code path
544*4882a593Smuzhiyun#
545*4882a593Smuzhiyun
546*4882a593Smuzhiyunmy ($ctx,$inp,$len,$padbit) = ($a0,$a1,$a2,$a3);
547*4882a593Smuzhiyunmy ($in0,$in1,$in2,$in3,$tmp0,$tmp1,$tmp2,$tmp3) =
548*4882a593Smuzhiyun   ($a4,$a5,$a6,$a7,$at,$t0,$t1,$t2);
549*4882a593Smuzhiyun
550*4882a593Smuzhiyun$code.=<<___;
551*4882a593Smuzhiyun#if (defined(_MIPS_ARCH_MIPS32R3) || defined(_MIPS_ARCH_MIPS32R5) || \\
552*4882a593Smuzhiyun     defined(_MIPS_ARCH_MIPS32R6)) \\
553*4882a593Smuzhiyun     && !defined(_MIPS_ARCH_MIPS32R2)
554*4882a593Smuzhiyun# define _MIPS_ARCH_MIPS32R2
555*4882a593Smuzhiyun#endif
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R6)
558*4882a593Smuzhiyun# define multu(rs,rt)
559*4882a593Smuzhiyun# define mflo(rd,rs,rt)	mulu	rd,rs,rt
560*4882a593Smuzhiyun# define mfhi(rd,rs,rt)	muhu	rd,rs,rt
561*4882a593Smuzhiyun#else
562*4882a593Smuzhiyun# define multu(rs,rt)	multu	rs,rt
563*4882a593Smuzhiyun# define mflo(rd,rs,rt)	mflo	rd
564*4882a593Smuzhiyun# define mfhi(rd,rs,rt)	mfhi	rd
565*4882a593Smuzhiyun#endif
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun#ifdef	__KERNEL__
568*4882a593Smuzhiyun# define poly1305_init   poly1305_init_mips
569*4882a593Smuzhiyun# define poly1305_blocks poly1305_blocks_mips
570*4882a593Smuzhiyun# define poly1305_emit   poly1305_emit_mips
571*4882a593Smuzhiyun#endif
572*4882a593Smuzhiyun
573*4882a593Smuzhiyun#if defined(__MIPSEB__) && !defined(MIPSEB)
574*4882a593Smuzhiyun# define MIPSEB
575*4882a593Smuzhiyun#endif
576*4882a593Smuzhiyun
577*4882a593Smuzhiyun#ifdef MIPSEB
578*4882a593Smuzhiyun# define MSB 0
579*4882a593Smuzhiyun# define LSB 3
580*4882a593Smuzhiyun#else
581*4882a593Smuzhiyun# define MSB 3
582*4882a593Smuzhiyun# define LSB 0
583*4882a593Smuzhiyun#endif
584*4882a593Smuzhiyun
585*4882a593Smuzhiyun.text
586*4882a593Smuzhiyun.set	noat
587*4882a593Smuzhiyun.set	noreorder
588*4882a593Smuzhiyun
589*4882a593Smuzhiyun.align	5
590*4882a593Smuzhiyun.globl	poly1305_init
591*4882a593Smuzhiyun.ent	poly1305_init
592*4882a593Smuzhiyunpoly1305_init:
593*4882a593Smuzhiyun	.frame	$sp,0,$ra
594*4882a593Smuzhiyun	.set	reorder
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun	sw	$zero,0($ctx)
597*4882a593Smuzhiyun	sw	$zero,4($ctx)
598*4882a593Smuzhiyun	sw	$zero,8($ctx)
599*4882a593Smuzhiyun	sw	$zero,12($ctx)
600*4882a593Smuzhiyun	sw	$zero,16($ctx)
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun	beqz	$inp,.Lno_key
603*4882a593Smuzhiyun
604*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R6)
605*4882a593Smuzhiyun	andi	$tmp0,$inp,3		# $inp % 4
606*4882a593Smuzhiyun	subu	$inp,$inp,$tmp0		# align $inp
607*4882a593Smuzhiyun	sll	$tmp0,$tmp0,3		# byte to bit offset
608*4882a593Smuzhiyun	lw	$in0,0($inp)
609*4882a593Smuzhiyun	lw	$in1,4($inp)
610*4882a593Smuzhiyun	lw	$in2,8($inp)
611*4882a593Smuzhiyun	lw	$in3,12($inp)
612*4882a593Smuzhiyun	beqz	$tmp0,.Laligned_key
613*4882a593Smuzhiyun
614*4882a593Smuzhiyun	lw	$tmp2,16($inp)
615*4882a593Smuzhiyun	subu	$tmp1,$zero,$tmp0
616*4882a593Smuzhiyun# ifdef	MIPSEB
617*4882a593Smuzhiyun	sllv	$in0,$in0,$tmp0
618*4882a593Smuzhiyun	srlv	$tmp3,$in1,$tmp1
619*4882a593Smuzhiyun	sllv	$in1,$in1,$tmp0
620*4882a593Smuzhiyun	or	$in0,$in0,$tmp3
621*4882a593Smuzhiyun	srlv	$tmp3,$in2,$tmp1
622*4882a593Smuzhiyun	sllv	$in2,$in2,$tmp0
623*4882a593Smuzhiyun	or	$in1,$in1,$tmp3
624*4882a593Smuzhiyun	srlv	$tmp3,$in3,$tmp1
625*4882a593Smuzhiyun	sllv	$in3,$in3,$tmp0
626*4882a593Smuzhiyun	or	$in2,$in2,$tmp3
627*4882a593Smuzhiyun	srlv	$tmp2,$tmp2,$tmp1
628*4882a593Smuzhiyun	or	$in3,$in3,$tmp2
629*4882a593Smuzhiyun# else
630*4882a593Smuzhiyun	srlv	$in0,$in0,$tmp0
631*4882a593Smuzhiyun	sllv	$tmp3,$in1,$tmp1
632*4882a593Smuzhiyun	srlv	$in1,$in1,$tmp0
633*4882a593Smuzhiyun	or	$in0,$in0,$tmp3
634*4882a593Smuzhiyun	sllv	$tmp3,$in2,$tmp1
635*4882a593Smuzhiyun	srlv	$in2,$in2,$tmp0
636*4882a593Smuzhiyun	or	$in1,$in1,$tmp3
637*4882a593Smuzhiyun	sllv	$tmp3,$in3,$tmp1
638*4882a593Smuzhiyun	srlv	$in3,$in3,$tmp0
639*4882a593Smuzhiyun	or	$in2,$in2,$tmp3
640*4882a593Smuzhiyun	sllv	$tmp2,$tmp2,$tmp1
641*4882a593Smuzhiyun	or	$in3,$in3,$tmp2
642*4882a593Smuzhiyun# endif
643*4882a593Smuzhiyun.Laligned_key:
644*4882a593Smuzhiyun#else
645*4882a593Smuzhiyun	lwl	$in0,0+MSB($inp)
646*4882a593Smuzhiyun	lwl	$in1,4+MSB($inp)
647*4882a593Smuzhiyun	lwl	$in2,8+MSB($inp)
648*4882a593Smuzhiyun	lwl	$in3,12+MSB($inp)
649*4882a593Smuzhiyun	lwr	$in0,0+LSB($inp)
650*4882a593Smuzhiyun	lwr	$in1,4+LSB($inp)
651*4882a593Smuzhiyun	lwr	$in2,8+LSB($inp)
652*4882a593Smuzhiyun	lwr	$in3,12+LSB($inp)
653*4882a593Smuzhiyun#endif
654*4882a593Smuzhiyun#ifdef	MIPSEB
655*4882a593Smuzhiyun# if defined(_MIPS_ARCH_MIPS32R2)
656*4882a593Smuzhiyun	wsbh	$in0,$in0		# byte swap
657*4882a593Smuzhiyun	wsbh	$in1,$in1
658*4882a593Smuzhiyun	wsbh	$in2,$in2
659*4882a593Smuzhiyun	wsbh	$in3,$in3
660*4882a593Smuzhiyun	rotr	$in0,$in0,16
661*4882a593Smuzhiyun	rotr	$in1,$in1,16
662*4882a593Smuzhiyun	rotr	$in2,$in2,16
663*4882a593Smuzhiyun	rotr	$in3,$in3,16
664*4882a593Smuzhiyun# else
665*4882a593Smuzhiyun	srl	$tmp0,$in0,24		# byte swap
666*4882a593Smuzhiyun	srl	$tmp1,$in0,8
667*4882a593Smuzhiyun	andi	$tmp2,$in0,0xFF00
668*4882a593Smuzhiyun	sll	$in0,$in0,24
669*4882a593Smuzhiyun	andi	$tmp1,0xFF00
670*4882a593Smuzhiyun	sll	$tmp2,$tmp2,8
671*4882a593Smuzhiyun	or	$in0,$tmp0
672*4882a593Smuzhiyun	 srl	$tmp0,$in1,24
673*4882a593Smuzhiyun	or	$tmp1,$tmp2
674*4882a593Smuzhiyun	 srl	$tmp2,$in1,8
675*4882a593Smuzhiyun	or	$in0,$tmp1
676*4882a593Smuzhiyun	 andi	$tmp1,$in1,0xFF00
677*4882a593Smuzhiyun	 sll	$in1,$in1,24
678*4882a593Smuzhiyun	 andi	$tmp2,0xFF00
679*4882a593Smuzhiyun	 sll	$tmp1,$tmp1,8
680*4882a593Smuzhiyun	 or	$in1,$tmp0
681*4882a593Smuzhiyun	srl	$tmp0,$in2,24
682*4882a593Smuzhiyun	 or	$tmp2,$tmp1
683*4882a593Smuzhiyun	srl	$tmp1,$in2,8
684*4882a593Smuzhiyun	 or	$in1,$tmp2
685*4882a593Smuzhiyun	andi	$tmp2,$in2,0xFF00
686*4882a593Smuzhiyun	sll	$in2,$in2,24
687*4882a593Smuzhiyun	andi	$tmp1,0xFF00
688*4882a593Smuzhiyun	sll	$tmp2,$tmp2,8
689*4882a593Smuzhiyun	or	$in2,$tmp0
690*4882a593Smuzhiyun	 srl	$tmp0,$in3,24
691*4882a593Smuzhiyun	or	$tmp1,$tmp2
692*4882a593Smuzhiyun	 srl	$tmp2,$in3,8
693*4882a593Smuzhiyun	or	$in2,$tmp1
694*4882a593Smuzhiyun	 andi	$tmp1,$in3,0xFF00
695*4882a593Smuzhiyun	 sll	$in3,$in3,24
696*4882a593Smuzhiyun	 andi	$tmp2,0xFF00
697*4882a593Smuzhiyun	 sll	$tmp1,$tmp1,8
698*4882a593Smuzhiyun	 or	$in3,$tmp0
699*4882a593Smuzhiyun	 or	$tmp2,$tmp1
700*4882a593Smuzhiyun	 or	$in3,$tmp2
701*4882a593Smuzhiyun# endif
702*4882a593Smuzhiyun#endif
703*4882a593Smuzhiyun	lui	$tmp0,0x0fff
704*4882a593Smuzhiyun	ori	$tmp0,0xffff		# 0x0fffffff
705*4882a593Smuzhiyun	and	$in0,$in0,$tmp0
706*4882a593Smuzhiyun	subu	$tmp0,3			# 0x0ffffffc
707*4882a593Smuzhiyun	and	$in1,$in1,$tmp0
708*4882a593Smuzhiyun	and	$in2,$in2,$tmp0
709*4882a593Smuzhiyun	and	$in3,$in3,$tmp0
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun	sw	$in0,20($ctx)
712*4882a593Smuzhiyun	sw	$in1,24($ctx)
713*4882a593Smuzhiyun	sw	$in2,28($ctx)
714*4882a593Smuzhiyun	sw	$in3,32($ctx)
715*4882a593Smuzhiyun
716*4882a593Smuzhiyun	srl	$tmp1,$in1,2
717*4882a593Smuzhiyun	srl	$tmp2,$in2,2
718*4882a593Smuzhiyun	srl	$tmp3,$in3,2
719*4882a593Smuzhiyun	addu	$in1,$in1,$tmp1		# s1 = r1 + (r1 >> 2)
720*4882a593Smuzhiyun	addu	$in2,$in2,$tmp2
721*4882a593Smuzhiyun	addu	$in3,$in3,$tmp3
722*4882a593Smuzhiyun	sw	$in1,36($ctx)
723*4882a593Smuzhiyun	sw	$in2,40($ctx)
724*4882a593Smuzhiyun	sw	$in3,44($ctx)
725*4882a593Smuzhiyun.Lno_key:
726*4882a593Smuzhiyun	li	$v0,0
727*4882a593Smuzhiyun	jr	$ra
728*4882a593Smuzhiyun.end	poly1305_init
729*4882a593Smuzhiyun___
730*4882a593Smuzhiyun{
731*4882a593Smuzhiyunmy $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? "0x00fff000" : "0x00ff0000";
732*4882a593Smuzhiyun
733*4882a593Smuzhiyunmy ($h0,$h1,$h2,$h3,$h4, $r0,$r1,$r2,$r3, $rs1,$rs2,$rs3) =
734*4882a593Smuzhiyun   ($s0,$s1,$s2,$s3,$s4, $s5,$s6,$s7,$s8, $s9,$s10,$s11);
735*4882a593Smuzhiyunmy ($d0,$d1,$d2,$d3) =
736*4882a593Smuzhiyun   ($a4,$a5,$a6,$a7);
737*4882a593Smuzhiyunmy $shr = $t2;		# used on R6
738*4882a593Smuzhiyunmy $one = $t2;		# used on R2
739*4882a593Smuzhiyun
740*4882a593Smuzhiyun$code.=<<___;
741*4882a593Smuzhiyun.globl	poly1305_blocks
742*4882a593Smuzhiyun.align	5
743*4882a593Smuzhiyun.ent	poly1305_blocks
744*4882a593Smuzhiyunpoly1305_blocks:
745*4882a593Smuzhiyun	.frame	$sp,16*4,$ra
746*4882a593Smuzhiyun	.mask	$SAVED_REGS_MASK,-4
747*4882a593Smuzhiyun	.set	noreorder
748*4882a593Smuzhiyun	subu	$sp, $sp,4*12
749*4882a593Smuzhiyun	sw	$s11,4*11($sp)
750*4882a593Smuzhiyun	sw	$s10,4*10($sp)
751*4882a593Smuzhiyun	sw	$s9, 4*9($sp)
752*4882a593Smuzhiyun	sw	$s8, 4*8($sp)
753*4882a593Smuzhiyun	sw	$s7, 4*7($sp)
754*4882a593Smuzhiyun	sw	$s6, 4*6($sp)
755*4882a593Smuzhiyun	sw	$s5, 4*5($sp)
756*4882a593Smuzhiyun	sw	$s4, 4*4($sp)
757*4882a593Smuzhiyun___
758*4882a593Smuzhiyun$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
759*4882a593Smuzhiyun	sw	$s3, 4*3($sp)
760*4882a593Smuzhiyun	sw	$s2, 4*2($sp)
761*4882a593Smuzhiyun	sw	$s1, 4*1($sp)
762*4882a593Smuzhiyun	sw	$s0, 4*0($sp)
763*4882a593Smuzhiyun___
764*4882a593Smuzhiyun$code.=<<___;
765*4882a593Smuzhiyun	.set	reorder
766*4882a593Smuzhiyun
767*4882a593Smuzhiyun	srl	$len,4			# number of complete blocks
768*4882a593Smuzhiyun	li	$one,1
769*4882a593Smuzhiyun	beqz	$len,.Labort
770*4882a593Smuzhiyun
771*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R6)
772*4882a593Smuzhiyun	andi	$shr,$inp,3
773*4882a593Smuzhiyun	subu	$inp,$inp,$shr		# align $inp
774*4882a593Smuzhiyun	sll	$shr,$shr,3		# byte to bit offset
775*4882a593Smuzhiyun#endif
776*4882a593Smuzhiyun
777*4882a593Smuzhiyun	lw	$h0,0($ctx)		# load hash value
778*4882a593Smuzhiyun	lw	$h1,4($ctx)
779*4882a593Smuzhiyun	lw	$h2,8($ctx)
780*4882a593Smuzhiyun	lw	$h3,12($ctx)
781*4882a593Smuzhiyun	lw	$h4,16($ctx)
782*4882a593Smuzhiyun
783*4882a593Smuzhiyun	lw	$r0,20($ctx)		# load key
784*4882a593Smuzhiyun	lw	$r1,24($ctx)
785*4882a593Smuzhiyun	lw	$r2,28($ctx)
786*4882a593Smuzhiyun	lw	$r3,32($ctx)
787*4882a593Smuzhiyun	lw	$rs1,36($ctx)
788*4882a593Smuzhiyun	lw	$rs2,40($ctx)
789*4882a593Smuzhiyun	lw	$rs3,44($ctx)
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun	sll	$len,4
792*4882a593Smuzhiyun	addu	$len,$len,$inp		# end of buffer
793*4882a593Smuzhiyun	b	.Loop
794*4882a593Smuzhiyun
795*4882a593Smuzhiyun.align	4
796*4882a593Smuzhiyun.Loop:
797*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R6)
798*4882a593Smuzhiyun	lw	$d0,0($inp)		# load input
799*4882a593Smuzhiyun	lw	$d1,4($inp)
800*4882a593Smuzhiyun	lw	$d2,8($inp)
801*4882a593Smuzhiyun	lw	$d3,12($inp)
802*4882a593Smuzhiyun	beqz	$shr,.Laligned_inp
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun	lw	$t0,16($inp)
805*4882a593Smuzhiyun	subu	$t1,$zero,$shr
806*4882a593Smuzhiyun# ifdef	MIPSEB
807*4882a593Smuzhiyun	sllv	$d0,$d0,$shr
808*4882a593Smuzhiyun	srlv	$at,$d1,$t1
809*4882a593Smuzhiyun	sllv	$d1,$d1,$shr
810*4882a593Smuzhiyun	or	$d0,$d0,$at
811*4882a593Smuzhiyun	srlv	$at,$d2,$t1
812*4882a593Smuzhiyun	sllv	$d2,$d2,$shr
813*4882a593Smuzhiyun	or	$d1,$d1,$at
814*4882a593Smuzhiyun	srlv	$at,$d3,$t1
815*4882a593Smuzhiyun	sllv	$d3,$d3,$shr
816*4882a593Smuzhiyun	or	$d2,$d2,$at
817*4882a593Smuzhiyun	srlv	$t0,$t0,$t1
818*4882a593Smuzhiyun	or	$d3,$d3,$t0
819*4882a593Smuzhiyun# else
820*4882a593Smuzhiyun	srlv	$d0,$d0,$shr
821*4882a593Smuzhiyun	sllv	$at,$d1,$t1
822*4882a593Smuzhiyun	srlv	$d1,$d1,$shr
823*4882a593Smuzhiyun	or	$d0,$d0,$at
824*4882a593Smuzhiyun	sllv	$at,$d2,$t1
825*4882a593Smuzhiyun	srlv	$d2,$d2,$shr
826*4882a593Smuzhiyun	or	$d1,$d1,$at
827*4882a593Smuzhiyun	sllv	$at,$d3,$t1
828*4882a593Smuzhiyun	srlv	$d3,$d3,$shr
829*4882a593Smuzhiyun	or	$d2,$d2,$at
830*4882a593Smuzhiyun	sllv	$t0,$t0,$t1
831*4882a593Smuzhiyun	or	$d3,$d3,$t0
832*4882a593Smuzhiyun# endif
833*4882a593Smuzhiyun.Laligned_inp:
834*4882a593Smuzhiyun#else
835*4882a593Smuzhiyun	lwl	$d0,0+MSB($inp)		# load input
836*4882a593Smuzhiyun	lwl	$d1,4+MSB($inp)
837*4882a593Smuzhiyun	lwl	$d2,8+MSB($inp)
838*4882a593Smuzhiyun	lwl	$d3,12+MSB($inp)
839*4882a593Smuzhiyun	lwr	$d0,0+LSB($inp)
840*4882a593Smuzhiyun	lwr	$d1,4+LSB($inp)
841*4882a593Smuzhiyun	lwr	$d2,8+LSB($inp)
842*4882a593Smuzhiyun	lwr	$d3,12+LSB($inp)
843*4882a593Smuzhiyun#endif
844*4882a593Smuzhiyun#ifdef	MIPSEB
845*4882a593Smuzhiyun# if defined(_MIPS_ARCH_MIPS32R2)
846*4882a593Smuzhiyun	wsbh	$d0,$d0			# byte swap
847*4882a593Smuzhiyun	wsbh	$d1,$d1
848*4882a593Smuzhiyun	wsbh	$d2,$d2
849*4882a593Smuzhiyun	wsbh	$d3,$d3
850*4882a593Smuzhiyun	rotr	$d0,$d0,16
851*4882a593Smuzhiyun	rotr	$d1,$d1,16
852*4882a593Smuzhiyun	rotr	$d2,$d2,16
853*4882a593Smuzhiyun	rotr	$d3,$d3,16
854*4882a593Smuzhiyun# else
855*4882a593Smuzhiyun	srl	$at,$d0,24		# byte swap
856*4882a593Smuzhiyun	srl	$t0,$d0,8
857*4882a593Smuzhiyun	andi	$t1,$d0,0xFF00
858*4882a593Smuzhiyun	sll	$d0,$d0,24
859*4882a593Smuzhiyun	andi	$t0,0xFF00
860*4882a593Smuzhiyun	sll	$t1,$t1,8
861*4882a593Smuzhiyun	or	$d0,$at
862*4882a593Smuzhiyun	 srl	$at,$d1,24
863*4882a593Smuzhiyun	or	$t0,$t1
864*4882a593Smuzhiyun	 srl	$t1,$d1,8
865*4882a593Smuzhiyun	or	$d0,$t0
866*4882a593Smuzhiyun	 andi	$t0,$d1,0xFF00
867*4882a593Smuzhiyun	 sll	$d1,$d1,24
868*4882a593Smuzhiyun	 andi	$t1,0xFF00
869*4882a593Smuzhiyun	 sll	$t0,$t0,8
870*4882a593Smuzhiyun	 or	$d1,$at
871*4882a593Smuzhiyun	srl	$at,$d2,24
872*4882a593Smuzhiyun	 or	$t1,$t0
873*4882a593Smuzhiyun	srl	$t0,$d2,8
874*4882a593Smuzhiyun	 or	$d1,$t1
875*4882a593Smuzhiyun	andi	$t1,$d2,0xFF00
876*4882a593Smuzhiyun	sll	$d2,$d2,24
877*4882a593Smuzhiyun	andi	$t0,0xFF00
878*4882a593Smuzhiyun	sll	$t1,$t1,8
879*4882a593Smuzhiyun	or	$d2,$at
880*4882a593Smuzhiyun	 srl	$at,$d3,24
881*4882a593Smuzhiyun	or	$t0,$t1
882*4882a593Smuzhiyun	 srl	$t1,$d3,8
883*4882a593Smuzhiyun	or	$d2,$t0
884*4882a593Smuzhiyun	 andi	$t0,$d3,0xFF00
885*4882a593Smuzhiyun	 sll	$d3,$d3,24
886*4882a593Smuzhiyun	 andi	$t1,0xFF00
887*4882a593Smuzhiyun	 sll	$t0,$t0,8
888*4882a593Smuzhiyun	 or	$d3,$at
889*4882a593Smuzhiyun	 or	$t1,$t0
890*4882a593Smuzhiyun	 or	$d3,$t1
891*4882a593Smuzhiyun# endif
892*4882a593Smuzhiyun#endif
893*4882a593Smuzhiyun	srl	$t0,$h4,2		# modulo-scheduled reduction
894*4882a593Smuzhiyun	andi	$h4,$h4,3
895*4882a593Smuzhiyun	sll	$at,$t0,2
896*4882a593Smuzhiyun
897*4882a593Smuzhiyun	addu	$d0,$d0,$h0		# accumulate input
898*4882a593Smuzhiyun	 addu	$t0,$t0,$at
899*4882a593Smuzhiyun	sltu	$h0,$d0,$h0
900*4882a593Smuzhiyun	addu	$d0,$d0,$t0		# ... and residue
901*4882a593Smuzhiyun	sltu	$at,$d0,$t0
902*4882a593Smuzhiyun
903*4882a593Smuzhiyun	addu	$d1,$d1,$h1
904*4882a593Smuzhiyun	 addu	$h0,$h0,$at		# carry
905*4882a593Smuzhiyun	sltu	$h1,$d1,$h1
906*4882a593Smuzhiyun	addu	$d1,$d1,$h0
907*4882a593Smuzhiyun	sltu	$h0,$d1,$h0
908*4882a593Smuzhiyun
909*4882a593Smuzhiyun	addu	$d2,$d2,$h2
910*4882a593Smuzhiyun	 addu	$h1,$h1,$h0		# carry
911*4882a593Smuzhiyun	sltu	$h2,$d2,$h2
912*4882a593Smuzhiyun	addu	$d2,$d2,$h1
913*4882a593Smuzhiyun	sltu	$h1,$d2,$h1
914*4882a593Smuzhiyun
915*4882a593Smuzhiyun	addu	$d3,$d3,$h3
916*4882a593Smuzhiyun	 addu	$h2,$h2,$h1		# carry
917*4882a593Smuzhiyun	sltu	$h3,$d3,$h3
918*4882a593Smuzhiyun	addu	$d3,$d3,$h2
919*4882a593Smuzhiyun
920*4882a593Smuzhiyun#if defined(_MIPS_ARCH_MIPS32R2) && !defined(_MIPS_ARCH_MIPS32R6)
921*4882a593Smuzhiyun	multu	$r0,$d0			# d0*r0
922*4882a593Smuzhiyun	 sltu	$h2,$d3,$h2
923*4882a593Smuzhiyun	maddu	$rs3,$d1		# d1*s3
924*4882a593Smuzhiyun	 addu	$h3,$h3,$h2		# carry
925*4882a593Smuzhiyun	maddu	$rs2,$d2		# d2*s2
926*4882a593Smuzhiyun	 addu	$h4,$h4,$padbit
927*4882a593Smuzhiyun	maddu	$rs1,$d3		# d3*s1
928*4882a593Smuzhiyun	 addu	$h4,$h4,$h3
929*4882a593Smuzhiyun	mfhi	$at
930*4882a593Smuzhiyun	mflo	$h0
931*4882a593Smuzhiyun
932*4882a593Smuzhiyun	multu	$r1,$d0			# d0*r1
933*4882a593Smuzhiyun	maddu	$r0,$d1			# d1*r0
934*4882a593Smuzhiyun	maddu	$rs3,$d2		# d2*s3
935*4882a593Smuzhiyun	maddu	$rs2,$d3		# d3*s2
936*4882a593Smuzhiyun	maddu	$rs1,$h4		# h4*s1
937*4882a593Smuzhiyun	maddu	$at,$one		# hi*1
938*4882a593Smuzhiyun	mfhi	$at
939*4882a593Smuzhiyun	mflo	$h1
940*4882a593Smuzhiyun
941*4882a593Smuzhiyun	multu	$r2,$d0			# d0*r2
942*4882a593Smuzhiyun	maddu	$r1,$d1			# d1*r1
943*4882a593Smuzhiyun	maddu	$r0,$d2			# d2*r0
944*4882a593Smuzhiyun	maddu	$rs3,$d3		# d3*s3
945*4882a593Smuzhiyun	maddu	$rs2,$h4		# h4*s2
946*4882a593Smuzhiyun	maddu	$at,$one		# hi*1
947*4882a593Smuzhiyun	mfhi	$at
948*4882a593Smuzhiyun	mflo	$h2
949*4882a593Smuzhiyun
950*4882a593Smuzhiyun	mul	$t0,$r0,$h4		# h4*r0
951*4882a593Smuzhiyun
952*4882a593Smuzhiyun	multu	$r3,$d0			# d0*r3
953*4882a593Smuzhiyun	maddu	$r2,$d1			# d1*r2
954*4882a593Smuzhiyun	maddu	$r1,$d2			# d2*r1
955*4882a593Smuzhiyun	maddu	$r0,$d3			# d3*r0
956*4882a593Smuzhiyun	maddu	$rs3,$h4		# h4*s3
957*4882a593Smuzhiyun	maddu	$at,$one		# hi*1
958*4882a593Smuzhiyun	mfhi	$at
959*4882a593Smuzhiyun	mflo	$h3
960*4882a593Smuzhiyun
961*4882a593Smuzhiyun	 addiu	$inp,$inp,16
962*4882a593Smuzhiyun
963*4882a593Smuzhiyun	addu	$h4,$t0,$at
964*4882a593Smuzhiyun#else
965*4882a593Smuzhiyun	multu	($r0,$d0)		# d0*r0
966*4882a593Smuzhiyun	mflo	($h0,$r0,$d0)
967*4882a593Smuzhiyun	mfhi	($h1,$r0,$d0)
968*4882a593Smuzhiyun
969*4882a593Smuzhiyun	 sltu	$h2,$d3,$h2
970*4882a593Smuzhiyun	 addu	$h3,$h3,$h2		# carry
971*4882a593Smuzhiyun
972*4882a593Smuzhiyun	multu	($rs3,$d1)		# d1*s3
973*4882a593Smuzhiyun	mflo	($at,$rs3,$d1)
974*4882a593Smuzhiyun	mfhi	($t0,$rs3,$d1)
975*4882a593Smuzhiyun
976*4882a593Smuzhiyun	 addu	$h4,$h4,$padbit
977*4882a593Smuzhiyun	 addiu	$inp,$inp,16
978*4882a593Smuzhiyun	 addu	$h4,$h4,$h3
979*4882a593Smuzhiyun
980*4882a593Smuzhiyun	multu	($rs2,$d2)		# d2*s2
981*4882a593Smuzhiyun	mflo	($a3,$rs2,$d2)
982*4882a593Smuzhiyun	mfhi	($t1,$rs2,$d2)
983*4882a593Smuzhiyun	 addu	$h0,$h0,$at
984*4882a593Smuzhiyun	 addu	$h1,$h1,$t0
985*4882a593Smuzhiyun	multu	($rs1,$d3)		# d3*s1
986*4882a593Smuzhiyun	 sltu	$at,$h0,$at
987*4882a593Smuzhiyun	 addu	$h1,$h1,$at
988*4882a593Smuzhiyun
989*4882a593Smuzhiyun	mflo	($at,$rs1,$d3)
990*4882a593Smuzhiyun	mfhi	($t0,$rs1,$d3)
991*4882a593Smuzhiyun	 addu	$h0,$h0,$a3
992*4882a593Smuzhiyun	 addu	$h1,$h1,$t1
993*4882a593Smuzhiyun	multu	($r1,$d0)		# d0*r1
994*4882a593Smuzhiyun	 sltu	$a3,$h0,$a3
995*4882a593Smuzhiyun	 addu	$h1,$h1,$a3
996*4882a593Smuzhiyun
997*4882a593Smuzhiyun
998*4882a593Smuzhiyun	mflo	($a3,$r1,$d0)
999*4882a593Smuzhiyun	mfhi	($h2,$r1,$d0)
1000*4882a593Smuzhiyun	 addu	$h0,$h0,$at
1001*4882a593Smuzhiyun	 addu	$h1,$h1,$t0
1002*4882a593Smuzhiyun	multu	($r0,$d1)		# d1*r0
1003*4882a593Smuzhiyun	 sltu	$at,$h0,$at
1004*4882a593Smuzhiyun	 addu	$h1,$h1,$at
1005*4882a593Smuzhiyun
1006*4882a593Smuzhiyun	mflo	($at,$r0,$d1)
1007*4882a593Smuzhiyun	mfhi	($t0,$r0,$d1)
1008*4882a593Smuzhiyun	 addu	$h1,$h1,$a3
1009*4882a593Smuzhiyun	 sltu	$a3,$h1,$a3
1010*4882a593Smuzhiyun	multu	($rs3,$d2)		# d2*s3
1011*4882a593Smuzhiyun	 addu	$h2,$h2,$a3
1012*4882a593Smuzhiyun
1013*4882a593Smuzhiyun	mflo	($a3,$rs3,$d2)
1014*4882a593Smuzhiyun	mfhi	($t1,$rs3,$d2)
1015*4882a593Smuzhiyun	 addu	$h1,$h1,$at
1016*4882a593Smuzhiyun	 addu	$h2,$h2,$t0
1017*4882a593Smuzhiyun	multu	($rs2,$d3)		# d3*s2
1018*4882a593Smuzhiyun	 sltu	$at,$h1,$at
1019*4882a593Smuzhiyun	 addu	$h2,$h2,$at
1020*4882a593Smuzhiyun
1021*4882a593Smuzhiyun	mflo	($at,$rs2,$d3)
1022*4882a593Smuzhiyun	mfhi	($t0,$rs2,$d3)
1023*4882a593Smuzhiyun	 addu	$h1,$h1,$a3
1024*4882a593Smuzhiyun	 addu	$h2,$h2,$t1
1025*4882a593Smuzhiyun	multu	($rs1,$h4)		# h4*s1
1026*4882a593Smuzhiyun	 sltu	$a3,$h1,$a3
1027*4882a593Smuzhiyun	 addu	$h2,$h2,$a3
1028*4882a593Smuzhiyun
1029*4882a593Smuzhiyun	mflo	($a3,$rs1,$h4)
1030*4882a593Smuzhiyun	 addu	$h1,$h1,$at
1031*4882a593Smuzhiyun	 addu	$h2,$h2,$t0
1032*4882a593Smuzhiyun	multu	($r2,$d0)		# d0*r2
1033*4882a593Smuzhiyun	 sltu	$at,$h1,$at
1034*4882a593Smuzhiyun	 addu	$h2,$h2,$at
1035*4882a593Smuzhiyun
1036*4882a593Smuzhiyun
1037*4882a593Smuzhiyun	mflo	($at,$r2,$d0)
1038*4882a593Smuzhiyun	mfhi	($h3,$r2,$d0)
1039*4882a593Smuzhiyun	 addu	$h1,$h1,$a3
1040*4882a593Smuzhiyun	 sltu	$a3,$h1,$a3
1041*4882a593Smuzhiyun	multu	($r1,$d1)		# d1*r1
1042*4882a593Smuzhiyun	 addu	$h2,$h2,$a3
1043*4882a593Smuzhiyun
1044*4882a593Smuzhiyun	mflo	($a3,$r1,$d1)
1045*4882a593Smuzhiyun	mfhi	($t1,$r1,$d1)
1046*4882a593Smuzhiyun	 addu	$h2,$h2,$at
1047*4882a593Smuzhiyun	 sltu	$at,$h2,$at
1048*4882a593Smuzhiyun	multu	($r0,$d2)		# d2*r0
1049*4882a593Smuzhiyun	 addu	$h3,$h3,$at
1050*4882a593Smuzhiyun
1051*4882a593Smuzhiyun	mflo	($at,$r0,$d2)
1052*4882a593Smuzhiyun	mfhi	($t0,$r0,$d2)
1053*4882a593Smuzhiyun	 addu	$h2,$h2,$a3
1054*4882a593Smuzhiyun	 addu	$h3,$h3,$t1
1055*4882a593Smuzhiyun	multu	($rs3,$d3)		# d3*s3
1056*4882a593Smuzhiyun	 sltu	$a3,$h2,$a3
1057*4882a593Smuzhiyun	 addu	$h3,$h3,$a3
1058*4882a593Smuzhiyun
1059*4882a593Smuzhiyun	mflo	($a3,$rs3,$d3)
1060*4882a593Smuzhiyun	mfhi	($t1,$rs3,$d3)
1061*4882a593Smuzhiyun	 addu	$h2,$h2,$at
1062*4882a593Smuzhiyun	 addu	$h3,$h3,$t0
1063*4882a593Smuzhiyun	multu	($rs2,$h4)		# h4*s2
1064*4882a593Smuzhiyun	 sltu	$at,$h2,$at
1065*4882a593Smuzhiyun	 addu	$h3,$h3,$at
1066*4882a593Smuzhiyun
1067*4882a593Smuzhiyun	mflo	($at,$rs2,$h4)
1068*4882a593Smuzhiyun	 addu	$h2,$h2,$a3
1069*4882a593Smuzhiyun	 addu	$h3,$h3,$t1
1070*4882a593Smuzhiyun	multu	($r3,$d0)		# d0*r3
1071*4882a593Smuzhiyun	 sltu	$a3,$h2,$a3
1072*4882a593Smuzhiyun	 addu	$h3,$h3,$a3
1073*4882a593Smuzhiyun
1074*4882a593Smuzhiyun
1075*4882a593Smuzhiyun	mflo	($a3,$r3,$d0)
1076*4882a593Smuzhiyun	mfhi	($t1,$r3,$d0)
1077*4882a593Smuzhiyun	 addu	$h2,$h2,$at
1078*4882a593Smuzhiyun	 sltu	$at,$h2,$at
1079*4882a593Smuzhiyun	multu	($r2,$d1)		# d1*r2
1080*4882a593Smuzhiyun	 addu	$h3,$h3,$at
1081*4882a593Smuzhiyun
1082*4882a593Smuzhiyun	mflo	($at,$r2,$d1)
1083*4882a593Smuzhiyun	mfhi	($t0,$r2,$d1)
1084*4882a593Smuzhiyun	 addu	$h3,$h3,$a3
1085*4882a593Smuzhiyun	 sltu	$a3,$h3,$a3
1086*4882a593Smuzhiyun	multu	($r0,$d3)		# d3*r0
1087*4882a593Smuzhiyun	 addu	$t1,$t1,$a3
1088*4882a593Smuzhiyun
1089*4882a593Smuzhiyun	mflo	($a3,$r0,$d3)
1090*4882a593Smuzhiyun	mfhi	($d3,$r0,$d3)
1091*4882a593Smuzhiyun	 addu	$h3,$h3,$at
1092*4882a593Smuzhiyun	 addu	$t1,$t1,$t0
1093*4882a593Smuzhiyun	multu	($r1,$d2)		# d2*r1
1094*4882a593Smuzhiyun	 sltu	$at,$h3,$at
1095*4882a593Smuzhiyun	 addu	$t1,$t1,$at
1096*4882a593Smuzhiyun
1097*4882a593Smuzhiyun	mflo	($at,$r1,$d2)
1098*4882a593Smuzhiyun	mfhi	($t0,$r1,$d2)
1099*4882a593Smuzhiyun	 addu	$h3,$h3,$a3
1100*4882a593Smuzhiyun	 addu	$t1,$t1,$d3
1101*4882a593Smuzhiyun	multu	($rs3,$h4)		# h4*s3
1102*4882a593Smuzhiyun	 sltu	$a3,$h3,$a3
1103*4882a593Smuzhiyun	 addu	$t1,$t1,$a3
1104*4882a593Smuzhiyun
1105*4882a593Smuzhiyun	mflo	($a3,$rs3,$h4)
1106*4882a593Smuzhiyun	 addu	$h3,$h3,$at
1107*4882a593Smuzhiyun	 addu	$t1,$t1,$t0
1108*4882a593Smuzhiyun	multu	($r0,$h4)		# h4*r0
1109*4882a593Smuzhiyun	 sltu	$at,$h3,$at
1110*4882a593Smuzhiyun	 addu	$t1,$t1,$at
1111*4882a593Smuzhiyun
1112*4882a593Smuzhiyun
1113*4882a593Smuzhiyun	mflo	($h4,$r0,$h4)
1114*4882a593Smuzhiyun	 addu	$h3,$h3,$a3
1115*4882a593Smuzhiyun	 sltu	$a3,$h3,$a3
1116*4882a593Smuzhiyun	 addu	$t1,$t1,$a3
1117*4882a593Smuzhiyun	addu	$h4,$h4,$t1
1118*4882a593Smuzhiyun
1119*4882a593Smuzhiyun	li	$padbit,1		# if we loop, padbit is 1
1120*4882a593Smuzhiyun#endif
1121*4882a593Smuzhiyun	bne	$inp,$len,.Loop
1122*4882a593Smuzhiyun
1123*4882a593Smuzhiyun	sw	$h0,0($ctx)		# store hash value
1124*4882a593Smuzhiyun	sw	$h1,4($ctx)
1125*4882a593Smuzhiyun	sw	$h2,8($ctx)
1126*4882a593Smuzhiyun	sw	$h3,12($ctx)
1127*4882a593Smuzhiyun	sw	$h4,16($ctx)
1128*4882a593Smuzhiyun
1129*4882a593Smuzhiyun	.set	noreorder
1130*4882a593Smuzhiyun.Labort:
1131*4882a593Smuzhiyun	lw	$s11,4*11($sp)
1132*4882a593Smuzhiyun	lw	$s10,4*10($sp)
1133*4882a593Smuzhiyun	lw	$s9, 4*9($sp)
1134*4882a593Smuzhiyun	lw	$s8, 4*8($sp)
1135*4882a593Smuzhiyun	lw	$s7, 4*7($sp)
1136*4882a593Smuzhiyun	lw	$s6, 4*6($sp)
1137*4882a593Smuzhiyun	lw	$s5, 4*5($sp)
1138*4882a593Smuzhiyun	lw	$s4, 4*4($sp)
1139*4882a593Smuzhiyun___
1140*4882a593Smuzhiyun$code.=<<___ if ($flavour =~ /nubi/i);	# optimize non-nubi prologue
1141*4882a593Smuzhiyun	lw	$s3, 4*3($sp)
1142*4882a593Smuzhiyun	lw	$s2, 4*2($sp)
1143*4882a593Smuzhiyun	lw	$s1, 4*1($sp)
1144*4882a593Smuzhiyun	lw	$s0, 4*0($sp)
1145*4882a593Smuzhiyun___
1146*4882a593Smuzhiyun$code.=<<___;
1147*4882a593Smuzhiyun	jr	$ra
1148*4882a593Smuzhiyun	addu	$sp,$sp,4*12
1149*4882a593Smuzhiyun.end	poly1305_blocks
1150*4882a593Smuzhiyun___
1151*4882a593Smuzhiyun}
1152*4882a593Smuzhiyun{
1153*4882a593Smuzhiyunmy ($ctx,$mac,$nonce,$tmp4) = ($a0,$a1,$a2,$a3);
1154*4882a593Smuzhiyun
1155*4882a593Smuzhiyun$code.=<<___;
1156*4882a593Smuzhiyun.align	5
1157*4882a593Smuzhiyun.globl	poly1305_emit
1158*4882a593Smuzhiyun.ent	poly1305_emit
1159*4882a593Smuzhiyunpoly1305_emit:
1160*4882a593Smuzhiyun	.frame	$sp,0,$ra
1161*4882a593Smuzhiyun	.set	reorder
1162*4882a593Smuzhiyun
1163*4882a593Smuzhiyun	lw	$tmp4,16($ctx)
1164*4882a593Smuzhiyun	lw	$tmp0,0($ctx)
1165*4882a593Smuzhiyun	lw	$tmp1,4($ctx)
1166*4882a593Smuzhiyun	lw	$tmp2,8($ctx)
1167*4882a593Smuzhiyun	lw	$tmp3,12($ctx)
1168*4882a593Smuzhiyun
1169*4882a593Smuzhiyun	li	$in0,-4			# final reduction
1170*4882a593Smuzhiyun	srl	$ctx,$tmp4,2
1171*4882a593Smuzhiyun	and	$in0,$in0,$tmp4
1172*4882a593Smuzhiyun	andi	$tmp4,$tmp4,3
1173*4882a593Smuzhiyun	addu	$ctx,$ctx,$in0
1174*4882a593Smuzhiyun
1175*4882a593Smuzhiyun	addu	$tmp0,$tmp0,$ctx
1176*4882a593Smuzhiyun	sltu	$ctx,$tmp0,$ctx
1177*4882a593Smuzhiyun	 addiu	$in0,$tmp0,5		# compare to modulus
1178*4882a593Smuzhiyun	addu	$tmp1,$tmp1,$ctx
1179*4882a593Smuzhiyun	 sltiu	$in1,$in0,5
1180*4882a593Smuzhiyun	sltu	$ctx,$tmp1,$ctx
1181*4882a593Smuzhiyun	 addu	$in1,$in1,$tmp1
1182*4882a593Smuzhiyun	addu	$tmp2,$tmp2,$ctx
1183*4882a593Smuzhiyun	 sltu	$in2,$in1,$tmp1
1184*4882a593Smuzhiyun	sltu	$ctx,$tmp2,$ctx
1185*4882a593Smuzhiyun	 addu	$in2,$in2,$tmp2
1186*4882a593Smuzhiyun	addu	$tmp3,$tmp3,$ctx
1187*4882a593Smuzhiyun	 sltu	$in3,$in2,$tmp2
1188*4882a593Smuzhiyun	sltu	$ctx,$tmp3,$ctx
1189*4882a593Smuzhiyun	 addu	$in3,$in3,$tmp3
1190*4882a593Smuzhiyun	addu	$tmp4,$tmp4,$ctx
1191*4882a593Smuzhiyun	 sltu	$ctx,$in3,$tmp3
1192*4882a593Smuzhiyun	 addu	$ctx,$tmp4
1193*4882a593Smuzhiyun
1194*4882a593Smuzhiyun	srl	$ctx,2			# see if it carried/borrowed
1195*4882a593Smuzhiyun	subu	$ctx,$zero,$ctx
1196*4882a593Smuzhiyun
1197*4882a593Smuzhiyun	xor	$in0,$tmp0
1198*4882a593Smuzhiyun	xor	$in1,$tmp1
1199*4882a593Smuzhiyun	xor	$in2,$tmp2
1200*4882a593Smuzhiyun	xor	$in3,$tmp3
1201*4882a593Smuzhiyun	and	$in0,$ctx
1202*4882a593Smuzhiyun	and	$in1,$ctx
1203*4882a593Smuzhiyun	and	$in2,$ctx
1204*4882a593Smuzhiyun	and	$in3,$ctx
1205*4882a593Smuzhiyun	xor	$in0,$tmp0
1206*4882a593Smuzhiyun	xor	$in1,$tmp1
1207*4882a593Smuzhiyun	xor	$in2,$tmp2
1208*4882a593Smuzhiyun	xor	$in3,$tmp3
1209*4882a593Smuzhiyun
1210*4882a593Smuzhiyun	lw	$tmp0,0($nonce)		# load nonce
1211*4882a593Smuzhiyun	lw	$tmp1,4($nonce)
1212*4882a593Smuzhiyun	lw	$tmp2,8($nonce)
1213*4882a593Smuzhiyun	lw	$tmp3,12($nonce)
1214*4882a593Smuzhiyun
1215*4882a593Smuzhiyun	addu	$in0,$tmp0		# accumulate nonce
1216*4882a593Smuzhiyun	sltu	$ctx,$in0,$tmp0
1217*4882a593Smuzhiyun
1218*4882a593Smuzhiyun	addu	$in1,$tmp1
1219*4882a593Smuzhiyun	sltu	$tmp1,$in1,$tmp1
1220*4882a593Smuzhiyun	addu	$in1,$ctx
1221*4882a593Smuzhiyun	sltu	$ctx,$in1,$ctx
1222*4882a593Smuzhiyun	addu	$ctx,$tmp1
1223*4882a593Smuzhiyun
1224*4882a593Smuzhiyun	addu	$in2,$tmp2
1225*4882a593Smuzhiyun	sltu	$tmp2,$in2,$tmp2
1226*4882a593Smuzhiyun	addu	$in2,$ctx
1227*4882a593Smuzhiyun	sltu	$ctx,$in2,$ctx
1228*4882a593Smuzhiyun	addu	$ctx,$tmp2
1229*4882a593Smuzhiyun
1230*4882a593Smuzhiyun	addu	$in3,$tmp3
1231*4882a593Smuzhiyun	addu	$in3,$ctx
1232*4882a593Smuzhiyun
1233*4882a593Smuzhiyun	srl	$tmp0,$in0,8		# write mac value
1234*4882a593Smuzhiyun	srl	$tmp1,$in0,16
1235*4882a593Smuzhiyun	srl	$tmp2,$in0,24
1236*4882a593Smuzhiyun	sb	$in0, 0($mac)
1237*4882a593Smuzhiyun	sb	$tmp0,1($mac)
1238*4882a593Smuzhiyun	srl	$tmp0,$in1,8
1239*4882a593Smuzhiyun	sb	$tmp1,2($mac)
1240*4882a593Smuzhiyun	srl	$tmp1,$in1,16
1241*4882a593Smuzhiyun	sb	$tmp2,3($mac)
1242*4882a593Smuzhiyun	srl	$tmp2,$in1,24
1243*4882a593Smuzhiyun	sb	$in1, 4($mac)
1244*4882a593Smuzhiyun	sb	$tmp0,5($mac)
1245*4882a593Smuzhiyun	srl	$tmp0,$in2,8
1246*4882a593Smuzhiyun	sb	$tmp1,6($mac)
1247*4882a593Smuzhiyun	srl	$tmp1,$in2,16
1248*4882a593Smuzhiyun	sb	$tmp2,7($mac)
1249*4882a593Smuzhiyun	srl	$tmp2,$in2,24
1250*4882a593Smuzhiyun	sb	$in2, 8($mac)
1251*4882a593Smuzhiyun	sb	$tmp0,9($mac)
1252*4882a593Smuzhiyun	srl	$tmp0,$in3,8
1253*4882a593Smuzhiyun	sb	$tmp1,10($mac)
1254*4882a593Smuzhiyun	srl	$tmp1,$in3,16
1255*4882a593Smuzhiyun	sb	$tmp2,11($mac)
1256*4882a593Smuzhiyun	srl	$tmp2,$in3,24
1257*4882a593Smuzhiyun	sb	$in3, 12($mac)
1258*4882a593Smuzhiyun	sb	$tmp0,13($mac)
1259*4882a593Smuzhiyun	sb	$tmp1,14($mac)
1260*4882a593Smuzhiyun	sb	$tmp2,15($mac)
1261*4882a593Smuzhiyun
1262*4882a593Smuzhiyun	jr	$ra
1263*4882a593Smuzhiyun.end	poly1305_emit
1264*4882a593Smuzhiyun.rdata
1265*4882a593Smuzhiyun.asciiz	"Poly1305 for MIPS32, CRYPTOGAMS by \@dot-asm"
1266*4882a593Smuzhiyun.align	2
1267*4882a593Smuzhiyun___
1268*4882a593Smuzhiyun}
1269*4882a593Smuzhiyun}}}
1270*4882a593Smuzhiyun
1271*4882a593Smuzhiyun$output=pop and open STDOUT,">$output";
1272*4882a593Smuzhiyunprint $code;
1273*4882a593Smuzhiyunclose STDOUT;
1274