xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/aes_ctrby8_avx-x86_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/*
2*4882a593Smuzhiyun *	Implement AES CTR mode by8 optimization with AVX instructions. (x86_64)
3*4882a593Smuzhiyun *
4*4882a593Smuzhiyun * This is AES128/192/256 CTR mode optimization implementation. It requires
5*4882a593Smuzhiyun * the support of Intel(R) AESNI and AVX instructions.
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * This work was inspired by the AES CTR mode optimization published
8*4882a593Smuzhiyun * in Intel Optimized IPSEC Cryptograhpic library.
9*4882a593Smuzhiyun * Additional information on it can be found at:
10*4882a593Smuzhiyun *    http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972
11*4882a593Smuzhiyun *
12*4882a593Smuzhiyun * This file is provided under a dual BSD/GPLv2 license.  When using or
13*4882a593Smuzhiyun * redistributing this file, you may do so under either license.
14*4882a593Smuzhiyun *
15*4882a593Smuzhiyun * GPL LICENSE SUMMARY
16*4882a593Smuzhiyun *
17*4882a593Smuzhiyun * Copyright(c) 2014 Intel Corporation.
18*4882a593Smuzhiyun *
19*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or modify
20*4882a593Smuzhiyun * it under the terms of version 2 of the GNU General Public License as
21*4882a593Smuzhiyun * published by the Free Software Foundation.
22*4882a593Smuzhiyun *
23*4882a593Smuzhiyun * This program is distributed in the hope that it will be useful, but
24*4882a593Smuzhiyun * WITHOUT ANY WARRANTY; without even the implied warranty of
25*4882a593Smuzhiyun * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
26*4882a593Smuzhiyun * General Public License for more details.
27*4882a593Smuzhiyun *
28*4882a593Smuzhiyun * Contact Information:
29*4882a593Smuzhiyun * James Guilford <james.guilford@intel.com>
30*4882a593Smuzhiyun * Sean Gulley <sean.m.gulley@intel.com>
31*4882a593Smuzhiyun * Chandramouli Narayanan <mouli@linux.intel.com>
32*4882a593Smuzhiyun *
33*4882a593Smuzhiyun * BSD LICENSE
34*4882a593Smuzhiyun *
35*4882a593Smuzhiyun * Copyright(c) 2014 Intel Corporation.
36*4882a593Smuzhiyun *
37*4882a593Smuzhiyun * Redistribution and use in source and binary forms, with or without
38*4882a593Smuzhiyun * modification, are permitted provided that the following conditions
39*4882a593Smuzhiyun * are met:
40*4882a593Smuzhiyun *
41*4882a593Smuzhiyun * Redistributions of source code must retain the above copyright
42*4882a593Smuzhiyun * notice, this list of conditions and the following disclaimer.
43*4882a593Smuzhiyun * Redistributions in binary form must reproduce the above copyright
44*4882a593Smuzhiyun * notice, this list of conditions and the following disclaimer in
45*4882a593Smuzhiyun * the documentation and/or other materials provided with the
46*4882a593Smuzhiyun * distribution.
47*4882a593Smuzhiyun * Neither the name of Intel Corporation nor the names of its
48*4882a593Smuzhiyun * contributors may be used to endorse or promote products derived
49*4882a593Smuzhiyun * from this software without specific prior written permission.
50*4882a593Smuzhiyun *
51*4882a593Smuzhiyun * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
52*4882a593Smuzhiyun * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
53*4882a593Smuzhiyun * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
54*4882a593Smuzhiyun * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
55*4882a593Smuzhiyun * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
56*4882a593Smuzhiyun * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
57*4882a593Smuzhiyun * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
58*4882a593Smuzhiyun * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
59*4882a593Smuzhiyun * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
60*4882a593Smuzhiyun * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
61*4882a593Smuzhiyun * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
62*4882a593Smuzhiyun *
63*4882a593Smuzhiyun */
64*4882a593Smuzhiyun
65*4882a593Smuzhiyun#include <linux/linkage.h>
66*4882a593Smuzhiyun
67*4882a593Smuzhiyun#define VMOVDQ		vmovdqu
68*4882a593Smuzhiyun
69*4882a593Smuzhiyun#define xdata0		%xmm0
70*4882a593Smuzhiyun#define xdata1		%xmm1
71*4882a593Smuzhiyun#define xdata2		%xmm2
72*4882a593Smuzhiyun#define xdata3		%xmm3
73*4882a593Smuzhiyun#define xdata4		%xmm4
74*4882a593Smuzhiyun#define xdata5		%xmm5
75*4882a593Smuzhiyun#define xdata6		%xmm6
76*4882a593Smuzhiyun#define xdata7		%xmm7
77*4882a593Smuzhiyun#define xcounter	%xmm8
78*4882a593Smuzhiyun#define xbyteswap	%xmm9
79*4882a593Smuzhiyun#define xkey0		%xmm10
80*4882a593Smuzhiyun#define xkey4		%xmm11
81*4882a593Smuzhiyun#define xkey8		%xmm12
82*4882a593Smuzhiyun#define xkey12		%xmm13
83*4882a593Smuzhiyun#define xkeyA		%xmm14
84*4882a593Smuzhiyun#define xkeyB		%xmm15
85*4882a593Smuzhiyun
86*4882a593Smuzhiyun#define p_in		%rdi
87*4882a593Smuzhiyun#define p_iv		%rsi
88*4882a593Smuzhiyun#define p_keys		%rdx
89*4882a593Smuzhiyun#define p_out		%rcx
90*4882a593Smuzhiyun#define num_bytes	%r8
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun#define tmp		%r10
93*4882a593Smuzhiyun#define	DDQ_DATA	0
94*4882a593Smuzhiyun#define	XDATA		1
95*4882a593Smuzhiyun#define KEY_128		1
96*4882a593Smuzhiyun#define KEY_192		2
97*4882a593Smuzhiyun#define KEY_256		3
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun.section .rodata
100*4882a593Smuzhiyun.align 16
101*4882a593Smuzhiyun
102*4882a593Smuzhiyunbyteswap_const:
103*4882a593Smuzhiyun	.octa 0x000102030405060708090A0B0C0D0E0F
104*4882a593Smuzhiyunddq_low_msk:
105*4882a593Smuzhiyun	.octa 0x0000000000000000FFFFFFFFFFFFFFFF
106*4882a593Smuzhiyunddq_high_add_1:
107*4882a593Smuzhiyun	.octa 0x00000000000000010000000000000000
108*4882a593Smuzhiyunddq_add_1:
109*4882a593Smuzhiyun	.octa 0x00000000000000000000000000000001
110*4882a593Smuzhiyunddq_add_2:
111*4882a593Smuzhiyun	.octa 0x00000000000000000000000000000002
112*4882a593Smuzhiyunddq_add_3:
113*4882a593Smuzhiyun	.octa 0x00000000000000000000000000000003
114*4882a593Smuzhiyunddq_add_4:
115*4882a593Smuzhiyun	.octa 0x00000000000000000000000000000004
116*4882a593Smuzhiyunddq_add_5:
117*4882a593Smuzhiyun	.octa 0x00000000000000000000000000000005
118*4882a593Smuzhiyunddq_add_6:
119*4882a593Smuzhiyun	.octa 0x00000000000000000000000000000006
120*4882a593Smuzhiyunddq_add_7:
121*4882a593Smuzhiyun	.octa 0x00000000000000000000000000000007
122*4882a593Smuzhiyunddq_add_8:
123*4882a593Smuzhiyun	.octa 0x00000000000000000000000000000008
124*4882a593Smuzhiyun
125*4882a593Smuzhiyun.text
126*4882a593Smuzhiyun
127*4882a593Smuzhiyun/* generate a unique variable for ddq_add_x */
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun/* generate a unique variable for xmm register */
130*4882a593Smuzhiyun.macro setxdata n
131*4882a593Smuzhiyun	var_xdata = %xmm\n
132*4882a593Smuzhiyun.endm
133*4882a593Smuzhiyun
134*4882a593Smuzhiyun/* club the numeric 'id' to the symbol 'name' */
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun.macro club name, id
137*4882a593Smuzhiyun.altmacro
138*4882a593Smuzhiyun	.if \name == XDATA
139*4882a593Smuzhiyun		setxdata %\id
140*4882a593Smuzhiyun	.endif
141*4882a593Smuzhiyun.noaltmacro
142*4882a593Smuzhiyun.endm
143*4882a593Smuzhiyun
144*4882a593Smuzhiyun/*
145*4882a593Smuzhiyun * do_aes num_in_par load_keys key_len
146*4882a593Smuzhiyun * This increments p_in, but not p_out
147*4882a593Smuzhiyun */
148*4882a593Smuzhiyun.macro do_aes b, k, key_len
149*4882a593Smuzhiyun	.set by, \b
150*4882a593Smuzhiyun	.set load_keys, \k
151*4882a593Smuzhiyun	.set klen, \key_len
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun	.if (load_keys)
154*4882a593Smuzhiyun		vmovdqa	0*16(p_keys), xkey0
155*4882a593Smuzhiyun	.endif
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun	vpshufb	xbyteswap, xcounter, xdata0
158*4882a593Smuzhiyun
159*4882a593Smuzhiyun	.set i, 1
160*4882a593Smuzhiyun	.rept (by - 1)
161*4882a593Smuzhiyun		club XDATA, i
162*4882a593Smuzhiyun		vpaddq	(ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata
163*4882a593Smuzhiyun		vptest	ddq_low_msk(%rip), var_xdata
164*4882a593Smuzhiyun		jnz 1f
165*4882a593Smuzhiyun		vpaddq	ddq_high_add_1(%rip), var_xdata, var_xdata
166*4882a593Smuzhiyun		vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
167*4882a593Smuzhiyun		1:
168*4882a593Smuzhiyun		vpshufb	xbyteswap, var_xdata, var_xdata
169*4882a593Smuzhiyun		.set i, (i +1)
170*4882a593Smuzhiyun	.endr
171*4882a593Smuzhiyun
172*4882a593Smuzhiyun	vmovdqa	1*16(p_keys), xkeyA
173*4882a593Smuzhiyun
174*4882a593Smuzhiyun	vpxor	xkey0, xdata0, xdata0
175*4882a593Smuzhiyun	vpaddq	(ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter
176*4882a593Smuzhiyun	vptest	ddq_low_msk(%rip), xcounter
177*4882a593Smuzhiyun	jnz	1f
178*4882a593Smuzhiyun	vpaddq	ddq_high_add_1(%rip), xcounter, xcounter
179*4882a593Smuzhiyun	1:
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun	.set i, 1
182*4882a593Smuzhiyun	.rept (by - 1)
183*4882a593Smuzhiyun		club XDATA, i
184*4882a593Smuzhiyun		vpxor	xkey0, var_xdata, var_xdata
185*4882a593Smuzhiyun		.set i, (i +1)
186*4882a593Smuzhiyun	.endr
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun	vmovdqa	2*16(p_keys), xkeyB
189*4882a593Smuzhiyun
190*4882a593Smuzhiyun	.set i, 0
191*4882a593Smuzhiyun	.rept by
192*4882a593Smuzhiyun		club XDATA, i
193*4882a593Smuzhiyun		vaesenc	xkeyA, var_xdata, var_xdata		/* key 1 */
194*4882a593Smuzhiyun		.set i, (i +1)
195*4882a593Smuzhiyun	.endr
196*4882a593Smuzhiyun
197*4882a593Smuzhiyun	.if (klen == KEY_128)
198*4882a593Smuzhiyun		.if (load_keys)
199*4882a593Smuzhiyun			vmovdqa	3*16(p_keys), xkey4
200*4882a593Smuzhiyun		.endif
201*4882a593Smuzhiyun	.else
202*4882a593Smuzhiyun		vmovdqa	3*16(p_keys), xkeyA
203*4882a593Smuzhiyun	.endif
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun	.set i, 0
206*4882a593Smuzhiyun	.rept by
207*4882a593Smuzhiyun		club XDATA, i
208*4882a593Smuzhiyun		vaesenc	xkeyB, var_xdata, var_xdata		/* key 2 */
209*4882a593Smuzhiyun		.set i, (i +1)
210*4882a593Smuzhiyun	.endr
211*4882a593Smuzhiyun
212*4882a593Smuzhiyun	add	$(16*by), p_in
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun	.if (klen == KEY_128)
215*4882a593Smuzhiyun		vmovdqa	4*16(p_keys), xkeyB
216*4882a593Smuzhiyun	.else
217*4882a593Smuzhiyun		.if (load_keys)
218*4882a593Smuzhiyun			vmovdqa	4*16(p_keys), xkey4
219*4882a593Smuzhiyun		.endif
220*4882a593Smuzhiyun	.endif
221*4882a593Smuzhiyun
222*4882a593Smuzhiyun	.set i, 0
223*4882a593Smuzhiyun	.rept by
224*4882a593Smuzhiyun		club XDATA, i
225*4882a593Smuzhiyun		/* key 3 */
226*4882a593Smuzhiyun		.if (klen == KEY_128)
227*4882a593Smuzhiyun			vaesenc	xkey4, var_xdata, var_xdata
228*4882a593Smuzhiyun		.else
229*4882a593Smuzhiyun			vaesenc	xkeyA, var_xdata, var_xdata
230*4882a593Smuzhiyun		.endif
231*4882a593Smuzhiyun		.set i, (i +1)
232*4882a593Smuzhiyun	.endr
233*4882a593Smuzhiyun
234*4882a593Smuzhiyun	vmovdqa	5*16(p_keys), xkeyA
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun	.set i, 0
237*4882a593Smuzhiyun	.rept by
238*4882a593Smuzhiyun		club XDATA, i
239*4882a593Smuzhiyun		/* key 4 */
240*4882a593Smuzhiyun		.if (klen == KEY_128)
241*4882a593Smuzhiyun			vaesenc	xkeyB, var_xdata, var_xdata
242*4882a593Smuzhiyun		.else
243*4882a593Smuzhiyun			vaesenc	xkey4, var_xdata, var_xdata
244*4882a593Smuzhiyun		.endif
245*4882a593Smuzhiyun		.set i, (i +1)
246*4882a593Smuzhiyun	.endr
247*4882a593Smuzhiyun
248*4882a593Smuzhiyun	.if (klen == KEY_128)
249*4882a593Smuzhiyun		.if (load_keys)
250*4882a593Smuzhiyun			vmovdqa	6*16(p_keys), xkey8
251*4882a593Smuzhiyun		.endif
252*4882a593Smuzhiyun	.else
253*4882a593Smuzhiyun		vmovdqa	6*16(p_keys), xkeyB
254*4882a593Smuzhiyun	.endif
255*4882a593Smuzhiyun
256*4882a593Smuzhiyun	.set i, 0
257*4882a593Smuzhiyun	.rept by
258*4882a593Smuzhiyun		club XDATA, i
259*4882a593Smuzhiyun		vaesenc	xkeyA, var_xdata, var_xdata		/* key 5 */
260*4882a593Smuzhiyun		.set i, (i +1)
261*4882a593Smuzhiyun	.endr
262*4882a593Smuzhiyun
263*4882a593Smuzhiyun	vmovdqa	7*16(p_keys), xkeyA
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun	.set i, 0
266*4882a593Smuzhiyun	.rept by
267*4882a593Smuzhiyun		club XDATA, i
268*4882a593Smuzhiyun		/* key 6 */
269*4882a593Smuzhiyun		.if (klen == KEY_128)
270*4882a593Smuzhiyun			vaesenc	xkey8, var_xdata, var_xdata
271*4882a593Smuzhiyun		.else
272*4882a593Smuzhiyun			vaesenc	xkeyB, var_xdata, var_xdata
273*4882a593Smuzhiyun		.endif
274*4882a593Smuzhiyun		.set i, (i +1)
275*4882a593Smuzhiyun	.endr
276*4882a593Smuzhiyun
277*4882a593Smuzhiyun	.if (klen == KEY_128)
278*4882a593Smuzhiyun		vmovdqa	8*16(p_keys), xkeyB
279*4882a593Smuzhiyun	.else
280*4882a593Smuzhiyun		.if (load_keys)
281*4882a593Smuzhiyun			vmovdqa	8*16(p_keys), xkey8
282*4882a593Smuzhiyun		.endif
283*4882a593Smuzhiyun	.endif
284*4882a593Smuzhiyun
285*4882a593Smuzhiyun	.set i, 0
286*4882a593Smuzhiyun	.rept by
287*4882a593Smuzhiyun		club XDATA, i
288*4882a593Smuzhiyun		vaesenc	xkeyA, var_xdata, var_xdata		/* key 7 */
289*4882a593Smuzhiyun		.set i, (i +1)
290*4882a593Smuzhiyun	.endr
291*4882a593Smuzhiyun
292*4882a593Smuzhiyun	.if (klen == KEY_128)
293*4882a593Smuzhiyun		.if (load_keys)
294*4882a593Smuzhiyun			vmovdqa	9*16(p_keys), xkey12
295*4882a593Smuzhiyun		.endif
296*4882a593Smuzhiyun	.else
297*4882a593Smuzhiyun		vmovdqa	9*16(p_keys), xkeyA
298*4882a593Smuzhiyun	.endif
299*4882a593Smuzhiyun
300*4882a593Smuzhiyun	.set i, 0
301*4882a593Smuzhiyun	.rept by
302*4882a593Smuzhiyun		club XDATA, i
303*4882a593Smuzhiyun		/* key 8 */
304*4882a593Smuzhiyun		.if (klen == KEY_128)
305*4882a593Smuzhiyun			vaesenc	xkeyB, var_xdata, var_xdata
306*4882a593Smuzhiyun		.else
307*4882a593Smuzhiyun			vaesenc	xkey8, var_xdata, var_xdata
308*4882a593Smuzhiyun		.endif
309*4882a593Smuzhiyun		.set i, (i +1)
310*4882a593Smuzhiyun	.endr
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun	vmovdqa	10*16(p_keys), xkeyB
313*4882a593Smuzhiyun
314*4882a593Smuzhiyun	.set i, 0
315*4882a593Smuzhiyun	.rept by
316*4882a593Smuzhiyun		club XDATA, i
317*4882a593Smuzhiyun		/* key 9 */
318*4882a593Smuzhiyun		.if (klen == KEY_128)
319*4882a593Smuzhiyun			vaesenc	xkey12, var_xdata, var_xdata
320*4882a593Smuzhiyun		.else
321*4882a593Smuzhiyun			vaesenc	xkeyA, var_xdata, var_xdata
322*4882a593Smuzhiyun		.endif
323*4882a593Smuzhiyun		.set i, (i +1)
324*4882a593Smuzhiyun	.endr
325*4882a593Smuzhiyun
326*4882a593Smuzhiyun	.if (klen != KEY_128)
327*4882a593Smuzhiyun		vmovdqa	11*16(p_keys), xkeyA
328*4882a593Smuzhiyun	.endif
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun	.set i, 0
331*4882a593Smuzhiyun	.rept by
332*4882a593Smuzhiyun		club XDATA, i
333*4882a593Smuzhiyun		/* key 10 */
334*4882a593Smuzhiyun		.if (klen == KEY_128)
335*4882a593Smuzhiyun			vaesenclast	xkeyB, var_xdata, var_xdata
336*4882a593Smuzhiyun		.else
337*4882a593Smuzhiyun			vaesenc	xkeyB, var_xdata, var_xdata
338*4882a593Smuzhiyun		.endif
339*4882a593Smuzhiyun		.set i, (i +1)
340*4882a593Smuzhiyun	.endr
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun	.if (klen != KEY_128)
343*4882a593Smuzhiyun		.if (load_keys)
344*4882a593Smuzhiyun			vmovdqa	12*16(p_keys), xkey12
345*4882a593Smuzhiyun		.endif
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun		.set i, 0
348*4882a593Smuzhiyun		.rept by
349*4882a593Smuzhiyun			club XDATA, i
350*4882a593Smuzhiyun			vaesenc	xkeyA, var_xdata, var_xdata	/* key 11 */
351*4882a593Smuzhiyun			.set i, (i +1)
352*4882a593Smuzhiyun		.endr
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun		.if (klen == KEY_256)
355*4882a593Smuzhiyun			vmovdqa	13*16(p_keys), xkeyA
356*4882a593Smuzhiyun		.endif
357*4882a593Smuzhiyun
358*4882a593Smuzhiyun		.set i, 0
359*4882a593Smuzhiyun		.rept by
360*4882a593Smuzhiyun			club XDATA, i
361*4882a593Smuzhiyun			.if (klen == KEY_256)
362*4882a593Smuzhiyun				/* key 12 */
363*4882a593Smuzhiyun				vaesenc	xkey12, var_xdata, var_xdata
364*4882a593Smuzhiyun			.else
365*4882a593Smuzhiyun				vaesenclast xkey12, var_xdata, var_xdata
366*4882a593Smuzhiyun			.endif
367*4882a593Smuzhiyun			.set i, (i +1)
368*4882a593Smuzhiyun		.endr
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun		.if (klen == KEY_256)
371*4882a593Smuzhiyun			vmovdqa	14*16(p_keys), xkeyB
372*4882a593Smuzhiyun
373*4882a593Smuzhiyun			.set i, 0
374*4882a593Smuzhiyun			.rept by
375*4882a593Smuzhiyun				club XDATA, i
376*4882a593Smuzhiyun				/* key 13 */
377*4882a593Smuzhiyun				vaesenc	xkeyA, var_xdata, var_xdata
378*4882a593Smuzhiyun				.set i, (i +1)
379*4882a593Smuzhiyun			.endr
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun			.set i, 0
382*4882a593Smuzhiyun			.rept by
383*4882a593Smuzhiyun				club XDATA, i
384*4882a593Smuzhiyun				/* key 14 */
385*4882a593Smuzhiyun				vaesenclast	xkeyB, var_xdata, var_xdata
386*4882a593Smuzhiyun				.set i, (i +1)
387*4882a593Smuzhiyun			.endr
388*4882a593Smuzhiyun		.endif
389*4882a593Smuzhiyun	.endif
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun	.set i, 0
392*4882a593Smuzhiyun	.rept (by / 2)
393*4882a593Smuzhiyun		.set j, (i+1)
394*4882a593Smuzhiyun		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
395*4882a593Smuzhiyun		VMOVDQ	(j*16 - 16*by)(p_in), xkeyB
396*4882a593Smuzhiyun		club XDATA, i
397*4882a593Smuzhiyun		vpxor	xkeyA, var_xdata, var_xdata
398*4882a593Smuzhiyun		club XDATA, j
399*4882a593Smuzhiyun		vpxor	xkeyB, var_xdata, var_xdata
400*4882a593Smuzhiyun		.set i, (i+2)
401*4882a593Smuzhiyun	.endr
402*4882a593Smuzhiyun
403*4882a593Smuzhiyun	.if (i < by)
404*4882a593Smuzhiyun		VMOVDQ	(i*16 - 16*by)(p_in), xkeyA
405*4882a593Smuzhiyun		club XDATA, i
406*4882a593Smuzhiyun		vpxor	xkeyA, var_xdata, var_xdata
407*4882a593Smuzhiyun	.endif
408*4882a593Smuzhiyun
409*4882a593Smuzhiyun	.set i, 0
410*4882a593Smuzhiyun	.rept by
411*4882a593Smuzhiyun		club XDATA, i
412*4882a593Smuzhiyun		VMOVDQ	var_xdata, i*16(p_out)
413*4882a593Smuzhiyun		.set i, (i+1)
414*4882a593Smuzhiyun	.endr
415*4882a593Smuzhiyun.endm
416*4882a593Smuzhiyun
417*4882a593Smuzhiyun.macro do_aes_load val, key_len
418*4882a593Smuzhiyun	do_aes \val, 1, \key_len
419*4882a593Smuzhiyun.endm
420*4882a593Smuzhiyun
421*4882a593Smuzhiyun.macro do_aes_noload val, key_len
422*4882a593Smuzhiyun	do_aes \val, 0, \key_len
423*4882a593Smuzhiyun.endm
424*4882a593Smuzhiyun
425*4882a593Smuzhiyun/* main body of aes ctr load */
426*4882a593Smuzhiyun
427*4882a593Smuzhiyun.macro do_aes_ctrmain key_len
428*4882a593Smuzhiyun	cmp	$16, num_bytes
429*4882a593Smuzhiyun	jb	.Ldo_return2\key_len
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun	vmovdqa	byteswap_const(%rip), xbyteswap
432*4882a593Smuzhiyun	vmovdqu	(p_iv), xcounter
433*4882a593Smuzhiyun	vpshufb	xbyteswap, xcounter, xcounter
434*4882a593Smuzhiyun
435*4882a593Smuzhiyun	mov	num_bytes, tmp
436*4882a593Smuzhiyun	and	$(7*16), tmp
437*4882a593Smuzhiyun	jz	.Lmult_of_8_blks\key_len
438*4882a593Smuzhiyun
439*4882a593Smuzhiyun	/* 1 <= tmp <= 7 */
440*4882a593Smuzhiyun	cmp	$(4*16), tmp
441*4882a593Smuzhiyun	jg	.Lgt4\key_len
442*4882a593Smuzhiyun	je	.Leq4\key_len
443*4882a593Smuzhiyun
444*4882a593Smuzhiyun.Llt4\key_len:
445*4882a593Smuzhiyun	cmp	$(2*16), tmp
446*4882a593Smuzhiyun	jg	.Leq3\key_len
447*4882a593Smuzhiyun	je	.Leq2\key_len
448*4882a593Smuzhiyun
449*4882a593Smuzhiyun.Leq1\key_len:
450*4882a593Smuzhiyun	do_aes_load	1, \key_len
451*4882a593Smuzhiyun	add	$(1*16), p_out
452*4882a593Smuzhiyun	and	$(~7*16), num_bytes
453*4882a593Smuzhiyun	jz	.Ldo_return2\key_len
454*4882a593Smuzhiyun	jmp	.Lmain_loop2\key_len
455*4882a593Smuzhiyun
456*4882a593Smuzhiyun.Leq2\key_len:
457*4882a593Smuzhiyun	do_aes_load	2, \key_len
458*4882a593Smuzhiyun	add	$(2*16), p_out
459*4882a593Smuzhiyun	and	$(~7*16), num_bytes
460*4882a593Smuzhiyun	jz	.Ldo_return2\key_len
461*4882a593Smuzhiyun	jmp	.Lmain_loop2\key_len
462*4882a593Smuzhiyun
463*4882a593Smuzhiyun
464*4882a593Smuzhiyun.Leq3\key_len:
465*4882a593Smuzhiyun	do_aes_load	3, \key_len
466*4882a593Smuzhiyun	add	$(3*16), p_out
467*4882a593Smuzhiyun	and	$(~7*16), num_bytes
468*4882a593Smuzhiyun	jz	.Ldo_return2\key_len
469*4882a593Smuzhiyun	jmp	.Lmain_loop2\key_len
470*4882a593Smuzhiyun
471*4882a593Smuzhiyun.Leq4\key_len:
472*4882a593Smuzhiyun	do_aes_load	4, \key_len
473*4882a593Smuzhiyun	add	$(4*16), p_out
474*4882a593Smuzhiyun	and	$(~7*16), num_bytes
475*4882a593Smuzhiyun	jz	.Ldo_return2\key_len
476*4882a593Smuzhiyun	jmp	.Lmain_loop2\key_len
477*4882a593Smuzhiyun
478*4882a593Smuzhiyun.Lgt4\key_len:
479*4882a593Smuzhiyun	cmp	$(6*16), tmp
480*4882a593Smuzhiyun	jg	.Leq7\key_len
481*4882a593Smuzhiyun	je	.Leq6\key_len
482*4882a593Smuzhiyun
483*4882a593Smuzhiyun.Leq5\key_len:
484*4882a593Smuzhiyun	do_aes_load	5, \key_len
485*4882a593Smuzhiyun	add	$(5*16), p_out
486*4882a593Smuzhiyun	and	$(~7*16), num_bytes
487*4882a593Smuzhiyun	jz	.Ldo_return2\key_len
488*4882a593Smuzhiyun	jmp	.Lmain_loop2\key_len
489*4882a593Smuzhiyun
490*4882a593Smuzhiyun.Leq6\key_len:
491*4882a593Smuzhiyun	do_aes_load	6, \key_len
492*4882a593Smuzhiyun	add	$(6*16), p_out
493*4882a593Smuzhiyun	and	$(~7*16), num_bytes
494*4882a593Smuzhiyun	jz	.Ldo_return2\key_len
495*4882a593Smuzhiyun	jmp	.Lmain_loop2\key_len
496*4882a593Smuzhiyun
497*4882a593Smuzhiyun.Leq7\key_len:
498*4882a593Smuzhiyun	do_aes_load	7, \key_len
499*4882a593Smuzhiyun	add	$(7*16), p_out
500*4882a593Smuzhiyun	and	$(~7*16), num_bytes
501*4882a593Smuzhiyun	jz	.Ldo_return2\key_len
502*4882a593Smuzhiyun	jmp	.Lmain_loop2\key_len
503*4882a593Smuzhiyun
504*4882a593Smuzhiyun.Lmult_of_8_blks\key_len:
505*4882a593Smuzhiyun	.if (\key_len != KEY_128)
506*4882a593Smuzhiyun		vmovdqa	0*16(p_keys), xkey0
507*4882a593Smuzhiyun		vmovdqa	4*16(p_keys), xkey4
508*4882a593Smuzhiyun		vmovdqa	8*16(p_keys), xkey8
509*4882a593Smuzhiyun		vmovdqa	12*16(p_keys), xkey12
510*4882a593Smuzhiyun	.else
511*4882a593Smuzhiyun		vmovdqa	0*16(p_keys), xkey0
512*4882a593Smuzhiyun		vmovdqa	3*16(p_keys), xkey4
513*4882a593Smuzhiyun		vmovdqa	6*16(p_keys), xkey8
514*4882a593Smuzhiyun		vmovdqa	9*16(p_keys), xkey12
515*4882a593Smuzhiyun	.endif
516*4882a593Smuzhiyun.align 16
517*4882a593Smuzhiyun.Lmain_loop2\key_len:
518*4882a593Smuzhiyun	/* num_bytes is a multiple of 8 and >0 */
519*4882a593Smuzhiyun	do_aes_noload	8, \key_len
520*4882a593Smuzhiyun	add	$(8*16), p_out
521*4882a593Smuzhiyun	sub	$(8*16), num_bytes
522*4882a593Smuzhiyun	jne	.Lmain_loop2\key_len
523*4882a593Smuzhiyun
524*4882a593Smuzhiyun.Ldo_return2\key_len:
525*4882a593Smuzhiyun	/* return updated IV */
526*4882a593Smuzhiyun	vpshufb	xbyteswap, xcounter, xcounter
527*4882a593Smuzhiyun	vmovdqu	xcounter, (p_iv)
528*4882a593Smuzhiyun	RET
529*4882a593Smuzhiyun.endm
530*4882a593Smuzhiyun
531*4882a593Smuzhiyun/*
532*4882a593Smuzhiyun * routine to do AES128 CTR enc/decrypt "by8"
533*4882a593Smuzhiyun * XMM registers are clobbered.
534*4882a593Smuzhiyun * Saving/restoring must be done at a higher level
535*4882a593Smuzhiyun * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out,
536*4882a593Smuzhiyun *			unsigned int num_bytes)
537*4882a593Smuzhiyun */
538*4882a593SmuzhiyunSYM_FUNC_START(aes_ctr_enc_128_avx_by8)
539*4882a593Smuzhiyun	/* call the aes main loop */
540*4882a593Smuzhiyun	do_aes_ctrmain KEY_128
541*4882a593Smuzhiyun
542*4882a593SmuzhiyunSYM_FUNC_END(aes_ctr_enc_128_avx_by8)
543*4882a593Smuzhiyun
544*4882a593Smuzhiyun/*
545*4882a593Smuzhiyun * routine to do AES192 CTR enc/decrypt "by8"
546*4882a593Smuzhiyun * XMM registers are clobbered.
547*4882a593Smuzhiyun * Saving/restoring must be done at a higher level
548*4882a593Smuzhiyun * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out,
549*4882a593Smuzhiyun *			unsigned int num_bytes)
550*4882a593Smuzhiyun */
551*4882a593SmuzhiyunSYM_FUNC_START(aes_ctr_enc_192_avx_by8)
552*4882a593Smuzhiyun	/* call the aes main loop */
553*4882a593Smuzhiyun	do_aes_ctrmain KEY_192
554*4882a593Smuzhiyun
555*4882a593SmuzhiyunSYM_FUNC_END(aes_ctr_enc_192_avx_by8)
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun/*
558*4882a593Smuzhiyun * routine to do AES256 CTR enc/decrypt "by8"
559*4882a593Smuzhiyun * XMM registers are clobbered.
560*4882a593Smuzhiyun * Saving/restoring must be done at a higher level
561*4882a593Smuzhiyun * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out,
562*4882a593Smuzhiyun *			unsigned int num_bytes)
563*4882a593Smuzhiyun */
564*4882a593SmuzhiyunSYM_FUNC_START(aes_ctr_enc_256_avx_by8)
565*4882a593Smuzhiyun	/* call the aes main loop */
566*4882a593Smuzhiyun	do_aes_ctrmain KEY_256
567*4882a593Smuzhiyun
568*4882a593SmuzhiyunSYM_FUNC_END(aes_ctr_enc_256_avx_by8)
569