xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/aesni-intel_asm.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * Implement AES algorithm in Intel AES-NI instructions.
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * The white paper of AES-NI instructions can be downloaded from:
6*4882a593Smuzhiyun *   http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf
7*4882a593Smuzhiyun *
8*4882a593Smuzhiyun * Copyright (C) 2008, Intel Corp.
9*4882a593Smuzhiyun *    Author: Huang Ying <ying.huang@intel.com>
10*4882a593Smuzhiyun *            Vinodh Gopal <vinodh.gopal@intel.com>
11*4882a593Smuzhiyun *            Kahraman Akdemir
12*4882a593Smuzhiyun *
13*4882a593Smuzhiyun * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
14*4882a593Smuzhiyun * interface for 64-bit kernels.
15*4882a593Smuzhiyun *    Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
16*4882a593Smuzhiyun *             Aidan O'Mahony (aidan.o.mahony@intel.com)
17*4882a593Smuzhiyun *             Adrian Hoban <adrian.hoban@intel.com>
18*4882a593Smuzhiyun *             James Guilford (james.guilford@intel.com)
19*4882a593Smuzhiyun *             Gabriele Paoloni <gabriele.paoloni@intel.com>
20*4882a593Smuzhiyun *             Tadeusz Struk (tadeusz.struk@intel.com)
21*4882a593Smuzhiyun *             Wajdi Feghali (wajdi.k.feghali@intel.com)
22*4882a593Smuzhiyun *    Copyright (c) 2010, Intel Corporation.
23*4882a593Smuzhiyun *
24*4882a593Smuzhiyun * Ported x86_64 version to x86:
25*4882a593Smuzhiyun *    Author: Mathias Krause <minipli@googlemail.com>
26*4882a593Smuzhiyun */
27*4882a593Smuzhiyun
28*4882a593Smuzhiyun#include <linux/linkage.h>
29*4882a593Smuzhiyun#include <asm/frame.h>
30*4882a593Smuzhiyun#include <asm/nospec-branch.h>
31*4882a593Smuzhiyun
32*4882a593Smuzhiyun/*
33*4882a593Smuzhiyun * The following macros are used to move an (un)aligned 16 byte value to/from
34*4882a593Smuzhiyun * an XMM register.  This can done for either FP or integer values, for FP use
35*4882a593Smuzhiyun * movaps (move aligned packed single) or integer use movdqa (move double quad
36*4882a593Smuzhiyun * aligned).  It doesn't make a performance difference which instruction is used
37*4882a593Smuzhiyun * since Nehalem (original Core i7) was released.  However, the movaps is a byte
38*4882a593Smuzhiyun * shorter, so that is the one we'll use for now. (same for unaligned).
39*4882a593Smuzhiyun */
40*4882a593Smuzhiyun#define MOVADQ	movaps
41*4882a593Smuzhiyun#define MOVUDQ	movups
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun#ifdef __x86_64__
44*4882a593Smuzhiyun
45*4882a593Smuzhiyun# constants in mergeable sections, linker can reorder and merge
46*4882a593Smuzhiyun.section	.rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16
47*4882a593Smuzhiyun.align 16
48*4882a593Smuzhiyun.Lgf128mul_x_ble_mask:
49*4882a593Smuzhiyun	.octa 0x00000000000000010000000000000087
50*4882a593Smuzhiyun.section	.rodata.cst16.POLY, "aM", @progbits, 16
51*4882a593Smuzhiyun.align 16
52*4882a593SmuzhiyunPOLY:   .octa 0xC2000000000000000000000000000001
53*4882a593Smuzhiyun.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
54*4882a593Smuzhiyun.align 16
55*4882a593SmuzhiyunTWOONE: .octa 0x00000001000000000000000000000001
56*4882a593Smuzhiyun
57*4882a593Smuzhiyun.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
58*4882a593Smuzhiyun.align 16
59*4882a593SmuzhiyunSHUF_MASK:  .octa 0x000102030405060708090A0B0C0D0E0F
60*4882a593Smuzhiyun.section	.rodata.cst16.MASK1, "aM", @progbits, 16
61*4882a593Smuzhiyun.align 16
62*4882a593SmuzhiyunMASK1:      .octa 0x0000000000000000ffffffffffffffff
63*4882a593Smuzhiyun.section	.rodata.cst16.MASK2, "aM", @progbits, 16
64*4882a593Smuzhiyun.align 16
65*4882a593SmuzhiyunMASK2:      .octa 0xffffffffffffffff0000000000000000
66*4882a593Smuzhiyun.section	.rodata.cst16.ONE, "aM", @progbits, 16
67*4882a593Smuzhiyun.align 16
68*4882a593SmuzhiyunONE:        .octa 0x00000000000000000000000000000001
69*4882a593Smuzhiyun.section	.rodata.cst16.F_MIN_MASK, "aM", @progbits, 16
70*4882a593Smuzhiyun.align 16
71*4882a593SmuzhiyunF_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
72*4882a593Smuzhiyun.section	.rodata.cst16.dec, "aM", @progbits, 16
73*4882a593Smuzhiyun.align 16
74*4882a593Smuzhiyundec:        .octa 0x1
75*4882a593Smuzhiyun.section	.rodata.cst16.enc, "aM", @progbits, 16
76*4882a593Smuzhiyun.align 16
77*4882a593Smuzhiyunenc:        .octa 0x2
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun# order of these constants should not change.
80*4882a593Smuzhiyun# more specifically, ALL_F should follow SHIFT_MASK,
81*4882a593Smuzhiyun# and zero should follow ALL_F
82*4882a593Smuzhiyun.section	.rodata, "a", @progbits
83*4882a593Smuzhiyun.align 16
84*4882a593SmuzhiyunSHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
85*4882a593SmuzhiyunALL_F:      .octa 0xffffffffffffffffffffffffffffffff
86*4882a593Smuzhiyun            .octa 0x00000000000000000000000000000000
87*4882a593Smuzhiyun
88*4882a593Smuzhiyun.text
89*4882a593Smuzhiyun
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun#define	STACK_OFFSET    8*3
92*4882a593Smuzhiyun
93*4882a593Smuzhiyun#define AadHash 16*0
94*4882a593Smuzhiyun#define AadLen 16*1
95*4882a593Smuzhiyun#define InLen (16*1)+8
96*4882a593Smuzhiyun#define PBlockEncKey 16*2
97*4882a593Smuzhiyun#define OrigIV 16*3
98*4882a593Smuzhiyun#define CurCount 16*4
99*4882a593Smuzhiyun#define PBlockLen 16*5
100*4882a593Smuzhiyun#define	HashKey		16*6	// store HashKey <<1 mod poly here
101*4882a593Smuzhiyun#define	HashKey_2	16*7	// store HashKey^2 <<1 mod poly here
102*4882a593Smuzhiyun#define	HashKey_3	16*8	// store HashKey^3 <<1 mod poly here
103*4882a593Smuzhiyun#define	HashKey_4	16*9	// store HashKey^4 <<1 mod poly here
104*4882a593Smuzhiyun#define	HashKey_k	16*10	// store XOR of High 64 bits and Low 64
105*4882a593Smuzhiyun				// bits of  HashKey <<1 mod poly here
106*4882a593Smuzhiyun				//(for Karatsuba purposes)
107*4882a593Smuzhiyun#define	HashKey_2_k	16*11	// store XOR of High 64 bits and Low 64
108*4882a593Smuzhiyun				// bits of  HashKey^2 <<1 mod poly here
109*4882a593Smuzhiyun				// (for Karatsuba purposes)
110*4882a593Smuzhiyun#define	HashKey_3_k	16*12	// store XOR of High 64 bits and Low 64
111*4882a593Smuzhiyun				// bits of  HashKey^3 <<1 mod poly here
112*4882a593Smuzhiyun				// (for Karatsuba purposes)
113*4882a593Smuzhiyun#define	HashKey_4_k	16*13	// store XOR of High 64 bits and Low 64
114*4882a593Smuzhiyun				// bits of  HashKey^4 <<1 mod poly here
115*4882a593Smuzhiyun				// (for Karatsuba purposes)
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun#define arg1 rdi
118*4882a593Smuzhiyun#define arg2 rsi
119*4882a593Smuzhiyun#define arg3 rdx
120*4882a593Smuzhiyun#define arg4 rcx
121*4882a593Smuzhiyun#define arg5 r8
122*4882a593Smuzhiyun#define arg6 r9
123*4882a593Smuzhiyun#define arg7 STACK_OFFSET+8(%rsp)
124*4882a593Smuzhiyun#define arg8 STACK_OFFSET+16(%rsp)
125*4882a593Smuzhiyun#define arg9 STACK_OFFSET+24(%rsp)
126*4882a593Smuzhiyun#define arg10 STACK_OFFSET+32(%rsp)
127*4882a593Smuzhiyun#define arg11 STACK_OFFSET+40(%rsp)
128*4882a593Smuzhiyun#define keysize 2*15*16(%arg1)
129*4882a593Smuzhiyun#endif
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun
132*4882a593Smuzhiyun#define STATE1	%xmm0
133*4882a593Smuzhiyun#define STATE2	%xmm4
134*4882a593Smuzhiyun#define STATE3	%xmm5
135*4882a593Smuzhiyun#define STATE4	%xmm6
136*4882a593Smuzhiyun#define STATE	STATE1
137*4882a593Smuzhiyun#define IN1	%xmm1
138*4882a593Smuzhiyun#define IN2	%xmm7
139*4882a593Smuzhiyun#define IN3	%xmm8
140*4882a593Smuzhiyun#define IN4	%xmm9
141*4882a593Smuzhiyun#define IN	IN1
142*4882a593Smuzhiyun#define KEY	%xmm2
143*4882a593Smuzhiyun#define IV	%xmm3
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun#define BSWAP_MASK %xmm10
146*4882a593Smuzhiyun#define CTR	%xmm11
147*4882a593Smuzhiyun#define INC	%xmm12
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun#define GF128MUL_MASK %xmm10
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun#ifdef __x86_64__
152*4882a593Smuzhiyun#define AREG	%rax
153*4882a593Smuzhiyun#define KEYP	%rdi
154*4882a593Smuzhiyun#define OUTP	%rsi
155*4882a593Smuzhiyun#define UKEYP	OUTP
156*4882a593Smuzhiyun#define INP	%rdx
157*4882a593Smuzhiyun#define LEN	%rcx
158*4882a593Smuzhiyun#define IVP	%r8
159*4882a593Smuzhiyun#define KLEN	%r9d
160*4882a593Smuzhiyun#define T1	%r10
161*4882a593Smuzhiyun#define TKEYP	T1
162*4882a593Smuzhiyun#define T2	%r11
163*4882a593Smuzhiyun#define TCTR_LOW T2
164*4882a593Smuzhiyun#else
165*4882a593Smuzhiyun#define AREG	%eax
166*4882a593Smuzhiyun#define KEYP	%edi
167*4882a593Smuzhiyun#define OUTP	AREG
168*4882a593Smuzhiyun#define UKEYP	OUTP
169*4882a593Smuzhiyun#define INP	%edx
170*4882a593Smuzhiyun#define LEN	%esi
171*4882a593Smuzhiyun#define IVP	%ebp
172*4882a593Smuzhiyun#define KLEN	%ebx
173*4882a593Smuzhiyun#define T1	%ecx
174*4882a593Smuzhiyun#define TKEYP	T1
175*4882a593Smuzhiyun#endif
176*4882a593Smuzhiyun
177*4882a593Smuzhiyun.macro FUNC_SAVE
178*4882a593Smuzhiyun	push	%r12
179*4882a593Smuzhiyun	push	%r13
180*4882a593Smuzhiyun	push	%r14
181*4882a593Smuzhiyun#
182*4882a593Smuzhiyun# states of %xmm registers %xmm6:%xmm15 not saved
183*4882a593Smuzhiyun# all %xmm registers are clobbered
184*4882a593Smuzhiyun#
185*4882a593Smuzhiyun.endm
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun.macro FUNC_RESTORE
189*4882a593Smuzhiyun	pop	%r14
190*4882a593Smuzhiyun	pop	%r13
191*4882a593Smuzhiyun	pop	%r12
192*4882a593Smuzhiyun.endm
193*4882a593Smuzhiyun
194*4882a593Smuzhiyun# Precompute hashkeys.
195*4882a593Smuzhiyun# Input: Hash subkey.
196*4882a593Smuzhiyun# Output: HashKeys stored in gcm_context_data.  Only needs to be called
197*4882a593Smuzhiyun# once per key.
198*4882a593Smuzhiyun# clobbers r12, and tmp xmm registers.
199*4882a593Smuzhiyun.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7
200*4882a593Smuzhiyun	mov	\SUBKEY, %r12
201*4882a593Smuzhiyun	movdqu	(%r12), \TMP3
202*4882a593Smuzhiyun	movdqa	SHUF_MASK(%rip), \TMP2
203*4882a593Smuzhiyun	pshufb	\TMP2, \TMP3
204*4882a593Smuzhiyun
205*4882a593Smuzhiyun	# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
206*4882a593Smuzhiyun
207*4882a593Smuzhiyun	movdqa	\TMP3, \TMP2
208*4882a593Smuzhiyun	psllq	$1, \TMP3
209*4882a593Smuzhiyun	psrlq	$63, \TMP2
210*4882a593Smuzhiyun	movdqa	\TMP2, \TMP1
211*4882a593Smuzhiyun	pslldq	$8, \TMP2
212*4882a593Smuzhiyun	psrldq	$8, \TMP1
213*4882a593Smuzhiyun	por	\TMP2, \TMP3
214*4882a593Smuzhiyun
215*4882a593Smuzhiyun	# reduce HashKey<<1
216*4882a593Smuzhiyun
217*4882a593Smuzhiyun	pshufd	$0x24, \TMP1, \TMP2
218*4882a593Smuzhiyun	pcmpeqd TWOONE(%rip), \TMP2
219*4882a593Smuzhiyun	pand	POLY(%rip), \TMP2
220*4882a593Smuzhiyun	pxor	\TMP2, \TMP3
221*4882a593Smuzhiyun	movdqu	\TMP3, HashKey(%arg2)
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun	movdqa	   \TMP3, \TMP5
224*4882a593Smuzhiyun	pshufd	   $78, \TMP3, \TMP1
225*4882a593Smuzhiyun	pxor	   \TMP3, \TMP1
226*4882a593Smuzhiyun	movdqu	   \TMP1, HashKey_k(%arg2)
227*4882a593Smuzhiyun
228*4882a593Smuzhiyun	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
229*4882a593Smuzhiyun# TMP5 = HashKey^2<<1 (mod poly)
230*4882a593Smuzhiyun	movdqu	   \TMP5, HashKey_2(%arg2)
231*4882a593Smuzhiyun# HashKey_2 = HashKey^2<<1 (mod poly)
232*4882a593Smuzhiyun	pshufd	   $78, \TMP5, \TMP1
233*4882a593Smuzhiyun	pxor	   \TMP5, \TMP1
234*4882a593Smuzhiyun	movdqu	   \TMP1, HashKey_2_k(%arg2)
235*4882a593Smuzhiyun
236*4882a593Smuzhiyun	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
237*4882a593Smuzhiyun# TMP5 = HashKey^3<<1 (mod poly)
238*4882a593Smuzhiyun	movdqu	   \TMP5, HashKey_3(%arg2)
239*4882a593Smuzhiyun	pshufd	   $78, \TMP5, \TMP1
240*4882a593Smuzhiyun	pxor	   \TMP5, \TMP1
241*4882a593Smuzhiyun	movdqu	   \TMP1, HashKey_3_k(%arg2)
242*4882a593Smuzhiyun
243*4882a593Smuzhiyun	GHASH_MUL  \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
244*4882a593Smuzhiyun# TMP5 = HashKey^3<<1 (mod poly)
245*4882a593Smuzhiyun	movdqu	   \TMP5, HashKey_4(%arg2)
246*4882a593Smuzhiyun	pshufd	   $78, \TMP5, \TMP1
247*4882a593Smuzhiyun	pxor	   \TMP5, \TMP1
248*4882a593Smuzhiyun	movdqu	   \TMP1, HashKey_4_k(%arg2)
249*4882a593Smuzhiyun.endm
250*4882a593Smuzhiyun
251*4882a593Smuzhiyun# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding.
252*4882a593Smuzhiyun# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13
253*4882a593Smuzhiyun.macro GCM_INIT Iv SUBKEY AAD AADLEN
254*4882a593Smuzhiyun	mov \AADLEN, %r11
255*4882a593Smuzhiyun	mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length
256*4882a593Smuzhiyun	xor %r11d, %r11d
257*4882a593Smuzhiyun	mov %r11, InLen(%arg2) # ctx_data.in_length = 0
258*4882a593Smuzhiyun	mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0
259*4882a593Smuzhiyun	mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0
260*4882a593Smuzhiyun	mov \Iv, %rax
261*4882a593Smuzhiyun	movdqu (%rax), %xmm0
262*4882a593Smuzhiyun	movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv
263*4882a593Smuzhiyun
264*4882a593Smuzhiyun	movdqa  SHUF_MASK(%rip), %xmm2
265*4882a593Smuzhiyun	pshufb %xmm2, %xmm0
266*4882a593Smuzhiyun	movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun	PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7
269*4882a593Smuzhiyun	movdqu HashKey(%arg2), %xmm13
270*4882a593Smuzhiyun
271*4882a593Smuzhiyun	CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \
272*4882a593Smuzhiyun	%xmm4, %xmm5, %xmm6
273*4882a593Smuzhiyun.endm
274*4882a593Smuzhiyun
275*4882a593Smuzhiyun# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context
276*4882a593Smuzhiyun# struct has been initialized by GCM_INIT.
277*4882a593Smuzhiyun# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK
278*4882a593Smuzhiyun# Clobbers rax, r10-r13, and xmm0-xmm15
279*4882a593Smuzhiyun.macro GCM_ENC_DEC operation
280*4882a593Smuzhiyun	movdqu AadHash(%arg2), %xmm8
281*4882a593Smuzhiyun	movdqu HashKey(%arg2), %xmm13
282*4882a593Smuzhiyun	add %arg5, InLen(%arg2)
283*4882a593Smuzhiyun
284*4882a593Smuzhiyun	xor %r11d, %r11d # initialise the data pointer offset as zero
285*4882a593Smuzhiyun	PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation
286*4882a593Smuzhiyun
287*4882a593Smuzhiyun	sub %r11, %arg5		# sub partial block data used
288*4882a593Smuzhiyun	mov %arg5, %r13		# save the number of bytes
289*4882a593Smuzhiyun
290*4882a593Smuzhiyun	and $-16, %r13		# %r13 = %r13 - (%r13 mod 16)
291*4882a593Smuzhiyun	mov %r13, %r12
292*4882a593Smuzhiyun	# Encrypt/Decrypt first few blocks
293*4882a593Smuzhiyun
294*4882a593Smuzhiyun	and	$(3<<4), %r12
295*4882a593Smuzhiyun	jz	_initial_num_blocks_is_0_\@
296*4882a593Smuzhiyun	cmp	$(2<<4), %r12
297*4882a593Smuzhiyun	jb	_initial_num_blocks_is_1_\@
298*4882a593Smuzhiyun	je	_initial_num_blocks_is_2_\@
299*4882a593Smuzhiyun_initial_num_blocks_is_3_\@:
300*4882a593Smuzhiyun	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
301*4882a593Smuzhiyun%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation
302*4882a593Smuzhiyun	sub	$48, %r13
303*4882a593Smuzhiyun	jmp	_initial_blocks_\@
304*4882a593Smuzhiyun_initial_num_blocks_is_2_\@:
305*4882a593Smuzhiyun	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
306*4882a593Smuzhiyun%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation
307*4882a593Smuzhiyun	sub	$32, %r13
308*4882a593Smuzhiyun	jmp	_initial_blocks_\@
309*4882a593Smuzhiyun_initial_num_blocks_is_1_\@:
310*4882a593Smuzhiyun	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
311*4882a593Smuzhiyun%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation
312*4882a593Smuzhiyun	sub	$16, %r13
313*4882a593Smuzhiyun	jmp	_initial_blocks_\@
314*4882a593Smuzhiyun_initial_num_blocks_is_0_\@:
315*4882a593Smuzhiyun	INITIAL_BLOCKS_ENC_DEC	%xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
316*4882a593Smuzhiyun%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation
317*4882a593Smuzhiyun_initial_blocks_\@:
318*4882a593Smuzhiyun
319*4882a593Smuzhiyun	# Main loop - Encrypt/Decrypt remaining blocks
320*4882a593Smuzhiyun
321*4882a593Smuzhiyun	test	%r13, %r13
322*4882a593Smuzhiyun	je	_zero_cipher_left_\@
323*4882a593Smuzhiyun	sub	$64, %r13
324*4882a593Smuzhiyun	je	_four_cipher_left_\@
325*4882a593Smuzhiyun_crypt_by_4_\@:
326*4882a593Smuzhiyun	GHASH_4_ENCRYPT_4_PARALLEL_\operation	%xmm9, %xmm10, %xmm11, %xmm12, \
327*4882a593Smuzhiyun	%xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \
328*4882a593Smuzhiyun	%xmm7, %xmm8, enc
329*4882a593Smuzhiyun	add	$64, %r11
330*4882a593Smuzhiyun	sub	$64, %r13
331*4882a593Smuzhiyun	jne	_crypt_by_4_\@
332*4882a593Smuzhiyun_four_cipher_left_\@:
333*4882a593Smuzhiyun	GHASH_LAST_4	%xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
334*4882a593Smuzhiyun%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
335*4882a593Smuzhiyun_zero_cipher_left_\@:
336*4882a593Smuzhiyun	movdqu %xmm8, AadHash(%arg2)
337*4882a593Smuzhiyun	movdqu %xmm0, CurCount(%arg2)
338*4882a593Smuzhiyun
339*4882a593Smuzhiyun	mov	%arg5, %r13
340*4882a593Smuzhiyun	and	$15, %r13			# %r13 = arg5 (mod 16)
341*4882a593Smuzhiyun	je	_multiple_of_16_bytes_\@
342*4882a593Smuzhiyun
343*4882a593Smuzhiyun	mov %r13, PBlockLen(%arg2)
344*4882a593Smuzhiyun
345*4882a593Smuzhiyun	# Handle the last <16 Byte block separately
346*4882a593Smuzhiyun	paddd ONE(%rip), %xmm0                # INCR CNT to get Yn
347*4882a593Smuzhiyun	movdqu %xmm0, CurCount(%arg2)
348*4882a593Smuzhiyun	movdqa SHUF_MASK(%rip), %xmm10
349*4882a593Smuzhiyun	pshufb %xmm10, %xmm0
350*4882a593Smuzhiyun
351*4882a593Smuzhiyun	ENCRYPT_SINGLE_BLOCK	%xmm0, %xmm1        # Encrypt(K, Yn)
352*4882a593Smuzhiyun	movdqu %xmm0, PBlockEncKey(%arg2)
353*4882a593Smuzhiyun
354*4882a593Smuzhiyun	cmp	$16, %arg5
355*4882a593Smuzhiyun	jge _large_enough_update_\@
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun	lea (%arg4,%r11,1), %r10
358*4882a593Smuzhiyun	mov %r13, %r12
359*4882a593Smuzhiyun	READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1
360*4882a593Smuzhiyun	jmp _data_read_\@
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun_large_enough_update_\@:
363*4882a593Smuzhiyun	sub	$16, %r11
364*4882a593Smuzhiyun	add	%r13, %r11
365*4882a593Smuzhiyun
366*4882a593Smuzhiyun	# receive the last <16 Byte block
367*4882a593Smuzhiyun	movdqu	(%arg4, %r11, 1), %xmm1
368*4882a593Smuzhiyun
369*4882a593Smuzhiyun	sub	%r13, %r11
370*4882a593Smuzhiyun	add	$16, %r11
371*4882a593Smuzhiyun
372*4882a593Smuzhiyun	lea	SHIFT_MASK+16(%rip), %r12
373*4882a593Smuzhiyun	# adjust the shuffle mask pointer to be able to shift 16-r13 bytes
374*4882a593Smuzhiyun	# (r13 is the number of bytes in plaintext mod 16)
375*4882a593Smuzhiyun	sub	%r13, %r12
376*4882a593Smuzhiyun	# get the appropriate shuffle mask
377*4882a593Smuzhiyun	movdqu	(%r12), %xmm2
378*4882a593Smuzhiyun	# shift right 16-r13 bytes
379*4882a593Smuzhiyun	pshufb  %xmm2, %xmm1
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun_data_read_\@:
382*4882a593Smuzhiyun	lea ALL_F+16(%rip), %r12
383*4882a593Smuzhiyun	sub %r13, %r12
384*4882a593Smuzhiyun
385*4882a593Smuzhiyun.ifc \operation, dec
386*4882a593Smuzhiyun	movdqa  %xmm1, %xmm2
387*4882a593Smuzhiyun.endif
388*4882a593Smuzhiyun	pxor	%xmm1, %xmm0            # XOR Encrypt(K, Yn)
389*4882a593Smuzhiyun	movdqu	(%r12), %xmm1
390*4882a593Smuzhiyun	# get the appropriate mask to mask out top 16-r13 bytes of xmm0
391*4882a593Smuzhiyun	pand	%xmm1, %xmm0            # mask out top 16-r13 bytes of xmm0
392*4882a593Smuzhiyun.ifc \operation, dec
393*4882a593Smuzhiyun	pand    %xmm1, %xmm2
394*4882a593Smuzhiyun	movdqa SHUF_MASK(%rip), %xmm10
395*4882a593Smuzhiyun	pshufb %xmm10 ,%xmm2
396*4882a593Smuzhiyun
397*4882a593Smuzhiyun	pxor %xmm2, %xmm8
398*4882a593Smuzhiyun.else
399*4882a593Smuzhiyun	movdqa SHUF_MASK(%rip), %xmm10
400*4882a593Smuzhiyun	pshufb %xmm10,%xmm0
401*4882a593Smuzhiyun
402*4882a593Smuzhiyun	pxor	%xmm0, %xmm8
403*4882a593Smuzhiyun.endif
404*4882a593Smuzhiyun
405*4882a593Smuzhiyun	movdqu %xmm8, AadHash(%arg2)
406*4882a593Smuzhiyun.ifc \operation, enc
407*4882a593Smuzhiyun	# GHASH computation for the last <16 byte block
408*4882a593Smuzhiyun	movdqa SHUF_MASK(%rip), %xmm10
409*4882a593Smuzhiyun	# shuffle xmm0 back to output as ciphertext
410*4882a593Smuzhiyun	pshufb %xmm10, %xmm0
411*4882a593Smuzhiyun.endif
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun	# Output %r13 bytes
414*4882a593Smuzhiyun	movq %xmm0, %rax
415*4882a593Smuzhiyun	cmp $8, %r13
416*4882a593Smuzhiyun	jle _less_than_8_bytes_left_\@
417*4882a593Smuzhiyun	mov %rax, (%arg3 , %r11, 1)
418*4882a593Smuzhiyun	add $8, %r11
419*4882a593Smuzhiyun	psrldq $8, %xmm0
420*4882a593Smuzhiyun	movq %xmm0, %rax
421*4882a593Smuzhiyun	sub $8, %r13
422*4882a593Smuzhiyun_less_than_8_bytes_left_\@:
423*4882a593Smuzhiyun	mov %al,  (%arg3, %r11, 1)
424*4882a593Smuzhiyun	add $1, %r11
425*4882a593Smuzhiyun	shr $8, %rax
426*4882a593Smuzhiyun	sub $1, %r13
427*4882a593Smuzhiyun	jne _less_than_8_bytes_left_\@
428*4882a593Smuzhiyun_multiple_of_16_bytes_\@:
429*4882a593Smuzhiyun.endm
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun# GCM_COMPLETE Finishes update of tag of last partial block
432*4882a593Smuzhiyun# Output: Authorization Tag (AUTH_TAG)
433*4882a593Smuzhiyun# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
434*4882a593Smuzhiyun.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN
435*4882a593Smuzhiyun	movdqu AadHash(%arg2), %xmm8
436*4882a593Smuzhiyun	movdqu HashKey(%arg2), %xmm13
437*4882a593Smuzhiyun
438*4882a593Smuzhiyun	mov PBlockLen(%arg2), %r12
439*4882a593Smuzhiyun
440*4882a593Smuzhiyun	test %r12, %r12
441*4882a593Smuzhiyun	je _partial_done\@
442*4882a593Smuzhiyun
443*4882a593Smuzhiyun	GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
444*4882a593Smuzhiyun
445*4882a593Smuzhiyun_partial_done\@:
446*4882a593Smuzhiyun	mov AadLen(%arg2), %r12  # %r13 = aadLen (number of bytes)
447*4882a593Smuzhiyun	shl	$3, %r12		  # convert into number of bits
448*4882a593Smuzhiyun	movd	%r12d, %xmm15		  # len(A) in %xmm15
449*4882a593Smuzhiyun	mov InLen(%arg2), %r12
450*4882a593Smuzhiyun	shl     $3, %r12                  # len(C) in bits (*128)
451*4882a593Smuzhiyun	movq    %r12, %xmm1
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun	pslldq	$8, %xmm15		  # %xmm15 = len(A)||0x0000000000000000
454*4882a593Smuzhiyun	pxor	%xmm1, %xmm15		  # %xmm15 = len(A)||len(C)
455*4882a593Smuzhiyun	pxor	%xmm15, %xmm8
456*4882a593Smuzhiyun	GHASH_MUL	%xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
457*4882a593Smuzhiyun	# final GHASH computation
458*4882a593Smuzhiyun	movdqa SHUF_MASK(%rip), %xmm10
459*4882a593Smuzhiyun	pshufb %xmm10, %xmm8
460*4882a593Smuzhiyun
461*4882a593Smuzhiyun	movdqu OrigIV(%arg2), %xmm0       # %xmm0 = Y0
462*4882a593Smuzhiyun	ENCRYPT_SINGLE_BLOCK	%xmm0,  %xmm1	  # E(K, Y0)
463*4882a593Smuzhiyun	pxor	%xmm8, %xmm0
464*4882a593Smuzhiyun_return_T_\@:
465*4882a593Smuzhiyun	mov	\AUTHTAG, %r10                     # %r10 = authTag
466*4882a593Smuzhiyun	mov	\AUTHTAGLEN, %r11                    # %r11 = auth_tag_len
467*4882a593Smuzhiyun	cmp	$16, %r11
468*4882a593Smuzhiyun	je	_T_16_\@
469*4882a593Smuzhiyun	cmp	$8, %r11
470*4882a593Smuzhiyun	jl	_T_4_\@
471*4882a593Smuzhiyun_T_8_\@:
472*4882a593Smuzhiyun	movq	%xmm0, %rax
473*4882a593Smuzhiyun	mov	%rax, (%r10)
474*4882a593Smuzhiyun	add	$8, %r10
475*4882a593Smuzhiyun	sub	$8, %r11
476*4882a593Smuzhiyun	psrldq	$8, %xmm0
477*4882a593Smuzhiyun	test	%r11, %r11
478*4882a593Smuzhiyun	je	_return_T_done_\@
479*4882a593Smuzhiyun_T_4_\@:
480*4882a593Smuzhiyun	movd	%xmm0, %eax
481*4882a593Smuzhiyun	mov	%eax, (%r10)
482*4882a593Smuzhiyun	add	$4, %r10
483*4882a593Smuzhiyun	sub	$4, %r11
484*4882a593Smuzhiyun	psrldq	$4, %xmm0
485*4882a593Smuzhiyun	test	%r11, %r11
486*4882a593Smuzhiyun	je	_return_T_done_\@
487*4882a593Smuzhiyun_T_123_\@:
488*4882a593Smuzhiyun	movd	%xmm0, %eax
489*4882a593Smuzhiyun	cmp	$2, %r11
490*4882a593Smuzhiyun	jl	_T_1_\@
491*4882a593Smuzhiyun	mov	%ax, (%r10)
492*4882a593Smuzhiyun	cmp	$2, %r11
493*4882a593Smuzhiyun	je	_return_T_done_\@
494*4882a593Smuzhiyun	add	$2, %r10
495*4882a593Smuzhiyun	sar	$16, %eax
496*4882a593Smuzhiyun_T_1_\@:
497*4882a593Smuzhiyun	mov	%al, (%r10)
498*4882a593Smuzhiyun	jmp	_return_T_done_\@
499*4882a593Smuzhiyun_T_16_\@:
500*4882a593Smuzhiyun	movdqu	%xmm0, (%r10)
501*4882a593Smuzhiyun_return_T_done_\@:
502*4882a593Smuzhiyun.endm
503*4882a593Smuzhiyun
504*4882a593Smuzhiyun#ifdef __x86_64__
505*4882a593Smuzhiyun/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
506*4882a593Smuzhiyun*
507*4882a593Smuzhiyun*
508*4882a593Smuzhiyun* Input: A and B (128-bits each, bit-reflected)
509*4882a593Smuzhiyun* Output: C = A*B*x mod poly, (i.e. >>1 )
510*4882a593Smuzhiyun* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
511*4882a593Smuzhiyun* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
512*4882a593Smuzhiyun*
513*4882a593Smuzhiyun*/
514*4882a593Smuzhiyun.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
515*4882a593Smuzhiyun	movdqa	  \GH, \TMP1
516*4882a593Smuzhiyun	pshufd	  $78, \GH, \TMP2
517*4882a593Smuzhiyun	pshufd	  $78, \HK, \TMP3
518*4882a593Smuzhiyun	pxor	  \GH, \TMP2            # TMP2 = a1+a0
519*4882a593Smuzhiyun	pxor	  \HK, \TMP3            # TMP3 = b1+b0
520*4882a593Smuzhiyun	pclmulqdq $0x11, \HK, \TMP1     # TMP1 = a1*b1
521*4882a593Smuzhiyun	pclmulqdq $0x00, \HK, \GH       # GH = a0*b0
522*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP3, \TMP2   # TMP2 = (a0+a1)*(b1+b0)
523*4882a593Smuzhiyun	pxor	  \GH, \TMP2
524*4882a593Smuzhiyun	pxor	  \TMP1, \TMP2          # TMP2 = (a0*b0)+(a1*b0)
525*4882a593Smuzhiyun	movdqa	  \TMP2, \TMP3
526*4882a593Smuzhiyun	pslldq	  $8, \TMP3             # left shift TMP3 2 DWs
527*4882a593Smuzhiyun	psrldq	  $8, \TMP2             # right shift TMP2 2 DWs
528*4882a593Smuzhiyun	pxor	  \TMP3, \GH
529*4882a593Smuzhiyun	pxor	  \TMP2, \TMP1          # TMP2:GH holds the result of GH*HK
530*4882a593Smuzhiyun
531*4882a593Smuzhiyun        # first phase of the reduction
532*4882a593Smuzhiyun
533*4882a593Smuzhiyun	movdqa    \GH, \TMP2
534*4882a593Smuzhiyun	movdqa    \GH, \TMP3
535*4882a593Smuzhiyun	movdqa    \GH, \TMP4            # copy GH into TMP2,TMP3 and TMP4
536*4882a593Smuzhiyun					# in in order to perform
537*4882a593Smuzhiyun					# independent shifts
538*4882a593Smuzhiyun	pslld     $31, \TMP2            # packed right shift <<31
539*4882a593Smuzhiyun	pslld     $30, \TMP3            # packed right shift <<30
540*4882a593Smuzhiyun	pslld     $25, \TMP4            # packed right shift <<25
541*4882a593Smuzhiyun	pxor      \TMP3, \TMP2          # xor the shifted versions
542*4882a593Smuzhiyun	pxor      \TMP4, \TMP2
543*4882a593Smuzhiyun	movdqa    \TMP2, \TMP5
544*4882a593Smuzhiyun	psrldq    $4, \TMP5             # right shift TMP5 1 DW
545*4882a593Smuzhiyun	pslldq    $12, \TMP2            # left shift TMP2 3 DWs
546*4882a593Smuzhiyun	pxor      \TMP2, \GH
547*4882a593Smuzhiyun
548*4882a593Smuzhiyun        # second phase of the reduction
549*4882a593Smuzhiyun
550*4882a593Smuzhiyun	movdqa    \GH,\TMP2             # copy GH into TMP2,TMP3 and TMP4
551*4882a593Smuzhiyun					# in in order to perform
552*4882a593Smuzhiyun					# independent shifts
553*4882a593Smuzhiyun	movdqa    \GH,\TMP3
554*4882a593Smuzhiyun	movdqa    \GH,\TMP4
555*4882a593Smuzhiyun	psrld     $1,\TMP2              # packed left shift >>1
556*4882a593Smuzhiyun	psrld     $2,\TMP3              # packed left shift >>2
557*4882a593Smuzhiyun	psrld     $7,\TMP4              # packed left shift >>7
558*4882a593Smuzhiyun	pxor      \TMP3,\TMP2		# xor the shifted versions
559*4882a593Smuzhiyun	pxor      \TMP4,\TMP2
560*4882a593Smuzhiyun	pxor      \TMP5, \TMP2
561*4882a593Smuzhiyun	pxor      \TMP2, \GH
562*4882a593Smuzhiyun	pxor      \TMP1, \GH            # result is in TMP1
563*4882a593Smuzhiyun.endm
564*4882a593Smuzhiyun
565*4882a593Smuzhiyun# Reads DLEN bytes starting at DPTR and stores in XMMDst
566*4882a593Smuzhiyun# where 0 < DLEN < 16
567*4882a593Smuzhiyun# Clobbers %rax, DLEN and XMM1
568*4882a593Smuzhiyun.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst
569*4882a593Smuzhiyun        cmp $8, \DLEN
570*4882a593Smuzhiyun        jl _read_lt8_\@
571*4882a593Smuzhiyun        mov (\DPTR), %rax
572*4882a593Smuzhiyun        movq %rax, \XMMDst
573*4882a593Smuzhiyun        sub $8, \DLEN
574*4882a593Smuzhiyun        jz _done_read_partial_block_\@
575*4882a593Smuzhiyun	xor %eax, %eax
576*4882a593Smuzhiyun_read_next_byte_\@:
577*4882a593Smuzhiyun        shl $8, %rax
578*4882a593Smuzhiyun        mov 7(\DPTR, \DLEN, 1), %al
579*4882a593Smuzhiyun        dec \DLEN
580*4882a593Smuzhiyun        jnz _read_next_byte_\@
581*4882a593Smuzhiyun        movq %rax, \XMM1
582*4882a593Smuzhiyun	pslldq $8, \XMM1
583*4882a593Smuzhiyun        por \XMM1, \XMMDst
584*4882a593Smuzhiyun	jmp _done_read_partial_block_\@
585*4882a593Smuzhiyun_read_lt8_\@:
586*4882a593Smuzhiyun	xor %eax, %eax
587*4882a593Smuzhiyun_read_next_byte_lt8_\@:
588*4882a593Smuzhiyun        shl $8, %rax
589*4882a593Smuzhiyun        mov -1(\DPTR, \DLEN, 1), %al
590*4882a593Smuzhiyun        dec \DLEN
591*4882a593Smuzhiyun        jnz _read_next_byte_lt8_\@
592*4882a593Smuzhiyun        movq %rax, \XMMDst
593*4882a593Smuzhiyun_done_read_partial_block_\@:
594*4882a593Smuzhiyun.endm
595*4882a593Smuzhiyun
596*4882a593Smuzhiyun# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
597*4882a593Smuzhiyun# clobbers r10-11, xmm14
598*4882a593Smuzhiyun.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \
599*4882a593Smuzhiyun	TMP6 TMP7
600*4882a593Smuzhiyun	MOVADQ	   SHUF_MASK(%rip), %xmm14
601*4882a593Smuzhiyun	mov	   \AAD, %r10		# %r10 = AAD
602*4882a593Smuzhiyun	mov	   \AADLEN, %r11		# %r11 = aadLen
603*4882a593Smuzhiyun	pxor	   \TMP7, \TMP7
604*4882a593Smuzhiyun	pxor	   \TMP6, \TMP6
605*4882a593Smuzhiyun
606*4882a593Smuzhiyun	cmp	   $16, %r11
607*4882a593Smuzhiyun	jl	   _get_AAD_rest\@
608*4882a593Smuzhiyun_get_AAD_blocks\@:
609*4882a593Smuzhiyun	movdqu	   (%r10), \TMP7
610*4882a593Smuzhiyun	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
611*4882a593Smuzhiyun	pxor	   \TMP7, \TMP6
612*4882a593Smuzhiyun	GHASH_MUL  \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
613*4882a593Smuzhiyun	add	   $16, %r10
614*4882a593Smuzhiyun	sub	   $16, %r11
615*4882a593Smuzhiyun	cmp	   $16, %r11
616*4882a593Smuzhiyun	jge	   _get_AAD_blocks\@
617*4882a593Smuzhiyun
618*4882a593Smuzhiyun	movdqu	   \TMP6, \TMP7
619*4882a593Smuzhiyun
620*4882a593Smuzhiyun	/* read the last <16B of AAD */
621*4882a593Smuzhiyun_get_AAD_rest\@:
622*4882a593Smuzhiyun	test	   %r11, %r11
623*4882a593Smuzhiyun	je	   _get_AAD_done\@
624*4882a593Smuzhiyun
625*4882a593Smuzhiyun	READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7
626*4882a593Smuzhiyun	pshufb	   %xmm14, \TMP7 # byte-reflect the AAD data
627*4882a593Smuzhiyun	pxor	   \TMP6, \TMP7
628*4882a593Smuzhiyun	GHASH_MUL  \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5
629*4882a593Smuzhiyun	movdqu \TMP7, \TMP6
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun_get_AAD_done\@:
632*4882a593Smuzhiyun	movdqu \TMP6, AadHash(%arg2)
633*4882a593Smuzhiyun.endm
634*4882a593Smuzhiyun
635*4882a593Smuzhiyun# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
636*4882a593Smuzhiyun# between update calls.
637*4882a593Smuzhiyun# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
638*4882a593Smuzhiyun# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
639*4882a593Smuzhiyun# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
640*4882a593Smuzhiyun.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
641*4882a593Smuzhiyun	AAD_HASH operation
642*4882a593Smuzhiyun	mov 	PBlockLen(%arg2), %r13
643*4882a593Smuzhiyun	test	%r13, %r13
644*4882a593Smuzhiyun	je	_partial_block_done_\@	# Leave Macro if no partial blocks
645*4882a593Smuzhiyun	# Read in input data without over reading
646*4882a593Smuzhiyun	cmp	$16, \PLAIN_CYPH_LEN
647*4882a593Smuzhiyun	jl	_fewer_than_16_bytes_\@
648*4882a593Smuzhiyun	movups	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
649*4882a593Smuzhiyun	jmp	_data_read_\@
650*4882a593Smuzhiyun
651*4882a593Smuzhiyun_fewer_than_16_bytes_\@:
652*4882a593Smuzhiyun	lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
653*4882a593Smuzhiyun	mov	\PLAIN_CYPH_LEN, %r12
654*4882a593Smuzhiyun	READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1
655*4882a593Smuzhiyun
656*4882a593Smuzhiyun	mov PBlockLen(%arg2), %r13
657*4882a593Smuzhiyun
658*4882a593Smuzhiyun_data_read_\@:				# Finished reading in data
659*4882a593Smuzhiyun
660*4882a593Smuzhiyun	movdqu	PBlockEncKey(%arg2), %xmm9
661*4882a593Smuzhiyun	movdqu	HashKey(%arg2), %xmm13
662*4882a593Smuzhiyun
663*4882a593Smuzhiyun	lea	SHIFT_MASK(%rip), %r12
664*4882a593Smuzhiyun
665*4882a593Smuzhiyun	# adjust the shuffle mask pointer to be able to shift r13 bytes
666*4882a593Smuzhiyun	# r16-r13 is the number of bytes in plaintext mod 16)
667*4882a593Smuzhiyun	add	%r13, %r12
668*4882a593Smuzhiyun	movdqu	(%r12), %xmm2		# get the appropriate shuffle mask
669*4882a593Smuzhiyun	pshufb	%xmm2, %xmm9		# shift right r13 bytes
670*4882a593Smuzhiyun
671*4882a593Smuzhiyun.ifc \operation, dec
672*4882a593Smuzhiyun	movdqa	%xmm1, %xmm3
673*4882a593Smuzhiyun	pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
674*4882a593Smuzhiyun
675*4882a593Smuzhiyun	mov	\PLAIN_CYPH_LEN, %r10
676*4882a593Smuzhiyun	add	%r13, %r10
677*4882a593Smuzhiyun	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
678*4882a593Smuzhiyun	sub	$16, %r10
679*4882a593Smuzhiyun	# Determine if if partial block is not being filled and
680*4882a593Smuzhiyun	# shift mask accordingly
681*4882a593Smuzhiyun	jge	_no_extra_mask_1_\@
682*4882a593Smuzhiyun	sub	%r10, %r12
683*4882a593Smuzhiyun_no_extra_mask_1_\@:
684*4882a593Smuzhiyun
685*4882a593Smuzhiyun	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
686*4882a593Smuzhiyun	# get the appropriate mask to mask out bottom r13 bytes of xmm9
687*4882a593Smuzhiyun	pand	%xmm1, %xmm9		# mask out bottom r13 bytes of xmm9
688*4882a593Smuzhiyun
689*4882a593Smuzhiyun	pand	%xmm1, %xmm3
690*4882a593Smuzhiyun	movdqa	SHUF_MASK(%rip), %xmm10
691*4882a593Smuzhiyun	pshufb	%xmm10, %xmm3
692*4882a593Smuzhiyun	pshufb	%xmm2, %xmm3
693*4882a593Smuzhiyun	pxor	%xmm3, \AAD_HASH
694*4882a593Smuzhiyun
695*4882a593Smuzhiyun	test	%r10, %r10
696*4882a593Smuzhiyun	jl	_partial_incomplete_1_\@
697*4882a593Smuzhiyun
698*4882a593Smuzhiyun	# GHASH computation for the last <16 Byte block
699*4882a593Smuzhiyun	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
700*4882a593Smuzhiyun	xor	%eax, %eax
701*4882a593Smuzhiyun
702*4882a593Smuzhiyun	mov	%rax, PBlockLen(%arg2)
703*4882a593Smuzhiyun	jmp	_dec_done_\@
704*4882a593Smuzhiyun_partial_incomplete_1_\@:
705*4882a593Smuzhiyun	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
706*4882a593Smuzhiyun_dec_done_\@:
707*4882a593Smuzhiyun	movdqu	\AAD_HASH, AadHash(%arg2)
708*4882a593Smuzhiyun.else
709*4882a593Smuzhiyun	pxor	%xmm1, %xmm9			# Plaintext XOR E(K, Yn)
710*4882a593Smuzhiyun
711*4882a593Smuzhiyun	mov	\PLAIN_CYPH_LEN, %r10
712*4882a593Smuzhiyun	add	%r13, %r10
713*4882a593Smuzhiyun	# Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
714*4882a593Smuzhiyun	sub	$16, %r10
715*4882a593Smuzhiyun	# Determine if if partial block is not being filled and
716*4882a593Smuzhiyun	# shift mask accordingly
717*4882a593Smuzhiyun	jge	_no_extra_mask_2_\@
718*4882a593Smuzhiyun	sub	%r10, %r12
719*4882a593Smuzhiyun_no_extra_mask_2_\@:
720*4882a593Smuzhiyun
721*4882a593Smuzhiyun	movdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
722*4882a593Smuzhiyun	# get the appropriate mask to mask out bottom r13 bytes of xmm9
723*4882a593Smuzhiyun	pand	%xmm1, %xmm9
724*4882a593Smuzhiyun
725*4882a593Smuzhiyun	movdqa	SHUF_MASK(%rip), %xmm1
726*4882a593Smuzhiyun	pshufb	%xmm1, %xmm9
727*4882a593Smuzhiyun	pshufb	%xmm2, %xmm9
728*4882a593Smuzhiyun	pxor	%xmm9, \AAD_HASH
729*4882a593Smuzhiyun
730*4882a593Smuzhiyun	test	%r10, %r10
731*4882a593Smuzhiyun	jl	_partial_incomplete_2_\@
732*4882a593Smuzhiyun
733*4882a593Smuzhiyun	# GHASH computation for the last <16 Byte block
734*4882a593Smuzhiyun	GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
735*4882a593Smuzhiyun	xor	%eax, %eax
736*4882a593Smuzhiyun
737*4882a593Smuzhiyun	mov	%rax, PBlockLen(%arg2)
738*4882a593Smuzhiyun	jmp	_encode_done_\@
739*4882a593Smuzhiyun_partial_incomplete_2_\@:
740*4882a593Smuzhiyun	add	\PLAIN_CYPH_LEN, PBlockLen(%arg2)
741*4882a593Smuzhiyun_encode_done_\@:
742*4882a593Smuzhiyun	movdqu	\AAD_HASH, AadHash(%arg2)
743*4882a593Smuzhiyun
744*4882a593Smuzhiyun	movdqa	SHUF_MASK(%rip), %xmm10
745*4882a593Smuzhiyun	# shuffle xmm9 back to output as ciphertext
746*4882a593Smuzhiyun	pshufb	%xmm10, %xmm9
747*4882a593Smuzhiyun	pshufb	%xmm2, %xmm9
748*4882a593Smuzhiyun.endif
749*4882a593Smuzhiyun	# output encrypted Bytes
750*4882a593Smuzhiyun	test	%r10, %r10
751*4882a593Smuzhiyun	jl	_partial_fill_\@
752*4882a593Smuzhiyun	mov	%r13, %r12
753*4882a593Smuzhiyun	mov	$16, %r13
754*4882a593Smuzhiyun	# Set r13 to be the number of bytes to write out
755*4882a593Smuzhiyun	sub	%r12, %r13
756*4882a593Smuzhiyun	jmp	_count_set_\@
757*4882a593Smuzhiyun_partial_fill_\@:
758*4882a593Smuzhiyun	mov	\PLAIN_CYPH_LEN, %r13
759*4882a593Smuzhiyun_count_set_\@:
760*4882a593Smuzhiyun	movdqa	%xmm9, %xmm0
761*4882a593Smuzhiyun	movq	%xmm0, %rax
762*4882a593Smuzhiyun	cmp	$8, %r13
763*4882a593Smuzhiyun	jle	_less_than_8_bytes_left_\@
764*4882a593Smuzhiyun
765*4882a593Smuzhiyun	mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
766*4882a593Smuzhiyun	add	$8, \DATA_OFFSET
767*4882a593Smuzhiyun	psrldq	$8, %xmm0
768*4882a593Smuzhiyun	movq	%xmm0, %rax
769*4882a593Smuzhiyun	sub	$8, %r13
770*4882a593Smuzhiyun_less_than_8_bytes_left_\@:
771*4882a593Smuzhiyun	movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
772*4882a593Smuzhiyun	add	$1, \DATA_OFFSET
773*4882a593Smuzhiyun	shr	$8, %rax
774*4882a593Smuzhiyun	sub	$1, %r13
775*4882a593Smuzhiyun	jne	_less_than_8_bytes_left_\@
776*4882a593Smuzhiyun_partial_block_done_\@:
777*4882a593Smuzhiyun.endm # PARTIAL_BLOCK
778*4882a593Smuzhiyun
779*4882a593Smuzhiyun/*
780*4882a593Smuzhiyun* if a = number of total plaintext bytes
781*4882a593Smuzhiyun* b = floor(a/16)
782*4882a593Smuzhiyun* num_initial_blocks = b mod 4
783*4882a593Smuzhiyun* encrypt the initial num_initial_blocks blocks and apply ghash on
784*4882a593Smuzhiyun* the ciphertext
785*4882a593Smuzhiyun* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
786*4882a593Smuzhiyun* are clobbered
787*4882a593Smuzhiyun* arg1, %arg2, %arg3 are used as a pointer only, not modified
788*4882a593Smuzhiyun*/
789*4882a593Smuzhiyun
790*4882a593Smuzhiyun
791*4882a593Smuzhiyun.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
792*4882a593Smuzhiyun	XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
793*4882a593Smuzhiyun	MOVADQ		SHUF_MASK(%rip), %xmm14
794*4882a593Smuzhiyun
795*4882a593Smuzhiyun	movdqu AadHash(%arg2), %xmm\i		    # XMM0 = Y0
796*4882a593Smuzhiyun
797*4882a593Smuzhiyun	# start AES for num_initial_blocks blocks
798*4882a593Smuzhiyun
799*4882a593Smuzhiyun	movdqu CurCount(%arg2), \XMM0                # XMM0 = Y0
800*4882a593Smuzhiyun
801*4882a593Smuzhiyun.if (\i == 5) || (\i == 6) || (\i == 7)
802*4882a593Smuzhiyun
803*4882a593Smuzhiyun	MOVADQ		ONE(%RIP),\TMP1
804*4882a593Smuzhiyun	MOVADQ		0(%arg1),\TMP2
805*4882a593Smuzhiyun.irpc index, \i_seq
806*4882a593Smuzhiyun	paddd		\TMP1, \XMM0                 # INCR Y0
807*4882a593Smuzhiyun.ifc \operation, dec
808*4882a593Smuzhiyun        movdqa     \XMM0, %xmm\index
809*4882a593Smuzhiyun.else
810*4882a593Smuzhiyun	MOVADQ		\XMM0, %xmm\index
811*4882a593Smuzhiyun.endif
812*4882a593Smuzhiyun	pshufb	%xmm14, %xmm\index      # perform a 16 byte swap
813*4882a593Smuzhiyun	pxor		\TMP2, %xmm\index
814*4882a593Smuzhiyun.endr
815*4882a593Smuzhiyun	lea	0x10(%arg1),%r10
816*4882a593Smuzhiyun	mov	keysize,%eax
817*4882a593Smuzhiyun	shr	$2,%eax				# 128->4, 192->6, 256->8
818*4882a593Smuzhiyun	add	$5,%eax			      # 128->9, 192->11, 256->13
819*4882a593Smuzhiyun
820*4882a593Smuzhiyunaes_loop_initial_\@:
821*4882a593Smuzhiyun	MOVADQ	(%r10),\TMP1
822*4882a593Smuzhiyun.irpc	index, \i_seq
823*4882a593Smuzhiyun	aesenc	\TMP1, %xmm\index
824*4882a593Smuzhiyun.endr
825*4882a593Smuzhiyun	add	$16,%r10
826*4882a593Smuzhiyun	sub	$1,%eax
827*4882a593Smuzhiyun	jnz	aes_loop_initial_\@
828*4882a593Smuzhiyun
829*4882a593Smuzhiyun	MOVADQ	(%r10), \TMP1
830*4882a593Smuzhiyun.irpc index, \i_seq
831*4882a593Smuzhiyun	aesenclast \TMP1, %xmm\index         # Last Round
832*4882a593Smuzhiyun.endr
833*4882a593Smuzhiyun.irpc index, \i_seq
834*4882a593Smuzhiyun	movdqu	   (%arg4 , %r11, 1), \TMP1
835*4882a593Smuzhiyun	pxor	   \TMP1, %xmm\index
836*4882a593Smuzhiyun	movdqu	   %xmm\index, (%arg3 , %r11, 1)
837*4882a593Smuzhiyun	# write back plaintext/ciphertext for num_initial_blocks
838*4882a593Smuzhiyun	add	   $16, %r11
839*4882a593Smuzhiyun
840*4882a593Smuzhiyun.ifc \operation, dec
841*4882a593Smuzhiyun	movdqa     \TMP1, %xmm\index
842*4882a593Smuzhiyun.endif
843*4882a593Smuzhiyun	pshufb	   %xmm14, %xmm\index
844*4882a593Smuzhiyun
845*4882a593Smuzhiyun		# prepare plaintext/ciphertext for GHASH computation
846*4882a593Smuzhiyun.endr
847*4882a593Smuzhiyun.endif
848*4882a593Smuzhiyun
849*4882a593Smuzhiyun        # apply GHASH on num_initial_blocks blocks
850*4882a593Smuzhiyun
851*4882a593Smuzhiyun.if \i == 5
852*4882a593Smuzhiyun        pxor       %xmm5, %xmm6
853*4882a593Smuzhiyun	GHASH_MUL  %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
854*4882a593Smuzhiyun        pxor       %xmm6, %xmm7
855*4882a593Smuzhiyun	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
856*4882a593Smuzhiyun        pxor       %xmm7, %xmm8
857*4882a593Smuzhiyun	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
858*4882a593Smuzhiyun.elseif \i == 6
859*4882a593Smuzhiyun        pxor       %xmm6, %xmm7
860*4882a593Smuzhiyun	GHASH_MUL  %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
861*4882a593Smuzhiyun        pxor       %xmm7, %xmm8
862*4882a593Smuzhiyun	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
863*4882a593Smuzhiyun.elseif \i == 7
864*4882a593Smuzhiyun        pxor       %xmm7, %xmm8
865*4882a593Smuzhiyun	GHASH_MUL  %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
866*4882a593Smuzhiyun.endif
867*4882a593Smuzhiyun	cmp	   $64, %r13
868*4882a593Smuzhiyun	jl	_initial_blocks_done\@
869*4882a593Smuzhiyun	# no need for precomputed values
870*4882a593Smuzhiyun/*
871*4882a593Smuzhiyun*
872*4882a593Smuzhiyun* Precomputations for HashKey parallel with encryption of first 4 blocks.
873*4882a593Smuzhiyun* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
874*4882a593Smuzhiyun*/
875*4882a593Smuzhiyun	MOVADQ	   ONE(%RIP),\TMP1
876*4882a593Smuzhiyun	paddd	   \TMP1, \XMM0              # INCR Y0
877*4882a593Smuzhiyun	MOVADQ	   \XMM0, \XMM1
878*4882a593Smuzhiyun	pshufb  %xmm14, \XMM1        # perform a 16 byte swap
879*4882a593Smuzhiyun
880*4882a593Smuzhiyun	paddd	   \TMP1, \XMM0              # INCR Y0
881*4882a593Smuzhiyun	MOVADQ	   \XMM0, \XMM2
882*4882a593Smuzhiyun	pshufb  %xmm14, \XMM2        # perform a 16 byte swap
883*4882a593Smuzhiyun
884*4882a593Smuzhiyun	paddd	   \TMP1, \XMM0              # INCR Y0
885*4882a593Smuzhiyun	MOVADQ	   \XMM0, \XMM3
886*4882a593Smuzhiyun	pshufb %xmm14, \XMM3        # perform a 16 byte swap
887*4882a593Smuzhiyun
888*4882a593Smuzhiyun	paddd	   \TMP1, \XMM0              # INCR Y0
889*4882a593Smuzhiyun	MOVADQ	   \XMM0, \XMM4
890*4882a593Smuzhiyun	pshufb %xmm14, \XMM4        # perform a 16 byte swap
891*4882a593Smuzhiyun
892*4882a593Smuzhiyun	MOVADQ	   0(%arg1),\TMP1
893*4882a593Smuzhiyun	pxor	   \TMP1, \XMM1
894*4882a593Smuzhiyun	pxor	   \TMP1, \XMM2
895*4882a593Smuzhiyun	pxor	   \TMP1, \XMM3
896*4882a593Smuzhiyun	pxor	   \TMP1, \XMM4
897*4882a593Smuzhiyun.irpc index, 1234 # do 4 rounds
898*4882a593Smuzhiyun	movaps 0x10*\index(%arg1), \TMP1
899*4882a593Smuzhiyun	aesenc	   \TMP1, \XMM1
900*4882a593Smuzhiyun	aesenc	   \TMP1, \XMM2
901*4882a593Smuzhiyun	aesenc	   \TMP1, \XMM3
902*4882a593Smuzhiyun	aesenc	   \TMP1, \XMM4
903*4882a593Smuzhiyun.endr
904*4882a593Smuzhiyun.irpc index, 56789 # do next 5 rounds
905*4882a593Smuzhiyun	movaps 0x10*\index(%arg1), \TMP1
906*4882a593Smuzhiyun	aesenc	   \TMP1, \XMM1
907*4882a593Smuzhiyun	aesenc	   \TMP1, \XMM2
908*4882a593Smuzhiyun	aesenc	   \TMP1, \XMM3
909*4882a593Smuzhiyun	aesenc	   \TMP1, \XMM4
910*4882a593Smuzhiyun.endr
911*4882a593Smuzhiyun	lea	   0xa0(%arg1),%r10
912*4882a593Smuzhiyun	mov	   keysize,%eax
913*4882a593Smuzhiyun	shr	   $2,%eax			# 128->4, 192->6, 256->8
914*4882a593Smuzhiyun	sub	   $4,%eax			# 128->0, 192->2, 256->4
915*4882a593Smuzhiyun	jz	   aes_loop_pre_done\@
916*4882a593Smuzhiyun
917*4882a593Smuzhiyunaes_loop_pre_\@:
918*4882a593Smuzhiyun	MOVADQ	   (%r10),\TMP2
919*4882a593Smuzhiyun.irpc	index, 1234
920*4882a593Smuzhiyun	aesenc	   \TMP2, %xmm\index
921*4882a593Smuzhiyun.endr
922*4882a593Smuzhiyun	add	   $16,%r10
923*4882a593Smuzhiyun	sub	   $1,%eax
924*4882a593Smuzhiyun	jnz	   aes_loop_pre_\@
925*4882a593Smuzhiyun
926*4882a593Smuzhiyunaes_loop_pre_done\@:
927*4882a593Smuzhiyun	MOVADQ	   (%r10), \TMP2
928*4882a593Smuzhiyun	aesenclast \TMP2, \XMM1
929*4882a593Smuzhiyun	aesenclast \TMP2, \XMM2
930*4882a593Smuzhiyun	aesenclast \TMP2, \XMM3
931*4882a593Smuzhiyun	aesenclast \TMP2, \XMM4
932*4882a593Smuzhiyun	movdqu	   16*0(%arg4 , %r11 , 1), \TMP1
933*4882a593Smuzhiyun	pxor	   \TMP1, \XMM1
934*4882a593Smuzhiyun.ifc \operation, dec
935*4882a593Smuzhiyun	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
936*4882a593Smuzhiyun	movdqa     \TMP1, \XMM1
937*4882a593Smuzhiyun.endif
938*4882a593Smuzhiyun	movdqu	   16*1(%arg4 , %r11 , 1), \TMP1
939*4882a593Smuzhiyun	pxor	   \TMP1, \XMM2
940*4882a593Smuzhiyun.ifc \operation, dec
941*4882a593Smuzhiyun	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
942*4882a593Smuzhiyun	movdqa     \TMP1, \XMM2
943*4882a593Smuzhiyun.endif
944*4882a593Smuzhiyun	movdqu	   16*2(%arg4 , %r11 , 1), \TMP1
945*4882a593Smuzhiyun	pxor	   \TMP1, \XMM3
946*4882a593Smuzhiyun.ifc \operation, dec
947*4882a593Smuzhiyun	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
948*4882a593Smuzhiyun	movdqa     \TMP1, \XMM3
949*4882a593Smuzhiyun.endif
950*4882a593Smuzhiyun	movdqu	   16*3(%arg4 , %r11 , 1), \TMP1
951*4882a593Smuzhiyun	pxor	   \TMP1, \XMM4
952*4882a593Smuzhiyun.ifc \operation, dec
953*4882a593Smuzhiyun	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
954*4882a593Smuzhiyun	movdqa     \TMP1, \XMM4
955*4882a593Smuzhiyun.else
956*4882a593Smuzhiyun	movdqu     \XMM1, 16*0(%arg3 , %r11 , 1)
957*4882a593Smuzhiyun	movdqu     \XMM2, 16*1(%arg3 , %r11 , 1)
958*4882a593Smuzhiyun	movdqu     \XMM3, 16*2(%arg3 , %r11 , 1)
959*4882a593Smuzhiyun	movdqu     \XMM4, 16*3(%arg3 , %r11 , 1)
960*4882a593Smuzhiyun.endif
961*4882a593Smuzhiyun
962*4882a593Smuzhiyun	add	   $64, %r11
963*4882a593Smuzhiyun	pshufb %xmm14, \XMM1 # perform a 16 byte swap
964*4882a593Smuzhiyun	pxor	   \XMMDst, \XMM1
965*4882a593Smuzhiyun# combine GHASHed value with the corresponding ciphertext
966*4882a593Smuzhiyun	pshufb %xmm14, \XMM2 # perform a 16 byte swap
967*4882a593Smuzhiyun	pshufb %xmm14, \XMM3 # perform a 16 byte swap
968*4882a593Smuzhiyun	pshufb %xmm14, \XMM4 # perform a 16 byte swap
969*4882a593Smuzhiyun
970*4882a593Smuzhiyun_initial_blocks_done\@:
971*4882a593Smuzhiyun
972*4882a593Smuzhiyun.endm
973*4882a593Smuzhiyun
974*4882a593Smuzhiyun/*
975*4882a593Smuzhiyun* encrypt 4 blocks at a time
976*4882a593Smuzhiyun* ghash the 4 previously encrypted ciphertext blocks
977*4882a593Smuzhiyun* arg1, %arg3, %arg4 are used as pointers only, not modified
978*4882a593Smuzhiyun* %r11 is the data offset value
979*4882a593Smuzhiyun*/
980*4882a593Smuzhiyun.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \
981*4882a593SmuzhiyunTMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
982*4882a593Smuzhiyun
983*4882a593Smuzhiyun	movdqa	  \XMM1, \XMM5
984*4882a593Smuzhiyun	movdqa	  \XMM2, \XMM6
985*4882a593Smuzhiyun	movdqa	  \XMM3, \XMM7
986*4882a593Smuzhiyun	movdqa	  \XMM4, \XMM8
987*4882a593Smuzhiyun
988*4882a593Smuzhiyun        movdqa    SHUF_MASK(%rip), %xmm15
989*4882a593Smuzhiyun        # multiply TMP5 * HashKey using karatsuba
990*4882a593Smuzhiyun
991*4882a593Smuzhiyun	movdqa	  \XMM5, \TMP4
992*4882a593Smuzhiyun	pshufd	  $78, \XMM5, \TMP6
993*4882a593Smuzhiyun	pxor	  \XMM5, \TMP6
994*4882a593Smuzhiyun	paddd     ONE(%rip), \XMM0		# INCR CNT
995*4882a593Smuzhiyun	movdqu	  HashKey_4(%arg2), \TMP5
996*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
997*4882a593Smuzhiyun	movdqa    \XMM0, \XMM1
998*4882a593Smuzhiyun	paddd     ONE(%rip), \XMM0		# INCR CNT
999*4882a593Smuzhiyun	movdqa    \XMM0, \XMM2
1000*4882a593Smuzhiyun	paddd     ONE(%rip), \XMM0		# INCR CNT
1001*4882a593Smuzhiyun	movdqa    \XMM0, \XMM3
1002*4882a593Smuzhiyun	paddd     ONE(%rip), \XMM0		# INCR CNT
1003*4882a593Smuzhiyun	movdqa    \XMM0, \XMM4
1004*4882a593Smuzhiyun	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1005*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1006*4882a593Smuzhiyun	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1007*4882a593Smuzhiyun	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1008*4882a593Smuzhiyun	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1009*4882a593Smuzhiyun
1010*4882a593Smuzhiyun	pxor	  (%arg1), \XMM1
1011*4882a593Smuzhiyun	pxor	  (%arg1), \XMM2
1012*4882a593Smuzhiyun	pxor	  (%arg1), \XMM3
1013*4882a593Smuzhiyun	pxor	  (%arg1), \XMM4
1014*4882a593Smuzhiyun	movdqu	  HashKey_4_k(%arg2), \TMP5
1015*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1016*4882a593Smuzhiyun	movaps 0x10(%arg1), \TMP1
1017*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM1              # Round 1
1018*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM2
1019*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM3
1020*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM4
1021*4882a593Smuzhiyun	movaps 0x20(%arg1), \TMP1
1022*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM1              # Round 2
1023*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM2
1024*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM3
1025*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM4
1026*4882a593Smuzhiyun	movdqa	  \XMM6, \TMP1
1027*4882a593Smuzhiyun	pshufd	  $78, \XMM6, \TMP2
1028*4882a593Smuzhiyun	pxor	  \XMM6, \TMP2
1029*4882a593Smuzhiyun	movdqu	  HashKey_3(%arg2), \TMP5
1030*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1031*4882a593Smuzhiyun	movaps 0x30(%arg1), \TMP3
1032*4882a593Smuzhiyun	aesenc    \TMP3, \XMM1              # Round 3
1033*4882a593Smuzhiyun	aesenc    \TMP3, \XMM2
1034*4882a593Smuzhiyun	aesenc    \TMP3, \XMM3
1035*4882a593Smuzhiyun	aesenc    \TMP3, \XMM4
1036*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1037*4882a593Smuzhiyun	movaps 0x40(%arg1), \TMP3
1038*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 4
1039*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1040*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1041*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1042*4882a593Smuzhiyun	movdqu	  HashKey_3_k(%arg2), \TMP5
1043*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1044*4882a593Smuzhiyun	movaps 0x50(%arg1), \TMP3
1045*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 5
1046*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1047*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1048*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1049*4882a593Smuzhiyun	pxor	  \TMP1, \TMP4
1050*4882a593Smuzhiyun# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1051*4882a593Smuzhiyun	pxor	  \XMM6, \XMM5
1052*4882a593Smuzhiyun	pxor	  \TMP2, \TMP6
1053*4882a593Smuzhiyun	movdqa	  \XMM7, \TMP1
1054*4882a593Smuzhiyun	pshufd	  $78, \XMM7, \TMP2
1055*4882a593Smuzhiyun	pxor	  \XMM7, \TMP2
1056*4882a593Smuzhiyun	movdqu	  HashKey_2(%arg2), \TMP5
1057*4882a593Smuzhiyun
1058*4882a593Smuzhiyun        # Multiply TMP5 * HashKey using karatsuba
1059*4882a593Smuzhiyun
1060*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1061*4882a593Smuzhiyun	movaps 0x60(%arg1), \TMP3
1062*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 6
1063*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1064*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1065*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1066*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1067*4882a593Smuzhiyun	movaps 0x70(%arg1), \TMP3
1068*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 7
1069*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1070*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1071*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1072*4882a593Smuzhiyun	movdqu	  HashKey_2_k(%arg2), \TMP5
1073*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1074*4882a593Smuzhiyun	movaps 0x80(%arg1), \TMP3
1075*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 8
1076*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1077*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1078*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1079*4882a593Smuzhiyun	pxor	  \TMP1, \TMP4
1080*4882a593Smuzhiyun# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1081*4882a593Smuzhiyun	pxor	  \XMM7, \XMM5
1082*4882a593Smuzhiyun	pxor	  \TMP2, \TMP6
1083*4882a593Smuzhiyun
1084*4882a593Smuzhiyun        # Multiply XMM8 * HashKey
1085*4882a593Smuzhiyun        # XMM8 and TMP5 hold the values for the two operands
1086*4882a593Smuzhiyun
1087*4882a593Smuzhiyun	movdqa	  \XMM8, \TMP1
1088*4882a593Smuzhiyun	pshufd	  $78, \XMM8, \TMP2
1089*4882a593Smuzhiyun	pxor	  \XMM8, \TMP2
1090*4882a593Smuzhiyun	movdqu	  HashKey(%arg2), \TMP5
1091*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1092*4882a593Smuzhiyun	movaps 0x90(%arg1), \TMP3
1093*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1             # Round 9
1094*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1095*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1096*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1097*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1098*4882a593Smuzhiyun	lea	  0xa0(%arg1),%r10
1099*4882a593Smuzhiyun	mov	  keysize,%eax
1100*4882a593Smuzhiyun	shr	  $2,%eax			# 128->4, 192->6, 256->8
1101*4882a593Smuzhiyun	sub	  $4,%eax			# 128->0, 192->2, 256->4
1102*4882a593Smuzhiyun	jz	  aes_loop_par_enc_done\@
1103*4882a593Smuzhiyun
1104*4882a593Smuzhiyunaes_loop_par_enc\@:
1105*4882a593Smuzhiyun	MOVADQ	  (%r10),\TMP3
1106*4882a593Smuzhiyun.irpc	index, 1234
1107*4882a593Smuzhiyun	aesenc	  \TMP3, %xmm\index
1108*4882a593Smuzhiyun.endr
1109*4882a593Smuzhiyun	add	  $16,%r10
1110*4882a593Smuzhiyun	sub	  $1,%eax
1111*4882a593Smuzhiyun	jnz	  aes_loop_par_enc\@
1112*4882a593Smuzhiyun
1113*4882a593Smuzhiyunaes_loop_par_enc_done\@:
1114*4882a593Smuzhiyun	MOVADQ	  (%r10), \TMP3
1115*4882a593Smuzhiyun	aesenclast \TMP3, \XMM1           # Round 10
1116*4882a593Smuzhiyun	aesenclast \TMP3, \XMM2
1117*4882a593Smuzhiyun	aesenclast \TMP3, \XMM3
1118*4882a593Smuzhiyun	aesenclast \TMP3, \XMM4
1119*4882a593Smuzhiyun	movdqu    HashKey_k(%arg2), \TMP5
1120*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1121*4882a593Smuzhiyun	movdqu	  (%arg4,%r11,1), \TMP3
1122*4882a593Smuzhiyun	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1123*4882a593Smuzhiyun	movdqu	  16(%arg4,%r11,1), \TMP3
1124*4882a593Smuzhiyun	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1125*4882a593Smuzhiyun	movdqu	  32(%arg4,%r11,1), \TMP3
1126*4882a593Smuzhiyun	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1127*4882a593Smuzhiyun	movdqu	  48(%arg4,%r11,1), \TMP3
1128*4882a593Smuzhiyun	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1129*4882a593Smuzhiyun        movdqu    \XMM1, (%arg3,%r11,1)        # Write to the ciphertext buffer
1130*4882a593Smuzhiyun        movdqu    \XMM2, 16(%arg3,%r11,1)      # Write to the ciphertext buffer
1131*4882a593Smuzhiyun        movdqu    \XMM3, 32(%arg3,%r11,1)      # Write to the ciphertext buffer
1132*4882a593Smuzhiyun        movdqu    \XMM4, 48(%arg3,%r11,1)      # Write to the ciphertext buffer
1133*4882a593Smuzhiyun	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1134*4882a593Smuzhiyun	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1135*4882a593Smuzhiyun	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1136*4882a593Smuzhiyun	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1137*4882a593Smuzhiyun
1138*4882a593Smuzhiyun	pxor	  \TMP4, \TMP1
1139*4882a593Smuzhiyun	pxor	  \XMM8, \XMM5
1140*4882a593Smuzhiyun	pxor	  \TMP6, \TMP2
1141*4882a593Smuzhiyun	pxor	  \TMP1, \TMP2
1142*4882a593Smuzhiyun	pxor	  \XMM5, \TMP2
1143*4882a593Smuzhiyun	movdqa	  \TMP2, \TMP3
1144*4882a593Smuzhiyun	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1145*4882a593Smuzhiyun	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1146*4882a593Smuzhiyun	pxor	  \TMP3, \XMM5
1147*4882a593Smuzhiyun	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1148*4882a593Smuzhiyun
1149*4882a593Smuzhiyun        # first phase of reduction
1150*4882a593Smuzhiyun
1151*4882a593Smuzhiyun	movdqa    \XMM5, \TMP2
1152*4882a593Smuzhiyun	movdqa    \XMM5, \TMP3
1153*4882a593Smuzhiyun	movdqa    \XMM5, \TMP4
1154*4882a593Smuzhiyun# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1155*4882a593Smuzhiyun	pslld     $31, \TMP2                   # packed right shift << 31
1156*4882a593Smuzhiyun	pslld     $30, \TMP3                   # packed right shift << 30
1157*4882a593Smuzhiyun	pslld     $25, \TMP4                   # packed right shift << 25
1158*4882a593Smuzhiyun	pxor      \TMP3, \TMP2	               # xor the shifted versions
1159*4882a593Smuzhiyun	pxor      \TMP4, \TMP2
1160*4882a593Smuzhiyun	movdqa    \TMP2, \TMP5
1161*4882a593Smuzhiyun	psrldq    $4, \TMP5                    # right shift T5 1 DW
1162*4882a593Smuzhiyun	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1163*4882a593Smuzhiyun	pxor      \TMP2, \XMM5
1164*4882a593Smuzhiyun
1165*4882a593Smuzhiyun        # second phase of reduction
1166*4882a593Smuzhiyun
1167*4882a593Smuzhiyun	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1168*4882a593Smuzhiyun	movdqa    \XMM5,\TMP3
1169*4882a593Smuzhiyun	movdqa    \XMM5,\TMP4
1170*4882a593Smuzhiyun	psrld     $1, \TMP2                    # packed left shift >>1
1171*4882a593Smuzhiyun	psrld     $2, \TMP3                    # packed left shift >>2
1172*4882a593Smuzhiyun	psrld     $7, \TMP4                    # packed left shift >>7
1173*4882a593Smuzhiyun	pxor      \TMP3,\TMP2		       # xor the shifted versions
1174*4882a593Smuzhiyun	pxor      \TMP4,\TMP2
1175*4882a593Smuzhiyun	pxor      \TMP5, \TMP2
1176*4882a593Smuzhiyun	pxor      \TMP2, \XMM5
1177*4882a593Smuzhiyun	pxor      \TMP1, \XMM5                 # result is in TMP1
1178*4882a593Smuzhiyun
1179*4882a593Smuzhiyun	pxor	  \XMM5, \XMM1
1180*4882a593Smuzhiyun.endm
1181*4882a593Smuzhiyun
1182*4882a593Smuzhiyun/*
1183*4882a593Smuzhiyun* decrypt 4 blocks at a time
1184*4882a593Smuzhiyun* ghash the 4 previously decrypted ciphertext blocks
1185*4882a593Smuzhiyun* arg1, %arg3, %arg4 are used as pointers only, not modified
1186*4882a593Smuzhiyun* %r11 is the data offset value
1187*4882a593Smuzhiyun*/
1188*4882a593Smuzhiyun.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \
1189*4882a593SmuzhiyunTMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
1190*4882a593Smuzhiyun
1191*4882a593Smuzhiyun	movdqa	  \XMM1, \XMM5
1192*4882a593Smuzhiyun	movdqa	  \XMM2, \XMM6
1193*4882a593Smuzhiyun	movdqa	  \XMM3, \XMM7
1194*4882a593Smuzhiyun	movdqa	  \XMM4, \XMM8
1195*4882a593Smuzhiyun
1196*4882a593Smuzhiyun        movdqa    SHUF_MASK(%rip), %xmm15
1197*4882a593Smuzhiyun        # multiply TMP5 * HashKey using karatsuba
1198*4882a593Smuzhiyun
1199*4882a593Smuzhiyun	movdqa	  \XMM5, \TMP4
1200*4882a593Smuzhiyun	pshufd	  $78, \XMM5, \TMP6
1201*4882a593Smuzhiyun	pxor	  \XMM5, \TMP6
1202*4882a593Smuzhiyun	paddd     ONE(%rip), \XMM0		# INCR CNT
1203*4882a593Smuzhiyun	movdqu	  HashKey_4(%arg2), \TMP5
1204*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP4           # TMP4 = a1*b1
1205*4882a593Smuzhiyun	movdqa    \XMM0, \XMM1
1206*4882a593Smuzhiyun	paddd     ONE(%rip), \XMM0		# INCR CNT
1207*4882a593Smuzhiyun	movdqa    \XMM0, \XMM2
1208*4882a593Smuzhiyun	paddd     ONE(%rip), \XMM0		# INCR CNT
1209*4882a593Smuzhiyun	movdqa    \XMM0, \XMM3
1210*4882a593Smuzhiyun	paddd     ONE(%rip), \XMM0		# INCR CNT
1211*4882a593Smuzhiyun	movdqa    \XMM0, \XMM4
1212*4882a593Smuzhiyun	pshufb %xmm15, \XMM1	# perform a 16 byte swap
1213*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM5           # XMM5 = a0*b0
1214*4882a593Smuzhiyun	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1215*4882a593Smuzhiyun	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1216*4882a593Smuzhiyun	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1217*4882a593Smuzhiyun
1218*4882a593Smuzhiyun	pxor	  (%arg1), \XMM1
1219*4882a593Smuzhiyun	pxor	  (%arg1), \XMM2
1220*4882a593Smuzhiyun	pxor	  (%arg1), \XMM3
1221*4882a593Smuzhiyun	pxor	  (%arg1), \XMM4
1222*4882a593Smuzhiyun	movdqu	  HashKey_4_k(%arg2), \TMP5
1223*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \TMP6       # TMP6 = (a1+a0)*(b1+b0)
1224*4882a593Smuzhiyun	movaps 0x10(%arg1), \TMP1
1225*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM1              # Round 1
1226*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM2
1227*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM3
1228*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM4
1229*4882a593Smuzhiyun	movaps 0x20(%arg1), \TMP1
1230*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM1              # Round 2
1231*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM2
1232*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM3
1233*4882a593Smuzhiyun	aesenc	  \TMP1, \XMM4
1234*4882a593Smuzhiyun	movdqa	  \XMM6, \TMP1
1235*4882a593Smuzhiyun	pshufd	  $78, \XMM6, \TMP2
1236*4882a593Smuzhiyun	pxor	  \XMM6, \TMP2
1237*4882a593Smuzhiyun	movdqu	  HashKey_3(%arg2), \TMP5
1238*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1 * b1
1239*4882a593Smuzhiyun	movaps 0x30(%arg1), \TMP3
1240*4882a593Smuzhiyun	aesenc    \TMP3, \XMM1              # Round 3
1241*4882a593Smuzhiyun	aesenc    \TMP3, \XMM2
1242*4882a593Smuzhiyun	aesenc    \TMP3, \XMM3
1243*4882a593Smuzhiyun	aesenc    \TMP3, \XMM4
1244*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM6       # XMM6 = a0*b0
1245*4882a593Smuzhiyun	movaps 0x40(%arg1), \TMP3
1246*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 4
1247*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1248*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1249*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1250*4882a593Smuzhiyun	movdqu	  HashKey_3_k(%arg2), \TMP5
1251*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1252*4882a593Smuzhiyun	movaps 0x50(%arg1), \TMP3
1253*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 5
1254*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1255*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1256*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1257*4882a593Smuzhiyun	pxor	  \TMP1, \TMP4
1258*4882a593Smuzhiyun# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1259*4882a593Smuzhiyun	pxor	  \XMM6, \XMM5
1260*4882a593Smuzhiyun	pxor	  \TMP2, \TMP6
1261*4882a593Smuzhiyun	movdqa	  \XMM7, \TMP1
1262*4882a593Smuzhiyun	pshufd	  $78, \XMM7, \TMP2
1263*4882a593Smuzhiyun	pxor	  \XMM7, \TMP2
1264*4882a593Smuzhiyun	movdqu	  HashKey_2(%arg2), \TMP5
1265*4882a593Smuzhiyun
1266*4882a593Smuzhiyun        # Multiply TMP5 * HashKey using karatsuba
1267*4882a593Smuzhiyun
1268*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1269*4882a593Smuzhiyun	movaps 0x60(%arg1), \TMP3
1270*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 6
1271*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1272*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1273*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1274*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM7       # XMM7 = a0*b0
1275*4882a593Smuzhiyun	movaps 0x70(%arg1), \TMP3
1276*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 7
1277*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1278*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1279*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1280*4882a593Smuzhiyun	movdqu	  HashKey_2_k(%arg2), \TMP5
1281*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1282*4882a593Smuzhiyun	movaps 0x80(%arg1), \TMP3
1283*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1              # Round 8
1284*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1285*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1286*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1287*4882a593Smuzhiyun	pxor	  \TMP1, \TMP4
1288*4882a593Smuzhiyun# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
1289*4882a593Smuzhiyun	pxor	  \XMM7, \XMM5
1290*4882a593Smuzhiyun	pxor	  \TMP2, \TMP6
1291*4882a593Smuzhiyun
1292*4882a593Smuzhiyun        # Multiply XMM8 * HashKey
1293*4882a593Smuzhiyun        # XMM8 and TMP5 hold the values for the two operands
1294*4882a593Smuzhiyun
1295*4882a593Smuzhiyun	movdqa	  \XMM8, \TMP1
1296*4882a593Smuzhiyun	pshufd	  $78, \XMM8, \TMP2
1297*4882a593Smuzhiyun	pxor	  \XMM8, \TMP2
1298*4882a593Smuzhiyun	movdqu	  HashKey(%arg2), \TMP5
1299*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP1      # TMP1 = a1*b1
1300*4882a593Smuzhiyun	movaps 0x90(%arg1), \TMP3
1301*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM1             # Round 9
1302*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM2
1303*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM3
1304*4882a593Smuzhiyun	aesenc	  \TMP3, \XMM4
1305*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM8      # XMM8 = a0*b0
1306*4882a593Smuzhiyun	lea	  0xa0(%arg1),%r10
1307*4882a593Smuzhiyun	mov	  keysize,%eax
1308*4882a593Smuzhiyun	shr	  $2,%eax		        # 128->4, 192->6, 256->8
1309*4882a593Smuzhiyun	sub	  $4,%eax			# 128->0, 192->2, 256->4
1310*4882a593Smuzhiyun	jz	  aes_loop_par_dec_done\@
1311*4882a593Smuzhiyun
1312*4882a593Smuzhiyunaes_loop_par_dec\@:
1313*4882a593Smuzhiyun	MOVADQ	  (%r10),\TMP3
1314*4882a593Smuzhiyun.irpc	index, 1234
1315*4882a593Smuzhiyun	aesenc	  \TMP3, %xmm\index
1316*4882a593Smuzhiyun.endr
1317*4882a593Smuzhiyun	add	  $16,%r10
1318*4882a593Smuzhiyun	sub	  $1,%eax
1319*4882a593Smuzhiyun	jnz	  aes_loop_par_dec\@
1320*4882a593Smuzhiyun
1321*4882a593Smuzhiyunaes_loop_par_dec_done\@:
1322*4882a593Smuzhiyun	MOVADQ	  (%r10), \TMP3
1323*4882a593Smuzhiyun	aesenclast \TMP3, \XMM1           # last round
1324*4882a593Smuzhiyun	aesenclast \TMP3, \XMM2
1325*4882a593Smuzhiyun	aesenclast \TMP3, \XMM3
1326*4882a593Smuzhiyun	aesenclast \TMP3, \XMM4
1327*4882a593Smuzhiyun	movdqu    HashKey_k(%arg2), \TMP5
1328*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \TMP2          # TMP2 = (a1+a0)*(b1+b0)
1329*4882a593Smuzhiyun	movdqu	  (%arg4,%r11,1), \TMP3
1330*4882a593Smuzhiyun	pxor	  \TMP3, \XMM1                 # Ciphertext/Plaintext XOR EK
1331*4882a593Smuzhiyun	movdqu	  \XMM1, (%arg3,%r11,1)        # Write to plaintext buffer
1332*4882a593Smuzhiyun	movdqa    \TMP3, \XMM1
1333*4882a593Smuzhiyun	movdqu	  16(%arg4,%r11,1), \TMP3
1334*4882a593Smuzhiyun	pxor	  \TMP3, \XMM2                 # Ciphertext/Plaintext XOR EK
1335*4882a593Smuzhiyun	movdqu	  \XMM2, 16(%arg3,%r11,1)      # Write to plaintext buffer
1336*4882a593Smuzhiyun	movdqa    \TMP3, \XMM2
1337*4882a593Smuzhiyun	movdqu	  32(%arg4,%r11,1), \TMP3
1338*4882a593Smuzhiyun	pxor	  \TMP3, \XMM3                 # Ciphertext/Plaintext XOR EK
1339*4882a593Smuzhiyun	movdqu	  \XMM3, 32(%arg3,%r11,1)      # Write to plaintext buffer
1340*4882a593Smuzhiyun	movdqa    \TMP3, \XMM3
1341*4882a593Smuzhiyun	movdqu	  48(%arg4,%r11,1), \TMP3
1342*4882a593Smuzhiyun	pxor	  \TMP3, \XMM4                 # Ciphertext/Plaintext XOR EK
1343*4882a593Smuzhiyun	movdqu	  \XMM4, 48(%arg3,%r11,1)      # Write to plaintext buffer
1344*4882a593Smuzhiyun	movdqa    \TMP3, \XMM4
1345*4882a593Smuzhiyun	pshufb %xmm15, \XMM1        # perform a 16 byte swap
1346*4882a593Smuzhiyun	pshufb %xmm15, \XMM2	# perform a 16 byte swap
1347*4882a593Smuzhiyun	pshufb %xmm15, \XMM3	# perform a 16 byte swap
1348*4882a593Smuzhiyun	pshufb %xmm15, \XMM4	# perform a 16 byte swap
1349*4882a593Smuzhiyun
1350*4882a593Smuzhiyun	pxor	  \TMP4, \TMP1
1351*4882a593Smuzhiyun	pxor	  \XMM8, \XMM5
1352*4882a593Smuzhiyun	pxor	  \TMP6, \TMP2
1353*4882a593Smuzhiyun	pxor	  \TMP1, \TMP2
1354*4882a593Smuzhiyun	pxor	  \XMM5, \TMP2
1355*4882a593Smuzhiyun	movdqa	  \TMP2, \TMP3
1356*4882a593Smuzhiyun	pslldq	  $8, \TMP3                    # left shift TMP3 2 DWs
1357*4882a593Smuzhiyun	psrldq	  $8, \TMP2                    # right shift TMP2 2 DWs
1358*4882a593Smuzhiyun	pxor	  \TMP3, \XMM5
1359*4882a593Smuzhiyun	pxor	  \TMP2, \TMP1	  # accumulate the results in TMP1:XMM5
1360*4882a593Smuzhiyun
1361*4882a593Smuzhiyun        # first phase of reduction
1362*4882a593Smuzhiyun
1363*4882a593Smuzhiyun	movdqa    \XMM5, \TMP2
1364*4882a593Smuzhiyun	movdqa    \XMM5, \TMP3
1365*4882a593Smuzhiyun	movdqa    \XMM5, \TMP4
1366*4882a593Smuzhiyun# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1367*4882a593Smuzhiyun	pslld     $31, \TMP2                   # packed right shift << 31
1368*4882a593Smuzhiyun	pslld     $30, \TMP3                   # packed right shift << 30
1369*4882a593Smuzhiyun	pslld     $25, \TMP4                   # packed right shift << 25
1370*4882a593Smuzhiyun	pxor      \TMP3, \TMP2	               # xor the shifted versions
1371*4882a593Smuzhiyun	pxor      \TMP4, \TMP2
1372*4882a593Smuzhiyun	movdqa    \TMP2, \TMP5
1373*4882a593Smuzhiyun	psrldq    $4, \TMP5                    # right shift T5 1 DW
1374*4882a593Smuzhiyun	pslldq    $12, \TMP2                   # left shift T2 3 DWs
1375*4882a593Smuzhiyun	pxor      \TMP2, \XMM5
1376*4882a593Smuzhiyun
1377*4882a593Smuzhiyun        # second phase of reduction
1378*4882a593Smuzhiyun
1379*4882a593Smuzhiyun	movdqa    \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1380*4882a593Smuzhiyun	movdqa    \XMM5,\TMP3
1381*4882a593Smuzhiyun	movdqa    \XMM5,\TMP4
1382*4882a593Smuzhiyun	psrld     $1, \TMP2                    # packed left shift >>1
1383*4882a593Smuzhiyun	psrld     $2, \TMP3                    # packed left shift >>2
1384*4882a593Smuzhiyun	psrld     $7, \TMP4                    # packed left shift >>7
1385*4882a593Smuzhiyun	pxor      \TMP3,\TMP2		       # xor the shifted versions
1386*4882a593Smuzhiyun	pxor      \TMP4,\TMP2
1387*4882a593Smuzhiyun	pxor      \TMP5, \TMP2
1388*4882a593Smuzhiyun	pxor      \TMP2, \XMM5
1389*4882a593Smuzhiyun	pxor      \TMP1, \XMM5                 # result is in TMP1
1390*4882a593Smuzhiyun
1391*4882a593Smuzhiyun	pxor	  \XMM5, \XMM1
1392*4882a593Smuzhiyun.endm
1393*4882a593Smuzhiyun
1394*4882a593Smuzhiyun/* GHASH the last 4 ciphertext blocks. */
1395*4882a593Smuzhiyun.macro	GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1396*4882a593SmuzhiyunTMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1397*4882a593Smuzhiyun
1398*4882a593Smuzhiyun        # Multiply TMP6 * HashKey (using Karatsuba)
1399*4882a593Smuzhiyun
1400*4882a593Smuzhiyun	movdqa	  \XMM1, \TMP6
1401*4882a593Smuzhiyun	pshufd	  $78, \XMM1, \TMP2
1402*4882a593Smuzhiyun	pxor	  \XMM1, \TMP2
1403*4882a593Smuzhiyun	movdqu	  HashKey_4(%arg2), \TMP5
1404*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP6       # TMP6 = a1*b1
1405*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM1       # XMM1 = a0*b0
1406*4882a593Smuzhiyun	movdqu	  HashKey_4_k(%arg2), \TMP4
1407*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1408*4882a593Smuzhiyun	movdqa	  \XMM1, \XMMDst
1409*4882a593Smuzhiyun	movdqa	  \TMP2, \XMM1              # result in TMP6, XMMDst, XMM1
1410*4882a593Smuzhiyun
1411*4882a593Smuzhiyun        # Multiply TMP1 * HashKey (using Karatsuba)
1412*4882a593Smuzhiyun
1413*4882a593Smuzhiyun	movdqa	  \XMM2, \TMP1
1414*4882a593Smuzhiyun	pshufd	  $78, \XMM2, \TMP2
1415*4882a593Smuzhiyun	pxor	  \XMM2, \TMP2
1416*4882a593Smuzhiyun	movdqu	  HashKey_3(%arg2), \TMP5
1417*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1418*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM2       # XMM2 = a0*b0
1419*4882a593Smuzhiyun	movdqu	  HashKey_3_k(%arg2), \TMP4
1420*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1421*4882a593Smuzhiyun	pxor	  \TMP1, \TMP6
1422*4882a593Smuzhiyun	pxor	  \XMM2, \XMMDst
1423*4882a593Smuzhiyun	pxor	  \TMP2, \XMM1
1424*4882a593Smuzhiyun# results accumulated in TMP6, XMMDst, XMM1
1425*4882a593Smuzhiyun
1426*4882a593Smuzhiyun        # Multiply TMP1 * HashKey (using Karatsuba)
1427*4882a593Smuzhiyun
1428*4882a593Smuzhiyun	movdqa	  \XMM3, \TMP1
1429*4882a593Smuzhiyun	pshufd	  $78, \XMM3, \TMP2
1430*4882a593Smuzhiyun	pxor	  \XMM3, \TMP2
1431*4882a593Smuzhiyun	movdqu	  HashKey_2(%arg2), \TMP5
1432*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP1       # TMP1 = a1*b1
1433*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM3       # XMM3 = a0*b0
1434*4882a593Smuzhiyun	movdqu	  HashKey_2_k(%arg2), \TMP4
1435*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1436*4882a593Smuzhiyun	pxor	  \TMP1, \TMP6
1437*4882a593Smuzhiyun	pxor	  \XMM3, \XMMDst
1438*4882a593Smuzhiyun	pxor	  \TMP2, \XMM1   # results accumulated in TMP6, XMMDst, XMM1
1439*4882a593Smuzhiyun
1440*4882a593Smuzhiyun        # Multiply TMP1 * HashKey (using Karatsuba)
1441*4882a593Smuzhiyun	movdqa	  \XMM4, \TMP1
1442*4882a593Smuzhiyun	pshufd	  $78, \XMM4, \TMP2
1443*4882a593Smuzhiyun	pxor	  \XMM4, \TMP2
1444*4882a593Smuzhiyun	movdqu	  HashKey(%arg2), \TMP5
1445*4882a593Smuzhiyun	pclmulqdq $0x11, \TMP5, \TMP1	    # TMP1 = a1*b1
1446*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP5, \XMM4       # XMM4 = a0*b0
1447*4882a593Smuzhiyun	movdqu	  HashKey_k(%arg2), \TMP4
1448*4882a593Smuzhiyun	pclmulqdq $0x00, \TMP4, \TMP2       # TMP2 = (a1+a0)*(b1+b0)
1449*4882a593Smuzhiyun	pxor	  \TMP1, \TMP6
1450*4882a593Smuzhiyun	pxor	  \XMM4, \XMMDst
1451*4882a593Smuzhiyun	pxor	  \XMM1, \TMP2
1452*4882a593Smuzhiyun	pxor	  \TMP6, \TMP2
1453*4882a593Smuzhiyun	pxor	  \XMMDst, \TMP2
1454*4882a593Smuzhiyun	# middle section of the temp results combined as in karatsuba algorithm
1455*4882a593Smuzhiyun	movdqa	  \TMP2, \TMP4
1456*4882a593Smuzhiyun	pslldq	  $8, \TMP4                 # left shift TMP4 2 DWs
1457*4882a593Smuzhiyun	psrldq	  $8, \TMP2                 # right shift TMP2 2 DWs
1458*4882a593Smuzhiyun	pxor	  \TMP4, \XMMDst
1459*4882a593Smuzhiyun	pxor	  \TMP2, \TMP6
1460*4882a593Smuzhiyun# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1461*4882a593Smuzhiyun	# first phase of the reduction
1462*4882a593Smuzhiyun	movdqa    \XMMDst, \TMP2
1463*4882a593Smuzhiyun	movdqa    \XMMDst, \TMP3
1464*4882a593Smuzhiyun	movdqa    \XMMDst, \TMP4
1465*4882a593Smuzhiyun# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1466*4882a593Smuzhiyun	pslld     $31, \TMP2                # packed right shifting << 31
1467*4882a593Smuzhiyun	pslld     $30, \TMP3                # packed right shifting << 30
1468*4882a593Smuzhiyun	pslld     $25, \TMP4                # packed right shifting << 25
1469*4882a593Smuzhiyun	pxor      \TMP3, \TMP2              # xor the shifted versions
1470*4882a593Smuzhiyun	pxor      \TMP4, \TMP2
1471*4882a593Smuzhiyun	movdqa    \TMP2, \TMP7
1472*4882a593Smuzhiyun	psrldq    $4, \TMP7                 # right shift TMP7 1 DW
1473*4882a593Smuzhiyun	pslldq    $12, \TMP2                # left shift TMP2 3 DWs
1474*4882a593Smuzhiyun	pxor      \TMP2, \XMMDst
1475*4882a593Smuzhiyun
1476*4882a593Smuzhiyun        # second phase of the reduction
1477*4882a593Smuzhiyun	movdqa    \XMMDst, \TMP2
1478*4882a593Smuzhiyun	# make 3 copies of XMMDst for doing 3 shift operations
1479*4882a593Smuzhiyun	movdqa    \XMMDst, \TMP3
1480*4882a593Smuzhiyun	movdqa    \XMMDst, \TMP4
1481*4882a593Smuzhiyun	psrld     $1, \TMP2                 # packed left shift >> 1
1482*4882a593Smuzhiyun	psrld     $2, \TMP3                 # packed left shift >> 2
1483*4882a593Smuzhiyun	psrld     $7, \TMP4                 # packed left shift >> 7
1484*4882a593Smuzhiyun	pxor      \TMP3, \TMP2              # xor the shifted versions
1485*4882a593Smuzhiyun	pxor      \TMP4, \TMP2
1486*4882a593Smuzhiyun	pxor      \TMP7, \TMP2
1487*4882a593Smuzhiyun	pxor      \TMP2, \XMMDst
1488*4882a593Smuzhiyun	pxor      \TMP6, \XMMDst            # reduced result is in XMMDst
1489*4882a593Smuzhiyun.endm
1490*4882a593Smuzhiyun
1491*4882a593Smuzhiyun
1492*4882a593Smuzhiyun/* Encryption of a single block
1493*4882a593Smuzhiyun* uses eax & r10
1494*4882a593Smuzhiyun*/
1495*4882a593Smuzhiyun
1496*4882a593Smuzhiyun.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1497*4882a593Smuzhiyun
1498*4882a593Smuzhiyun	pxor		(%arg1), \XMM0
1499*4882a593Smuzhiyun	mov		keysize,%eax
1500*4882a593Smuzhiyun	shr		$2,%eax			# 128->4, 192->6, 256->8
1501*4882a593Smuzhiyun	add		$5,%eax			# 128->9, 192->11, 256->13
1502*4882a593Smuzhiyun	lea		16(%arg1), %r10	  # get first expanded key address
1503*4882a593Smuzhiyun
1504*4882a593Smuzhiyun_esb_loop_\@:
1505*4882a593Smuzhiyun	MOVADQ		(%r10),\TMP1
1506*4882a593Smuzhiyun	aesenc		\TMP1,\XMM0
1507*4882a593Smuzhiyun	add		$16,%r10
1508*4882a593Smuzhiyun	sub		$1,%eax
1509*4882a593Smuzhiyun	jnz		_esb_loop_\@
1510*4882a593Smuzhiyun
1511*4882a593Smuzhiyun	MOVADQ		(%r10),\TMP1
1512*4882a593Smuzhiyun	aesenclast	\TMP1,\XMM0
1513*4882a593Smuzhiyun.endm
1514*4882a593Smuzhiyun/*****************************************************************************
1515*4882a593Smuzhiyun* void aesni_gcm_dec(void *aes_ctx,    // AES Key schedule. Starts on a 16 byte boundary.
1516*4882a593Smuzhiyun*                   struct gcm_context_data *data
1517*4882a593Smuzhiyun*                                      // Context data
1518*4882a593Smuzhiyun*                   u8 *out,           // Plaintext output. Encrypt in-place is allowed.
1519*4882a593Smuzhiyun*                   const u8 *in,      // Ciphertext input
1520*4882a593Smuzhiyun*                   u64 plaintext_len, // Length of data in bytes for decryption.
1521*4882a593Smuzhiyun*                   u8 *iv,            // Pre-counter block j0: 4 byte salt (from Security Association)
1522*4882a593Smuzhiyun*                                      // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1523*4882a593Smuzhiyun*                                      // concatenated with 0x00000001. 16-byte aligned pointer.
1524*4882a593Smuzhiyun*                   u8 *hash_subkey,   // H, the Hash sub key input. Data starts on a 16-byte boundary.
1525*4882a593Smuzhiyun*                   const u8 *aad,     // Additional Authentication Data (AAD)
1526*4882a593Smuzhiyun*                   u64 aad_len,       // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1527*4882a593Smuzhiyun*                   u8  *auth_tag,     // Authenticated Tag output. The driver will compare this to the
1528*4882a593Smuzhiyun*                                      // given authentication tag and only return the plaintext if they match.
1529*4882a593Smuzhiyun*                   u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1530*4882a593Smuzhiyun*                                      // (most likely), 12 or 8.
1531*4882a593Smuzhiyun*
1532*4882a593Smuzhiyun* Assumptions:
1533*4882a593Smuzhiyun*
1534*4882a593Smuzhiyun* keys:
1535*4882a593Smuzhiyun*       keys are pre-expanded and aligned to 16 bytes. we are using the first
1536*4882a593Smuzhiyun*       set of 11 keys in the data structure void *aes_ctx
1537*4882a593Smuzhiyun*
1538*4882a593Smuzhiyun* iv:
1539*4882a593Smuzhiyun*       0                   1                   2                   3
1540*4882a593Smuzhiyun*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1541*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1542*4882a593Smuzhiyun*       |                             Salt  (From the SA)               |
1543*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1544*4882a593Smuzhiyun*       |                     Initialization Vector                     |
1545*4882a593Smuzhiyun*       |         (This is the sequence number from IPSec header)       |
1546*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1547*4882a593Smuzhiyun*       |                              0x1                              |
1548*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1549*4882a593Smuzhiyun*
1550*4882a593Smuzhiyun*
1551*4882a593Smuzhiyun*
1552*4882a593Smuzhiyun* AAD:
1553*4882a593Smuzhiyun*       AAD padded to 128 bits with 0
1554*4882a593Smuzhiyun*       for example, assume AAD is a u32 vector
1555*4882a593Smuzhiyun*
1556*4882a593Smuzhiyun*       if AAD is 8 bytes:
1557*4882a593Smuzhiyun*       AAD[3] = {A0, A1};
1558*4882a593Smuzhiyun*       padded AAD in xmm register = {A1 A0 0 0}
1559*4882a593Smuzhiyun*
1560*4882a593Smuzhiyun*       0                   1                   2                   3
1561*4882a593Smuzhiyun*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1562*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1563*4882a593Smuzhiyun*       |                               SPI (A1)                        |
1564*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1565*4882a593Smuzhiyun*       |                     32-bit Sequence Number (A0)               |
1566*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1567*4882a593Smuzhiyun*       |                              0x0                              |
1568*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1569*4882a593Smuzhiyun*
1570*4882a593Smuzhiyun*                                       AAD Format with 32-bit Sequence Number
1571*4882a593Smuzhiyun*
1572*4882a593Smuzhiyun*       if AAD is 12 bytes:
1573*4882a593Smuzhiyun*       AAD[3] = {A0, A1, A2};
1574*4882a593Smuzhiyun*       padded AAD in xmm register = {A2 A1 A0 0}
1575*4882a593Smuzhiyun*
1576*4882a593Smuzhiyun*       0                   1                   2                   3
1577*4882a593Smuzhiyun*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1578*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1579*4882a593Smuzhiyun*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1580*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1581*4882a593Smuzhiyun*       |                               SPI (A2)                        |
1582*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1583*4882a593Smuzhiyun*       |                 64-bit Extended Sequence Number {A1,A0}       |
1584*4882a593Smuzhiyun*       |                                                               |
1585*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1586*4882a593Smuzhiyun*       |                              0x0                              |
1587*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1588*4882a593Smuzhiyun*
1589*4882a593Smuzhiyun*                        AAD Format with 64-bit Extended Sequence Number
1590*4882a593Smuzhiyun*
1591*4882a593Smuzhiyun* poly = x^128 + x^127 + x^126 + x^121 + 1
1592*4882a593Smuzhiyun*
1593*4882a593Smuzhiyun*****************************************************************************/
1594*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_dec)
1595*4882a593Smuzhiyun	FUNC_SAVE
1596*4882a593Smuzhiyun
1597*4882a593Smuzhiyun	GCM_INIT %arg6, arg7, arg8, arg9
1598*4882a593Smuzhiyun	GCM_ENC_DEC dec
1599*4882a593Smuzhiyun	GCM_COMPLETE arg10, arg11
1600*4882a593Smuzhiyun	FUNC_RESTORE
1601*4882a593Smuzhiyun	RET
1602*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_dec)
1603*4882a593Smuzhiyun
1604*4882a593Smuzhiyun
1605*4882a593Smuzhiyun/*****************************************************************************
1606*4882a593Smuzhiyun* void aesni_gcm_enc(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1607*4882a593Smuzhiyun*                    struct gcm_context_data *data
1608*4882a593Smuzhiyun*                                        // Context data
1609*4882a593Smuzhiyun*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1610*4882a593Smuzhiyun*                    const u8 *in,       // Plaintext input
1611*4882a593Smuzhiyun*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1612*4882a593Smuzhiyun*                    u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1613*4882a593Smuzhiyun*                                        // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1614*4882a593Smuzhiyun*                                        // concatenated with 0x00000001. 16-byte aligned pointer.
1615*4882a593Smuzhiyun*                    u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1616*4882a593Smuzhiyun*                    const u8 *aad,      // Additional Authentication Data (AAD)
1617*4882a593Smuzhiyun*                    u64 aad_len,        // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1618*4882a593Smuzhiyun*                    u8 *auth_tag,       // Authenticated Tag output.
1619*4882a593Smuzhiyun*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1620*4882a593Smuzhiyun*                                        // 12 or 8.
1621*4882a593Smuzhiyun*
1622*4882a593Smuzhiyun* Assumptions:
1623*4882a593Smuzhiyun*
1624*4882a593Smuzhiyun* keys:
1625*4882a593Smuzhiyun*       keys are pre-expanded and aligned to 16 bytes. we are using the
1626*4882a593Smuzhiyun*       first set of 11 keys in the data structure void *aes_ctx
1627*4882a593Smuzhiyun*
1628*4882a593Smuzhiyun*
1629*4882a593Smuzhiyun* iv:
1630*4882a593Smuzhiyun*       0                   1                   2                   3
1631*4882a593Smuzhiyun*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1632*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1633*4882a593Smuzhiyun*       |                             Salt  (From the SA)               |
1634*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1635*4882a593Smuzhiyun*       |                     Initialization Vector                     |
1636*4882a593Smuzhiyun*       |         (This is the sequence number from IPSec header)       |
1637*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1638*4882a593Smuzhiyun*       |                              0x1                              |
1639*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1640*4882a593Smuzhiyun*
1641*4882a593Smuzhiyun*
1642*4882a593Smuzhiyun*
1643*4882a593Smuzhiyun* AAD:
1644*4882a593Smuzhiyun*       AAD padded to 128 bits with 0
1645*4882a593Smuzhiyun*       for example, assume AAD is a u32 vector
1646*4882a593Smuzhiyun*
1647*4882a593Smuzhiyun*       if AAD is 8 bytes:
1648*4882a593Smuzhiyun*       AAD[3] = {A0, A1};
1649*4882a593Smuzhiyun*       padded AAD in xmm register = {A1 A0 0 0}
1650*4882a593Smuzhiyun*
1651*4882a593Smuzhiyun*       0                   1                   2                   3
1652*4882a593Smuzhiyun*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1653*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1654*4882a593Smuzhiyun*       |                               SPI (A1)                        |
1655*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1656*4882a593Smuzhiyun*       |                     32-bit Sequence Number (A0)               |
1657*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1658*4882a593Smuzhiyun*       |                              0x0                              |
1659*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1660*4882a593Smuzhiyun*
1661*4882a593Smuzhiyun*                                 AAD Format with 32-bit Sequence Number
1662*4882a593Smuzhiyun*
1663*4882a593Smuzhiyun*       if AAD is 12 bytes:
1664*4882a593Smuzhiyun*       AAD[3] = {A0, A1, A2};
1665*4882a593Smuzhiyun*       padded AAD in xmm register = {A2 A1 A0 0}
1666*4882a593Smuzhiyun*
1667*4882a593Smuzhiyun*       0                   1                   2                   3
1668*4882a593Smuzhiyun*       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1669*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1670*4882a593Smuzhiyun*       |                               SPI (A2)                        |
1671*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1672*4882a593Smuzhiyun*       |                 64-bit Extended Sequence Number {A1,A0}       |
1673*4882a593Smuzhiyun*       |                                                               |
1674*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1675*4882a593Smuzhiyun*       |                              0x0                              |
1676*4882a593Smuzhiyun*       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1677*4882a593Smuzhiyun*
1678*4882a593Smuzhiyun*                         AAD Format with 64-bit Extended Sequence Number
1679*4882a593Smuzhiyun*
1680*4882a593Smuzhiyun* poly = x^128 + x^127 + x^126 + x^121 + 1
1681*4882a593Smuzhiyun***************************************************************************/
1682*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_enc)
1683*4882a593Smuzhiyun	FUNC_SAVE
1684*4882a593Smuzhiyun
1685*4882a593Smuzhiyun	GCM_INIT %arg6, arg7, arg8, arg9
1686*4882a593Smuzhiyun	GCM_ENC_DEC enc
1687*4882a593Smuzhiyun
1688*4882a593Smuzhiyun	GCM_COMPLETE arg10, arg11
1689*4882a593Smuzhiyun	FUNC_RESTORE
1690*4882a593Smuzhiyun	RET
1691*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_enc)
1692*4882a593Smuzhiyun
1693*4882a593Smuzhiyun/*****************************************************************************
1694*4882a593Smuzhiyun* void aesni_gcm_init(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1695*4882a593Smuzhiyun*                     struct gcm_context_data *data,
1696*4882a593Smuzhiyun*                                         // context data
1697*4882a593Smuzhiyun*                     u8 *iv,             // Pre-counter block j0: 4 byte salt (from Security Association)
1698*4882a593Smuzhiyun*                                         // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1699*4882a593Smuzhiyun*                                         // concatenated with 0x00000001. 16-byte aligned pointer.
1700*4882a593Smuzhiyun*                     u8 *hash_subkey,    // H, the Hash sub key input. Data starts on a 16-byte boundary.
1701*4882a593Smuzhiyun*                     const u8 *aad,      // Additional Authentication Data (AAD)
1702*4882a593Smuzhiyun*                     u64 aad_len)        // Length of AAD in bytes.
1703*4882a593Smuzhiyun*/
1704*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_init)
1705*4882a593Smuzhiyun	FUNC_SAVE
1706*4882a593Smuzhiyun	GCM_INIT %arg3, %arg4,%arg5, %arg6
1707*4882a593Smuzhiyun	FUNC_RESTORE
1708*4882a593Smuzhiyun	RET
1709*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_init)
1710*4882a593Smuzhiyun
1711*4882a593Smuzhiyun/*****************************************************************************
1712*4882a593Smuzhiyun* void aesni_gcm_enc_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1713*4882a593Smuzhiyun*                    struct gcm_context_data *data,
1714*4882a593Smuzhiyun*                                        // context data
1715*4882a593Smuzhiyun*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1716*4882a593Smuzhiyun*                    const u8 *in,       // Plaintext input
1717*4882a593Smuzhiyun*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1718*4882a593Smuzhiyun*/
1719*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_enc_update)
1720*4882a593Smuzhiyun	FUNC_SAVE
1721*4882a593Smuzhiyun	GCM_ENC_DEC enc
1722*4882a593Smuzhiyun	FUNC_RESTORE
1723*4882a593Smuzhiyun	RET
1724*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_enc_update)
1725*4882a593Smuzhiyun
1726*4882a593Smuzhiyun/*****************************************************************************
1727*4882a593Smuzhiyun* void aesni_gcm_dec_update(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1728*4882a593Smuzhiyun*                    struct gcm_context_data *data,
1729*4882a593Smuzhiyun*                                        // context data
1730*4882a593Smuzhiyun*                    u8 *out,            // Ciphertext output. Encrypt in-place is allowed.
1731*4882a593Smuzhiyun*                    const u8 *in,       // Plaintext input
1732*4882a593Smuzhiyun*                    u64 plaintext_len,  // Length of data in bytes for encryption.
1733*4882a593Smuzhiyun*/
1734*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_dec_update)
1735*4882a593Smuzhiyun	FUNC_SAVE
1736*4882a593Smuzhiyun	GCM_ENC_DEC dec
1737*4882a593Smuzhiyun	FUNC_RESTORE
1738*4882a593Smuzhiyun	RET
1739*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_dec_update)
1740*4882a593Smuzhiyun
1741*4882a593Smuzhiyun/*****************************************************************************
1742*4882a593Smuzhiyun* void aesni_gcm_finalize(void *aes_ctx,      // AES Key schedule. Starts on a 16 byte boundary.
1743*4882a593Smuzhiyun*                    struct gcm_context_data *data,
1744*4882a593Smuzhiyun*                                        // context data
1745*4882a593Smuzhiyun*                    u8 *auth_tag,       // Authenticated Tag output.
1746*4882a593Smuzhiyun*                    u64 auth_tag_len);  // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1747*4882a593Smuzhiyun*                                        // 12 or 8.
1748*4882a593Smuzhiyun*/
1749*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_finalize)
1750*4882a593Smuzhiyun	FUNC_SAVE
1751*4882a593Smuzhiyun	GCM_COMPLETE %arg3 %arg4
1752*4882a593Smuzhiyun	FUNC_RESTORE
1753*4882a593Smuzhiyun	RET
1754*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_finalize)
1755*4882a593Smuzhiyun
1756*4882a593Smuzhiyun#endif
1757*4882a593Smuzhiyun
1758*4882a593Smuzhiyun
1759*4882a593SmuzhiyunSYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128)
1760*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_key_expansion_256a)
1761*4882a593Smuzhiyun	pshufd $0b11111111, %xmm1, %xmm1
1762*4882a593Smuzhiyun	shufps $0b00010000, %xmm0, %xmm4
1763*4882a593Smuzhiyun	pxor %xmm4, %xmm0
1764*4882a593Smuzhiyun	shufps $0b10001100, %xmm0, %xmm4
1765*4882a593Smuzhiyun	pxor %xmm4, %xmm0
1766*4882a593Smuzhiyun	pxor %xmm1, %xmm0
1767*4882a593Smuzhiyun	movaps %xmm0, (TKEYP)
1768*4882a593Smuzhiyun	add $0x10, TKEYP
1769*4882a593Smuzhiyun	RET
1770*4882a593SmuzhiyunSYM_FUNC_END(_key_expansion_256a)
1771*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(_key_expansion_128)
1772*4882a593Smuzhiyun
1773*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_key_expansion_192a)
1774*4882a593Smuzhiyun	pshufd $0b01010101, %xmm1, %xmm1
1775*4882a593Smuzhiyun	shufps $0b00010000, %xmm0, %xmm4
1776*4882a593Smuzhiyun	pxor %xmm4, %xmm0
1777*4882a593Smuzhiyun	shufps $0b10001100, %xmm0, %xmm4
1778*4882a593Smuzhiyun	pxor %xmm4, %xmm0
1779*4882a593Smuzhiyun	pxor %xmm1, %xmm0
1780*4882a593Smuzhiyun
1781*4882a593Smuzhiyun	movaps %xmm2, %xmm5
1782*4882a593Smuzhiyun	movaps %xmm2, %xmm6
1783*4882a593Smuzhiyun	pslldq $4, %xmm5
1784*4882a593Smuzhiyun	pshufd $0b11111111, %xmm0, %xmm3
1785*4882a593Smuzhiyun	pxor %xmm3, %xmm2
1786*4882a593Smuzhiyun	pxor %xmm5, %xmm2
1787*4882a593Smuzhiyun
1788*4882a593Smuzhiyun	movaps %xmm0, %xmm1
1789*4882a593Smuzhiyun	shufps $0b01000100, %xmm0, %xmm6
1790*4882a593Smuzhiyun	movaps %xmm6, (TKEYP)
1791*4882a593Smuzhiyun	shufps $0b01001110, %xmm2, %xmm1
1792*4882a593Smuzhiyun	movaps %xmm1, 0x10(TKEYP)
1793*4882a593Smuzhiyun	add $0x20, TKEYP
1794*4882a593Smuzhiyun	RET
1795*4882a593SmuzhiyunSYM_FUNC_END(_key_expansion_192a)
1796*4882a593Smuzhiyun
1797*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_key_expansion_192b)
1798*4882a593Smuzhiyun	pshufd $0b01010101, %xmm1, %xmm1
1799*4882a593Smuzhiyun	shufps $0b00010000, %xmm0, %xmm4
1800*4882a593Smuzhiyun	pxor %xmm4, %xmm0
1801*4882a593Smuzhiyun	shufps $0b10001100, %xmm0, %xmm4
1802*4882a593Smuzhiyun	pxor %xmm4, %xmm0
1803*4882a593Smuzhiyun	pxor %xmm1, %xmm0
1804*4882a593Smuzhiyun
1805*4882a593Smuzhiyun	movaps %xmm2, %xmm5
1806*4882a593Smuzhiyun	pslldq $4, %xmm5
1807*4882a593Smuzhiyun	pshufd $0b11111111, %xmm0, %xmm3
1808*4882a593Smuzhiyun	pxor %xmm3, %xmm2
1809*4882a593Smuzhiyun	pxor %xmm5, %xmm2
1810*4882a593Smuzhiyun
1811*4882a593Smuzhiyun	movaps %xmm0, (TKEYP)
1812*4882a593Smuzhiyun	add $0x10, TKEYP
1813*4882a593Smuzhiyun	RET
1814*4882a593SmuzhiyunSYM_FUNC_END(_key_expansion_192b)
1815*4882a593Smuzhiyun
1816*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_key_expansion_256b)
1817*4882a593Smuzhiyun	pshufd $0b10101010, %xmm1, %xmm1
1818*4882a593Smuzhiyun	shufps $0b00010000, %xmm2, %xmm4
1819*4882a593Smuzhiyun	pxor %xmm4, %xmm2
1820*4882a593Smuzhiyun	shufps $0b10001100, %xmm2, %xmm4
1821*4882a593Smuzhiyun	pxor %xmm4, %xmm2
1822*4882a593Smuzhiyun	pxor %xmm1, %xmm2
1823*4882a593Smuzhiyun	movaps %xmm2, (TKEYP)
1824*4882a593Smuzhiyun	add $0x10, TKEYP
1825*4882a593Smuzhiyun	RET
1826*4882a593SmuzhiyunSYM_FUNC_END(_key_expansion_256b)
1827*4882a593Smuzhiyun
1828*4882a593Smuzhiyun/*
1829*4882a593Smuzhiyun * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
1830*4882a593Smuzhiyun *                   unsigned int key_len)
1831*4882a593Smuzhiyun */
1832*4882a593SmuzhiyunSYM_FUNC_START(aesni_set_key)
1833*4882a593Smuzhiyun	FRAME_BEGIN
1834*4882a593Smuzhiyun#ifndef __x86_64__
1835*4882a593Smuzhiyun	pushl KEYP
1836*4882a593Smuzhiyun	movl (FRAME_OFFSET+8)(%esp), KEYP	# ctx
1837*4882a593Smuzhiyun	movl (FRAME_OFFSET+12)(%esp), UKEYP	# in_key
1838*4882a593Smuzhiyun	movl (FRAME_OFFSET+16)(%esp), %edx	# key_len
1839*4882a593Smuzhiyun#endif
1840*4882a593Smuzhiyun	movups (UKEYP), %xmm0		# user key (first 16 bytes)
1841*4882a593Smuzhiyun	movaps %xmm0, (KEYP)
1842*4882a593Smuzhiyun	lea 0x10(KEYP), TKEYP		# key addr
1843*4882a593Smuzhiyun	movl %edx, 480(KEYP)
1844*4882a593Smuzhiyun	pxor %xmm4, %xmm4		# xmm4 is assumed 0 in _key_expansion_x
1845*4882a593Smuzhiyun	cmp $24, %dl
1846*4882a593Smuzhiyun	jb .Lenc_key128
1847*4882a593Smuzhiyun	je .Lenc_key192
1848*4882a593Smuzhiyun	movups 0x10(UKEYP), %xmm2	# other user key
1849*4882a593Smuzhiyun	movaps %xmm2, (TKEYP)
1850*4882a593Smuzhiyun	add $0x10, TKEYP
1851*4882a593Smuzhiyun	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1852*4882a593Smuzhiyun	call _key_expansion_256a
1853*4882a593Smuzhiyun	aeskeygenassist $0x1, %xmm0, %xmm1
1854*4882a593Smuzhiyun	call _key_expansion_256b
1855*4882a593Smuzhiyun	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1856*4882a593Smuzhiyun	call _key_expansion_256a
1857*4882a593Smuzhiyun	aeskeygenassist $0x2, %xmm0, %xmm1
1858*4882a593Smuzhiyun	call _key_expansion_256b
1859*4882a593Smuzhiyun	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1860*4882a593Smuzhiyun	call _key_expansion_256a
1861*4882a593Smuzhiyun	aeskeygenassist $0x4, %xmm0, %xmm1
1862*4882a593Smuzhiyun	call _key_expansion_256b
1863*4882a593Smuzhiyun	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1864*4882a593Smuzhiyun	call _key_expansion_256a
1865*4882a593Smuzhiyun	aeskeygenassist $0x8, %xmm0, %xmm1
1866*4882a593Smuzhiyun	call _key_expansion_256b
1867*4882a593Smuzhiyun	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1868*4882a593Smuzhiyun	call _key_expansion_256a
1869*4882a593Smuzhiyun	aeskeygenassist $0x10, %xmm0, %xmm1
1870*4882a593Smuzhiyun	call _key_expansion_256b
1871*4882a593Smuzhiyun	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1872*4882a593Smuzhiyun	call _key_expansion_256a
1873*4882a593Smuzhiyun	aeskeygenassist $0x20, %xmm0, %xmm1
1874*4882a593Smuzhiyun	call _key_expansion_256b
1875*4882a593Smuzhiyun	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1876*4882a593Smuzhiyun	call _key_expansion_256a
1877*4882a593Smuzhiyun	jmp .Ldec_key
1878*4882a593Smuzhiyun.Lenc_key192:
1879*4882a593Smuzhiyun	movq 0x10(UKEYP), %xmm2		# other user key
1880*4882a593Smuzhiyun	aeskeygenassist $0x1, %xmm2, %xmm1	# round 1
1881*4882a593Smuzhiyun	call _key_expansion_192a
1882*4882a593Smuzhiyun	aeskeygenassist $0x2, %xmm2, %xmm1	# round 2
1883*4882a593Smuzhiyun	call _key_expansion_192b
1884*4882a593Smuzhiyun	aeskeygenassist $0x4, %xmm2, %xmm1	# round 3
1885*4882a593Smuzhiyun	call _key_expansion_192a
1886*4882a593Smuzhiyun	aeskeygenassist $0x8, %xmm2, %xmm1	# round 4
1887*4882a593Smuzhiyun	call _key_expansion_192b
1888*4882a593Smuzhiyun	aeskeygenassist $0x10, %xmm2, %xmm1	# round 5
1889*4882a593Smuzhiyun	call _key_expansion_192a
1890*4882a593Smuzhiyun	aeskeygenassist $0x20, %xmm2, %xmm1	# round 6
1891*4882a593Smuzhiyun	call _key_expansion_192b
1892*4882a593Smuzhiyun	aeskeygenassist $0x40, %xmm2, %xmm1	# round 7
1893*4882a593Smuzhiyun	call _key_expansion_192a
1894*4882a593Smuzhiyun	aeskeygenassist $0x80, %xmm2, %xmm1	# round 8
1895*4882a593Smuzhiyun	call _key_expansion_192b
1896*4882a593Smuzhiyun	jmp .Ldec_key
1897*4882a593Smuzhiyun.Lenc_key128:
1898*4882a593Smuzhiyun	aeskeygenassist $0x1, %xmm0, %xmm1	# round 1
1899*4882a593Smuzhiyun	call _key_expansion_128
1900*4882a593Smuzhiyun	aeskeygenassist $0x2, %xmm0, %xmm1	# round 2
1901*4882a593Smuzhiyun	call _key_expansion_128
1902*4882a593Smuzhiyun	aeskeygenassist $0x4, %xmm0, %xmm1	# round 3
1903*4882a593Smuzhiyun	call _key_expansion_128
1904*4882a593Smuzhiyun	aeskeygenassist $0x8, %xmm0, %xmm1	# round 4
1905*4882a593Smuzhiyun	call _key_expansion_128
1906*4882a593Smuzhiyun	aeskeygenassist $0x10, %xmm0, %xmm1	# round 5
1907*4882a593Smuzhiyun	call _key_expansion_128
1908*4882a593Smuzhiyun	aeskeygenassist $0x20, %xmm0, %xmm1	# round 6
1909*4882a593Smuzhiyun	call _key_expansion_128
1910*4882a593Smuzhiyun	aeskeygenassist $0x40, %xmm0, %xmm1	# round 7
1911*4882a593Smuzhiyun	call _key_expansion_128
1912*4882a593Smuzhiyun	aeskeygenassist $0x80, %xmm0, %xmm1	# round 8
1913*4882a593Smuzhiyun	call _key_expansion_128
1914*4882a593Smuzhiyun	aeskeygenassist $0x1b, %xmm0, %xmm1	# round 9
1915*4882a593Smuzhiyun	call _key_expansion_128
1916*4882a593Smuzhiyun	aeskeygenassist $0x36, %xmm0, %xmm1	# round 10
1917*4882a593Smuzhiyun	call _key_expansion_128
1918*4882a593Smuzhiyun.Ldec_key:
1919*4882a593Smuzhiyun	sub $0x10, TKEYP
1920*4882a593Smuzhiyun	movaps (KEYP), %xmm0
1921*4882a593Smuzhiyun	movaps (TKEYP), %xmm1
1922*4882a593Smuzhiyun	movaps %xmm0, 240(TKEYP)
1923*4882a593Smuzhiyun	movaps %xmm1, 240(KEYP)
1924*4882a593Smuzhiyun	add $0x10, KEYP
1925*4882a593Smuzhiyun	lea 240-16(TKEYP), UKEYP
1926*4882a593Smuzhiyun.align 4
1927*4882a593Smuzhiyun.Ldec_key_loop:
1928*4882a593Smuzhiyun	movaps (KEYP), %xmm0
1929*4882a593Smuzhiyun	aesimc %xmm0, %xmm1
1930*4882a593Smuzhiyun	movaps %xmm1, (UKEYP)
1931*4882a593Smuzhiyun	add $0x10, KEYP
1932*4882a593Smuzhiyun	sub $0x10, UKEYP
1933*4882a593Smuzhiyun	cmp TKEYP, KEYP
1934*4882a593Smuzhiyun	jb .Ldec_key_loop
1935*4882a593Smuzhiyun	xor AREG, AREG
1936*4882a593Smuzhiyun#ifndef __x86_64__
1937*4882a593Smuzhiyun	popl KEYP
1938*4882a593Smuzhiyun#endif
1939*4882a593Smuzhiyun	FRAME_END
1940*4882a593Smuzhiyun	RET
1941*4882a593SmuzhiyunSYM_FUNC_END(aesni_set_key)
1942*4882a593Smuzhiyun
1943*4882a593Smuzhiyun/*
1944*4882a593Smuzhiyun * void aesni_enc(const void *ctx, u8 *dst, const u8 *src)
1945*4882a593Smuzhiyun */
1946*4882a593SmuzhiyunSYM_FUNC_START(aesni_enc)
1947*4882a593Smuzhiyun	FRAME_BEGIN
1948*4882a593Smuzhiyun#ifndef __x86_64__
1949*4882a593Smuzhiyun	pushl KEYP
1950*4882a593Smuzhiyun	pushl KLEN
1951*4882a593Smuzhiyun	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
1952*4882a593Smuzhiyun	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
1953*4882a593Smuzhiyun	movl (FRAME_OFFSET+20)(%esp), INP	# src
1954*4882a593Smuzhiyun#endif
1955*4882a593Smuzhiyun	movl 480(KEYP), KLEN		# key length
1956*4882a593Smuzhiyun	movups (INP), STATE		# input
1957*4882a593Smuzhiyun	call _aesni_enc1
1958*4882a593Smuzhiyun	movups STATE, (OUTP)		# output
1959*4882a593Smuzhiyun#ifndef __x86_64__
1960*4882a593Smuzhiyun	popl KLEN
1961*4882a593Smuzhiyun	popl KEYP
1962*4882a593Smuzhiyun#endif
1963*4882a593Smuzhiyun	FRAME_END
1964*4882a593Smuzhiyun	RET
1965*4882a593SmuzhiyunSYM_FUNC_END(aesni_enc)
1966*4882a593Smuzhiyun
1967*4882a593Smuzhiyun/*
1968*4882a593Smuzhiyun * _aesni_enc1:		internal ABI
1969*4882a593Smuzhiyun * input:
1970*4882a593Smuzhiyun *	KEYP:		key struct pointer
1971*4882a593Smuzhiyun *	KLEN:		round count
1972*4882a593Smuzhiyun *	STATE:		initial state (input)
1973*4882a593Smuzhiyun * output:
1974*4882a593Smuzhiyun *	STATE:		finial state (output)
1975*4882a593Smuzhiyun * changed:
1976*4882a593Smuzhiyun *	KEY
1977*4882a593Smuzhiyun *	TKEYP (T1)
1978*4882a593Smuzhiyun */
1979*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_enc1)
1980*4882a593Smuzhiyun	movaps (KEYP), KEY		# key
1981*4882a593Smuzhiyun	mov KEYP, TKEYP
1982*4882a593Smuzhiyun	pxor KEY, STATE		# round 0
1983*4882a593Smuzhiyun	add $0x30, TKEYP
1984*4882a593Smuzhiyun	cmp $24, KLEN
1985*4882a593Smuzhiyun	jb .Lenc128
1986*4882a593Smuzhiyun	lea 0x20(TKEYP), TKEYP
1987*4882a593Smuzhiyun	je .Lenc192
1988*4882a593Smuzhiyun	add $0x20, TKEYP
1989*4882a593Smuzhiyun	movaps -0x60(TKEYP), KEY
1990*4882a593Smuzhiyun	aesenc KEY, STATE
1991*4882a593Smuzhiyun	movaps -0x50(TKEYP), KEY
1992*4882a593Smuzhiyun	aesenc KEY, STATE
1993*4882a593Smuzhiyun.align 4
1994*4882a593Smuzhiyun.Lenc192:
1995*4882a593Smuzhiyun	movaps -0x40(TKEYP), KEY
1996*4882a593Smuzhiyun	aesenc KEY, STATE
1997*4882a593Smuzhiyun	movaps -0x30(TKEYP), KEY
1998*4882a593Smuzhiyun	aesenc KEY, STATE
1999*4882a593Smuzhiyun.align 4
2000*4882a593Smuzhiyun.Lenc128:
2001*4882a593Smuzhiyun	movaps -0x20(TKEYP), KEY
2002*4882a593Smuzhiyun	aesenc KEY, STATE
2003*4882a593Smuzhiyun	movaps -0x10(TKEYP), KEY
2004*4882a593Smuzhiyun	aesenc KEY, STATE
2005*4882a593Smuzhiyun	movaps (TKEYP), KEY
2006*4882a593Smuzhiyun	aesenc KEY, STATE
2007*4882a593Smuzhiyun	movaps 0x10(TKEYP), KEY
2008*4882a593Smuzhiyun	aesenc KEY, STATE
2009*4882a593Smuzhiyun	movaps 0x20(TKEYP), KEY
2010*4882a593Smuzhiyun	aesenc KEY, STATE
2011*4882a593Smuzhiyun	movaps 0x30(TKEYP), KEY
2012*4882a593Smuzhiyun	aesenc KEY, STATE
2013*4882a593Smuzhiyun	movaps 0x40(TKEYP), KEY
2014*4882a593Smuzhiyun	aesenc KEY, STATE
2015*4882a593Smuzhiyun	movaps 0x50(TKEYP), KEY
2016*4882a593Smuzhiyun	aesenc KEY, STATE
2017*4882a593Smuzhiyun	movaps 0x60(TKEYP), KEY
2018*4882a593Smuzhiyun	aesenc KEY, STATE
2019*4882a593Smuzhiyun	movaps 0x70(TKEYP), KEY
2020*4882a593Smuzhiyun	aesenclast KEY, STATE
2021*4882a593Smuzhiyun	RET
2022*4882a593SmuzhiyunSYM_FUNC_END(_aesni_enc1)
2023*4882a593Smuzhiyun
2024*4882a593Smuzhiyun/*
2025*4882a593Smuzhiyun * _aesni_enc4:	internal ABI
2026*4882a593Smuzhiyun * input:
2027*4882a593Smuzhiyun *	KEYP:		key struct pointer
2028*4882a593Smuzhiyun *	KLEN:		round count
2029*4882a593Smuzhiyun *	STATE1:		initial state (input)
2030*4882a593Smuzhiyun *	STATE2
2031*4882a593Smuzhiyun *	STATE3
2032*4882a593Smuzhiyun *	STATE4
2033*4882a593Smuzhiyun * output:
2034*4882a593Smuzhiyun *	STATE1:		finial state (output)
2035*4882a593Smuzhiyun *	STATE2
2036*4882a593Smuzhiyun *	STATE3
2037*4882a593Smuzhiyun *	STATE4
2038*4882a593Smuzhiyun * changed:
2039*4882a593Smuzhiyun *	KEY
2040*4882a593Smuzhiyun *	TKEYP (T1)
2041*4882a593Smuzhiyun */
2042*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_enc4)
2043*4882a593Smuzhiyun	movaps (KEYP), KEY		# key
2044*4882a593Smuzhiyun	mov KEYP, TKEYP
2045*4882a593Smuzhiyun	pxor KEY, STATE1		# round 0
2046*4882a593Smuzhiyun	pxor KEY, STATE2
2047*4882a593Smuzhiyun	pxor KEY, STATE3
2048*4882a593Smuzhiyun	pxor KEY, STATE4
2049*4882a593Smuzhiyun	add $0x30, TKEYP
2050*4882a593Smuzhiyun	cmp $24, KLEN
2051*4882a593Smuzhiyun	jb .L4enc128
2052*4882a593Smuzhiyun	lea 0x20(TKEYP), TKEYP
2053*4882a593Smuzhiyun	je .L4enc192
2054*4882a593Smuzhiyun	add $0x20, TKEYP
2055*4882a593Smuzhiyun	movaps -0x60(TKEYP), KEY
2056*4882a593Smuzhiyun	aesenc KEY, STATE1
2057*4882a593Smuzhiyun	aesenc KEY, STATE2
2058*4882a593Smuzhiyun	aesenc KEY, STATE3
2059*4882a593Smuzhiyun	aesenc KEY, STATE4
2060*4882a593Smuzhiyun	movaps -0x50(TKEYP), KEY
2061*4882a593Smuzhiyun	aesenc KEY, STATE1
2062*4882a593Smuzhiyun	aesenc KEY, STATE2
2063*4882a593Smuzhiyun	aesenc KEY, STATE3
2064*4882a593Smuzhiyun	aesenc KEY, STATE4
2065*4882a593Smuzhiyun#.align 4
2066*4882a593Smuzhiyun.L4enc192:
2067*4882a593Smuzhiyun	movaps -0x40(TKEYP), KEY
2068*4882a593Smuzhiyun	aesenc KEY, STATE1
2069*4882a593Smuzhiyun	aesenc KEY, STATE2
2070*4882a593Smuzhiyun	aesenc KEY, STATE3
2071*4882a593Smuzhiyun	aesenc KEY, STATE4
2072*4882a593Smuzhiyun	movaps -0x30(TKEYP), KEY
2073*4882a593Smuzhiyun	aesenc KEY, STATE1
2074*4882a593Smuzhiyun	aesenc KEY, STATE2
2075*4882a593Smuzhiyun	aesenc KEY, STATE3
2076*4882a593Smuzhiyun	aesenc KEY, STATE4
2077*4882a593Smuzhiyun#.align 4
2078*4882a593Smuzhiyun.L4enc128:
2079*4882a593Smuzhiyun	movaps -0x20(TKEYP), KEY
2080*4882a593Smuzhiyun	aesenc KEY, STATE1
2081*4882a593Smuzhiyun	aesenc KEY, STATE2
2082*4882a593Smuzhiyun	aesenc KEY, STATE3
2083*4882a593Smuzhiyun	aesenc KEY, STATE4
2084*4882a593Smuzhiyun	movaps -0x10(TKEYP), KEY
2085*4882a593Smuzhiyun	aesenc KEY, STATE1
2086*4882a593Smuzhiyun	aesenc KEY, STATE2
2087*4882a593Smuzhiyun	aesenc KEY, STATE3
2088*4882a593Smuzhiyun	aesenc KEY, STATE4
2089*4882a593Smuzhiyun	movaps (TKEYP), KEY
2090*4882a593Smuzhiyun	aesenc KEY, STATE1
2091*4882a593Smuzhiyun	aesenc KEY, STATE2
2092*4882a593Smuzhiyun	aesenc KEY, STATE3
2093*4882a593Smuzhiyun	aesenc KEY, STATE4
2094*4882a593Smuzhiyun	movaps 0x10(TKEYP), KEY
2095*4882a593Smuzhiyun	aesenc KEY, STATE1
2096*4882a593Smuzhiyun	aesenc KEY, STATE2
2097*4882a593Smuzhiyun	aesenc KEY, STATE3
2098*4882a593Smuzhiyun	aesenc KEY, STATE4
2099*4882a593Smuzhiyun	movaps 0x20(TKEYP), KEY
2100*4882a593Smuzhiyun	aesenc KEY, STATE1
2101*4882a593Smuzhiyun	aesenc KEY, STATE2
2102*4882a593Smuzhiyun	aesenc KEY, STATE3
2103*4882a593Smuzhiyun	aesenc KEY, STATE4
2104*4882a593Smuzhiyun	movaps 0x30(TKEYP), KEY
2105*4882a593Smuzhiyun	aesenc KEY, STATE1
2106*4882a593Smuzhiyun	aesenc KEY, STATE2
2107*4882a593Smuzhiyun	aesenc KEY, STATE3
2108*4882a593Smuzhiyun	aesenc KEY, STATE4
2109*4882a593Smuzhiyun	movaps 0x40(TKEYP), KEY
2110*4882a593Smuzhiyun	aesenc KEY, STATE1
2111*4882a593Smuzhiyun	aesenc KEY, STATE2
2112*4882a593Smuzhiyun	aesenc KEY, STATE3
2113*4882a593Smuzhiyun	aesenc KEY, STATE4
2114*4882a593Smuzhiyun	movaps 0x50(TKEYP), KEY
2115*4882a593Smuzhiyun	aesenc KEY, STATE1
2116*4882a593Smuzhiyun	aesenc KEY, STATE2
2117*4882a593Smuzhiyun	aesenc KEY, STATE3
2118*4882a593Smuzhiyun	aesenc KEY, STATE4
2119*4882a593Smuzhiyun	movaps 0x60(TKEYP), KEY
2120*4882a593Smuzhiyun	aesenc KEY, STATE1
2121*4882a593Smuzhiyun	aesenc KEY, STATE2
2122*4882a593Smuzhiyun	aesenc KEY, STATE3
2123*4882a593Smuzhiyun	aesenc KEY, STATE4
2124*4882a593Smuzhiyun	movaps 0x70(TKEYP), KEY
2125*4882a593Smuzhiyun	aesenclast KEY, STATE1		# last round
2126*4882a593Smuzhiyun	aesenclast KEY, STATE2
2127*4882a593Smuzhiyun	aesenclast KEY, STATE3
2128*4882a593Smuzhiyun	aesenclast KEY, STATE4
2129*4882a593Smuzhiyun	RET
2130*4882a593SmuzhiyunSYM_FUNC_END(_aesni_enc4)
2131*4882a593Smuzhiyun
2132*4882a593Smuzhiyun/*
2133*4882a593Smuzhiyun * void aesni_dec (const void *ctx, u8 *dst, const u8 *src)
2134*4882a593Smuzhiyun */
2135*4882a593SmuzhiyunSYM_FUNC_START(aesni_dec)
2136*4882a593Smuzhiyun	FRAME_BEGIN
2137*4882a593Smuzhiyun#ifndef __x86_64__
2138*4882a593Smuzhiyun	pushl KEYP
2139*4882a593Smuzhiyun	pushl KLEN
2140*4882a593Smuzhiyun	movl (FRAME_OFFSET+12)(%esp), KEYP	# ctx
2141*4882a593Smuzhiyun	movl (FRAME_OFFSET+16)(%esp), OUTP	# dst
2142*4882a593Smuzhiyun	movl (FRAME_OFFSET+20)(%esp), INP	# src
2143*4882a593Smuzhiyun#endif
2144*4882a593Smuzhiyun	mov 480(KEYP), KLEN		# key length
2145*4882a593Smuzhiyun	add $240, KEYP
2146*4882a593Smuzhiyun	movups (INP), STATE		# input
2147*4882a593Smuzhiyun	call _aesni_dec1
2148*4882a593Smuzhiyun	movups STATE, (OUTP)		#output
2149*4882a593Smuzhiyun#ifndef __x86_64__
2150*4882a593Smuzhiyun	popl KLEN
2151*4882a593Smuzhiyun	popl KEYP
2152*4882a593Smuzhiyun#endif
2153*4882a593Smuzhiyun	FRAME_END
2154*4882a593Smuzhiyun	RET
2155*4882a593SmuzhiyunSYM_FUNC_END(aesni_dec)
2156*4882a593Smuzhiyun
2157*4882a593Smuzhiyun/*
2158*4882a593Smuzhiyun * _aesni_dec1:		internal ABI
2159*4882a593Smuzhiyun * input:
2160*4882a593Smuzhiyun *	KEYP:		key struct pointer
2161*4882a593Smuzhiyun *	KLEN:		key length
2162*4882a593Smuzhiyun *	STATE:		initial state (input)
2163*4882a593Smuzhiyun * output:
2164*4882a593Smuzhiyun *	STATE:		finial state (output)
2165*4882a593Smuzhiyun * changed:
2166*4882a593Smuzhiyun *	KEY
2167*4882a593Smuzhiyun *	TKEYP (T1)
2168*4882a593Smuzhiyun */
2169*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_dec1)
2170*4882a593Smuzhiyun	movaps (KEYP), KEY		# key
2171*4882a593Smuzhiyun	mov KEYP, TKEYP
2172*4882a593Smuzhiyun	pxor KEY, STATE		# round 0
2173*4882a593Smuzhiyun	add $0x30, TKEYP
2174*4882a593Smuzhiyun	cmp $24, KLEN
2175*4882a593Smuzhiyun	jb .Ldec128
2176*4882a593Smuzhiyun	lea 0x20(TKEYP), TKEYP
2177*4882a593Smuzhiyun	je .Ldec192
2178*4882a593Smuzhiyun	add $0x20, TKEYP
2179*4882a593Smuzhiyun	movaps -0x60(TKEYP), KEY
2180*4882a593Smuzhiyun	aesdec KEY, STATE
2181*4882a593Smuzhiyun	movaps -0x50(TKEYP), KEY
2182*4882a593Smuzhiyun	aesdec KEY, STATE
2183*4882a593Smuzhiyun.align 4
2184*4882a593Smuzhiyun.Ldec192:
2185*4882a593Smuzhiyun	movaps -0x40(TKEYP), KEY
2186*4882a593Smuzhiyun	aesdec KEY, STATE
2187*4882a593Smuzhiyun	movaps -0x30(TKEYP), KEY
2188*4882a593Smuzhiyun	aesdec KEY, STATE
2189*4882a593Smuzhiyun.align 4
2190*4882a593Smuzhiyun.Ldec128:
2191*4882a593Smuzhiyun	movaps -0x20(TKEYP), KEY
2192*4882a593Smuzhiyun	aesdec KEY, STATE
2193*4882a593Smuzhiyun	movaps -0x10(TKEYP), KEY
2194*4882a593Smuzhiyun	aesdec KEY, STATE
2195*4882a593Smuzhiyun	movaps (TKEYP), KEY
2196*4882a593Smuzhiyun	aesdec KEY, STATE
2197*4882a593Smuzhiyun	movaps 0x10(TKEYP), KEY
2198*4882a593Smuzhiyun	aesdec KEY, STATE
2199*4882a593Smuzhiyun	movaps 0x20(TKEYP), KEY
2200*4882a593Smuzhiyun	aesdec KEY, STATE
2201*4882a593Smuzhiyun	movaps 0x30(TKEYP), KEY
2202*4882a593Smuzhiyun	aesdec KEY, STATE
2203*4882a593Smuzhiyun	movaps 0x40(TKEYP), KEY
2204*4882a593Smuzhiyun	aesdec KEY, STATE
2205*4882a593Smuzhiyun	movaps 0x50(TKEYP), KEY
2206*4882a593Smuzhiyun	aesdec KEY, STATE
2207*4882a593Smuzhiyun	movaps 0x60(TKEYP), KEY
2208*4882a593Smuzhiyun	aesdec KEY, STATE
2209*4882a593Smuzhiyun	movaps 0x70(TKEYP), KEY
2210*4882a593Smuzhiyun	aesdeclast KEY, STATE
2211*4882a593Smuzhiyun	RET
2212*4882a593SmuzhiyunSYM_FUNC_END(_aesni_dec1)
2213*4882a593Smuzhiyun
2214*4882a593Smuzhiyun/*
2215*4882a593Smuzhiyun * _aesni_dec4:	internal ABI
2216*4882a593Smuzhiyun * input:
2217*4882a593Smuzhiyun *	KEYP:		key struct pointer
2218*4882a593Smuzhiyun *	KLEN:		key length
2219*4882a593Smuzhiyun *	STATE1:		initial state (input)
2220*4882a593Smuzhiyun *	STATE2
2221*4882a593Smuzhiyun *	STATE3
2222*4882a593Smuzhiyun *	STATE4
2223*4882a593Smuzhiyun * output:
2224*4882a593Smuzhiyun *	STATE1:		finial state (output)
2225*4882a593Smuzhiyun *	STATE2
2226*4882a593Smuzhiyun *	STATE3
2227*4882a593Smuzhiyun *	STATE4
2228*4882a593Smuzhiyun * changed:
2229*4882a593Smuzhiyun *	KEY
2230*4882a593Smuzhiyun *	TKEYP (T1)
2231*4882a593Smuzhiyun */
2232*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_dec4)
2233*4882a593Smuzhiyun	movaps (KEYP), KEY		# key
2234*4882a593Smuzhiyun	mov KEYP, TKEYP
2235*4882a593Smuzhiyun	pxor KEY, STATE1		# round 0
2236*4882a593Smuzhiyun	pxor KEY, STATE2
2237*4882a593Smuzhiyun	pxor KEY, STATE3
2238*4882a593Smuzhiyun	pxor KEY, STATE4
2239*4882a593Smuzhiyun	add $0x30, TKEYP
2240*4882a593Smuzhiyun	cmp $24, KLEN
2241*4882a593Smuzhiyun	jb .L4dec128
2242*4882a593Smuzhiyun	lea 0x20(TKEYP), TKEYP
2243*4882a593Smuzhiyun	je .L4dec192
2244*4882a593Smuzhiyun	add $0x20, TKEYP
2245*4882a593Smuzhiyun	movaps -0x60(TKEYP), KEY
2246*4882a593Smuzhiyun	aesdec KEY, STATE1
2247*4882a593Smuzhiyun	aesdec KEY, STATE2
2248*4882a593Smuzhiyun	aesdec KEY, STATE3
2249*4882a593Smuzhiyun	aesdec KEY, STATE4
2250*4882a593Smuzhiyun	movaps -0x50(TKEYP), KEY
2251*4882a593Smuzhiyun	aesdec KEY, STATE1
2252*4882a593Smuzhiyun	aesdec KEY, STATE2
2253*4882a593Smuzhiyun	aesdec KEY, STATE3
2254*4882a593Smuzhiyun	aesdec KEY, STATE4
2255*4882a593Smuzhiyun.align 4
2256*4882a593Smuzhiyun.L4dec192:
2257*4882a593Smuzhiyun	movaps -0x40(TKEYP), KEY
2258*4882a593Smuzhiyun	aesdec KEY, STATE1
2259*4882a593Smuzhiyun	aesdec KEY, STATE2
2260*4882a593Smuzhiyun	aesdec KEY, STATE3
2261*4882a593Smuzhiyun	aesdec KEY, STATE4
2262*4882a593Smuzhiyun	movaps -0x30(TKEYP), KEY
2263*4882a593Smuzhiyun	aesdec KEY, STATE1
2264*4882a593Smuzhiyun	aesdec KEY, STATE2
2265*4882a593Smuzhiyun	aesdec KEY, STATE3
2266*4882a593Smuzhiyun	aesdec KEY, STATE4
2267*4882a593Smuzhiyun.align 4
2268*4882a593Smuzhiyun.L4dec128:
2269*4882a593Smuzhiyun	movaps -0x20(TKEYP), KEY
2270*4882a593Smuzhiyun	aesdec KEY, STATE1
2271*4882a593Smuzhiyun	aesdec KEY, STATE2
2272*4882a593Smuzhiyun	aesdec KEY, STATE3
2273*4882a593Smuzhiyun	aesdec KEY, STATE4
2274*4882a593Smuzhiyun	movaps -0x10(TKEYP), KEY
2275*4882a593Smuzhiyun	aesdec KEY, STATE1
2276*4882a593Smuzhiyun	aesdec KEY, STATE2
2277*4882a593Smuzhiyun	aesdec KEY, STATE3
2278*4882a593Smuzhiyun	aesdec KEY, STATE4
2279*4882a593Smuzhiyun	movaps (TKEYP), KEY
2280*4882a593Smuzhiyun	aesdec KEY, STATE1
2281*4882a593Smuzhiyun	aesdec KEY, STATE2
2282*4882a593Smuzhiyun	aesdec KEY, STATE3
2283*4882a593Smuzhiyun	aesdec KEY, STATE4
2284*4882a593Smuzhiyun	movaps 0x10(TKEYP), KEY
2285*4882a593Smuzhiyun	aesdec KEY, STATE1
2286*4882a593Smuzhiyun	aesdec KEY, STATE2
2287*4882a593Smuzhiyun	aesdec KEY, STATE3
2288*4882a593Smuzhiyun	aesdec KEY, STATE4
2289*4882a593Smuzhiyun	movaps 0x20(TKEYP), KEY
2290*4882a593Smuzhiyun	aesdec KEY, STATE1
2291*4882a593Smuzhiyun	aesdec KEY, STATE2
2292*4882a593Smuzhiyun	aesdec KEY, STATE3
2293*4882a593Smuzhiyun	aesdec KEY, STATE4
2294*4882a593Smuzhiyun	movaps 0x30(TKEYP), KEY
2295*4882a593Smuzhiyun	aesdec KEY, STATE1
2296*4882a593Smuzhiyun	aesdec KEY, STATE2
2297*4882a593Smuzhiyun	aesdec KEY, STATE3
2298*4882a593Smuzhiyun	aesdec KEY, STATE4
2299*4882a593Smuzhiyun	movaps 0x40(TKEYP), KEY
2300*4882a593Smuzhiyun	aesdec KEY, STATE1
2301*4882a593Smuzhiyun	aesdec KEY, STATE2
2302*4882a593Smuzhiyun	aesdec KEY, STATE3
2303*4882a593Smuzhiyun	aesdec KEY, STATE4
2304*4882a593Smuzhiyun	movaps 0x50(TKEYP), KEY
2305*4882a593Smuzhiyun	aesdec KEY, STATE1
2306*4882a593Smuzhiyun	aesdec KEY, STATE2
2307*4882a593Smuzhiyun	aesdec KEY, STATE3
2308*4882a593Smuzhiyun	aesdec KEY, STATE4
2309*4882a593Smuzhiyun	movaps 0x60(TKEYP), KEY
2310*4882a593Smuzhiyun	aesdec KEY, STATE1
2311*4882a593Smuzhiyun	aesdec KEY, STATE2
2312*4882a593Smuzhiyun	aesdec KEY, STATE3
2313*4882a593Smuzhiyun	aesdec KEY, STATE4
2314*4882a593Smuzhiyun	movaps 0x70(TKEYP), KEY
2315*4882a593Smuzhiyun	aesdeclast KEY, STATE1		# last round
2316*4882a593Smuzhiyun	aesdeclast KEY, STATE2
2317*4882a593Smuzhiyun	aesdeclast KEY, STATE3
2318*4882a593Smuzhiyun	aesdeclast KEY, STATE4
2319*4882a593Smuzhiyun	RET
2320*4882a593SmuzhiyunSYM_FUNC_END(_aesni_dec4)
2321*4882a593Smuzhiyun
2322*4882a593Smuzhiyun/*
2323*4882a593Smuzhiyun * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2324*4882a593Smuzhiyun *		      size_t len)
2325*4882a593Smuzhiyun */
2326*4882a593SmuzhiyunSYM_FUNC_START(aesni_ecb_enc)
2327*4882a593Smuzhiyun	FRAME_BEGIN
2328*4882a593Smuzhiyun#ifndef __x86_64__
2329*4882a593Smuzhiyun	pushl LEN
2330*4882a593Smuzhiyun	pushl KEYP
2331*4882a593Smuzhiyun	pushl KLEN
2332*4882a593Smuzhiyun	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2333*4882a593Smuzhiyun	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2334*4882a593Smuzhiyun	movl (FRAME_OFFSET+24)(%esp), INP	# src
2335*4882a593Smuzhiyun	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2336*4882a593Smuzhiyun#endif
2337*4882a593Smuzhiyun	test LEN, LEN		# check length
2338*4882a593Smuzhiyun	jz .Lecb_enc_ret
2339*4882a593Smuzhiyun	mov 480(KEYP), KLEN
2340*4882a593Smuzhiyun	cmp $16, LEN
2341*4882a593Smuzhiyun	jb .Lecb_enc_ret
2342*4882a593Smuzhiyun	cmp $64, LEN
2343*4882a593Smuzhiyun	jb .Lecb_enc_loop1
2344*4882a593Smuzhiyun.align 4
2345*4882a593Smuzhiyun.Lecb_enc_loop4:
2346*4882a593Smuzhiyun	movups (INP), STATE1
2347*4882a593Smuzhiyun	movups 0x10(INP), STATE2
2348*4882a593Smuzhiyun	movups 0x20(INP), STATE3
2349*4882a593Smuzhiyun	movups 0x30(INP), STATE4
2350*4882a593Smuzhiyun	call _aesni_enc4
2351*4882a593Smuzhiyun	movups STATE1, (OUTP)
2352*4882a593Smuzhiyun	movups STATE2, 0x10(OUTP)
2353*4882a593Smuzhiyun	movups STATE3, 0x20(OUTP)
2354*4882a593Smuzhiyun	movups STATE4, 0x30(OUTP)
2355*4882a593Smuzhiyun	sub $64, LEN
2356*4882a593Smuzhiyun	add $64, INP
2357*4882a593Smuzhiyun	add $64, OUTP
2358*4882a593Smuzhiyun	cmp $64, LEN
2359*4882a593Smuzhiyun	jge .Lecb_enc_loop4
2360*4882a593Smuzhiyun	cmp $16, LEN
2361*4882a593Smuzhiyun	jb .Lecb_enc_ret
2362*4882a593Smuzhiyun.align 4
2363*4882a593Smuzhiyun.Lecb_enc_loop1:
2364*4882a593Smuzhiyun	movups (INP), STATE1
2365*4882a593Smuzhiyun	call _aesni_enc1
2366*4882a593Smuzhiyun	movups STATE1, (OUTP)
2367*4882a593Smuzhiyun	sub $16, LEN
2368*4882a593Smuzhiyun	add $16, INP
2369*4882a593Smuzhiyun	add $16, OUTP
2370*4882a593Smuzhiyun	cmp $16, LEN
2371*4882a593Smuzhiyun	jge .Lecb_enc_loop1
2372*4882a593Smuzhiyun.Lecb_enc_ret:
2373*4882a593Smuzhiyun#ifndef __x86_64__
2374*4882a593Smuzhiyun	popl KLEN
2375*4882a593Smuzhiyun	popl KEYP
2376*4882a593Smuzhiyun	popl LEN
2377*4882a593Smuzhiyun#endif
2378*4882a593Smuzhiyun	FRAME_END
2379*4882a593Smuzhiyun	RET
2380*4882a593SmuzhiyunSYM_FUNC_END(aesni_ecb_enc)
2381*4882a593Smuzhiyun
2382*4882a593Smuzhiyun/*
2383*4882a593Smuzhiyun * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2384*4882a593Smuzhiyun *		      size_t len);
2385*4882a593Smuzhiyun */
2386*4882a593SmuzhiyunSYM_FUNC_START(aesni_ecb_dec)
2387*4882a593Smuzhiyun	FRAME_BEGIN
2388*4882a593Smuzhiyun#ifndef __x86_64__
2389*4882a593Smuzhiyun	pushl LEN
2390*4882a593Smuzhiyun	pushl KEYP
2391*4882a593Smuzhiyun	pushl KLEN
2392*4882a593Smuzhiyun	movl (FRAME_OFFSET+16)(%esp), KEYP	# ctx
2393*4882a593Smuzhiyun	movl (FRAME_OFFSET+20)(%esp), OUTP	# dst
2394*4882a593Smuzhiyun	movl (FRAME_OFFSET+24)(%esp), INP	# src
2395*4882a593Smuzhiyun	movl (FRAME_OFFSET+28)(%esp), LEN	# len
2396*4882a593Smuzhiyun#endif
2397*4882a593Smuzhiyun	test LEN, LEN
2398*4882a593Smuzhiyun	jz .Lecb_dec_ret
2399*4882a593Smuzhiyun	mov 480(KEYP), KLEN
2400*4882a593Smuzhiyun	add $240, KEYP
2401*4882a593Smuzhiyun	cmp $16, LEN
2402*4882a593Smuzhiyun	jb .Lecb_dec_ret
2403*4882a593Smuzhiyun	cmp $64, LEN
2404*4882a593Smuzhiyun	jb .Lecb_dec_loop1
2405*4882a593Smuzhiyun.align 4
2406*4882a593Smuzhiyun.Lecb_dec_loop4:
2407*4882a593Smuzhiyun	movups (INP), STATE1
2408*4882a593Smuzhiyun	movups 0x10(INP), STATE2
2409*4882a593Smuzhiyun	movups 0x20(INP), STATE3
2410*4882a593Smuzhiyun	movups 0x30(INP), STATE4
2411*4882a593Smuzhiyun	call _aesni_dec4
2412*4882a593Smuzhiyun	movups STATE1, (OUTP)
2413*4882a593Smuzhiyun	movups STATE2, 0x10(OUTP)
2414*4882a593Smuzhiyun	movups STATE3, 0x20(OUTP)
2415*4882a593Smuzhiyun	movups STATE4, 0x30(OUTP)
2416*4882a593Smuzhiyun	sub $64, LEN
2417*4882a593Smuzhiyun	add $64, INP
2418*4882a593Smuzhiyun	add $64, OUTP
2419*4882a593Smuzhiyun	cmp $64, LEN
2420*4882a593Smuzhiyun	jge .Lecb_dec_loop4
2421*4882a593Smuzhiyun	cmp $16, LEN
2422*4882a593Smuzhiyun	jb .Lecb_dec_ret
2423*4882a593Smuzhiyun.align 4
2424*4882a593Smuzhiyun.Lecb_dec_loop1:
2425*4882a593Smuzhiyun	movups (INP), STATE1
2426*4882a593Smuzhiyun	call _aesni_dec1
2427*4882a593Smuzhiyun	movups STATE1, (OUTP)
2428*4882a593Smuzhiyun	sub $16, LEN
2429*4882a593Smuzhiyun	add $16, INP
2430*4882a593Smuzhiyun	add $16, OUTP
2431*4882a593Smuzhiyun	cmp $16, LEN
2432*4882a593Smuzhiyun	jge .Lecb_dec_loop1
2433*4882a593Smuzhiyun.Lecb_dec_ret:
2434*4882a593Smuzhiyun#ifndef __x86_64__
2435*4882a593Smuzhiyun	popl KLEN
2436*4882a593Smuzhiyun	popl KEYP
2437*4882a593Smuzhiyun	popl LEN
2438*4882a593Smuzhiyun#endif
2439*4882a593Smuzhiyun	FRAME_END
2440*4882a593Smuzhiyun	RET
2441*4882a593SmuzhiyunSYM_FUNC_END(aesni_ecb_dec)
2442*4882a593Smuzhiyun
2443*4882a593Smuzhiyun/*
2444*4882a593Smuzhiyun * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2445*4882a593Smuzhiyun *		      size_t len, u8 *iv)
2446*4882a593Smuzhiyun */
2447*4882a593SmuzhiyunSYM_FUNC_START(aesni_cbc_enc)
2448*4882a593Smuzhiyun	FRAME_BEGIN
2449*4882a593Smuzhiyun#ifndef __x86_64__
2450*4882a593Smuzhiyun	pushl IVP
2451*4882a593Smuzhiyun	pushl LEN
2452*4882a593Smuzhiyun	pushl KEYP
2453*4882a593Smuzhiyun	pushl KLEN
2454*4882a593Smuzhiyun	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2455*4882a593Smuzhiyun	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2456*4882a593Smuzhiyun	movl (FRAME_OFFSET+28)(%esp), INP	# src
2457*4882a593Smuzhiyun	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2458*4882a593Smuzhiyun	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2459*4882a593Smuzhiyun#endif
2460*4882a593Smuzhiyun	cmp $16, LEN
2461*4882a593Smuzhiyun	jb .Lcbc_enc_ret
2462*4882a593Smuzhiyun	mov 480(KEYP), KLEN
2463*4882a593Smuzhiyun	movups (IVP), STATE	# load iv as initial state
2464*4882a593Smuzhiyun.align 4
2465*4882a593Smuzhiyun.Lcbc_enc_loop:
2466*4882a593Smuzhiyun	movups (INP), IN	# load input
2467*4882a593Smuzhiyun	pxor IN, STATE
2468*4882a593Smuzhiyun	call _aesni_enc1
2469*4882a593Smuzhiyun	movups STATE, (OUTP)	# store output
2470*4882a593Smuzhiyun	sub $16, LEN
2471*4882a593Smuzhiyun	add $16, INP
2472*4882a593Smuzhiyun	add $16, OUTP
2473*4882a593Smuzhiyun	cmp $16, LEN
2474*4882a593Smuzhiyun	jge .Lcbc_enc_loop
2475*4882a593Smuzhiyun	movups STATE, (IVP)
2476*4882a593Smuzhiyun.Lcbc_enc_ret:
2477*4882a593Smuzhiyun#ifndef __x86_64__
2478*4882a593Smuzhiyun	popl KLEN
2479*4882a593Smuzhiyun	popl KEYP
2480*4882a593Smuzhiyun	popl LEN
2481*4882a593Smuzhiyun	popl IVP
2482*4882a593Smuzhiyun#endif
2483*4882a593Smuzhiyun	FRAME_END
2484*4882a593Smuzhiyun	RET
2485*4882a593SmuzhiyunSYM_FUNC_END(aesni_cbc_enc)
2486*4882a593Smuzhiyun
2487*4882a593Smuzhiyun/*
2488*4882a593Smuzhiyun * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2489*4882a593Smuzhiyun *		      size_t len, u8 *iv)
2490*4882a593Smuzhiyun */
2491*4882a593SmuzhiyunSYM_FUNC_START(aesni_cbc_dec)
2492*4882a593Smuzhiyun	FRAME_BEGIN
2493*4882a593Smuzhiyun#ifndef __x86_64__
2494*4882a593Smuzhiyun	pushl IVP
2495*4882a593Smuzhiyun	pushl LEN
2496*4882a593Smuzhiyun	pushl KEYP
2497*4882a593Smuzhiyun	pushl KLEN
2498*4882a593Smuzhiyun	movl (FRAME_OFFSET+20)(%esp), KEYP	# ctx
2499*4882a593Smuzhiyun	movl (FRAME_OFFSET+24)(%esp), OUTP	# dst
2500*4882a593Smuzhiyun	movl (FRAME_OFFSET+28)(%esp), INP	# src
2501*4882a593Smuzhiyun	movl (FRAME_OFFSET+32)(%esp), LEN	# len
2502*4882a593Smuzhiyun	movl (FRAME_OFFSET+36)(%esp), IVP	# iv
2503*4882a593Smuzhiyun#endif
2504*4882a593Smuzhiyun	cmp $16, LEN
2505*4882a593Smuzhiyun	jb .Lcbc_dec_just_ret
2506*4882a593Smuzhiyun	mov 480(KEYP), KLEN
2507*4882a593Smuzhiyun	add $240, KEYP
2508*4882a593Smuzhiyun	movups (IVP), IV
2509*4882a593Smuzhiyun	cmp $64, LEN
2510*4882a593Smuzhiyun	jb .Lcbc_dec_loop1
2511*4882a593Smuzhiyun.align 4
2512*4882a593Smuzhiyun.Lcbc_dec_loop4:
2513*4882a593Smuzhiyun	movups (INP), IN1
2514*4882a593Smuzhiyun	movaps IN1, STATE1
2515*4882a593Smuzhiyun	movups 0x10(INP), IN2
2516*4882a593Smuzhiyun	movaps IN2, STATE2
2517*4882a593Smuzhiyun#ifdef __x86_64__
2518*4882a593Smuzhiyun	movups 0x20(INP), IN3
2519*4882a593Smuzhiyun	movaps IN3, STATE3
2520*4882a593Smuzhiyun	movups 0x30(INP), IN4
2521*4882a593Smuzhiyun	movaps IN4, STATE4
2522*4882a593Smuzhiyun#else
2523*4882a593Smuzhiyun	movups 0x20(INP), IN1
2524*4882a593Smuzhiyun	movaps IN1, STATE3
2525*4882a593Smuzhiyun	movups 0x30(INP), IN2
2526*4882a593Smuzhiyun	movaps IN2, STATE4
2527*4882a593Smuzhiyun#endif
2528*4882a593Smuzhiyun	call _aesni_dec4
2529*4882a593Smuzhiyun	pxor IV, STATE1
2530*4882a593Smuzhiyun#ifdef __x86_64__
2531*4882a593Smuzhiyun	pxor IN1, STATE2
2532*4882a593Smuzhiyun	pxor IN2, STATE3
2533*4882a593Smuzhiyun	pxor IN3, STATE4
2534*4882a593Smuzhiyun	movaps IN4, IV
2535*4882a593Smuzhiyun#else
2536*4882a593Smuzhiyun	pxor IN1, STATE4
2537*4882a593Smuzhiyun	movaps IN2, IV
2538*4882a593Smuzhiyun	movups (INP), IN1
2539*4882a593Smuzhiyun	pxor IN1, STATE2
2540*4882a593Smuzhiyun	movups 0x10(INP), IN2
2541*4882a593Smuzhiyun	pxor IN2, STATE3
2542*4882a593Smuzhiyun#endif
2543*4882a593Smuzhiyun	movups STATE1, (OUTP)
2544*4882a593Smuzhiyun	movups STATE2, 0x10(OUTP)
2545*4882a593Smuzhiyun	movups STATE3, 0x20(OUTP)
2546*4882a593Smuzhiyun	movups STATE4, 0x30(OUTP)
2547*4882a593Smuzhiyun	sub $64, LEN
2548*4882a593Smuzhiyun	add $64, INP
2549*4882a593Smuzhiyun	add $64, OUTP
2550*4882a593Smuzhiyun	cmp $64, LEN
2551*4882a593Smuzhiyun	jge .Lcbc_dec_loop4
2552*4882a593Smuzhiyun	cmp $16, LEN
2553*4882a593Smuzhiyun	jb .Lcbc_dec_ret
2554*4882a593Smuzhiyun.align 4
2555*4882a593Smuzhiyun.Lcbc_dec_loop1:
2556*4882a593Smuzhiyun	movups (INP), IN
2557*4882a593Smuzhiyun	movaps IN, STATE
2558*4882a593Smuzhiyun	call _aesni_dec1
2559*4882a593Smuzhiyun	pxor IV, STATE
2560*4882a593Smuzhiyun	movups STATE, (OUTP)
2561*4882a593Smuzhiyun	movaps IN, IV
2562*4882a593Smuzhiyun	sub $16, LEN
2563*4882a593Smuzhiyun	add $16, INP
2564*4882a593Smuzhiyun	add $16, OUTP
2565*4882a593Smuzhiyun	cmp $16, LEN
2566*4882a593Smuzhiyun	jge .Lcbc_dec_loop1
2567*4882a593Smuzhiyun.Lcbc_dec_ret:
2568*4882a593Smuzhiyun	movups IV, (IVP)
2569*4882a593Smuzhiyun.Lcbc_dec_just_ret:
2570*4882a593Smuzhiyun#ifndef __x86_64__
2571*4882a593Smuzhiyun	popl KLEN
2572*4882a593Smuzhiyun	popl KEYP
2573*4882a593Smuzhiyun	popl LEN
2574*4882a593Smuzhiyun	popl IVP
2575*4882a593Smuzhiyun#endif
2576*4882a593Smuzhiyun	FRAME_END
2577*4882a593Smuzhiyun	RET
2578*4882a593SmuzhiyunSYM_FUNC_END(aesni_cbc_dec)
2579*4882a593Smuzhiyun
2580*4882a593Smuzhiyun#ifdef __x86_64__
2581*4882a593Smuzhiyun.pushsection .rodata
2582*4882a593Smuzhiyun.align 16
2583*4882a593Smuzhiyun.Lbswap_mask:
2584*4882a593Smuzhiyun	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
2585*4882a593Smuzhiyun.popsection
2586*4882a593Smuzhiyun
2587*4882a593Smuzhiyun/*
2588*4882a593Smuzhiyun * _aesni_inc_init:	internal ABI
2589*4882a593Smuzhiyun *	setup registers used by _aesni_inc
2590*4882a593Smuzhiyun * input:
2591*4882a593Smuzhiyun *	IV
2592*4882a593Smuzhiyun * output:
2593*4882a593Smuzhiyun *	CTR:	== IV, in little endian
2594*4882a593Smuzhiyun *	TCTR_LOW: == lower qword of CTR
2595*4882a593Smuzhiyun *	INC:	== 1, in little endian
2596*4882a593Smuzhiyun *	BSWAP_MASK == endian swapping mask
2597*4882a593Smuzhiyun */
2598*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_inc_init)
2599*4882a593Smuzhiyun	movaps .Lbswap_mask, BSWAP_MASK
2600*4882a593Smuzhiyun	movaps IV, CTR
2601*4882a593Smuzhiyun	pshufb BSWAP_MASK, CTR
2602*4882a593Smuzhiyun	mov $1, TCTR_LOW
2603*4882a593Smuzhiyun	movq TCTR_LOW, INC
2604*4882a593Smuzhiyun	movq CTR, TCTR_LOW
2605*4882a593Smuzhiyun	RET
2606*4882a593SmuzhiyunSYM_FUNC_END(_aesni_inc_init)
2607*4882a593Smuzhiyun
2608*4882a593Smuzhiyun/*
2609*4882a593Smuzhiyun * _aesni_inc:		internal ABI
2610*4882a593Smuzhiyun *	Increase IV by 1, IV is in big endian
2611*4882a593Smuzhiyun * input:
2612*4882a593Smuzhiyun *	IV
2613*4882a593Smuzhiyun *	CTR:	== IV, in little endian
2614*4882a593Smuzhiyun *	TCTR_LOW: == lower qword of CTR
2615*4882a593Smuzhiyun *	INC:	== 1, in little endian
2616*4882a593Smuzhiyun *	BSWAP_MASK == endian swapping mask
2617*4882a593Smuzhiyun * output:
2618*4882a593Smuzhiyun *	IV:	Increase by 1
2619*4882a593Smuzhiyun * changed:
2620*4882a593Smuzhiyun *	CTR:	== output IV, in little endian
2621*4882a593Smuzhiyun *	TCTR_LOW: == lower qword of CTR
2622*4882a593Smuzhiyun */
2623*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_inc)
2624*4882a593Smuzhiyun	paddq INC, CTR
2625*4882a593Smuzhiyun	add $1, TCTR_LOW
2626*4882a593Smuzhiyun	jnc .Linc_low
2627*4882a593Smuzhiyun	pslldq $8, INC
2628*4882a593Smuzhiyun	paddq INC, CTR
2629*4882a593Smuzhiyun	psrldq $8, INC
2630*4882a593Smuzhiyun.Linc_low:
2631*4882a593Smuzhiyun	movaps CTR, IV
2632*4882a593Smuzhiyun	pshufb BSWAP_MASK, IV
2633*4882a593Smuzhiyun	RET
2634*4882a593SmuzhiyunSYM_FUNC_END(_aesni_inc)
2635*4882a593Smuzhiyun
2636*4882a593Smuzhiyun/*
2637*4882a593Smuzhiyun * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src,
2638*4882a593Smuzhiyun *		      size_t len, u8 *iv)
2639*4882a593Smuzhiyun */
2640*4882a593SmuzhiyunSYM_FUNC_START(aesni_ctr_enc)
2641*4882a593Smuzhiyun	FRAME_BEGIN
2642*4882a593Smuzhiyun	cmp $16, LEN
2643*4882a593Smuzhiyun	jb .Lctr_enc_just_ret
2644*4882a593Smuzhiyun	mov 480(KEYP), KLEN
2645*4882a593Smuzhiyun	movups (IVP), IV
2646*4882a593Smuzhiyun	call _aesni_inc_init
2647*4882a593Smuzhiyun	cmp $64, LEN
2648*4882a593Smuzhiyun	jb .Lctr_enc_loop1
2649*4882a593Smuzhiyun.align 4
2650*4882a593Smuzhiyun.Lctr_enc_loop4:
2651*4882a593Smuzhiyun	movaps IV, STATE1
2652*4882a593Smuzhiyun	call _aesni_inc
2653*4882a593Smuzhiyun	movups (INP), IN1
2654*4882a593Smuzhiyun	movaps IV, STATE2
2655*4882a593Smuzhiyun	call _aesni_inc
2656*4882a593Smuzhiyun	movups 0x10(INP), IN2
2657*4882a593Smuzhiyun	movaps IV, STATE3
2658*4882a593Smuzhiyun	call _aesni_inc
2659*4882a593Smuzhiyun	movups 0x20(INP), IN3
2660*4882a593Smuzhiyun	movaps IV, STATE4
2661*4882a593Smuzhiyun	call _aesni_inc
2662*4882a593Smuzhiyun	movups 0x30(INP), IN4
2663*4882a593Smuzhiyun	call _aesni_enc4
2664*4882a593Smuzhiyun	pxor IN1, STATE1
2665*4882a593Smuzhiyun	movups STATE1, (OUTP)
2666*4882a593Smuzhiyun	pxor IN2, STATE2
2667*4882a593Smuzhiyun	movups STATE2, 0x10(OUTP)
2668*4882a593Smuzhiyun	pxor IN3, STATE3
2669*4882a593Smuzhiyun	movups STATE3, 0x20(OUTP)
2670*4882a593Smuzhiyun	pxor IN4, STATE4
2671*4882a593Smuzhiyun	movups STATE4, 0x30(OUTP)
2672*4882a593Smuzhiyun	sub $64, LEN
2673*4882a593Smuzhiyun	add $64, INP
2674*4882a593Smuzhiyun	add $64, OUTP
2675*4882a593Smuzhiyun	cmp $64, LEN
2676*4882a593Smuzhiyun	jge .Lctr_enc_loop4
2677*4882a593Smuzhiyun	cmp $16, LEN
2678*4882a593Smuzhiyun	jb .Lctr_enc_ret
2679*4882a593Smuzhiyun.align 4
2680*4882a593Smuzhiyun.Lctr_enc_loop1:
2681*4882a593Smuzhiyun	movaps IV, STATE
2682*4882a593Smuzhiyun	call _aesni_inc
2683*4882a593Smuzhiyun	movups (INP), IN
2684*4882a593Smuzhiyun	call _aesni_enc1
2685*4882a593Smuzhiyun	pxor IN, STATE
2686*4882a593Smuzhiyun	movups STATE, (OUTP)
2687*4882a593Smuzhiyun	sub $16, LEN
2688*4882a593Smuzhiyun	add $16, INP
2689*4882a593Smuzhiyun	add $16, OUTP
2690*4882a593Smuzhiyun	cmp $16, LEN
2691*4882a593Smuzhiyun	jge .Lctr_enc_loop1
2692*4882a593Smuzhiyun.Lctr_enc_ret:
2693*4882a593Smuzhiyun	movups IV, (IVP)
2694*4882a593Smuzhiyun.Lctr_enc_just_ret:
2695*4882a593Smuzhiyun	FRAME_END
2696*4882a593Smuzhiyun	RET
2697*4882a593SmuzhiyunSYM_FUNC_END(aesni_ctr_enc)
2698*4882a593Smuzhiyun
2699*4882a593Smuzhiyun/*
2700*4882a593Smuzhiyun * _aesni_gf128mul_x_ble:		internal ABI
2701*4882a593Smuzhiyun *	Multiply in GF(2^128) for XTS IVs
2702*4882a593Smuzhiyun * input:
2703*4882a593Smuzhiyun *	IV:	current IV
2704*4882a593Smuzhiyun *	GF128MUL_MASK == mask with 0x87 and 0x01
2705*4882a593Smuzhiyun * output:
2706*4882a593Smuzhiyun *	IV:	next IV
2707*4882a593Smuzhiyun * changed:
2708*4882a593Smuzhiyun *	CTR:	== temporary value
2709*4882a593Smuzhiyun */
2710*4882a593Smuzhiyun#define _aesni_gf128mul_x_ble() \
2711*4882a593Smuzhiyun	pshufd $0x13, IV, CTR; \
2712*4882a593Smuzhiyun	paddq IV, IV; \
2713*4882a593Smuzhiyun	psrad $31, CTR; \
2714*4882a593Smuzhiyun	pand GF128MUL_MASK, CTR; \
2715*4882a593Smuzhiyun	pxor CTR, IV;
2716*4882a593Smuzhiyun
2717*4882a593Smuzhiyun/*
2718*4882a593Smuzhiyun * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2719*4882a593Smuzhiyun *			  const u8 *src, unsigned int len, le128 *iv)
2720*4882a593Smuzhiyun */
2721*4882a593SmuzhiyunSYM_FUNC_START(aesni_xts_encrypt)
2722*4882a593Smuzhiyun	FRAME_BEGIN
2723*4882a593Smuzhiyun
2724*4882a593Smuzhiyun	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2725*4882a593Smuzhiyun	movups (IVP), IV
2726*4882a593Smuzhiyun
2727*4882a593Smuzhiyun	mov 480(KEYP), KLEN
2728*4882a593Smuzhiyun
2729*4882a593Smuzhiyun.Lxts_enc_loop4:
2730*4882a593Smuzhiyun	movdqa IV, STATE1
2731*4882a593Smuzhiyun	movdqu 0x00(INP), INC
2732*4882a593Smuzhiyun	pxor INC, STATE1
2733*4882a593Smuzhiyun	movdqu IV, 0x00(OUTP)
2734*4882a593Smuzhiyun
2735*4882a593Smuzhiyun	_aesni_gf128mul_x_ble()
2736*4882a593Smuzhiyun	movdqa IV, STATE2
2737*4882a593Smuzhiyun	movdqu 0x10(INP), INC
2738*4882a593Smuzhiyun	pxor INC, STATE2
2739*4882a593Smuzhiyun	movdqu IV, 0x10(OUTP)
2740*4882a593Smuzhiyun
2741*4882a593Smuzhiyun	_aesni_gf128mul_x_ble()
2742*4882a593Smuzhiyun	movdqa IV, STATE3
2743*4882a593Smuzhiyun	movdqu 0x20(INP), INC
2744*4882a593Smuzhiyun	pxor INC, STATE3
2745*4882a593Smuzhiyun	movdqu IV, 0x20(OUTP)
2746*4882a593Smuzhiyun
2747*4882a593Smuzhiyun	_aesni_gf128mul_x_ble()
2748*4882a593Smuzhiyun	movdqa IV, STATE4
2749*4882a593Smuzhiyun	movdqu 0x30(INP), INC
2750*4882a593Smuzhiyun	pxor INC, STATE4
2751*4882a593Smuzhiyun	movdqu IV, 0x30(OUTP)
2752*4882a593Smuzhiyun
2753*4882a593Smuzhiyun	call _aesni_enc4
2754*4882a593Smuzhiyun
2755*4882a593Smuzhiyun	movdqu 0x00(OUTP), INC
2756*4882a593Smuzhiyun	pxor INC, STATE1
2757*4882a593Smuzhiyun	movdqu STATE1, 0x00(OUTP)
2758*4882a593Smuzhiyun
2759*4882a593Smuzhiyun	movdqu 0x10(OUTP), INC
2760*4882a593Smuzhiyun	pxor INC, STATE2
2761*4882a593Smuzhiyun	movdqu STATE2, 0x10(OUTP)
2762*4882a593Smuzhiyun
2763*4882a593Smuzhiyun	movdqu 0x20(OUTP), INC
2764*4882a593Smuzhiyun	pxor INC, STATE3
2765*4882a593Smuzhiyun	movdqu STATE3, 0x20(OUTP)
2766*4882a593Smuzhiyun
2767*4882a593Smuzhiyun	movdqu 0x30(OUTP), INC
2768*4882a593Smuzhiyun	pxor INC, STATE4
2769*4882a593Smuzhiyun	movdqu STATE4, 0x30(OUTP)
2770*4882a593Smuzhiyun
2771*4882a593Smuzhiyun	_aesni_gf128mul_x_ble()
2772*4882a593Smuzhiyun
2773*4882a593Smuzhiyun	add $64, INP
2774*4882a593Smuzhiyun	add $64, OUTP
2775*4882a593Smuzhiyun	sub $64, LEN
2776*4882a593Smuzhiyun	ja .Lxts_enc_loop4
2777*4882a593Smuzhiyun
2778*4882a593Smuzhiyun	movups IV, (IVP)
2779*4882a593Smuzhiyun
2780*4882a593Smuzhiyun	FRAME_END
2781*4882a593Smuzhiyun	RET
2782*4882a593SmuzhiyunSYM_FUNC_END(aesni_xts_encrypt)
2783*4882a593Smuzhiyun
2784*4882a593Smuzhiyun/*
2785*4882a593Smuzhiyun * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst,
2786*4882a593Smuzhiyun *			  const u8 *src, unsigned int len, le128 *iv)
2787*4882a593Smuzhiyun */
2788*4882a593SmuzhiyunSYM_FUNC_START(aesni_xts_decrypt)
2789*4882a593Smuzhiyun	FRAME_BEGIN
2790*4882a593Smuzhiyun
2791*4882a593Smuzhiyun	movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK
2792*4882a593Smuzhiyun	movups (IVP), IV
2793*4882a593Smuzhiyun
2794*4882a593Smuzhiyun	mov 480(KEYP), KLEN
2795*4882a593Smuzhiyun	add $240, KEYP
2796*4882a593Smuzhiyun
2797*4882a593Smuzhiyun.Lxts_dec_loop4:
2798*4882a593Smuzhiyun	movdqa IV, STATE1
2799*4882a593Smuzhiyun	movdqu 0x00(INP), INC
2800*4882a593Smuzhiyun	pxor INC, STATE1
2801*4882a593Smuzhiyun	movdqu IV, 0x00(OUTP)
2802*4882a593Smuzhiyun
2803*4882a593Smuzhiyun	_aesni_gf128mul_x_ble()
2804*4882a593Smuzhiyun	movdqa IV, STATE2
2805*4882a593Smuzhiyun	movdqu 0x10(INP), INC
2806*4882a593Smuzhiyun	pxor INC, STATE2
2807*4882a593Smuzhiyun	movdqu IV, 0x10(OUTP)
2808*4882a593Smuzhiyun
2809*4882a593Smuzhiyun	_aesni_gf128mul_x_ble()
2810*4882a593Smuzhiyun	movdqa IV, STATE3
2811*4882a593Smuzhiyun	movdqu 0x20(INP), INC
2812*4882a593Smuzhiyun	pxor INC, STATE3
2813*4882a593Smuzhiyun	movdqu IV, 0x20(OUTP)
2814*4882a593Smuzhiyun
2815*4882a593Smuzhiyun	_aesni_gf128mul_x_ble()
2816*4882a593Smuzhiyun	movdqa IV, STATE4
2817*4882a593Smuzhiyun	movdqu 0x30(INP), INC
2818*4882a593Smuzhiyun	pxor INC, STATE4
2819*4882a593Smuzhiyun	movdqu IV, 0x30(OUTP)
2820*4882a593Smuzhiyun
2821*4882a593Smuzhiyun	call _aesni_dec4
2822*4882a593Smuzhiyun
2823*4882a593Smuzhiyun	movdqu 0x00(OUTP), INC
2824*4882a593Smuzhiyun	pxor INC, STATE1
2825*4882a593Smuzhiyun	movdqu STATE1, 0x00(OUTP)
2826*4882a593Smuzhiyun
2827*4882a593Smuzhiyun	movdqu 0x10(OUTP), INC
2828*4882a593Smuzhiyun	pxor INC, STATE2
2829*4882a593Smuzhiyun	movdqu STATE2, 0x10(OUTP)
2830*4882a593Smuzhiyun
2831*4882a593Smuzhiyun	movdqu 0x20(OUTP), INC
2832*4882a593Smuzhiyun	pxor INC, STATE3
2833*4882a593Smuzhiyun	movdqu STATE3, 0x20(OUTP)
2834*4882a593Smuzhiyun
2835*4882a593Smuzhiyun	movdqu 0x30(OUTP), INC
2836*4882a593Smuzhiyun	pxor INC, STATE4
2837*4882a593Smuzhiyun	movdqu STATE4, 0x30(OUTP)
2838*4882a593Smuzhiyun
2839*4882a593Smuzhiyun	_aesni_gf128mul_x_ble()
2840*4882a593Smuzhiyun
2841*4882a593Smuzhiyun	add $64, INP
2842*4882a593Smuzhiyun	add $64, OUTP
2843*4882a593Smuzhiyun	sub $64, LEN
2844*4882a593Smuzhiyun	ja .Lxts_dec_loop4
2845*4882a593Smuzhiyun
2846*4882a593Smuzhiyun	movups IV, (IVP)
2847*4882a593Smuzhiyun
2848*4882a593Smuzhiyun	FRAME_END
2849*4882a593Smuzhiyun	RET
2850*4882a593SmuzhiyunSYM_FUNC_END(aesni_xts_decrypt)
2851*4882a593Smuzhiyun
2852*4882a593Smuzhiyun#endif
2853