xref: /OK3568_Linux_fs/kernel/arch/x86/crypto/aesni-intel_avx-x86_64.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun########################################################################
2*4882a593Smuzhiyun# Copyright (c) 2013, Intel Corporation
3*4882a593Smuzhiyun#
4*4882a593Smuzhiyun# This software is available to you under a choice of one of two
5*4882a593Smuzhiyun# licenses.  You may choose to be licensed under the terms of the GNU
6*4882a593Smuzhiyun# General Public License (GPL) Version 2, available from the file
7*4882a593Smuzhiyun# COPYING in the main directory of this source tree, or the
8*4882a593Smuzhiyun# OpenIB.org BSD license below:
9*4882a593Smuzhiyun#
10*4882a593Smuzhiyun# Redistribution and use in source and binary forms, with or without
11*4882a593Smuzhiyun# modification, are permitted provided that the following conditions are
12*4882a593Smuzhiyun# met:
13*4882a593Smuzhiyun#
14*4882a593Smuzhiyun# * Redistributions of source code must retain the above copyright
15*4882a593Smuzhiyun#   notice, this list of conditions and the following disclaimer.
16*4882a593Smuzhiyun#
17*4882a593Smuzhiyun# * Redistributions in binary form must reproduce the above copyright
18*4882a593Smuzhiyun#   notice, this list of conditions and the following disclaimer in the
19*4882a593Smuzhiyun#   documentation and/or other materials provided with the
20*4882a593Smuzhiyun#   distribution.
21*4882a593Smuzhiyun#
22*4882a593Smuzhiyun# * Neither the name of the Intel Corporation nor the names of its
23*4882a593Smuzhiyun#   contributors may be used to endorse or promote products derived from
24*4882a593Smuzhiyun#   this software without specific prior written permission.
25*4882a593Smuzhiyun#
26*4882a593Smuzhiyun#
27*4882a593Smuzhiyun# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
28*4882a593Smuzhiyun# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29*4882a593Smuzhiyun# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30*4882a593Smuzhiyun# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
31*4882a593Smuzhiyun# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32*4882a593Smuzhiyun# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33*4882a593Smuzhiyun# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR
34*4882a593Smuzhiyun# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35*4882a593Smuzhiyun# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36*4882a593Smuzhiyun# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37*4882a593Smuzhiyun# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38*4882a593Smuzhiyun########################################################################
39*4882a593Smuzhiyun##
40*4882a593Smuzhiyun## Authors:
41*4882a593Smuzhiyun##	Erdinc Ozturk <erdinc.ozturk@intel.com>
42*4882a593Smuzhiyun##	Vinodh Gopal <vinodh.gopal@intel.com>
43*4882a593Smuzhiyun##	James Guilford <james.guilford@intel.com>
44*4882a593Smuzhiyun##	Tim Chen <tim.c.chen@linux.intel.com>
45*4882a593Smuzhiyun##
46*4882a593Smuzhiyun## References:
47*4882a593Smuzhiyun##       This code was derived and highly optimized from the code described in paper:
48*4882a593Smuzhiyun##               Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation
49*4882a593Smuzhiyun##			on Intel Architecture Processors. August, 2010
50*4882a593Smuzhiyun##       The details of the implementation is explained in:
51*4882a593Smuzhiyun##               Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode
52*4882a593Smuzhiyun##			on Intel Architecture Processors. October, 2012.
53*4882a593Smuzhiyun##
54*4882a593Smuzhiyun## Assumptions:
55*4882a593Smuzhiyun##
56*4882a593Smuzhiyun##
57*4882a593Smuzhiyun##
58*4882a593Smuzhiyun## iv:
59*4882a593Smuzhiyun##       0                   1                   2                   3
60*4882a593Smuzhiyun##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
61*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
62*4882a593Smuzhiyun##       |                             Salt  (From the SA)               |
63*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
64*4882a593Smuzhiyun##       |                     Initialization Vector                     |
65*4882a593Smuzhiyun##       |         (This is the sequence number from IPSec header)       |
66*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
67*4882a593Smuzhiyun##       |                              0x1                              |
68*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
69*4882a593Smuzhiyun##
70*4882a593Smuzhiyun##
71*4882a593Smuzhiyun##
72*4882a593Smuzhiyun## AAD:
73*4882a593Smuzhiyun##       AAD padded to 128 bits with 0
74*4882a593Smuzhiyun##       for example, assume AAD is a u32 vector
75*4882a593Smuzhiyun##
76*4882a593Smuzhiyun##       if AAD is 8 bytes:
77*4882a593Smuzhiyun##       AAD[3] = {A0, A1}#
78*4882a593Smuzhiyun##       padded AAD in xmm register = {A1 A0 0 0}
79*4882a593Smuzhiyun##
80*4882a593Smuzhiyun##       0                   1                   2                   3
81*4882a593Smuzhiyun##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
82*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
83*4882a593Smuzhiyun##       |                               SPI (A1)                        |
84*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
85*4882a593Smuzhiyun##       |                     32-bit Sequence Number (A0)               |
86*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
87*4882a593Smuzhiyun##       |                              0x0                              |
88*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
89*4882a593Smuzhiyun##
90*4882a593Smuzhiyun##                                       AAD Format with 32-bit Sequence Number
91*4882a593Smuzhiyun##
92*4882a593Smuzhiyun##       if AAD is 12 bytes:
93*4882a593Smuzhiyun##       AAD[3] = {A0, A1, A2}#
94*4882a593Smuzhiyun##       padded AAD in xmm register = {A2 A1 A0 0}
95*4882a593Smuzhiyun##
96*4882a593Smuzhiyun##       0                   1                   2                   3
97*4882a593Smuzhiyun##       0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
98*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
99*4882a593Smuzhiyun##       |                               SPI (A2)                        |
100*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
101*4882a593Smuzhiyun##       |                 64-bit Extended Sequence Number {A1,A0}       |
102*4882a593Smuzhiyun##       |                                                               |
103*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
104*4882a593Smuzhiyun##       |                              0x0                              |
105*4882a593Smuzhiyun##       +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
106*4882a593Smuzhiyun##
107*4882a593Smuzhiyun##        AAD Format with 64-bit Extended Sequence Number
108*4882a593Smuzhiyun##
109*4882a593Smuzhiyun##
110*4882a593Smuzhiyun## aadLen:
111*4882a593Smuzhiyun##       from the definition of the spec, aadLen can only be 8 or 12 bytes.
112*4882a593Smuzhiyun##	 The code additionally supports aadLen of length 16 bytes.
113*4882a593Smuzhiyun##
114*4882a593Smuzhiyun## TLen:
115*4882a593Smuzhiyun##       from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
116*4882a593Smuzhiyun##
117*4882a593Smuzhiyun## poly = x^128 + x^127 + x^126 + x^121 + 1
118*4882a593Smuzhiyun## throughout the code, one tab and two tab indentations are used. one tab is
119*4882a593Smuzhiyun## for GHASH part, two tabs is for AES part.
120*4882a593Smuzhiyun##
121*4882a593Smuzhiyun
122*4882a593Smuzhiyun#include <linux/linkage.h>
123*4882a593Smuzhiyun
124*4882a593Smuzhiyun# constants in mergeable sections, linker can reorder and merge
125*4882a593Smuzhiyun.section	.rodata.cst16.POLY, "aM", @progbits, 16
126*4882a593Smuzhiyun.align 16
127*4882a593SmuzhiyunPOLY:            .octa     0xC2000000000000000000000000000001
128*4882a593Smuzhiyun
129*4882a593Smuzhiyun.section	.rodata.cst16.POLY2, "aM", @progbits, 16
130*4882a593Smuzhiyun.align 16
131*4882a593SmuzhiyunPOLY2:           .octa     0xC20000000000000000000001C2000000
132*4882a593Smuzhiyun
133*4882a593Smuzhiyun.section	.rodata.cst16.TWOONE, "aM", @progbits, 16
134*4882a593Smuzhiyun.align 16
135*4882a593SmuzhiyunTWOONE:          .octa     0x00000001000000000000000000000001
136*4882a593Smuzhiyun
137*4882a593Smuzhiyun.section	.rodata.cst16.SHUF_MASK, "aM", @progbits, 16
138*4882a593Smuzhiyun.align 16
139*4882a593SmuzhiyunSHUF_MASK:       .octa     0x000102030405060708090A0B0C0D0E0F
140*4882a593Smuzhiyun
141*4882a593Smuzhiyun.section	.rodata.cst16.ONE, "aM", @progbits, 16
142*4882a593Smuzhiyun.align 16
143*4882a593SmuzhiyunONE:             .octa     0x00000000000000000000000000000001
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun.section	.rodata.cst16.ONEf, "aM", @progbits, 16
146*4882a593Smuzhiyun.align 16
147*4882a593SmuzhiyunONEf:            .octa     0x01000000000000000000000000000000
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun# order of these constants should not change.
150*4882a593Smuzhiyun# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F
151*4882a593Smuzhiyun.section	.rodata, "a", @progbits
152*4882a593Smuzhiyun.align 16
153*4882a593SmuzhiyunSHIFT_MASK:      .octa     0x0f0e0d0c0b0a09080706050403020100
154*4882a593SmuzhiyunALL_F:           .octa     0xffffffffffffffffffffffffffffffff
155*4882a593Smuzhiyun                 .octa     0x00000000000000000000000000000000
156*4882a593Smuzhiyun
157*4882a593Smuzhiyun.section .rodata
158*4882a593Smuzhiyun.align 16
159*4882a593Smuzhiyun.type aad_shift_arr, @object
160*4882a593Smuzhiyun.size aad_shift_arr, 272
161*4882a593Smuzhiyunaad_shift_arr:
162*4882a593Smuzhiyun        .octa     0xffffffffffffffffffffffffffffffff
163*4882a593Smuzhiyun        .octa     0xffffffffffffffffffffffffffffff0C
164*4882a593Smuzhiyun        .octa     0xffffffffffffffffffffffffffff0D0C
165*4882a593Smuzhiyun        .octa     0xffffffffffffffffffffffffff0E0D0C
166*4882a593Smuzhiyun        .octa     0xffffffffffffffffffffffff0F0E0D0C
167*4882a593Smuzhiyun        .octa     0xffffffffffffffffffffff0C0B0A0908
168*4882a593Smuzhiyun        .octa     0xffffffffffffffffffff0D0C0B0A0908
169*4882a593Smuzhiyun        .octa     0xffffffffffffffffff0E0D0C0B0A0908
170*4882a593Smuzhiyun        .octa     0xffffffffffffffff0F0E0D0C0B0A0908
171*4882a593Smuzhiyun        .octa     0xffffffffffffff0C0B0A090807060504
172*4882a593Smuzhiyun        .octa     0xffffffffffff0D0C0B0A090807060504
173*4882a593Smuzhiyun        .octa     0xffffffffff0E0D0C0B0A090807060504
174*4882a593Smuzhiyun        .octa     0xffffffff0F0E0D0C0B0A090807060504
175*4882a593Smuzhiyun        .octa     0xffffff0C0B0A09080706050403020100
176*4882a593Smuzhiyun        .octa     0xffff0D0C0B0A09080706050403020100
177*4882a593Smuzhiyun        .octa     0xff0E0D0C0B0A09080706050403020100
178*4882a593Smuzhiyun        .octa     0x0F0E0D0C0B0A09080706050403020100
179*4882a593Smuzhiyun
180*4882a593Smuzhiyun
181*4882a593Smuzhiyun.text
182*4882a593Smuzhiyun
183*4882a593Smuzhiyun
184*4882a593Smuzhiyun#define AadHash 16*0
185*4882a593Smuzhiyun#define AadLen 16*1
186*4882a593Smuzhiyun#define InLen (16*1)+8
187*4882a593Smuzhiyun#define PBlockEncKey 16*2
188*4882a593Smuzhiyun#define OrigIV 16*3
189*4882a593Smuzhiyun#define CurCount 16*4
190*4882a593Smuzhiyun#define PBlockLen 16*5
191*4882a593Smuzhiyun
192*4882a593SmuzhiyunHashKey        = 16*6   # store HashKey <<1 mod poly here
193*4882a593SmuzhiyunHashKey_2      = 16*7   # store HashKey^2 <<1 mod poly here
194*4882a593SmuzhiyunHashKey_3      = 16*8   # store HashKey^3 <<1 mod poly here
195*4882a593SmuzhiyunHashKey_4      = 16*9   # store HashKey^4 <<1 mod poly here
196*4882a593SmuzhiyunHashKey_5      = 16*10   # store HashKey^5 <<1 mod poly here
197*4882a593SmuzhiyunHashKey_6      = 16*11   # store HashKey^6 <<1 mod poly here
198*4882a593SmuzhiyunHashKey_7      = 16*12   # store HashKey^7 <<1 mod poly here
199*4882a593SmuzhiyunHashKey_8      = 16*13   # store HashKey^8 <<1 mod poly here
200*4882a593SmuzhiyunHashKey_k      = 16*14   # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes)
201*4882a593SmuzhiyunHashKey_2_k    = 16*15   # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes)
202*4882a593SmuzhiyunHashKey_3_k    = 16*16   # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes)
203*4882a593SmuzhiyunHashKey_4_k    = 16*17   # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes)
204*4882a593SmuzhiyunHashKey_5_k    = 16*18   # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes)
205*4882a593SmuzhiyunHashKey_6_k    = 16*19   # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes)
206*4882a593SmuzhiyunHashKey_7_k    = 16*20   # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes)
207*4882a593SmuzhiyunHashKey_8_k    = 16*21   # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes)
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun#define arg1 %rdi
210*4882a593Smuzhiyun#define arg2 %rsi
211*4882a593Smuzhiyun#define arg3 %rdx
212*4882a593Smuzhiyun#define arg4 %rcx
213*4882a593Smuzhiyun#define arg5 %r8
214*4882a593Smuzhiyun#define arg6 %r9
215*4882a593Smuzhiyun#define arg7 STACK_OFFSET+8*1(%r14)
216*4882a593Smuzhiyun#define arg8 STACK_OFFSET+8*2(%r14)
217*4882a593Smuzhiyun#define arg9 STACK_OFFSET+8*3(%r14)
218*4882a593Smuzhiyun#define arg10 STACK_OFFSET+8*4(%r14)
219*4882a593Smuzhiyun#define keysize 2*15*16(arg1)
220*4882a593Smuzhiyun
221*4882a593Smuzhiyuni = 0
222*4882a593Smuzhiyunj = 0
223*4882a593Smuzhiyun
224*4882a593Smuzhiyunout_order = 0
225*4882a593Smuzhiyunin_order = 1
226*4882a593SmuzhiyunDEC = 0
227*4882a593SmuzhiyunENC = 1
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun.macro define_reg r n
230*4882a593Smuzhiyunreg_\r = %xmm\n
231*4882a593Smuzhiyun.endm
232*4882a593Smuzhiyun
233*4882a593Smuzhiyun.macro setreg
234*4882a593Smuzhiyun.altmacro
235*4882a593Smuzhiyundefine_reg i %i
236*4882a593Smuzhiyundefine_reg j %j
237*4882a593Smuzhiyun.noaltmacro
238*4882a593Smuzhiyun.endm
239*4882a593Smuzhiyun
240*4882a593Smuzhiyun# need to push 4 registers into stack to maintain
241*4882a593SmuzhiyunSTACK_OFFSET = 8*4
242*4882a593Smuzhiyun
243*4882a593SmuzhiyunTMP1 =   16*0    # Temporary storage for AAD
244*4882a593SmuzhiyunTMP2 =   16*1    # Temporary storage for AES State 2 (State 1 is stored in an XMM register)
245*4882a593SmuzhiyunTMP3 =   16*2    # Temporary storage for AES State 3
246*4882a593SmuzhiyunTMP4 =   16*3    # Temporary storage for AES State 4
247*4882a593SmuzhiyunTMP5 =   16*4    # Temporary storage for AES State 5
248*4882a593SmuzhiyunTMP6 =   16*5    # Temporary storage for AES State 6
249*4882a593SmuzhiyunTMP7 =   16*6    # Temporary storage for AES State 7
250*4882a593SmuzhiyunTMP8 =   16*7    # Temporary storage for AES State 8
251*4882a593Smuzhiyun
252*4882a593SmuzhiyunVARIABLE_OFFSET = 16*8
253*4882a593Smuzhiyun
254*4882a593Smuzhiyun################################
255*4882a593Smuzhiyun# Utility Macros
256*4882a593Smuzhiyun################################
257*4882a593Smuzhiyun
258*4882a593Smuzhiyun.macro FUNC_SAVE
259*4882a593Smuzhiyun        #the number of pushes must equal STACK_OFFSET
260*4882a593Smuzhiyun        push    %r12
261*4882a593Smuzhiyun        push    %r13
262*4882a593Smuzhiyun        push    %r14
263*4882a593Smuzhiyun        push    %r15
264*4882a593Smuzhiyun
265*4882a593Smuzhiyun        mov     %rsp, %r14
266*4882a593Smuzhiyun
267*4882a593Smuzhiyun
268*4882a593Smuzhiyun
269*4882a593Smuzhiyun        sub     $VARIABLE_OFFSET, %rsp
270*4882a593Smuzhiyun        and     $~63, %rsp                    # align rsp to 64 bytes
271*4882a593Smuzhiyun.endm
272*4882a593Smuzhiyun
273*4882a593Smuzhiyun.macro FUNC_RESTORE
274*4882a593Smuzhiyun        mov     %r14, %rsp
275*4882a593Smuzhiyun
276*4882a593Smuzhiyun        pop     %r15
277*4882a593Smuzhiyun        pop     %r14
278*4882a593Smuzhiyun        pop     %r13
279*4882a593Smuzhiyun        pop     %r12
280*4882a593Smuzhiyun.endm
281*4882a593Smuzhiyun
282*4882a593Smuzhiyun# Encryption of a single block
283*4882a593Smuzhiyun.macro ENCRYPT_SINGLE_BLOCK REP XMM0
284*4882a593Smuzhiyun                vpxor    (arg1), \XMM0, \XMM0
285*4882a593Smuzhiyun               i = 1
286*4882a593Smuzhiyun               setreg
287*4882a593Smuzhiyun.rep \REP
288*4882a593Smuzhiyun                vaesenc  16*i(arg1), \XMM0, \XMM0
289*4882a593Smuzhiyun               i = (i+1)
290*4882a593Smuzhiyun               setreg
291*4882a593Smuzhiyun.endr
292*4882a593Smuzhiyun                vaesenclast 16*i(arg1), \XMM0, \XMM0
293*4882a593Smuzhiyun.endm
294*4882a593Smuzhiyun
295*4882a593Smuzhiyun# combined for GCM encrypt and decrypt functions
296*4882a593Smuzhiyun# clobbering all xmm registers
297*4882a593Smuzhiyun# clobbering r10, r11, r12, r13, r14, r15
298*4882a593Smuzhiyun.macro  GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP
299*4882a593Smuzhiyun        vmovdqu AadHash(arg2), %xmm8
300*4882a593Smuzhiyun        vmovdqu  HashKey(arg2), %xmm13      # xmm13 = HashKey
301*4882a593Smuzhiyun        add arg5, InLen(arg2)
302*4882a593Smuzhiyun
303*4882a593Smuzhiyun        # initialize the data pointer offset as zero
304*4882a593Smuzhiyun        xor     %r11d, %r11d
305*4882a593Smuzhiyun
306*4882a593Smuzhiyun        PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC
307*4882a593Smuzhiyun        sub %r11, arg5
308*4882a593Smuzhiyun
309*4882a593Smuzhiyun        mov     arg5, %r13                  # save the number of bytes of plaintext/ciphertext
310*4882a593Smuzhiyun        and     $-16, %r13                  # r13 = r13 - (r13 mod 16)
311*4882a593Smuzhiyun
312*4882a593Smuzhiyun        mov     %r13, %r12
313*4882a593Smuzhiyun        shr     $4, %r12
314*4882a593Smuzhiyun        and     $7, %r12
315*4882a593Smuzhiyun        jz      _initial_num_blocks_is_0\@
316*4882a593Smuzhiyun
317*4882a593Smuzhiyun        cmp     $7, %r12
318*4882a593Smuzhiyun        je      _initial_num_blocks_is_7\@
319*4882a593Smuzhiyun        cmp     $6, %r12
320*4882a593Smuzhiyun        je      _initial_num_blocks_is_6\@
321*4882a593Smuzhiyun        cmp     $5, %r12
322*4882a593Smuzhiyun        je      _initial_num_blocks_is_5\@
323*4882a593Smuzhiyun        cmp     $4, %r12
324*4882a593Smuzhiyun        je      _initial_num_blocks_is_4\@
325*4882a593Smuzhiyun        cmp     $3, %r12
326*4882a593Smuzhiyun        je      _initial_num_blocks_is_3\@
327*4882a593Smuzhiyun        cmp     $2, %r12
328*4882a593Smuzhiyun        je      _initial_num_blocks_is_2\@
329*4882a593Smuzhiyun
330*4882a593Smuzhiyun        jmp     _initial_num_blocks_is_1\@
331*4882a593Smuzhiyun
332*4882a593Smuzhiyun_initial_num_blocks_is_7\@:
333*4882a593Smuzhiyun        \INITIAL_BLOCKS  \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
334*4882a593Smuzhiyun        sub     $16*7, %r13
335*4882a593Smuzhiyun        jmp     _initial_blocks_encrypted\@
336*4882a593Smuzhiyun
337*4882a593Smuzhiyun_initial_num_blocks_is_6\@:
338*4882a593Smuzhiyun        \INITIAL_BLOCKS  \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
339*4882a593Smuzhiyun        sub     $16*6, %r13
340*4882a593Smuzhiyun        jmp     _initial_blocks_encrypted\@
341*4882a593Smuzhiyun
342*4882a593Smuzhiyun_initial_num_blocks_is_5\@:
343*4882a593Smuzhiyun        \INITIAL_BLOCKS  \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
344*4882a593Smuzhiyun        sub     $16*5, %r13
345*4882a593Smuzhiyun        jmp     _initial_blocks_encrypted\@
346*4882a593Smuzhiyun
347*4882a593Smuzhiyun_initial_num_blocks_is_4\@:
348*4882a593Smuzhiyun        \INITIAL_BLOCKS  \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
349*4882a593Smuzhiyun        sub     $16*4, %r13
350*4882a593Smuzhiyun        jmp     _initial_blocks_encrypted\@
351*4882a593Smuzhiyun
352*4882a593Smuzhiyun_initial_num_blocks_is_3\@:
353*4882a593Smuzhiyun        \INITIAL_BLOCKS  \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
354*4882a593Smuzhiyun        sub     $16*3, %r13
355*4882a593Smuzhiyun        jmp     _initial_blocks_encrypted\@
356*4882a593Smuzhiyun
357*4882a593Smuzhiyun_initial_num_blocks_is_2\@:
358*4882a593Smuzhiyun        \INITIAL_BLOCKS  \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
359*4882a593Smuzhiyun        sub     $16*2, %r13
360*4882a593Smuzhiyun        jmp     _initial_blocks_encrypted\@
361*4882a593Smuzhiyun
362*4882a593Smuzhiyun_initial_num_blocks_is_1\@:
363*4882a593Smuzhiyun        \INITIAL_BLOCKS  \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
364*4882a593Smuzhiyun        sub     $16*1, %r13
365*4882a593Smuzhiyun        jmp     _initial_blocks_encrypted\@
366*4882a593Smuzhiyun
367*4882a593Smuzhiyun_initial_num_blocks_is_0\@:
368*4882a593Smuzhiyun        \INITIAL_BLOCKS  \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC
369*4882a593Smuzhiyun
370*4882a593Smuzhiyun
371*4882a593Smuzhiyun_initial_blocks_encrypted\@:
372*4882a593Smuzhiyun        test    %r13, %r13
373*4882a593Smuzhiyun        je      _zero_cipher_left\@
374*4882a593Smuzhiyun
375*4882a593Smuzhiyun        sub     $128, %r13
376*4882a593Smuzhiyun        je      _eight_cipher_left\@
377*4882a593Smuzhiyun
378*4882a593Smuzhiyun
379*4882a593Smuzhiyun
380*4882a593Smuzhiyun
381*4882a593Smuzhiyun        vmovd   %xmm9, %r15d
382*4882a593Smuzhiyun        and     $255, %r15d
383*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
384*4882a593Smuzhiyun
385*4882a593Smuzhiyun
386*4882a593Smuzhiyun_encrypt_by_8_new\@:
387*4882a593Smuzhiyun        cmp     $(255-8), %r15d
388*4882a593Smuzhiyun        jg      _encrypt_by_8\@
389*4882a593Smuzhiyun
390*4882a593Smuzhiyun
391*4882a593Smuzhiyun
392*4882a593Smuzhiyun        add     $8, %r15b
393*4882a593Smuzhiyun        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC
394*4882a593Smuzhiyun        add     $128, %r11
395*4882a593Smuzhiyun        sub     $128, %r13
396*4882a593Smuzhiyun        jne     _encrypt_by_8_new\@
397*4882a593Smuzhiyun
398*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
399*4882a593Smuzhiyun        jmp     _eight_cipher_left\@
400*4882a593Smuzhiyun
401*4882a593Smuzhiyun_encrypt_by_8\@:
402*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
403*4882a593Smuzhiyun        add     $8, %r15b
404*4882a593Smuzhiyun        \GHASH_8_ENCRYPT_8_PARALLEL      \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC
405*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
406*4882a593Smuzhiyun        add     $128, %r11
407*4882a593Smuzhiyun        sub     $128, %r13
408*4882a593Smuzhiyun        jne     _encrypt_by_8_new\@
409*4882a593Smuzhiyun
410*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
411*4882a593Smuzhiyun
412*4882a593Smuzhiyun
413*4882a593Smuzhiyun
414*4882a593Smuzhiyun
415*4882a593Smuzhiyun_eight_cipher_left\@:
416*4882a593Smuzhiyun        \GHASH_LAST_8    %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8
417*4882a593Smuzhiyun
418*4882a593Smuzhiyun
419*4882a593Smuzhiyun_zero_cipher_left\@:
420*4882a593Smuzhiyun        vmovdqu %xmm14, AadHash(arg2)
421*4882a593Smuzhiyun        vmovdqu %xmm9, CurCount(arg2)
422*4882a593Smuzhiyun
423*4882a593Smuzhiyun        # check for 0 length
424*4882a593Smuzhiyun        mov     arg5, %r13
425*4882a593Smuzhiyun        and     $15, %r13                            # r13 = (arg5 mod 16)
426*4882a593Smuzhiyun
427*4882a593Smuzhiyun        je      _multiple_of_16_bytes\@
428*4882a593Smuzhiyun
429*4882a593Smuzhiyun        # handle the last <16 Byte block separately
430*4882a593Smuzhiyun
431*4882a593Smuzhiyun        mov %r13, PBlockLen(arg2)
432*4882a593Smuzhiyun
433*4882a593Smuzhiyun        vpaddd  ONE(%rip), %xmm9, %xmm9              # INCR CNT to get Yn
434*4882a593Smuzhiyun        vmovdqu %xmm9, CurCount(arg2)
435*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
436*4882a593Smuzhiyun
437*4882a593Smuzhiyun        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Yn)
438*4882a593Smuzhiyun        vmovdqu %xmm9, PBlockEncKey(arg2)
439*4882a593Smuzhiyun
440*4882a593Smuzhiyun        cmp $16, arg5
441*4882a593Smuzhiyun        jge _large_enough_update\@
442*4882a593Smuzhiyun
443*4882a593Smuzhiyun        lea (arg4,%r11,1), %r10
444*4882a593Smuzhiyun        mov %r13, %r12
445*4882a593Smuzhiyun
446*4882a593Smuzhiyun        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
447*4882a593Smuzhiyun
448*4882a593Smuzhiyun        lea     SHIFT_MASK+16(%rip), %r12
449*4882a593Smuzhiyun        sub     %r13, %r12                           # adjust the shuffle mask pointer to be
450*4882a593Smuzhiyun						     # able to shift 16-r13 bytes (r13 is the
451*4882a593Smuzhiyun	# number of bytes in plaintext mod 16)
452*4882a593Smuzhiyun
453*4882a593Smuzhiyun        jmp _final_ghash_mul\@
454*4882a593Smuzhiyun
455*4882a593Smuzhiyun_large_enough_update\@:
456*4882a593Smuzhiyun        sub $16, %r11
457*4882a593Smuzhiyun        add %r13, %r11
458*4882a593Smuzhiyun
459*4882a593Smuzhiyun        # receive the last <16 Byte block
460*4882a593Smuzhiyun        vmovdqu	(arg4, %r11, 1), %xmm1
461*4882a593Smuzhiyun
462*4882a593Smuzhiyun        sub	%r13, %r11
463*4882a593Smuzhiyun        add	$16, %r11
464*4882a593Smuzhiyun
465*4882a593Smuzhiyun        lea	SHIFT_MASK+16(%rip), %r12
466*4882a593Smuzhiyun        # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
467*4882a593Smuzhiyun        # (r13 is the number of bytes in plaintext mod 16)
468*4882a593Smuzhiyun        sub	%r13, %r12
469*4882a593Smuzhiyun        # get the appropriate shuffle mask
470*4882a593Smuzhiyun        vmovdqu	(%r12), %xmm2
471*4882a593Smuzhiyun        # shift right 16-r13 bytes
472*4882a593Smuzhiyun        vpshufb  %xmm2, %xmm1, %xmm1
473*4882a593Smuzhiyun
474*4882a593Smuzhiyun_final_ghash_mul\@:
475*4882a593Smuzhiyun        .if  \ENC_DEC ==  DEC
476*4882a593Smuzhiyun        vmovdqa %xmm1, %xmm2
477*4882a593Smuzhiyun        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
478*4882a593Smuzhiyun        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
479*4882a593Smuzhiyun						     # mask out top 16-r13 bytes of xmm9
480*4882a593Smuzhiyun        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
481*4882a593Smuzhiyun        vpand   %xmm1, %xmm2, %xmm2
482*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm2, %xmm2
483*4882a593Smuzhiyun        vpxor   %xmm2, %xmm14, %xmm14
484*4882a593Smuzhiyun
485*4882a593Smuzhiyun        vmovdqu %xmm14, AadHash(arg2)
486*4882a593Smuzhiyun        .else
487*4882a593Smuzhiyun        vpxor   %xmm1, %xmm9, %xmm9                  # Plaintext XOR E(K, Yn)
488*4882a593Smuzhiyun        vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1        # get the appropriate mask to
489*4882a593Smuzhiyun						     # mask out top 16-r13 bytes of xmm9
490*4882a593Smuzhiyun        vpand   %xmm1, %xmm9, %xmm9                  # mask out top 16-r13 bytes of xmm9
491*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9
492*4882a593Smuzhiyun        vpxor   %xmm9, %xmm14, %xmm14
493*4882a593Smuzhiyun
494*4882a593Smuzhiyun        vmovdqu %xmm14, AadHash(arg2)
495*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm9, %xmm9        # shuffle xmm9 back to output as ciphertext
496*4882a593Smuzhiyun        .endif
497*4882a593Smuzhiyun
498*4882a593Smuzhiyun
499*4882a593Smuzhiyun        #############################
500*4882a593Smuzhiyun        # output r13 Bytes
501*4882a593Smuzhiyun        vmovq   %xmm9, %rax
502*4882a593Smuzhiyun        cmp     $8, %r13
503*4882a593Smuzhiyun        jle     _less_than_8_bytes_left\@
504*4882a593Smuzhiyun
505*4882a593Smuzhiyun        mov     %rax, (arg3 , %r11)
506*4882a593Smuzhiyun        add     $8, %r11
507*4882a593Smuzhiyun        vpsrldq $8, %xmm9, %xmm9
508*4882a593Smuzhiyun        vmovq   %xmm9, %rax
509*4882a593Smuzhiyun        sub     $8, %r13
510*4882a593Smuzhiyun
511*4882a593Smuzhiyun_less_than_8_bytes_left\@:
512*4882a593Smuzhiyun        movb    %al, (arg3 , %r11)
513*4882a593Smuzhiyun        add     $1, %r11
514*4882a593Smuzhiyun        shr     $8, %rax
515*4882a593Smuzhiyun        sub     $1, %r13
516*4882a593Smuzhiyun        jne     _less_than_8_bytes_left\@
517*4882a593Smuzhiyun        #############################
518*4882a593Smuzhiyun
519*4882a593Smuzhiyun_multiple_of_16_bytes\@:
520*4882a593Smuzhiyun.endm
521*4882a593Smuzhiyun
522*4882a593Smuzhiyun
523*4882a593Smuzhiyun# GCM_COMPLETE Finishes update of tag of last partial block
524*4882a593Smuzhiyun# Output: Authorization Tag (AUTH_TAG)
525*4882a593Smuzhiyun# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15
526*4882a593Smuzhiyun.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN
527*4882a593Smuzhiyun        vmovdqu AadHash(arg2), %xmm14
528*4882a593Smuzhiyun        vmovdqu HashKey(arg2), %xmm13
529*4882a593Smuzhiyun
530*4882a593Smuzhiyun        mov PBlockLen(arg2), %r12
531*4882a593Smuzhiyun        test %r12, %r12
532*4882a593Smuzhiyun        je _partial_done\@
533*4882a593Smuzhiyun
534*4882a593Smuzhiyun	#GHASH computation for the last <16 Byte block
535*4882a593Smuzhiyun        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
536*4882a593Smuzhiyun
537*4882a593Smuzhiyun_partial_done\@:
538*4882a593Smuzhiyun        mov AadLen(arg2), %r12                          # r12 = aadLen (number of bytes)
539*4882a593Smuzhiyun        shl     $3, %r12                             # convert into number of bits
540*4882a593Smuzhiyun        vmovd   %r12d, %xmm15                        # len(A) in xmm15
541*4882a593Smuzhiyun
542*4882a593Smuzhiyun        mov InLen(arg2), %r12
543*4882a593Smuzhiyun        shl     $3, %r12                        # len(C) in bits  (*128)
544*4882a593Smuzhiyun        vmovq   %r12, %xmm1
545*4882a593Smuzhiyun        vpslldq $8, %xmm15, %xmm15                   # xmm15 = len(A)|| 0x0000000000000000
546*4882a593Smuzhiyun        vpxor   %xmm1, %xmm15, %xmm15                # xmm15 = len(A)||len(C)
547*4882a593Smuzhiyun
548*4882a593Smuzhiyun        vpxor   %xmm15, %xmm14, %xmm14
549*4882a593Smuzhiyun        \GHASH_MUL       %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6    # final GHASH computation
550*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm14, %xmm14      # perform a 16Byte swap
551*4882a593Smuzhiyun
552*4882a593Smuzhiyun        vmovdqu OrigIV(arg2), %xmm9
553*4882a593Smuzhiyun
554*4882a593Smuzhiyun        ENCRYPT_SINGLE_BLOCK    \REP, %xmm9                # E(K, Y0)
555*4882a593Smuzhiyun
556*4882a593Smuzhiyun        vpxor   %xmm14, %xmm9, %xmm9
557*4882a593Smuzhiyun
558*4882a593Smuzhiyun
559*4882a593Smuzhiyun
560*4882a593Smuzhiyun_return_T\@:
561*4882a593Smuzhiyun        mov     \AUTH_TAG, %r10              # r10 = authTag
562*4882a593Smuzhiyun        mov     \AUTH_TAG_LEN, %r11              # r11 = auth_tag_len
563*4882a593Smuzhiyun
564*4882a593Smuzhiyun        cmp     $16, %r11
565*4882a593Smuzhiyun        je      _T_16\@
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun        cmp     $8, %r11
568*4882a593Smuzhiyun        jl      _T_4\@
569*4882a593Smuzhiyun
570*4882a593Smuzhiyun_T_8\@:
571*4882a593Smuzhiyun        vmovq   %xmm9, %rax
572*4882a593Smuzhiyun        mov     %rax, (%r10)
573*4882a593Smuzhiyun        add     $8, %r10
574*4882a593Smuzhiyun        sub     $8, %r11
575*4882a593Smuzhiyun        vpsrldq $8, %xmm9, %xmm9
576*4882a593Smuzhiyun        test    %r11, %r11
577*4882a593Smuzhiyun        je     _return_T_done\@
578*4882a593Smuzhiyun_T_4\@:
579*4882a593Smuzhiyun        vmovd   %xmm9, %eax
580*4882a593Smuzhiyun        mov     %eax, (%r10)
581*4882a593Smuzhiyun        add     $4, %r10
582*4882a593Smuzhiyun        sub     $4, %r11
583*4882a593Smuzhiyun        vpsrldq     $4, %xmm9, %xmm9
584*4882a593Smuzhiyun        test    %r11, %r11
585*4882a593Smuzhiyun        je     _return_T_done\@
586*4882a593Smuzhiyun_T_123\@:
587*4882a593Smuzhiyun        vmovd     %xmm9, %eax
588*4882a593Smuzhiyun        cmp     $2, %r11
589*4882a593Smuzhiyun        jl     _T_1\@
590*4882a593Smuzhiyun        mov     %ax, (%r10)
591*4882a593Smuzhiyun        cmp     $2, %r11
592*4882a593Smuzhiyun        je     _return_T_done\@
593*4882a593Smuzhiyun        add     $2, %r10
594*4882a593Smuzhiyun        sar     $16, %eax
595*4882a593Smuzhiyun_T_1\@:
596*4882a593Smuzhiyun        mov     %al, (%r10)
597*4882a593Smuzhiyun        jmp     _return_T_done\@
598*4882a593Smuzhiyun
599*4882a593Smuzhiyun_T_16\@:
600*4882a593Smuzhiyun        vmovdqu %xmm9, (%r10)
601*4882a593Smuzhiyun
602*4882a593Smuzhiyun_return_T_done\@:
603*4882a593Smuzhiyun.endm
604*4882a593Smuzhiyun
605*4882a593Smuzhiyun.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8
606*4882a593Smuzhiyun
607*4882a593Smuzhiyun	mov     \AAD, %r10                      # r10 = AAD
608*4882a593Smuzhiyun	mov     \AADLEN, %r12                      # r12 = aadLen
609*4882a593Smuzhiyun
610*4882a593Smuzhiyun
611*4882a593Smuzhiyun	mov     %r12, %r11
612*4882a593Smuzhiyun
613*4882a593Smuzhiyun	vpxor   \T8, \T8, \T8
614*4882a593Smuzhiyun	vpxor   \T7, \T7, \T7
615*4882a593Smuzhiyun	cmp     $16, %r11
616*4882a593Smuzhiyun	jl      _get_AAD_rest8\@
617*4882a593Smuzhiyun_get_AAD_blocks\@:
618*4882a593Smuzhiyun	vmovdqu (%r10), \T7
619*4882a593Smuzhiyun	vpshufb SHUF_MASK(%rip), \T7, \T7
620*4882a593Smuzhiyun	vpxor   \T7, \T8, \T8
621*4882a593Smuzhiyun	\GHASH_MUL       \T8, \T2, \T1, \T3, \T4, \T5, \T6
622*4882a593Smuzhiyun	add     $16, %r10
623*4882a593Smuzhiyun	sub     $16, %r12
624*4882a593Smuzhiyun	sub     $16, %r11
625*4882a593Smuzhiyun	cmp     $16, %r11
626*4882a593Smuzhiyun	jge     _get_AAD_blocks\@
627*4882a593Smuzhiyun	vmovdqu \T8, \T7
628*4882a593Smuzhiyun	test    %r11, %r11
629*4882a593Smuzhiyun	je      _get_AAD_done\@
630*4882a593Smuzhiyun
631*4882a593Smuzhiyun	vpxor   \T7, \T7, \T7
632*4882a593Smuzhiyun
633*4882a593Smuzhiyun	/* read the last <16B of AAD. since we have at least 4B of
634*4882a593Smuzhiyun	data right after the AAD (the ICV, and maybe some CT), we can
635*4882a593Smuzhiyun	read 4B/8B blocks safely, and then get rid of the extra stuff */
636*4882a593Smuzhiyun_get_AAD_rest8\@:
637*4882a593Smuzhiyun	cmp     $4, %r11
638*4882a593Smuzhiyun	jle     _get_AAD_rest4\@
639*4882a593Smuzhiyun	movq    (%r10), \T1
640*4882a593Smuzhiyun	add     $8, %r10
641*4882a593Smuzhiyun	sub     $8, %r11
642*4882a593Smuzhiyun	vpslldq $8, \T1, \T1
643*4882a593Smuzhiyun	vpsrldq $8, \T7, \T7
644*4882a593Smuzhiyun	vpxor   \T1, \T7, \T7
645*4882a593Smuzhiyun	jmp     _get_AAD_rest8\@
646*4882a593Smuzhiyun_get_AAD_rest4\@:
647*4882a593Smuzhiyun	test    %r11, %r11
648*4882a593Smuzhiyun	jle      _get_AAD_rest0\@
649*4882a593Smuzhiyun	mov     (%r10), %eax
650*4882a593Smuzhiyun	movq    %rax, \T1
651*4882a593Smuzhiyun	add     $4, %r10
652*4882a593Smuzhiyun	sub     $4, %r11
653*4882a593Smuzhiyun	vpslldq $12, \T1, \T1
654*4882a593Smuzhiyun	vpsrldq $4, \T7, \T7
655*4882a593Smuzhiyun	vpxor   \T1, \T7, \T7
656*4882a593Smuzhiyun_get_AAD_rest0\@:
657*4882a593Smuzhiyun	/* finalize: shift out the extra bytes we read, and align
658*4882a593Smuzhiyun	left. since pslldq can only shift by an immediate, we use
659*4882a593Smuzhiyun	vpshufb and an array of shuffle masks */
660*4882a593Smuzhiyun	movq    %r12, %r11
661*4882a593Smuzhiyun	salq    $4, %r11
662*4882a593Smuzhiyun	vmovdqu  aad_shift_arr(%r11), \T1
663*4882a593Smuzhiyun	vpshufb \T1, \T7, \T7
664*4882a593Smuzhiyun_get_AAD_rest_final\@:
665*4882a593Smuzhiyun	vpshufb SHUF_MASK(%rip), \T7, \T7
666*4882a593Smuzhiyun	vpxor   \T8, \T7, \T7
667*4882a593Smuzhiyun	\GHASH_MUL       \T7, \T2, \T1, \T3, \T4, \T5, \T6
668*4882a593Smuzhiyun
669*4882a593Smuzhiyun_get_AAD_done\@:
670*4882a593Smuzhiyun        vmovdqu \T7, AadHash(arg2)
671*4882a593Smuzhiyun.endm
672*4882a593Smuzhiyun
673*4882a593Smuzhiyun.macro INIT GHASH_MUL PRECOMPUTE
674*4882a593Smuzhiyun        mov arg6, %r11
675*4882a593Smuzhiyun        mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length
676*4882a593Smuzhiyun        xor %r11d, %r11d
677*4882a593Smuzhiyun        mov %r11, InLen(arg2) # ctx_data.in_length = 0
678*4882a593Smuzhiyun
679*4882a593Smuzhiyun        mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0
680*4882a593Smuzhiyun        mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0
681*4882a593Smuzhiyun        mov arg3, %rax
682*4882a593Smuzhiyun        movdqu (%rax), %xmm0
683*4882a593Smuzhiyun        movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv
684*4882a593Smuzhiyun
685*4882a593Smuzhiyun        vpshufb SHUF_MASK(%rip), %xmm0, %xmm0
686*4882a593Smuzhiyun        movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv
687*4882a593Smuzhiyun
688*4882a593Smuzhiyun        vmovdqu  (arg4), %xmm6              # xmm6 = HashKey
689*4882a593Smuzhiyun
690*4882a593Smuzhiyun        vpshufb  SHUF_MASK(%rip), %xmm6, %xmm6
691*4882a593Smuzhiyun        ###############  PRECOMPUTATION of HashKey<<1 mod poly from the HashKey
692*4882a593Smuzhiyun        vmovdqa  %xmm6, %xmm2
693*4882a593Smuzhiyun        vpsllq   $1, %xmm6, %xmm6
694*4882a593Smuzhiyun        vpsrlq   $63, %xmm2, %xmm2
695*4882a593Smuzhiyun        vmovdqa  %xmm2, %xmm1
696*4882a593Smuzhiyun        vpslldq  $8, %xmm2, %xmm2
697*4882a593Smuzhiyun        vpsrldq  $8, %xmm1, %xmm1
698*4882a593Smuzhiyun        vpor     %xmm2, %xmm6, %xmm6
699*4882a593Smuzhiyun        #reduction
700*4882a593Smuzhiyun        vpshufd  $0b00100100, %xmm1, %xmm2
701*4882a593Smuzhiyun        vpcmpeqd TWOONE(%rip), %xmm2, %xmm2
702*4882a593Smuzhiyun        vpand    POLY(%rip), %xmm2, %xmm2
703*4882a593Smuzhiyun        vpxor    %xmm2, %xmm6, %xmm6        # xmm6 holds the HashKey<<1 mod poly
704*4882a593Smuzhiyun        #######################################################################
705*4882a593Smuzhiyun        vmovdqu  %xmm6, HashKey(arg2)       # store HashKey<<1 mod poly
706*4882a593Smuzhiyun
707*4882a593Smuzhiyun        CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0
708*4882a593Smuzhiyun
709*4882a593Smuzhiyun        \PRECOMPUTE  %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5
710*4882a593Smuzhiyun.endm
711*4882a593Smuzhiyun
712*4882a593Smuzhiyun
713*4882a593Smuzhiyun# Reads DLEN bytes starting at DPTR and stores in XMMDst
714*4882a593Smuzhiyun# where 0 < DLEN < 16
715*4882a593Smuzhiyun# Clobbers %rax, DLEN
716*4882a593Smuzhiyun.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst
717*4882a593Smuzhiyun        vpxor \XMMDst, \XMMDst, \XMMDst
718*4882a593Smuzhiyun
719*4882a593Smuzhiyun        cmp $8, \DLEN
720*4882a593Smuzhiyun        jl _read_lt8_\@
721*4882a593Smuzhiyun        mov (\DPTR), %rax
722*4882a593Smuzhiyun        vpinsrq $0, %rax, \XMMDst, \XMMDst
723*4882a593Smuzhiyun        sub $8, \DLEN
724*4882a593Smuzhiyun        jz _done_read_partial_block_\@
725*4882a593Smuzhiyun        xor %eax, %eax
726*4882a593Smuzhiyun_read_next_byte_\@:
727*4882a593Smuzhiyun        shl $8, %rax
728*4882a593Smuzhiyun        mov 7(\DPTR, \DLEN, 1), %al
729*4882a593Smuzhiyun        dec \DLEN
730*4882a593Smuzhiyun        jnz _read_next_byte_\@
731*4882a593Smuzhiyun        vpinsrq $1, %rax, \XMMDst, \XMMDst
732*4882a593Smuzhiyun        jmp _done_read_partial_block_\@
733*4882a593Smuzhiyun_read_lt8_\@:
734*4882a593Smuzhiyun        xor %eax, %eax
735*4882a593Smuzhiyun_read_next_byte_lt8_\@:
736*4882a593Smuzhiyun        shl $8, %rax
737*4882a593Smuzhiyun        mov -1(\DPTR, \DLEN, 1), %al
738*4882a593Smuzhiyun        dec \DLEN
739*4882a593Smuzhiyun        jnz _read_next_byte_lt8_\@
740*4882a593Smuzhiyun        vpinsrq $0, %rax, \XMMDst, \XMMDst
741*4882a593Smuzhiyun_done_read_partial_block_\@:
742*4882a593Smuzhiyun.endm
743*4882a593Smuzhiyun
744*4882a593Smuzhiyun# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks
745*4882a593Smuzhiyun# between update calls.
746*4882a593Smuzhiyun# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK
747*4882a593Smuzhiyun# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context
748*4882a593Smuzhiyun# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13
749*4882a593Smuzhiyun.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \
750*4882a593Smuzhiyun        AAD_HASH ENC_DEC
751*4882a593Smuzhiyun        mov 	PBlockLen(arg2), %r13
752*4882a593Smuzhiyun        test	%r13, %r13
753*4882a593Smuzhiyun        je	_partial_block_done_\@	# Leave Macro if no partial blocks
754*4882a593Smuzhiyun        # Read in input data without over reading
755*4882a593Smuzhiyun        cmp	$16, \PLAIN_CYPH_LEN
756*4882a593Smuzhiyun        jl	_fewer_than_16_bytes_\@
757*4882a593Smuzhiyun        vmovdqu	(\PLAIN_CYPH_IN), %xmm1	# If more than 16 bytes, just fill xmm
758*4882a593Smuzhiyun        jmp	_data_read_\@
759*4882a593Smuzhiyun
760*4882a593Smuzhiyun_fewer_than_16_bytes_\@:
761*4882a593Smuzhiyun        lea	(\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10
762*4882a593Smuzhiyun        mov	\PLAIN_CYPH_LEN, %r12
763*4882a593Smuzhiyun        READ_PARTIAL_BLOCK %r10 %r12 %xmm1
764*4882a593Smuzhiyun
765*4882a593Smuzhiyun        mov PBlockLen(arg2), %r13
766*4882a593Smuzhiyun
767*4882a593Smuzhiyun_data_read_\@:				# Finished reading in data
768*4882a593Smuzhiyun
769*4882a593Smuzhiyun        vmovdqu	PBlockEncKey(arg2), %xmm9
770*4882a593Smuzhiyun        vmovdqu	HashKey(arg2), %xmm13
771*4882a593Smuzhiyun
772*4882a593Smuzhiyun        lea	SHIFT_MASK(%rip), %r12
773*4882a593Smuzhiyun
774*4882a593Smuzhiyun        # adjust the shuffle mask pointer to be able to shift r13 bytes
775*4882a593Smuzhiyun        # r16-r13 is the number of bytes in plaintext mod 16)
776*4882a593Smuzhiyun        add	%r13, %r12
777*4882a593Smuzhiyun        vmovdqu	(%r12), %xmm2		# get the appropriate shuffle mask
778*4882a593Smuzhiyun        vpshufb %xmm2, %xmm9, %xmm9		# shift right r13 bytes
779*4882a593Smuzhiyun
780*4882a593Smuzhiyun.if  \ENC_DEC ==  DEC
781*4882a593Smuzhiyun        vmovdqa	%xmm1, %xmm3
782*4882a593Smuzhiyun        pxor	%xmm1, %xmm9		# Cyphertext XOR E(K, Yn)
783*4882a593Smuzhiyun
784*4882a593Smuzhiyun        mov	\PLAIN_CYPH_LEN, %r10
785*4882a593Smuzhiyun        add	%r13, %r10
786*4882a593Smuzhiyun        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
787*4882a593Smuzhiyun        sub	$16, %r10
788*4882a593Smuzhiyun        # Determine if if partial block is not being filled and
789*4882a593Smuzhiyun        # shift mask accordingly
790*4882a593Smuzhiyun        jge	_no_extra_mask_1_\@
791*4882a593Smuzhiyun        sub	%r10, %r12
792*4882a593Smuzhiyun_no_extra_mask_1_\@:
793*4882a593Smuzhiyun
794*4882a593Smuzhiyun        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
795*4882a593Smuzhiyun        # get the appropriate mask to mask out bottom r13 bytes of xmm9
796*4882a593Smuzhiyun        vpand	%xmm1, %xmm9, %xmm9		# mask out bottom r13 bytes of xmm9
797*4882a593Smuzhiyun
798*4882a593Smuzhiyun        vpand	%xmm1, %xmm3, %xmm3
799*4882a593Smuzhiyun        vmovdqa	SHUF_MASK(%rip), %xmm10
800*4882a593Smuzhiyun        vpshufb	%xmm10, %xmm3, %xmm3
801*4882a593Smuzhiyun        vpshufb	%xmm2, %xmm3, %xmm3
802*4882a593Smuzhiyun        vpxor	%xmm3, \AAD_HASH, \AAD_HASH
803*4882a593Smuzhiyun
804*4882a593Smuzhiyun        test	%r10, %r10
805*4882a593Smuzhiyun        jl	_partial_incomplete_1_\@
806*4882a593Smuzhiyun
807*4882a593Smuzhiyun        # GHASH computation for the last <16 Byte block
808*4882a593Smuzhiyun        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
809*4882a593Smuzhiyun        xor	%eax,%eax
810*4882a593Smuzhiyun
811*4882a593Smuzhiyun        mov	%rax, PBlockLen(arg2)
812*4882a593Smuzhiyun        jmp	_dec_done_\@
813*4882a593Smuzhiyun_partial_incomplete_1_\@:
814*4882a593Smuzhiyun        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
815*4882a593Smuzhiyun_dec_done_\@:
816*4882a593Smuzhiyun        vmovdqu	\AAD_HASH, AadHash(arg2)
817*4882a593Smuzhiyun.else
818*4882a593Smuzhiyun        vpxor	%xmm1, %xmm9, %xmm9			# Plaintext XOR E(K, Yn)
819*4882a593Smuzhiyun
820*4882a593Smuzhiyun        mov	\PLAIN_CYPH_LEN, %r10
821*4882a593Smuzhiyun        add	%r13, %r10
822*4882a593Smuzhiyun        # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling
823*4882a593Smuzhiyun        sub	$16, %r10
824*4882a593Smuzhiyun        # Determine if if partial block is not being filled and
825*4882a593Smuzhiyun        # shift mask accordingly
826*4882a593Smuzhiyun        jge	_no_extra_mask_2_\@
827*4882a593Smuzhiyun        sub	%r10, %r12
828*4882a593Smuzhiyun_no_extra_mask_2_\@:
829*4882a593Smuzhiyun
830*4882a593Smuzhiyun        vmovdqu	ALL_F-SHIFT_MASK(%r12), %xmm1
831*4882a593Smuzhiyun        # get the appropriate mask to mask out bottom r13 bytes of xmm9
832*4882a593Smuzhiyun        vpand	%xmm1, %xmm9, %xmm9
833*4882a593Smuzhiyun
834*4882a593Smuzhiyun        vmovdqa	SHUF_MASK(%rip), %xmm1
835*4882a593Smuzhiyun        vpshufb %xmm1, %xmm9, %xmm9
836*4882a593Smuzhiyun        vpshufb %xmm2, %xmm9, %xmm9
837*4882a593Smuzhiyun        vpxor	%xmm9, \AAD_HASH, \AAD_HASH
838*4882a593Smuzhiyun
839*4882a593Smuzhiyun        test	%r10, %r10
840*4882a593Smuzhiyun        jl	_partial_incomplete_2_\@
841*4882a593Smuzhiyun
842*4882a593Smuzhiyun        # GHASH computation for the last <16 Byte block
843*4882a593Smuzhiyun        \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6
844*4882a593Smuzhiyun        xor	%eax,%eax
845*4882a593Smuzhiyun
846*4882a593Smuzhiyun        mov	%rax, PBlockLen(arg2)
847*4882a593Smuzhiyun        jmp	_encode_done_\@
848*4882a593Smuzhiyun_partial_incomplete_2_\@:
849*4882a593Smuzhiyun        add	\PLAIN_CYPH_LEN, PBlockLen(arg2)
850*4882a593Smuzhiyun_encode_done_\@:
851*4882a593Smuzhiyun        vmovdqu	\AAD_HASH, AadHash(arg2)
852*4882a593Smuzhiyun
853*4882a593Smuzhiyun        vmovdqa	SHUF_MASK(%rip), %xmm10
854*4882a593Smuzhiyun        # shuffle xmm9 back to output as ciphertext
855*4882a593Smuzhiyun        vpshufb	%xmm10, %xmm9, %xmm9
856*4882a593Smuzhiyun        vpshufb	%xmm2, %xmm9, %xmm9
857*4882a593Smuzhiyun.endif
858*4882a593Smuzhiyun        # output encrypted Bytes
859*4882a593Smuzhiyun        test	%r10, %r10
860*4882a593Smuzhiyun        jl	_partial_fill_\@
861*4882a593Smuzhiyun        mov	%r13, %r12
862*4882a593Smuzhiyun        mov	$16, %r13
863*4882a593Smuzhiyun        # Set r13 to be the number of bytes to write out
864*4882a593Smuzhiyun        sub	%r12, %r13
865*4882a593Smuzhiyun        jmp	_count_set_\@
866*4882a593Smuzhiyun_partial_fill_\@:
867*4882a593Smuzhiyun        mov	\PLAIN_CYPH_LEN, %r13
868*4882a593Smuzhiyun_count_set_\@:
869*4882a593Smuzhiyun        vmovdqa	%xmm9, %xmm0
870*4882a593Smuzhiyun        vmovq	%xmm0, %rax
871*4882a593Smuzhiyun        cmp	$8, %r13
872*4882a593Smuzhiyun        jle	_less_than_8_bytes_left_\@
873*4882a593Smuzhiyun
874*4882a593Smuzhiyun        mov	%rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
875*4882a593Smuzhiyun        add	$8, \DATA_OFFSET
876*4882a593Smuzhiyun        psrldq	$8, %xmm0
877*4882a593Smuzhiyun        vmovq	%xmm0, %rax
878*4882a593Smuzhiyun        sub	$8, %r13
879*4882a593Smuzhiyun_less_than_8_bytes_left_\@:
880*4882a593Smuzhiyun        movb	%al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1)
881*4882a593Smuzhiyun        add	$1, \DATA_OFFSET
882*4882a593Smuzhiyun        shr	$8, %rax
883*4882a593Smuzhiyun        sub	$1, %r13
884*4882a593Smuzhiyun        jne	_less_than_8_bytes_left_\@
885*4882a593Smuzhiyun_partial_block_done_\@:
886*4882a593Smuzhiyun.endm # PARTIAL_BLOCK
887*4882a593Smuzhiyun
888*4882a593Smuzhiyun###############################################################################
889*4882a593Smuzhiyun# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
890*4882a593Smuzhiyun# Input: A and B (128-bits each, bit-reflected)
891*4882a593Smuzhiyun# Output: C = A*B*x mod poly, (i.e. >>1 )
892*4882a593Smuzhiyun# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
893*4882a593Smuzhiyun# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
894*4882a593Smuzhiyun###############################################################################
895*4882a593Smuzhiyun.macro  GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5
896*4882a593Smuzhiyun
897*4882a593Smuzhiyun        vpshufd         $0b01001110, \GH, \T2
898*4882a593Smuzhiyun        vpshufd         $0b01001110, \HK, \T3
899*4882a593Smuzhiyun        vpxor           \GH     , \T2, \T2      # T2 = (a1+a0)
900*4882a593Smuzhiyun        vpxor           \HK     , \T3, \T3      # T3 = (b1+b0)
901*4882a593Smuzhiyun
902*4882a593Smuzhiyun        vpclmulqdq      $0x11, \HK, \GH, \T1    # T1 = a1*b1
903*4882a593Smuzhiyun        vpclmulqdq      $0x00, \HK, \GH, \GH    # GH = a0*b0
904*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2    # T2 = (a1+a0)*(b1+b0)
905*4882a593Smuzhiyun        vpxor           \GH, \T2,\T2
906*4882a593Smuzhiyun        vpxor           \T1, \T2,\T2            # T2 = a0*b1+a1*b0
907*4882a593Smuzhiyun
908*4882a593Smuzhiyun        vpslldq         $8, \T2,\T3             # shift-L T3 2 DWs
909*4882a593Smuzhiyun        vpsrldq         $8, \T2,\T2             # shift-R T2 2 DWs
910*4882a593Smuzhiyun        vpxor           \T3, \GH, \GH
911*4882a593Smuzhiyun        vpxor           \T2, \T1, \T1           # <T1:GH> = GH x HK
912*4882a593Smuzhiyun
913*4882a593Smuzhiyun        #first phase of the reduction
914*4882a593Smuzhiyun        vpslld  $31, \GH, \T2                   # packed right shifting << 31
915*4882a593Smuzhiyun        vpslld  $30, \GH, \T3                   # packed right shifting shift << 30
916*4882a593Smuzhiyun        vpslld  $25, \GH, \T4                   # packed right shifting shift << 25
917*4882a593Smuzhiyun
918*4882a593Smuzhiyun        vpxor   \T3, \T2, \T2                   # xor the shifted versions
919*4882a593Smuzhiyun        vpxor   \T4, \T2, \T2
920*4882a593Smuzhiyun
921*4882a593Smuzhiyun        vpsrldq $4, \T2, \T5                    # shift-R T5 1 DW
922*4882a593Smuzhiyun
923*4882a593Smuzhiyun        vpslldq $12, \T2, \T2                   # shift-L T2 3 DWs
924*4882a593Smuzhiyun        vpxor   \T2, \GH, \GH                   # first phase of the reduction complete
925*4882a593Smuzhiyun
926*4882a593Smuzhiyun        #second phase of the reduction
927*4882a593Smuzhiyun
928*4882a593Smuzhiyun        vpsrld  $1,\GH, \T2                     # packed left shifting >> 1
929*4882a593Smuzhiyun        vpsrld  $2,\GH, \T3                     # packed left shifting >> 2
930*4882a593Smuzhiyun        vpsrld  $7,\GH, \T4                     # packed left shifting >> 7
931*4882a593Smuzhiyun        vpxor   \T3, \T2, \T2                   # xor the shifted versions
932*4882a593Smuzhiyun        vpxor   \T4, \T2, \T2
933*4882a593Smuzhiyun
934*4882a593Smuzhiyun        vpxor   \T5, \T2, \T2
935*4882a593Smuzhiyun        vpxor   \T2, \GH, \GH
936*4882a593Smuzhiyun        vpxor   \T1, \GH, \GH                   # the result is in GH
937*4882a593Smuzhiyun
938*4882a593Smuzhiyun
939*4882a593Smuzhiyun.endm
940*4882a593Smuzhiyun
941*4882a593Smuzhiyun.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6
942*4882a593Smuzhiyun
943*4882a593Smuzhiyun        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
944*4882a593Smuzhiyun        vmovdqa  \HK, \T5
945*4882a593Smuzhiyun
946*4882a593Smuzhiyun        vpshufd  $0b01001110, \T5, \T1
947*4882a593Smuzhiyun        vpxor    \T5, \T1, \T1
948*4882a593Smuzhiyun        vmovdqu  \T1, HashKey_k(arg2)
949*4882a593Smuzhiyun
950*4882a593Smuzhiyun        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^2<<1 mod poly
951*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_2(arg2)                    #  [HashKey_2] = HashKey^2<<1 mod poly
952*4882a593Smuzhiyun        vpshufd  $0b01001110, \T5, \T1
953*4882a593Smuzhiyun        vpxor    \T5, \T1, \T1
954*4882a593Smuzhiyun        vmovdqu  \T1, HashKey_2_k(arg2)
955*4882a593Smuzhiyun
956*4882a593Smuzhiyun        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^3<<1 mod poly
957*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_3(arg2)
958*4882a593Smuzhiyun        vpshufd  $0b01001110, \T5, \T1
959*4882a593Smuzhiyun        vpxor    \T5, \T1, \T1
960*4882a593Smuzhiyun        vmovdqu  \T1, HashKey_3_k(arg2)
961*4882a593Smuzhiyun
962*4882a593Smuzhiyun        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^4<<1 mod poly
963*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_4(arg2)
964*4882a593Smuzhiyun        vpshufd  $0b01001110, \T5, \T1
965*4882a593Smuzhiyun        vpxor    \T5, \T1, \T1
966*4882a593Smuzhiyun        vmovdqu  \T1, HashKey_4_k(arg2)
967*4882a593Smuzhiyun
968*4882a593Smuzhiyun        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^5<<1 mod poly
969*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_5(arg2)
970*4882a593Smuzhiyun        vpshufd  $0b01001110, \T5, \T1
971*4882a593Smuzhiyun        vpxor    \T5, \T1, \T1
972*4882a593Smuzhiyun        vmovdqu  \T1, HashKey_5_k(arg2)
973*4882a593Smuzhiyun
974*4882a593Smuzhiyun        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^6<<1 mod poly
975*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_6(arg2)
976*4882a593Smuzhiyun        vpshufd  $0b01001110, \T5, \T1
977*4882a593Smuzhiyun        vpxor    \T5, \T1, \T1
978*4882a593Smuzhiyun        vmovdqu  \T1, HashKey_6_k(arg2)
979*4882a593Smuzhiyun
980*4882a593Smuzhiyun        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^7<<1 mod poly
981*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_7(arg2)
982*4882a593Smuzhiyun        vpshufd  $0b01001110, \T5, \T1
983*4882a593Smuzhiyun        vpxor    \T5, \T1, \T1
984*4882a593Smuzhiyun        vmovdqu  \T1, HashKey_7_k(arg2)
985*4882a593Smuzhiyun
986*4882a593Smuzhiyun        GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2  #  T5 = HashKey^8<<1 mod poly
987*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_8(arg2)
988*4882a593Smuzhiyun        vpshufd  $0b01001110, \T5, \T1
989*4882a593Smuzhiyun        vpxor    \T5, \T1, \T1
990*4882a593Smuzhiyun        vmovdqu  \T1, HashKey_8_k(arg2)
991*4882a593Smuzhiyun
992*4882a593Smuzhiyun.endm
993*4882a593Smuzhiyun
994*4882a593Smuzhiyun## if a = number of total plaintext bytes
995*4882a593Smuzhiyun## b = floor(a/16)
996*4882a593Smuzhiyun## num_initial_blocks = b mod 4#
997*4882a593Smuzhiyun## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
998*4882a593Smuzhiyun## r10, r11, r12, rax are clobbered
999*4882a593Smuzhiyun## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1000*4882a593Smuzhiyun
1001*4882a593Smuzhiyun.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC
1002*4882a593Smuzhiyun	i = (8-\num_initial_blocks)
1003*4882a593Smuzhiyun	setreg
1004*4882a593Smuzhiyun        vmovdqu AadHash(arg2), reg_i
1005*4882a593Smuzhiyun
1006*4882a593Smuzhiyun	# start AES for num_initial_blocks blocks
1007*4882a593Smuzhiyun	vmovdqu CurCount(arg2), \CTR
1008*4882a593Smuzhiyun
1009*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
1010*4882a593Smuzhiyun	setreg
1011*4882a593Smuzhiyun.rep \num_initial_blocks
1012*4882a593Smuzhiyun                vpaddd  ONE(%rip), \CTR, \CTR		# INCR Y0
1013*4882a593Smuzhiyun                vmovdqa \CTR, reg_i
1014*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), reg_i, reg_i   # perform a 16Byte swap
1015*4882a593Smuzhiyun	i = (i+1)
1016*4882a593Smuzhiyun	setreg
1017*4882a593Smuzhiyun.endr
1018*4882a593Smuzhiyun
1019*4882a593Smuzhiyun	vmovdqa  (arg1), \T_key
1020*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
1021*4882a593Smuzhiyun	setreg
1022*4882a593Smuzhiyun.rep \num_initial_blocks
1023*4882a593Smuzhiyun                vpxor   \T_key, reg_i, reg_i
1024*4882a593Smuzhiyun	i = (i+1)
1025*4882a593Smuzhiyun	setreg
1026*4882a593Smuzhiyun.endr
1027*4882a593Smuzhiyun
1028*4882a593Smuzhiyun       j = 1
1029*4882a593Smuzhiyun       setreg
1030*4882a593Smuzhiyun.rep \REP
1031*4882a593Smuzhiyun       vmovdqa  16*j(arg1), \T_key
1032*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
1033*4882a593Smuzhiyun	setreg
1034*4882a593Smuzhiyun.rep \num_initial_blocks
1035*4882a593Smuzhiyun        vaesenc \T_key, reg_i, reg_i
1036*4882a593Smuzhiyun	i = (i+1)
1037*4882a593Smuzhiyun	setreg
1038*4882a593Smuzhiyun.endr
1039*4882a593Smuzhiyun
1040*4882a593Smuzhiyun       j = (j+1)
1041*4882a593Smuzhiyun       setreg
1042*4882a593Smuzhiyun.endr
1043*4882a593Smuzhiyun
1044*4882a593Smuzhiyun	vmovdqa  16*j(arg1), \T_key
1045*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
1046*4882a593Smuzhiyun	setreg
1047*4882a593Smuzhiyun.rep \num_initial_blocks
1048*4882a593Smuzhiyun        vaesenclast      \T_key, reg_i, reg_i
1049*4882a593Smuzhiyun	i = (i+1)
1050*4882a593Smuzhiyun	setreg
1051*4882a593Smuzhiyun.endr
1052*4882a593Smuzhiyun
1053*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
1054*4882a593Smuzhiyun	setreg
1055*4882a593Smuzhiyun.rep \num_initial_blocks
1056*4882a593Smuzhiyun                vmovdqu (arg4, %r11), \T1
1057*4882a593Smuzhiyun                vpxor   \T1, reg_i, reg_i
1058*4882a593Smuzhiyun                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for num_initial_blocks blocks
1059*4882a593Smuzhiyun                add     $16, %r11
1060*4882a593Smuzhiyun.if  \ENC_DEC == DEC
1061*4882a593Smuzhiyun                vmovdqa \T1, reg_i
1062*4882a593Smuzhiyun.endif
1063*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
1064*4882a593Smuzhiyun	i = (i+1)
1065*4882a593Smuzhiyun	setreg
1066*4882a593Smuzhiyun.endr
1067*4882a593Smuzhiyun
1068*4882a593Smuzhiyun
1069*4882a593Smuzhiyun	i = (8-\num_initial_blocks)
1070*4882a593Smuzhiyun	j = (9-\num_initial_blocks)
1071*4882a593Smuzhiyun	setreg
1072*4882a593Smuzhiyun
1073*4882a593Smuzhiyun.rep \num_initial_blocks
1074*4882a593Smuzhiyun        vpxor    reg_i, reg_j, reg_j
1075*4882a593Smuzhiyun        GHASH_MUL_AVX       reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks
1076*4882a593Smuzhiyun	i = (i+1)
1077*4882a593Smuzhiyun	j = (j+1)
1078*4882a593Smuzhiyun	setreg
1079*4882a593Smuzhiyun.endr
1080*4882a593Smuzhiyun        # XMM8 has the combined result here
1081*4882a593Smuzhiyun
1082*4882a593Smuzhiyun        vmovdqa  \XMM8, TMP1(%rsp)
1083*4882a593Smuzhiyun        vmovdqa  \XMM8, \T3
1084*4882a593Smuzhiyun
1085*4882a593Smuzhiyun        cmp     $128, %r13
1086*4882a593Smuzhiyun        jl      _initial_blocks_done\@                  # no need for precomputed constants
1087*4882a593Smuzhiyun
1088*4882a593Smuzhiyun###############################################################################
1089*4882a593Smuzhiyun# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1090*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1091*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM1
1092*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
1093*4882a593Smuzhiyun
1094*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1095*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM2
1096*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
1097*4882a593Smuzhiyun
1098*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1099*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM3
1100*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
1101*4882a593Smuzhiyun
1102*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1103*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM4
1104*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
1105*4882a593Smuzhiyun
1106*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1107*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM5
1108*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
1109*4882a593Smuzhiyun
1110*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1111*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM6
1112*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
1113*4882a593Smuzhiyun
1114*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1115*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM7
1116*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
1117*4882a593Smuzhiyun
1118*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
1119*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM8
1120*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
1121*4882a593Smuzhiyun
1122*4882a593Smuzhiyun                vmovdqa  (arg1), \T_key
1123*4882a593Smuzhiyun                vpxor    \T_key, \XMM1, \XMM1
1124*4882a593Smuzhiyun                vpxor    \T_key, \XMM2, \XMM2
1125*4882a593Smuzhiyun                vpxor    \T_key, \XMM3, \XMM3
1126*4882a593Smuzhiyun                vpxor    \T_key, \XMM4, \XMM4
1127*4882a593Smuzhiyun                vpxor    \T_key, \XMM5, \XMM5
1128*4882a593Smuzhiyun                vpxor    \T_key, \XMM6, \XMM6
1129*4882a593Smuzhiyun                vpxor    \T_key, \XMM7, \XMM7
1130*4882a593Smuzhiyun                vpxor    \T_key, \XMM8, \XMM8
1131*4882a593Smuzhiyun
1132*4882a593Smuzhiyun               i = 1
1133*4882a593Smuzhiyun               setreg
1134*4882a593Smuzhiyun.rep    \REP       # do REP rounds
1135*4882a593Smuzhiyun                vmovdqa  16*i(arg1), \T_key
1136*4882a593Smuzhiyun                vaesenc  \T_key, \XMM1, \XMM1
1137*4882a593Smuzhiyun                vaesenc  \T_key, \XMM2, \XMM2
1138*4882a593Smuzhiyun                vaesenc  \T_key, \XMM3, \XMM3
1139*4882a593Smuzhiyun                vaesenc  \T_key, \XMM4, \XMM4
1140*4882a593Smuzhiyun                vaesenc  \T_key, \XMM5, \XMM5
1141*4882a593Smuzhiyun                vaesenc  \T_key, \XMM6, \XMM6
1142*4882a593Smuzhiyun                vaesenc  \T_key, \XMM7, \XMM7
1143*4882a593Smuzhiyun                vaesenc  \T_key, \XMM8, \XMM8
1144*4882a593Smuzhiyun               i = (i+1)
1145*4882a593Smuzhiyun               setreg
1146*4882a593Smuzhiyun.endr
1147*4882a593Smuzhiyun
1148*4882a593Smuzhiyun                vmovdqa  16*i(arg1), \T_key
1149*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM1, \XMM1
1150*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM2, \XMM2
1151*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM3, \XMM3
1152*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM4, \XMM4
1153*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM5, \XMM5
1154*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM6, \XMM6
1155*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM7, \XMM7
1156*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM8, \XMM8
1157*4882a593Smuzhiyun
1158*4882a593Smuzhiyun                vmovdqu  (arg4, %r11), \T1
1159*4882a593Smuzhiyun                vpxor    \T1, \XMM1, \XMM1
1160*4882a593Smuzhiyun                vmovdqu  \XMM1, (arg3 , %r11)
1161*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
1162*4882a593Smuzhiyun                vmovdqa  \T1, \XMM1
1163*4882a593Smuzhiyun                .endif
1164*4882a593Smuzhiyun
1165*4882a593Smuzhiyun                vmovdqu  16*1(arg4, %r11), \T1
1166*4882a593Smuzhiyun                vpxor    \T1, \XMM2, \XMM2
1167*4882a593Smuzhiyun                vmovdqu  \XMM2, 16*1(arg3 , %r11)
1168*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
1169*4882a593Smuzhiyun                vmovdqa  \T1, \XMM2
1170*4882a593Smuzhiyun                .endif
1171*4882a593Smuzhiyun
1172*4882a593Smuzhiyun                vmovdqu  16*2(arg4, %r11), \T1
1173*4882a593Smuzhiyun                vpxor    \T1, \XMM3, \XMM3
1174*4882a593Smuzhiyun                vmovdqu  \XMM3, 16*2(arg3 , %r11)
1175*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
1176*4882a593Smuzhiyun                vmovdqa  \T1, \XMM3
1177*4882a593Smuzhiyun                .endif
1178*4882a593Smuzhiyun
1179*4882a593Smuzhiyun                vmovdqu  16*3(arg4, %r11), \T1
1180*4882a593Smuzhiyun                vpxor    \T1, \XMM4, \XMM4
1181*4882a593Smuzhiyun                vmovdqu  \XMM4, 16*3(arg3 , %r11)
1182*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
1183*4882a593Smuzhiyun                vmovdqa  \T1, \XMM4
1184*4882a593Smuzhiyun                .endif
1185*4882a593Smuzhiyun
1186*4882a593Smuzhiyun                vmovdqu  16*4(arg4, %r11), \T1
1187*4882a593Smuzhiyun                vpxor    \T1, \XMM5, \XMM5
1188*4882a593Smuzhiyun                vmovdqu  \XMM5, 16*4(arg3 , %r11)
1189*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
1190*4882a593Smuzhiyun                vmovdqa  \T1, \XMM5
1191*4882a593Smuzhiyun                .endif
1192*4882a593Smuzhiyun
1193*4882a593Smuzhiyun                vmovdqu  16*5(arg4, %r11), \T1
1194*4882a593Smuzhiyun                vpxor    \T1, \XMM6, \XMM6
1195*4882a593Smuzhiyun                vmovdqu  \XMM6, 16*5(arg3 , %r11)
1196*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
1197*4882a593Smuzhiyun                vmovdqa  \T1, \XMM6
1198*4882a593Smuzhiyun                .endif
1199*4882a593Smuzhiyun
1200*4882a593Smuzhiyun                vmovdqu  16*6(arg4, %r11), \T1
1201*4882a593Smuzhiyun                vpxor    \T1, \XMM7, \XMM7
1202*4882a593Smuzhiyun                vmovdqu  \XMM7, 16*6(arg3 , %r11)
1203*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
1204*4882a593Smuzhiyun                vmovdqa  \T1, \XMM7
1205*4882a593Smuzhiyun                .endif
1206*4882a593Smuzhiyun
1207*4882a593Smuzhiyun                vmovdqu  16*7(arg4, %r11), \T1
1208*4882a593Smuzhiyun                vpxor    \T1, \XMM8, \XMM8
1209*4882a593Smuzhiyun                vmovdqu  \XMM8, 16*7(arg3 , %r11)
1210*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
1211*4882a593Smuzhiyun                vmovdqa  \T1, \XMM8
1212*4882a593Smuzhiyun                .endif
1213*4882a593Smuzhiyun
1214*4882a593Smuzhiyun                add     $128, %r11
1215*4882a593Smuzhiyun
1216*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
1217*4882a593Smuzhiyun                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with the corresponding ciphertext
1218*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
1219*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
1220*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
1221*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
1222*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
1223*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
1224*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
1225*4882a593Smuzhiyun
1226*4882a593Smuzhiyun###############################################################################
1227*4882a593Smuzhiyun
1228*4882a593Smuzhiyun_initial_blocks_done\@:
1229*4882a593Smuzhiyun
1230*4882a593Smuzhiyun.endm
1231*4882a593Smuzhiyun
1232*4882a593Smuzhiyun# encrypt 8 blocks at a time
1233*4882a593Smuzhiyun# ghash the 8 previously encrypted ciphertext blocks
1234*4882a593Smuzhiyun# arg1, arg3, arg4 are used as pointers only, not modified
1235*4882a593Smuzhiyun# r11 is the data offset value
1236*4882a593Smuzhiyun.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
1237*4882a593Smuzhiyun
1238*4882a593Smuzhiyun        vmovdqa \XMM1, \T2
1239*4882a593Smuzhiyun        vmovdqa \XMM2, TMP2(%rsp)
1240*4882a593Smuzhiyun        vmovdqa \XMM3, TMP3(%rsp)
1241*4882a593Smuzhiyun        vmovdqa \XMM4, TMP4(%rsp)
1242*4882a593Smuzhiyun        vmovdqa \XMM5, TMP5(%rsp)
1243*4882a593Smuzhiyun        vmovdqa \XMM6, TMP6(%rsp)
1244*4882a593Smuzhiyun        vmovdqa \XMM7, TMP7(%rsp)
1245*4882a593Smuzhiyun        vmovdqa \XMM8, TMP8(%rsp)
1246*4882a593Smuzhiyun
1247*4882a593Smuzhiyun.if \loop_idx == in_order
1248*4882a593Smuzhiyun                vpaddd  ONE(%rip), \CTR, \XMM1           # INCR CNT
1249*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM1, \XMM2
1250*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM2, \XMM3
1251*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM3, \XMM4
1252*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM4, \XMM5
1253*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM5, \XMM6
1254*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM6, \XMM7
1255*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM7, \XMM8
1256*4882a593Smuzhiyun                vmovdqa \XMM8, \CTR
1257*4882a593Smuzhiyun
1258*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1    # perform a 16Byte swap
1259*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2    # perform a 16Byte swap
1260*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3    # perform a 16Byte swap
1261*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4    # perform a 16Byte swap
1262*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5    # perform a 16Byte swap
1263*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6    # perform a 16Byte swap
1264*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7    # perform a 16Byte swap
1265*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8    # perform a 16Byte swap
1266*4882a593Smuzhiyun.else
1267*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \CTR, \XMM1           # INCR CNT
1268*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM1, \XMM2
1269*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM2, \XMM3
1270*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM3, \XMM4
1271*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM4, \XMM5
1272*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM5, \XMM6
1273*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM6, \XMM7
1274*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM7, \XMM8
1275*4882a593Smuzhiyun                vmovdqa \XMM8, \CTR
1276*4882a593Smuzhiyun.endif
1277*4882a593Smuzhiyun
1278*4882a593Smuzhiyun
1279*4882a593Smuzhiyun        #######################################################################
1280*4882a593Smuzhiyun
1281*4882a593Smuzhiyun                vmovdqu (arg1), \T1
1282*4882a593Smuzhiyun                vpxor   \T1, \XMM1, \XMM1
1283*4882a593Smuzhiyun                vpxor   \T1, \XMM2, \XMM2
1284*4882a593Smuzhiyun                vpxor   \T1, \XMM3, \XMM3
1285*4882a593Smuzhiyun                vpxor   \T1, \XMM4, \XMM4
1286*4882a593Smuzhiyun                vpxor   \T1, \XMM5, \XMM5
1287*4882a593Smuzhiyun                vpxor   \T1, \XMM6, \XMM6
1288*4882a593Smuzhiyun                vpxor   \T1, \XMM7, \XMM7
1289*4882a593Smuzhiyun                vpxor   \T1, \XMM8, \XMM8
1290*4882a593Smuzhiyun
1291*4882a593Smuzhiyun        #######################################################################
1292*4882a593Smuzhiyun
1293*4882a593Smuzhiyun
1294*4882a593Smuzhiyun
1295*4882a593Smuzhiyun
1296*4882a593Smuzhiyun
1297*4882a593Smuzhiyun                vmovdqu 16*1(arg1), \T1
1298*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
1299*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
1300*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
1301*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
1302*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
1303*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
1304*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
1305*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
1306*4882a593Smuzhiyun
1307*4882a593Smuzhiyun                vmovdqu 16*2(arg1), \T1
1308*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
1309*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
1310*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
1311*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
1312*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
1313*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
1314*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
1315*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
1316*4882a593Smuzhiyun
1317*4882a593Smuzhiyun
1318*4882a593Smuzhiyun        #######################################################################
1319*4882a593Smuzhiyun
1320*4882a593Smuzhiyun        vmovdqu         HashKey_8(arg2), \T5
1321*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T2, \T4             # T4 = a1*b1
1322*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T2, \T7             # T7 = a0*b0
1323*4882a593Smuzhiyun
1324*4882a593Smuzhiyun        vpshufd         $0b01001110, \T2, \T6
1325*4882a593Smuzhiyun        vpxor           \T2, \T6, \T6
1326*4882a593Smuzhiyun
1327*4882a593Smuzhiyun        vmovdqu         HashKey_8_k(arg2), \T5
1328*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T6, \T6
1329*4882a593Smuzhiyun
1330*4882a593Smuzhiyun                vmovdqu 16*3(arg1), \T1
1331*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
1332*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
1333*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
1334*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
1335*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
1336*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
1337*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
1338*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
1339*4882a593Smuzhiyun
1340*4882a593Smuzhiyun        vmovdqa         TMP2(%rsp), \T1
1341*4882a593Smuzhiyun        vmovdqu         HashKey_7(arg2), \T5
1342*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
1343*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
1344*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
1345*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
1346*4882a593Smuzhiyun
1347*4882a593Smuzhiyun        vpshufd         $0b01001110, \T1, \T3
1348*4882a593Smuzhiyun        vpxor           \T1, \T3, \T3
1349*4882a593Smuzhiyun        vmovdqu         HashKey_7_k(arg2), \T5
1350*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T3, \T3
1351*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
1352*4882a593Smuzhiyun
1353*4882a593Smuzhiyun                vmovdqu 16*4(arg1), \T1
1354*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
1355*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
1356*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
1357*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
1358*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
1359*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
1360*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
1361*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
1362*4882a593Smuzhiyun
1363*4882a593Smuzhiyun        #######################################################################
1364*4882a593Smuzhiyun
1365*4882a593Smuzhiyun        vmovdqa         TMP3(%rsp), \T1
1366*4882a593Smuzhiyun        vmovdqu         HashKey_6(arg2), \T5
1367*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
1368*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
1369*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
1370*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
1371*4882a593Smuzhiyun
1372*4882a593Smuzhiyun        vpshufd         $0b01001110, \T1, \T3
1373*4882a593Smuzhiyun        vpxor           \T1, \T3, \T3
1374*4882a593Smuzhiyun        vmovdqu         HashKey_6_k(arg2), \T5
1375*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T3, \T3
1376*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
1377*4882a593Smuzhiyun
1378*4882a593Smuzhiyun                vmovdqu 16*5(arg1), \T1
1379*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
1380*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
1381*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
1382*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
1383*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
1384*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
1385*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
1386*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
1387*4882a593Smuzhiyun
1388*4882a593Smuzhiyun        vmovdqa         TMP4(%rsp), \T1
1389*4882a593Smuzhiyun        vmovdqu         HashKey_5(arg2), \T5
1390*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
1391*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
1392*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
1393*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
1394*4882a593Smuzhiyun
1395*4882a593Smuzhiyun        vpshufd         $0b01001110, \T1, \T3
1396*4882a593Smuzhiyun        vpxor           \T1, \T3, \T3
1397*4882a593Smuzhiyun        vmovdqu         HashKey_5_k(arg2), \T5
1398*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T3, \T3
1399*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
1400*4882a593Smuzhiyun
1401*4882a593Smuzhiyun                vmovdqu 16*6(arg1), \T1
1402*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
1403*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
1404*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
1405*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
1406*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
1407*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
1408*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
1409*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
1410*4882a593Smuzhiyun
1411*4882a593Smuzhiyun
1412*4882a593Smuzhiyun        vmovdqa         TMP5(%rsp), \T1
1413*4882a593Smuzhiyun        vmovdqu         HashKey_4(arg2), \T5
1414*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
1415*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
1416*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
1417*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
1418*4882a593Smuzhiyun
1419*4882a593Smuzhiyun        vpshufd         $0b01001110, \T1, \T3
1420*4882a593Smuzhiyun        vpxor           \T1, \T3, \T3
1421*4882a593Smuzhiyun        vmovdqu         HashKey_4_k(arg2), \T5
1422*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T3, \T3
1423*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
1424*4882a593Smuzhiyun
1425*4882a593Smuzhiyun                vmovdqu 16*7(arg1), \T1
1426*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
1427*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
1428*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
1429*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
1430*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
1431*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
1432*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
1433*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
1434*4882a593Smuzhiyun
1435*4882a593Smuzhiyun        vmovdqa         TMP6(%rsp), \T1
1436*4882a593Smuzhiyun        vmovdqu         HashKey_3(arg2), \T5
1437*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
1438*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
1439*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
1440*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
1441*4882a593Smuzhiyun
1442*4882a593Smuzhiyun        vpshufd         $0b01001110, \T1, \T3
1443*4882a593Smuzhiyun        vpxor           \T1, \T3, \T3
1444*4882a593Smuzhiyun        vmovdqu         HashKey_3_k(arg2), \T5
1445*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T3, \T3
1446*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
1447*4882a593Smuzhiyun
1448*4882a593Smuzhiyun
1449*4882a593Smuzhiyun                vmovdqu 16*8(arg1), \T1
1450*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
1451*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
1452*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
1453*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
1454*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
1455*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
1456*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
1457*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
1458*4882a593Smuzhiyun
1459*4882a593Smuzhiyun        vmovdqa         TMP7(%rsp), \T1
1460*4882a593Smuzhiyun        vmovdqu         HashKey_2(arg2), \T5
1461*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
1462*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
1463*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
1464*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
1465*4882a593Smuzhiyun
1466*4882a593Smuzhiyun        vpshufd         $0b01001110, \T1, \T3
1467*4882a593Smuzhiyun        vpxor           \T1, \T3, \T3
1468*4882a593Smuzhiyun        vmovdqu         HashKey_2_k(arg2), \T5
1469*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T3, \T3
1470*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
1471*4882a593Smuzhiyun
1472*4882a593Smuzhiyun        #######################################################################
1473*4882a593Smuzhiyun
1474*4882a593Smuzhiyun                vmovdqu 16*9(arg1), \T5
1475*4882a593Smuzhiyun                vaesenc \T5, \XMM1, \XMM1
1476*4882a593Smuzhiyun                vaesenc \T5, \XMM2, \XMM2
1477*4882a593Smuzhiyun                vaesenc \T5, \XMM3, \XMM3
1478*4882a593Smuzhiyun                vaesenc \T5, \XMM4, \XMM4
1479*4882a593Smuzhiyun                vaesenc \T5, \XMM5, \XMM5
1480*4882a593Smuzhiyun                vaesenc \T5, \XMM6, \XMM6
1481*4882a593Smuzhiyun                vaesenc \T5, \XMM7, \XMM7
1482*4882a593Smuzhiyun                vaesenc \T5, \XMM8, \XMM8
1483*4882a593Smuzhiyun
1484*4882a593Smuzhiyun        vmovdqa         TMP8(%rsp), \T1
1485*4882a593Smuzhiyun        vmovdqu         HashKey(arg2), \T5
1486*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
1487*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
1488*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
1489*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
1490*4882a593Smuzhiyun
1491*4882a593Smuzhiyun        vpshufd         $0b01001110, \T1, \T3
1492*4882a593Smuzhiyun        vpxor           \T1, \T3, \T3
1493*4882a593Smuzhiyun        vmovdqu         HashKey_k(arg2), \T5
1494*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T3, \T3
1495*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
1496*4882a593Smuzhiyun
1497*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
1498*4882a593Smuzhiyun        vpxor           \T7, \T6, \T6
1499*4882a593Smuzhiyun
1500*4882a593Smuzhiyun                vmovdqu 16*10(arg1), \T5
1501*4882a593Smuzhiyun
1502*4882a593Smuzhiyun        i = 11
1503*4882a593Smuzhiyun        setreg
1504*4882a593Smuzhiyun.rep (\REP-9)
1505*4882a593Smuzhiyun
1506*4882a593Smuzhiyun        vaesenc \T5, \XMM1, \XMM1
1507*4882a593Smuzhiyun        vaesenc \T5, \XMM2, \XMM2
1508*4882a593Smuzhiyun        vaesenc \T5, \XMM3, \XMM3
1509*4882a593Smuzhiyun        vaesenc \T5, \XMM4, \XMM4
1510*4882a593Smuzhiyun        vaesenc \T5, \XMM5, \XMM5
1511*4882a593Smuzhiyun        vaesenc \T5, \XMM6, \XMM6
1512*4882a593Smuzhiyun        vaesenc \T5, \XMM7, \XMM7
1513*4882a593Smuzhiyun        vaesenc \T5, \XMM8, \XMM8
1514*4882a593Smuzhiyun
1515*4882a593Smuzhiyun        vmovdqu 16*i(arg1), \T5
1516*4882a593Smuzhiyun        i = i + 1
1517*4882a593Smuzhiyun        setreg
1518*4882a593Smuzhiyun.endr
1519*4882a593Smuzhiyun
1520*4882a593Smuzhiyun	i = 0
1521*4882a593Smuzhiyun	j = 1
1522*4882a593Smuzhiyun	setreg
1523*4882a593Smuzhiyun.rep 8
1524*4882a593Smuzhiyun		vpxor	16*i(arg4, %r11), \T5, \T2
1525*4882a593Smuzhiyun                .if \ENC_DEC == ENC
1526*4882a593Smuzhiyun                vaesenclast     \T2, reg_j, reg_j
1527*4882a593Smuzhiyun                .else
1528*4882a593Smuzhiyun                vaesenclast     \T2, reg_j, \T3
1529*4882a593Smuzhiyun                vmovdqu 16*i(arg4, %r11), reg_j
1530*4882a593Smuzhiyun                vmovdqu \T3, 16*i(arg3, %r11)
1531*4882a593Smuzhiyun                .endif
1532*4882a593Smuzhiyun	i = (i+1)
1533*4882a593Smuzhiyun	j = (j+1)
1534*4882a593Smuzhiyun	setreg
1535*4882a593Smuzhiyun.endr
1536*4882a593Smuzhiyun	#######################################################################
1537*4882a593Smuzhiyun
1538*4882a593Smuzhiyun
1539*4882a593Smuzhiyun	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
1540*4882a593Smuzhiyun	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
1541*4882a593Smuzhiyun	vpxor	\T3, \T7, \T7
1542*4882a593Smuzhiyun	vpxor	\T4, \T6, \T6				# accumulate the results in T6:T7
1543*4882a593Smuzhiyun
1544*4882a593Smuzhiyun
1545*4882a593Smuzhiyun
1546*4882a593Smuzhiyun	#######################################################################
1547*4882a593Smuzhiyun	#first phase of the reduction
1548*4882a593Smuzhiyun	#######################################################################
1549*4882a593Smuzhiyun        vpslld  $31, \T7, \T2                           # packed right shifting << 31
1550*4882a593Smuzhiyun        vpslld  $30, \T7, \T3                           # packed right shifting shift << 30
1551*4882a593Smuzhiyun        vpslld  $25, \T7, \T4                           # packed right shifting shift << 25
1552*4882a593Smuzhiyun
1553*4882a593Smuzhiyun        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1554*4882a593Smuzhiyun        vpxor   \T4, \T2, \T2
1555*4882a593Smuzhiyun
1556*4882a593Smuzhiyun        vpsrldq $4, \T2, \T1                            # shift-R T1 1 DW
1557*4882a593Smuzhiyun
1558*4882a593Smuzhiyun        vpslldq $12, \T2, \T2                           # shift-L T2 3 DWs
1559*4882a593Smuzhiyun        vpxor   \T2, \T7, \T7                           # first phase of the reduction complete
1560*4882a593Smuzhiyun	#######################################################################
1561*4882a593Smuzhiyun                .if \ENC_DEC == ENC
1562*4882a593Smuzhiyun		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
1563*4882a593Smuzhiyun		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
1564*4882a593Smuzhiyun		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
1565*4882a593Smuzhiyun		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
1566*4882a593Smuzhiyun		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
1567*4882a593Smuzhiyun		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
1568*4882a593Smuzhiyun		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
1569*4882a593Smuzhiyun		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
1570*4882a593Smuzhiyun                .endif
1571*4882a593Smuzhiyun
1572*4882a593Smuzhiyun	#######################################################################
1573*4882a593Smuzhiyun	#second phase of the reduction
1574*4882a593Smuzhiyun        vpsrld  $1, \T7, \T2                            # packed left shifting >> 1
1575*4882a593Smuzhiyun        vpsrld  $2, \T7, \T3                            # packed left shifting >> 2
1576*4882a593Smuzhiyun        vpsrld  $7, \T7, \T4                            # packed left shifting >> 7
1577*4882a593Smuzhiyun        vpxor   \T3, \T2, \T2                           # xor the shifted versions
1578*4882a593Smuzhiyun        vpxor   \T4, \T2, \T2
1579*4882a593Smuzhiyun
1580*4882a593Smuzhiyun        vpxor   \T1, \T2, \T2
1581*4882a593Smuzhiyun        vpxor   \T2, \T7, \T7
1582*4882a593Smuzhiyun        vpxor   \T7, \T6, \T6                           # the result is in T6
1583*4882a593Smuzhiyun	#######################################################################
1584*4882a593Smuzhiyun
1585*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
1586*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
1587*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
1588*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
1589*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
1590*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
1591*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
1592*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
1593*4882a593Smuzhiyun
1594*4882a593Smuzhiyun
1595*4882a593Smuzhiyun	vpxor	\T6, \XMM1, \XMM1
1596*4882a593Smuzhiyun
1597*4882a593Smuzhiyun
1598*4882a593Smuzhiyun
1599*4882a593Smuzhiyun.endm
1600*4882a593Smuzhiyun
1601*4882a593Smuzhiyun
1602*4882a593Smuzhiyun# GHASH the last 4 ciphertext blocks.
1603*4882a593Smuzhiyun.macro  GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
1604*4882a593Smuzhiyun
1605*4882a593Smuzhiyun        ## Karatsuba Method
1606*4882a593Smuzhiyun
1607*4882a593Smuzhiyun
1608*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM1, \T2
1609*4882a593Smuzhiyun        vpxor           \XMM1, \T2, \T2
1610*4882a593Smuzhiyun        vmovdqu         HashKey_8(arg2), \T5
1611*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM1, \T6
1612*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM1, \T7
1613*4882a593Smuzhiyun
1614*4882a593Smuzhiyun        vmovdqu         HashKey_8_k(arg2), \T3
1615*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \XMM1
1616*4882a593Smuzhiyun
1617*4882a593Smuzhiyun        ######################
1618*4882a593Smuzhiyun
1619*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM2, \T2
1620*4882a593Smuzhiyun        vpxor           \XMM2, \T2, \T2
1621*4882a593Smuzhiyun        vmovdqu         HashKey_7(arg2), \T5
1622*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM2, \T4
1623*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
1624*4882a593Smuzhiyun
1625*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM2, \T4
1626*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
1627*4882a593Smuzhiyun
1628*4882a593Smuzhiyun        vmovdqu         HashKey_7_k(arg2), \T3
1629*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
1630*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
1631*4882a593Smuzhiyun
1632*4882a593Smuzhiyun        ######################
1633*4882a593Smuzhiyun
1634*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM3, \T2
1635*4882a593Smuzhiyun        vpxor           \XMM3, \T2, \T2
1636*4882a593Smuzhiyun        vmovdqu         HashKey_6(arg2), \T5
1637*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM3, \T4
1638*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
1639*4882a593Smuzhiyun
1640*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM3, \T4
1641*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
1642*4882a593Smuzhiyun
1643*4882a593Smuzhiyun        vmovdqu         HashKey_6_k(arg2), \T3
1644*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
1645*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
1646*4882a593Smuzhiyun
1647*4882a593Smuzhiyun        ######################
1648*4882a593Smuzhiyun
1649*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM4, \T2
1650*4882a593Smuzhiyun        vpxor           \XMM4, \T2, \T2
1651*4882a593Smuzhiyun        vmovdqu         HashKey_5(arg2), \T5
1652*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM4, \T4
1653*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
1654*4882a593Smuzhiyun
1655*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM4, \T4
1656*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
1657*4882a593Smuzhiyun
1658*4882a593Smuzhiyun        vmovdqu         HashKey_5_k(arg2), \T3
1659*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
1660*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
1661*4882a593Smuzhiyun
1662*4882a593Smuzhiyun        ######################
1663*4882a593Smuzhiyun
1664*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM5, \T2
1665*4882a593Smuzhiyun        vpxor           \XMM5, \T2, \T2
1666*4882a593Smuzhiyun        vmovdqu         HashKey_4(arg2), \T5
1667*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM5, \T4
1668*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
1669*4882a593Smuzhiyun
1670*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM5, \T4
1671*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
1672*4882a593Smuzhiyun
1673*4882a593Smuzhiyun        vmovdqu         HashKey_4_k(arg2), \T3
1674*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
1675*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
1676*4882a593Smuzhiyun
1677*4882a593Smuzhiyun        ######################
1678*4882a593Smuzhiyun
1679*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM6, \T2
1680*4882a593Smuzhiyun        vpxor           \XMM6, \T2, \T2
1681*4882a593Smuzhiyun        vmovdqu         HashKey_3(arg2), \T5
1682*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM6, \T4
1683*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
1684*4882a593Smuzhiyun
1685*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM6, \T4
1686*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
1687*4882a593Smuzhiyun
1688*4882a593Smuzhiyun        vmovdqu         HashKey_3_k(arg2), \T3
1689*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
1690*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
1691*4882a593Smuzhiyun
1692*4882a593Smuzhiyun        ######################
1693*4882a593Smuzhiyun
1694*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM7, \T2
1695*4882a593Smuzhiyun        vpxor           \XMM7, \T2, \T2
1696*4882a593Smuzhiyun        vmovdqu         HashKey_2(arg2), \T5
1697*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM7, \T4
1698*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
1699*4882a593Smuzhiyun
1700*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM7, \T4
1701*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
1702*4882a593Smuzhiyun
1703*4882a593Smuzhiyun        vmovdqu         HashKey_2_k(arg2), \T3
1704*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
1705*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
1706*4882a593Smuzhiyun
1707*4882a593Smuzhiyun        ######################
1708*4882a593Smuzhiyun
1709*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM8, \T2
1710*4882a593Smuzhiyun        vpxor           \XMM8, \T2, \T2
1711*4882a593Smuzhiyun        vmovdqu         HashKey(arg2), \T5
1712*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM8, \T4
1713*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
1714*4882a593Smuzhiyun
1715*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM8, \T4
1716*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
1717*4882a593Smuzhiyun
1718*4882a593Smuzhiyun        vmovdqu         HashKey_k(arg2), \T3
1719*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
1720*4882a593Smuzhiyun
1721*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
1722*4882a593Smuzhiyun        vpxor           \T6, \XMM1, \XMM1
1723*4882a593Smuzhiyun        vpxor           \T7, \XMM1, \T2
1724*4882a593Smuzhiyun
1725*4882a593Smuzhiyun
1726*4882a593Smuzhiyun
1727*4882a593Smuzhiyun
1728*4882a593Smuzhiyun        vpslldq $8, \T2, \T4
1729*4882a593Smuzhiyun        vpsrldq $8, \T2, \T2
1730*4882a593Smuzhiyun
1731*4882a593Smuzhiyun        vpxor   \T4, \T7, \T7
1732*4882a593Smuzhiyun        vpxor   \T2, \T6, \T6   # <T6:T7> holds the result of
1733*4882a593Smuzhiyun				# the accumulated carry-less multiplications
1734*4882a593Smuzhiyun
1735*4882a593Smuzhiyun        #######################################################################
1736*4882a593Smuzhiyun        #first phase of the reduction
1737*4882a593Smuzhiyun        vpslld  $31, \T7, \T2   # packed right shifting << 31
1738*4882a593Smuzhiyun        vpslld  $30, \T7, \T3   # packed right shifting shift << 30
1739*4882a593Smuzhiyun        vpslld  $25, \T7, \T4   # packed right shifting shift << 25
1740*4882a593Smuzhiyun
1741*4882a593Smuzhiyun        vpxor   \T3, \T2, \T2   # xor the shifted versions
1742*4882a593Smuzhiyun        vpxor   \T4, \T2, \T2
1743*4882a593Smuzhiyun
1744*4882a593Smuzhiyun        vpsrldq $4, \T2, \T1    # shift-R T1 1 DW
1745*4882a593Smuzhiyun
1746*4882a593Smuzhiyun        vpslldq $12, \T2, \T2   # shift-L T2 3 DWs
1747*4882a593Smuzhiyun        vpxor   \T2, \T7, \T7   # first phase of the reduction complete
1748*4882a593Smuzhiyun        #######################################################################
1749*4882a593Smuzhiyun
1750*4882a593Smuzhiyun
1751*4882a593Smuzhiyun        #second phase of the reduction
1752*4882a593Smuzhiyun        vpsrld  $1, \T7, \T2    # packed left shifting >> 1
1753*4882a593Smuzhiyun        vpsrld  $2, \T7, \T3    # packed left shifting >> 2
1754*4882a593Smuzhiyun        vpsrld  $7, \T7, \T4    # packed left shifting >> 7
1755*4882a593Smuzhiyun        vpxor   \T3, \T2, \T2   # xor the shifted versions
1756*4882a593Smuzhiyun        vpxor   \T4, \T2, \T2
1757*4882a593Smuzhiyun
1758*4882a593Smuzhiyun        vpxor   \T1, \T2, \T2
1759*4882a593Smuzhiyun        vpxor   \T2, \T7, \T7
1760*4882a593Smuzhiyun        vpxor   \T7, \T6, \T6   # the result is in T6
1761*4882a593Smuzhiyun
1762*4882a593Smuzhiyun.endm
1763*4882a593Smuzhiyun
1764*4882a593Smuzhiyun#############################################################
1765*4882a593Smuzhiyun#void   aesni_gcm_precomp_avx_gen2
1766*4882a593Smuzhiyun#        (gcm_data     *my_ctx_data,
1767*4882a593Smuzhiyun#         gcm_context_data *data,
1768*4882a593Smuzhiyun#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
1769*4882a593Smuzhiyun#        u8      *iv, /* Pre-counter block j0: 4 byte salt
1770*4882a593Smuzhiyun#			(from Security Association) concatenated with 8 byte
1771*4882a593Smuzhiyun#			Initialisation Vector (from IPSec ESP Payload)
1772*4882a593Smuzhiyun#			concatenated with 0x00000001. 16-byte aligned pointer. */
1773*4882a593Smuzhiyun#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
1774*4882a593Smuzhiyun#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
1775*4882a593Smuzhiyun#############################################################
1776*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_init_avx_gen2)
1777*4882a593Smuzhiyun        FUNC_SAVE
1778*4882a593Smuzhiyun        INIT GHASH_MUL_AVX, PRECOMPUTE_AVX
1779*4882a593Smuzhiyun        FUNC_RESTORE
1780*4882a593Smuzhiyun        RET
1781*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_init_avx_gen2)
1782*4882a593Smuzhiyun
1783*4882a593Smuzhiyun###############################################################################
1784*4882a593Smuzhiyun#void   aesni_gcm_enc_update_avx_gen2(
1785*4882a593Smuzhiyun#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1786*4882a593Smuzhiyun#        gcm_context_data *data,
1787*4882a593Smuzhiyun#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
1788*4882a593Smuzhiyun#        const   u8 *in, /* Plaintext input */
1789*4882a593Smuzhiyun#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1790*4882a593Smuzhiyun###############################################################################
1791*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_enc_update_avx_gen2)
1792*4882a593Smuzhiyun        FUNC_SAVE
1793*4882a593Smuzhiyun        mov     keysize, %eax
1794*4882a593Smuzhiyun        cmp     $32, %eax
1795*4882a593Smuzhiyun        je      key_256_enc_update
1796*4882a593Smuzhiyun        cmp     $16, %eax
1797*4882a593Smuzhiyun        je      key_128_enc_update
1798*4882a593Smuzhiyun        # must be 192
1799*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11
1800*4882a593Smuzhiyun        FUNC_RESTORE
1801*4882a593Smuzhiyun        RET
1802*4882a593Smuzhiyunkey_128_enc_update:
1803*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9
1804*4882a593Smuzhiyun        FUNC_RESTORE
1805*4882a593Smuzhiyun        RET
1806*4882a593Smuzhiyunkey_256_enc_update:
1807*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13
1808*4882a593Smuzhiyun        FUNC_RESTORE
1809*4882a593Smuzhiyun        RET
1810*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_enc_update_avx_gen2)
1811*4882a593Smuzhiyun
1812*4882a593Smuzhiyun###############################################################################
1813*4882a593Smuzhiyun#void   aesni_gcm_dec_update_avx_gen2(
1814*4882a593Smuzhiyun#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1815*4882a593Smuzhiyun#        gcm_context_data *data,
1816*4882a593Smuzhiyun#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
1817*4882a593Smuzhiyun#        const   u8 *in, /* Ciphertext input */
1818*4882a593Smuzhiyun#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
1819*4882a593Smuzhiyun###############################################################################
1820*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_dec_update_avx_gen2)
1821*4882a593Smuzhiyun        FUNC_SAVE
1822*4882a593Smuzhiyun        mov     keysize,%eax
1823*4882a593Smuzhiyun        cmp     $32, %eax
1824*4882a593Smuzhiyun        je      key_256_dec_update
1825*4882a593Smuzhiyun        cmp     $16, %eax
1826*4882a593Smuzhiyun        je      key_128_dec_update
1827*4882a593Smuzhiyun        # must be 192
1828*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11
1829*4882a593Smuzhiyun        FUNC_RESTORE
1830*4882a593Smuzhiyun        RET
1831*4882a593Smuzhiyunkey_128_dec_update:
1832*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9
1833*4882a593Smuzhiyun        FUNC_RESTORE
1834*4882a593Smuzhiyun        RET
1835*4882a593Smuzhiyunkey_256_dec_update:
1836*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13
1837*4882a593Smuzhiyun        FUNC_RESTORE
1838*4882a593Smuzhiyun        RET
1839*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_dec_update_avx_gen2)
1840*4882a593Smuzhiyun
1841*4882a593Smuzhiyun###############################################################################
1842*4882a593Smuzhiyun#void   aesni_gcm_finalize_avx_gen2(
1843*4882a593Smuzhiyun#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
1844*4882a593Smuzhiyun#        gcm_context_data *data,
1845*4882a593Smuzhiyun#        u8      *auth_tag, /* Authenticated Tag output. */
1846*4882a593Smuzhiyun#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
1847*4882a593Smuzhiyun#				Valid values are 16 (most likely), 12 or 8. */
1848*4882a593Smuzhiyun###############################################################################
1849*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_finalize_avx_gen2)
1850*4882a593Smuzhiyun        FUNC_SAVE
1851*4882a593Smuzhiyun        mov	keysize,%eax
1852*4882a593Smuzhiyun        cmp     $32, %eax
1853*4882a593Smuzhiyun        je      key_256_finalize
1854*4882a593Smuzhiyun        cmp     $16, %eax
1855*4882a593Smuzhiyun        je      key_128_finalize
1856*4882a593Smuzhiyun        # must be 192
1857*4882a593Smuzhiyun        GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4
1858*4882a593Smuzhiyun        FUNC_RESTORE
1859*4882a593Smuzhiyun        RET
1860*4882a593Smuzhiyunkey_128_finalize:
1861*4882a593Smuzhiyun        GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4
1862*4882a593Smuzhiyun        FUNC_RESTORE
1863*4882a593Smuzhiyun        RET
1864*4882a593Smuzhiyunkey_256_finalize:
1865*4882a593Smuzhiyun        GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4
1866*4882a593Smuzhiyun        FUNC_RESTORE
1867*4882a593Smuzhiyun        RET
1868*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_finalize_avx_gen2)
1869*4882a593Smuzhiyun
1870*4882a593Smuzhiyun###############################################################################
1871*4882a593Smuzhiyun# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
1872*4882a593Smuzhiyun# Input: A and B (128-bits each, bit-reflected)
1873*4882a593Smuzhiyun# Output: C = A*B*x mod poly, (i.e. >>1 )
1874*4882a593Smuzhiyun# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1875*4882a593Smuzhiyun# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1876*4882a593Smuzhiyun###############################################################################
1877*4882a593Smuzhiyun.macro  GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5
1878*4882a593Smuzhiyun
1879*4882a593Smuzhiyun        vpclmulqdq      $0x11,\HK,\GH,\T1      # T1 = a1*b1
1880*4882a593Smuzhiyun        vpclmulqdq      $0x00,\HK,\GH,\T2      # T2 = a0*b0
1881*4882a593Smuzhiyun        vpclmulqdq      $0x01,\HK,\GH,\T3      # T3 = a1*b0
1882*4882a593Smuzhiyun        vpclmulqdq      $0x10,\HK,\GH,\GH      # GH = a0*b1
1883*4882a593Smuzhiyun        vpxor           \T3, \GH, \GH
1884*4882a593Smuzhiyun
1885*4882a593Smuzhiyun
1886*4882a593Smuzhiyun        vpsrldq         $8 , \GH, \T3          # shift-R GH 2 DWs
1887*4882a593Smuzhiyun        vpslldq         $8 , \GH, \GH          # shift-L GH 2 DWs
1888*4882a593Smuzhiyun
1889*4882a593Smuzhiyun        vpxor           \T3, \T1, \T1
1890*4882a593Smuzhiyun        vpxor           \T2, \GH, \GH
1891*4882a593Smuzhiyun
1892*4882a593Smuzhiyun        #######################################################################
1893*4882a593Smuzhiyun        #first phase of the reduction
1894*4882a593Smuzhiyun        vmovdqa         POLY2(%rip), \T3
1895*4882a593Smuzhiyun
1896*4882a593Smuzhiyun        vpclmulqdq      $0x01, \GH, \T3, \T2
1897*4882a593Smuzhiyun        vpslldq         $8, \T2, \T2           # shift-L T2 2 DWs
1898*4882a593Smuzhiyun
1899*4882a593Smuzhiyun        vpxor           \T2, \GH, \GH          # first phase of the reduction complete
1900*4882a593Smuzhiyun        #######################################################################
1901*4882a593Smuzhiyun        #second phase of the reduction
1902*4882a593Smuzhiyun        vpclmulqdq      $0x00, \GH, \T3, \T2
1903*4882a593Smuzhiyun        vpsrldq         $4, \T2, \T2           # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
1904*4882a593Smuzhiyun
1905*4882a593Smuzhiyun        vpclmulqdq      $0x10, \GH, \T3, \GH
1906*4882a593Smuzhiyun        vpslldq         $4, \GH, \GH           # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts)
1907*4882a593Smuzhiyun
1908*4882a593Smuzhiyun        vpxor           \T2, \GH, \GH          # second phase of the reduction complete
1909*4882a593Smuzhiyun        #######################################################################
1910*4882a593Smuzhiyun        vpxor           \T1, \GH, \GH          # the result is in GH
1911*4882a593Smuzhiyun
1912*4882a593Smuzhiyun
1913*4882a593Smuzhiyun.endm
1914*4882a593Smuzhiyun
1915*4882a593Smuzhiyun.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6
1916*4882a593Smuzhiyun
1917*4882a593Smuzhiyun        # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
1918*4882a593Smuzhiyun        vmovdqa  \HK, \T5
1919*4882a593Smuzhiyun        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^2<<1 mod poly
1920*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_2(arg2)                       #  [HashKey_2] = HashKey^2<<1 mod poly
1921*4882a593Smuzhiyun
1922*4882a593Smuzhiyun        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^3<<1 mod poly
1923*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_3(arg2)
1924*4882a593Smuzhiyun
1925*4882a593Smuzhiyun        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^4<<1 mod poly
1926*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_4(arg2)
1927*4882a593Smuzhiyun
1928*4882a593Smuzhiyun        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^5<<1 mod poly
1929*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_5(arg2)
1930*4882a593Smuzhiyun
1931*4882a593Smuzhiyun        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^6<<1 mod poly
1932*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_6(arg2)
1933*4882a593Smuzhiyun
1934*4882a593Smuzhiyun        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^7<<1 mod poly
1935*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_7(arg2)
1936*4882a593Smuzhiyun
1937*4882a593Smuzhiyun        GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2    #  T5 = HashKey^8<<1 mod poly
1938*4882a593Smuzhiyun        vmovdqu  \T5, HashKey_8(arg2)
1939*4882a593Smuzhiyun
1940*4882a593Smuzhiyun.endm
1941*4882a593Smuzhiyun
1942*4882a593Smuzhiyun## if a = number of total plaintext bytes
1943*4882a593Smuzhiyun## b = floor(a/16)
1944*4882a593Smuzhiyun## num_initial_blocks = b mod 4#
1945*4882a593Smuzhiyun## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext
1946*4882a593Smuzhiyun## r10, r11, r12, rax are clobbered
1947*4882a593Smuzhiyun## arg1, arg3, arg4, r14 are used as a pointer only, not modified
1948*4882a593Smuzhiyun
1949*4882a593Smuzhiyun.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER
1950*4882a593Smuzhiyun	i = (8-\num_initial_blocks)
1951*4882a593Smuzhiyun	setreg
1952*4882a593Smuzhiyun	vmovdqu AadHash(arg2), reg_i
1953*4882a593Smuzhiyun
1954*4882a593Smuzhiyun	# start AES for num_initial_blocks blocks
1955*4882a593Smuzhiyun	vmovdqu CurCount(arg2), \CTR
1956*4882a593Smuzhiyun
1957*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
1958*4882a593Smuzhiyun	setreg
1959*4882a593Smuzhiyun.rep \num_initial_blocks
1960*4882a593Smuzhiyun                vpaddd  ONE(%rip), \CTR, \CTR   # INCR Y0
1961*4882a593Smuzhiyun                vmovdqa \CTR, reg_i
1962*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), reg_i, reg_i     # perform a 16Byte swap
1963*4882a593Smuzhiyun	i = (i+1)
1964*4882a593Smuzhiyun	setreg
1965*4882a593Smuzhiyun.endr
1966*4882a593Smuzhiyun
1967*4882a593Smuzhiyun	vmovdqa  (arg1), \T_key
1968*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
1969*4882a593Smuzhiyun	setreg
1970*4882a593Smuzhiyun.rep \num_initial_blocks
1971*4882a593Smuzhiyun                vpxor   \T_key, reg_i, reg_i
1972*4882a593Smuzhiyun	i = (i+1)
1973*4882a593Smuzhiyun	setreg
1974*4882a593Smuzhiyun.endr
1975*4882a593Smuzhiyun
1976*4882a593Smuzhiyun	j = 1
1977*4882a593Smuzhiyun	setreg
1978*4882a593Smuzhiyun.rep \REP
1979*4882a593Smuzhiyun	vmovdqa  16*j(arg1), \T_key
1980*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
1981*4882a593Smuzhiyun	setreg
1982*4882a593Smuzhiyun.rep \num_initial_blocks
1983*4882a593Smuzhiyun        vaesenc \T_key, reg_i, reg_i
1984*4882a593Smuzhiyun	i = (i+1)
1985*4882a593Smuzhiyun	setreg
1986*4882a593Smuzhiyun.endr
1987*4882a593Smuzhiyun
1988*4882a593Smuzhiyun	j = (j+1)
1989*4882a593Smuzhiyun	setreg
1990*4882a593Smuzhiyun.endr
1991*4882a593Smuzhiyun
1992*4882a593Smuzhiyun
1993*4882a593Smuzhiyun	vmovdqa  16*j(arg1), \T_key
1994*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
1995*4882a593Smuzhiyun	setreg
1996*4882a593Smuzhiyun.rep \num_initial_blocks
1997*4882a593Smuzhiyun        vaesenclast      \T_key, reg_i, reg_i
1998*4882a593Smuzhiyun	i = (i+1)
1999*4882a593Smuzhiyun	setreg
2000*4882a593Smuzhiyun.endr
2001*4882a593Smuzhiyun
2002*4882a593Smuzhiyun	i = (9-\num_initial_blocks)
2003*4882a593Smuzhiyun	setreg
2004*4882a593Smuzhiyun.rep \num_initial_blocks
2005*4882a593Smuzhiyun                vmovdqu (arg4, %r11), \T1
2006*4882a593Smuzhiyun                vpxor   \T1, reg_i, reg_i
2007*4882a593Smuzhiyun                vmovdqu reg_i, (arg3 , %r11)           # write back ciphertext for
2008*4882a593Smuzhiyun						       # num_initial_blocks blocks
2009*4882a593Smuzhiyun                add     $16, %r11
2010*4882a593Smuzhiyun.if  \ENC_DEC == DEC
2011*4882a593Smuzhiyun                vmovdqa \T1, reg_i
2012*4882a593Smuzhiyun.endif
2013*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), reg_i, reg_i  # prepare ciphertext for GHASH computations
2014*4882a593Smuzhiyun	i = (i+1)
2015*4882a593Smuzhiyun	setreg
2016*4882a593Smuzhiyun.endr
2017*4882a593Smuzhiyun
2018*4882a593Smuzhiyun
2019*4882a593Smuzhiyun	i = (8-\num_initial_blocks)
2020*4882a593Smuzhiyun	j = (9-\num_initial_blocks)
2021*4882a593Smuzhiyun	setreg
2022*4882a593Smuzhiyun
2023*4882a593Smuzhiyun.rep \num_initial_blocks
2024*4882a593Smuzhiyun        vpxor    reg_i, reg_j, reg_j
2025*4882a593Smuzhiyun        GHASH_MUL_AVX2       reg_j, \T2, \T1, \T3, \T4, \T5, \T6  # apply GHASH on num_initial_blocks blocks
2026*4882a593Smuzhiyun	i = (i+1)
2027*4882a593Smuzhiyun	j = (j+1)
2028*4882a593Smuzhiyun	setreg
2029*4882a593Smuzhiyun.endr
2030*4882a593Smuzhiyun        # XMM8 has the combined result here
2031*4882a593Smuzhiyun
2032*4882a593Smuzhiyun        vmovdqa  \XMM8, TMP1(%rsp)
2033*4882a593Smuzhiyun        vmovdqa  \XMM8, \T3
2034*4882a593Smuzhiyun
2035*4882a593Smuzhiyun        cmp     $128, %r13
2036*4882a593Smuzhiyun        jl      _initial_blocks_done\@                  # no need for precomputed constants
2037*4882a593Smuzhiyun
2038*4882a593Smuzhiyun###############################################################################
2039*4882a593Smuzhiyun# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
2040*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2041*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM1
2042*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1  # perform a 16Byte swap
2043*4882a593Smuzhiyun
2044*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2045*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM2
2046*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2  # perform a 16Byte swap
2047*4882a593Smuzhiyun
2048*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2049*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM3
2050*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3  # perform a 16Byte swap
2051*4882a593Smuzhiyun
2052*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2053*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM4
2054*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4  # perform a 16Byte swap
2055*4882a593Smuzhiyun
2056*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2057*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM5
2058*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5  # perform a 16Byte swap
2059*4882a593Smuzhiyun
2060*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2061*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM6
2062*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6  # perform a 16Byte swap
2063*4882a593Smuzhiyun
2064*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2065*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM7
2066*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7  # perform a 16Byte swap
2067*4882a593Smuzhiyun
2068*4882a593Smuzhiyun                vpaddd   ONE(%rip), \CTR, \CTR          # INCR Y0
2069*4882a593Smuzhiyun                vmovdqa  \CTR, \XMM8
2070*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8  # perform a 16Byte swap
2071*4882a593Smuzhiyun
2072*4882a593Smuzhiyun                vmovdqa  (arg1), \T_key
2073*4882a593Smuzhiyun                vpxor    \T_key, \XMM1, \XMM1
2074*4882a593Smuzhiyun                vpxor    \T_key, \XMM2, \XMM2
2075*4882a593Smuzhiyun                vpxor    \T_key, \XMM3, \XMM3
2076*4882a593Smuzhiyun                vpxor    \T_key, \XMM4, \XMM4
2077*4882a593Smuzhiyun                vpxor    \T_key, \XMM5, \XMM5
2078*4882a593Smuzhiyun                vpxor    \T_key, \XMM6, \XMM6
2079*4882a593Smuzhiyun                vpxor    \T_key, \XMM7, \XMM7
2080*4882a593Smuzhiyun                vpxor    \T_key, \XMM8, \XMM8
2081*4882a593Smuzhiyun
2082*4882a593Smuzhiyun		i = 1
2083*4882a593Smuzhiyun		setreg
2084*4882a593Smuzhiyun.rep    \REP       # do REP rounds
2085*4882a593Smuzhiyun                vmovdqa  16*i(arg1), \T_key
2086*4882a593Smuzhiyun                vaesenc  \T_key, \XMM1, \XMM1
2087*4882a593Smuzhiyun                vaesenc  \T_key, \XMM2, \XMM2
2088*4882a593Smuzhiyun                vaesenc  \T_key, \XMM3, \XMM3
2089*4882a593Smuzhiyun                vaesenc  \T_key, \XMM4, \XMM4
2090*4882a593Smuzhiyun                vaesenc  \T_key, \XMM5, \XMM5
2091*4882a593Smuzhiyun                vaesenc  \T_key, \XMM6, \XMM6
2092*4882a593Smuzhiyun                vaesenc  \T_key, \XMM7, \XMM7
2093*4882a593Smuzhiyun                vaesenc  \T_key, \XMM8, \XMM8
2094*4882a593Smuzhiyun		i = (i+1)
2095*4882a593Smuzhiyun		setreg
2096*4882a593Smuzhiyun.endr
2097*4882a593Smuzhiyun
2098*4882a593Smuzhiyun
2099*4882a593Smuzhiyun                vmovdqa  16*i(arg1), \T_key
2100*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM1, \XMM1
2101*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM2, \XMM2
2102*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM3, \XMM3
2103*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM4, \XMM4
2104*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM5, \XMM5
2105*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM6, \XMM6
2106*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM7, \XMM7
2107*4882a593Smuzhiyun                vaesenclast  \T_key, \XMM8, \XMM8
2108*4882a593Smuzhiyun
2109*4882a593Smuzhiyun                vmovdqu  (arg4, %r11), \T1
2110*4882a593Smuzhiyun                vpxor    \T1, \XMM1, \XMM1
2111*4882a593Smuzhiyun                vmovdqu  \XMM1, (arg3 , %r11)
2112*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
2113*4882a593Smuzhiyun                vmovdqa  \T1, \XMM1
2114*4882a593Smuzhiyun                .endif
2115*4882a593Smuzhiyun
2116*4882a593Smuzhiyun                vmovdqu  16*1(arg4, %r11), \T1
2117*4882a593Smuzhiyun                vpxor    \T1, \XMM2, \XMM2
2118*4882a593Smuzhiyun                vmovdqu  \XMM2, 16*1(arg3 , %r11)
2119*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
2120*4882a593Smuzhiyun                vmovdqa  \T1, \XMM2
2121*4882a593Smuzhiyun                .endif
2122*4882a593Smuzhiyun
2123*4882a593Smuzhiyun                vmovdqu  16*2(arg4, %r11), \T1
2124*4882a593Smuzhiyun                vpxor    \T1, \XMM3, \XMM3
2125*4882a593Smuzhiyun                vmovdqu  \XMM3, 16*2(arg3 , %r11)
2126*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
2127*4882a593Smuzhiyun                vmovdqa  \T1, \XMM3
2128*4882a593Smuzhiyun                .endif
2129*4882a593Smuzhiyun
2130*4882a593Smuzhiyun                vmovdqu  16*3(arg4, %r11), \T1
2131*4882a593Smuzhiyun                vpxor    \T1, \XMM4, \XMM4
2132*4882a593Smuzhiyun                vmovdqu  \XMM4, 16*3(arg3 , %r11)
2133*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
2134*4882a593Smuzhiyun                vmovdqa  \T1, \XMM4
2135*4882a593Smuzhiyun                .endif
2136*4882a593Smuzhiyun
2137*4882a593Smuzhiyun                vmovdqu  16*4(arg4, %r11), \T1
2138*4882a593Smuzhiyun                vpxor    \T1, \XMM5, \XMM5
2139*4882a593Smuzhiyun                vmovdqu  \XMM5, 16*4(arg3 , %r11)
2140*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
2141*4882a593Smuzhiyun                vmovdqa  \T1, \XMM5
2142*4882a593Smuzhiyun                .endif
2143*4882a593Smuzhiyun
2144*4882a593Smuzhiyun                vmovdqu  16*5(arg4, %r11), \T1
2145*4882a593Smuzhiyun                vpxor    \T1, \XMM6, \XMM6
2146*4882a593Smuzhiyun                vmovdqu  \XMM6, 16*5(arg3 , %r11)
2147*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
2148*4882a593Smuzhiyun                vmovdqa  \T1, \XMM6
2149*4882a593Smuzhiyun                .endif
2150*4882a593Smuzhiyun
2151*4882a593Smuzhiyun                vmovdqu  16*6(arg4, %r11), \T1
2152*4882a593Smuzhiyun                vpxor    \T1, \XMM7, \XMM7
2153*4882a593Smuzhiyun                vmovdqu  \XMM7, 16*6(arg3 , %r11)
2154*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
2155*4882a593Smuzhiyun                vmovdqa  \T1, \XMM7
2156*4882a593Smuzhiyun                .endif
2157*4882a593Smuzhiyun
2158*4882a593Smuzhiyun                vmovdqu  16*7(arg4, %r11), \T1
2159*4882a593Smuzhiyun                vpxor    \T1, \XMM8, \XMM8
2160*4882a593Smuzhiyun                vmovdqu  \XMM8, 16*7(arg3 , %r11)
2161*4882a593Smuzhiyun                .if   \ENC_DEC == DEC
2162*4882a593Smuzhiyun                vmovdqa  \T1, \XMM8
2163*4882a593Smuzhiyun                .endif
2164*4882a593Smuzhiyun
2165*4882a593Smuzhiyun                add     $128, %r11
2166*4882a593Smuzhiyun
2167*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2168*4882a593Smuzhiyun                vpxor    TMP1(%rsp), \XMM1, \XMM1          # combine GHASHed value with
2169*4882a593Smuzhiyun							   # the corresponding ciphertext
2170*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2171*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2172*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2173*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2174*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2175*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2176*4882a593Smuzhiyun                vpshufb  SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2177*4882a593Smuzhiyun
2178*4882a593Smuzhiyun###############################################################################
2179*4882a593Smuzhiyun
2180*4882a593Smuzhiyun_initial_blocks_done\@:
2181*4882a593Smuzhiyun
2182*4882a593Smuzhiyun
2183*4882a593Smuzhiyun.endm
2184*4882a593Smuzhiyun
2185*4882a593Smuzhiyun
2186*4882a593Smuzhiyun
2187*4882a593Smuzhiyun# encrypt 8 blocks at a time
2188*4882a593Smuzhiyun# ghash the 8 previously encrypted ciphertext blocks
2189*4882a593Smuzhiyun# arg1, arg3, arg4 are used as pointers only, not modified
2190*4882a593Smuzhiyun# r11 is the data offset value
2191*4882a593Smuzhiyun.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC
2192*4882a593Smuzhiyun
2193*4882a593Smuzhiyun        vmovdqa \XMM1, \T2
2194*4882a593Smuzhiyun        vmovdqa \XMM2, TMP2(%rsp)
2195*4882a593Smuzhiyun        vmovdqa \XMM3, TMP3(%rsp)
2196*4882a593Smuzhiyun        vmovdqa \XMM4, TMP4(%rsp)
2197*4882a593Smuzhiyun        vmovdqa \XMM5, TMP5(%rsp)
2198*4882a593Smuzhiyun        vmovdqa \XMM6, TMP6(%rsp)
2199*4882a593Smuzhiyun        vmovdqa \XMM7, TMP7(%rsp)
2200*4882a593Smuzhiyun        vmovdqa \XMM8, TMP8(%rsp)
2201*4882a593Smuzhiyun
2202*4882a593Smuzhiyun.if \loop_idx == in_order
2203*4882a593Smuzhiyun                vpaddd  ONE(%rip), \CTR, \XMM1            # INCR CNT
2204*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM1, \XMM2
2205*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM2, \XMM3
2206*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM3, \XMM4
2207*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM4, \XMM5
2208*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM5, \XMM6
2209*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM6, \XMM7
2210*4882a593Smuzhiyun                vpaddd  ONE(%rip), \XMM7, \XMM8
2211*4882a593Smuzhiyun                vmovdqa \XMM8, \CTR
2212*4882a593Smuzhiyun
2213*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM1, \XMM1     # perform a 16Byte swap
2214*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM2, \XMM2     # perform a 16Byte swap
2215*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM3, \XMM3     # perform a 16Byte swap
2216*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM4, \XMM4     # perform a 16Byte swap
2217*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM5, \XMM5     # perform a 16Byte swap
2218*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM6, \XMM6     # perform a 16Byte swap
2219*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM7, \XMM7     # perform a 16Byte swap
2220*4882a593Smuzhiyun                vpshufb SHUF_MASK(%rip), \XMM8, \XMM8     # perform a 16Byte swap
2221*4882a593Smuzhiyun.else
2222*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \CTR, \XMM1            # INCR CNT
2223*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM1, \XMM2
2224*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM2, \XMM3
2225*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM3, \XMM4
2226*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM4, \XMM5
2227*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM5, \XMM6
2228*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM6, \XMM7
2229*4882a593Smuzhiyun                vpaddd  ONEf(%rip), \XMM7, \XMM8
2230*4882a593Smuzhiyun                vmovdqa \XMM8, \CTR
2231*4882a593Smuzhiyun.endif
2232*4882a593Smuzhiyun
2233*4882a593Smuzhiyun
2234*4882a593Smuzhiyun        #######################################################################
2235*4882a593Smuzhiyun
2236*4882a593Smuzhiyun                vmovdqu (arg1), \T1
2237*4882a593Smuzhiyun                vpxor   \T1, \XMM1, \XMM1
2238*4882a593Smuzhiyun                vpxor   \T1, \XMM2, \XMM2
2239*4882a593Smuzhiyun                vpxor   \T1, \XMM3, \XMM3
2240*4882a593Smuzhiyun                vpxor   \T1, \XMM4, \XMM4
2241*4882a593Smuzhiyun                vpxor   \T1, \XMM5, \XMM5
2242*4882a593Smuzhiyun                vpxor   \T1, \XMM6, \XMM6
2243*4882a593Smuzhiyun                vpxor   \T1, \XMM7, \XMM7
2244*4882a593Smuzhiyun                vpxor   \T1, \XMM8, \XMM8
2245*4882a593Smuzhiyun
2246*4882a593Smuzhiyun        #######################################################################
2247*4882a593Smuzhiyun
2248*4882a593Smuzhiyun
2249*4882a593Smuzhiyun
2250*4882a593Smuzhiyun
2251*4882a593Smuzhiyun
2252*4882a593Smuzhiyun                vmovdqu 16*1(arg1), \T1
2253*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
2254*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
2255*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
2256*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
2257*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
2258*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
2259*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
2260*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
2261*4882a593Smuzhiyun
2262*4882a593Smuzhiyun                vmovdqu 16*2(arg1), \T1
2263*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
2264*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
2265*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
2266*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
2267*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
2268*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
2269*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
2270*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
2271*4882a593Smuzhiyun
2272*4882a593Smuzhiyun
2273*4882a593Smuzhiyun        #######################################################################
2274*4882a593Smuzhiyun
2275*4882a593Smuzhiyun        vmovdqu         HashKey_8(arg2), \T5
2276*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T2, \T4              # T4 = a1*b1
2277*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T2, \T7              # T7 = a0*b0
2278*4882a593Smuzhiyun        vpclmulqdq      $0x01, \T5, \T2, \T6              # T6 = a1*b0
2279*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T2, \T5              # T5 = a0*b1
2280*4882a593Smuzhiyun        vpxor           \T5, \T6, \T6
2281*4882a593Smuzhiyun
2282*4882a593Smuzhiyun                vmovdqu 16*3(arg1), \T1
2283*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
2284*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
2285*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
2286*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
2287*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
2288*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
2289*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
2290*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
2291*4882a593Smuzhiyun
2292*4882a593Smuzhiyun        vmovdqa         TMP2(%rsp), \T1
2293*4882a593Smuzhiyun        vmovdqu         HashKey_7(arg2), \T5
2294*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
2295*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
2296*4882a593Smuzhiyun
2297*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
2298*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
2299*4882a593Smuzhiyun
2300*4882a593Smuzhiyun        vpclmulqdq      $0x01, \T5, \T1, \T3
2301*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2302*4882a593Smuzhiyun
2303*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T1, \T3
2304*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2305*4882a593Smuzhiyun
2306*4882a593Smuzhiyun                vmovdqu 16*4(arg1), \T1
2307*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
2308*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
2309*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
2310*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
2311*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
2312*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
2313*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
2314*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
2315*4882a593Smuzhiyun
2316*4882a593Smuzhiyun        #######################################################################
2317*4882a593Smuzhiyun
2318*4882a593Smuzhiyun        vmovdqa         TMP3(%rsp), \T1
2319*4882a593Smuzhiyun        vmovdqu         HashKey_6(arg2), \T5
2320*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
2321*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
2322*4882a593Smuzhiyun
2323*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
2324*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
2325*4882a593Smuzhiyun
2326*4882a593Smuzhiyun        vpclmulqdq      $0x01, \T5, \T1, \T3
2327*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2328*4882a593Smuzhiyun
2329*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T1, \T3
2330*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2331*4882a593Smuzhiyun
2332*4882a593Smuzhiyun                vmovdqu 16*5(arg1), \T1
2333*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
2334*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
2335*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
2336*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
2337*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
2338*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
2339*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
2340*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
2341*4882a593Smuzhiyun
2342*4882a593Smuzhiyun        vmovdqa         TMP4(%rsp), \T1
2343*4882a593Smuzhiyun        vmovdqu         HashKey_5(arg2), \T5
2344*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
2345*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
2346*4882a593Smuzhiyun
2347*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
2348*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
2349*4882a593Smuzhiyun
2350*4882a593Smuzhiyun        vpclmulqdq      $0x01, \T5, \T1, \T3
2351*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2352*4882a593Smuzhiyun
2353*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T1, \T3
2354*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2355*4882a593Smuzhiyun
2356*4882a593Smuzhiyun                vmovdqu 16*6(arg1), \T1
2357*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
2358*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
2359*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
2360*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
2361*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
2362*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
2363*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
2364*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
2365*4882a593Smuzhiyun
2366*4882a593Smuzhiyun
2367*4882a593Smuzhiyun        vmovdqa         TMP5(%rsp), \T1
2368*4882a593Smuzhiyun        vmovdqu         HashKey_4(arg2), \T5
2369*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
2370*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
2371*4882a593Smuzhiyun
2372*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
2373*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
2374*4882a593Smuzhiyun
2375*4882a593Smuzhiyun        vpclmulqdq      $0x01, \T5, \T1, \T3
2376*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2377*4882a593Smuzhiyun
2378*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T1, \T3
2379*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2380*4882a593Smuzhiyun
2381*4882a593Smuzhiyun                vmovdqu 16*7(arg1), \T1
2382*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
2383*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
2384*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
2385*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
2386*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
2387*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
2388*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
2389*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
2390*4882a593Smuzhiyun
2391*4882a593Smuzhiyun        vmovdqa         TMP6(%rsp), \T1
2392*4882a593Smuzhiyun        vmovdqu         HashKey_3(arg2), \T5
2393*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
2394*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
2395*4882a593Smuzhiyun
2396*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
2397*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
2398*4882a593Smuzhiyun
2399*4882a593Smuzhiyun        vpclmulqdq      $0x01, \T5, \T1, \T3
2400*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2401*4882a593Smuzhiyun
2402*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T1, \T3
2403*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2404*4882a593Smuzhiyun
2405*4882a593Smuzhiyun                vmovdqu 16*8(arg1), \T1
2406*4882a593Smuzhiyun                vaesenc \T1, \XMM1, \XMM1
2407*4882a593Smuzhiyun                vaesenc \T1, \XMM2, \XMM2
2408*4882a593Smuzhiyun                vaesenc \T1, \XMM3, \XMM3
2409*4882a593Smuzhiyun                vaesenc \T1, \XMM4, \XMM4
2410*4882a593Smuzhiyun                vaesenc \T1, \XMM5, \XMM5
2411*4882a593Smuzhiyun                vaesenc \T1, \XMM6, \XMM6
2412*4882a593Smuzhiyun                vaesenc \T1, \XMM7, \XMM7
2413*4882a593Smuzhiyun                vaesenc \T1, \XMM8, \XMM8
2414*4882a593Smuzhiyun
2415*4882a593Smuzhiyun        vmovdqa         TMP7(%rsp), \T1
2416*4882a593Smuzhiyun        vmovdqu         HashKey_2(arg2), \T5
2417*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
2418*4882a593Smuzhiyun        vpxor           \T3, \T4, \T4
2419*4882a593Smuzhiyun
2420*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
2421*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
2422*4882a593Smuzhiyun
2423*4882a593Smuzhiyun        vpclmulqdq      $0x01, \T5, \T1, \T3
2424*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2425*4882a593Smuzhiyun
2426*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T1, \T3
2427*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2428*4882a593Smuzhiyun
2429*4882a593Smuzhiyun
2430*4882a593Smuzhiyun        #######################################################################
2431*4882a593Smuzhiyun
2432*4882a593Smuzhiyun                vmovdqu 16*9(arg1), \T5
2433*4882a593Smuzhiyun                vaesenc \T5, \XMM1, \XMM1
2434*4882a593Smuzhiyun                vaesenc \T5, \XMM2, \XMM2
2435*4882a593Smuzhiyun                vaesenc \T5, \XMM3, \XMM3
2436*4882a593Smuzhiyun                vaesenc \T5, \XMM4, \XMM4
2437*4882a593Smuzhiyun                vaesenc \T5, \XMM5, \XMM5
2438*4882a593Smuzhiyun                vaesenc \T5, \XMM6, \XMM6
2439*4882a593Smuzhiyun                vaesenc \T5, \XMM7, \XMM7
2440*4882a593Smuzhiyun                vaesenc \T5, \XMM8, \XMM8
2441*4882a593Smuzhiyun
2442*4882a593Smuzhiyun        vmovdqa         TMP8(%rsp), \T1
2443*4882a593Smuzhiyun        vmovdqu         HashKey(arg2), \T5
2444*4882a593Smuzhiyun
2445*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \T1, \T3
2446*4882a593Smuzhiyun        vpxor           \T3, \T7, \T7
2447*4882a593Smuzhiyun
2448*4882a593Smuzhiyun        vpclmulqdq      $0x01, \T5, \T1, \T3
2449*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2450*4882a593Smuzhiyun
2451*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T5, \T1, \T3
2452*4882a593Smuzhiyun        vpxor           \T3, \T6, \T6
2453*4882a593Smuzhiyun
2454*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \T1, \T3
2455*4882a593Smuzhiyun        vpxor           \T3, \T4, \T1
2456*4882a593Smuzhiyun
2457*4882a593Smuzhiyun
2458*4882a593Smuzhiyun                vmovdqu 16*10(arg1), \T5
2459*4882a593Smuzhiyun
2460*4882a593Smuzhiyun        i = 11
2461*4882a593Smuzhiyun        setreg
2462*4882a593Smuzhiyun.rep (\REP-9)
2463*4882a593Smuzhiyun        vaesenc \T5, \XMM1, \XMM1
2464*4882a593Smuzhiyun        vaesenc \T5, \XMM2, \XMM2
2465*4882a593Smuzhiyun        vaesenc \T5, \XMM3, \XMM3
2466*4882a593Smuzhiyun        vaesenc \T5, \XMM4, \XMM4
2467*4882a593Smuzhiyun        vaesenc \T5, \XMM5, \XMM5
2468*4882a593Smuzhiyun        vaesenc \T5, \XMM6, \XMM6
2469*4882a593Smuzhiyun        vaesenc \T5, \XMM7, \XMM7
2470*4882a593Smuzhiyun        vaesenc \T5, \XMM8, \XMM8
2471*4882a593Smuzhiyun
2472*4882a593Smuzhiyun        vmovdqu 16*i(arg1), \T5
2473*4882a593Smuzhiyun        i = i + 1
2474*4882a593Smuzhiyun        setreg
2475*4882a593Smuzhiyun.endr
2476*4882a593Smuzhiyun
2477*4882a593Smuzhiyun	i = 0
2478*4882a593Smuzhiyun	j = 1
2479*4882a593Smuzhiyun	setreg
2480*4882a593Smuzhiyun.rep 8
2481*4882a593Smuzhiyun		vpxor	16*i(arg4, %r11), \T5, \T2
2482*4882a593Smuzhiyun                .if \ENC_DEC == ENC
2483*4882a593Smuzhiyun                vaesenclast     \T2, reg_j, reg_j
2484*4882a593Smuzhiyun                .else
2485*4882a593Smuzhiyun                vaesenclast     \T2, reg_j, \T3
2486*4882a593Smuzhiyun                vmovdqu 16*i(arg4, %r11), reg_j
2487*4882a593Smuzhiyun                vmovdqu \T3, 16*i(arg3, %r11)
2488*4882a593Smuzhiyun                .endif
2489*4882a593Smuzhiyun	i = (i+1)
2490*4882a593Smuzhiyun	j = (j+1)
2491*4882a593Smuzhiyun	setreg
2492*4882a593Smuzhiyun.endr
2493*4882a593Smuzhiyun	#######################################################################
2494*4882a593Smuzhiyun
2495*4882a593Smuzhiyun
2496*4882a593Smuzhiyun	vpslldq	$8, \T6, \T3				# shift-L T3 2 DWs
2497*4882a593Smuzhiyun	vpsrldq	$8, \T6, \T6				# shift-R T2 2 DWs
2498*4882a593Smuzhiyun	vpxor	\T3, \T7, \T7
2499*4882a593Smuzhiyun	vpxor	\T6, \T1, \T1				# accumulate the results in T1:T7
2500*4882a593Smuzhiyun
2501*4882a593Smuzhiyun
2502*4882a593Smuzhiyun
2503*4882a593Smuzhiyun	#######################################################################
2504*4882a593Smuzhiyun	#first phase of the reduction
2505*4882a593Smuzhiyun	vmovdqa         POLY2(%rip), \T3
2506*4882a593Smuzhiyun
2507*4882a593Smuzhiyun	vpclmulqdq	$0x01, \T7, \T3, \T2
2508*4882a593Smuzhiyun	vpslldq		$8, \T2, \T2			# shift-L xmm2 2 DWs
2509*4882a593Smuzhiyun
2510*4882a593Smuzhiyun	vpxor		\T2, \T7, \T7			# first phase of the reduction complete
2511*4882a593Smuzhiyun	#######################################################################
2512*4882a593Smuzhiyun                .if \ENC_DEC == ENC
2513*4882a593Smuzhiyun		vmovdqu	 \XMM1,	16*0(arg3,%r11)		# Write to the Ciphertext buffer
2514*4882a593Smuzhiyun		vmovdqu	 \XMM2,	16*1(arg3,%r11)		# Write to the Ciphertext buffer
2515*4882a593Smuzhiyun		vmovdqu	 \XMM3,	16*2(arg3,%r11)		# Write to the Ciphertext buffer
2516*4882a593Smuzhiyun		vmovdqu	 \XMM4,	16*3(arg3,%r11)		# Write to the Ciphertext buffer
2517*4882a593Smuzhiyun		vmovdqu	 \XMM5,	16*4(arg3,%r11)		# Write to the Ciphertext buffer
2518*4882a593Smuzhiyun		vmovdqu	 \XMM6,	16*5(arg3,%r11)		# Write to the Ciphertext buffer
2519*4882a593Smuzhiyun		vmovdqu	 \XMM7,	16*6(arg3,%r11)		# Write to the Ciphertext buffer
2520*4882a593Smuzhiyun		vmovdqu	 \XMM8,	16*7(arg3,%r11)		# Write to the Ciphertext buffer
2521*4882a593Smuzhiyun                .endif
2522*4882a593Smuzhiyun
2523*4882a593Smuzhiyun	#######################################################################
2524*4882a593Smuzhiyun	#second phase of the reduction
2525*4882a593Smuzhiyun	vpclmulqdq	$0x00, \T7, \T3, \T2
2526*4882a593Smuzhiyun	vpsrldq		$4, \T2, \T2			# shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2527*4882a593Smuzhiyun
2528*4882a593Smuzhiyun	vpclmulqdq	$0x10, \T7, \T3, \T4
2529*4882a593Smuzhiyun	vpslldq		$4, \T4, \T4			# shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts)
2530*4882a593Smuzhiyun
2531*4882a593Smuzhiyun	vpxor		\T2, \T4, \T4			# second phase of the reduction complete
2532*4882a593Smuzhiyun	#######################################################################
2533*4882a593Smuzhiyun	vpxor		\T4, \T1, \T1			# the result is in T1
2534*4882a593Smuzhiyun
2535*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM1, \XMM1	# perform a 16Byte swap
2536*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM2, \XMM2	# perform a 16Byte swap
2537*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM3, \XMM3	# perform a 16Byte swap
2538*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM4, \XMM4	# perform a 16Byte swap
2539*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM5, \XMM5	# perform a 16Byte swap
2540*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM6, \XMM6	# perform a 16Byte swap
2541*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM7, \XMM7	# perform a 16Byte swap
2542*4882a593Smuzhiyun		vpshufb	SHUF_MASK(%rip), \XMM8, \XMM8	# perform a 16Byte swap
2543*4882a593Smuzhiyun
2544*4882a593Smuzhiyun
2545*4882a593Smuzhiyun	vpxor	\T1, \XMM1, \XMM1
2546*4882a593Smuzhiyun
2547*4882a593Smuzhiyun
2548*4882a593Smuzhiyun
2549*4882a593Smuzhiyun.endm
2550*4882a593Smuzhiyun
2551*4882a593Smuzhiyun
2552*4882a593Smuzhiyun# GHASH the last 4 ciphertext blocks.
2553*4882a593Smuzhiyun.macro  GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8
2554*4882a593Smuzhiyun
2555*4882a593Smuzhiyun        ## Karatsuba Method
2556*4882a593Smuzhiyun
2557*4882a593Smuzhiyun        vmovdqu         HashKey_8(arg2), \T5
2558*4882a593Smuzhiyun
2559*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM1, \T2
2560*4882a593Smuzhiyun        vpshufd         $0b01001110, \T5, \T3
2561*4882a593Smuzhiyun        vpxor           \XMM1, \T2, \T2
2562*4882a593Smuzhiyun        vpxor           \T5, \T3, \T3
2563*4882a593Smuzhiyun
2564*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM1, \T6
2565*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM1, \T7
2566*4882a593Smuzhiyun
2567*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \XMM1
2568*4882a593Smuzhiyun
2569*4882a593Smuzhiyun        ######################
2570*4882a593Smuzhiyun
2571*4882a593Smuzhiyun        vmovdqu         HashKey_7(arg2), \T5
2572*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM2, \T2
2573*4882a593Smuzhiyun        vpshufd         $0b01001110, \T5, \T3
2574*4882a593Smuzhiyun        vpxor           \XMM2, \T2, \T2
2575*4882a593Smuzhiyun        vpxor           \T5, \T3, \T3
2576*4882a593Smuzhiyun
2577*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM2, \T4
2578*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
2579*4882a593Smuzhiyun
2580*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM2, \T4
2581*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
2582*4882a593Smuzhiyun
2583*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
2584*4882a593Smuzhiyun
2585*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
2586*4882a593Smuzhiyun
2587*4882a593Smuzhiyun        ######################
2588*4882a593Smuzhiyun
2589*4882a593Smuzhiyun        vmovdqu         HashKey_6(arg2), \T5
2590*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM3, \T2
2591*4882a593Smuzhiyun        vpshufd         $0b01001110, \T5, \T3
2592*4882a593Smuzhiyun        vpxor           \XMM3, \T2, \T2
2593*4882a593Smuzhiyun        vpxor           \T5, \T3, \T3
2594*4882a593Smuzhiyun
2595*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM3, \T4
2596*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
2597*4882a593Smuzhiyun
2598*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM3, \T4
2599*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
2600*4882a593Smuzhiyun
2601*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
2602*4882a593Smuzhiyun
2603*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
2604*4882a593Smuzhiyun
2605*4882a593Smuzhiyun        ######################
2606*4882a593Smuzhiyun
2607*4882a593Smuzhiyun        vmovdqu         HashKey_5(arg2), \T5
2608*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM4, \T2
2609*4882a593Smuzhiyun        vpshufd         $0b01001110, \T5, \T3
2610*4882a593Smuzhiyun        vpxor           \XMM4, \T2, \T2
2611*4882a593Smuzhiyun        vpxor           \T5, \T3, \T3
2612*4882a593Smuzhiyun
2613*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM4, \T4
2614*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
2615*4882a593Smuzhiyun
2616*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM4, \T4
2617*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
2618*4882a593Smuzhiyun
2619*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
2620*4882a593Smuzhiyun
2621*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
2622*4882a593Smuzhiyun
2623*4882a593Smuzhiyun        ######################
2624*4882a593Smuzhiyun
2625*4882a593Smuzhiyun        vmovdqu         HashKey_4(arg2), \T5
2626*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM5, \T2
2627*4882a593Smuzhiyun        vpshufd         $0b01001110, \T5, \T3
2628*4882a593Smuzhiyun        vpxor           \XMM5, \T2, \T2
2629*4882a593Smuzhiyun        vpxor           \T5, \T3, \T3
2630*4882a593Smuzhiyun
2631*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM5, \T4
2632*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
2633*4882a593Smuzhiyun
2634*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM5, \T4
2635*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
2636*4882a593Smuzhiyun
2637*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
2638*4882a593Smuzhiyun
2639*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
2640*4882a593Smuzhiyun
2641*4882a593Smuzhiyun        ######################
2642*4882a593Smuzhiyun
2643*4882a593Smuzhiyun        vmovdqu         HashKey_3(arg2), \T5
2644*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM6, \T2
2645*4882a593Smuzhiyun        vpshufd         $0b01001110, \T5, \T3
2646*4882a593Smuzhiyun        vpxor           \XMM6, \T2, \T2
2647*4882a593Smuzhiyun        vpxor           \T5, \T3, \T3
2648*4882a593Smuzhiyun
2649*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM6, \T4
2650*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
2651*4882a593Smuzhiyun
2652*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM6, \T4
2653*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
2654*4882a593Smuzhiyun
2655*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
2656*4882a593Smuzhiyun
2657*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
2658*4882a593Smuzhiyun
2659*4882a593Smuzhiyun        ######################
2660*4882a593Smuzhiyun
2661*4882a593Smuzhiyun        vmovdqu         HashKey_2(arg2), \T5
2662*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM7, \T2
2663*4882a593Smuzhiyun        vpshufd         $0b01001110, \T5, \T3
2664*4882a593Smuzhiyun        vpxor           \XMM7, \T2, \T2
2665*4882a593Smuzhiyun        vpxor           \T5, \T3, \T3
2666*4882a593Smuzhiyun
2667*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM7, \T4
2668*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
2669*4882a593Smuzhiyun
2670*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM7, \T4
2671*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
2672*4882a593Smuzhiyun
2673*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
2674*4882a593Smuzhiyun
2675*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
2676*4882a593Smuzhiyun
2677*4882a593Smuzhiyun        ######################
2678*4882a593Smuzhiyun
2679*4882a593Smuzhiyun        vmovdqu         HashKey(arg2), \T5
2680*4882a593Smuzhiyun        vpshufd         $0b01001110, \XMM8, \T2
2681*4882a593Smuzhiyun        vpshufd         $0b01001110, \T5, \T3
2682*4882a593Smuzhiyun        vpxor           \XMM8, \T2, \T2
2683*4882a593Smuzhiyun        vpxor           \T5, \T3, \T3
2684*4882a593Smuzhiyun
2685*4882a593Smuzhiyun        vpclmulqdq      $0x11, \T5, \XMM8, \T4
2686*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6
2687*4882a593Smuzhiyun
2688*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T5, \XMM8, \T4
2689*4882a593Smuzhiyun        vpxor           \T4, \T7, \T7
2690*4882a593Smuzhiyun
2691*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T3, \T2, \T2
2692*4882a593Smuzhiyun
2693*4882a593Smuzhiyun        vpxor           \T2, \XMM1, \XMM1
2694*4882a593Smuzhiyun        vpxor           \T6, \XMM1, \XMM1
2695*4882a593Smuzhiyun        vpxor           \T7, \XMM1, \T2
2696*4882a593Smuzhiyun
2697*4882a593Smuzhiyun
2698*4882a593Smuzhiyun
2699*4882a593Smuzhiyun
2700*4882a593Smuzhiyun        vpslldq $8, \T2, \T4
2701*4882a593Smuzhiyun        vpsrldq $8, \T2, \T2
2702*4882a593Smuzhiyun
2703*4882a593Smuzhiyun        vpxor   \T4, \T7, \T7
2704*4882a593Smuzhiyun        vpxor   \T2, \T6, \T6                      # <T6:T7> holds the result of the
2705*4882a593Smuzhiyun						   # accumulated carry-less multiplications
2706*4882a593Smuzhiyun
2707*4882a593Smuzhiyun        #######################################################################
2708*4882a593Smuzhiyun        #first phase of the reduction
2709*4882a593Smuzhiyun        vmovdqa         POLY2(%rip), \T3
2710*4882a593Smuzhiyun
2711*4882a593Smuzhiyun        vpclmulqdq      $0x01, \T7, \T3, \T2
2712*4882a593Smuzhiyun        vpslldq         $8, \T2, \T2               # shift-L xmm2 2 DWs
2713*4882a593Smuzhiyun
2714*4882a593Smuzhiyun        vpxor           \T2, \T7, \T7              # first phase of the reduction complete
2715*4882a593Smuzhiyun        #######################################################################
2716*4882a593Smuzhiyun
2717*4882a593Smuzhiyun
2718*4882a593Smuzhiyun        #second phase of the reduction
2719*4882a593Smuzhiyun        vpclmulqdq      $0x00, \T7, \T3, \T2
2720*4882a593Smuzhiyun        vpsrldq         $4, \T2, \T2               # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R)
2721*4882a593Smuzhiyun
2722*4882a593Smuzhiyun        vpclmulqdq      $0x10, \T7, \T3, \T4
2723*4882a593Smuzhiyun        vpslldq         $4, \T4, \T4               # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts)
2724*4882a593Smuzhiyun
2725*4882a593Smuzhiyun        vpxor           \T2, \T4, \T4              # second phase of the reduction complete
2726*4882a593Smuzhiyun        #######################################################################
2727*4882a593Smuzhiyun        vpxor           \T4, \T6, \T6              # the result is in T6
2728*4882a593Smuzhiyun.endm
2729*4882a593Smuzhiyun
2730*4882a593Smuzhiyun
2731*4882a593Smuzhiyun
2732*4882a593Smuzhiyun#############################################################
2733*4882a593Smuzhiyun#void   aesni_gcm_init_avx_gen4
2734*4882a593Smuzhiyun#        (gcm_data     *my_ctx_data,
2735*4882a593Smuzhiyun#         gcm_context_data *data,
2736*4882a593Smuzhiyun#        u8      *iv, /* Pre-counter block j0: 4 byte salt
2737*4882a593Smuzhiyun#			(from Security Association) concatenated with 8 byte
2738*4882a593Smuzhiyun#			Initialisation Vector (from IPSec ESP Payload)
2739*4882a593Smuzhiyun#			concatenated with 0x00000001. 16-byte aligned pointer. */
2740*4882a593Smuzhiyun#        u8     *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */
2741*4882a593Smuzhiyun#        const   u8 *aad, /* Additional Authentication Data (AAD)*/
2742*4882a593Smuzhiyun#        u64     aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */
2743*4882a593Smuzhiyun#############################################################
2744*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_init_avx_gen4)
2745*4882a593Smuzhiyun        FUNC_SAVE
2746*4882a593Smuzhiyun        INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2
2747*4882a593Smuzhiyun        FUNC_RESTORE
2748*4882a593Smuzhiyun        RET
2749*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_init_avx_gen4)
2750*4882a593Smuzhiyun
2751*4882a593Smuzhiyun###############################################################################
2752*4882a593Smuzhiyun#void   aesni_gcm_enc_avx_gen4(
2753*4882a593Smuzhiyun#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2754*4882a593Smuzhiyun#        gcm_context_data *data,
2755*4882a593Smuzhiyun#        u8      *out, /* Ciphertext output. Encrypt in-place is allowed.  */
2756*4882a593Smuzhiyun#        const   u8 *in, /* Plaintext input */
2757*4882a593Smuzhiyun#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2758*4882a593Smuzhiyun###############################################################################
2759*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_enc_update_avx_gen4)
2760*4882a593Smuzhiyun        FUNC_SAVE
2761*4882a593Smuzhiyun        mov     keysize,%eax
2762*4882a593Smuzhiyun        cmp     $32, %eax
2763*4882a593Smuzhiyun        je      key_256_enc_update4
2764*4882a593Smuzhiyun        cmp     $16, %eax
2765*4882a593Smuzhiyun        je      key_128_enc_update4
2766*4882a593Smuzhiyun        # must be 192
2767*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11
2768*4882a593Smuzhiyun        FUNC_RESTORE
2769*4882a593Smuzhiyun	RET
2770*4882a593Smuzhiyunkey_128_enc_update4:
2771*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9
2772*4882a593Smuzhiyun        FUNC_RESTORE
2773*4882a593Smuzhiyun	RET
2774*4882a593Smuzhiyunkey_256_enc_update4:
2775*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13
2776*4882a593Smuzhiyun        FUNC_RESTORE
2777*4882a593Smuzhiyun	RET
2778*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_enc_update_avx_gen4)
2779*4882a593Smuzhiyun
2780*4882a593Smuzhiyun###############################################################################
2781*4882a593Smuzhiyun#void   aesni_gcm_dec_update_avx_gen4(
2782*4882a593Smuzhiyun#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2783*4882a593Smuzhiyun#        gcm_context_data *data,
2784*4882a593Smuzhiyun#        u8      *out, /* Plaintext output. Decrypt in-place is allowed.  */
2785*4882a593Smuzhiyun#        const   u8 *in, /* Ciphertext input */
2786*4882a593Smuzhiyun#        u64     plaintext_len) /* Length of data in Bytes for encryption. */
2787*4882a593Smuzhiyun###############################################################################
2788*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_dec_update_avx_gen4)
2789*4882a593Smuzhiyun        FUNC_SAVE
2790*4882a593Smuzhiyun        mov     keysize,%eax
2791*4882a593Smuzhiyun        cmp     $32, %eax
2792*4882a593Smuzhiyun        je      key_256_dec_update4
2793*4882a593Smuzhiyun        cmp     $16, %eax
2794*4882a593Smuzhiyun        je      key_128_dec_update4
2795*4882a593Smuzhiyun        # must be 192
2796*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11
2797*4882a593Smuzhiyun        FUNC_RESTORE
2798*4882a593Smuzhiyun        RET
2799*4882a593Smuzhiyunkey_128_dec_update4:
2800*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9
2801*4882a593Smuzhiyun        FUNC_RESTORE
2802*4882a593Smuzhiyun        RET
2803*4882a593Smuzhiyunkey_256_dec_update4:
2804*4882a593Smuzhiyun        GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13
2805*4882a593Smuzhiyun        FUNC_RESTORE
2806*4882a593Smuzhiyun        RET
2807*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_dec_update_avx_gen4)
2808*4882a593Smuzhiyun
2809*4882a593Smuzhiyun###############################################################################
2810*4882a593Smuzhiyun#void   aesni_gcm_finalize_avx_gen4(
2811*4882a593Smuzhiyun#        gcm_data        *my_ctx_data,     /* aligned to 16 Bytes */
2812*4882a593Smuzhiyun#        gcm_context_data *data,
2813*4882a593Smuzhiyun#        u8      *auth_tag, /* Authenticated Tag output. */
2814*4882a593Smuzhiyun#        u64     auth_tag_len)# /* Authenticated Tag Length in bytes.
2815*4882a593Smuzhiyun#                              Valid values are 16 (most likely), 12 or 8. */
2816*4882a593Smuzhiyun###############################################################################
2817*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_finalize_avx_gen4)
2818*4882a593Smuzhiyun        FUNC_SAVE
2819*4882a593Smuzhiyun        mov	keysize,%eax
2820*4882a593Smuzhiyun        cmp     $32, %eax
2821*4882a593Smuzhiyun        je      key_256_finalize4
2822*4882a593Smuzhiyun        cmp     $16, %eax
2823*4882a593Smuzhiyun        je      key_128_finalize4
2824*4882a593Smuzhiyun        # must be 192
2825*4882a593Smuzhiyun        GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4
2826*4882a593Smuzhiyun        FUNC_RESTORE
2827*4882a593Smuzhiyun        RET
2828*4882a593Smuzhiyunkey_128_finalize4:
2829*4882a593Smuzhiyun        GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4
2830*4882a593Smuzhiyun        FUNC_RESTORE
2831*4882a593Smuzhiyun        RET
2832*4882a593Smuzhiyunkey_256_finalize4:
2833*4882a593Smuzhiyun        GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4
2834*4882a593Smuzhiyun        FUNC_RESTORE
2835*4882a593Smuzhiyun        RET
2836*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_finalize_avx_gen4)
2837