1*4882a593Smuzhiyun######################################################################## 2*4882a593Smuzhiyun# Copyright (c) 2013, Intel Corporation 3*4882a593Smuzhiyun# 4*4882a593Smuzhiyun# This software is available to you under a choice of one of two 5*4882a593Smuzhiyun# licenses. You may choose to be licensed under the terms of the GNU 6*4882a593Smuzhiyun# General Public License (GPL) Version 2, available from the file 7*4882a593Smuzhiyun# COPYING in the main directory of this source tree, or the 8*4882a593Smuzhiyun# OpenIB.org BSD license below: 9*4882a593Smuzhiyun# 10*4882a593Smuzhiyun# Redistribution and use in source and binary forms, with or without 11*4882a593Smuzhiyun# modification, are permitted provided that the following conditions are 12*4882a593Smuzhiyun# met: 13*4882a593Smuzhiyun# 14*4882a593Smuzhiyun# * Redistributions of source code must retain the above copyright 15*4882a593Smuzhiyun# notice, this list of conditions and the following disclaimer. 16*4882a593Smuzhiyun# 17*4882a593Smuzhiyun# * Redistributions in binary form must reproduce the above copyright 18*4882a593Smuzhiyun# notice, this list of conditions and the following disclaimer in the 19*4882a593Smuzhiyun# documentation and/or other materials provided with the 20*4882a593Smuzhiyun# distribution. 21*4882a593Smuzhiyun# 22*4882a593Smuzhiyun# * Neither the name of the Intel Corporation nor the names of its 23*4882a593Smuzhiyun# contributors may be used to endorse or promote products derived from 24*4882a593Smuzhiyun# this software without specific prior written permission. 25*4882a593Smuzhiyun# 26*4882a593Smuzhiyun# 27*4882a593Smuzhiyun# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 28*4882a593Smuzhiyun# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29*4882a593Smuzhiyun# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30*4882a593Smuzhiyun# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 31*4882a593Smuzhiyun# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 32*4882a593Smuzhiyun# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 33*4882a593Smuzhiyun# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES# LOSS OF USE, DATA, OR 34*4882a593Smuzhiyun# PROFITS# OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 35*4882a593Smuzhiyun# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 36*4882a593Smuzhiyun# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 37*4882a593Smuzhiyun# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38*4882a593Smuzhiyun######################################################################## 39*4882a593Smuzhiyun## 40*4882a593Smuzhiyun## Authors: 41*4882a593Smuzhiyun## Erdinc Ozturk <erdinc.ozturk@intel.com> 42*4882a593Smuzhiyun## Vinodh Gopal <vinodh.gopal@intel.com> 43*4882a593Smuzhiyun## James Guilford <james.guilford@intel.com> 44*4882a593Smuzhiyun## Tim Chen <tim.c.chen@linux.intel.com> 45*4882a593Smuzhiyun## 46*4882a593Smuzhiyun## References: 47*4882a593Smuzhiyun## This code was derived and highly optimized from the code described in paper: 48*4882a593Smuzhiyun## Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation 49*4882a593Smuzhiyun## on Intel Architecture Processors. August, 2010 50*4882a593Smuzhiyun## The details of the implementation is explained in: 51*4882a593Smuzhiyun## Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode 52*4882a593Smuzhiyun## on Intel Architecture Processors. October, 2012. 53*4882a593Smuzhiyun## 54*4882a593Smuzhiyun## Assumptions: 55*4882a593Smuzhiyun## 56*4882a593Smuzhiyun## 57*4882a593Smuzhiyun## 58*4882a593Smuzhiyun## iv: 59*4882a593Smuzhiyun## 0 1 2 3 60*4882a593Smuzhiyun## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 61*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 62*4882a593Smuzhiyun## | Salt (From the SA) | 63*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 64*4882a593Smuzhiyun## | Initialization Vector | 65*4882a593Smuzhiyun## | (This is the sequence number from IPSec header) | 66*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 67*4882a593Smuzhiyun## | 0x1 | 68*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 69*4882a593Smuzhiyun## 70*4882a593Smuzhiyun## 71*4882a593Smuzhiyun## 72*4882a593Smuzhiyun## AAD: 73*4882a593Smuzhiyun## AAD padded to 128 bits with 0 74*4882a593Smuzhiyun## for example, assume AAD is a u32 vector 75*4882a593Smuzhiyun## 76*4882a593Smuzhiyun## if AAD is 8 bytes: 77*4882a593Smuzhiyun## AAD[3] = {A0, A1}# 78*4882a593Smuzhiyun## padded AAD in xmm register = {A1 A0 0 0} 79*4882a593Smuzhiyun## 80*4882a593Smuzhiyun## 0 1 2 3 81*4882a593Smuzhiyun## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 82*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 83*4882a593Smuzhiyun## | SPI (A1) | 84*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 85*4882a593Smuzhiyun## | 32-bit Sequence Number (A0) | 86*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 87*4882a593Smuzhiyun## | 0x0 | 88*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 89*4882a593Smuzhiyun## 90*4882a593Smuzhiyun## AAD Format with 32-bit Sequence Number 91*4882a593Smuzhiyun## 92*4882a593Smuzhiyun## if AAD is 12 bytes: 93*4882a593Smuzhiyun## AAD[3] = {A0, A1, A2}# 94*4882a593Smuzhiyun## padded AAD in xmm register = {A2 A1 A0 0} 95*4882a593Smuzhiyun## 96*4882a593Smuzhiyun## 0 1 2 3 97*4882a593Smuzhiyun## 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 98*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 99*4882a593Smuzhiyun## | SPI (A2) | 100*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 101*4882a593Smuzhiyun## | 64-bit Extended Sequence Number {A1,A0} | 102*4882a593Smuzhiyun## | | 103*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 104*4882a593Smuzhiyun## | 0x0 | 105*4882a593Smuzhiyun## +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 106*4882a593Smuzhiyun## 107*4882a593Smuzhiyun## AAD Format with 64-bit Extended Sequence Number 108*4882a593Smuzhiyun## 109*4882a593Smuzhiyun## 110*4882a593Smuzhiyun## aadLen: 111*4882a593Smuzhiyun## from the definition of the spec, aadLen can only be 8 or 12 bytes. 112*4882a593Smuzhiyun## The code additionally supports aadLen of length 16 bytes. 113*4882a593Smuzhiyun## 114*4882a593Smuzhiyun## TLen: 115*4882a593Smuzhiyun## from the definition of the spec, TLen can only be 8, 12 or 16 bytes. 116*4882a593Smuzhiyun## 117*4882a593Smuzhiyun## poly = x^128 + x^127 + x^126 + x^121 + 1 118*4882a593Smuzhiyun## throughout the code, one tab and two tab indentations are used. one tab is 119*4882a593Smuzhiyun## for GHASH part, two tabs is for AES part. 120*4882a593Smuzhiyun## 121*4882a593Smuzhiyun 122*4882a593Smuzhiyun#include <linux/linkage.h> 123*4882a593Smuzhiyun 124*4882a593Smuzhiyun# constants in mergeable sections, linker can reorder and merge 125*4882a593Smuzhiyun.section .rodata.cst16.POLY, "aM", @progbits, 16 126*4882a593Smuzhiyun.align 16 127*4882a593SmuzhiyunPOLY: .octa 0xC2000000000000000000000000000001 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun.section .rodata.cst16.POLY2, "aM", @progbits, 16 130*4882a593Smuzhiyun.align 16 131*4882a593SmuzhiyunPOLY2: .octa 0xC20000000000000000000001C2000000 132*4882a593Smuzhiyun 133*4882a593Smuzhiyun.section .rodata.cst16.TWOONE, "aM", @progbits, 16 134*4882a593Smuzhiyun.align 16 135*4882a593SmuzhiyunTWOONE: .octa 0x00000001000000000000000000000001 136*4882a593Smuzhiyun 137*4882a593Smuzhiyun.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 138*4882a593Smuzhiyun.align 16 139*4882a593SmuzhiyunSHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 140*4882a593Smuzhiyun 141*4882a593Smuzhiyun.section .rodata.cst16.ONE, "aM", @progbits, 16 142*4882a593Smuzhiyun.align 16 143*4882a593SmuzhiyunONE: .octa 0x00000000000000000000000000000001 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun.section .rodata.cst16.ONEf, "aM", @progbits, 16 146*4882a593Smuzhiyun.align 16 147*4882a593SmuzhiyunONEf: .octa 0x01000000000000000000000000000000 148*4882a593Smuzhiyun 149*4882a593Smuzhiyun# order of these constants should not change. 150*4882a593Smuzhiyun# more specifically, ALL_F should follow SHIFT_MASK, and zero should follow ALL_F 151*4882a593Smuzhiyun.section .rodata, "a", @progbits 152*4882a593Smuzhiyun.align 16 153*4882a593SmuzhiyunSHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 154*4882a593SmuzhiyunALL_F: .octa 0xffffffffffffffffffffffffffffffff 155*4882a593Smuzhiyun .octa 0x00000000000000000000000000000000 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun.section .rodata 158*4882a593Smuzhiyun.align 16 159*4882a593Smuzhiyun.type aad_shift_arr, @object 160*4882a593Smuzhiyun.size aad_shift_arr, 272 161*4882a593Smuzhiyunaad_shift_arr: 162*4882a593Smuzhiyun .octa 0xffffffffffffffffffffffffffffffff 163*4882a593Smuzhiyun .octa 0xffffffffffffffffffffffffffffff0C 164*4882a593Smuzhiyun .octa 0xffffffffffffffffffffffffffff0D0C 165*4882a593Smuzhiyun .octa 0xffffffffffffffffffffffffff0E0D0C 166*4882a593Smuzhiyun .octa 0xffffffffffffffffffffffff0F0E0D0C 167*4882a593Smuzhiyun .octa 0xffffffffffffffffffffff0C0B0A0908 168*4882a593Smuzhiyun .octa 0xffffffffffffffffffff0D0C0B0A0908 169*4882a593Smuzhiyun .octa 0xffffffffffffffffff0E0D0C0B0A0908 170*4882a593Smuzhiyun .octa 0xffffffffffffffff0F0E0D0C0B0A0908 171*4882a593Smuzhiyun .octa 0xffffffffffffff0C0B0A090807060504 172*4882a593Smuzhiyun .octa 0xffffffffffff0D0C0B0A090807060504 173*4882a593Smuzhiyun .octa 0xffffffffff0E0D0C0B0A090807060504 174*4882a593Smuzhiyun .octa 0xffffffff0F0E0D0C0B0A090807060504 175*4882a593Smuzhiyun .octa 0xffffff0C0B0A09080706050403020100 176*4882a593Smuzhiyun .octa 0xffff0D0C0B0A09080706050403020100 177*4882a593Smuzhiyun .octa 0xff0E0D0C0B0A09080706050403020100 178*4882a593Smuzhiyun .octa 0x0F0E0D0C0B0A09080706050403020100 179*4882a593Smuzhiyun 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun.text 182*4882a593Smuzhiyun 183*4882a593Smuzhiyun 184*4882a593Smuzhiyun#define AadHash 16*0 185*4882a593Smuzhiyun#define AadLen 16*1 186*4882a593Smuzhiyun#define InLen (16*1)+8 187*4882a593Smuzhiyun#define PBlockEncKey 16*2 188*4882a593Smuzhiyun#define OrigIV 16*3 189*4882a593Smuzhiyun#define CurCount 16*4 190*4882a593Smuzhiyun#define PBlockLen 16*5 191*4882a593Smuzhiyun 192*4882a593SmuzhiyunHashKey = 16*6 # store HashKey <<1 mod poly here 193*4882a593SmuzhiyunHashKey_2 = 16*7 # store HashKey^2 <<1 mod poly here 194*4882a593SmuzhiyunHashKey_3 = 16*8 # store HashKey^3 <<1 mod poly here 195*4882a593SmuzhiyunHashKey_4 = 16*9 # store HashKey^4 <<1 mod poly here 196*4882a593SmuzhiyunHashKey_5 = 16*10 # store HashKey^5 <<1 mod poly here 197*4882a593SmuzhiyunHashKey_6 = 16*11 # store HashKey^6 <<1 mod poly here 198*4882a593SmuzhiyunHashKey_7 = 16*12 # store HashKey^7 <<1 mod poly here 199*4882a593SmuzhiyunHashKey_8 = 16*13 # store HashKey^8 <<1 mod poly here 200*4882a593SmuzhiyunHashKey_k = 16*14 # store XOR of HashKey <<1 mod poly here (for Karatsuba purposes) 201*4882a593SmuzhiyunHashKey_2_k = 16*15 # store XOR of HashKey^2 <<1 mod poly here (for Karatsuba purposes) 202*4882a593SmuzhiyunHashKey_3_k = 16*16 # store XOR of HashKey^3 <<1 mod poly here (for Karatsuba purposes) 203*4882a593SmuzhiyunHashKey_4_k = 16*17 # store XOR of HashKey^4 <<1 mod poly here (for Karatsuba purposes) 204*4882a593SmuzhiyunHashKey_5_k = 16*18 # store XOR of HashKey^5 <<1 mod poly here (for Karatsuba purposes) 205*4882a593SmuzhiyunHashKey_6_k = 16*19 # store XOR of HashKey^6 <<1 mod poly here (for Karatsuba purposes) 206*4882a593SmuzhiyunHashKey_7_k = 16*20 # store XOR of HashKey^7 <<1 mod poly here (for Karatsuba purposes) 207*4882a593SmuzhiyunHashKey_8_k = 16*21 # store XOR of HashKey^8 <<1 mod poly here (for Karatsuba purposes) 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun#define arg1 %rdi 210*4882a593Smuzhiyun#define arg2 %rsi 211*4882a593Smuzhiyun#define arg3 %rdx 212*4882a593Smuzhiyun#define arg4 %rcx 213*4882a593Smuzhiyun#define arg5 %r8 214*4882a593Smuzhiyun#define arg6 %r9 215*4882a593Smuzhiyun#define arg7 STACK_OFFSET+8*1(%r14) 216*4882a593Smuzhiyun#define arg8 STACK_OFFSET+8*2(%r14) 217*4882a593Smuzhiyun#define arg9 STACK_OFFSET+8*3(%r14) 218*4882a593Smuzhiyun#define arg10 STACK_OFFSET+8*4(%r14) 219*4882a593Smuzhiyun#define keysize 2*15*16(arg1) 220*4882a593Smuzhiyun 221*4882a593Smuzhiyuni = 0 222*4882a593Smuzhiyunj = 0 223*4882a593Smuzhiyun 224*4882a593Smuzhiyunout_order = 0 225*4882a593Smuzhiyunin_order = 1 226*4882a593SmuzhiyunDEC = 0 227*4882a593SmuzhiyunENC = 1 228*4882a593Smuzhiyun 229*4882a593Smuzhiyun.macro define_reg r n 230*4882a593Smuzhiyunreg_\r = %xmm\n 231*4882a593Smuzhiyun.endm 232*4882a593Smuzhiyun 233*4882a593Smuzhiyun.macro setreg 234*4882a593Smuzhiyun.altmacro 235*4882a593Smuzhiyundefine_reg i %i 236*4882a593Smuzhiyundefine_reg j %j 237*4882a593Smuzhiyun.noaltmacro 238*4882a593Smuzhiyun.endm 239*4882a593Smuzhiyun 240*4882a593Smuzhiyun# need to push 4 registers into stack to maintain 241*4882a593SmuzhiyunSTACK_OFFSET = 8*4 242*4882a593Smuzhiyun 243*4882a593SmuzhiyunTMP1 = 16*0 # Temporary storage for AAD 244*4882a593SmuzhiyunTMP2 = 16*1 # Temporary storage for AES State 2 (State 1 is stored in an XMM register) 245*4882a593SmuzhiyunTMP3 = 16*2 # Temporary storage for AES State 3 246*4882a593SmuzhiyunTMP4 = 16*3 # Temporary storage for AES State 4 247*4882a593SmuzhiyunTMP5 = 16*4 # Temporary storage for AES State 5 248*4882a593SmuzhiyunTMP6 = 16*5 # Temporary storage for AES State 6 249*4882a593SmuzhiyunTMP7 = 16*6 # Temporary storage for AES State 7 250*4882a593SmuzhiyunTMP8 = 16*7 # Temporary storage for AES State 8 251*4882a593Smuzhiyun 252*4882a593SmuzhiyunVARIABLE_OFFSET = 16*8 253*4882a593Smuzhiyun 254*4882a593Smuzhiyun################################ 255*4882a593Smuzhiyun# Utility Macros 256*4882a593Smuzhiyun################################ 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun.macro FUNC_SAVE 259*4882a593Smuzhiyun #the number of pushes must equal STACK_OFFSET 260*4882a593Smuzhiyun push %r12 261*4882a593Smuzhiyun push %r13 262*4882a593Smuzhiyun push %r14 263*4882a593Smuzhiyun push %r15 264*4882a593Smuzhiyun 265*4882a593Smuzhiyun mov %rsp, %r14 266*4882a593Smuzhiyun 267*4882a593Smuzhiyun 268*4882a593Smuzhiyun 269*4882a593Smuzhiyun sub $VARIABLE_OFFSET, %rsp 270*4882a593Smuzhiyun and $~63, %rsp # align rsp to 64 bytes 271*4882a593Smuzhiyun.endm 272*4882a593Smuzhiyun 273*4882a593Smuzhiyun.macro FUNC_RESTORE 274*4882a593Smuzhiyun mov %r14, %rsp 275*4882a593Smuzhiyun 276*4882a593Smuzhiyun pop %r15 277*4882a593Smuzhiyun pop %r14 278*4882a593Smuzhiyun pop %r13 279*4882a593Smuzhiyun pop %r12 280*4882a593Smuzhiyun.endm 281*4882a593Smuzhiyun 282*4882a593Smuzhiyun# Encryption of a single block 283*4882a593Smuzhiyun.macro ENCRYPT_SINGLE_BLOCK REP XMM0 284*4882a593Smuzhiyun vpxor (arg1), \XMM0, \XMM0 285*4882a593Smuzhiyun i = 1 286*4882a593Smuzhiyun setreg 287*4882a593Smuzhiyun.rep \REP 288*4882a593Smuzhiyun vaesenc 16*i(arg1), \XMM0, \XMM0 289*4882a593Smuzhiyun i = (i+1) 290*4882a593Smuzhiyun setreg 291*4882a593Smuzhiyun.endr 292*4882a593Smuzhiyun vaesenclast 16*i(arg1), \XMM0, \XMM0 293*4882a593Smuzhiyun.endm 294*4882a593Smuzhiyun 295*4882a593Smuzhiyun# combined for GCM encrypt and decrypt functions 296*4882a593Smuzhiyun# clobbering all xmm registers 297*4882a593Smuzhiyun# clobbering r10, r11, r12, r13, r14, r15 298*4882a593Smuzhiyun.macro GCM_ENC_DEC INITIAL_BLOCKS GHASH_8_ENCRYPT_8_PARALLEL GHASH_LAST_8 GHASH_MUL ENC_DEC REP 299*4882a593Smuzhiyun vmovdqu AadHash(arg2), %xmm8 300*4882a593Smuzhiyun vmovdqu HashKey(arg2), %xmm13 # xmm13 = HashKey 301*4882a593Smuzhiyun add arg5, InLen(arg2) 302*4882a593Smuzhiyun 303*4882a593Smuzhiyun # initialize the data pointer offset as zero 304*4882a593Smuzhiyun xor %r11d, %r11d 305*4882a593Smuzhiyun 306*4882a593Smuzhiyun PARTIAL_BLOCK \GHASH_MUL, arg3, arg4, arg5, %r11, %xmm8, \ENC_DEC 307*4882a593Smuzhiyun sub %r11, arg5 308*4882a593Smuzhiyun 309*4882a593Smuzhiyun mov arg5, %r13 # save the number of bytes of plaintext/ciphertext 310*4882a593Smuzhiyun and $-16, %r13 # r13 = r13 - (r13 mod 16) 311*4882a593Smuzhiyun 312*4882a593Smuzhiyun mov %r13, %r12 313*4882a593Smuzhiyun shr $4, %r12 314*4882a593Smuzhiyun and $7, %r12 315*4882a593Smuzhiyun jz _initial_num_blocks_is_0\@ 316*4882a593Smuzhiyun 317*4882a593Smuzhiyun cmp $7, %r12 318*4882a593Smuzhiyun je _initial_num_blocks_is_7\@ 319*4882a593Smuzhiyun cmp $6, %r12 320*4882a593Smuzhiyun je _initial_num_blocks_is_6\@ 321*4882a593Smuzhiyun cmp $5, %r12 322*4882a593Smuzhiyun je _initial_num_blocks_is_5\@ 323*4882a593Smuzhiyun cmp $4, %r12 324*4882a593Smuzhiyun je _initial_num_blocks_is_4\@ 325*4882a593Smuzhiyun cmp $3, %r12 326*4882a593Smuzhiyun je _initial_num_blocks_is_3\@ 327*4882a593Smuzhiyun cmp $2, %r12 328*4882a593Smuzhiyun je _initial_num_blocks_is_2\@ 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun jmp _initial_num_blocks_is_1\@ 331*4882a593Smuzhiyun 332*4882a593Smuzhiyun_initial_num_blocks_is_7\@: 333*4882a593Smuzhiyun \INITIAL_BLOCKS \REP, 7, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 334*4882a593Smuzhiyun sub $16*7, %r13 335*4882a593Smuzhiyun jmp _initial_blocks_encrypted\@ 336*4882a593Smuzhiyun 337*4882a593Smuzhiyun_initial_num_blocks_is_6\@: 338*4882a593Smuzhiyun \INITIAL_BLOCKS \REP, 6, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 339*4882a593Smuzhiyun sub $16*6, %r13 340*4882a593Smuzhiyun jmp _initial_blocks_encrypted\@ 341*4882a593Smuzhiyun 342*4882a593Smuzhiyun_initial_num_blocks_is_5\@: 343*4882a593Smuzhiyun \INITIAL_BLOCKS \REP, 5, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 344*4882a593Smuzhiyun sub $16*5, %r13 345*4882a593Smuzhiyun jmp _initial_blocks_encrypted\@ 346*4882a593Smuzhiyun 347*4882a593Smuzhiyun_initial_num_blocks_is_4\@: 348*4882a593Smuzhiyun \INITIAL_BLOCKS \REP, 4, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 349*4882a593Smuzhiyun sub $16*4, %r13 350*4882a593Smuzhiyun jmp _initial_blocks_encrypted\@ 351*4882a593Smuzhiyun 352*4882a593Smuzhiyun_initial_num_blocks_is_3\@: 353*4882a593Smuzhiyun \INITIAL_BLOCKS \REP, 3, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 354*4882a593Smuzhiyun sub $16*3, %r13 355*4882a593Smuzhiyun jmp _initial_blocks_encrypted\@ 356*4882a593Smuzhiyun 357*4882a593Smuzhiyun_initial_num_blocks_is_2\@: 358*4882a593Smuzhiyun \INITIAL_BLOCKS \REP, 2, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 359*4882a593Smuzhiyun sub $16*2, %r13 360*4882a593Smuzhiyun jmp _initial_blocks_encrypted\@ 361*4882a593Smuzhiyun 362*4882a593Smuzhiyun_initial_num_blocks_is_1\@: 363*4882a593Smuzhiyun \INITIAL_BLOCKS \REP, 1, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 364*4882a593Smuzhiyun sub $16*1, %r13 365*4882a593Smuzhiyun jmp _initial_blocks_encrypted\@ 366*4882a593Smuzhiyun 367*4882a593Smuzhiyun_initial_num_blocks_is_0\@: 368*4882a593Smuzhiyun \INITIAL_BLOCKS \REP, 0, %xmm12, %xmm13, %xmm14, %xmm15, %xmm11, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm10, %xmm0, \ENC_DEC 369*4882a593Smuzhiyun 370*4882a593Smuzhiyun 371*4882a593Smuzhiyun_initial_blocks_encrypted\@: 372*4882a593Smuzhiyun test %r13, %r13 373*4882a593Smuzhiyun je _zero_cipher_left\@ 374*4882a593Smuzhiyun 375*4882a593Smuzhiyun sub $128, %r13 376*4882a593Smuzhiyun je _eight_cipher_left\@ 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun 379*4882a593Smuzhiyun 380*4882a593Smuzhiyun 381*4882a593Smuzhiyun vmovd %xmm9, %r15d 382*4882a593Smuzhiyun and $255, %r15d 383*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 384*4882a593Smuzhiyun 385*4882a593Smuzhiyun 386*4882a593Smuzhiyun_encrypt_by_8_new\@: 387*4882a593Smuzhiyun cmp $(255-8), %r15d 388*4882a593Smuzhiyun jg _encrypt_by_8\@ 389*4882a593Smuzhiyun 390*4882a593Smuzhiyun 391*4882a593Smuzhiyun 392*4882a593Smuzhiyun add $8, %r15b 393*4882a593Smuzhiyun \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, out_order, \ENC_DEC 394*4882a593Smuzhiyun add $128, %r11 395*4882a593Smuzhiyun sub $128, %r13 396*4882a593Smuzhiyun jne _encrypt_by_8_new\@ 397*4882a593Smuzhiyun 398*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 399*4882a593Smuzhiyun jmp _eight_cipher_left\@ 400*4882a593Smuzhiyun 401*4882a593Smuzhiyun_encrypt_by_8\@: 402*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 403*4882a593Smuzhiyun add $8, %r15b 404*4882a593Smuzhiyun \GHASH_8_ENCRYPT_8_PARALLEL \REP, %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm9, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm15, in_order, \ENC_DEC 405*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 406*4882a593Smuzhiyun add $128, %r11 407*4882a593Smuzhiyun sub $128, %r13 408*4882a593Smuzhiyun jne _encrypt_by_8_new\@ 409*4882a593Smuzhiyun 410*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 411*4882a593Smuzhiyun 412*4882a593Smuzhiyun 413*4882a593Smuzhiyun 414*4882a593Smuzhiyun 415*4882a593Smuzhiyun_eight_cipher_left\@: 416*4882a593Smuzhiyun \GHASH_LAST_8 %xmm0, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8 417*4882a593Smuzhiyun 418*4882a593Smuzhiyun 419*4882a593Smuzhiyun_zero_cipher_left\@: 420*4882a593Smuzhiyun vmovdqu %xmm14, AadHash(arg2) 421*4882a593Smuzhiyun vmovdqu %xmm9, CurCount(arg2) 422*4882a593Smuzhiyun 423*4882a593Smuzhiyun # check for 0 length 424*4882a593Smuzhiyun mov arg5, %r13 425*4882a593Smuzhiyun and $15, %r13 # r13 = (arg5 mod 16) 426*4882a593Smuzhiyun 427*4882a593Smuzhiyun je _multiple_of_16_bytes\@ 428*4882a593Smuzhiyun 429*4882a593Smuzhiyun # handle the last <16 Byte block separately 430*4882a593Smuzhiyun 431*4882a593Smuzhiyun mov %r13, PBlockLen(arg2) 432*4882a593Smuzhiyun 433*4882a593Smuzhiyun vpaddd ONE(%rip), %xmm9, %xmm9 # INCR CNT to get Yn 434*4882a593Smuzhiyun vmovdqu %xmm9, CurCount(arg2) 435*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 436*4882a593Smuzhiyun 437*4882a593Smuzhiyun ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Yn) 438*4882a593Smuzhiyun vmovdqu %xmm9, PBlockEncKey(arg2) 439*4882a593Smuzhiyun 440*4882a593Smuzhiyun cmp $16, arg5 441*4882a593Smuzhiyun jge _large_enough_update\@ 442*4882a593Smuzhiyun 443*4882a593Smuzhiyun lea (arg4,%r11,1), %r10 444*4882a593Smuzhiyun mov %r13, %r12 445*4882a593Smuzhiyun 446*4882a593Smuzhiyun READ_PARTIAL_BLOCK %r10 %r12 %xmm1 447*4882a593Smuzhiyun 448*4882a593Smuzhiyun lea SHIFT_MASK+16(%rip), %r12 449*4882a593Smuzhiyun sub %r13, %r12 # adjust the shuffle mask pointer to be 450*4882a593Smuzhiyun # able to shift 16-r13 bytes (r13 is the 451*4882a593Smuzhiyun # number of bytes in plaintext mod 16) 452*4882a593Smuzhiyun 453*4882a593Smuzhiyun jmp _final_ghash_mul\@ 454*4882a593Smuzhiyun 455*4882a593Smuzhiyun_large_enough_update\@: 456*4882a593Smuzhiyun sub $16, %r11 457*4882a593Smuzhiyun add %r13, %r11 458*4882a593Smuzhiyun 459*4882a593Smuzhiyun # receive the last <16 Byte block 460*4882a593Smuzhiyun vmovdqu (arg4, %r11, 1), %xmm1 461*4882a593Smuzhiyun 462*4882a593Smuzhiyun sub %r13, %r11 463*4882a593Smuzhiyun add $16, %r11 464*4882a593Smuzhiyun 465*4882a593Smuzhiyun lea SHIFT_MASK+16(%rip), %r12 466*4882a593Smuzhiyun # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 467*4882a593Smuzhiyun # (r13 is the number of bytes in plaintext mod 16) 468*4882a593Smuzhiyun sub %r13, %r12 469*4882a593Smuzhiyun # get the appropriate shuffle mask 470*4882a593Smuzhiyun vmovdqu (%r12), %xmm2 471*4882a593Smuzhiyun # shift right 16-r13 bytes 472*4882a593Smuzhiyun vpshufb %xmm2, %xmm1, %xmm1 473*4882a593Smuzhiyun 474*4882a593Smuzhiyun_final_ghash_mul\@: 475*4882a593Smuzhiyun .if \ENC_DEC == DEC 476*4882a593Smuzhiyun vmovdqa %xmm1, %xmm2 477*4882a593Smuzhiyun vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 478*4882a593Smuzhiyun vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 479*4882a593Smuzhiyun # mask out top 16-r13 bytes of xmm9 480*4882a593Smuzhiyun vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 481*4882a593Smuzhiyun vpand %xmm1, %xmm2, %xmm2 482*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm2, %xmm2 483*4882a593Smuzhiyun vpxor %xmm2, %xmm14, %xmm14 484*4882a593Smuzhiyun 485*4882a593Smuzhiyun vmovdqu %xmm14, AadHash(arg2) 486*4882a593Smuzhiyun .else 487*4882a593Smuzhiyun vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 488*4882a593Smuzhiyun vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 # get the appropriate mask to 489*4882a593Smuzhiyun # mask out top 16-r13 bytes of xmm9 490*4882a593Smuzhiyun vpand %xmm1, %xmm9, %xmm9 # mask out top 16-r13 bytes of xmm9 491*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 492*4882a593Smuzhiyun vpxor %xmm9, %xmm14, %xmm14 493*4882a593Smuzhiyun 494*4882a593Smuzhiyun vmovdqu %xmm14, AadHash(arg2) 495*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm9, %xmm9 # shuffle xmm9 back to output as ciphertext 496*4882a593Smuzhiyun .endif 497*4882a593Smuzhiyun 498*4882a593Smuzhiyun 499*4882a593Smuzhiyun ############################# 500*4882a593Smuzhiyun # output r13 Bytes 501*4882a593Smuzhiyun vmovq %xmm9, %rax 502*4882a593Smuzhiyun cmp $8, %r13 503*4882a593Smuzhiyun jle _less_than_8_bytes_left\@ 504*4882a593Smuzhiyun 505*4882a593Smuzhiyun mov %rax, (arg3 , %r11) 506*4882a593Smuzhiyun add $8, %r11 507*4882a593Smuzhiyun vpsrldq $8, %xmm9, %xmm9 508*4882a593Smuzhiyun vmovq %xmm9, %rax 509*4882a593Smuzhiyun sub $8, %r13 510*4882a593Smuzhiyun 511*4882a593Smuzhiyun_less_than_8_bytes_left\@: 512*4882a593Smuzhiyun movb %al, (arg3 , %r11) 513*4882a593Smuzhiyun add $1, %r11 514*4882a593Smuzhiyun shr $8, %rax 515*4882a593Smuzhiyun sub $1, %r13 516*4882a593Smuzhiyun jne _less_than_8_bytes_left\@ 517*4882a593Smuzhiyun ############################# 518*4882a593Smuzhiyun 519*4882a593Smuzhiyun_multiple_of_16_bytes\@: 520*4882a593Smuzhiyun.endm 521*4882a593Smuzhiyun 522*4882a593Smuzhiyun 523*4882a593Smuzhiyun# GCM_COMPLETE Finishes update of tag of last partial block 524*4882a593Smuzhiyun# Output: Authorization Tag (AUTH_TAG) 525*4882a593Smuzhiyun# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 526*4882a593Smuzhiyun.macro GCM_COMPLETE GHASH_MUL REP AUTH_TAG AUTH_TAG_LEN 527*4882a593Smuzhiyun vmovdqu AadHash(arg2), %xmm14 528*4882a593Smuzhiyun vmovdqu HashKey(arg2), %xmm13 529*4882a593Smuzhiyun 530*4882a593Smuzhiyun mov PBlockLen(arg2), %r12 531*4882a593Smuzhiyun test %r12, %r12 532*4882a593Smuzhiyun je _partial_done\@ 533*4882a593Smuzhiyun 534*4882a593Smuzhiyun #GHASH computation for the last <16 Byte block 535*4882a593Smuzhiyun \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 536*4882a593Smuzhiyun 537*4882a593Smuzhiyun_partial_done\@: 538*4882a593Smuzhiyun mov AadLen(arg2), %r12 # r12 = aadLen (number of bytes) 539*4882a593Smuzhiyun shl $3, %r12 # convert into number of bits 540*4882a593Smuzhiyun vmovd %r12d, %xmm15 # len(A) in xmm15 541*4882a593Smuzhiyun 542*4882a593Smuzhiyun mov InLen(arg2), %r12 543*4882a593Smuzhiyun shl $3, %r12 # len(C) in bits (*128) 544*4882a593Smuzhiyun vmovq %r12, %xmm1 545*4882a593Smuzhiyun vpslldq $8, %xmm15, %xmm15 # xmm15 = len(A)|| 0x0000000000000000 546*4882a593Smuzhiyun vpxor %xmm1, %xmm15, %xmm15 # xmm15 = len(A)||len(C) 547*4882a593Smuzhiyun 548*4882a593Smuzhiyun vpxor %xmm15, %xmm14, %xmm14 549*4882a593Smuzhiyun \GHASH_MUL %xmm14, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 # final GHASH computation 550*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm14, %xmm14 # perform a 16Byte swap 551*4882a593Smuzhiyun 552*4882a593Smuzhiyun vmovdqu OrigIV(arg2), %xmm9 553*4882a593Smuzhiyun 554*4882a593Smuzhiyun ENCRYPT_SINGLE_BLOCK \REP, %xmm9 # E(K, Y0) 555*4882a593Smuzhiyun 556*4882a593Smuzhiyun vpxor %xmm14, %xmm9, %xmm9 557*4882a593Smuzhiyun 558*4882a593Smuzhiyun 559*4882a593Smuzhiyun 560*4882a593Smuzhiyun_return_T\@: 561*4882a593Smuzhiyun mov \AUTH_TAG, %r10 # r10 = authTag 562*4882a593Smuzhiyun mov \AUTH_TAG_LEN, %r11 # r11 = auth_tag_len 563*4882a593Smuzhiyun 564*4882a593Smuzhiyun cmp $16, %r11 565*4882a593Smuzhiyun je _T_16\@ 566*4882a593Smuzhiyun 567*4882a593Smuzhiyun cmp $8, %r11 568*4882a593Smuzhiyun jl _T_4\@ 569*4882a593Smuzhiyun 570*4882a593Smuzhiyun_T_8\@: 571*4882a593Smuzhiyun vmovq %xmm9, %rax 572*4882a593Smuzhiyun mov %rax, (%r10) 573*4882a593Smuzhiyun add $8, %r10 574*4882a593Smuzhiyun sub $8, %r11 575*4882a593Smuzhiyun vpsrldq $8, %xmm9, %xmm9 576*4882a593Smuzhiyun test %r11, %r11 577*4882a593Smuzhiyun je _return_T_done\@ 578*4882a593Smuzhiyun_T_4\@: 579*4882a593Smuzhiyun vmovd %xmm9, %eax 580*4882a593Smuzhiyun mov %eax, (%r10) 581*4882a593Smuzhiyun add $4, %r10 582*4882a593Smuzhiyun sub $4, %r11 583*4882a593Smuzhiyun vpsrldq $4, %xmm9, %xmm9 584*4882a593Smuzhiyun test %r11, %r11 585*4882a593Smuzhiyun je _return_T_done\@ 586*4882a593Smuzhiyun_T_123\@: 587*4882a593Smuzhiyun vmovd %xmm9, %eax 588*4882a593Smuzhiyun cmp $2, %r11 589*4882a593Smuzhiyun jl _T_1\@ 590*4882a593Smuzhiyun mov %ax, (%r10) 591*4882a593Smuzhiyun cmp $2, %r11 592*4882a593Smuzhiyun je _return_T_done\@ 593*4882a593Smuzhiyun add $2, %r10 594*4882a593Smuzhiyun sar $16, %eax 595*4882a593Smuzhiyun_T_1\@: 596*4882a593Smuzhiyun mov %al, (%r10) 597*4882a593Smuzhiyun jmp _return_T_done\@ 598*4882a593Smuzhiyun 599*4882a593Smuzhiyun_T_16\@: 600*4882a593Smuzhiyun vmovdqu %xmm9, (%r10) 601*4882a593Smuzhiyun 602*4882a593Smuzhiyun_return_T_done\@: 603*4882a593Smuzhiyun.endm 604*4882a593Smuzhiyun 605*4882a593Smuzhiyun.macro CALC_AAD_HASH GHASH_MUL AAD AADLEN T1 T2 T3 T4 T5 T6 T7 T8 606*4882a593Smuzhiyun 607*4882a593Smuzhiyun mov \AAD, %r10 # r10 = AAD 608*4882a593Smuzhiyun mov \AADLEN, %r12 # r12 = aadLen 609*4882a593Smuzhiyun 610*4882a593Smuzhiyun 611*4882a593Smuzhiyun mov %r12, %r11 612*4882a593Smuzhiyun 613*4882a593Smuzhiyun vpxor \T8, \T8, \T8 614*4882a593Smuzhiyun vpxor \T7, \T7, \T7 615*4882a593Smuzhiyun cmp $16, %r11 616*4882a593Smuzhiyun jl _get_AAD_rest8\@ 617*4882a593Smuzhiyun_get_AAD_blocks\@: 618*4882a593Smuzhiyun vmovdqu (%r10), \T7 619*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \T7, \T7 620*4882a593Smuzhiyun vpxor \T7, \T8, \T8 621*4882a593Smuzhiyun \GHASH_MUL \T8, \T2, \T1, \T3, \T4, \T5, \T6 622*4882a593Smuzhiyun add $16, %r10 623*4882a593Smuzhiyun sub $16, %r12 624*4882a593Smuzhiyun sub $16, %r11 625*4882a593Smuzhiyun cmp $16, %r11 626*4882a593Smuzhiyun jge _get_AAD_blocks\@ 627*4882a593Smuzhiyun vmovdqu \T8, \T7 628*4882a593Smuzhiyun test %r11, %r11 629*4882a593Smuzhiyun je _get_AAD_done\@ 630*4882a593Smuzhiyun 631*4882a593Smuzhiyun vpxor \T7, \T7, \T7 632*4882a593Smuzhiyun 633*4882a593Smuzhiyun /* read the last <16B of AAD. since we have at least 4B of 634*4882a593Smuzhiyun data right after the AAD (the ICV, and maybe some CT), we can 635*4882a593Smuzhiyun read 4B/8B blocks safely, and then get rid of the extra stuff */ 636*4882a593Smuzhiyun_get_AAD_rest8\@: 637*4882a593Smuzhiyun cmp $4, %r11 638*4882a593Smuzhiyun jle _get_AAD_rest4\@ 639*4882a593Smuzhiyun movq (%r10), \T1 640*4882a593Smuzhiyun add $8, %r10 641*4882a593Smuzhiyun sub $8, %r11 642*4882a593Smuzhiyun vpslldq $8, \T1, \T1 643*4882a593Smuzhiyun vpsrldq $8, \T7, \T7 644*4882a593Smuzhiyun vpxor \T1, \T7, \T7 645*4882a593Smuzhiyun jmp _get_AAD_rest8\@ 646*4882a593Smuzhiyun_get_AAD_rest4\@: 647*4882a593Smuzhiyun test %r11, %r11 648*4882a593Smuzhiyun jle _get_AAD_rest0\@ 649*4882a593Smuzhiyun mov (%r10), %eax 650*4882a593Smuzhiyun movq %rax, \T1 651*4882a593Smuzhiyun add $4, %r10 652*4882a593Smuzhiyun sub $4, %r11 653*4882a593Smuzhiyun vpslldq $12, \T1, \T1 654*4882a593Smuzhiyun vpsrldq $4, \T7, \T7 655*4882a593Smuzhiyun vpxor \T1, \T7, \T7 656*4882a593Smuzhiyun_get_AAD_rest0\@: 657*4882a593Smuzhiyun /* finalize: shift out the extra bytes we read, and align 658*4882a593Smuzhiyun left. since pslldq can only shift by an immediate, we use 659*4882a593Smuzhiyun vpshufb and an array of shuffle masks */ 660*4882a593Smuzhiyun movq %r12, %r11 661*4882a593Smuzhiyun salq $4, %r11 662*4882a593Smuzhiyun vmovdqu aad_shift_arr(%r11), \T1 663*4882a593Smuzhiyun vpshufb \T1, \T7, \T7 664*4882a593Smuzhiyun_get_AAD_rest_final\@: 665*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \T7, \T7 666*4882a593Smuzhiyun vpxor \T8, \T7, \T7 667*4882a593Smuzhiyun \GHASH_MUL \T7, \T2, \T1, \T3, \T4, \T5, \T6 668*4882a593Smuzhiyun 669*4882a593Smuzhiyun_get_AAD_done\@: 670*4882a593Smuzhiyun vmovdqu \T7, AadHash(arg2) 671*4882a593Smuzhiyun.endm 672*4882a593Smuzhiyun 673*4882a593Smuzhiyun.macro INIT GHASH_MUL PRECOMPUTE 674*4882a593Smuzhiyun mov arg6, %r11 675*4882a593Smuzhiyun mov %r11, AadLen(arg2) # ctx_data.aad_length = aad_length 676*4882a593Smuzhiyun xor %r11d, %r11d 677*4882a593Smuzhiyun mov %r11, InLen(arg2) # ctx_data.in_length = 0 678*4882a593Smuzhiyun 679*4882a593Smuzhiyun mov %r11, PBlockLen(arg2) # ctx_data.partial_block_length = 0 680*4882a593Smuzhiyun mov %r11, PBlockEncKey(arg2) # ctx_data.partial_block_enc_key = 0 681*4882a593Smuzhiyun mov arg3, %rax 682*4882a593Smuzhiyun movdqu (%rax), %xmm0 683*4882a593Smuzhiyun movdqu %xmm0, OrigIV(arg2) # ctx_data.orig_IV = iv 684*4882a593Smuzhiyun 685*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm0, %xmm0 686*4882a593Smuzhiyun movdqu %xmm0, CurCount(arg2) # ctx_data.current_counter = iv 687*4882a593Smuzhiyun 688*4882a593Smuzhiyun vmovdqu (arg4), %xmm6 # xmm6 = HashKey 689*4882a593Smuzhiyun 690*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), %xmm6, %xmm6 691*4882a593Smuzhiyun ############### PRECOMPUTATION of HashKey<<1 mod poly from the HashKey 692*4882a593Smuzhiyun vmovdqa %xmm6, %xmm2 693*4882a593Smuzhiyun vpsllq $1, %xmm6, %xmm6 694*4882a593Smuzhiyun vpsrlq $63, %xmm2, %xmm2 695*4882a593Smuzhiyun vmovdqa %xmm2, %xmm1 696*4882a593Smuzhiyun vpslldq $8, %xmm2, %xmm2 697*4882a593Smuzhiyun vpsrldq $8, %xmm1, %xmm1 698*4882a593Smuzhiyun vpor %xmm2, %xmm6, %xmm6 699*4882a593Smuzhiyun #reduction 700*4882a593Smuzhiyun vpshufd $0b00100100, %xmm1, %xmm2 701*4882a593Smuzhiyun vpcmpeqd TWOONE(%rip), %xmm2, %xmm2 702*4882a593Smuzhiyun vpand POLY(%rip), %xmm2, %xmm2 703*4882a593Smuzhiyun vpxor %xmm2, %xmm6, %xmm6 # xmm6 holds the HashKey<<1 mod poly 704*4882a593Smuzhiyun ####################################################################### 705*4882a593Smuzhiyun vmovdqu %xmm6, HashKey(arg2) # store HashKey<<1 mod poly 706*4882a593Smuzhiyun 707*4882a593Smuzhiyun CALC_AAD_HASH \GHASH_MUL, arg5, arg6, %xmm2, %xmm6, %xmm3, %xmm4, %xmm5, %xmm7, %xmm1, %xmm0 708*4882a593Smuzhiyun 709*4882a593Smuzhiyun \PRECOMPUTE %xmm6, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5 710*4882a593Smuzhiyun.endm 711*4882a593Smuzhiyun 712*4882a593Smuzhiyun 713*4882a593Smuzhiyun# Reads DLEN bytes starting at DPTR and stores in XMMDst 714*4882a593Smuzhiyun# where 0 < DLEN < 16 715*4882a593Smuzhiyun# Clobbers %rax, DLEN 716*4882a593Smuzhiyun.macro READ_PARTIAL_BLOCK DPTR DLEN XMMDst 717*4882a593Smuzhiyun vpxor \XMMDst, \XMMDst, \XMMDst 718*4882a593Smuzhiyun 719*4882a593Smuzhiyun cmp $8, \DLEN 720*4882a593Smuzhiyun jl _read_lt8_\@ 721*4882a593Smuzhiyun mov (\DPTR), %rax 722*4882a593Smuzhiyun vpinsrq $0, %rax, \XMMDst, \XMMDst 723*4882a593Smuzhiyun sub $8, \DLEN 724*4882a593Smuzhiyun jz _done_read_partial_block_\@ 725*4882a593Smuzhiyun xor %eax, %eax 726*4882a593Smuzhiyun_read_next_byte_\@: 727*4882a593Smuzhiyun shl $8, %rax 728*4882a593Smuzhiyun mov 7(\DPTR, \DLEN, 1), %al 729*4882a593Smuzhiyun dec \DLEN 730*4882a593Smuzhiyun jnz _read_next_byte_\@ 731*4882a593Smuzhiyun vpinsrq $1, %rax, \XMMDst, \XMMDst 732*4882a593Smuzhiyun jmp _done_read_partial_block_\@ 733*4882a593Smuzhiyun_read_lt8_\@: 734*4882a593Smuzhiyun xor %eax, %eax 735*4882a593Smuzhiyun_read_next_byte_lt8_\@: 736*4882a593Smuzhiyun shl $8, %rax 737*4882a593Smuzhiyun mov -1(\DPTR, \DLEN, 1), %al 738*4882a593Smuzhiyun dec \DLEN 739*4882a593Smuzhiyun jnz _read_next_byte_lt8_\@ 740*4882a593Smuzhiyun vpinsrq $0, %rax, \XMMDst, \XMMDst 741*4882a593Smuzhiyun_done_read_partial_block_\@: 742*4882a593Smuzhiyun.endm 743*4882a593Smuzhiyun 744*4882a593Smuzhiyun# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 745*4882a593Smuzhiyun# between update calls. 746*4882a593Smuzhiyun# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 747*4882a593Smuzhiyun# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 748*4882a593Smuzhiyun# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 749*4882a593Smuzhiyun.macro PARTIAL_BLOCK GHASH_MUL CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 750*4882a593Smuzhiyun AAD_HASH ENC_DEC 751*4882a593Smuzhiyun mov PBlockLen(arg2), %r13 752*4882a593Smuzhiyun test %r13, %r13 753*4882a593Smuzhiyun je _partial_block_done_\@ # Leave Macro if no partial blocks 754*4882a593Smuzhiyun # Read in input data without over reading 755*4882a593Smuzhiyun cmp $16, \PLAIN_CYPH_LEN 756*4882a593Smuzhiyun jl _fewer_than_16_bytes_\@ 757*4882a593Smuzhiyun vmovdqu (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 758*4882a593Smuzhiyun jmp _data_read_\@ 759*4882a593Smuzhiyun 760*4882a593Smuzhiyun_fewer_than_16_bytes_\@: 761*4882a593Smuzhiyun lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 762*4882a593Smuzhiyun mov \PLAIN_CYPH_LEN, %r12 763*4882a593Smuzhiyun READ_PARTIAL_BLOCK %r10 %r12 %xmm1 764*4882a593Smuzhiyun 765*4882a593Smuzhiyun mov PBlockLen(arg2), %r13 766*4882a593Smuzhiyun 767*4882a593Smuzhiyun_data_read_\@: # Finished reading in data 768*4882a593Smuzhiyun 769*4882a593Smuzhiyun vmovdqu PBlockEncKey(arg2), %xmm9 770*4882a593Smuzhiyun vmovdqu HashKey(arg2), %xmm13 771*4882a593Smuzhiyun 772*4882a593Smuzhiyun lea SHIFT_MASK(%rip), %r12 773*4882a593Smuzhiyun 774*4882a593Smuzhiyun # adjust the shuffle mask pointer to be able to shift r13 bytes 775*4882a593Smuzhiyun # r16-r13 is the number of bytes in plaintext mod 16) 776*4882a593Smuzhiyun add %r13, %r12 777*4882a593Smuzhiyun vmovdqu (%r12), %xmm2 # get the appropriate shuffle mask 778*4882a593Smuzhiyun vpshufb %xmm2, %xmm9, %xmm9 # shift right r13 bytes 779*4882a593Smuzhiyun 780*4882a593Smuzhiyun.if \ENC_DEC == DEC 781*4882a593Smuzhiyun vmovdqa %xmm1, %xmm3 782*4882a593Smuzhiyun pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 783*4882a593Smuzhiyun 784*4882a593Smuzhiyun mov \PLAIN_CYPH_LEN, %r10 785*4882a593Smuzhiyun add %r13, %r10 786*4882a593Smuzhiyun # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 787*4882a593Smuzhiyun sub $16, %r10 788*4882a593Smuzhiyun # Determine if if partial block is not being filled and 789*4882a593Smuzhiyun # shift mask accordingly 790*4882a593Smuzhiyun jge _no_extra_mask_1_\@ 791*4882a593Smuzhiyun sub %r10, %r12 792*4882a593Smuzhiyun_no_extra_mask_1_\@: 793*4882a593Smuzhiyun 794*4882a593Smuzhiyun vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 795*4882a593Smuzhiyun # get the appropriate mask to mask out bottom r13 bytes of xmm9 796*4882a593Smuzhiyun vpand %xmm1, %xmm9, %xmm9 # mask out bottom r13 bytes of xmm9 797*4882a593Smuzhiyun 798*4882a593Smuzhiyun vpand %xmm1, %xmm3, %xmm3 799*4882a593Smuzhiyun vmovdqa SHUF_MASK(%rip), %xmm10 800*4882a593Smuzhiyun vpshufb %xmm10, %xmm3, %xmm3 801*4882a593Smuzhiyun vpshufb %xmm2, %xmm3, %xmm3 802*4882a593Smuzhiyun vpxor %xmm3, \AAD_HASH, \AAD_HASH 803*4882a593Smuzhiyun 804*4882a593Smuzhiyun test %r10, %r10 805*4882a593Smuzhiyun jl _partial_incomplete_1_\@ 806*4882a593Smuzhiyun 807*4882a593Smuzhiyun # GHASH computation for the last <16 Byte block 808*4882a593Smuzhiyun \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 809*4882a593Smuzhiyun xor %eax,%eax 810*4882a593Smuzhiyun 811*4882a593Smuzhiyun mov %rax, PBlockLen(arg2) 812*4882a593Smuzhiyun jmp _dec_done_\@ 813*4882a593Smuzhiyun_partial_incomplete_1_\@: 814*4882a593Smuzhiyun add \PLAIN_CYPH_LEN, PBlockLen(arg2) 815*4882a593Smuzhiyun_dec_done_\@: 816*4882a593Smuzhiyun vmovdqu \AAD_HASH, AadHash(arg2) 817*4882a593Smuzhiyun.else 818*4882a593Smuzhiyun vpxor %xmm1, %xmm9, %xmm9 # Plaintext XOR E(K, Yn) 819*4882a593Smuzhiyun 820*4882a593Smuzhiyun mov \PLAIN_CYPH_LEN, %r10 821*4882a593Smuzhiyun add %r13, %r10 822*4882a593Smuzhiyun # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 823*4882a593Smuzhiyun sub $16, %r10 824*4882a593Smuzhiyun # Determine if if partial block is not being filled and 825*4882a593Smuzhiyun # shift mask accordingly 826*4882a593Smuzhiyun jge _no_extra_mask_2_\@ 827*4882a593Smuzhiyun sub %r10, %r12 828*4882a593Smuzhiyun_no_extra_mask_2_\@: 829*4882a593Smuzhiyun 830*4882a593Smuzhiyun vmovdqu ALL_F-SHIFT_MASK(%r12), %xmm1 831*4882a593Smuzhiyun # get the appropriate mask to mask out bottom r13 bytes of xmm9 832*4882a593Smuzhiyun vpand %xmm1, %xmm9, %xmm9 833*4882a593Smuzhiyun 834*4882a593Smuzhiyun vmovdqa SHUF_MASK(%rip), %xmm1 835*4882a593Smuzhiyun vpshufb %xmm1, %xmm9, %xmm9 836*4882a593Smuzhiyun vpshufb %xmm2, %xmm9, %xmm9 837*4882a593Smuzhiyun vpxor %xmm9, \AAD_HASH, \AAD_HASH 838*4882a593Smuzhiyun 839*4882a593Smuzhiyun test %r10, %r10 840*4882a593Smuzhiyun jl _partial_incomplete_2_\@ 841*4882a593Smuzhiyun 842*4882a593Smuzhiyun # GHASH computation for the last <16 Byte block 843*4882a593Smuzhiyun \GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 844*4882a593Smuzhiyun xor %eax,%eax 845*4882a593Smuzhiyun 846*4882a593Smuzhiyun mov %rax, PBlockLen(arg2) 847*4882a593Smuzhiyun jmp _encode_done_\@ 848*4882a593Smuzhiyun_partial_incomplete_2_\@: 849*4882a593Smuzhiyun add \PLAIN_CYPH_LEN, PBlockLen(arg2) 850*4882a593Smuzhiyun_encode_done_\@: 851*4882a593Smuzhiyun vmovdqu \AAD_HASH, AadHash(arg2) 852*4882a593Smuzhiyun 853*4882a593Smuzhiyun vmovdqa SHUF_MASK(%rip), %xmm10 854*4882a593Smuzhiyun # shuffle xmm9 back to output as ciphertext 855*4882a593Smuzhiyun vpshufb %xmm10, %xmm9, %xmm9 856*4882a593Smuzhiyun vpshufb %xmm2, %xmm9, %xmm9 857*4882a593Smuzhiyun.endif 858*4882a593Smuzhiyun # output encrypted Bytes 859*4882a593Smuzhiyun test %r10, %r10 860*4882a593Smuzhiyun jl _partial_fill_\@ 861*4882a593Smuzhiyun mov %r13, %r12 862*4882a593Smuzhiyun mov $16, %r13 863*4882a593Smuzhiyun # Set r13 to be the number of bytes to write out 864*4882a593Smuzhiyun sub %r12, %r13 865*4882a593Smuzhiyun jmp _count_set_\@ 866*4882a593Smuzhiyun_partial_fill_\@: 867*4882a593Smuzhiyun mov \PLAIN_CYPH_LEN, %r13 868*4882a593Smuzhiyun_count_set_\@: 869*4882a593Smuzhiyun vmovdqa %xmm9, %xmm0 870*4882a593Smuzhiyun vmovq %xmm0, %rax 871*4882a593Smuzhiyun cmp $8, %r13 872*4882a593Smuzhiyun jle _less_than_8_bytes_left_\@ 873*4882a593Smuzhiyun 874*4882a593Smuzhiyun mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 875*4882a593Smuzhiyun add $8, \DATA_OFFSET 876*4882a593Smuzhiyun psrldq $8, %xmm0 877*4882a593Smuzhiyun vmovq %xmm0, %rax 878*4882a593Smuzhiyun sub $8, %r13 879*4882a593Smuzhiyun_less_than_8_bytes_left_\@: 880*4882a593Smuzhiyun movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 881*4882a593Smuzhiyun add $1, \DATA_OFFSET 882*4882a593Smuzhiyun shr $8, %rax 883*4882a593Smuzhiyun sub $1, %r13 884*4882a593Smuzhiyun jne _less_than_8_bytes_left_\@ 885*4882a593Smuzhiyun_partial_block_done_\@: 886*4882a593Smuzhiyun.endm # PARTIAL_BLOCK 887*4882a593Smuzhiyun 888*4882a593Smuzhiyun############################################################################### 889*4882a593Smuzhiyun# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 890*4882a593Smuzhiyun# Input: A and B (128-bits each, bit-reflected) 891*4882a593Smuzhiyun# Output: C = A*B*x mod poly, (i.e. >>1 ) 892*4882a593Smuzhiyun# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 893*4882a593Smuzhiyun# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 894*4882a593Smuzhiyun############################################################################### 895*4882a593Smuzhiyun.macro GHASH_MUL_AVX GH HK T1 T2 T3 T4 T5 896*4882a593Smuzhiyun 897*4882a593Smuzhiyun vpshufd $0b01001110, \GH, \T2 898*4882a593Smuzhiyun vpshufd $0b01001110, \HK, \T3 899*4882a593Smuzhiyun vpxor \GH , \T2, \T2 # T2 = (a1+a0) 900*4882a593Smuzhiyun vpxor \HK , \T3, \T3 # T3 = (b1+b0) 901*4882a593Smuzhiyun 902*4882a593Smuzhiyun vpclmulqdq $0x11, \HK, \GH, \T1 # T1 = a1*b1 903*4882a593Smuzhiyun vpclmulqdq $0x00, \HK, \GH, \GH # GH = a0*b0 904*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 # T2 = (a1+a0)*(b1+b0) 905*4882a593Smuzhiyun vpxor \GH, \T2,\T2 906*4882a593Smuzhiyun vpxor \T1, \T2,\T2 # T2 = a0*b1+a1*b0 907*4882a593Smuzhiyun 908*4882a593Smuzhiyun vpslldq $8, \T2,\T3 # shift-L T3 2 DWs 909*4882a593Smuzhiyun vpsrldq $8, \T2,\T2 # shift-R T2 2 DWs 910*4882a593Smuzhiyun vpxor \T3, \GH, \GH 911*4882a593Smuzhiyun vpxor \T2, \T1, \T1 # <T1:GH> = GH x HK 912*4882a593Smuzhiyun 913*4882a593Smuzhiyun #first phase of the reduction 914*4882a593Smuzhiyun vpslld $31, \GH, \T2 # packed right shifting << 31 915*4882a593Smuzhiyun vpslld $30, \GH, \T3 # packed right shifting shift << 30 916*4882a593Smuzhiyun vpslld $25, \GH, \T4 # packed right shifting shift << 25 917*4882a593Smuzhiyun 918*4882a593Smuzhiyun vpxor \T3, \T2, \T2 # xor the shifted versions 919*4882a593Smuzhiyun vpxor \T4, \T2, \T2 920*4882a593Smuzhiyun 921*4882a593Smuzhiyun vpsrldq $4, \T2, \T5 # shift-R T5 1 DW 922*4882a593Smuzhiyun 923*4882a593Smuzhiyun vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 924*4882a593Smuzhiyun vpxor \T2, \GH, \GH # first phase of the reduction complete 925*4882a593Smuzhiyun 926*4882a593Smuzhiyun #second phase of the reduction 927*4882a593Smuzhiyun 928*4882a593Smuzhiyun vpsrld $1,\GH, \T2 # packed left shifting >> 1 929*4882a593Smuzhiyun vpsrld $2,\GH, \T3 # packed left shifting >> 2 930*4882a593Smuzhiyun vpsrld $7,\GH, \T4 # packed left shifting >> 7 931*4882a593Smuzhiyun vpxor \T3, \T2, \T2 # xor the shifted versions 932*4882a593Smuzhiyun vpxor \T4, \T2, \T2 933*4882a593Smuzhiyun 934*4882a593Smuzhiyun vpxor \T5, \T2, \T2 935*4882a593Smuzhiyun vpxor \T2, \GH, \GH 936*4882a593Smuzhiyun vpxor \T1, \GH, \GH # the result is in GH 937*4882a593Smuzhiyun 938*4882a593Smuzhiyun 939*4882a593Smuzhiyun.endm 940*4882a593Smuzhiyun 941*4882a593Smuzhiyun.macro PRECOMPUTE_AVX HK T1 T2 T3 T4 T5 T6 942*4882a593Smuzhiyun 943*4882a593Smuzhiyun # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 944*4882a593Smuzhiyun vmovdqa \HK, \T5 945*4882a593Smuzhiyun 946*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T1 947*4882a593Smuzhiyun vpxor \T5, \T1, \T1 948*4882a593Smuzhiyun vmovdqu \T1, HashKey_k(arg2) 949*4882a593Smuzhiyun 950*4882a593Smuzhiyun GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 951*4882a593Smuzhiyun vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 952*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T1 953*4882a593Smuzhiyun vpxor \T5, \T1, \T1 954*4882a593Smuzhiyun vmovdqu \T1, HashKey_2_k(arg2) 955*4882a593Smuzhiyun 956*4882a593Smuzhiyun GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 957*4882a593Smuzhiyun vmovdqu \T5, HashKey_3(arg2) 958*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T1 959*4882a593Smuzhiyun vpxor \T5, \T1, \T1 960*4882a593Smuzhiyun vmovdqu \T1, HashKey_3_k(arg2) 961*4882a593Smuzhiyun 962*4882a593Smuzhiyun GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 963*4882a593Smuzhiyun vmovdqu \T5, HashKey_4(arg2) 964*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T1 965*4882a593Smuzhiyun vpxor \T5, \T1, \T1 966*4882a593Smuzhiyun vmovdqu \T1, HashKey_4_k(arg2) 967*4882a593Smuzhiyun 968*4882a593Smuzhiyun GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 969*4882a593Smuzhiyun vmovdqu \T5, HashKey_5(arg2) 970*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T1 971*4882a593Smuzhiyun vpxor \T5, \T1, \T1 972*4882a593Smuzhiyun vmovdqu \T1, HashKey_5_k(arg2) 973*4882a593Smuzhiyun 974*4882a593Smuzhiyun GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 975*4882a593Smuzhiyun vmovdqu \T5, HashKey_6(arg2) 976*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T1 977*4882a593Smuzhiyun vpxor \T5, \T1, \T1 978*4882a593Smuzhiyun vmovdqu \T1, HashKey_6_k(arg2) 979*4882a593Smuzhiyun 980*4882a593Smuzhiyun GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 981*4882a593Smuzhiyun vmovdqu \T5, HashKey_7(arg2) 982*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T1 983*4882a593Smuzhiyun vpxor \T5, \T1, \T1 984*4882a593Smuzhiyun vmovdqu \T1, HashKey_7_k(arg2) 985*4882a593Smuzhiyun 986*4882a593Smuzhiyun GHASH_MUL_AVX \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 987*4882a593Smuzhiyun vmovdqu \T5, HashKey_8(arg2) 988*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T1 989*4882a593Smuzhiyun vpxor \T5, \T1, \T1 990*4882a593Smuzhiyun vmovdqu \T1, HashKey_8_k(arg2) 991*4882a593Smuzhiyun 992*4882a593Smuzhiyun.endm 993*4882a593Smuzhiyun 994*4882a593Smuzhiyun## if a = number of total plaintext bytes 995*4882a593Smuzhiyun## b = floor(a/16) 996*4882a593Smuzhiyun## num_initial_blocks = b mod 4# 997*4882a593Smuzhiyun## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 998*4882a593Smuzhiyun## r10, r11, r12, rax are clobbered 999*4882a593Smuzhiyun## arg1, arg3, arg4, r14 are used as a pointer only, not modified 1000*4882a593Smuzhiyun 1001*4882a593Smuzhiyun.macro INITIAL_BLOCKS_AVX REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC 1002*4882a593Smuzhiyun i = (8-\num_initial_blocks) 1003*4882a593Smuzhiyun setreg 1004*4882a593Smuzhiyun vmovdqu AadHash(arg2), reg_i 1005*4882a593Smuzhiyun 1006*4882a593Smuzhiyun # start AES for num_initial_blocks blocks 1007*4882a593Smuzhiyun vmovdqu CurCount(arg2), \CTR 1008*4882a593Smuzhiyun 1009*4882a593Smuzhiyun i = (9-\num_initial_blocks) 1010*4882a593Smuzhiyun setreg 1011*4882a593Smuzhiyun.rep \num_initial_blocks 1012*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1013*4882a593Smuzhiyun vmovdqa \CTR, reg_i 1014*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1015*4882a593Smuzhiyun i = (i+1) 1016*4882a593Smuzhiyun setreg 1017*4882a593Smuzhiyun.endr 1018*4882a593Smuzhiyun 1019*4882a593Smuzhiyun vmovdqa (arg1), \T_key 1020*4882a593Smuzhiyun i = (9-\num_initial_blocks) 1021*4882a593Smuzhiyun setreg 1022*4882a593Smuzhiyun.rep \num_initial_blocks 1023*4882a593Smuzhiyun vpxor \T_key, reg_i, reg_i 1024*4882a593Smuzhiyun i = (i+1) 1025*4882a593Smuzhiyun setreg 1026*4882a593Smuzhiyun.endr 1027*4882a593Smuzhiyun 1028*4882a593Smuzhiyun j = 1 1029*4882a593Smuzhiyun setreg 1030*4882a593Smuzhiyun.rep \REP 1031*4882a593Smuzhiyun vmovdqa 16*j(arg1), \T_key 1032*4882a593Smuzhiyun i = (9-\num_initial_blocks) 1033*4882a593Smuzhiyun setreg 1034*4882a593Smuzhiyun.rep \num_initial_blocks 1035*4882a593Smuzhiyun vaesenc \T_key, reg_i, reg_i 1036*4882a593Smuzhiyun i = (i+1) 1037*4882a593Smuzhiyun setreg 1038*4882a593Smuzhiyun.endr 1039*4882a593Smuzhiyun 1040*4882a593Smuzhiyun j = (j+1) 1041*4882a593Smuzhiyun setreg 1042*4882a593Smuzhiyun.endr 1043*4882a593Smuzhiyun 1044*4882a593Smuzhiyun vmovdqa 16*j(arg1), \T_key 1045*4882a593Smuzhiyun i = (9-\num_initial_blocks) 1046*4882a593Smuzhiyun setreg 1047*4882a593Smuzhiyun.rep \num_initial_blocks 1048*4882a593Smuzhiyun vaesenclast \T_key, reg_i, reg_i 1049*4882a593Smuzhiyun i = (i+1) 1050*4882a593Smuzhiyun setreg 1051*4882a593Smuzhiyun.endr 1052*4882a593Smuzhiyun 1053*4882a593Smuzhiyun i = (9-\num_initial_blocks) 1054*4882a593Smuzhiyun setreg 1055*4882a593Smuzhiyun.rep \num_initial_blocks 1056*4882a593Smuzhiyun vmovdqu (arg4, %r11), \T1 1057*4882a593Smuzhiyun vpxor \T1, reg_i, reg_i 1058*4882a593Smuzhiyun vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for num_initial_blocks blocks 1059*4882a593Smuzhiyun add $16, %r11 1060*4882a593Smuzhiyun.if \ENC_DEC == DEC 1061*4882a593Smuzhiyun vmovdqa \T1, reg_i 1062*4882a593Smuzhiyun.endif 1063*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 1064*4882a593Smuzhiyun i = (i+1) 1065*4882a593Smuzhiyun setreg 1066*4882a593Smuzhiyun.endr 1067*4882a593Smuzhiyun 1068*4882a593Smuzhiyun 1069*4882a593Smuzhiyun i = (8-\num_initial_blocks) 1070*4882a593Smuzhiyun j = (9-\num_initial_blocks) 1071*4882a593Smuzhiyun setreg 1072*4882a593Smuzhiyun 1073*4882a593Smuzhiyun.rep \num_initial_blocks 1074*4882a593Smuzhiyun vpxor reg_i, reg_j, reg_j 1075*4882a593Smuzhiyun GHASH_MUL_AVX reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 1076*4882a593Smuzhiyun i = (i+1) 1077*4882a593Smuzhiyun j = (j+1) 1078*4882a593Smuzhiyun setreg 1079*4882a593Smuzhiyun.endr 1080*4882a593Smuzhiyun # XMM8 has the combined result here 1081*4882a593Smuzhiyun 1082*4882a593Smuzhiyun vmovdqa \XMM8, TMP1(%rsp) 1083*4882a593Smuzhiyun vmovdqa \XMM8, \T3 1084*4882a593Smuzhiyun 1085*4882a593Smuzhiyun cmp $128, %r13 1086*4882a593Smuzhiyun jl _initial_blocks_done\@ # no need for precomputed constants 1087*4882a593Smuzhiyun 1088*4882a593Smuzhiyun############################################################################### 1089*4882a593Smuzhiyun# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1090*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1091*4882a593Smuzhiyun vmovdqa \CTR, \XMM1 1092*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1093*4882a593Smuzhiyun 1094*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1095*4882a593Smuzhiyun vmovdqa \CTR, \XMM2 1096*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1097*4882a593Smuzhiyun 1098*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1099*4882a593Smuzhiyun vmovdqa \CTR, \XMM3 1100*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1101*4882a593Smuzhiyun 1102*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1103*4882a593Smuzhiyun vmovdqa \CTR, \XMM4 1104*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1105*4882a593Smuzhiyun 1106*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1107*4882a593Smuzhiyun vmovdqa \CTR, \XMM5 1108*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1109*4882a593Smuzhiyun 1110*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1111*4882a593Smuzhiyun vmovdqa \CTR, \XMM6 1112*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1113*4882a593Smuzhiyun 1114*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1115*4882a593Smuzhiyun vmovdqa \CTR, \XMM7 1116*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1117*4882a593Smuzhiyun 1118*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1119*4882a593Smuzhiyun vmovdqa \CTR, \XMM8 1120*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1121*4882a593Smuzhiyun 1122*4882a593Smuzhiyun vmovdqa (arg1), \T_key 1123*4882a593Smuzhiyun vpxor \T_key, \XMM1, \XMM1 1124*4882a593Smuzhiyun vpxor \T_key, \XMM2, \XMM2 1125*4882a593Smuzhiyun vpxor \T_key, \XMM3, \XMM3 1126*4882a593Smuzhiyun vpxor \T_key, \XMM4, \XMM4 1127*4882a593Smuzhiyun vpxor \T_key, \XMM5, \XMM5 1128*4882a593Smuzhiyun vpxor \T_key, \XMM6, \XMM6 1129*4882a593Smuzhiyun vpxor \T_key, \XMM7, \XMM7 1130*4882a593Smuzhiyun vpxor \T_key, \XMM8, \XMM8 1131*4882a593Smuzhiyun 1132*4882a593Smuzhiyun i = 1 1133*4882a593Smuzhiyun setreg 1134*4882a593Smuzhiyun.rep \REP # do REP rounds 1135*4882a593Smuzhiyun vmovdqa 16*i(arg1), \T_key 1136*4882a593Smuzhiyun vaesenc \T_key, \XMM1, \XMM1 1137*4882a593Smuzhiyun vaesenc \T_key, \XMM2, \XMM2 1138*4882a593Smuzhiyun vaesenc \T_key, \XMM3, \XMM3 1139*4882a593Smuzhiyun vaesenc \T_key, \XMM4, \XMM4 1140*4882a593Smuzhiyun vaesenc \T_key, \XMM5, \XMM5 1141*4882a593Smuzhiyun vaesenc \T_key, \XMM6, \XMM6 1142*4882a593Smuzhiyun vaesenc \T_key, \XMM7, \XMM7 1143*4882a593Smuzhiyun vaesenc \T_key, \XMM8, \XMM8 1144*4882a593Smuzhiyun i = (i+1) 1145*4882a593Smuzhiyun setreg 1146*4882a593Smuzhiyun.endr 1147*4882a593Smuzhiyun 1148*4882a593Smuzhiyun vmovdqa 16*i(arg1), \T_key 1149*4882a593Smuzhiyun vaesenclast \T_key, \XMM1, \XMM1 1150*4882a593Smuzhiyun vaesenclast \T_key, \XMM2, \XMM2 1151*4882a593Smuzhiyun vaesenclast \T_key, \XMM3, \XMM3 1152*4882a593Smuzhiyun vaesenclast \T_key, \XMM4, \XMM4 1153*4882a593Smuzhiyun vaesenclast \T_key, \XMM5, \XMM5 1154*4882a593Smuzhiyun vaesenclast \T_key, \XMM6, \XMM6 1155*4882a593Smuzhiyun vaesenclast \T_key, \XMM7, \XMM7 1156*4882a593Smuzhiyun vaesenclast \T_key, \XMM8, \XMM8 1157*4882a593Smuzhiyun 1158*4882a593Smuzhiyun vmovdqu (arg4, %r11), \T1 1159*4882a593Smuzhiyun vpxor \T1, \XMM1, \XMM1 1160*4882a593Smuzhiyun vmovdqu \XMM1, (arg3 , %r11) 1161*4882a593Smuzhiyun .if \ENC_DEC == DEC 1162*4882a593Smuzhiyun vmovdqa \T1, \XMM1 1163*4882a593Smuzhiyun .endif 1164*4882a593Smuzhiyun 1165*4882a593Smuzhiyun vmovdqu 16*1(arg4, %r11), \T1 1166*4882a593Smuzhiyun vpxor \T1, \XMM2, \XMM2 1167*4882a593Smuzhiyun vmovdqu \XMM2, 16*1(arg3 , %r11) 1168*4882a593Smuzhiyun .if \ENC_DEC == DEC 1169*4882a593Smuzhiyun vmovdqa \T1, \XMM2 1170*4882a593Smuzhiyun .endif 1171*4882a593Smuzhiyun 1172*4882a593Smuzhiyun vmovdqu 16*2(arg4, %r11), \T1 1173*4882a593Smuzhiyun vpxor \T1, \XMM3, \XMM3 1174*4882a593Smuzhiyun vmovdqu \XMM3, 16*2(arg3 , %r11) 1175*4882a593Smuzhiyun .if \ENC_DEC == DEC 1176*4882a593Smuzhiyun vmovdqa \T1, \XMM3 1177*4882a593Smuzhiyun .endif 1178*4882a593Smuzhiyun 1179*4882a593Smuzhiyun vmovdqu 16*3(arg4, %r11), \T1 1180*4882a593Smuzhiyun vpxor \T1, \XMM4, \XMM4 1181*4882a593Smuzhiyun vmovdqu \XMM4, 16*3(arg3 , %r11) 1182*4882a593Smuzhiyun .if \ENC_DEC == DEC 1183*4882a593Smuzhiyun vmovdqa \T1, \XMM4 1184*4882a593Smuzhiyun .endif 1185*4882a593Smuzhiyun 1186*4882a593Smuzhiyun vmovdqu 16*4(arg4, %r11), \T1 1187*4882a593Smuzhiyun vpxor \T1, \XMM5, \XMM5 1188*4882a593Smuzhiyun vmovdqu \XMM5, 16*4(arg3 , %r11) 1189*4882a593Smuzhiyun .if \ENC_DEC == DEC 1190*4882a593Smuzhiyun vmovdqa \T1, \XMM5 1191*4882a593Smuzhiyun .endif 1192*4882a593Smuzhiyun 1193*4882a593Smuzhiyun vmovdqu 16*5(arg4, %r11), \T1 1194*4882a593Smuzhiyun vpxor \T1, \XMM6, \XMM6 1195*4882a593Smuzhiyun vmovdqu \XMM6, 16*5(arg3 , %r11) 1196*4882a593Smuzhiyun .if \ENC_DEC == DEC 1197*4882a593Smuzhiyun vmovdqa \T1, \XMM6 1198*4882a593Smuzhiyun .endif 1199*4882a593Smuzhiyun 1200*4882a593Smuzhiyun vmovdqu 16*6(arg4, %r11), \T1 1201*4882a593Smuzhiyun vpxor \T1, \XMM7, \XMM7 1202*4882a593Smuzhiyun vmovdqu \XMM7, 16*6(arg3 , %r11) 1203*4882a593Smuzhiyun .if \ENC_DEC == DEC 1204*4882a593Smuzhiyun vmovdqa \T1, \XMM7 1205*4882a593Smuzhiyun .endif 1206*4882a593Smuzhiyun 1207*4882a593Smuzhiyun vmovdqu 16*7(arg4, %r11), \T1 1208*4882a593Smuzhiyun vpxor \T1, \XMM8, \XMM8 1209*4882a593Smuzhiyun vmovdqu \XMM8, 16*7(arg3 , %r11) 1210*4882a593Smuzhiyun .if \ENC_DEC == DEC 1211*4882a593Smuzhiyun vmovdqa \T1, \XMM8 1212*4882a593Smuzhiyun .endif 1213*4882a593Smuzhiyun 1214*4882a593Smuzhiyun add $128, %r11 1215*4882a593Smuzhiyun 1216*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1217*4882a593Smuzhiyun vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with the corresponding ciphertext 1218*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1219*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1220*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1221*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1222*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1223*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1224*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1225*4882a593Smuzhiyun 1226*4882a593Smuzhiyun############################################################################### 1227*4882a593Smuzhiyun 1228*4882a593Smuzhiyun_initial_blocks_done\@: 1229*4882a593Smuzhiyun 1230*4882a593Smuzhiyun.endm 1231*4882a593Smuzhiyun 1232*4882a593Smuzhiyun# encrypt 8 blocks at a time 1233*4882a593Smuzhiyun# ghash the 8 previously encrypted ciphertext blocks 1234*4882a593Smuzhiyun# arg1, arg3, arg4 are used as pointers only, not modified 1235*4882a593Smuzhiyun# r11 is the data offset value 1236*4882a593Smuzhiyun.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 1237*4882a593Smuzhiyun 1238*4882a593Smuzhiyun vmovdqa \XMM1, \T2 1239*4882a593Smuzhiyun vmovdqa \XMM2, TMP2(%rsp) 1240*4882a593Smuzhiyun vmovdqa \XMM3, TMP3(%rsp) 1241*4882a593Smuzhiyun vmovdqa \XMM4, TMP4(%rsp) 1242*4882a593Smuzhiyun vmovdqa \XMM5, TMP5(%rsp) 1243*4882a593Smuzhiyun vmovdqa \XMM6, TMP6(%rsp) 1244*4882a593Smuzhiyun vmovdqa \XMM7, TMP7(%rsp) 1245*4882a593Smuzhiyun vmovdqa \XMM8, TMP8(%rsp) 1246*4882a593Smuzhiyun 1247*4882a593Smuzhiyun.if \loop_idx == in_order 1248*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 1249*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM1, \XMM2 1250*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM2, \XMM3 1251*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM3, \XMM4 1252*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM4, \XMM5 1253*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM5, \XMM6 1254*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM6, \XMM7 1255*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM7, \XMM8 1256*4882a593Smuzhiyun vmovdqa \XMM8, \CTR 1257*4882a593Smuzhiyun 1258*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1259*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1260*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1261*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1262*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1263*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1264*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1265*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1266*4882a593Smuzhiyun.else 1267*4882a593Smuzhiyun vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 1268*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM1, \XMM2 1269*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM2, \XMM3 1270*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM3, \XMM4 1271*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM4, \XMM5 1272*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM5, \XMM6 1273*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM6, \XMM7 1274*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM7, \XMM8 1275*4882a593Smuzhiyun vmovdqa \XMM8, \CTR 1276*4882a593Smuzhiyun.endif 1277*4882a593Smuzhiyun 1278*4882a593Smuzhiyun 1279*4882a593Smuzhiyun ####################################################################### 1280*4882a593Smuzhiyun 1281*4882a593Smuzhiyun vmovdqu (arg1), \T1 1282*4882a593Smuzhiyun vpxor \T1, \XMM1, \XMM1 1283*4882a593Smuzhiyun vpxor \T1, \XMM2, \XMM2 1284*4882a593Smuzhiyun vpxor \T1, \XMM3, \XMM3 1285*4882a593Smuzhiyun vpxor \T1, \XMM4, \XMM4 1286*4882a593Smuzhiyun vpxor \T1, \XMM5, \XMM5 1287*4882a593Smuzhiyun vpxor \T1, \XMM6, \XMM6 1288*4882a593Smuzhiyun vpxor \T1, \XMM7, \XMM7 1289*4882a593Smuzhiyun vpxor \T1, \XMM8, \XMM8 1290*4882a593Smuzhiyun 1291*4882a593Smuzhiyun ####################################################################### 1292*4882a593Smuzhiyun 1293*4882a593Smuzhiyun 1294*4882a593Smuzhiyun 1295*4882a593Smuzhiyun 1296*4882a593Smuzhiyun 1297*4882a593Smuzhiyun vmovdqu 16*1(arg1), \T1 1298*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 1299*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 1300*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 1301*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 1302*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 1303*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 1304*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 1305*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 1306*4882a593Smuzhiyun 1307*4882a593Smuzhiyun vmovdqu 16*2(arg1), \T1 1308*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 1309*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 1310*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 1311*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 1312*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 1313*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 1314*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 1315*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 1316*4882a593Smuzhiyun 1317*4882a593Smuzhiyun 1318*4882a593Smuzhiyun ####################################################################### 1319*4882a593Smuzhiyun 1320*4882a593Smuzhiyun vmovdqu HashKey_8(arg2), \T5 1321*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 1322*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 1323*4882a593Smuzhiyun 1324*4882a593Smuzhiyun vpshufd $0b01001110, \T2, \T6 1325*4882a593Smuzhiyun vpxor \T2, \T6, \T6 1326*4882a593Smuzhiyun 1327*4882a593Smuzhiyun vmovdqu HashKey_8_k(arg2), \T5 1328*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T6, \T6 1329*4882a593Smuzhiyun 1330*4882a593Smuzhiyun vmovdqu 16*3(arg1), \T1 1331*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 1332*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 1333*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 1334*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 1335*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 1336*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 1337*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 1338*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 1339*4882a593Smuzhiyun 1340*4882a593Smuzhiyun vmovdqa TMP2(%rsp), \T1 1341*4882a593Smuzhiyun vmovdqu HashKey_7(arg2), \T5 1342*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 1343*4882a593Smuzhiyun vpxor \T3, \T4, \T4 1344*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 1345*4882a593Smuzhiyun vpxor \T3, \T7, \T7 1346*4882a593Smuzhiyun 1347*4882a593Smuzhiyun vpshufd $0b01001110, \T1, \T3 1348*4882a593Smuzhiyun vpxor \T1, \T3, \T3 1349*4882a593Smuzhiyun vmovdqu HashKey_7_k(arg2), \T5 1350*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T3, \T3 1351*4882a593Smuzhiyun vpxor \T3, \T6, \T6 1352*4882a593Smuzhiyun 1353*4882a593Smuzhiyun vmovdqu 16*4(arg1), \T1 1354*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 1355*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 1356*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 1357*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 1358*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 1359*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 1360*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 1361*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 1362*4882a593Smuzhiyun 1363*4882a593Smuzhiyun ####################################################################### 1364*4882a593Smuzhiyun 1365*4882a593Smuzhiyun vmovdqa TMP3(%rsp), \T1 1366*4882a593Smuzhiyun vmovdqu HashKey_6(arg2), \T5 1367*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 1368*4882a593Smuzhiyun vpxor \T3, \T4, \T4 1369*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 1370*4882a593Smuzhiyun vpxor \T3, \T7, \T7 1371*4882a593Smuzhiyun 1372*4882a593Smuzhiyun vpshufd $0b01001110, \T1, \T3 1373*4882a593Smuzhiyun vpxor \T1, \T3, \T3 1374*4882a593Smuzhiyun vmovdqu HashKey_6_k(arg2), \T5 1375*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T3, \T3 1376*4882a593Smuzhiyun vpxor \T3, \T6, \T6 1377*4882a593Smuzhiyun 1378*4882a593Smuzhiyun vmovdqu 16*5(arg1), \T1 1379*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 1380*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 1381*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 1382*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 1383*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 1384*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 1385*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 1386*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 1387*4882a593Smuzhiyun 1388*4882a593Smuzhiyun vmovdqa TMP4(%rsp), \T1 1389*4882a593Smuzhiyun vmovdqu HashKey_5(arg2), \T5 1390*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 1391*4882a593Smuzhiyun vpxor \T3, \T4, \T4 1392*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 1393*4882a593Smuzhiyun vpxor \T3, \T7, \T7 1394*4882a593Smuzhiyun 1395*4882a593Smuzhiyun vpshufd $0b01001110, \T1, \T3 1396*4882a593Smuzhiyun vpxor \T1, \T3, \T3 1397*4882a593Smuzhiyun vmovdqu HashKey_5_k(arg2), \T5 1398*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T3, \T3 1399*4882a593Smuzhiyun vpxor \T3, \T6, \T6 1400*4882a593Smuzhiyun 1401*4882a593Smuzhiyun vmovdqu 16*6(arg1), \T1 1402*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 1403*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 1404*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 1405*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 1406*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 1407*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 1408*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 1409*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 1410*4882a593Smuzhiyun 1411*4882a593Smuzhiyun 1412*4882a593Smuzhiyun vmovdqa TMP5(%rsp), \T1 1413*4882a593Smuzhiyun vmovdqu HashKey_4(arg2), \T5 1414*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 1415*4882a593Smuzhiyun vpxor \T3, \T4, \T4 1416*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 1417*4882a593Smuzhiyun vpxor \T3, \T7, \T7 1418*4882a593Smuzhiyun 1419*4882a593Smuzhiyun vpshufd $0b01001110, \T1, \T3 1420*4882a593Smuzhiyun vpxor \T1, \T3, \T3 1421*4882a593Smuzhiyun vmovdqu HashKey_4_k(arg2), \T5 1422*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T3, \T3 1423*4882a593Smuzhiyun vpxor \T3, \T6, \T6 1424*4882a593Smuzhiyun 1425*4882a593Smuzhiyun vmovdqu 16*7(arg1), \T1 1426*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 1427*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 1428*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 1429*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 1430*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 1431*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 1432*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 1433*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 1434*4882a593Smuzhiyun 1435*4882a593Smuzhiyun vmovdqa TMP6(%rsp), \T1 1436*4882a593Smuzhiyun vmovdqu HashKey_3(arg2), \T5 1437*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 1438*4882a593Smuzhiyun vpxor \T3, \T4, \T4 1439*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 1440*4882a593Smuzhiyun vpxor \T3, \T7, \T7 1441*4882a593Smuzhiyun 1442*4882a593Smuzhiyun vpshufd $0b01001110, \T1, \T3 1443*4882a593Smuzhiyun vpxor \T1, \T3, \T3 1444*4882a593Smuzhiyun vmovdqu HashKey_3_k(arg2), \T5 1445*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T3, \T3 1446*4882a593Smuzhiyun vpxor \T3, \T6, \T6 1447*4882a593Smuzhiyun 1448*4882a593Smuzhiyun 1449*4882a593Smuzhiyun vmovdqu 16*8(arg1), \T1 1450*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 1451*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 1452*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 1453*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 1454*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 1455*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 1456*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 1457*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 1458*4882a593Smuzhiyun 1459*4882a593Smuzhiyun vmovdqa TMP7(%rsp), \T1 1460*4882a593Smuzhiyun vmovdqu HashKey_2(arg2), \T5 1461*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 1462*4882a593Smuzhiyun vpxor \T3, \T4, \T4 1463*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 1464*4882a593Smuzhiyun vpxor \T3, \T7, \T7 1465*4882a593Smuzhiyun 1466*4882a593Smuzhiyun vpshufd $0b01001110, \T1, \T3 1467*4882a593Smuzhiyun vpxor \T1, \T3, \T3 1468*4882a593Smuzhiyun vmovdqu HashKey_2_k(arg2), \T5 1469*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T3, \T3 1470*4882a593Smuzhiyun vpxor \T3, \T6, \T6 1471*4882a593Smuzhiyun 1472*4882a593Smuzhiyun ####################################################################### 1473*4882a593Smuzhiyun 1474*4882a593Smuzhiyun vmovdqu 16*9(arg1), \T5 1475*4882a593Smuzhiyun vaesenc \T5, \XMM1, \XMM1 1476*4882a593Smuzhiyun vaesenc \T5, \XMM2, \XMM2 1477*4882a593Smuzhiyun vaesenc \T5, \XMM3, \XMM3 1478*4882a593Smuzhiyun vaesenc \T5, \XMM4, \XMM4 1479*4882a593Smuzhiyun vaesenc \T5, \XMM5, \XMM5 1480*4882a593Smuzhiyun vaesenc \T5, \XMM6, \XMM6 1481*4882a593Smuzhiyun vaesenc \T5, \XMM7, \XMM7 1482*4882a593Smuzhiyun vaesenc \T5, \XMM8, \XMM8 1483*4882a593Smuzhiyun 1484*4882a593Smuzhiyun vmovdqa TMP8(%rsp), \T1 1485*4882a593Smuzhiyun vmovdqu HashKey(arg2), \T5 1486*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 1487*4882a593Smuzhiyun vpxor \T3, \T4, \T4 1488*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 1489*4882a593Smuzhiyun vpxor \T3, \T7, \T7 1490*4882a593Smuzhiyun 1491*4882a593Smuzhiyun vpshufd $0b01001110, \T1, \T3 1492*4882a593Smuzhiyun vpxor \T1, \T3, \T3 1493*4882a593Smuzhiyun vmovdqu HashKey_k(arg2), \T5 1494*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T3, \T3 1495*4882a593Smuzhiyun vpxor \T3, \T6, \T6 1496*4882a593Smuzhiyun 1497*4882a593Smuzhiyun vpxor \T4, \T6, \T6 1498*4882a593Smuzhiyun vpxor \T7, \T6, \T6 1499*4882a593Smuzhiyun 1500*4882a593Smuzhiyun vmovdqu 16*10(arg1), \T5 1501*4882a593Smuzhiyun 1502*4882a593Smuzhiyun i = 11 1503*4882a593Smuzhiyun setreg 1504*4882a593Smuzhiyun.rep (\REP-9) 1505*4882a593Smuzhiyun 1506*4882a593Smuzhiyun vaesenc \T5, \XMM1, \XMM1 1507*4882a593Smuzhiyun vaesenc \T5, \XMM2, \XMM2 1508*4882a593Smuzhiyun vaesenc \T5, \XMM3, \XMM3 1509*4882a593Smuzhiyun vaesenc \T5, \XMM4, \XMM4 1510*4882a593Smuzhiyun vaesenc \T5, \XMM5, \XMM5 1511*4882a593Smuzhiyun vaesenc \T5, \XMM6, \XMM6 1512*4882a593Smuzhiyun vaesenc \T5, \XMM7, \XMM7 1513*4882a593Smuzhiyun vaesenc \T5, \XMM8, \XMM8 1514*4882a593Smuzhiyun 1515*4882a593Smuzhiyun vmovdqu 16*i(arg1), \T5 1516*4882a593Smuzhiyun i = i + 1 1517*4882a593Smuzhiyun setreg 1518*4882a593Smuzhiyun.endr 1519*4882a593Smuzhiyun 1520*4882a593Smuzhiyun i = 0 1521*4882a593Smuzhiyun j = 1 1522*4882a593Smuzhiyun setreg 1523*4882a593Smuzhiyun.rep 8 1524*4882a593Smuzhiyun vpxor 16*i(arg4, %r11), \T5, \T2 1525*4882a593Smuzhiyun .if \ENC_DEC == ENC 1526*4882a593Smuzhiyun vaesenclast \T2, reg_j, reg_j 1527*4882a593Smuzhiyun .else 1528*4882a593Smuzhiyun vaesenclast \T2, reg_j, \T3 1529*4882a593Smuzhiyun vmovdqu 16*i(arg4, %r11), reg_j 1530*4882a593Smuzhiyun vmovdqu \T3, 16*i(arg3, %r11) 1531*4882a593Smuzhiyun .endif 1532*4882a593Smuzhiyun i = (i+1) 1533*4882a593Smuzhiyun j = (j+1) 1534*4882a593Smuzhiyun setreg 1535*4882a593Smuzhiyun.endr 1536*4882a593Smuzhiyun ####################################################################### 1537*4882a593Smuzhiyun 1538*4882a593Smuzhiyun 1539*4882a593Smuzhiyun vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 1540*4882a593Smuzhiyun vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 1541*4882a593Smuzhiyun vpxor \T3, \T7, \T7 1542*4882a593Smuzhiyun vpxor \T4, \T6, \T6 # accumulate the results in T6:T7 1543*4882a593Smuzhiyun 1544*4882a593Smuzhiyun 1545*4882a593Smuzhiyun 1546*4882a593Smuzhiyun ####################################################################### 1547*4882a593Smuzhiyun #first phase of the reduction 1548*4882a593Smuzhiyun ####################################################################### 1549*4882a593Smuzhiyun vpslld $31, \T7, \T2 # packed right shifting << 31 1550*4882a593Smuzhiyun vpslld $30, \T7, \T3 # packed right shifting shift << 30 1551*4882a593Smuzhiyun vpslld $25, \T7, \T4 # packed right shifting shift << 25 1552*4882a593Smuzhiyun 1553*4882a593Smuzhiyun vpxor \T3, \T2, \T2 # xor the shifted versions 1554*4882a593Smuzhiyun vpxor \T4, \T2, \T2 1555*4882a593Smuzhiyun 1556*4882a593Smuzhiyun vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1557*4882a593Smuzhiyun 1558*4882a593Smuzhiyun vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1559*4882a593Smuzhiyun vpxor \T2, \T7, \T7 # first phase of the reduction complete 1560*4882a593Smuzhiyun ####################################################################### 1561*4882a593Smuzhiyun .if \ENC_DEC == ENC 1562*4882a593Smuzhiyun vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 1563*4882a593Smuzhiyun vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 1564*4882a593Smuzhiyun vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 1565*4882a593Smuzhiyun vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 1566*4882a593Smuzhiyun vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 1567*4882a593Smuzhiyun vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 1568*4882a593Smuzhiyun vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 1569*4882a593Smuzhiyun vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 1570*4882a593Smuzhiyun .endif 1571*4882a593Smuzhiyun 1572*4882a593Smuzhiyun ####################################################################### 1573*4882a593Smuzhiyun #second phase of the reduction 1574*4882a593Smuzhiyun vpsrld $1, \T7, \T2 # packed left shifting >> 1 1575*4882a593Smuzhiyun vpsrld $2, \T7, \T3 # packed left shifting >> 2 1576*4882a593Smuzhiyun vpsrld $7, \T7, \T4 # packed left shifting >> 7 1577*4882a593Smuzhiyun vpxor \T3, \T2, \T2 # xor the shifted versions 1578*4882a593Smuzhiyun vpxor \T4, \T2, \T2 1579*4882a593Smuzhiyun 1580*4882a593Smuzhiyun vpxor \T1, \T2, \T2 1581*4882a593Smuzhiyun vpxor \T2, \T7, \T7 1582*4882a593Smuzhiyun vpxor \T7, \T6, \T6 # the result is in T6 1583*4882a593Smuzhiyun ####################################################################### 1584*4882a593Smuzhiyun 1585*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 1586*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 1587*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 1588*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 1589*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 1590*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 1591*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 1592*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 1593*4882a593Smuzhiyun 1594*4882a593Smuzhiyun 1595*4882a593Smuzhiyun vpxor \T6, \XMM1, \XMM1 1596*4882a593Smuzhiyun 1597*4882a593Smuzhiyun 1598*4882a593Smuzhiyun 1599*4882a593Smuzhiyun.endm 1600*4882a593Smuzhiyun 1601*4882a593Smuzhiyun 1602*4882a593Smuzhiyun# GHASH the last 4 ciphertext blocks. 1603*4882a593Smuzhiyun.macro GHASH_LAST_8_AVX T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 1604*4882a593Smuzhiyun 1605*4882a593Smuzhiyun ## Karatsuba Method 1606*4882a593Smuzhiyun 1607*4882a593Smuzhiyun 1608*4882a593Smuzhiyun vpshufd $0b01001110, \XMM1, \T2 1609*4882a593Smuzhiyun vpxor \XMM1, \T2, \T2 1610*4882a593Smuzhiyun vmovdqu HashKey_8(arg2), \T5 1611*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM1, \T6 1612*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM1, \T7 1613*4882a593Smuzhiyun 1614*4882a593Smuzhiyun vmovdqu HashKey_8_k(arg2), \T3 1615*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \XMM1 1616*4882a593Smuzhiyun 1617*4882a593Smuzhiyun ###################### 1618*4882a593Smuzhiyun 1619*4882a593Smuzhiyun vpshufd $0b01001110, \XMM2, \T2 1620*4882a593Smuzhiyun vpxor \XMM2, \T2, \T2 1621*4882a593Smuzhiyun vmovdqu HashKey_7(arg2), \T5 1622*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM2, \T4 1623*4882a593Smuzhiyun vpxor \T4, \T6, \T6 1624*4882a593Smuzhiyun 1625*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM2, \T4 1626*4882a593Smuzhiyun vpxor \T4, \T7, \T7 1627*4882a593Smuzhiyun 1628*4882a593Smuzhiyun vmovdqu HashKey_7_k(arg2), \T3 1629*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 1630*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 1631*4882a593Smuzhiyun 1632*4882a593Smuzhiyun ###################### 1633*4882a593Smuzhiyun 1634*4882a593Smuzhiyun vpshufd $0b01001110, \XMM3, \T2 1635*4882a593Smuzhiyun vpxor \XMM3, \T2, \T2 1636*4882a593Smuzhiyun vmovdqu HashKey_6(arg2), \T5 1637*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM3, \T4 1638*4882a593Smuzhiyun vpxor \T4, \T6, \T6 1639*4882a593Smuzhiyun 1640*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM3, \T4 1641*4882a593Smuzhiyun vpxor \T4, \T7, \T7 1642*4882a593Smuzhiyun 1643*4882a593Smuzhiyun vmovdqu HashKey_6_k(arg2), \T3 1644*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 1645*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 1646*4882a593Smuzhiyun 1647*4882a593Smuzhiyun ###################### 1648*4882a593Smuzhiyun 1649*4882a593Smuzhiyun vpshufd $0b01001110, \XMM4, \T2 1650*4882a593Smuzhiyun vpxor \XMM4, \T2, \T2 1651*4882a593Smuzhiyun vmovdqu HashKey_5(arg2), \T5 1652*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM4, \T4 1653*4882a593Smuzhiyun vpxor \T4, \T6, \T6 1654*4882a593Smuzhiyun 1655*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM4, \T4 1656*4882a593Smuzhiyun vpxor \T4, \T7, \T7 1657*4882a593Smuzhiyun 1658*4882a593Smuzhiyun vmovdqu HashKey_5_k(arg2), \T3 1659*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 1660*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 1661*4882a593Smuzhiyun 1662*4882a593Smuzhiyun ###################### 1663*4882a593Smuzhiyun 1664*4882a593Smuzhiyun vpshufd $0b01001110, \XMM5, \T2 1665*4882a593Smuzhiyun vpxor \XMM5, \T2, \T2 1666*4882a593Smuzhiyun vmovdqu HashKey_4(arg2), \T5 1667*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM5, \T4 1668*4882a593Smuzhiyun vpxor \T4, \T6, \T6 1669*4882a593Smuzhiyun 1670*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM5, \T4 1671*4882a593Smuzhiyun vpxor \T4, \T7, \T7 1672*4882a593Smuzhiyun 1673*4882a593Smuzhiyun vmovdqu HashKey_4_k(arg2), \T3 1674*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 1675*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 1676*4882a593Smuzhiyun 1677*4882a593Smuzhiyun ###################### 1678*4882a593Smuzhiyun 1679*4882a593Smuzhiyun vpshufd $0b01001110, \XMM6, \T2 1680*4882a593Smuzhiyun vpxor \XMM6, \T2, \T2 1681*4882a593Smuzhiyun vmovdqu HashKey_3(arg2), \T5 1682*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM6, \T4 1683*4882a593Smuzhiyun vpxor \T4, \T6, \T6 1684*4882a593Smuzhiyun 1685*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM6, \T4 1686*4882a593Smuzhiyun vpxor \T4, \T7, \T7 1687*4882a593Smuzhiyun 1688*4882a593Smuzhiyun vmovdqu HashKey_3_k(arg2), \T3 1689*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 1690*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 1691*4882a593Smuzhiyun 1692*4882a593Smuzhiyun ###################### 1693*4882a593Smuzhiyun 1694*4882a593Smuzhiyun vpshufd $0b01001110, \XMM7, \T2 1695*4882a593Smuzhiyun vpxor \XMM7, \T2, \T2 1696*4882a593Smuzhiyun vmovdqu HashKey_2(arg2), \T5 1697*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM7, \T4 1698*4882a593Smuzhiyun vpxor \T4, \T6, \T6 1699*4882a593Smuzhiyun 1700*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM7, \T4 1701*4882a593Smuzhiyun vpxor \T4, \T7, \T7 1702*4882a593Smuzhiyun 1703*4882a593Smuzhiyun vmovdqu HashKey_2_k(arg2), \T3 1704*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 1705*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 1706*4882a593Smuzhiyun 1707*4882a593Smuzhiyun ###################### 1708*4882a593Smuzhiyun 1709*4882a593Smuzhiyun vpshufd $0b01001110, \XMM8, \T2 1710*4882a593Smuzhiyun vpxor \XMM8, \T2, \T2 1711*4882a593Smuzhiyun vmovdqu HashKey(arg2), \T5 1712*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM8, \T4 1713*4882a593Smuzhiyun vpxor \T4, \T6, \T6 1714*4882a593Smuzhiyun 1715*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM8, \T4 1716*4882a593Smuzhiyun vpxor \T4, \T7, \T7 1717*4882a593Smuzhiyun 1718*4882a593Smuzhiyun vmovdqu HashKey_k(arg2), \T3 1719*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 1720*4882a593Smuzhiyun 1721*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 1722*4882a593Smuzhiyun vpxor \T6, \XMM1, \XMM1 1723*4882a593Smuzhiyun vpxor \T7, \XMM1, \T2 1724*4882a593Smuzhiyun 1725*4882a593Smuzhiyun 1726*4882a593Smuzhiyun 1727*4882a593Smuzhiyun 1728*4882a593Smuzhiyun vpslldq $8, \T2, \T4 1729*4882a593Smuzhiyun vpsrldq $8, \T2, \T2 1730*4882a593Smuzhiyun 1731*4882a593Smuzhiyun vpxor \T4, \T7, \T7 1732*4882a593Smuzhiyun vpxor \T2, \T6, \T6 # <T6:T7> holds the result of 1733*4882a593Smuzhiyun # the accumulated carry-less multiplications 1734*4882a593Smuzhiyun 1735*4882a593Smuzhiyun ####################################################################### 1736*4882a593Smuzhiyun #first phase of the reduction 1737*4882a593Smuzhiyun vpslld $31, \T7, \T2 # packed right shifting << 31 1738*4882a593Smuzhiyun vpslld $30, \T7, \T3 # packed right shifting shift << 30 1739*4882a593Smuzhiyun vpslld $25, \T7, \T4 # packed right shifting shift << 25 1740*4882a593Smuzhiyun 1741*4882a593Smuzhiyun vpxor \T3, \T2, \T2 # xor the shifted versions 1742*4882a593Smuzhiyun vpxor \T4, \T2, \T2 1743*4882a593Smuzhiyun 1744*4882a593Smuzhiyun vpsrldq $4, \T2, \T1 # shift-R T1 1 DW 1745*4882a593Smuzhiyun 1746*4882a593Smuzhiyun vpslldq $12, \T2, \T2 # shift-L T2 3 DWs 1747*4882a593Smuzhiyun vpxor \T2, \T7, \T7 # first phase of the reduction complete 1748*4882a593Smuzhiyun ####################################################################### 1749*4882a593Smuzhiyun 1750*4882a593Smuzhiyun 1751*4882a593Smuzhiyun #second phase of the reduction 1752*4882a593Smuzhiyun vpsrld $1, \T7, \T2 # packed left shifting >> 1 1753*4882a593Smuzhiyun vpsrld $2, \T7, \T3 # packed left shifting >> 2 1754*4882a593Smuzhiyun vpsrld $7, \T7, \T4 # packed left shifting >> 7 1755*4882a593Smuzhiyun vpxor \T3, \T2, \T2 # xor the shifted versions 1756*4882a593Smuzhiyun vpxor \T4, \T2, \T2 1757*4882a593Smuzhiyun 1758*4882a593Smuzhiyun vpxor \T1, \T2, \T2 1759*4882a593Smuzhiyun vpxor \T2, \T7, \T7 1760*4882a593Smuzhiyun vpxor \T7, \T6, \T6 # the result is in T6 1761*4882a593Smuzhiyun 1762*4882a593Smuzhiyun.endm 1763*4882a593Smuzhiyun 1764*4882a593Smuzhiyun############################################################# 1765*4882a593Smuzhiyun#void aesni_gcm_precomp_avx_gen2 1766*4882a593Smuzhiyun# (gcm_data *my_ctx_data, 1767*4882a593Smuzhiyun# gcm_context_data *data, 1768*4882a593Smuzhiyun# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 1769*4882a593Smuzhiyun# u8 *iv, /* Pre-counter block j0: 4 byte salt 1770*4882a593Smuzhiyun# (from Security Association) concatenated with 8 byte 1771*4882a593Smuzhiyun# Initialisation Vector (from IPSec ESP Payload) 1772*4882a593Smuzhiyun# concatenated with 0x00000001. 16-byte aligned pointer. */ 1773*4882a593Smuzhiyun# const u8 *aad, /* Additional Authentication Data (AAD)*/ 1774*4882a593Smuzhiyun# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 1775*4882a593Smuzhiyun############################################################# 1776*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_init_avx_gen2) 1777*4882a593Smuzhiyun FUNC_SAVE 1778*4882a593Smuzhiyun INIT GHASH_MUL_AVX, PRECOMPUTE_AVX 1779*4882a593Smuzhiyun FUNC_RESTORE 1780*4882a593Smuzhiyun RET 1781*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_init_avx_gen2) 1782*4882a593Smuzhiyun 1783*4882a593Smuzhiyun############################################################################### 1784*4882a593Smuzhiyun#void aesni_gcm_enc_update_avx_gen2( 1785*4882a593Smuzhiyun# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1786*4882a593Smuzhiyun# gcm_context_data *data, 1787*4882a593Smuzhiyun# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 1788*4882a593Smuzhiyun# const u8 *in, /* Plaintext input */ 1789*4882a593Smuzhiyun# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1790*4882a593Smuzhiyun############################################################################### 1791*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_enc_update_avx_gen2) 1792*4882a593Smuzhiyun FUNC_SAVE 1793*4882a593Smuzhiyun mov keysize, %eax 1794*4882a593Smuzhiyun cmp $32, %eax 1795*4882a593Smuzhiyun je key_256_enc_update 1796*4882a593Smuzhiyun cmp $16, %eax 1797*4882a593Smuzhiyun je key_128_enc_update 1798*4882a593Smuzhiyun # must be 192 1799*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 11 1800*4882a593Smuzhiyun FUNC_RESTORE 1801*4882a593Smuzhiyun RET 1802*4882a593Smuzhiyunkey_128_enc_update: 1803*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 9 1804*4882a593Smuzhiyun FUNC_RESTORE 1805*4882a593Smuzhiyun RET 1806*4882a593Smuzhiyunkey_256_enc_update: 1807*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, ENC, 13 1808*4882a593Smuzhiyun FUNC_RESTORE 1809*4882a593Smuzhiyun RET 1810*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_enc_update_avx_gen2) 1811*4882a593Smuzhiyun 1812*4882a593Smuzhiyun############################################################################### 1813*4882a593Smuzhiyun#void aesni_gcm_dec_update_avx_gen2( 1814*4882a593Smuzhiyun# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1815*4882a593Smuzhiyun# gcm_context_data *data, 1816*4882a593Smuzhiyun# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 1817*4882a593Smuzhiyun# const u8 *in, /* Ciphertext input */ 1818*4882a593Smuzhiyun# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 1819*4882a593Smuzhiyun############################################################################### 1820*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_dec_update_avx_gen2) 1821*4882a593Smuzhiyun FUNC_SAVE 1822*4882a593Smuzhiyun mov keysize,%eax 1823*4882a593Smuzhiyun cmp $32, %eax 1824*4882a593Smuzhiyun je key_256_dec_update 1825*4882a593Smuzhiyun cmp $16, %eax 1826*4882a593Smuzhiyun je key_128_dec_update 1827*4882a593Smuzhiyun # must be 192 1828*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 11 1829*4882a593Smuzhiyun FUNC_RESTORE 1830*4882a593Smuzhiyun RET 1831*4882a593Smuzhiyunkey_128_dec_update: 1832*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 9 1833*4882a593Smuzhiyun FUNC_RESTORE 1834*4882a593Smuzhiyun RET 1835*4882a593Smuzhiyunkey_256_dec_update: 1836*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX, GHASH_8_ENCRYPT_8_PARALLEL_AVX, GHASH_LAST_8_AVX, GHASH_MUL_AVX, DEC, 13 1837*4882a593Smuzhiyun FUNC_RESTORE 1838*4882a593Smuzhiyun RET 1839*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_dec_update_avx_gen2) 1840*4882a593Smuzhiyun 1841*4882a593Smuzhiyun############################################################################### 1842*4882a593Smuzhiyun#void aesni_gcm_finalize_avx_gen2( 1843*4882a593Smuzhiyun# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 1844*4882a593Smuzhiyun# gcm_context_data *data, 1845*4882a593Smuzhiyun# u8 *auth_tag, /* Authenticated Tag output. */ 1846*4882a593Smuzhiyun# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 1847*4882a593Smuzhiyun# Valid values are 16 (most likely), 12 or 8. */ 1848*4882a593Smuzhiyun############################################################################### 1849*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_finalize_avx_gen2) 1850*4882a593Smuzhiyun FUNC_SAVE 1851*4882a593Smuzhiyun mov keysize,%eax 1852*4882a593Smuzhiyun cmp $32, %eax 1853*4882a593Smuzhiyun je key_256_finalize 1854*4882a593Smuzhiyun cmp $16, %eax 1855*4882a593Smuzhiyun je key_128_finalize 1856*4882a593Smuzhiyun # must be 192 1857*4882a593Smuzhiyun GCM_COMPLETE GHASH_MUL_AVX, 11, arg3, arg4 1858*4882a593Smuzhiyun FUNC_RESTORE 1859*4882a593Smuzhiyun RET 1860*4882a593Smuzhiyunkey_128_finalize: 1861*4882a593Smuzhiyun GCM_COMPLETE GHASH_MUL_AVX, 9, arg3, arg4 1862*4882a593Smuzhiyun FUNC_RESTORE 1863*4882a593Smuzhiyun RET 1864*4882a593Smuzhiyunkey_256_finalize: 1865*4882a593Smuzhiyun GCM_COMPLETE GHASH_MUL_AVX, 13, arg3, arg4 1866*4882a593Smuzhiyun FUNC_RESTORE 1867*4882a593Smuzhiyun RET 1868*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_finalize_avx_gen2) 1869*4882a593Smuzhiyun 1870*4882a593Smuzhiyun############################################################################### 1871*4882a593Smuzhiyun# GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 1872*4882a593Smuzhiyun# Input: A and B (128-bits each, bit-reflected) 1873*4882a593Smuzhiyun# Output: C = A*B*x mod poly, (i.e. >>1 ) 1874*4882a593Smuzhiyun# To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 1875*4882a593Smuzhiyun# GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 1876*4882a593Smuzhiyun############################################################################### 1877*4882a593Smuzhiyun.macro GHASH_MUL_AVX2 GH HK T1 T2 T3 T4 T5 1878*4882a593Smuzhiyun 1879*4882a593Smuzhiyun vpclmulqdq $0x11,\HK,\GH,\T1 # T1 = a1*b1 1880*4882a593Smuzhiyun vpclmulqdq $0x00,\HK,\GH,\T2 # T2 = a0*b0 1881*4882a593Smuzhiyun vpclmulqdq $0x01,\HK,\GH,\T3 # T3 = a1*b0 1882*4882a593Smuzhiyun vpclmulqdq $0x10,\HK,\GH,\GH # GH = a0*b1 1883*4882a593Smuzhiyun vpxor \T3, \GH, \GH 1884*4882a593Smuzhiyun 1885*4882a593Smuzhiyun 1886*4882a593Smuzhiyun vpsrldq $8 , \GH, \T3 # shift-R GH 2 DWs 1887*4882a593Smuzhiyun vpslldq $8 , \GH, \GH # shift-L GH 2 DWs 1888*4882a593Smuzhiyun 1889*4882a593Smuzhiyun vpxor \T3, \T1, \T1 1890*4882a593Smuzhiyun vpxor \T2, \GH, \GH 1891*4882a593Smuzhiyun 1892*4882a593Smuzhiyun ####################################################################### 1893*4882a593Smuzhiyun #first phase of the reduction 1894*4882a593Smuzhiyun vmovdqa POLY2(%rip), \T3 1895*4882a593Smuzhiyun 1896*4882a593Smuzhiyun vpclmulqdq $0x01, \GH, \T3, \T2 1897*4882a593Smuzhiyun vpslldq $8, \T2, \T2 # shift-L T2 2 DWs 1898*4882a593Smuzhiyun 1899*4882a593Smuzhiyun vpxor \T2, \GH, \GH # first phase of the reduction complete 1900*4882a593Smuzhiyun ####################################################################### 1901*4882a593Smuzhiyun #second phase of the reduction 1902*4882a593Smuzhiyun vpclmulqdq $0x00, \GH, \T3, \T2 1903*4882a593Smuzhiyun vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 1904*4882a593Smuzhiyun 1905*4882a593Smuzhiyun vpclmulqdq $0x10, \GH, \T3, \GH 1906*4882a593Smuzhiyun vpslldq $4, \GH, \GH # shift-L GH 1 DW (Shift-L 1-DW to obtain result with no shifts) 1907*4882a593Smuzhiyun 1908*4882a593Smuzhiyun vpxor \T2, \GH, \GH # second phase of the reduction complete 1909*4882a593Smuzhiyun ####################################################################### 1910*4882a593Smuzhiyun vpxor \T1, \GH, \GH # the result is in GH 1911*4882a593Smuzhiyun 1912*4882a593Smuzhiyun 1913*4882a593Smuzhiyun.endm 1914*4882a593Smuzhiyun 1915*4882a593Smuzhiyun.macro PRECOMPUTE_AVX2 HK T1 T2 T3 T4 T5 T6 1916*4882a593Smuzhiyun 1917*4882a593Smuzhiyun # Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 1918*4882a593Smuzhiyun vmovdqa \HK, \T5 1919*4882a593Smuzhiyun GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^2<<1 mod poly 1920*4882a593Smuzhiyun vmovdqu \T5, HashKey_2(arg2) # [HashKey_2] = HashKey^2<<1 mod poly 1921*4882a593Smuzhiyun 1922*4882a593Smuzhiyun GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^3<<1 mod poly 1923*4882a593Smuzhiyun vmovdqu \T5, HashKey_3(arg2) 1924*4882a593Smuzhiyun 1925*4882a593Smuzhiyun GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^4<<1 mod poly 1926*4882a593Smuzhiyun vmovdqu \T5, HashKey_4(arg2) 1927*4882a593Smuzhiyun 1928*4882a593Smuzhiyun GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^5<<1 mod poly 1929*4882a593Smuzhiyun vmovdqu \T5, HashKey_5(arg2) 1930*4882a593Smuzhiyun 1931*4882a593Smuzhiyun GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^6<<1 mod poly 1932*4882a593Smuzhiyun vmovdqu \T5, HashKey_6(arg2) 1933*4882a593Smuzhiyun 1934*4882a593Smuzhiyun GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^7<<1 mod poly 1935*4882a593Smuzhiyun vmovdqu \T5, HashKey_7(arg2) 1936*4882a593Smuzhiyun 1937*4882a593Smuzhiyun GHASH_MUL_AVX2 \T5, \HK, \T1, \T3, \T4, \T6, \T2 # T5 = HashKey^8<<1 mod poly 1938*4882a593Smuzhiyun vmovdqu \T5, HashKey_8(arg2) 1939*4882a593Smuzhiyun 1940*4882a593Smuzhiyun.endm 1941*4882a593Smuzhiyun 1942*4882a593Smuzhiyun## if a = number of total plaintext bytes 1943*4882a593Smuzhiyun## b = floor(a/16) 1944*4882a593Smuzhiyun## num_initial_blocks = b mod 4# 1945*4882a593Smuzhiyun## encrypt the initial num_initial_blocks blocks and apply ghash on the ciphertext 1946*4882a593Smuzhiyun## r10, r11, r12, rax are clobbered 1947*4882a593Smuzhiyun## arg1, arg3, arg4, r14 are used as a pointer only, not modified 1948*4882a593Smuzhiyun 1949*4882a593Smuzhiyun.macro INITIAL_BLOCKS_AVX2 REP num_initial_blocks T1 T2 T3 T4 T5 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T6 T_key ENC_DEC VER 1950*4882a593Smuzhiyun i = (8-\num_initial_blocks) 1951*4882a593Smuzhiyun setreg 1952*4882a593Smuzhiyun vmovdqu AadHash(arg2), reg_i 1953*4882a593Smuzhiyun 1954*4882a593Smuzhiyun # start AES for num_initial_blocks blocks 1955*4882a593Smuzhiyun vmovdqu CurCount(arg2), \CTR 1956*4882a593Smuzhiyun 1957*4882a593Smuzhiyun i = (9-\num_initial_blocks) 1958*4882a593Smuzhiyun setreg 1959*4882a593Smuzhiyun.rep \num_initial_blocks 1960*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 1961*4882a593Smuzhiyun vmovdqa \CTR, reg_i 1962*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), reg_i, reg_i # perform a 16Byte swap 1963*4882a593Smuzhiyun i = (i+1) 1964*4882a593Smuzhiyun setreg 1965*4882a593Smuzhiyun.endr 1966*4882a593Smuzhiyun 1967*4882a593Smuzhiyun vmovdqa (arg1), \T_key 1968*4882a593Smuzhiyun i = (9-\num_initial_blocks) 1969*4882a593Smuzhiyun setreg 1970*4882a593Smuzhiyun.rep \num_initial_blocks 1971*4882a593Smuzhiyun vpxor \T_key, reg_i, reg_i 1972*4882a593Smuzhiyun i = (i+1) 1973*4882a593Smuzhiyun setreg 1974*4882a593Smuzhiyun.endr 1975*4882a593Smuzhiyun 1976*4882a593Smuzhiyun j = 1 1977*4882a593Smuzhiyun setreg 1978*4882a593Smuzhiyun.rep \REP 1979*4882a593Smuzhiyun vmovdqa 16*j(arg1), \T_key 1980*4882a593Smuzhiyun i = (9-\num_initial_blocks) 1981*4882a593Smuzhiyun setreg 1982*4882a593Smuzhiyun.rep \num_initial_blocks 1983*4882a593Smuzhiyun vaesenc \T_key, reg_i, reg_i 1984*4882a593Smuzhiyun i = (i+1) 1985*4882a593Smuzhiyun setreg 1986*4882a593Smuzhiyun.endr 1987*4882a593Smuzhiyun 1988*4882a593Smuzhiyun j = (j+1) 1989*4882a593Smuzhiyun setreg 1990*4882a593Smuzhiyun.endr 1991*4882a593Smuzhiyun 1992*4882a593Smuzhiyun 1993*4882a593Smuzhiyun vmovdqa 16*j(arg1), \T_key 1994*4882a593Smuzhiyun i = (9-\num_initial_blocks) 1995*4882a593Smuzhiyun setreg 1996*4882a593Smuzhiyun.rep \num_initial_blocks 1997*4882a593Smuzhiyun vaesenclast \T_key, reg_i, reg_i 1998*4882a593Smuzhiyun i = (i+1) 1999*4882a593Smuzhiyun setreg 2000*4882a593Smuzhiyun.endr 2001*4882a593Smuzhiyun 2002*4882a593Smuzhiyun i = (9-\num_initial_blocks) 2003*4882a593Smuzhiyun setreg 2004*4882a593Smuzhiyun.rep \num_initial_blocks 2005*4882a593Smuzhiyun vmovdqu (arg4, %r11), \T1 2006*4882a593Smuzhiyun vpxor \T1, reg_i, reg_i 2007*4882a593Smuzhiyun vmovdqu reg_i, (arg3 , %r11) # write back ciphertext for 2008*4882a593Smuzhiyun # num_initial_blocks blocks 2009*4882a593Smuzhiyun add $16, %r11 2010*4882a593Smuzhiyun.if \ENC_DEC == DEC 2011*4882a593Smuzhiyun vmovdqa \T1, reg_i 2012*4882a593Smuzhiyun.endif 2013*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), reg_i, reg_i # prepare ciphertext for GHASH computations 2014*4882a593Smuzhiyun i = (i+1) 2015*4882a593Smuzhiyun setreg 2016*4882a593Smuzhiyun.endr 2017*4882a593Smuzhiyun 2018*4882a593Smuzhiyun 2019*4882a593Smuzhiyun i = (8-\num_initial_blocks) 2020*4882a593Smuzhiyun j = (9-\num_initial_blocks) 2021*4882a593Smuzhiyun setreg 2022*4882a593Smuzhiyun 2023*4882a593Smuzhiyun.rep \num_initial_blocks 2024*4882a593Smuzhiyun vpxor reg_i, reg_j, reg_j 2025*4882a593Smuzhiyun GHASH_MUL_AVX2 reg_j, \T2, \T1, \T3, \T4, \T5, \T6 # apply GHASH on num_initial_blocks blocks 2026*4882a593Smuzhiyun i = (i+1) 2027*4882a593Smuzhiyun j = (j+1) 2028*4882a593Smuzhiyun setreg 2029*4882a593Smuzhiyun.endr 2030*4882a593Smuzhiyun # XMM8 has the combined result here 2031*4882a593Smuzhiyun 2032*4882a593Smuzhiyun vmovdqa \XMM8, TMP1(%rsp) 2033*4882a593Smuzhiyun vmovdqa \XMM8, \T3 2034*4882a593Smuzhiyun 2035*4882a593Smuzhiyun cmp $128, %r13 2036*4882a593Smuzhiyun jl _initial_blocks_done\@ # no need for precomputed constants 2037*4882a593Smuzhiyun 2038*4882a593Smuzhiyun############################################################################### 2039*4882a593Smuzhiyun# Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 2040*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2041*4882a593Smuzhiyun vmovdqa \CTR, \XMM1 2042*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2043*4882a593Smuzhiyun 2044*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2045*4882a593Smuzhiyun vmovdqa \CTR, \XMM2 2046*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2047*4882a593Smuzhiyun 2048*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2049*4882a593Smuzhiyun vmovdqa \CTR, \XMM3 2050*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2051*4882a593Smuzhiyun 2052*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2053*4882a593Smuzhiyun vmovdqa \CTR, \XMM4 2054*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2055*4882a593Smuzhiyun 2056*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2057*4882a593Smuzhiyun vmovdqa \CTR, \XMM5 2058*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2059*4882a593Smuzhiyun 2060*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2061*4882a593Smuzhiyun vmovdqa \CTR, \XMM6 2062*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2063*4882a593Smuzhiyun 2064*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2065*4882a593Smuzhiyun vmovdqa \CTR, \XMM7 2066*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2067*4882a593Smuzhiyun 2068*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \CTR # INCR Y0 2069*4882a593Smuzhiyun vmovdqa \CTR, \XMM8 2070*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2071*4882a593Smuzhiyun 2072*4882a593Smuzhiyun vmovdqa (arg1), \T_key 2073*4882a593Smuzhiyun vpxor \T_key, \XMM1, \XMM1 2074*4882a593Smuzhiyun vpxor \T_key, \XMM2, \XMM2 2075*4882a593Smuzhiyun vpxor \T_key, \XMM3, \XMM3 2076*4882a593Smuzhiyun vpxor \T_key, \XMM4, \XMM4 2077*4882a593Smuzhiyun vpxor \T_key, \XMM5, \XMM5 2078*4882a593Smuzhiyun vpxor \T_key, \XMM6, \XMM6 2079*4882a593Smuzhiyun vpxor \T_key, \XMM7, \XMM7 2080*4882a593Smuzhiyun vpxor \T_key, \XMM8, \XMM8 2081*4882a593Smuzhiyun 2082*4882a593Smuzhiyun i = 1 2083*4882a593Smuzhiyun setreg 2084*4882a593Smuzhiyun.rep \REP # do REP rounds 2085*4882a593Smuzhiyun vmovdqa 16*i(arg1), \T_key 2086*4882a593Smuzhiyun vaesenc \T_key, \XMM1, \XMM1 2087*4882a593Smuzhiyun vaesenc \T_key, \XMM2, \XMM2 2088*4882a593Smuzhiyun vaesenc \T_key, \XMM3, \XMM3 2089*4882a593Smuzhiyun vaesenc \T_key, \XMM4, \XMM4 2090*4882a593Smuzhiyun vaesenc \T_key, \XMM5, \XMM5 2091*4882a593Smuzhiyun vaesenc \T_key, \XMM6, \XMM6 2092*4882a593Smuzhiyun vaesenc \T_key, \XMM7, \XMM7 2093*4882a593Smuzhiyun vaesenc \T_key, \XMM8, \XMM8 2094*4882a593Smuzhiyun i = (i+1) 2095*4882a593Smuzhiyun setreg 2096*4882a593Smuzhiyun.endr 2097*4882a593Smuzhiyun 2098*4882a593Smuzhiyun 2099*4882a593Smuzhiyun vmovdqa 16*i(arg1), \T_key 2100*4882a593Smuzhiyun vaesenclast \T_key, \XMM1, \XMM1 2101*4882a593Smuzhiyun vaesenclast \T_key, \XMM2, \XMM2 2102*4882a593Smuzhiyun vaesenclast \T_key, \XMM3, \XMM3 2103*4882a593Smuzhiyun vaesenclast \T_key, \XMM4, \XMM4 2104*4882a593Smuzhiyun vaesenclast \T_key, \XMM5, \XMM5 2105*4882a593Smuzhiyun vaesenclast \T_key, \XMM6, \XMM6 2106*4882a593Smuzhiyun vaesenclast \T_key, \XMM7, \XMM7 2107*4882a593Smuzhiyun vaesenclast \T_key, \XMM8, \XMM8 2108*4882a593Smuzhiyun 2109*4882a593Smuzhiyun vmovdqu (arg4, %r11), \T1 2110*4882a593Smuzhiyun vpxor \T1, \XMM1, \XMM1 2111*4882a593Smuzhiyun vmovdqu \XMM1, (arg3 , %r11) 2112*4882a593Smuzhiyun .if \ENC_DEC == DEC 2113*4882a593Smuzhiyun vmovdqa \T1, \XMM1 2114*4882a593Smuzhiyun .endif 2115*4882a593Smuzhiyun 2116*4882a593Smuzhiyun vmovdqu 16*1(arg4, %r11), \T1 2117*4882a593Smuzhiyun vpxor \T1, \XMM2, \XMM2 2118*4882a593Smuzhiyun vmovdqu \XMM2, 16*1(arg3 , %r11) 2119*4882a593Smuzhiyun .if \ENC_DEC == DEC 2120*4882a593Smuzhiyun vmovdqa \T1, \XMM2 2121*4882a593Smuzhiyun .endif 2122*4882a593Smuzhiyun 2123*4882a593Smuzhiyun vmovdqu 16*2(arg4, %r11), \T1 2124*4882a593Smuzhiyun vpxor \T1, \XMM3, \XMM3 2125*4882a593Smuzhiyun vmovdqu \XMM3, 16*2(arg3 , %r11) 2126*4882a593Smuzhiyun .if \ENC_DEC == DEC 2127*4882a593Smuzhiyun vmovdqa \T1, \XMM3 2128*4882a593Smuzhiyun .endif 2129*4882a593Smuzhiyun 2130*4882a593Smuzhiyun vmovdqu 16*3(arg4, %r11), \T1 2131*4882a593Smuzhiyun vpxor \T1, \XMM4, \XMM4 2132*4882a593Smuzhiyun vmovdqu \XMM4, 16*3(arg3 , %r11) 2133*4882a593Smuzhiyun .if \ENC_DEC == DEC 2134*4882a593Smuzhiyun vmovdqa \T1, \XMM4 2135*4882a593Smuzhiyun .endif 2136*4882a593Smuzhiyun 2137*4882a593Smuzhiyun vmovdqu 16*4(arg4, %r11), \T1 2138*4882a593Smuzhiyun vpxor \T1, \XMM5, \XMM5 2139*4882a593Smuzhiyun vmovdqu \XMM5, 16*4(arg3 , %r11) 2140*4882a593Smuzhiyun .if \ENC_DEC == DEC 2141*4882a593Smuzhiyun vmovdqa \T1, \XMM5 2142*4882a593Smuzhiyun .endif 2143*4882a593Smuzhiyun 2144*4882a593Smuzhiyun vmovdqu 16*5(arg4, %r11), \T1 2145*4882a593Smuzhiyun vpxor \T1, \XMM6, \XMM6 2146*4882a593Smuzhiyun vmovdqu \XMM6, 16*5(arg3 , %r11) 2147*4882a593Smuzhiyun .if \ENC_DEC == DEC 2148*4882a593Smuzhiyun vmovdqa \T1, \XMM6 2149*4882a593Smuzhiyun .endif 2150*4882a593Smuzhiyun 2151*4882a593Smuzhiyun vmovdqu 16*6(arg4, %r11), \T1 2152*4882a593Smuzhiyun vpxor \T1, \XMM7, \XMM7 2153*4882a593Smuzhiyun vmovdqu \XMM7, 16*6(arg3 , %r11) 2154*4882a593Smuzhiyun .if \ENC_DEC == DEC 2155*4882a593Smuzhiyun vmovdqa \T1, \XMM7 2156*4882a593Smuzhiyun .endif 2157*4882a593Smuzhiyun 2158*4882a593Smuzhiyun vmovdqu 16*7(arg4, %r11), \T1 2159*4882a593Smuzhiyun vpxor \T1, \XMM8, \XMM8 2160*4882a593Smuzhiyun vmovdqu \XMM8, 16*7(arg3 , %r11) 2161*4882a593Smuzhiyun .if \ENC_DEC == DEC 2162*4882a593Smuzhiyun vmovdqa \T1, \XMM8 2163*4882a593Smuzhiyun .endif 2164*4882a593Smuzhiyun 2165*4882a593Smuzhiyun add $128, %r11 2166*4882a593Smuzhiyun 2167*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2168*4882a593Smuzhiyun vpxor TMP1(%rsp), \XMM1, \XMM1 # combine GHASHed value with 2169*4882a593Smuzhiyun # the corresponding ciphertext 2170*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2171*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2172*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2173*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2174*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2175*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2176*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2177*4882a593Smuzhiyun 2178*4882a593Smuzhiyun############################################################################### 2179*4882a593Smuzhiyun 2180*4882a593Smuzhiyun_initial_blocks_done\@: 2181*4882a593Smuzhiyun 2182*4882a593Smuzhiyun 2183*4882a593Smuzhiyun.endm 2184*4882a593Smuzhiyun 2185*4882a593Smuzhiyun 2186*4882a593Smuzhiyun 2187*4882a593Smuzhiyun# encrypt 8 blocks at a time 2188*4882a593Smuzhiyun# ghash the 8 previously encrypted ciphertext blocks 2189*4882a593Smuzhiyun# arg1, arg3, arg4 are used as pointers only, not modified 2190*4882a593Smuzhiyun# r11 is the data offset value 2191*4882a593Smuzhiyun.macro GHASH_8_ENCRYPT_8_PARALLEL_AVX2 REP T1 T2 T3 T4 T5 T6 CTR XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 T7 loop_idx ENC_DEC 2192*4882a593Smuzhiyun 2193*4882a593Smuzhiyun vmovdqa \XMM1, \T2 2194*4882a593Smuzhiyun vmovdqa \XMM2, TMP2(%rsp) 2195*4882a593Smuzhiyun vmovdqa \XMM3, TMP3(%rsp) 2196*4882a593Smuzhiyun vmovdqa \XMM4, TMP4(%rsp) 2197*4882a593Smuzhiyun vmovdqa \XMM5, TMP5(%rsp) 2198*4882a593Smuzhiyun vmovdqa \XMM6, TMP6(%rsp) 2199*4882a593Smuzhiyun vmovdqa \XMM7, TMP7(%rsp) 2200*4882a593Smuzhiyun vmovdqa \XMM8, TMP8(%rsp) 2201*4882a593Smuzhiyun 2202*4882a593Smuzhiyun.if \loop_idx == in_order 2203*4882a593Smuzhiyun vpaddd ONE(%rip), \CTR, \XMM1 # INCR CNT 2204*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM1, \XMM2 2205*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM2, \XMM3 2206*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM3, \XMM4 2207*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM4, \XMM5 2208*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM5, \XMM6 2209*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM6, \XMM7 2210*4882a593Smuzhiyun vpaddd ONE(%rip), \XMM7, \XMM8 2211*4882a593Smuzhiyun vmovdqa \XMM8, \CTR 2212*4882a593Smuzhiyun 2213*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2214*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2215*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2216*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2217*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2218*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2219*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2220*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2221*4882a593Smuzhiyun.else 2222*4882a593Smuzhiyun vpaddd ONEf(%rip), \CTR, \XMM1 # INCR CNT 2223*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM1, \XMM2 2224*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM2, \XMM3 2225*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM3, \XMM4 2226*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM4, \XMM5 2227*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM5, \XMM6 2228*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM6, \XMM7 2229*4882a593Smuzhiyun vpaddd ONEf(%rip), \XMM7, \XMM8 2230*4882a593Smuzhiyun vmovdqa \XMM8, \CTR 2231*4882a593Smuzhiyun.endif 2232*4882a593Smuzhiyun 2233*4882a593Smuzhiyun 2234*4882a593Smuzhiyun ####################################################################### 2235*4882a593Smuzhiyun 2236*4882a593Smuzhiyun vmovdqu (arg1), \T1 2237*4882a593Smuzhiyun vpxor \T1, \XMM1, \XMM1 2238*4882a593Smuzhiyun vpxor \T1, \XMM2, \XMM2 2239*4882a593Smuzhiyun vpxor \T1, \XMM3, \XMM3 2240*4882a593Smuzhiyun vpxor \T1, \XMM4, \XMM4 2241*4882a593Smuzhiyun vpxor \T1, \XMM5, \XMM5 2242*4882a593Smuzhiyun vpxor \T1, \XMM6, \XMM6 2243*4882a593Smuzhiyun vpxor \T1, \XMM7, \XMM7 2244*4882a593Smuzhiyun vpxor \T1, \XMM8, \XMM8 2245*4882a593Smuzhiyun 2246*4882a593Smuzhiyun ####################################################################### 2247*4882a593Smuzhiyun 2248*4882a593Smuzhiyun 2249*4882a593Smuzhiyun 2250*4882a593Smuzhiyun 2251*4882a593Smuzhiyun 2252*4882a593Smuzhiyun vmovdqu 16*1(arg1), \T1 2253*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 2254*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 2255*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 2256*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 2257*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 2258*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 2259*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 2260*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 2261*4882a593Smuzhiyun 2262*4882a593Smuzhiyun vmovdqu 16*2(arg1), \T1 2263*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 2264*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 2265*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 2266*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 2267*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 2268*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 2269*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 2270*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 2271*4882a593Smuzhiyun 2272*4882a593Smuzhiyun 2273*4882a593Smuzhiyun ####################################################################### 2274*4882a593Smuzhiyun 2275*4882a593Smuzhiyun vmovdqu HashKey_8(arg2), \T5 2276*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T2, \T4 # T4 = a1*b1 2277*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T2, \T7 # T7 = a0*b0 2278*4882a593Smuzhiyun vpclmulqdq $0x01, \T5, \T2, \T6 # T6 = a1*b0 2279*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T2, \T5 # T5 = a0*b1 2280*4882a593Smuzhiyun vpxor \T5, \T6, \T6 2281*4882a593Smuzhiyun 2282*4882a593Smuzhiyun vmovdqu 16*3(arg1), \T1 2283*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 2284*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 2285*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 2286*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 2287*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 2288*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 2289*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 2290*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 2291*4882a593Smuzhiyun 2292*4882a593Smuzhiyun vmovdqa TMP2(%rsp), \T1 2293*4882a593Smuzhiyun vmovdqu HashKey_7(arg2), \T5 2294*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 2295*4882a593Smuzhiyun vpxor \T3, \T4, \T4 2296*4882a593Smuzhiyun 2297*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 2298*4882a593Smuzhiyun vpxor \T3, \T7, \T7 2299*4882a593Smuzhiyun 2300*4882a593Smuzhiyun vpclmulqdq $0x01, \T5, \T1, \T3 2301*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2302*4882a593Smuzhiyun 2303*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T1, \T3 2304*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2305*4882a593Smuzhiyun 2306*4882a593Smuzhiyun vmovdqu 16*4(arg1), \T1 2307*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 2308*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 2309*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 2310*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 2311*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 2312*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 2313*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 2314*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 2315*4882a593Smuzhiyun 2316*4882a593Smuzhiyun ####################################################################### 2317*4882a593Smuzhiyun 2318*4882a593Smuzhiyun vmovdqa TMP3(%rsp), \T1 2319*4882a593Smuzhiyun vmovdqu HashKey_6(arg2), \T5 2320*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 2321*4882a593Smuzhiyun vpxor \T3, \T4, \T4 2322*4882a593Smuzhiyun 2323*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 2324*4882a593Smuzhiyun vpxor \T3, \T7, \T7 2325*4882a593Smuzhiyun 2326*4882a593Smuzhiyun vpclmulqdq $0x01, \T5, \T1, \T3 2327*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2328*4882a593Smuzhiyun 2329*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T1, \T3 2330*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2331*4882a593Smuzhiyun 2332*4882a593Smuzhiyun vmovdqu 16*5(arg1), \T1 2333*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 2334*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 2335*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 2336*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 2337*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 2338*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 2339*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 2340*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 2341*4882a593Smuzhiyun 2342*4882a593Smuzhiyun vmovdqa TMP4(%rsp), \T1 2343*4882a593Smuzhiyun vmovdqu HashKey_5(arg2), \T5 2344*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 2345*4882a593Smuzhiyun vpxor \T3, \T4, \T4 2346*4882a593Smuzhiyun 2347*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 2348*4882a593Smuzhiyun vpxor \T3, \T7, \T7 2349*4882a593Smuzhiyun 2350*4882a593Smuzhiyun vpclmulqdq $0x01, \T5, \T1, \T3 2351*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2352*4882a593Smuzhiyun 2353*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T1, \T3 2354*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2355*4882a593Smuzhiyun 2356*4882a593Smuzhiyun vmovdqu 16*6(arg1), \T1 2357*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 2358*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 2359*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 2360*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 2361*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 2362*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 2363*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 2364*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 2365*4882a593Smuzhiyun 2366*4882a593Smuzhiyun 2367*4882a593Smuzhiyun vmovdqa TMP5(%rsp), \T1 2368*4882a593Smuzhiyun vmovdqu HashKey_4(arg2), \T5 2369*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 2370*4882a593Smuzhiyun vpxor \T3, \T4, \T4 2371*4882a593Smuzhiyun 2372*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 2373*4882a593Smuzhiyun vpxor \T3, \T7, \T7 2374*4882a593Smuzhiyun 2375*4882a593Smuzhiyun vpclmulqdq $0x01, \T5, \T1, \T3 2376*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2377*4882a593Smuzhiyun 2378*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T1, \T3 2379*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2380*4882a593Smuzhiyun 2381*4882a593Smuzhiyun vmovdqu 16*7(arg1), \T1 2382*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 2383*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 2384*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 2385*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 2386*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 2387*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 2388*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 2389*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 2390*4882a593Smuzhiyun 2391*4882a593Smuzhiyun vmovdqa TMP6(%rsp), \T1 2392*4882a593Smuzhiyun vmovdqu HashKey_3(arg2), \T5 2393*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 2394*4882a593Smuzhiyun vpxor \T3, \T4, \T4 2395*4882a593Smuzhiyun 2396*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 2397*4882a593Smuzhiyun vpxor \T3, \T7, \T7 2398*4882a593Smuzhiyun 2399*4882a593Smuzhiyun vpclmulqdq $0x01, \T5, \T1, \T3 2400*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2401*4882a593Smuzhiyun 2402*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T1, \T3 2403*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2404*4882a593Smuzhiyun 2405*4882a593Smuzhiyun vmovdqu 16*8(arg1), \T1 2406*4882a593Smuzhiyun vaesenc \T1, \XMM1, \XMM1 2407*4882a593Smuzhiyun vaesenc \T1, \XMM2, \XMM2 2408*4882a593Smuzhiyun vaesenc \T1, \XMM3, \XMM3 2409*4882a593Smuzhiyun vaesenc \T1, \XMM4, \XMM4 2410*4882a593Smuzhiyun vaesenc \T1, \XMM5, \XMM5 2411*4882a593Smuzhiyun vaesenc \T1, \XMM6, \XMM6 2412*4882a593Smuzhiyun vaesenc \T1, \XMM7, \XMM7 2413*4882a593Smuzhiyun vaesenc \T1, \XMM8, \XMM8 2414*4882a593Smuzhiyun 2415*4882a593Smuzhiyun vmovdqa TMP7(%rsp), \T1 2416*4882a593Smuzhiyun vmovdqu HashKey_2(arg2), \T5 2417*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 2418*4882a593Smuzhiyun vpxor \T3, \T4, \T4 2419*4882a593Smuzhiyun 2420*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 2421*4882a593Smuzhiyun vpxor \T3, \T7, \T7 2422*4882a593Smuzhiyun 2423*4882a593Smuzhiyun vpclmulqdq $0x01, \T5, \T1, \T3 2424*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2425*4882a593Smuzhiyun 2426*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T1, \T3 2427*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2428*4882a593Smuzhiyun 2429*4882a593Smuzhiyun 2430*4882a593Smuzhiyun ####################################################################### 2431*4882a593Smuzhiyun 2432*4882a593Smuzhiyun vmovdqu 16*9(arg1), \T5 2433*4882a593Smuzhiyun vaesenc \T5, \XMM1, \XMM1 2434*4882a593Smuzhiyun vaesenc \T5, \XMM2, \XMM2 2435*4882a593Smuzhiyun vaesenc \T5, \XMM3, \XMM3 2436*4882a593Smuzhiyun vaesenc \T5, \XMM4, \XMM4 2437*4882a593Smuzhiyun vaesenc \T5, \XMM5, \XMM5 2438*4882a593Smuzhiyun vaesenc \T5, \XMM6, \XMM6 2439*4882a593Smuzhiyun vaesenc \T5, \XMM7, \XMM7 2440*4882a593Smuzhiyun vaesenc \T5, \XMM8, \XMM8 2441*4882a593Smuzhiyun 2442*4882a593Smuzhiyun vmovdqa TMP8(%rsp), \T1 2443*4882a593Smuzhiyun vmovdqu HashKey(arg2), \T5 2444*4882a593Smuzhiyun 2445*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \T1, \T3 2446*4882a593Smuzhiyun vpxor \T3, \T7, \T7 2447*4882a593Smuzhiyun 2448*4882a593Smuzhiyun vpclmulqdq $0x01, \T5, \T1, \T3 2449*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2450*4882a593Smuzhiyun 2451*4882a593Smuzhiyun vpclmulqdq $0x10, \T5, \T1, \T3 2452*4882a593Smuzhiyun vpxor \T3, \T6, \T6 2453*4882a593Smuzhiyun 2454*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \T1, \T3 2455*4882a593Smuzhiyun vpxor \T3, \T4, \T1 2456*4882a593Smuzhiyun 2457*4882a593Smuzhiyun 2458*4882a593Smuzhiyun vmovdqu 16*10(arg1), \T5 2459*4882a593Smuzhiyun 2460*4882a593Smuzhiyun i = 11 2461*4882a593Smuzhiyun setreg 2462*4882a593Smuzhiyun.rep (\REP-9) 2463*4882a593Smuzhiyun vaesenc \T5, \XMM1, \XMM1 2464*4882a593Smuzhiyun vaesenc \T5, \XMM2, \XMM2 2465*4882a593Smuzhiyun vaesenc \T5, \XMM3, \XMM3 2466*4882a593Smuzhiyun vaesenc \T5, \XMM4, \XMM4 2467*4882a593Smuzhiyun vaesenc \T5, \XMM5, \XMM5 2468*4882a593Smuzhiyun vaesenc \T5, \XMM6, \XMM6 2469*4882a593Smuzhiyun vaesenc \T5, \XMM7, \XMM7 2470*4882a593Smuzhiyun vaesenc \T5, \XMM8, \XMM8 2471*4882a593Smuzhiyun 2472*4882a593Smuzhiyun vmovdqu 16*i(arg1), \T5 2473*4882a593Smuzhiyun i = i + 1 2474*4882a593Smuzhiyun setreg 2475*4882a593Smuzhiyun.endr 2476*4882a593Smuzhiyun 2477*4882a593Smuzhiyun i = 0 2478*4882a593Smuzhiyun j = 1 2479*4882a593Smuzhiyun setreg 2480*4882a593Smuzhiyun.rep 8 2481*4882a593Smuzhiyun vpxor 16*i(arg4, %r11), \T5, \T2 2482*4882a593Smuzhiyun .if \ENC_DEC == ENC 2483*4882a593Smuzhiyun vaesenclast \T2, reg_j, reg_j 2484*4882a593Smuzhiyun .else 2485*4882a593Smuzhiyun vaesenclast \T2, reg_j, \T3 2486*4882a593Smuzhiyun vmovdqu 16*i(arg4, %r11), reg_j 2487*4882a593Smuzhiyun vmovdqu \T3, 16*i(arg3, %r11) 2488*4882a593Smuzhiyun .endif 2489*4882a593Smuzhiyun i = (i+1) 2490*4882a593Smuzhiyun j = (j+1) 2491*4882a593Smuzhiyun setreg 2492*4882a593Smuzhiyun.endr 2493*4882a593Smuzhiyun ####################################################################### 2494*4882a593Smuzhiyun 2495*4882a593Smuzhiyun 2496*4882a593Smuzhiyun vpslldq $8, \T6, \T3 # shift-L T3 2 DWs 2497*4882a593Smuzhiyun vpsrldq $8, \T6, \T6 # shift-R T2 2 DWs 2498*4882a593Smuzhiyun vpxor \T3, \T7, \T7 2499*4882a593Smuzhiyun vpxor \T6, \T1, \T1 # accumulate the results in T1:T7 2500*4882a593Smuzhiyun 2501*4882a593Smuzhiyun 2502*4882a593Smuzhiyun 2503*4882a593Smuzhiyun ####################################################################### 2504*4882a593Smuzhiyun #first phase of the reduction 2505*4882a593Smuzhiyun vmovdqa POLY2(%rip), \T3 2506*4882a593Smuzhiyun 2507*4882a593Smuzhiyun vpclmulqdq $0x01, \T7, \T3, \T2 2508*4882a593Smuzhiyun vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2509*4882a593Smuzhiyun 2510*4882a593Smuzhiyun vpxor \T2, \T7, \T7 # first phase of the reduction complete 2511*4882a593Smuzhiyun ####################################################################### 2512*4882a593Smuzhiyun .if \ENC_DEC == ENC 2513*4882a593Smuzhiyun vmovdqu \XMM1, 16*0(arg3,%r11) # Write to the Ciphertext buffer 2514*4882a593Smuzhiyun vmovdqu \XMM2, 16*1(arg3,%r11) # Write to the Ciphertext buffer 2515*4882a593Smuzhiyun vmovdqu \XMM3, 16*2(arg3,%r11) # Write to the Ciphertext buffer 2516*4882a593Smuzhiyun vmovdqu \XMM4, 16*3(arg3,%r11) # Write to the Ciphertext buffer 2517*4882a593Smuzhiyun vmovdqu \XMM5, 16*4(arg3,%r11) # Write to the Ciphertext buffer 2518*4882a593Smuzhiyun vmovdqu \XMM6, 16*5(arg3,%r11) # Write to the Ciphertext buffer 2519*4882a593Smuzhiyun vmovdqu \XMM7, 16*6(arg3,%r11) # Write to the Ciphertext buffer 2520*4882a593Smuzhiyun vmovdqu \XMM8, 16*7(arg3,%r11) # Write to the Ciphertext buffer 2521*4882a593Smuzhiyun .endif 2522*4882a593Smuzhiyun 2523*4882a593Smuzhiyun ####################################################################### 2524*4882a593Smuzhiyun #second phase of the reduction 2525*4882a593Smuzhiyun vpclmulqdq $0x00, \T7, \T3, \T2 2526*4882a593Smuzhiyun vpsrldq $4, \T2, \T2 # shift-R xmm2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2527*4882a593Smuzhiyun 2528*4882a593Smuzhiyun vpclmulqdq $0x10, \T7, \T3, \T4 2529*4882a593Smuzhiyun vpslldq $4, \T4, \T4 # shift-L xmm0 1 DW (Shift-L 1-DW to obtain result with no shifts) 2530*4882a593Smuzhiyun 2531*4882a593Smuzhiyun vpxor \T2, \T4, \T4 # second phase of the reduction complete 2532*4882a593Smuzhiyun ####################################################################### 2533*4882a593Smuzhiyun vpxor \T4, \T1, \T1 # the result is in T1 2534*4882a593Smuzhiyun 2535*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM1, \XMM1 # perform a 16Byte swap 2536*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM2, \XMM2 # perform a 16Byte swap 2537*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM3, \XMM3 # perform a 16Byte swap 2538*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM4, \XMM4 # perform a 16Byte swap 2539*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM5, \XMM5 # perform a 16Byte swap 2540*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM6, \XMM6 # perform a 16Byte swap 2541*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM7, \XMM7 # perform a 16Byte swap 2542*4882a593Smuzhiyun vpshufb SHUF_MASK(%rip), \XMM8, \XMM8 # perform a 16Byte swap 2543*4882a593Smuzhiyun 2544*4882a593Smuzhiyun 2545*4882a593Smuzhiyun vpxor \T1, \XMM1, \XMM1 2546*4882a593Smuzhiyun 2547*4882a593Smuzhiyun 2548*4882a593Smuzhiyun 2549*4882a593Smuzhiyun.endm 2550*4882a593Smuzhiyun 2551*4882a593Smuzhiyun 2552*4882a593Smuzhiyun# GHASH the last 4 ciphertext blocks. 2553*4882a593Smuzhiyun.macro GHASH_LAST_8_AVX2 T1 T2 T3 T4 T5 T6 T7 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 2554*4882a593Smuzhiyun 2555*4882a593Smuzhiyun ## Karatsuba Method 2556*4882a593Smuzhiyun 2557*4882a593Smuzhiyun vmovdqu HashKey_8(arg2), \T5 2558*4882a593Smuzhiyun 2559*4882a593Smuzhiyun vpshufd $0b01001110, \XMM1, \T2 2560*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T3 2561*4882a593Smuzhiyun vpxor \XMM1, \T2, \T2 2562*4882a593Smuzhiyun vpxor \T5, \T3, \T3 2563*4882a593Smuzhiyun 2564*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM1, \T6 2565*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM1, \T7 2566*4882a593Smuzhiyun 2567*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \XMM1 2568*4882a593Smuzhiyun 2569*4882a593Smuzhiyun ###################### 2570*4882a593Smuzhiyun 2571*4882a593Smuzhiyun vmovdqu HashKey_7(arg2), \T5 2572*4882a593Smuzhiyun vpshufd $0b01001110, \XMM2, \T2 2573*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T3 2574*4882a593Smuzhiyun vpxor \XMM2, \T2, \T2 2575*4882a593Smuzhiyun vpxor \T5, \T3, \T3 2576*4882a593Smuzhiyun 2577*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM2, \T4 2578*4882a593Smuzhiyun vpxor \T4, \T6, \T6 2579*4882a593Smuzhiyun 2580*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM2, \T4 2581*4882a593Smuzhiyun vpxor \T4, \T7, \T7 2582*4882a593Smuzhiyun 2583*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 2584*4882a593Smuzhiyun 2585*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 2586*4882a593Smuzhiyun 2587*4882a593Smuzhiyun ###################### 2588*4882a593Smuzhiyun 2589*4882a593Smuzhiyun vmovdqu HashKey_6(arg2), \T5 2590*4882a593Smuzhiyun vpshufd $0b01001110, \XMM3, \T2 2591*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T3 2592*4882a593Smuzhiyun vpxor \XMM3, \T2, \T2 2593*4882a593Smuzhiyun vpxor \T5, \T3, \T3 2594*4882a593Smuzhiyun 2595*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM3, \T4 2596*4882a593Smuzhiyun vpxor \T4, \T6, \T6 2597*4882a593Smuzhiyun 2598*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM3, \T4 2599*4882a593Smuzhiyun vpxor \T4, \T7, \T7 2600*4882a593Smuzhiyun 2601*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 2602*4882a593Smuzhiyun 2603*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 2604*4882a593Smuzhiyun 2605*4882a593Smuzhiyun ###################### 2606*4882a593Smuzhiyun 2607*4882a593Smuzhiyun vmovdqu HashKey_5(arg2), \T5 2608*4882a593Smuzhiyun vpshufd $0b01001110, \XMM4, \T2 2609*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T3 2610*4882a593Smuzhiyun vpxor \XMM4, \T2, \T2 2611*4882a593Smuzhiyun vpxor \T5, \T3, \T3 2612*4882a593Smuzhiyun 2613*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM4, \T4 2614*4882a593Smuzhiyun vpxor \T4, \T6, \T6 2615*4882a593Smuzhiyun 2616*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM4, \T4 2617*4882a593Smuzhiyun vpxor \T4, \T7, \T7 2618*4882a593Smuzhiyun 2619*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 2620*4882a593Smuzhiyun 2621*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 2622*4882a593Smuzhiyun 2623*4882a593Smuzhiyun ###################### 2624*4882a593Smuzhiyun 2625*4882a593Smuzhiyun vmovdqu HashKey_4(arg2), \T5 2626*4882a593Smuzhiyun vpshufd $0b01001110, \XMM5, \T2 2627*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T3 2628*4882a593Smuzhiyun vpxor \XMM5, \T2, \T2 2629*4882a593Smuzhiyun vpxor \T5, \T3, \T3 2630*4882a593Smuzhiyun 2631*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM5, \T4 2632*4882a593Smuzhiyun vpxor \T4, \T6, \T6 2633*4882a593Smuzhiyun 2634*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM5, \T4 2635*4882a593Smuzhiyun vpxor \T4, \T7, \T7 2636*4882a593Smuzhiyun 2637*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 2638*4882a593Smuzhiyun 2639*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 2640*4882a593Smuzhiyun 2641*4882a593Smuzhiyun ###################### 2642*4882a593Smuzhiyun 2643*4882a593Smuzhiyun vmovdqu HashKey_3(arg2), \T5 2644*4882a593Smuzhiyun vpshufd $0b01001110, \XMM6, \T2 2645*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T3 2646*4882a593Smuzhiyun vpxor \XMM6, \T2, \T2 2647*4882a593Smuzhiyun vpxor \T5, \T3, \T3 2648*4882a593Smuzhiyun 2649*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM6, \T4 2650*4882a593Smuzhiyun vpxor \T4, \T6, \T6 2651*4882a593Smuzhiyun 2652*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM6, \T4 2653*4882a593Smuzhiyun vpxor \T4, \T7, \T7 2654*4882a593Smuzhiyun 2655*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 2656*4882a593Smuzhiyun 2657*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 2658*4882a593Smuzhiyun 2659*4882a593Smuzhiyun ###################### 2660*4882a593Smuzhiyun 2661*4882a593Smuzhiyun vmovdqu HashKey_2(arg2), \T5 2662*4882a593Smuzhiyun vpshufd $0b01001110, \XMM7, \T2 2663*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T3 2664*4882a593Smuzhiyun vpxor \XMM7, \T2, \T2 2665*4882a593Smuzhiyun vpxor \T5, \T3, \T3 2666*4882a593Smuzhiyun 2667*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM7, \T4 2668*4882a593Smuzhiyun vpxor \T4, \T6, \T6 2669*4882a593Smuzhiyun 2670*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM7, \T4 2671*4882a593Smuzhiyun vpxor \T4, \T7, \T7 2672*4882a593Smuzhiyun 2673*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 2674*4882a593Smuzhiyun 2675*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 2676*4882a593Smuzhiyun 2677*4882a593Smuzhiyun ###################### 2678*4882a593Smuzhiyun 2679*4882a593Smuzhiyun vmovdqu HashKey(arg2), \T5 2680*4882a593Smuzhiyun vpshufd $0b01001110, \XMM8, \T2 2681*4882a593Smuzhiyun vpshufd $0b01001110, \T5, \T3 2682*4882a593Smuzhiyun vpxor \XMM8, \T2, \T2 2683*4882a593Smuzhiyun vpxor \T5, \T3, \T3 2684*4882a593Smuzhiyun 2685*4882a593Smuzhiyun vpclmulqdq $0x11, \T5, \XMM8, \T4 2686*4882a593Smuzhiyun vpxor \T4, \T6, \T6 2687*4882a593Smuzhiyun 2688*4882a593Smuzhiyun vpclmulqdq $0x00, \T5, \XMM8, \T4 2689*4882a593Smuzhiyun vpxor \T4, \T7, \T7 2690*4882a593Smuzhiyun 2691*4882a593Smuzhiyun vpclmulqdq $0x00, \T3, \T2, \T2 2692*4882a593Smuzhiyun 2693*4882a593Smuzhiyun vpxor \T2, \XMM1, \XMM1 2694*4882a593Smuzhiyun vpxor \T6, \XMM1, \XMM1 2695*4882a593Smuzhiyun vpxor \T7, \XMM1, \T2 2696*4882a593Smuzhiyun 2697*4882a593Smuzhiyun 2698*4882a593Smuzhiyun 2699*4882a593Smuzhiyun 2700*4882a593Smuzhiyun vpslldq $8, \T2, \T4 2701*4882a593Smuzhiyun vpsrldq $8, \T2, \T2 2702*4882a593Smuzhiyun 2703*4882a593Smuzhiyun vpxor \T4, \T7, \T7 2704*4882a593Smuzhiyun vpxor \T2, \T6, \T6 # <T6:T7> holds the result of the 2705*4882a593Smuzhiyun # accumulated carry-less multiplications 2706*4882a593Smuzhiyun 2707*4882a593Smuzhiyun ####################################################################### 2708*4882a593Smuzhiyun #first phase of the reduction 2709*4882a593Smuzhiyun vmovdqa POLY2(%rip), \T3 2710*4882a593Smuzhiyun 2711*4882a593Smuzhiyun vpclmulqdq $0x01, \T7, \T3, \T2 2712*4882a593Smuzhiyun vpslldq $8, \T2, \T2 # shift-L xmm2 2 DWs 2713*4882a593Smuzhiyun 2714*4882a593Smuzhiyun vpxor \T2, \T7, \T7 # first phase of the reduction complete 2715*4882a593Smuzhiyun ####################################################################### 2716*4882a593Smuzhiyun 2717*4882a593Smuzhiyun 2718*4882a593Smuzhiyun #second phase of the reduction 2719*4882a593Smuzhiyun vpclmulqdq $0x00, \T7, \T3, \T2 2720*4882a593Smuzhiyun vpsrldq $4, \T2, \T2 # shift-R T2 1 DW (Shift-R only 1-DW to obtain 2-DWs shift-R) 2721*4882a593Smuzhiyun 2722*4882a593Smuzhiyun vpclmulqdq $0x10, \T7, \T3, \T4 2723*4882a593Smuzhiyun vpslldq $4, \T4, \T4 # shift-L T4 1 DW (Shift-L 1-DW to obtain result with no shifts) 2724*4882a593Smuzhiyun 2725*4882a593Smuzhiyun vpxor \T2, \T4, \T4 # second phase of the reduction complete 2726*4882a593Smuzhiyun ####################################################################### 2727*4882a593Smuzhiyun vpxor \T4, \T6, \T6 # the result is in T6 2728*4882a593Smuzhiyun.endm 2729*4882a593Smuzhiyun 2730*4882a593Smuzhiyun 2731*4882a593Smuzhiyun 2732*4882a593Smuzhiyun############################################################# 2733*4882a593Smuzhiyun#void aesni_gcm_init_avx_gen4 2734*4882a593Smuzhiyun# (gcm_data *my_ctx_data, 2735*4882a593Smuzhiyun# gcm_context_data *data, 2736*4882a593Smuzhiyun# u8 *iv, /* Pre-counter block j0: 4 byte salt 2737*4882a593Smuzhiyun# (from Security Association) concatenated with 8 byte 2738*4882a593Smuzhiyun# Initialisation Vector (from IPSec ESP Payload) 2739*4882a593Smuzhiyun# concatenated with 0x00000001. 16-byte aligned pointer. */ 2740*4882a593Smuzhiyun# u8 *hash_subkey# /* H, the Hash sub key input. Data starts on a 16-byte boundary. */ 2741*4882a593Smuzhiyun# const u8 *aad, /* Additional Authentication Data (AAD)*/ 2742*4882a593Smuzhiyun# u64 aad_len) /* Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 Bytes */ 2743*4882a593Smuzhiyun############################################################# 2744*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_init_avx_gen4) 2745*4882a593Smuzhiyun FUNC_SAVE 2746*4882a593Smuzhiyun INIT GHASH_MUL_AVX2, PRECOMPUTE_AVX2 2747*4882a593Smuzhiyun FUNC_RESTORE 2748*4882a593Smuzhiyun RET 2749*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_init_avx_gen4) 2750*4882a593Smuzhiyun 2751*4882a593Smuzhiyun############################################################################### 2752*4882a593Smuzhiyun#void aesni_gcm_enc_avx_gen4( 2753*4882a593Smuzhiyun# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2754*4882a593Smuzhiyun# gcm_context_data *data, 2755*4882a593Smuzhiyun# u8 *out, /* Ciphertext output. Encrypt in-place is allowed. */ 2756*4882a593Smuzhiyun# const u8 *in, /* Plaintext input */ 2757*4882a593Smuzhiyun# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2758*4882a593Smuzhiyun############################################################################### 2759*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_enc_update_avx_gen4) 2760*4882a593Smuzhiyun FUNC_SAVE 2761*4882a593Smuzhiyun mov keysize,%eax 2762*4882a593Smuzhiyun cmp $32, %eax 2763*4882a593Smuzhiyun je key_256_enc_update4 2764*4882a593Smuzhiyun cmp $16, %eax 2765*4882a593Smuzhiyun je key_128_enc_update4 2766*4882a593Smuzhiyun # must be 192 2767*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 11 2768*4882a593Smuzhiyun FUNC_RESTORE 2769*4882a593Smuzhiyun RET 2770*4882a593Smuzhiyunkey_128_enc_update4: 2771*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 9 2772*4882a593Smuzhiyun FUNC_RESTORE 2773*4882a593Smuzhiyun RET 2774*4882a593Smuzhiyunkey_256_enc_update4: 2775*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, ENC, 13 2776*4882a593Smuzhiyun FUNC_RESTORE 2777*4882a593Smuzhiyun RET 2778*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_enc_update_avx_gen4) 2779*4882a593Smuzhiyun 2780*4882a593Smuzhiyun############################################################################### 2781*4882a593Smuzhiyun#void aesni_gcm_dec_update_avx_gen4( 2782*4882a593Smuzhiyun# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2783*4882a593Smuzhiyun# gcm_context_data *data, 2784*4882a593Smuzhiyun# u8 *out, /* Plaintext output. Decrypt in-place is allowed. */ 2785*4882a593Smuzhiyun# const u8 *in, /* Ciphertext input */ 2786*4882a593Smuzhiyun# u64 plaintext_len) /* Length of data in Bytes for encryption. */ 2787*4882a593Smuzhiyun############################################################################### 2788*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_dec_update_avx_gen4) 2789*4882a593Smuzhiyun FUNC_SAVE 2790*4882a593Smuzhiyun mov keysize,%eax 2791*4882a593Smuzhiyun cmp $32, %eax 2792*4882a593Smuzhiyun je key_256_dec_update4 2793*4882a593Smuzhiyun cmp $16, %eax 2794*4882a593Smuzhiyun je key_128_dec_update4 2795*4882a593Smuzhiyun # must be 192 2796*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 11 2797*4882a593Smuzhiyun FUNC_RESTORE 2798*4882a593Smuzhiyun RET 2799*4882a593Smuzhiyunkey_128_dec_update4: 2800*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 9 2801*4882a593Smuzhiyun FUNC_RESTORE 2802*4882a593Smuzhiyun RET 2803*4882a593Smuzhiyunkey_256_dec_update4: 2804*4882a593Smuzhiyun GCM_ENC_DEC INITIAL_BLOCKS_AVX2, GHASH_8_ENCRYPT_8_PARALLEL_AVX2, GHASH_LAST_8_AVX2, GHASH_MUL_AVX2, DEC, 13 2805*4882a593Smuzhiyun FUNC_RESTORE 2806*4882a593Smuzhiyun RET 2807*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_dec_update_avx_gen4) 2808*4882a593Smuzhiyun 2809*4882a593Smuzhiyun############################################################################### 2810*4882a593Smuzhiyun#void aesni_gcm_finalize_avx_gen4( 2811*4882a593Smuzhiyun# gcm_data *my_ctx_data, /* aligned to 16 Bytes */ 2812*4882a593Smuzhiyun# gcm_context_data *data, 2813*4882a593Smuzhiyun# u8 *auth_tag, /* Authenticated Tag output. */ 2814*4882a593Smuzhiyun# u64 auth_tag_len)# /* Authenticated Tag Length in bytes. 2815*4882a593Smuzhiyun# Valid values are 16 (most likely), 12 or 8. */ 2816*4882a593Smuzhiyun############################################################################### 2817*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_finalize_avx_gen4) 2818*4882a593Smuzhiyun FUNC_SAVE 2819*4882a593Smuzhiyun mov keysize,%eax 2820*4882a593Smuzhiyun cmp $32, %eax 2821*4882a593Smuzhiyun je key_256_finalize4 2822*4882a593Smuzhiyun cmp $16, %eax 2823*4882a593Smuzhiyun je key_128_finalize4 2824*4882a593Smuzhiyun # must be 192 2825*4882a593Smuzhiyun GCM_COMPLETE GHASH_MUL_AVX2, 11, arg3, arg4 2826*4882a593Smuzhiyun FUNC_RESTORE 2827*4882a593Smuzhiyun RET 2828*4882a593Smuzhiyunkey_128_finalize4: 2829*4882a593Smuzhiyun GCM_COMPLETE GHASH_MUL_AVX2, 9, arg3, arg4 2830*4882a593Smuzhiyun FUNC_RESTORE 2831*4882a593Smuzhiyun RET 2832*4882a593Smuzhiyunkey_256_finalize4: 2833*4882a593Smuzhiyun GCM_COMPLETE GHASH_MUL_AVX2, 13, arg3, arg4 2834*4882a593Smuzhiyun FUNC_RESTORE 2835*4882a593Smuzhiyun RET 2836*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_finalize_avx_gen4) 2837