1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Implement AES algorithm in Intel AES-NI instructions. 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * The white paper of AES-NI instructions can be downloaded from: 6*4882a593Smuzhiyun * http://softwarecommunity.intel.com/isn/downloads/intelavx/AES-Instructions-Set_WP.pdf 7*4882a593Smuzhiyun * 8*4882a593Smuzhiyun * Copyright (C) 2008, Intel Corp. 9*4882a593Smuzhiyun * Author: Huang Ying <ying.huang@intel.com> 10*4882a593Smuzhiyun * Vinodh Gopal <vinodh.gopal@intel.com> 11*4882a593Smuzhiyun * Kahraman Akdemir 12*4882a593Smuzhiyun * 13*4882a593Smuzhiyun * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD 14*4882a593Smuzhiyun * interface for 64-bit kernels. 15*4882a593Smuzhiyun * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com) 16*4882a593Smuzhiyun * Aidan O'Mahony (aidan.o.mahony@intel.com) 17*4882a593Smuzhiyun * Adrian Hoban <adrian.hoban@intel.com> 18*4882a593Smuzhiyun * James Guilford (james.guilford@intel.com) 19*4882a593Smuzhiyun * Gabriele Paoloni <gabriele.paoloni@intel.com> 20*4882a593Smuzhiyun * Tadeusz Struk (tadeusz.struk@intel.com) 21*4882a593Smuzhiyun * Wajdi Feghali (wajdi.k.feghali@intel.com) 22*4882a593Smuzhiyun * Copyright (c) 2010, Intel Corporation. 23*4882a593Smuzhiyun * 24*4882a593Smuzhiyun * Ported x86_64 version to x86: 25*4882a593Smuzhiyun * Author: Mathias Krause <minipli@googlemail.com> 26*4882a593Smuzhiyun */ 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun#include <linux/linkage.h> 29*4882a593Smuzhiyun#include <asm/frame.h> 30*4882a593Smuzhiyun#include <asm/nospec-branch.h> 31*4882a593Smuzhiyun 32*4882a593Smuzhiyun/* 33*4882a593Smuzhiyun * The following macros are used to move an (un)aligned 16 byte value to/from 34*4882a593Smuzhiyun * an XMM register. This can done for either FP or integer values, for FP use 35*4882a593Smuzhiyun * movaps (move aligned packed single) or integer use movdqa (move double quad 36*4882a593Smuzhiyun * aligned). It doesn't make a performance difference which instruction is used 37*4882a593Smuzhiyun * since Nehalem (original Core i7) was released. However, the movaps is a byte 38*4882a593Smuzhiyun * shorter, so that is the one we'll use for now. (same for unaligned). 39*4882a593Smuzhiyun */ 40*4882a593Smuzhiyun#define MOVADQ movaps 41*4882a593Smuzhiyun#define MOVUDQ movups 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun#ifdef __x86_64__ 44*4882a593Smuzhiyun 45*4882a593Smuzhiyun# constants in mergeable sections, linker can reorder and merge 46*4882a593Smuzhiyun.section .rodata.cst16.gf128mul_x_ble_mask, "aM", @progbits, 16 47*4882a593Smuzhiyun.align 16 48*4882a593Smuzhiyun.Lgf128mul_x_ble_mask: 49*4882a593Smuzhiyun .octa 0x00000000000000010000000000000087 50*4882a593Smuzhiyun.section .rodata.cst16.POLY, "aM", @progbits, 16 51*4882a593Smuzhiyun.align 16 52*4882a593SmuzhiyunPOLY: .octa 0xC2000000000000000000000000000001 53*4882a593Smuzhiyun.section .rodata.cst16.TWOONE, "aM", @progbits, 16 54*4882a593Smuzhiyun.align 16 55*4882a593SmuzhiyunTWOONE: .octa 0x00000001000000000000000000000001 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun.section .rodata.cst16.SHUF_MASK, "aM", @progbits, 16 58*4882a593Smuzhiyun.align 16 59*4882a593SmuzhiyunSHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F 60*4882a593Smuzhiyun.section .rodata.cst16.MASK1, "aM", @progbits, 16 61*4882a593Smuzhiyun.align 16 62*4882a593SmuzhiyunMASK1: .octa 0x0000000000000000ffffffffffffffff 63*4882a593Smuzhiyun.section .rodata.cst16.MASK2, "aM", @progbits, 16 64*4882a593Smuzhiyun.align 16 65*4882a593SmuzhiyunMASK2: .octa 0xffffffffffffffff0000000000000000 66*4882a593Smuzhiyun.section .rodata.cst16.ONE, "aM", @progbits, 16 67*4882a593Smuzhiyun.align 16 68*4882a593SmuzhiyunONE: .octa 0x00000000000000000000000000000001 69*4882a593Smuzhiyun.section .rodata.cst16.F_MIN_MASK, "aM", @progbits, 16 70*4882a593Smuzhiyun.align 16 71*4882a593SmuzhiyunF_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0 72*4882a593Smuzhiyun.section .rodata.cst16.dec, "aM", @progbits, 16 73*4882a593Smuzhiyun.align 16 74*4882a593Smuzhiyundec: .octa 0x1 75*4882a593Smuzhiyun.section .rodata.cst16.enc, "aM", @progbits, 16 76*4882a593Smuzhiyun.align 16 77*4882a593Smuzhiyunenc: .octa 0x2 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun# order of these constants should not change. 80*4882a593Smuzhiyun# more specifically, ALL_F should follow SHIFT_MASK, 81*4882a593Smuzhiyun# and zero should follow ALL_F 82*4882a593Smuzhiyun.section .rodata, "a", @progbits 83*4882a593Smuzhiyun.align 16 84*4882a593SmuzhiyunSHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100 85*4882a593SmuzhiyunALL_F: .octa 0xffffffffffffffffffffffffffffffff 86*4882a593Smuzhiyun .octa 0x00000000000000000000000000000000 87*4882a593Smuzhiyun 88*4882a593Smuzhiyun.text 89*4882a593Smuzhiyun 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun#define STACK_OFFSET 8*3 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun#define AadHash 16*0 94*4882a593Smuzhiyun#define AadLen 16*1 95*4882a593Smuzhiyun#define InLen (16*1)+8 96*4882a593Smuzhiyun#define PBlockEncKey 16*2 97*4882a593Smuzhiyun#define OrigIV 16*3 98*4882a593Smuzhiyun#define CurCount 16*4 99*4882a593Smuzhiyun#define PBlockLen 16*5 100*4882a593Smuzhiyun#define HashKey 16*6 // store HashKey <<1 mod poly here 101*4882a593Smuzhiyun#define HashKey_2 16*7 // store HashKey^2 <<1 mod poly here 102*4882a593Smuzhiyun#define HashKey_3 16*8 // store HashKey^3 <<1 mod poly here 103*4882a593Smuzhiyun#define HashKey_4 16*9 // store HashKey^4 <<1 mod poly here 104*4882a593Smuzhiyun#define HashKey_k 16*10 // store XOR of High 64 bits and Low 64 105*4882a593Smuzhiyun // bits of HashKey <<1 mod poly here 106*4882a593Smuzhiyun //(for Karatsuba purposes) 107*4882a593Smuzhiyun#define HashKey_2_k 16*11 // store XOR of High 64 bits and Low 64 108*4882a593Smuzhiyun // bits of HashKey^2 <<1 mod poly here 109*4882a593Smuzhiyun // (for Karatsuba purposes) 110*4882a593Smuzhiyun#define HashKey_3_k 16*12 // store XOR of High 64 bits and Low 64 111*4882a593Smuzhiyun // bits of HashKey^3 <<1 mod poly here 112*4882a593Smuzhiyun // (for Karatsuba purposes) 113*4882a593Smuzhiyun#define HashKey_4_k 16*13 // store XOR of High 64 bits and Low 64 114*4882a593Smuzhiyun // bits of HashKey^4 <<1 mod poly here 115*4882a593Smuzhiyun // (for Karatsuba purposes) 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun#define arg1 rdi 118*4882a593Smuzhiyun#define arg2 rsi 119*4882a593Smuzhiyun#define arg3 rdx 120*4882a593Smuzhiyun#define arg4 rcx 121*4882a593Smuzhiyun#define arg5 r8 122*4882a593Smuzhiyun#define arg6 r9 123*4882a593Smuzhiyun#define arg7 STACK_OFFSET+8(%rsp) 124*4882a593Smuzhiyun#define arg8 STACK_OFFSET+16(%rsp) 125*4882a593Smuzhiyun#define arg9 STACK_OFFSET+24(%rsp) 126*4882a593Smuzhiyun#define arg10 STACK_OFFSET+32(%rsp) 127*4882a593Smuzhiyun#define arg11 STACK_OFFSET+40(%rsp) 128*4882a593Smuzhiyun#define keysize 2*15*16(%arg1) 129*4882a593Smuzhiyun#endif 130*4882a593Smuzhiyun 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun#define STATE1 %xmm0 133*4882a593Smuzhiyun#define STATE2 %xmm4 134*4882a593Smuzhiyun#define STATE3 %xmm5 135*4882a593Smuzhiyun#define STATE4 %xmm6 136*4882a593Smuzhiyun#define STATE STATE1 137*4882a593Smuzhiyun#define IN1 %xmm1 138*4882a593Smuzhiyun#define IN2 %xmm7 139*4882a593Smuzhiyun#define IN3 %xmm8 140*4882a593Smuzhiyun#define IN4 %xmm9 141*4882a593Smuzhiyun#define IN IN1 142*4882a593Smuzhiyun#define KEY %xmm2 143*4882a593Smuzhiyun#define IV %xmm3 144*4882a593Smuzhiyun 145*4882a593Smuzhiyun#define BSWAP_MASK %xmm10 146*4882a593Smuzhiyun#define CTR %xmm11 147*4882a593Smuzhiyun#define INC %xmm12 148*4882a593Smuzhiyun 149*4882a593Smuzhiyun#define GF128MUL_MASK %xmm10 150*4882a593Smuzhiyun 151*4882a593Smuzhiyun#ifdef __x86_64__ 152*4882a593Smuzhiyun#define AREG %rax 153*4882a593Smuzhiyun#define KEYP %rdi 154*4882a593Smuzhiyun#define OUTP %rsi 155*4882a593Smuzhiyun#define UKEYP OUTP 156*4882a593Smuzhiyun#define INP %rdx 157*4882a593Smuzhiyun#define LEN %rcx 158*4882a593Smuzhiyun#define IVP %r8 159*4882a593Smuzhiyun#define KLEN %r9d 160*4882a593Smuzhiyun#define T1 %r10 161*4882a593Smuzhiyun#define TKEYP T1 162*4882a593Smuzhiyun#define T2 %r11 163*4882a593Smuzhiyun#define TCTR_LOW T2 164*4882a593Smuzhiyun#else 165*4882a593Smuzhiyun#define AREG %eax 166*4882a593Smuzhiyun#define KEYP %edi 167*4882a593Smuzhiyun#define OUTP AREG 168*4882a593Smuzhiyun#define UKEYP OUTP 169*4882a593Smuzhiyun#define INP %edx 170*4882a593Smuzhiyun#define LEN %esi 171*4882a593Smuzhiyun#define IVP %ebp 172*4882a593Smuzhiyun#define KLEN %ebx 173*4882a593Smuzhiyun#define T1 %ecx 174*4882a593Smuzhiyun#define TKEYP T1 175*4882a593Smuzhiyun#endif 176*4882a593Smuzhiyun 177*4882a593Smuzhiyun.macro FUNC_SAVE 178*4882a593Smuzhiyun push %r12 179*4882a593Smuzhiyun push %r13 180*4882a593Smuzhiyun push %r14 181*4882a593Smuzhiyun# 182*4882a593Smuzhiyun# states of %xmm registers %xmm6:%xmm15 not saved 183*4882a593Smuzhiyun# all %xmm registers are clobbered 184*4882a593Smuzhiyun# 185*4882a593Smuzhiyun.endm 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun.macro FUNC_RESTORE 189*4882a593Smuzhiyun pop %r14 190*4882a593Smuzhiyun pop %r13 191*4882a593Smuzhiyun pop %r12 192*4882a593Smuzhiyun.endm 193*4882a593Smuzhiyun 194*4882a593Smuzhiyun# Precompute hashkeys. 195*4882a593Smuzhiyun# Input: Hash subkey. 196*4882a593Smuzhiyun# Output: HashKeys stored in gcm_context_data. Only needs to be called 197*4882a593Smuzhiyun# once per key. 198*4882a593Smuzhiyun# clobbers r12, and tmp xmm registers. 199*4882a593Smuzhiyun.macro PRECOMPUTE SUBKEY TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 TMP7 200*4882a593Smuzhiyun mov \SUBKEY, %r12 201*4882a593Smuzhiyun movdqu (%r12), \TMP3 202*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), \TMP2 203*4882a593Smuzhiyun pshufb \TMP2, \TMP3 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun # precompute HashKey<<1 mod poly from the HashKey (required for GHASH) 206*4882a593Smuzhiyun 207*4882a593Smuzhiyun movdqa \TMP3, \TMP2 208*4882a593Smuzhiyun psllq $1, \TMP3 209*4882a593Smuzhiyun psrlq $63, \TMP2 210*4882a593Smuzhiyun movdqa \TMP2, \TMP1 211*4882a593Smuzhiyun pslldq $8, \TMP2 212*4882a593Smuzhiyun psrldq $8, \TMP1 213*4882a593Smuzhiyun por \TMP2, \TMP3 214*4882a593Smuzhiyun 215*4882a593Smuzhiyun # reduce HashKey<<1 216*4882a593Smuzhiyun 217*4882a593Smuzhiyun pshufd $0x24, \TMP1, \TMP2 218*4882a593Smuzhiyun pcmpeqd TWOONE(%rip), \TMP2 219*4882a593Smuzhiyun pand POLY(%rip), \TMP2 220*4882a593Smuzhiyun pxor \TMP2, \TMP3 221*4882a593Smuzhiyun movdqu \TMP3, HashKey(%arg2) 222*4882a593Smuzhiyun 223*4882a593Smuzhiyun movdqa \TMP3, \TMP5 224*4882a593Smuzhiyun pshufd $78, \TMP3, \TMP1 225*4882a593Smuzhiyun pxor \TMP3, \TMP1 226*4882a593Smuzhiyun movdqu \TMP1, HashKey_k(%arg2) 227*4882a593Smuzhiyun 228*4882a593Smuzhiyun GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 229*4882a593Smuzhiyun# TMP5 = HashKey^2<<1 (mod poly) 230*4882a593Smuzhiyun movdqu \TMP5, HashKey_2(%arg2) 231*4882a593Smuzhiyun# HashKey_2 = HashKey^2<<1 (mod poly) 232*4882a593Smuzhiyun pshufd $78, \TMP5, \TMP1 233*4882a593Smuzhiyun pxor \TMP5, \TMP1 234*4882a593Smuzhiyun movdqu \TMP1, HashKey_2_k(%arg2) 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 237*4882a593Smuzhiyun# TMP5 = HashKey^3<<1 (mod poly) 238*4882a593Smuzhiyun movdqu \TMP5, HashKey_3(%arg2) 239*4882a593Smuzhiyun pshufd $78, \TMP5, \TMP1 240*4882a593Smuzhiyun pxor \TMP5, \TMP1 241*4882a593Smuzhiyun movdqu \TMP1, HashKey_3_k(%arg2) 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7 244*4882a593Smuzhiyun# TMP5 = HashKey^3<<1 (mod poly) 245*4882a593Smuzhiyun movdqu \TMP5, HashKey_4(%arg2) 246*4882a593Smuzhiyun pshufd $78, \TMP5, \TMP1 247*4882a593Smuzhiyun pxor \TMP5, \TMP1 248*4882a593Smuzhiyun movdqu \TMP1, HashKey_4_k(%arg2) 249*4882a593Smuzhiyun.endm 250*4882a593Smuzhiyun 251*4882a593Smuzhiyun# GCM_INIT initializes a gcm_context struct to prepare for encoding/decoding. 252*4882a593Smuzhiyun# Clobbers rax, r10-r13 and xmm0-xmm6, %xmm13 253*4882a593Smuzhiyun.macro GCM_INIT Iv SUBKEY AAD AADLEN 254*4882a593Smuzhiyun mov \AADLEN, %r11 255*4882a593Smuzhiyun mov %r11, AadLen(%arg2) # ctx_data.aad_length = aad_length 256*4882a593Smuzhiyun xor %r11d, %r11d 257*4882a593Smuzhiyun mov %r11, InLen(%arg2) # ctx_data.in_length = 0 258*4882a593Smuzhiyun mov %r11, PBlockLen(%arg2) # ctx_data.partial_block_length = 0 259*4882a593Smuzhiyun mov %r11, PBlockEncKey(%arg2) # ctx_data.partial_block_enc_key = 0 260*4882a593Smuzhiyun mov \Iv, %rax 261*4882a593Smuzhiyun movdqu (%rax), %xmm0 262*4882a593Smuzhiyun movdqu %xmm0, OrigIV(%arg2) # ctx_data.orig_IV = iv 263*4882a593Smuzhiyun 264*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm2 265*4882a593Smuzhiyun pshufb %xmm2, %xmm0 266*4882a593Smuzhiyun movdqu %xmm0, CurCount(%arg2) # ctx_data.current_counter = iv 267*4882a593Smuzhiyun 268*4882a593Smuzhiyun PRECOMPUTE \SUBKEY, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7 269*4882a593Smuzhiyun movdqu HashKey(%arg2), %xmm13 270*4882a593Smuzhiyun 271*4882a593Smuzhiyun CALC_AAD_HASH %xmm13, \AAD, \AADLEN, %xmm0, %xmm1, %xmm2, %xmm3, \ 272*4882a593Smuzhiyun %xmm4, %xmm5, %xmm6 273*4882a593Smuzhiyun.endm 274*4882a593Smuzhiyun 275*4882a593Smuzhiyun# GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context 276*4882a593Smuzhiyun# struct has been initialized by GCM_INIT. 277*4882a593Smuzhiyun# Requires the input data be at least 1 byte long because of READ_PARTIAL_BLOCK 278*4882a593Smuzhiyun# Clobbers rax, r10-r13, and xmm0-xmm15 279*4882a593Smuzhiyun.macro GCM_ENC_DEC operation 280*4882a593Smuzhiyun movdqu AadHash(%arg2), %xmm8 281*4882a593Smuzhiyun movdqu HashKey(%arg2), %xmm13 282*4882a593Smuzhiyun add %arg5, InLen(%arg2) 283*4882a593Smuzhiyun 284*4882a593Smuzhiyun xor %r11d, %r11d # initialise the data pointer offset as zero 285*4882a593Smuzhiyun PARTIAL_BLOCK %arg3 %arg4 %arg5 %r11 %xmm8 \operation 286*4882a593Smuzhiyun 287*4882a593Smuzhiyun sub %r11, %arg5 # sub partial block data used 288*4882a593Smuzhiyun mov %arg5, %r13 # save the number of bytes 289*4882a593Smuzhiyun 290*4882a593Smuzhiyun and $-16, %r13 # %r13 = %r13 - (%r13 mod 16) 291*4882a593Smuzhiyun mov %r13, %r12 292*4882a593Smuzhiyun # Encrypt/Decrypt first few blocks 293*4882a593Smuzhiyun 294*4882a593Smuzhiyun and $(3<<4), %r12 295*4882a593Smuzhiyun jz _initial_num_blocks_is_0_\@ 296*4882a593Smuzhiyun cmp $(2<<4), %r12 297*4882a593Smuzhiyun jb _initial_num_blocks_is_1_\@ 298*4882a593Smuzhiyun je _initial_num_blocks_is_2_\@ 299*4882a593Smuzhiyun_initial_num_blocks_is_3_\@: 300*4882a593Smuzhiyun INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 301*4882a593Smuzhiyun%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, \operation 302*4882a593Smuzhiyun sub $48, %r13 303*4882a593Smuzhiyun jmp _initial_blocks_\@ 304*4882a593Smuzhiyun_initial_num_blocks_is_2_\@: 305*4882a593Smuzhiyun INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 306*4882a593Smuzhiyun%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, \operation 307*4882a593Smuzhiyun sub $32, %r13 308*4882a593Smuzhiyun jmp _initial_blocks_\@ 309*4882a593Smuzhiyun_initial_num_blocks_is_1_\@: 310*4882a593Smuzhiyun INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 311*4882a593Smuzhiyun%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, \operation 312*4882a593Smuzhiyun sub $16, %r13 313*4882a593Smuzhiyun jmp _initial_blocks_\@ 314*4882a593Smuzhiyun_initial_num_blocks_is_0_\@: 315*4882a593Smuzhiyun INITIAL_BLOCKS_ENC_DEC %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \ 316*4882a593Smuzhiyun%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, \operation 317*4882a593Smuzhiyun_initial_blocks_\@: 318*4882a593Smuzhiyun 319*4882a593Smuzhiyun # Main loop - Encrypt/Decrypt remaining blocks 320*4882a593Smuzhiyun 321*4882a593Smuzhiyun test %r13, %r13 322*4882a593Smuzhiyun je _zero_cipher_left_\@ 323*4882a593Smuzhiyun sub $64, %r13 324*4882a593Smuzhiyun je _four_cipher_left_\@ 325*4882a593Smuzhiyun_crypt_by_4_\@: 326*4882a593Smuzhiyun GHASH_4_ENCRYPT_4_PARALLEL_\operation %xmm9, %xmm10, %xmm11, %xmm12, \ 327*4882a593Smuzhiyun %xmm13, %xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, \ 328*4882a593Smuzhiyun %xmm7, %xmm8, enc 329*4882a593Smuzhiyun add $64, %r11 330*4882a593Smuzhiyun sub $64, %r13 331*4882a593Smuzhiyun jne _crypt_by_4_\@ 332*4882a593Smuzhiyun_four_cipher_left_\@: 333*4882a593Smuzhiyun GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \ 334*4882a593Smuzhiyun%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8 335*4882a593Smuzhiyun_zero_cipher_left_\@: 336*4882a593Smuzhiyun movdqu %xmm8, AadHash(%arg2) 337*4882a593Smuzhiyun movdqu %xmm0, CurCount(%arg2) 338*4882a593Smuzhiyun 339*4882a593Smuzhiyun mov %arg5, %r13 340*4882a593Smuzhiyun and $15, %r13 # %r13 = arg5 (mod 16) 341*4882a593Smuzhiyun je _multiple_of_16_bytes_\@ 342*4882a593Smuzhiyun 343*4882a593Smuzhiyun mov %r13, PBlockLen(%arg2) 344*4882a593Smuzhiyun 345*4882a593Smuzhiyun # Handle the last <16 Byte block separately 346*4882a593Smuzhiyun paddd ONE(%rip), %xmm0 # INCR CNT to get Yn 347*4882a593Smuzhiyun movdqu %xmm0, CurCount(%arg2) 348*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm10 349*4882a593Smuzhiyun pshufb %xmm10, %xmm0 350*4882a593Smuzhiyun 351*4882a593Smuzhiyun ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn) 352*4882a593Smuzhiyun movdqu %xmm0, PBlockEncKey(%arg2) 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun cmp $16, %arg5 355*4882a593Smuzhiyun jge _large_enough_update_\@ 356*4882a593Smuzhiyun 357*4882a593Smuzhiyun lea (%arg4,%r11,1), %r10 358*4882a593Smuzhiyun mov %r13, %r12 359*4882a593Smuzhiyun READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm1 360*4882a593Smuzhiyun jmp _data_read_\@ 361*4882a593Smuzhiyun 362*4882a593Smuzhiyun_large_enough_update_\@: 363*4882a593Smuzhiyun sub $16, %r11 364*4882a593Smuzhiyun add %r13, %r11 365*4882a593Smuzhiyun 366*4882a593Smuzhiyun # receive the last <16 Byte block 367*4882a593Smuzhiyun movdqu (%arg4, %r11, 1), %xmm1 368*4882a593Smuzhiyun 369*4882a593Smuzhiyun sub %r13, %r11 370*4882a593Smuzhiyun add $16, %r11 371*4882a593Smuzhiyun 372*4882a593Smuzhiyun lea SHIFT_MASK+16(%rip), %r12 373*4882a593Smuzhiyun # adjust the shuffle mask pointer to be able to shift 16-r13 bytes 374*4882a593Smuzhiyun # (r13 is the number of bytes in plaintext mod 16) 375*4882a593Smuzhiyun sub %r13, %r12 376*4882a593Smuzhiyun # get the appropriate shuffle mask 377*4882a593Smuzhiyun movdqu (%r12), %xmm2 378*4882a593Smuzhiyun # shift right 16-r13 bytes 379*4882a593Smuzhiyun pshufb %xmm2, %xmm1 380*4882a593Smuzhiyun 381*4882a593Smuzhiyun_data_read_\@: 382*4882a593Smuzhiyun lea ALL_F+16(%rip), %r12 383*4882a593Smuzhiyun sub %r13, %r12 384*4882a593Smuzhiyun 385*4882a593Smuzhiyun.ifc \operation, dec 386*4882a593Smuzhiyun movdqa %xmm1, %xmm2 387*4882a593Smuzhiyun.endif 388*4882a593Smuzhiyun pxor %xmm1, %xmm0 # XOR Encrypt(K, Yn) 389*4882a593Smuzhiyun movdqu (%r12), %xmm1 390*4882a593Smuzhiyun # get the appropriate mask to mask out top 16-r13 bytes of xmm0 391*4882a593Smuzhiyun pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0 392*4882a593Smuzhiyun.ifc \operation, dec 393*4882a593Smuzhiyun pand %xmm1, %xmm2 394*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm10 395*4882a593Smuzhiyun pshufb %xmm10 ,%xmm2 396*4882a593Smuzhiyun 397*4882a593Smuzhiyun pxor %xmm2, %xmm8 398*4882a593Smuzhiyun.else 399*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm10 400*4882a593Smuzhiyun pshufb %xmm10,%xmm0 401*4882a593Smuzhiyun 402*4882a593Smuzhiyun pxor %xmm0, %xmm8 403*4882a593Smuzhiyun.endif 404*4882a593Smuzhiyun 405*4882a593Smuzhiyun movdqu %xmm8, AadHash(%arg2) 406*4882a593Smuzhiyun.ifc \operation, enc 407*4882a593Smuzhiyun # GHASH computation for the last <16 byte block 408*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm10 409*4882a593Smuzhiyun # shuffle xmm0 back to output as ciphertext 410*4882a593Smuzhiyun pshufb %xmm10, %xmm0 411*4882a593Smuzhiyun.endif 412*4882a593Smuzhiyun 413*4882a593Smuzhiyun # Output %r13 bytes 414*4882a593Smuzhiyun movq %xmm0, %rax 415*4882a593Smuzhiyun cmp $8, %r13 416*4882a593Smuzhiyun jle _less_than_8_bytes_left_\@ 417*4882a593Smuzhiyun mov %rax, (%arg3 , %r11, 1) 418*4882a593Smuzhiyun add $8, %r11 419*4882a593Smuzhiyun psrldq $8, %xmm0 420*4882a593Smuzhiyun movq %xmm0, %rax 421*4882a593Smuzhiyun sub $8, %r13 422*4882a593Smuzhiyun_less_than_8_bytes_left_\@: 423*4882a593Smuzhiyun mov %al, (%arg3, %r11, 1) 424*4882a593Smuzhiyun add $1, %r11 425*4882a593Smuzhiyun shr $8, %rax 426*4882a593Smuzhiyun sub $1, %r13 427*4882a593Smuzhiyun jne _less_than_8_bytes_left_\@ 428*4882a593Smuzhiyun_multiple_of_16_bytes_\@: 429*4882a593Smuzhiyun.endm 430*4882a593Smuzhiyun 431*4882a593Smuzhiyun# GCM_COMPLETE Finishes update of tag of last partial block 432*4882a593Smuzhiyun# Output: Authorization Tag (AUTH_TAG) 433*4882a593Smuzhiyun# Clobbers rax, r10-r12, and xmm0, xmm1, xmm5-xmm15 434*4882a593Smuzhiyun.macro GCM_COMPLETE AUTHTAG AUTHTAGLEN 435*4882a593Smuzhiyun movdqu AadHash(%arg2), %xmm8 436*4882a593Smuzhiyun movdqu HashKey(%arg2), %xmm13 437*4882a593Smuzhiyun 438*4882a593Smuzhiyun mov PBlockLen(%arg2), %r12 439*4882a593Smuzhiyun 440*4882a593Smuzhiyun test %r12, %r12 441*4882a593Smuzhiyun je _partial_done\@ 442*4882a593Smuzhiyun 443*4882a593Smuzhiyun GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 444*4882a593Smuzhiyun 445*4882a593Smuzhiyun_partial_done\@: 446*4882a593Smuzhiyun mov AadLen(%arg2), %r12 # %r13 = aadLen (number of bytes) 447*4882a593Smuzhiyun shl $3, %r12 # convert into number of bits 448*4882a593Smuzhiyun movd %r12d, %xmm15 # len(A) in %xmm15 449*4882a593Smuzhiyun mov InLen(%arg2), %r12 450*4882a593Smuzhiyun shl $3, %r12 # len(C) in bits (*128) 451*4882a593Smuzhiyun movq %r12, %xmm1 452*4882a593Smuzhiyun 453*4882a593Smuzhiyun pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000 454*4882a593Smuzhiyun pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C) 455*4882a593Smuzhiyun pxor %xmm15, %xmm8 456*4882a593Smuzhiyun GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6 457*4882a593Smuzhiyun # final GHASH computation 458*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm10 459*4882a593Smuzhiyun pshufb %xmm10, %xmm8 460*4882a593Smuzhiyun 461*4882a593Smuzhiyun movdqu OrigIV(%arg2), %xmm0 # %xmm0 = Y0 462*4882a593Smuzhiyun ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0) 463*4882a593Smuzhiyun pxor %xmm8, %xmm0 464*4882a593Smuzhiyun_return_T_\@: 465*4882a593Smuzhiyun mov \AUTHTAG, %r10 # %r10 = authTag 466*4882a593Smuzhiyun mov \AUTHTAGLEN, %r11 # %r11 = auth_tag_len 467*4882a593Smuzhiyun cmp $16, %r11 468*4882a593Smuzhiyun je _T_16_\@ 469*4882a593Smuzhiyun cmp $8, %r11 470*4882a593Smuzhiyun jl _T_4_\@ 471*4882a593Smuzhiyun_T_8_\@: 472*4882a593Smuzhiyun movq %xmm0, %rax 473*4882a593Smuzhiyun mov %rax, (%r10) 474*4882a593Smuzhiyun add $8, %r10 475*4882a593Smuzhiyun sub $8, %r11 476*4882a593Smuzhiyun psrldq $8, %xmm0 477*4882a593Smuzhiyun test %r11, %r11 478*4882a593Smuzhiyun je _return_T_done_\@ 479*4882a593Smuzhiyun_T_4_\@: 480*4882a593Smuzhiyun movd %xmm0, %eax 481*4882a593Smuzhiyun mov %eax, (%r10) 482*4882a593Smuzhiyun add $4, %r10 483*4882a593Smuzhiyun sub $4, %r11 484*4882a593Smuzhiyun psrldq $4, %xmm0 485*4882a593Smuzhiyun test %r11, %r11 486*4882a593Smuzhiyun je _return_T_done_\@ 487*4882a593Smuzhiyun_T_123_\@: 488*4882a593Smuzhiyun movd %xmm0, %eax 489*4882a593Smuzhiyun cmp $2, %r11 490*4882a593Smuzhiyun jl _T_1_\@ 491*4882a593Smuzhiyun mov %ax, (%r10) 492*4882a593Smuzhiyun cmp $2, %r11 493*4882a593Smuzhiyun je _return_T_done_\@ 494*4882a593Smuzhiyun add $2, %r10 495*4882a593Smuzhiyun sar $16, %eax 496*4882a593Smuzhiyun_T_1_\@: 497*4882a593Smuzhiyun mov %al, (%r10) 498*4882a593Smuzhiyun jmp _return_T_done_\@ 499*4882a593Smuzhiyun_T_16_\@: 500*4882a593Smuzhiyun movdqu %xmm0, (%r10) 501*4882a593Smuzhiyun_return_T_done_\@: 502*4882a593Smuzhiyun.endm 503*4882a593Smuzhiyun 504*4882a593Smuzhiyun#ifdef __x86_64__ 505*4882a593Smuzhiyun/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0) 506*4882a593Smuzhiyun* 507*4882a593Smuzhiyun* 508*4882a593Smuzhiyun* Input: A and B (128-bits each, bit-reflected) 509*4882a593Smuzhiyun* Output: C = A*B*x mod poly, (i.e. >>1 ) 510*4882a593Smuzhiyun* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input 511*4882a593Smuzhiyun* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly. 512*4882a593Smuzhiyun* 513*4882a593Smuzhiyun*/ 514*4882a593Smuzhiyun.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5 515*4882a593Smuzhiyun movdqa \GH, \TMP1 516*4882a593Smuzhiyun pshufd $78, \GH, \TMP2 517*4882a593Smuzhiyun pshufd $78, \HK, \TMP3 518*4882a593Smuzhiyun pxor \GH, \TMP2 # TMP2 = a1+a0 519*4882a593Smuzhiyun pxor \HK, \TMP3 # TMP3 = b1+b0 520*4882a593Smuzhiyun pclmulqdq $0x11, \HK, \TMP1 # TMP1 = a1*b1 521*4882a593Smuzhiyun pclmulqdq $0x00, \HK, \GH # GH = a0*b0 522*4882a593Smuzhiyun pclmulqdq $0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0) 523*4882a593Smuzhiyun pxor \GH, \TMP2 524*4882a593Smuzhiyun pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0) 525*4882a593Smuzhiyun movdqa \TMP2, \TMP3 526*4882a593Smuzhiyun pslldq $8, \TMP3 # left shift TMP3 2 DWs 527*4882a593Smuzhiyun psrldq $8, \TMP2 # right shift TMP2 2 DWs 528*4882a593Smuzhiyun pxor \TMP3, \GH 529*4882a593Smuzhiyun pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK 530*4882a593Smuzhiyun 531*4882a593Smuzhiyun # first phase of the reduction 532*4882a593Smuzhiyun 533*4882a593Smuzhiyun movdqa \GH, \TMP2 534*4882a593Smuzhiyun movdqa \GH, \TMP3 535*4882a593Smuzhiyun movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4 536*4882a593Smuzhiyun # in in order to perform 537*4882a593Smuzhiyun # independent shifts 538*4882a593Smuzhiyun pslld $31, \TMP2 # packed right shift <<31 539*4882a593Smuzhiyun pslld $30, \TMP3 # packed right shift <<30 540*4882a593Smuzhiyun pslld $25, \TMP4 # packed right shift <<25 541*4882a593Smuzhiyun pxor \TMP3, \TMP2 # xor the shifted versions 542*4882a593Smuzhiyun pxor \TMP4, \TMP2 543*4882a593Smuzhiyun movdqa \TMP2, \TMP5 544*4882a593Smuzhiyun psrldq $4, \TMP5 # right shift TMP5 1 DW 545*4882a593Smuzhiyun pslldq $12, \TMP2 # left shift TMP2 3 DWs 546*4882a593Smuzhiyun pxor \TMP2, \GH 547*4882a593Smuzhiyun 548*4882a593Smuzhiyun # second phase of the reduction 549*4882a593Smuzhiyun 550*4882a593Smuzhiyun movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4 551*4882a593Smuzhiyun # in in order to perform 552*4882a593Smuzhiyun # independent shifts 553*4882a593Smuzhiyun movdqa \GH,\TMP3 554*4882a593Smuzhiyun movdqa \GH,\TMP4 555*4882a593Smuzhiyun psrld $1,\TMP2 # packed left shift >>1 556*4882a593Smuzhiyun psrld $2,\TMP3 # packed left shift >>2 557*4882a593Smuzhiyun psrld $7,\TMP4 # packed left shift >>7 558*4882a593Smuzhiyun pxor \TMP3,\TMP2 # xor the shifted versions 559*4882a593Smuzhiyun pxor \TMP4,\TMP2 560*4882a593Smuzhiyun pxor \TMP5, \TMP2 561*4882a593Smuzhiyun pxor \TMP2, \GH 562*4882a593Smuzhiyun pxor \TMP1, \GH # result is in TMP1 563*4882a593Smuzhiyun.endm 564*4882a593Smuzhiyun 565*4882a593Smuzhiyun# Reads DLEN bytes starting at DPTR and stores in XMMDst 566*4882a593Smuzhiyun# where 0 < DLEN < 16 567*4882a593Smuzhiyun# Clobbers %rax, DLEN and XMM1 568*4882a593Smuzhiyun.macro READ_PARTIAL_BLOCK DPTR DLEN XMM1 XMMDst 569*4882a593Smuzhiyun cmp $8, \DLEN 570*4882a593Smuzhiyun jl _read_lt8_\@ 571*4882a593Smuzhiyun mov (\DPTR), %rax 572*4882a593Smuzhiyun movq %rax, \XMMDst 573*4882a593Smuzhiyun sub $8, \DLEN 574*4882a593Smuzhiyun jz _done_read_partial_block_\@ 575*4882a593Smuzhiyun xor %eax, %eax 576*4882a593Smuzhiyun_read_next_byte_\@: 577*4882a593Smuzhiyun shl $8, %rax 578*4882a593Smuzhiyun mov 7(\DPTR, \DLEN, 1), %al 579*4882a593Smuzhiyun dec \DLEN 580*4882a593Smuzhiyun jnz _read_next_byte_\@ 581*4882a593Smuzhiyun movq %rax, \XMM1 582*4882a593Smuzhiyun pslldq $8, \XMM1 583*4882a593Smuzhiyun por \XMM1, \XMMDst 584*4882a593Smuzhiyun jmp _done_read_partial_block_\@ 585*4882a593Smuzhiyun_read_lt8_\@: 586*4882a593Smuzhiyun xor %eax, %eax 587*4882a593Smuzhiyun_read_next_byte_lt8_\@: 588*4882a593Smuzhiyun shl $8, %rax 589*4882a593Smuzhiyun mov -1(\DPTR, \DLEN, 1), %al 590*4882a593Smuzhiyun dec \DLEN 591*4882a593Smuzhiyun jnz _read_next_byte_lt8_\@ 592*4882a593Smuzhiyun movq %rax, \XMMDst 593*4882a593Smuzhiyun_done_read_partial_block_\@: 594*4882a593Smuzhiyun.endm 595*4882a593Smuzhiyun 596*4882a593Smuzhiyun# CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. 597*4882a593Smuzhiyun# clobbers r10-11, xmm14 598*4882a593Smuzhiyun.macro CALC_AAD_HASH HASHKEY AAD AADLEN TMP1 TMP2 TMP3 TMP4 TMP5 \ 599*4882a593Smuzhiyun TMP6 TMP7 600*4882a593Smuzhiyun MOVADQ SHUF_MASK(%rip), %xmm14 601*4882a593Smuzhiyun mov \AAD, %r10 # %r10 = AAD 602*4882a593Smuzhiyun mov \AADLEN, %r11 # %r11 = aadLen 603*4882a593Smuzhiyun pxor \TMP7, \TMP7 604*4882a593Smuzhiyun pxor \TMP6, \TMP6 605*4882a593Smuzhiyun 606*4882a593Smuzhiyun cmp $16, %r11 607*4882a593Smuzhiyun jl _get_AAD_rest\@ 608*4882a593Smuzhiyun_get_AAD_blocks\@: 609*4882a593Smuzhiyun movdqu (%r10), \TMP7 610*4882a593Smuzhiyun pshufb %xmm14, \TMP7 # byte-reflect the AAD data 611*4882a593Smuzhiyun pxor \TMP7, \TMP6 612*4882a593Smuzhiyun GHASH_MUL \TMP6, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 613*4882a593Smuzhiyun add $16, %r10 614*4882a593Smuzhiyun sub $16, %r11 615*4882a593Smuzhiyun cmp $16, %r11 616*4882a593Smuzhiyun jge _get_AAD_blocks\@ 617*4882a593Smuzhiyun 618*4882a593Smuzhiyun movdqu \TMP6, \TMP7 619*4882a593Smuzhiyun 620*4882a593Smuzhiyun /* read the last <16B of AAD */ 621*4882a593Smuzhiyun_get_AAD_rest\@: 622*4882a593Smuzhiyun test %r11, %r11 623*4882a593Smuzhiyun je _get_AAD_done\@ 624*4882a593Smuzhiyun 625*4882a593Smuzhiyun READ_PARTIAL_BLOCK %r10, %r11, \TMP1, \TMP7 626*4882a593Smuzhiyun pshufb %xmm14, \TMP7 # byte-reflect the AAD data 627*4882a593Smuzhiyun pxor \TMP6, \TMP7 628*4882a593Smuzhiyun GHASH_MUL \TMP7, \HASHKEY, \TMP1, \TMP2, \TMP3, \TMP4, \TMP5 629*4882a593Smuzhiyun movdqu \TMP7, \TMP6 630*4882a593Smuzhiyun 631*4882a593Smuzhiyun_get_AAD_done\@: 632*4882a593Smuzhiyun movdqu \TMP6, AadHash(%arg2) 633*4882a593Smuzhiyun.endm 634*4882a593Smuzhiyun 635*4882a593Smuzhiyun# PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks 636*4882a593Smuzhiyun# between update calls. 637*4882a593Smuzhiyun# Requires the input data be at least 1 byte long due to READ_PARTIAL_BLOCK 638*4882a593Smuzhiyun# Outputs encrypted bytes, and updates hash and partial info in gcm_data_context 639*4882a593Smuzhiyun# Clobbers rax, r10, r12, r13, xmm0-6, xmm9-13 640*4882a593Smuzhiyun.macro PARTIAL_BLOCK CYPH_PLAIN_OUT PLAIN_CYPH_IN PLAIN_CYPH_LEN DATA_OFFSET \ 641*4882a593Smuzhiyun AAD_HASH operation 642*4882a593Smuzhiyun mov PBlockLen(%arg2), %r13 643*4882a593Smuzhiyun test %r13, %r13 644*4882a593Smuzhiyun je _partial_block_done_\@ # Leave Macro if no partial blocks 645*4882a593Smuzhiyun # Read in input data without over reading 646*4882a593Smuzhiyun cmp $16, \PLAIN_CYPH_LEN 647*4882a593Smuzhiyun jl _fewer_than_16_bytes_\@ 648*4882a593Smuzhiyun movups (\PLAIN_CYPH_IN), %xmm1 # If more than 16 bytes, just fill xmm 649*4882a593Smuzhiyun jmp _data_read_\@ 650*4882a593Smuzhiyun 651*4882a593Smuzhiyun_fewer_than_16_bytes_\@: 652*4882a593Smuzhiyun lea (\PLAIN_CYPH_IN, \DATA_OFFSET, 1), %r10 653*4882a593Smuzhiyun mov \PLAIN_CYPH_LEN, %r12 654*4882a593Smuzhiyun READ_PARTIAL_BLOCK %r10 %r12 %xmm0 %xmm1 655*4882a593Smuzhiyun 656*4882a593Smuzhiyun mov PBlockLen(%arg2), %r13 657*4882a593Smuzhiyun 658*4882a593Smuzhiyun_data_read_\@: # Finished reading in data 659*4882a593Smuzhiyun 660*4882a593Smuzhiyun movdqu PBlockEncKey(%arg2), %xmm9 661*4882a593Smuzhiyun movdqu HashKey(%arg2), %xmm13 662*4882a593Smuzhiyun 663*4882a593Smuzhiyun lea SHIFT_MASK(%rip), %r12 664*4882a593Smuzhiyun 665*4882a593Smuzhiyun # adjust the shuffle mask pointer to be able to shift r13 bytes 666*4882a593Smuzhiyun # r16-r13 is the number of bytes in plaintext mod 16) 667*4882a593Smuzhiyun add %r13, %r12 668*4882a593Smuzhiyun movdqu (%r12), %xmm2 # get the appropriate shuffle mask 669*4882a593Smuzhiyun pshufb %xmm2, %xmm9 # shift right r13 bytes 670*4882a593Smuzhiyun 671*4882a593Smuzhiyun.ifc \operation, dec 672*4882a593Smuzhiyun movdqa %xmm1, %xmm3 673*4882a593Smuzhiyun pxor %xmm1, %xmm9 # Cyphertext XOR E(K, Yn) 674*4882a593Smuzhiyun 675*4882a593Smuzhiyun mov \PLAIN_CYPH_LEN, %r10 676*4882a593Smuzhiyun add %r13, %r10 677*4882a593Smuzhiyun # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 678*4882a593Smuzhiyun sub $16, %r10 679*4882a593Smuzhiyun # Determine if if partial block is not being filled and 680*4882a593Smuzhiyun # shift mask accordingly 681*4882a593Smuzhiyun jge _no_extra_mask_1_\@ 682*4882a593Smuzhiyun sub %r10, %r12 683*4882a593Smuzhiyun_no_extra_mask_1_\@: 684*4882a593Smuzhiyun 685*4882a593Smuzhiyun movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 686*4882a593Smuzhiyun # get the appropriate mask to mask out bottom r13 bytes of xmm9 687*4882a593Smuzhiyun pand %xmm1, %xmm9 # mask out bottom r13 bytes of xmm9 688*4882a593Smuzhiyun 689*4882a593Smuzhiyun pand %xmm1, %xmm3 690*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm10 691*4882a593Smuzhiyun pshufb %xmm10, %xmm3 692*4882a593Smuzhiyun pshufb %xmm2, %xmm3 693*4882a593Smuzhiyun pxor %xmm3, \AAD_HASH 694*4882a593Smuzhiyun 695*4882a593Smuzhiyun test %r10, %r10 696*4882a593Smuzhiyun jl _partial_incomplete_1_\@ 697*4882a593Smuzhiyun 698*4882a593Smuzhiyun # GHASH computation for the last <16 Byte block 699*4882a593Smuzhiyun GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 700*4882a593Smuzhiyun xor %eax, %eax 701*4882a593Smuzhiyun 702*4882a593Smuzhiyun mov %rax, PBlockLen(%arg2) 703*4882a593Smuzhiyun jmp _dec_done_\@ 704*4882a593Smuzhiyun_partial_incomplete_1_\@: 705*4882a593Smuzhiyun add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 706*4882a593Smuzhiyun_dec_done_\@: 707*4882a593Smuzhiyun movdqu \AAD_HASH, AadHash(%arg2) 708*4882a593Smuzhiyun.else 709*4882a593Smuzhiyun pxor %xmm1, %xmm9 # Plaintext XOR E(K, Yn) 710*4882a593Smuzhiyun 711*4882a593Smuzhiyun mov \PLAIN_CYPH_LEN, %r10 712*4882a593Smuzhiyun add %r13, %r10 713*4882a593Smuzhiyun # Set r10 to be the amount of data left in CYPH_PLAIN_IN after filling 714*4882a593Smuzhiyun sub $16, %r10 715*4882a593Smuzhiyun # Determine if if partial block is not being filled and 716*4882a593Smuzhiyun # shift mask accordingly 717*4882a593Smuzhiyun jge _no_extra_mask_2_\@ 718*4882a593Smuzhiyun sub %r10, %r12 719*4882a593Smuzhiyun_no_extra_mask_2_\@: 720*4882a593Smuzhiyun 721*4882a593Smuzhiyun movdqu ALL_F-SHIFT_MASK(%r12), %xmm1 722*4882a593Smuzhiyun # get the appropriate mask to mask out bottom r13 bytes of xmm9 723*4882a593Smuzhiyun pand %xmm1, %xmm9 724*4882a593Smuzhiyun 725*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm1 726*4882a593Smuzhiyun pshufb %xmm1, %xmm9 727*4882a593Smuzhiyun pshufb %xmm2, %xmm9 728*4882a593Smuzhiyun pxor %xmm9, \AAD_HASH 729*4882a593Smuzhiyun 730*4882a593Smuzhiyun test %r10, %r10 731*4882a593Smuzhiyun jl _partial_incomplete_2_\@ 732*4882a593Smuzhiyun 733*4882a593Smuzhiyun # GHASH computation for the last <16 Byte block 734*4882a593Smuzhiyun GHASH_MUL \AAD_HASH, %xmm13, %xmm0, %xmm10, %xmm11, %xmm5, %xmm6 735*4882a593Smuzhiyun xor %eax, %eax 736*4882a593Smuzhiyun 737*4882a593Smuzhiyun mov %rax, PBlockLen(%arg2) 738*4882a593Smuzhiyun jmp _encode_done_\@ 739*4882a593Smuzhiyun_partial_incomplete_2_\@: 740*4882a593Smuzhiyun add \PLAIN_CYPH_LEN, PBlockLen(%arg2) 741*4882a593Smuzhiyun_encode_done_\@: 742*4882a593Smuzhiyun movdqu \AAD_HASH, AadHash(%arg2) 743*4882a593Smuzhiyun 744*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm10 745*4882a593Smuzhiyun # shuffle xmm9 back to output as ciphertext 746*4882a593Smuzhiyun pshufb %xmm10, %xmm9 747*4882a593Smuzhiyun pshufb %xmm2, %xmm9 748*4882a593Smuzhiyun.endif 749*4882a593Smuzhiyun # output encrypted Bytes 750*4882a593Smuzhiyun test %r10, %r10 751*4882a593Smuzhiyun jl _partial_fill_\@ 752*4882a593Smuzhiyun mov %r13, %r12 753*4882a593Smuzhiyun mov $16, %r13 754*4882a593Smuzhiyun # Set r13 to be the number of bytes to write out 755*4882a593Smuzhiyun sub %r12, %r13 756*4882a593Smuzhiyun jmp _count_set_\@ 757*4882a593Smuzhiyun_partial_fill_\@: 758*4882a593Smuzhiyun mov \PLAIN_CYPH_LEN, %r13 759*4882a593Smuzhiyun_count_set_\@: 760*4882a593Smuzhiyun movdqa %xmm9, %xmm0 761*4882a593Smuzhiyun movq %xmm0, %rax 762*4882a593Smuzhiyun cmp $8, %r13 763*4882a593Smuzhiyun jle _less_than_8_bytes_left_\@ 764*4882a593Smuzhiyun 765*4882a593Smuzhiyun mov %rax, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 766*4882a593Smuzhiyun add $8, \DATA_OFFSET 767*4882a593Smuzhiyun psrldq $8, %xmm0 768*4882a593Smuzhiyun movq %xmm0, %rax 769*4882a593Smuzhiyun sub $8, %r13 770*4882a593Smuzhiyun_less_than_8_bytes_left_\@: 771*4882a593Smuzhiyun movb %al, (\CYPH_PLAIN_OUT, \DATA_OFFSET, 1) 772*4882a593Smuzhiyun add $1, \DATA_OFFSET 773*4882a593Smuzhiyun shr $8, %rax 774*4882a593Smuzhiyun sub $1, %r13 775*4882a593Smuzhiyun jne _less_than_8_bytes_left_\@ 776*4882a593Smuzhiyun_partial_block_done_\@: 777*4882a593Smuzhiyun.endm # PARTIAL_BLOCK 778*4882a593Smuzhiyun 779*4882a593Smuzhiyun/* 780*4882a593Smuzhiyun* if a = number of total plaintext bytes 781*4882a593Smuzhiyun* b = floor(a/16) 782*4882a593Smuzhiyun* num_initial_blocks = b mod 4 783*4882a593Smuzhiyun* encrypt the initial num_initial_blocks blocks and apply ghash on 784*4882a593Smuzhiyun* the ciphertext 785*4882a593Smuzhiyun* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers 786*4882a593Smuzhiyun* are clobbered 787*4882a593Smuzhiyun* arg1, %arg2, %arg3 are used as a pointer only, not modified 788*4882a593Smuzhiyun*/ 789*4882a593Smuzhiyun 790*4882a593Smuzhiyun 791*4882a593Smuzhiyun.macro INITIAL_BLOCKS_ENC_DEC TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \ 792*4882a593Smuzhiyun XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation 793*4882a593Smuzhiyun MOVADQ SHUF_MASK(%rip), %xmm14 794*4882a593Smuzhiyun 795*4882a593Smuzhiyun movdqu AadHash(%arg2), %xmm\i # XMM0 = Y0 796*4882a593Smuzhiyun 797*4882a593Smuzhiyun # start AES for num_initial_blocks blocks 798*4882a593Smuzhiyun 799*4882a593Smuzhiyun movdqu CurCount(%arg2), \XMM0 # XMM0 = Y0 800*4882a593Smuzhiyun 801*4882a593Smuzhiyun.if (\i == 5) || (\i == 6) || (\i == 7) 802*4882a593Smuzhiyun 803*4882a593Smuzhiyun MOVADQ ONE(%RIP),\TMP1 804*4882a593Smuzhiyun MOVADQ 0(%arg1),\TMP2 805*4882a593Smuzhiyun.irpc index, \i_seq 806*4882a593Smuzhiyun paddd \TMP1, \XMM0 # INCR Y0 807*4882a593Smuzhiyun.ifc \operation, dec 808*4882a593Smuzhiyun movdqa \XMM0, %xmm\index 809*4882a593Smuzhiyun.else 810*4882a593Smuzhiyun MOVADQ \XMM0, %xmm\index 811*4882a593Smuzhiyun.endif 812*4882a593Smuzhiyun pshufb %xmm14, %xmm\index # perform a 16 byte swap 813*4882a593Smuzhiyun pxor \TMP2, %xmm\index 814*4882a593Smuzhiyun.endr 815*4882a593Smuzhiyun lea 0x10(%arg1),%r10 816*4882a593Smuzhiyun mov keysize,%eax 817*4882a593Smuzhiyun shr $2,%eax # 128->4, 192->6, 256->8 818*4882a593Smuzhiyun add $5,%eax # 128->9, 192->11, 256->13 819*4882a593Smuzhiyun 820*4882a593Smuzhiyunaes_loop_initial_\@: 821*4882a593Smuzhiyun MOVADQ (%r10),\TMP1 822*4882a593Smuzhiyun.irpc index, \i_seq 823*4882a593Smuzhiyun aesenc \TMP1, %xmm\index 824*4882a593Smuzhiyun.endr 825*4882a593Smuzhiyun add $16,%r10 826*4882a593Smuzhiyun sub $1,%eax 827*4882a593Smuzhiyun jnz aes_loop_initial_\@ 828*4882a593Smuzhiyun 829*4882a593Smuzhiyun MOVADQ (%r10), \TMP1 830*4882a593Smuzhiyun.irpc index, \i_seq 831*4882a593Smuzhiyun aesenclast \TMP1, %xmm\index # Last Round 832*4882a593Smuzhiyun.endr 833*4882a593Smuzhiyun.irpc index, \i_seq 834*4882a593Smuzhiyun movdqu (%arg4 , %r11, 1), \TMP1 835*4882a593Smuzhiyun pxor \TMP1, %xmm\index 836*4882a593Smuzhiyun movdqu %xmm\index, (%arg3 , %r11, 1) 837*4882a593Smuzhiyun # write back plaintext/ciphertext for num_initial_blocks 838*4882a593Smuzhiyun add $16, %r11 839*4882a593Smuzhiyun 840*4882a593Smuzhiyun.ifc \operation, dec 841*4882a593Smuzhiyun movdqa \TMP1, %xmm\index 842*4882a593Smuzhiyun.endif 843*4882a593Smuzhiyun pshufb %xmm14, %xmm\index 844*4882a593Smuzhiyun 845*4882a593Smuzhiyun # prepare plaintext/ciphertext for GHASH computation 846*4882a593Smuzhiyun.endr 847*4882a593Smuzhiyun.endif 848*4882a593Smuzhiyun 849*4882a593Smuzhiyun # apply GHASH on num_initial_blocks blocks 850*4882a593Smuzhiyun 851*4882a593Smuzhiyun.if \i == 5 852*4882a593Smuzhiyun pxor %xmm5, %xmm6 853*4882a593Smuzhiyun GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 854*4882a593Smuzhiyun pxor %xmm6, %xmm7 855*4882a593Smuzhiyun GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 856*4882a593Smuzhiyun pxor %xmm7, %xmm8 857*4882a593Smuzhiyun GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 858*4882a593Smuzhiyun.elseif \i == 6 859*4882a593Smuzhiyun pxor %xmm6, %xmm7 860*4882a593Smuzhiyun GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 861*4882a593Smuzhiyun pxor %xmm7, %xmm8 862*4882a593Smuzhiyun GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 863*4882a593Smuzhiyun.elseif \i == 7 864*4882a593Smuzhiyun pxor %xmm7, %xmm8 865*4882a593Smuzhiyun GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1 866*4882a593Smuzhiyun.endif 867*4882a593Smuzhiyun cmp $64, %r13 868*4882a593Smuzhiyun jl _initial_blocks_done\@ 869*4882a593Smuzhiyun # no need for precomputed values 870*4882a593Smuzhiyun/* 871*4882a593Smuzhiyun* 872*4882a593Smuzhiyun* Precomputations for HashKey parallel with encryption of first 4 blocks. 873*4882a593Smuzhiyun* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i 874*4882a593Smuzhiyun*/ 875*4882a593Smuzhiyun MOVADQ ONE(%RIP),\TMP1 876*4882a593Smuzhiyun paddd \TMP1, \XMM0 # INCR Y0 877*4882a593Smuzhiyun MOVADQ \XMM0, \XMM1 878*4882a593Smuzhiyun pshufb %xmm14, \XMM1 # perform a 16 byte swap 879*4882a593Smuzhiyun 880*4882a593Smuzhiyun paddd \TMP1, \XMM0 # INCR Y0 881*4882a593Smuzhiyun MOVADQ \XMM0, \XMM2 882*4882a593Smuzhiyun pshufb %xmm14, \XMM2 # perform a 16 byte swap 883*4882a593Smuzhiyun 884*4882a593Smuzhiyun paddd \TMP1, \XMM0 # INCR Y0 885*4882a593Smuzhiyun MOVADQ \XMM0, \XMM3 886*4882a593Smuzhiyun pshufb %xmm14, \XMM3 # perform a 16 byte swap 887*4882a593Smuzhiyun 888*4882a593Smuzhiyun paddd \TMP1, \XMM0 # INCR Y0 889*4882a593Smuzhiyun MOVADQ \XMM0, \XMM4 890*4882a593Smuzhiyun pshufb %xmm14, \XMM4 # perform a 16 byte swap 891*4882a593Smuzhiyun 892*4882a593Smuzhiyun MOVADQ 0(%arg1),\TMP1 893*4882a593Smuzhiyun pxor \TMP1, \XMM1 894*4882a593Smuzhiyun pxor \TMP1, \XMM2 895*4882a593Smuzhiyun pxor \TMP1, \XMM3 896*4882a593Smuzhiyun pxor \TMP1, \XMM4 897*4882a593Smuzhiyun.irpc index, 1234 # do 4 rounds 898*4882a593Smuzhiyun movaps 0x10*\index(%arg1), \TMP1 899*4882a593Smuzhiyun aesenc \TMP1, \XMM1 900*4882a593Smuzhiyun aesenc \TMP1, \XMM2 901*4882a593Smuzhiyun aesenc \TMP1, \XMM3 902*4882a593Smuzhiyun aesenc \TMP1, \XMM4 903*4882a593Smuzhiyun.endr 904*4882a593Smuzhiyun.irpc index, 56789 # do next 5 rounds 905*4882a593Smuzhiyun movaps 0x10*\index(%arg1), \TMP1 906*4882a593Smuzhiyun aesenc \TMP1, \XMM1 907*4882a593Smuzhiyun aesenc \TMP1, \XMM2 908*4882a593Smuzhiyun aesenc \TMP1, \XMM3 909*4882a593Smuzhiyun aesenc \TMP1, \XMM4 910*4882a593Smuzhiyun.endr 911*4882a593Smuzhiyun lea 0xa0(%arg1),%r10 912*4882a593Smuzhiyun mov keysize,%eax 913*4882a593Smuzhiyun shr $2,%eax # 128->4, 192->6, 256->8 914*4882a593Smuzhiyun sub $4,%eax # 128->0, 192->2, 256->4 915*4882a593Smuzhiyun jz aes_loop_pre_done\@ 916*4882a593Smuzhiyun 917*4882a593Smuzhiyunaes_loop_pre_\@: 918*4882a593Smuzhiyun MOVADQ (%r10),\TMP2 919*4882a593Smuzhiyun.irpc index, 1234 920*4882a593Smuzhiyun aesenc \TMP2, %xmm\index 921*4882a593Smuzhiyun.endr 922*4882a593Smuzhiyun add $16,%r10 923*4882a593Smuzhiyun sub $1,%eax 924*4882a593Smuzhiyun jnz aes_loop_pre_\@ 925*4882a593Smuzhiyun 926*4882a593Smuzhiyunaes_loop_pre_done\@: 927*4882a593Smuzhiyun MOVADQ (%r10), \TMP2 928*4882a593Smuzhiyun aesenclast \TMP2, \XMM1 929*4882a593Smuzhiyun aesenclast \TMP2, \XMM2 930*4882a593Smuzhiyun aesenclast \TMP2, \XMM3 931*4882a593Smuzhiyun aesenclast \TMP2, \XMM4 932*4882a593Smuzhiyun movdqu 16*0(%arg4 , %r11 , 1), \TMP1 933*4882a593Smuzhiyun pxor \TMP1, \XMM1 934*4882a593Smuzhiyun.ifc \operation, dec 935*4882a593Smuzhiyun movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 936*4882a593Smuzhiyun movdqa \TMP1, \XMM1 937*4882a593Smuzhiyun.endif 938*4882a593Smuzhiyun movdqu 16*1(%arg4 , %r11 , 1), \TMP1 939*4882a593Smuzhiyun pxor \TMP1, \XMM2 940*4882a593Smuzhiyun.ifc \operation, dec 941*4882a593Smuzhiyun movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 942*4882a593Smuzhiyun movdqa \TMP1, \XMM2 943*4882a593Smuzhiyun.endif 944*4882a593Smuzhiyun movdqu 16*2(%arg4 , %r11 , 1), \TMP1 945*4882a593Smuzhiyun pxor \TMP1, \XMM3 946*4882a593Smuzhiyun.ifc \operation, dec 947*4882a593Smuzhiyun movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 948*4882a593Smuzhiyun movdqa \TMP1, \XMM3 949*4882a593Smuzhiyun.endif 950*4882a593Smuzhiyun movdqu 16*3(%arg4 , %r11 , 1), \TMP1 951*4882a593Smuzhiyun pxor \TMP1, \XMM4 952*4882a593Smuzhiyun.ifc \operation, dec 953*4882a593Smuzhiyun movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 954*4882a593Smuzhiyun movdqa \TMP1, \XMM4 955*4882a593Smuzhiyun.else 956*4882a593Smuzhiyun movdqu \XMM1, 16*0(%arg3 , %r11 , 1) 957*4882a593Smuzhiyun movdqu \XMM2, 16*1(%arg3 , %r11 , 1) 958*4882a593Smuzhiyun movdqu \XMM3, 16*2(%arg3 , %r11 , 1) 959*4882a593Smuzhiyun movdqu \XMM4, 16*3(%arg3 , %r11 , 1) 960*4882a593Smuzhiyun.endif 961*4882a593Smuzhiyun 962*4882a593Smuzhiyun add $64, %r11 963*4882a593Smuzhiyun pshufb %xmm14, \XMM1 # perform a 16 byte swap 964*4882a593Smuzhiyun pxor \XMMDst, \XMM1 965*4882a593Smuzhiyun# combine GHASHed value with the corresponding ciphertext 966*4882a593Smuzhiyun pshufb %xmm14, \XMM2 # perform a 16 byte swap 967*4882a593Smuzhiyun pshufb %xmm14, \XMM3 # perform a 16 byte swap 968*4882a593Smuzhiyun pshufb %xmm14, \XMM4 # perform a 16 byte swap 969*4882a593Smuzhiyun 970*4882a593Smuzhiyun_initial_blocks_done\@: 971*4882a593Smuzhiyun 972*4882a593Smuzhiyun.endm 973*4882a593Smuzhiyun 974*4882a593Smuzhiyun/* 975*4882a593Smuzhiyun* encrypt 4 blocks at a time 976*4882a593Smuzhiyun* ghash the 4 previously encrypted ciphertext blocks 977*4882a593Smuzhiyun* arg1, %arg3, %arg4 are used as pointers only, not modified 978*4882a593Smuzhiyun* %r11 is the data offset value 979*4882a593Smuzhiyun*/ 980*4882a593Smuzhiyun.macro GHASH_4_ENCRYPT_4_PARALLEL_enc TMP1 TMP2 TMP3 TMP4 TMP5 \ 981*4882a593SmuzhiyunTMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 982*4882a593Smuzhiyun 983*4882a593Smuzhiyun movdqa \XMM1, \XMM5 984*4882a593Smuzhiyun movdqa \XMM2, \XMM6 985*4882a593Smuzhiyun movdqa \XMM3, \XMM7 986*4882a593Smuzhiyun movdqa \XMM4, \XMM8 987*4882a593Smuzhiyun 988*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm15 989*4882a593Smuzhiyun # multiply TMP5 * HashKey using karatsuba 990*4882a593Smuzhiyun 991*4882a593Smuzhiyun movdqa \XMM5, \TMP4 992*4882a593Smuzhiyun pshufd $78, \XMM5, \TMP6 993*4882a593Smuzhiyun pxor \XMM5, \TMP6 994*4882a593Smuzhiyun paddd ONE(%rip), \XMM0 # INCR CNT 995*4882a593Smuzhiyun movdqu HashKey_4(%arg2), \TMP5 996*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 997*4882a593Smuzhiyun movdqa \XMM0, \XMM1 998*4882a593Smuzhiyun paddd ONE(%rip), \XMM0 # INCR CNT 999*4882a593Smuzhiyun movdqa \XMM0, \XMM2 1000*4882a593Smuzhiyun paddd ONE(%rip), \XMM0 # INCR CNT 1001*4882a593Smuzhiyun movdqa \XMM0, \XMM3 1002*4882a593Smuzhiyun paddd ONE(%rip), \XMM0 # INCR CNT 1003*4882a593Smuzhiyun movdqa \XMM0, \XMM4 1004*4882a593Smuzhiyun pshufb %xmm15, \XMM1 # perform a 16 byte swap 1005*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1006*4882a593Smuzhiyun pshufb %xmm15, \XMM2 # perform a 16 byte swap 1007*4882a593Smuzhiyun pshufb %xmm15, \XMM3 # perform a 16 byte swap 1008*4882a593Smuzhiyun pshufb %xmm15, \XMM4 # perform a 16 byte swap 1009*4882a593Smuzhiyun 1010*4882a593Smuzhiyun pxor (%arg1), \XMM1 1011*4882a593Smuzhiyun pxor (%arg1), \XMM2 1012*4882a593Smuzhiyun pxor (%arg1), \XMM3 1013*4882a593Smuzhiyun pxor (%arg1), \XMM4 1014*4882a593Smuzhiyun movdqu HashKey_4_k(%arg2), \TMP5 1015*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1016*4882a593Smuzhiyun movaps 0x10(%arg1), \TMP1 1017*4882a593Smuzhiyun aesenc \TMP1, \XMM1 # Round 1 1018*4882a593Smuzhiyun aesenc \TMP1, \XMM2 1019*4882a593Smuzhiyun aesenc \TMP1, \XMM3 1020*4882a593Smuzhiyun aesenc \TMP1, \XMM4 1021*4882a593Smuzhiyun movaps 0x20(%arg1), \TMP1 1022*4882a593Smuzhiyun aesenc \TMP1, \XMM1 # Round 2 1023*4882a593Smuzhiyun aesenc \TMP1, \XMM2 1024*4882a593Smuzhiyun aesenc \TMP1, \XMM3 1025*4882a593Smuzhiyun aesenc \TMP1, \XMM4 1026*4882a593Smuzhiyun movdqa \XMM6, \TMP1 1027*4882a593Smuzhiyun pshufd $78, \XMM6, \TMP2 1028*4882a593Smuzhiyun pxor \XMM6, \TMP2 1029*4882a593Smuzhiyun movdqu HashKey_3(%arg2), \TMP5 1030*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1031*4882a593Smuzhiyun movaps 0x30(%arg1), \TMP3 1032*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 3 1033*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1034*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1035*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1036*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1037*4882a593Smuzhiyun movaps 0x40(%arg1), \TMP3 1038*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 4 1039*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1040*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1041*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1042*4882a593Smuzhiyun movdqu HashKey_3_k(%arg2), \TMP5 1043*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1044*4882a593Smuzhiyun movaps 0x50(%arg1), \TMP3 1045*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 5 1046*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1047*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1048*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1049*4882a593Smuzhiyun pxor \TMP1, \TMP4 1050*4882a593Smuzhiyun# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1051*4882a593Smuzhiyun pxor \XMM6, \XMM5 1052*4882a593Smuzhiyun pxor \TMP2, \TMP6 1053*4882a593Smuzhiyun movdqa \XMM7, \TMP1 1054*4882a593Smuzhiyun pshufd $78, \XMM7, \TMP2 1055*4882a593Smuzhiyun pxor \XMM7, \TMP2 1056*4882a593Smuzhiyun movdqu HashKey_2(%arg2), \TMP5 1057*4882a593Smuzhiyun 1058*4882a593Smuzhiyun # Multiply TMP5 * HashKey using karatsuba 1059*4882a593Smuzhiyun 1060*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1061*4882a593Smuzhiyun movaps 0x60(%arg1), \TMP3 1062*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 6 1063*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1064*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1065*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1066*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1067*4882a593Smuzhiyun movaps 0x70(%arg1), \TMP3 1068*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 7 1069*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1070*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1071*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1072*4882a593Smuzhiyun movdqu HashKey_2_k(%arg2), \TMP5 1073*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1074*4882a593Smuzhiyun movaps 0x80(%arg1), \TMP3 1075*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 8 1076*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1077*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1078*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1079*4882a593Smuzhiyun pxor \TMP1, \TMP4 1080*4882a593Smuzhiyun# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1081*4882a593Smuzhiyun pxor \XMM7, \XMM5 1082*4882a593Smuzhiyun pxor \TMP2, \TMP6 1083*4882a593Smuzhiyun 1084*4882a593Smuzhiyun # Multiply XMM8 * HashKey 1085*4882a593Smuzhiyun # XMM8 and TMP5 hold the values for the two operands 1086*4882a593Smuzhiyun 1087*4882a593Smuzhiyun movdqa \XMM8, \TMP1 1088*4882a593Smuzhiyun pshufd $78, \XMM8, \TMP2 1089*4882a593Smuzhiyun pxor \XMM8, \TMP2 1090*4882a593Smuzhiyun movdqu HashKey(%arg2), \TMP5 1091*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1092*4882a593Smuzhiyun movaps 0x90(%arg1), \TMP3 1093*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 9 1094*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1095*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1096*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1097*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1098*4882a593Smuzhiyun lea 0xa0(%arg1),%r10 1099*4882a593Smuzhiyun mov keysize,%eax 1100*4882a593Smuzhiyun shr $2,%eax # 128->4, 192->6, 256->8 1101*4882a593Smuzhiyun sub $4,%eax # 128->0, 192->2, 256->4 1102*4882a593Smuzhiyun jz aes_loop_par_enc_done\@ 1103*4882a593Smuzhiyun 1104*4882a593Smuzhiyunaes_loop_par_enc\@: 1105*4882a593Smuzhiyun MOVADQ (%r10),\TMP3 1106*4882a593Smuzhiyun.irpc index, 1234 1107*4882a593Smuzhiyun aesenc \TMP3, %xmm\index 1108*4882a593Smuzhiyun.endr 1109*4882a593Smuzhiyun add $16,%r10 1110*4882a593Smuzhiyun sub $1,%eax 1111*4882a593Smuzhiyun jnz aes_loop_par_enc\@ 1112*4882a593Smuzhiyun 1113*4882a593Smuzhiyunaes_loop_par_enc_done\@: 1114*4882a593Smuzhiyun MOVADQ (%r10), \TMP3 1115*4882a593Smuzhiyun aesenclast \TMP3, \XMM1 # Round 10 1116*4882a593Smuzhiyun aesenclast \TMP3, \XMM2 1117*4882a593Smuzhiyun aesenclast \TMP3, \XMM3 1118*4882a593Smuzhiyun aesenclast \TMP3, \XMM4 1119*4882a593Smuzhiyun movdqu HashKey_k(%arg2), \TMP5 1120*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1121*4882a593Smuzhiyun movdqu (%arg4,%r11,1), \TMP3 1122*4882a593Smuzhiyun pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1123*4882a593Smuzhiyun movdqu 16(%arg4,%r11,1), \TMP3 1124*4882a593Smuzhiyun pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1125*4882a593Smuzhiyun movdqu 32(%arg4,%r11,1), \TMP3 1126*4882a593Smuzhiyun pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1127*4882a593Smuzhiyun movdqu 48(%arg4,%r11,1), \TMP3 1128*4882a593Smuzhiyun pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1129*4882a593Smuzhiyun movdqu \XMM1, (%arg3,%r11,1) # Write to the ciphertext buffer 1130*4882a593Smuzhiyun movdqu \XMM2, 16(%arg3,%r11,1) # Write to the ciphertext buffer 1131*4882a593Smuzhiyun movdqu \XMM3, 32(%arg3,%r11,1) # Write to the ciphertext buffer 1132*4882a593Smuzhiyun movdqu \XMM4, 48(%arg3,%r11,1) # Write to the ciphertext buffer 1133*4882a593Smuzhiyun pshufb %xmm15, \XMM1 # perform a 16 byte swap 1134*4882a593Smuzhiyun pshufb %xmm15, \XMM2 # perform a 16 byte swap 1135*4882a593Smuzhiyun pshufb %xmm15, \XMM3 # perform a 16 byte swap 1136*4882a593Smuzhiyun pshufb %xmm15, \XMM4 # perform a 16 byte swap 1137*4882a593Smuzhiyun 1138*4882a593Smuzhiyun pxor \TMP4, \TMP1 1139*4882a593Smuzhiyun pxor \XMM8, \XMM5 1140*4882a593Smuzhiyun pxor \TMP6, \TMP2 1141*4882a593Smuzhiyun pxor \TMP1, \TMP2 1142*4882a593Smuzhiyun pxor \XMM5, \TMP2 1143*4882a593Smuzhiyun movdqa \TMP2, \TMP3 1144*4882a593Smuzhiyun pslldq $8, \TMP3 # left shift TMP3 2 DWs 1145*4882a593Smuzhiyun psrldq $8, \TMP2 # right shift TMP2 2 DWs 1146*4882a593Smuzhiyun pxor \TMP3, \XMM5 1147*4882a593Smuzhiyun pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1148*4882a593Smuzhiyun 1149*4882a593Smuzhiyun # first phase of reduction 1150*4882a593Smuzhiyun 1151*4882a593Smuzhiyun movdqa \XMM5, \TMP2 1152*4882a593Smuzhiyun movdqa \XMM5, \TMP3 1153*4882a593Smuzhiyun movdqa \XMM5, \TMP4 1154*4882a593Smuzhiyun# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1155*4882a593Smuzhiyun pslld $31, \TMP2 # packed right shift << 31 1156*4882a593Smuzhiyun pslld $30, \TMP3 # packed right shift << 30 1157*4882a593Smuzhiyun pslld $25, \TMP4 # packed right shift << 25 1158*4882a593Smuzhiyun pxor \TMP3, \TMP2 # xor the shifted versions 1159*4882a593Smuzhiyun pxor \TMP4, \TMP2 1160*4882a593Smuzhiyun movdqa \TMP2, \TMP5 1161*4882a593Smuzhiyun psrldq $4, \TMP5 # right shift T5 1 DW 1162*4882a593Smuzhiyun pslldq $12, \TMP2 # left shift T2 3 DWs 1163*4882a593Smuzhiyun pxor \TMP2, \XMM5 1164*4882a593Smuzhiyun 1165*4882a593Smuzhiyun # second phase of reduction 1166*4882a593Smuzhiyun 1167*4882a593Smuzhiyun movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1168*4882a593Smuzhiyun movdqa \XMM5,\TMP3 1169*4882a593Smuzhiyun movdqa \XMM5,\TMP4 1170*4882a593Smuzhiyun psrld $1, \TMP2 # packed left shift >>1 1171*4882a593Smuzhiyun psrld $2, \TMP3 # packed left shift >>2 1172*4882a593Smuzhiyun psrld $7, \TMP4 # packed left shift >>7 1173*4882a593Smuzhiyun pxor \TMP3,\TMP2 # xor the shifted versions 1174*4882a593Smuzhiyun pxor \TMP4,\TMP2 1175*4882a593Smuzhiyun pxor \TMP5, \TMP2 1176*4882a593Smuzhiyun pxor \TMP2, \XMM5 1177*4882a593Smuzhiyun pxor \TMP1, \XMM5 # result is in TMP1 1178*4882a593Smuzhiyun 1179*4882a593Smuzhiyun pxor \XMM5, \XMM1 1180*4882a593Smuzhiyun.endm 1181*4882a593Smuzhiyun 1182*4882a593Smuzhiyun/* 1183*4882a593Smuzhiyun* decrypt 4 blocks at a time 1184*4882a593Smuzhiyun* ghash the 4 previously decrypted ciphertext blocks 1185*4882a593Smuzhiyun* arg1, %arg3, %arg4 are used as pointers only, not modified 1186*4882a593Smuzhiyun* %r11 is the data offset value 1187*4882a593Smuzhiyun*/ 1188*4882a593Smuzhiyun.macro GHASH_4_ENCRYPT_4_PARALLEL_dec TMP1 TMP2 TMP3 TMP4 TMP5 \ 1189*4882a593SmuzhiyunTMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation 1190*4882a593Smuzhiyun 1191*4882a593Smuzhiyun movdqa \XMM1, \XMM5 1192*4882a593Smuzhiyun movdqa \XMM2, \XMM6 1193*4882a593Smuzhiyun movdqa \XMM3, \XMM7 1194*4882a593Smuzhiyun movdqa \XMM4, \XMM8 1195*4882a593Smuzhiyun 1196*4882a593Smuzhiyun movdqa SHUF_MASK(%rip), %xmm15 1197*4882a593Smuzhiyun # multiply TMP5 * HashKey using karatsuba 1198*4882a593Smuzhiyun 1199*4882a593Smuzhiyun movdqa \XMM5, \TMP4 1200*4882a593Smuzhiyun pshufd $78, \XMM5, \TMP6 1201*4882a593Smuzhiyun pxor \XMM5, \TMP6 1202*4882a593Smuzhiyun paddd ONE(%rip), \XMM0 # INCR CNT 1203*4882a593Smuzhiyun movdqu HashKey_4(%arg2), \TMP5 1204*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP4 # TMP4 = a1*b1 1205*4882a593Smuzhiyun movdqa \XMM0, \XMM1 1206*4882a593Smuzhiyun paddd ONE(%rip), \XMM0 # INCR CNT 1207*4882a593Smuzhiyun movdqa \XMM0, \XMM2 1208*4882a593Smuzhiyun paddd ONE(%rip), \XMM0 # INCR CNT 1209*4882a593Smuzhiyun movdqa \XMM0, \XMM3 1210*4882a593Smuzhiyun paddd ONE(%rip), \XMM0 # INCR CNT 1211*4882a593Smuzhiyun movdqa \XMM0, \XMM4 1212*4882a593Smuzhiyun pshufb %xmm15, \XMM1 # perform a 16 byte swap 1213*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM5 # XMM5 = a0*b0 1214*4882a593Smuzhiyun pshufb %xmm15, \XMM2 # perform a 16 byte swap 1215*4882a593Smuzhiyun pshufb %xmm15, \XMM3 # perform a 16 byte swap 1216*4882a593Smuzhiyun pshufb %xmm15, \XMM4 # perform a 16 byte swap 1217*4882a593Smuzhiyun 1218*4882a593Smuzhiyun pxor (%arg1), \XMM1 1219*4882a593Smuzhiyun pxor (%arg1), \XMM2 1220*4882a593Smuzhiyun pxor (%arg1), \XMM3 1221*4882a593Smuzhiyun pxor (%arg1), \XMM4 1222*4882a593Smuzhiyun movdqu HashKey_4_k(%arg2), \TMP5 1223*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0) 1224*4882a593Smuzhiyun movaps 0x10(%arg1), \TMP1 1225*4882a593Smuzhiyun aesenc \TMP1, \XMM1 # Round 1 1226*4882a593Smuzhiyun aesenc \TMP1, \XMM2 1227*4882a593Smuzhiyun aesenc \TMP1, \XMM3 1228*4882a593Smuzhiyun aesenc \TMP1, \XMM4 1229*4882a593Smuzhiyun movaps 0x20(%arg1), \TMP1 1230*4882a593Smuzhiyun aesenc \TMP1, \XMM1 # Round 2 1231*4882a593Smuzhiyun aesenc \TMP1, \XMM2 1232*4882a593Smuzhiyun aesenc \TMP1, \XMM3 1233*4882a593Smuzhiyun aesenc \TMP1, \XMM4 1234*4882a593Smuzhiyun movdqa \XMM6, \TMP1 1235*4882a593Smuzhiyun pshufd $78, \XMM6, \TMP2 1236*4882a593Smuzhiyun pxor \XMM6, \TMP2 1237*4882a593Smuzhiyun movdqu HashKey_3(%arg2), \TMP5 1238*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1 * b1 1239*4882a593Smuzhiyun movaps 0x30(%arg1), \TMP3 1240*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 3 1241*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1242*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1243*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1244*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM6 # XMM6 = a0*b0 1245*4882a593Smuzhiyun movaps 0x40(%arg1), \TMP3 1246*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 4 1247*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1248*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1249*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1250*4882a593Smuzhiyun movdqu HashKey_3_k(%arg2), \TMP5 1251*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1252*4882a593Smuzhiyun movaps 0x50(%arg1), \TMP3 1253*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 5 1254*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1255*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1256*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1257*4882a593Smuzhiyun pxor \TMP1, \TMP4 1258*4882a593Smuzhiyun# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1259*4882a593Smuzhiyun pxor \XMM6, \XMM5 1260*4882a593Smuzhiyun pxor \TMP2, \TMP6 1261*4882a593Smuzhiyun movdqa \XMM7, \TMP1 1262*4882a593Smuzhiyun pshufd $78, \XMM7, \TMP2 1263*4882a593Smuzhiyun pxor \XMM7, \TMP2 1264*4882a593Smuzhiyun movdqu HashKey_2(%arg2), \TMP5 1265*4882a593Smuzhiyun 1266*4882a593Smuzhiyun # Multiply TMP5 * HashKey using karatsuba 1267*4882a593Smuzhiyun 1268*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1269*4882a593Smuzhiyun movaps 0x60(%arg1), \TMP3 1270*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 6 1271*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1272*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1273*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1274*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM7 # XMM7 = a0*b0 1275*4882a593Smuzhiyun movaps 0x70(%arg1), \TMP3 1276*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 7 1277*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1278*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1279*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1280*4882a593Smuzhiyun movdqu HashKey_2_k(%arg2), \TMP5 1281*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1282*4882a593Smuzhiyun movaps 0x80(%arg1), \TMP3 1283*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 8 1284*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1285*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1286*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1287*4882a593Smuzhiyun pxor \TMP1, \TMP4 1288*4882a593Smuzhiyun# accumulate the results in TMP4:XMM5, TMP6 holds the middle part 1289*4882a593Smuzhiyun pxor \XMM7, \XMM5 1290*4882a593Smuzhiyun pxor \TMP2, \TMP6 1291*4882a593Smuzhiyun 1292*4882a593Smuzhiyun # Multiply XMM8 * HashKey 1293*4882a593Smuzhiyun # XMM8 and TMP5 hold the values for the two operands 1294*4882a593Smuzhiyun 1295*4882a593Smuzhiyun movdqa \XMM8, \TMP1 1296*4882a593Smuzhiyun pshufd $78, \XMM8, \TMP2 1297*4882a593Smuzhiyun pxor \XMM8, \TMP2 1298*4882a593Smuzhiyun movdqu HashKey(%arg2), \TMP5 1299*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1300*4882a593Smuzhiyun movaps 0x90(%arg1), \TMP3 1301*4882a593Smuzhiyun aesenc \TMP3, \XMM1 # Round 9 1302*4882a593Smuzhiyun aesenc \TMP3, \XMM2 1303*4882a593Smuzhiyun aesenc \TMP3, \XMM3 1304*4882a593Smuzhiyun aesenc \TMP3, \XMM4 1305*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM8 # XMM8 = a0*b0 1306*4882a593Smuzhiyun lea 0xa0(%arg1),%r10 1307*4882a593Smuzhiyun mov keysize,%eax 1308*4882a593Smuzhiyun shr $2,%eax # 128->4, 192->6, 256->8 1309*4882a593Smuzhiyun sub $4,%eax # 128->0, 192->2, 256->4 1310*4882a593Smuzhiyun jz aes_loop_par_dec_done\@ 1311*4882a593Smuzhiyun 1312*4882a593Smuzhiyunaes_loop_par_dec\@: 1313*4882a593Smuzhiyun MOVADQ (%r10),\TMP3 1314*4882a593Smuzhiyun.irpc index, 1234 1315*4882a593Smuzhiyun aesenc \TMP3, %xmm\index 1316*4882a593Smuzhiyun.endr 1317*4882a593Smuzhiyun add $16,%r10 1318*4882a593Smuzhiyun sub $1,%eax 1319*4882a593Smuzhiyun jnz aes_loop_par_dec\@ 1320*4882a593Smuzhiyun 1321*4882a593Smuzhiyunaes_loop_par_dec_done\@: 1322*4882a593Smuzhiyun MOVADQ (%r10), \TMP3 1323*4882a593Smuzhiyun aesenclast \TMP3, \XMM1 # last round 1324*4882a593Smuzhiyun aesenclast \TMP3, \XMM2 1325*4882a593Smuzhiyun aesenclast \TMP3, \XMM3 1326*4882a593Smuzhiyun aesenclast \TMP3, \XMM4 1327*4882a593Smuzhiyun movdqu HashKey_k(%arg2), \TMP5 1328*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1329*4882a593Smuzhiyun movdqu (%arg4,%r11,1), \TMP3 1330*4882a593Smuzhiyun pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK 1331*4882a593Smuzhiyun movdqu \XMM1, (%arg3,%r11,1) # Write to plaintext buffer 1332*4882a593Smuzhiyun movdqa \TMP3, \XMM1 1333*4882a593Smuzhiyun movdqu 16(%arg4,%r11,1), \TMP3 1334*4882a593Smuzhiyun pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK 1335*4882a593Smuzhiyun movdqu \XMM2, 16(%arg3,%r11,1) # Write to plaintext buffer 1336*4882a593Smuzhiyun movdqa \TMP3, \XMM2 1337*4882a593Smuzhiyun movdqu 32(%arg4,%r11,1), \TMP3 1338*4882a593Smuzhiyun pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK 1339*4882a593Smuzhiyun movdqu \XMM3, 32(%arg3,%r11,1) # Write to plaintext buffer 1340*4882a593Smuzhiyun movdqa \TMP3, \XMM3 1341*4882a593Smuzhiyun movdqu 48(%arg4,%r11,1), \TMP3 1342*4882a593Smuzhiyun pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK 1343*4882a593Smuzhiyun movdqu \XMM4, 48(%arg3,%r11,1) # Write to plaintext buffer 1344*4882a593Smuzhiyun movdqa \TMP3, \XMM4 1345*4882a593Smuzhiyun pshufb %xmm15, \XMM1 # perform a 16 byte swap 1346*4882a593Smuzhiyun pshufb %xmm15, \XMM2 # perform a 16 byte swap 1347*4882a593Smuzhiyun pshufb %xmm15, \XMM3 # perform a 16 byte swap 1348*4882a593Smuzhiyun pshufb %xmm15, \XMM4 # perform a 16 byte swap 1349*4882a593Smuzhiyun 1350*4882a593Smuzhiyun pxor \TMP4, \TMP1 1351*4882a593Smuzhiyun pxor \XMM8, \XMM5 1352*4882a593Smuzhiyun pxor \TMP6, \TMP2 1353*4882a593Smuzhiyun pxor \TMP1, \TMP2 1354*4882a593Smuzhiyun pxor \XMM5, \TMP2 1355*4882a593Smuzhiyun movdqa \TMP2, \TMP3 1356*4882a593Smuzhiyun pslldq $8, \TMP3 # left shift TMP3 2 DWs 1357*4882a593Smuzhiyun psrldq $8, \TMP2 # right shift TMP2 2 DWs 1358*4882a593Smuzhiyun pxor \TMP3, \XMM5 1359*4882a593Smuzhiyun pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5 1360*4882a593Smuzhiyun 1361*4882a593Smuzhiyun # first phase of reduction 1362*4882a593Smuzhiyun 1363*4882a593Smuzhiyun movdqa \XMM5, \TMP2 1364*4882a593Smuzhiyun movdqa \XMM5, \TMP3 1365*4882a593Smuzhiyun movdqa \XMM5, \TMP4 1366*4882a593Smuzhiyun# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently 1367*4882a593Smuzhiyun pslld $31, \TMP2 # packed right shift << 31 1368*4882a593Smuzhiyun pslld $30, \TMP3 # packed right shift << 30 1369*4882a593Smuzhiyun pslld $25, \TMP4 # packed right shift << 25 1370*4882a593Smuzhiyun pxor \TMP3, \TMP2 # xor the shifted versions 1371*4882a593Smuzhiyun pxor \TMP4, \TMP2 1372*4882a593Smuzhiyun movdqa \TMP2, \TMP5 1373*4882a593Smuzhiyun psrldq $4, \TMP5 # right shift T5 1 DW 1374*4882a593Smuzhiyun pslldq $12, \TMP2 # left shift T2 3 DWs 1375*4882a593Smuzhiyun pxor \TMP2, \XMM5 1376*4882a593Smuzhiyun 1377*4882a593Smuzhiyun # second phase of reduction 1378*4882a593Smuzhiyun 1379*4882a593Smuzhiyun movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4 1380*4882a593Smuzhiyun movdqa \XMM5,\TMP3 1381*4882a593Smuzhiyun movdqa \XMM5,\TMP4 1382*4882a593Smuzhiyun psrld $1, \TMP2 # packed left shift >>1 1383*4882a593Smuzhiyun psrld $2, \TMP3 # packed left shift >>2 1384*4882a593Smuzhiyun psrld $7, \TMP4 # packed left shift >>7 1385*4882a593Smuzhiyun pxor \TMP3,\TMP2 # xor the shifted versions 1386*4882a593Smuzhiyun pxor \TMP4,\TMP2 1387*4882a593Smuzhiyun pxor \TMP5, \TMP2 1388*4882a593Smuzhiyun pxor \TMP2, \XMM5 1389*4882a593Smuzhiyun pxor \TMP1, \XMM5 # result is in TMP1 1390*4882a593Smuzhiyun 1391*4882a593Smuzhiyun pxor \XMM5, \XMM1 1392*4882a593Smuzhiyun.endm 1393*4882a593Smuzhiyun 1394*4882a593Smuzhiyun/* GHASH the last 4 ciphertext blocks. */ 1395*4882a593Smuzhiyun.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \ 1396*4882a593SmuzhiyunTMP7 XMM1 XMM2 XMM3 XMM4 XMMDst 1397*4882a593Smuzhiyun 1398*4882a593Smuzhiyun # Multiply TMP6 * HashKey (using Karatsuba) 1399*4882a593Smuzhiyun 1400*4882a593Smuzhiyun movdqa \XMM1, \TMP6 1401*4882a593Smuzhiyun pshufd $78, \XMM1, \TMP2 1402*4882a593Smuzhiyun pxor \XMM1, \TMP2 1403*4882a593Smuzhiyun movdqu HashKey_4(%arg2), \TMP5 1404*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP6 # TMP6 = a1*b1 1405*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM1 # XMM1 = a0*b0 1406*4882a593Smuzhiyun movdqu HashKey_4_k(%arg2), \TMP4 1407*4882a593Smuzhiyun pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1408*4882a593Smuzhiyun movdqa \XMM1, \XMMDst 1409*4882a593Smuzhiyun movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1 1410*4882a593Smuzhiyun 1411*4882a593Smuzhiyun # Multiply TMP1 * HashKey (using Karatsuba) 1412*4882a593Smuzhiyun 1413*4882a593Smuzhiyun movdqa \XMM2, \TMP1 1414*4882a593Smuzhiyun pshufd $78, \XMM2, \TMP2 1415*4882a593Smuzhiyun pxor \XMM2, \TMP2 1416*4882a593Smuzhiyun movdqu HashKey_3(%arg2), \TMP5 1417*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1418*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM2 # XMM2 = a0*b0 1419*4882a593Smuzhiyun movdqu HashKey_3_k(%arg2), \TMP4 1420*4882a593Smuzhiyun pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1421*4882a593Smuzhiyun pxor \TMP1, \TMP6 1422*4882a593Smuzhiyun pxor \XMM2, \XMMDst 1423*4882a593Smuzhiyun pxor \TMP2, \XMM1 1424*4882a593Smuzhiyun# results accumulated in TMP6, XMMDst, XMM1 1425*4882a593Smuzhiyun 1426*4882a593Smuzhiyun # Multiply TMP1 * HashKey (using Karatsuba) 1427*4882a593Smuzhiyun 1428*4882a593Smuzhiyun movdqa \XMM3, \TMP1 1429*4882a593Smuzhiyun pshufd $78, \XMM3, \TMP2 1430*4882a593Smuzhiyun pxor \XMM3, \TMP2 1431*4882a593Smuzhiyun movdqu HashKey_2(%arg2), \TMP5 1432*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1433*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM3 # XMM3 = a0*b0 1434*4882a593Smuzhiyun movdqu HashKey_2_k(%arg2), \TMP4 1435*4882a593Smuzhiyun pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1436*4882a593Smuzhiyun pxor \TMP1, \TMP6 1437*4882a593Smuzhiyun pxor \XMM3, \XMMDst 1438*4882a593Smuzhiyun pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1 1439*4882a593Smuzhiyun 1440*4882a593Smuzhiyun # Multiply TMP1 * HashKey (using Karatsuba) 1441*4882a593Smuzhiyun movdqa \XMM4, \TMP1 1442*4882a593Smuzhiyun pshufd $78, \XMM4, \TMP2 1443*4882a593Smuzhiyun pxor \XMM4, \TMP2 1444*4882a593Smuzhiyun movdqu HashKey(%arg2), \TMP5 1445*4882a593Smuzhiyun pclmulqdq $0x11, \TMP5, \TMP1 # TMP1 = a1*b1 1446*4882a593Smuzhiyun pclmulqdq $0x00, \TMP5, \XMM4 # XMM4 = a0*b0 1447*4882a593Smuzhiyun movdqu HashKey_k(%arg2), \TMP4 1448*4882a593Smuzhiyun pclmulqdq $0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0) 1449*4882a593Smuzhiyun pxor \TMP1, \TMP6 1450*4882a593Smuzhiyun pxor \XMM4, \XMMDst 1451*4882a593Smuzhiyun pxor \XMM1, \TMP2 1452*4882a593Smuzhiyun pxor \TMP6, \TMP2 1453*4882a593Smuzhiyun pxor \XMMDst, \TMP2 1454*4882a593Smuzhiyun # middle section of the temp results combined as in karatsuba algorithm 1455*4882a593Smuzhiyun movdqa \TMP2, \TMP4 1456*4882a593Smuzhiyun pslldq $8, \TMP4 # left shift TMP4 2 DWs 1457*4882a593Smuzhiyun psrldq $8, \TMP2 # right shift TMP2 2 DWs 1458*4882a593Smuzhiyun pxor \TMP4, \XMMDst 1459*4882a593Smuzhiyun pxor \TMP2, \TMP6 1460*4882a593Smuzhiyun# TMP6:XMMDst holds the result of the accumulated carry-less multiplications 1461*4882a593Smuzhiyun # first phase of the reduction 1462*4882a593Smuzhiyun movdqa \XMMDst, \TMP2 1463*4882a593Smuzhiyun movdqa \XMMDst, \TMP3 1464*4882a593Smuzhiyun movdqa \XMMDst, \TMP4 1465*4882a593Smuzhiyun# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently 1466*4882a593Smuzhiyun pslld $31, \TMP2 # packed right shifting << 31 1467*4882a593Smuzhiyun pslld $30, \TMP3 # packed right shifting << 30 1468*4882a593Smuzhiyun pslld $25, \TMP4 # packed right shifting << 25 1469*4882a593Smuzhiyun pxor \TMP3, \TMP2 # xor the shifted versions 1470*4882a593Smuzhiyun pxor \TMP4, \TMP2 1471*4882a593Smuzhiyun movdqa \TMP2, \TMP7 1472*4882a593Smuzhiyun psrldq $4, \TMP7 # right shift TMP7 1 DW 1473*4882a593Smuzhiyun pslldq $12, \TMP2 # left shift TMP2 3 DWs 1474*4882a593Smuzhiyun pxor \TMP2, \XMMDst 1475*4882a593Smuzhiyun 1476*4882a593Smuzhiyun # second phase of the reduction 1477*4882a593Smuzhiyun movdqa \XMMDst, \TMP2 1478*4882a593Smuzhiyun # make 3 copies of XMMDst for doing 3 shift operations 1479*4882a593Smuzhiyun movdqa \XMMDst, \TMP3 1480*4882a593Smuzhiyun movdqa \XMMDst, \TMP4 1481*4882a593Smuzhiyun psrld $1, \TMP2 # packed left shift >> 1 1482*4882a593Smuzhiyun psrld $2, \TMP3 # packed left shift >> 2 1483*4882a593Smuzhiyun psrld $7, \TMP4 # packed left shift >> 7 1484*4882a593Smuzhiyun pxor \TMP3, \TMP2 # xor the shifted versions 1485*4882a593Smuzhiyun pxor \TMP4, \TMP2 1486*4882a593Smuzhiyun pxor \TMP7, \TMP2 1487*4882a593Smuzhiyun pxor \TMP2, \XMMDst 1488*4882a593Smuzhiyun pxor \TMP6, \XMMDst # reduced result is in XMMDst 1489*4882a593Smuzhiyun.endm 1490*4882a593Smuzhiyun 1491*4882a593Smuzhiyun 1492*4882a593Smuzhiyun/* Encryption of a single block 1493*4882a593Smuzhiyun* uses eax & r10 1494*4882a593Smuzhiyun*/ 1495*4882a593Smuzhiyun 1496*4882a593Smuzhiyun.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1 1497*4882a593Smuzhiyun 1498*4882a593Smuzhiyun pxor (%arg1), \XMM0 1499*4882a593Smuzhiyun mov keysize,%eax 1500*4882a593Smuzhiyun shr $2,%eax # 128->4, 192->6, 256->8 1501*4882a593Smuzhiyun add $5,%eax # 128->9, 192->11, 256->13 1502*4882a593Smuzhiyun lea 16(%arg1), %r10 # get first expanded key address 1503*4882a593Smuzhiyun 1504*4882a593Smuzhiyun_esb_loop_\@: 1505*4882a593Smuzhiyun MOVADQ (%r10),\TMP1 1506*4882a593Smuzhiyun aesenc \TMP1,\XMM0 1507*4882a593Smuzhiyun add $16,%r10 1508*4882a593Smuzhiyun sub $1,%eax 1509*4882a593Smuzhiyun jnz _esb_loop_\@ 1510*4882a593Smuzhiyun 1511*4882a593Smuzhiyun MOVADQ (%r10),\TMP1 1512*4882a593Smuzhiyun aesenclast \TMP1,\XMM0 1513*4882a593Smuzhiyun.endm 1514*4882a593Smuzhiyun/***************************************************************************** 1515*4882a593Smuzhiyun* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1516*4882a593Smuzhiyun* struct gcm_context_data *data 1517*4882a593Smuzhiyun* // Context data 1518*4882a593Smuzhiyun* u8 *out, // Plaintext output. Encrypt in-place is allowed. 1519*4882a593Smuzhiyun* const u8 *in, // Ciphertext input 1520*4882a593Smuzhiyun* u64 plaintext_len, // Length of data in bytes for decryption. 1521*4882a593Smuzhiyun* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1522*4882a593Smuzhiyun* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1523*4882a593Smuzhiyun* // concatenated with 0x00000001. 16-byte aligned pointer. 1524*4882a593Smuzhiyun* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1525*4882a593Smuzhiyun* const u8 *aad, // Additional Authentication Data (AAD) 1526*4882a593Smuzhiyun* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1527*4882a593Smuzhiyun* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the 1528*4882a593Smuzhiyun* // given authentication tag and only return the plaintext if they match. 1529*4882a593Smuzhiyun* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 1530*4882a593Smuzhiyun* // (most likely), 12 or 8. 1531*4882a593Smuzhiyun* 1532*4882a593Smuzhiyun* Assumptions: 1533*4882a593Smuzhiyun* 1534*4882a593Smuzhiyun* keys: 1535*4882a593Smuzhiyun* keys are pre-expanded and aligned to 16 bytes. we are using the first 1536*4882a593Smuzhiyun* set of 11 keys in the data structure void *aes_ctx 1537*4882a593Smuzhiyun* 1538*4882a593Smuzhiyun* iv: 1539*4882a593Smuzhiyun* 0 1 2 3 1540*4882a593Smuzhiyun* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1541*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1542*4882a593Smuzhiyun* | Salt (From the SA) | 1543*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1544*4882a593Smuzhiyun* | Initialization Vector | 1545*4882a593Smuzhiyun* | (This is the sequence number from IPSec header) | 1546*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1547*4882a593Smuzhiyun* | 0x1 | 1548*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1549*4882a593Smuzhiyun* 1550*4882a593Smuzhiyun* 1551*4882a593Smuzhiyun* 1552*4882a593Smuzhiyun* AAD: 1553*4882a593Smuzhiyun* AAD padded to 128 bits with 0 1554*4882a593Smuzhiyun* for example, assume AAD is a u32 vector 1555*4882a593Smuzhiyun* 1556*4882a593Smuzhiyun* if AAD is 8 bytes: 1557*4882a593Smuzhiyun* AAD[3] = {A0, A1}; 1558*4882a593Smuzhiyun* padded AAD in xmm register = {A1 A0 0 0} 1559*4882a593Smuzhiyun* 1560*4882a593Smuzhiyun* 0 1 2 3 1561*4882a593Smuzhiyun* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1562*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1563*4882a593Smuzhiyun* | SPI (A1) | 1564*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1565*4882a593Smuzhiyun* | 32-bit Sequence Number (A0) | 1566*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1567*4882a593Smuzhiyun* | 0x0 | 1568*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1569*4882a593Smuzhiyun* 1570*4882a593Smuzhiyun* AAD Format with 32-bit Sequence Number 1571*4882a593Smuzhiyun* 1572*4882a593Smuzhiyun* if AAD is 12 bytes: 1573*4882a593Smuzhiyun* AAD[3] = {A0, A1, A2}; 1574*4882a593Smuzhiyun* padded AAD in xmm register = {A2 A1 A0 0} 1575*4882a593Smuzhiyun* 1576*4882a593Smuzhiyun* 0 1 2 3 1577*4882a593Smuzhiyun* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1578*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1579*4882a593Smuzhiyun* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1580*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1581*4882a593Smuzhiyun* | SPI (A2) | 1582*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1583*4882a593Smuzhiyun* | 64-bit Extended Sequence Number {A1,A0} | 1584*4882a593Smuzhiyun* | | 1585*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1586*4882a593Smuzhiyun* | 0x0 | 1587*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1588*4882a593Smuzhiyun* 1589*4882a593Smuzhiyun* AAD Format with 64-bit Extended Sequence Number 1590*4882a593Smuzhiyun* 1591*4882a593Smuzhiyun* poly = x^128 + x^127 + x^126 + x^121 + 1 1592*4882a593Smuzhiyun* 1593*4882a593Smuzhiyun*****************************************************************************/ 1594*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_dec) 1595*4882a593Smuzhiyun FUNC_SAVE 1596*4882a593Smuzhiyun 1597*4882a593Smuzhiyun GCM_INIT %arg6, arg7, arg8, arg9 1598*4882a593Smuzhiyun GCM_ENC_DEC dec 1599*4882a593Smuzhiyun GCM_COMPLETE arg10, arg11 1600*4882a593Smuzhiyun FUNC_RESTORE 1601*4882a593Smuzhiyun RET 1602*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_dec) 1603*4882a593Smuzhiyun 1604*4882a593Smuzhiyun 1605*4882a593Smuzhiyun/***************************************************************************** 1606*4882a593Smuzhiyun* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1607*4882a593Smuzhiyun* struct gcm_context_data *data 1608*4882a593Smuzhiyun* // Context data 1609*4882a593Smuzhiyun* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1610*4882a593Smuzhiyun* const u8 *in, // Plaintext input 1611*4882a593Smuzhiyun* u64 plaintext_len, // Length of data in bytes for encryption. 1612*4882a593Smuzhiyun* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1613*4882a593Smuzhiyun* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1614*4882a593Smuzhiyun* // concatenated with 0x00000001. 16-byte aligned pointer. 1615*4882a593Smuzhiyun* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1616*4882a593Smuzhiyun* const u8 *aad, // Additional Authentication Data (AAD) 1617*4882a593Smuzhiyun* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes 1618*4882a593Smuzhiyun* u8 *auth_tag, // Authenticated Tag output. 1619*4882a593Smuzhiyun* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1620*4882a593Smuzhiyun* // 12 or 8. 1621*4882a593Smuzhiyun* 1622*4882a593Smuzhiyun* Assumptions: 1623*4882a593Smuzhiyun* 1624*4882a593Smuzhiyun* keys: 1625*4882a593Smuzhiyun* keys are pre-expanded and aligned to 16 bytes. we are using the 1626*4882a593Smuzhiyun* first set of 11 keys in the data structure void *aes_ctx 1627*4882a593Smuzhiyun* 1628*4882a593Smuzhiyun* 1629*4882a593Smuzhiyun* iv: 1630*4882a593Smuzhiyun* 0 1 2 3 1631*4882a593Smuzhiyun* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1632*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1633*4882a593Smuzhiyun* | Salt (From the SA) | 1634*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1635*4882a593Smuzhiyun* | Initialization Vector | 1636*4882a593Smuzhiyun* | (This is the sequence number from IPSec header) | 1637*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1638*4882a593Smuzhiyun* | 0x1 | 1639*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1640*4882a593Smuzhiyun* 1641*4882a593Smuzhiyun* 1642*4882a593Smuzhiyun* 1643*4882a593Smuzhiyun* AAD: 1644*4882a593Smuzhiyun* AAD padded to 128 bits with 0 1645*4882a593Smuzhiyun* for example, assume AAD is a u32 vector 1646*4882a593Smuzhiyun* 1647*4882a593Smuzhiyun* if AAD is 8 bytes: 1648*4882a593Smuzhiyun* AAD[3] = {A0, A1}; 1649*4882a593Smuzhiyun* padded AAD in xmm register = {A1 A0 0 0} 1650*4882a593Smuzhiyun* 1651*4882a593Smuzhiyun* 0 1 2 3 1652*4882a593Smuzhiyun* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1653*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1654*4882a593Smuzhiyun* | SPI (A1) | 1655*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1656*4882a593Smuzhiyun* | 32-bit Sequence Number (A0) | 1657*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1658*4882a593Smuzhiyun* | 0x0 | 1659*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1660*4882a593Smuzhiyun* 1661*4882a593Smuzhiyun* AAD Format with 32-bit Sequence Number 1662*4882a593Smuzhiyun* 1663*4882a593Smuzhiyun* if AAD is 12 bytes: 1664*4882a593Smuzhiyun* AAD[3] = {A0, A1, A2}; 1665*4882a593Smuzhiyun* padded AAD in xmm register = {A2 A1 A0 0} 1666*4882a593Smuzhiyun* 1667*4882a593Smuzhiyun* 0 1 2 3 1668*4882a593Smuzhiyun* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1669*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1670*4882a593Smuzhiyun* | SPI (A2) | 1671*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1672*4882a593Smuzhiyun* | 64-bit Extended Sequence Number {A1,A0} | 1673*4882a593Smuzhiyun* | | 1674*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1675*4882a593Smuzhiyun* | 0x0 | 1676*4882a593Smuzhiyun* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1677*4882a593Smuzhiyun* 1678*4882a593Smuzhiyun* AAD Format with 64-bit Extended Sequence Number 1679*4882a593Smuzhiyun* 1680*4882a593Smuzhiyun* poly = x^128 + x^127 + x^126 + x^121 + 1 1681*4882a593Smuzhiyun***************************************************************************/ 1682*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_enc) 1683*4882a593Smuzhiyun FUNC_SAVE 1684*4882a593Smuzhiyun 1685*4882a593Smuzhiyun GCM_INIT %arg6, arg7, arg8, arg9 1686*4882a593Smuzhiyun GCM_ENC_DEC enc 1687*4882a593Smuzhiyun 1688*4882a593Smuzhiyun GCM_COMPLETE arg10, arg11 1689*4882a593Smuzhiyun FUNC_RESTORE 1690*4882a593Smuzhiyun RET 1691*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_enc) 1692*4882a593Smuzhiyun 1693*4882a593Smuzhiyun/***************************************************************************** 1694*4882a593Smuzhiyun* void aesni_gcm_init(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1695*4882a593Smuzhiyun* struct gcm_context_data *data, 1696*4882a593Smuzhiyun* // context data 1697*4882a593Smuzhiyun* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association) 1698*4882a593Smuzhiyun* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload) 1699*4882a593Smuzhiyun* // concatenated with 0x00000001. 16-byte aligned pointer. 1700*4882a593Smuzhiyun* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary. 1701*4882a593Smuzhiyun* const u8 *aad, // Additional Authentication Data (AAD) 1702*4882a593Smuzhiyun* u64 aad_len) // Length of AAD in bytes. 1703*4882a593Smuzhiyun*/ 1704*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_init) 1705*4882a593Smuzhiyun FUNC_SAVE 1706*4882a593Smuzhiyun GCM_INIT %arg3, %arg4,%arg5, %arg6 1707*4882a593Smuzhiyun FUNC_RESTORE 1708*4882a593Smuzhiyun RET 1709*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_init) 1710*4882a593Smuzhiyun 1711*4882a593Smuzhiyun/***************************************************************************** 1712*4882a593Smuzhiyun* void aesni_gcm_enc_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1713*4882a593Smuzhiyun* struct gcm_context_data *data, 1714*4882a593Smuzhiyun* // context data 1715*4882a593Smuzhiyun* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1716*4882a593Smuzhiyun* const u8 *in, // Plaintext input 1717*4882a593Smuzhiyun* u64 plaintext_len, // Length of data in bytes for encryption. 1718*4882a593Smuzhiyun*/ 1719*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_enc_update) 1720*4882a593Smuzhiyun FUNC_SAVE 1721*4882a593Smuzhiyun GCM_ENC_DEC enc 1722*4882a593Smuzhiyun FUNC_RESTORE 1723*4882a593Smuzhiyun RET 1724*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_enc_update) 1725*4882a593Smuzhiyun 1726*4882a593Smuzhiyun/***************************************************************************** 1727*4882a593Smuzhiyun* void aesni_gcm_dec_update(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1728*4882a593Smuzhiyun* struct gcm_context_data *data, 1729*4882a593Smuzhiyun* // context data 1730*4882a593Smuzhiyun* u8 *out, // Ciphertext output. Encrypt in-place is allowed. 1731*4882a593Smuzhiyun* const u8 *in, // Plaintext input 1732*4882a593Smuzhiyun* u64 plaintext_len, // Length of data in bytes for encryption. 1733*4882a593Smuzhiyun*/ 1734*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_dec_update) 1735*4882a593Smuzhiyun FUNC_SAVE 1736*4882a593Smuzhiyun GCM_ENC_DEC dec 1737*4882a593Smuzhiyun FUNC_RESTORE 1738*4882a593Smuzhiyun RET 1739*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_dec_update) 1740*4882a593Smuzhiyun 1741*4882a593Smuzhiyun/***************************************************************************** 1742*4882a593Smuzhiyun* void aesni_gcm_finalize(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary. 1743*4882a593Smuzhiyun* struct gcm_context_data *data, 1744*4882a593Smuzhiyun* // context data 1745*4882a593Smuzhiyun* u8 *auth_tag, // Authenticated Tag output. 1746*4882a593Smuzhiyun* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely), 1747*4882a593Smuzhiyun* // 12 or 8. 1748*4882a593Smuzhiyun*/ 1749*4882a593SmuzhiyunSYM_FUNC_START(aesni_gcm_finalize) 1750*4882a593Smuzhiyun FUNC_SAVE 1751*4882a593Smuzhiyun GCM_COMPLETE %arg3 %arg4 1752*4882a593Smuzhiyun FUNC_RESTORE 1753*4882a593Smuzhiyun RET 1754*4882a593SmuzhiyunSYM_FUNC_END(aesni_gcm_finalize) 1755*4882a593Smuzhiyun 1756*4882a593Smuzhiyun#endif 1757*4882a593Smuzhiyun 1758*4882a593Smuzhiyun 1759*4882a593SmuzhiyunSYM_FUNC_START_LOCAL_ALIAS(_key_expansion_128) 1760*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_key_expansion_256a) 1761*4882a593Smuzhiyun pshufd $0b11111111, %xmm1, %xmm1 1762*4882a593Smuzhiyun shufps $0b00010000, %xmm0, %xmm4 1763*4882a593Smuzhiyun pxor %xmm4, %xmm0 1764*4882a593Smuzhiyun shufps $0b10001100, %xmm0, %xmm4 1765*4882a593Smuzhiyun pxor %xmm4, %xmm0 1766*4882a593Smuzhiyun pxor %xmm1, %xmm0 1767*4882a593Smuzhiyun movaps %xmm0, (TKEYP) 1768*4882a593Smuzhiyun add $0x10, TKEYP 1769*4882a593Smuzhiyun RET 1770*4882a593SmuzhiyunSYM_FUNC_END(_key_expansion_256a) 1771*4882a593SmuzhiyunSYM_FUNC_END_ALIAS(_key_expansion_128) 1772*4882a593Smuzhiyun 1773*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_key_expansion_192a) 1774*4882a593Smuzhiyun pshufd $0b01010101, %xmm1, %xmm1 1775*4882a593Smuzhiyun shufps $0b00010000, %xmm0, %xmm4 1776*4882a593Smuzhiyun pxor %xmm4, %xmm0 1777*4882a593Smuzhiyun shufps $0b10001100, %xmm0, %xmm4 1778*4882a593Smuzhiyun pxor %xmm4, %xmm0 1779*4882a593Smuzhiyun pxor %xmm1, %xmm0 1780*4882a593Smuzhiyun 1781*4882a593Smuzhiyun movaps %xmm2, %xmm5 1782*4882a593Smuzhiyun movaps %xmm2, %xmm6 1783*4882a593Smuzhiyun pslldq $4, %xmm5 1784*4882a593Smuzhiyun pshufd $0b11111111, %xmm0, %xmm3 1785*4882a593Smuzhiyun pxor %xmm3, %xmm2 1786*4882a593Smuzhiyun pxor %xmm5, %xmm2 1787*4882a593Smuzhiyun 1788*4882a593Smuzhiyun movaps %xmm0, %xmm1 1789*4882a593Smuzhiyun shufps $0b01000100, %xmm0, %xmm6 1790*4882a593Smuzhiyun movaps %xmm6, (TKEYP) 1791*4882a593Smuzhiyun shufps $0b01001110, %xmm2, %xmm1 1792*4882a593Smuzhiyun movaps %xmm1, 0x10(TKEYP) 1793*4882a593Smuzhiyun add $0x20, TKEYP 1794*4882a593Smuzhiyun RET 1795*4882a593SmuzhiyunSYM_FUNC_END(_key_expansion_192a) 1796*4882a593Smuzhiyun 1797*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_key_expansion_192b) 1798*4882a593Smuzhiyun pshufd $0b01010101, %xmm1, %xmm1 1799*4882a593Smuzhiyun shufps $0b00010000, %xmm0, %xmm4 1800*4882a593Smuzhiyun pxor %xmm4, %xmm0 1801*4882a593Smuzhiyun shufps $0b10001100, %xmm0, %xmm4 1802*4882a593Smuzhiyun pxor %xmm4, %xmm0 1803*4882a593Smuzhiyun pxor %xmm1, %xmm0 1804*4882a593Smuzhiyun 1805*4882a593Smuzhiyun movaps %xmm2, %xmm5 1806*4882a593Smuzhiyun pslldq $4, %xmm5 1807*4882a593Smuzhiyun pshufd $0b11111111, %xmm0, %xmm3 1808*4882a593Smuzhiyun pxor %xmm3, %xmm2 1809*4882a593Smuzhiyun pxor %xmm5, %xmm2 1810*4882a593Smuzhiyun 1811*4882a593Smuzhiyun movaps %xmm0, (TKEYP) 1812*4882a593Smuzhiyun add $0x10, TKEYP 1813*4882a593Smuzhiyun RET 1814*4882a593SmuzhiyunSYM_FUNC_END(_key_expansion_192b) 1815*4882a593Smuzhiyun 1816*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_key_expansion_256b) 1817*4882a593Smuzhiyun pshufd $0b10101010, %xmm1, %xmm1 1818*4882a593Smuzhiyun shufps $0b00010000, %xmm2, %xmm4 1819*4882a593Smuzhiyun pxor %xmm4, %xmm2 1820*4882a593Smuzhiyun shufps $0b10001100, %xmm2, %xmm4 1821*4882a593Smuzhiyun pxor %xmm4, %xmm2 1822*4882a593Smuzhiyun pxor %xmm1, %xmm2 1823*4882a593Smuzhiyun movaps %xmm2, (TKEYP) 1824*4882a593Smuzhiyun add $0x10, TKEYP 1825*4882a593Smuzhiyun RET 1826*4882a593SmuzhiyunSYM_FUNC_END(_key_expansion_256b) 1827*4882a593Smuzhiyun 1828*4882a593Smuzhiyun/* 1829*4882a593Smuzhiyun * int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 1830*4882a593Smuzhiyun * unsigned int key_len) 1831*4882a593Smuzhiyun */ 1832*4882a593SmuzhiyunSYM_FUNC_START(aesni_set_key) 1833*4882a593Smuzhiyun FRAME_BEGIN 1834*4882a593Smuzhiyun#ifndef __x86_64__ 1835*4882a593Smuzhiyun pushl KEYP 1836*4882a593Smuzhiyun movl (FRAME_OFFSET+8)(%esp), KEYP # ctx 1837*4882a593Smuzhiyun movl (FRAME_OFFSET+12)(%esp), UKEYP # in_key 1838*4882a593Smuzhiyun movl (FRAME_OFFSET+16)(%esp), %edx # key_len 1839*4882a593Smuzhiyun#endif 1840*4882a593Smuzhiyun movups (UKEYP), %xmm0 # user key (first 16 bytes) 1841*4882a593Smuzhiyun movaps %xmm0, (KEYP) 1842*4882a593Smuzhiyun lea 0x10(KEYP), TKEYP # key addr 1843*4882a593Smuzhiyun movl %edx, 480(KEYP) 1844*4882a593Smuzhiyun pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1845*4882a593Smuzhiyun cmp $24, %dl 1846*4882a593Smuzhiyun jb .Lenc_key128 1847*4882a593Smuzhiyun je .Lenc_key192 1848*4882a593Smuzhiyun movups 0x10(UKEYP), %xmm2 # other user key 1849*4882a593Smuzhiyun movaps %xmm2, (TKEYP) 1850*4882a593Smuzhiyun add $0x10, TKEYP 1851*4882a593Smuzhiyun aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1852*4882a593Smuzhiyun call _key_expansion_256a 1853*4882a593Smuzhiyun aeskeygenassist $0x1, %xmm0, %xmm1 1854*4882a593Smuzhiyun call _key_expansion_256b 1855*4882a593Smuzhiyun aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1856*4882a593Smuzhiyun call _key_expansion_256a 1857*4882a593Smuzhiyun aeskeygenassist $0x2, %xmm0, %xmm1 1858*4882a593Smuzhiyun call _key_expansion_256b 1859*4882a593Smuzhiyun aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1860*4882a593Smuzhiyun call _key_expansion_256a 1861*4882a593Smuzhiyun aeskeygenassist $0x4, %xmm0, %xmm1 1862*4882a593Smuzhiyun call _key_expansion_256b 1863*4882a593Smuzhiyun aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1864*4882a593Smuzhiyun call _key_expansion_256a 1865*4882a593Smuzhiyun aeskeygenassist $0x8, %xmm0, %xmm1 1866*4882a593Smuzhiyun call _key_expansion_256b 1867*4882a593Smuzhiyun aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1868*4882a593Smuzhiyun call _key_expansion_256a 1869*4882a593Smuzhiyun aeskeygenassist $0x10, %xmm0, %xmm1 1870*4882a593Smuzhiyun call _key_expansion_256b 1871*4882a593Smuzhiyun aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1872*4882a593Smuzhiyun call _key_expansion_256a 1873*4882a593Smuzhiyun aeskeygenassist $0x20, %xmm0, %xmm1 1874*4882a593Smuzhiyun call _key_expansion_256b 1875*4882a593Smuzhiyun aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1876*4882a593Smuzhiyun call _key_expansion_256a 1877*4882a593Smuzhiyun jmp .Ldec_key 1878*4882a593Smuzhiyun.Lenc_key192: 1879*4882a593Smuzhiyun movq 0x10(UKEYP), %xmm2 # other user key 1880*4882a593Smuzhiyun aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 1881*4882a593Smuzhiyun call _key_expansion_192a 1882*4882a593Smuzhiyun aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 1883*4882a593Smuzhiyun call _key_expansion_192b 1884*4882a593Smuzhiyun aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 1885*4882a593Smuzhiyun call _key_expansion_192a 1886*4882a593Smuzhiyun aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 1887*4882a593Smuzhiyun call _key_expansion_192b 1888*4882a593Smuzhiyun aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 1889*4882a593Smuzhiyun call _key_expansion_192a 1890*4882a593Smuzhiyun aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 1891*4882a593Smuzhiyun call _key_expansion_192b 1892*4882a593Smuzhiyun aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 1893*4882a593Smuzhiyun call _key_expansion_192a 1894*4882a593Smuzhiyun aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 1895*4882a593Smuzhiyun call _key_expansion_192b 1896*4882a593Smuzhiyun jmp .Ldec_key 1897*4882a593Smuzhiyun.Lenc_key128: 1898*4882a593Smuzhiyun aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 1899*4882a593Smuzhiyun call _key_expansion_128 1900*4882a593Smuzhiyun aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 1901*4882a593Smuzhiyun call _key_expansion_128 1902*4882a593Smuzhiyun aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 1903*4882a593Smuzhiyun call _key_expansion_128 1904*4882a593Smuzhiyun aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 1905*4882a593Smuzhiyun call _key_expansion_128 1906*4882a593Smuzhiyun aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 1907*4882a593Smuzhiyun call _key_expansion_128 1908*4882a593Smuzhiyun aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 1909*4882a593Smuzhiyun call _key_expansion_128 1910*4882a593Smuzhiyun aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 1911*4882a593Smuzhiyun call _key_expansion_128 1912*4882a593Smuzhiyun aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 1913*4882a593Smuzhiyun call _key_expansion_128 1914*4882a593Smuzhiyun aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 1915*4882a593Smuzhiyun call _key_expansion_128 1916*4882a593Smuzhiyun aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 1917*4882a593Smuzhiyun call _key_expansion_128 1918*4882a593Smuzhiyun.Ldec_key: 1919*4882a593Smuzhiyun sub $0x10, TKEYP 1920*4882a593Smuzhiyun movaps (KEYP), %xmm0 1921*4882a593Smuzhiyun movaps (TKEYP), %xmm1 1922*4882a593Smuzhiyun movaps %xmm0, 240(TKEYP) 1923*4882a593Smuzhiyun movaps %xmm1, 240(KEYP) 1924*4882a593Smuzhiyun add $0x10, KEYP 1925*4882a593Smuzhiyun lea 240-16(TKEYP), UKEYP 1926*4882a593Smuzhiyun.align 4 1927*4882a593Smuzhiyun.Ldec_key_loop: 1928*4882a593Smuzhiyun movaps (KEYP), %xmm0 1929*4882a593Smuzhiyun aesimc %xmm0, %xmm1 1930*4882a593Smuzhiyun movaps %xmm1, (UKEYP) 1931*4882a593Smuzhiyun add $0x10, KEYP 1932*4882a593Smuzhiyun sub $0x10, UKEYP 1933*4882a593Smuzhiyun cmp TKEYP, KEYP 1934*4882a593Smuzhiyun jb .Ldec_key_loop 1935*4882a593Smuzhiyun xor AREG, AREG 1936*4882a593Smuzhiyun#ifndef __x86_64__ 1937*4882a593Smuzhiyun popl KEYP 1938*4882a593Smuzhiyun#endif 1939*4882a593Smuzhiyun FRAME_END 1940*4882a593Smuzhiyun RET 1941*4882a593SmuzhiyunSYM_FUNC_END(aesni_set_key) 1942*4882a593Smuzhiyun 1943*4882a593Smuzhiyun/* 1944*4882a593Smuzhiyun * void aesni_enc(const void *ctx, u8 *dst, const u8 *src) 1945*4882a593Smuzhiyun */ 1946*4882a593SmuzhiyunSYM_FUNC_START(aesni_enc) 1947*4882a593Smuzhiyun FRAME_BEGIN 1948*4882a593Smuzhiyun#ifndef __x86_64__ 1949*4882a593Smuzhiyun pushl KEYP 1950*4882a593Smuzhiyun pushl KLEN 1951*4882a593Smuzhiyun movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 1952*4882a593Smuzhiyun movl (FRAME_OFFSET+16)(%esp), OUTP # dst 1953*4882a593Smuzhiyun movl (FRAME_OFFSET+20)(%esp), INP # src 1954*4882a593Smuzhiyun#endif 1955*4882a593Smuzhiyun movl 480(KEYP), KLEN # key length 1956*4882a593Smuzhiyun movups (INP), STATE # input 1957*4882a593Smuzhiyun call _aesni_enc1 1958*4882a593Smuzhiyun movups STATE, (OUTP) # output 1959*4882a593Smuzhiyun#ifndef __x86_64__ 1960*4882a593Smuzhiyun popl KLEN 1961*4882a593Smuzhiyun popl KEYP 1962*4882a593Smuzhiyun#endif 1963*4882a593Smuzhiyun FRAME_END 1964*4882a593Smuzhiyun RET 1965*4882a593SmuzhiyunSYM_FUNC_END(aesni_enc) 1966*4882a593Smuzhiyun 1967*4882a593Smuzhiyun/* 1968*4882a593Smuzhiyun * _aesni_enc1: internal ABI 1969*4882a593Smuzhiyun * input: 1970*4882a593Smuzhiyun * KEYP: key struct pointer 1971*4882a593Smuzhiyun * KLEN: round count 1972*4882a593Smuzhiyun * STATE: initial state (input) 1973*4882a593Smuzhiyun * output: 1974*4882a593Smuzhiyun * STATE: finial state (output) 1975*4882a593Smuzhiyun * changed: 1976*4882a593Smuzhiyun * KEY 1977*4882a593Smuzhiyun * TKEYP (T1) 1978*4882a593Smuzhiyun */ 1979*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_enc1) 1980*4882a593Smuzhiyun movaps (KEYP), KEY # key 1981*4882a593Smuzhiyun mov KEYP, TKEYP 1982*4882a593Smuzhiyun pxor KEY, STATE # round 0 1983*4882a593Smuzhiyun add $0x30, TKEYP 1984*4882a593Smuzhiyun cmp $24, KLEN 1985*4882a593Smuzhiyun jb .Lenc128 1986*4882a593Smuzhiyun lea 0x20(TKEYP), TKEYP 1987*4882a593Smuzhiyun je .Lenc192 1988*4882a593Smuzhiyun add $0x20, TKEYP 1989*4882a593Smuzhiyun movaps -0x60(TKEYP), KEY 1990*4882a593Smuzhiyun aesenc KEY, STATE 1991*4882a593Smuzhiyun movaps -0x50(TKEYP), KEY 1992*4882a593Smuzhiyun aesenc KEY, STATE 1993*4882a593Smuzhiyun.align 4 1994*4882a593Smuzhiyun.Lenc192: 1995*4882a593Smuzhiyun movaps -0x40(TKEYP), KEY 1996*4882a593Smuzhiyun aesenc KEY, STATE 1997*4882a593Smuzhiyun movaps -0x30(TKEYP), KEY 1998*4882a593Smuzhiyun aesenc KEY, STATE 1999*4882a593Smuzhiyun.align 4 2000*4882a593Smuzhiyun.Lenc128: 2001*4882a593Smuzhiyun movaps -0x20(TKEYP), KEY 2002*4882a593Smuzhiyun aesenc KEY, STATE 2003*4882a593Smuzhiyun movaps -0x10(TKEYP), KEY 2004*4882a593Smuzhiyun aesenc KEY, STATE 2005*4882a593Smuzhiyun movaps (TKEYP), KEY 2006*4882a593Smuzhiyun aesenc KEY, STATE 2007*4882a593Smuzhiyun movaps 0x10(TKEYP), KEY 2008*4882a593Smuzhiyun aesenc KEY, STATE 2009*4882a593Smuzhiyun movaps 0x20(TKEYP), KEY 2010*4882a593Smuzhiyun aesenc KEY, STATE 2011*4882a593Smuzhiyun movaps 0x30(TKEYP), KEY 2012*4882a593Smuzhiyun aesenc KEY, STATE 2013*4882a593Smuzhiyun movaps 0x40(TKEYP), KEY 2014*4882a593Smuzhiyun aesenc KEY, STATE 2015*4882a593Smuzhiyun movaps 0x50(TKEYP), KEY 2016*4882a593Smuzhiyun aesenc KEY, STATE 2017*4882a593Smuzhiyun movaps 0x60(TKEYP), KEY 2018*4882a593Smuzhiyun aesenc KEY, STATE 2019*4882a593Smuzhiyun movaps 0x70(TKEYP), KEY 2020*4882a593Smuzhiyun aesenclast KEY, STATE 2021*4882a593Smuzhiyun RET 2022*4882a593SmuzhiyunSYM_FUNC_END(_aesni_enc1) 2023*4882a593Smuzhiyun 2024*4882a593Smuzhiyun/* 2025*4882a593Smuzhiyun * _aesni_enc4: internal ABI 2026*4882a593Smuzhiyun * input: 2027*4882a593Smuzhiyun * KEYP: key struct pointer 2028*4882a593Smuzhiyun * KLEN: round count 2029*4882a593Smuzhiyun * STATE1: initial state (input) 2030*4882a593Smuzhiyun * STATE2 2031*4882a593Smuzhiyun * STATE3 2032*4882a593Smuzhiyun * STATE4 2033*4882a593Smuzhiyun * output: 2034*4882a593Smuzhiyun * STATE1: finial state (output) 2035*4882a593Smuzhiyun * STATE2 2036*4882a593Smuzhiyun * STATE3 2037*4882a593Smuzhiyun * STATE4 2038*4882a593Smuzhiyun * changed: 2039*4882a593Smuzhiyun * KEY 2040*4882a593Smuzhiyun * TKEYP (T1) 2041*4882a593Smuzhiyun */ 2042*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_enc4) 2043*4882a593Smuzhiyun movaps (KEYP), KEY # key 2044*4882a593Smuzhiyun mov KEYP, TKEYP 2045*4882a593Smuzhiyun pxor KEY, STATE1 # round 0 2046*4882a593Smuzhiyun pxor KEY, STATE2 2047*4882a593Smuzhiyun pxor KEY, STATE3 2048*4882a593Smuzhiyun pxor KEY, STATE4 2049*4882a593Smuzhiyun add $0x30, TKEYP 2050*4882a593Smuzhiyun cmp $24, KLEN 2051*4882a593Smuzhiyun jb .L4enc128 2052*4882a593Smuzhiyun lea 0x20(TKEYP), TKEYP 2053*4882a593Smuzhiyun je .L4enc192 2054*4882a593Smuzhiyun add $0x20, TKEYP 2055*4882a593Smuzhiyun movaps -0x60(TKEYP), KEY 2056*4882a593Smuzhiyun aesenc KEY, STATE1 2057*4882a593Smuzhiyun aesenc KEY, STATE2 2058*4882a593Smuzhiyun aesenc KEY, STATE3 2059*4882a593Smuzhiyun aesenc KEY, STATE4 2060*4882a593Smuzhiyun movaps -0x50(TKEYP), KEY 2061*4882a593Smuzhiyun aesenc KEY, STATE1 2062*4882a593Smuzhiyun aesenc KEY, STATE2 2063*4882a593Smuzhiyun aesenc KEY, STATE3 2064*4882a593Smuzhiyun aesenc KEY, STATE4 2065*4882a593Smuzhiyun#.align 4 2066*4882a593Smuzhiyun.L4enc192: 2067*4882a593Smuzhiyun movaps -0x40(TKEYP), KEY 2068*4882a593Smuzhiyun aesenc KEY, STATE1 2069*4882a593Smuzhiyun aesenc KEY, STATE2 2070*4882a593Smuzhiyun aesenc KEY, STATE3 2071*4882a593Smuzhiyun aesenc KEY, STATE4 2072*4882a593Smuzhiyun movaps -0x30(TKEYP), KEY 2073*4882a593Smuzhiyun aesenc KEY, STATE1 2074*4882a593Smuzhiyun aesenc KEY, STATE2 2075*4882a593Smuzhiyun aesenc KEY, STATE3 2076*4882a593Smuzhiyun aesenc KEY, STATE4 2077*4882a593Smuzhiyun#.align 4 2078*4882a593Smuzhiyun.L4enc128: 2079*4882a593Smuzhiyun movaps -0x20(TKEYP), KEY 2080*4882a593Smuzhiyun aesenc KEY, STATE1 2081*4882a593Smuzhiyun aesenc KEY, STATE2 2082*4882a593Smuzhiyun aesenc KEY, STATE3 2083*4882a593Smuzhiyun aesenc KEY, STATE4 2084*4882a593Smuzhiyun movaps -0x10(TKEYP), KEY 2085*4882a593Smuzhiyun aesenc KEY, STATE1 2086*4882a593Smuzhiyun aesenc KEY, STATE2 2087*4882a593Smuzhiyun aesenc KEY, STATE3 2088*4882a593Smuzhiyun aesenc KEY, STATE4 2089*4882a593Smuzhiyun movaps (TKEYP), KEY 2090*4882a593Smuzhiyun aesenc KEY, STATE1 2091*4882a593Smuzhiyun aesenc KEY, STATE2 2092*4882a593Smuzhiyun aesenc KEY, STATE3 2093*4882a593Smuzhiyun aesenc KEY, STATE4 2094*4882a593Smuzhiyun movaps 0x10(TKEYP), KEY 2095*4882a593Smuzhiyun aesenc KEY, STATE1 2096*4882a593Smuzhiyun aesenc KEY, STATE2 2097*4882a593Smuzhiyun aesenc KEY, STATE3 2098*4882a593Smuzhiyun aesenc KEY, STATE4 2099*4882a593Smuzhiyun movaps 0x20(TKEYP), KEY 2100*4882a593Smuzhiyun aesenc KEY, STATE1 2101*4882a593Smuzhiyun aesenc KEY, STATE2 2102*4882a593Smuzhiyun aesenc KEY, STATE3 2103*4882a593Smuzhiyun aesenc KEY, STATE4 2104*4882a593Smuzhiyun movaps 0x30(TKEYP), KEY 2105*4882a593Smuzhiyun aesenc KEY, STATE1 2106*4882a593Smuzhiyun aesenc KEY, STATE2 2107*4882a593Smuzhiyun aesenc KEY, STATE3 2108*4882a593Smuzhiyun aesenc KEY, STATE4 2109*4882a593Smuzhiyun movaps 0x40(TKEYP), KEY 2110*4882a593Smuzhiyun aesenc KEY, STATE1 2111*4882a593Smuzhiyun aesenc KEY, STATE2 2112*4882a593Smuzhiyun aesenc KEY, STATE3 2113*4882a593Smuzhiyun aesenc KEY, STATE4 2114*4882a593Smuzhiyun movaps 0x50(TKEYP), KEY 2115*4882a593Smuzhiyun aesenc KEY, STATE1 2116*4882a593Smuzhiyun aesenc KEY, STATE2 2117*4882a593Smuzhiyun aesenc KEY, STATE3 2118*4882a593Smuzhiyun aesenc KEY, STATE4 2119*4882a593Smuzhiyun movaps 0x60(TKEYP), KEY 2120*4882a593Smuzhiyun aesenc KEY, STATE1 2121*4882a593Smuzhiyun aesenc KEY, STATE2 2122*4882a593Smuzhiyun aesenc KEY, STATE3 2123*4882a593Smuzhiyun aesenc KEY, STATE4 2124*4882a593Smuzhiyun movaps 0x70(TKEYP), KEY 2125*4882a593Smuzhiyun aesenclast KEY, STATE1 # last round 2126*4882a593Smuzhiyun aesenclast KEY, STATE2 2127*4882a593Smuzhiyun aesenclast KEY, STATE3 2128*4882a593Smuzhiyun aesenclast KEY, STATE4 2129*4882a593Smuzhiyun RET 2130*4882a593SmuzhiyunSYM_FUNC_END(_aesni_enc4) 2131*4882a593Smuzhiyun 2132*4882a593Smuzhiyun/* 2133*4882a593Smuzhiyun * void aesni_dec (const void *ctx, u8 *dst, const u8 *src) 2134*4882a593Smuzhiyun */ 2135*4882a593SmuzhiyunSYM_FUNC_START(aesni_dec) 2136*4882a593Smuzhiyun FRAME_BEGIN 2137*4882a593Smuzhiyun#ifndef __x86_64__ 2138*4882a593Smuzhiyun pushl KEYP 2139*4882a593Smuzhiyun pushl KLEN 2140*4882a593Smuzhiyun movl (FRAME_OFFSET+12)(%esp), KEYP # ctx 2141*4882a593Smuzhiyun movl (FRAME_OFFSET+16)(%esp), OUTP # dst 2142*4882a593Smuzhiyun movl (FRAME_OFFSET+20)(%esp), INP # src 2143*4882a593Smuzhiyun#endif 2144*4882a593Smuzhiyun mov 480(KEYP), KLEN # key length 2145*4882a593Smuzhiyun add $240, KEYP 2146*4882a593Smuzhiyun movups (INP), STATE # input 2147*4882a593Smuzhiyun call _aesni_dec1 2148*4882a593Smuzhiyun movups STATE, (OUTP) #output 2149*4882a593Smuzhiyun#ifndef __x86_64__ 2150*4882a593Smuzhiyun popl KLEN 2151*4882a593Smuzhiyun popl KEYP 2152*4882a593Smuzhiyun#endif 2153*4882a593Smuzhiyun FRAME_END 2154*4882a593Smuzhiyun RET 2155*4882a593SmuzhiyunSYM_FUNC_END(aesni_dec) 2156*4882a593Smuzhiyun 2157*4882a593Smuzhiyun/* 2158*4882a593Smuzhiyun * _aesni_dec1: internal ABI 2159*4882a593Smuzhiyun * input: 2160*4882a593Smuzhiyun * KEYP: key struct pointer 2161*4882a593Smuzhiyun * KLEN: key length 2162*4882a593Smuzhiyun * STATE: initial state (input) 2163*4882a593Smuzhiyun * output: 2164*4882a593Smuzhiyun * STATE: finial state (output) 2165*4882a593Smuzhiyun * changed: 2166*4882a593Smuzhiyun * KEY 2167*4882a593Smuzhiyun * TKEYP (T1) 2168*4882a593Smuzhiyun */ 2169*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_dec1) 2170*4882a593Smuzhiyun movaps (KEYP), KEY # key 2171*4882a593Smuzhiyun mov KEYP, TKEYP 2172*4882a593Smuzhiyun pxor KEY, STATE # round 0 2173*4882a593Smuzhiyun add $0x30, TKEYP 2174*4882a593Smuzhiyun cmp $24, KLEN 2175*4882a593Smuzhiyun jb .Ldec128 2176*4882a593Smuzhiyun lea 0x20(TKEYP), TKEYP 2177*4882a593Smuzhiyun je .Ldec192 2178*4882a593Smuzhiyun add $0x20, TKEYP 2179*4882a593Smuzhiyun movaps -0x60(TKEYP), KEY 2180*4882a593Smuzhiyun aesdec KEY, STATE 2181*4882a593Smuzhiyun movaps -0x50(TKEYP), KEY 2182*4882a593Smuzhiyun aesdec KEY, STATE 2183*4882a593Smuzhiyun.align 4 2184*4882a593Smuzhiyun.Ldec192: 2185*4882a593Smuzhiyun movaps -0x40(TKEYP), KEY 2186*4882a593Smuzhiyun aesdec KEY, STATE 2187*4882a593Smuzhiyun movaps -0x30(TKEYP), KEY 2188*4882a593Smuzhiyun aesdec KEY, STATE 2189*4882a593Smuzhiyun.align 4 2190*4882a593Smuzhiyun.Ldec128: 2191*4882a593Smuzhiyun movaps -0x20(TKEYP), KEY 2192*4882a593Smuzhiyun aesdec KEY, STATE 2193*4882a593Smuzhiyun movaps -0x10(TKEYP), KEY 2194*4882a593Smuzhiyun aesdec KEY, STATE 2195*4882a593Smuzhiyun movaps (TKEYP), KEY 2196*4882a593Smuzhiyun aesdec KEY, STATE 2197*4882a593Smuzhiyun movaps 0x10(TKEYP), KEY 2198*4882a593Smuzhiyun aesdec KEY, STATE 2199*4882a593Smuzhiyun movaps 0x20(TKEYP), KEY 2200*4882a593Smuzhiyun aesdec KEY, STATE 2201*4882a593Smuzhiyun movaps 0x30(TKEYP), KEY 2202*4882a593Smuzhiyun aesdec KEY, STATE 2203*4882a593Smuzhiyun movaps 0x40(TKEYP), KEY 2204*4882a593Smuzhiyun aesdec KEY, STATE 2205*4882a593Smuzhiyun movaps 0x50(TKEYP), KEY 2206*4882a593Smuzhiyun aesdec KEY, STATE 2207*4882a593Smuzhiyun movaps 0x60(TKEYP), KEY 2208*4882a593Smuzhiyun aesdec KEY, STATE 2209*4882a593Smuzhiyun movaps 0x70(TKEYP), KEY 2210*4882a593Smuzhiyun aesdeclast KEY, STATE 2211*4882a593Smuzhiyun RET 2212*4882a593SmuzhiyunSYM_FUNC_END(_aesni_dec1) 2213*4882a593Smuzhiyun 2214*4882a593Smuzhiyun/* 2215*4882a593Smuzhiyun * _aesni_dec4: internal ABI 2216*4882a593Smuzhiyun * input: 2217*4882a593Smuzhiyun * KEYP: key struct pointer 2218*4882a593Smuzhiyun * KLEN: key length 2219*4882a593Smuzhiyun * STATE1: initial state (input) 2220*4882a593Smuzhiyun * STATE2 2221*4882a593Smuzhiyun * STATE3 2222*4882a593Smuzhiyun * STATE4 2223*4882a593Smuzhiyun * output: 2224*4882a593Smuzhiyun * STATE1: finial state (output) 2225*4882a593Smuzhiyun * STATE2 2226*4882a593Smuzhiyun * STATE3 2227*4882a593Smuzhiyun * STATE4 2228*4882a593Smuzhiyun * changed: 2229*4882a593Smuzhiyun * KEY 2230*4882a593Smuzhiyun * TKEYP (T1) 2231*4882a593Smuzhiyun */ 2232*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_dec4) 2233*4882a593Smuzhiyun movaps (KEYP), KEY # key 2234*4882a593Smuzhiyun mov KEYP, TKEYP 2235*4882a593Smuzhiyun pxor KEY, STATE1 # round 0 2236*4882a593Smuzhiyun pxor KEY, STATE2 2237*4882a593Smuzhiyun pxor KEY, STATE3 2238*4882a593Smuzhiyun pxor KEY, STATE4 2239*4882a593Smuzhiyun add $0x30, TKEYP 2240*4882a593Smuzhiyun cmp $24, KLEN 2241*4882a593Smuzhiyun jb .L4dec128 2242*4882a593Smuzhiyun lea 0x20(TKEYP), TKEYP 2243*4882a593Smuzhiyun je .L4dec192 2244*4882a593Smuzhiyun add $0x20, TKEYP 2245*4882a593Smuzhiyun movaps -0x60(TKEYP), KEY 2246*4882a593Smuzhiyun aesdec KEY, STATE1 2247*4882a593Smuzhiyun aesdec KEY, STATE2 2248*4882a593Smuzhiyun aesdec KEY, STATE3 2249*4882a593Smuzhiyun aesdec KEY, STATE4 2250*4882a593Smuzhiyun movaps -0x50(TKEYP), KEY 2251*4882a593Smuzhiyun aesdec KEY, STATE1 2252*4882a593Smuzhiyun aesdec KEY, STATE2 2253*4882a593Smuzhiyun aesdec KEY, STATE3 2254*4882a593Smuzhiyun aesdec KEY, STATE4 2255*4882a593Smuzhiyun.align 4 2256*4882a593Smuzhiyun.L4dec192: 2257*4882a593Smuzhiyun movaps -0x40(TKEYP), KEY 2258*4882a593Smuzhiyun aesdec KEY, STATE1 2259*4882a593Smuzhiyun aesdec KEY, STATE2 2260*4882a593Smuzhiyun aesdec KEY, STATE3 2261*4882a593Smuzhiyun aesdec KEY, STATE4 2262*4882a593Smuzhiyun movaps -0x30(TKEYP), KEY 2263*4882a593Smuzhiyun aesdec KEY, STATE1 2264*4882a593Smuzhiyun aesdec KEY, STATE2 2265*4882a593Smuzhiyun aesdec KEY, STATE3 2266*4882a593Smuzhiyun aesdec KEY, STATE4 2267*4882a593Smuzhiyun.align 4 2268*4882a593Smuzhiyun.L4dec128: 2269*4882a593Smuzhiyun movaps -0x20(TKEYP), KEY 2270*4882a593Smuzhiyun aesdec KEY, STATE1 2271*4882a593Smuzhiyun aesdec KEY, STATE2 2272*4882a593Smuzhiyun aesdec KEY, STATE3 2273*4882a593Smuzhiyun aesdec KEY, STATE4 2274*4882a593Smuzhiyun movaps -0x10(TKEYP), KEY 2275*4882a593Smuzhiyun aesdec KEY, STATE1 2276*4882a593Smuzhiyun aesdec KEY, STATE2 2277*4882a593Smuzhiyun aesdec KEY, STATE3 2278*4882a593Smuzhiyun aesdec KEY, STATE4 2279*4882a593Smuzhiyun movaps (TKEYP), KEY 2280*4882a593Smuzhiyun aesdec KEY, STATE1 2281*4882a593Smuzhiyun aesdec KEY, STATE2 2282*4882a593Smuzhiyun aesdec KEY, STATE3 2283*4882a593Smuzhiyun aesdec KEY, STATE4 2284*4882a593Smuzhiyun movaps 0x10(TKEYP), KEY 2285*4882a593Smuzhiyun aesdec KEY, STATE1 2286*4882a593Smuzhiyun aesdec KEY, STATE2 2287*4882a593Smuzhiyun aesdec KEY, STATE3 2288*4882a593Smuzhiyun aesdec KEY, STATE4 2289*4882a593Smuzhiyun movaps 0x20(TKEYP), KEY 2290*4882a593Smuzhiyun aesdec KEY, STATE1 2291*4882a593Smuzhiyun aesdec KEY, STATE2 2292*4882a593Smuzhiyun aesdec KEY, STATE3 2293*4882a593Smuzhiyun aesdec KEY, STATE4 2294*4882a593Smuzhiyun movaps 0x30(TKEYP), KEY 2295*4882a593Smuzhiyun aesdec KEY, STATE1 2296*4882a593Smuzhiyun aesdec KEY, STATE2 2297*4882a593Smuzhiyun aesdec KEY, STATE3 2298*4882a593Smuzhiyun aesdec KEY, STATE4 2299*4882a593Smuzhiyun movaps 0x40(TKEYP), KEY 2300*4882a593Smuzhiyun aesdec KEY, STATE1 2301*4882a593Smuzhiyun aesdec KEY, STATE2 2302*4882a593Smuzhiyun aesdec KEY, STATE3 2303*4882a593Smuzhiyun aesdec KEY, STATE4 2304*4882a593Smuzhiyun movaps 0x50(TKEYP), KEY 2305*4882a593Smuzhiyun aesdec KEY, STATE1 2306*4882a593Smuzhiyun aesdec KEY, STATE2 2307*4882a593Smuzhiyun aesdec KEY, STATE3 2308*4882a593Smuzhiyun aesdec KEY, STATE4 2309*4882a593Smuzhiyun movaps 0x60(TKEYP), KEY 2310*4882a593Smuzhiyun aesdec KEY, STATE1 2311*4882a593Smuzhiyun aesdec KEY, STATE2 2312*4882a593Smuzhiyun aesdec KEY, STATE3 2313*4882a593Smuzhiyun aesdec KEY, STATE4 2314*4882a593Smuzhiyun movaps 0x70(TKEYP), KEY 2315*4882a593Smuzhiyun aesdeclast KEY, STATE1 # last round 2316*4882a593Smuzhiyun aesdeclast KEY, STATE2 2317*4882a593Smuzhiyun aesdeclast KEY, STATE3 2318*4882a593Smuzhiyun aesdeclast KEY, STATE4 2319*4882a593Smuzhiyun RET 2320*4882a593SmuzhiyunSYM_FUNC_END(_aesni_dec4) 2321*4882a593Smuzhiyun 2322*4882a593Smuzhiyun/* 2323*4882a593Smuzhiyun * void aesni_ecb_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2324*4882a593Smuzhiyun * size_t len) 2325*4882a593Smuzhiyun */ 2326*4882a593SmuzhiyunSYM_FUNC_START(aesni_ecb_enc) 2327*4882a593Smuzhiyun FRAME_BEGIN 2328*4882a593Smuzhiyun#ifndef __x86_64__ 2329*4882a593Smuzhiyun pushl LEN 2330*4882a593Smuzhiyun pushl KEYP 2331*4882a593Smuzhiyun pushl KLEN 2332*4882a593Smuzhiyun movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2333*4882a593Smuzhiyun movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2334*4882a593Smuzhiyun movl (FRAME_OFFSET+24)(%esp), INP # src 2335*4882a593Smuzhiyun movl (FRAME_OFFSET+28)(%esp), LEN # len 2336*4882a593Smuzhiyun#endif 2337*4882a593Smuzhiyun test LEN, LEN # check length 2338*4882a593Smuzhiyun jz .Lecb_enc_ret 2339*4882a593Smuzhiyun mov 480(KEYP), KLEN 2340*4882a593Smuzhiyun cmp $16, LEN 2341*4882a593Smuzhiyun jb .Lecb_enc_ret 2342*4882a593Smuzhiyun cmp $64, LEN 2343*4882a593Smuzhiyun jb .Lecb_enc_loop1 2344*4882a593Smuzhiyun.align 4 2345*4882a593Smuzhiyun.Lecb_enc_loop4: 2346*4882a593Smuzhiyun movups (INP), STATE1 2347*4882a593Smuzhiyun movups 0x10(INP), STATE2 2348*4882a593Smuzhiyun movups 0x20(INP), STATE3 2349*4882a593Smuzhiyun movups 0x30(INP), STATE4 2350*4882a593Smuzhiyun call _aesni_enc4 2351*4882a593Smuzhiyun movups STATE1, (OUTP) 2352*4882a593Smuzhiyun movups STATE2, 0x10(OUTP) 2353*4882a593Smuzhiyun movups STATE3, 0x20(OUTP) 2354*4882a593Smuzhiyun movups STATE4, 0x30(OUTP) 2355*4882a593Smuzhiyun sub $64, LEN 2356*4882a593Smuzhiyun add $64, INP 2357*4882a593Smuzhiyun add $64, OUTP 2358*4882a593Smuzhiyun cmp $64, LEN 2359*4882a593Smuzhiyun jge .Lecb_enc_loop4 2360*4882a593Smuzhiyun cmp $16, LEN 2361*4882a593Smuzhiyun jb .Lecb_enc_ret 2362*4882a593Smuzhiyun.align 4 2363*4882a593Smuzhiyun.Lecb_enc_loop1: 2364*4882a593Smuzhiyun movups (INP), STATE1 2365*4882a593Smuzhiyun call _aesni_enc1 2366*4882a593Smuzhiyun movups STATE1, (OUTP) 2367*4882a593Smuzhiyun sub $16, LEN 2368*4882a593Smuzhiyun add $16, INP 2369*4882a593Smuzhiyun add $16, OUTP 2370*4882a593Smuzhiyun cmp $16, LEN 2371*4882a593Smuzhiyun jge .Lecb_enc_loop1 2372*4882a593Smuzhiyun.Lecb_enc_ret: 2373*4882a593Smuzhiyun#ifndef __x86_64__ 2374*4882a593Smuzhiyun popl KLEN 2375*4882a593Smuzhiyun popl KEYP 2376*4882a593Smuzhiyun popl LEN 2377*4882a593Smuzhiyun#endif 2378*4882a593Smuzhiyun FRAME_END 2379*4882a593Smuzhiyun RET 2380*4882a593SmuzhiyunSYM_FUNC_END(aesni_ecb_enc) 2381*4882a593Smuzhiyun 2382*4882a593Smuzhiyun/* 2383*4882a593Smuzhiyun * void aesni_ecb_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2384*4882a593Smuzhiyun * size_t len); 2385*4882a593Smuzhiyun */ 2386*4882a593SmuzhiyunSYM_FUNC_START(aesni_ecb_dec) 2387*4882a593Smuzhiyun FRAME_BEGIN 2388*4882a593Smuzhiyun#ifndef __x86_64__ 2389*4882a593Smuzhiyun pushl LEN 2390*4882a593Smuzhiyun pushl KEYP 2391*4882a593Smuzhiyun pushl KLEN 2392*4882a593Smuzhiyun movl (FRAME_OFFSET+16)(%esp), KEYP # ctx 2393*4882a593Smuzhiyun movl (FRAME_OFFSET+20)(%esp), OUTP # dst 2394*4882a593Smuzhiyun movl (FRAME_OFFSET+24)(%esp), INP # src 2395*4882a593Smuzhiyun movl (FRAME_OFFSET+28)(%esp), LEN # len 2396*4882a593Smuzhiyun#endif 2397*4882a593Smuzhiyun test LEN, LEN 2398*4882a593Smuzhiyun jz .Lecb_dec_ret 2399*4882a593Smuzhiyun mov 480(KEYP), KLEN 2400*4882a593Smuzhiyun add $240, KEYP 2401*4882a593Smuzhiyun cmp $16, LEN 2402*4882a593Smuzhiyun jb .Lecb_dec_ret 2403*4882a593Smuzhiyun cmp $64, LEN 2404*4882a593Smuzhiyun jb .Lecb_dec_loop1 2405*4882a593Smuzhiyun.align 4 2406*4882a593Smuzhiyun.Lecb_dec_loop4: 2407*4882a593Smuzhiyun movups (INP), STATE1 2408*4882a593Smuzhiyun movups 0x10(INP), STATE2 2409*4882a593Smuzhiyun movups 0x20(INP), STATE3 2410*4882a593Smuzhiyun movups 0x30(INP), STATE4 2411*4882a593Smuzhiyun call _aesni_dec4 2412*4882a593Smuzhiyun movups STATE1, (OUTP) 2413*4882a593Smuzhiyun movups STATE2, 0x10(OUTP) 2414*4882a593Smuzhiyun movups STATE3, 0x20(OUTP) 2415*4882a593Smuzhiyun movups STATE4, 0x30(OUTP) 2416*4882a593Smuzhiyun sub $64, LEN 2417*4882a593Smuzhiyun add $64, INP 2418*4882a593Smuzhiyun add $64, OUTP 2419*4882a593Smuzhiyun cmp $64, LEN 2420*4882a593Smuzhiyun jge .Lecb_dec_loop4 2421*4882a593Smuzhiyun cmp $16, LEN 2422*4882a593Smuzhiyun jb .Lecb_dec_ret 2423*4882a593Smuzhiyun.align 4 2424*4882a593Smuzhiyun.Lecb_dec_loop1: 2425*4882a593Smuzhiyun movups (INP), STATE1 2426*4882a593Smuzhiyun call _aesni_dec1 2427*4882a593Smuzhiyun movups STATE1, (OUTP) 2428*4882a593Smuzhiyun sub $16, LEN 2429*4882a593Smuzhiyun add $16, INP 2430*4882a593Smuzhiyun add $16, OUTP 2431*4882a593Smuzhiyun cmp $16, LEN 2432*4882a593Smuzhiyun jge .Lecb_dec_loop1 2433*4882a593Smuzhiyun.Lecb_dec_ret: 2434*4882a593Smuzhiyun#ifndef __x86_64__ 2435*4882a593Smuzhiyun popl KLEN 2436*4882a593Smuzhiyun popl KEYP 2437*4882a593Smuzhiyun popl LEN 2438*4882a593Smuzhiyun#endif 2439*4882a593Smuzhiyun FRAME_END 2440*4882a593Smuzhiyun RET 2441*4882a593SmuzhiyunSYM_FUNC_END(aesni_ecb_dec) 2442*4882a593Smuzhiyun 2443*4882a593Smuzhiyun/* 2444*4882a593Smuzhiyun * void aesni_cbc_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2445*4882a593Smuzhiyun * size_t len, u8 *iv) 2446*4882a593Smuzhiyun */ 2447*4882a593SmuzhiyunSYM_FUNC_START(aesni_cbc_enc) 2448*4882a593Smuzhiyun FRAME_BEGIN 2449*4882a593Smuzhiyun#ifndef __x86_64__ 2450*4882a593Smuzhiyun pushl IVP 2451*4882a593Smuzhiyun pushl LEN 2452*4882a593Smuzhiyun pushl KEYP 2453*4882a593Smuzhiyun pushl KLEN 2454*4882a593Smuzhiyun movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2455*4882a593Smuzhiyun movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2456*4882a593Smuzhiyun movl (FRAME_OFFSET+28)(%esp), INP # src 2457*4882a593Smuzhiyun movl (FRAME_OFFSET+32)(%esp), LEN # len 2458*4882a593Smuzhiyun movl (FRAME_OFFSET+36)(%esp), IVP # iv 2459*4882a593Smuzhiyun#endif 2460*4882a593Smuzhiyun cmp $16, LEN 2461*4882a593Smuzhiyun jb .Lcbc_enc_ret 2462*4882a593Smuzhiyun mov 480(KEYP), KLEN 2463*4882a593Smuzhiyun movups (IVP), STATE # load iv as initial state 2464*4882a593Smuzhiyun.align 4 2465*4882a593Smuzhiyun.Lcbc_enc_loop: 2466*4882a593Smuzhiyun movups (INP), IN # load input 2467*4882a593Smuzhiyun pxor IN, STATE 2468*4882a593Smuzhiyun call _aesni_enc1 2469*4882a593Smuzhiyun movups STATE, (OUTP) # store output 2470*4882a593Smuzhiyun sub $16, LEN 2471*4882a593Smuzhiyun add $16, INP 2472*4882a593Smuzhiyun add $16, OUTP 2473*4882a593Smuzhiyun cmp $16, LEN 2474*4882a593Smuzhiyun jge .Lcbc_enc_loop 2475*4882a593Smuzhiyun movups STATE, (IVP) 2476*4882a593Smuzhiyun.Lcbc_enc_ret: 2477*4882a593Smuzhiyun#ifndef __x86_64__ 2478*4882a593Smuzhiyun popl KLEN 2479*4882a593Smuzhiyun popl KEYP 2480*4882a593Smuzhiyun popl LEN 2481*4882a593Smuzhiyun popl IVP 2482*4882a593Smuzhiyun#endif 2483*4882a593Smuzhiyun FRAME_END 2484*4882a593Smuzhiyun RET 2485*4882a593SmuzhiyunSYM_FUNC_END(aesni_cbc_enc) 2486*4882a593Smuzhiyun 2487*4882a593Smuzhiyun/* 2488*4882a593Smuzhiyun * void aesni_cbc_dec(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2489*4882a593Smuzhiyun * size_t len, u8 *iv) 2490*4882a593Smuzhiyun */ 2491*4882a593SmuzhiyunSYM_FUNC_START(aesni_cbc_dec) 2492*4882a593Smuzhiyun FRAME_BEGIN 2493*4882a593Smuzhiyun#ifndef __x86_64__ 2494*4882a593Smuzhiyun pushl IVP 2495*4882a593Smuzhiyun pushl LEN 2496*4882a593Smuzhiyun pushl KEYP 2497*4882a593Smuzhiyun pushl KLEN 2498*4882a593Smuzhiyun movl (FRAME_OFFSET+20)(%esp), KEYP # ctx 2499*4882a593Smuzhiyun movl (FRAME_OFFSET+24)(%esp), OUTP # dst 2500*4882a593Smuzhiyun movl (FRAME_OFFSET+28)(%esp), INP # src 2501*4882a593Smuzhiyun movl (FRAME_OFFSET+32)(%esp), LEN # len 2502*4882a593Smuzhiyun movl (FRAME_OFFSET+36)(%esp), IVP # iv 2503*4882a593Smuzhiyun#endif 2504*4882a593Smuzhiyun cmp $16, LEN 2505*4882a593Smuzhiyun jb .Lcbc_dec_just_ret 2506*4882a593Smuzhiyun mov 480(KEYP), KLEN 2507*4882a593Smuzhiyun add $240, KEYP 2508*4882a593Smuzhiyun movups (IVP), IV 2509*4882a593Smuzhiyun cmp $64, LEN 2510*4882a593Smuzhiyun jb .Lcbc_dec_loop1 2511*4882a593Smuzhiyun.align 4 2512*4882a593Smuzhiyun.Lcbc_dec_loop4: 2513*4882a593Smuzhiyun movups (INP), IN1 2514*4882a593Smuzhiyun movaps IN1, STATE1 2515*4882a593Smuzhiyun movups 0x10(INP), IN2 2516*4882a593Smuzhiyun movaps IN2, STATE2 2517*4882a593Smuzhiyun#ifdef __x86_64__ 2518*4882a593Smuzhiyun movups 0x20(INP), IN3 2519*4882a593Smuzhiyun movaps IN3, STATE3 2520*4882a593Smuzhiyun movups 0x30(INP), IN4 2521*4882a593Smuzhiyun movaps IN4, STATE4 2522*4882a593Smuzhiyun#else 2523*4882a593Smuzhiyun movups 0x20(INP), IN1 2524*4882a593Smuzhiyun movaps IN1, STATE3 2525*4882a593Smuzhiyun movups 0x30(INP), IN2 2526*4882a593Smuzhiyun movaps IN2, STATE4 2527*4882a593Smuzhiyun#endif 2528*4882a593Smuzhiyun call _aesni_dec4 2529*4882a593Smuzhiyun pxor IV, STATE1 2530*4882a593Smuzhiyun#ifdef __x86_64__ 2531*4882a593Smuzhiyun pxor IN1, STATE2 2532*4882a593Smuzhiyun pxor IN2, STATE3 2533*4882a593Smuzhiyun pxor IN3, STATE4 2534*4882a593Smuzhiyun movaps IN4, IV 2535*4882a593Smuzhiyun#else 2536*4882a593Smuzhiyun pxor IN1, STATE4 2537*4882a593Smuzhiyun movaps IN2, IV 2538*4882a593Smuzhiyun movups (INP), IN1 2539*4882a593Smuzhiyun pxor IN1, STATE2 2540*4882a593Smuzhiyun movups 0x10(INP), IN2 2541*4882a593Smuzhiyun pxor IN2, STATE3 2542*4882a593Smuzhiyun#endif 2543*4882a593Smuzhiyun movups STATE1, (OUTP) 2544*4882a593Smuzhiyun movups STATE2, 0x10(OUTP) 2545*4882a593Smuzhiyun movups STATE3, 0x20(OUTP) 2546*4882a593Smuzhiyun movups STATE4, 0x30(OUTP) 2547*4882a593Smuzhiyun sub $64, LEN 2548*4882a593Smuzhiyun add $64, INP 2549*4882a593Smuzhiyun add $64, OUTP 2550*4882a593Smuzhiyun cmp $64, LEN 2551*4882a593Smuzhiyun jge .Lcbc_dec_loop4 2552*4882a593Smuzhiyun cmp $16, LEN 2553*4882a593Smuzhiyun jb .Lcbc_dec_ret 2554*4882a593Smuzhiyun.align 4 2555*4882a593Smuzhiyun.Lcbc_dec_loop1: 2556*4882a593Smuzhiyun movups (INP), IN 2557*4882a593Smuzhiyun movaps IN, STATE 2558*4882a593Smuzhiyun call _aesni_dec1 2559*4882a593Smuzhiyun pxor IV, STATE 2560*4882a593Smuzhiyun movups STATE, (OUTP) 2561*4882a593Smuzhiyun movaps IN, IV 2562*4882a593Smuzhiyun sub $16, LEN 2563*4882a593Smuzhiyun add $16, INP 2564*4882a593Smuzhiyun add $16, OUTP 2565*4882a593Smuzhiyun cmp $16, LEN 2566*4882a593Smuzhiyun jge .Lcbc_dec_loop1 2567*4882a593Smuzhiyun.Lcbc_dec_ret: 2568*4882a593Smuzhiyun movups IV, (IVP) 2569*4882a593Smuzhiyun.Lcbc_dec_just_ret: 2570*4882a593Smuzhiyun#ifndef __x86_64__ 2571*4882a593Smuzhiyun popl KLEN 2572*4882a593Smuzhiyun popl KEYP 2573*4882a593Smuzhiyun popl LEN 2574*4882a593Smuzhiyun popl IVP 2575*4882a593Smuzhiyun#endif 2576*4882a593Smuzhiyun FRAME_END 2577*4882a593Smuzhiyun RET 2578*4882a593SmuzhiyunSYM_FUNC_END(aesni_cbc_dec) 2579*4882a593Smuzhiyun 2580*4882a593Smuzhiyun#ifdef __x86_64__ 2581*4882a593Smuzhiyun.pushsection .rodata 2582*4882a593Smuzhiyun.align 16 2583*4882a593Smuzhiyun.Lbswap_mask: 2584*4882a593Smuzhiyun .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2585*4882a593Smuzhiyun.popsection 2586*4882a593Smuzhiyun 2587*4882a593Smuzhiyun/* 2588*4882a593Smuzhiyun * _aesni_inc_init: internal ABI 2589*4882a593Smuzhiyun * setup registers used by _aesni_inc 2590*4882a593Smuzhiyun * input: 2591*4882a593Smuzhiyun * IV 2592*4882a593Smuzhiyun * output: 2593*4882a593Smuzhiyun * CTR: == IV, in little endian 2594*4882a593Smuzhiyun * TCTR_LOW: == lower qword of CTR 2595*4882a593Smuzhiyun * INC: == 1, in little endian 2596*4882a593Smuzhiyun * BSWAP_MASK == endian swapping mask 2597*4882a593Smuzhiyun */ 2598*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_inc_init) 2599*4882a593Smuzhiyun movaps .Lbswap_mask, BSWAP_MASK 2600*4882a593Smuzhiyun movaps IV, CTR 2601*4882a593Smuzhiyun pshufb BSWAP_MASK, CTR 2602*4882a593Smuzhiyun mov $1, TCTR_LOW 2603*4882a593Smuzhiyun movq TCTR_LOW, INC 2604*4882a593Smuzhiyun movq CTR, TCTR_LOW 2605*4882a593Smuzhiyun RET 2606*4882a593SmuzhiyunSYM_FUNC_END(_aesni_inc_init) 2607*4882a593Smuzhiyun 2608*4882a593Smuzhiyun/* 2609*4882a593Smuzhiyun * _aesni_inc: internal ABI 2610*4882a593Smuzhiyun * Increase IV by 1, IV is in big endian 2611*4882a593Smuzhiyun * input: 2612*4882a593Smuzhiyun * IV 2613*4882a593Smuzhiyun * CTR: == IV, in little endian 2614*4882a593Smuzhiyun * TCTR_LOW: == lower qword of CTR 2615*4882a593Smuzhiyun * INC: == 1, in little endian 2616*4882a593Smuzhiyun * BSWAP_MASK == endian swapping mask 2617*4882a593Smuzhiyun * output: 2618*4882a593Smuzhiyun * IV: Increase by 1 2619*4882a593Smuzhiyun * changed: 2620*4882a593Smuzhiyun * CTR: == output IV, in little endian 2621*4882a593Smuzhiyun * TCTR_LOW: == lower qword of CTR 2622*4882a593Smuzhiyun */ 2623*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(_aesni_inc) 2624*4882a593Smuzhiyun paddq INC, CTR 2625*4882a593Smuzhiyun add $1, TCTR_LOW 2626*4882a593Smuzhiyun jnc .Linc_low 2627*4882a593Smuzhiyun pslldq $8, INC 2628*4882a593Smuzhiyun paddq INC, CTR 2629*4882a593Smuzhiyun psrldq $8, INC 2630*4882a593Smuzhiyun.Linc_low: 2631*4882a593Smuzhiyun movaps CTR, IV 2632*4882a593Smuzhiyun pshufb BSWAP_MASK, IV 2633*4882a593Smuzhiyun RET 2634*4882a593SmuzhiyunSYM_FUNC_END(_aesni_inc) 2635*4882a593Smuzhiyun 2636*4882a593Smuzhiyun/* 2637*4882a593Smuzhiyun * void aesni_ctr_enc(struct crypto_aes_ctx *ctx, const u8 *dst, u8 *src, 2638*4882a593Smuzhiyun * size_t len, u8 *iv) 2639*4882a593Smuzhiyun */ 2640*4882a593SmuzhiyunSYM_FUNC_START(aesni_ctr_enc) 2641*4882a593Smuzhiyun FRAME_BEGIN 2642*4882a593Smuzhiyun cmp $16, LEN 2643*4882a593Smuzhiyun jb .Lctr_enc_just_ret 2644*4882a593Smuzhiyun mov 480(KEYP), KLEN 2645*4882a593Smuzhiyun movups (IVP), IV 2646*4882a593Smuzhiyun call _aesni_inc_init 2647*4882a593Smuzhiyun cmp $64, LEN 2648*4882a593Smuzhiyun jb .Lctr_enc_loop1 2649*4882a593Smuzhiyun.align 4 2650*4882a593Smuzhiyun.Lctr_enc_loop4: 2651*4882a593Smuzhiyun movaps IV, STATE1 2652*4882a593Smuzhiyun call _aesni_inc 2653*4882a593Smuzhiyun movups (INP), IN1 2654*4882a593Smuzhiyun movaps IV, STATE2 2655*4882a593Smuzhiyun call _aesni_inc 2656*4882a593Smuzhiyun movups 0x10(INP), IN2 2657*4882a593Smuzhiyun movaps IV, STATE3 2658*4882a593Smuzhiyun call _aesni_inc 2659*4882a593Smuzhiyun movups 0x20(INP), IN3 2660*4882a593Smuzhiyun movaps IV, STATE4 2661*4882a593Smuzhiyun call _aesni_inc 2662*4882a593Smuzhiyun movups 0x30(INP), IN4 2663*4882a593Smuzhiyun call _aesni_enc4 2664*4882a593Smuzhiyun pxor IN1, STATE1 2665*4882a593Smuzhiyun movups STATE1, (OUTP) 2666*4882a593Smuzhiyun pxor IN2, STATE2 2667*4882a593Smuzhiyun movups STATE2, 0x10(OUTP) 2668*4882a593Smuzhiyun pxor IN3, STATE3 2669*4882a593Smuzhiyun movups STATE3, 0x20(OUTP) 2670*4882a593Smuzhiyun pxor IN4, STATE4 2671*4882a593Smuzhiyun movups STATE4, 0x30(OUTP) 2672*4882a593Smuzhiyun sub $64, LEN 2673*4882a593Smuzhiyun add $64, INP 2674*4882a593Smuzhiyun add $64, OUTP 2675*4882a593Smuzhiyun cmp $64, LEN 2676*4882a593Smuzhiyun jge .Lctr_enc_loop4 2677*4882a593Smuzhiyun cmp $16, LEN 2678*4882a593Smuzhiyun jb .Lctr_enc_ret 2679*4882a593Smuzhiyun.align 4 2680*4882a593Smuzhiyun.Lctr_enc_loop1: 2681*4882a593Smuzhiyun movaps IV, STATE 2682*4882a593Smuzhiyun call _aesni_inc 2683*4882a593Smuzhiyun movups (INP), IN 2684*4882a593Smuzhiyun call _aesni_enc1 2685*4882a593Smuzhiyun pxor IN, STATE 2686*4882a593Smuzhiyun movups STATE, (OUTP) 2687*4882a593Smuzhiyun sub $16, LEN 2688*4882a593Smuzhiyun add $16, INP 2689*4882a593Smuzhiyun add $16, OUTP 2690*4882a593Smuzhiyun cmp $16, LEN 2691*4882a593Smuzhiyun jge .Lctr_enc_loop1 2692*4882a593Smuzhiyun.Lctr_enc_ret: 2693*4882a593Smuzhiyun movups IV, (IVP) 2694*4882a593Smuzhiyun.Lctr_enc_just_ret: 2695*4882a593Smuzhiyun FRAME_END 2696*4882a593Smuzhiyun RET 2697*4882a593SmuzhiyunSYM_FUNC_END(aesni_ctr_enc) 2698*4882a593Smuzhiyun 2699*4882a593Smuzhiyun/* 2700*4882a593Smuzhiyun * _aesni_gf128mul_x_ble: internal ABI 2701*4882a593Smuzhiyun * Multiply in GF(2^128) for XTS IVs 2702*4882a593Smuzhiyun * input: 2703*4882a593Smuzhiyun * IV: current IV 2704*4882a593Smuzhiyun * GF128MUL_MASK == mask with 0x87 and 0x01 2705*4882a593Smuzhiyun * output: 2706*4882a593Smuzhiyun * IV: next IV 2707*4882a593Smuzhiyun * changed: 2708*4882a593Smuzhiyun * CTR: == temporary value 2709*4882a593Smuzhiyun */ 2710*4882a593Smuzhiyun#define _aesni_gf128mul_x_ble() \ 2711*4882a593Smuzhiyun pshufd $0x13, IV, CTR; \ 2712*4882a593Smuzhiyun paddq IV, IV; \ 2713*4882a593Smuzhiyun psrad $31, CTR; \ 2714*4882a593Smuzhiyun pand GF128MUL_MASK, CTR; \ 2715*4882a593Smuzhiyun pxor CTR, IV; 2716*4882a593Smuzhiyun 2717*4882a593Smuzhiyun/* 2718*4882a593Smuzhiyun * void aesni_xts_encrypt(const struct crypto_aes_ctx *ctx, u8 *dst, 2719*4882a593Smuzhiyun * const u8 *src, unsigned int len, le128 *iv) 2720*4882a593Smuzhiyun */ 2721*4882a593SmuzhiyunSYM_FUNC_START(aesni_xts_encrypt) 2722*4882a593Smuzhiyun FRAME_BEGIN 2723*4882a593Smuzhiyun 2724*4882a593Smuzhiyun movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2725*4882a593Smuzhiyun movups (IVP), IV 2726*4882a593Smuzhiyun 2727*4882a593Smuzhiyun mov 480(KEYP), KLEN 2728*4882a593Smuzhiyun 2729*4882a593Smuzhiyun.Lxts_enc_loop4: 2730*4882a593Smuzhiyun movdqa IV, STATE1 2731*4882a593Smuzhiyun movdqu 0x00(INP), INC 2732*4882a593Smuzhiyun pxor INC, STATE1 2733*4882a593Smuzhiyun movdqu IV, 0x00(OUTP) 2734*4882a593Smuzhiyun 2735*4882a593Smuzhiyun _aesni_gf128mul_x_ble() 2736*4882a593Smuzhiyun movdqa IV, STATE2 2737*4882a593Smuzhiyun movdqu 0x10(INP), INC 2738*4882a593Smuzhiyun pxor INC, STATE2 2739*4882a593Smuzhiyun movdqu IV, 0x10(OUTP) 2740*4882a593Smuzhiyun 2741*4882a593Smuzhiyun _aesni_gf128mul_x_ble() 2742*4882a593Smuzhiyun movdqa IV, STATE3 2743*4882a593Smuzhiyun movdqu 0x20(INP), INC 2744*4882a593Smuzhiyun pxor INC, STATE3 2745*4882a593Smuzhiyun movdqu IV, 0x20(OUTP) 2746*4882a593Smuzhiyun 2747*4882a593Smuzhiyun _aesni_gf128mul_x_ble() 2748*4882a593Smuzhiyun movdqa IV, STATE4 2749*4882a593Smuzhiyun movdqu 0x30(INP), INC 2750*4882a593Smuzhiyun pxor INC, STATE4 2751*4882a593Smuzhiyun movdqu IV, 0x30(OUTP) 2752*4882a593Smuzhiyun 2753*4882a593Smuzhiyun call _aesni_enc4 2754*4882a593Smuzhiyun 2755*4882a593Smuzhiyun movdqu 0x00(OUTP), INC 2756*4882a593Smuzhiyun pxor INC, STATE1 2757*4882a593Smuzhiyun movdqu STATE1, 0x00(OUTP) 2758*4882a593Smuzhiyun 2759*4882a593Smuzhiyun movdqu 0x10(OUTP), INC 2760*4882a593Smuzhiyun pxor INC, STATE2 2761*4882a593Smuzhiyun movdqu STATE2, 0x10(OUTP) 2762*4882a593Smuzhiyun 2763*4882a593Smuzhiyun movdqu 0x20(OUTP), INC 2764*4882a593Smuzhiyun pxor INC, STATE3 2765*4882a593Smuzhiyun movdqu STATE3, 0x20(OUTP) 2766*4882a593Smuzhiyun 2767*4882a593Smuzhiyun movdqu 0x30(OUTP), INC 2768*4882a593Smuzhiyun pxor INC, STATE4 2769*4882a593Smuzhiyun movdqu STATE4, 0x30(OUTP) 2770*4882a593Smuzhiyun 2771*4882a593Smuzhiyun _aesni_gf128mul_x_ble() 2772*4882a593Smuzhiyun 2773*4882a593Smuzhiyun add $64, INP 2774*4882a593Smuzhiyun add $64, OUTP 2775*4882a593Smuzhiyun sub $64, LEN 2776*4882a593Smuzhiyun ja .Lxts_enc_loop4 2777*4882a593Smuzhiyun 2778*4882a593Smuzhiyun movups IV, (IVP) 2779*4882a593Smuzhiyun 2780*4882a593Smuzhiyun FRAME_END 2781*4882a593Smuzhiyun RET 2782*4882a593SmuzhiyunSYM_FUNC_END(aesni_xts_encrypt) 2783*4882a593Smuzhiyun 2784*4882a593Smuzhiyun/* 2785*4882a593Smuzhiyun * void aesni_xts_decrypt(const struct crypto_aes_ctx *ctx, u8 *dst, 2786*4882a593Smuzhiyun * const u8 *src, unsigned int len, le128 *iv) 2787*4882a593Smuzhiyun */ 2788*4882a593SmuzhiyunSYM_FUNC_START(aesni_xts_decrypt) 2789*4882a593Smuzhiyun FRAME_BEGIN 2790*4882a593Smuzhiyun 2791*4882a593Smuzhiyun movdqa .Lgf128mul_x_ble_mask, GF128MUL_MASK 2792*4882a593Smuzhiyun movups (IVP), IV 2793*4882a593Smuzhiyun 2794*4882a593Smuzhiyun mov 480(KEYP), KLEN 2795*4882a593Smuzhiyun add $240, KEYP 2796*4882a593Smuzhiyun 2797*4882a593Smuzhiyun.Lxts_dec_loop4: 2798*4882a593Smuzhiyun movdqa IV, STATE1 2799*4882a593Smuzhiyun movdqu 0x00(INP), INC 2800*4882a593Smuzhiyun pxor INC, STATE1 2801*4882a593Smuzhiyun movdqu IV, 0x00(OUTP) 2802*4882a593Smuzhiyun 2803*4882a593Smuzhiyun _aesni_gf128mul_x_ble() 2804*4882a593Smuzhiyun movdqa IV, STATE2 2805*4882a593Smuzhiyun movdqu 0x10(INP), INC 2806*4882a593Smuzhiyun pxor INC, STATE2 2807*4882a593Smuzhiyun movdqu IV, 0x10(OUTP) 2808*4882a593Smuzhiyun 2809*4882a593Smuzhiyun _aesni_gf128mul_x_ble() 2810*4882a593Smuzhiyun movdqa IV, STATE3 2811*4882a593Smuzhiyun movdqu 0x20(INP), INC 2812*4882a593Smuzhiyun pxor INC, STATE3 2813*4882a593Smuzhiyun movdqu IV, 0x20(OUTP) 2814*4882a593Smuzhiyun 2815*4882a593Smuzhiyun _aesni_gf128mul_x_ble() 2816*4882a593Smuzhiyun movdqa IV, STATE4 2817*4882a593Smuzhiyun movdqu 0x30(INP), INC 2818*4882a593Smuzhiyun pxor INC, STATE4 2819*4882a593Smuzhiyun movdqu IV, 0x30(OUTP) 2820*4882a593Smuzhiyun 2821*4882a593Smuzhiyun call _aesni_dec4 2822*4882a593Smuzhiyun 2823*4882a593Smuzhiyun movdqu 0x00(OUTP), INC 2824*4882a593Smuzhiyun pxor INC, STATE1 2825*4882a593Smuzhiyun movdqu STATE1, 0x00(OUTP) 2826*4882a593Smuzhiyun 2827*4882a593Smuzhiyun movdqu 0x10(OUTP), INC 2828*4882a593Smuzhiyun pxor INC, STATE2 2829*4882a593Smuzhiyun movdqu STATE2, 0x10(OUTP) 2830*4882a593Smuzhiyun 2831*4882a593Smuzhiyun movdqu 0x20(OUTP), INC 2832*4882a593Smuzhiyun pxor INC, STATE3 2833*4882a593Smuzhiyun movdqu STATE3, 0x20(OUTP) 2834*4882a593Smuzhiyun 2835*4882a593Smuzhiyun movdqu 0x30(OUTP), INC 2836*4882a593Smuzhiyun pxor INC, STATE4 2837*4882a593Smuzhiyun movdqu STATE4, 0x30(OUTP) 2838*4882a593Smuzhiyun 2839*4882a593Smuzhiyun _aesni_gf128mul_x_ble() 2840*4882a593Smuzhiyun 2841*4882a593Smuzhiyun add $64, INP 2842*4882a593Smuzhiyun add $64, OUTP 2843*4882a593Smuzhiyun sub $64, LEN 2844*4882a593Smuzhiyun ja .Lxts_dec_loop4 2845*4882a593Smuzhiyun 2846*4882a593Smuzhiyun movups IV, (IVP) 2847*4882a593Smuzhiyun 2848*4882a593Smuzhiyun FRAME_END 2849*4882a593Smuzhiyun RET 2850*4882a593SmuzhiyunSYM_FUNC_END(aesni_xts_decrypt) 2851*4882a593Smuzhiyun 2852*4882a593Smuzhiyun#endif 2853