1*4882a593Smuzhiyun/* 2*4882a593Smuzhiyun * Implement AES CTR mode by8 optimization with AVX instructions. (x86_64) 3*4882a593Smuzhiyun * 4*4882a593Smuzhiyun * This is AES128/192/256 CTR mode optimization implementation. It requires 5*4882a593Smuzhiyun * the support of Intel(R) AESNI and AVX instructions. 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * This work was inspired by the AES CTR mode optimization published 8*4882a593Smuzhiyun * in Intel Optimized IPSEC Cryptograhpic library. 9*4882a593Smuzhiyun * Additional information on it can be found at: 10*4882a593Smuzhiyun * http://downloadcenter.intel.com/Detail_Desc.aspx?agr=Y&DwnldID=22972 11*4882a593Smuzhiyun * 12*4882a593Smuzhiyun * This file is provided under a dual BSD/GPLv2 license. When using or 13*4882a593Smuzhiyun * redistributing this file, you may do so under either license. 14*4882a593Smuzhiyun * 15*4882a593Smuzhiyun * GPL LICENSE SUMMARY 16*4882a593Smuzhiyun * 17*4882a593Smuzhiyun * Copyright(c) 2014 Intel Corporation. 18*4882a593Smuzhiyun * 19*4882a593Smuzhiyun * This program is free software; you can redistribute it and/or modify 20*4882a593Smuzhiyun * it under the terms of version 2 of the GNU General Public License as 21*4882a593Smuzhiyun * published by the Free Software Foundation. 22*4882a593Smuzhiyun * 23*4882a593Smuzhiyun * This program is distributed in the hope that it will be useful, but 24*4882a593Smuzhiyun * WITHOUT ANY WARRANTY; without even the implied warranty of 25*4882a593Smuzhiyun * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 26*4882a593Smuzhiyun * General Public License for more details. 27*4882a593Smuzhiyun * 28*4882a593Smuzhiyun * Contact Information: 29*4882a593Smuzhiyun * James Guilford <james.guilford@intel.com> 30*4882a593Smuzhiyun * Sean Gulley <sean.m.gulley@intel.com> 31*4882a593Smuzhiyun * Chandramouli Narayanan <mouli@linux.intel.com> 32*4882a593Smuzhiyun * 33*4882a593Smuzhiyun * BSD LICENSE 34*4882a593Smuzhiyun * 35*4882a593Smuzhiyun * Copyright(c) 2014 Intel Corporation. 36*4882a593Smuzhiyun * 37*4882a593Smuzhiyun * Redistribution and use in source and binary forms, with or without 38*4882a593Smuzhiyun * modification, are permitted provided that the following conditions 39*4882a593Smuzhiyun * are met: 40*4882a593Smuzhiyun * 41*4882a593Smuzhiyun * Redistributions of source code must retain the above copyright 42*4882a593Smuzhiyun * notice, this list of conditions and the following disclaimer. 43*4882a593Smuzhiyun * Redistributions in binary form must reproduce the above copyright 44*4882a593Smuzhiyun * notice, this list of conditions and the following disclaimer in 45*4882a593Smuzhiyun * the documentation and/or other materials provided with the 46*4882a593Smuzhiyun * distribution. 47*4882a593Smuzhiyun * Neither the name of Intel Corporation nor the names of its 48*4882a593Smuzhiyun * contributors may be used to endorse or promote products derived 49*4882a593Smuzhiyun * from this software without specific prior written permission. 50*4882a593Smuzhiyun * 51*4882a593Smuzhiyun * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 52*4882a593Smuzhiyun * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 53*4882a593Smuzhiyun * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 54*4882a593Smuzhiyun * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 55*4882a593Smuzhiyun * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 56*4882a593Smuzhiyun * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 57*4882a593Smuzhiyun * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 58*4882a593Smuzhiyun * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 59*4882a593Smuzhiyun * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 60*4882a593Smuzhiyun * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 61*4882a593Smuzhiyun * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 62*4882a593Smuzhiyun * 63*4882a593Smuzhiyun */ 64*4882a593Smuzhiyun 65*4882a593Smuzhiyun#include <linux/linkage.h> 66*4882a593Smuzhiyun 67*4882a593Smuzhiyun#define VMOVDQ vmovdqu 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun#define xdata0 %xmm0 70*4882a593Smuzhiyun#define xdata1 %xmm1 71*4882a593Smuzhiyun#define xdata2 %xmm2 72*4882a593Smuzhiyun#define xdata3 %xmm3 73*4882a593Smuzhiyun#define xdata4 %xmm4 74*4882a593Smuzhiyun#define xdata5 %xmm5 75*4882a593Smuzhiyun#define xdata6 %xmm6 76*4882a593Smuzhiyun#define xdata7 %xmm7 77*4882a593Smuzhiyun#define xcounter %xmm8 78*4882a593Smuzhiyun#define xbyteswap %xmm9 79*4882a593Smuzhiyun#define xkey0 %xmm10 80*4882a593Smuzhiyun#define xkey4 %xmm11 81*4882a593Smuzhiyun#define xkey8 %xmm12 82*4882a593Smuzhiyun#define xkey12 %xmm13 83*4882a593Smuzhiyun#define xkeyA %xmm14 84*4882a593Smuzhiyun#define xkeyB %xmm15 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun#define p_in %rdi 87*4882a593Smuzhiyun#define p_iv %rsi 88*4882a593Smuzhiyun#define p_keys %rdx 89*4882a593Smuzhiyun#define p_out %rcx 90*4882a593Smuzhiyun#define num_bytes %r8 91*4882a593Smuzhiyun 92*4882a593Smuzhiyun#define tmp %r10 93*4882a593Smuzhiyun#define DDQ_DATA 0 94*4882a593Smuzhiyun#define XDATA 1 95*4882a593Smuzhiyun#define KEY_128 1 96*4882a593Smuzhiyun#define KEY_192 2 97*4882a593Smuzhiyun#define KEY_256 3 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun.section .rodata 100*4882a593Smuzhiyun.align 16 101*4882a593Smuzhiyun 102*4882a593Smuzhiyunbyteswap_const: 103*4882a593Smuzhiyun .octa 0x000102030405060708090A0B0C0D0E0F 104*4882a593Smuzhiyunddq_low_msk: 105*4882a593Smuzhiyun .octa 0x0000000000000000FFFFFFFFFFFFFFFF 106*4882a593Smuzhiyunddq_high_add_1: 107*4882a593Smuzhiyun .octa 0x00000000000000010000000000000000 108*4882a593Smuzhiyunddq_add_1: 109*4882a593Smuzhiyun .octa 0x00000000000000000000000000000001 110*4882a593Smuzhiyunddq_add_2: 111*4882a593Smuzhiyun .octa 0x00000000000000000000000000000002 112*4882a593Smuzhiyunddq_add_3: 113*4882a593Smuzhiyun .octa 0x00000000000000000000000000000003 114*4882a593Smuzhiyunddq_add_4: 115*4882a593Smuzhiyun .octa 0x00000000000000000000000000000004 116*4882a593Smuzhiyunddq_add_5: 117*4882a593Smuzhiyun .octa 0x00000000000000000000000000000005 118*4882a593Smuzhiyunddq_add_6: 119*4882a593Smuzhiyun .octa 0x00000000000000000000000000000006 120*4882a593Smuzhiyunddq_add_7: 121*4882a593Smuzhiyun .octa 0x00000000000000000000000000000007 122*4882a593Smuzhiyunddq_add_8: 123*4882a593Smuzhiyun .octa 0x00000000000000000000000000000008 124*4882a593Smuzhiyun 125*4882a593Smuzhiyun.text 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun/* generate a unique variable for ddq_add_x */ 128*4882a593Smuzhiyun 129*4882a593Smuzhiyun/* generate a unique variable for xmm register */ 130*4882a593Smuzhiyun.macro setxdata n 131*4882a593Smuzhiyun var_xdata = %xmm\n 132*4882a593Smuzhiyun.endm 133*4882a593Smuzhiyun 134*4882a593Smuzhiyun/* club the numeric 'id' to the symbol 'name' */ 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun.macro club name, id 137*4882a593Smuzhiyun.altmacro 138*4882a593Smuzhiyun .if \name == XDATA 139*4882a593Smuzhiyun setxdata %\id 140*4882a593Smuzhiyun .endif 141*4882a593Smuzhiyun.noaltmacro 142*4882a593Smuzhiyun.endm 143*4882a593Smuzhiyun 144*4882a593Smuzhiyun/* 145*4882a593Smuzhiyun * do_aes num_in_par load_keys key_len 146*4882a593Smuzhiyun * This increments p_in, but not p_out 147*4882a593Smuzhiyun */ 148*4882a593Smuzhiyun.macro do_aes b, k, key_len 149*4882a593Smuzhiyun .set by, \b 150*4882a593Smuzhiyun .set load_keys, \k 151*4882a593Smuzhiyun .set klen, \key_len 152*4882a593Smuzhiyun 153*4882a593Smuzhiyun .if (load_keys) 154*4882a593Smuzhiyun vmovdqa 0*16(p_keys), xkey0 155*4882a593Smuzhiyun .endif 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun vpshufb xbyteswap, xcounter, xdata0 158*4882a593Smuzhiyun 159*4882a593Smuzhiyun .set i, 1 160*4882a593Smuzhiyun .rept (by - 1) 161*4882a593Smuzhiyun club XDATA, i 162*4882a593Smuzhiyun vpaddq (ddq_add_1 + 16 * (i - 1))(%rip), xcounter, var_xdata 163*4882a593Smuzhiyun vptest ddq_low_msk(%rip), var_xdata 164*4882a593Smuzhiyun jnz 1f 165*4882a593Smuzhiyun vpaddq ddq_high_add_1(%rip), var_xdata, var_xdata 166*4882a593Smuzhiyun vpaddq ddq_high_add_1(%rip), xcounter, xcounter 167*4882a593Smuzhiyun 1: 168*4882a593Smuzhiyun vpshufb xbyteswap, var_xdata, var_xdata 169*4882a593Smuzhiyun .set i, (i +1) 170*4882a593Smuzhiyun .endr 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun vmovdqa 1*16(p_keys), xkeyA 173*4882a593Smuzhiyun 174*4882a593Smuzhiyun vpxor xkey0, xdata0, xdata0 175*4882a593Smuzhiyun vpaddq (ddq_add_1 + 16 * (by - 1))(%rip), xcounter, xcounter 176*4882a593Smuzhiyun vptest ddq_low_msk(%rip), xcounter 177*4882a593Smuzhiyun jnz 1f 178*4882a593Smuzhiyun vpaddq ddq_high_add_1(%rip), xcounter, xcounter 179*4882a593Smuzhiyun 1: 180*4882a593Smuzhiyun 181*4882a593Smuzhiyun .set i, 1 182*4882a593Smuzhiyun .rept (by - 1) 183*4882a593Smuzhiyun club XDATA, i 184*4882a593Smuzhiyun vpxor xkey0, var_xdata, var_xdata 185*4882a593Smuzhiyun .set i, (i +1) 186*4882a593Smuzhiyun .endr 187*4882a593Smuzhiyun 188*4882a593Smuzhiyun vmovdqa 2*16(p_keys), xkeyB 189*4882a593Smuzhiyun 190*4882a593Smuzhiyun .set i, 0 191*4882a593Smuzhiyun .rept by 192*4882a593Smuzhiyun club XDATA, i 193*4882a593Smuzhiyun vaesenc xkeyA, var_xdata, var_xdata /* key 1 */ 194*4882a593Smuzhiyun .set i, (i +1) 195*4882a593Smuzhiyun .endr 196*4882a593Smuzhiyun 197*4882a593Smuzhiyun .if (klen == KEY_128) 198*4882a593Smuzhiyun .if (load_keys) 199*4882a593Smuzhiyun vmovdqa 3*16(p_keys), xkey4 200*4882a593Smuzhiyun .endif 201*4882a593Smuzhiyun .else 202*4882a593Smuzhiyun vmovdqa 3*16(p_keys), xkeyA 203*4882a593Smuzhiyun .endif 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun .set i, 0 206*4882a593Smuzhiyun .rept by 207*4882a593Smuzhiyun club XDATA, i 208*4882a593Smuzhiyun vaesenc xkeyB, var_xdata, var_xdata /* key 2 */ 209*4882a593Smuzhiyun .set i, (i +1) 210*4882a593Smuzhiyun .endr 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun add $(16*by), p_in 213*4882a593Smuzhiyun 214*4882a593Smuzhiyun .if (klen == KEY_128) 215*4882a593Smuzhiyun vmovdqa 4*16(p_keys), xkeyB 216*4882a593Smuzhiyun .else 217*4882a593Smuzhiyun .if (load_keys) 218*4882a593Smuzhiyun vmovdqa 4*16(p_keys), xkey4 219*4882a593Smuzhiyun .endif 220*4882a593Smuzhiyun .endif 221*4882a593Smuzhiyun 222*4882a593Smuzhiyun .set i, 0 223*4882a593Smuzhiyun .rept by 224*4882a593Smuzhiyun club XDATA, i 225*4882a593Smuzhiyun /* key 3 */ 226*4882a593Smuzhiyun .if (klen == KEY_128) 227*4882a593Smuzhiyun vaesenc xkey4, var_xdata, var_xdata 228*4882a593Smuzhiyun .else 229*4882a593Smuzhiyun vaesenc xkeyA, var_xdata, var_xdata 230*4882a593Smuzhiyun .endif 231*4882a593Smuzhiyun .set i, (i +1) 232*4882a593Smuzhiyun .endr 233*4882a593Smuzhiyun 234*4882a593Smuzhiyun vmovdqa 5*16(p_keys), xkeyA 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun .set i, 0 237*4882a593Smuzhiyun .rept by 238*4882a593Smuzhiyun club XDATA, i 239*4882a593Smuzhiyun /* key 4 */ 240*4882a593Smuzhiyun .if (klen == KEY_128) 241*4882a593Smuzhiyun vaesenc xkeyB, var_xdata, var_xdata 242*4882a593Smuzhiyun .else 243*4882a593Smuzhiyun vaesenc xkey4, var_xdata, var_xdata 244*4882a593Smuzhiyun .endif 245*4882a593Smuzhiyun .set i, (i +1) 246*4882a593Smuzhiyun .endr 247*4882a593Smuzhiyun 248*4882a593Smuzhiyun .if (klen == KEY_128) 249*4882a593Smuzhiyun .if (load_keys) 250*4882a593Smuzhiyun vmovdqa 6*16(p_keys), xkey8 251*4882a593Smuzhiyun .endif 252*4882a593Smuzhiyun .else 253*4882a593Smuzhiyun vmovdqa 6*16(p_keys), xkeyB 254*4882a593Smuzhiyun .endif 255*4882a593Smuzhiyun 256*4882a593Smuzhiyun .set i, 0 257*4882a593Smuzhiyun .rept by 258*4882a593Smuzhiyun club XDATA, i 259*4882a593Smuzhiyun vaesenc xkeyA, var_xdata, var_xdata /* key 5 */ 260*4882a593Smuzhiyun .set i, (i +1) 261*4882a593Smuzhiyun .endr 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun vmovdqa 7*16(p_keys), xkeyA 264*4882a593Smuzhiyun 265*4882a593Smuzhiyun .set i, 0 266*4882a593Smuzhiyun .rept by 267*4882a593Smuzhiyun club XDATA, i 268*4882a593Smuzhiyun /* key 6 */ 269*4882a593Smuzhiyun .if (klen == KEY_128) 270*4882a593Smuzhiyun vaesenc xkey8, var_xdata, var_xdata 271*4882a593Smuzhiyun .else 272*4882a593Smuzhiyun vaesenc xkeyB, var_xdata, var_xdata 273*4882a593Smuzhiyun .endif 274*4882a593Smuzhiyun .set i, (i +1) 275*4882a593Smuzhiyun .endr 276*4882a593Smuzhiyun 277*4882a593Smuzhiyun .if (klen == KEY_128) 278*4882a593Smuzhiyun vmovdqa 8*16(p_keys), xkeyB 279*4882a593Smuzhiyun .else 280*4882a593Smuzhiyun .if (load_keys) 281*4882a593Smuzhiyun vmovdqa 8*16(p_keys), xkey8 282*4882a593Smuzhiyun .endif 283*4882a593Smuzhiyun .endif 284*4882a593Smuzhiyun 285*4882a593Smuzhiyun .set i, 0 286*4882a593Smuzhiyun .rept by 287*4882a593Smuzhiyun club XDATA, i 288*4882a593Smuzhiyun vaesenc xkeyA, var_xdata, var_xdata /* key 7 */ 289*4882a593Smuzhiyun .set i, (i +1) 290*4882a593Smuzhiyun .endr 291*4882a593Smuzhiyun 292*4882a593Smuzhiyun .if (klen == KEY_128) 293*4882a593Smuzhiyun .if (load_keys) 294*4882a593Smuzhiyun vmovdqa 9*16(p_keys), xkey12 295*4882a593Smuzhiyun .endif 296*4882a593Smuzhiyun .else 297*4882a593Smuzhiyun vmovdqa 9*16(p_keys), xkeyA 298*4882a593Smuzhiyun .endif 299*4882a593Smuzhiyun 300*4882a593Smuzhiyun .set i, 0 301*4882a593Smuzhiyun .rept by 302*4882a593Smuzhiyun club XDATA, i 303*4882a593Smuzhiyun /* key 8 */ 304*4882a593Smuzhiyun .if (klen == KEY_128) 305*4882a593Smuzhiyun vaesenc xkeyB, var_xdata, var_xdata 306*4882a593Smuzhiyun .else 307*4882a593Smuzhiyun vaesenc xkey8, var_xdata, var_xdata 308*4882a593Smuzhiyun .endif 309*4882a593Smuzhiyun .set i, (i +1) 310*4882a593Smuzhiyun .endr 311*4882a593Smuzhiyun 312*4882a593Smuzhiyun vmovdqa 10*16(p_keys), xkeyB 313*4882a593Smuzhiyun 314*4882a593Smuzhiyun .set i, 0 315*4882a593Smuzhiyun .rept by 316*4882a593Smuzhiyun club XDATA, i 317*4882a593Smuzhiyun /* key 9 */ 318*4882a593Smuzhiyun .if (klen == KEY_128) 319*4882a593Smuzhiyun vaesenc xkey12, var_xdata, var_xdata 320*4882a593Smuzhiyun .else 321*4882a593Smuzhiyun vaesenc xkeyA, var_xdata, var_xdata 322*4882a593Smuzhiyun .endif 323*4882a593Smuzhiyun .set i, (i +1) 324*4882a593Smuzhiyun .endr 325*4882a593Smuzhiyun 326*4882a593Smuzhiyun .if (klen != KEY_128) 327*4882a593Smuzhiyun vmovdqa 11*16(p_keys), xkeyA 328*4882a593Smuzhiyun .endif 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun .set i, 0 331*4882a593Smuzhiyun .rept by 332*4882a593Smuzhiyun club XDATA, i 333*4882a593Smuzhiyun /* key 10 */ 334*4882a593Smuzhiyun .if (klen == KEY_128) 335*4882a593Smuzhiyun vaesenclast xkeyB, var_xdata, var_xdata 336*4882a593Smuzhiyun .else 337*4882a593Smuzhiyun vaesenc xkeyB, var_xdata, var_xdata 338*4882a593Smuzhiyun .endif 339*4882a593Smuzhiyun .set i, (i +1) 340*4882a593Smuzhiyun .endr 341*4882a593Smuzhiyun 342*4882a593Smuzhiyun .if (klen != KEY_128) 343*4882a593Smuzhiyun .if (load_keys) 344*4882a593Smuzhiyun vmovdqa 12*16(p_keys), xkey12 345*4882a593Smuzhiyun .endif 346*4882a593Smuzhiyun 347*4882a593Smuzhiyun .set i, 0 348*4882a593Smuzhiyun .rept by 349*4882a593Smuzhiyun club XDATA, i 350*4882a593Smuzhiyun vaesenc xkeyA, var_xdata, var_xdata /* key 11 */ 351*4882a593Smuzhiyun .set i, (i +1) 352*4882a593Smuzhiyun .endr 353*4882a593Smuzhiyun 354*4882a593Smuzhiyun .if (klen == KEY_256) 355*4882a593Smuzhiyun vmovdqa 13*16(p_keys), xkeyA 356*4882a593Smuzhiyun .endif 357*4882a593Smuzhiyun 358*4882a593Smuzhiyun .set i, 0 359*4882a593Smuzhiyun .rept by 360*4882a593Smuzhiyun club XDATA, i 361*4882a593Smuzhiyun .if (klen == KEY_256) 362*4882a593Smuzhiyun /* key 12 */ 363*4882a593Smuzhiyun vaesenc xkey12, var_xdata, var_xdata 364*4882a593Smuzhiyun .else 365*4882a593Smuzhiyun vaesenclast xkey12, var_xdata, var_xdata 366*4882a593Smuzhiyun .endif 367*4882a593Smuzhiyun .set i, (i +1) 368*4882a593Smuzhiyun .endr 369*4882a593Smuzhiyun 370*4882a593Smuzhiyun .if (klen == KEY_256) 371*4882a593Smuzhiyun vmovdqa 14*16(p_keys), xkeyB 372*4882a593Smuzhiyun 373*4882a593Smuzhiyun .set i, 0 374*4882a593Smuzhiyun .rept by 375*4882a593Smuzhiyun club XDATA, i 376*4882a593Smuzhiyun /* key 13 */ 377*4882a593Smuzhiyun vaesenc xkeyA, var_xdata, var_xdata 378*4882a593Smuzhiyun .set i, (i +1) 379*4882a593Smuzhiyun .endr 380*4882a593Smuzhiyun 381*4882a593Smuzhiyun .set i, 0 382*4882a593Smuzhiyun .rept by 383*4882a593Smuzhiyun club XDATA, i 384*4882a593Smuzhiyun /* key 14 */ 385*4882a593Smuzhiyun vaesenclast xkeyB, var_xdata, var_xdata 386*4882a593Smuzhiyun .set i, (i +1) 387*4882a593Smuzhiyun .endr 388*4882a593Smuzhiyun .endif 389*4882a593Smuzhiyun .endif 390*4882a593Smuzhiyun 391*4882a593Smuzhiyun .set i, 0 392*4882a593Smuzhiyun .rept (by / 2) 393*4882a593Smuzhiyun .set j, (i+1) 394*4882a593Smuzhiyun VMOVDQ (i*16 - 16*by)(p_in), xkeyA 395*4882a593Smuzhiyun VMOVDQ (j*16 - 16*by)(p_in), xkeyB 396*4882a593Smuzhiyun club XDATA, i 397*4882a593Smuzhiyun vpxor xkeyA, var_xdata, var_xdata 398*4882a593Smuzhiyun club XDATA, j 399*4882a593Smuzhiyun vpxor xkeyB, var_xdata, var_xdata 400*4882a593Smuzhiyun .set i, (i+2) 401*4882a593Smuzhiyun .endr 402*4882a593Smuzhiyun 403*4882a593Smuzhiyun .if (i < by) 404*4882a593Smuzhiyun VMOVDQ (i*16 - 16*by)(p_in), xkeyA 405*4882a593Smuzhiyun club XDATA, i 406*4882a593Smuzhiyun vpxor xkeyA, var_xdata, var_xdata 407*4882a593Smuzhiyun .endif 408*4882a593Smuzhiyun 409*4882a593Smuzhiyun .set i, 0 410*4882a593Smuzhiyun .rept by 411*4882a593Smuzhiyun club XDATA, i 412*4882a593Smuzhiyun VMOVDQ var_xdata, i*16(p_out) 413*4882a593Smuzhiyun .set i, (i+1) 414*4882a593Smuzhiyun .endr 415*4882a593Smuzhiyun.endm 416*4882a593Smuzhiyun 417*4882a593Smuzhiyun.macro do_aes_load val, key_len 418*4882a593Smuzhiyun do_aes \val, 1, \key_len 419*4882a593Smuzhiyun.endm 420*4882a593Smuzhiyun 421*4882a593Smuzhiyun.macro do_aes_noload val, key_len 422*4882a593Smuzhiyun do_aes \val, 0, \key_len 423*4882a593Smuzhiyun.endm 424*4882a593Smuzhiyun 425*4882a593Smuzhiyun/* main body of aes ctr load */ 426*4882a593Smuzhiyun 427*4882a593Smuzhiyun.macro do_aes_ctrmain key_len 428*4882a593Smuzhiyun cmp $16, num_bytes 429*4882a593Smuzhiyun jb .Ldo_return2\key_len 430*4882a593Smuzhiyun 431*4882a593Smuzhiyun vmovdqa byteswap_const(%rip), xbyteswap 432*4882a593Smuzhiyun vmovdqu (p_iv), xcounter 433*4882a593Smuzhiyun vpshufb xbyteswap, xcounter, xcounter 434*4882a593Smuzhiyun 435*4882a593Smuzhiyun mov num_bytes, tmp 436*4882a593Smuzhiyun and $(7*16), tmp 437*4882a593Smuzhiyun jz .Lmult_of_8_blks\key_len 438*4882a593Smuzhiyun 439*4882a593Smuzhiyun /* 1 <= tmp <= 7 */ 440*4882a593Smuzhiyun cmp $(4*16), tmp 441*4882a593Smuzhiyun jg .Lgt4\key_len 442*4882a593Smuzhiyun je .Leq4\key_len 443*4882a593Smuzhiyun 444*4882a593Smuzhiyun.Llt4\key_len: 445*4882a593Smuzhiyun cmp $(2*16), tmp 446*4882a593Smuzhiyun jg .Leq3\key_len 447*4882a593Smuzhiyun je .Leq2\key_len 448*4882a593Smuzhiyun 449*4882a593Smuzhiyun.Leq1\key_len: 450*4882a593Smuzhiyun do_aes_load 1, \key_len 451*4882a593Smuzhiyun add $(1*16), p_out 452*4882a593Smuzhiyun and $(~7*16), num_bytes 453*4882a593Smuzhiyun jz .Ldo_return2\key_len 454*4882a593Smuzhiyun jmp .Lmain_loop2\key_len 455*4882a593Smuzhiyun 456*4882a593Smuzhiyun.Leq2\key_len: 457*4882a593Smuzhiyun do_aes_load 2, \key_len 458*4882a593Smuzhiyun add $(2*16), p_out 459*4882a593Smuzhiyun and $(~7*16), num_bytes 460*4882a593Smuzhiyun jz .Ldo_return2\key_len 461*4882a593Smuzhiyun jmp .Lmain_loop2\key_len 462*4882a593Smuzhiyun 463*4882a593Smuzhiyun 464*4882a593Smuzhiyun.Leq3\key_len: 465*4882a593Smuzhiyun do_aes_load 3, \key_len 466*4882a593Smuzhiyun add $(3*16), p_out 467*4882a593Smuzhiyun and $(~7*16), num_bytes 468*4882a593Smuzhiyun jz .Ldo_return2\key_len 469*4882a593Smuzhiyun jmp .Lmain_loop2\key_len 470*4882a593Smuzhiyun 471*4882a593Smuzhiyun.Leq4\key_len: 472*4882a593Smuzhiyun do_aes_load 4, \key_len 473*4882a593Smuzhiyun add $(4*16), p_out 474*4882a593Smuzhiyun and $(~7*16), num_bytes 475*4882a593Smuzhiyun jz .Ldo_return2\key_len 476*4882a593Smuzhiyun jmp .Lmain_loop2\key_len 477*4882a593Smuzhiyun 478*4882a593Smuzhiyun.Lgt4\key_len: 479*4882a593Smuzhiyun cmp $(6*16), tmp 480*4882a593Smuzhiyun jg .Leq7\key_len 481*4882a593Smuzhiyun je .Leq6\key_len 482*4882a593Smuzhiyun 483*4882a593Smuzhiyun.Leq5\key_len: 484*4882a593Smuzhiyun do_aes_load 5, \key_len 485*4882a593Smuzhiyun add $(5*16), p_out 486*4882a593Smuzhiyun and $(~7*16), num_bytes 487*4882a593Smuzhiyun jz .Ldo_return2\key_len 488*4882a593Smuzhiyun jmp .Lmain_loop2\key_len 489*4882a593Smuzhiyun 490*4882a593Smuzhiyun.Leq6\key_len: 491*4882a593Smuzhiyun do_aes_load 6, \key_len 492*4882a593Smuzhiyun add $(6*16), p_out 493*4882a593Smuzhiyun and $(~7*16), num_bytes 494*4882a593Smuzhiyun jz .Ldo_return2\key_len 495*4882a593Smuzhiyun jmp .Lmain_loop2\key_len 496*4882a593Smuzhiyun 497*4882a593Smuzhiyun.Leq7\key_len: 498*4882a593Smuzhiyun do_aes_load 7, \key_len 499*4882a593Smuzhiyun add $(7*16), p_out 500*4882a593Smuzhiyun and $(~7*16), num_bytes 501*4882a593Smuzhiyun jz .Ldo_return2\key_len 502*4882a593Smuzhiyun jmp .Lmain_loop2\key_len 503*4882a593Smuzhiyun 504*4882a593Smuzhiyun.Lmult_of_8_blks\key_len: 505*4882a593Smuzhiyun .if (\key_len != KEY_128) 506*4882a593Smuzhiyun vmovdqa 0*16(p_keys), xkey0 507*4882a593Smuzhiyun vmovdqa 4*16(p_keys), xkey4 508*4882a593Smuzhiyun vmovdqa 8*16(p_keys), xkey8 509*4882a593Smuzhiyun vmovdqa 12*16(p_keys), xkey12 510*4882a593Smuzhiyun .else 511*4882a593Smuzhiyun vmovdqa 0*16(p_keys), xkey0 512*4882a593Smuzhiyun vmovdqa 3*16(p_keys), xkey4 513*4882a593Smuzhiyun vmovdqa 6*16(p_keys), xkey8 514*4882a593Smuzhiyun vmovdqa 9*16(p_keys), xkey12 515*4882a593Smuzhiyun .endif 516*4882a593Smuzhiyun.align 16 517*4882a593Smuzhiyun.Lmain_loop2\key_len: 518*4882a593Smuzhiyun /* num_bytes is a multiple of 8 and >0 */ 519*4882a593Smuzhiyun do_aes_noload 8, \key_len 520*4882a593Smuzhiyun add $(8*16), p_out 521*4882a593Smuzhiyun sub $(8*16), num_bytes 522*4882a593Smuzhiyun jne .Lmain_loop2\key_len 523*4882a593Smuzhiyun 524*4882a593Smuzhiyun.Ldo_return2\key_len: 525*4882a593Smuzhiyun /* return updated IV */ 526*4882a593Smuzhiyun vpshufb xbyteswap, xcounter, xcounter 527*4882a593Smuzhiyun vmovdqu xcounter, (p_iv) 528*4882a593Smuzhiyun RET 529*4882a593Smuzhiyun.endm 530*4882a593Smuzhiyun 531*4882a593Smuzhiyun/* 532*4882a593Smuzhiyun * routine to do AES128 CTR enc/decrypt "by8" 533*4882a593Smuzhiyun * XMM registers are clobbered. 534*4882a593Smuzhiyun * Saving/restoring must be done at a higher level 535*4882a593Smuzhiyun * aes_ctr_enc_128_avx_by8(void *in, void *iv, void *keys, void *out, 536*4882a593Smuzhiyun * unsigned int num_bytes) 537*4882a593Smuzhiyun */ 538*4882a593SmuzhiyunSYM_FUNC_START(aes_ctr_enc_128_avx_by8) 539*4882a593Smuzhiyun /* call the aes main loop */ 540*4882a593Smuzhiyun do_aes_ctrmain KEY_128 541*4882a593Smuzhiyun 542*4882a593SmuzhiyunSYM_FUNC_END(aes_ctr_enc_128_avx_by8) 543*4882a593Smuzhiyun 544*4882a593Smuzhiyun/* 545*4882a593Smuzhiyun * routine to do AES192 CTR enc/decrypt "by8" 546*4882a593Smuzhiyun * XMM registers are clobbered. 547*4882a593Smuzhiyun * Saving/restoring must be done at a higher level 548*4882a593Smuzhiyun * aes_ctr_enc_192_avx_by8(void *in, void *iv, void *keys, void *out, 549*4882a593Smuzhiyun * unsigned int num_bytes) 550*4882a593Smuzhiyun */ 551*4882a593SmuzhiyunSYM_FUNC_START(aes_ctr_enc_192_avx_by8) 552*4882a593Smuzhiyun /* call the aes main loop */ 553*4882a593Smuzhiyun do_aes_ctrmain KEY_192 554*4882a593Smuzhiyun 555*4882a593SmuzhiyunSYM_FUNC_END(aes_ctr_enc_192_avx_by8) 556*4882a593Smuzhiyun 557*4882a593Smuzhiyun/* 558*4882a593Smuzhiyun * routine to do AES256 CTR enc/decrypt "by8" 559*4882a593Smuzhiyun * XMM registers are clobbered. 560*4882a593Smuzhiyun * Saving/restoring must be done at a higher level 561*4882a593Smuzhiyun * aes_ctr_enc_256_avx_by8(void *in, void *iv, void *keys, void *out, 562*4882a593Smuzhiyun * unsigned int num_bytes) 563*4882a593Smuzhiyun */ 564*4882a593SmuzhiyunSYM_FUNC_START(aes_ctr_enc_256_avx_by8) 565*4882a593Smuzhiyun /* call the aes main loop */ 566*4882a593Smuzhiyun do_aes_ctrmain KEY_256 567*4882a593Smuzhiyun 568*4882a593SmuzhiyunSYM_FUNC_END(aes_ctr_enc_256_avx_by8) 569