1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * x86_64/AVX2/AES-NI assembler implementation of Camellia 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun#include <asm/frame.h> 10*4882a593Smuzhiyun#include <asm/nospec-branch.h> 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun#define CAMELLIA_TABLE_BYTE_LEN 272 13*4882a593Smuzhiyun 14*4882a593Smuzhiyun/* struct camellia_ctx: */ 15*4882a593Smuzhiyun#define key_table 0 16*4882a593Smuzhiyun#define key_length CAMELLIA_TABLE_BYTE_LEN 17*4882a593Smuzhiyun 18*4882a593Smuzhiyun/* register macros */ 19*4882a593Smuzhiyun#define CTX %rdi 20*4882a593Smuzhiyun#define RIO %r8 21*4882a593Smuzhiyun 22*4882a593Smuzhiyun/********************************************************************** 23*4882a593Smuzhiyun helper macros 24*4882a593Smuzhiyun **********************************************************************/ 25*4882a593Smuzhiyun#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 26*4882a593Smuzhiyun vpand x, mask4bit, tmp0; \ 27*4882a593Smuzhiyun vpandn x, mask4bit, x; \ 28*4882a593Smuzhiyun vpsrld $4, x, x; \ 29*4882a593Smuzhiyun \ 30*4882a593Smuzhiyun vpshufb tmp0, lo_t, tmp0; \ 31*4882a593Smuzhiyun vpshufb x, hi_t, x; \ 32*4882a593Smuzhiyun vpxor tmp0, x, x; 33*4882a593Smuzhiyun 34*4882a593Smuzhiyun#define ymm0_x xmm0 35*4882a593Smuzhiyun#define ymm1_x xmm1 36*4882a593Smuzhiyun#define ymm2_x xmm2 37*4882a593Smuzhiyun#define ymm3_x xmm3 38*4882a593Smuzhiyun#define ymm4_x xmm4 39*4882a593Smuzhiyun#define ymm5_x xmm5 40*4882a593Smuzhiyun#define ymm6_x xmm6 41*4882a593Smuzhiyun#define ymm7_x xmm7 42*4882a593Smuzhiyun#define ymm8_x xmm8 43*4882a593Smuzhiyun#define ymm9_x xmm9 44*4882a593Smuzhiyun#define ymm10_x xmm10 45*4882a593Smuzhiyun#define ymm11_x xmm11 46*4882a593Smuzhiyun#define ymm12_x xmm12 47*4882a593Smuzhiyun#define ymm13_x xmm13 48*4882a593Smuzhiyun#define ymm14_x xmm14 49*4882a593Smuzhiyun#define ymm15_x xmm15 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun/********************************************************************** 52*4882a593Smuzhiyun 32-way camellia 53*4882a593Smuzhiyun **********************************************************************/ 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun/* 56*4882a593Smuzhiyun * IN: 57*4882a593Smuzhiyun * x0..x7: byte-sliced AB state 58*4882a593Smuzhiyun * mem_cd: register pointer storing CD state 59*4882a593Smuzhiyun * key: index for key material 60*4882a593Smuzhiyun * OUT: 61*4882a593Smuzhiyun * x0..x7: new byte-sliced CD state 62*4882a593Smuzhiyun */ 63*4882a593Smuzhiyun#define roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, t0, t1, t2, t3, t4, t5, t6, \ 64*4882a593Smuzhiyun t7, mem_cd, key) \ 65*4882a593Smuzhiyun /* \ 66*4882a593Smuzhiyun * S-function with AES subbytes \ 67*4882a593Smuzhiyun */ \ 68*4882a593Smuzhiyun vbroadcasti128 .Linv_shift_row, t4; \ 69*4882a593Smuzhiyun vpbroadcastd .L0f0f0f0f, t7; \ 70*4882a593Smuzhiyun vbroadcasti128 .Lpre_tf_lo_s1, t5; \ 71*4882a593Smuzhiyun vbroadcasti128 .Lpre_tf_hi_s1, t6; \ 72*4882a593Smuzhiyun vbroadcasti128 .Lpre_tf_lo_s4, t2; \ 73*4882a593Smuzhiyun vbroadcasti128 .Lpre_tf_hi_s4, t3; \ 74*4882a593Smuzhiyun \ 75*4882a593Smuzhiyun /* AES inverse shift rows */ \ 76*4882a593Smuzhiyun vpshufb t4, x0, x0; \ 77*4882a593Smuzhiyun vpshufb t4, x7, x7; \ 78*4882a593Smuzhiyun vpshufb t4, x3, x3; \ 79*4882a593Smuzhiyun vpshufb t4, x6, x6; \ 80*4882a593Smuzhiyun vpshufb t4, x2, x2; \ 81*4882a593Smuzhiyun vpshufb t4, x5, x5; \ 82*4882a593Smuzhiyun vpshufb t4, x1, x1; \ 83*4882a593Smuzhiyun vpshufb t4, x4, x4; \ 84*4882a593Smuzhiyun \ 85*4882a593Smuzhiyun /* prefilter sboxes 1, 2 and 3 */ \ 86*4882a593Smuzhiyun /* prefilter sbox 4 */ \ 87*4882a593Smuzhiyun filter_8bit(x0, t5, t6, t7, t4); \ 88*4882a593Smuzhiyun filter_8bit(x7, t5, t6, t7, t4); \ 89*4882a593Smuzhiyun vextracti128 $1, x0, t0##_x; \ 90*4882a593Smuzhiyun vextracti128 $1, x7, t1##_x; \ 91*4882a593Smuzhiyun filter_8bit(x3, t2, t3, t7, t4); \ 92*4882a593Smuzhiyun filter_8bit(x6, t2, t3, t7, t4); \ 93*4882a593Smuzhiyun vextracti128 $1, x3, t3##_x; \ 94*4882a593Smuzhiyun vextracti128 $1, x6, t2##_x; \ 95*4882a593Smuzhiyun filter_8bit(x2, t5, t6, t7, t4); \ 96*4882a593Smuzhiyun filter_8bit(x5, t5, t6, t7, t4); \ 97*4882a593Smuzhiyun filter_8bit(x1, t5, t6, t7, t4); \ 98*4882a593Smuzhiyun filter_8bit(x4, t5, t6, t7, t4); \ 99*4882a593Smuzhiyun \ 100*4882a593Smuzhiyun vpxor t4##_x, t4##_x, t4##_x; \ 101*4882a593Smuzhiyun \ 102*4882a593Smuzhiyun /* AES subbytes + AES shift rows */ \ 103*4882a593Smuzhiyun vextracti128 $1, x2, t6##_x; \ 104*4882a593Smuzhiyun vextracti128 $1, x5, t5##_x; \ 105*4882a593Smuzhiyun vaesenclast t4##_x, x0##_x, x0##_x; \ 106*4882a593Smuzhiyun vaesenclast t4##_x, t0##_x, t0##_x; \ 107*4882a593Smuzhiyun vinserti128 $1, t0##_x, x0, x0; \ 108*4882a593Smuzhiyun vaesenclast t4##_x, x7##_x, x7##_x; \ 109*4882a593Smuzhiyun vaesenclast t4##_x, t1##_x, t1##_x; \ 110*4882a593Smuzhiyun vinserti128 $1, t1##_x, x7, x7; \ 111*4882a593Smuzhiyun vaesenclast t4##_x, x3##_x, x3##_x; \ 112*4882a593Smuzhiyun vaesenclast t4##_x, t3##_x, t3##_x; \ 113*4882a593Smuzhiyun vinserti128 $1, t3##_x, x3, x3; \ 114*4882a593Smuzhiyun vaesenclast t4##_x, x6##_x, x6##_x; \ 115*4882a593Smuzhiyun vaesenclast t4##_x, t2##_x, t2##_x; \ 116*4882a593Smuzhiyun vinserti128 $1, t2##_x, x6, x6; \ 117*4882a593Smuzhiyun vextracti128 $1, x1, t3##_x; \ 118*4882a593Smuzhiyun vextracti128 $1, x4, t2##_x; \ 119*4882a593Smuzhiyun vbroadcasti128 .Lpost_tf_lo_s1, t0; \ 120*4882a593Smuzhiyun vbroadcasti128 .Lpost_tf_hi_s1, t1; \ 121*4882a593Smuzhiyun vaesenclast t4##_x, x2##_x, x2##_x; \ 122*4882a593Smuzhiyun vaesenclast t4##_x, t6##_x, t6##_x; \ 123*4882a593Smuzhiyun vinserti128 $1, t6##_x, x2, x2; \ 124*4882a593Smuzhiyun vaesenclast t4##_x, x5##_x, x5##_x; \ 125*4882a593Smuzhiyun vaesenclast t4##_x, t5##_x, t5##_x; \ 126*4882a593Smuzhiyun vinserti128 $1, t5##_x, x5, x5; \ 127*4882a593Smuzhiyun vaesenclast t4##_x, x1##_x, x1##_x; \ 128*4882a593Smuzhiyun vaesenclast t4##_x, t3##_x, t3##_x; \ 129*4882a593Smuzhiyun vinserti128 $1, t3##_x, x1, x1; \ 130*4882a593Smuzhiyun vaesenclast t4##_x, x4##_x, x4##_x; \ 131*4882a593Smuzhiyun vaesenclast t4##_x, t2##_x, t2##_x; \ 132*4882a593Smuzhiyun vinserti128 $1, t2##_x, x4, x4; \ 133*4882a593Smuzhiyun \ 134*4882a593Smuzhiyun /* postfilter sboxes 1 and 4 */ \ 135*4882a593Smuzhiyun vbroadcasti128 .Lpost_tf_lo_s3, t2; \ 136*4882a593Smuzhiyun vbroadcasti128 .Lpost_tf_hi_s3, t3; \ 137*4882a593Smuzhiyun filter_8bit(x0, t0, t1, t7, t6); \ 138*4882a593Smuzhiyun filter_8bit(x7, t0, t1, t7, t6); \ 139*4882a593Smuzhiyun filter_8bit(x3, t0, t1, t7, t6); \ 140*4882a593Smuzhiyun filter_8bit(x6, t0, t1, t7, t6); \ 141*4882a593Smuzhiyun \ 142*4882a593Smuzhiyun /* postfilter sbox 3 */ \ 143*4882a593Smuzhiyun vbroadcasti128 .Lpost_tf_lo_s2, t4; \ 144*4882a593Smuzhiyun vbroadcasti128 .Lpost_tf_hi_s2, t5; \ 145*4882a593Smuzhiyun filter_8bit(x2, t2, t3, t7, t6); \ 146*4882a593Smuzhiyun filter_8bit(x5, t2, t3, t7, t6); \ 147*4882a593Smuzhiyun \ 148*4882a593Smuzhiyun vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \ 149*4882a593Smuzhiyun \ 150*4882a593Smuzhiyun /* postfilter sbox 2 */ \ 151*4882a593Smuzhiyun filter_8bit(x1, t4, t5, t7, t2); \ 152*4882a593Smuzhiyun filter_8bit(x4, t4, t5, t7, t2); \ 153*4882a593Smuzhiyun vpxor t7, t7, t7; \ 154*4882a593Smuzhiyun \ 155*4882a593Smuzhiyun vpsrldq $1, t0, t1; \ 156*4882a593Smuzhiyun vpsrldq $2, t0, t2; \ 157*4882a593Smuzhiyun vpshufb t7, t1, t1; \ 158*4882a593Smuzhiyun vpsrldq $3, t0, t3; \ 159*4882a593Smuzhiyun \ 160*4882a593Smuzhiyun /* P-function */ \ 161*4882a593Smuzhiyun vpxor x5, x0, x0; \ 162*4882a593Smuzhiyun vpxor x6, x1, x1; \ 163*4882a593Smuzhiyun vpxor x7, x2, x2; \ 164*4882a593Smuzhiyun vpxor x4, x3, x3; \ 165*4882a593Smuzhiyun \ 166*4882a593Smuzhiyun vpshufb t7, t2, t2; \ 167*4882a593Smuzhiyun vpsrldq $4, t0, t4; \ 168*4882a593Smuzhiyun vpshufb t7, t3, t3; \ 169*4882a593Smuzhiyun vpsrldq $5, t0, t5; \ 170*4882a593Smuzhiyun vpshufb t7, t4, t4; \ 171*4882a593Smuzhiyun \ 172*4882a593Smuzhiyun vpxor x2, x4, x4; \ 173*4882a593Smuzhiyun vpxor x3, x5, x5; \ 174*4882a593Smuzhiyun vpxor x0, x6, x6; \ 175*4882a593Smuzhiyun vpxor x1, x7, x7; \ 176*4882a593Smuzhiyun \ 177*4882a593Smuzhiyun vpsrldq $6, t0, t6; \ 178*4882a593Smuzhiyun vpshufb t7, t5, t5; \ 179*4882a593Smuzhiyun vpshufb t7, t6, t6; \ 180*4882a593Smuzhiyun \ 181*4882a593Smuzhiyun vpxor x7, x0, x0; \ 182*4882a593Smuzhiyun vpxor x4, x1, x1; \ 183*4882a593Smuzhiyun vpxor x5, x2, x2; \ 184*4882a593Smuzhiyun vpxor x6, x3, x3; \ 185*4882a593Smuzhiyun \ 186*4882a593Smuzhiyun vpxor x3, x4, x4; \ 187*4882a593Smuzhiyun vpxor x0, x5, x5; \ 188*4882a593Smuzhiyun vpxor x1, x6, x6; \ 189*4882a593Smuzhiyun vpxor x2, x7, x7; /* note: high and low parts swapped */ \ 190*4882a593Smuzhiyun \ 191*4882a593Smuzhiyun /* Add key material and result to CD (x becomes new CD) */ \ 192*4882a593Smuzhiyun \ 193*4882a593Smuzhiyun vpxor t6, x1, x1; \ 194*4882a593Smuzhiyun vpxor 5 * 32(mem_cd), x1, x1; \ 195*4882a593Smuzhiyun \ 196*4882a593Smuzhiyun vpsrldq $7, t0, t6; \ 197*4882a593Smuzhiyun vpshufb t7, t0, t0; \ 198*4882a593Smuzhiyun vpshufb t7, t6, t7; \ 199*4882a593Smuzhiyun \ 200*4882a593Smuzhiyun vpxor t7, x0, x0; \ 201*4882a593Smuzhiyun vpxor 4 * 32(mem_cd), x0, x0; \ 202*4882a593Smuzhiyun \ 203*4882a593Smuzhiyun vpxor t5, x2, x2; \ 204*4882a593Smuzhiyun vpxor 6 * 32(mem_cd), x2, x2; \ 205*4882a593Smuzhiyun \ 206*4882a593Smuzhiyun vpxor t4, x3, x3; \ 207*4882a593Smuzhiyun vpxor 7 * 32(mem_cd), x3, x3; \ 208*4882a593Smuzhiyun \ 209*4882a593Smuzhiyun vpxor t3, x4, x4; \ 210*4882a593Smuzhiyun vpxor 0 * 32(mem_cd), x4, x4; \ 211*4882a593Smuzhiyun \ 212*4882a593Smuzhiyun vpxor t2, x5, x5; \ 213*4882a593Smuzhiyun vpxor 1 * 32(mem_cd), x5, x5; \ 214*4882a593Smuzhiyun \ 215*4882a593Smuzhiyun vpxor t1, x6, x6; \ 216*4882a593Smuzhiyun vpxor 2 * 32(mem_cd), x6, x6; \ 217*4882a593Smuzhiyun \ 218*4882a593Smuzhiyun vpxor t0, x7, x7; \ 219*4882a593Smuzhiyun vpxor 3 * 32(mem_cd), x7, x7; 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun/* 222*4882a593Smuzhiyun * Size optimization... with inlined roundsm32 binary would be over 5 times 223*4882a593Smuzhiyun * larger and would only marginally faster. 224*4882a593Smuzhiyun */ 225*4882a593Smuzhiyun.align 8 226*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 227*4882a593Smuzhiyun roundsm32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 228*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 229*4882a593Smuzhiyun %rcx, (%r9)); 230*4882a593Smuzhiyun RET; 231*4882a593SmuzhiyunSYM_FUNC_END(roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd) 232*4882a593Smuzhiyun 233*4882a593Smuzhiyun.align 8 234*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 235*4882a593Smuzhiyun roundsm32(%ymm4, %ymm5, %ymm6, %ymm7, %ymm0, %ymm1, %ymm2, %ymm3, 236*4882a593Smuzhiyun %ymm12, %ymm13, %ymm14, %ymm15, %ymm8, %ymm9, %ymm10, %ymm11, 237*4882a593Smuzhiyun %rax, (%r9)); 238*4882a593Smuzhiyun RET; 239*4882a593SmuzhiyunSYM_FUNC_END(roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab) 240*4882a593Smuzhiyun 241*4882a593Smuzhiyun/* 242*4882a593Smuzhiyun * IN/OUT: 243*4882a593Smuzhiyun * x0..x7: byte-sliced AB state preloaded 244*4882a593Smuzhiyun * mem_ab: byte-sliced AB state in memory 245*4882a593Smuzhiyun * mem_cb: byte-sliced CD state in memory 246*4882a593Smuzhiyun */ 247*4882a593Smuzhiyun#define two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 248*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd, i, dir, store_ab) \ 249*4882a593Smuzhiyun leaq (key_table + (i) * 8)(CTX), %r9; \ 250*4882a593Smuzhiyun call roundsm32_x0_x1_x2_x3_x4_x5_x6_x7_y0_y1_y2_y3_y4_y5_y6_y7_cd; \ 251*4882a593Smuzhiyun \ 252*4882a593Smuzhiyun vmovdqu x0, 4 * 32(mem_cd); \ 253*4882a593Smuzhiyun vmovdqu x1, 5 * 32(mem_cd); \ 254*4882a593Smuzhiyun vmovdqu x2, 6 * 32(mem_cd); \ 255*4882a593Smuzhiyun vmovdqu x3, 7 * 32(mem_cd); \ 256*4882a593Smuzhiyun vmovdqu x4, 0 * 32(mem_cd); \ 257*4882a593Smuzhiyun vmovdqu x5, 1 * 32(mem_cd); \ 258*4882a593Smuzhiyun vmovdqu x6, 2 * 32(mem_cd); \ 259*4882a593Smuzhiyun vmovdqu x7, 3 * 32(mem_cd); \ 260*4882a593Smuzhiyun \ 261*4882a593Smuzhiyun leaq (key_table + ((i) + (dir)) * 8)(CTX), %r9; \ 262*4882a593Smuzhiyun call roundsm32_x4_x5_x6_x7_x0_x1_x2_x3_y4_y5_y6_y7_y0_y1_y2_y3_ab; \ 263*4882a593Smuzhiyun \ 264*4882a593Smuzhiyun store_ab(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab); 265*4882a593Smuzhiyun 266*4882a593Smuzhiyun#define dummy_store(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) /* do nothing */ 267*4882a593Smuzhiyun 268*4882a593Smuzhiyun#define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \ 269*4882a593Smuzhiyun /* Store new AB state */ \ 270*4882a593Smuzhiyun vmovdqu x4, 4 * 32(mem_ab); \ 271*4882a593Smuzhiyun vmovdqu x5, 5 * 32(mem_ab); \ 272*4882a593Smuzhiyun vmovdqu x6, 6 * 32(mem_ab); \ 273*4882a593Smuzhiyun vmovdqu x7, 7 * 32(mem_ab); \ 274*4882a593Smuzhiyun vmovdqu x0, 0 * 32(mem_ab); \ 275*4882a593Smuzhiyun vmovdqu x1, 1 * 32(mem_ab); \ 276*4882a593Smuzhiyun vmovdqu x2, 2 * 32(mem_ab); \ 277*4882a593Smuzhiyun vmovdqu x3, 3 * 32(mem_ab); 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun#define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 280*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd, i) \ 281*4882a593Smuzhiyun two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 282*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd, (i) + 2, 1, store_ab_state); \ 283*4882a593Smuzhiyun two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 284*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd, (i) + 4, 1, store_ab_state); \ 285*4882a593Smuzhiyun two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 286*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd, (i) + 6, 1, dummy_store); 287*4882a593Smuzhiyun 288*4882a593Smuzhiyun#define dec_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 289*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd, i) \ 290*4882a593Smuzhiyun two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 291*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \ 292*4882a593Smuzhiyun two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 293*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \ 294*4882a593Smuzhiyun two_roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 295*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store); 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun/* 298*4882a593Smuzhiyun * IN: 299*4882a593Smuzhiyun * v0..3: byte-sliced 32-bit integers 300*4882a593Smuzhiyun * OUT: 301*4882a593Smuzhiyun * v0..3: (IN <<< 1) 302*4882a593Smuzhiyun */ 303*4882a593Smuzhiyun#define rol32_1_32(v0, v1, v2, v3, t0, t1, t2, zero) \ 304*4882a593Smuzhiyun vpcmpgtb v0, zero, t0; \ 305*4882a593Smuzhiyun vpaddb v0, v0, v0; \ 306*4882a593Smuzhiyun vpabsb t0, t0; \ 307*4882a593Smuzhiyun \ 308*4882a593Smuzhiyun vpcmpgtb v1, zero, t1; \ 309*4882a593Smuzhiyun vpaddb v1, v1, v1; \ 310*4882a593Smuzhiyun vpabsb t1, t1; \ 311*4882a593Smuzhiyun \ 312*4882a593Smuzhiyun vpcmpgtb v2, zero, t2; \ 313*4882a593Smuzhiyun vpaddb v2, v2, v2; \ 314*4882a593Smuzhiyun vpabsb t2, t2; \ 315*4882a593Smuzhiyun \ 316*4882a593Smuzhiyun vpor t0, v1, v1; \ 317*4882a593Smuzhiyun \ 318*4882a593Smuzhiyun vpcmpgtb v3, zero, t0; \ 319*4882a593Smuzhiyun vpaddb v3, v3, v3; \ 320*4882a593Smuzhiyun vpabsb t0, t0; \ 321*4882a593Smuzhiyun \ 322*4882a593Smuzhiyun vpor t1, v2, v2; \ 323*4882a593Smuzhiyun vpor t2, v3, v3; \ 324*4882a593Smuzhiyun vpor t0, v0, v0; 325*4882a593Smuzhiyun 326*4882a593Smuzhiyun/* 327*4882a593Smuzhiyun * IN: 328*4882a593Smuzhiyun * r: byte-sliced AB state in memory 329*4882a593Smuzhiyun * l: byte-sliced CD state in memory 330*4882a593Smuzhiyun * OUT: 331*4882a593Smuzhiyun * x0..x7: new byte-sliced CD state 332*4882a593Smuzhiyun */ 333*4882a593Smuzhiyun#define fls32(l, l0, l1, l2, l3, l4, l5, l6, l7, r, t0, t1, t2, t3, tt0, \ 334*4882a593Smuzhiyun tt1, tt2, tt3, kll, klr, krl, krr) \ 335*4882a593Smuzhiyun /* \ 336*4882a593Smuzhiyun * t0 = kll; \ 337*4882a593Smuzhiyun * t0 &= ll; \ 338*4882a593Smuzhiyun * lr ^= rol32(t0, 1); \ 339*4882a593Smuzhiyun */ \ 340*4882a593Smuzhiyun vpbroadcastd kll, t0; /* only lowest 32-bit used */ \ 341*4882a593Smuzhiyun vpxor tt0, tt0, tt0; \ 342*4882a593Smuzhiyun vpshufb tt0, t0, t3; \ 343*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 344*4882a593Smuzhiyun vpshufb tt0, t0, t2; \ 345*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 346*4882a593Smuzhiyun vpshufb tt0, t0, t1; \ 347*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 348*4882a593Smuzhiyun vpshufb tt0, t0, t0; \ 349*4882a593Smuzhiyun \ 350*4882a593Smuzhiyun vpand l0, t0, t0; \ 351*4882a593Smuzhiyun vpand l1, t1, t1; \ 352*4882a593Smuzhiyun vpand l2, t2, t2; \ 353*4882a593Smuzhiyun vpand l3, t3, t3; \ 354*4882a593Smuzhiyun \ 355*4882a593Smuzhiyun rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 356*4882a593Smuzhiyun \ 357*4882a593Smuzhiyun vpxor l4, t0, l4; \ 358*4882a593Smuzhiyun vpbroadcastd krr, t0; /* only lowest 32-bit used */ \ 359*4882a593Smuzhiyun vmovdqu l4, 4 * 32(l); \ 360*4882a593Smuzhiyun vpxor l5, t1, l5; \ 361*4882a593Smuzhiyun vmovdqu l5, 5 * 32(l); \ 362*4882a593Smuzhiyun vpxor l6, t2, l6; \ 363*4882a593Smuzhiyun vmovdqu l6, 6 * 32(l); \ 364*4882a593Smuzhiyun vpxor l7, t3, l7; \ 365*4882a593Smuzhiyun vmovdqu l7, 7 * 32(l); \ 366*4882a593Smuzhiyun \ 367*4882a593Smuzhiyun /* \ 368*4882a593Smuzhiyun * t2 = krr; \ 369*4882a593Smuzhiyun * t2 |= rr; \ 370*4882a593Smuzhiyun * rl ^= t2; \ 371*4882a593Smuzhiyun */ \ 372*4882a593Smuzhiyun \ 373*4882a593Smuzhiyun vpshufb tt0, t0, t3; \ 374*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 375*4882a593Smuzhiyun vpshufb tt0, t0, t2; \ 376*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 377*4882a593Smuzhiyun vpshufb tt0, t0, t1; \ 378*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 379*4882a593Smuzhiyun vpshufb tt0, t0, t0; \ 380*4882a593Smuzhiyun \ 381*4882a593Smuzhiyun vpor 4 * 32(r), t0, t0; \ 382*4882a593Smuzhiyun vpor 5 * 32(r), t1, t1; \ 383*4882a593Smuzhiyun vpor 6 * 32(r), t2, t2; \ 384*4882a593Smuzhiyun vpor 7 * 32(r), t3, t3; \ 385*4882a593Smuzhiyun \ 386*4882a593Smuzhiyun vpxor 0 * 32(r), t0, t0; \ 387*4882a593Smuzhiyun vpxor 1 * 32(r), t1, t1; \ 388*4882a593Smuzhiyun vpxor 2 * 32(r), t2, t2; \ 389*4882a593Smuzhiyun vpxor 3 * 32(r), t3, t3; \ 390*4882a593Smuzhiyun vmovdqu t0, 0 * 32(r); \ 391*4882a593Smuzhiyun vpbroadcastd krl, t0; /* only lowest 32-bit used */ \ 392*4882a593Smuzhiyun vmovdqu t1, 1 * 32(r); \ 393*4882a593Smuzhiyun vmovdqu t2, 2 * 32(r); \ 394*4882a593Smuzhiyun vmovdqu t3, 3 * 32(r); \ 395*4882a593Smuzhiyun \ 396*4882a593Smuzhiyun /* \ 397*4882a593Smuzhiyun * t2 = krl; \ 398*4882a593Smuzhiyun * t2 &= rl; \ 399*4882a593Smuzhiyun * rr ^= rol32(t2, 1); \ 400*4882a593Smuzhiyun */ \ 401*4882a593Smuzhiyun vpshufb tt0, t0, t3; \ 402*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 403*4882a593Smuzhiyun vpshufb tt0, t0, t2; \ 404*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 405*4882a593Smuzhiyun vpshufb tt0, t0, t1; \ 406*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 407*4882a593Smuzhiyun vpshufb tt0, t0, t0; \ 408*4882a593Smuzhiyun \ 409*4882a593Smuzhiyun vpand 0 * 32(r), t0, t0; \ 410*4882a593Smuzhiyun vpand 1 * 32(r), t1, t1; \ 411*4882a593Smuzhiyun vpand 2 * 32(r), t2, t2; \ 412*4882a593Smuzhiyun vpand 3 * 32(r), t3, t3; \ 413*4882a593Smuzhiyun \ 414*4882a593Smuzhiyun rol32_1_32(t3, t2, t1, t0, tt1, tt2, tt3, tt0); \ 415*4882a593Smuzhiyun \ 416*4882a593Smuzhiyun vpxor 4 * 32(r), t0, t0; \ 417*4882a593Smuzhiyun vpxor 5 * 32(r), t1, t1; \ 418*4882a593Smuzhiyun vpxor 6 * 32(r), t2, t2; \ 419*4882a593Smuzhiyun vpxor 7 * 32(r), t3, t3; \ 420*4882a593Smuzhiyun vmovdqu t0, 4 * 32(r); \ 421*4882a593Smuzhiyun vpbroadcastd klr, t0; /* only lowest 32-bit used */ \ 422*4882a593Smuzhiyun vmovdqu t1, 5 * 32(r); \ 423*4882a593Smuzhiyun vmovdqu t2, 6 * 32(r); \ 424*4882a593Smuzhiyun vmovdqu t3, 7 * 32(r); \ 425*4882a593Smuzhiyun \ 426*4882a593Smuzhiyun /* \ 427*4882a593Smuzhiyun * t0 = klr; \ 428*4882a593Smuzhiyun * t0 |= lr; \ 429*4882a593Smuzhiyun * ll ^= t0; \ 430*4882a593Smuzhiyun */ \ 431*4882a593Smuzhiyun \ 432*4882a593Smuzhiyun vpshufb tt0, t0, t3; \ 433*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 434*4882a593Smuzhiyun vpshufb tt0, t0, t2; \ 435*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 436*4882a593Smuzhiyun vpshufb tt0, t0, t1; \ 437*4882a593Smuzhiyun vpsrldq $1, t0, t0; \ 438*4882a593Smuzhiyun vpshufb tt0, t0, t0; \ 439*4882a593Smuzhiyun \ 440*4882a593Smuzhiyun vpor l4, t0, t0; \ 441*4882a593Smuzhiyun vpor l5, t1, t1; \ 442*4882a593Smuzhiyun vpor l6, t2, t2; \ 443*4882a593Smuzhiyun vpor l7, t3, t3; \ 444*4882a593Smuzhiyun \ 445*4882a593Smuzhiyun vpxor l0, t0, l0; \ 446*4882a593Smuzhiyun vmovdqu l0, 0 * 32(l); \ 447*4882a593Smuzhiyun vpxor l1, t1, l1; \ 448*4882a593Smuzhiyun vmovdqu l1, 1 * 32(l); \ 449*4882a593Smuzhiyun vpxor l2, t2, l2; \ 450*4882a593Smuzhiyun vmovdqu l2, 2 * 32(l); \ 451*4882a593Smuzhiyun vpxor l3, t3, l3; \ 452*4882a593Smuzhiyun vmovdqu l3, 3 * 32(l); 453*4882a593Smuzhiyun 454*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 455*4882a593Smuzhiyun vpunpckhdq x1, x0, t2; \ 456*4882a593Smuzhiyun vpunpckldq x1, x0, x0; \ 457*4882a593Smuzhiyun \ 458*4882a593Smuzhiyun vpunpckldq x3, x2, t1; \ 459*4882a593Smuzhiyun vpunpckhdq x3, x2, x2; \ 460*4882a593Smuzhiyun \ 461*4882a593Smuzhiyun vpunpckhqdq t1, x0, x1; \ 462*4882a593Smuzhiyun vpunpcklqdq t1, x0, x0; \ 463*4882a593Smuzhiyun \ 464*4882a593Smuzhiyun vpunpckhqdq x2, t2, x3; \ 465*4882a593Smuzhiyun vpunpcklqdq x2, t2, x2; 466*4882a593Smuzhiyun 467*4882a593Smuzhiyun#define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \ 468*4882a593Smuzhiyun a3, b3, c3, d3, st0, st1) \ 469*4882a593Smuzhiyun vmovdqu d2, st0; \ 470*4882a593Smuzhiyun vmovdqu d3, st1; \ 471*4882a593Smuzhiyun transpose_4x4(a0, a1, a2, a3, d2, d3); \ 472*4882a593Smuzhiyun transpose_4x4(b0, b1, b2, b3, d2, d3); \ 473*4882a593Smuzhiyun vmovdqu st0, d2; \ 474*4882a593Smuzhiyun vmovdqu st1, d3; \ 475*4882a593Smuzhiyun \ 476*4882a593Smuzhiyun vmovdqu a0, st0; \ 477*4882a593Smuzhiyun vmovdqu a1, st1; \ 478*4882a593Smuzhiyun transpose_4x4(c0, c1, c2, c3, a0, a1); \ 479*4882a593Smuzhiyun transpose_4x4(d0, d1, d2, d3, a0, a1); \ 480*4882a593Smuzhiyun \ 481*4882a593Smuzhiyun vbroadcasti128 .Lshufb_16x16b, a0; \ 482*4882a593Smuzhiyun vmovdqu st1, a1; \ 483*4882a593Smuzhiyun vpshufb a0, a2, a2; \ 484*4882a593Smuzhiyun vpshufb a0, a3, a3; \ 485*4882a593Smuzhiyun vpshufb a0, b0, b0; \ 486*4882a593Smuzhiyun vpshufb a0, b1, b1; \ 487*4882a593Smuzhiyun vpshufb a0, b2, b2; \ 488*4882a593Smuzhiyun vpshufb a0, b3, b3; \ 489*4882a593Smuzhiyun vpshufb a0, a1, a1; \ 490*4882a593Smuzhiyun vpshufb a0, c0, c0; \ 491*4882a593Smuzhiyun vpshufb a0, c1, c1; \ 492*4882a593Smuzhiyun vpshufb a0, c2, c2; \ 493*4882a593Smuzhiyun vpshufb a0, c3, c3; \ 494*4882a593Smuzhiyun vpshufb a0, d0, d0; \ 495*4882a593Smuzhiyun vpshufb a0, d1, d1; \ 496*4882a593Smuzhiyun vpshufb a0, d2, d2; \ 497*4882a593Smuzhiyun vpshufb a0, d3, d3; \ 498*4882a593Smuzhiyun vmovdqu d3, st1; \ 499*4882a593Smuzhiyun vmovdqu st0, d3; \ 500*4882a593Smuzhiyun vpshufb a0, d3, a0; \ 501*4882a593Smuzhiyun vmovdqu d2, st0; \ 502*4882a593Smuzhiyun \ 503*4882a593Smuzhiyun transpose_4x4(a0, b0, c0, d0, d2, d3); \ 504*4882a593Smuzhiyun transpose_4x4(a1, b1, c1, d1, d2, d3); \ 505*4882a593Smuzhiyun vmovdqu st0, d2; \ 506*4882a593Smuzhiyun vmovdqu st1, d3; \ 507*4882a593Smuzhiyun \ 508*4882a593Smuzhiyun vmovdqu b0, st0; \ 509*4882a593Smuzhiyun vmovdqu b1, st1; \ 510*4882a593Smuzhiyun transpose_4x4(a2, b2, c2, d2, b0, b1); \ 511*4882a593Smuzhiyun transpose_4x4(a3, b3, c3, d3, b0, b1); \ 512*4882a593Smuzhiyun vmovdqu st0, b0; \ 513*4882a593Smuzhiyun vmovdqu st1, b1; \ 514*4882a593Smuzhiyun /* does not adjust output bytes inside vectors */ 515*4882a593Smuzhiyun 516*4882a593Smuzhiyun/* load blocks to registers and apply pre-whitening */ 517*4882a593Smuzhiyun#define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 518*4882a593Smuzhiyun y6, y7, rio, key) \ 519*4882a593Smuzhiyun vpbroadcastq key, x0; \ 520*4882a593Smuzhiyun vpshufb .Lpack_bswap, x0, x0; \ 521*4882a593Smuzhiyun \ 522*4882a593Smuzhiyun vpxor 0 * 32(rio), x0, y7; \ 523*4882a593Smuzhiyun vpxor 1 * 32(rio), x0, y6; \ 524*4882a593Smuzhiyun vpxor 2 * 32(rio), x0, y5; \ 525*4882a593Smuzhiyun vpxor 3 * 32(rio), x0, y4; \ 526*4882a593Smuzhiyun vpxor 4 * 32(rio), x0, y3; \ 527*4882a593Smuzhiyun vpxor 5 * 32(rio), x0, y2; \ 528*4882a593Smuzhiyun vpxor 6 * 32(rio), x0, y1; \ 529*4882a593Smuzhiyun vpxor 7 * 32(rio), x0, y0; \ 530*4882a593Smuzhiyun vpxor 8 * 32(rio), x0, x7; \ 531*4882a593Smuzhiyun vpxor 9 * 32(rio), x0, x6; \ 532*4882a593Smuzhiyun vpxor 10 * 32(rio), x0, x5; \ 533*4882a593Smuzhiyun vpxor 11 * 32(rio), x0, x4; \ 534*4882a593Smuzhiyun vpxor 12 * 32(rio), x0, x3; \ 535*4882a593Smuzhiyun vpxor 13 * 32(rio), x0, x2; \ 536*4882a593Smuzhiyun vpxor 14 * 32(rio), x0, x1; \ 537*4882a593Smuzhiyun vpxor 15 * 32(rio), x0, x0; 538*4882a593Smuzhiyun 539*4882a593Smuzhiyun/* byteslice pre-whitened blocks and store to temporary memory */ 540*4882a593Smuzhiyun#define inpack32_post(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 541*4882a593Smuzhiyun y6, y7, mem_ab, mem_cd) \ 542*4882a593Smuzhiyun byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \ 543*4882a593Smuzhiyun y4, y5, y6, y7, (mem_ab), (mem_cd)); \ 544*4882a593Smuzhiyun \ 545*4882a593Smuzhiyun vmovdqu x0, 0 * 32(mem_ab); \ 546*4882a593Smuzhiyun vmovdqu x1, 1 * 32(mem_ab); \ 547*4882a593Smuzhiyun vmovdqu x2, 2 * 32(mem_ab); \ 548*4882a593Smuzhiyun vmovdqu x3, 3 * 32(mem_ab); \ 549*4882a593Smuzhiyun vmovdqu x4, 4 * 32(mem_ab); \ 550*4882a593Smuzhiyun vmovdqu x5, 5 * 32(mem_ab); \ 551*4882a593Smuzhiyun vmovdqu x6, 6 * 32(mem_ab); \ 552*4882a593Smuzhiyun vmovdqu x7, 7 * 32(mem_ab); \ 553*4882a593Smuzhiyun vmovdqu y0, 0 * 32(mem_cd); \ 554*4882a593Smuzhiyun vmovdqu y1, 1 * 32(mem_cd); \ 555*4882a593Smuzhiyun vmovdqu y2, 2 * 32(mem_cd); \ 556*4882a593Smuzhiyun vmovdqu y3, 3 * 32(mem_cd); \ 557*4882a593Smuzhiyun vmovdqu y4, 4 * 32(mem_cd); \ 558*4882a593Smuzhiyun vmovdqu y5, 5 * 32(mem_cd); \ 559*4882a593Smuzhiyun vmovdqu y6, 6 * 32(mem_cd); \ 560*4882a593Smuzhiyun vmovdqu y7, 7 * 32(mem_cd); 561*4882a593Smuzhiyun 562*4882a593Smuzhiyun/* de-byteslice, apply post-whitening and store blocks */ 563*4882a593Smuzhiyun#define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \ 564*4882a593Smuzhiyun y5, y6, y7, key, stack_tmp0, stack_tmp1) \ 565*4882a593Smuzhiyun byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \ 566*4882a593Smuzhiyun y3, y7, x3, x7, stack_tmp0, stack_tmp1); \ 567*4882a593Smuzhiyun \ 568*4882a593Smuzhiyun vmovdqu x0, stack_tmp0; \ 569*4882a593Smuzhiyun \ 570*4882a593Smuzhiyun vpbroadcastq key, x0; \ 571*4882a593Smuzhiyun vpshufb .Lpack_bswap, x0, x0; \ 572*4882a593Smuzhiyun \ 573*4882a593Smuzhiyun vpxor x0, y7, y7; \ 574*4882a593Smuzhiyun vpxor x0, y6, y6; \ 575*4882a593Smuzhiyun vpxor x0, y5, y5; \ 576*4882a593Smuzhiyun vpxor x0, y4, y4; \ 577*4882a593Smuzhiyun vpxor x0, y3, y3; \ 578*4882a593Smuzhiyun vpxor x0, y2, y2; \ 579*4882a593Smuzhiyun vpxor x0, y1, y1; \ 580*4882a593Smuzhiyun vpxor x0, y0, y0; \ 581*4882a593Smuzhiyun vpxor x0, x7, x7; \ 582*4882a593Smuzhiyun vpxor x0, x6, x6; \ 583*4882a593Smuzhiyun vpxor x0, x5, x5; \ 584*4882a593Smuzhiyun vpxor x0, x4, x4; \ 585*4882a593Smuzhiyun vpxor x0, x3, x3; \ 586*4882a593Smuzhiyun vpxor x0, x2, x2; \ 587*4882a593Smuzhiyun vpxor x0, x1, x1; \ 588*4882a593Smuzhiyun vpxor stack_tmp0, x0, x0; 589*4882a593Smuzhiyun 590*4882a593Smuzhiyun#define write_output(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ 591*4882a593Smuzhiyun y6, y7, rio) \ 592*4882a593Smuzhiyun vmovdqu x0, 0 * 32(rio); \ 593*4882a593Smuzhiyun vmovdqu x1, 1 * 32(rio); \ 594*4882a593Smuzhiyun vmovdqu x2, 2 * 32(rio); \ 595*4882a593Smuzhiyun vmovdqu x3, 3 * 32(rio); \ 596*4882a593Smuzhiyun vmovdqu x4, 4 * 32(rio); \ 597*4882a593Smuzhiyun vmovdqu x5, 5 * 32(rio); \ 598*4882a593Smuzhiyun vmovdqu x6, 6 * 32(rio); \ 599*4882a593Smuzhiyun vmovdqu x7, 7 * 32(rio); \ 600*4882a593Smuzhiyun vmovdqu y0, 8 * 32(rio); \ 601*4882a593Smuzhiyun vmovdqu y1, 9 * 32(rio); \ 602*4882a593Smuzhiyun vmovdqu y2, 10 * 32(rio); \ 603*4882a593Smuzhiyun vmovdqu y3, 11 * 32(rio); \ 604*4882a593Smuzhiyun vmovdqu y4, 12 * 32(rio); \ 605*4882a593Smuzhiyun vmovdqu y5, 13 * 32(rio); \ 606*4882a593Smuzhiyun vmovdqu y6, 14 * 32(rio); \ 607*4882a593Smuzhiyun vmovdqu y7, 15 * 32(rio); 608*4882a593Smuzhiyun 609*4882a593Smuzhiyun 610*4882a593Smuzhiyun.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32 611*4882a593Smuzhiyun.align 32 612*4882a593Smuzhiyun#define SHUFB_BYTES(idx) \ 613*4882a593Smuzhiyun 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 614*4882a593Smuzhiyun.Lshufb_16x16b: 615*4882a593Smuzhiyun .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 616*4882a593Smuzhiyun .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 617*4882a593Smuzhiyun 618*4882a593Smuzhiyun.section .rodata.cst32.pack_bswap, "aM", @progbits, 32 619*4882a593Smuzhiyun.align 32 620*4882a593Smuzhiyun.Lpack_bswap: 621*4882a593Smuzhiyun .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 622*4882a593Smuzhiyun .long 0x00010203, 0x04050607, 0x80808080, 0x80808080 623*4882a593Smuzhiyun 624*4882a593Smuzhiyun/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 625*4882a593Smuzhiyun.section .rodata.cst16, "aM", @progbits, 16 626*4882a593Smuzhiyun.align 16 627*4882a593Smuzhiyun 628*4882a593Smuzhiyun/* For CTR-mode IV byteswap */ 629*4882a593Smuzhiyun.Lbswap128_mask: 630*4882a593Smuzhiyun .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 631*4882a593Smuzhiyun 632*4882a593Smuzhiyun/* For XTS mode */ 633*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask_0: 634*4882a593Smuzhiyun .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 635*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask_1: 636*4882a593Smuzhiyun .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 637*4882a593Smuzhiyun 638*4882a593Smuzhiyun/* 639*4882a593Smuzhiyun * pre-SubByte transform 640*4882a593Smuzhiyun * 641*4882a593Smuzhiyun * pre-lookup for sbox1, sbox2, sbox3: 642*4882a593Smuzhiyun * swap_bitendianness( 643*4882a593Smuzhiyun * isom_map_camellia_to_aes( 644*4882a593Smuzhiyun * camellia_f( 645*4882a593Smuzhiyun * swap_bitendianess(in) 646*4882a593Smuzhiyun * ) 647*4882a593Smuzhiyun * ) 648*4882a593Smuzhiyun * ) 649*4882a593Smuzhiyun * 650*4882a593Smuzhiyun * (note: '⊕ 0xc5' inside camellia_f()) 651*4882a593Smuzhiyun */ 652*4882a593Smuzhiyun.Lpre_tf_lo_s1: 653*4882a593Smuzhiyun .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86 654*4882a593Smuzhiyun .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88 655*4882a593Smuzhiyun.Lpre_tf_hi_s1: 656*4882a593Smuzhiyun .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a 657*4882a593Smuzhiyun .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23 658*4882a593Smuzhiyun 659*4882a593Smuzhiyun/* 660*4882a593Smuzhiyun * pre-SubByte transform 661*4882a593Smuzhiyun * 662*4882a593Smuzhiyun * pre-lookup for sbox4: 663*4882a593Smuzhiyun * swap_bitendianness( 664*4882a593Smuzhiyun * isom_map_camellia_to_aes( 665*4882a593Smuzhiyun * camellia_f( 666*4882a593Smuzhiyun * swap_bitendianess(in <<< 1) 667*4882a593Smuzhiyun * ) 668*4882a593Smuzhiyun * ) 669*4882a593Smuzhiyun * ) 670*4882a593Smuzhiyun * 671*4882a593Smuzhiyun * (note: '⊕ 0xc5' inside camellia_f()) 672*4882a593Smuzhiyun */ 673*4882a593Smuzhiyun.Lpre_tf_lo_s4: 674*4882a593Smuzhiyun .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25 675*4882a593Smuzhiyun .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74 676*4882a593Smuzhiyun.Lpre_tf_hi_s4: 677*4882a593Smuzhiyun .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72 678*4882a593Smuzhiyun .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf 679*4882a593Smuzhiyun 680*4882a593Smuzhiyun/* 681*4882a593Smuzhiyun * post-SubByte transform 682*4882a593Smuzhiyun * 683*4882a593Smuzhiyun * post-lookup for sbox1, sbox4: 684*4882a593Smuzhiyun * swap_bitendianness( 685*4882a593Smuzhiyun * camellia_h( 686*4882a593Smuzhiyun * isom_map_aes_to_camellia( 687*4882a593Smuzhiyun * swap_bitendianness( 688*4882a593Smuzhiyun * aes_inverse_affine_transform(in) 689*4882a593Smuzhiyun * ) 690*4882a593Smuzhiyun * ) 691*4882a593Smuzhiyun * ) 692*4882a593Smuzhiyun * ) 693*4882a593Smuzhiyun * 694*4882a593Smuzhiyun * (note: '⊕ 0x6e' inside camellia_h()) 695*4882a593Smuzhiyun */ 696*4882a593Smuzhiyun.Lpost_tf_lo_s1: 697*4882a593Smuzhiyun .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31 698*4882a593Smuzhiyun .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1 699*4882a593Smuzhiyun.Lpost_tf_hi_s1: 700*4882a593Smuzhiyun .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8 701*4882a593Smuzhiyun .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c 702*4882a593Smuzhiyun 703*4882a593Smuzhiyun/* 704*4882a593Smuzhiyun * post-SubByte transform 705*4882a593Smuzhiyun * 706*4882a593Smuzhiyun * post-lookup for sbox2: 707*4882a593Smuzhiyun * swap_bitendianness( 708*4882a593Smuzhiyun * camellia_h( 709*4882a593Smuzhiyun * isom_map_aes_to_camellia( 710*4882a593Smuzhiyun * swap_bitendianness( 711*4882a593Smuzhiyun * aes_inverse_affine_transform(in) 712*4882a593Smuzhiyun * ) 713*4882a593Smuzhiyun * ) 714*4882a593Smuzhiyun * ) 715*4882a593Smuzhiyun * ) <<< 1 716*4882a593Smuzhiyun * 717*4882a593Smuzhiyun * (note: '⊕ 0x6e' inside camellia_h()) 718*4882a593Smuzhiyun */ 719*4882a593Smuzhiyun.Lpost_tf_lo_s2: 720*4882a593Smuzhiyun .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62 721*4882a593Smuzhiyun .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3 722*4882a593Smuzhiyun.Lpost_tf_hi_s2: 723*4882a593Smuzhiyun .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51 724*4882a593Smuzhiyun .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18 725*4882a593Smuzhiyun 726*4882a593Smuzhiyun/* 727*4882a593Smuzhiyun * post-SubByte transform 728*4882a593Smuzhiyun * 729*4882a593Smuzhiyun * post-lookup for sbox3: 730*4882a593Smuzhiyun * swap_bitendianness( 731*4882a593Smuzhiyun * camellia_h( 732*4882a593Smuzhiyun * isom_map_aes_to_camellia( 733*4882a593Smuzhiyun * swap_bitendianness( 734*4882a593Smuzhiyun * aes_inverse_affine_transform(in) 735*4882a593Smuzhiyun * ) 736*4882a593Smuzhiyun * ) 737*4882a593Smuzhiyun * ) 738*4882a593Smuzhiyun * ) >>> 1 739*4882a593Smuzhiyun * 740*4882a593Smuzhiyun * (note: '⊕ 0x6e' inside camellia_h()) 741*4882a593Smuzhiyun */ 742*4882a593Smuzhiyun.Lpost_tf_lo_s3: 743*4882a593Smuzhiyun .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98 744*4882a593Smuzhiyun .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8 745*4882a593Smuzhiyun.Lpost_tf_hi_s3: 746*4882a593Smuzhiyun .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54 747*4882a593Smuzhiyun .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06 748*4882a593Smuzhiyun 749*4882a593Smuzhiyun/* For isolating SubBytes from AESENCLAST, inverse shift row */ 750*4882a593Smuzhiyun.Linv_shift_row: 751*4882a593Smuzhiyun .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 752*4882a593Smuzhiyun .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 753*4882a593Smuzhiyun 754*4882a593Smuzhiyun.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 755*4882a593Smuzhiyun.align 4 756*4882a593Smuzhiyun/* 4-bit mask */ 757*4882a593Smuzhiyun.L0f0f0f0f: 758*4882a593Smuzhiyun .long 0x0f0f0f0f 759*4882a593Smuzhiyun 760*4882a593Smuzhiyun.text 761*4882a593Smuzhiyun 762*4882a593Smuzhiyun.align 8 763*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__camellia_enc_blk32) 764*4882a593Smuzhiyun /* input: 765*4882a593Smuzhiyun * %rdi: ctx, CTX 766*4882a593Smuzhiyun * %rax: temporary storage, 512 bytes 767*4882a593Smuzhiyun * %ymm0..%ymm15: 32 plaintext blocks 768*4882a593Smuzhiyun * output: 769*4882a593Smuzhiyun * %ymm0..%ymm15: 32 encrypted blocks, order swapped: 770*4882a593Smuzhiyun * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 771*4882a593Smuzhiyun */ 772*4882a593Smuzhiyun FRAME_BEGIN 773*4882a593Smuzhiyun 774*4882a593Smuzhiyun leaq 8 * 32(%rax), %rcx; 775*4882a593Smuzhiyun 776*4882a593Smuzhiyun inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 777*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 778*4882a593Smuzhiyun %ymm15, %rax, %rcx); 779*4882a593Smuzhiyun 780*4882a593Smuzhiyun enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 781*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 782*4882a593Smuzhiyun %ymm15, %rax, %rcx, 0); 783*4882a593Smuzhiyun 784*4882a593Smuzhiyun fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 785*4882a593Smuzhiyun %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 786*4882a593Smuzhiyun %ymm15, 787*4882a593Smuzhiyun ((key_table + (8) * 8) + 0)(CTX), 788*4882a593Smuzhiyun ((key_table + (8) * 8) + 4)(CTX), 789*4882a593Smuzhiyun ((key_table + (8) * 8) + 8)(CTX), 790*4882a593Smuzhiyun ((key_table + (8) * 8) + 12)(CTX)); 791*4882a593Smuzhiyun 792*4882a593Smuzhiyun enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 793*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 794*4882a593Smuzhiyun %ymm15, %rax, %rcx, 8); 795*4882a593Smuzhiyun 796*4882a593Smuzhiyun fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 797*4882a593Smuzhiyun %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 798*4882a593Smuzhiyun %ymm15, 799*4882a593Smuzhiyun ((key_table + (16) * 8) + 0)(CTX), 800*4882a593Smuzhiyun ((key_table + (16) * 8) + 4)(CTX), 801*4882a593Smuzhiyun ((key_table + (16) * 8) + 8)(CTX), 802*4882a593Smuzhiyun ((key_table + (16) * 8) + 12)(CTX)); 803*4882a593Smuzhiyun 804*4882a593Smuzhiyun enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 805*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 806*4882a593Smuzhiyun %ymm15, %rax, %rcx, 16); 807*4882a593Smuzhiyun 808*4882a593Smuzhiyun movl $24, %r8d; 809*4882a593Smuzhiyun cmpl $16, key_length(CTX); 810*4882a593Smuzhiyun jne .Lenc_max32; 811*4882a593Smuzhiyun 812*4882a593Smuzhiyun.Lenc_done: 813*4882a593Smuzhiyun /* load CD for output */ 814*4882a593Smuzhiyun vmovdqu 0 * 32(%rcx), %ymm8; 815*4882a593Smuzhiyun vmovdqu 1 * 32(%rcx), %ymm9; 816*4882a593Smuzhiyun vmovdqu 2 * 32(%rcx), %ymm10; 817*4882a593Smuzhiyun vmovdqu 3 * 32(%rcx), %ymm11; 818*4882a593Smuzhiyun vmovdqu 4 * 32(%rcx), %ymm12; 819*4882a593Smuzhiyun vmovdqu 5 * 32(%rcx), %ymm13; 820*4882a593Smuzhiyun vmovdqu 6 * 32(%rcx), %ymm14; 821*4882a593Smuzhiyun vmovdqu 7 * 32(%rcx), %ymm15; 822*4882a593Smuzhiyun 823*4882a593Smuzhiyun outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 824*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 825*4882a593Smuzhiyun %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax)); 826*4882a593Smuzhiyun 827*4882a593Smuzhiyun FRAME_END 828*4882a593Smuzhiyun RET; 829*4882a593Smuzhiyun 830*4882a593Smuzhiyun.align 8 831*4882a593Smuzhiyun.Lenc_max32: 832*4882a593Smuzhiyun movl $32, %r8d; 833*4882a593Smuzhiyun 834*4882a593Smuzhiyun fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 835*4882a593Smuzhiyun %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 836*4882a593Smuzhiyun %ymm15, 837*4882a593Smuzhiyun ((key_table + (24) * 8) + 0)(CTX), 838*4882a593Smuzhiyun ((key_table + (24) * 8) + 4)(CTX), 839*4882a593Smuzhiyun ((key_table + (24) * 8) + 8)(CTX), 840*4882a593Smuzhiyun ((key_table + (24) * 8) + 12)(CTX)); 841*4882a593Smuzhiyun 842*4882a593Smuzhiyun enc_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 843*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 844*4882a593Smuzhiyun %ymm15, %rax, %rcx, 24); 845*4882a593Smuzhiyun 846*4882a593Smuzhiyun jmp .Lenc_done; 847*4882a593SmuzhiyunSYM_FUNC_END(__camellia_enc_blk32) 848*4882a593Smuzhiyun 849*4882a593Smuzhiyun.align 8 850*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__camellia_dec_blk32) 851*4882a593Smuzhiyun /* input: 852*4882a593Smuzhiyun * %rdi: ctx, CTX 853*4882a593Smuzhiyun * %rax: temporary storage, 512 bytes 854*4882a593Smuzhiyun * %r8d: 24 for 16 byte key, 32 for larger 855*4882a593Smuzhiyun * %ymm0..%ymm15: 16 encrypted blocks 856*4882a593Smuzhiyun * output: 857*4882a593Smuzhiyun * %ymm0..%ymm15: 16 plaintext blocks, order swapped: 858*4882a593Smuzhiyun * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 859*4882a593Smuzhiyun */ 860*4882a593Smuzhiyun FRAME_BEGIN 861*4882a593Smuzhiyun 862*4882a593Smuzhiyun leaq 8 * 32(%rax), %rcx; 863*4882a593Smuzhiyun 864*4882a593Smuzhiyun inpack32_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 865*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 866*4882a593Smuzhiyun %ymm15, %rax, %rcx); 867*4882a593Smuzhiyun 868*4882a593Smuzhiyun cmpl $32, %r8d; 869*4882a593Smuzhiyun je .Ldec_max32; 870*4882a593Smuzhiyun 871*4882a593Smuzhiyun.Ldec_max24: 872*4882a593Smuzhiyun dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 873*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 874*4882a593Smuzhiyun %ymm15, %rax, %rcx, 16); 875*4882a593Smuzhiyun 876*4882a593Smuzhiyun fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 877*4882a593Smuzhiyun %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 878*4882a593Smuzhiyun %ymm15, 879*4882a593Smuzhiyun ((key_table + (16) * 8) + 8)(CTX), 880*4882a593Smuzhiyun ((key_table + (16) * 8) + 12)(CTX), 881*4882a593Smuzhiyun ((key_table + (16) * 8) + 0)(CTX), 882*4882a593Smuzhiyun ((key_table + (16) * 8) + 4)(CTX)); 883*4882a593Smuzhiyun 884*4882a593Smuzhiyun dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 885*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 886*4882a593Smuzhiyun %ymm15, %rax, %rcx, 8); 887*4882a593Smuzhiyun 888*4882a593Smuzhiyun fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 889*4882a593Smuzhiyun %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 890*4882a593Smuzhiyun %ymm15, 891*4882a593Smuzhiyun ((key_table + (8) * 8) + 8)(CTX), 892*4882a593Smuzhiyun ((key_table + (8) * 8) + 12)(CTX), 893*4882a593Smuzhiyun ((key_table + (8) * 8) + 0)(CTX), 894*4882a593Smuzhiyun ((key_table + (8) * 8) + 4)(CTX)); 895*4882a593Smuzhiyun 896*4882a593Smuzhiyun dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 897*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 898*4882a593Smuzhiyun %ymm15, %rax, %rcx, 0); 899*4882a593Smuzhiyun 900*4882a593Smuzhiyun /* load CD for output */ 901*4882a593Smuzhiyun vmovdqu 0 * 32(%rcx), %ymm8; 902*4882a593Smuzhiyun vmovdqu 1 * 32(%rcx), %ymm9; 903*4882a593Smuzhiyun vmovdqu 2 * 32(%rcx), %ymm10; 904*4882a593Smuzhiyun vmovdqu 3 * 32(%rcx), %ymm11; 905*4882a593Smuzhiyun vmovdqu 4 * 32(%rcx), %ymm12; 906*4882a593Smuzhiyun vmovdqu 5 * 32(%rcx), %ymm13; 907*4882a593Smuzhiyun vmovdqu 6 * 32(%rcx), %ymm14; 908*4882a593Smuzhiyun vmovdqu 7 * 32(%rcx), %ymm15; 909*4882a593Smuzhiyun 910*4882a593Smuzhiyun outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 911*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 912*4882a593Smuzhiyun %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); 913*4882a593Smuzhiyun 914*4882a593Smuzhiyun FRAME_END 915*4882a593Smuzhiyun RET; 916*4882a593Smuzhiyun 917*4882a593Smuzhiyun.align 8 918*4882a593Smuzhiyun.Ldec_max32: 919*4882a593Smuzhiyun dec_rounds32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 920*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 921*4882a593Smuzhiyun %ymm15, %rax, %rcx, 24); 922*4882a593Smuzhiyun 923*4882a593Smuzhiyun fls32(%rax, %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 924*4882a593Smuzhiyun %rcx, %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 925*4882a593Smuzhiyun %ymm15, 926*4882a593Smuzhiyun ((key_table + (24) * 8) + 8)(CTX), 927*4882a593Smuzhiyun ((key_table + (24) * 8) + 12)(CTX), 928*4882a593Smuzhiyun ((key_table + (24) * 8) + 0)(CTX), 929*4882a593Smuzhiyun ((key_table + (24) * 8) + 4)(CTX)); 930*4882a593Smuzhiyun 931*4882a593Smuzhiyun jmp .Ldec_max24; 932*4882a593SmuzhiyunSYM_FUNC_END(__camellia_dec_blk32) 933*4882a593Smuzhiyun 934*4882a593SmuzhiyunSYM_FUNC_START(camellia_ecb_enc_32way) 935*4882a593Smuzhiyun /* input: 936*4882a593Smuzhiyun * %rdi: ctx, CTX 937*4882a593Smuzhiyun * %rsi: dst (32 blocks) 938*4882a593Smuzhiyun * %rdx: src (32 blocks) 939*4882a593Smuzhiyun */ 940*4882a593Smuzhiyun FRAME_BEGIN 941*4882a593Smuzhiyun 942*4882a593Smuzhiyun vzeroupper; 943*4882a593Smuzhiyun 944*4882a593Smuzhiyun inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 945*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 946*4882a593Smuzhiyun %ymm15, %rdx, (key_table)(CTX)); 947*4882a593Smuzhiyun 948*4882a593Smuzhiyun /* now dst can be used as temporary buffer (even in src == dst case) */ 949*4882a593Smuzhiyun movq %rsi, %rax; 950*4882a593Smuzhiyun 951*4882a593Smuzhiyun call __camellia_enc_blk32; 952*4882a593Smuzhiyun 953*4882a593Smuzhiyun write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 954*4882a593Smuzhiyun %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 955*4882a593Smuzhiyun %ymm8, %rsi); 956*4882a593Smuzhiyun 957*4882a593Smuzhiyun vzeroupper; 958*4882a593Smuzhiyun 959*4882a593Smuzhiyun FRAME_END 960*4882a593Smuzhiyun RET; 961*4882a593SmuzhiyunSYM_FUNC_END(camellia_ecb_enc_32way) 962*4882a593Smuzhiyun 963*4882a593SmuzhiyunSYM_FUNC_START(camellia_ecb_dec_32way) 964*4882a593Smuzhiyun /* input: 965*4882a593Smuzhiyun * %rdi: ctx, CTX 966*4882a593Smuzhiyun * %rsi: dst (32 blocks) 967*4882a593Smuzhiyun * %rdx: src (32 blocks) 968*4882a593Smuzhiyun */ 969*4882a593Smuzhiyun FRAME_BEGIN 970*4882a593Smuzhiyun 971*4882a593Smuzhiyun vzeroupper; 972*4882a593Smuzhiyun 973*4882a593Smuzhiyun cmpl $16, key_length(CTX); 974*4882a593Smuzhiyun movl $32, %r8d; 975*4882a593Smuzhiyun movl $24, %eax; 976*4882a593Smuzhiyun cmovel %eax, %r8d; /* max */ 977*4882a593Smuzhiyun 978*4882a593Smuzhiyun inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 979*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 980*4882a593Smuzhiyun %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 981*4882a593Smuzhiyun 982*4882a593Smuzhiyun /* now dst can be used as temporary buffer (even in src == dst case) */ 983*4882a593Smuzhiyun movq %rsi, %rax; 984*4882a593Smuzhiyun 985*4882a593Smuzhiyun call __camellia_dec_blk32; 986*4882a593Smuzhiyun 987*4882a593Smuzhiyun write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 988*4882a593Smuzhiyun %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 989*4882a593Smuzhiyun %ymm8, %rsi); 990*4882a593Smuzhiyun 991*4882a593Smuzhiyun vzeroupper; 992*4882a593Smuzhiyun 993*4882a593Smuzhiyun FRAME_END 994*4882a593Smuzhiyun RET; 995*4882a593SmuzhiyunSYM_FUNC_END(camellia_ecb_dec_32way) 996*4882a593Smuzhiyun 997*4882a593SmuzhiyunSYM_FUNC_START(camellia_cbc_dec_32way) 998*4882a593Smuzhiyun /* input: 999*4882a593Smuzhiyun * %rdi: ctx, CTX 1000*4882a593Smuzhiyun * %rsi: dst (32 blocks) 1001*4882a593Smuzhiyun * %rdx: src (32 blocks) 1002*4882a593Smuzhiyun */ 1003*4882a593Smuzhiyun FRAME_BEGIN 1004*4882a593Smuzhiyun 1005*4882a593Smuzhiyun vzeroupper; 1006*4882a593Smuzhiyun 1007*4882a593Smuzhiyun cmpl $16, key_length(CTX); 1008*4882a593Smuzhiyun movl $32, %r8d; 1009*4882a593Smuzhiyun movl $24, %eax; 1010*4882a593Smuzhiyun cmovel %eax, %r8d; /* max */ 1011*4882a593Smuzhiyun 1012*4882a593Smuzhiyun inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 1013*4882a593Smuzhiyun %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1014*4882a593Smuzhiyun %ymm15, %rdx, (key_table)(CTX, %r8, 8)); 1015*4882a593Smuzhiyun 1016*4882a593Smuzhiyun movq %rsp, %r10; 1017*4882a593Smuzhiyun cmpq %rsi, %rdx; 1018*4882a593Smuzhiyun je .Lcbc_dec_use_stack; 1019*4882a593Smuzhiyun 1020*4882a593Smuzhiyun /* dst can be used as temporary storage, src is not overwritten. */ 1021*4882a593Smuzhiyun movq %rsi, %rax; 1022*4882a593Smuzhiyun jmp .Lcbc_dec_continue; 1023*4882a593Smuzhiyun 1024*4882a593Smuzhiyun.Lcbc_dec_use_stack: 1025*4882a593Smuzhiyun /* 1026*4882a593Smuzhiyun * dst still in-use (because dst == src), so use stack for temporary 1027*4882a593Smuzhiyun * storage. 1028*4882a593Smuzhiyun */ 1029*4882a593Smuzhiyun subq $(16 * 32), %rsp; 1030*4882a593Smuzhiyun movq %rsp, %rax; 1031*4882a593Smuzhiyun 1032*4882a593Smuzhiyun.Lcbc_dec_continue: 1033*4882a593Smuzhiyun call __camellia_dec_blk32; 1034*4882a593Smuzhiyun 1035*4882a593Smuzhiyun vmovdqu %ymm7, (%rax); 1036*4882a593Smuzhiyun vpxor %ymm7, %ymm7, %ymm7; 1037*4882a593Smuzhiyun vinserti128 $1, (%rdx), %ymm7, %ymm7; 1038*4882a593Smuzhiyun vpxor (%rax), %ymm7, %ymm7; 1039*4882a593Smuzhiyun movq %r10, %rsp; 1040*4882a593Smuzhiyun vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6; 1041*4882a593Smuzhiyun vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5; 1042*4882a593Smuzhiyun vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4; 1043*4882a593Smuzhiyun vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3; 1044*4882a593Smuzhiyun vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2; 1045*4882a593Smuzhiyun vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1; 1046*4882a593Smuzhiyun vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0; 1047*4882a593Smuzhiyun vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15; 1048*4882a593Smuzhiyun vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14; 1049*4882a593Smuzhiyun vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13; 1050*4882a593Smuzhiyun vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12; 1051*4882a593Smuzhiyun vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11; 1052*4882a593Smuzhiyun vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10; 1053*4882a593Smuzhiyun vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9; 1054*4882a593Smuzhiyun vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8; 1055*4882a593Smuzhiyun write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1056*4882a593Smuzhiyun %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1057*4882a593Smuzhiyun %ymm8, %rsi); 1058*4882a593Smuzhiyun 1059*4882a593Smuzhiyun vzeroupper; 1060*4882a593Smuzhiyun 1061*4882a593Smuzhiyun FRAME_END 1062*4882a593Smuzhiyun RET; 1063*4882a593SmuzhiyunSYM_FUNC_END(camellia_cbc_dec_32way) 1064*4882a593Smuzhiyun 1065*4882a593Smuzhiyun#define inc_le128(x, minus_one, tmp) \ 1066*4882a593Smuzhiyun vpcmpeqq minus_one, x, tmp; \ 1067*4882a593Smuzhiyun vpsubq minus_one, x, x; \ 1068*4882a593Smuzhiyun vpslldq $8, tmp, tmp; \ 1069*4882a593Smuzhiyun vpsubq tmp, x, x; 1070*4882a593Smuzhiyun 1071*4882a593Smuzhiyun#define add2_le128(x, minus_one, minus_two, tmp1, tmp2) \ 1072*4882a593Smuzhiyun vpcmpeqq minus_one, x, tmp1; \ 1073*4882a593Smuzhiyun vpcmpeqq minus_two, x, tmp2; \ 1074*4882a593Smuzhiyun vpsubq minus_two, x, x; \ 1075*4882a593Smuzhiyun vpor tmp2, tmp1, tmp1; \ 1076*4882a593Smuzhiyun vpslldq $8, tmp1, tmp1; \ 1077*4882a593Smuzhiyun vpsubq tmp1, x, x; 1078*4882a593Smuzhiyun 1079*4882a593SmuzhiyunSYM_FUNC_START(camellia_ctr_32way) 1080*4882a593Smuzhiyun /* input: 1081*4882a593Smuzhiyun * %rdi: ctx, CTX 1082*4882a593Smuzhiyun * %rsi: dst (32 blocks) 1083*4882a593Smuzhiyun * %rdx: src (32 blocks) 1084*4882a593Smuzhiyun * %rcx: iv (little endian, 128bit) 1085*4882a593Smuzhiyun */ 1086*4882a593Smuzhiyun FRAME_BEGIN 1087*4882a593Smuzhiyun 1088*4882a593Smuzhiyun vzeroupper; 1089*4882a593Smuzhiyun 1090*4882a593Smuzhiyun movq %rsp, %r10; 1091*4882a593Smuzhiyun cmpq %rsi, %rdx; 1092*4882a593Smuzhiyun je .Lctr_use_stack; 1093*4882a593Smuzhiyun 1094*4882a593Smuzhiyun /* dst can be used as temporary storage, src is not overwritten. */ 1095*4882a593Smuzhiyun movq %rsi, %rax; 1096*4882a593Smuzhiyun jmp .Lctr_continue; 1097*4882a593Smuzhiyun 1098*4882a593Smuzhiyun.Lctr_use_stack: 1099*4882a593Smuzhiyun subq $(16 * 32), %rsp; 1100*4882a593Smuzhiyun movq %rsp, %rax; 1101*4882a593Smuzhiyun 1102*4882a593Smuzhiyun.Lctr_continue: 1103*4882a593Smuzhiyun vpcmpeqd %ymm15, %ymm15, %ymm15; 1104*4882a593Smuzhiyun vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */ 1105*4882a593Smuzhiyun vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */ 1106*4882a593Smuzhiyun 1107*4882a593Smuzhiyun /* load IV and byteswap */ 1108*4882a593Smuzhiyun vmovdqu (%rcx), %xmm0; 1109*4882a593Smuzhiyun vmovdqa %xmm0, %xmm1; 1110*4882a593Smuzhiyun inc_le128(%xmm0, %xmm15, %xmm14); 1111*4882a593Smuzhiyun vbroadcasti128 .Lbswap128_mask, %ymm14; 1112*4882a593Smuzhiyun vinserti128 $1, %xmm0, %ymm1, %ymm0; 1113*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm13; 1114*4882a593Smuzhiyun vmovdqu %ymm13, 15 * 32(%rax); 1115*4882a593Smuzhiyun 1116*4882a593Smuzhiyun /* construct IVs */ 1117*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); /* ab:le2 ; cd:le3 */ 1118*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm13; 1119*4882a593Smuzhiyun vmovdqu %ymm13, 14 * 32(%rax); 1120*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1121*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm13; 1122*4882a593Smuzhiyun vmovdqu %ymm13, 13 * 32(%rax); 1123*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1124*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm13; 1125*4882a593Smuzhiyun vmovdqu %ymm13, 12 * 32(%rax); 1126*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1127*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm13; 1128*4882a593Smuzhiyun vmovdqu %ymm13, 11 * 32(%rax); 1129*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1130*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm10; 1131*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1132*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm9; 1133*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1134*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm8; 1135*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1136*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm7; 1137*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1138*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm6; 1139*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1140*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm5; 1141*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1142*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm4; 1143*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1144*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm3; 1145*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1146*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm2; 1147*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1148*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm1; 1149*4882a593Smuzhiyun add2_le128(%ymm0, %ymm15, %ymm12, %ymm11, %ymm13); 1150*4882a593Smuzhiyun vextracti128 $1, %ymm0, %xmm13; 1151*4882a593Smuzhiyun vpshufb %ymm14, %ymm0, %ymm0; 1152*4882a593Smuzhiyun inc_le128(%xmm13, %xmm15, %xmm14); 1153*4882a593Smuzhiyun vmovdqu %xmm13, (%rcx); 1154*4882a593Smuzhiyun 1155*4882a593Smuzhiyun /* inpack32_pre: */ 1156*4882a593Smuzhiyun vpbroadcastq (key_table)(CTX), %ymm15; 1157*4882a593Smuzhiyun vpshufb .Lpack_bswap, %ymm15, %ymm15; 1158*4882a593Smuzhiyun vpxor %ymm0, %ymm15, %ymm0; 1159*4882a593Smuzhiyun vpxor %ymm1, %ymm15, %ymm1; 1160*4882a593Smuzhiyun vpxor %ymm2, %ymm15, %ymm2; 1161*4882a593Smuzhiyun vpxor %ymm3, %ymm15, %ymm3; 1162*4882a593Smuzhiyun vpxor %ymm4, %ymm15, %ymm4; 1163*4882a593Smuzhiyun vpxor %ymm5, %ymm15, %ymm5; 1164*4882a593Smuzhiyun vpxor %ymm6, %ymm15, %ymm6; 1165*4882a593Smuzhiyun vpxor %ymm7, %ymm15, %ymm7; 1166*4882a593Smuzhiyun vpxor %ymm8, %ymm15, %ymm8; 1167*4882a593Smuzhiyun vpxor %ymm9, %ymm15, %ymm9; 1168*4882a593Smuzhiyun vpxor %ymm10, %ymm15, %ymm10; 1169*4882a593Smuzhiyun vpxor 11 * 32(%rax), %ymm15, %ymm11; 1170*4882a593Smuzhiyun vpxor 12 * 32(%rax), %ymm15, %ymm12; 1171*4882a593Smuzhiyun vpxor 13 * 32(%rax), %ymm15, %ymm13; 1172*4882a593Smuzhiyun vpxor 14 * 32(%rax), %ymm15, %ymm14; 1173*4882a593Smuzhiyun vpxor 15 * 32(%rax), %ymm15, %ymm15; 1174*4882a593Smuzhiyun 1175*4882a593Smuzhiyun call __camellia_enc_blk32; 1176*4882a593Smuzhiyun 1177*4882a593Smuzhiyun movq %r10, %rsp; 1178*4882a593Smuzhiyun 1179*4882a593Smuzhiyun vpxor 0 * 32(%rdx), %ymm7, %ymm7; 1180*4882a593Smuzhiyun vpxor 1 * 32(%rdx), %ymm6, %ymm6; 1181*4882a593Smuzhiyun vpxor 2 * 32(%rdx), %ymm5, %ymm5; 1182*4882a593Smuzhiyun vpxor 3 * 32(%rdx), %ymm4, %ymm4; 1183*4882a593Smuzhiyun vpxor 4 * 32(%rdx), %ymm3, %ymm3; 1184*4882a593Smuzhiyun vpxor 5 * 32(%rdx), %ymm2, %ymm2; 1185*4882a593Smuzhiyun vpxor 6 * 32(%rdx), %ymm1, %ymm1; 1186*4882a593Smuzhiyun vpxor 7 * 32(%rdx), %ymm0, %ymm0; 1187*4882a593Smuzhiyun vpxor 8 * 32(%rdx), %ymm15, %ymm15; 1188*4882a593Smuzhiyun vpxor 9 * 32(%rdx), %ymm14, %ymm14; 1189*4882a593Smuzhiyun vpxor 10 * 32(%rdx), %ymm13, %ymm13; 1190*4882a593Smuzhiyun vpxor 11 * 32(%rdx), %ymm12, %ymm12; 1191*4882a593Smuzhiyun vpxor 12 * 32(%rdx), %ymm11, %ymm11; 1192*4882a593Smuzhiyun vpxor 13 * 32(%rdx), %ymm10, %ymm10; 1193*4882a593Smuzhiyun vpxor 14 * 32(%rdx), %ymm9, %ymm9; 1194*4882a593Smuzhiyun vpxor 15 * 32(%rdx), %ymm8, %ymm8; 1195*4882a593Smuzhiyun write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1196*4882a593Smuzhiyun %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1197*4882a593Smuzhiyun %ymm8, %rsi); 1198*4882a593Smuzhiyun 1199*4882a593Smuzhiyun vzeroupper; 1200*4882a593Smuzhiyun 1201*4882a593Smuzhiyun FRAME_END 1202*4882a593Smuzhiyun RET; 1203*4882a593SmuzhiyunSYM_FUNC_END(camellia_ctr_32way) 1204*4882a593Smuzhiyun 1205*4882a593Smuzhiyun#define gf128mul_x_ble(iv, mask, tmp) \ 1206*4882a593Smuzhiyun vpsrad $31, iv, tmp; \ 1207*4882a593Smuzhiyun vpaddq iv, iv, iv; \ 1208*4882a593Smuzhiyun vpshufd $0x13, tmp, tmp; \ 1209*4882a593Smuzhiyun vpand mask, tmp, tmp; \ 1210*4882a593Smuzhiyun vpxor tmp, iv, iv; 1211*4882a593Smuzhiyun 1212*4882a593Smuzhiyun#define gf128mul_x2_ble(iv, mask1, mask2, tmp0, tmp1) \ 1213*4882a593Smuzhiyun vpsrad $31, iv, tmp0; \ 1214*4882a593Smuzhiyun vpaddq iv, iv, tmp1; \ 1215*4882a593Smuzhiyun vpsllq $2, iv, iv; \ 1216*4882a593Smuzhiyun vpshufd $0x13, tmp0, tmp0; \ 1217*4882a593Smuzhiyun vpsrad $31, tmp1, tmp1; \ 1218*4882a593Smuzhiyun vpand mask2, tmp0, tmp0; \ 1219*4882a593Smuzhiyun vpshufd $0x13, tmp1, tmp1; \ 1220*4882a593Smuzhiyun vpxor tmp0, iv, iv; \ 1221*4882a593Smuzhiyun vpand mask1, tmp1, tmp1; \ 1222*4882a593Smuzhiyun vpxor tmp1, iv, iv; 1223*4882a593Smuzhiyun 1224*4882a593Smuzhiyun.align 8 1225*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(camellia_xts_crypt_32way) 1226*4882a593Smuzhiyun /* input: 1227*4882a593Smuzhiyun * %rdi: ctx, CTX 1228*4882a593Smuzhiyun * %rsi: dst (32 blocks) 1229*4882a593Smuzhiyun * %rdx: src (32 blocks) 1230*4882a593Smuzhiyun * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1231*4882a593Smuzhiyun * %r8: index for input whitening key 1232*4882a593Smuzhiyun * %r9: pointer to __camellia_enc_blk32 or __camellia_dec_blk32 1233*4882a593Smuzhiyun */ 1234*4882a593Smuzhiyun FRAME_BEGIN 1235*4882a593Smuzhiyun 1236*4882a593Smuzhiyun vzeroupper; 1237*4882a593Smuzhiyun 1238*4882a593Smuzhiyun subq $(16 * 32), %rsp; 1239*4882a593Smuzhiyun movq %rsp, %rax; 1240*4882a593Smuzhiyun 1241*4882a593Smuzhiyun vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_0, %ymm12; 1242*4882a593Smuzhiyun 1243*4882a593Smuzhiyun /* load IV and construct second IV */ 1244*4882a593Smuzhiyun vmovdqu (%rcx), %xmm0; 1245*4882a593Smuzhiyun vmovdqa %xmm0, %xmm15; 1246*4882a593Smuzhiyun gf128mul_x_ble(%xmm0, %xmm12, %xmm13); 1247*4882a593Smuzhiyun vbroadcasti128 .Lxts_gf128mul_and_shl1_mask_1, %ymm13; 1248*4882a593Smuzhiyun vinserti128 $1, %xmm0, %ymm15, %ymm0; 1249*4882a593Smuzhiyun vpxor 0 * 32(%rdx), %ymm0, %ymm15; 1250*4882a593Smuzhiyun vmovdqu %ymm15, 15 * 32(%rax); 1251*4882a593Smuzhiyun vmovdqu %ymm0, 0 * 32(%rsi); 1252*4882a593Smuzhiyun 1253*4882a593Smuzhiyun /* construct IVs */ 1254*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1255*4882a593Smuzhiyun vpxor 1 * 32(%rdx), %ymm0, %ymm15; 1256*4882a593Smuzhiyun vmovdqu %ymm15, 14 * 32(%rax); 1257*4882a593Smuzhiyun vmovdqu %ymm0, 1 * 32(%rsi); 1258*4882a593Smuzhiyun 1259*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1260*4882a593Smuzhiyun vpxor 2 * 32(%rdx), %ymm0, %ymm15; 1261*4882a593Smuzhiyun vmovdqu %ymm15, 13 * 32(%rax); 1262*4882a593Smuzhiyun vmovdqu %ymm0, 2 * 32(%rsi); 1263*4882a593Smuzhiyun 1264*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1265*4882a593Smuzhiyun vpxor 3 * 32(%rdx), %ymm0, %ymm15; 1266*4882a593Smuzhiyun vmovdqu %ymm15, 12 * 32(%rax); 1267*4882a593Smuzhiyun vmovdqu %ymm0, 3 * 32(%rsi); 1268*4882a593Smuzhiyun 1269*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1270*4882a593Smuzhiyun vpxor 4 * 32(%rdx), %ymm0, %ymm11; 1271*4882a593Smuzhiyun vmovdqu %ymm0, 4 * 32(%rsi); 1272*4882a593Smuzhiyun 1273*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1274*4882a593Smuzhiyun vpxor 5 * 32(%rdx), %ymm0, %ymm10; 1275*4882a593Smuzhiyun vmovdqu %ymm0, 5 * 32(%rsi); 1276*4882a593Smuzhiyun 1277*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1278*4882a593Smuzhiyun vpxor 6 * 32(%rdx), %ymm0, %ymm9; 1279*4882a593Smuzhiyun vmovdqu %ymm0, 6 * 32(%rsi); 1280*4882a593Smuzhiyun 1281*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1282*4882a593Smuzhiyun vpxor 7 * 32(%rdx), %ymm0, %ymm8; 1283*4882a593Smuzhiyun vmovdqu %ymm0, 7 * 32(%rsi); 1284*4882a593Smuzhiyun 1285*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1286*4882a593Smuzhiyun vpxor 8 * 32(%rdx), %ymm0, %ymm7; 1287*4882a593Smuzhiyun vmovdqu %ymm0, 8 * 32(%rsi); 1288*4882a593Smuzhiyun 1289*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1290*4882a593Smuzhiyun vpxor 9 * 32(%rdx), %ymm0, %ymm6; 1291*4882a593Smuzhiyun vmovdqu %ymm0, 9 * 32(%rsi); 1292*4882a593Smuzhiyun 1293*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1294*4882a593Smuzhiyun vpxor 10 * 32(%rdx), %ymm0, %ymm5; 1295*4882a593Smuzhiyun vmovdqu %ymm0, 10 * 32(%rsi); 1296*4882a593Smuzhiyun 1297*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1298*4882a593Smuzhiyun vpxor 11 * 32(%rdx), %ymm0, %ymm4; 1299*4882a593Smuzhiyun vmovdqu %ymm0, 11 * 32(%rsi); 1300*4882a593Smuzhiyun 1301*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1302*4882a593Smuzhiyun vpxor 12 * 32(%rdx), %ymm0, %ymm3; 1303*4882a593Smuzhiyun vmovdqu %ymm0, 12 * 32(%rsi); 1304*4882a593Smuzhiyun 1305*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1306*4882a593Smuzhiyun vpxor 13 * 32(%rdx), %ymm0, %ymm2; 1307*4882a593Smuzhiyun vmovdqu %ymm0, 13 * 32(%rsi); 1308*4882a593Smuzhiyun 1309*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1310*4882a593Smuzhiyun vpxor 14 * 32(%rdx), %ymm0, %ymm1; 1311*4882a593Smuzhiyun vmovdqu %ymm0, 14 * 32(%rsi); 1312*4882a593Smuzhiyun 1313*4882a593Smuzhiyun gf128mul_x2_ble(%ymm0, %ymm12, %ymm13, %ymm14, %ymm15); 1314*4882a593Smuzhiyun vpxor 15 * 32(%rdx), %ymm0, %ymm15; 1315*4882a593Smuzhiyun vmovdqu %ymm15, 0 * 32(%rax); 1316*4882a593Smuzhiyun vmovdqu %ymm0, 15 * 32(%rsi); 1317*4882a593Smuzhiyun 1318*4882a593Smuzhiyun vextracti128 $1, %ymm0, %xmm0; 1319*4882a593Smuzhiyun gf128mul_x_ble(%xmm0, %xmm12, %xmm15); 1320*4882a593Smuzhiyun vmovdqu %xmm0, (%rcx); 1321*4882a593Smuzhiyun 1322*4882a593Smuzhiyun /* inpack32_pre: */ 1323*4882a593Smuzhiyun vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; 1324*4882a593Smuzhiyun vpshufb .Lpack_bswap, %ymm15, %ymm15; 1325*4882a593Smuzhiyun vpxor 0 * 32(%rax), %ymm15, %ymm0; 1326*4882a593Smuzhiyun vpxor %ymm1, %ymm15, %ymm1; 1327*4882a593Smuzhiyun vpxor %ymm2, %ymm15, %ymm2; 1328*4882a593Smuzhiyun vpxor %ymm3, %ymm15, %ymm3; 1329*4882a593Smuzhiyun vpxor %ymm4, %ymm15, %ymm4; 1330*4882a593Smuzhiyun vpxor %ymm5, %ymm15, %ymm5; 1331*4882a593Smuzhiyun vpxor %ymm6, %ymm15, %ymm6; 1332*4882a593Smuzhiyun vpxor %ymm7, %ymm15, %ymm7; 1333*4882a593Smuzhiyun vpxor %ymm8, %ymm15, %ymm8; 1334*4882a593Smuzhiyun vpxor %ymm9, %ymm15, %ymm9; 1335*4882a593Smuzhiyun vpxor %ymm10, %ymm15, %ymm10; 1336*4882a593Smuzhiyun vpxor %ymm11, %ymm15, %ymm11; 1337*4882a593Smuzhiyun vpxor 12 * 32(%rax), %ymm15, %ymm12; 1338*4882a593Smuzhiyun vpxor 13 * 32(%rax), %ymm15, %ymm13; 1339*4882a593Smuzhiyun vpxor 14 * 32(%rax), %ymm15, %ymm14; 1340*4882a593Smuzhiyun vpxor 15 * 32(%rax), %ymm15, %ymm15; 1341*4882a593Smuzhiyun 1342*4882a593Smuzhiyun CALL_NOSPEC r9; 1343*4882a593Smuzhiyun 1344*4882a593Smuzhiyun addq $(16 * 32), %rsp; 1345*4882a593Smuzhiyun 1346*4882a593Smuzhiyun vpxor 0 * 32(%rsi), %ymm7, %ymm7; 1347*4882a593Smuzhiyun vpxor 1 * 32(%rsi), %ymm6, %ymm6; 1348*4882a593Smuzhiyun vpxor 2 * 32(%rsi), %ymm5, %ymm5; 1349*4882a593Smuzhiyun vpxor 3 * 32(%rsi), %ymm4, %ymm4; 1350*4882a593Smuzhiyun vpxor 4 * 32(%rsi), %ymm3, %ymm3; 1351*4882a593Smuzhiyun vpxor 5 * 32(%rsi), %ymm2, %ymm2; 1352*4882a593Smuzhiyun vpxor 6 * 32(%rsi), %ymm1, %ymm1; 1353*4882a593Smuzhiyun vpxor 7 * 32(%rsi), %ymm0, %ymm0; 1354*4882a593Smuzhiyun vpxor 8 * 32(%rsi), %ymm15, %ymm15; 1355*4882a593Smuzhiyun vpxor 9 * 32(%rsi), %ymm14, %ymm14; 1356*4882a593Smuzhiyun vpxor 10 * 32(%rsi), %ymm13, %ymm13; 1357*4882a593Smuzhiyun vpxor 11 * 32(%rsi), %ymm12, %ymm12; 1358*4882a593Smuzhiyun vpxor 12 * 32(%rsi), %ymm11, %ymm11; 1359*4882a593Smuzhiyun vpxor 13 * 32(%rsi), %ymm10, %ymm10; 1360*4882a593Smuzhiyun vpxor 14 * 32(%rsi), %ymm9, %ymm9; 1361*4882a593Smuzhiyun vpxor 15 * 32(%rsi), %ymm8, %ymm8; 1362*4882a593Smuzhiyun write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0, 1363*4882a593Smuzhiyun %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9, 1364*4882a593Smuzhiyun %ymm8, %rsi); 1365*4882a593Smuzhiyun 1366*4882a593Smuzhiyun vzeroupper; 1367*4882a593Smuzhiyun 1368*4882a593Smuzhiyun FRAME_END 1369*4882a593Smuzhiyun RET; 1370*4882a593SmuzhiyunSYM_FUNC_END(camellia_xts_crypt_32way) 1371*4882a593Smuzhiyun 1372*4882a593SmuzhiyunSYM_FUNC_START(camellia_xts_enc_32way) 1373*4882a593Smuzhiyun /* input: 1374*4882a593Smuzhiyun * %rdi: ctx, CTX 1375*4882a593Smuzhiyun * %rsi: dst (32 blocks) 1376*4882a593Smuzhiyun * %rdx: src (32 blocks) 1377*4882a593Smuzhiyun * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1378*4882a593Smuzhiyun */ 1379*4882a593Smuzhiyun 1380*4882a593Smuzhiyun xorl %r8d, %r8d; /* input whitening key, 0 for enc */ 1381*4882a593Smuzhiyun 1382*4882a593Smuzhiyun leaq __camellia_enc_blk32, %r9; 1383*4882a593Smuzhiyun 1384*4882a593Smuzhiyun jmp camellia_xts_crypt_32way; 1385*4882a593SmuzhiyunSYM_FUNC_END(camellia_xts_enc_32way) 1386*4882a593Smuzhiyun 1387*4882a593SmuzhiyunSYM_FUNC_START(camellia_xts_dec_32way) 1388*4882a593Smuzhiyun /* input: 1389*4882a593Smuzhiyun * %rdi: ctx, CTX 1390*4882a593Smuzhiyun * %rsi: dst (32 blocks) 1391*4882a593Smuzhiyun * %rdx: src (32 blocks) 1392*4882a593Smuzhiyun * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 1393*4882a593Smuzhiyun */ 1394*4882a593Smuzhiyun 1395*4882a593Smuzhiyun cmpl $16, key_length(CTX); 1396*4882a593Smuzhiyun movl $32, %r8d; 1397*4882a593Smuzhiyun movl $24, %eax; 1398*4882a593Smuzhiyun cmovel %eax, %r8d; /* input whitening key, last for dec */ 1399*4882a593Smuzhiyun 1400*4882a593Smuzhiyun leaq __camellia_dec_blk32, %r9; 1401*4882a593Smuzhiyun 1402*4882a593Smuzhiyun jmp camellia_xts_crypt_32way; 1403*4882a593SmuzhiyunSYM_FUNC_END(camellia_xts_dec_32way) 1404