1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * x86_64/AVX2 assembler optimized version of Serpent 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * Based on AVX assembler implementation of Serpent by: 8*4882a593Smuzhiyun * Copyright © 2012 Johannes Goetzfried 9*4882a593Smuzhiyun * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 10*4882a593Smuzhiyun */ 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun#include <linux/linkage.h> 13*4882a593Smuzhiyun#include <asm/frame.h> 14*4882a593Smuzhiyun#include "glue_helper-asm-avx2.S" 15*4882a593Smuzhiyun 16*4882a593Smuzhiyun.file "serpent-avx2-asm_64.S" 17*4882a593Smuzhiyun 18*4882a593Smuzhiyun.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 19*4882a593Smuzhiyun.align 16 20*4882a593Smuzhiyun.Lbswap128_mask: 21*4882a593Smuzhiyun .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun.section .rodata.cst16.xts_gf128mul_and_shl1_mask_0, "aM", @progbits, 16 24*4882a593Smuzhiyun.align 16 25*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask_0: 26*4882a593Smuzhiyun .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun.section .rodata.cst16.xts_gf128mul_and_shl1_mask_1, "aM", @progbits, 16 29*4882a593Smuzhiyun.align 16 30*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask_1: 31*4882a593Smuzhiyun .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun.text 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun#define CTX %rdi 36*4882a593Smuzhiyun 37*4882a593Smuzhiyun#define RNOT %ymm0 38*4882a593Smuzhiyun#define tp %ymm1 39*4882a593Smuzhiyun 40*4882a593Smuzhiyun#define RA1 %ymm2 41*4882a593Smuzhiyun#define RA2 %ymm3 42*4882a593Smuzhiyun#define RB1 %ymm4 43*4882a593Smuzhiyun#define RB2 %ymm5 44*4882a593Smuzhiyun#define RC1 %ymm6 45*4882a593Smuzhiyun#define RC2 %ymm7 46*4882a593Smuzhiyun#define RD1 %ymm8 47*4882a593Smuzhiyun#define RD2 %ymm9 48*4882a593Smuzhiyun#define RE1 %ymm10 49*4882a593Smuzhiyun#define RE2 %ymm11 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun#define RK0 %ymm12 52*4882a593Smuzhiyun#define RK1 %ymm13 53*4882a593Smuzhiyun#define RK2 %ymm14 54*4882a593Smuzhiyun#define RK3 %ymm15 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun#define RK0x %xmm12 57*4882a593Smuzhiyun#define RK1x %xmm13 58*4882a593Smuzhiyun#define RK2x %xmm14 59*4882a593Smuzhiyun#define RK3x %xmm15 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun#define S0_1(x0, x1, x2, x3, x4) \ 62*4882a593Smuzhiyun vpor x0, x3, tp; \ 63*4882a593Smuzhiyun vpxor x3, x0, x0; \ 64*4882a593Smuzhiyun vpxor x2, x3, x4; \ 65*4882a593Smuzhiyun vpxor RNOT, x4, x4; \ 66*4882a593Smuzhiyun vpxor x1, tp, x3; \ 67*4882a593Smuzhiyun vpand x0, x1, x1; \ 68*4882a593Smuzhiyun vpxor x4, x1, x1; \ 69*4882a593Smuzhiyun vpxor x0, x2, x2; 70*4882a593Smuzhiyun#define S0_2(x0, x1, x2, x3, x4) \ 71*4882a593Smuzhiyun vpxor x3, x0, x0; \ 72*4882a593Smuzhiyun vpor x0, x4, x4; \ 73*4882a593Smuzhiyun vpxor x2, x0, x0; \ 74*4882a593Smuzhiyun vpand x1, x2, x2; \ 75*4882a593Smuzhiyun vpxor x2, x3, x3; \ 76*4882a593Smuzhiyun vpxor RNOT, x1, x1; \ 77*4882a593Smuzhiyun vpxor x4, x2, x2; \ 78*4882a593Smuzhiyun vpxor x2, x1, x1; 79*4882a593Smuzhiyun 80*4882a593Smuzhiyun#define S1_1(x0, x1, x2, x3, x4) \ 81*4882a593Smuzhiyun vpxor x0, x1, tp; \ 82*4882a593Smuzhiyun vpxor x3, x0, x0; \ 83*4882a593Smuzhiyun vpxor RNOT, x3, x3; \ 84*4882a593Smuzhiyun vpand tp, x1, x4; \ 85*4882a593Smuzhiyun vpor tp, x0, x0; \ 86*4882a593Smuzhiyun vpxor x2, x3, x3; \ 87*4882a593Smuzhiyun vpxor x3, x0, x0; \ 88*4882a593Smuzhiyun vpxor x3, tp, x1; 89*4882a593Smuzhiyun#define S1_2(x0, x1, x2, x3, x4) \ 90*4882a593Smuzhiyun vpxor x4, x3, x3; \ 91*4882a593Smuzhiyun vpor x4, x1, x1; \ 92*4882a593Smuzhiyun vpxor x2, x4, x4; \ 93*4882a593Smuzhiyun vpand x0, x2, x2; \ 94*4882a593Smuzhiyun vpxor x1, x2, x2; \ 95*4882a593Smuzhiyun vpor x0, x1, x1; \ 96*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 97*4882a593Smuzhiyun vpxor x2, x0, x0; \ 98*4882a593Smuzhiyun vpxor x1, x4, x4; 99*4882a593Smuzhiyun 100*4882a593Smuzhiyun#define S2_1(x0, x1, x2, x3, x4) \ 101*4882a593Smuzhiyun vpxor RNOT, x3, x3; \ 102*4882a593Smuzhiyun vpxor x0, x1, x1; \ 103*4882a593Smuzhiyun vpand x2, x0, tp; \ 104*4882a593Smuzhiyun vpxor x3, tp, tp; \ 105*4882a593Smuzhiyun vpor x0, x3, x3; \ 106*4882a593Smuzhiyun vpxor x1, x2, x2; \ 107*4882a593Smuzhiyun vpxor x1, x3, x3; \ 108*4882a593Smuzhiyun vpand tp, x1, x1; 109*4882a593Smuzhiyun#define S2_2(x0, x1, x2, x3, x4) \ 110*4882a593Smuzhiyun vpxor x2, tp, tp; \ 111*4882a593Smuzhiyun vpand x3, x2, x2; \ 112*4882a593Smuzhiyun vpor x1, x3, x3; \ 113*4882a593Smuzhiyun vpxor RNOT, tp, tp; \ 114*4882a593Smuzhiyun vpxor tp, x3, x3; \ 115*4882a593Smuzhiyun vpxor tp, x0, x4; \ 116*4882a593Smuzhiyun vpxor x2, tp, x0; \ 117*4882a593Smuzhiyun vpor x2, x1, x1; 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun#define S3_1(x0, x1, x2, x3, x4) \ 120*4882a593Smuzhiyun vpxor x3, x1, tp; \ 121*4882a593Smuzhiyun vpor x0, x3, x3; \ 122*4882a593Smuzhiyun vpand x0, x1, x4; \ 123*4882a593Smuzhiyun vpxor x2, x0, x0; \ 124*4882a593Smuzhiyun vpxor tp, x2, x2; \ 125*4882a593Smuzhiyun vpand x3, tp, x1; \ 126*4882a593Smuzhiyun vpxor x3, x2, x2; \ 127*4882a593Smuzhiyun vpor x4, x0, x0; \ 128*4882a593Smuzhiyun vpxor x3, x4, x4; 129*4882a593Smuzhiyun#define S3_2(x0, x1, x2, x3, x4) \ 130*4882a593Smuzhiyun vpxor x0, x1, x1; \ 131*4882a593Smuzhiyun vpand x3, x0, x0; \ 132*4882a593Smuzhiyun vpand x4, x3, x3; \ 133*4882a593Smuzhiyun vpxor x2, x3, x3; \ 134*4882a593Smuzhiyun vpor x1, x4, x4; \ 135*4882a593Smuzhiyun vpand x1, x2, x2; \ 136*4882a593Smuzhiyun vpxor x3, x4, x4; \ 137*4882a593Smuzhiyun vpxor x3, x0, x0; \ 138*4882a593Smuzhiyun vpxor x2, x3, x3; 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun#define S4_1(x0, x1, x2, x3, x4) \ 141*4882a593Smuzhiyun vpand x0, x3, tp; \ 142*4882a593Smuzhiyun vpxor x3, x0, x0; \ 143*4882a593Smuzhiyun vpxor x2, tp, tp; \ 144*4882a593Smuzhiyun vpor x3, x2, x2; \ 145*4882a593Smuzhiyun vpxor x1, x0, x0; \ 146*4882a593Smuzhiyun vpxor tp, x3, x4; \ 147*4882a593Smuzhiyun vpor x0, x2, x2; \ 148*4882a593Smuzhiyun vpxor x1, x2, x2; 149*4882a593Smuzhiyun#define S4_2(x0, x1, x2, x3, x4) \ 150*4882a593Smuzhiyun vpand x0, x1, x1; \ 151*4882a593Smuzhiyun vpxor x4, x1, x1; \ 152*4882a593Smuzhiyun vpand x2, x4, x4; \ 153*4882a593Smuzhiyun vpxor tp, x2, x2; \ 154*4882a593Smuzhiyun vpxor x0, x4, x4; \ 155*4882a593Smuzhiyun vpor x1, tp, x3; \ 156*4882a593Smuzhiyun vpxor RNOT, x1, x1; \ 157*4882a593Smuzhiyun vpxor x0, x3, x3; 158*4882a593Smuzhiyun 159*4882a593Smuzhiyun#define S5_1(x0, x1, x2, x3, x4) \ 160*4882a593Smuzhiyun vpor x0, x1, tp; \ 161*4882a593Smuzhiyun vpxor tp, x2, x2; \ 162*4882a593Smuzhiyun vpxor RNOT, x3, x3; \ 163*4882a593Smuzhiyun vpxor x0, x1, x4; \ 164*4882a593Smuzhiyun vpxor x2, x0, x0; \ 165*4882a593Smuzhiyun vpand x4, tp, x1; \ 166*4882a593Smuzhiyun vpor x3, x4, x4; \ 167*4882a593Smuzhiyun vpxor x0, x4, x4; 168*4882a593Smuzhiyun#define S5_2(x0, x1, x2, x3, x4) \ 169*4882a593Smuzhiyun vpand x3, x0, x0; \ 170*4882a593Smuzhiyun vpxor x3, x1, x1; \ 171*4882a593Smuzhiyun vpxor x2, x3, x3; \ 172*4882a593Smuzhiyun vpxor x1, x0, x0; \ 173*4882a593Smuzhiyun vpand x4, x2, x2; \ 174*4882a593Smuzhiyun vpxor x2, x1, x1; \ 175*4882a593Smuzhiyun vpand x0, x2, x2; \ 176*4882a593Smuzhiyun vpxor x2, x3, x3; 177*4882a593Smuzhiyun 178*4882a593Smuzhiyun#define S6_1(x0, x1, x2, x3, x4) \ 179*4882a593Smuzhiyun vpxor x0, x3, x3; \ 180*4882a593Smuzhiyun vpxor x2, x1, tp; \ 181*4882a593Smuzhiyun vpxor x0, x2, x2; \ 182*4882a593Smuzhiyun vpand x3, x0, x0; \ 183*4882a593Smuzhiyun vpor x3, tp, tp; \ 184*4882a593Smuzhiyun vpxor RNOT, x1, x4; \ 185*4882a593Smuzhiyun vpxor tp, x0, x0; \ 186*4882a593Smuzhiyun vpxor x2, tp, x1; 187*4882a593Smuzhiyun#define S6_2(x0, x1, x2, x3, x4) \ 188*4882a593Smuzhiyun vpxor x4, x3, x3; \ 189*4882a593Smuzhiyun vpxor x0, x4, x4; \ 190*4882a593Smuzhiyun vpand x0, x2, x2; \ 191*4882a593Smuzhiyun vpxor x1, x4, x4; \ 192*4882a593Smuzhiyun vpxor x3, x2, x2; \ 193*4882a593Smuzhiyun vpand x1, x3, x3; \ 194*4882a593Smuzhiyun vpxor x0, x3, x3; \ 195*4882a593Smuzhiyun vpxor x2, x1, x1; 196*4882a593Smuzhiyun 197*4882a593Smuzhiyun#define S7_1(x0, x1, x2, x3, x4) \ 198*4882a593Smuzhiyun vpxor RNOT, x1, tp; \ 199*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 200*4882a593Smuzhiyun vpand x2, tp, x1; \ 201*4882a593Smuzhiyun vpxor x3, x1, x1; \ 202*4882a593Smuzhiyun vpor tp, x3, x3; \ 203*4882a593Smuzhiyun vpxor x2, tp, x4; \ 204*4882a593Smuzhiyun vpxor x3, x2, x2; \ 205*4882a593Smuzhiyun vpxor x0, x3, x3; \ 206*4882a593Smuzhiyun vpor x1, x0, x0; 207*4882a593Smuzhiyun#define S7_2(x0, x1, x2, x3, x4) \ 208*4882a593Smuzhiyun vpand x0, x2, x2; \ 209*4882a593Smuzhiyun vpxor x4, x0, x0; \ 210*4882a593Smuzhiyun vpxor x3, x4, x4; \ 211*4882a593Smuzhiyun vpand x0, x3, x3; \ 212*4882a593Smuzhiyun vpxor x1, x4, x4; \ 213*4882a593Smuzhiyun vpxor x4, x2, x2; \ 214*4882a593Smuzhiyun vpxor x1, x3, x3; \ 215*4882a593Smuzhiyun vpor x0, x4, x4; \ 216*4882a593Smuzhiyun vpxor x1, x4, x4; 217*4882a593Smuzhiyun 218*4882a593Smuzhiyun#define SI0_1(x0, x1, x2, x3, x4) \ 219*4882a593Smuzhiyun vpxor x0, x1, x1; \ 220*4882a593Smuzhiyun vpor x1, x3, tp; \ 221*4882a593Smuzhiyun vpxor x1, x3, x4; \ 222*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 223*4882a593Smuzhiyun vpxor tp, x2, x2; \ 224*4882a593Smuzhiyun vpxor x0, tp, x3; \ 225*4882a593Smuzhiyun vpand x1, x0, x0; \ 226*4882a593Smuzhiyun vpxor x2, x0, x0; 227*4882a593Smuzhiyun#define SI0_2(x0, x1, x2, x3, x4) \ 228*4882a593Smuzhiyun vpand x3, x2, x2; \ 229*4882a593Smuzhiyun vpxor x4, x3, x3; \ 230*4882a593Smuzhiyun vpxor x3, x2, x2; \ 231*4882a593Smuzhiyun vpxor x3, x1, x1; \ 232*4882a593Smuzhiyun vpand x0, x3, x3; \ 233*4882a593Smuzhiyun vpxor x0, x1, x1; \ 234*4882a593Smuzhiyun vpxor x2, x0, x0; \ 235*4882a593Smuzhiyun vpxor x3, x4, x4; 236*4882a593Smuzhiyun 237*4882a593Smuzhiyun#define SI1_1(x0, x1, x2, x3, x4) \ 238*4882a593Smuzhiyun vpxor x3, x1, x1; \ 239*4882a593Smuzhiyun vpxor x2, x0, tp; \ 240*4882a593Smuzhiyun vpxor RNOT, x2, x2; \ 241*4882a593Smuzhiyun vpor x1, x0, x4; \ 242*4882a593Smuzhiyun vpxor x3, x4, x4; \ 243*4882a593Smuzhiyun vpand x1, x3, x3; \ 244*4882a593Smuzhiyun vpxor x2, x1, x1; \ 245*4882a593Smuzhiyun vpand x4, x2, x2; 246*4882a593Smuzhiyun#define SI1_2(x0, x1, x2, x3, x4) \ 247*4882a593Smuzhiyun vpxor x1, x4, x4; \ 248*4882a593Smuzhiyun vpor x3, x1, x1; \ 249*4882a593Smuzhiyun vpxor tp, x3, x3; \ 250*4882a593Smuzhiyun vpxor tp, x2, x2; \ 251*4882a593Smuzhiyun vpor x4, tp, x0; \ 252*4882a593Smuzhiyun vpxor x4, x2, x2; \ 253*4882a593Smuzhiyun vpxor x0, x1, x1; \ 254*4882a593Smuzhiyun vpxor x1, x4, x4; 255*4882a593Smuzhiyun 256*4882a593Smuzhiyun#define SI2_1(x0, x1, x2, x3, x4) \ 257*4882a593Smuzhiyun vpxor x1, x2, x2; \ 258*4882a593Smuzhiyun vpxor RNOT, x3, tp; \ 259*4882a593Smuzhiyun vpor x2, tp, tp; \ 260*4882a593Smuzhiyun vpxor x3, x2, x2; \ 261*4882a593Smuzhiyun vpxor x0, x3, x4; \ 262*4882a593Smuzhiyun vpxor x1, tp, x3; \ 263*4882a593Smuzhiyun vpor x2, x1, x1; \ 264*4882a593Smuzhiyun vpxor x0, x2, x2; 265*4882a593Smuzhiyun#define SI2_2(x0, x1, x2, x3, x4) \ 266*4882a593Smuzhiyun vpxor x4, x1, x1; \ 267*4882a593Smuzhiyun vpor x3, x4, x4; \ 268*4882a593Smuzhiyun vpxor x3, x2, x2; \ 269*4882a593Smuzhiyun vpxor x2, x4, x4; \ 270*4882a593Smuzhiyun vpand x1, x2, x2; \ 271*4882a593Smuzhiyun vpxor x3, x2, x2; \ 272*4882a593Smuzhiyun vpxor x4, x3, x3; \ 273*4882a593Smuzhiyun vpxor x0, x4, x4; 274*4882a593Smuzhiyun 275*4882a593Smuzhiyun#define SI3_1(x0, x1, x2, x3, x4) \ 276*4882a593Smuzhiyun vpxor x1, x2, x2; \ 277*4882a593Smuzhiyun vpand x2, x1, tp; \ 278*4882a593Smuzhiyun vpxor x0, tp, tp; \ 279*4882a593Smuzhiyun vpor x1, x0, x0; \ 280*4882a593Smuzhiyun vpxor x3, x1, x4; \ 281*4882a593Smuzhiyun vpxor x3, x0, x0; \ 282*4882a593Smuzhiyun vpor tp, x3, x3; \ 283*4882a593Smuzhiyun vpxor x2, tp, x1; 284*4882a593Smuzhiyun#define SI3_2(x0, x1, x2, x3, x4) \ 285*4882a593Smuzhiyun vpxor x3, x1, x1; \ 286*4882a593Smuzhiyun vpxor x2, x0, x0; \ 287*4882a593Smuzhiyun vpxor x3, x2, x2; \ 288*4882a593Smuzhiyun vpand x1, x3, x3; \ 289*4882a593Smuzhiyun vpxor x0, x1, x1; \ 290*4882a593Smuzhiyun vpand x2, x0, x0; \ 291*4882a593Smuzhiyun vpxor x3, x4, x4; \ 292*4882a593Smuzhiyun vpxor x0, x3, x3; \ 293*4882a593Smuzhiyun vpxor x1, x0, x0; 294*4882a593Smuzhiyun 295*4882a593Smuzhiyun#define SI4_1(x0, x1, x2, x3, x4) \ 296*4882a593Smuzhiyun vpxor x3, x2, x2; \ 297*4882a593Smuzhiyun vpand x1, x0, tp; \ 298*4882a593Smuzhiyun vpxor x2, tp, tp; \ 299*4882a593Smuzhiyun vpor x3, x2, x2; \ 300*4882a593Smuzhiyun vpxor RNOT, x0, x4; \ 301*4882a593Smuzhiyun vpxor tp, x1, x1; \ 302*4882a593Smuzhiyun vpxor x2, tp, x0; \ 303*4882a593Smuzhiyun vpand x4, x2, x2; 304*4882a593Smuzhiyun#define SI4_2(x0, x1, x2, x3, x4) \ 305*4882a593Smuzhiyun vpxor x0, x2, x2; \ 306*4882a593Smuzhiyun vpor x4, x0, x0; \ 307*4882a593Smuzhiyun vpxor x3, x0, x0; \ 308*4882a593Smuzhiyun vpand x2, x3, x3; \ 309*4882a593Smuzhiyun vpxor x3, x4, x4; \ 310*4882a593Smuzhiyun vpxor x1, x3, x3; \ 311*4882a593Smuzhiyun vpand x0, x1, x1; \ 312*4882a593Smuzhiyun vpxor x1, x4, x4; \ 313*4882a593Smuzhiyun vpxor x3, x0, x0; 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun#define SI5_1(x0, x1, x2, x3, x4) \ 316*4882a593Smuzhiyun vpor x2, x1, tp; \ 317*4882a593Smuzhiyun vpxor x1, x2, x2; \ 318*4882a593Smuzhiyun vpxor x3, tp, tp; \ 319*4882a593Smuzhiyun vpand x1, x3, x3; \ 320*4882a593Smuzhiyun vpxor x3, x2, x2; \ 321*4882a593Smuzhiyun vpor x0, x3, x3; \ 322*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 323*4882a593Smuzhiyun vpxor x2, x3, x3; \ 324*4882a593Smuzhiyun vpor x0, x2, x2; 325*4882a593Smuzhiyun#define SI5_2(x0, x1, x2, x3, x4) \ 326*4882a593Smuzhiyun vpxor tp, x1, x4; \ 327*4882a593Smuzhiyun vpxor x4, x2, x2; \ 328*4882a593Smuzhiyun vpand x0, x4, x4; \ 329*4882a593Smuzhiyun vpxor tp, x0, x0; \ 330*4882a593Smuzhiyun vpxor x3, tp, x1; \ 331*4882a593Smuzhiyun vpand x2, x0, x0; \ 332*4882a593Smuzhiyun vpxor x3, x2, x2; \ 333*4882a593Smuzhiyun vpxor x2, x0, x0; \ 334*4882a593Smuzhiyun vpxor x4, x2, x2; \ 335*4882a593Smuzhiyun vpxor x3, x4, x4; 336*4882a593Smuzhiyun 337*4882a593Smuzhiyun#define SI6_1(x0, x1, x2, x3, x4) \ 338*4882a593Smuzhiyun vpxor x2, x0, x0; \ 339*4882a593Smuzhiyun vpand x3, x0, tp; \ 340*4882a593Smuzhiyun vpxor x3, x2, x2; \ 341*4882a593Smuzhiyun vpxor x2, tp, tp; \ 342*4882a593Smuzhiyun vpxor x1, x3, x3; \ 343*4882a593Smuzhiyun vpor x0, x2, x2; \ 344*4882a593Smuzhiyun vpxor x3, x2, x2; \ 345*4882a593Smuzhiyun vpand tp, x3, x3; 346*4882a593Smuzhiyun#define SI6_2(x0, x1, x2, x3, x4) \ 347*4882a593Smuzhiyun vpxor RNOT, tp, tp; \ 348*4882a593Smuzhiyun vpxor x1, x3, x3; \ 349*4882a593Smuzhiyun vpand x2, x1, x1; \ 350*4882a593Smuzhiyun vpxor tp, x0, x4; \ 351*4882a593Smuzhiyun vpxor x4, x3, x3; \ 352*4882a593Smuzhiyun vpxor x2, x4, x4; \ 353*4882a593Smuzhiyun vpxor x1, tp, x0; \ 354*4882a593Smuzhiyun vpxor x0, x2, x2; 355*4882a593Smuzhiyun 356*4882a593Smuzhiyun#define SI7_1(x0, x1, x2, x3, x4) \ 357*4882a593Smuzhiyun vpand x0, x3, tp; \ 358*4882a593Smuzhiyun vpxor x2, x0, x0; \ 359*4882a593Smuzhiyun vpor x3, x2, x2; \ 360*4882a593Smuzhiyun vpxor x1, x3, x4; \ 361*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 362*4882a593Smuzhiyun vpor tp, x1, x1; \ 363*4882a593Smuzhiyun vpxor x0, x4, x4; \ 364*4882a593Smuzhiyun vpand x2, x0, x0; \ 365*4882a593Smuzhiyun vpxor x1, x0, x0; 366*4882a593Smuzhiyun#define SI7_2(x0, x1, x2, x3, x4) \ 367*4882a593Smuzhiyun vpand x2, x1, x1; \ 368*4882a593Smuzhiyun vpxor x2, tp, x3; \ 369*4882a593Smuzhiyun vpxor x3, x4, x4; \ 370*4882a593Smuzhiyun vpand x3, x2, x2; \ 371*4882a593Smuzhiyun vpor x0, x3, x3; \ 372*4882a593Smuzhiyun vpxor x4, x1, x1; \ 373*4882a593Smuzhiyun vpxor x4, x3, x3; \ 374*4882a593Smuzhiyun vpand x0, x4, x4; \ 375*4882a593Smuzhiyun vpxor x2, x4, x4; 376*4882a593Smuzhiyun 377*4882a593Smuzhiyun#define get_key(i,j,t) \ 378*4882a593Smuzhiyun vpbroadcastd (4*(i)+(j))*4(CTX), t; 379*4882a593Smuzhiyun 380*4882a593Smuzhiyun#define K2(x0, x1, x2, x3, x4, i) \ 381*4882a593Smuzhiyun get_key(i, 0, RK0); \ 382*4882a593Smuzhiyun get_key(i, 1, RK1); \ 383*4882a593Smuzhiyun get_key(i, 2, RK2); \ 384*4882a593Smuzhiyun get_key(i, 3, RK3); \ 385*4882a593Smuzhiyun vpxor RK0, x0 ## 1, x0 ## 1; \ 386*4882a593Smuzhiyun vpxor RK1, x1 ## 1, x1 ## 1; \ 387*4882a593Smuzhiyun vpxor RK2, x2 ## 1, x2 ## 1; \ 388*4882a593Smuzhiyun vpxor RK3, x3 ## 1, x3 ## 1; \ 389*4882a593Smuzhiyun vpxor RK0, x0 ## 2, x0 ## 2; \ 390*4882a593Smuzhiyun vpxor RK1, x1 ## 2, x1 ## 2; \ 391*4882a593Smuzhiyun vpxor RK2, x2 ## 2, x2 ## 2; \ 392*4882a593Smuzhiyun vpxor RK3, x3 ## 2, x3 ## 2; 393*4882a593Smuzhiyun 394*4882a593Smuzhiyun#define LK2(x0, x1, x2, x3, x4, i) \ 395*4882a593Smuzhiyun vpslld $13, x0 ## 1, x4 ## 1; \ 396*4882a593Smuzhiyun vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 397*4882a593Smuzhiyun vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 398*4882a593Smuzhiyun vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 399*4882a593Smuzhiyun vpslld $3, x2 ## 1, x4 ## 1; \ 400*4882a593Smuzhiyun vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 401*4882a593Smuzhiyun vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 402*4882a593Smuzhiyun vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 403*4882a593Smuzhiyun vpslld $13, x0 ## 2, x4 ## 2; \ 404*4882a593Smuzhiyun vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 405*4882a593Smuzhiyun vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 406*4882a593Smuzhiyun vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 407*4882a593Smuzhiyun vpslld $3, x2 ## 2, x4 ## 2; \ 408*4882a593Smuzhiyun vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 409*4882a593Smuzhiyun vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 410*4882a593Smuzhiyun vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 411*4882a593Smuzhiyun vpslld $1, x1 ## 1, x4 ## 1; \ 412*4882a593Smuzhiyun vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 413*4882a593Smuzhiyun vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 414*4882a593Smuzhiyun vpslld $3, x0 ## 1, x4 ## 1; \ 415*4882a593Smuzhiyun vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 416*4882a593Smuzhiyun vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 417*4882a593Smuzhiyun get_key(i, 1, RK1); \ 418*4882a593Smuzhiyun vpslld $1, x1 ## 2, x4 ## 2; \ 419*4882a593Smuzhiyun vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 420*4882a593Smuzhiyun vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 421*4882a593Smuzhiyun vpslld $3, x0 ## 2, x4 ## 2; \ 422*4882a593Smuzhiyun vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 423*4882a593Smuzhiyun vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 424*4882a593Smuzhiyun get_key(i, 3, RK3); \ 425*4882a593Smuzhiyun vpslld $7, x3 ## 1, x4 ## 1; \ 426*4882a593Smuzhiyun vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 427*4882a593Smuzhiyun vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 428*4882a593Smuzhiyun vpslld $7, x1 ## 1, x4 ## 1; \ 429*4882a593Smuzhiyun vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 430*4882a593Smuzhiyun vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 431*4882a593Smuzhiyun vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 432*4882a593Smuzhiyun vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 433*4882a593Smuzhiyun get_key(i, 0, RK0); \ 434*4882a593Smuzhiyun vpslld $7, x3 ## 2, x4 ## 2; \ 435*4882a593Smuzhiyun vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 436*4882a593Smuzhiyun vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 437*4882a593Smuzhiyun vpslld $7, x1 ## 2, x4 ## 2; \ 438*4882a593Smuzhiyun vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 439*4882a593Smuzhiyun vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 440*4882a593Smuzhiyun vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 441*4882a593Smuzhiyun vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 442*4882a593Smuzhiyun get_key(i, 2, RK2); \ 443*4882a593Smuzhiyun vpxor RK1, x1 ## 1, x1 ## 1; \ 444*4882a593Smuzhiyun vpxor RK3, x3 ## 1, x3 ## 1; \ 445*4882a593Smuzhiyun vpslld $5, x0 ## 1, x4 ## 1; \ 446*4882a593Smuzhiyun vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 447*4882a593Smuzhiyun vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 448*4882a593Smuzhiyun vpslld $22, x2 ## 1, x4 ## 1; \ 449*4882a593Smuzhiyun vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 450*4882a593Smuzhiyun vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 451*4882a593Smuzhiyun vpxor RK0, x0 ## 1, x0 ## 1; \ 452*4882a593Smuzhiyun vpxor RK2, x2 ## 1, x2 ## 1; \ 453*4882a593Smuzhiyun vpxor RK1, x1 ## 2, x1 ## 2; \ 454*4882a593Smuzhiyun vpxor RK3, x3 ## 2, x3 ## 2; \ 455*4882a593Smuzhiyun vpslld $5, x0 ## 2, x4 ## 2; \ 456*4882a593Smuzhiyun vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 457*4882a593Smuzhiyun vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 458*4882a593Smuzhiyun vpslld $22, x2 ## 2, x4 ## 2; \ 459*4882a593Smuzhiyun vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 460*4882a593Smuzhiyun vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 461*4882a593Smuzhiyun vpxor RK0, x0 ## 2, x0 ## 2; \ 462*4882a593Smuzhiyun vpxor RK2, x2 ## 2, x2 ## 2; 463*4882a593Smuzhiyun 464*4882a593Smuzhiyun#define KL2(x0, x1, x2, x3, x4, i) \ 465*4882a593Smuzhiyun vpxor RK0, x0 ## 1, x0 ## 1; \ 466*4882a593Smuzhiyun vpxor RK2, x2 ## 1, x2 ## 1; \ 467*4882a593Smuzhiyun vpsrld $5, x0 ## 1, x4 ## 1; \ 468*4882a593Smuzhiyun vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 469*4882a593Smuzhiyun vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 470*4882a593Smuzhiyun vpxor RK3, x3 ## 1, x3 ## 1; \ 471*4882a593Smuzhiyun vpxor RK1, x1 ## 1, x1 ## 1; \ 472*4882a593Smuzhiyun vpsrld $22, x2 ## 1, x4 ## 1; \ 473*4882a593Smuzhiyun vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 474*4882a593Smuzhiyun vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 475*4882a593Smuzhiyun vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 476*4882a593Smuzhiyun vpxor RK0, x0 ## 2, x0 ## 2; \ 477*4882a593Smuzhiyun vpxor RK2, x2 ## 2, x2 ## 2; \ 478*4882a593Smuzhiyun vpsrld $5, x0 ## 2, x4 ## 2; \ 479*4882a593Smuzhiyun vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 480*4882a593Smuzhiyun vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 481*4882a593Smuzhiyun vpxor RK3, x3 ## 2, x3 ## 2; \ 482*4882a593Smuzhiyun vpxor RK1, x1 ## 2, x1 ## 2; \ 483*4882a593Smuzhiyun vpsrld $22, x2 ## 2, x4 ## 2; \ 484*4882a593Smuzhiyun vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 485*4882a593Smuzhiyun vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 486*4882a593Smuzhiyun vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 487*4882a593Smuzhiyun vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 488*4882a593Smuzhiyun vpslld $7, x1 ## 1, x4 ## 1; \ 489*4882a593Smuzhiyun vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 490*4882a593Smuzhiyun vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 491*4882a593Smuzhiyun vpsrld $1, x1 ## 1, x4 ## 1; \ 492*4882a593Smuzhiyun vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 493*4882a593Smuzhiyun vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 494*4882a593Smuzhiyun vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 495*4882a593Smuzhiyun vpslld $7, x1 ## 2, x4 ## 2; \ 496*4882a593Smuzhiyun vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 497*4882a593Smuzhiyun vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 498*4882a593Smuzhiyun vpsrld $1, x1 ## 2, x4 ## 2; \ 499*4882a593Smuzhiyun vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 500*4882a593Smuzhiyun vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 501*4882a593Smuzhiyun vpsrld $7, x3 ## 1, x4 ## 1; \ 502*4882a593Smuzhiyun vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 503*4882a593Smuzhiyun vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 504*4882a593Smuzhiyun vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 505*4882a593Smuzhiyun vpslld $3, x0 ## 1, x4 ## 1; \ 506*4882a593Smuzhiyun vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 507*4882a593Smuzhiyun vpsrld $7, x3 ## 2, x4 ## 2; \ 508*4882a593Smuzhiyun vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 509*4882a593Smuzhiyun vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 510*4882a593Smuzhiyun vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 511*4882a593Smuzhiyun vpslld $3, x0 ## 2, x4 ## 2; \ 512*4882a593Smuzhiyun vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 513*4882a593Smuzhiyun vpsrld $13, x0 ## 1, x4 ## 1; \ 514*4882a593Smuzhiyun vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 515*4882a593Smuzhiyun vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 516*4882a593Smuzhiyun vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 517*4882a593Smuzhiyun vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 518*4882a593Smuzhiyun vpsrld $3, x2 ## 1, x4 ## 1; \ 519*4882a593Smuzhiyun vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 520*4882a593Smuzhiyun vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 521*4882a593Smuzhiyun vpsrld $13, x0 ## 2, x4 ## 2; \ 522*4882a593Smuzhiyun vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 523*4882a593Smuzhiyun vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 524*4882a593Smuzhiyun vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 525*4882a593Smuzhiyun vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 526*4882a593Smuzhiyun vpsrld $3, x2 ## 2, x4 ## 2; \ 527*4882a593Smuzhiyun vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 528*4882a593Smuzhiyun vpor x4 ## 2, x2 ## 2, x2 ## 2; 529*4882a593Smuzhiyun 530*4882a593Smuzhiyun#define S(SBOX, x0, x1, x2, x3, x4) \ 531*4882a593Smuzhiyun SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 532*4882a593Smuzhiyun SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 533*4882a593Smuzhiyun SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 534*4882a593Smuzhiyun SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 535*4882a593Smuzhiyun 536*4882a593Smuzhiyun#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 537*4882a593Smuzhiyun get_key(i, 0, RK0); \ 538*4882a593Smuzhiyun SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 539*4882a593Smuzhiyun get_key(i, 2, RK2); \ 540*4882a593Smuzhiyun SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 541*4882a593Smuzhiyun get_key(i, 3, RK3); \ 542*4882a593Smuzhiyun SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 543*4882a593Smuzhiyun get_key(i, 1, RK1); \ 544*4882a593Smuzhiyun SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 545*4882a593Smuzhiyun 546*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 547*4882a593Smuzhiyun vpunpckldq x1, x0, t0; \ 548*4882a593Smuzhiyun vpunpckhdq x1, x0, t2; \ 549*4882a593Smuzhiyun vpunpckldq x3, x2, t1; \ 550*4882a593Smuzhiyun vpunpckhdq x3, x2, x3; \ 551*4882a593Smuzhiyun \ 552*4882a593Smuzhiyun vpunpcklqdq t1, t0, x0; \ 553*4882a593Smuzhiyun vpunpckhqdq t1, t0, x1; \ 554*4882a593Smuzhiyun vpunpcklqdq x3, t2, x2; \ 555*4882a593Smuzhiyun vpunpckhqdq x3, t2, x3; 556*4882a593Smuzhiyun 557*4882a593Smuzhiyun#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 558*4882a593Smuzhiyun transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 559*4882a593Smuzhiyun 560*4882a593Smuzhiyun#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 561*4882a593Smuzhiyun transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 562*4882a593Smuzhiyun 563*4882a593Smuzhiyun.align 8 564*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__serpent_enc_blk16) 565*4882a593Smuzhiyun /* input: 566*4882a593Smuzhiyun * %rdi: ctx, CTX 567*4882a593Smuzhiyun * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext 568*4882a593Smuzhiyun * output: 569*4882a593Smuzhiyun * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 570*4882a593Smuzhiyun */ 571*4882a593Smuzhiyun 572*4882a593Smuzhiyun vpcmpeqd RNOT, RNOT, RNOT; 573*4882a593Smuzhiyun 574*4882a593Smuzhiyun read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 575*4882a593Smuzhiyun read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 576*4882a593Smuzhiyun 577*4882a593Smuzhiyun K2(RA, RB, RC, RD, RE, 0); 578*4882a593Smuzhiyun S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 579*4882a593Smuzhiyun S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 580*4882a593Smuzhiyun S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 581*4882a593Smuzhiyun S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 582*4882a593Smuzhiyun S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 583*4882a593Smuzhiyun S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 584*4882a593Smuzhiyun S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 585*4882a593Smuzhiyun S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 586*4882a593Smuzhiyun S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 587*4882a593Smuzhiyun S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 588*4882a593Smuzhiyun S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 589*4882a593Smuzhiyun S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 590*4882a593Smuzhiyun S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 591*4882a593Smuzhiyun S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 592*4882a593Smuzhiyun S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 593*4882a593Smuzhiyun S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 594*4882a593Smuzhiyun S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 595*4882a593Smuzhiyun S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 596*4882a593Smuzhiyun S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 597*4882a593Smuzhiyun S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 598*4882a593Smuzhiyun S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 599*4882a593Smuzhiyun S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 600*4882a593Smuzhiyun S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 601*4882a593Smuzhiyun S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 602*4882a593Smuzhiyun S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 603*4882a593Smuzhiyun S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 604*4882a593Smuzhiyun S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 605*4882a593Smuzhiyun S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 606*4882a593Smuzhiyun S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 607*4882a593Smuzhiyun S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 608*4882a593Smuzhiyun S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 609*4882a593Smuzhiyun S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 610*4882a593Smuzhiyun 611*4882a593Smuzhiyun write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 612*4882a593Smuzhiyun write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 613*4882a593Smuzhiyun 614*4882a593Smuzhiyun RET; 615*4882a593SmuzhiyunSYM_FUNC_END(__serpent_enc_blk16) 616*4882a593Smuzhiyun 617*4882a593Smuzhiyun.align 8 618*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__serpent_dec_blk16) 619*4882a593Smuzhiyun /* input: 620*4882a593Smuzhiyun * %rdi: ctx, CTX 621*4882a593Smuzhiyun * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 622*4882a593Smuzhiyun * output: 623*4882a593Smuzhiyun * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext 624*4882a593Smuzhiyun */ 625*4882a593Smuzhiyun 626*4882a593Smuzhiyun vpcmpeqd RNOT, RNOT, RNOT; 627*4882a593Smuzhiyun 628*4882a593Smuzhiyun read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 629*4882a593Smuzhiyun read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 630*4882a593Smuzhiyun 631*4882a593Smuzhiyun K2(RA, RB, RC, RD, RE, 32); 632*4882a593Smuzhiyun SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 633*4882a593Smuzhiyun SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 634*4882a593Smuzhiyun SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 635*4882a593Smuzhiyun SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 636*4882a593Smuzhiyun SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 637*4882a593Smuzhiyun SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 638*4882a593Smuzhiyun SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 639*4882a593Smuzhiyun SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 640*4882a593Smuzhiyun SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 641*4882a593Smuzhiyun SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 642*4882a593Smuzhiyun SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 643*4882a593Smuzhiyun SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 644*4882a593Smuzhiyun SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 645*4882a593Smuzhiyun SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 646*4882a593Smuzhiyun SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 647*4882a593Smuzhiyun SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 648*4882a593Smuzhiyun SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 649*4882a593Smuzhiyun SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 650*4882a593Smuzhiyun SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 651*4882a593Smuzhiyun SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 652*4882a593Smuzhiyun SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 653*4882a593Smuzhiyun SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 654*4882a593Smuzhiyun SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 655*4882a593Smuzhiyun SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 656*4882a593Smuzhiyun SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 657*4882a593Smuzhiyun SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 658*4882a593Smuzhiyun SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 659*4882a593Smuzhiyun SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 660*4882a593Smuzhiyun SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 661*4882a593Smuzhiyun SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 662*4882a593Smuzhiyun SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 663*4882a593Smuzhiyun S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 664*4882a593Smuzhiyun 665*4882a593Smuzhiyun write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 666*4882a593Smuzhiyun write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 667*4882a593Smuzhiyun 668*4882a593Smuzhiyun RET; 669*4882a593SmuzhiyunSYM_FUNC_END(__serpent_dec_blk16) 670*4882a593Smuzhiyun 671*4882a593SmuzhiyunSYM_FUNC_START(serpent_ecb_enc_16way) 672*4882a593Smuzhiyun /* input: 673*4882a593Smuzhiyun * %rdi: ctx, CTX 674*4882a593Smuzhiyun * %rsi: dst 675*4882a593Smuzhiyun * %rdx: src 676*4882a593Smuzhiyun */ 677*4882a593Smuzhiyun FRAME_BEGIN 678*4882a593Smuzhiyun 679*4882a593Smuzhiyun vzeroupper; 680*4882a593Smuzhiyun 681*4882a593Smuzhiyun load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 682*4882a593Smuzhiyun 683*4882a593Smuzhiyun call __serpent_enc_blk16; 684*4882a593Smuzhiyun 685*4882a593Smuzhiyun store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 686*4882a593Smuzhiyun 687*4882a593Smuzhiyun vzeroupper; 688*4882a593Smuzhiyun 689*4882a593Smuzhiyun FRAME_END 690*4882a593Smuzhiyun RET; 691*4882a593SmuzhiyunSYM_FUNC_END(serpent_ecb_enc_16way) 692*4882a593Smuzhiyun 693*4882a593SmuzhiyunSYM_FUNC_START(serpent_ecb_dec_16way) 694*4882a593Smuzhiyun /* input: 695*4882a593Smuzhiyun * %rdi: ctx, CTX 696*4882a593Smuzhiyun * %rsi: dst 697*4882a593Smuzhiyun * %rdx: src 698*4882a593Smuzhiyun */ 699*4882a593Smuzhiyun FRAME_BEGIN 700*4882a593Smuzhiyun 701*4882a593Smuzhiyun vzeroupper; 702*4882a593Smuzhiyun 703*4882a593Smuzhiyun load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 704*4882a593Smuzhiyun 705*4882a593Smuzhiyun call __serpent_dec_blk16; 706*4882a593Smuzhiyun 707*4882a593Smuzhiyun store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 708*4882a593Smuzhiyun 709*4882a593Smuzhiyun vzeroupper; 710*4882a593Smuzhiyun 711*4882a593Smuzhiyun FRAME_END 712*4882a593Smuzhiyun RET; 713*4882a593SmuzhiyunSYM_FUNC_END(serpent_ecb_dec_16way) 714*4882a593Smuzhiyun 715*4882a593SmuzhiyunSYM_FUNC_START(serpent_cbc_dec_16way) 716*4882a593Smuzhiyun /* input: 717*4882a593Smuzhiyun * %rdi: ctx, CTX 718*4882a593Smuzhiyun * %rsi: dst 719*4882a593Smuzhiyun * %rdx: src 720*4882a593Smuzhiyun */ 721*4882a593Smuzhiyun FRAME_BEGIN 722*4882a593Smuzhiyun 723*4882a593Smuzhiyun vzeroupper; 724*4882a593Smuzhiyun 725*4882a593Smuzhiyun load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 726*4882a593Smuzhiyun 727*4882a593Smuzhiyun call __serpent_dec_blk16; 728*4882a593Smuzhiyun 729*4882a593Smuzhiyun store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2, 730*4882a593Smuzhiyun RK0); 731*4882a593Smuzhiyun 732*4882a593Smuzhiyun vzeroupper; 733*4882a593Smuzhiyun 734*4882a593Smuzhiyun FRAME_END 735*4882a593Smuzhiyun RET; 736*4882a593SmuzhiyunSYM_FUNC_END(serpent_cbc_dec_16way) 737*4882a593Smuzhiyun 738*4882a593SmuzhiyunSYM_FUNC_START(serpent_ctr_16way) 739*4882a593Smuzhiyun /* input: 740*4882a593Smuzhiyun * %rdi: ctx, CTX 741*4882a593Smuzhiyun * %rsi: dst (16 blocks) 742*4882a593Smuzhiyun * %rdx: src (16 blocks) 743*4882a593Smuzhiyun * %rcx: iv (little endian, 128bit) 744*4882a593Smuzhiyun */ 745*4882a593Smuzhiyun FRAME_BEGIN 746*4882a593Smuzhiyun 747*4882a593Smuzhiyun vzeroupper; 748*4882a593Smuzhiyun 749*4882a593Smuzhiyun load_ctr_16way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 750*4882a593Smuzhiyun RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 751*4882a593Smuzhiyun tp); 752*4882a593Smuzhiyun 753*4882a593Smuzhiyun call __serpent_enc_blk16; 754*4882a593Smuzhiyun 755*4882a593Smuzhiyun store_ctr_16way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 756*4882a593Smuzhiyun 757*4882a593Smuzhiyun vzeroupper; 758*4882a593Smuzhiyun 759*4882a593Smuzhiyun FRAME_END 760*4882a593Smuzhiyun RET; 761*4882a593SmuzhiyunSYM_FUNC_END(serpent_ctr_16way) 762*4882a593Smuzhiyun 763*4882a593SmuzhiyunSYM_FUNC_START(serpent_xts_enc_16way) 764*4882a593Smuzhiyun /* input: 765*4882a593Smuzhiyun * %rdi: ctx, CTX 766*4882a593Smuzhiyun * %rsi: dst (16 blocks) 767*4882a593Smuzhiyun * %rdx: src (16 blocks) 768*4882a593Smuzhiyun * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 769*4882a593Smuzhiyun */ 770*4882a593Smuzhiyun FRAME_BEGIN 771*4882a593Smuzhiyun 772*4882a593Smuzhiyun vzeroupper; 773*4882a593Smuzhiyun 774*4882a593Smuzhiyun load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 775*4882a593Smuzhiyun RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 776*4882a593Smuzhiyun .Lxts_gf128mul_and_shl1_mask_0, 777*4882a593Smuzhiyun .Lxts_gf128mul_and_shl1_mask_1); 778*4882a593Smuzhiyun 779*4882a593Smuzhiyun call __serpent_enc_blk16; 780*4882a593Smuzhiyun 781*4882a593Smuzhiyun store_xts_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 782*4882a593Smuzhiyun 783*4882a593Smuzhiyun vzeroupper; 784*4882a593Smuzhiyun 785*4882a593Smuzhiyun FRAME_END 786*4882a593Smuzhiyun RET; 787*4882a593SmuzhiyunSYM_FUNC_END(serpent_xts_enc_16way) 788*4882a593Smuzhiyun 789*4882a593SmuzhiyunSYM_FUNC_START(serpent_xts_dec_16way) 790*4882a593Smuzhiyun /* input: 791*4882a593Smuzhiyun * %rdi: ctx, CTX 792*4882a593Smuzhiyun * %rsi: dst (16 blocks) 793*4882a593Smuzhiyun * %rdx: src (16 blocks) 794*4882a593Smuzhiyun * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 795*4882a593Smuzhiyun */ 796*4882a593Smuzhiyun FRAME_BEGIN 797*4882a593Smuzhiyun 798*4882a593Smuzhiyun vzeroupper; 799*4882a593Smuzhiyun 800*4882a593Smuzhiyun load_xts_16way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 801*4882a593Smuzhiyun RD2, RK0, RK0x, RK1, RK1x, RK2, RK2x, RK3, RK3x, RNOT, 802*4882a593Smuzhiyun .Lxts_gf128mul_and_shl1_mask_0, 803*4882a593Smuzhiyun .Lxts_gf128mul_and_shl1_mask_1); 804*4882a593Smuzhiyun 805*4882a593Smuzhiyun call __serpent_dec_blk16; 806*4882a593Smuzhiyun 807*4882a593Smuzhiyun store_xts_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 808*4882a593Smuzhiyun 809*4882a593Smuzhiyun vzeroupper; 810*4882a593Smuzhiyun 811*4882a593Smuzhiyun FRAME_END 812*4882a593Smuzhiyun RET; 813*4882a593SmuzhiyunSYM_FUNC_END(serpent_xts_dec_16way) 814