1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2012 Johannes Goetzfried 6*4882a593Smuzhiyun * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 7*4882a593Smuzhiyun * 8*4882a593Smuzhiyun * Copyright © 2011-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9*4882a593Smuzhiyun */ 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun#include <linux/linkage.h> 12*4882a593Smuzhiyun#include <asm/frame.h> 13*4882a593Smuzhiyun#include "glue_helper-asm-avx.S" 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun.file "serpent-avx-x86_64-asm_64.S" 16*4882a593Smuzhiyun 17*4882a593Smuzhiyun.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 18*4882a593Smuzhiyun.align 16 19*4882a593Smuzhiyun.Lbswap128_mask: 20*4882a593Smuzhiyun .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 21*4882a593Smuzhiyun.section .rodata.cst16.xts_gf128mul_and_shl1_mask, "aM", @progbits, 16 22*4882a593Smuzhiyun.align 16 23*4882a593Smuzhiyun.Lxts_gf128mul_and_shl1_mask: 24*4882a593Smuzhiyun .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun.text 27*4882a593Smuzhiyun 28*4882a593Smuzhiyun#define CTX %rdi 29*4882a593Smuzhiyun 30*4882a593Smuzhiyun/********************************************************************** 31*4882a593Smuzhiyun 8-way AVX serpent 32*4882a593Smuzhiyun **********************************************************************/ 33*4882a593Smuzhiyun#define RA1 %xmm0 34*4882a593Smuzhiyun#define RB1 %xmm1 35*4882a593Smuzhiyun#define RC1 %xmm2 36*4882a593Smuzhiyun#define RD1 %xmm3 37*4882a593Smuzhiyun#define RE1 %xmm4 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun#define tp %xmm5 40*4882a593Smuzhiyun 41*4882a593Smuzhiyun#define RA2 %xmm6 42*4882a593Smuzhiyun#define RB2 %xmm7 43*4882a593Smuzhiyun#define RC2 %xmm8 44*4882a593Smuzhiyun#define RD2 %xmm9 45*4882a593Smuzhiyun#define RE2 %xmm10 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun#define RNOT %xmm11 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun#define RK0 %xmm12 50*4882a593Smuzhiyun#define RK1 %xmm13 51*4882a593Smuzhiyun#define RK2 %xmm14 52*4882a593Smuzhiyun#define RK3 %xmm15 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun 55*4882a593Smuzhiyun#define S0_1(x0, x1, x2, x3, x4) \ 56*4882a593Smuzhiyun vpor x0, x3, tp; \ 57*4882a593Smuzhiyun vpxor x3, x0, x0; \ 58*4882a593Smuzhiyun vpxor x2, x3, x4; \ 59*4882a593Smuzhiyun vpxor RNOT, x4, x4; \ 60*4882a593Smuzhiyun vpxor x1, tp, x3; \ 61*4882a593Smuzhiyun vpand x0, x1, x1; \ 62*4882a593Smuzhiyun vpxor x4, x1, x1; \ 63*4882a593Smuzhiyun vpxor x0, x2, x2; 64*4882a593Smuzhiyun#define S0_2(x0, x1, x2, x3, x4) \ 65*4882a593Smuzhiyun vpxor x3, x0, x0; \ 66*4882a593Smuzhiyun vpor x0, x4, x4; \ 67*4882a593Smuzhiyun vpxor x2, x0, x0; \ 68*4882a593Smuzhiyun vpand x1, x2, x2; \ 69*4882a593Smuzhiyun vpxor x2, x3, x3; \ 70*4882a593Smuzhiyun vpxor RNOT, x1, x1; \ 71*4882a593Smuzhiyun vpxor x4, x2, x2; \ 72*4882a593Smuzhiyun vpxor x2, x1, x1; 73*4882a593Smuzhiyun 74*4882a593Smuzhiyun#define S1_1(x0, x1, x2, x3, x4) \ 75*4882a593Smuzhiyun vpxor x0, x1, tp; \ 76*4882a593Smuzhiyun vpxor x3, x0, x0; \ 77*4882a593Smuzhiyun vpxor RNOT, x3, x3; \ 78*4882a593Smuzhiyun vpand tp, x1, x4; \ 79*4882a593Smuzhiyun vpor tp, x0, x0; \ 80*4882a593Smuzhiyun vpxor x2, x3, x3; \ 81*4882a593Smuzhiyun vpxor x3, x0, x0; \ 82*4882a593Smuzhiyun vpxor x3, tp, x1; 83*4882a593Smuzhiyun#define S1_2(x0, x1, x2, x3, x4) \ 84*4882a593Smuzhiyun vpxor x4, x3, x3; \ 85*4882a593Smuzhiyun vpor x4, x1, x1; \ 86*4882a593Smuzhiyun vpxor x2, x4, x4; \ 87*4882a593Smuzhiyun vpand x0, x2, x2; \ 88*4882a593Smuzhiyun vpxor x1, x2, x2; \ 89*4882a593Smuzhiyun vpor x0, x1, x1; \ 90*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 91*4882a593Smuzhiyun vpxor x2, x0, x0; \ 92*4882a593Smuzhiyun vpxor x1, x4, x4; 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun#define S2_1(x0, x1, x2, x3, x4) \ 95*4882a593Smuzhiyun vpxor RNOT, x3, x3; \ 96*4882a593Smuzhiyun vpxor x0, x1, x1; \ 97*4882a593Smuzhiyun vpand x2, x0, tp; \ 98*4882a593Smuzhiyun vpxor x3, tp, tp; \ 99*4882a593Smuzhiyun vpor x0, x3, x3; \ 100*4882a593Smuzhiyun vpxor x1, x2, x2; \ 101*4882a593Smuzhiyun vpxor x1, x3, x3; \ 102*4882a593Smuzhiyun vpand tp, x1, x1; 103*4882a593Smuzhiyun#define S2_2(x0, x1, x2, x3, x4) \ 104*4882a593Smuzhiyun vpxor x2, tp, tp; \ 105*4882a593Smuzhiyun vpand x3, x2, x2; \ 106*4882a593Smuzhiyun vpor x1, x3, x3; \ 107*4882a593Smuzhiyun vpxor RNOT, tp, tp; \ 108*4882a593Smuzhiyun vpxor tp, x3, x3; \ 109*4882a593Smuzhiyun vpxor tp, x0, x4; \ 110*4882a593Smuzhiyun vpxor x2, tp, x0; \ 111*4882a593Smuzhiyun vpor x2, x1, x1; 112*4882a593Smuzhiyun 113*4882a593Smuzhiyun#define S3_1(x0, x1, x2, x3, x4) \ 114*4882a593Smuzhiyun vpxor x3, x1, tp; \ 115*4882a593Smuzhiyun vpor x0, x3, x3; \ 116*4882a593Smuzhiyun vpand x0, x1, x4; \ 117*4882a593Smuzhiyun vpxor x2, x0, x0; \ 118*4882a593Smuzhiyun vpxor tp, x2, x2; \ 119*4882a593Smuzhiyun vpand x3, tp, x1; \ 120*4882a593Smuzhiyun vpxor x3, x2, x2; \ 121*4882a593Smuzhiyun vpor x4, x0, x0; \ 122*4882a593Smuzhiyun vpxor x3, x4, x4; 123*4882a593Smuzhiyun#define S3_2(x0, x1, x2, x3, x4) \ 124*4882a593Smuzhiyun vpxor x0, x1, x1; \ 125*4882a593Smuzhiyun vpand x3, x0, x0; \ 126*4882a593Smuzhiyun vpand x4, x3, x3; \ 127*4882a593Smuzhiyun vpxor x2, x3, x3; \ 128*4882a593Smuzhiyun vpor x1, x4, x4; \ 129*4882a593Smuzhiyun vpand x1, x2, x2; \ 130*4882a593Smuzhiyun vpxor x3, x4, x4; \ 131*4882a593Smuzhiyun vpxor x3, x0, x0; \ 132*4882a593Smuzhiyun vpxor x2, x3, x3; 133*4882a593Smuzhiyun 134*4882a593Smuzhiyun#define S4_1(x0, x1, x2, x3, x4) \ 135*4882a593Smuzhiyun vpand x0, x3, tp; \ 136*4882a593Smuzhiyun vpxor x3, x0, x0; \ 137*4882a593Smuzhiyun vpxor x2, tp, tp; \ 138*4882a593Smuzhiyun vpor x3, x2, x2; \ 139*4882a593Smuzhiyun vpxor x1, x0, x0; \ 140*4882a593Smuzhiyun vpxor tp, x3, x4; \ 141*4882a593Smuzhiyun vpor x0, x2, x2; \ 142*4882a593Smuzhiyun vpxor x1, x2, x2; 143*4882a593Smuzhiyun#define S4_2(x0, x1, x2, x3, x4) \ 144*4882a593Smuzhiyun vpand x0, x1, x1; \ 145*4882a593Smuzhiyun vpxor x4, x1, x1; \ 146*4882a593Smuzhiyun vpand x2, x4, x4; \ 147*4882a593Smuzhiyun vpxor tp, x2, x2; \ 148*4882a593Smuzhiyun vpxor x0, x4, x4; \ 149*4882a593Smuzhiyun vpor x1, tp, x3; \ 150*4882a593Smuzhiyun vpxor RNOT, x1, x1; \ 151*4882a593Smuzhiyun vpxor x0, x3, x3; 152*4882a593Smuzhiyun 153*4882a593Smuzhiyun#define S5_1(x0, x1, x2, x3, x4) \ 154*4882a593Smuzhiyun vpor x0, x1, tp; \ 155*4882a593Smuzhiyun vpxor tp, x2, x2; \ 156*4882a593Smuzhiyun vpxor RNOT, x3, x3; \ 157*4882a593Smuzhiyun vpxor x0, x1, x4; \ 158*4882a593Smuzhiyun vpxor x2, x0, x0; \ 159*4882a593Smuzhiyun vpand x4, tp, x1; \ 160*4882a593Smuzhiyun vpor x3, x4, x4; \ 161*4882a593Smuzhiyun vpxor x0, x4, x4; 162*4882a593Smuzhiyun#define S5_2(x0, x1, x2, x3, x4) \ 163*4882a593Smuzhiyun vpand x3, x0, x0; \ 164*4882a593Smuzhiyun vpxor x3, x1, x1; \ 165*4882a593Smuzhiyun vpxor x2, x3, x3; \ 166*4882a593Smuzhiyun vpxor x1, x0, x0; \ 167*4882a593Smuzhiyun vpand x4, x2, x2; \ 168*4882a593Smuzhiyun vpxor x2, x1, x1; \ 169*4882a593Smuzhiyun vpand x0, x2, x2; \ 170*4882a593Smuzhiyun vpxor x2, x3, x3; 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun#define S6_1(x0, x1, x2, x3, x4) \ 173*4882a593Smuzhiyun vpxor x0, x3, x3; \ 174*4882a593Smuzhiyun vpxor x2, x1, tp; \ 175*4882a593Smuzhiyun vpxor x0, x2, x2; \ 176*4882a593Smuzhiyun vpand x3, x0, x0; \ 177*4882a593Smuzhiyun vpor x3, tp, tp; \ 178*4882a593Smuzhiyun vpxor RNOT, x1, x4; \ 179*4882a593Smuzhiyun vpxor tp, x0, x0; \ 180*4882a593Smuzhiyun vpxor x2, tp, x1; 181*4882a593Smuzhiyun#define S6_2(x0, x1, x2, x3, x4) \ 182*4882a593Smuzhiyun vpxor x4, x3, x3; \ 183*4882a593Smuzhiyun vpxor x0, x4, x4; \ 184*4882a593Smuzhiyun vpand x0, x2, x2; \ 185*4882a593Smuzhiyun vpxor x1, x4, x4; \ 186*4882a593Smuzhiyun vpxor x3, x2, x2; \ 187*4882a593Smuzhiyun vpand x1, x3, x3; \ 188*4882a593Smuzhiyun vpxor x0, x3, x3; \ 189*4882a593Smuzhiyun vpxor x2, x1, x1; 190*4882a593Smuzhiyun 191*4882a593Smuzhiyun#define S7_1(x0, x1, x2, x3, x4) \ 192*4882a593Smuzhiyun vpxor RNOT, x1, tp; \ 193*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 194*4882a593Smuzhiyun vpand x2, tp, x1; \ 195*4882a593Smuzhiyun vpxor x3, x1, x1; \ 196*4882a593Smuzhiyun vpor tp, x3, x3; \ 197*4882a593Smuzhiyun vpxor x2, tp, x4; \ 198*4882a593Smuzhiyun vpxor x3, x2, x2; \ 199*4882a593Smuzhiyun vpxor x0, x3, x3; \ 200*4882a593Smuzhiyun vpor x1, x0, x0; 201*4882a593Smuzhiyun#define S7_2(x0, x1, x2, x3, x4) \ 202*4882a593Smuzhiyun vpand x0, x2, x2; \ 203*4882a593Smuzhiyun vpxor x4, x0, x0; \ 204*4882a593Smuzhiyun vpxor x3, x4, x4; \ 205*4882a593Smuzhiyun vpand x0, x3, x3; \ 206*4882a593Smuzhiyun vpxor x1, x4, x4; \ 207*4882a593Smuzhiyun vpxor x4, x2, x2; \ 208*4882a593Smuzhiyun vpxor x1, x3, x3; \ 209*4882a593Smuzhiyun vpor x0, x4, x4; \ 210*4882a593Smuzhiyun vpxor x1, x4, x4; 211*4882a593Smuzhiyun 212*4882a593Smuzhiyun#define SI0_1(x0, x1, x2, x3, x4) \ 213*4882a593Smuzhiyun vpxor x0, x1, x1; \ 214*4882a593Smuzhiyun vpor x1, x3, tp; \ 215*4882a593Smuzhiyun vpxor x1, x3, x4; \ 216*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 217*4882a593Smuzhiyun vpxor tp, x2, x2; \ 218*4882a593Smuzhiyun vpxor x0, tp, x3; \ 219*4882a593Smuzhiyun vpand x1, x0, x0; \ 220*4882a593Smuzhiyun vpxor x2, x0, x0; 221*4882a593Smuzhiyun#define SI0_2(x0, x1, x2, x3, x4) \ 222*4882a593Smuzhiyun vpand x3, x2, x2; \ 223*4882a593Smuzhiyun vpxor x4, x3, x3; \ 224*4882a593Smuzhiyun vpxor x3, x2, x2; \ 225*4882a593Smuzhiyun vpxor x3, x1, x1; \ 226*4882a593Smuzhiyun vpand x0, x3, x3; \ 227*4882a593Smuzhiyun vpxor x0, x1, x1; \ 228*4882a593Smuzhiyun vpxor x2, x0, x0; \ 229*4882a593Smuzhiyun vpxor x3, x4, x4; 230*4882a593Smuzhiyun 231*4882a593Smuzhiyun#define SI1_1(x0, x1, x2, x3, x4) \ 232*4882a593Smuzhiyun vpxor x3, x1, x1; \ 233*4882a593Smuzhiyun vpxor x2, x0, tp; \ 234*4882a593Smuzhiyun vpxor RNOT, x2, x2; \ 235*4882a593Smuzhiyun vpor x1, x0, x4; \ 236*4882a593Smuzhiyun vpxor x3, x4, x4; \ 237*4882a593Smuzhiyun vpand x1, x3, x3; \ 238*4882a593Smuzhiyun vpxor x2, x1, x1; \ 239*4882a593Smuzhiyun vpand x4, x2, x2; 240*4882a593Smuzhiyun#define SI1_2(x0, x1, x2, x3, x4) \ 241*4882a593Smuzhiyun vpxor x1, x4, x4; \ 242*4882a593Smuzhiyun vpor x3, x1, x1; \ 243*4882a593Smuzhiyun vpxor tp, x3, x3; \ 244*4882a593Smuzhiyun vpxor tp, x2, x2; \ 245*4882a593Smuzhiyun vpor x4, tp, x0; \ 246*4882a593Smuzhiyun vpxor x4, x2, x2; \ 247*4882a593Smuzhiyun vpxor x0, x1, x1; \ 248*4882a593Smuzhiyun vpxor x1, x4, x4; 249*4882a593Smuzhiyun 250*4882a593Smuzhiyun#define SI2_1(x0, x1, x2, x3, x4) \ 251*4882a593Smuzhiyun vpxor x1, x2, x2; \ 252*4882a593Smuzhiyun vpxor RNOT, x3, tp; \ 253*4882a593Smuzhiyun vpor x2, tp, tp; \ 254*4882a593Smuzhiyun vpxor x3, x2, x2; \ 255*4882a593Smuzhiyun vpxor x0, x3, x4; \ 256*4882a593Smuzhiyun vpxor x1, tp, x3; \ 257*4882a593Smuzhiyun vpor x2, x1, x1; \ 258*4882a593Smuzhiyun vpxor x0, x2, x2; 259*4882a593Smuzhiyun#define SI2_2(x0, x1, x2, x3, x4) \ 260*4882a593Smuzhiyun vpxor x4, x1, x1; \ 261*4882a593Smuzhiyun vpor x3, x4, x4; \ 262*4882a593Smuzhiyun vpxor x3, x2, x2; \ 263*4882a593Smuzhiyun vpxor x2, x4, x4; \ 264*4882a593Smuzhiyun vpand x1, x2, x2; \ 265*4882a593Smuzhiyun vpxor x3, x2, x2; \ 266*4882a593Smuzhiyun vpxor x4, x3, x3; \ 267*4882a593Smuzhiyun vpxor x0, x4, x4; 268*4882a593Smuzhiyun 269*4882a593Smuzhiyun#define SI3_1(x0, x1, x2, x3, x4) \ 270*4882a593Smuzhiyun vpxor x1, x2, x2; \ 271*4882a593Smuzhiyun vpand x2, x1, tp; \ 272*4882a593Smuzhiyun vpxor x0, tp, tp; \ 273*4882a593Smuzhiyun vpor x1, x0, x0; \ 274*4882a593Smuzhiyun vpxor x3, x1, x4; \ 275*4882a593Smuzhiyun vpxor x3, x0, x0; \ 276*4882a593Smuzhiyun vpor tp, x3, x3; \ 277*4882a593Smuzhiyun vpxor x2, tp, x1; 278*4882a593Smuzhiyun#define SI3_2(x0, x1, x2, x3, x4) \ 279*4882a593Smuzhiyun vpxor x3, x1, x1; \ 280*4882a593Smuzhiyun vpxor x2, x0, x0; \ 281*4882a593Smuzhiyun vpxor x3, x2, x2; \ 282*4882a593Smuzhiyun vpand x1, x3, x3; \ 283*4882a593Smuzhiyun vpxor x0, x1, x1; \ 284*4882a593Smuzhiyun vpand x2, x0, x0; \ 285*4882a593Smuzhiyun vpxor x3, x4, x4; \ 286*4882a593Smuzhiyun vpxor x0, x3, x3; \ 287*4882a593Smuzhiyun vpxor x1, x0, x0; 288*4882a593Smuzhiyun 289*4882a593Smuzhiyun#define SI4_1(x0, x1, x2, x3, x4) \ 290*4882a593Smuzhiyun vpxor x3, x2, x2; \ 291*4882a593Smuzhiyun vpand x1, x0, tp; \ 292*4882a593Smuzhiyun vpxor x2, tp, tp; \ 293*4882a593Smuzhiyun vpor x3, x2, x2; \ 294*4882a593Smuzhiyun vpxor RNOT, x0, x4; \ 295*4882a593Smuzhiyun vpxor tp, x1, x1; \ 296*4882a593Smuzhiyun vpxor x2, tp, x0; \ 297*4882a593Smuzhiyun vpand x4, x2, x2; 298*4882a593Smuzhiyun#define SI4_2(x0, x1, x2, x3, x4) \ 299*4882a593Smuzhiyun vpxor x0, x2, x2; \ 300*4882a593Smuzhiyun vpor x4, x0, x0; \ 301*4882a593Smuzhiyun vpxor x3, x0, x0; \ 302*4882a593Smuzhiyun vpand x2, x3, x3; \ 303*4882a593Smuzhiyun vpxor x3, x4, x4; \ 304*4882a593Smuzhiyun vpxor x1, x3, x3; \ 305*4882a593Smuzhiyun vpand x0, x1, x1; \ 306*4882a593Smuzhiyun vpxor x1, x4, x4; \ 307*4882a593Smuzhiyun vpxor x3, x0, x0; 308*4882a593Smuzhiyun 309*4882a593Smuzhiyun#define SI5_1(x0, x1, x2, x3, x4) \ 310*4882a593Smuzhiyun vpor x2, x1, tp; \ 311*4882a593Smuzhiyun vpxor x1, x2, x2; \ 312*4882a593Smuzhiyun vpxor x3, tp, tp; \ 313*4882a593Smuzhiyun vpand x1, x3, x3; \ 314*4882a593Smuzhiyun vpxor x3, x2, x2; \ 315*4882a593Smuzhiyun vpor x0, x3, x3; \ 316*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 317*4882a593Smuzhiyun vpxor x2, x3, x3; \ 318*4882a593Smuzhiyun vpor x0, x2, x2; 319*4882a593Smuzhiyun#define SI5_2(x0, x1, x2, x3, x4) \ 320*4882a593Smuzhiyun vpxor tp, x1, x4; \ 321*4882a593Smuzhiyun vpxor x4, x2, x2; \ 322*4882a593Smuzhiyun vpand x0, x4, x4; \ 323*4882a593Smuzhiyun vpxor tp, x0, x0; \ 324*4882a593Smuzhiyun vpxor x3, tp, x1; \ 325*4882a593Smuzhiyun vpand x2, x0, x0; \ 326*4882a593Smuzhiyun vpxor x3, x2, x2; \ 327*4882a593Smuzhiyun vpxor x2, x0, x0; \ 328*4882a593Smuzhiyun vpxor x4, x2, x2; \ 329*4882a593Smuzhiyun vpxor x3, x4, x4; 330*4882a593Smuzhiyun 331*4882a593Smuzhiyun#define SI6_1(x0, x1, x2, x3, x4) \ 332*4882a593Smuzhiyun vpxor x2, x0, x0; \ 333*4882a593Smuzhiyun vpand x3, x0, tp; \ 334*4882a593Smuzhiyun vpxor x3, x2, x2; \ 335*4882a593Smuzhiyun vpxor x2, tp, tp; \ 336*4882a593Smuzhiyun vpxor x1, x3, x3; \ 337*4882a593Smuzhiyun vpor x0, x2, x2; \ 338*4882a593Smuzhiyun vpxor x3, x2, x2; \ 339*4882a593Smuzhiyun vpand tp, x3, x3; 340*4882a593Smuzhiyun#define SI6_2(x0, x1, x2, x3, x4) \ 341*4882a593Smuzhiyun vpxor RNOT, tp, tp; \ 342*4882a593Smuzhiyun vpxor x1, x3, x3; \ 343*4882a593Smuzhiyun vpand x2, x1, x1; \ 344*4882a593Smuzhiyun vpxor tp, x0, x4; \ 345*4882a593Smuzhiyun vpxor x4, x3, x3; \ 346*4882a593Smuzhiyun vpxor x2, x4, x4; \ 347*4882a593Smuzhiyun vpxor x1, tp, x0; \ 348*4882a593Smuzhiyun vpxor x0, x2, x2; 349*4882a593Smuzhiyun 350*4882a593Smuzhiyun#define SI7_1(x0, x1, x2, x3, x4) \ 351*4882a593Smuzhiyun vpand x0, x3, tp; \ 352*4882a593Smuzhiyun vpxor x2, x0, x0; \ 353*4882a593Smuzhiyun vpor x3, x2, x2; \ 354*4882a593Smuzhiyun vpxor x1, x3, x4; \ 355*4882a593Smuzhiyun vpxor RNOT, x0, x0; \ 356*4882a593Smuzhiyun vpor tp, x1, x1; \ 357*4882a593Smuzhiyun vpxor x0, x4, x4; \ 358*4882a593Smuzhiyun vpand x2, x0, x0; \ 359*4882a593Smuzhiyun vpxor x1, x0, x0; 360*4882a593Smuzhiyun#define SI7_2(x0, x1, x2, x3, x4) \ 361*4882a593Smuzhiyun vpand x2, x1, x1; \ 362*4882a593Smuzhiyun vpxor x2, tp, x3; \ 363*4882a593Smuzhiyun vpxor x3, x4, x4; \ 364*4882a593Smuzhiyun vpand x3, x2, x2; \ 365*4882a593Smuzhiyun vpor x0, x3, x3; \ 366*4882a593Smuzhiyun vpxor x4, x1, x1; \ 367*4882a593Smuzhiyun vpxor x4, x3, x3; \ 368*4882a593Smuzhiyun vpand x0, x4, x4; \ 369*4882a593Smuzhiyun vpxor x2, x4, x4; 370*4882a593Smuzhiyun 371*4882a593Smuzhiyun#define get_key(i, j, t) \ 372*4882a593Smuzhiyun vbroadcastss (4*(i)+(j))*4(CTX), t; 373*4882a593Smuzhiyun 374*4882a593Smuzhiyun#define K2(x0, x1, x2, x3, x4, i) \ 375*4882a593Smuzhiyun get_key(i, 0, RK0); \ 376*4882a593Smuzhiyun get_key(i, 1, RK1); \ 377*4882a593Smuzhiyun get_key(i, 2, RK2); \ 378*4882a593Smuzhiyun get_key(i, 3, RK3); \ 379*4882a593Smuzhiyun vpxor RK0, x0 ## 1, x0 ## 1; \ 380*4882a593Smuzhiyun vpxor RK1, x1 ## 1, x1 ## 1; \ 381*4882a593Smuzhiyun vpxor RK2, x2 ## 1, x2 ## 1; \ 382*4882a593Smuzhiyun vpxor RK3, x3 ## 1, x3 ## 1; \ 383*4882a593Smuzhiyun vpxor RK0, x0 ## 2, x0 ## 2; \ 384*4882a593Smuzhiyun vpxor RK1, x1 ## 2, x1 ## 2; \ 385*4882a593Smuzhiyun vpxor RK2, x2 ## 2, x2 ## 2; \ 386*4882a593Smuzhiyun vpxor RK3, x3 ## 2, x3 ## 2; 387*4882a593Smuzhiyun 388*4882a593Smuzhiyun#define LK2(x0, x1, x2, x3, x4, i) \ 389*4882a593Smuzhiyun vpslld $13, x0 ## 1, x4 ## 1; \ 390*4882a593Smuzhiyun vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 391*4882a593Smuzhiyun vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 392*4882a593Smuzhiyun vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 393*4882a593Smuzhiyun vpslld $3, x2 ## 1, x4 ## 1; \ 394*4882a593Smuzhiyun vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 395*4882a593Smuzhiyun vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 396*4882a593Smuzhiyun vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 397*4882a593Smuzhiyun vpslld $13, x0 ## 2, x4 ## 2; \ 398*4882a593Smuzhiyun vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 399*4882a593Smuzhiyun vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 400*4882a593Smuzhiyun vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 401*4882a593Smuzhiyun vpslld $3, x2 ## 2, x4 ## 2; \ 402*4882a593Smuzhiyun vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 403*4882a593Smuzhiyun vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 404*4882a593Smuzhiyun vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 405*4882a593Smuzhiyun vpslld $1, x1 ## 1, x4 ## 1; \ 406*4882a593Smuzhiyun vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 407*4882a593Smuzhiyun vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 408*4882a593Smuzhiyun vpslld $3, x0 ## 1, x4 ## 1; \ 409*4882a593Smuzhiyun vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 410*4882a593Smuzhiyun vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 411*4882a593Smuzhiyun get_key(i, 1, RK1); \ 412*4882a593Smuzhiyun vpslld $1, x1 ## 2, x4 ## 2; \ 413*4882a593Smuzhiyun vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 414*4882a593Smuzhiyun vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 415*4882a593Smuzhiyun vpslld $3, x0 ## 2, x4 ## 2; \ 416*4882a593Smuzhiyun vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 417*4882a593Smuzhiyun vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 418*4882a593Smuzhiyun get_key(i, 3, RK3); \ 419*4882a593Smuzhiyun vpslld $7, x3 ## 1, x4 ## 1; \ 420*4882a593Smuzhiyun vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 421*4882a593Smuzhiyun vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 422*4882a593Smuzhiyun vpslld $7, x1 ## 1, x4 ## 1; \ 423*4882a593Smuzhiyun vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 424*4882a593Smuzhiyun vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 425*4882a593Smuzhiyun vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 426*4882a593Smuzhiyun vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 427*4882a593Smuzhiyun get_key(i, 0, RK0); \ 428*4882a593Smuzhiyun vpslld $7, x3 ## 2, x4 ## 2; \ 429*4882a593Smuzhiyun vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 430*4882a593Smuzhiyun vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 431*4882a593Smuzhiyun vpslld $7, x1 ## 2, x4 ## 2; \ 432*4882a593Smuzhiyun vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 433*4882a593Smuzhiyun vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 434*4882a593Smuzhiyun vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 435*4882a593Smuzhiyun vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 436*4882a593Smuzhiyun get_key(i, 2, RK2); \ 437*4882a593Smuzhiyun vpxor RK1, x1 ## 1, x1 ## 1; \ 438*4882a593Smuzhiyun vpxor RK3, x3 ## 1, x3 ## 1; \ 439*4882a593Smuzhiyun vpslld $5, x0 ## 1, x4 ## 1; \ 440*4882a593Smuzhiyun vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 441*4882a593Smuzhiyun vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 442*4882a593Smuzhiyun vpslld $22, x2 ## 1, x4 ## 1; \ 443*4882a593Smuzhiyun vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 444*4882a593Smuzhiyun vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 445*4882a593Smuzhiyun vpxor RK0, x0 ## 1, x0 ## 1; \ 446*4882a593Smuzhiyun vpxor RK2, x2 ## 1, x2 ## 1; \ 447*4882a593Smuzhiyun vpxor RK1, x1 ## 2, x1 ## 2; \ 448*4882a593Smuzhiyun vpxor RK3, x3 ## 2, x3 ## 2; \ 449*4882a593Smuzhiyun vpslld $5, x0 ## 2, x4 ## 2; \ 450*4882a593Smuzhiyun vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 451*4882a593Smuzhiyun vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 452*4882a593Smuzhiyun vpslld $22, x2 ## 2, x4 ## 2; \ 453*4882a593Smuzhiyun vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 454*4882a593Smuzhiyun vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 455*4882a593Smuzhiyun vpxor RK0, x0 ## 2, x0 ## 2; \ 456*4882a593Smuzhiyun vpxor RK2, x2 ## 2, x2 ## 2; 457*4882a593Smuzhiyun 458*4882a593Smuzhiyun#define KL2(x0, x1, x2, x3, x4, i) \ 459*4882a593Smuzhiyun vpxor RK0, x0 ## 1, x0 ## 1; \ 460*4882a593Smuzhiyun vpxor RK2, x2 ## 1, x2 ## 1; \ 461*4882a593Smuzhiyun vpsrld $5, x0 ## 1, x4 ## 1; \ 462*4882a593Smuzhiyun vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 463*4882a593Smuzhiyun vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 464*4882a593Smuzhiyun vpxor RK3, x3 ## 1, x3 ## 1; \ 465*4882a593Smuzhiyun vpxor RK1, x1 ## 1, x1 ## 1; \ 466*4882a593Smuzhiyun vpsrld $22, x2 ## 1, x4 ## 1; \ 467*4882a593Smuzhiyun vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 468*4882a593Smuzhiyun vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 469*4882a593Smuzhiyun vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 470*4882a593Smuzhiyun vpxor RK0, x0 ## 2, x0 ## 2; \ 471*4882a593Smuzhiyun vpxor RK2, x2 ## 2, x2 ## 2; \ 472*4882a593Smuzhiyun vpsrld $5, x0 ## 2, x4 ## 2; \ 473*4882a593Smuzhiyun vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 474*4882a593Smuzhiyun vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 475*4882a593Smuzhiyun vpxor RK3, x3 ## 2, x3 ## 2; \ 476*4882a593Smuzhiyun vpxor RK1, x1 ## 2, x1 ## 2; \ 477*4882a593Smuzhiyun vpsrld $22, x2 ## 2, x4 ## 2; \ 478*4882a593Smuzhiyun vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 479*4882a593Smuzhiyun vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 480*4882a593Smuzhiyun vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 481*4882a593Smuzhiyun vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 482*4882a593Smuzhiyun vpslld $7, x1 ## 1, x4 ## 1; \ 483*4882a593Smuzhiyun vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 484*4882a593Smuzhiyun vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 485*4882a593Smuzhiyun vpsrld $1, x1 ## 1, x4 ## 1; \ 486*4882a593Smuzhiyun vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 487*4882a593Smuzhiyun vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 488*4882a593Smuzhiyun vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 489*4882a593Smuzhiyun vpslld $7, x1 ## 2, x4 ## 2; \ 490*4882a593Smuzhiyun vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 491*4882a593Smuzhiyun vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 492*4882a593Smuzhiyun vpsrld $1, x1 ## 2, x4 ## 2; \ 493*4882a593Smuzhiyun vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 494*4882a593Smuzhiyun vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 495*4882a593Smuzhiyun vpsrld $7, x3 ## 1, x4 ## 1; \ 496*4882a593Smuzhiyun vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 497*4882a593Smuzhiyun vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 498*4882a593Smuzhiyun vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 499*4882a593Smuzhiyun vpslld $3, x0 ## 1, x4 ## 1; \ 500*4882a593Smuzhiyun vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 501*4882a593Smuzhiyun vpsrld $7, x3 ## 2, x4 ## 2; \ 502*4882a593Smuzhiyun vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 503*4882a593Smuzhiyun vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 504*4882a593Smuzhiyun vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 505*4882a593Smuzhiyun vpslld $3, x0 ## 2, x4 ## 2; \ 506*4882a593Smuzhiyun vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 507*4882a593Smuzhiyun vpsrld $13, x0 ## 1, x4 ## 1; \ 508*4882a593Smuzhiyun vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 509*4882a593Smuzhiyun vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 510*4882a593Smuzhiyun vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 511*4882a593Smuzhiyun vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 512*4882a593Smuzhiyun vpsrld $3, x2 ## 1, x4 ## 1; \ 513*4882a593Smuzhiyun vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 514*4882a593Smuzhiyun vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 515*4882a593Smuzhiyun vpsrld $13, x0 ## 2, x4 ## 2; \ 516*4882a593Smuzhiyun vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 517*4882a593Smuzhiyun vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 518*4882a593Smuzhiyun vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 519*4882a593Smuzhiyun vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 520*4882a593Smuzhiyun vpsrld $3, x2 ## 2, x4 ## 2; \ 521*4882a593Smuzhiyun vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 522*4882a593Smuzhiyun vpor x4 ## 2, x2 ## 2, x2 ## 2; 523*4882a593Smuzhiyun 524*4882a593Smuzhiyun#define S(SBOX, x0, x1, x2, x3, x4) \ 525*4882a593Smuzhiyun SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 526*4882a593Smuzhiyun SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 527*4882a593Smuzhiyun SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 528*4882a593Smuzhiyun SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 529*4882a593Smuzhiyun 530*4882a593Smuzhiyun#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 531*4882a593Smuzhiyun get_key(i, 0, RK0); \ 532*4882a593Smuzhiyun SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 533*4882a593Smuzhiyun get_key(i, 2, RK2); \ 534*4882a593Smuzhiyun SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 535*4882a593Smuzhiyun get_key(i, 3, RK3); \ 536*4882a593Smuzhiyun SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 537*4882a593Smuzhiyun get_key(i, 1, RK1); \ 538*4882a593Smuzhiyun SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 539*4882a593Smuzhiyun 540*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 541*4882a593Smuzhiyun vpunpckldq x1, x0, t0; \ 542*4882a593Smuzhiyun vpunpckhdq x1, x0, t2; \ 543*4882a593Smuzhiyun vpunpckldq x3, x2, t1; \ 544*4882a593Smuzhiyun vpunpckhdq x3, x2, x3; \ 545*4882a593Smuzhiyun \ 546*4882a593Smuzhiyun vpunpcklqdq t1, t0, x0; \ 547*4882a593Smuzhiyun vpunpckhqdq t1, t0, x1; \ 548*4882a593Smuzhiyun vpunpcklqdq x3, t2, x2; \ 549*4882a593Smuzhiyun vpunpckhqdq x3, t2, x3; 550*4882a593Smuzhiyun 551*4882a593Smuzhiyun#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 552*4882a593Smuzhiyun transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 553*4882a593Smuzhiyun 554*4882a593Smuzhiyun#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 555*4882a593Smuzhiyun transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 556*4882a593Smuzhiyun 557*4882a593Smuzhiyun.align 8 558*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__serpent_enc_blk8_avx) 559*4882a593Smuzhiyun /* input: 560*4882a593Smuzhiyun * %rdi: ctx, CTX 561*4882a593Smuzhiyun * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 562*4882a593Smuzhiyun * output: 563*4882a593Smuzhiyun * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 564*4882a593Smuzhiyun */ 565*4882a593Smuzhiyun 566*4882a593Smuzhiyun vpcmpeqd RNOT, RNOT, RNOT; 567*4882a593Smuzhiyun 568*4882a593Smuzhiyun read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 569*4882a593Smuzhiyun read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 570*4882a593Smuzhiyun 571*4882a593Smuzhiyun K2(RA, RB, RC, RD, RE, 0); 572*4882a593Smuzhiyun S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 573*4882a593Smuzhiyun S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 574*4882a593Smuzhiyun S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 575*4882a593Smuzhiyun S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 576*4882a593Smuzhiyun S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 577*4882a593Smuzhiyun S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 578*4882a593Smuzhiyun S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 579*4882a593Smuzhiyun S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 580*4882a593Smuzhiyun S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 581*4882a593Smuzhiyun S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 582*4882a593Smuzhiyun S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 583*4882a593Smuzhiyun S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 584*4882a593Smuzhiyun S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 585*4882a593Smuzhiyun S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 586*4882a593Smuzhiyun S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 587*4882a593Smuzhiyun S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 588*4882a593Smuzhiyun S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 589*4882a593Smuzhiyun S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 590*4882a593Smuzhiyun S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 591*4882a593Smuzhiyun S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 592*4882a593Smuzhiyun S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 593*4882a593Smuzhiyun S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 594*4882a593Smuzhiyun S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 595*4882a593Smuzhiyun S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 596*4882a593Smuzhiyun S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 597*4882a593Smuzhiyun S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 598*4882a593Smuzhiyun S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 599*4882a593Smuzhiyun S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 600*4882a593Smuzhiyun S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 601*4882a593Smuzhiyun S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 602*4882a593Smuzhiyun S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 603*4882a593Smuzhiyun S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 604*4882a593Smuzhiyun 605*4882a593Smuzhiyun write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 606*4882a593Smuzhiyun write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 607*4882a593Smuzhiyun 608*4882a593Smuzhiyun RET; 609*4882a593SmuzhiyunSYM_FUNC_END(__serpent_enc_blk8_avx) 610*4882a593Smuzhiyun 611*4882a593Smuzhiyun.align 8 612*4882a593SmuzhiyunSYM_FUNC_START_LOCAL(__serpent_dec_blk8_avx) 613*4882a593Smuzhiyun /* input: 614*4882a593Smuzhiyun * %rdi: ctx, CTX 615*4882a593Smuzhiyun * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 616*4882a593Smuzhiyun * output: 617*4882a593Smuzhiyun * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks 618*4882a593Smuzhiyun */ 619*4882a593Smuzhiyun 620*4882a593Smuzhiyun vpcmpeqd RNOT, RNOT, RNOT; 621*4882a593Smuzhiyun 622*4882a593Smuzhiyun read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 623*4882a593Smuzhiyun read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 624*4882a593Smuzhiyun 625*4882a593Smuzhiyun K2(RA, RB, RC, RD, RE, 32); 626*4882a593Smuzhiyun SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 627*4882a593Smuzhiyun SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 628*4882a593Smuzhiyun SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 629*4882a593Smuzhiyun SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 630*4882a593Smuzhiyun SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 631*4882a593Smuzhiyun SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 632*4882a593Smuzhiyun SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 633*4882a593Smuzhiyun SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 634*4882a593Smuzhiyun SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 635*4882a593Smuzhiyun SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 636*4882a593Smuzhiyun SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 637*4882a593Smuzhiyun SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 638*4882a593Smuzhiyun SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 639*4882a593Smuzhiyun SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 640*4882a593Smuzhiyun SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 641*4882a593Smuzhiyun SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 642*4882a593Smuzhiyun SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 643*4882a593Smuzhiyun SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 644*4882a593Smuzhiyun SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 645*4882a593Smuzhiyun SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 646*4882a593Smuzhiyun SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 647*4882a593Smuzhiyun SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 648*4882a593Smuzhiyun SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 649*4882a593Smuzhiyun SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 650*4882a593Smuzhiyun SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 651*4882a593Smuzhiyun SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 652*4882a593Smuzhiyun SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 653*4882a593Smuzhiyun SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 654*4882a593Smuzhiyun SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 655*4882a593Smuzhiyun SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 656*4882a593Smuzhiyun SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 657*4882a593Smuzhiyun S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 658*4882a593Smuzhiyun 659*4882a593Smuzhiyun write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 660*4882a593Smuzhiyun write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 661*4882a593Smuzhiyun 662*4882a593Smuzhiyun RET; 663*4882a593SmuzhiyunSYM_FUNC_END(__serpent_dec_blk8_avx) 664*4882a593Smuzhiyun 665*4882a593SmuzhiyunSYM_FUNC_START(serpent_ecb_enc_8way_avx) 666*4882a593Smuzhiyun /* input: 667*4882a593Smuzhiyun * %rdi: ctx, CTX 668*4882a593Smuzhiyun * %rsi: dst 669*4882a593Smuzhiyun * %rdx: src 670*4882a593Smuzhiyun */ 671*4882a593Smuzhiyun FRAME_BEGIN 672*4882a593Smuzhiyun 673*4882a593Smuzhiyun load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 674*4882a593Smuzhiyun 675*4882a593Smuzhiyun call __serpent_enc_blk8_avx; 676*4882a593Smuzhiyun 677*4882a593Smuzhiyun store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 678*4882a593Smuzhiyun 679*4882a593Smuzhiyun FRAME_END 680*4882a593Smuzhiyun RET; 681*4882a593SmuzhiyunSYM_FUNC_END(serpent_ecb_enc_8way_avx) 682*4882a593Smuzhiyun 683*4882a593SmuzhiyunSYM_FUNC_START(serpent_ecb_dec_8way_avx) 684*4882a593Smuzhiyun /* input: 685*4882a593Smuzhiyun * %rdi: ctx, CTX 686*4882a593Smuzhiyun * %rsi: dst 687*4882a593Smuzhiyun * %rdx: src 688*4882a593Smuzhiyun */ 689*4882a593Smuzhiyun FRAME_BEGIN 690*4882a593Smuzhiyun 691*4882a593Smuzhiyun load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 692*4882a593Smuzhiyun 693*4882a593Smuzhiyun call __serpent_dec_blk8_avx; 694*4882a593Smuzhiyun 695*4882a593Smuzhiyun store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 696*4882a593Smuzhiyun 697*4882a593Smuzhiyun FRAME_END 698*4882a593Smuzhiyun RET; 699*4882a593SmuzhiyunSYM_FUNC_END(serpent_ecb_dec_8way_avx) 700*4882a593Smuzhiyun 701*4882a593SmuzhiyunSYM_FUNC_START(serpent_cbc_dec_8way_avx) 702*4882a593Smuzhiyun /* input: 703*4882a593Smuzhiyun * %rdi: ctx, CTX 704*4882a593Smuzhiyun * %rsi: dst 705*4882a593Smuzhiyun * %rdx: src 706*4882a593Smuzhiyun */ 707*4882a593Smuzhiyun FRAME_BEGIN 708*4882a593Smuzhiyun 709*4882a593Smuzhiyun load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 710*4882a593Smuzhiyun 711*4882a593Smuzhiyun call __serpent_dec_blk8_avx; 712*4882a593Smuzhiyun 713*4882a593Smuzhiyun store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 714*4882a593Smuzhiyun 715*4882a593Smuzhiyun FRAME_END 716*4882a593Smuzhiyun RET; 717*4882a593SmuzhiyunSYM_FUNC_END(serpent_cbc_dec_8way_avx) 718*4882a593Smuzhiyun 719*4882a593SmuzhiyunSYM_FUNC_START(serpent_ctr_8way_avx) 720*4882a593Smuzhiyun /* input: 721*4882a593Smuzhiyun * %rdi: ctx, CTX 722*4882a593Smuzhiyun * %rsi: dst 723*4882a593Smuzhiyun * %rdx: src 724*4882a593Smuzhiyun * %rcx: iv (little endian, 128bit) 725*4882a593Smuzhiyun */ 726*4882a593Smuzhiyun FRAME_BEGIN 727*4882a593Smuzhiyun 728*4882a593Smuzhiyun load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 729*4882a593Smuzhiyun RD2, RK0, RK1, RK2); 730*4882a593Smuzhiyun 731*4882a593Smuzhiyun call __serpent_enc_blk8_avx; 732*4882a593Smuzhiyun 733*4882a593Smuzhiyun store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 734*4882a593Smuzhiyun 735*4882a593Smuzhiyun FRAME_END 736*4882a593Smuzhiyun RET; 737*4882a593SmuzhiyunSYM_FUNC_END(serpent_ctr_8way_avx) 738*4882a593Smuzhiyun 739*4882a593SmuzhiyunSYM_FUNC_START(serpent_xts_enc_8way_avx) 740*4882a593Smuzhiyun /* input: 741*4882a593Smuzhiyun * %rdi: ctx, CTX 742*4882a593Smuzhiyun * %rsi: dst 743*4882a593Smuzhiyun * %rdx: src 744*4882a593Smuzhiyun * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 745*4882a593Smuzhiyun */ 746*4882a593Smuzhiyun FRAME_BEGIN 747*4882a593Smuzhiyun 748*4882a593Smuzhiyun /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 749*4882a593Smuzhiyun load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 750*4882a593Smuzhiyun RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 751*4882a593Smuzhiyun 752*4882a593Smuzhiyun call __serpent_enc_blk8_avx; 753*4882a593Smuzhiyun 754*4882a593Smuzhiyun /* dst <= regs xor IVs(in dst) */ 755*4882a593Smuzhiyun store_xts_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 756*4882a593Smuzhiyun 757*4882a593Smuzhiyun FRAME_END 758*4882a593Smuzhiyun RET; 759*4882a593SmuzhiyunSYM_FUNC_END(serpent_xts_enc_8way_avx) 760*4882a593Smuzhiyun 761*4882a593SmuzhiyunSYM_FUNC_START(serpent_xts_dec_8way_avx) 762*4882a593Smuzhiyun /* input: 763*4882a593Smuzhiyun * %rdi: ctx, CTX 764*4882a593Smuzhiyun * %rsi: dst 765*4882a593Smuzhiyun * %rdx: src 766*4882a593Smuzhiyun * %rcx: iv (t ⊕ αⁿ ∈ GF(2¹²⁸)) 767*4882a593Smuzhiyun */ 768*4882a593Smuzhiyun FRAME_BEGIN 769*4882a593Smuzhiyun 770*4882a593Smuzhiyun /* regs <= src, dst <= IVs, regs <= regs xor IVs */ 771*4882a593Smuzhiyun load_xts_8way(%rcx, %rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2, 772*4882a593Smuzhiyun RK0, RK1, RK2, .Lxts_gf128mul_and_shl1_mask); 773*4882a593Smuzhiyun 774*4882a593Smuzhiyun call __serpent_dec_blk8_avx; 775*4882a593Smuzhiyun 776*4882a593Smuzhiyun /* dst <= regs xor IVs(in dst) */ 777*4882a593Smuzhiyun store_xts_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 778*4882a593Smuzhiyun 779*4882a593Smuzhiyun FRAME_END 780*4882a593Smuzhiyun RET; 781*4882a593SmuzhiyunSYM_FUNC_END(serpent_xts_dec_8way_avx) 782