1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Serpent Cipher 4-way parallel algorithm (i586/SSE2) 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6*4882a593Smuzhiyun * 7*4882a593Smuzhiyun * Based on crypto/serpent.c by 8*4882a593Smuzhiyun * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no> 9*4882a593Smuzhiyun * 2003 Herbert Valerio Riedel <hvr@gnu.org> 10*4882a593Smuzhiyun */ 11*4882a593Smuzhiyun 12*4882a593Smuzhiyun#include <linux/linkage.h> 13*4882a593Smuzhiyun 14*4882a593Smuzhiyun.file "serpent-sse2-i586-asm_32.S" 15*4882a593Smuzhiyun.text 16*4882a593Smuzhiyun 17*4882a593Smuzhiyun#define arg_ctx 4 18*4882a593Smuzhiyun#define arg_dst 8 19*4882a593Smuzhiyun#define arg_src 12 20*4882a593Smuzhiyun#define arg_xor 16 21*4882a593Smuzhiyun 22*4882a593Smuzhiyun/********************************************************************** 23*4882a593Smuzhiyun 4-way SSE2 serpent 24*4882a593Smuzhiyun **********************************************************************/ 25*4882a593Smuzhiyun#define CTX %edx 26*4882a593Smuzhiyun 27*4882a593Smuzhiyun#define RA %xmm0 28*4882a593Smuzhiyun#define RB %xmm1 29*4882a593Smuzhiyun#define RC %xmm2 30*4882a593Smuzhiyun#define RD %xmm3 31*4882a593Smuzhiyun#define RE %xmm4 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun#define RT0 %xmm5 34*4882a593Smuzhiyun#define RT1 %xmm6 35*4882a593Smuzhiyun 36*4882a593Smuzhiyun#define RNOT %xmm7 37*4882a593Smuzhiyun 38*4882a593Smuzhiyun#define get_key(i, j, t) \ 39*4882a593Smuzhiyun movd (4*(i)+(j))*4(CTX), t; \ 40*4882a593Smuzhiyun pshufd $0, t, t; 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun#define K(x0, x1, x2, x3, x4, i) \ 43*4882a593Smuzhiyun get_key(i, 0, x4); \ 44*4882a593Smuzhiyun get_key(i, 1, RT0); \ 45*4882a593Smuzhiyun get_key(i, 2, RT1); \ 46*4882a593Smuzhiyun pxor x4, x0; \ 47*4882a593Smuzhiyun pxor RT0, x1; \ 48*4882a593Smuzhiyun pxor RT1, x2; \ 49*4882a593Smuzhiyun get_key(i, 3, x4); \ 50*4882a593Smuzhiyun pxor x4, x3; 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun#define LK(x0, x1, x2, x3, x4, i) \ 53*4882a593Smuzhiyun movdqa x0, x4; \ 54*4882a593Smuzhiyun pslld $13, x0; \ 55*4882a593Smuzhiyun psrld $(32 - 13), x4; \ 56*4882a593Smuzhiyun por x4, x0; \ 57*4882a593Smuzhiyun pxor x0, x1; \ 58*4882a593Smuzhiyun movdqa x2, x4; \ 59*4882a593Smuzhiyun pslld $3, x2; \ 60*4882a593Smuzhiyun psrld $(32 - 3), x4; \ 61*4882a593Smuzhiyun por x4, x2; \ 62*4882a593Smuzhiyun pxor x2, x1; \ 63*4882a593Smuzhiyun movdqa x1, x4; \ 64*4882a593Smuzhiyun pslld $1, x1; \ 65*4882a593Smuzhiyun psrld $(32 - 1), x4; \ 66*4882a593Smuzhiyun por x4, x1; \ 67*4882a593Smuzhiyun movdqa x0, x4; \ 68*4882a593Smuzhiyun pslld $3, x4; \ 69*4882a593Smuzhiyun pxor x2, x3; \ 70*4882a593Smuzhiyun pxor x4, x3; \ 71*4882a593Smuzhiyun movdqa x3, x4; \ 72*4882a593Smuzhiyun pslld $7, x3; \ 73*4882a593Smuzhiyun psrld $(32 - 7), x4; \ 74*4882a593Smuzhiyun por x4, x3; \ 75*4882a593Smuzhiyun movdqa x1, x4; \ 76*4882a593Smuzhiyun pslld $7, x4; \ 77*4882a593Smuzhiyun pxor x1, x0; \ 78*4882a593Smuzhiyun pxor x3, x0; \ 79*4882a593Smuzhiyun pxor x3, x2; \ 80*4882a593Smuzhiyun pxor x4, x2; \ 81*4882a593Smuzhiyun movdqa x0, x4; \ 82*4882a593Smuzhiyun get_key(i, 1, RT0); \ 83*4882a593Smuzhiyun pxor RT0, x1; \ 84*4882a593Smuzhiyun get_key(i, 3, RT0); \ 85*4882a593Smuzhiyun pxor RT0, x3; \ 86*4882a593Smuzhiyun pslld $5, x0; \ 87*4882a593Smuzhiyun psrld $(32 - 5), x4; \ 88*4882a593Smuzhiyun por x4, x0; \ 89*4882a593Smuzhiyun movdqa x2, x4; \ 90*4882a593Smuzhiyun pslld $22, x2; \ 91*4882a593Smuzhiyun psrld $(32 - 22), x4; \ 92*4882a593Smuzhiyun por x4, x2; \ 93*4882a593Smuzhiyun get_key(i, 0, RT0); \ 94*4882a593Smuzhiyun pxor RT0, x0; \ 95*4882a593Smuzhiyun get_key(i, 2, RT0); \ 96*4882a593Smuzhiyun pxor RT0, x2; 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun#define KL(x0, x1, x2, x3, x4, i) \ 99*4882a593Smuzhiyun K(x0, x1, x2, x3, x4, i); \ 100*4882a593Smuzhiyun movdqa x0, x4; \ 101*4882a593Smuzhiyun psrld $5, x0; \ 102*4882a593Smuzhiyun pslld $(32 - 5), x4; \ 103*4882a593Smuzhiyun por x4, x0; \ 104*4882a593Smuzhiyun movdqa x2, x4; \ 105*4882a593Smuzhiyun psrld $22, x2; \ 106*4882a593Smuzhiyun pslld $(32 - 22), x4; \ 107*4882a593Smuzhiyun por x4, x2; \ 108*4882a593Smuzhiyun pxor x3, x2; \ 109*4882a593Smuzhiyun pxor x3, x0; \ 110*4882a593Smuzhiyun movdqa x1, x4; \ 111*4882a593Smuzhiyun pslld $7, x4; \ 112*4882a593Smuzhiyun pxor x1, x0; \ 113*4882a593Smuzhiyun pxor x4, x2; \ 114*4882a593Smuzhiyun movdqa x1, x4; \ 115*4882a593Smuzhiyun psrld $1, x1; \ 116*4882a593Smuzhiyun pslld $(32 - 1), x4; \ 117*4882a593Smuzhiyun por x4, x1; \ 118*4882a593Smuzhiyun movdqa x3, x4; \ 119*4882a593Smuzhiyun psrld $7, x3; \ 120*4882a593Smuzhiyun pslld $(32 - 7), x4; \ 121*4882a593Smuzhiyun por x4, x3; \ 122*4882a593Smuzhiyun pxor x0, x1; \ 123*4882a593Smuzhiyun movdqa x0, x4; \ 124*4882a593Smuzhiyun pslld $3, x4; \ 125*4882a593Smuzhiyun pxor x4, x3; \ 126*4882a593Smuzhiyun movdqa x0, x4; \ 127*4882a593Smuzhiyun psrld $13, x0; \ 128*4882a593Smuzhiyun pslld $(32 - 13), x4; \ 129*4882a593Smuzhiyun por x4, x0; \ 130*4882a593Smuzhiyun pxor x2, x1; \ 131*4882a593Smuzhiyun pxor x2, x3; \ 132*4882a593Smuzhiyun movdqa x2, x4; \ 133*4882a593Smuzhiyun psrld $3, x2; \ 134*4882a593Smuzhiyun pslld $(32 - 3), x4; \ 135*4882a593Smuzhiyun por x4, x2; 136*4882a593Smuzhiyun 137*4882a593Smuzhiyun#define S0(x0, x1, x2, x3, x4) \ 138*4882a593Smuzhiyun movdqa x3, x4; \ 139*4882a593Smuzhiyun por x0, x3; \ 140*4882a593Smuzhiyun pxor x4, x0; \ 141*4882a593Smuzhiyun pxor x2, x4; \ 142*4882a593Smuzhiyun pxor RNOT, x4; \ 143*4882a593Smuzhiyun pxor x1, x3; \ 144*4882a593Smuzhiyun pand x0, x1; \ 145*4882a593Smuzhiyun pxor x4, x1; \ 146*4882a593Smuzhiyun pxor x0, x2; \ 147*4882a593Smuzhiyun pxor x3, x0; \ 148*4882a593Smuzhiyun por x0, x4; \ 149*4882a593Smuzhiyun pxor x2, x0; \ 150*4882a593Smuzhiyun pand x1, x2; \ 151*4882a593Smuzhiyun pxor x2, x3; \ 152*4882a593Smuzhiyun pxor RNOT, x1; \ 153*4882a593Smuzhiyun pxor x4, x2; \ 154*4882a593Smuzhiyun pxor x2, x1; 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun#define S1(x0, x1, x2, x3, x4) \ 157*4882a593Smuzhiyun movdqa x1, x4; \ 158*4882a593Smuzhiyun pxor x0, x1; \ 159*4882a593Smuzhiyun pxor x3, x0; \ 160*4882a593Smuzhiyun pxor RNOT, x3; \ 161*4882a593Smuzhiyun pand x1, x4; \ 162*4882a593Smuzhiyun por x1, x0; \ 163*4882a593Smuzhiyun pxor x2, x3; \ 164*4882a593Smuzhiyun pxor x3, x0; \ 165*4882a593Smuzhiyun pxor x3, x1; \ 166*4882a593Smuzhiyun pxor x4, x3; \ 167*4882a593Smuzhiyun por x4, x1; \ 168*4882a593Smuzhiyun pxor x2, x4; \ 169*4882a593Smuzhiyun pand x0, x2; \ 170*4882a593Smuzhiyun pxor x1, x2; \ 171*4882a593Smuzhiyun por x0, x1; \ 172*4882a593Smuzhiyun pxor RNOT, x0; \ 173*4882a593Smuzhiyun pxor x2, x0; \ 174*4882a593Smuzhiyun pxor x1, x4; 175*4882a593Smuzhiyun 176*4882a593Smuzhiyun#define S2(x0, x1, x2, x3, x4) \ 177*4882a593Smuzhiyun pxor RNOT, x3; \ 178*4882a593Smuzhiyun pxor x0, x1; \ 179*4882a593Smuzhiyun movdqa x0, x4; \ 180*4882a593Smuzhiyun pand x2, x0; \ 181*4882a593Smuzhiyun pxor x3, x0; \ 182*4882a593Smuzhiyun por x4, x3; \ 183*4882a593Smuzhiyun pxor x1, x2; \ 184*4882a593Smuzhiyun pxor x1, x3; \ 185*4882a593Smuzhiyun pand x0, x1; \ 186*4882a593Smuzhiyun pxor x2, x0; \ 187*4882a593Smuzhiyun pand x3, x2; \ 188*4882a593Smuzhiyun por x1, x3; \ 189*4882a593Smuzhiyun pxor RNOT, x0; \ 190*4882a593Smuzhiyun pxor x0, x3; \ 191*4882a593Smuzhiyun pxor x0, x4; \ 192*4882a593Smuzhiyun pxor x2, x0; \ 193*4882a593Smuzhiyun por x2, x1; 194*4882a593Smuzhiyun 195*4882a593Smuzhiyun#define S3(x0, x1, x2, x3, x4) \ 196*4882a593Smuzhiyun movdqa x1, x4; \ 197*4882a593Smuzhiyun pxor x3, x1; \ 198*4882a593Smuzhiyun por x0, x3; \ 199*4882a593Smuzhiyun pand x0, x4; \ 200*4882a593Smuzhiyun pxor x2, x0; \ 201*4882a593Smuzhiyun pxor x1, x2; \ 202*4882a593Smuzhiyun pand x3, x1; \ 203*4882a593Smuzhiyun pxor x3, x2; \ 204*4882a593Smuzhiyun por x4, x0; \ 205*4882a593Smuzhiyun pxor x3, x4; \ 206*4882a593Smuzhiyun pxor x0, x1; \ 207*4882a593Smuzhiyun pand x3, x0; \ 208*4882a593Smuzhiyun pand x4, x3; \ 209*4882a593Smuzhiyun pxor x2, x3; \ 210*4882a593Smuzhiyun por x1, x4; \ 211*4882a593Smuzhiyun pand x1, x2; \ 212*4882a593Smuzhiyun pxor x3, x4; \ 213*4882a593Smuzhiyun pxor x3, x0; \ 214*4882a593Smuzhiyun pxor x2, x3; 215*4882a593Smuzhiyun 216*4882a593Smuzhiyun#define S4(x0, x1, x2, x3, x4) \ 217*4882a593Smuzhiyun movdqa x3, x4; \ 218*4882a593Smuzhiyun pand x0, x3; \ 219*4882a593Smuzhiyun pxor x4, x0; \ 220*4882a593Smuzhiyun pxor x2, x3; \ 221*4882a593Smuzhiyun por x4, x2; \ 222*4882a593Smuzhiyun pxor x1, x0; \ 223*4882a593Smuzhiyun pxor x3, x4; \ 224*4882a593Smuzhiyun por x0, x2; \ 225*4882a593Smuzhiyun pxor x1, x2; \ 226*4882a593Smuzhiyun pand x0, x1; \ 227*4882a593Smuzhiyun pxor x4, x1; \ 228*4882a593Smuzhiyun pand x2, x4; \ 229*4882a593Smuzhiyun pxor x3, x2; \ 230*4882a593Smuzhiyun pxor x0, x4; \ 231*4882a593Smuzhiyun por x1, x3; \ 232*4882a593Smuzhiyun pxor RNOT, x1; \ 233*4882a593Smuzhiyun pxor x0, x3; 234*4882a593Smuzhiyun 235*4882a593Smuzhiyun#define S5(x0, x1, x2, x3, x4) \ 236*4882a593Smuzhiyun movdqa x1, x4; \ 237*4882a593Smuzhiyun por x0, x1; \ 238*4882a593Smuzhiyun pxor x1, x2; \ 239*4882a593Smuzhiyun pxor RNOT, x3; \ 240*4882a593Smuzhiyun pxor x0, x4; \ 241*4882a593Smuzhiyun pxor x2, x0; \ 242*4882a593Smuzhiyun pand x4, x1; \ 243*4882a593Smuzhiyun por x3, x4; \ 244*4882a593Smuzhiyun pxor x0, x4; \ 245*4882a593Smuzhiyun pand x3, x0; \ 246*4882a593Smuzhiyun pxor x3, x1; \ 247*4882a593Smuzhiyun pxor x2, x3; \ 248*4882a593Smuzhiyun pxor x1, x0; \ 249*4882a593Smuzhiyun pand x4, x2; \ 250*4882a593Smuzhiyun pxor x2, x1; \ 251*4882a593Smuzhiyun pand x0, x2; \ 252*4882a593Smuzhiyun pxor x2, x3; 253*4882a593Smuzhiyun 254*4882a593Smuzhiyun#define S6(x0, x1, x2, x3, x4) \ 255*4882a593Smuzhiyun movdqa x1, x4; \ 256*4882a593Smuzhiyun pxor x0, x3; \ 257*4882a593Smuzhiyun pxor x2, x1; \ 258*4882a593Smuzhiyun pxor x0, x2; \ 259*4882a593Smuzhiyun pand x3, x0; \ 260*4882a593Smuzhiyun por x3, x1; \ 261*4882a593Smuzhiyun pxor RNOT, x4; \ 262*4882a593Smuzhiyun pxor x1, x0; \ 263*4882a593Smuzhiyun pxor x2, x1; \ 264*4882a593Smuzhiyun pxor x4, x3; \ 265*4882a593Smuzhiyun pxor x0, x4; \ 266*4882a593Smuzhiyun pand x0, x2; \ 267*4882a593Smuzhiyun pxor x1, x4; \ 268*4882a593Smuzhiyun pxor x3, x2; \ 269*4882a593Smuzhiyun pand x1, x3; \ 270*4882a593Smuzhiyun pxor x0, x3; \ 271*4882a593Smuzhiyun pxor x2, x1; 272*4882a593Smuzhiyun 273*4882a593Smuzhiyun#define S7(x0, x1, x2, x3, x4) \ 274*4882a593Smuzhiyun pxor RNOT, x1; \ 275*4882a593Smuzhiyun movdqa x1, x4; \ 276*4882a593Smuzhiyun pxor RNOT, x0; \ 277*4882a593Smuzhiyun pand x2, x1; \ 278*4882a593Smuzhiyun pxor x3, x1; \ 279*4882a593Smuzhiyun por x4, x3; \ 280*4882a593Smuzhiyun pxor x2, x4; \ 281*4882a593Smuzhiyun pxor x3, x2; \ 282*4882a593Smuzhiyun pxor x0, x3; \ 283*4882a593Smuzhiyun por x1, x0; \ 284*4882a593Smuzhiyun pand x0, x2; \ 285*4882a593Smuzhiyun pxor x4, x0; \ 286*4882a593Smuzhiyun pxor x3, x4; \ 287*4882a593Smuzhiyun pand x0, x3; \ 288*4882a593Smuzhiyun pxor x1, x4; \ 289*4882a593Smuzhiyun pxor x4, x2; \ 290*4882a593Smuzhiyun pxor x1, x3; \ 291*4882a593Smuzhiyun por x0, x4; \ 292*4882a593Smuzhiyun pxor x1, x4; 293*4882a593Smuzhiyun 294*4882a593Smuzhiyun#define SI0(x0, x1, x2, x3, x4) \ 295*4882a593Smuzhiyun movdqa x3, x4; \ 296*4882a593Smuzhiyun pxor x0, x1; \ 297*4882a593Smuzhiyun por x1, x3; \ 298*4882a593Smuzhiyun pxor x1, x4; \ 299*4882a593Smuzhiyun pxor RNOT, x0; \ 300*4882a593Smuzhiyun pxor x3, x2; \ 301*4882a593Smuzhiyun pxor x0, x3; \ 302*4882a593Smuzhiyun pand x1, x0; \ 303*4882a593Smuzhiyun pxor x2, x0; \ 304*4882a593Smuzhiyun pand x3, x2; \ 305*4882a593Smuzhiyun pxor x4, x3; \ 306*4882a593Smuzhiyun pxor x3, x2; \ 307*4882a593Smuzhiyun pxor x3, x1; \ 308*4882a593Smuzhiyun pand x0, x3; \ 309*4882a593Smuzhiyun pxor x0, x1; \ 310*4882a593Smuzhiyun pxor x2, x0; \ 311*4882a593Smuzhiyun pxor x3, x4; 312*4882a593Smuzhiyun 313*4882a593Smuzhiyun#define SI1(x0, x1, x2, x3, x4) \ 314*4882a593Smuzhiyun pxor x3, x1; \ 315*4882a593Smuzhiyun movdqa x0, x4; \ 316*4882a593Smuzhiyun pxor x2, x0; \ 317*4882a593Smuzhiyun pxor RNOT, x2; \ 318*4882a593Smuzhiyun por x1, x4; \ 319*4882a593Smuzhiyun pxor x3, x4; \ 320*4882a593Smuzhiyun pand x1, x3; \ 321*4882a593Smuzhiyun pxor x2, x1; \ 322*4882a593Smuzhiyun pand x4, x2; \ 323*4882a593Smuzhiyun pxor x1, x4; \ 324*4882a593Smuzhiyun por x3, x1; \ 325*4882a593Smuzhiyun pxor x0, x3; \ 326*4882a593Smuzhiyun pxor x0, x2; \ 327*4882a593Smuzhiyun por x4, x0; \ 328*4882a593Smuzhiyun pxor x4, x2; \ 329*4882a593Smuzhiyun pxor x0, x1; \ 330*4882a593Smuzhiyun pxor x1, x4; 331*4882a593Smuzhiyun 332*4882a593Smuzhiyun#define SI2(x0, x1, x2, x3, x4) \ 333*4882a593Smuzhiyun pxor x1, x2; \ 334*4882a593Smuzhiyun movdqa x3, x4; \ 335*4882a593Smuzhiyun pxor RNOT, x3; \ 336*4882a593Smuzhiyun por x2, x3; \ 337*4882a593Smuzhiyun pxor x4, x2; \ 338*4882a593Smuzhiyun pxor x0, x4; \ 339*4882a593Smuzhiyun pxor x1, x3; \ 340*4882a593Smuzhiyun por x2, x1; \ 341*4882a593Smuzhiyun pxor x0, x2; \ 342*4882a593Smuzhiyun pxor x4, x1; \ 343*4882a593Smuzhiyun por x3, x4; \ 344*4882a593Smuzhiyun pxor x3, x2; \ 345*4882a593Smuzhiyun pxor x2, x4; \ 346*4882a593Smuzhiyun pand x1, x2; \ 347*4882a593Smuzhiyun pxor x3, x2; \ 348*4882a593Smuzhiyun pxor x4, x3; \ 349*4882a593Smuzhiyun pxor x0, x4; 350*4882a593Smuzhiyun 351*4882a593Smuzhiyun#define SI3(x0, x1, x2, x3, x4) \ 352*4882a593Smuzhiyun pxor x1, x2; \ 353*4882a593Smuzhiyun movdqa x1, x4; \ 354*4882a593Smuzhiyun pand x2, x1; \ 355*4882a593Smuzhiyun pxor x0, x1; \ 356*4882a593Smuzhiyun por x4, x0; \ 357*4882a593Smuzhiyun pxor x3, x4; \ 358*4882a593Smuzhiyun pxor x3, x0; \ 359*4882a593Smuzhiyun por x1, x3; \ 360*4882a593Smuzhiyun pxor x2, x1; \ 361*4882a593Smuzhiyun pxor x3, x1; \ 362*4882a593Smuzhiyun pxor x2, x0; \ 363*4882a593Smuzhiyun pxor x3, x2; \ 364*4882a593Smuzhiyun pand x1, x3; \ 365*4882a593Smuzhiyun pxor x0, x1; \ 366*4882a593Smuzhiyun pand x2, x0; \ 367*4882a593Smuzhiyun pxor x3, x4; \ 368*4882a593Smuzhiyun pxor x0, x3; \ 369*4882a593Smuzhiyun pxor x1, x0; 370*4882a593Smuzhiyun 371*4882a593Smuzhiyun#define SI4(x0, x1, x2, x3, x4) \ 372*4882a593Smuzhiyun pxor x3, x2; \ 373*4882a593Smuzhiyun movdqa x0, x4; \ 374*4882a593Smuzhiyun pand x1, x0; \ 375*4882a593Smuzhiyun pxor x2, x0; \ 376*4882a593Smuzhiyun por x3, x2; \ 377*4882a593Smuzhiyun pxor RNOT, x4; \ 378*4882a593Smuzhiyun pxor x0, x1; \ 379*4882a593Smuzhiyun pxor x2, x0; \ 380*4882a593Smuzhiyun pand x4, x2; \ 381*4882a593Smuzhiyun pxor x0, x2; \ 382*4882a593Smuzhiyun por x4, x0; \ 383*4882a593Smuzhiyun pxor x3, x0; \ 384*4882a593Smuzhiyun pand x2, x3; \ 385*4882a593Smuzhiyun pxor x3, x4; \ 386*4882a593Smuzhiyun pxor x1, x3; \ 387*4882a593Smuzhiyun pand x0, x1; \ 388*4882a593Smuzhiyun pxor x1, x4; \ 389*4882a593Smuzhiyun pxor x3, x0; 390*4882a593Smuzhiyun 391*4882a593Smuzhiyun#define SI5(x0, x1, x2, x3, x4) \ 392*4882a593Smuzhiyun movdqa x1, x4; \ 393*4882a593Smuzhiyun por x2, x1; \ 394*4882a593Smuzhiyun pxor x4, x2; \ 395*4882a593Smuzhiyun pxor x3, x1; \ 396*4882a593Smuzhiyun pand x4, x3; \ 397*4882a593Smuzhiyun pxor x3, x2; \ 398*4882a593Smuzhiyun por x0, x3; \ 399*4882a593Smuzhiyun pxor RNOT, x0; \ 400*4882a593Smuzhiyun pxor x2, x3; \ 401*4882a593Smuzhiyun por x0, x2; \ 402*4882a593Smuzhiyun pxor x1, x4; \ 403*4882a593Smuzhiyun pxor x4, x2; \ 404*4882a593Smuzhiyun pand x0, x4; \ 405*4882a593Smuzhiyun pxor x1, x0; \ 406*4882a593Smuzhiyun pxor x3, x1; \ 407*4882a593Smuzhiyun pand x2, x0; \ 408*4882a593Smuzhiyun pxor x3, x2; \ 409*4882a593Smuzhiyun pxor x2, x0; \ 410*4882a593Smuzhiyun pxor x4, x2; \ 411*4882a593Smuzhiyun pxor x3, x4; 412*4882a593Smuzhiyun 413*4882a593Smuzhiyun#define SI6(x0, x1, x2, x3, x4) \ 414*4882a593Smuzhiyun pxor x2, x0; \ 415*4882a593Smuzhiyun movdqa x0, x4; \ 416*4882a593Smuzhiyun pand x3, x0; \ 417*4882a593Smuzhiyun pxor x3, x2; \ 418*4882a593Smuzhiyun pxor x2, x0; \ 419*4882a593Smuzhiyun pxor x1, x3; \ 420*4882a593Smuzhiyun por x4, x2; \ 421*4882a593Smuzhiyun pxor x3, x2; \ 422*4882a593Smuzhiyun pand x0, x3; \ 423*4882a593Smuzhiyun pxor RNOT, x0; \ 424*4882a593Smuzhiyun pxor x1, x3; \ 425*4882a593Smuzhiyun pand x2, x1; \ 426*4882a593Smuzhiyun pxor x0, x4; \ 427*4882a593Smuzhiyun pxor x4, x3; \ 428*4882a593Smuzhiyun pxor x2, x4; \ 429*4882a593Smuzhiyun pxor x1, x0; \ 430*4882a593Smuzhiyun pxor x0, x2; 431*4882a593Smuzhiyun 432*4882a593Smuzhiyun#define SI7(x0, x1, x2, x3, x4) \ 433*4882a593Smuzhiyun movdqa x3, x4; \ 434*4882a593Smuzhiyun pand x0, x3; \ 435*4882a593Smuzhiyun pxor x2, x0; \ 436*4882a593Smuzhiyun por x4, x2; \ 437*4882a593Smuzhiyun pxor x1, x4; \ 438*4882a593Smuzhiyun pxor RNOT, x0; \ 439*4882a593Smuzhiyun por x3, x1; \ 440*4882a593Smuzhiyun pxor x0, x4; \ 441*4882a593Smuzhiyun pand x2, x0; \ 442*4882a593Smuzhiyun pxor x1, x0; \ 443*4882a593Smuzhiyun pand x2, x1; \ 444*4882a593Smuzhiyun pxor x2, x3; \ 445*4882a593Smuzhiyun pxor x3, x4; \ 446*4882a593Smuzhiyun pand x3, x2; \ 447*4882a593Smuzhiyun por x0, x3; \ 448*4882a593Smuzhiyun pxor x4, x1; \ 449*4882a593Smuzhiyun pxor x4, x3; \ 450*4882a593Smuzhiyun pand x0, x4; \ 451*4882a593Smuzhiyun pxor x2, x4; 452*4882a593Smuzhiyun 453*4882a593Smuzhiyun#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 454*4882a593Smuzhiyun movdqa x0, t2; \ 455*4882a593Smuzhiyun punpckldq x1, x0; \ 456*4882a593Smuzhiyun punpckhdq x1, t2; \ 457*4882a593Smuzhiyun movdqa x2, t1; \ 458*4882a593Smuzhiyun punpckhdq x3, x2; \ 459*4882a593Smuzhiyun punpckldq x3, t1; \ 460*4882a593Smuzhiyun movdqa x0, x1; \ 461*4882a593Smuzhiyun punpcklqdq t1, x0; \ 462*4882a593Smuzhiyun punpckhqdq t1, x1; \ 463*4882a593Smuzhiyun movdqa t2, x3; \ 464*4882a593Smuzhiyun punpcklqdq x2, t2; \ 465*4882a593Smuzhiyun punpckhqdq x2, x3; \ 466*4882a593Smuzhiyun movdqa t2, x2; 467*4882a593Smuzhiyun 468*4882a593Smuzhiyun#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 469*4882a593Smuzhiyun movdqu (0*4*4)(in), x0; \ 470*4882a593Smuzhiyun movdqu (1*4*4)(in), x1; \ 471*4882a593Smuzhiyun movdqu (2*4*4)(in), x2; \ 472*4882a593Smuzhiyun movdqu (3*4*4)(in), x3; \ 473*4882a593Smuzhiyun \ 474*4882a593Smuzhiyun transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 475*4882a593Smuzhiyun 476*4882a593Smuzhiyun#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 477*4882a593Smuzhiyun transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 478*4882a593Smuzhiyun \ 479*4882a593Smuzhiyun movdqu x0, (0*4*4)(out); \ 480*4882a593Smuzhiyun movdqu x1, (1*4*4)(out); \ 481*4882a593Smuzhiyun movdqu x2, (2*4*4)(out); \ 482*4882a593Smuzhiyun movdqu x3, (3*4*4)(out); 483*4882a593Smuzhiyun 484*4882a593Smuzhiyun#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 485*4882a593Smuzhiyun transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 486*4882a593Smuzhiyun \ 487*4882a593Smuzhiyun movdqu (0*4*4)(out), t0; \ 488*4882a593Smuzhiyun pxor t0, x0; \ 489*4882a593Smuzhiyun movdqu x0, (0*4*4)(out); \ 490*4882a593Smuzhiyun movdqu (1*4*4)(out), t0; \ 491*4882a593Smuzhiyun pxor t0, x1; \ 492*4882a593Smuzhiyun movdqu x1, (1*4*4)(out); \ 493*4882a593Smuzhiyun movdqu (2*4*4)(out), t0; \ 494*4882a593Smuzhiyun pxor t0, x2; \ 495*4882a593Smuzhiyun movdqu x2, (2*4*4)(out); \ 496*4882a593Smuzhiyun movdqu (3*4*4)(out), t0; \ 497*4882a593Smuzhiyun pxor t0, x3; \ 498*4882a593Smuzhiyun movdqu x3, (3*4*4)(out); 499*4882a593Smuzhiyun 500*4882a593SmuzhiyunSYM_FUNC_START(__serpent_enc_blk_4way) 501*4882a593Smuzhiyun /* input: 502*4882a593Smuzhiyun * arg_ctx(%esp): ctx, CTX 503*4882a593Smuzhiyun * arg_dst(%esp): dst 504*4882a593Smuzhiyun * arg_src(%esp): src 505*4882a593Smuzhiyun * arg_xor(%esp): bool, if true: xor output 506*4882a593Smuzhiyun */ 507*4882a593Smuzhiyun 508*4882a593Smuzhiyun pcmpeqd RNOT, RNOT; 509*4882a593Smuzhiyun 510*4882a593Smuzhiyun movl arg_ctx(%esp), CTX; 511*4882a593Smuzhiyun 512*4882a593Smuzhiyun movl arg_src(%esp), %eax; 513*4882a593Smuzhiyun read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 514*4882a593Smuzhiyun 515*4882a593Smuzhiyun K(RA, RB, RC, RD, RE, 0); 516*4882a593Smuzhiyun S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1); 517*4882a593Smuzhiyun S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2); 518*4882a593Smuzhiyun S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3); 519*4882a593Smuzhiyun S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4); 520*4882a593Smuzhiyun S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5); 521*4882a593Smuzhiyun S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6); 522*4882a593Smuzhiyun S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7); 523*4882a593Smuzhiyun S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8); 524*4882a593Smuzhiyun S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9); 525*4882a593Smuzhiyun S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10); 526*4882a593Smuzhiyun S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11); 527*4882a593Smuzhiyun S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12); 528*4882a593Smuzhiyun S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13); 529*4882a593Smuzhiyun S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14); 530*4882a593Smuzhiyun S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15); 531*4882a593Smuzhiyun S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16); 532*4882a593Smuzhiyun S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17); 533*4882a593Smuzhiyun S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18); 534*4882a593Smuzhiyun S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19); 535*4882a593Smuzhiyun S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20); 536*4882a593Smuzhiyun S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21); 537*4882a593Smuzhiyun S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22); 538*4882a593Smuzhiyun S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23); 539*4882a593Smuzhiyun S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24); 540*4882a593Smuzhiyun S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25); 541*4882a593Smuzhiyun S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26); 542*4882a593Smuzhiyun S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27); 543*4882a593Smuzhiyun S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28); 544*4882a593Smuzhiyun S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29); 545*4882a593Smuzhiyun S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30); 546*4882a593Smuzhiyun S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31); 547*4882a593Smuzhiyun S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32); 548*4882a593Smuzhiyun 549*4882a593Smuzhiyun movl arg_dst(%esp), %eax; 550*4882a593Smuzhiyun 551*4882a593Smuzhiyun cmpb $0, arg_xor(%esp); 552*4882a593Smuzhiyun jnz .L__enc_xor4; 553*4882a593Smuzhiyun 554*4882a593Smuzhiyun write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 555*4882a593Smuzhiyun 556*4882a593Smuzhiyun RET; 557*4882a593Smuzhiyun 558*4882a593Smuzhiyun.L__enc_xor4: 559*4882a593Smuzhiyun xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 560*4882a593Smuzhiyun 561*4882a593Smuzhiyun RET; 562*4882a593SmuzhiyunSYM_FUNC_END(__serpent_enc_blk_4way) 563*4882a593Smuzhiyun 564*4882a593SmuzhiyunSYM_FUNC_START(serpent_dec_blk_4way) 565*4882a593Smuzhiyun /* input: 566*4882a593Smuzhiyun * arg_ctx(%esp): ctx, CTX 567*4882a593Smuzhiyun * arg_dst(%esp): dst 568*4882a593Smuzhiyun * arg_src(%esp): src 569*4882a593Smuzhiyun */ 570*4882a593Smuzhiyun 571*4882a593Smuzhiyun pcmpeqd RNOT, RNOT; 572*4882a593Smuzhiyun 573*4882a593Smuzhiyun movl arg_ctx(%esp), CTX; 574*4882a593Smuzhiyun 575*4882a593Smuzhiyun movl arg_src(%esp), %eax; 576*4882a593Smuzhiyun read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 577*4882a593Smuzhiyun 578*4882a593Smuzhiyun K(RA, RB, RC, RD, RE, 32); 579*4882a593Smuzhiyun SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31); 580*4882a593Smuzhiyun SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30); 581*4882a593Smuzhiyun SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29); 582*4882a593Smuzhiyun SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28); 583*4882a593Smuzhiyun SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27); 584*4882a593Smuzhiyun SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26); 585*4882a593Smuzhiyun SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25); 586*4882a593Smuzhiyun SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24); 587*4882a593Smuzhiyun SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23); 588*4882a593Smuzhiyun SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22); 589*4882a593Smuzhiyun SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21); 590*4882a593Smuzhiyun SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20); 591*4882a593Smuzhiyun SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19); 592*4882a593Smuzhiyun SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18); 593*4882a593Smuzhiyun SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17); 594*4882a593Smuzhiyun SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16); 595*4882a593Smuzhiyun SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15); 596*4882a593Smuzhiyun SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14); 597*4882a593Smuzhiyun SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13); 598*4882a593Smuzhiyun SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12); 599*4882a593Smuzhiyun SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11); 600*4882a593Smuzhiyun SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10); 601*4882a593Smuzhiyun SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9); 602*4882a593Smuzhiyun SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8); 603*4882a593Smuzhiyun SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7); 604*4882a593Smuzhiyun SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6); 605*4882a593Smuzhiyun SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5); 606*4882a593Smuzhiyun SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4); 607*4882a593Smuzhiyun SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3); 608*4882a593Smuzhiyun SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2); 609*4882a593Smuzhiyun SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1); 610*4882a593Smuzhiyun SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0); 611*4882a593Smuzhiyun 612*4882a593Smuzhiyun movl arg_dst(%esp), %eax; 613*4882a593Smuzhiyun write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); 614*4882a593Smuzhiyun 615*4882a593Smuzhiyun RET; 616*4882a593SmuzhiyunSYM_FUNC_END(serpent_dec_blk_4way) 617