1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/*************************************************************************** 3*4882a593Smuzhiyun* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * 4*4882a593Smuzhiyun* * 5*4882a593Smuzhiyun***************************************************************************/ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun.file "twofish-i586-asm.S" 8*4882a593Smuzhiyun.text 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun#include <linux/linkage.h> 11*4882a593Smuzhiyun#include <asm/asm-offsets.h> 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun/* return address at 0 */ 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun#define in_blk 12 /* input byte array address parameter*/ 16*4882a593Smuzhiyun#define out_blk 8 /* output byte array address parameter*/ 17*4882a593Smuzhiyun#define ctx 4 /* Twofish context structure */ 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun#define a_offset 0 20*4882a593Smuzhiyun#define b_offset 4 21*4882a593Smuzhiyun#define c_offset 8 22*4882a593Smuzhiyun#define d_offset 12 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun/* Structure of the crypto context struct*/ 25*4882a593Smuzhiyun 26*4882a593Smuzhiyun#define s0 0 /* S0 Array 256 Words each */ 27*4882a593Smuzhiyun#define s1 1024 /* S1 Array */ 28*4882a593Smuzhiyun#define s2 2048 /* S2 Array */ 29*4882a593Smuzhiyun#define s3 3072 /* S3 Array */ 30*4882a593Smuzhiyun#define w 4096 /* 8 whitening keys (word) */ 31*4882a593Smuzhiyun#define k 4128 /* key 1-32 ( word ) */ 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun/* define a few register aliases to allow macro substitution */ 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun#define R0D %eax 36*4882a593Smuzhiyun#define R0B %al 37*4882a593Smuzhiyun#define R0H %ah 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun#define R1D %ebx 40*4882a593Smuzhiyun#define R1B %bl 41*4882a593Smuzhiyun#define R1H %bh 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun#define R2D %ecx 44*4882a593Smuzhiyun#define R2B %cl 45*4882a593Smuzhiyun#define R2H %ch 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun#define R3D %edx 48*4882a593Smuzhiyun#define R3B %dl 49*4882a593Smuzhiyun#define R3H %dh 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun/* performs input whitening */ 53*4882a593Smuzhiyun#define input_whitening(src,context,offset)\ 54*4882a593Smuzhiyun xor w+offset(context), src; 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun/* performs input whitening */ 57*4882a593Smuzhiyun#define output_whitening(src,context,offset)\ 58*4882a593Smuzhiyun xor w+16+offset(context), src; 59*4882a593Smuzhiyun 60*4882a593Smuzhiyun/* 61*4882a593Smuzhiyun * a input register containing a (rotated 16) 62*4882a593Smuzhiyun * b input register containing b 63*4882a593Smuzhiyun * c input register containing c 64*4882a593Smuzhiyun * d input register containing d (already rol $1) 65*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance 66*4882a593Smuzhiyun */ 67*4882a593Smuzhiyun#define encrypt_round(a,b,c,d,round)\ 68*4882a593Smuzhiyun push d ## D;\ 69*4882a593Smuzhiyun movzx b ## B, %edi;\ 70*4882a593Smuzhiyun mov s1(%ebp,%edi,4),d ## D;\ 71*4882a593Smuzhiyun movzx a ## B, %edi;\ 72*4882a593Smuzhiyun mov s2(%ebp,%edi,4),%esi;\ 73*4882a593Smuzhiyun movzx b ## H, %edi;\ 74*4882a593Smuzhiyun ror $16, b ## D;\ 75*4882a593Smuzhiyun xor s2(%ebp,%edi,4),d ## D;\ 76*4882a593Smuzhiyun movzx a ## H, %edi;\ 77*4882a593Smuzhiyun ror $16, a ## D;\ 78*4882a593Smuzhiyun xor s3(%ebp,%edi,4),%esi;\ 79*4882a593Smuzhiyun movzx b ## B, %edi;\ 80*4882a593Smuzhiyun xor s3(%ebp,%edi,4),d ## D;\ 81*4882a593Smuzhiyun movzx a ## B, %edi;\ 82*4882a593Smuzhiyun xor (%ebp,%edi,4), %esi;\ 83*4882a593Smuzhiyun movzx b ## H, %edi;\ 84*4882a593Smuzhiyun ror $15, b ## D;\ 85*4882a593Smuzhiyun xor (%ebp,%edi,4), d ## D;\ 86*4882a593Smuzhiyun movzx a ## H, %edi;\ 87*4882a593Smuzhiyun xor s1(%ebp,%edi,4),%esi;\ 88*4882a593Smuzhiyun pop %edi;\ 89*4882a593Smuzhiyun add d ## D, %esi;\ 90*4882a593Smuzhiyun add %esi, d ## D;\ 91*4882a593Smuzhiyun add k+round(%ebp), %esi;\ 92*4882a593Smuzhiyun xor %esi, c ## D;\ 93*4882a593Smuzhiyun rol $15, c ## D;\ 94*4882a593Smuzhiyun add k+4+round(%ebp),d ## D;\ 95*4882a593Smuzhiyun xor %edi, d ## D; 96*4882a593Smuzhiyun 97*4882a593Smuzhiyun/* 98*4882a593Smuzhiyun * a input register containing a (rotated 16) 99*4882a593Smuzhiyun * b input register containing b 100*4882a593Smuzhiyun * c input register containing c 101*4882a593Smuzhiyun * d input register containing d (already rol $1) 102*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance 103*4882a593Smuzhiyun * last round has different rotations for the output preparation 104*4882a593Smuzhiyun */ 105*4882a593Smuzhiyun#define encrypt_last_round(a,b,c,d,round)\ 106*4882a593Smuzhiyun push d ## D;\ 107*4882a593Smuzhiyun movzx b ## B, %edi;\ 108*4882a593Smuzhiyun mov s1(%ebp,%edi,4),d ## D;\ 109*4882a593Smuzhiyun movzx a ## B, %edi;\ 110*4882a593Smuzhiyun mov s2(%ebp,%edi,4),%esi;\ 111*4882a593Smuzhiyun movzx b ## H, %edi;\ 112*4882a593Smuzhiyun ror $16, b ## D;\ 113*4882a593Smuzhiyun xor s2(%ebp,%edi,4),d ## D;\ 114*4882a593Smuzhiyun movzx a ## H, %edi;\ 115*4882a593Smuzhiyun ror $16, a ## D;\ 116*4882a593Smuzhiyun xor s3(%ebp,%edi,4),%esi;\ 117*4882a593Smuzhiyun movzx b ## B, %edi;\ 118*4882a593Smuzhiyun xor s3(%ebp,%edi,4),d ## D;\ 119*4882a593Smuzhiyun movzx a ## B, %edi;\ 120*4882a593Smuzhiyun xor (%ebp,%edi,4), %esi;\ 121*4882a593Smuzhiyun movzx b ## H, %edi;\ 122*4882a593Smuzhiyun ror $16, b ## D;\ 123*4882a593Smuzhiyun xor (%ebp,%edi,4), d ## D;\ 124*4882a593Smuzhiyun movzx a ## H, %edi;\ 125*4882a593Smuzhiyun xor s1(%ebp,%edi,4),%esi;\ 126*4882a593Smuzhiyun pop %edi;\ 127*4882a593Smuzhiyun add d ## D, %esi;\ 128*4882a593Smuzhiyun add %esi, d ## D;\ 129*4882a593Smuzhiyun add k+round(%ebp), %esi;\ 130*4882a593Smuzhiyun xor %esi, c ## D;\ 131*4882a593Smuzhiyun ror $1, c ## D;\ 132*4882a593Smuzhiyun add k+4+round(%ebp),d ## D;\ 133*4882a593Smuzhiyun xor %edi, d ## D; 134*4882a593Smuzhiyun 135*4882a593Smuzhiyun/* 136*4882a593Smuzhiyun * a input register containing a 137*4882a593Smuzhiyun * b input register containing b (rotated 16) 138*4882a593Smuzhiyun * c input register containing c 139*4882a593Smuzhiyun * d input register containing d (already rol $1) 140*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance 141*4882a593Smuzhiyun */ 142*4882a593Smuzhiyun#define decrypt_round(a,b,c,d,round)\ 143*4882a593Smuzhiyun push c ## D;\ 144*4882a593Smuzhiyun movzx a ## B, %edi;\ 145*4882a593Smuzhiyun mov (%ebp,%edi,4), c ## D;\ 146*4882a593Smuzhiyun movzx b ## B, %edi;\ 147*4882a593Smuzhiyun mov s3(%ebp,%edi,4),%esi;\ 148*4882a593Smuzhiyun movzx a ## H, %edi;\ 149*4882a593Smuzhiyun ror $16, a ## D;\ 150*4882a593Smuzhiyun xor s1(%ebp,%edi,4),c ## D;\ 151*4882a593Smuzhiyun movzx b ## H, %edi;\ 152*4882a593Smuzhiyun ror $16, b ## D;\ 153*4882a593Smuzhiyun xor (%ebp,%edi,4), %esi;\ 154*4882a593Smuzhiyun movzx a ## B, %edi;\ 155*4882a593Smuzhiyun xor s2(%ebp,%edi,4),c ## D;\ 156*4882a593Smuzhiyun movzx b ## B, %edi;\ 157*4882a593Smuzhiyun xor s1(%ebp,%edi,4),%esi;\ 158*4882a593Smuzhiyun movzx a ## H, %edi;\ 159*4882a593Smuzhiyun ror $15, a ## D;\ 160*4882a593Smuzhiyun xor s3(%ebp,%edi,4),c ## D;\ 161*4882a593Smuzhiyun movzx b ## H, %edi;\ 162*4882a593Smuzhiyun xor s2(%ebp,%edi,4),%esi;\ 163*4882a593Smuzhiyun pop %edi;\ 164*4882a593Smuzhiyun add %esi, c ## D;\ 165*4882a593Smuzhiyun add c ## D, %esi;\ 166*4882a593Smuzhiyun add k+round(%ebp), c ## D;\ 167*4882a593Smuzhiyun xor %edi, c ## D;\ 168*4882a593Smuzhiyun add k+4+round(%ebp),%esi;\ 169*4882a593Smuzhiyun xor %esi, d ## D;\ 170*4882a593Smuzhiyun rol $15, d ## D; 171*4882a593Smuzhiyun 172*4882a593Smuzhiyun/* 173*4882a593Smuzhiyun * a input register containing a 174*4882a593Smuzhiyun * b input register containing b (rotated 16) 175*4882a593Smuzhiyun * c input register containing c 176*4882a593Smuzhiyun * d input register containing d (already rol $1) 177*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance 178*4882a593Smuzhiyun * last round has different rotations for the output preparation 179*4882a593Smuzhiyun */ 180*4882a593Smuzhiyun#define decrypt_last_round(a,b,c,d,round)\ 181*4882a593Smuzhiyun push c ## D;\ 182*4882a593Smuzhiyun movzx a ## B, %edi;\ 183*4882a593Smuzhiyun mov (%ebp,%edi,4), c ## D;\ 184*4882a593Smuzhiyun movzx b ## B, %edi;\ 185*4882a593Smuzhiyun mov s3(%ebp,%edi,4),%esi;\ 186*4882a593Smuzhiyun movzx a ## H, %edi;\ 187*4882a593Smuzhiyun ror $16, a ## D;\ 188*4882a593Smuzhiyun xor s1(%ebp,%edi,4),c ## D;\ 189*4882a593Smuzhiyun movzx b ## H, %edi;\ 190*4882a593Smuzhiyun ror $16, b ## D;\ 191*4882a593Smuzhiyun xor (%ebp,%edi,4), %esi;\ 192*4882a593Smuzhiyun movzx a ## B, %edi;\ 193*4882a593Smuzhiyun xor s2(%ebp,%edi,4),c ## D;\ 194*4882a593Smuzhiyun movzx b ## B, %edi;\ 195*4882a593Smuzhiyun xor s1(%ebp,%edi,4),%esi;\ 196*4882a593Smuzhiyun movzx a ## H, %edi;\ 197*4882a593Smuzhiyun ror $16, a ## D;\ 198*4882a593Smuzhiyun xor s3(%ebp,%edi,4),c ## D;\ 199*4882a593Smuzhiyun movzx b ## H, %edi;\ 200*4882a593Smuzhiyun xor s2(%ebp,%edi,4),%esi;\ 201*4882a593Smuzhiyun pop %edi;\ 202*4882a593Smuzhiyun add %esi, c ## D;\ 203*4882a593Smuzhiyun add c ## D, %esi;\ 204*4882a593Smuzhiyun add k+round(%ebp), c ## D;\ 205*4882a593Smuzhiyun xor %edi, c ## D;\ 206*4882a593Smuzhiyun add k+4+round(%ebp),%esi;\ 207*4882a593Smuzhiyun xor %esi, d ## D;\ 208*4882a593Smuzhiyun ror $1, d ## D; 209*4882a593Smuzhiyun 210*4882a593SmuzhiyunSYM_FUNC_START(twofish_enc_blk) 211*4882a593Smuzhiyun push %ebp /* save registers according to calling convention*/ 212*4882a593Smuzhiyun push %ebx 213*4882a593Smuzhiyun push %esi 214*4882a593Smuzhiyun push %edi 215*4882a593Smuzhiyun 216*4882a593Smuzhiyun mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base 217*4882a593Smuzhiyun * pointer to the ctx address */ 218*4882a593Smuzhiyun mov in_blk+16(%esp),%edi /* input address in edi */ 219*4882a593Smuzhiyun 220*4882a593Smuzhiyun mov (%edi), %eax 221*4882a593Smuzhiyun mov b_offset(%edi), %ebx 222*4882a593Smuzhiyun mov c_offset(%edi), %ecx 223*4882a593Smuzhiyun mov d_offset(%edi), %edx 224*4882a593Smuzhiyun input_whitening(%eax,%ebp,a_offset) 225*4882a593Smuzhiyun ror $16, %eax 226*4882a593Smuzhiyun input_whitening(%ebx,%ebp,b_offset) 227*4882a593Smuzhiyun input_whitening(%ecx,%ebp,c_offset) 228*4882a593Smuzhiyun input_whitening(%edx,%ebp,d_offset) 229*4882a593Smuzhiyun rol $1, %edx 230*4882a593Smuzhiyun 231*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,0); 232*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,8); 233*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,2*8); 234*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,3*8); 235*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,4*8); 236*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,5*8); 237*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,6*8); 238*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,7*8); 239*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,8*8); 240*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,9*8); 241*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,10*8); 242*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,11*8); 243*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,12*8); 244*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,13*8); 245*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,14*8); 246*4882a593Smuzhiyun encrypt_last_round(R2,R3,R0,R1,15*8); 247*4882a593Smuzhiyun 248*4882a593Smuzhiyun output_whitening(%eax,%ebp,c_offset) 249*4882a593Smuzhiyun output_whitening(%ebx,%ebp,d_offset) 250*4882a593Smuzhiyun output_whitening(%ecx,%ebp,a_offset) 251*4882a593Smuzhiyun output_whitening(%edx,%ebp,b_offset) 252*4882a593Smuzhiyun mov out_blk+16(%esp),%edi; 253*4882a593Smuzhiyun mov %eax, c_offset(%edi) 254*4882a593Smuzhiyun mov %ebx, d_offset(%edi) 255*4882a593Smuzhiyun mov %ecx, (%edi) 256*4882a593Smuzhiyun mov %edx, b_offset(%edi) 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun pop %edi 259*4882a593Smuzhiyun pop %esi 260*4882a593Smuzhiyun pop %ebx 261*4882a593Smuzhiyun pop %ebp 262*4882a593Smuzhiyun mov $1, %eax 263*4882a593Smuzhiyun RET 264*4882a593SmuzhiyunSYM_FUNC_END(twofish_enc_blk) 265*4882a593Smuzhiyun 266*4882a593SmuzhiyunSYM_FUNC_START(twofish_dec_blk) 267*4882a593Smuzhiyun push %ebp /* save registers according to calling convention*/ 268*4882a593Smuzhiyun push %ebx 269*4882a593Smuzhiyun push %esi 270*4882a593Smuzhiyun push %edi 271*4882a593Smuzhiyun 272*4882a593Smuzhiyun 273*4882a593Smuzhiyun mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base 274*4882a593Smuzhiyun * pointer to the ctx address */ 275*4882a593Smuzhiyun mov in_blk+16(%esp),%edi /* input address in edi */ 276*4882a593Smuzhiyun 277*4882a593Smuzhiyun mov (%edi), %eax 278*4882a593Smuzhiyun mov b_offset(%edi), %ebx 279*4882a593Smuzhiyun mov c_offset(%edi), %ecx 280*4882a593Smuzhiyun mov d_offset(%edi), %edx 281*4882a593Smuzhiyun output_whitening(%eax,%ebp,a_offset) 282*4882a593Smuzhiyun output_whitening(%ebx,%ebp,b_offset) 283*4882a593Smuzhiyun ror $16, %ebx 284*4882a593Smuzhiyun output_whitening(%ecx,%ebp,c_offset) 285*4882a593Smuzhiyun output_whitening(%edx,%ebp,d_offset) 286*4882a593Smuzhiyun rol $1, %ecx 287*4882a593Smuzhiyun 288*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,15*8); 289*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,14*8); 290*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,13*8); 291*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,12*8); 292*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,11*8); 293*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,10*8); 294*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,9*8); 295*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,8*8); 296*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,7*8); 297*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,6*8); 298*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,5*8); 299*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,4*8); 300*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,3*8); 301*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,2*8); 302*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,1*8); 303*4882a593Smuzhiyun decrypt_last_round(R2,R3,R0,R1,0); 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun input_whitening(%eax,%ebp,c_offset) 306*4882a593Smuzhiyun input_whitening(%ebx,%ebp,d_offset) 307*4882a593Smuzhiyun input_whitening(%ecx,%ebp,a_offset) 308*4882a593Smuzhiyun input_whitening(%edx,%ebp,b_offset) 309*4882a593Smuzhiyun mov out_blk+16(%esp),%edi; 310*4882a593Smuzhiyun mov %eax, c_offset(%edi) 311*4882a593Smuzhiyun mov %ebx, d_offset(%edi) 312*4882a593Smuzhiyun mov %ecx, (%edi) 313*4882a593Smuzhiyun mov %edx, b_offset(%edi) 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun pop %edi 316*4882a593Smuzhiyun pop %esi 317*4882a593Smuzhiyun pop %ebx 318*4882a593Smuzhiyun pop %ebp 319*4882a593Smuzhiyun mov $1, %eax 320*4882a593Smuzhiyun RET 321*4882a593SmuzhiyunSYM_FUNC_END(twofish_dec_blk) 322