1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/*************************************************************************** 3*4882a593Smuzhiyun* Copyright (C) 2006 by Joachim Fritschi, <jfritschi@freenet.de> * 4*4882a593Smuzhiyun* * 5*4882a593Smuzhiyun***************************************************************************/ 6*4882a593Smuzhiyun 7*4882a593Smuzhiyun.file "twofish-x86_64-asm.S" 8*4882a593Smuzhiyun.text 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun#include <linux/linkage.h> 11*4882a593Smuzhiyun#include <asm/asm-offsets.h> 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun#define a_offset 0 14*4882a593Smuzhiyun#define b_offset 4 15*4882a593Smuzhiyun#define c_offset 8 16*4882a593Smuzhiyun#define d_offset 12 17*4882a593Smuzhiyun 18*4882a593Smuzhiyun/* Structure of the crypto context struct*/ 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun#define s0 0 /* S0 Array 256 Words each */ 21*4882a593Smuzhiyun#define s1 1024 /* S1 Array */ 22*4882a593Smuzhiyun#define s2 2048 /* S2 Array */ 23*4882a593Smuzhiyun#define s3 3072 /* S3 Array */ 24*4882a593Smuzhiyun#define w 4096 /* 8 whitening keys (word) */ 25*4882a593Smuzhiyun#define k 4128 /* key 1-32 ( word ) */ 26*4882a593Smuzhiyun 27*4882a593Smuzhiyun/* define a few register aliases to allow macro substitution */ 28*4882a593Smuzhiyun 29*4882a593Smuzhiyun#define R0 %rax 30*4882a593Smuzhiyun#define R0D %eax 31*4882a593Smuzhiyun#define R0B %al 32*4882a593Smuzhiyun#define R0H %ah 33*4882a593Smuzhiyun 34*4882a593Smuzhiyun#define R1 %rbx 35*4882a593Smuzhiyun#define R1D %ebx 36*4882a593Smuzhiyun#define R1B %bl 37*4882a593Smuzhiyun#define R1H %bh 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun#define R2 %rcx 40*4882a593Smuzhiyun#define R2D %ecx 41*4882a593Smuzhiyun#define R2B %cl 42*4882a593Smuzhiyun#define R2H %ch 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun#define R3 %rdx 45*4882a593Smuzhiyun#define R3D %edx 46*4882a593Smuzhiyun#define R3B %dl 47*4882a593Smuzhiyun#define R3H %dh 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun 50*4882a593Smuzhiyun/* performs input whitening */ 51*4882a593Smuzhiyun#define input_whitening(src,context,offset)\ 52*4882a593Smuzhiyun xor w+offset(context), src; 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun/* performs input whitening */ 55*4882a593Smuzhiyun#define output_whitening(src,context,offset)\ 56*4882a593Smuzhiyun xor w+16+offset(context), src; 57*4882a593Smuzhiyun 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun/* 60*4882a593Smuzhiyun * a input register containing a (rotated 16) 61*4882a593Smuzhiyun * b input register containing b 62*4882a593Smuzhiyun * c input register containing c 63*4882a593Smuzhiyun * d input register containing d (already rol $1) 64*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance 65*4882a593Smuzhiyun */ 66*4882a593Smuzhiyun#define encrypt_round(a,b,c,d,round)\ 67*4882a593Smuzhiyun movzx b ## B, %edi;\ 68*4882a593Smuzhiyun mov s1(%r11,%rdi,4),%r8d;\ 69*4882a593Smuzhiyun movzx a ## B, %edi;\ 70*4882a593Smuzhiyun mov s2(%r11,%rdi,4),%r9d;\ 71*4882a593Smuzhiyun movzx b ## H, %edi;\ 72*4882a593Smuzhiyun ror $16, b ## D;\ 73*4882a593Smuzhiyun xor s2(%r11,%rdi,4),%r8d;\ 74*4882a593Smuzhiyun movzx a ## H, %edi;\ 75*4882a593Smuzhiyun ror $16, a ## D;\ 76*4882a593Smuzhiyun xor s3(%r11,%rdi,4),%r9d;\ 77*4882a593Smuzhiyun movzx b ## B, %edi;\ 78*4882a593Smuzhiyun xor s3(%r11,%rdi,4),%r8d;\ 79*4882a593Smuzhiyun movzx a ## B, %edi;\ 80*4882a593Smuzhiyun xor (%r11,%rdi,4), %r9d;\ 81*4882a593Smuzhiyun movzx b ## H, %edi;\ 82*4882a593Smuzhiyun ror $15, b ## D;\ 83*4882a593Smuzhiyun xor (%r11,%rdi,4), %r8d;\ 84*4882a593Smuzhiyun movzx a ## H, %edi;\ 85*4882a593Smuzhiyun xor s1(%r11,%rdi,4),%r9d;\ 86*4882a593Smuzhiyun add %r8d, %r9d;\ 87*4882a593Smuzhiyun add %r9d, %r8d;\ 88*4882a593Smuzhiyun add k+round(%r11), %r9d;\ 89*4882a593Smuzhiyun xor %r9d, c ## D;\ 90*4882a593Smuzhiyun rol $15, c ## D;\ 91*4882a593Smuzhiyun add k+4+round(%r11),%r8d;\ 92*4882a593Smuzhiyun xor %r8d, d ## D; 93*4882a593Smuzhiyun 94*4882a593Smuzhiyun/* 95*4882a593Smuzhiyun * a input register containing a(rotated 16) 96*4882a593Smuzhiyun * b input register containing b 97*4882a593Smuzhiyun * c input register containing c 98*4882a593Smuzhiyun * d input register containing d (already rol $1) 99*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance 100*4882a593Smuzhiyun * during the round a and b are prepared for the output whitening 101*4882a593Smuzhiyun */ 102*4882a593Smuzhiyun#define encrypt_last_round(a,b,c,d,round)\ 103*4882a593Smuzhiyun mov b ## D, %r10d;\ 104*4882a593Smuzhiyun shl $32, %r10;\ 105*4882a593Smuzhiyun movzx b ## B, %edi;\ 106*4882a593Smuzhiyun mov s1(%r11,%rdi,4),%r8d;\ 107*4882a593Smuzhiyun movzx a ## B, %edi;\ 108*4882a593Smuzhiyun mov s2(%r11,%rdi,4),%r9d;\ 109*4882a593Smuzhiyun movzx b ## H, %edi;\ 110*4882a593Smuzhiyun ror $16, b ## D;\ 111*4882a593Smuzhiyun xor s2(%r11,%rdi,4),%r8d;\ 112*4882a593Smuzhiyun movzx a ## H, %edi;\ 113*4882a593Smuzhiyun ror $16, a ## D;\ 114*4882a593Smuzhiyun xor s3(%r11,%rdi,4),%r9d;\ 115*4882a593Smuzhiyun movzx b ## B, %edi;\ 116*4882a593Smuzhiyun xor s3(%r11,%rdi,4),%r8d;\ 117*4882a593Smuzhiyun movzx a ## B, %edi;\ 118*4882a593Smuzhiyun xor (%r11,%rdi,4), %r9d;\ 119*4882a593Smuzhiyun xor a, %r10;\ 120*4882a593Smuzhiyun movzx b ## H, %edi;\ 121*4882a593Smuzhiyun xor (%r11,%rdi,4), %r8d;\ 122*4882a593Smuzhiyun movzx a ## H, %edi;\ 123*4882a593Smuzhiyun xor s1(%r11,%rdi,4),%r9d;\ 124*4882a593Smuzhiyun add %r8d, %r9d;\ 125*4882a593Smuzhiyun add %r9d, %r8d;\ 126*4882a593Smuzhiyun add k+round(%r11), %r9d;\ 127*4882a593Smuzhiyun xor %r9d, c ## D;\ 128*4882a593Smuzhiyun ror $1, c ## D;\ 129*4882a593Smuzhiyun add k+4+round(%r11),%r8d;\ 130*4882a593Smuzhiyun xor %r8d, d ## D 131*4882a593Smuzhiyun 132*4882a593Smuzhiyun/* 133*4882a593Smuzhiyun * a input register containing a 134*4882a593Smuzhiyun * b input register containing b (rotated 16) 135*4882a593Smuzhiyun * c input register containing c (already rol $1) 136*4882a593Smuzhiyun * d input register containing d 137*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance 138*4882a593Smuzhiyun */ 139*4882a593Smuzhiyun#define decrypt_round(a,b,c,d,round)\ 140*4882a593Smuzhiyun movzx a ## B, %edi;\ 141*4882a593Smuzhiyun mov (%r11,%rdi,4), %r9d;\ 142*4882a593Smuzhiyun movzx b ## B, %edi;\ 143*4882a593Smuzhiyun mov s3(%r11,%rdi,4),%r8d;\ 144*4882a593Smuzhiyun movzx a ## H, %edi;\ 145*4882a593Smuzhiyun ror $16, a ## D;\ 146*4882a593Smuzhiyun xor s1(%r11,%rdi,4),%r9d;\ 147*4882a593Smuzhiyun movzx b ## H, %edi;\ 148*4882a593Smuzhiyun ror $16, b ## D;\ 149*4882a593Smuzhiyun xor (%r11,%rdi,4), %r8d;\ 150*4882a593Smuzhiyun movzx a ## B, %edi;\ 151*4882a593Smuzhiyun xor s2(%r11,%rdi,4),%r9d;\ 152*4882a593Smuzhiyun movzx b ## B, %edi;\ 153*4882a593Smuzhiyun xor s1(%r11,%rdi,4),%r8d;\ 154*4882a593Smuzhiyun movzx a ## H, %edi;\ 155*4882a593Smuzhiyun ror $15, a ## D;\ 156*4882a593Smuzhiyun xor s3(%r11,%rdi,4),%r9d;\ 157*4882a593Smuzhiyun movzx b ## H, %edi;\ 158*4882a593Smuzhiyun xor s2(%r11,%rdi,4),%r8d;\ 159*4882a593Smuzhiyun add %r8d, %r9d;\ 160*4882a593Smuzhiyun add %r9d, %r8d;\ 161*4882a593Smuzhiyun add k+round(%r11), %r9d;\ 162*4882a593Smuzhiyun xor %r9d, c ## D;\ 163*4882a593Smuzhiyun add k+4+round(%r11),%r8d;\ 164*4882a593Smuzhiyun xor %r8d, d ## D;\ 165*4882a593Smuzhiyun rol $15, d ## D; 166*4882a593Smuzhiyun 167*4882a593Smuzhiyun/* 168*4882a593Smuzhiyun * a input register containing a 169*4882a593Smuzhiyun * b input register containing b 170*4882a593Smuzhiyun * c input register containing c (already rol $1) 171*4882a593Smuzhiyun * d input register containing d 172*4882a593Smuzhiyun * operations on a and b are interleaved to increase performance 173*4882a593Smuzhiyun * during the round a and b are prepared for the output whitening 174*4882a593Smuzhiyun */ 175*4882a593Smuzhiyun#define decrypt_last_round(a,b,c,d,round)\ 176*4882a593Smuzhiyun movzx a ## B, %edi;\ 177*4882a593Smuzhiyun mov (%r11,%rdi,4), %r9d;\ 178*4882a593Smuzhiyun movzx b ## B, %edi;\ 179*4882a593Smuzhiyun mov s3(%r11,%rdi,4),%r8d;\ 180*4882a593Smuzhiyun movzx b ## H, %edi;\ 181*4882a593Smuzhiyun ror $16, b ## D;\ 182*4882a593Smuzhiyun xor (%r11,%rdi,4), %r8d;\ 183*4882a593Smuzhiyun movzx a ## H, %edi;\ 184*4882a593Smuzhiyun mov b ## D, %r10d;\ 185*4882a593Smuzhiyun shl $32, %r10;\ 186*4882a593Smuzhiyun xor a, %r10;\ 187*4882a593Smuzhiyun ror $16, a ## D;\ 188*4882a593Smuzhiyun xor s1(%r11,%rdi,4),%r9d;\ 189*4882a593Smuzhiyun movzx b ## B, %edi;\ 190*4882a593Smuzhiyun xor s1(%r11,%rdi,4),%r8d;\ 191*4882a593Smuzhiyun movzx a ## B, %edi;\ 192*4882a593Smuzhiyun xor s2(%r11,%rdi,4),%r9d;\ 193*4882a593Smuzhiyun movzx b ## H, %edi;\ 194*4882a593Smuzhiyun xor s2(%r11,%rdi,4),%r8d;\ 195*4882a593Smuzhiyun movzx a ## H, %edi;\ 196*4882a593Smuzhiyun xor s3(%r11,%rdi,4),%r9d;\ 197*4882a593Smuzhiyun add %r8d, %r9d;\ 198*4882a593Smuzhiyun add %r9d, %r8d;\ 199*4882a593Smuzhiyun add k+round(%r11), %r9d;\ 200*4882a593Smuzhiyun xor %r9d, c ## D;\ 201*4882a593Smuzhiyun add k+4+round(%r11),%r8d;\ 202*4882a593Smuzhiyun xor %r8d, d ## D;\ 203*4882a593Smuzhiyun ror $1, d ## D; 204*4882a593Smuzhiyun 205*4882a593SmuzhiyunSYM_FUNC_START(twofish_enc_blk) 206*4882a593Smuzhiyun pushq R1 207*4882a593Smuzhiyun 208*4882a593Smuzhiyun /* %rdi contains the ctx address */ 209*4882a593Smuzhiyun /* %rsi contains the output address */ 210*4882a593Smuzhiyun /* %rdx contains the input address */ 211*4882a593Smuzhiyun /* ctx address is moved to free one non-rex register 212*4882a593Smuzhiyun as target for the 8bit high operations */ 213*4882a593Smuzhiyun mov %rdi, %r11 214*4882a593Smuzhiyun 215*4882a593Smuzhiyun movq (R3), R1 216*4882a593Smuzhiyun movq 8(R3), R3 217*4882a593Smuzhiyun input_whitening(R1,%r11,a_offset) 218*4882a593Smuzhiyun input_whitening(R3,%r11,c_offset) 219*4882a593Smuzhiyun mov R1D, R0D 220*4882a593Smuzhiyun rol $16, R0D 221*4882a593Smuzhiyun shr $32, R1 222*4882a593Smuzhiyun mov R3D, R2D 223*4882a593Smuzhiyun shr $32, R3 224*4882a593Smuzhiyun rol $1, R3D 225*4882a593Smuzhiyun 226*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,0); 227*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,8); 228*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,2*8); 229*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,3*8); 230*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,4*8); 231*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,5*8); 232*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,6*8); 233*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,7*8); 234*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,8*8); 235*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,9*8); 236*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,10*8); 237*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,11*8); 238*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,12*8); 239*4882a593Smuzhiyun encrypt_round(R2,R3,R0,R1,13*8); 240*4882a593Smuzhiyun encrypt_round(R0,R1,R2,R3,14*8); 241*4882a593Smuzhiyun encrypt_last_round(R2,R3,R0,R1,15*8); 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun 244*4882a593Smuzhiyun output_whitening(%r10,%r11,a_offset) 245*4882a593Smuzhiyun movq %r10, (%rsi) 246*4882a593Smuzhiyun 247*4882a593Smuzhiyun shl $32, R1 248*4882a593Smuzhiyun xor R0, R1 249*4882a593Smuzhiyun 250*4882a593Smuzhiyun output_whitening(R1,%r11,c_offset) 251*4882a593Smuzhiyun movq R1, 8(%rsi) 252*4882a593Smuzhiyun 253*4882a593Smuzhiyun popq R1 254*4882a593Smuzhiyun movl $1,%eax 255*4882a593Smuzhiyun RET 256*4882a593SmuzhiyunSYM_FUNC_END(twofish_enc_blk) 257*4882a593Smuzhiyun 258*4882a593SmuzhiyunSYM_FUNC_START(twofish_dec_blk) 259*4882a593Smuzhiyun pushq R1 260*4882a593Smuzhiyun 261*4882a593Smuzhiyun /* %rdi contains the ctx address */ 262*4882a593Smuzhiyun /* %rsi contains the output address */ 263*4882a593Smuzhiyun /* %rdx contains the input address */ 264*4882a593Smuzhiyun /* ctx address is moved to free one non-rex register 265*4882a593Smuzhiyun as target for the 8bit high operations */ 266*4882a593Smuzhiyun mov %rdi, %r11 267*4882a593Smuzhiyun 268*4882a593Smuzhiyun movq (R3), R1 269*4882a593Smuzhiyun movq 8(R3), R3 270*4882a593Smuzhiyun output_whitening(R1,%r11,a_offset) 271*4882a593Smuzhiyun output_whitening(R3,%r11,c_offset) 272*4882a593Smuzhiyun mov R1D, R0D 273*4882a593Smuzhiyun shr $32, R1 274*4882a593Smuzhiyun rol $16, R1D 275*4882a593Smuzhiyun mov R3D, R2D 276*4882a593Smuzhiyun shr $32, R3 277*4882a593Smuzhiyun rol $1, R2D 278*4882a593Smuzhiyun 279*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,15*8); 280*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,14*8); 281*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,13*8); 282*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,12*8); 283*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,11*8); 284*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,10*8); 285*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,9*8); 286*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,8*8); 287*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,7*8); 288*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,6*8); 289*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,5*8); 290*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,4*8); 291*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,3*8); 292*4882a593Smuzhiyun decrypt_round(R2,R3,R0,R1,2*8); 293*4882a593Smuzhiyun decrypt_round(R0,R1,R2,R3,1*8); 294*4882a593Smuzhiyun decrypt_last_round(R2,R3,R0,R1,0); 295*4882a593Smuzhiyun 296*4882a593Smuzhiyun input_whitening(%r10,%r11,a_offset) 297*4882a593Smuzhiyun movq %r10, (%rsi) 298*4882a593Smuzhiyun 299*4882a593Smuzhiyun shl $32, R1 300*4882a593Smuzhiyun xor R0, R1 301*4882a593Smuzhiyun 302*4882a593Smuzhiyun input_whitening(R1,%r11,c_offset) 303*4882a593Smuzhiyun movq R1, 8(%rsi) 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun popq R1 306*4882a593Smuzhiyun movl $1,%eax 307*4882a593Smuzhiyun RET 308*4882a593SmuzhiyunSYM_FUNC_END(twofish_dec_blk) 309