1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Twofish Cipher 3-way parallel algorithm (x86_64) 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun.file "twofish-x86_64-asm-3way.S" 11*4882a593Smuzhiyun.text 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun/* structure of crypto context */ 14*4882a593Smuzhiyun#define s0 0 15*4882a593Smuzhiyun#define s1 1024 16*4882a593Smuzhiyun#define s2 2048 17*4882a593Smuzhiyun#define s3 3072 18*4882a593Smuzhiyun#define w 4096 19*4882a593Smuzhiyun#define k 4128 20*4882a593Smuzhiyun 21*4882a593Smuzhiyun/********************************************************************** 22*4882a593Smuzhiyun 3-way twofish 23*4882a593Smuzhiyun **********************************************************************/ 24*4882a593Smuzhiyun#define CTX %rdi 25*4882a593Smuzhiyun#define RIO %rdx 26*4882a593Smuzhiyun 27*4882a593Smuzhiyun#define RAB0 %rax 28*4882a593Smuzhiyun#define RAB1 %rbx 29*4882a593Smuzhiyun#define RAB2 %rcx 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun#define RAB0d %eax 32*4882a593Smuzhiyun#define RAB1d %ebx 33*4882a593Smuzhiyun#define RAB2d %ecx 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun#define RAB0bh %ah 36*4882a593Smuzhiyun#define RAB1bh %bh 37*4882a593Smuzhiyun#define RAB2bh %ch 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun#define RAB0bl %al 40*4882a593Smuzhiyun#define RAB1bl %bl 41*4882a593Smuzhiyun#define RAB2bl %cl 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun#define CD0 0x0(%rsp) 44*4882a593Smuzhiyun#define CD1 0x8(%rsp) 45*4882a593Smuzhiyun#define CD2 0x10(%rsp) 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun# used only before/after all rounds 48*4882a593Smuzhiyun#define RCD0 %r8 49*4882a593Smuzhiyun#define RCD1 %r9 50*4882a593Smuzhiyun#define RCD2 %r10 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun# used only during rounds 53*4882a593Smuzhiyun#define RX0 %r8 54*4882a593Smuzhiyun#define RX1 %r9 55*4882a593Smuzhiyun#define RX2 %r10 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun#define RX0d %r8d 58*4882a593Smuzhiyun#define RX1d %r9d 59*4882a593Smuzhiyun#define RX2d %r10d 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun#define RY0 %r11 62*4882a593Smuzhiyun#define RY1 %r12 63*4882a593Smuzhiyun#define RY2 %r13 64*4882a593Smuzhiyun 65*4882a593Smuzhiyun#define RY0d %r11d 66*4882a593Smuzhiyun#define RY1d %r12d 67*4882a593Smuzhiyun#define RY2d %r13d 68*4882a593Smuzhiyun 69*4882a593Smuzhiyun#define RT0 %rdx 70*4882a593Smuzhiyun#define RT1 %rsi 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun#define RT0d %edx 73*4882a593Smuzhiyun#define RT1d %esi 74*4882a593Smuzhiyun 75*4882a593Smuzhiyun#define RT1bl %sil 76*4882a593Smuzhiyun 77*4882a593Smuzhiyun#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ 78*4882a593Smuzhiyun movzbl ab ## bl, tmp2 ## d; \ 79*4882a593Smuzhiyun movzbl ab ## bh, tmp1 ## d; \ 80*4882a593Smuzhiyun rorq $(rot), ab; \ 81*4882a593Smuzhiyun op1##l T0(CTX, tmp2, 4), dst ## d; \ 82*4882a593Smuzhiyun op2##l T1(CTX, tmp1, 4), dst ## d; 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun#define swap_ab_with_cd(ab, cd, tmp) \ 85*4882a593Smuzhiyun movq cd, tmp; \ 86*4882a593Smuzhiyun movq ab, cd; \ 87*4882a593Smuzhiyun movq tmp, ab; 88*4882a593Smuzhiyun 89*4882a593Smuzhiyun/* 90*4882a593Smuzhiyun * Combined G1 & G2 function. Reordered with help of rotates to have moves 91*4882a593Smuzhiyun * at begining. 92*4882a593Smuzhiyun */ 93*4882a593Smuzhiyun#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ 94*4882a593Smuzhiyun /* G1,1 && G2,1 */ \ 95*4882a593Smuzhiyun do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ 96*4882a593Smuzhiyun do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ 97*4882a593Smuzhiyun \ 98*4882a593Smuzhiyun do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ 99*4882a593Smuzhiyun do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ 100*4882a593Smuzhiyun \ 101*4882a593Smuzhiyun do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ 102*4882a593Smuzhiyun do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ 103*4882a593Smuzhiyun \ 104*4882a593Smuzhiyun /* G1,2 && G2,2 */ \ 105*4882a593Smuzhiyun do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ 106*4882a593Smuzhiyun do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ 107*4882a593Smuzhiyun swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \ 108*4882a593Smuzhiyun \ 109*4882a593Smuzhiyun do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ 110*4882a593Smuzhiyun do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ 111*4882a593Smuzhiyun swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \ 112*4882a593Smuzhiyun \ 113*4882a593Smuzhiyun do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ 114*4882a593Smuzhiyun do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ 115*4882a593Smuzhiyun swap_ab_with_cd(ab ## 2, cd ## 2, RT0); 116*4882a593Smuzhiyun 117*4882a593Smuzhiyun#define enc_round_end(ab, x, y, n) \ 118*4882a593Smuzhiyun addl y ## d, x ## d; \ 119*4882a593Smuzhiyun addl x ## d, y ## d; \ 120*4882a593Smuzhiyun addl k+4*(2*(n))(CTX), x ## d; \ 121*4882a593Smuzhiyun xorl ab ## d, x ## d; \ 122*4882a593Smuzhiyun addl k+4*(2*(n)+1)(CTX), y ## d; \ 123*4882a593Smuzhiyun shrq $32, ab; \ 124*4882a593Smuzhiyun roll $1, ab ## d; \ 125*4882a593Smuzhiyun xorl y ## d, ab ## d; \ 126*4882a593Smuzhiyun shlq $32, ab; \ 127*4882a593Smuzhiyun rorl $1, x ## d; \ 128*4882a593Smuzhiyun orq x, ab; 129*4882a593Smuzhiyun 130*4882a593Smuzhiyun#define dec_round_end(ba, x, y, n) \ 131*4882a593Smuzhiyun addl y ## d, x ## d; \ 132*4882a593Smuzhiyun addl x ## d, y ## d; \ 133*4882a593Smuzhiyun addl k+4*(2*(n))(CTX), x ## d; \ 134*4882a593Smuzhiyun addl k+4*(2*(n)+1)(CTX), y ## d; \ 135*4882a593Smuzhiyun xorl ba ## d, y ## d; \ 136*4882a593Smuzhiyun shrq $32, ba; \ 137*4882a593Smuzhiyun roll $1, ba ## d; \ 138*4882a593Smuzhiyun xorl x ## d, ba ## d; \ 139*4882a593Smuzhiyun shlq $32, ba; \ 140*4882a593Smuzhiyun rorl $1, y ## d; \ 141*4882a593Smuzhiyun orq y, ba; 142*4882a593Smuzhiyun 143*4882a593Smuzhiyun#define encrypt_round3(ab, cd, n) \ 144*4882a593Smuzhiyun g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ 145*4882a593Smuzhiyun \ 146*4882a593Smuzhiyun enc_round_end(ab ## 0, RX0, RY0, n); \ 147*4882a593Smuzhiyun enc_round_end(ab ## 1, RX1, RY1, n); \ 148*4882a593Smuzhiyun enc_round_end(ab ## 2, RX2, RY2, n); 149*4882a593Smuzhiyun 150*4882a593Smuzhiyun#define decrypt_round3(ba, dc, n) \ 151*4882a593Smuzhiyun g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ 152*4882a593Smuzhiyun \ 153*4882a593Smuzhiyun dec_round_end(ba ## 0, RX0, RY0, n); \ 154*4882a593Smuzhiyun dec_round_end(ba ## 1, RX1, RY1, n); \ 155*4882a593Smuzhiyun dec_round_end(ba ## 2, RX2, RY2, n); 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun#define encrypt_cycle3(ab, cd, n) \ 158*4882a593Smuzhiyun encrypt_round3(ab, cd, n*2); \ 159*4882a593Smuzhiyun encrypt_round3(ab, cd, (n*2)+1); 160*4882a593Smuzhiyun 161*4882a593Smuzhiyun#define decrypt_cycle3(ba, dc, n) \ 162*4882a593Smuzhiyun decrypt_round3(ba, dc, (n*2)+1); \ 163*4882a593Smuzhiyun decrypt_round3(ba, dc, (n*2)); 164*4882a593Smuzhiyun 165*4882a593Smuzhiyun#define push_cd() \ 166*4882a593Smuzhiyun pushq RCD2; \ 167*4882a593Smuzhiyun pushq RCD1; \ 168*4882a593Smuzhiyun pushq RCD0; 169*4882a593Smuzhiyun 170*4882a593Smuzhiyun#define pop_cd() \ 171*4882a593Smuzhiyun popq RCD0; \ 172*4882a593Smuzhiyun popq RCD1; \ 173*4882a593Smuzhiyun popq RCD2; 174*4882a593Smuzhiyun 175*4882a593Smuzhiyun#define inpack3(in, n, xy, m) \ 176*4882a593Smuzhiyun movq 4*(n)(in), xy ## 0; \ 177*4882a593Smuzhiyun xorq w+4*m(CTX), xy ## 0; \ 178*4882a593Smuzhiyun \ 179*4882a593Smuzhiyun movq 4*(4+(n))(in), xy ## 1; \ 180*4882a593Smuzhiyun xorq w+4*m(CTX), xy ## 1; \ 181*4882a593Smuzhiyun \ 182*4882a593Smuzhiyun movq 4*(8+(n))(in), xy ## 2; \ 183*4882a593Smuzhiyun xorq w+4*m(CTX), xy ## 2; 184*4882a593Smuzhiyun 185*4882a593Smuzhiyun#define outunpack3(op, out, n, xy, m) \ 186*4882a593Smuzhiyun xorq w+4*m(CTX), xy ## 0; \ 187*4882a593Smuzhiyun op ## q xy ## 0, 4*(n)(out); \ 188*4882a593Smuzhiyun \ 189*4882a593Smuzhiyun xorq w+4*m(CTX), xy ## 1; \ 190*4882a593Smuzhiyun op ## q xy ## 1, 4*(4+(n))(out); \ 191*4882a593Smuzhiyun \ 192*4882a593Smuzhiyun xorq w+4*m(CTX), xy ## 2; \ 193*4882a593Smuzhiyun op ## q xy ## 2, 4*(8+(n))(out); 194*4882a593Smuzhiyun 195*4882a593Smuzhiyun#define inpack_enc3() \ 196*4882a593Smuzhiyun inpack3(RIO, 0, RAB, 0); \ 197*4882a593Smuzhiyun inpack3(RIO, 2, RCD, 2); 198*4882a593Smuzhiyun 199*4882a593Smuzhiyun#define outunpack_enc3(op) \ 200*4882a593Smuzhiyun outunpack3(op, RIO, 2, RAB, 6); \ 201*4882a593Smuzhiyun outunpack3(op, RIO, 0, RCD, 4); 202*4882a593Smuzhiyun 203*4882a593Smuzhiyun#define inpack_dec3() \ 204*4882a593Smuzhiyun inpack3(RIO, 0, RAB, 4); \ 205*4882a593Smuzhiyun rorq $32, RAB0; \ 206*4882a593Smuzhiyun rorq $32, RAB1; \ 207*4882a593Smuzhiyun rorq $32, RAB2; \ 208*4882a593Smuzhiyun inpack3(RIO, 2, RCD, 6); \ 209*4882a593Smuzhiyun rorq $32, RCD0; \ 210*4882a593Smuzhiyun rorq $32, RCD1; \ 211*4882a593Smuzhiyun rorq $32, RCD2; 212*4882a593Smuzhiyun 213*4882a593Smuzhiyun#define outunpack_dec3() \ 214*4882a593Smuzhiyun rorq $32, RCD0; \ 215*4882a593Smuzhiyun rorq $32, RCD1; \ 216*4882a593Smuzhiyun rorq $32, RCD2; \ 217*4882a593Smuzhiyun outunpack3(mov, RIO, 0, RCD, 0); \ 218*4882a593Smuzhiyun rorq $32, RAB0; \ 219*4882a593Smuzhiyun rorq $32, RAB1; \ 220*4882a593Smuzhiyun rorq $32, RAB2; \ 221*4882a593Smuzhiyun outunpack3(mov, RIO, 2, RAB, 2); 222*4882a593Smuzhiyun 223*4882a593SmuzhiyunSYM_FUNC_START(__twofish_enc_blk_3way) 224*4882a593Smuzhiyun /* input: 225*4882a593Smuzhiyun * %rdi: ctx, CTX 226*4882a593Smuzhiyun * %rsi: dst 227*4882a593Smuzhiyun * %rdx: src, RIO 228*4882a593Smuzhiyun * %rcx: bool, if true: xor output 229*4882a593Smuzhiyun */ 230*4882a593Smuzhiyun pushq %r13; 231*4882a593Smuzhiyun pushq %r12; 232*4882a593Smuzhiyun pushq %rbx; 233*4882a593Smuzhiyun 234*4882a593Smuzhiyun pushq %rcx; /* bool xor */ 235*4882a593Smuzhiyun pushq %rsi; /* dst */ 236*4882a593Smuzhiyun 237*4882a593Smuzhiyun inpack_enc3(); 238*4882a593Smuzhiyun 239*4882a593Smuzhiyun push_cd(); 240*4882a593Smuzhiyun encrypt_cycle3(RAB, CD, 0); 241*4882a593Smuzhiyun encrypt_cycle3(RAB, CD, 1); 242*4882a593Smuzhiyun encrypt_cycle3(RAB, CD, 2); 243*4882a593Smuzhiyun encrypt_cycle3(RAB, CD, 3); 244*4882a593Smuzhiyun encrypt_cycle3(RAB, CD, 4); 245*4882a593Smuzhiyun encrypt_cycle3(RAB, CD, 5); 246*4882a593Smuzhiyun encrypt_cycle3(RAB, CD, 6); 247*4882a593Smuzhiyun encrypt_cycle3(RAB, CD, 7); 248*4882a593Smuzhiyun pop_cd(); 249*4882a593Smuzhiyun 250*4882a593Smuzhiyun popq RIO; /* dst */ 251*4882a593Smuzhiyun popq RT1; /* bool xor */ 252*4882a593Smuzhiyun 253*4882a593Smuzhiyun testb RT1bl, RT1bl; 254*4882a593Smuzhiyun jnz .L__enc_xor3; 255*4882a593Smuzhiyun 256*4882a593Smuzhiyun outunpack_enc3(mov); 257*4882a593Smuzhiyun 258*4882a593Smuzhiyun popq %rbx; 259*4882a593Smuzhiyun popq %r12; 260*4882a593Smuzhiyun popq %r13; 261*4882a593Smuzhiyun RET; 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun.L__enc_xor3: 264*4882a593Smuzhiyun outunpack_enc3(xor); 265*4882a593Smuzhiyun 266*4882a593Smuzhiyun popq %rbx; 267*4882a593Smuzhiyun popq %r12; 268*4882a593Smuzhiyun popq %r13; 269*4882a593Smuzhiyun RET; 270*4882a593SmuzhiyunSYM_FUNC_END(__twofish_enc_blk_3way) 271*4882a593Smuzhiyun 272*4882a593SmuzhiyunSYM_FUNC_START(twofish_dec_blk_3way) 273*4882a593Smuzhiyun /* input: 274*4882a593Smuzhiyun * %rdi: ctx, CTX 275*4882a593Smuzhiyun * %rsi: dst 276*4882a593Smuzhiyun * %rdx: src, RIO 277*4882a593Smuzhiyun */ 278*4882a593Smuzhiyun pushq %r13; 279*4882a593Smuzhiyun pushq %r12; 280*4882a593Smuzhiyun pushq %rbx; 281*4882a593Smuzhiyun 282*4882a593Smuzhiyun pushq %rsi; /* dst */ 283*4882a593Smuzhiyun 284*4882a593Smuzhiyun inpack_dec3(); 285*4882a593Smuzhiyun 286*4882a593Smuzhiyun push_cd(); 287*4882a593Smuzhiyun decrypt_cycle3(RAB, CD, 7); 288*4882a593Smuzhiyun decrypt_cycle3(RAB, CD, 6); 289*4882a593Smuzhiyun decrypt_cycle3(RAB, CD, 5); 290*4882a593Smuzhiyun decrypt_cycle3(RAB, CD, 4); 291*4882a593Smuzhiyun decrypt_cycle3(RAB, CD, 3); 292*4882a593Smuzhiyun decrypt_cycle3(RAB, CD, 2); 293*4882a593Smuzhiyun decrypt_cycle3(RAB, CD, 1); 294*4882a593Smuzhiyun decrypt_cycle3(RAB, CD, 0); 295*4882a593Smuzhiyun pop_cd(); 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun popq RIO; /* dst */ 298*4882a593Smuzhiyun 299*4882a593Smuzhiyun outunpack_dec3(); 300*4882a593Smuzhiyun 301*4882a593Smuzhiyun popq %rbx; 302*4882a593Smuzhiyun popq %r12; 303*4882a593Smuzhiyun popq %r13; 304*4882a593Smuzhiyun RET; 305*4882a593SmuzhiyunSYM_FUNC_END(twofish_dec_blk_3way) 306