1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Blowfish Cipher Algorithm (x86_64) 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun.file "blowfish-x86_64-asm.S" 11*4882a593Smuzhiyun.text 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun/* structure of crypto context */ 14*4882a593Smuzhiyun#define p 0 15*4882a593Smuzhiyun#define s0 ((16 + 2) * 4) 16*4882a593Smuzhiyun#define s1 ((16 + 2 + (1 * 256)) * 4) 17*4882a593Smuzhiyun#define s2 ((16 + 2 + (2 * 256)) * 4) 18*4882a593Smuzhiyun#define s3 ((16 + 2 + (3 * 256)) * 4) 19*4882a593Smuzhiyun 20*4882a593Smuzhiyun/* register macros */ 21*4882a593Smuzhiyun#define CTX %r12 22*4882a593Smuzhiyun#define RIO %rsi 23*4882a593Smuzhiyun 24*4882a593Smuzhiyun#define RX0 %rax 25*4882a593Smuzhiyun#define RX1 %rbx 26*4882a593Smuzhiyun#define RX2 %rcx 27*4882a593Smuzhiyun#define RX3 %rdx 28*4882a593Smuzhiyun 29*4882a593Smuzhiyun#define RX0d %eax 30*4882a593Smuzhiyun#define RX1d %ebx 31*4882a593Smuzhiyun#define RX2d %ecx 32*4882a593Smuzhiyun#define RX3d %edx 33*4882a593Smuzhiyun 34*4882a593Smuzhiyun#define RX0bl %al 35*4882a593Smuzhiyun#define RX1bl %bl 36*4882a593Smuzhiyun#define RX2bl %cl 37*4882a593Smuzhiyun#define RX3bl %dl 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun#define RX0bh %ah 40*4882a593Smuzhiyun#define RX1bh %bh 41*4882a593Smuzhiyun#define RX2bh %ch 42*4882a593Smuzhiyun#define RX3bh %dh 43*4882a593Smuzhiyun 44*4882a593Smuzhiyun#define RT0 %rdi 45*4882a593Smuzhiyun#define RT1 %rsi 46*4882a593Smuzhiyun#define RT2 %r8 47*4882a593Smuzhiyun#define RT3 %r9 48*4882a593Smuzhiyun 49*4882a593Smuzhiyun#define RT0d %edi 50*4882a593Smuzhiyun#define RT1d %esi 51*4882a593Smuzhiyun#define RT2d %r8d 52*4882a593Smuzhiyun#define RT3d %r9d 53*4882a593Smuzhiyun 54*4882a593Smuzhiyun#define RKEY %r10 55*4882a593Smuzhiyun 56*4882a593Smuzhiyun/*********************************************************************** 57*4882a593Smuzhiyun * 1-way blowfish 58*4882a593Smuzhiyun ***********************************************************************/ 59*4882a593Smuzhiyun#define F() \ 60*4882a593Smuzhiyun rorq $16, RX0; \ 61*4882a593Smuzhiyun movzbl RX0bh, RT0d; \ 62*4882a593Smuzhiyun movzbl RX0bl, RT1d; \ 63*4882a593Smuzhiyun rolq $16, RX0; \ 64*4882a593Smuzhiyun movl s0(CTX,RT0,4), RT0d; \ 65*4882a593Smuzhiyun addl s1(CTX,RT1,4), RT0d; \ 66*4882a593Smuzhiyun movzbl RX0bh, RT1d; \ 67*4882a593Smuzhiyun movzbl RX0bl, RT2d; \ 68*4882a593Smuzhiyun rolq $32, RX0; \ 69*4882a593Smuzhiyun xorl s2(CTX,RT1,4), RT0d; \ 70*4882a593Smuzhiyun addl s3(CTX,RT2,4), RT0d; \ 71*4882a593Smuzhiyun xorq RT0, RX0; 72*4882a593Smuzhiyun 73*4882a593Smuzhiyun#define add_roundkey_enc(n) \ 74*4882a593Smuzhiyun xorq p+4*(n)(CTX), RX0; 75*4882a593Smuzhiyun 76*4882a593Smuzhiyun#define round_enc(n) \ 77*4882a593Smuzhiyun add_roundkey_enc(n); \ 78*4882a593Smuzhiyun \ 79*4882a593Smuzhiyun F(); \ 80*4882a593Smuzhiyun F(); 81*4882a593Smuzhiyun 82*4882a593Smuzhiyun#define add_roundkey_dec(n) \ 83*4882a593Smuzhiyun movq p+4*(n-1)(CTX), RT0; \ 84*4882a593Smuzhiyun rorq $32, RT0; \ 85*4882a593Smuzhiyun xorq RT0, RX0; 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun#define round_dec(n) \ 88*4882a593Smuzhiyun add_roundkey_dec(n); \ 89*4882a593Smuzhiyun \ 90*4882a593Smuzhiyun F(); \ 91*4882a593Smuzhiyun F(); \ 92*4882a593Smuzhiyun 93*4882a593Smuzhiyun#define read_block() \ 94*4882a593Smuzhiyun movq (RIO), RX0; \ 95*4882a593Smuzhiyun rorq $32, RX0; \ 96*4882a593Smuzhiyun bswapq RX0; 97*4882a593Smuzhiyun 98*4882a593Smuzhiyun#define write_block() \ 99*4882a593Smuzhiyun bswapq RX0; \ 100*4882a593Smuzhiyun movq RX0, (RIO); 101*4882a593Smuzhiyun 102*4882a593Smuzhiyun#define xor_block() \ 103*4882a593Smuzhiyun bswapq RX0; \ 104*4882a593Smuzhiyun xorq RX0, (RIO); 105*4882a593Smuzhiyun 106*4882a593SmuzhiyunSYM_FUNC_START(__blowfish_enc_blk) 107*4882a593Smuzhiyun /* input: 108*4882a593Smuzhiyun * %rdi: ctx 109*4882a593Smuzhiyun * %rsi: dst 110*4882a593Smuzhiyun * %rdx: src 111*4882a593Smuzhiyun * %rcx: bool, if true: xor output 112*4882a593Smuzhiyun */ 113*4882a593Smuzhiyun movq %r12, %r11; 114*4882a593Smuzhiyun 115*4882a593Smuzhiyun movq %rdi, CTX; 116*4882a593Smuzhiyun movq %rsi, %r10; 117*4882a593Smuzhiyun movq %rdx, RIO; 118*4882a593Smuzhiyun 119*4882a593Smuzhiyun read_block(); 120*4882a593Smuzhiyun 121*4882a593Smuzhiyun round_enc(0); 122*4882a593Smuzhiyun round_enc(2); 123*4882a593Smuzhiyun round_enc(4); 124*4882a593Smuzhiyun round_enc(6); 125*4882a593Smuzhiyun round_enc(8); 126*4882a593Smuzhiyun round_enc(10); 127*4882a593Smuzhiyun round_enc(12); 128*4882a593Smuzhiyun round_enc(14); 129*4882a593Smuzhiyun add_roundkey_enc(16); 130*4882a593Smuzhiyun 131*4882a593Smuzhiyun movq %r11, %r12; 132*4882a593Smuzhiyun 133*4882a593Smuzhiyun movq %r10, RIO; 134*4882a593Smuzhiyun test %cl, %cl; 135*4882a593Smuzhiyun jnz .L__enc_xor; 136*4882a593Smuzhiyun 137*4882a593Smuzhiyun write_block(); 138*4882a593Smuzhiyun RET; 139*4882a593Smuzhiyun.L__enc_xor: 140*4882a593Smuzhiyun xor_block(); 141*4882a593Smuzhiyun RET; 142*4882a593SmuzhiyunSYM_FUNC_END(__blowfish_enc_blk) 143*4882a593Smuzhiyun 144*4882a593SmuzhiyunSYM_FUNC_START(blowfish_dec_blk) 145*4882a593Smuzhiyun /* input: 146*4882a593Smuzhiyun * %rdi: ctx 147*4882a593Smuzhiyun * %rsi: dst 148*4882a593Smuzhiyun * %rdx: src 149*4882a593Smuzhiyun */ 150*4882a593Smuzhiyun movq %r12, %r11; 151*4882a593Smuzhiyun 152*4882a593Smuzhiyun movq %rdi, CTX; 153*4882a593Smuzhiyun movq %rsi, %r10; 154*4882a593Smuzhiyun movq %rdx, RIO; 155*4882a593Smuzhiyun 156*4882a593Smuzhiyun read_block(); 157*4882a593Smuzhiyun 158*4882a593Smuzhiyun round_dec(17); 159*4882a593Smuzhiyun round_dec(15); 160*4882a593Smuzhiyun round_dec(13); 161*4882a593Smuzhiyun round_dec(11); 162*4882a593Smuzhiyun round_dec(9); 163*4882a593Smuzhiyun round_dec(7); 164*4882a593Smuzhiyun round_dec(5); 165*4882a593Smuzhiyun round_dec(3); 166*4882a593Smuzhiyun add_roundkey_dec(1); 167*4882a593Smuzhiyun 168*4882a593Smuzhiyun movq %r10, RIO; 169*4882a593Smuzhiyun write_block(); 170*4882a593Smuzhiyun 171*4882a593Smuzhiyun movq %r11, %r12; 172*4882a593Smuzhiyun 173*4882a593Smuzhiyun RET; 174*4882a593SmuzhiyunSYM_FUNC_END(blowfish_dec_blk) 175*4882a593Smuzhiyun 176*4882a593Smuzhiyun/********************************************************************** 177*4882a593Smuzhiyun 4-way blowfish, four blocks parallel 178*4882a593Smuzhiyun **********************************************************************/ 179*4882a593Smuzhiyun 180*4882a593Smuzhiyun/* F() for 4-way. Slower when used alone/1-way, but faster when used 181*4882a593Smuzhiyun * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). 182*4882a593Smuzhiyun */ 183*4882a593Smuzhiyun#define F4(x) \ 184*4882a593Smuzhiyun movzbl x ## bh, RT1d; \ 185*4882a593Smuzhiyun movzbl x ## bl, RT3d; \ 186*4882a593Smuzhiyun rorq $16, x; \ 187*4882a593Smuzhiyun movzbl x ## bh, RT0d; \ 188*4882a593Smuzhiyun movzbl x ## bl, RT2d; \ 189*4882a593Smuzhiyun rorq $16, x; \ 190*4882a593Smuzhiyun movl s0(CTX,RT0,4), RT0d; \ 191*4882a593Smuzhiyun addl s1(CTX,RT2,4), RT0d; \ 192*4882a593Smuzhiyun xorl s2(CTX,RT1,4), RT0d; \ 193*4882a593Smuzhiyun addl s3(CTX,RT3,4), RT0d; \ 194*4882a593Smuzhiyun xorq RT0, x; 195*4882a593Smuzhiyun 196*4882a593Smuzhiyun#define add_preloaded_roundkey4() \ 197*4882a593Smuzhiyun xorq RKEY, RX0; \ 198*4882a593Smuzhiyun xorq RKEY, RX1; \ 199*4882a593Smuzhiyun xorq RKEY, RX2; \ 200*4882a593Smuzhiyun xorq RKEY, RX3; 201*4882a593Smuzhiyun 202*4882a593Smuzhiyun#define preload_roundkey_enc(n) \ 203*4882a593Smuzhiyun movq p+4*(n)(CTX), RKEY; 204*4882a593Smuzhiyun 205*4882a593Smuzhiyun#define add_roundkey_enc4(n) \ 206*4882a593Smuzhiyun add_preloaded_roundkey4(); \ 207*4882a593Smuzhiyun preload_roundkey_enc(n + 2); 208*4882a593Smuzhiyun 209*4882a593Smuzhiyun#define round_enc4(n) \ 210*4882a593Smuzhiyun add_roundkey_enc4(n); \ 211*4882a593Smuzhiyun \ 212*4882a593Smuzhiyun F4(RX0); \ 213*4882a593Smuzhiyun F4(RX1); \ 214*4882a593Smuzhiyun F4(RX2); \ 215*4882a593Smuzhiyun F4(RX3); \ 216*4882a593Smuzhiyun \ 217*4882a593Smuzhiyun F4(RX0); \ 218*4882a593Smuzhiyun F4(RX1); \ 219*4882a593Smuzhiyun F4(RX2); \ 220*4882a593Smuzhiyun F4(RX3); 221*4882a593Smuzhiyun 222*4882a593Smuzhiyun#define preload_roundkey_dec(n) \ 223*4882a593Smuzhiyun movq p+4*((n)-1)(CTX), RKEY; \ 224*4882a593Smuzhiyun rorq $32, RKEY; 225*4882a593Smuzhiyun 226*4882a593Smuzhiyun#define add_roundkey_dec4(n) \ 227*4882a593Smuzhiyun add_preloaded_roundkey4(); \ 228*4882a593Smuzhiyun preload_roundkey_dec(n - 2); 229*4882a593Smuzhiyun 230*4882a593Smuzhiyun#define round_dec4(n) \ 231*4882a593Smuzhiyun add_roundkey_dec4(n); \ 232*4882a593Smuzhiyun \ 233*4882a593Smuzhiyun F4(RX0); \ 234*4882a593Smuzhiyun F4(RX1); \ 235*4882a593Smuzhiyun F4(RX2); \ 236*4882a593Smuzhiyun F4(RX3); \ 237*4882a593Smuzhiyun \ 238*4882a593Smuzhiyun F4(RX0); \ 239*4882a593Smuzhiyun F4(RX1); \ 240*4882a593Smuzhiyun F4(RX2); \ 241*4882a593Smuzhiyun F4(RX3); 242*4882a593Smuzhiyun 243*4882a593Smuzhiyun#define read_block4() \ 244*4882a593Smuzhiyun movq (RIO), RX0; \ 245*4882a593Smuzhiyun rorq $32, RX0; \ 246*4882a593Smuzhiyun bswapq RX0; \ 247*4882a593Smuzhiyun \ 248*4882a593Smuzhiyun movq 8(RIO), RX1; \ 249*4882a593Smuzhiyun rorq $32, RX1; \ 250*4882a593Smuzhiyun bswapq RX1; \ 251*4882a593Smuzhiyun \ 252*4882a593Smuzhiyun movq 16(RIO), RX2; \ 253*4882a593Smuzhiyun rorq $32, RX2; \ 254*4882a593Smuzhiyun bswapq RX2; \ 255*4882a593Smuzhiyun \ 256*4882a593Smuzhiyun movq 24(RIO), RX3; \ 257*4882a593Smuzhiyun rorq $32, RX3; \ 258*4882a593Smuzhiyun bswapq RX3; 259*4882a593Smuzhiyun 260*4882a593Smuzhiyun#define write_block4() \ 261*4882a593Smuzhiyun bswapq RX0; \ 262*4882a593Smuzhiyun movq RX0, (RIO); \ 263*4882a593Smuzhiyun \ 264*4882a593Smuzhiyun bswapq RX1; \ 265*4882a593Smuzhiyun movq RX1, 8(RIO); \ 266*4882a593Smuzhiyun \ 267*4882a593Smuzhiyun bswapq RX2; \ 268*4882a593Smuzhiyun movq RX2, 16(RIO); \ 269*4882a593Smuzhiyun \ 270*4882a593Smuzhiyun bswapq RX3; \ 271*4882a593Smuzhiyun movq RX3, 24(RIO); 272*4882a593Smuzhiyun 273*4882a593Smuzhiyun#define xor_block4() \ 274*4882a593Smuzhiyun bswapq RX0; \ 275*4882a593Smuzhiyun xorq RX0, (RIO); \ 276*4882a593Smuzhiyun \ 277*4882a593Smuzhiyun bswapq RX1; \ 278*4882a593Smuzhiyun xorq RX1, 8(RIO); \ 279*4882a593Smuzhiyun \ 280*4882a593Smuzhiyun bswapq RX2; \ 281*4882a593Smuzhiyun xorq RX2, 16(RIO); \ 282*4882a593Smuzhiyun \ 283*4882a593Smuzhiyun bswapq RX3; \ 284*4882a593Smuzhiyun xorq RX3, 24(RIO); 285*4882a593Smuzhiyun 286*4882a593SmuzhiyunSYM_FUNC_START(__blowfish_enc_blk_4way) 287*4882a593Smuzhiyun /* input: 288*4882a593Smuzhiyun * %rdi: ctx 289*4882a593Smuzhiyun * %rsi: dst 290*4882a593Smuzhiyun * %rdx: src 291*4882a593Smuzhiyun * %rcx: bool, if true: xor output 292*4882a593Smuzhiyun */ 293*4882a593Smuzhiyun pushq %r12; 294*4882a593Smuzhiyun pushq %rbx; 295*4882a593Smuzhiyun pushq %rcx; 296*4882a593Smuzhiyun 297*4882a593Smuzhiyun movq %rdi, CTX 298*4882a593Smuzhiyun movq %rsi, %r11; 299*4882a593Smuzhiyun movq %rdx, RIO; 300*4882a593Smuzhiyun 301*4882a593Smuzhiyun preload_roundkey_enc(0); 302*4882a593Smuzhiyun 303*4882a593Smuzhiyun read_block4(); 304*4882a593Smuzhiyun 305*4882a593Smuzhiyun round_enc4(0); 306*4882a593Smuzhiyun round_enc4(2); 307*4882a593Smuzhiyun round_enc4(4); 308*4882a593Smuzhiyun round_enc4(6); 309*4882a593Smuzhiyun round_enc4(8); 310*4882a593Smuzhiyun round_enc4(10); 311*4882a593Smuzhiyun round_enc4(12); 312*4882a593Smuzhiyun round_enc4(14); 313*4882a593Smuzhiyun add_preloaded_roundkey4(); 314*4882a593Smuzhiyun 315*4882a593Smuzhiyun popq %r12; 316*4882a593Smuzhiyun movq %r11, RIO; 317*4882a593Smuzhiyun 318*4882a593Smuzhiyun test %r12b, %r12b; 319*4882a593Smuzhiyun jnz .L__enc_xor4; 320*4882a593Smuzhiyun 321*4882a593Smuzhiyun write_block4(); 322*4882a593Smuzhiyun 323*4882a593Smuzhiyun popq %rbx; 324*4882a593Smuzhiyun popq %r12; 325*4882a593Smuzhiyun RET; 326*4882a593Smuzhiyun 327*4882a593Smuzhiyun.L__enc_xor4: 328*4882a593Smuzhiyun xor_block4(); 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun popq %rbx; 331*4882a593Smuzhiyun popq %r12; 332*4882a593Smuzhiyun RET; 333*4882a593SmuzhiyunSYM_FUNC_END(__blowfish_enc_blk_4way) 334*4882a593Smuzhiyun 335*4882a593SmuzhiyunSYM_FUNC_START(blowfish_dec_blk_4way) 336*4882a593Smuzhiyun /* input: 337*4882a593Smuzhiyun * %rdi: ctx 338*4882a593Smuzhiyun * %rsi: dst 339*4882a593Smuzhiyun * %rdx: src 340*4882a593Smuzhiyun */ 341*4882a593Smuzhiyun pushq %r12; 342*4882a593Smuzhiyun pushq %rbx; 343*4882a593Smuzhiyun 344*4882a593Smuzhiyun movq %rdi, CTX; 345*4882a593Smuzhiyun movq %rsi, %r11 346*4882a593Smuzhiyun movq %rdx, RIO; 347*4882a593Smuzhiyun 348*4882a593Smuzhiyun preload_roundkey_dec(17); 349*4882a593Smuzhiyun read_block4(); 350*4882a593Smuzhiyun 351*4882a593Smuzhiyun round_dec4(17); 352*4882a593Smuzhiyun round_dec4(15); 353*4882a593Smuzhiyun round_dec4(13); 354*4882a593Smuzhiyun round_dec4(11); 355*4882a593Smuzhiyun round_dec4(9); 356*4882a593Smuzhiyun round_dec4(7); 357*4882a593Smuzhiyun round_dec4(5); 358*4882a593Smuzhiyun round_dec4(3); 359*4882a593Smuzhiyun add_preloaded_roundkey4(); 360*4882a593Smuzhiyun 361*4882a593Smuzhiyun movq %r11, RIO; 362*4882a593Smuzhiyun write_block4(); 363*4882a593Smuzhiyun 364*4882a593Smuzhiyun popq %rbx; 365*4882a593Smuzhiyun popq %r12; 366*4882a593Smuzhiyun 367*4882a593Smuzhiyun RET; 368*4882a593SmuzhiyunSYM_FUNC_END(blowfish_dec_blk_4way) 369