1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * Camellia Cipher Algorithm (x86_64) 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun 10*4882a593Smuzhiyun.file "camellia-x86_64-asm_64.S" 11*4882a593Smuzhiyun.text 12*4882a593Smuzhiyun 13*4882a593Smuzhiyun.extern camellia_sp10011110; 14*4882a593Smuzhiyun.extern camellia_sp22000222; 15*4882a593Smuzhiyun.extern camellia_sp03303033; 16*4882a593Smuzhiyun.extern camellia_sp00444404; 17*4882a593Smuzhiyun.extern camellia_sp02220222; 18*4882a593Smuzhiyun.extern camellia_sp30333033; 19*4882a593Smuzhiyun.extern camellia_sp44044404; 20*4882a593Smuzhiyun.extern camellia_sp11101110; 21*4882a593Smuzhiyun 22*4882a593Smuzhiyun#define sp10011110 camellia_sp10011110 23*4882a593Smuzhiyun#define sp22000222 camellia_sp22000222 24*4882a593Smuzhiyun#define sp03303033 camellia_sp03303033 25*4882a593Smuzhiyun#define sp00444404 camellia_sp00444404 26*4882a593Smuzhiyun#define sp02220222 camellia_sp02220222 27*4882a593Smuzhiyun#define sp30333033 camellia_sp30333033 28*4882a593Smuzhiyun#define sp44044404 camellia_sp44044404 29*4882a593Smuzhiyun#define sp11101110 camellia_sp11101110 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun#define CAMELLIA_TABLE_BYTE_LEN 272 32*4882a593Smuzhiyun 33*4882a593Smuzhiyun/* struct camellia_ctx: */ 34*4882a593Smuzhiyun#define key_table 0 35*4882a593Smuzhiyun#define key_length CAMELLIA_TABLE_BYTE_LEN 36*4882a593Smuzhiyun 37*4882a593Smuzhiyun/* register macros */ 38*4882a593Smuzhiyun#define CTX %rdi 39*4882a593Smuzhiyun#define RIO %rsi 40*4882a593Smuzhiyun#define RIOd %esi 41*4882a593Smuzhiyun 42*4882a593Smuzhiyun#define RAB0 %rax 43*4882a593Smuzhiyun#define RCD0 %rcx 44*4882a593Smuzhiyun#define RAB1 %rbx 45*4882a593Smuzhiyun#define RCD1 %rdx 46*4882a593Smuzhiyun 47*4882a593Smuzhiyun#define RAB0d %eax 48*4882a593Smuzhiyun#define RCD0d %ecx 49*4882a593Smuzhiyun#define RAB1d %ebx 50*4882a593Smuzhiyun#define RCD1d %edx 51*4882a593Smuzhiyun 52*4882a593Smuzhiyun#define RAB0bl %al 53*4882a593Smuzhiyun#define RCD0bl %cl 54*4882a593Smuzhiyun#define RAB1bl %bl 55*4882a593Smuzhiyun#define RCD1bl %dl 56*4882a593Smuzhiyun 57*4882a593Smuzhiyun#define RAB0bh %ah 58*4882a593Smuzhiyun#define RCD0bh %ch 59*4882a593Smuzhiyun#define RAB1bh %bh 60*4882a593Smuzhiyun#define RCD1bh %dh 61*4882a593Smuzhiyun 62*4882a593Smuzhiyun#define RT0 %rsi 63*4882a593Smuzhiyun#define RT1 %r12 64*4882a593Smuzhiyun#define RT2 %r8 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun#define RT0d %esi 67*4882a593Smuzhiyun#define RT1d %r12d 68*4882a593Smuzhiyun#define RT2d %r8d 69*4882a593Smuzhiyun 70*4882a593Smuzhiyun#define RT2bl %r8b 71*4882a593Smuzhiyun 72*4882a593Smuzhiyun#define RXOR %r9 73*4882a593Smuzhiyun#define RR12 %r10 74*4882a593Smuzhiyun#define RDST %r11 75*4882a593Smuzhiyun 76*4882a593Smuzhiyun#define RXORd %r9d 77*4882a593Smuzhiyun#define RXORbl %r9b 78*4882a593Smuzhiyun 79*4882a593Smuzhiyun#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ 80*4882a593Smuzhiyun movzbl ab ## bl, tmp2 ## d; \ 81*4882a593Smuzhiyun movzbl ab ## bh, tmp1 ## d; \ 82*4882a593Smuzhiyun rorq $16, ab; \ 83*4882a593Smuzhiyun xorq T0(, tmp2, 8), dst; \ 84*4882a593Smuzhiyun xorq T1(, tmp1, 8), dst; 85*4882a593Smuzhiyun 86*4882a593Smuzhiyun/********************************************************************** 87*4882a593Smuzhiyun 1-way camellia 88*4882a593Smuzhiyun **********************************************************************/ 89*4882a593Smuzhiyun#define roundsm(ab, subkey, cd) \ 90*4882a593Smuzhiyun movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 91*4882a593Smuzhiyun \ 92*4882a593Smuzhiyun xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 93*4882a593Smuzhiyun xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 94*4882a593Smuzhiyun xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 95*4882a593Smuzhiyun xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 96*4882a593Smuzhiyun \ 97*4882a593Smuzhiyun xorq RT2, cd ## 0; 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun#define fls(l, r, kl, kr) \ 100*4882a593Smuzhiyun movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 101*4882a593Smuzhiyun andl l ## 0d, RT0d; \ 102*4882a593Smuzhiyun roll $1, RT0d; \ 103*4882a593Smuzhiyun shlq $32, RT0; \ 104*4882a593Smuzhiyun xorq RT0, l ## 0; \ 105*4882a593Smuzhiyun movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 106*4882a593Smuzhiyun orq r ## 0, RT1; \ 107*4882a593Smuzhiyun shrq $32, RT1; \ 108*4882a593Smuzhiyun xorq RT1, r ## 0; \ 109*4882a593Smuzhiyun \ 110*4882a593Smuzhiyun movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ 111*4882a593Smuzhiyun orq l ## 0, RT2; \ 112*4882a593Smuzhiyun shrq $32, RT2; \ 113*4882a593Smuzhiyun xorq RT2, l ## 0; \ 114*4882a593Smuzhiyun movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ 115*4882a593Smuzhiyun andl r ## 0d, RT0d; \ 116*4882a593Smuzhiyun roll $1, RT0d; \ 117*4882a593Smuzhiyun shlq $32, RT0; \ 118*4882a593Smuzhiyun xorq RT0, r ## 0; 119*4882a593Smuzhiyun 120*4882a593Smuzhiyun#define enc_rounds(i) \ 121*4882a593Smuzhiyun roundsm(RAB, i + 2, RCD); \ 122*4882a593Smuzhiyun roundsm(RCD, i + 3, RAB); \ 123*4882a593Smuzhiyun roundsm(RAB, i + 4, RCD); \ 124*4882a593Smuzhiyun roundsm(RCD, i + 5, RAB); \ 125*4882a593Smuzhiyun roundsm(RAB, i + 6, RCD); \ 126*4882a593Smuzhiyun roundsm(RCD, i + 7, RAB); 127*4882a593Smuzhiyun 128*4882a593Smuzhiyun#define enc_fls(i) \ 129*4882a593Smuzhiyun fls(RAB, RCD, i + 0, i + 1); 130*4882a593Smuzhiyun 131*4882a593Smuzhiyun#define enc_inpack() \ 132*4882a593Smuzhiyun movq (RIO), RAB0; \ 133*4882a593Smuzhiyun bswapq RAB0; \ 134*4882a593Smuzhiyun rolq $32, RAB0; \ 135*4882a593Smuzhiyun movq 4*2(RIO), RCD0; \ 136*4882a593Smuzhiyun bswapq RCD0; \ 137*4882a593Smuzhiyun rorq $32, RCD0; \ 138*4882a593Smuzhiyun xorq key_table(CTX), RAB0; 139*4882a593Smuzhiyun 140*4882a593Smuzhiyun#define enc_outunpack(op, max) \ 141*4882a593Smuzhiyun xorq key_table(CTX, max, 8), RCD0; \ 142*4882a593Smuzhiyun rorq $32, RCD0; \ 143*4882a593Smuzhiyun bswapq RCD0; \ 144*4882a593Smuzhiyun op ## q RCD0, (RIO); \ 145*4882a593Smuzhiyun rolq $32, RAB0; \ 146*4882a593Smuzhiyun bswapq RAB0; \ 147*4882a593Smuzhiyun op ## q RAB0, 4*2(RIO); 148*4882a593Smuzhiyun 149*4882a593Smuzhiyun#define dec_rounds(i) \ 150*4882a593Smuzhiyun roundsm(RAB, i + 7, RCD); \ 151*4882a593Smuzhiyun roundsm(RCD, i + 6, RAB); \ 152*4882a593Smuzhiyun roundsm(RAB, i + 5, RCD); \ 153*4882a593Smuzhiyun roundsm(RCD, i + 4, RAB); \ 154*4882a593Smuzhiyun roundsm(RAB, i + 3, RCD); \ 155*4882a593Smuzhiyun roundsm(RCD, i + 2, RAB); 156*4882a593Smuzhiyun 157*4882a593Smuzhiyun#define dec_fls(i) \ 158*4882a593Smuzhiyun fls(RAB, RCD, i + 1, i + 0); 159*4882a593Smuzhiyun 160*4882a593Smuzhiyun#define dec_inpack(max) \ 161*4882a593Smuzhiyun movq (RIO), RAB0; \ 162*4882a593Smuzhiyun bswapq RAB0; \ 163*4882a593Smuzhiyun rolq $32, RAB0; \ 164*4882a593Smuzhiyun movq 4*2(RIO), RCD0; \ 165*4882a593Smuzhiyun bswapq RCD0; \ 166*4882a593Smuzhiyun rorq $32, RCD0; \ 167*4882a593Smuzhiyun xorq key_table(CTX, max, 8), RAB0; 168*4882a593Smuzhiyun 169*4882a593Smuzhiyun#define dec_outunpack() \ 170*4882a593Smuzhiyun xorq key_table(CTX), RCD0; \ 171*4882a593Smuzhiyun rorq $32, RCD0; \ 172*4882a593Smuzhiyun bswapq RCD0; \ 173*4882a593Smuzhiyun movq RCD0, (RIO); \ 174*4882a593Smuzhiyun rolq $32, RAB0; \ 175*4882a593Smuzhiyun bswapq RAB0; \ 176*4882a593Smuzhiyun movq RAB0, 4*2(RIO); 177*4882a593Smuzhiyun 178*4882a593SmuzhiyunSYM_FUNC_START(__camellia_enc_blk) 179*4882a593Smuzhiyun /* input: 180*4882a593Smuzhiyun * %rdi: ctx, CTX 181*4882a593Smuzhiyun * %rsi: dst 182*4882a593Smuzhiyun * %rdx: src 183*4882a593Smuzhiyun * %rcx: bool xor 184*4882a593Smuzhiyun */ 185*4882a593Smuzhiyun movq %r12, RR12; 186*4882a593Smuzhiyun 187*4882a593Smuzhiyun movq %rcx, RXOR; 188*4882a593Smuzhiyun movq %rsi, RDST; 189*4882a593Smuzhiyun movq %rdx, RIO; 190*4882a593Smuzhiyun 191*4882a593Smuzhiyun enc_inpack(); 192*4882a593Smuzhiyun 193*4882a593Smuzhiyun enc_rounds(0); 194*4882a593Smuzhiyun enc_fls(8); 195*4882a593Smuzhiyun enc_rounds(8); 196*4882a593Smuzhiyun enc_fls(16); 197*4882a593Smuzhiyun enc_rounds(16); 198*4882a593Smuzhiyun movl $24, RT1d; /* max */ 199*4882a593Smuzhiyun 200*4882a593Smuzhiyun cmpb $16, key_length(CTX); 201*4882a593Smuzhiyun je .L__enc_done; 202*4882a593Smuzhiyun 203*4882a593Smuzhiyun enc_fls(24); 204*4882a593Smuzhiyun enc_rounds(24); 205*4882a593Smuzhiyun movl $32, RT1d; /* max */ 206*4882a593Smuzhiyun 207*4882a593Smuzhiyun.L__enc_done: 208*4882a593Smuzhiyun testb RXORbl, RXORbl; 209*4882a593Smuzhiyun movq RDST, RIO; 210*4882a593Smuzhiyun 211*4882a593Smuzhiyun jnz .L__enc_xor; 212*4882a593Smuzhiyun 213*4882a593Smuzhiyun enc_outunpack(mov, RT1); 214*4882a593Smuzhiyun 215*4882a593Smuzhiyun movq RR12, %r12; 216*4882a593Smuzhiyun RET; 217*4882a593Smuzhiyun 218*4882a593Smuzhiyun.L__enc_xor: 219*4882a593Smuzhiyun enc_outunpack(xor, RT1); 220*4882a593Smuzhiyun 221*4882a593Smuzhiyun movq RR12, %r12; 222*4882a593Smuzhiyun RET; 223*4882a593SmuzhiyunSYM_FUNC_END(__camellia_enc_blk) 224*4882a593Smuzhiyun 225*4882a593SmuzhiyunSYM_FUNC_START(camellia_dec_blk) 226*4882a593Smuzhiyun /* input: 227*4882a593Smuzhiyun * %rdi: ctx, CTX 228*4882a593Smuzhiyun * %rsi: dst 229*4882a593Smuzhiyun * %rdx: src 230*4882a593Smuzhiyun */ 231*4882a593Smuzhiyun cmpl $16, key_length(CTX); 232*4882a593Smuzhiyun movl $32, RT2d; 233*4882a593Smuzhiyun movl $24, RXORd; 234*4882a593Smuzhiyun cmovel RXORd, RT2d; /* max */ 235*4882a593Smuzhiyun 236*4882a593Smuzhiyun movq %r12, RR12; 237*4882a593Smuzhiyun movq %rsi, RDST; 238*4882a593Smuzhiyun movq %rdx, RIO; 239*4882a593Smuzhiyun 240*4882a593Smuzhiyun dec_inpack(RT2); 241*4882a593Smuzhiyun 242*4882a593Smuzhiyun cmpb $24, RT2bl; 243*4882a593Smuzhiyun je .L__dec_rounds16; 244*4882a593Smuzhiyun 245*4882a593Smuzhiyun dec_rounds(24); 246*4882a593Smuzhiyun dec_fls(24); 247*4882a593Smuzhiyun 248*4882a593Smuzhiyun.L__dec_rounds16: 249*4882a593Smuzhiyun dec_rounds(16); 250*4882a593Smuzhiyun dec_fls(16); 251*4882a593Smuzhiyun dec_rounds(8); 252*4882a593Smuzhiyun dec_fls(8); 253*4882a593Smuzhiyun dec_rounds(0); 254*4882a593Smuzhiyun 255*4882a593Smuzhiyun movq RDST, RIO; 256*4882a593Smuzhiyun 257*4882a593Smuzhiyun dec_outunpack(); 258*4882a593Smuzhiyun 259*4882a593Smuzhiyun movq RR12, %r12; 260*4882a593Smuzhiyun RET; 261*4882a593SmuzhiyunSYM_FUNC_END(camellia_dec_blk) 262*4882a593Smuzhiyun 263*4882a593Smuzhiyun/********************************************************************** 264*4882a593Smuzhiyun 2-way camellia 265*4882a593Smuzhiyun **********************************************************************/ 266*4882a593Smuzhiyun#define roundsm2(ab, subkey, cd) \ 267*4882a593Smuzhiyun movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 268*4882a593Smuzhiyun xorq RT2, cd ## 1; \ 269*4882a593Smuzhiyun \ 270*4882a593Smuzhiyun xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 271*4882a593Smuzhiyun xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 272*4882a593Smuzhiyun xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 273*4882a593Smuzhiyun xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 274*4882a593Smuzhiyun \ 275*4882a593Smuzhiyun xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ 276*4882a593Smuzhiyun xorq RT2, cd ## 0; \ 277*4882a593Smuzhiyun xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ 278*4882a593Smuzhiyun xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ 279*4882a593Smuzhiyun xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); 280*4882a593Smuzhiyun 281*4882a593Smuzhiyun#define fls2(l, r, kl, kr) \ 282*4882a593Smuzhiyun movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 283*4882a593Smuzhiyun andl l ## 0d, RT0d; \ 284*4882a593Smuzhiyun roll $1, RT0d; \ 285*4882a593Smuzhiyun shlq $32, RT0; \ 286*4882a593Smuzhiyun xorq RT0, l ## 0; \ 287*4882a593Smuzhiyun movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 288*4882a593Smuzhiyun orq r ## 0, RT1; \ 289*4882a593Smuzhiyun shrq $32, RT1; \ 290*4882a593Smuzhiyun xorq RT1, r ## 0; \ 291*4882a593Smuzhiyun \ 292*4882a593Smuzhiyun movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ 293*4882a593Smuzhiyun andl l ## 1d, RT2d; \ 294*4882a593Smuzhiyun roll $1, RT2d; \ 295*4882a593Smuzhiyun shlq $32, RT2; \ 296*4882a593Smuzhiyun xorq RT2, l ## 1; \ 297*4882a593Smuzhiyun movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ 298*4882a593Smuzhiyun orq r ## 1, RT0; \ 299*4882a593Smuzhiyun shrq $32, RT0; \ 300*4882a593Smuzhiyun xorq RT0, r ## 1; \ 301*4882a593Smuzhiyun \ 302*4882a593Smuzhiyun movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ 303*4882a593Smuzhiyun orq l ## 0, RT1; \ 304*4882a593Smuzhiyun shrq $32, RT1; \ 305*4882a593Smuzhiyun xorq RT1, l ## 0; \ 306*4882a593Smuzhiyun movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ 307*4882a593Smuzhiyun andl r ## 0d, RT2d; \ 308*4882a593Smuzhiyun roll $1, RT2d; \ 309*4882a593Smuzhiyun shlq $32, RT2; \ 310*4882a593Smuzhiyun xorq RT2, r ## 0; \ 311*4882a593Smuzhiyun \ 312*4882a593Smuzhiyun movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ 313*4882a593Smuzhiyun orq l ## 1, RT0; \ 314*4882a593Smuzhiyun shrq $32, RT0; \ 315*4882a593Smuzhiyun xorq RT0, l ## 1; \ 316*4882a593Smuzhiyun movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ 317*4882a593Smuzhiyun andl r ## 1d, RT1d; \ 318*4882a593Smuzhiyun roll $1, RT1d; \ 319*4882a593Smuzhiyun shlq $32, RT1; \ 320*4882a593Smuzhiyun xorq RT1, r ## 1; 321*4882a593Smuzhiyun 322*4882a593Smuzhiyun#define enc_rounds2(i) \ 323*4882a593Smuzhiyun roundsm2(RAB, i + 2, RCD); \ 324*4882a593Smuzhiyun roundsm2(RCD, i + 3, RAB); \ 325*4882a593Smuzhiyun roundsm2(RAB, i + 4, RCD); \ 326*4882a593Smuzhiyun roundsm2(RCD, i + 5, RAB); \ 327*4882a593Smuzhiyun roundsm2(RAB, i + 6, RCD); \ 328*4882a593Smuzhiyun roundsm2(RCD, i + 7, RAB); 329*4882a593Smuzhiyun 330*4882a593Smuzhiyun#define enc_fls2(i) \ 331*4882a593Smuzhiyun fls2(RAB, RCD, i + 0, i + 1); 332*4882a593Smuzhiyun 333*4882a593Smuzhiyun#define enc_inpack2() \ 334*4882a593Smuzhiyun movq (RIO), RAB0; \ 335*4882a593Smuzhiyun bswapq RAB0; \ 336*4882a593Smuzhiyun rorq $32, RAB0; \ 337*4882a593Smuzhiyun movq 4*2(RIO), RCD0; \ 338*4882a593Smuzhiyun bswapq RCD0; \ 339*4882a593Smuzhiyun rolq $32, RCD0; \ 340*4882a593Smuzhiyun xorq key_table(CTX), RAB0; \ 341*4882a593Smuzhiyun \ 342*4882a593Smuzhiyun movq 8*2(RIO), RAB1; \ 343*4882a593Smuzhiyun bswapq RAB1; \ 344*4882a593Smuzhiyun rorq $32, RAB1; \ 345*4882a593Smuzhiyun movq 12*2(RIO), RCD1; \ 346*4882a593Smuzhiyun bswapq RCD1; \ 347*4882a593Smuzhiyun rolq $32, RCD1; \ 348*4882a593Smuzhiyun xorq key_table(CTX), RAB1; 349*4882a593Smuzhiyun 350*4882a593Smuzhiyun#define enc_outunpack2(op, max) \ 351*4882a593Smuzhiyun xorq key_table(CTX, max, 8), RCD0; \ 352*4882a593Smuzhiyun rolq $32, RCD0; \ 353*4882a593Smuzhiyun bswapq RCD0; \ 354*4882a593Smuzhiyun op ## q RCD0, (RIO); \ 355*4882a593Smuzhiyun rorq $32, RAB0; \ 356*4882a593Smuzhiyun bswapq RAB0; \ 357*4882a593Smuzhiyun op ## q RAB0, 4*2(RIO); \ 358*4882a593Smuzhiyun \ 359*4882a593Smuzhiyun xorq key_table(CTX, max, 8), RCD1; \ 360*4882a593Smuzhiyun rolq $32, RCD1; \ 361*4882a593Smuzhiyun bswapq RCD1; \ 362*4882a593Smuzhiyun op ## q RCD1, 8*2(RIO); \ 363*4882a593Smuzhiyun rorq $32, RAB1; \ 364*4882a593Smuzhiyun bswapq RAB1; \ 365*4882a593Smuzhiyun op ## q RAB1, 12*2(RIO); 366*4882a593Smuzhiyun 367*4882a593Smuzhiyun#define dec_rounds2(i) \ 368*4882a593Smuzhiyun roundsm2(RAB, i + 7, RCD); \ 369*4882a593Smuzhiyun roundsm2(RCD, i + 6, RAB); \ 370*4882a593Smuzhiyun roundsm2(RAB, i + 5, RCD); \ 371*4882a593Smuzhiyun roundsm2(RCD, i + 4, RAB); \ 372*4882a593Smuzhiyun roundsm2(RAB, i + 3, RCD); \ 373*4882a593Smuzhiyun roundsm2(RCD, i + 2, RAB); 374*4882a593Smuzhiyun 375*4882a593Smuzhiyun#define dec_fls2(i) \ 376*4882a593Smuzhiyun fls2(RAB, RCD, i + 1, i + 0); 377*4882a593Smuzhiyun 378*4882a593Smuzhiyun#define dec_inpack2(max) \ 379*4882a593Smuzhiyun movq (RIO), RAB0; \ 380*4882a593Smuzhiyun bswapq RAB0; \ 381*4882a593Smuzhiyun rorq $32, RAB0; \ 382*4882a593Smuzhiyun movq 4*2(RIO), RCD0; \ 383*4882a593Smuzhiyun bswapq RCD0; \ 384*4882a593Smuzhiyun rolq $32, RCD0; \ 385*4882a593Smuzhiyun xorq key_table(CTX, max, 8), RAB0; \ 386*4882a593Smuzhiyun \ 387*4882a593Smuzhiyun movq 8*2(RIO), RAB1; \ 388*4882a593Smuzhiyun bswapq RAB1; \ 389*4882a593Smuzhiyun rorq $32, RAB1; \ 390*4882a593Smuzhiyun movq 12*2(RIO), RCD1; \ 391*4882a593Smuzhiyun bswapq RCD1; \ 392*4882a593Smuzhiyun rolq $32, RCD1; \ 393*4882a593Smuzhiyun xorq key_table(CTX, max, 8), RAB1; 394*4882a593Smuzhiyun 395*4882a593Smuzhiyun#define dec_outunpack2() \ 396*4882a593Smuzhiyun xorq key_table(CTX), RCD0; \ 397*4882a593Smuzhiyun rolq $32, RCD0; \ 398*4882a593Smuzhiyun bswapq RCD0; \ 399*4882a593Smuzhiyun movq RCD0, (RIO); \ 400*4882a593Smuzhiyun rorq $32, RAB0; \ 401*4882a593Smuzhiyun bswapq RAB0; \ 402*4882a593Smuzhiyun movq RAB0, 4*2(RIO); \ 403*4882a593Smuzhiyun \ 404*4882a593Smuzhiyun xorq key_table(CTX), RCD1; \ 405*4882a593Smuzhiyun rolq $32, RCD1; \ 406*4882a593Smuzhiyun bswapq RCD1; \ 407*4882a593Smuzhiyun movq RCD1, 8*2(RIO); \ 408*4882a593Smuzhiyun rorq $32, RAB1; \ 409*4882a593Smuzhiyun bswapq RAB1; \ 410*4882a593Smuzhiyun movq RAB1, 12*2(RIO); 411*4882a593Smuzhiyun 412*4882a593SmuzhiyunSYM_FUNC_START(__camellia_enc_blk_2way) 413*4882a593Smuzhiyun /* input: 414*4882a593Smuzhiyun * %rdi: ctx, CTX 415*4882a593Smuzhiyun * %rsi: dst 416*4882a593Smuzhiyun * %rdx: src 417*4882a593Smuzhiyun * %rcx: bool xor 418*4882a593Smuzhiyun */ 419*4882a593Smuzhiyun pushq %rbx; 420*4882a593Smuzhiyun 421*4882a593Smuzhiyun movq %r12, RR12; 422*4882a593Smuzhiyun movq %rcx, RXOR; 423*4882a593Smuzhiyun movq %rsi, RDST; 424*4882a593Smuzhiyun movq %rdx, RIO; 425*4882a593Smuzhiyun 426*4882a593Smuzhiyun enc_inpack2(); 427*4882a593Smuzhiyun 428*4882a593Smuzhiyun enc_rounds2(0); 429*4882a593Smuzhiyun enc_fls2(8); 430*4882a593Smuzhiyun enc_rounds2(8); 431*4882a593Smuzhiyun enc_fls2(16); 432*4882a593Smuzhiyun enc_rounds2(16); 433*4882a593Smuzhiyun movl $24, RT2d; /* max */ 434*4882a593Smuzhiyun 435*4882a593Smuzhiyun cmpb $16, key_length(CTX); 436*4882a593Smuzhiyun je .L__enc2_done; 437*4882a593Smuzhiyun 438*4882a593Smuzhiyun enc_fls2(24); 439*4882a593Smuzhiyun enc_rounds2(24); 440*4882a593Smuzhiyun movl $32, RT2d; /* max */ 441*4882a593Smuzhiyun 442*4882a593Smuzhiyun.L__enc2_done: 443*4882a593Smuzhiyun test RXORbl, RXORbl; 444*4882a593Smuzhiyun movq RDST, RIO; 445*4882a593Smuzhiyun jnz .L__enc2_xor; 446*4882a593Smuzhiyun 447*4882a593Smuzhiyun enc_outunpack2(mov, RT2); 448*4882a593Smuzhiyun 449*4882a593Smuzhiyun movq RR12, %r12; 450*4882a593Smuzhiyun popq %rbx; 451*4882a593Smuzhiyun RET; 452*4882a593Smuzhiyun 453*4882a593Smuzhiyun.L__enc2_xor: 454*4882a593Smuzhiyun enc_outunpack2(xor, RT2); 455*4882a593Smuzhiyun 456*4882a593Smuzhiyun movq RR12, %r12; 457*4882a593Smuzhiyun popq %rbx; 458*4882a593Smuzhiyun RET; 459*4882a593SmuzhiyunSYM_FUNC_END(__camellia_enc_blk_2way) 460*4882a593Smuzhiyun 461*4882a593SmuzhiyunSYM_FUNC_START(camellia_dec_blk_2way) 462*4882a593Smuzhiyun /* input: 463*4882a593Smuzhiyun * %rdi: ctx, CTX 464*4882a593Smuzhiyun * %rsi: dst 465*4882a593Smuzhiyun * %rdx: src 466*4882a593Smuzhiyun */ 467*4882a593Smuzhiyun cmpl $16, key_length(CTX); 468*4882a593Smuzhiyun movl $32, RT2d; 469*4882a593Smuzhiyun movl $24, RXORd; 470*4882a593Smuzhiyun cmovel RXORd, RT2d; /* max */ 471*4882a593Smuzhiyun 472*4882a593Smuzhiyun movq %rbx, RXOR; 473*4882a593Smuzhiyun movq %r12, RR12; 474*4882a593Smuzhiyun movq %rsi, RDST; 475*4882a593Smuzhiyun movq %rdx, RIO; 476*4882a593Smuzhiyun 477*4882a593Smuzhiyun dec_inpack2(RT2); 478*4882a593Smuzhiyun 479*4882a593Smuzhiyun cmpb $24, RT2bl; 480*4882a593Smuzhiyun je .L__dec2_rounds16; 481*4882a593Smuzhiyun 482*4882a593Smuzhiyun dec_rounds2(24); 483*4882a593Smuzhiyun dec_fls2(24); 484*4882a593Smuzhiyun 485*4882a593Smuzhiyun.L__dec2_rounds16: 486*4882a593Smuzhiyun dec_rounds2(16); 487*4882a593Smuzhiyun dec_fls2(16); 488*4882a593Smuzhiyun dec_rounds2(8); 489*4882a593Smuzhiyun dec_fls2(8); 490*4882a593Smuzhiyun dec_rounds2(0); 491*4882a593Smuzhiyun 492*4882a593Smuzhiyun movq RDST, RIO; 493*4882a593Smuzhiyun 494*4882a593Smuzhiyun dec_outunpack2(); 495*4882a593Smuzhiyun 496*4882a593Smuzhiyun movq RR12, %r12; 497*4882a593Smuzhiyun movq RXOR, %rbx; 498*4882a593Smuzhiyun RET; 499*4882a593SmuzhiyunSYM_FUNC_END(camellia_dec_blk_2way) 500