Lines Matching +full:0 +full:- +full:32

1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * x86_64/AVX2/AES-NI assembler implementation of Camellia
10 #include <asm/nospec-branch.h>
15 #define key_table 0
52 32-way camellia
57 * x0..x7: byte-sliced AB state
61 * x0..x7: new byte-sliced CD state
66 * S-function with AES subbytes \
148 vpbroadcastq key, t0; /* higher 64-bit duplicate ignored */ \
160 /* P-function */ \
194 vpxor 5 * 32(mem_cd), x1, x1; \
201 vpxor 4 * 32(mem_cd), x0, x0; \
204 vpxor 6 * 32(mem_cd), x2, x2; \
207 vpxor 7 * 32(mem_cd), x3, x3; \
210 vpxor 0 * 32(mem_cd), x4, x4; \
213 vpxor 1 * 32(mem_cd), x5, x5; \
216 vpxor 2 * 32(mem_cd), x6, x6; \
219 vpxor 3 * 32(mem_cd), x7, x7;
243 * x0..x7: byte-sliced AB state preloaded
244 * mem_ab: byte-sliced AB state in memory
245 * mem_cb: byte-sliced CD state in memory
252 vmovdqu x0, 4 * 32(mem_cd); \
253 vmovdqu x1, 5 * 32(mem_cd); \
254 vmovdqu x2, 6 * 32(mem_cd); \
255 vmovdqu x3, 7 * 32(mem_cd); \
256 vmovdqu x4, 0 * 32(mem_cd); \
257 vmovdqu x5, 1 * 32(mem_cd); \
258 vmovdqu x6, 2 * 32(mem_cd); \
259 vmovdqu x7, 3 * 32(mem_cd); \
270 vmovdqu x4, 4 * 32(mem_ab); \
271 vmovdqu x5, 5 * 32(mem_ab); \
272 vmovdqu x6, 6 * 32(mem_ab); \
273 vmovdqu x7, 7 * 32(mem_ab); \
274 vmovdqu x0, 0 * 32(mem_ab); \
275 vmovdqu x1, 1 * 32(mem_ab); \
276 vmovdqu x2, 2 * 32(mem_ab); \
277 vmovdqu x3, 3 * 32(mem_ab);
291 y6, y7, mem_ab, mem_cd, (i) + 7, -1, store_ab_state); \
293 y6, y7, mem_ab, mem_cd, (i) + 5, -1, store_ab_state); \
295 y6, y7, mem_ab, mem_cd, (i) + 3, -1, dummy_store);
299 * v0..3: byte-sliced 32-bit integers
328 * r: byte-sliced AB state in memory
329 * l: byte-sliced CD state in memory
331 * x0..x7: new byte-sliced CD state
340 vpbroadcastd kll, t0; /* only lowest 32-bit used */ \
358 vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
359 vmovdqu l4, 4 * 32(l); \
361 vmovdqu l5, 5 * 32(l); \
363 vmovdqu l6, 6 * 32(l); \
365 vmovdqu l7, 7 * 32(l); \
381 vpor 4 * 32(r), t0, t0; \
382 vpor 5 * 32(r), t1, t1; \
383 vpor 6 * 32(r), t2, t2; \
384 vpor 7 * 32(r), t3, t3; \
386 vpxor 0 * 32(r), t0, t0; \
387 vpxor 1 * 32(r), t1, t1; \
388 vpxor 2 * 32(r), t2, t2; \
389 vpxor 3 * 32(r), t3, t3; \
390 vmovdqu t0, 0 * 32(r); \
391 vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
392 vmovdqu t1, 1 * 32(r); \
393 vmovdqu t2, 2 * 32(r); \
394 vmovdqu t3, 3 * 32(r); \
409 vpand 0 * 32(r), t0, t0; \
410 vpand 1 * 32(r), t1, t1; \
411 vpand 2 * 32(r), t2, t2; \
412 vpand 3 * 32(r), t3, t3; \
416 vpxor 4 * 32(r), t0, t0; \
417 vpxor 5 * 32(r), t1, t1; \
418 vpxor 6 * 32(r), t2, t2; \
419 vpxor 7 * 32(r), t3, t3; \
420 vmovdqu t0, 4 * 32(r); \
421 vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
422 vmovdqu t1, 5 * 32(r); \
423 vmovdqu t2, 6 * 32(r); \
424 vmovdqu t3, 7 * 32(r); \
446 vmovdqu l0, 0 * 32(l); \
448 vmovdqu l1, 1 * 32(l); \
450 vmovdqu l2, 2 * 32(l); \
452 vmovdqu l3, 3 * 32(l);
516 /* load blocks to registers and apply pre-whitening */
522 vpxor 0 * 32(rio), x0, y7; \
523 vpxor 1 * 32(rio), x0, y6; \
524 vpxor 2 * 32(rio), x0, y5; \
525 vpxor 3 * 32(rio), x0, y4; \
526 vpxor 4 * 32(rio), x0, y3; \
527 vpxor 5 * 32(rio), x0, y2; \
528 vpxor 6 * 32(rio), x0, y1; \
529 vpxor 7 * 32(rio), x0, y0; \
530 vpxor 8 * 32(rio), x0, x7; \
531 vpxor 9 * 32(rio), x0, x6; \
532 vpxor 10 * 32(rio), x0, x5; \
533 vpxor 11 * 32(rio), x0, x4; \
534 vpxor 12 * 32(rio), x0, x3; \
535 vpxor 13 * 32(rio), x0, x2; \
536 vpxor 14 * 32(rio), x0, x1; \
537 vpxor 15 * 32(rio), x0, x0;
539 /* byteslice pre-whitened blocks and store to temporary memory */
545 vmovdqu x0, 0 * 32(mem_ab); \
546 vmovdqu x1, 1 * 32(mem_ab); \
547 vmovdqu x2, 2 * 32(mem_ab); \
548 vmovdqu x3, 3 * 32(mem_ab); \
549 vmovdqu x4, 4 * 32(mem_ab); \
550 vmovdqu x5, 5 * 32(mem_ab); \
551 vmovdqu x6, 6 * 32(mem_ab); \
552 vmovdqu x7, 7 * 32(mem_ab); \
553 vmovdqu y0, 0 * 32(mem_cd); \
554 vmovdqu y1, 1 * 32(mem_cd); \
555 vmovdqu y2, 2 * 32(mem_cd); \
556 vmovdqu y3, 3 * 32(mem_cd); \
557 vmovdqu y4, 4 * 32(mem_cd); \
558 vmovdqu y5, 5 * 32(mem_cd); \
559 vmovdqu y6, 6 * 32(mem_cd); \
560 vmovdqu y7, 7 * 32(mem_cd);
562 /* de-byteslice, apply post-whitening and store blocks */
592 vmovdqu x0, 0 * 32(rio); \
593 vmovdqu x1, 1 * 32(rio); \
594 vmovdqu x2, 2 * 32(rio); \
595 vmovdqu x3, 3 * 32(rio); \
596 vmovdqu x4, 4 * 32(rio); \
597 vmovdqu x5, 5 * 32(rio); \
598 vmovdqu x6, 6 * 32(rio); \
599 vmovdqu x7, 7 * 32(rio); \
600 vmovdqu y0, 8 * 32(rio); \
601 vmovdqu y1, 9 * 32(rio); \
602 vmovdqu y2, 10 * 32(rio); \
603 vmovdqu y3, 11 * 32(rio); \
604 vmovdqu y4, 12 * 32(rio); \
605 vmovdqu y5, 13 * 32(rio); \
606 vmovdqu y6, 14 * 32(rio); \
607 vmovdqu y7, 15 * 32(rio);
610 .section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32
611 .align 32
613 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx)
615 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
616 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3)
618 .section .rodata.cst32.pack_bswap, "aM", @progbits, 32
619 .align 32
621 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
622 .long 0x00010203, 0x04050607, 0x80808080, 0x80808080
624 /* NB: section is mergeable, all elements must be aligned 16-byte blocks */
628 /* For CTR-mode IV byteswap */
630 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
634 .byte 0x87, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0
636 .byte 0x0e, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0
639 * pre-SubByte transform
641 * pre-lookup for sbox1, sbox2, sbox3:
650 * (note: '⊕ 0xc5' inside camellia_f())
653 .byte 0x45, 0xe8, 0x40, 0xed, 0x2e, 0x83, 0x2b, 0x86
654 .byte 0x4b, 0xe6, 0x4e, 0xe3, 0x20, 0x8d, 0x25, 0x88
656 .byte 0x00, 0x51, 0xf1, 0xa0, 0x8a, 0xdb, 0x7b, 0x2a
657 .byte 0x09, 0x58, 0xf8, 0xa9, 0x83, 0xd2, 0x72, 0x23
660 * pre-SubByte transform
662 * pre-lookup for sbox4:
671 * (note: '⊕ 0xc5' inside camellia_f())
674 .byte 0x45, 0x40, 0x2e, 0x2b, 0x4b, 0x4e, 0x20, 0x25
675 .byte 0x14, 0x11, 0x7f, 0x7a, 0x1a, 0x1f, 0x71, 0x74
677 .byte 0x00, 0xf1, 0x8a, 0x7b, 0x09, 0xf8, 0x83, 0x72
678 .byte 0xad, 0x5c, 0x27, 0xd6, 0xa4, 0x55, 0x2e, 0xdf
681 * post-SubByte transform
683 * post-lookup for sbox1, sbox4:
694 * (note: '⊕ 0x6e' inside camellia_h())
697 .byte 0x3c, 0xcc, 0xcf, 0x3f, 0x32, 0xc2, 0xc1, 0x31
698 .byte 0xdc, 0x2c, 0x2f, 0xdf, 0xd2, 0x22, 0x21, 0xd1
700 .byte 0x00, 0xf9, 0x86, 0x7f, 0xd7, 0x2e, 0x51, 0xa8
701 .byte 0xa4, 0x5d, 0x22, 0xdb, 0x73, 0x8a, 0xf5, 0x0c
704 * post-SubByte transform
706 * post-lookup for sbox2:
717 * (note: '⊕ 0x6e' inside camellia_h())
720 .byte 0x78, 0x99, 0x9f, 0x7e, 0x64, 0x85, 0x83, 0x62
721 .byte 0xb9, 0x58, 0x5e, 0xbf, 0xa5, 0x44, 0x42, 0xa3
723 .byte 0x00, 0xf3, 0x0d, 0xfe, 0xaf, 0x5c, 0xa2, 0x51
724 .byte 0x49, 0xba, 0x44, 0xb7, 0xe6, 0x15, 0xeb, 0x18
727 * post-SubByte transform
729 * post-lookup for sbox3:
740 * (note: '⊕ 0x6e' inside camellia_h())
743 .byte 0x1e, 0x66, 0xe7, 0x9f, 0x19, 0x61, 0xe0, 0x98
744 .byte 0x6e, 0x16, 0x97, 0xef, 0x69, 0x11, 0x90, 0xe8
746 .byte 0x00, 0xfc, 0x43, 0xbf, 0xeb, 0x17, 0xa8, 0x54
747 .byte 0x52, 0xae, 0x11, 0xed, 0xb9, 0x45, 0xfa, 0x06
751 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
752 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
756 /* 4-bit mask */
758 .long 0x0f0f0f0f
767 * %ymm0..%ymm15: 32 plaintext blocks
769 * %ymm0..%ymm15: 32 encrypted blocks, order swapped:
770 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
774 leaq 8 * 32(%rax), %rcx;
782 %ymm15, %rax, %rcx, 0);
787 ((key_table + (8) * 8) + 0)(CTX),
799 ((key_table + (16) * 8) + 0)(CTX),
814 vmovdqu 0 * 32(%rcx), %ymm8;
815 vmovdqu 1 * 32(%rcx), %ymm9;
816 vmovdqu 2 * 32(%rcx), %ymm10;
817 vmovdqu 3 * 32(%rcx), %ymm11;
818 vmovdqu 4 * 32(%rcx), %ymm12;
819 vmovdqu 5 * 32(%rcx), %ymm13;
820 vmovdqu 6 * 32(%rcx), %ymm14;
821 vmovdqu 7 * 32(%rcx), %ymm15;
825 %ymm15, (key_table)(CTX, %r8, 8), (%rax), 1 * 32(%rax));
832 movl $32, %r8d;
837 ((key_table + (24) * 8) + 0)(CTX),
854 * %r8d: 24 for 16 byte key, 32 for larger
858 * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8
862 leaq 8 * 32(%rax), %rcx;
868 cmpl $32, %r8d;
881 ((key_table + (16) * 8) + 0)(CTX),
893 ((key_table + (8) * 8) + 0)(CTX),
898 %ymm15, %rax, %rcx, 0);
901 vmovdqu 0 * 32(%rcx), %ymm8;
902 vmovdqu 1 * 32(%rcx), %ymm9;
903 vmovdqu 2 * 32(%rcx), %ymm10;
904 vmovdqu 3 * 32(%rcx), %ymm11;
905 vmovdqu 4 * 32(%rcx), %ymm12;
906 vmovdqu 5 * 32(%rcx), %ymm13;
907 vmovdqu 6 * 32(%rcx), %ymm14;
908 vmovdqu 7 * 32(%rcx), %ymm15;
912 %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax));
928 ((key_table + (24) * 8) + 0)(CTX),
937 * %rsi: dst (32 blocks)
938 * %rdx: src (32 blocks)
966 * %rsi: dst (32 blocks)
967 * %rdx: src (32 blocks)
974 movl $32, %r8d;
1000 * %rsi: dst (32 blocks)
1001 * %rdx: src (32 blocks)
1008 movl $32, %r8d;
1026 * dst still in-use (because dst == src), so use stack for temporary
1029 subq $(16 * 32), %rsp;
1040 vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
1041 vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
1042 vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
1043 vpxor (3 * 32 + 16)(%rdx), %ymm3, %ymm3;
1044 vpxor (4 * 32 + 16)(%rdx), %ymm2, %ymm2;
1045 vpxor (5 * 32 + 16)(%rdx), %ymm1, %ymm1;
1046 vpxor (6 * 32 + 16)(%rdx), %ymm0, %ymm0;
1047 vpxor (7 * 32 + 16)(%rdx), %ymm15, %ymm15;
1048 vpxor (8 * 32 + 16)(%rdx), %ymm14, %ymm14;
1049 vpxor (9 * 32 + 16)(%rdx), %ymm13, %ymm13;
1050 vpxor (10 * 32 + 16)(%rdx), %ymm12, %ymm12;
1051 vpxor (11 * 32 + 16)(%rdx), %ymm11, %ymm11;
1052 vpxor (12 * 32 + 16)(%rdx), %ymm10, %ymm10;
1053 vpxor (13 * 32 + 16)(%rdx), %ymm9, %ymm9;
1054 vpxor (14 * 32 + 16)(%rdx), %ymm8, %ymm8;
1082 * %rsi: dst (32 blocks)
1083 * %rdx: src (32 blocks)
1099 subq $(16 * 32), %rsp;
1104 vpsrldq $8, %ymm15, %ymm15; /* ab: -1:0 ; cd: -1:0 */
1105 vpaddq %ymm15, %ymm15, %ymm12; /* ab: -2:0 ; cd: -2:0 */
1114 vmovdqu %ymm13, 15 * 32(%rax);
1119 vmovdqu %ymm13, 14 * 32(%rax);
1122 vmovdqu %ymm13, 13 * 32(%rax);
1125 vmovdqu %ymm13, 12 * 32(%rax);
1128 vmovdqu %ymm13, 11 * 32(%rax);
1169 vpxor 11 * 32(%rax), %ymm15, %ymm11;
1170 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1171 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1172 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1173 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1179 vpxor 0 * 32(%rdx), %ymm7, %ymm7;
1180 vpxor 1 * 32(%rdx), %ymm6, %ymm6;
1181 vpxor 2 * 32(%rdx), %ymm5, %ymm5;
1182 vpxor 3 * 32(%rdx), %ymm4, %ymm4;
1183 vpxor 4 * 32(%rdx), %ymm3, %ymm3;
1184 vpxor 5 * 32(%rdx), %ymm2, %ymm2;
1185 vpxor 6 * 32(%rdx), %ymm1, %ymm1;
1186 vpxor 7 * 32(%rdx), %ymm0, %ymm0;
1187 vpxor 8 * 32(%rdx), %ymm15, %ymm15;
1188 vpxor 9 * 32(%rdx), %ymm14, %ymm14;
1189 vpxor 10 * 32(%rdx), %ymm13, %ymm13;
1190 vpxor 11 * 32(%rdx), %ymm12, %ymm12;
1191 vpxor 12 * 32(%rdx), %ymm11, %ymm11;
1192 vpxor 13 * 32(%rdx), %ymm10, %ymm10;
1193 vpxor 14 * 32(%rdx), %ymm9, %ymm9;
1194 vpxor 15 * 32(%rdx), %ymm8, %ymm8;
1208 vpshufd $0x13, tmp, tmp; \
1216 vpshufd $0x13, tmp0, tmp0; \
1219 vpshufd $0x13, tmp1, tmp1; \
1228 * %rsi: dst (32 blocks)
1229 * %rdx: src (32 blocks)
1238 subq $(16 * 32), %rsp;
1249 vpxor 0 * 32(%rdx), %ymm0, %ymm15;
1250 vmovdqu %ymm15, 15 * 32(%rax);
1251 vmovdqu %ymm0, 0 * 32(%rsi);
1255 vpxor 1 * 32(%rdx), %ymm0, %ymm15;
1256 vmovdqu %ymm15, 14 * 32(%rax);
1257 vmovdqu %ymm0, 1 * 32(%rsi);
1260 vpxor 2 * 32(%rdx), %ymm0, %ymm15;
1261 vmovdqu %ymm15, 13 * 32(%rax);
1262 vmovdqu %ymm0, 2 * 32(%rsi);
1265 vpxor 3 * 32(%rdx), %ymm0, %ymm15;
1266 vmovdqu %ymm15, 12 * 32(%rax);
1267 vmovdqu %ymm0, 3 * 32(%rsi);
1270 vpxor 4 * 32(%rdx), %ymm0, %ymm11;
1271 vmovdqu %ymm0, 4 * 32(%rsi);
1274 vpxor 5 * 32(%rdx), %ymm0, %ymm10;
1275 vmovdqu %ymm0, 5 * 32(%rsi);
1278 vpxor 6 * 32(%rdx), %ymm0, %ymm9;
1279 vmovdqu %ymm0, 6 * 32(%rsi);
1282 vpxor 7 * 32(%rdx), %ymm0, %ymm8;
1283 vmovdqu %ymm0, 7 * 32(%rsi);
1286 vpxor 8 * 32(%rdx), %ymm0, %ymm7;
1287 vmovdqu %ymm0, 8 * 32(%rsi);
1290 vpxor 9 * 32(%rdx), %ymm0, %ymm6;
1291 vmovdqu %ymm0, 9 * 32(%rsi);
1294 vpxor 10 * 32(%rdx), %ymm0, %ymm5;
1295 vmovdqu %ymm0, 10 * 32(%rsi);
1298 vpxor 11 * 32(%rdx), %ymm0, %ymm4;
1299 vmovdqu %ymm0, 11 * 32(%rsi);
1302 vpxor 12 * 32(%rdx), %ymm0, %ymm3;
1303 vmovdqu %ymm0, 12 * 32(%rsi);
1306 vpxor 13 * 32(%rdx), %ymm0, %ymm2;
1307 vmovdqu %ymm0, 13 * 32(%rsi);
1310 vpxor 14 * 32(%rdx), %ymm0, %ymm1;
1311 vmovdqu %ymm0, 14 * 32(%rsi);
1314 vpxor 15 * 32(%rdx), %ymm0, %ymm15;
1315 vmovdqu %ymm15, 0 * 32(%rax);
1316 vmovdqu %ymm0, 15 * 32(%rsi);
1325 vpxor 0 * 32(%rax), %ymm15, %ymm0;
1337 vpxor 12 * 32(%rax), %ymm15, %ymm12;
1338 vpxor 13 * 32(%rax), %ymm15, %ymm13;
1339 vpxor 14 * 32(%rax), %ymm15, %ymm14;
1340 vpxor 15 * 32(%rax), %ymm15, %ymm15;
1344 addq $(16 * 32), %rsp;
1346 vpxor 0 * 32(%rsi), %ymm7, %ymm7;
1347 vpxor 1 * 32(%rsi), %ymm6, %ymm6;
1348 vpxor 2 * 32(%rsi), %ymm5, %ymm5;
1349 vpxor 3 * 32(%rsi), %ymm4, %ymm4;
1350 vpxor 4 * 32(%rsi), %ymm3, %ymm3;
1351 vpxor 5 * 32(%rsi), %ymm2, %ymm2;
1352 vpxor 6 * 32(%rsi), %ymm1, %ymm1;
1353 vpxor 7 * 32(%rsi), %ymm0, %ymm0;
1354 vpxor 8 * 32(%rsi), %ymm15, %ymm15;
1355 vpxor 9 * 32(%rsi), %ymm14, %ymm14;
1356 vpxor 10 * 32(%rsi), %ymm13, %ymm13;
1357 vpxor 11 * 32(%rsi), %ymm12, %ymm12;
1358 vpxor 12 * 32(%rsi), %ymm11, %ymm11;
1359 vpxor 13 * 32(%rsi), %ymm10, %ymm10;
1360 vpxor 14 * 32(%rsi), %ymm9, %ymm9;
1361 vpxor 15 * 32(%rsi), %ymm8, %ymm8;
1375 * %rsi: dst (32 blocks)
1376 * %rdx: src (32 blocks)
1380 xorl %r8d, %r8d; /* input whitening key, 0 for enc */
1390 * %rsi: dst (32 blocks)
1391 * %rdx: src (32 blocks)
1396 movl $32, %r8d;