1/* SPDX-License-Identifier: BSD-2-Clause */ 2/* 3 * Copyright (c) Hisilicon Technologies Co., Ltd. 2023. All rights reserved. 4 * Copyright (C) 2022, Alibaba Group. 5 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 6 * 7 * SM4 optimization for ARMv8 by SM4 HW instruction, which is an optional 8 * Cryptographic Extension for ARMv8.2-A. 9 * 10 * The CE implementation refers to Linux kernel (sm4-ce-core.S contributed 11 * by Tianjia Zhang <tianjia.zhang@linux.alibaba.com>). 12 */ 13 14#include <asm.S> 15 16.arch armv8.2-a+crypto+sm4 17 18#define tw0l x7 19#define tw0h x8 20#define tw1l x9 21#define tw1h x10 22#define tw2l x11 23#define tw2h x12 24#define tw3l x13 25#define tw3h x14 26#define tw4l x15 27#define tw4h x16 28#define tw5l x17 29#define tw5h x18 30#define tw6l x19 31#define tw6h x20 32#define tw7l x21 33#define tw7h x22 34#define tmpw0 w23 35#define tmpx0 x23 36#define tmpw1 w24 37#define tmpx1 x24 38#define tmpw2 w25 39 40/* round keys: v0-v7 */ 41#define RK0 v0 42#define RK1 v1 43#define RK2 v2 44#define RK3 v3 45#define RK4 v4 46#define RK5 v5 47#define RK6 v6 48#define RK7 v7 49 50/* plain blocks: v8-v15 */ 51#define BLK0 v8 52#define BLK1 v9 53#define BLK2 v10 54#define BLK3 v11 55#define BLK4 v12 56#define BLK5 v13 57#define BLK6 v14 58#define BLK7 v15 59 60#define TMP0 v16 61#define TMP1 v17 62#define TMP2 v18 63#define TMP3 v19 64#define TMP4 v20 65#define TMP5 v21 66#define TMP6 v22 67#define TMP7 v23 68#define TMP8 v24 69#define IV v25 70 71.macro frame_push 72 stp x15, x16, [sp, #-0x10]! 73 stp x17, x18, [sp, #-0x10]! 74 stp x19, x20, [sp, #-0x10]! 75 stp x21, x22, [sp, #-0x10]! 76 stp x23, x24, [sp, #-0x10]! 77 stp x25, x26, [sp, #-0x10]! 78 stp x27, x28, [sp, #-0x10]! 79 stp x29, x30, [sp, #-0x10]! 80 stp d8, d9, [sp, #-0x10]! 81 stp d10, d11, [sp, #-0x10]! 82 stp d12, d13, [sp, #-0x10]! 83 stp d14, d15, [sp, #-0x10]! 84.endm 85 86.macro frame_pop 87 ldp d14, d15, [sp], #0x10 88 ldp d12, d13, [sp], #0x10 89 ldp d10, d11, [sp], #0x10 90 ldp d8, d9, [sp], #0x10 91 ldp x29, x30, [sp], #0x10 92 ldp x27, x28, [sp], #0x10 93 ldp x25, x26, [sp], #0x10 94 ldp x23, x24, [sp], #0x10 95 ldp x21, x22, [sp], #0x10 96 ldp x19, x20, [sp], #0x10 97 ldp x17, x18, [sp], #0x10 98 ldp x15, x16, [sp], #0x10 99.endm 100 101.macro encrypt_block_no_rev, in 102 sm4e \in\().4s, RK0.4s 103 sm4e \in\().4s, RK1.4s 104 sm4e \in\().4s, RK2.4s 105 sm4e \in\().4s, RK3.4s 106 sm4e \in\().4s, RK4.4s 107 sm4e \in\().4s, RK5.4s 108 sm4e \in\().4s, RK6.4s 109 sm4e \in\().4s, RK7.4s 110 rev64 \in\().4s, \in\().4s 111 ext \in\().16b, \in\().16b, \in\().16b, #8 112.endm 113 114.macro encrypt_block, in 115 rev32 \in\().16b, \in\().16b 116 sm4e \in\().4s, RK0.4s 117 sm4e \in\().4s, RK1.4s 118 sm4e \in\().4s, RK2.4s 119 sm4e \in\().4s, RK3.4s 120 sm4e \in\().4s, RK4.4s 121 sm4e \in\().4s, RK5.4s 122 sm4e \in\().4s, RK6.4s 123 sm4e \in\().4s, RK7.4s 124 rev64 \in\().16b, \in\().16b 125 ext \in\().16b, \in\().16b, \in\().16b, #8 126.endm 127 128.macro decrypt_block, in 129 rev32 \in\().16b, \in\().16b 130 rev64 RK7.4s, RK7.4s; 131 rev64 RK6.4s, RK6.4s; 132 rev64 RK5.4s, RK5.4s; 133 rev64 RK4.4s, RK4.4s; 134 ext RK7.16b, RK7.16b, RK7.16b, #8; 135 ext RK6.16b, RK6.16b, RK6.16b, #8; 136 ext RK5.16b, RK5.16b, RK5.16b, #8; 137 ext RK4.16b, RK4.16b, RK4.16b, #8; 138 sm4e \in\().4s, RK7.4s 139 sm4e \in\().4s, RK6.4s 140 sm4e \in\().4s, RK5.4s 141 sm4e \in\().4s, RK4.4s 142 rev64 RK3.4s, RK3.4s; 143 rev64 RK2.4s, RK2.4s; 144 rev64 RK1.4s, RK1.4s; 145 rev64 RK0.4s, RK0.4s; 146 ext RK3.16b, RK3.16b, RK3.16b, #8; 147 ext RK2.16b, RK2.16b, RK2.16b, #8; 148 ext RK1.16b, RK1.16b, RK1.16b, #8; 149 ext RK0.16b, RK0.16b, RK0.16b, #8; 150 sm4e \in\().4s, RK3.4s 151 sm4e \in\().4s, RK2.4s 152 sm4e \in\().4s, RK1.4s 153 sm4e \in\().4s, RK0.4s 154 rev64 \in\().16b, \in\().16b 155 ext \in\().16b, \in\().16b, \in\().16b, #8 156.endm 157 158LOCAL_FUNC sm4_encrypt_block1x , : 159 encrypt_block BLK0 160 ret 161END_FUNC sm4_encrypt_block1x 162 163LOCAL_FUNC sm4_decrypt_block1x , : 164 decrypt_block BLK0 165 ret 166END_FUNC sm4_decrypt_block1x 167 168LOCAL_FUNC sm4_encrypt_block4x , : 169 rev32 BLK0.16b, BLK0.16b 170 rev32 BLK1.16b, BLK1.16b 171 rev32 BLK2.16b, BLK2.16b 172 rev32 BLK3.16b, BLK3.16b 173 174 sm4e BLK0.4s, RK0.4s 175 sm4e BLK1.4s, RK0.4s 176 sm4e BLK2.4s, RK0.4s 177 sm4e BLK3.4s, RK0.4s 178 179 sm4e BLK0.4s, RK1.4s 180 sm4e BLK1.4s, RK1.4s 181 sm4e BLK2.4s, RK1.4s 182 sm4e BLK3.4s, RK1.4s 183 184 sm4e BLK0.4s, RK2.4s 185 sm4e BLK1.4s, RK2.4s 186 sm4e BLK2.4s, RK2.4s 187 sm4e BLK3.4s, RK2.4s 188 189 sm4e BLK0.4s, RK3.4s 190 sm4e BLK1.4s, RK3.4s 191 sm4e BLK2.4s, RK3.4s 192 sm4e BLK3.4s, RK3.4s 193 194 sm4e BLK0.4s, RK4.4s 195 sm4e BLK1.4s, RK4.4s 196 sm4e BLK2.4s, RK4.4s 197 sm4e BLK3.4s, RK4.4s 198 199 sm4e BLK0.4s, RK5.4s 200 sm4e BLK1.4s, RK5.4s 201 sm4e BLK2.4s, RK5.4s 202 sm4e BLK3.4s, RK5.4s 203 204 sm4e BLK0.4s, RK6.4s 205 sm4e BLK1.4s, RK6.4s 206 sm4e BLK2.4s, RK6.4s 207 sm4e BLK3.4s, RK6.4s 208 209 sm4e BLK0.4s, RK7.4s 210 sm4e BLK1.4s, RK7.4s 211 sm4e BLK2.4s, RK7.4s 212 sm4e BLK3.4s, RK7.4s 213 214 rev64 BLK0.16b, BLK0.16b 215 rev64 BLK1.16b, BLK1.16b 216 rev64 BLK2.16b, BLK2.16b 217 rev64 BLK3.16b, BLK3.16b 218 219 ext BLK0.16b, BLK0.16b, BLK0.16b, #8 220 ext BLK1.16b, BLK1.16b, BLK1.16b, #8 221 ext BLK2.16b, BLK2.16b, BLK2.16b, #8 222 ext BLK3.16b, BLK3.16b, BLK3.16b, #8 223 ret 224END_FUNC sm4_encrypt_block4x 225 226LOCAL_FUNC sm4_encrypt_block8x , : 227 rev32 BLK0.16b, BLK0.16b 228 rev32 BLK1.16b, BLK1.16b 229 rev32 BLK2.16b, BLK2.16b 230 rev32 BLK3.16b, BLK3.16b 231 rev32 BLK4.16b, BLK4.16b 232 rev32 BLK5.16b, BLK5.16b 233 rev32 BLK6.16b, BLK6.16b 234 rev32 BLK7.16b, BLK7.16b 235 236 sm4e BLK0.4s, RK0.4s 237 sm4e BLK1.4s, RK0.4s 238 sm4e BLK2.4s, RK0.4s 239 sm4e BLK3.4s, RK0.4s 240 sm4e BLK4.4s, RK0.4s 241 sm4e BLK5.4s, RK0.4s 242 sm4e BLK6.4s, RK0.4s 243 sm4e BLK7.4s, RK0.4s 244 245 sm4e BLK0.4s, RK1.4s 246 sm4e BLK1.4s, RK1.4s 247 sm4e BLK2.4s, RK1.4s 248 sm4e BLK3.4s, RK1.4s 249 sm4e BLK4.4s, RK1.4s 250 sm4e BLK5.4s, RK1.4s 251 sm4e BLK6.4s, RK1.4s 252 sm4e BLK7.4s, RK1.4s 253 254 sm4e BLK0.4s, RK2.4s 255 sm4e BLK1.4s, RK2.4s 256 sm4e BLK2.4s, RK2.4s 257 sm4e BLK3.4s, RK2.4s 258 sm4e BLK4.4s, RK2.4s 259 sm4e BLK5.4s, RK2.4s 260 sm4e BLK6.4s, RK2.4s 261 sm4e BLK7.4s, RK2.4s 262 263 sm4e BLK0.4s, RK3.4s 264 sm4e BLK1.4s, RK3.4s 265 sm4e BLK2.4s, RK3.4s 266 sm4e BLK3.4s, RK3.4s 267 sm4e BLK4.4s, RK3.4s 268 sm4e BLK5.4s, RK3.4s 269 sm4e BLK6.4s, RK3.4s 270 sm4e BLK7.4s, RK3.4s 271 272 sm4e BLK0.4s, RK4.4s 273 sm4e BLK1.4s, RK4.4s 274 sm4e BLK2.4s, RK4.4s 275 sm4e BLK3.4s, RK4.4s 276 sm4e BLK4.4s, RK4.4s 277 sm4e BLK5.4s, RK4.4s 278 sm4e BLK6.4s, RK4.4s 279 sm4e BLK7.4s, RK4.4s 280 281 sm4e BLK0.4s, RK5.4s 282 sm4e BLK1.4s, RK5.4s 283 sm4e BLK2.4s, RK5.4s 284 sm4e BLK3.4s, RK5.4s 285 sm4e BLK4.4s, RK5.4s 286 sm4e BLK5.4s, RK5.4s 287 sm4e BLK6.4s, RK5.4s 288 sm4e BLK7.4s, RK5.4s 289 290 sm4e BLK0.4s, RK6.4s 291 sm4e BLK1.4s, RK6.4s 292 sm4e BLK2.4s, RK6.4s 293 sm4e BLK3.4s, RK6.4s 294 sm4e BLK4.4s, RK6.4s 295 sm4e BLK5.4s, RK6.4s 296 sm4e BLK6.4s, RK6.4s 297 sm4e BLK7.4s, RK6.4s 298 299 sm4e BLK0.4s, RK7.4s 300 sm4e BLK1.4s, RK7.4s 301 sm4e BLK2.4s, RK7.4s 302 sm4e BLK3.4s, RK7.4s 303 sm4e BLK4.4s, RK7.4s 304 sm4e BLK5.4s, RK7.4s 305 sm4e BLK6.4s, RK7.4s 306 sm4e BLK7.4s, RK7.4s 307 308 rev64 BLK0.16b, BLK0.16b 309 rev64 BLK1.16b, BLK1.16b 310 rev64 BLK2.16b, BLK2.16b 311 rev64 BLK3.16b, BLK3.16b 312 rev64 BLK4.16b, BLK4.16b 313 rev64 BLK5.16b, BLK5.16b 314 rev64 BLK6.16b, BLK6.16b 315 rev64 BLK7.16b, BLK7.16b 316 317 ext BLK0.16b, BLK0.16b, BLK0.16b, #8 318 ext BLK1.16b, BLK1.16b, BLK1.16b, #8 319 ext BLK2.16b, BLK2.16b, BLK2.16b, #8 320 ext BLK3.16b, BLK3.16b, BLK3.16b, #8 321 ext BLK4.16b, BLK4.16b, BLK4.16b, #8 322 ext BLK5.16b, BLK5.16b, BLK5.16b, #8 323 ext BLK6.16b, BLK6.16b, BLK6.16b, #8 324 ext BLK7.16b, BLK7.16b, BLK7.16b, #8 325 ret 326END_FUNC sm4_encrypt_block8x 327 328.macro inc_le128, vctr, low, high 329 mov \vctr\().d[1], \high 330 mov \vctr\().d[0], \low 331 adds \high, \high, #1 332 adc \low, \low, xzr 333 rev64 \vctr\().16b, \vctr\().16b 334.endm 335 336.macro mov_reg_to_vec, desv, src0, src1 337 mov \desv\().d[0], \src0 338 mov \desv\().d[1], \src1 339.endm 340 341.macro next_tweak, des0, des1, src0, src1 342 mov tmpw2, 0x87 343 extr tmpx0, \src1, \src1, #32 344 extr \des1, \src1, \src0, #63 345 and tmpw1, tmpw2, tmpw0, asr#31 346 eor \des0, tmpx1, \src0, lsl#1 347.endm 348 349.macro next_tweak_vec, desv, srcv 350 mov tw0l, \srcv\().d[0] 351 mov tw0h, \srcv\().d[1] 352 next_tweak tw1l, tw1h, tw0l, tw0h 353 mov \desv\().d[0], tw1l 354 mov \desv\().d[1], tw1h 355.endm 356 357LOCAL_DATA .Lck , : 358 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 359 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 360 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 361 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 362 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 363 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 364 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 365 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 366END_DATA .Lck 367 368LOCAL_DATA .Lfk , : 369 .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc 370END_DATA .Lfk 371 372/* 373 * void ce_sm4_setkey_enc(uint32_t sk[32], uint8_t const key[16]); 374 * x0: round key 375 * x1: user key 376 */ 377FUNC ce_sm4_setkey_enc , : 378 ld1 {RK0.4s}, [x1] 379 adr x2, .Lfk 380 ld1 {TMP8.4s}, [x2] 381 adr x2, .Lck 382 ld1 {TMP0.4s, TMP1.4s, TMP2.4s, TMP3.4s}, [x2], 64 383 rev32 RK0.16b, RK0.16b 384 ld1 {TMP4.4s, TMP5.4s, TMP6.4s, TMP7.4s}, [x2] 385 eor RK0.16b, RK0.16b, TMP8.16b 386 sm4ekey RK0.4s, RK0.4s, TMP0.4s 387 sm4ekey RK1.4s, RK0.4s, TMP1.4s 388 sm4ekey RK2.4s, RK1.4s, TMP2.4s 389 sm4ekey RK3.4s, RK2.4s, TMP3.4s 390 sm4ekey RK4.4s, RK3.4s, TMP4.4s 391 st1 {RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x0], 64 392 sm4ekey RK5.4s, RK4.4s, TMP5.4s 393 sm4ekey RK6.4s, RK5.4s, TMP6.4s 394 sm4ekey RK7.4s, RK6.4s, TMP7.4s 395 st1 {RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x0] 396 ret 397END_FUNC ce_sm4_setkey_enc 398 399/* 400 * void ce_sm4_setkey_dec(uint32_t sk[32], uint8_t const key[16]); 401 * x0: round key 402 * x1: user key 403 */ 404FUNC ce_sm4_setkey_dec , : 405 ld1 {RK7.4s}, [x1] 406 adr x2, .Lfk 407 ld1 {TMP8.4s}, [x2] 408 adr x2, .Lck 409 ld1 {TMP0.4s, TMP1.4s, TMP2.4s, TMP3.4s}, [x2], 64 410 rev32 RK7.16b, RK7.16b 411 ld1 {TMP4.4s, TMP5.4s, TMP6.4s, TMP7.4s}, [x2] 412 eor RK7.16b, RK7.16b, TMP8.16b; 413 sm4ekey RK7.4s, RK7.4s, TMP0.4s 414 sm4ekey RK6.4s, RK7.4s, TMP1.4s 415 sm4ekey RK5.4s, RK6.4s, TMP2.4s 416 rev64 RK7.4s, RK7.4s 417 rev64 RK6.4s, RK6.4s 418 ext RK7.16b, RK7.16b, RK7.16b, #8 419 ext RK6.16b, RK6.16b, RK6.16b, #8 420 sm4ekey RK4.4s, RK5.4s, TMP3.4s 421 sm4ekey RK3.4s, RK4.4s, TMP4.4s 422 rev64 RK5.4s, RK5.4s 423 rev64 RK4.4s, RK4.4s 424 ext RK5.16b, RK5.16b, RK5.16b, #8 425 ext RK4.16b, RK4.16b, RK4.16b, #8 426 sm4ekey RK2.4s, RK3.4s, TMP5.4s 427 sm4ekey RK1.4s, RK2.4s, TMP6.4s 428 rev64 RK3.4s, RK3.4s 429 rev64 RK2.4s, RK2.4s 430 ext RK3.16b, RK3.16b, RK3.16b, #8 431 ext RK2.16b, RK2.16b, RK2.16b, #8 432 sm4ekey RK0.4s, RK1.4s, TMP7.4s 433 rev64 RK1.4s, RK1.4s 434 rev64 RK0.4s, RK0.4s 435 ext RK1.16b, RK1.16b, RK1.16b, #8 436 ext RK0.16b, RK0.16b, RK0.16b, #8 437 st1 {RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x0], 64 438 st1 {RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x0] 439 ret 440END_FUNC ce_sm4_setkey_dec 441 442/* 443 * void ce_sm4_ecb_encrypt(uint8_t out[], uint8_t const in[], 444 * uint8_t const rk[], size_t len); 445 * x0: output 446 * x1: input 447 * x2: round key 448 * w3: length 449 */ 450FUNC ce_sm4_ecb_encrypt , : 451 frame_push 452 453 ld1 {RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64 454 ld1 {RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64 455 lsr w3, w3, 4 456 457.Lecbloop8x: 458 cmp w3, 8 459 b.lt .Lecb4x 460 ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64 461 ld1 {BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x1], #64 462 bl sm4_encrypt_block8x 463 st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64 464 st1 {BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64 465 subs w3, w3, #8 466 b.gt .Lecbloop8x 467 468.Lecb4x: 469 cmp w3, 1 470 b.lt .Lecbout 471 cmp w3, 2 472 b.lt .Lecb1x 473 cmp w3, 3 474 b.lt .Lecb2x 475 cmp w3, 4 476 b.lt .Lecb3x 477 ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64 478 bl sm4_encrypt_block4x 479 st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64 480 sub w3, w3, #4 481 b .Lecb4x 482 483.Lecb3x: 484 ld1 {BLK0.16b, BLK1.16b, BLK2.16b}, [x1], #48 485 bl sm4_encrypt_block4x 486 st1 {BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48 487 subs w3, w3, #3 488 b.le .Lecbout 489 490.Lecb2x: 491 ld1 {BLK0.16b, BLK1.16b}, [x1], #32 492 bl sm4_encrypt_block4x 493 st1 {BLK0.16b, BLK1.16b}, [x0], #32 494 subs w3, w3, #2 495 b.le .Lecbout 496 497.Lecb1x: 498 ld1 {BLK0.16b}, [x1], #16 499 bl sm4_encrypt_block1x 500 st1 {BLK0.16b}, [x0], #16 501 502.Lecbout: 503 frame_pop 504 ret 505 506END_FUNC ce_sm4_ecb_encrypt 507 508/* 509 * void ce_sm4_cbc_encrypt(uint8_t out[], uint8_t const in[], 510 * uint8_t const rk[], size_t len, 511 * uint8_t iv[]); 512 * x0: output 513 * x1: input 514 * x2: round key 515 * w3: length 516 * x4: iv 517 */ 518FUNC ce_sm4_cbc_encrypt , : 519 frame_push 520 521 ld1 {RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64 522 ld1 {RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64 523 lsr w3, w3, 4 524 ld1 {IV.16b}, [x4] 525 526.Lcbcencloop4x: 527 cmp w3, 4 528 b.lt .Lcbcenc1x 529 ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64 530 eor BLK0.16b, BLK0.16b, IV.16b 531 rev32 BLK0.16b, BLK0.16b 532 rev32 BLK1.16b, BLK1.16b 533 rev32 BLK2.16b, BLK2.16b 534 rev32 BLK3.16b, BLK3.16b 535 encrypt_block_no_rev BLK0 536 eor BLK1.16b, BLK1.16b, BLK0.16b 537 encrypt_block_no_rev BLK1 538 rev32 BLK0.16b, BLK0.16b 539 eor BLK2.16b, BLK2.16b, BLK1.16b 540 encrypt_block_no_rev BLK2 541 rev32 BLK1.16b, BLK1.16b 542 eor BLK3.16b, BLK3.16b, BLK2.16b 543 encrypt_block_no_rev BLK3 544 rev32 BLK2.16b, BLK2.16b 545 rev32 BLK3.16b, BLK3.16b 546 mov IV.16b, BLK3.16b 547 st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64 548 subs w3, w3, #4 549 b .Lcbcencloop4x 550.Lcbcenc1x: 551 cmp w3, 1 552 b.lt .Lcbcencout 553.Lcbcencloop: 554 ld1 {BLK0.16b}, [x1], #16 555 eor BLK0.16b, BLK0.16b, IV.16b 556 bl sm4_encrypt_block1x 557 mov IV.16b, BLK0.16b 558 st1 {BLK0.16b}, [x0], #16 559 subs w3, w3, #1 560 bne .Lcbcencloop 561.Lcbcencout: 562 st1 {IV.16b}, [x4] 563 frame_pop 564 ret 565END_FUNC ce_sm4_cbc_encrypt 566 567/* 568 * void ce_sm4_cbc_decrypt(uint8_t out[], uint8_t const in[], 569 * uint8_t const rk[], size_t len, 570 * uint8_t iv[]); 571 * x0: output 572 * x1: input 573 * x2: round key 574 * w3: length 575 * x4: iv 576 */ 577FUNC ce_sm4_cbc_decrypt , : 578 frame_push 579 580 ld1 {RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64 581 ld1 {RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64 582 lsr w3, w3, 4 583 ld1 {IV.16b}, [x4] 584 585.Lcbcdecloop8x: 586 cmp w3, 8 587 b.lt .Lcbcdec4x 588 589 ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64 590 ld1 {BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x1], #64 591 bl sm4_encrypt_block8x 592 sub x5, x1, #128 593 eor BLK0.16b, BLK0.16b, IV.16b 594 ld1 {TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x5], #64 595 eor BLK1.16b, BLK1.16b, TMP0.16b 596 eor BLK2.16b, BLK2.16b, TMP1.16b 597 eor BLK3.16b, BLK3.16b, TMP2.16b 598 st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64 599 ld1 {TMP4.16b, TMP5.16b, TMP6.16b, TMP7.16b}, [x5], #64 600 eor BLK4.16b, BLK4.16b, TMP3.16b 601 eor BLK5.16b, BLK5.16b, TMP4.16b 602 mov IV.16b, TMP7.16b 603 eor BLK6.16b, BLK6.16b, TMP5.16b 604 eor BLK7.16b, BLK7.16b, TMP6.16b 605 st1 {BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64 606 subs w3, w3, #8 607 b.gt .Lcbcdecloop8x 608 609.Lcbcdec4x: 610 cmp w3, 1 611 b.lt .Lcbcdecout 612 cmp w3, 2 613 b.lt .Lcbcdec1x 614 cmp w3, 3 615 b.lt .Lcbcdec2x 616 cmp w3, 4 617 b.lt .Lcbcdec3x 618 ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64 619 bl sm4_encrypt_block4x 620 sub x5, x1, 64 621 ld1 {TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x5], #64 622 eor BLK0.16b, BLK0.16b, IV.16b 623 eor BLK1.16b, BLK1.16b, TMP0.16b 624 eor BLK2.16b, BLK2.16b, TMP1.16b 625 eor BLK3.16b, BLK3.16b, TMP2.16b 626 mov IV.16b, TMP3.16b 627 st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64 628 sub w3, w3, #4 629 b .Lcbcdec4x 630 631.Lcbcdec3x: 632 ld1 {BLK0.16b, BLK1.16b, BLK2.16b}, [x1], #48 633 bl sm4_encrypt_block4x 634 sub x5, x1, 48 635 ld1 {TMP0.16b, TMP1.16b, TMP2.16b}, [x5], #48 636 eor BLK0.16b, BLK0.16b, IV.16b 637 eor BLK1.16b, BLK1.16b, TMP0.16b 638 eor BLK2.16b, BLK2.16b, TMP1.16b 639 mov IV.16b, TMP2.16b 640 st1 {BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48 641 subs w3, w3, #3 642 b.le .Lcbcdecout 643 644.Lcbcdec2x: 645 ld1 {BLK0.16b, BLK1.16b}, [x1], #32 646 bl sm4_encrypt_block4x 647 sub x5, x1, 32 648 ld1 {TMP0.16b, TMP1.16b}, [x5], #32 649 eor BLK0.16b, BLK0.16b, IV.16b 650 eor BLK1.16b, BLK1.16b, TMP0.16b 651 mov IV.16b, TMP1.16b 652 st1 {BLK0.16b, BLK1.16b}, [x0], #32 653 subs w3, w3, #2 654 b.le .Lcbcdecout 655 656.Lcbcdec1x: 657 ld1 {BLK0.16b}, [x1], #16 658 bl sm4_encrypt_block1x 659 sub x5, x1, 16 660 ld1 {TMP0.16b}, [x5], #16 661 eor BLK0.16b, BLK0.16b, IV.16b 662 mov IV.16b, TMP0.16b 663 st1 {BLK0.16b}, [x0], #16 664 665.Lcbcdecout: 666 st1 {IV.16b}, [x4] 667 frame_pop 668 ret 669END_FUNC ce_sm4_cbc_decrypt 670 671/* 672 * void ce_sm4_ctr_encrypt(uint8_t out[], uint8_t const in[], 673 * uint8_t const rk[], size_t len, 674 * uint8_t iv[]); 675 * x0: output 676 * x1: input 677 * x2: round key 678 * w3: length 679 * x4: iv 680 */ 681FUNC ce_sm4_ctr_encrypt , : 682 frame_push 683 684 ld1 {RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64 685 ld1 {RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64 686 lsr w3, w3, 4 687 ldp x7, x8, [x4] 688 rev x7, x7 689 rev x8, x8 690 691.Lctrloop8x: 692 cmp w3, 8 693 b.lt .Lctr4x 694 695 /* construct CTRs */ 696 inc_le128 BLK0, x7, x8 697 inc_le128 BLK1, x7, x8 698 inc_le128 BLK2, x7, x8 699 inc_le128 BLK3, x7, x8 700 inc_le128 BLK4, x7, x8 701 inc_le128 BLK5, x7, x8 702 inc_le128 BLK6, x7, x8 703 inc_le128 BLK7, x7, x8 704 bl sm4_encrypt_block8x 705 ld1 {TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x1], #64 706 ld1 {TMP4.16b, TMP5.16b, TMP6.16b, TMP7.16b}, [x1], #64 707 eor BLK0.16b, BLK0.16b, TMP0.16b 708 eor BLK1.16b, BLK1.16b, TMP1.16b 709 eor BLK2.16b, BLK2.16b, TMP2.16b 710 eor BLK3.16b, BLK3.16b, TMP3.16b 711 eor BLK4.16b, BLK4.16b, TMP4.16b 712 eor BLK5.16b, BLK5.16b, TMP5.16b 713 eor BLK6.16b, BLK6.16b, TMP6.16b 714 eor BLK7.16b, BLK7.16b, TMP7.16b 715 st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64 716 st1 {BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64 717 subs w3, w3, #8 718 b.gt .Lctrloop8x 719 720.Lctr4x: 721 cmp w3, 1 722 b.lt .Lctrout 723 cmp w3, 2 724 b.lt .Lctr1x 725 cmp w3, 3 726 b.lt .Lctr2x 727 cmp w3, 4 728 b.lt .Lctr3x 729 inc_le128 BLK0, x7, x8 730 inc_le128 BLK1, x7, x8 731 inc_le128 BLK2, x7, x8 732 inc_le128 BLK3, x7, x8 733 bl sm4_encrypt_block4x 734 ld1 {TMP0.16b, TMP1.16b, TMP2.16b, TMP3.16b}, [x1], #64 735 eor BLK0.16b, BLK0.16b, TMP0.16b 736 eor BLK1.16b, BLK1.16b, TMP1.16b 737 eor BLK2.16b, BLK2.16b, TMP2.16b 738 eor BLK3.16b, BLK3.16b, TMP3.16b 739 st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64 740 sub w3, w3, #4 741 b .Lctr4x 742 743.Lctr3x: 744 inc_le128 BLK0, x7, x8 745 inc_le128 BLK1, x7, x8 746 inc_le128 BLK2, x7, x8 747 bl sm4_encrypt_block4x 748 ld1 {TMP0.16b, TMP1.16b, TMP2.16b}, [x1], #48 749 eor BLK0.16b, BLK0.16b, TMP0.16b 750 eor BLK1.16b, BLK1.16b, TMP1.16b 751 eor BLK2.16b, BLK2.16b, TMP2.16b 752 st1 {BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48 753 subs w3, w3, #3 754 b.le .Lctrout 755 756.Lctr2x: 757 inc_le128 BLK0, x7, x8 758 inc_le128 BLK1, x7, x8 759 bl sm4_encrypt_block4x 760 ld1 {TMP0.16b, TMP1.16b}, [x1], #32 761 eor BLK0.16b, BLK0.16b, TMP0.16b 762 eor BLK1.16b, BLK1.16b, TMP1.16b 763 st1 {BLK0.16b, BLK1.16b}, [x0], #32 764 subs w3, w3, #2 765 b.le .Lctrout 766 767.Lctr1x: 768 inc_le128 BLK0, x7, x8 769 bl sm4_encrypt_block1x 770 ld1 {TMP0.16b}, [x1], #16 771 eor BLK0.16b, BLK0.16b, TMP0.16b 772 st1 {BLK0.16b}, [x0], #16 773 774.Lctrout: 775 rev x7, x7 776 rev x8, x8 777 stp x7, x8, [x4] 778 frame_pop 779 ret 780END_FUNC ce_sm4_ctr_encrypt 781 782/* 783 * x0: output 784 * x1: input 785 * x2: round key1 786 * x3: round key2 787 * w4: blocks 788 * x26: enc/dec 789 */ 790LOCAL_FUNC xts_do_cipher , : 791 stp x29, x30, [sp, #-16]! 792 mov x29, sp 793 794 ld1 {IV.16b}, [x5] 795 /* load round key2 for first tweak */ 796 ld1 {RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x3], #64 797 ld1 {RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x3], #64 798 encrypt_block IV 799 /* load round key1 for block cipher */ 800 ld1 {RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x2], #64 801 ld1 {RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x2], #64 802 /* w6: remain */ 803 and w6, w4, #0x0F 804 /* w4: blocks */ 805 lsr w4, w4, 4 806 /* blocks == 0: ret */ 807 cmp w4, #1 808 b.lt .Lxtsout 809 cmp w6, 0 810 b.eq .Lxtsblks 811 subs w4, w4, #1 812 b.eq .Lxtstail 813.Lxtsblks: 814 mov tw0l, IV.d[0] 815 mov tw0h, IV.d[1] 816 next_tweak tw1l, tw1h, tw0l, tw0h 817 next_tweak tw2l, tw2h, tw1l, tw1h 818 next_tweak tw3l, tw3h, tw2l, tw2h 819 next_tweak tw4l, tw4h, tw3l, tw3h 820 next_tweak tw5l, tw5h, tw4l, tw4h 821 next_tweak tw6l, tw6h, tw5l, tw5h 822 next_tweak tw7l, tw7h, tw6l, tw6h 823.Lxtsloop8x: 824 cmp w4, 8 825 b.lt .Lxts4x 826 ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64 827 mov_reg_to_vec TMP0, tw0l, tw0h 828 mov_reg_to_vec TMP1, tw1l, tw1h 829 mov_reg_to_vec TMP2, tw2l, tw2h 830 mov_reg_to_vec TMP3, tw3l, tw3h 831 eor BLK0.16b, BLK0.16b, TMP0.16b 832 eor BLK1.16b, BLK1.16b, TMP1.16b 833 eor BLK2.16b, BLK2.16b, TMP2.16b 834 eor BLK3.16b, BLK3.16b, TMP3.16b 835 ld1 {BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x1], #64 836 mov_reg_to_vec TMP4, tw4l, tw4h 837 mov_reg_to_vec TMP5, tw5l, tw5h 838 mov_reg_to_vec TMP6, tw6l, tw6h 839 mov_reg_to_vec IV, tw7l, tw7h 840 eor BLK4.16b, BLK4.16b, TMP4.16b 841 eor BLK5.16b, BLK5.16b, TMP5.16b 842 eor BLK6.16b, BLK6.16b, TMP6.16b 843 eor BLK7.16b, BLK7.16b, IV.16b 844 845 bl sm4_encrypt_block8x 846 847 mov_reg_to_vec TMP0, tw0l, tw0h 848 next_tweak tw0l, tw0h, tw7l, tw7h 849 mov_reg_to_vec TMP1, tw1l, tw1h 850 next_tweak tw1l, tw1h, tw0l, tw0h 851 mov_reg_to_vec TMP2, tw2l, tw2h 852 next_tweak tw2l, tw2h, tw1l, tw1h 853 mov_reg_to_vec TMP3, tw3l, tw3h 854 next_tweak tw3l, tw3h, tw2l, tw2h 855 mov_reg_to_vec TMP4, tw4l, tw4h 856 next_tweak tw4l, tw4h, tw3l, tw3h 857 mov_reg_to_vec TMP5, tw5l, tw5h 858 next_tweak tw5l, tw5h, tw4l, tw4h 859 mov_reg_to_vec TMP6, tw6l, tw6h 860 next_tweak tw6l, tw6h, tw5l, tw5h 861 mov_reg_to_vec IV, tw7l, tw7h 862 next_tweak tw7l, tw7h, tw6l, tw6h 863 864 eor BLK0.16b, BLK0.16b, TMP0.16b 865 eor BLK1.16b, BLK1.16b, TMP1.16b 866 eor BLK2.16b, BLK2.16b, TMP2.16b 867 eor BLK3.16b, BLK3.16b, TMP3.16b 868 eor BLK4.16b, BLK4.16b, TMP4.16b 869 eor BLK5.16b, BLK5.16b, TMP5.16b 870 eor BLK6.16b, BLK6.16b, TMP6.16b 871 eor BLK7.16b, BLK7.16b, IV.16b 872 873 st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64 874 st1 {BLK4.16b, BLK5.16b, BLK6.16b, BLK7.16b}, [x0], #64 875 subs w4, w4, #8 876 b.gt .Lxtsloop8x 877 878.Lxts4x: 879 cmp w4, 1 880 b.lt .Lxtsblksout 881 cmp w4, 2 882 b.lt .Lxts1x 883 cmp w4, 3 884 b.lt .Lxts2x 885 cmp w4, 4 886 b.lt .Lxts3x 887 ld1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x1], #64 888 mov_reg_to_vec BLK4, tw0l, tw0h 889 mov_reg_to_vec BLK5, tw1l, tw1h 890 mov_reg_to_vec BLK6, tw2l, tw2h 891 mov_reg_to_vec IV, tw3l, tw3h 892 eor BLK0.16b, BLK0.16b, BLK4.16b 893 eor BLK1.16b, BLK1.16b, BLK5.16b 894 eor BLK2.16b, BLK2.16b, BLK6.16b 895 eor BLK3.16b, BLK3.16b, IV.16b 896 bl sm4_encrypt_block4x 897 eor BLK0.16b, BLK0.16b, BLK4.16b 898 eor BLK1.16b, BLK1.16b, BLK5.16b 899 eor BLK2.16b, BLK2.16b, BLK6.16b 900 eor BLK3.16b, BLK3.16b, IV.16b 901 st1 {BLK0.16b, BLK1.16b, BLK2.16b, BLK3.16b}, [x0], #64 902 sub w4, w4, #4 903 904 mov tw0l, tw4l 905 mov tw0h, tw4h 906 mov tw1l, tw5l 907 mov tw1h, tw5h 908 mov tw2l, tw6l 909 mov tw2h, tw6h 910 b .Lxts4x 911 912.Lxts3x: 913 ld1 {BLK0.16b, BLK1.16b, BLK2.16b}, [x1], #48 914 mov_reg_to_vec BLK4, tw0l, tw0h 915 mov_reg_to_vec BLK5, tw1l, tw1h 916 mov_reg_to_vec IV, tw2l, tw2h 917 eor BLK0.16b, BLK0.16b, BLK4.16b 918 eor BLK1.16b, BLK1.16b, BLK5.16b 919 eor BLK2.16b, BLK2.16b, IV.16b 920 bl sm4_encrypt_block4x 921 eor BLK0.16b, BLK0.16b, BLK4.16b 922 eor BLK1.16b, BLK1.16b, BLK5.16b 923 eor BLK2.16b, BLK2.16b, IV.16b 924 st1 {BLK0.16b, BLK1.16b, BLK2.16b}, [x0], #48 925 subs w4, w4, #3 926 b.le .Lxtsblksout 927 928.Lxts2x: 929 ld1 {BLK0.16b, BLK1.16b}, [x1], #32 930 mov_reg_to_vec BLK4, tw0l, tw0h 931 mov_reg_to_vec IV, tw1l, tw1h 932 eor BLK0.16b, BLK0.16b, BLK4.16b 933 eor BLK1.16b, BLK1.16b, IV.16b 934 bl sm4_encrypt_block4x 935 eor BLK0.16b, BLK0.16b, BLK4.16b 936 eor BLK1.16b, BLK1.16b, IV.16b 937 st1 {BLK0.16b, BLK1.16b}, [x0], #32 938 subs w4, w4, #2 939 b.le .Lxtsblksout 940 941.Lxts1x: 942 ld1 {BLK0.16b}, [x1], #16 943 mov_reg_to_vec IV, tw0l, tw0h 944 eor BLK0.16b, BLK0.16b, IV.16b 945 bl sm4_encrypt_block1x 946 eor BLK0.16b, BLK0.16b, IV.16b 947 st1 {BLK0.16b}, [x0], #16 948.Lxtsblksout: 949 cmp w6, 0 950 /* if encrypt some blocks with a partial block */ 951 next_tweak_vec IV, IV 952 b.eq .Lxtsout 953.Lxtstail: 954 next_tweak_vec TMP7, IV 955 cmp x26, 1 956 b.eq 1f 957 /* The last two tweaks IV, TMP7 need to be swapped for decryption */ 958 mov TMP8.16b, IV.16b 959 mov IV.16b, TMP7.16b 960 mov TMP7.16b, TMP8.16b 961 1: 962 ld1 {BLK0.16b}, [x1], #16 963 eor BLK0.16b, BLK0.16b, IV.16b 964 bl sm4_encrypt_block1x 965 eor BLK0.16b, BLK0.16b, IV.16b 966 st1 {BLK0.16b}, [x0], #16 967 sub x7, x0, 16 968 10: 969 subs x6, x6, 1 970 ldrb tmpw0, [x7, x6] 971 strb tmpw0, [x0, x6] 972 ldrb tmpw0, [x1, x6] 973 strb tmpw0, [x7, x6] 974 b.gt 10b 975 ld1 {BLK0.16b}, [x7] 976 eor BLK0.16b, BLK0.16b, TMP7.16b 977 bl sm4_encrypt_block1x 978 eor BLK0.16b, BLK0.16b, TMP7.16b 979 st1 {BLK0.16b}, [x7] 980 981.Lxtsout: 982 /* load round key2 for last tweak */ 983 sub x3, x3, #128 984 ld1 {RK0.4s, RK1.4s, RK2.4s, RK3.4s}, [x3], #64 985 ld1 {RK4.4s, RK5.4s, RK6.4s, RK7.4s}, [x3], #64 986 /* decrypt last tweak for next update */ 987 decrypt_block IV 988 st1 {IV.16b}, [x5] 989 ldp x29, x30, [sp], #16 990 ret 991END_FUNC xts_do_cipher 992 993/* 994 * void ce_sm4_xts_encrypt(uint8_t out[], uint8_t const in[], 995 * uint8_t const rk1[], uint8_t const rk2[], 996 * size_t len, uint8_t iv[]) 997 * x0: output 998 * x1: input 999 * x2: round key1 1000 * x3: round key2 1001 * w4: len 1002 * x5: iv 1003 */ 1004FUNC ce_sm4_xts_encrypt , : 1005 frame_push 1006 mov x26, 1 1007 bl xts_do_cipher 1008 frame_pop 1009 ret 1010 1011END_FUNC ce_sm4_xts_encrypt 1012 1013/* 1014 * void ce_sm4_xts_decrypt(uint8_t out[], uint8_t const in[], 1015 * uint8_t const rk1[], uint8_t const rk2[], 1016 * size_t len, uint8_t iv[]) 1017 * x0: output 1018 * x1: input 1019 * x2: round key1 1020 * x3: round key2 1021 * w4: len 1022 * x5: iv 1023 */ 1024FUNC ce_sm4_xts_decrypt , : 1025 frame_push 1026 mov x26, 0 1027 bl xts_do_cipher 1028 frame_pop 1029 ret 1030END_FUNC ce_sm4_xts_decrypt 1031 1032BTI(emit_aarch64_feature_1_and GNU_PROPERTY_AARCH64_FEATURE_1_BTI) 1033