1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-only */ 2*4882a593Smuzhiyun/* 3*4882a593Smuzhiyun * sm3-ce-core.S - SM3 secure hash using ARMv8.2 Crypto Extensions 4*4882a593Smuzhiyun * 5*4882a593Smuzhiyun * Copyright (C) 2018 Linaro Ltd <ard.biesheuvel@linaro.org> 6*4882a593Smuzhiyun */ 7*4882a593Smuzhiyun 8*4882a593Smuzhiyun#include <linux/linkage.h> 9*4882a593Smuzhiyun#include <asm/assembler.h> 10*4882a593Smuzhiyun 11*4882a593Smuzhiyun .irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 12*4882a593Smuzhiyun .set .Lv\b\().4s, \b 13*4882a593Smuzhiyun .endr 14*4882a593Smuzhiyun 15*4882a593Smuzhiyun .macro sm3partw1, rd, rn, rm 16*4882a593Smuzhiyun .inst 0xce60c000 | .L\rd | (.L\rn << 5) | (.L\rm << 16) 17*4882a593Smuzhiyun .endm 18*4882a593Smuzhiyun 19*4882a593Smuzhiyun .macro sm3partw2, rd, rn, rm 20*4882a593Smuzhiyun .inst 0xce60c400 | .L\rd | (.L\rn << 5) | (.L\rm << 16) 21*4882a593Smuzhiyun .endm 22*4882a593Smuzhiyun 23*4882a593Smuzhiyun .macro sm3ss1, rd, rn, rm, ra 24*4882a593Smuzhiyun .inst 0xce400000 | .L\rd | (.L\rn << 5) | (.L\ra << 10) | (.L\rm << 16) 25*4882a593Smuzhiyun .endm 26*4882a593Smuzhiyun 27*4882a593Smuzhiyun .macro sm3tt1a, rd, rn, rm, imm2 28*4882a593Smuzhiyun .inst 0xce408000 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) 29*4882a593Smuzhiyun .endm 30*4882a593Smuzhiyun 31*4882a593Smuzhiyun .macro sm3tt1b, rd, rn, rm, imm2 32*4882a593Smuzhiyun .inst 0xce408400 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) 33*4882a593Smuzhiyun .endm 34*4882a593Smuzhiyun 35*4882a593Smuzhiyun .macro sm3tt2a, rd, rn, rm, imm2 36*4882a593Smuzhiyun .inst 0xce408800 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) 37*4882a593Smuzhiyun .endm 38*4882a593Smuzhiyun 39*4882a593Smuzhiyun .macro sm3tt2b, rd, rn, rm, imm2 40*4882a593Smuzhiyun .inst 0xce408c00 | .L\rd | (.L\rn << 5) | ((\imm2) << 12) | (.L\rm << 16) 41*4882a593Smuzhiyun .endm 42*4882a593Smuzhiyun 43*4882a593Smuzhiyun .macro round, ab, s0, t0, t1, i 44*4882a593Smuzhiyun sm3ss1 v5.4s, v8.4s, \t0\().4s, v9.4s 45*4882a593Smuzhiyun shl \t1\().4s, \t0\().4s, #1 46*4882a593Smuzhiyun sri \t1\().4s, \t0\().4s, #31 47*4882a593Smuzhiyun sm3tt1\ab v8.4s, v5.4s, v10.4s, \i 48*4882a593Smuzhiyun sm3tt2\ab v9.4s, v5.4s, \s0\().4s, \i 49*4882a593Smuzhiyun .endm 50*4882a593Smuzhiyun 51*4882a593Smuzhiyun .macro qround, ab, s0, s1, s2, s3, s4 52*4882a593Smuzhiyun .ifnb \s4 53*4882a593Smuzhiyun ext \s4\().16b, \s1\().16b, \s2\().16b, #12 54*4882a593Smuzhiyun ext v6.16b, \s0\().16b, \s1\().16b, #12 55*4882a593Smuzhiyun ext v7.16b, \s2\().16b, \s3\().16b, #8 56*4882a593Smuzhiyun sm3partw1 \s4\().4s, \s0\().4s, \s3\().4s 57*4882a593Smuzhiyun .endif 58*4882a593Smuzhiyun 59*4882a593Smuzhiyun eor v10.16b, \s0\().16b, \s1\().16b 60*4882a593Smuzhiyun 61*4882a593Smuzhiyun round \ab, \s0, v11, v12, 0 62*4882a593Smuzhiyun round \ab, \s0, v12, v11, 1 63*4882a593Smuzhiyun round \ab, \s0, v11, v12, 2 64*4882a593Smuzhiyun round \ab, \s0, v12, v11, 3 65*4882a593Smuzhiyun 66*4882a593Smuzhiyun .ifnb \s4 67*4882a593Smuzhiyun sm3partw2 \s4\().4s, v7.4s, v6.4s 68*4882a593Smuzhiyun .endif 69*4882a593Smuzhiyun .endm 70*4882a593Smuzhiyun 71*4882a593Smuzhiyun /* 72*4882a593Smuzhiyun * void sm3_ce_transform(struct sm3_state *sst, u8 const *src, 73*4882a593Smuzhiyun * int blocks) 74*4882a593Smuzhiyun */ 75*4882a593Smuzhiyun .text 76*4882a593SmuzhiyunSYM_FUNC_START(sm3_ce_transform) 77*4882a593Smuzhiyun /* load state */ 78*4882a593Smuzhiyun ld1 {v8.4s-v9.4s}, [x0] 79*4882a593Smuzhiyun rev64 v8.4s, v8.4s 80*4882a593Smuzhiyun rev64 v9.4s, v9.4s 81*4882a593Smuzhiyun ext v8.16b, v8.16b, v8.16b, #8 82*4882a593Smuzhiyun ext v9.16b, v9.16b, v9.16b, #8 83*4882a593Smuzhiyun 84*4882a593Smuzhiyun adr_l x8, .Lt 85*4882a593Smuzhiyun ldp s13, s14, [x8] 86*4882a593Smuzhiyun 87*4882a593Smuzhiyun /* load input */ 88*4882a593Smuzhiyun0: ld1 {v0.16b-v3.16b}, [x1], #64 89*4882a593Smuzhiyun sub w2, w2, #1 90*4882a593Smuzhiyun 91*4882a593Smuzhiyun mov v15.16b, v8.16b 92*4882a593Smuzhiyun mov v16.16b, v9.16b 93*4882a593Smuzhiyun 94*4882a593SmuzhiyunCPU_LE( rev32 v0.16b, v0.16b ) 95*4882a593SmuzhiyunCPU_LE( rev32 v1.16b, v1.16b ) 96*4882a593SmuzhiyunCPU_LE( rev32 v2.16b, v2.16b ) 97*4882a593SmuzhiyunCPU_LE( rev32 v3.16b, v3.16b ) 98*4882a593Smuzhiyun 99*4882a593Smuzhiyun ext v11.16b, v13.16b, v13.16b, #4 100*4882a593Smuzhiyun 101*4882a593Smuzhiyun qround a, v0, v1, v2, v3, v4 102*4882a593Smuzhiyun qround a, v1, v2, v3, v4, v0 103*4882a593Smuzhiyun qround a, v2, v3, v4, v0, v1 104*4882a593Smuzhiyun qround a, v3, v4, v0, v1, v2 105*4882a593Smuzhiyun 106*4882a593Smuzhiyun ext v11.16b, v14.16b, v14.16b, #4 107*4882a593Smuzhiyun 108*4882a593Smuzhiyun qround b, v4, v0, v1, v2, v3 109*4882a593Smuzhiyun qround b, v0, v1, v2, v3, v4 110*4882a593Smuzhiyun qround b, v1, v2, v3, v4, v0 111*4882a593Smuzhiyun qround b, v2, v3, v4, v0, v1 112*4882a593Smuzhiyun qround b, v3, v4, v0, v1, v2 113*4882a593Smuzhiyun qround b, v4, v0, v1, v2, v3 114*4882a593Smuzhiyun qround b, v0, v1, v2, v3, v4 115*4882a593Smuzhiyun qround b, v1, v2, v3, v4, v0 116*4882a593Smuzhiyun qround b, v2, v3, v4, v0, v1 117*4882a593Smuzhiyun qround b, v3, v4 118*4882a593Smuzhiyun qround b, v4, v0 119*4882a593Smuzhiyun qround b, v0, v1 120*4882a593Smuzhiyun 121*4882a593Smuzhiyun eor v8.16b, v8.16b, v15.16b 122*4882a593Smuzhiyun eor v9.16b, v9.16b, v16.16b 123*4882a593Smuzhiyun 124*4882a593Smuzhiyun /* handled all input blocks? */ 125*4882a593Smuzhiyun cbnz w2, 0b 126*4882a593Smuzhiyun 127*4882a593Smuzhiyun /* save state */ 128*4882a593Smuzhiyun rev64 v8.4s, v8.4s 129*4882a593Smuzhiyun rev64 v9.4s, v9.4s 130*4882a593Smuzhiyun ext v8.16b, v8.16b, v8.16b, #8 131*4882a593Smuzhiyun ext v9.16b, v9.16b, v9.16b, #8 132*4882a593Smuzhiyun st1 {v8.4s-v9.4s}, [x0] 133*4882a593Smuzhiyun ret 134*4882a593SmuzhiyunSYM_FUNC_END(sm3_ce_transform) 135*4882a593Smuzhiyun 136*4882a593Smuzhiyun .section ".rodata", "a" 137*4882a593Smuzhiyun .align 3 138*4882a593Smuzhiyun.Lt: .word 0x79cc4519, 0x9d8a7a87 139