xref: /OK3568_Linux_fs/kernel/arch/arm/crypto/blake2s-core.S (revision 4882a59341e53eb6f0b4789bf948001014eff981)
1*4882a593Smuzhiyun/* SPDX-License-Identifier: GPL-2.0-or-later */
2*4882a593Smuzhiyun/*
3*4882a593Smuzhiyun * BLAKE2s digest algorithm, ARM scalar implementation
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright 2020 Google LLC
6*4882a593Smuzhiyun *
7*4882a593Smuzhiyun * Author: Eric Biggers <ebiggers@google.com>
8*4882a593Smuzhiyun */
9*4882a593Smuzhiyun
10*4882a593Smuzhiyun#include <linux/linkage.h>
11*4882a593Smuzhiyun#include <asm/assembler.h>
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun	// Registers used to hold message words temporarily.  There aren't
14*4882a593Smuzhiyun	// enough ARM registers to hold the whole message block, so we have to
15*4882a593Smuzhiyun	// load the words on-demand.
16*4882a593Smuzhiyun	M_0		.req	r12
17*4882a593Smuzhiyun	M_1		.req	r14
18*4882a593Smuzhiyun
19*4882a593Smuzhiyun// The BLAKE2s initialization vector
20*4882a593Smuzhiyun.Lblake2s_IV:
21*4882a593Smuzhiyun	.word	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
22*4882a593Smuzhiyun	.word	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
23*4882a593Smuzhiyun
24*4882a593Smuzhiyun.macro __ldrd		a, b, src, offset
25*4882a593Smuzhiyun#if __LINUX_ARM_ARCH__ >= 6
26*4882a593Smuzhiyun	ldrd		\a, \b, [\src, #\offset]
27*4882a593Smuzhiyun#else
28*4882a593Smuzhiyun	ldr		\a, [\src, #\offset]
29*4882a593Smuzhiyun	ldr		\b, [\src, #\offset + 4]
30*4882a593Smuzhiyun#endif
31*4882a593Smuzhiyun.endm
32*4882a593Smuzhiyun
33*4882a593Smuzhiyun.macro __strd		a, b, dst, offset
34*4882a593Smuzhiyun#if __LINUX_ARM_ARCH__ >= 6
35*4882a593Smuzhiyun	strd		\a, \b, [\dst, #\offset]
36*4882a593Smuzhiyun#else
37*4882a593Smuzhiyun	str		\a, [\dst, #\offset]
38*4882a593Smuzhiyun	str		\b, [\dst, #\offset + 4]
39*4882a593Smuzhiyun#endif
40*4882a593Smuzhiyun.endm
41*4882a593Smuzhiyun
42*4882a593Smuzhiyun.macro _le32_bswap	a, tmp
43*4882a593Smuzhiyun#ifdef __ARMEB__
44*4882a593Smuzhiyun	rev_l		\a, \tmp
45*4882a593Smuzhiyun#endif
46*4882a593Smuzhiyun.endm
47*4882a593Smuzhiyun
48*4882a593Smuzhiyun.macro _le32_bswap_8x	a, b, c, d, e, f, g, h,  tmp
49*4882a593Smuzhiyun	_le32_bswap	\a, \tmp
50*4882a593Smuzhiyun	_le32_bswap	\b, \tmp
51*4882a593Smuzhiyun	_le32_bswap	\c, \tmp
52*4882a593Smuzhiyun	_le32_bswap	\d, \tmp
53*4882a593Smuzhiyun	_le32_bswap	\e, \tmp
54*4882a593Smuzhiyun	_le32_bswap	\f, \tmp
55*4882a593Smuzhiyun	_le32_bswap	\g, \tmp
56*4882a593Smuzhiyun	_le32_bswap	\h, \tmp
57*4882a593Smuzhiyun.endm
58*4882a593Smuzhiyun
59*4882a593Smuzhiyun// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
60*4882a593Smuzhiyun// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
61*4882a593Smuzhiyun// columns/diagonals.  s0-s1 are the word offsets to the message words the first
62*4882a593Smuzhiyun// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
63*4882a593Smuzhiyun// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
64*4882a593Smuzhiyun//
65*4882a593Smuzhiyun// Note that to save instructions, the rotations don't happen when the
66*4882a593Smuzhiyun// pseudocode says they should, but rather they are delayed until the values are
67*4882a593Smuzhiyun// used.  See the comment above _blake2s_round().
68*4882a593Smuzhiyun.macro _blake2s_quarterround  a0, b0, c0, d0,  a1, b1, c1, d1,  s0, s1, s2, s3
69*4882a593Smuzhiyun
70*4882a593Smuzhiyun	ldr		M_0, [sp, #32 + 4 * \s0]
71*4882a593Smuzhiyun	ldr		M_1, [sp, #32 + 4 * \s2]
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun	// a += b + m[blake2s_sigma[r][2*i + 0]];
74*4882a593Smuzhiyun	add		\a0, \a0, \b0, ror #brot
75*4882a593Smuzhiyun	add		\a1, \a1, \b1, ror #brot
76*4882a593Smuzhiyun	add		\a0, \a0, M_0
77*4882a593Smuzhiyun	add		\a1, \a1, M_1
78*4882a593Smuzhiyun
79*4882a593Smuzhiyun	// d = ror32(d ^ a, 16);
80*4882a593Smuzhiyun	eor		\d0, \a0, \d0, ror #drot
81*4882a593Smuzhiyun	eor		\d1, \a1, \d1, ror #drot
82*4882a593Smuzhiyun
83*4882a593Smuzhiyun	// c += d;
84*4882a593Smuzhiyun	add		\c0, \c0, \d0, ror #16
85*4882a593Smuzhiyun	add		\c1, \c1, \d1, ror #16
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun	// b = ror32(b ^ c, 12);
88*4882a593Smuzhiyun	eor		\b0, \c0, \b0, ror #brot
89*4882a593Smuzhiyun	eor		\b1, \c1, \b1, ror #brot
90*4882a593Smuzhiyun
91*4882a593Smuzhiyun	ldr		M_0, [sp, #32 + 4 * \s1]
92*4882a593Smuzhiyun	ldr		M_1, [sp, #32 + 4 * \s3]
93*4882a593Smuzhiyun
94*4882a593Smuzhiyun	// a += b + m[blake2s_sigma[r][2*i + 1]];
95*4882a593Smuzhiyun	add		\a0, \a0, \b0, ror #12
96*4882a593Smuzhiyun	add		\a1, \a1, \b1, ror #12
97*4882a593Smuzhiyun	add		\a0, \a0, M_0
98*4882a593Smuzhiyun	add		\a1, \a1, M_1
99*4882a593Smuzhiyun
100*4882a593Smuzhiyun	// d = ror32(d ^ a, 8);
101*4882a593Smuzhiyun	eor		\d0, \a0, \d0, ror#16
102*4882a593Smuzhiyun	eor		\d1, \a1, \d1, ror#16
103*4882a593Smuzhiyun
104*4882a593Smuzhiyun	// c += d;
105*4882a593Smuzhiyun	add		\c0, \c0, \d0, ror#8
106*4882a593Smuzhiyun	add		\c1, \c1, \d1, ror#8
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun	// b = ror32(b ^ c, 7);
109*4882a593Smuzhiyun	eor		\b0, \c0, \b0, ror#12
110*4882a593Smuzhiyun	eor		\b1, \c1, \b1, ror#12
111*4882a593Smuzhiyun.endm
112*4882a593Smuzhiyun
113*4882a593Smuzhiyun// Execute one round of BLAKE2s by updating the state matrix v[0..15].  v[0..9]
114*4882a593Smuzhiyun// are in r0..r9.  The stack pointer points to 8 bytes of scratch space for
115*4882a593Smuzhiyun// spilling v[8..9], then to v[9..15], then to the message block.  r10-r12 and
116*4882a593Smuzhiyun// r14 are free to use.  The macro arguments s0-s15 give the order in which the
117*4882a593Smuzhiyun// message words are used in this round.
118*4882a593Smuzhiyun//
119*4882a593Smuzhiyun// All rotates are performed using the implicit rotate operand accepted by the
120*4882a593Smuzhiyun// 'add' and 'eor' instructions.  This is faster than using explicit rotate
121*4882a593Smuzhiyun// instructions.  To make this work, we allow the values in the second and last
122*4882a593Smuzhiyun// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
123*4882a593Smuzhiyun// wrong rotation amount.  The rotation amount is then fixed up just in time
124*4882a593Smuzhiyun// when the values are used.  'brot' is the number of bits the values in row 'b'
125*4882a593Smuzhiyun// need to be rotated right to arrive at the correct values, and 'drot'
126*4882a593Smuzhiyun// similarly for row 'd'.  (brot, drot) start out as (0, 0) but we make it such
127*4882a593Smuzhiyun// that they end up as (7, 8) after every round.
128*4882a593Smuzhiyun.macro	_blake2s_round	s0, s1, s2, s3, s4, s5, s6, s7, \
129*4882a593Smuzhiyun			s8, s9, s10, s11, s12, s13, s14, s15
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun	// Mix first two columns:
132*4882a593Smuzhiyun	// (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
133*4882a593Smuzhiyun	__ldrd		r10, r11, sp, 16	// load v[12] and v[13]
134*4882a593Smuzhiyun	_blake2s_quarterround	r0, r4, r8, r10,  r1, r5, r9, r11, \
135*4882a593Smuzhiyun				\s0, \s1, \s2, \s3
136*4882a593Smuzhiyun	__strd		r8, r9, sp, 0
137*4882a593Smuzhiyun	__strd		r10, r11, sp, 16
138*4882a593Smuzhiyun
139*4882a593Smuzhiyun	// Mix second two columns:
140*4882a593Smuzhiyun	// (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
141*4882a593Smuzhiyun	__ldrd		r8, r9, sp, 8		// load v[10] and v[11]
142*4882a593Smuzhiyun	__ldrd		r10, r11, sp, 24	// load v[14] and v[15]
143*4882a593Smuzhiyun	_blake2s_quarterround	r2, r6, r8, r10,  r3, r7, r9, r11, \
144*4882a593Smuzhiyun				\s4, \s5, \s6, \s7
145*4882a593Smuzhiyun	str		r10, [sp, #24]		// store v[14]
146*4882a593Smuzhiyun	// v[10], v[11], and v[15] are used below, so no need to store them yet.
147*4882a593Smuzhiyun
148*4882a593Smuzhiyun	.set brot, 7
149*4882a593Smuzhiyun	.set drot, 8
150*4882a593Smuzhiyun
151*4882a593Smuzhiyun	// Mix first two diagonals:
152*4882a593Smuzhiyun	// (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
153*4882a593Smuzhiyun	ldr		r10, [sp, #16]		// load v[12]
154*4882a593Smuzhiyun	_blake2s_quarterround	r0, r5, r8, r11,  r1, r6, r9, r10, \
155*4882a593Smuzhiyun				\s8, \s9, \s10, \s11
156*4882a593Smuzhiyun	__strd		r8, r9, sp, 8
157*4882a593Smuzhiyun	str		r11, [sp, #28]
158*4882a593Smuzhiyun	str		r10, [sp, #16]
159*4882a593Smuzhiyun
160*4882a593Smuzhiyun	// Mix second two diagonals:
161*4882a593Smuzhiyun	// (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
162*4882a593Smuzhiyun	__ldrd		r8, r9, sp, 0		// load v[8] and v[9]
163*4882a593Smuzhiyun	__ldrd		r10, r11, sp, 20	// load v[13] and v[14]
164*4882a593Smuzhiyun	_blake2s_quarterround	r2, r7, r8, r10,  r3, r4, r9, r11, \
165*4882a593Smuzhiyun				\s12, \s13, \s14, \s15
166*4882a593Smuzhiyun	__strd		r10, r11, sp, 20
167*4882a593Smuzhiyun.endm
168*4882a593Smuzhiyun
169*4882a593Smuzhiyun//
170*4882a593Smuzhiyun// void blake2s_compress(struct blake2s_state *state,
171*4882a593Smuzhiyun//			 const u8 *block, size_t nblocks, u32 inc);
172*4882a593Smuzhiyun//
173*4882a593Smuzhiyun// Only the first three fields of struct blake2s_state are used:
174*4882a593Smuzhiyun//	u32 h[8];	(inout)
175*4882a593Smuzhiyun//	u32 t[2];	(inout)
176*4882a593Smuzhiyun//	u32 f[2];	(in)
177*4882a593Smuzhiyun//
178*4882a593Smuzhiyun	.align		5
179*4882a593SmuzhiyunENTRY(blake2s_compress)
180*4882a593Smuzhiyun	push		{r0-r2,r4-r11,lr}	// keep this an even number
181*4882a593Smuzhiyun
182*4882a593Smuzhiyun.Lnext_block:
183*4882a593Smuzhiyun	// r0 is 'state'
184*4882a593Smuzhiyun	// r1 is 'block'
185*4882a593Smuzhiyun	// r3 is 'inc'
186*4882a593Smuzhiyun
187*4882a593Smuzhiyun	// Load and increment the counter t[0..1].
188*4882a593Smuzhiyun	__ldrd		r10, r11, r0, 32
189*4882a593Smuzhiyun	adds		r10, r10, r3
190*4882a593Smuzhiyun	adc		r11, r11, #0
191*4882a593Smuzhiyun	__strd		r10, r11, r0, 32
192*4882a593Smuzhiyun
193*4882a593Smuzhiyun	// _blake2s_round is very short on registers, so copy the message block
194*4882a593Smuzhiyun	// to the stack to save a register during the rounds.  This also has the
195*4882a593Smuzhiyun	// advantage that misalignment only needs to be dealt with in one place.
196*4882a593Smuzhiyun	sub		sp, sp, #64
197*4882a593Smuzhiyun	mov		r12, sp
198*4882a593Smuzhiyun	tst		r1, #3
199*4882a593Smuzhiyun	bne		.Lcopy_block_misaligned
200*4882a593Smuzhiyun	ldmia		r1!, {r2-r9}
201*4882a593Smuzhiyun	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
202*4882a593Smuzhiyun	stmia		r12!, {r2-r9}
203*4882a593Smuzhiyun	ldmia		r1!, {r2-r9}
204*4882a593Smuzhiyun	_le32_bswap_8x	r2, r3, r4, r5, r6, r7, r8, r9,  r14
205*4882a593Smuzhiyun	stmia		r12, {r2-r9}
206*4882a593Smuzhiyun.Lcopy_block_done:
207*4882a593Smuzhiyun	str		r1, [sp, #68]		// Update message pointer
208*4882a593Smuzhiyun
209*4882a593Smuzhiyun	// Calculate v[8..15].  Push v[9..15] onto the stack, and leave space
210*4882a593Smuzhiyun	// for spilling v[8..9].  Leave v[8..9] in r8-r9.
211*4882a593Smuzhiyun	mov		r14, r0			// r14 = state
212*4882a593Smuzhiyun	adr		r12, .Lblake2s_IV
213*4882a593Smuzhiyun	ldmia		r12!, {r8-r9}		// load IV[0..1]
214*4882a593Smuzhiyun	__ldrd		r0, r1, r14, 40		// load f[0..1]
215*4882a593Smuzhiyun	ldm		r12, {r2-r7}		// load IV[3..7]
216*4882a593Smuzhiyun	eor		r4, r4, r10		// v[12] = IV[4] ^ t[0]
217*4882a593Smuzhiyun	eor		r5, r5, r11		// v[13] = IV[5] ^ t[1]
218*4882a593Smuzhiyun	eor		r6, r6, r0		// v[14] = IV[6] ^ f[0]
219*4882a593Smuzhiyun	eor		r7, r7, r1		// v[15] = IV[7] ^ f[1]
220*4882a593Smuzhiyun	push		{r2-r7}			// push v[9..15]
221*4882a593Smuzhiyun	sub		sp, sp, #8		// leave space for v[8..9]
222*4882a593Smuzhiyun
223*4882a593Smuzhiyun	// Load h[0..7] == v[0..7].
224*4882a593Smuzhiyun	ldm		r14, {r0-r7}
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun	// Execute the rounds.  Each round is provided the order in which it
227*4882a593Smuzhiyun	// needs to use the message words.
228*4882a593Smuzhiyun	.set brot, 0
229*4882a593Smuzhiyun	.set drot, 0
230*4882a593Smuzhiyun	_blake2s_round	0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
231*4882a593Smuzhiyun	_blake2s_round	14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
232*4882a593Smuzhiyun	_blake2s_round	11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
233*4882a593Smuzhiyun	_blake2s_round	7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
234*4882a593Smuzhiyun	_blake2s_round	9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
235*4882a593Smuzhiyun	_blake2s_round	2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
236*4882a593Smuzhiyun	_blake2s_round	12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
237*4882a593Smuzhiyun	_blake2s_round	13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
238*4882a593Smuzhiyun	_blake2s_round	6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
239*4882a593Smuzhiyun	_blake2s_round	10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
240*4882a593Smuzhiyun
241*4882a593Smuzhiyun	// Fold the final state matrix into the hash chaining value:
242*4882a593Smuzhiyun	//
243*4882a593Smuzhiyun	//	for (i = 0; i < 8; i++)
244*4882a593Smuzhiyun	//		h[i] ^= v[i] ^ v[i + 8];
245*4882a593Smuzhiyun	//
246*4882a593Smuzhiyun	ldr		r14, [sp, #96]		// r14 = &h[0]
247*4882a593Smuzhiyun	add		sp, sp, #8		// v[8..9] are already loaded.
248*4882a593Smuzhiyun	pop		{r10-r11}		// load v[10..11]
249*4882a593Smuzhiyun	eor		r0, r0, r8
250*4882a593Smuzhiyun	eor		r1, r1, r9
251*4882a593Smuzhiyun	eor		r2, r2, r10
252*4882a593Smuzhiyun	eor		r3, r3, r11
253*4882a593Smuzhiyun	ldm		r14, {r8-r11}		// load h[0..3]
254*4882a593Smuzhiyun	eor		r0, r0, r8
255*4882a593Smuzhiyun	eor		r1, r1, r9
256*4882a593Smuzhiyun	eor		r2, r2, r10
257*4882a593Smuzhiyun	eor		r3, r3, r11
258*4882a593Smuzhiyun	stmia		r14!, {r0-r3}		// store new h[0..3]
259*4882a593Smuzhiyun	ldm		r14, {r0-r3}		// load old h[4..7]
260*4882a593Smuzhiyun	pop		{r8-r11}		// load v[12..15]
261*4882a593Smuzhiyun	eor		r0, r0, r4, ror #brot
262*4882a593Smuzhiyun	eor		r1, r1, r5, ror #brot
263*4882a593Smuzhiyun	eor		r2, r2, r6, ror #brot
264*4882a593Smuzhiyun	eor		r3, r3, r7, ror #brot
265*4882a593Smuzhiyun	eor		r0, r0, r8, ror #drot
266*4882a593Smuzhiyun	eor		r1, r1, r9, ror #drot
267*4882a593Smuzhiyun	eor		r2, r2, r10, ror #drot
268*4882a593Smuzhiyun	eor		r3, r3, r11, ror #drot
269*4882a593Smuzhiyun	  add		sp, sp, #64		// skip copy of message block
270*4882a593Smuzhiyun	stm		r14, {r0-r3}		// store new h[4..7]
271*4882a593Smuzhiyun
272*4882a593Smuzhiyun	// Advance to the next block, if there is one.  Note that if there are
273*4882a593Smuzhiyun	// multiple blocks, then 'inc' (the counter increment amount) must be
274*4882a593Smuzhiyun	// 64.  So we can simply set it to 64 without re-loading it.
275*4882a593Smuzhiyun	ldm		sp, {r0, r1, r2}	// load (state, block, nblocks)
276*4882a593Smuzhiyun	mov		r3, #64			// set 'inc'
277*4882a593Smuzhiyun	subs		r2, r2, #1		// nblocks--
278*4882a593Smuzhiyun	str		r2, [sp, #8]
279*4882a593Smuzhiyun	bne		.Lnext_block		// nblocks != 0?
280*4882a593Smuzhiyun
281*4882a593Smuzhiyun	pop		{r0-r2,r4-r11,pc}
282*4882a593Smuzhiyun
283*4882a593Smuzhiyun	// The next message block (pointed to by r1) isn't 4-byte aligned, so it
284*4882a593Smuzhiyun	// can't be loaded using ldmia.  Copy it to the stack buffer (pointed to
285*4882a593Smuzhiyun	// by r12) using an alternative method.  r2-r9 are free to use.
286*4882a593Smuzhiyun.Lcopy_block_misaligned:
287*4882a593Smuzhiyun	mov		r2, #64
288*4882a593Smuzhiyun1:
289*4882a593Smuzhiyun#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
290*4882a593Smuzhiyun	ldr		r3, [r1], #4
291*4882a593Smuzhiyun	_le32_bswap	r3, r4
292*4882a593Smuzhiyun#else
293*4882a593Smuzhiyun	ldrb		r3, [r1, #0]
294*4882a593Smuzhiyun	ldrb		r4, [r1, #1]
295*4882a593Smuzhiyun	ldrb		r5, [r1, #2]
296*4882a593Smuzhiyun	ldrb		r6, [r1, #3]
297*4882a593Smuzhiyun	add		r1, r1, #4
298*4882a593Smuzhiyun	orr		r3, r3, r4, lsl #8
299*4882a593Smuzhiyun	orr		r3, r3, r5, lsl #16
300*4882a593Smuzhiyun	orr		r3, r3, r6, lsl #24
301*4882a593Smuzhiyun#endif
302*4882a593Smuzhiyun	subs		r2, r2, #4
303*4882a593Smuzhiyun	str		r3, [r12], #4
304*4882a593Smuzhiyun	bne		1b
305*4882a593Smuzhiyun	b		.Lcopy_block_done
306*4882a593SmuzhiyunENDPROC(blake2s_compress)
307