1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * OpenSSL/Cryptogams accelerated Poly1305 transform for arm64
4*4882a593Smuzhiyun *
5*4882a593Smuzhiyun * Copyright (C) 2019 Linaro Ltd. <ard.biesheuvel@linaro.org>
6*4882a593Smuzhiyun */
7*4882a593Smuzhiyun
8*4882a593Smuzhiyun #include <asm/hwcap.h>
9*4882a593Smuzhiyun #include <asm/neon.h>
10*4882a593Smuzhiyun #include <asm/simd.h>
11*4882a593Smuzhiyun #include <asm/unaligned.h>
12*4882a593Smuzhiyun #include <crypto/algapi.h>
13*4882a593Smuzhiyun #include <crypto/internal/hash.h>
14*4882a593Smuzhiyun #include <crypto/internal/poly1305.h>
15*4882a593Smuzhiyun #include <crypto/internal/simd.h>
16*4882a593Smuzhiyun #include <linux/cpufeature.h>
17*4882a593Smuzhiyun #include <linux/crypto.h>
18*4882a593Smuzhiyun #include <linux/jump_label.h>
19*4882a593Smuzhiyun #include <linux/module.h>
20*4882a593Smuzhiyun
21*4882a593Smuzhiyun asmlinkage void poly1305_init_arm64(void *state, const u8 *key);
22*4882a593Smuzhiyun asmlinkage void poly1305_blocks(void *state, const u8 *src, u32 len, u32 hibit);
23*4882a593Smuzhiyun asmlinkage void poly1305_blocks_neon(void *state, const u8 *src, u32 len, u32 hibit);
24*4882a593Smuzhiyun asmlinkage void poly1305_emit(void *state, u8 *digest, const u32 *nonce);
25*4882a593Smuzhiyun
26*4882a593Smuzhiyun static __ro_after_init DEFINE_STATIC_KEY_FALSE(have_neon);
27*4882a593Smuzhiyun
poly1305_init_arch(struct poly1305_desc_ctx * dctx,const u8 key[POLY1305_KEY_SIZE])28*4882a593Smuzhiyun void poly1305_init_arch(struct poly1305_desc_ctx *dctx, const u8 key[POLY1305_KEY_SIZE])
29*4882a593Smuzhiyun {
30*4882a593Smuzhiyun poly1305_init_arm64(&dctx->h, key);
31*4882a593Smuzhiyun dctx->s[0] = get_unaligned_le32(key + 16);
32*4882a593Smuzhiyun dctx->s[1] = get_unaligned_le32(key + 20);
33*4882a593Smuzhiyun dctx->s[2] = get_unaligned_le32(key + 24);
34*4882a593Smuzhiyun dctx->s[3] = get_unaligned_le32(key + 28);
35*4882a593Smuzhiyun dctx->buflen = 0;
36*4882a593Smuzhiyun }
37*4882a593Smuzhiyun EXPORT_SYMBOL(poly1305_init_arch);
38*4882a593Smuzhiyun
neon_poly1305_init(struct shash_desc * desc)39*4882a593Smuzhiyun static int neon_poly1305_init(struct shash_desc *desc)
40*4882a593Smuzhiyun {
41*4882a593Smuzhiyun struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
42*4882a593Smuzhiyun
43*4882a593Smuzhiyun dctx->buflen = 0;
44*4882a593Smuzhiyun dctx->rset = 0;
45*4882a593Smuzhiyun dctx->sset = false;
46*4882a593Smuzhiyun
47*4882a593Smuzhiyun return 0;
48*4882a593Smuzhiyun }
49*4882a593Smuzhiyun
neon_poly1305_blocks(struct poly1305_desc_ctx * dctx,const u8 * src,u32 len,u32 hibit,bool do_neon)50*4882a593Smuzhiyun static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src,
51*4882a593Smuzhiyun u32 len, u32 hibit, bool do_neon)
52*4882a593Smuzhiyun {
53*4882a593Smuzhiyun if (unlikely(!dctx->sset)) {
54*4882a593Smuzhiyun if (!dctx->rset) {
55*4882a593Smuzhiyun poly1305_init_arm64(&dctx->h, src);
56*4882a593Smuzhiyun src += POLY1305_BLOCK_SIZE;
57*4882a593Smuzhiyun len -= POLY1305_BLOCK_SIZE;
58*4882a593Smuzhiyun dctx->rset = 1;
59*4882a593Smuzhiyun }
60*4882a593Smuzhiyun if (len >= POLY1305_BLOCK_SIZE) {
61*4882a593Smuzhiyun dctx->s[0] = get_unaligned_le32(src + 0);
62*4882a593Smuzhiyun dctx->s[1] = get_unaligned_le32(src + 4);
63*4882a593Smuzhiyun dctx->s[2] = get_unaligned_le32(src + 8);
64*4882a593Smuzhiyun dctx->s[3] = get_unaligned_le32(src + 12);
65*4882a593Smuzhiyun src += POLY1305_BLOCK_SIZE;
66*4882a593Smuzhiyun len -= POLY1305_BLOCK_SIZE;
67*4882a593Smuzhiyun dctx->sset = true;
68*4882a593Smuzhiyun }
69*4882a593Smuzhiyun if (len < POLY1305_BLOCK_SIZE)
70*4882a593Smuzhiyun return;
71*4882a593Smuzhiyun }
72*4882a593Smuzhiyun
73*4882a593Smuzhiyun len &= ~(POLY1305_BLOCK_SIZE - 1);
74*4882a593Smuzhiyun
75*4882a593Smuzhiyun if (static_branch_likely(&have_neon) && likely(do_neon))
76*4882a593Smuzhiyun poly1305_blocks_neon(&dctx->h, src, len, hibit);
77*4882a593Smuzhiyun else
78*4882a593Smuzhiyun poly1305_blocks(&dctx->h, src, len, hibit);
79*4882a593Smuzhiyun }
80*4882a593Smuzhiyun
neon_poly1305_do_update(struct poly1305_desc_ctx * dctx,const u8 * src,u32 len,bool do_neon)81*4882a593Smuzhiyun static void neon_poly1305_do_update(struct poly1305_desc_ctx *dctx,
82*4882a593Smuzhiyun const u8 *src, u32 len, bool do_neon)
83*4882a593Smuzhiyun {
84*4882a593Smuzhiyun if (unlikely(dctx->buflen)) {
85*4882a593Smuzhiyun u32 bytes = min(len, POLY1305_BLOCK_SIZE - dctx->buflen);
86*4882a593Smuzhiyun
87*4882a593Smuzhiyun memcpy(dctx->buf + dctx->buflen, src, bytes);
88*4882a593Smuzhiyun src += bytes;
89*4882a593Smuzhiyun len -= bytes;
90*4882a593Smuzhiyun dctx->buflen += bytes;
91*4882a593Smuzhiyun
92*4882a593Smuzhiyun if (dctx->buflen == POLY1305_BLOCK_SIZE) {
93*4882a593Smuzhiyun neon_poly1305_blocks(dctx, dctx->buf,
94*4882a593Smuzhiyun POLY1305_BLOCK_SIZE, 1, false);
95*4882a593Smuzhiyun dctx->buflen = 0;
96*4882a593Smuzhiyun }
97*4882a593Smuzhiyun }
98*4882a593Smuzhiyun
99*4882a593Smuzhiyun if (likely(len >= POLY1305_BLOCK_SIZE)) {
100*4882a593Smuzhiyun neon_poly1305_blocks(dctx, src, len, 1, do_neon);
101*4882a593Smuzhiyun src += round_down(len, POLY1305_BLOCK_SIZE);
102*4882a593Smuzhiyun len %= POLY1305_BLOCK_SIZE;
103*4882a593Smuzhiyun }
104*4882a593Smuzhiyun
105*4882a593Smuzhiyun if (unlikely(len)) {
106*4882a593Smuzhiyun dctx->buflen = len;
107*4882a593Smuzhiyun memcpy(dctx->buf, src, len);
108*4882a593Smuzhiyun }
109*4882a593Smuzhiyun }
110*4882a593Smuzhiyun
neon_poly1305_update(struct shash_desc * desc,const u8 * src,unsigned int srclen)111*4882a593Smuzhiyun static int neon_poly1305_update(struct shash_desc *desc,
112*4882a593Smuzhiyun const u8 *src, unsigned int srclen)
113*4882a593Smuzhiyun {
114*4882a593Smuzhiyun bool do_neon = crypto_simd_usable() && srclen > 128;
115*4882a593Smuzhiyun struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
116*4882a593Smuzhiyun
117*4882a593Smuzhiyun if (static_branch_likely(&have_neon) && do_neon)
118*4882a593Smuzhiyun kernel_neon_begin();
119*4882a593Smuzhiyun neon_poly1305_do_update(dctx, src, srclen, do_neon);
120*4882a593Smuzhiyun if (static_branch_likely(&have_neon) && do_neon)
121*4882a593Smuzhiyun kernel_neon_end();
122*4882a593Smuzhiyun return 0;
123*4882a593Smuzhiyun }
124*4882a593Smuzhiyun
poly1305_update_arch(struct poly1305_desc_ctx * dctx,const u8 * src,unsigned int nbytes)125*4882a593Smuzhiyun void poly1305_update_arch(struct poly1305_desc_ctx *dctx, const u8 *src,
126*4882a593Smuzhiyun unsigned int nbytes)
127*4882a593Smuzhiyun {
128*4882a593Smuzhiyun if (unlikely(dctx->buflen)) {
129*4882a593Smuzhiyun u32 bytes = min(nbytes, POLY1305_BLOCK_SIZE - dctx->buflen);
130*4882a593Smuzhiyun
131*4882a593Smuzhiyun memcpy(dctx->buf + dctx->buflen, src, bytes);
132*4882a593Smuzhiyun src += bytes;
133*4882a593Smuzhiyun nbytes -= bytes;
134*4882a593Smuzhiyun dctx->buflen += bytes;
135*4882a593Smuzhiyun
136*4882a593Smuzhiyun if (dctx->buflen == POLY1305_BLOCK_SIZE) {
137*4882a593Smuzhiyun poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 1);
138*4882a593Smuzhiyun dctx->buflen = 0;
139*4882a593Smuzhiyun }
140*4882a593Smuzhiyun }
141*4882a593Smuzhiyun
142*4882a593Smuzhiyun if (likely(nbytes >= POLY1305_BLOCK_SIZE)) {
143*4882a593Smuzhiyun unsigned int len = round_down(nbytes, POLY1305_BLOCK_SIZE);
144*4882a593Smuzhiyun
145*4882a593Smuzhiyun if (static_branch_likely(&have_neon) && crypto_simd_usable()) {
146*4882a593Smuzhiyun do {
147*4882a593Smuzhiyun unsigned int todo = min_t(unsigned int, len, SZ_4K);
148*4882a593Smuzhiyun
149*4882a593Smuzhiyun kernel_neon_begin();
150*4882a593Smuzhiyun poly1305_blocks_neon(&dctx->h, src, todo, 1);
151*4882a593Smuzhiyun kernel_neon_end();
152*4882a593Smuzhiyun
153*4882a593Smuzhiyun len -= todo;
154*4882a593Smuzhiyun src += todo;
155*4882a593Smuzhiyun } while (len);
156*4882a593Smuzhiyun } else {
157*4882a593Smuzhiyun poly1305_blocks(&dctx->h, src, len, 1);
158*4882a593Smuzhiyun src += len;
159*4882a593Smuzhiyun }
160*4882a593Smuzhiyun nbytes %= POLY1305_BLOCK_SIZE;
161*4882a593Smuzhiyun }
162*4882a593Smuzhiyun
163*4882a593Smuzhiyun if (unlikely(nbytes)) {
164*4882a593Smuzhiyun dctx->buflen = nbytes;
165*4882a593Smuzhiyun memcpy(dctx->buf, src, nbytes);
166*4882a593Smuzhiyun }
167*4882a593Smuzhiyun }
168*4882a593Smuzhiyun EXPORT_SYMBOL(poly1305_update_arch);
169*4882a593Smuzhiyun
poly1305_final_arch(struct poly1305_desc_ctx * dctx,u8 * dst)170*4882a593Smuzhiyun void poly1305_final_arch(struct poly1305_desc_ctx *dctx, u8 *dst)
171*4882a593Smuzhiyun {
172*4882a593Smuzhiyun if (unlikely(dctx->buflen)) {
173*4882a593Smuzhiyun dctx->buf[dctx->buflen++] = 1;
174*4882a593Smuzhiyun memset(dctx->buf + dctx->buflen, 0,
175*4882a593Smuzhiyun POLY1305_BLOCK_SIZE - dctx->buflen);
176*4882a593Smuzhiyun poly1305_blocks(&dctx->h, dctx->buf, POLY1305_BLOCK_SIZE, 0);
177*4882a593Smuzhiyun }
178*4882a593Smuzhiyun
179*4882a593Smuzhiyun poly1305_emit(&dctx->h, dst, dctx->s);
180*4882a593Smuzhiyun *dctx = (struct poly1305_desc_ctx){};
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun EXPORT_SYMBOL(poly1305_final_arch);
183*4882a593Smuzhiyun
neon_poly1305_final(struct shash_desc * desc,u8 * dst)184*4882a593Smuzhiyun static int neon_poly1305_final(struct shash_desc *desc, u8 *dst)
185*4882a593Smuzhiyun {
186*4882a593Smuzhiyun struct poly1305_desc_ctx *dctx = shash_desc_ctx(desc);
187*4882a593Smuzhiyun
188*4882a593Smuzhiyun if (unlikely(!dctx->sset))
189*4882a593Smuzhiyun return -ENOKEY;
190*4882a593Smuzhiyun
191*4882a593Smuzhiyun poly1305_final_arch(dctx, dst);
192*4882a593Smuzhiyun return 0;
193*4882a593Smuzhiyun }
194*4882a593Smuzhiyun
195*4882a593Smuzhiyun static struct shash_alg neon_poly1305_alg = {
196*4882a593Smuzhiyun .init = neon_poly1305_init,
197*4882a593Smuzhiyun .update = neon_poly1305_update,
198*4882a593Smuzhiyun .final = neon_poly1305_final,
199*4882a593Smuzhiyun .digestsize = POLY1305_DIGEST_SIZE,
200*4882a593Smuzhiyun .descsize = sizeof(struct poly1305_desc_ctx),
201*4882a593Smuzhiyun
202*4882a593Smuzhiyun .base.cra_name = "poly1305",
203*4882a593Smuzhiyun .base.cra_driver_name = "poly1305-neon",
204*4882a593Smuzhiyun .base.cra_priority = 200,
205*4882a593Smuzhiyun .base.cra_blocksize = POLY1305_BLOCK_SIZE,
206*4882a593Smuzhiyun .base.cra_module = THIS_MODULE,
207*4882a593Smuzhiyun };
208*4882a593Smuzhiyun
neon_poly1305_mod_init(void)209*4882a593Smuzhiyun static int __init neon_poly1305_mod_init(void)
210*4882a593Smuzhiyun {
211*4882a593Smuzhiyun if (!cpu_have_named_feature(ASIMD))
212*4882a593Smuzhiyun return 0;
213*4882a593Smuzhiyun
214*4882a593Smuzhiyun static_branch_enable(&have_neon);
215*4882a593Smuzhiyun
216*4882a593Smuzhiyun return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
217*4882a593Smuzhiyun crypto_register_shash(&neon_poly1305_alg) : 0;
218*4882a593Smuzhiyun }
219*4882a593Smuzhiyun
neon_poly1305_mod_exit(void)220*4882a593Smuzhiyun static void __exit neon_poly1305_mod_exit(void)
221*4882a593Smuzhiyun {
222*4882a593Smuzhiyun if (IS_REACHABLE(CONFIG_CRYPTO_HASH) && cpu_have_named_feature(ASIMD))
223*4882a593Smuzhiyun crypto_unregister_shash(&neon_poly1305_alg);
224*4882a593Smuzhiyun }
225*4882a593Smuzhiyun
226*4882a593Smuzhiyun module_init(neon_poly1305_mod_init);
227*4882a593Smuzhiyun module_exit(neon_poly1305_mod_exit);
228*4882a593Smuzhiyun
229*4882a593Smuzhiyun MODULE_LICENSE("GPL v2");
230*4882a593Smuzhiyun MODULE_ALIAS_CRYPTO("poly1305");
231*4882a593Smuzhiyun MODULE_ALIAS_CRYPTO("poly1305-neon");
232