1*4882a593Smuzhiyun // SPDX-License-Identifier: GPL-2.0 OR MIT
2*4882a593Smuzhiyun /*
3*4882a593Smuzhiyun * Copyright (C) 2016-2017 INRIA and Microsoft Corporation.
4*4882a593Smuzhiyun * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
5*4882a593Smuzhiyun *
6*4882a593Smuzhiyun * This is a machine-generated formally verified implementation of Curve25519
7*4882a593Smuzhiyun * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine
8*4882a593Smuzhiyun * generated, it has been tweaked to be suitable for use in the kernel. It is
9*4882a593Smuzhiyun * optimized for 64-bit machines that can efficiently work with 128-bit
10*4882a593Smuzhiyun * integer types.
11*4882a593Smuzhiyun */
12*4882a593Smuzhiyun
13*4882a593Smuzhiyun #include <asm/unaligned.h>
14*4882a593Smuzhiyun #include <crypto/curve25519.h>
15*4882a593Smuzhiyun #include <linux/string.h>
16*4882a593Smuzhiyun
17*4882a593Smuzhiyun typedef __uint128_t u128;
18*4882a593Smuzhiyun
u64_eq_mask(u64 a,u64 b)19*4882a593Smuzhiyun static __always_inline u64 u64_eq_mask(u64 a, u64 b)
20*4882a593Smuzhiyun {
21*4882a593Smuzhiyun u64 x = a ^ b;
22*4882a593Smuzhiyun u64 minus_x = ~x + (u64)1U;
23*4882a593Smuzhiyun u64 x_or_minus_x = x | minus_x;
24*4882a593Smuzhiyun u64 xnx = x_or_minus_x >> (u32)63U;
25*4882a593Smuzhiyun u64 c = xnx - (u64)1U;
26*4882a593Smuzhiyun return c;
27*4882a593Smuzhiyun }
28*4882a593Smuzhiyun
u64_gte_mask(u64 a,u64 b)29*4882a593Smuzhiyun static __always_inline u64 u64_gte_mask(u64 a, u64 b)
30*4882a593Smuzhiyun {
31*4882a593Smuzhiyun u64 x = a;
32*4882a593Smuzhiyun u64 y = b;
33*4882a593Smuzhiyun u64 x_xor_y = x ^ y;
34*4882a593Smuzhiyun u64 x_sub_y = x - y;
35*4882a593Smuzhiyun u64 x_sub_y_xor_y = x_sub_y ^ y;
36*4882a593Smuzhiyun u64 q = x_xor_y | x_sub_y_xor_y;
37*4882a593Smuzhiyun u64 x_xor_q = x ^ q;
38*4882a593Smuzhiyun u64 x_xor_q_ = x_xor_q >> (u32)63U;
39*4882a593Smuzhiyun u64 c = x_xor_q_ - (u64)1U;
40*4882a593Smuzhiyun return c;
41*4882a593Smuzhiyun }
42*4882a593Smuzhiyun
modulo_carry_top(u64 * b)43*4882a593Smuzhiyun static __always_inline void modulo_carry_top(u64 *b)
44*4882a593Smuzhiyun {
45*4882a593Smuzhiyun u64 b4 = b[4];
46*4882a593Smuzhiyun u64 b0 = b[0];
47*4882a593Smuzhiyun u64 b4_ = b4 & 0x7ffffffffffffLLU;
48*4882a593Smuzhiyun u64 b0_ = b0 + 19 * (b4 >> 51);
49*4882a593Smuzhiyun b[4] = b4_;
50*4882a593Smuzhiyun b[0] = b0_;
51*4882a593Smuzhiyun }
52*4882a593Smuzhiyun
fproduct_copy_from_wide_(u64 * output,u128 * input)53*4882a593Smuzhiyun static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input)
54*4882a593Smuzhiyun {
55*4882a593Smuzhiyun {
56*4882a593Smuzhiyun u128 xi = input[0];
57*4882a593Smuzhiyun output[0] = ((u64)(xi));
58*4882a593Smuzhiyun }
59*4882a593Smuzhiyun {
60*4882a593Smuzhiyun u128 xi = input[1];
61*4882a593Smuzhiyun output[1] = ((u64)(xi));
62*4882a593Smuzhiyun }
63*4882a593Smuzhiyun {
64*4882a593Smuzhiyun u128 xi = input[2];
65*4882a593Smuzhiyun output[2] = ((u64)(xi));
66*4882a593Smuzhiyun }
67*4882a593Smuzhiyun {
68*4882a593Smuzhiyun u128 xi = input[3];
69*4882a593Smuzhiyun output[3] = ((u64)(xi));
70*4882a593Smuzhiyun }
71*4882a593Smuzhiyun {
72*4882a593Smuzhiyun u128 xi = input[4];
73*4882a593Smuzhiyun output[4] = ((u64)(xi));
74*4882a593Smuzhiyun }
75*4882a593Smuzhiyun }
76*4882a593Smuzhiyun
77*4882a593Smuzhiyun static __always_inline void
fproduct_sum_scalar_multiplication_(u128 * output,u64 * input,u64 s)78*4882a593Smuzhiyun fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s)
79*4882a593Smuzhiyun {
80*4882a593Smuzhiyun output[0] += (u128)input[0] * s;
81*4882a593Smuzhiyun output[1] += (u128)input[1] * s;
82*4882a593Smuzhiyun output[2] += (u128)input[2] * s;
83*4882a593Smuzhiyun output[3] += (u128)input[3] * s;
84*4882a593Smuzhiyun output[4] += (u128)input[4] * s;
85*4882a593Smuzhiyun }
86*4882a593Smuzhiyun
fproduct_carry_wide_(u128 * tmp)87*4882a593Smuzhiyun static __always_inline void fproduct_carry_wide_(u128 *tmp)
88*4882a593Smuzhiyun {
89*4882a593Smuzhiyun {
90*4882a593Smuzhiyun u32 ctr = 0;
91*4882a593Smuzhiyun u128 tctr = tmp[ctr];
92*4882a593Smuzhiyun u128 tctrp1 = tmp[ctr + 1];
93*4882a593Smuzhiyun u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
94*4882a593Smuzhiyun u128 c = ((tctr) >> (51));
95*4882a593Smuzhiyun tmp[ctr] = ((u128)(r0));
96*4882a593Smuzhiyun tmp[ctr + 1] = ((tctrp1) + (c));
97*4882a593Smuzhiyun }
98*4882a593Smuzhiyun {
99*4882a593Smuzhiyun u32 ctr = 1;
100*4882a593Smuzhiyun u128 tctr = tmp[ctr];
101*4882a593Smuzhiyun u128 tctrp1 = tmp[ctr + 1];
102*4882a593Smuzhiyun u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
103*4882a593Smuzhiyun u128 c = ((tctr) >> (51));
104*4882a593Smuzhiyun tmp[ctr] = ((u128)(r0));
105*4882a593Smuzhiyun tmp[ctr + 1] = ((tctrp1) + (c));
106*4882a593Smuzhiyun }
107*4882a593Smuzhiyun
108*4882a593Smuzhiyun {
109*4882a593Smuzhiyun u32 ctr = 2;
110*4882a593Smuzhiyun u128 tctr = tmp[ctr];
111*4882a593Smuzhiyun u128 tctrp1 = tmp[ctr + 1];
112*4882a593Smuzhiyun u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
113*4882a593Smuzhiyun u128 c = ((tctr) >> (51));
114*4882a593Smuzhiyun tmp[ctr] = ((u128)(r0));
115*4882a593Smuzhiyun tmp[ctr + 1] = ((tctrp1) + (c));
116*4882a593Smuzhiyun }
117*4882a593Smuzhiyun {
118*4882a593Smuzhiyun u32 ctr = 3;
119*4882a593Smuzhiyun u128 tctr = tmp[ctr];
120*4882a593Smuzhiyun u128 tctrp1 = tmp[ctr + 1];
121*4882a593Smuzhiyun u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU;
122*4882a593Smuzhiyun u128 c = ((tctr) >> (51));
123*4882a593Smuzhiyun tmp[ctr] = ((u128)(r0));
124*4882a593Smuzhiyun tmp[ctr + 1] = ((tctrp1) + (c));
125*4882a593Smuzhiyun }
126*4882a593Smuzhiyun }
127*4882a593Smuzhiyun
fmul_shift_reduce(u64 * output)128*4882a593Smuzhiyun static __always_inline void fmul_shift_reduce(u64 *output)
129*4882a593Smuzhiyun {
130*4882a593Smuzhiyun u64 tmp = output[4];
131*4882a593Smuzhiyun u64 b0;
132*4882a593Smuzhiyun {
133*4882a593Smuzhiyun u32 ctr = 5 - 0 - 1;
134*4882a593Smuzhiyun u64 z = output[ctr - 1];
135*4882a593Smuzhiyun output[ctr] = z;
136*4882a593Smuzhiyun }
137*4882a593Smuzhiyun {
138*4882a593Smuzhiyun u32 ctr = 5 - 1 - 1;
139*4882a593Smuzhiyun u64 z = output[ctr - 1];
140*4882a593Smuzhiyun output[ctr] = z;
141*4882a593Smuzhiyun }
142*4882a593Smuzhiyun {
143*4882a593Smuzhiyun u32 ctr = 5 - 2 - 1;
144*4882a593Smuzhiyun u64 z = output[ctr - 1];
145*4882a593Smuzhiyun output[ctr] = z;
146*4882a593Smuzhiyun }
147*4882a593Smuzhiyun {
148*4882a593Smuzhiyun u32 ctr = 5 - 3 - 1;
149*4882a593Smuzhiyun u64 z = output[ctr - 1];
150*4882a593Smuzhiyun output[ctr] = z;
151*4882a593Smuzhiyun }
152*4882a593Smuzhiyun output[0] = tmp;
153*4882a593Smuzhiyun b0 = output[0];
154*4882a593Smuzhiyun output[0] = 19 * b0;
155*4882a593Smuzhiyun }
156*4882a593Smuzhiyun
fmul_mul_shift_reduce_(u128 * output,u64 * input,u64 * input21)157*4882a593Smuzhiyun static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input,
158*4882a593Smuzhiyun u64 *input21)
159*4882a593Smuzhiyun {
160*4882a593Smuzhiyun u32 i;
161*4882a593Smuzhiyun u64 input2i;
162*4882a593Smuzhiyun {
163*4882a593Smuzhiyun u64 input2i = input21[0];
164*4882a593Smuzhiyun fproduct_sum_scalar_multiplication_(output, input, input2i);
165*4882a593Smuzhiyun fmul_shift_reduce(input);
166*4882a593Smuzhiyun }
167*4882a593Smuzhiyun {
168*4882a593Smuzhiyun u64 input2i = input21[1];
169*4882a593Smuzhiyun fproduct_sum_scalar_multiplication_(output, input, input2i);
170*4882a593Smuzhiyun fmul_shift_reduce(input);
171*4882a593Smuzhiyun }
172*4882a593Smuzhiyun {
173*4882a593Smuzhiyun u64 input2i = input21[2];
174*4882a593Smuzhiyun fproduct_sum_scalar_multiplication_(output, input, input2i);
175*4882a593Smuzhiyun fmul_shift_reduce(input);
176*4882a593Smuzhiyun }
177*4882a593Smuzhiyun {
178*4882a593Smuzhiyun u64 input2i = input21[3];
179*4882a593Smuzhiyun fproduct_sum_scalar_multiplication_(output, input, input2i);
180*4882a593Smuzhiyun fmul_shift_reduce(input);
181*4882a593Smuzhiyun }
182*4882a593Smuzhiyun i = 4;
183*4882a593Smuzhiyun input2i = input21[i];
184*4882a593Smuzhiyun fproduct_sum_scalar_multiplication_(output, input, input2i);
185*4882a593Smuzhiyun }
186*4882a593Smuzhiyun
fmul_fmul(u64 * output,u64 * input,u64 * input21)187*4882a593Smuzhiyun static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21)
188*4882a593Smuzhiyun {
189*4882a593Smuzhiyun u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] };
190*4882a593Smuzhiyun {
191*4882a593Smuzhiyun u128 b4;
192*4882a593Smuzhiyun u128 b0;
193*4882a593Smuzhiyun u128 b4_;
194*4882a593Smuzhiyun u128 b0_;
195*4882a593Smuzhiyun u64 i0;
196*4882a593Smuzhiyun u64 i1;
197*4882a593Smuzhiyun u64 i0_;
198*4882a593Smuzhiyun u64 i1_;
199*4882a593Smuzhiyun u128 t[5] = { 0 };
200*4882a593Smuzhiyun fmul_mul_shift_reduce_(t, tmp, input21);
201*4882a593Smuzhiyun fproduct_carry_wide_(t);
202*4882a593Smuzhiyun b4 = t[4];
203*4882a593Smuzhiyun b0 = t[0];
204*4882a593Smuzhiyun b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
205*4882a593Smuzhiyun b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
206*4882a593Smuzhiyun t[4] = b4_;
207*4882a593Smuzhiyun t[0] = b0_;
208*4882a593Smuzhiyun fproduct_copy_from_wide_(output, t);
209*4882a593Smuzhiyun i0 = output[0];
210*4882a593Smuzhiyun i1 = output[1];
211*4882a593Smuzhiyun i0_ = i0 & 0x7ffffffffffffLLU;
212*4882a593Smuzhiyun i1_ = i1 + (i0 >> 51);
213*4882a593Smuzhiyun output[0] = i0_;
214*4882a593Smuzhiyun output[1] = i1_;
215*4882a593Smuzhiyun }
216*4882a593Smuzhiyun }
217*4882a593Smuzhiyun
fsquare_fsquare__(u128 * tmp,u64 * output)218*4882a593Smuzhiyun static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output)
219*4882a593Smuzhiyun {
220*4882a593Smuzhiyun u64 r0 = output[0];
221*4882a593Smuzhiyun u64 r1 = output[1];
222*4882a593Smuzhiyun u64 r2 = output[2];
223*4882a593Smuzhiyun u64 r3 = output[3];
224*4882a593Smuzhiyun u64 r4 = output[4];
225*4882a593Smuzhiyun u64 d0 = r0 * 2;
226*4882a593Smuzhiyun u64 d1 = r1 * 2;
227*4882a593Smuzhiyun u64 d2 = r2 * 2 * 19;
228*4882a593Smuzhiyun u64 d419 = r4 * 19;
229*4882a593Smuzhiyun u64 d4 = d419 * 2;
230*4882a593Smuzhiyun u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) +
231*4882a593Smuzhiyun (((u128)(d2) * (r3))));
232*4882a593Smuzhiyun u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) +
233*4882a593Smuzhiyun (((u128)(r3 * 19) * (r3))));
234*4882a593Smuzhiyun u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) +
235*4882a593Smuzhiyun (((u128)(d4) * (r3))));
236*4882a593Smuzhiyun u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) +
237*4882a593Smuzhiyun (((u128)(r4) * (d419))));
238*4882a593Smuzhiyun u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) +
239*4882a593Smuzhiyun (((u128)(r2) * (r2))));
240*4882a593Smuzhiyun tmp[0] = s0;
241*4882a593Smuzhiyun tmp[1] = s1;
242*4882a593Smuzhiyun tmp[2] = s2;
243*4882a593Smuzhiyun tmp[3] = s3;
244*4882a593Smuzhiyun tmp[4] = s4;
245*4882a593Smuzhiyun }
246*4882a593Smuzhiyun
fsquare_fsquare_(u128 * tmp,u64 * output)247*4882a593Smuzhiyun static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output)
248*4882a593Smuzhiyun {
249*4882a593Smuzhiyun u128 b4;
250*4882a593Smuzhiyun u128 b0;
251*4882a593Smuzhiyun u128 b4_;
252*4882a593Smuzhiyun u128 b0_;
253*4882a593Smuzhiyun u64 i0;
254*4882a593Smuzhiyun u64 i1;
255*4882a593Smuzhiyun u64 i0_;
256*4882a593Smuzhiyun u64 i1_;
257*4882a593Smuzhiyun fsquare_fsquare__(tmp, output);
258*4882a593Smuzhiyun fproduct_carry_wide_(tmp);
259*4882a593Smuzhiyun b4 = tmp[4];
260*4882a593Smuzhiyun b0 = tmp[0];
261*4882a593Smuzhiyun b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
262*4882a593Smuzhiyun b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
263*4882a593Smuzhiyun tmp[4] = b4_;
264*4882a593Smuzhiyun tmp[0] = b0_;
265*4882a593Smuzhiyun fproduct_copy_from_wide_(output, tmp);
266*4882a593Smuzhiyun i0 = output[0];
267*4882a593Smuzhiyun i1 = output[1];
268*4882a593Smuzhiyun i0_ = i0 & 0x7ffffffffffffLLU;
269*4882a593Smuzhiyun i1_ = i1 + (i0 >> 51);
270*4882a593Smuzhiyun output[0] = i0_;
271*4882a593Smuzhiyun output[1] = i1_;
272*4882a593Smuzhiyun }
273*4882a593Smuzhiyun
fsquare_fsquare_times_(u64 * output,u128 * tmp,u32 count1)274*4882a593Smuzhiyun static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp,
275*4882a593Smuzhiyun u32 count1)
276*4882a593Smuzhiyun {
277*4882a593Smuzhiyun u32 i;
278*4882a593Smuzhiyun fsquare_fsquare_(tmp, output);
279*4882a593Smuzhiyun for (i = 1; i < count1; ++i)
280*4882a593Smuzhiyun fsquare_fsquare_(tmp, output);
281*4882a593Smuzhiyun }
282*4882a593Smuzhiyun
fsquare_fsquare_times(u64 * output,u64 * input,u32 count1)283*4882a593Smuzhiyun static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input,
284*4882a593Smuzhiyun u32 count1)
285*4882a593Smuzhiyun {
286*4882a593Smuzhiyun u128 t[5];
287*4882a593Smuzhiyun memcpy(output, input, 5 * sizeof(*input));
288*4882a593Smuzhiyun fsquare_fsquare_times_(output, t, count1);
289*4882a593Smuzhiyun }
290*4882a593Smuzhiyun
fsquare_fsquare_times_inplace(u64 * output,u32 count1)291*4882a593Smuzhiyun static __always_inline void fsquare_fsquare_times_inplace(u64 *output,
292*4882a593Smuzhiyun u32 count1)
293*4882a593Smuzhiyun {
294*4882a593Smuzhiyun u128 t[5];
295*4882a593Smuzhiyun fsquare_fsquare_times_(output, t, count1);
296*4882a593Smuzhiyun }
297*4882a593Smuzhiyun
crecip_crecip(u64 * out,u64 * z)298*4882a593Smuzhiyun static __always_inline void crecip_crecip(u64 *out, u64 *z)
299*4882a593Smuzhiyun {
300*4882a593Smuzhiyun u64 buf[20] = { 0 };
301*4882a593Smuzhiyun u64 *a0 = buf;
302*4882a593Smuzhiyun u64 *t00 = buf + 5;
303*4882a593Smuzhiyun u64 *b0 = buf + 10;
304*4882a593Smuzhiyun u64 *t01;
305*4882a593Smuzhiyun u64 *b1;
306*4882a593Smuzhiyun u64 *c0;
307*4882a593Smuzhiyun u64 *a;
308*4882a593Smuzhiyun u64 *t0;
309*4882a593Smuzhiyun u64 *b;
310*4882a593Smuzhiyun u64 *c;
311*4882a593Smuzhiyun fsquare_fsquare_times(a0, z, 1);
312*4882a593Smuzhiyun fsquare_fsquare_times(t00, a0, 2);
313*4882a593Smuzhiyun fmul_fmul(b0, t00, z);
314*4882a593Smuzhiyun fmul_fmul(a0, b0, a0);
315*4882a593Smuzhiyun fsquare_fsquare_times(t00, a0, 1);
316*4882a593Smuzhiyun fmul_fmul(b0, t00, b0);
317*4882a593Smuzhiyun fsquare_fsquare_times(t00, b0, 5);
318*4882a593Smuzhiyun t01 = buf + 5;
319*4882a593Smuzhiyun b1 = buf + 10;
320*4882a593Smuzhiyun c0 = buf + 15;
321*4882a593Smuzhiyun fmul_fmul(b1, t01, b1);
322*4882a593Smuzhiyun fsquare_fsquare_times(t01, b1, 10);
323*4882a593Smuzhiyun fmul_fmul(c0, t01, b1);
324*4882a593Smuzhiyun fsquare_fsquare_times(t01, c0, 20);
325*4882a593Smuzhiyun fmul_fmul(t01, t01, c0);
326*4882a593Smuzhiyun fsquare_fsquare_times_inplace(t01, 10);
327*4882a593Smuzhiyun fmul_fmul(b1, t01, b1);
328*4882a593Smuzhiyun fsquare_fsquare_times(t01, b1, 50);
329*4882a593Smuzhiyun a = buf;
330*4882a593Smuzhiyun t0 = buf + 5;
331*4882a593Smuzhiyun b = buf + 10;
332*4882a593Smuzhiyun c = buf + 15;
333*4882a593Smuzhiyun fmul_fmul(c, t0, b);
334*4882a593Smuzhiyun fsquare_fsquare_times(t0, c, 100);
335*4882a593Smuzhiyun fmul_fmul(t0, t0, c);
336*4882a593Smuzhiyun fsquare_fsquare_times_inplace(t0, 50);
337*4882a593Smuzhiyun fmul_fmul(t0, t0, b);
338*4882a593Smuzhiyun fsquare_fsquare_times_inplace(t0, 5);
339*4882a593Smuzhiyun fmul_fmul(out, t0, a);
340*4882a593Smuzhiyun }
341*4882a593Smuzhiyun
fsum(u64 * a,u64 * b)342*4882a593Smuzhiyun static __always_inline void fsum(u64 *a, u64 *b)
343*4882a593Smuzhiyun {
344*4882a593Smuzhiyun a[0] += b[0];
345*4882a593Smuzhiyun a[1] += b[1];
346*4882a593Smuzhiyun a[2] += b[2];
347*4882a593Smuzhiyun a[3] += b[3];
348*4882a593Smuzhiyun a[4] += b[4];
349*4882a593Smuzhiyun }
350*4882a593Smuzhiyun
fdifference(u64 * a,u64 * b)351*4882a593Smuzhiyun static __always_inline void fdifference(u64 *a, u64 *b)
352*4882a593Smuzhiyun {
353*4882a593Smuzhiyun u64 tmp[5] = { 0 };
354*4882a593Smuzhiyun u64 b0;
355*4882a593Smuzhiyun u64 b1;
356*4882a593Smuzhiyun u64 b2;
357*4882a593Smuzhiyun u64 b3;
358*4882a593Smuzhiyun u64 b4;
359*4882a593Smuzhiyun memcpy(tmp, b, 5 * sizeof(*b));
360*4882a593Smuzhiyun b0 = tmp[0];
361*4882a593Smuzhiyun b1 = tmp[1];
362*4882a593Smuzhiyun b2 = tmp[2];
363*4882a593Smuzhiyun b3 = tmp[3];
364*4882a593Smuzhiyun b4 = tmp[4];
365*4882a593Smuzhiyun tmp[0] = b0 + 0x3fffffffffff68LLU;
366*4882a593Smuzhiyun tmp[1] = b1 + 0x3ffffffffffff8LLU;
367*4882a593Smuzhiyun tmp[2] = b2 + 0x3ffffffffffff8LLU;
368*4882a593Smuzhiyun tmp[3] = b3 + 0x3ffffffffffff8LLU;
369*4882a593Smuzhiyun tmp[4] = b4 + 0x3ffffffffffff8LLU;
370*4882a593Smuzhiyun {
371*4882a593Smuzhiyun u64 xi = a[0];
372*4882a593Smuzhiyun u64 yi = tmp[0];
373*4882a593Smuzhiyun a[0] = yi - xi;
374*4882a593Smuzhiyun }
375*4882a593Smuzhiyun {
376*4882a593Smuzhiyun u64 xi = a[1];
377*4882a593Smuzhiyun u64 yi = tmp[1];
378*4882a593Smuzhiyun a[1] = yi - xi;
379*4882a593Smuzhiyun }
380*4882a593Smuzhiyun {
381*4882a593Smuzhiyun u64 xi = a[2];
382*4882a593Smuzhiyun u64 yi = tmp[2];
383*4882a593Smuzhiyun a[2] = yi - xi;
384*4882a593Smuzhiyun }
385*4882a593Smuzhiyun {
386*4882a593Smuzhiyun u64 xi = a[3];
387*4882a593Smuzhiyun u64 yi = tmp[3];
388*4882a593Smuzhiyun a[3] = yi - xi;
389*4882a593Smuzhiyun }
390*4882a593Smuzhiyun {
391*4882a593Smuzhiyun u64 xi = a[4];
392*4882a593Smuzhiyun u64 yi = tmp[4];
393*4882a593Smuzhiyun a[4] = yi - xi;
394*4882a593Smuzhiyun }
395*4882a593Smuzhiyun }
396*4882a593Smuzhiyun
fscalar(u64 * output,u64 * b,u64 s)397*4882a593Smuzhiyun static __always_inline void fscalar(u64 *output, u64 *b, u64 s)
398*4882a593Smuzhiyun {
399*4882a593Smuzhiyun u128 tmp[5];
400*4882a593Smuzhiyun u128 b4;
401*4882a593Smuzhiyun u128 b0;
402*4882a593Smuzhiyun u128 b4_;
403*4882a593Smuzhiyun u128 b0_;
404*4882a593Smuzhiyun {
405*4882a593Smuzhiyun u64 xi = b[0];
406*4882a593Smuzhiyun tmp[0] = ((u128)(xi) * (s));
407*4882a593Smuzhiyun }
408*4882a593Smuzhiyun {
409*4882a593Smuzhiyun u64 xi = b[1];
410*4882a593Smuzhiyun tmp[1] = ((u128)(xi) * (s));
411*4882a593Smuzhiyun }
412*4882a593Smuzhiyun {
413*4882a593Smuzhiyun u64 xi = b[2];
414*4882a593Smuzhiyun tmp[2] = ((u128)(xi) * (s));
415*4882a593Smuzhiyun }
416*4882a593Smuzhiyun {
417*4882a593Smuzhiyun u64 xi = b[3];
418*4882a593Smuzhiyun tmp[3] = ((u128)(xi) * (s));
419*4882a593Smuzhiyun }
420*4882a593Smuzhiyun {
421*4882a593Smuzhiyun u64 xi = b[4];
422*4882a593Smuzhiyun tmp[4] = ((u128)(xi) * (s));
423*4882a593Smuzhiyun }
424*4882a593Smuzhiyun fproduct_carry_wide_(tmp);
425*4882a593Smuzhiyun b4 = tmp[4];
426*4882a593Smuzhiyun b0 = tmp[0];
427*4882a593Smuzhiyun b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU))));
428*4882a593Smuzhiyun b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51))))))));
429*4882a593Smuzhiyun tmp[4] = b4_;
430*4882a593Smuzhiyun tmp[0] = b0_;
431*4882a593Smuzhiyun fproduct_copy_from_wide_(output, tmp);
432*4882a593Smuzhiyun }
433*4882a593Smuzhiyun
fmul(u64 * output,u64 * a,u64 * b)434*4882a593Smuzhiyun static __always_inline void fmul(u64 *output, u64 *a, u64 *b)
435*4882a593Smuzhiyun {
436*4882a593Smuzhiyun fmul_fmul(output, a, b);
437*4882a593Smuzhiyun }
438*4882a593Smuzhiyun
crecip(u64 * output,u64 * input)439*4882a593Smuzhiyun static __always_inline void crecip(u64 *output, u64 *input)
440*4882a593Smuzhiyun {
441*4882a593Smuzhiyun crecip_crecip(output, input);
442*4882a593Smuzhiyun }
443*4882a593Smuzhiyun
point_swap_conditional_step(u64 * a,u64 * b,u64 swap1,u32 ctr)444*4882a593Smuzhiyun static __always_inline void point_swap_conditional_step(u64 *a, u64 *b,
445*4882a593Smuzhiyun u64 swap1, u32 ctr)
446*4882a593Smuzhiyun {
447*4882a593Smuzhiyun u32 i = ctr - 1;
448*4882a593Smuzhiyun u64 ai = a[i];
449*4882a593Smuzhiyun u64 bi = b[i];
450*4882a593Smuzhiyun u64 x = swap1 & (ai ^ bi);
451*4882a593Smuzhiyun u64 ai1 = ai ^ x;
452*4882a593Smuzhiyun u64 bi1 = bi ^ x;
453*4882a593Smuzhiyun a[i] = ai1;
454*4882a593Smuzhiyun b[i] = bi1;
455*4882a593Smuzhiyun }
456*4882a593Smuzhiyun
point_swap_conditional5(u64 * a,u64 * b,u64 swap1)457*4882a593Smuzhiyun static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1)
458*4882a593Smuzhiyun {
459*4882a593Smuzhiyun point_swap_conditional_step(a, b, swap1, 5);
460*4882a593Smuzhiyun point_swap_conditional_step(a, b, swap1, 4);
461*4882a593Smuzhiyun point_swap_conditional_step(a, b, swap1, 3);
462*4882a593Smuzhiyun point_swap_conditional_step(a, b, swap1, 2);
463*4882a593Smuzhiyun point_swap_conditional_step(a, b, swap1, 1);
464*4882a593Smuzhiyun }
465*4882a593Smuzhiyun
point_swap_conditional(u64 * a,u64 * b,u64 iswap)466*4882a593Smuzhiyun static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap)
467*4882a593Smuzhiyun {
468*4882a593Smuzhiyun u64 swap1 = 0 - iswap;
469*4882a593Smuzhiyun point_swap_conditional5(a, b, swap1);
470*4882a593Smuzhiyun point_swap_conditional5(a + 5, b + 5, swap1);
471*4882a593Smuzhiyun }
472*4882a593Smuzhiyun
point_copy(u64 * output,u64 * input)473*4882a593Smuzhiyun static __always_inline void point_copy(u64 *output, u64 *input)
474*4882a593Smuzhiyun {
475*4882a593Smuzhiyun memcpy(output, input, 5 * sizeof(*input));
476*4882a593Smuzhiyun memcpy(output + 5, input + 5, 5 * sizeof(*input));
477*4882a593Smuzhiyun }
478*4882a593Smuzhiyun
addanddouble_fmonty(u64 * pp,u64 * ppq,u64 * p,u64 * pq,u64 * qmqp)479*4882a593Smuzhiyun static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p,
480*4882a593Smuzhiyun u64 *pq, u64 *qmqp)
481*4882a593Smuzhiyun {
482*4882a593Smuzhiyun u64 *qx = qmqp;
483*4882a593Smuzhiyun u64 *x2 = pp;
484*4882a593Smuzhiyun u64 *z2 = pp + 5;
485*4882a593Smuzhiyun u64 *x3 = ppq;
486*4882a593Smuzhiyun u64 *z3 = ppq + 5;
487*4882a593Smuzhiyun u64 *x = p;
488*4882a593Smuzhiyun u64 *z = p + 5;
489*4882a593Smuzhiyun u64 *xprime = pq;
490*4882a593Smuzhiyun u64 *zprime = pq + 5;
491*4882a593Smuzhiyun u64 buf[40] = { 0 };
492*4882a593Smuzhiyun u64 *origx = buf;
493*4882a593Smuzhiyun u64 *origxprime0 = buf + 5;
494*4882a593Smuzhiyun u64 *xxprime0;
495*4882a593Smuzhiyun u64 *zzprime0;
496*4882a593Smuzhiyun u64 *origxprime;
497*4882a593Smuzhiyun xxprime0 = buf + 25;
498*4882a593Smuzhiyun zzprime0 = buf + 30;
499*4882a593Smuzhiyun memcpy(origx, x, 5 * sizeof(*x));
500*4882a593Smuzhiyun fsum(x, z);
501*4882a593Smuzhiyun fdifference(z, origx);
502*4882a593Smuzhiyun memcpy(origxprime0, xprime, 5 * sizeof(*xprime));
503*4882a593Smuzhiyun fsum(xprime, zprime);
504*4882a593Smuzhiyun fdifference(zprime, origxprime0);
505*4882a593Smuzhiyun fmul(xxprime0, xprime, z);
506*4882a593Smuzhiyun fmul(zzprime0, x, zprime);
507*4882a593Smuzhiyun origxprime = buf + 5;
508*4882a593Smuzhiyun {
509*4882a593Smuzhiyun u64 *xx0;
510*4882a593Smuzhiyun u64 *zz0;
511*4882a593Smuzhiyun u64 *xxprime;
512*4882a593Smuzhiyun u64 *zzprime;
513*4882a593Smuzhiyun u64 *zzzprime;
514*4882a593Smuzhiyun xx0 = buf + 15;
515*4882a593Smuzhiyun zz0 = buf + 20;
516*4882a593Smuzhiyun xxprime = buf + 25;
517*4882a593Smuzhiyun zzprime = buf + 30;
518*4882a593Smuzhiyun zzzprime = buf + 35;
519*4882a593Smuzhiyun memcpy(origxprime, xxprime, 5 * sizeof(*xxprime));
520*4882a593Smuzhiyun fsum(xxprime, zzprime);
521*4882a593Smuzhiyun fdifference(zzprime, origxprime);
522*4882a593Smuzhiyun fsquare_fsquare_times(x3, xxprime, 1);
523*4882a593Smuzhiyun fsquare_fsquare_times(zzzprime, zzprime, 1);
524*4882a593Smuzhiyun fmul(z3, zzzprime, qx);
525*4882a593Smuzhiyun fsquare_fsquare_times(xx0, x, 1);
526*4882a593Smuzhiyun fsquare_fsquare_times(zz0, z, 1);
527*4882a593Smuzhiyun {
528*4882a593Smuzhiyun u64 *zzz;
529*4882a593Smuzhiyun u64 *xx;
530*4882a593Smuzhiyun u64 *zz;
531*4882a593Smuzhiyun u64 scalar;
532*4882a593Smuzhiyun zzz = buf + 10;
533*4882a593Smuzhiyun xx = buf + 15;
534*4882a593Smuzhiyun zz = buf + 20;
535*4882a593Smuzhiyun fmul(x2, xx, zz);
536*4882a593Smuzhiyun fdifference(zz, xx);
537*4882a593Smuzhiyun scalar = 121665;
538*4882a593Smuzhiyun fscalar(zzz, zz, scalar);
539*4882a593Smuzhiyun fsum(zzz, xx);
540*4882a593Smuzhiyun fmul(z2, zzz, zz);
541*4882a593Smuzhiyun }
542*4882a593Smuzhiyun }
543*4882a593Smuzhiyun }
544*4882a593Smuzhiyun
545*4882a593Smuzhiyun static __always_inline void
ladder_smallloop_cmult_small_loop_step(u64 * nq,u64 * nqpq,u64 * nq2,u64 * nqpq2,u64 * q,u8 byt)546*4882a593Smuzhiyun ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
547*4882a593Smuzhiyun u64 *q, u8 byt)
548*4882a593Smuzhiyun {
549*4882a593Smuzhiyun u64 bit0 = (u64)(byt >> 7);
550*4882a593Smuzhiyun u64 bit;
551*4882a593Smuzhiyun point_swap_conditional(nq, nqpq, bit0);
552*4882a593Smuzhiyun addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q);
553*4882a593Smuzhiyun bit = (u64)(byt >> 7);
554*4882a593Smuzhiyun point_swap_conditional(nq2, nqpq2, bit);
555*4882a593Smuzhiyun }
556*4882a593Smuzhiyun
557*4882a593Smuzhiyun static __always_inline void
ladder_smallloop_cmult_small_loop_double_step(u64 * nq,u64 * nqpq,u64 * nq2,u64 * nqpq2,u64 * q,u8 byt)558*4882a593Smuzhiyun ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2,
559*4882a593Smuzhiyun u64 *nqpq2, u64 *q, u8 byt)
560*4882a593Smuzhiyun {
561*4882a593Smuzhiyun u8 byt1;
562*4882a593Smuzhiyun ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt);
563*4882a593Smuzhiyun byt1 = byt << 1;
564*4882a593Smuzhiyun ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1);
565*4882a593Smuzhiyun }
566*4882a593Smuzhiyun
567*4882a593Smuzhiyun static __always_inline void
ladder_smallloop_cmult_small_loop(u64 * nq,u64 * nqpq,u64 * nq2,u64 * nqpq2,u64 * q,u8 byt,u32 i)568*4882a593Smuzhiyun ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2,
569*4882a593Smuzhiyun u64 *q, u8 byt, u32 i)
570*4882a593Smuzhiyun {
571*4882a593Smuzhiyun while (i--) {
572*4882a593Smuzhiyun ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2,
573*4882a593Smuzhiyun nqpq2, q, byt);
574*4882a593Smuzhiyun byt <<= 2;
575*4882a593Smuzhiyun }
576*4882a593Smuzhiyun }
577*4882a593Smuzhiyun
ladder_bigloop_cmult_big_loop(u8 * n1,u64 * nq,u64 * nqpq,u64 * nq2,u64 * nqpq2,u64 * q,u32 i)578*4882a593Smuzhiyun static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq,
579*4882a593Smuzhiyun u64 *nqpq, u64 *nq2,
580*4882a593Smuzhiyun u64 *nqpq2, u64 *q,
581*4882a593Smuzhiyun u32 i)
582*4882a593Smuzhiyun {
583*4882a593Smuzhiyun while (i--) {
584*4882a593Smuzhiyun u8 byte = n1[i];
585*4882a593Smuzhiyun ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q,
586*4882a593Smuzhiyun byte, 4);
587*4882a593Smuzhiyun }
588*4882a593Smuzhiyun }
589*4882a593Smuzhiyun
ladder_cmult(u64 * result,u8 * n1,u64 * q)590*4882a593Smuzhiyun static void ladder_cmult(u64 *result, u8 *n1, u64 *q)
591*4882a593Smuzhiyun {
592*4882a593Smuzhiyun u64 point_buf[40] = { 0 };
593*4882a593Smuzhiyun u64 *nq = point_buf;
594*4882a593Smuzhiyun u64 *nqpq = point_buf + 10;
595*4882a593Smuzhiyun u64 *nq2 = point_buf + 20;
596*4882a593Smuzhiyun u64 *nqpq2 = point_buf + 30;
597*4882a593Smuzhiyun point_copy(nqpq, q);
598*4882a593Smuzhiyun nq[0] = 1;
599*4882a593Smuzhiyun ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32);
600*4882a593Smuzhiyun point_copy(result, nq);
601*4882a593Smuzhiyun }
602*4882a593Smuzhiyun
format_fexpand(u64 * output,const u8 * input)603*4882a593Smuzhiyun static __always_inline void format_fexpand(u64 *output, const u8 *input)
604*4882a593Smuzhiyun {
605*4882a593Smuzhiyun const u8 *x00 = input + 6;
606*4882a593Smuzhiyun const u8 *x01 = input + 12;
607*4882a593Smuzhiyun const u8 *x02 = input + 19;
608*4882a593Smuzhiyun const u8 *x0 = input + 24;
609*4882a593Smuzhiyun u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4;
610*4882a593Smuzhiyun i0 = get_unaligned_le64(input);
611*4882a593Smuzhiyun i1 = get_unaligned_le64(x00);
612*4882a593Smuzhiyun i2 = get_unaligned_le64(x01);
613*4882a593Smuzhiyun i3 = get_unaligned_le64(x02);
614*4882a593Smuzhiyun i4 = get_unaligned_le64(x0);
615*4882a593Smuzhiyun output0 = i0 & 0x7ffffffffffffLLU;
616*4882a593Smuzhiyun output1 = i1 >> 3 & 0x7ffffffffffffLLU;
617*4882a593Smuzhiyun output2 = i2 >> 6 & 0x7ffffffffffffLLU;
618*4882a593Smuzhiyun output3 = i3 >> 1 & 0x7ffffffffffffLLU;
619*4882a593Smuzhiyun output4 = i4 >> 12 & 0x7ffffffffffffLLU;
620*4882a593Smuzhiyun output[0] = output0;
621*4882a593Smuzhiyun output[1] = output1;
622*4882a593Smuzhiyun output[2] = output2;
623*4882a593Smuzhiyun output[3] = output3;
624*4882a593Smuzhiyun output[4] = output4;
625*4882a593Smuzhiyun }
626*4882a593Smuzhiyun
format_fcontract_first_carry_pass(u64 * input)627*4882a593Smuzhiyun static __always_inline void format_fcontract_first_carry_pass(u64 *input)
628*4882a593Smuzhiyun {
629*4882a593Smuzhiyun u64 t0 = input[0];
630*4882a593Smuzhiyun u64 t1 = input[1];
631*4882a593Smuzhiyun u64 t2 = input[2];
632*4882a593Smuzhiyun u64 t3 = input[3];
633*4882a593Smuzhiyun u64 t4 = input[4];
634*4882a593Smuzhiyun u64 t1_ = t1 + (t0 >> 51);
635*4882a593Smuzhiyun u64 t0_ = t0 & 0x7ffffffffffffLLU;
636*4882a593Smuzhiyun u64 t2_ = t2 + (t1_ >> 51);
637*4882a593Smuzhiyun u64 t1__ = t1_ & 0x7ffffffffffffLLU;
638*4882a593Smuzhiyun u64 t3_ = t3 + (t2_ >> 51);
639*4882a593Smuzhiyun u64 t2__ = t2_ & 0x7ffffffffffffLLU;
640*4882a593Smuzhiyun u64 t4_ = t4 + (t3_ >> 51);
641*4882a593Smuzhiyun u64 t3__ = t3_ & 0x7ffffffffffffLLU;
642*4882a593Smuzhiyun input[0] = t0_;
643*4882a593Smuzhiyun input[1] = t1__;
644*4882a593Smuzhiyun input[2] = t2__;
645*4882a593Smuzhiyun input[3] = t3__;
646*4882a593Smuzhiyun input[4] = t4_;
647*4882a593Smuzhiyun }
648*4882a593Smuzhiyun
format_fcontract_first_carry_full(u64 * input)649*4882a593Smuzhiyun static __always_inline void format_fcontract_first_carry_full(u64 *input)
650*4882a593Smuzhiyun {
651*4882a593Smuzhiyun format_fcontract_first_carry_pass(input);
652*4882a593Smuzhiyun modulo_carry_top(input);
653*4882a593Smuzhiyun }
654*4882a593Smuzhiyun
format_fcontract_second_carry_pass(u64 * input)655*4882a593Smuzhiyun static __always_inline void format_fcontract_second_carry_pass(u64 *input)
656*4882a593Smuzhiyun {
657*4882a593Smuzhiyun u64 t0 = input[0];
658*4882a593Smuzhiyun u64 t1 = input[1];
659*4882a593Smuzhiyun u64 t2 = input[2];
660*4882a593Smuzhiyun u64 t3 = input[3];
661*4882a593Smuzhiyun u64 t4 = input[4];
662*4882a593Smuzhiyun u64 t1_ = t1 + (t0 >> 51);
663*4882a593Smuzhiyun u64 t0_ = t0 & 0x7ffffffffffffLLU;
664*4882a593Smuzhiyun u64 t2_ = t2 + (t1_ >> 51);
665*4882a593Smuzhiyun u64 t1__ = t1_ & 0x7ffffffffffffLLU;
666*4882a593Smuzhiyun u64 t3_ = t3 + (t2_ >> 51);
667*4882a593Smuzhiyun u64 t2__ = t2_ & 0x7ffffffffffffLLU;
668*4882a593Smuzhiyun u64 t4_ = t4 + (t3_ >> 51);
669*4882a593Smuzhiyun u64 t3__ = t3_ & 0x7ffffffffffffLLU;
670*4882a593Smuzhiyun input[0] = t0_;
671*4882a593Smuzhiyun input[1] = t1__;
672*4882a593Smuzhiyun input[2] = t2__;
673*4882a593Smuzhiyun input[3] = t3__;
674*4882a593Smuzhiyun input[4] = t4_;
675*4882a593Smuzhiyun }
676*4882a593Smuzhiyun
format_fcontract_second_carry_full(u64 * input)677*4882a593Smuzhiyun static __always_inline void format_fcontract_second_carry_full(u64 *input)
678*4882a593Smuzhiyun {
679*4882a593Smuzhiyun u64 i0;
680*4882a593Smuzhiyun u64 i1;
681*4882a593Smuzhiyun u64 i0_;
682*4882a593Smuzhiyun u64 i1_;
683*4882a593Smuzhiyun format_fcontract_second_carry_pass(input);
684*4882a593Smuzhiyun modulo_carry_top(input);
685*4882a593Smuzhiyun i0 = input[0];
686*4882a593Smuzhiyun i1 = input[1];
687*4882a593Smuzhiyun i0_ = i0 & 0x7ffffffffffffLLU;
688*4882a593Smuzhiyun i1_ = i1 + (i0 >> 51);
689*4882a593Smuzhiyun input[0] = i0_;
690*4882a593Smuzhiyun input[1] = i1_;
691*4882a593Smuzhiyun }
692*4882a593Smuzhiyun
format_fcontract_trim(u64 * input)693*4882a593Smuzhiyun static __always_inline void format_fcontract_trim(u64 *input)
694*4882a593Smuzhiyun {
695*4882a593Smuzhiyun u64 a0 = input[0];
696*4882a593Smuzhiyun u64 a1 = input[1];
697*4882a593Smuzhiyun u64 a2 = input[2];
698*4882a593Smuzhiyun u64 a3 = input[3];
699*4882a593Smuzhiyun u64 a4 = input[4];
700*4882a593Smuzhiyun u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU);
701*4882a593Smuzhiyun u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU);
702*4882a593Smuzhiyun u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU);
703*4882a593Smuzhiyun u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU);
704*4882a593Smuzhiyun u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU);
705*4882a593Smuzhiyun u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4;
706*4882a593Smuzhiyun u64 a0_ = a0 - (0x7ffffffffffedLLU & mask);
707*4882a593Smuzhiyun u64 a1_ = a1 - (0x7ffffffffffffLLU & mask);
708*4882a593Smuzhiyun u64 a2_ = a2 - (0x7ffffffffffffLLU & mask);
709*4882a593Smuzhiyun u64 a3_ = a3 - (0x7ffffffffffffLLU & mask);
710*4882a593Smuzhiyun u64 a4_ = a4 - (0x7ffffffffffffLLU & mask);
711*4882a593Smuzhiyun input[0] = a0_;
712*4882a593Smuzhiyun input[1] = a1_;
713*4882a593Smuzhiyun input[2] = a2_;
714*4882a593Smuzhiyun input[3] = a3_;
715*4882a593Smuzhiyun input[4] = a4_;
716*4882a593Smuzhiyun }
717*4882a593Smuzhiyun
format_fcontract_store(u8 * output,u64 * input)718*4882a593Smuzhiyun static __always_inline void format_fcontract_store(u8 *output, u64 *input)
719*4882a593Smuzhiyun {
720*4882a593Smuzhiyun u64 t0 = input[0];
721*4882a593Smuzhiyun u64 t1 = input[1];
722*4882a593Smuzhiyun u64 t2 = input[2];
723*4882a593Smuzhiyun u64 t3 = input[3];
724*4882a593Smuzhiyun u64 t4 = input[4];
725*4882a593Smuzhiyun u64 o0 = t1 << 51 | t0;
726*4882a593Smuzhiyun u64 o1 = t2 << 38 | t1 >> 13;
727*4882a593Smuzhiyun u64 o2 = t3 << 25 | t2 >> 26;
728*4882a593Smuzhiyun u64 o3 = t4 << 12 | t3 >> 39;
729*4882a593Smuzhiyun u8 *b0 = output;
730*4882a593Smuzhiyun u8 *b1 = output + 8;
731*4882a593Smuzhiyun u8 *b2 = output + 16;
732*4882a593Smuzhiyun u8 *b3 = output + 24;
733*4882a593Smuzhiyun put_unaligned_le64(o0, b0);
734*4882a593Smuzhiyun put_unaligned_le64(o1, b1);
735*4882a593Smuzhiyun put_unaligned_le64(o2, b2);
736*4882a593Smuzhiyun put_unaligned_le64(o3, b3);
737*4882a593Smuzhiyun }
738*4882a593Smuzhiyun
format_fcontract(u8 * output,u64 * input)739*4882a593Smuzhiyun static __always_inline void format_fcontract(u8 *output, u64 *input)
740*4882a593Smuzhiyun {
741*4882a593Smuzhiyun format_fcontract_first_carry_full(input);
742*4882a593Smuzhiyun format_fcontract_second_carry_full(input);
743*4882a593Smuzhiyun format_fcontract_trim(input);
744*4882a593Smuzhiyun format_fcontract_store(output, input);
745*4882a593Smuzhiyun }
746*4882a593Smuzhiyun
format_scalar_of_point(u8 * scalar,u64 * point)747*4882a593Smuzhiyun static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point)
748*4882a593Smuzhiyun {
749*4882a593Smuzhiyun u64 *x = point;
750*4882a593Smuzhiyun u64 *z = point + 5;
751*4882a593Smuzhiyun u64 buf[10] __aligned(32) = { 0 };
752*4882a593Smuzhiyun u64 *zmone = buf;
753*4882a593Smuzhiyun u64 *sc = buf + 5;
754*4882a593Smuzhiyun crecip(zmone, z);
755*4882a593Smuzhiyun fmul(sc, x, zmone);
756*4882a593Smuzhiyun format_fcontract(scalar, sc);
757*4882a593Smuzhiyun }
758*4882a593Smuzhiyun
curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],const u8 secret[CURVE25519_KEY_SIZE],const u8 basepoint[CURVE25519_KEY_SIZE])759*4882a593Smuzhiyun void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE],
760*4882a593Smuzhiyun const u8 secret[CURVE25519_KEY_SIZE],
761*4882a593Smuzhiyun const u8 basepoint[CURVE25519_KEY_SIZE])
762*4882a593Smuzhiyun {
763*4882a593Smuzhiyun u64 buf0[10] __aligned(32) = { 0 };
764*4882a593Smuzhiyun u64 *x0 = buf0;
765*4882a593Smuzhiyun u64 *z = buf0 + 5;
766*4882a593Smuzhiyun u64 *q;
767*4882a593Smuzhiyun format_fexpand(x0, basepoint);
768*4882a593Smuzhiyun z[0] = 1;
769*4882a593Smuzhiyun q = buf0;
770*4882a593Smuzhiyun {
771*4882a593Smuzhiyun u8 e[32] __aligned(32) = { 0 };
772*4882a593Smuzhiyun u8 *scalar;
773*4882a593Smuzhiyun memcpy(e, secret, 32);
774*4882a593Smuzhiyun curve25519_clamp_secret(e);
775*4882a593Smuzhiyun scalar = e;
776*4882a593Smuzhiyun {
777*4882a593Smuzhiyun u64 buf[15] = { 0 };
778*4882a593Smuzhiyun u64 *nq = buf;
779*4882a593Smuzhiyun u64 *x = nq;
780*4882a593Smuzhiyun x[0] = 1;
781*4882a593Smuzhiyun ladder_cmult(nq, scalar, q);
782*4882a593Smuzhiyun format_scalar_of_point(mypublic, nq);
783*4882a593Smuzhiyun memzero_explicit(buf, sizeof(buf));
784*4882a593Smuzhiyun }
785*4882a593Smuzhiyun memzero_explicit(e, sizeof(e));
786*4882a593Smuzhiyun }
787*4882a593Smuzhiyun memzero_explicit(buf0, sizeof(buf0));
788*4882a593Smuzhiyun }
789