blob: c240a54e1c1bcc016ddce2ad9a7a062c11c80a67 [file] [log] [blame]
Adam Langley95c29f32014-06-20 12:00:00 -07001/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
2 * All rights reserved.
3 *
4 * This package is an SSL implementation written
5 * by Eric Young (eay@cryptsoft.com).
6 * The implementation was written so as to conform with Netscapes SSL.
7 *
8 * This library is free for commercial and non-commercial use as long as
9 * the following conditions are aheared to. The following conditions
10 * apply to all code found in this distribution, be it the RC4, RSA,
11 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
12 * included with this distribution is covered by the same copyright terms
13 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
14 *
15 * Copyright remains Eric Young's, and as such any Copyright notices in
16 * the code are not to be removed.
17 * If this package is used in a product, Eric Young should be given attribution
18 * as the author of the parts of the library used.
19 * This can be in the form of a textual message at program startup or
20 * in documentation (online or textual) provided with the package.
21 *
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
24 * are met:
25 * 1. Redistributions of source code must retain the copyright
26 * notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 * notice, this list of conditions and the following disclaimer in the
29 * documentation and/or other materials provided with the distribution.
30 * 3. All advertising materials mentioning features or use of this software
31 * must display the following acknowledgement:
32 * "This product includes cryptographic software written by
33 * Eric Young (eay@cryptsoft.com)"
34 * The word 'cryptographic' can be left out if the rouines from the library
35 * being used are not cryptographic related :-).
36 * 4. If you include any Windows specific code (or a derivative thereof) from
37 * the apps directory (application code) you must include an acknowledgement:
38 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
39 *
40 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 *
52 * The licence and distribution terms for any publically available version or
53 * derivative of this code cannot be changed. i.e. this code cannot simply be
54 * copied and put under another distribution licence
55 * [including the GNU Public Licence.] */
56
57#include <openssl/bn.h>
58
59#include <assert.h>
60
61#include "internal.h"
62
63
David Benjaminf44aa682014-10-31 15:48:19 -040064/* Generic implementations of most operations are needed for:
65 * - Configurations without inline assembly.
66 * - Architectures other than x86 or x86_64.
67 * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
David Benjamin3e700bb2014-10-29 17:13:38 -040068#if defined(OPENSSL_NO_ASM) || \
David Benjaminf44aa682014-10-31 15:48:19 -040069 (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
70 (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
Adam Langley95c29f32014-06-20 12:00:00 -070071
72#if defined(OPENSSL_WINDOWS)
73#define alloca _alloca
74#else
75#include <alloca.h>
76#endif
77
78#ifdef BN_LLONG
79#define mul_add(r, a, w, c) \
80 { \
81 BN_ULLONG t; \
82 t = (BN_ULLONG)w * (a) + (r) + (c); \
83 (r) = Lw(t); \
84 (c) = Hw(t); \
85 }
86
87#define mul(r, a, w, c) \
88 { \
89 BN_ULLONG t; \
90 t = (BN_ULLONG)w * (a) + (c); \
91 (r) = Lw(t); \
92 (c) = Hw(t); \
93 }
94
95#define sqr(r0, r1, a) \
96 { \
97 BN_ULLONG t; \
98 t = (BN_ULLONG)(a) * (a); \
99 (r0) = Lw(t); \
100 (r1) = Hw(t); \
101 }
102
103#elif defined(BN_UMULT_LOHI)
104#define mul_add(r, a, w, c) \
105 { \
106 BN_ULONG high, low, ret, tmp = (a); \
107 ret = (r); \
108 BN_UMULT_LOHI(low, high, w, tmp); \
109 ret += (c); \
110 (c) = (ret < (c)) ? 1 : 0; \
111 (c) += high; \
112 ret += low; \
113 (c) += (ret < low) ? 1 : 0; \
114 (r) = ret; \
115 }
116
117#define mul(r, a, w, c) \
118 { \
119 BN_ULONG high, low, ret, ta = (a); \
120 BN_UMULT_LOHI(low, high, w, ta); \
121 ret = low + (c); \
122 (c) = high; \
123 (c) += (ret < low) ? 1 : 0; \
124 (r) = ret; \
125 }
126
127#define sqr(r0, r1, a) \
128 { \
129 BN_ULONG tmp = (a); \
130 BN_UMULT_LOHI(r0, r1, tmp, tmp); \
131 }
132
Adam Langley95c29f32014-06-20 12:00:00 -0700133#else
Adam Langley3e652652015-01-09 15:44:37 -0800134
Adam Langley95c29f32014-06-20 12:00:00 -0700135/*************************************************************
136 * No long long type
137 */
138
139#define LBITS(a) ((a) & BN_MASK2l)
140#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
141#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
142
143#define LLBITS(a) ((a) & BN_MASKl)
144#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
145#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
146
147#define mul64(l, h, bl, bh) \
148 { \
149 BN_ULONG m, m1, lt, ht; \
150 \
151 lt = l; \
152 ht = h; \
153 m = (bh) * (lt); \
154 lt = (bl) * (lt); \
155 m1 = (bl) * (ht); \
156 ht = (bh) * (ht); \
157 m = (m + m1) & BN_MASK2; \
158 if (m < m1) \
159 ht += L2HBITS((BN_ULONG)1); \
160 ht += HBITS(m); \
161 m1 = L2HBITS(m); \
162 lt = (lt + m1) & BN_MASK2; \
163 if (lt < m1) \
164 ht++; \
165 (l) = lt; \
166 (h) = ht; \
167 }
168
169#define sqr64(lo, ho, in) \
170 { \
171 BN_ULONG l, h, m; \
172 \
173 h = (in); \
174 l = LBITS(h); \
175 h = HBITS(h); \
176 m = (l) * (h); \
177 l *= l; \
178 h *= h; \
179 h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
180 m = (m & BN_MASK2l) << (BN_BITS4 + 1); \
181 l = (l + m) & BN_MASK2; \
182 if (l < m) \
183 h++; \
184 (lo) = l; \
185 (ho) = h; \
186 }
187
188#define mul_add(r, a, bl, bh, c) \
189 { \
190 BN_ULONG l, h; \
191 \
192 h = (a); \
193 l = LBITS(h); \
194 h = HBITS(h); \
195 mul64(l, h, (bl), (bh)); \
196 \
197 /* non-multiply part */ \
198 l = (l + (c)) & BN_MASK2; \
199 if (l < (c)) \
200 h++; \
201 (c) = (r); \
202 l = (l + (c)) & BN_MASK2; \
203 if (l < (c)) \
204 h++; \
205 (c) = h & BN_MASK2; \
206 (r) = l; \
207 }
208
209#define mul(r, a, bl, bh, c) \
210 { \
211 BN_ULONG l, h; \
212 \
213 h = (a); \
214 l = LBITS(h); \
215 h = HBITS(h); \
216 mul64(l, h, (bl), (bh)); \
217 \
218 /* non-multiply part */ \
219 l += (c); \
220 if ((l & BN_MASK2) < (c)) \
221 h++; \
222 (c) = h & BN_MASK2; \
223 (r) = l & BN_MASK2; \
224 }
225#endif /* !BN_LLONG */
226
227#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
228
229BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
230 BN_ULONG w) {
231 BN_ULONG c1 = 0;
232
233 assert(num >= 0);
234 if (num <= 0) {
235 return c1;
236 }
237
238 while (num & ~3) {
239 mul_add(rp[0], ap[0], w, c1);
240 mul_add(rp[1], ap[1], w, c1);
241 mul_add(rp[2], ap[2], w, c1);
242 mul_add(rp[3], ap[3], w, c1);
243 ap += 4;
244 rp += 4;
245 num -= 4;
246 }
247
248 while (num) {
249 mul_add(rp[0], ap[0], w, c1);
250 ap++;
251 rp++;
252 num--;
253 }
254
255 return c1;
256}
257
258BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
259 BN_ULONG c1 = 0;
260
261 assert(num >= 0);
262 if (num <= 0) {
263 return c1;
264 }
265
266 while (num & ~3) {
267 mul(rp[0], ap[0], w, c1);
268 mul(rp[1], ap[1], w, c1);
269 mul(rp[2], ap[2], w, c1);
270 mul(rp[3], ap[3], w, c1);
271 ap += 4;
272 rp += 4;
273 num -= 4;
274 }
275 while (num) {
276 mul(rp[0], ap[0], w, c1);
277 ap++;
278 rp++;
279 num--;
280 }
281 return c1;
282}
283
284void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
285 assert(n >= 0);
286 if (n <= 0) {
287 return;
288 }
289
290 while (n & ~3) {
291 sqr(r[0], r[1], a[0]);
292 sqr(r[2], r[3], a[1]);
293 sqr(r[4], r[5], a[2]);
294 sqr(r[6], r[7], a[3]);
295 a += 4;
296 r += 8;
297 n -= 4;
298 }
299 while (n) {
300 sqr(r[0], r[1], a[0]);
301 a++;
302 r += 2;
303 n--;
304 }
305}
306
307#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
308
309BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
310 BN_ULONG w) {
311 BN_ULONG c = 0;
312 BN_ULONG bl, bh;
313
314 assert(num >= 0);
315 if (num <= 0) {
316 return (BN_ULONG)0;
317 }
318
319 bl = LBITS(w);
320 bh = HBITS(w);
321
322 while (num & ~3) {
323 mul_add(rp[0], ap[0], bl, bh, c);
324 mul_add(rp[1], ap[1], bl, bh, c);
325 mul_add(rp[2], ap[2], bl, bh, c);
326 mul_add(rp[3], ap[3], bl, bh, c);
327 ap += 4;
328 rp += 4;
329 num -= 4;
330 }
331 while (num) {
332 mul_add(rp[0], ap[0], bl, bh, c);
333 ap++;
334 rp++;
335 num--;
336 }
337 return c;
338}
339
340BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
341 BN_ULONG carry = 0;
342 BN_ULONG bl, bh;
343
344 assert(num >= 0);
345 if (num <= 0) {
346 return (BN_ULONG)0;
347 }
348
349 bl = LBITS(w);
350 bh = HBITS(w);
351
352 while (num & ~3) {
353 mul(rp[0], ap[0], bl, bh, carry);
354 mul(rp[1], ap[1], bl, bh, carry);
355 mul(rp[2], ap[2], bl, bh, carry);
356 mul(rp[3], ap[3], bl, bh, carry);
357 ap += 4;
358 rp += 4;
359 num -= 4;
360 }
361 while (num) {
362 mul(rp[0], ap[0], bl, bh, carry);
363 ap++;
364 rp++;
365 num--;
366 }
367 return carry;
368}
369
370void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
371 assert(n >= 0);
372 if (n <= 0) {
373 return;
374 }
375
376 while (n & ~3) {
377 sqr64(r[0], r[1], a[0]);
378 sqr64(r[2], r[3], a[1]);
379 sqr64(r[4], r[5], a[2]);
380 sqr64(r[6], r[7], a[3]);
381 a += 4;
382 r += 8;
383 n -= 4;
384 }
385 while (n) {
386 sqr64(r[0], r[1], a[0]);
387 a++;
388 r += 2;
389 n--;
390 }
391}
392
393#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
394
Adam Langley3e652652015-01-09 15:44:37 -0800395#if defined(BN_LLONG)
Adam Langley95c29f32014-06-20 12:00:00 -0700396
397BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
398 return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
399}
400
401#else
402
403/* Divide h,l by d and return the result. */
404BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
405 BN_ULONG dh, dl, q, ret = 0, th, tl, t;
406 int i, count = 2;
407
408 if (d == 0) {
409 return BN_MASK2;
410 }
411
412 i = BN_num_bits_word(d);
413 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
414
415 i = BN_BITS2 - i;
416 if (h >= d) {
417 h -= d;
418 }
419
420 if (i) {
421 d <<= i;
422 h = (h << i) | (l >> (BN_BITS2 - i));
423 l <<= i;
424 }
425 dh = (d & BN_MASK2h) >> BN_BITS4;
426 dl = (d & BN_MASK2l);
427 for (;;) {
428 if ((h >> BN_BITS4) == dh) {
429 q = BN_MASK2l;
430 } else {
431 q = h / dh;
432 }
433
434 th = q * dh;
435 tl = dl * q;
436 for (;;) {
437 t = h - th;
438 if ((t & BN_MASK2h) ||
439 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
440 break;
441 }
442 q--;
443 th -= dh;
444 tl -= dl;
445 }
446 t = (tl >> BN_BITS4);
447 tl = (tl << BN_BITS4) & BN_MASK2h;
448 th += t;
449
450 if (l < tl) {
451 th++;
452 }
453 l -= tl;
454 if (h < th) {
455 h += d;
456 q--;
457 }
458 h -= th;
459
460 if (--count == 0) {
461 break;
462 }
463
464 ret = q << BN_BITS4;
465 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
466 l = (l & BN_MASK2l) << BN_BITS4;
467 }
468
469 ret |= q;
470 return ret;
471}
472
Adam Langley3e652652015-01-09 15:44:37 -0800473#endif /* !defined(BN_LLONG) */
Adam Langley95c29f32014-06-20 12:00:00 -0700474
475#ifdef BN_LLONG
476BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
477 int n) {
478 BN_ULLONG ll = 0;
479
480 assert(n >= 0);
481 if (n <= 0) {
482 return (BN_ULONG)0;
483 }
484
485 while (n & ~3) {
486 ll += (BN_ULLONG)a[0] + b[0];
487 r[0] = (BN_ULONG)ll & BN_MASK2;
488 ll >>= BN_BITS2;
489 ll += (BN_ULLONG)a[1] + b[1];
490 r[1] = (BN_ULONG)ll & BN_MASK2;
491 ll >>= BN_BITS2;
492 ll += (BN_ULLONG)a[2] + b[2];
493 r[2] = (BN_ULONG)ll & BN_MASK2;
494 ll >>= BN_BITS2;
495 ll += (BN_ULLONG)a[3] + b[3];
496 r[3] = (BN_ULONG)ll & BN_MASK2;
497 ll >>= BN_BITS2;
498 a += 4;
499 b += 4;
500 r += 4;
501 n -= 4;
502 }
503 while (n) {
504 ll += (BN_ULLONG)a[0] + b[0];
505 r[0] = (BN_ULONG)ll & BN_MASK2;
506 ll >>= BN_BITS2;
507 a++;
508 b++;
509 r++;
510 n--;
511 }
512 return (BN_ULONG)ll;
513}
514
515#else /* !BN_LLONG */
516
517BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
518 int n) {
519 BN_ULONG c, l, t;
520
521 assert(n >= 0);
522 if (n <= 0) {
523 return (BN_ULONG)0;
524 }
525
526 c = 0;
527 while (n & ~3) {
528 t = a[0];
529 t = (t + c) & BN_MASK2;
530 c = (t < c);
531 l = (t + b[0]) & BN_MASK2;
532 c += (l < t);
533 r[0] = l;
534 t = a[1];
535 t = (t + c) & BN_MASK2;
536 c = (t < c);
537 l = (t + b[1]) & BN_MASK2;
538 c += (l < t);
539 r[1] = l;
540 t = a[2];
541 t = (t + c) & BN_MASK2;
542 c = (t < c);
543 l = (t + b[2]) & BN_MASK2;
544 c += (l < t);
545 r[2] = l;
546 t = a[3];
547 t = (t + c) & BN_MASK2;
548 c = (t < c);
549 l = (t + b[3]) & BN_MASK2;
550 c += (l < t);
551 r[3] = l;
552 a += 4;
553 b += 4;
554 r += 4;
555 n -= 4;
556 }
557 while (n) {
558 t = a[0];
559 t = (t + c) & BN_MASK2;
560 c = (t < c);
561 l = (t + b[0]) & BN_MASK2;
562 c += (l < t);
563 r[0] = l;
564 a++;
565 b++;
566 r++;
567 n--;
568 }
569 return (BN_ULONG)c;
570}
571
572#endif /* !BN_LLONG */
573
574BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
575 int n) {
576 BN_ULONG t1, t2;
577 int c = 0;
578
579 assert(n >= 0);
580 if (n <= 0) {
581 return (BN_ULONG)0;
582 }
583
584 while (n & ~3) {
585 t1 = a[0];
586 t2 = b[0];
587 r[0] = (t1 - t2 - c) & BN_MASK2;
David Benjaminc9a202f2015-02-11 01:16:26 -0500588 if (t1 != t2) {
Adam Langley95c29f32014-06-20 12:00:00 -0700589 c = (t1 < t2);
David Benjaminc9a202f2015-02-11 01:16:26 -0500590 }
Adam Langley95c29f32014-06-20 12:00:00 -0700591 t1 = a[1];
592 t2 = b[1];
593 r[1] = (t1 - t2 - c) & BN_MASK2;
David Benjaminc9a202f2015-02-11 01:16:26 -0500594 if (t1 != t2) {
Adam Langley95c29f32014-06-20 12:00:00 -0700595 c = (t1 < t2);
David Benjaminc9a202f2015-02-11 01:16:26 -0500596 }
Adam Langley95c29f32014-06-20 12:00:00 -0700597 t1 = a[2];
598 t2 = b[2];
599 r[2] = (t1 - t2 - c) & BN_MASK2;
David Benjaminc9a202f2015-02-11 01:16:26 -0500600 if (t1 != t2) {
Adam Langley95c29f32014-06-20 12:00:00 -0700601 c = (t1 < t2);
David Benjaminc9a202f2015-02-11 01:16:26 -0500602 }
Adam Langley95c29f32014-06-20 12:00:00 -0700603 t1 = a[3];
604 t2 = b[3];
605 r[3] = (t1 - t2 - c) & BN_MASK2;
David Benjaminc9a202f2015-02-11 01:16:26 -0500606 if (t1 != t2) {
Adam Langley95c29f32014-06-20 12:00:00 -0700607 c = (t1 < t2);
David Benjaminc9a202f2015-02-11 01:16:26 -0500608 }
Adam Langley95c29f32014-06-20 12:00:00 -0700609 a += 4;
610 b += 4;
611 r += 4;
612 n -= 4;
613 }
614 while (n) {
615 t1 = a[0];
616 t2 = b[0];
617 r[0] = (t1 - t2 - c) & BN_MASK2;
David Benjaminc9a202f2015-02-11 01:16:26 -0500618 if (t1 != t2) {
Adam Langley95c29f32014-06-20 12:00:00 -0700619 c = (t1 < t2);
David Benjaminc9a202f2015-02-11 01:16:26 -0500620 }
Adam Langley95c29f32014-06-20 12:00:00 -0700621 a++;
622 b++;
623 r++;
624 n--;
625 }
626 return c;
627}
628
629/* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
630/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
631/* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
632/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
633
634#ifdef BN_LLONG
Adam Langley95c29f32014-06-20 12:00:00 -0700635
Adam Langleya83cc802015-01-08 11:46:35 -0800636/* Keep in mind that additions to multiplication result can not overflow,
637 * because its high half cannot be all-ones. */
638#define mul_add_c(a, b, c0, c1, c2) \
639 do { \
640 BN_ULONG hi; \
641 BN_ULLONG t = (BN_ULLONG)(a) * (b); \
642 t += c0; /* no carry */ \
643 c0 = (BN_ULONG)Lw(t); \
644 hi = (BN_ULONG)Hw(t); \
645 c1 = (c1 + hi) & BN_MASK2; \
646 if (c1 < hi) \
647 c2++; \
648 } while (0)
Adam Langley95c29f32014-06-20 12:00:00 -0700649
Adam Langleya83cc802015-01-08 11:46:35 -0800650#define mul_add_c2(a, b, c0, c1, c2) \
651 do { \
652 BN_ULONG hi; \
653 BN_ULLONG t = (BN_ULLONG)(a) * (b); \
654 BN_ULLONG tt = t + c0; /* no carry */ \
655 c0 = (BN_ULONG)Lw(tt); \
656 hi = (BN_ULONG)Hw(tt); \
657 c1 = (c1 + hi) & BN_MASK2; \
658 if (c1 < hi) \
659 c2++; \
660 t += c0; /* no carry */ \
661 c0 = (BN_ULONG)Lw(t); \
662 hi = (BN_ULONG)Hw(t); \
663 c1 = (c1 + hi) & BN_MASK2; \
664 if (c1 < hi) \
665 c2++; \
666 } while (0)
667
668#define sqr_add_c(a, i, c0, c1, c2) \
669 do { \
670 BN_ULONG hi; \
671 BN_ULLONG t = (BN_ULLONG)a[i] * a[i]; \
672 t += c0; /* no carry */ \
673 c0 = (BN_ULONG)Lw(t); \
674 hi = (BN_ULONG)Hw(t); \
675 c1 = (c1 + hi) & BN_MASK2; \
676 if (c1 < hi) \
677 c2++; \
678 } while (0)
Adam Langley95c29f32014-06-20 12:00:00 -0700679
680#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
681
682#elif defined(BN_UMULT_LOHI)
683
Adam Langleya83cc802015-01-08 11:46:35 -0800684/* Keep in mind that additions to hi can not overflow, because the high word of
685 * a multiplication result cannot be all-ones. */
Adam Langley95c29f32014-06-20 12:00:00 -0700686#define mul_add_c(a, b, c0, c1, c2) \
Adam Langleya83cc802015-01-08 11:46:35 -0800687 do { \
Adam Langley95c29f32014-06-20 12:00:00 -0700688 BN_ULONG ta = (a), tb = (b); \
Adam Langleya83cc802015-01-08 11:46:35 -0800689 BN_ULONG lo, hi; \
690 BN_UMULT_LOHI(lo, hi, ta, tb); \
691 c0 += lo; \
692 hi += (c0 < lo) ? 1 : 0; \
693 c1 += hi; \
694 c2 += (c1 < hi) ? 1 : 0; \
695 } while (0)
Adam Langley95c29f32014-06-20 12:00:00 -0700696
697#define mul_add_c2(a, b, c0, c1, c2) \
Adam Langleya83cc802015-01-08 11:46:35 -0800698 do { \
699 BN_ULONG ta = (a), tb = (b); \
700 BN_ULONG lo, hi, tt; \
701 BN_UMULT_LOHI(lo, hi, ta, tb); \
702 c0 += lo; \
703 tt = hi + ((c0 < lo) ? 1 : 0); \
704 c1 += tt; \
705 c2 += (c1 < tt) ? 1 : 0; \
706 c0 += lo; \
707 hi += (c0 < lo) ? 1 : 0; \
708 c1 += hi; \
709 c2 += (c1 < hi) ? 1 : 0; \
710 } while (0)
Adam Langley95c29f32014-06-20 12:00:00 -0700711
712#define sqr_add_c(a, i, c0, c1, c2) \
Adam Langleya83cc802015-01-08 11:46:35 -0800713 do { \
Adam Langley95c29f32014-06-20 12:00:00 -0700714 BN_ULONG ta = (a)[i]; \
Adam Langleya83cc802015-01-08 11:46:35 -0800715 BN_ULONG lo, hi; \
716 BN_UMULT_LOHI(lo, hi, ta, ta); \
717 c0 += lo; \
718 hi += (c0 < lo) ? 1 : 0; \
719 c1 += hi; \
720 c2 += (c1 < hi) ? 1 : 0; \
721 } while (0)
Adam Langley95c29f32014-06-20 12:00:00 -0700722
723#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
724
Adam Langley95c29f32014-06-20 12:00:00 -0700725#else /* !BN_LLONG */
Adam Langley95c29f32014-06-20 12:00:00 -0700726
Adam Langleya83cc802015-01-08 11:46:35 -0800727/* Keep in mind that additions to hi can not overflow, because
728 * the high word of a multiplication result cannot be all-ones. */
729
730#define mul_add_c(a, b, c0, c1, c2) \
731 do { \
732 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
733 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
734 mul64(lo, hi, bl, bh); \
735 c0 = (c0 + lo) & BN_MASK2; \
736 if (c0 < lo) \
737 hi++; \
738 c1 = (c1 + hi) & BN_MASK2; \
739 if (c1 < hi) \
740 c2++; \
741 } while (0)
742
743#define mul_add_c2(a, b, c0, c1, c2) \
744 do { \
745 BN_ULONG tt; \
746 BN_ULONG lo = LBITS(a), hi = HBITS(a); \
747 BN_ULONG bl = LBITS(b), bh = HBITS(b); \
748 mul64(lo, hi, bl, bh); \
749 tt = hi; \
750 c0 = (c0 + lo) & BN_MASK2; \
751 if (c0 < lo) \
752 tt++; \
753 c1 = (c1 + tt) & BN_MASK2; \
754 if (c1 < tt) \
755 c2++; \
756 c0 = (c0 + lo) & BN_MASK2; \
757 if (c0 < lo) \
758 hi++; \
759 c1 = (c1 + hi) & BN_MASK2; \
760 if (c1 < hi) \
761 c2++; \
762 } while (0)
Adam Langley95c29f32014-06-20 12:00:00 -0700763
764#define sqr_add_c(a, i, c0, c1, c2) \
Adam Langleya83cc802015-01-08 11:46:35 -0800765 do { \
766 BN_ULONG lo, hi; \
767 sqr64(lo, hi, (a)[i]); \
768 c0 = (c0 + lo) & BN_MASK2; \
769 if (c0 < lo) \
770 hi++; \
771 c1 = (c1 + hi) & BN_MASK2; \
772 if (c1 < hi) \
773 c2++; \
774 } while (0)
Adam Langley95c29f32014-06-20 12:00:00 -0700775
776#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
777#endif /* !BN_LLONG */
778
779void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
Adam Langley95c29f32014-06-20 12:00:00 -0700780 BN_ULONG c1, c2, c3;
781
782 c1 = 0;
783 c2 = 0;
784 c3 = 0;
785 mul_add_c(a[0], b[0], c1, c2, c3);
786 r[0] = c1;
787 c1 = 0;
788 mul_add_c(a[0], b[1], c2, c3, c1);
789 mul_add_c(a[1], b[0], c2, c3, c1);
790 r[1] = c2;
791 c2 = 0;
792 mul_add_c(a[2], b[0], c3, c1, c2);
793 mul_add_c(a[1], b[1], c3, c1, c2);
794 mul_add_c(a[0], b[2], c3, c1, c2);
795 r[2] = c3;
796 c3 = 0;
797 mul_add_c(a[0], b[3], c1, c2, c3);
798 mul_add_c(a[1], b[2], c1, c2, c3);
799 mul_add_c(a[2], b[1], c1, c2, c3);
800 mul_add_c(a[3], b[0], c1, c2, c3);
801 r[3] = c1;
802 c1 = 0;
803 mul_add_c(a[4], b[0], c2, c3, c1);
804 mul_add_c(a[3], b[1], c2, c3, c1);
805 mul_add_c(a[2], b[2], c2, c3, c1);
806 mul_add_c(a[1], b[3], c2, c3, c1);
807 mul_add_c(a[0], b[4], c2, c3, c1);
808 r[4] = c2;
809 c2 = 0;
810 mul_add_c(a[0], b[5], c3, c1, c2);
811 mul_add_c(a[1], b[4], c3, c1, c2);
812 mul_add_c(a[2], b[3], c3, c1, c2);
813 mul_add_c(a[3], b[2], c3, c1, c2);
814 mul_add_c(a[4], b[1], c3, c1, c2);
815 mul_add_c(a[5], b[0], c3, c1, c2);
816 r[5] = c3;
817 c3 = 0;
818 mul_add_c(a[6], b[0], c1, c2, c3);
819 mul_add_c(a[5], b[1], c1, c2, c3);
820 mul_add_c(a[4], b[2], c1, c2, c3);
821 mul_add_c(a[3], b[3], c1, c2, c3);
822 mul_add_c(a[2], b[4], c1, c2, c3);
823 mul_add_c(a[1], b[5], c1, c2, c3);
824 mul_add_c(a[0], b[6], c1, c2, c3);
825 r[6] = c1;
826 c1 = 0;
827 mul_add_c(a[0], b[7], c2, c3, c1);
828 mul_add_c(a[1], b[6], c2, c3, c1);
829 mul_add_c(a[2], b[5], c2, c3, c1);
830 mul_add_c(a[3], b[4], c2, c3, c1);
831 mul_add_c(a[4], b[3], c2, c3, c1);
832 mul_add_c(a[5], b[2], c2, c3, c1);
833 mul_add_c(a[6], b[1], c2, c3, c1);
834 mul_add_c(a[7], b[0], c2, c3, c1);
835 r[7] = c2;
836 c2 = 0;
837 mul_add_c(a[7], b[1], c3, c1, c2);
838 mul_add_c(a[6], b[2], c3, c1, c2);
839 mul_add_c(a[5], b[3], c3, c1, c2);
840 mul_add_c(a[4], b[4], c3, c1, c2);
841 mul_add_c(a[3], b[5], c3, c1, c2);
842 mul_add_c(a[2], b[6], c3, c1, c2);
843 mul_add_c(a[1], b[7], c3, c1, c2);
844 r[8] = c3;
845 c3 = 0;
846 mul_add_c(a[2], b[7], c1, c2, c3);
847 mul_add_c(a[3], b[6], c1, c2, c3);
848 mul_add_c(a[4], b[5], c1, c2, c3);
849 mul_add_c(a[5], b[4], c1, c2, c3);
850 mul_add_c(a[6], b[3], c1, c2, c3);
851 mul_add_c(a[7], b[2], c1, c2, c3);
852 r[9] = c1;
853 c1 = 0;
854 mul_add_c(a[7], b[3], c2, c3, c1);
855 mul_add_c(a[6], b[4], c2, c3, c1);
856 mul_add_c(a[5], b[5], c2, c3, c1);
857 mul_add_c(a[4], b[6], c2, c3, c1);
858 mul_add_c(a[3], b[7], c2, c3, c1);
859 r[10] = c2;
860 c2 = 0;
861 mul_add_c(a[4], b[7], c3, c1, c2);
862 mul_add_c(a[5], b[6], c3, c1, c2);
863 mul_add_c(a[6], b[5], c3, c1, c2);
864 mul_add_c(a[7], b[4], c3, c1, c2);
865 r[11] = c3;
866 c3 = 0;
867 mul_add_c(a[7], b[5], c1, c2, c3);
868 mul_add_c(a[6], b[6], c1, c2, c3);
869 mul_add_c(a[5], b[7], c1, c2, c3);
870 r[12] = c1;
871 c1 = 0;
872 mul_add_c(a[6], b[7], c2, c3, c1);
873 mul_add_c(a[7], b[6], c2, c3, c1);
874 r[13] = c2;
875 c2 = 0;
876 mul_add_c(a[7], b[7], c3, c1, c2);
877 r[14] = c3;
878 r[15] = c1;
879}
880
881void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
Adam Langley95c29f32014-06-20 12:00:00 -0700882 BN_ULONG c1, c2, c3;
883
884 c1 = 0;
885 c2 = 0;
886 c3 = 0;
887 mul_add_c(a[0], b[0], c1, c2, c3);
888 r[0] = c1;
889 c1 = 0;
890 mul_add_c(a[0], b[1], c2, c3, c1);
891 mul_add_c(a[1], b[0], c2, c3, c1);
892 r[1] = c2;
893 c2 = 0;
894 mul_add_c(a[2], b[0], c3, c1, c2);
895 mul_add_c(a[1], b[1], c3, c1, c2);
896 mul_add_c(a[0], b[2], c3, c1, c2);
897 r[2] = c3;
898 c3 = 0;
899 mul_add_c(a[0], b[3], c1, c2, c3);
900 mul_add_c(a[1], b[2], c1, c2, c3);
901 mul_add_c(a[2], b[1], c1, c2, c3);
902 mul_add_c(a[3], b[0], c1, c2, c3);
903 r[3] = c1;
904 c1 = 0;
905 mul_add_c(a[3], b[1], c2, c3, c1);
906 mul_add_c(a[2], b[2], c2, c3, c1);
907 mul_add_c(a[1], b[3], c2, c3, c1);
908 r[4] = c2;
909 c2 = 0;
910 mul_add_c(a[2], b[3], c3, c1, c2);
911 mul_add_c(a[3], b[2], c3, c1, c2);
912 r[5] = c3;
913 c3 = 0;
914 mul_add_c(a[3], b[3], c1, c2, c3);
915 r[6] = c1;
916 r[7] = c2;
917}
918
919void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
Adam Langley95c29f32014-06-20 12:00:00 -0700920 BN_ULONG c1, c2, c3;
921
922 c1 = 0;
923 c2 = 0;
924 c3 = 0;
925 sqr_add_c(a, 0, c1, c2, c3);
926 r[0] = c1;
927 c1 = 0;
928 sqr_add_c2(a, 1, 0, c2, c3, c1);
929 r[1] = c2;
930 c2 = 0;
931 sqr_add_c(a, 1, c3, c1, c2);
932 sqr_add_c2(a, 2, 0, c3, c1, c2);
933 r[2] = c3;
934 c3 = 0;
935 sqr_add_c2(a, 3, 0, c1, c2, c3);
936 sqr_add_c2(a, 2, 1, c1, c2, c3);
937 r[3] = c1;
938 c1 = 0;
939 sqr_add_c(a, 2, c2, c3, c1);
940 sqr_add_c2(a, 3, 1, c2, c3, c1);
941 sqr_add_c2(a, 4, 0, c2, c3, c1);
942 r[4] = c2;
943 c2 = 0;
944 sqr_add_c2(a, 5, 0, c3, c1, c2);
945 sqr_add_c2(a, 4, 1, c3, c1, c2);
946 sqr_add_c2(a, 3, 2, c3, c1, c2);
947 r[5] = c3;
948 c3 = 0;
949 sqr_add_c(a, 3, c1, c2, c3);
950 sqr_add_c2(a, 4, 2, c1, c2, c3);
951 sqr_add_c2(a, 5, 1, c1, c2, c3);
952 sqr_add_c2(a, 6, 0, c1, c2, c3);
953 r[6] = c1;
954 c1 = 0;
955 sqr_add_c2(a, 7, 0, c2, c3, c1);
956 sqr_add_c2(a, 6, 1, c2, c3, c1);
957 sqr_add_c2(a, 5, 2, c2, c3, c1);
958 sqr_add_c2(a, 4, 3, c2, c3, c1);
959 r[7] = c2;
960 c2 = 0;
961 sqr_add_c(a, 4, c3, c1, c2);
962 sqr_add_c2(a, 5, 3, c3, c1, c2);
963 sqr_add_c2(a, 6, 2, c3, c1, c2);
964 sqr_add_c2(a, 7, 1, c3, c1, c2);
965 r[8] = c3;
966 c3 = 0;
967 sqr_add_c2(a, 7, 2, c1, c2, c3);
968 sqr_add_c2(a, 6, 3, c1, c2, c3);
969 sqr_add_c2(a, 5, 4, c1, c2, c3);
970 r[9] = c1;
971 c1 = 0;
972 sqr_add_c(a, 5, c2, c3, c1);
973 sqr_add_c2(a, 6, 4, c2, c3, c1);
974 sqr_add_c2(a, 7, 3, c2, c3, c1);
975 r[10] = c2;
976 c2 = 0;
977 sqr_add_c2(a, 7, 4, c3, c1, c2);
978 sqr_add_c2(a, 6, 5, c3, c1, c2);
979 r[11] = c3;
980 c3 = 0;
981 sqr_add_c(a, 6, c1, c2, c3);
982 sqr_add_c2(a, 7, 5, c1, c2, c3);
983 r[12] = c1;
984 c1 = 0;
985 sqr_add_c2(a, 7, 6, c2, c3, c1);
986 r[13] = c2;
987 c2 = 0;
988 sqr_add_c(a, 7, c3, c1, c2);
989 r[14] = c3;
990 r[15] = c1;
991}
992
993void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
Adam Langley95c29f32014-06-20 12:00:00 -0700994 BN_ULONG c1, c2, c3;
995
996 c1 = 0;
997 c2 = 0;
998 c3 = 0;
999 sqr_add_c(a, 0, c1, c2, c3);
1000 r[0] = c1;
1001 c1 = 0;
1002 sqr_add_c2(a, 1, 0, c2, c3, c1);
1003 r[1] = c2;
1004 c2 = 0;
1005 sqr_add_c(a, 1, c3, c1, c2);
1006 sqr_add_c2(a, 2, 0, c3, c1, c2);
1007 r[2] = c3;
1008 c3 = 0;
1009 sqr_add_c2(a, 3, 0, c1, c2, c3);
1010 sqr_add_c2(a, 2, 1, c1, c2, c3);
1011 r[3] = c1;
1012 c1 = 0;
1013 sqr_add_c(a, 2, c2, c3, c1);
1014 sqr_add_c2(a, 3, 1, c2, c3, c1);
1015 r[4] = c2;
1016 c2 = 0;
1017 sqr_add_c2(a, 3, 2, c3, c1, c2);
1018 r[5] = c3;
1019 c3 = 0;
1020 sqr_add_c(a, 3, c1, c2, c3);
1021 r[6] = c1;
1022 r[7] = c2;
1023}
1024
1025#if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
1026/* This is essentially reference implementation, which may or may not
1027 * result in performance improvement. E.g. on IA-32 this routine was
1028 * observed to give 40% faster rsa1024 private key operations and 10%
1029 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
1030 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
1031 * reference implementation, one to be used as starting point for
1032 * platform-specific assembler. Mentioned numbers apply to compiler
1033 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
1034 * can vary not only from platform to platform, but even for compiler
1035 * versions. Assembler vs. assembler improvement coefficients can
1036 * [and are known to] differ and are to be documented elsewhere. */
1037int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1038 const BN_ULONG *np, const BN_ULONG *n0p, int num) {
1039 BN_ULONG c0, c1, ml, *tp, n0;
1040#ifdef mul64
1041 BN_ULONG mh;
1042#endif
1043 volatile BN_ULONG *vp;
1044 int i = 0, j;
1045
1046#if 0 /* template for platform-specific implementation */
1047 if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);
1048#endif
1049 vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1050
1051 n0 = *n0p;
1052
1053 c0 = 0;
1054 ml = bp[0];
1055#ifdef mul64
1056 mh = HBITS(ml);
1057 ml = LBITS(ml);
David Benjaminc9a202f2015-02-11 01:16:26 -05001058 for (j = 0; j < num; ++j) {
Adam Langley95c29f32014-06-20 12:00:00 -07001059 mul(tp[j], ap[j], ml, mh, c0);
David Benjaminc9a202f2015-02-11 01:16:26 -05001060 }
Adam Langley95c29f32014-06-20 12:00:00 -07001061#else
David Benjaminc9a202f2015-02-11 01:16:26 -05001062 for (j = 0; j < num; ++j) {
Adam Langley95c29f32014-06-20 12:00:00 -07001063 mul(tp[j], ap[j], ml, c0);
David Benjaminc9a202f2015-02-11 01:16:26 -05001064 }
Adam Langley95c29f32014-06-20 12:00:00 -07001065#endif
1066
1067 tp[num] = c0;
1068 tp[num + 1] = 0;
1069 goto enter;
1070
Adam Langley7bd538d2015-10-09 13:04:03 -07001071 for (; i < num; i++) {
Adam Langley95c29f32014-06-20 12:00:00 -07001072 c0 = 0;
1073 ml = bp[i];
1074#ifdef mul64
1075 mh = HBITS(ml);
1076 ml = LBITS(ml);
David Benjaminc9a202f2015-02-11 01:16:26 -05001077 for (j = 0; j < num; ++j) {
Adam Langley95c29f32014-06-20 12:00:00 -07001078 mul_add(tp[j], ap[j], ml, mh, c0);
David Benjaminc9a202f2015-02-11 01:16:26 -05001079 }
Adam Langley95c29f32014-06-20 12:00:00 -07001080#else
David Benjaminc9a202f2015-02-11 01:16:26 -05001081 for (j = 0; j < num; ++j) {
Adam Langley95c29f32014-06-20 12:00:00 -07001082 mul_add(tp[j], ap[j], ml, c0);
David Benjaminc9a202f2015-02-11 01:16:26 -05001083 }
Adam Langley95c29f32014-06-20 12:00:00 -07001084#endif
1085 c1 = (tp[num] + c0) & BN_MASK2;
1086 tp[num] = c1;
1087 tp[num + 1] = (c1 < c0 ? 1 : 0);
1088 enter:
1089 c1 = tp[0];
1090 ml = (c1 * n0) & BN_MASK2;
1091 c0 = 0;
1092#ifdef mul64
1093 mh = HBITS(ml);
1094 ml = LBITS(ml);
1095 mul_add(c1, np[0], ml, mh, c0);
1096#else
1097 mul_add(c1, ml, np[0], c0);
1098#endif
1099 for (j = 1; j < num; j++) {
1100 c1 = tp[j];
1101#ifdef mul64
1102 mul_add(c1, np[j], ml, mh, c0);
1103#else
1104 mul_add(c1, ml, np[j], c0);
1105#endif
1106 tp[j - 1] = c1 & BN_MASK2;
1107 }
1108 c1 = (tp[num] + c0) & BN_MASK2;
1109 tp[num - 1] = c1;
1110 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
1111 }
1112
1113 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1114 c0 = bn_sub_words(rp, tp, np, num);
1115 if (tp[num] != 0 || c0 == 0) {
David Benjaminc9a202f2015-02-11 01:16:26 -05001116 for (i = 0; i < num + 2; i++) {
Adam Langley95c29f32014-06-20 12:00:00 -07001117 vp[i] = 0;
David Benjaminc9a202f2015-02-11 01:16:26 -05001118 }
Adam Langley95c29f32014-06-20 12:00:00 -07001119 return 1;
1120 }
1121 }
David Benjaminc9a202f2015-02-11 01:16:26 -05001122 for (i = 0; i < num; i++) {
Adam Langley95c29f32014-06-20 12:00:00 -07001123 rp[i] = tp[i], vp[i] = 0;
David Benjaminc9a202f2015-02-11 01:16:26 -05001124 }
Adam Langley95c29f32014-06-20 12:00:00 -07001125 vp[num] = 0;
1126 vp[num + 1] = 0;
1127 return 1;
1128}
1129#endif
1130
1131#endif