Simplify division-with-remainder calculations in crypto/bn/div.c.
Create a |bn_div_rem_words| that is used for double-word/single-word
divisions and division-with-remainder. Remove all implementations of
|bn_div_words| except for the implementation needed for 64-bit MSVC.
This allows more code to be shared across platforms and also removes
an instance of the dangerous pattern wherein the |div_asm| macro
modified a variable that wasn't passed as a parameter.
Also, document the limitations of the compiler-generated code for the
non-asm code paths more fully. Compilers indeed have not improved in
this respect.
Change-Id: I5a36a2edd7465de406d47d72dcd6bf3e63e5c232
Reviewed-on: https://boringssl-review.googlesource.com/7127
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index d78a851..214c12a 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -186,14 +186,6 @@
}
}
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
- BN_ULONG ret, waste;
-
- asm("divq %4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
-
- return ret;
-}
-
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
int n) {
BN_ULONG ret;
diff --git a/crypto/bn/div.c b/crypto/bn/div.c
index 5a239bc..69791f7 100644
--- a/crypto/bn/div.c
+++ b/crypto/bn/div.c
@@ -62,49 +62,46 @@
#include "internal.h"
-#define asm __asm__
-
-#if !defined(OPENSSL_NO_ASM)
-# if defined(__GNUC__) && __GNUC__>=2
-# if defined(OPENSSL_X86)
- /*
- * There were two reasons for implementing this template:
- * - GNU C generates a call to a function (__udivdi3 to be exact)
- * in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
- * understand why...);
- * - divl doesn't only calculate quotient, but also leaves
- * remainder in %edx which we can definitely use here:-)
- *
- * <appro@fy.chalmers.se>
- */
-#undef div_asm
-# define div_asm(n0,n1,d0) \
- ({ asm volatile ( \
- "divl %4" \
- : "=a"(q), "=d"(rem) \
- : "a"(n1), "d"(n0), "g"(d0) \
- : "cc"); \
- q; \
- })
-# define REMAINDER_IS_ALREADY_CALCULATED
-# elif defined(OPENSSL_X86_64)
- /*
- * Same story here, but it's 128-bit by 64-bit division. Wow!
- * <appro@fy.chalmers.se>
- */
-# undef div_asm
-# define div_asm(n0,n1,d0) \
- ({ asm volatile ( \
- "divq %4" \
- : "=a"(q), "=d"(rem) \
- : "a"(n1), "d"(n0), "g"(d0) \
- : "cc"); \
- q; \
- })
-# define REMAINDER_IS_ALREADY_CALCULATED
-# endif /* __<cpu> */
-# endif /* __GNUC__ */
-#endif /* OPENSSL_NO_ASM */
+static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
+ BN_ULONG n0, BN_ULONG n1, BN_ULONG d0) {
+ /* GCC and Clang generate function calls to |__udivdi3| and |__umoddi3| when
+ * the |BN_ULLONG|-based C code is used.
+ *
+ * GCC bugs:
+ * * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+ * * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721
+ * * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54183
+ * * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58897
+ * * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65668
+ *
+ * Clang bugs:
+ * * https://llvm.org/bugs/show_bug.cgi?id=6397
+ * * https://llvm.org/bugs/show_bug.cgi?id=12418
+ *
+ * These issues aren't specific to x86 and x86_64, so it might be worthwhile
+ * to add more assembly language implementations. */
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__GNUC__)
+ __asm__ volatile (
+ "divl %4"
+ : "=a"(*quotient_out), "=d"(*rem_out)
+ : "a"(n1), "d"(n0), "g"(d0)
+ : "cc" );
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
+ __asm__ volatile (
+ "divq %4"
+ : "=a"(*quotient_out), "=d"(*rem_out)
+ : "a"(n1), "d"(n0), "g"(d0)
+ : "cc" );
+#else
+#if defined(BN_ULLONG)
+ BN_ULLONG n = (((BN_ULLONG)n0) << BN_BITS2) | n1;
+ *quotient_out = (BN_ULONG)(n / d0);
+#else
+ *quotient_out = bn_div_words(n0, n1, d0);
+#endif
+ *rem_out = n1 - (*quotient_out * d0);
+#endif
+}
/* BN_div computes dv := num / divisor, rounding towards
* zero, and sets up rm such that dv*divisor + rm = num holds.
@@ -260,23 +257,10 @@
q = BN_MASK2;
} else {
/* n0 < d0 */
+ bn_div_rem_words(&q, &rem, n0, n1, d0);
+
#ifdef BN_ULLONG
- BN_ULLONG t2;
-
-#if defined(BN_ULLONG) && !defined(div_asm)
- q = (BN_ULONG)(((((BN_ULLONG)n0) << BN_BITS2) | n1) / d0);
-#else
- q = div_asm(n0, n1, d0);
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
- /* rem doesn't have to be BN_ULLONG. The least we know it's less that d0,
- * isn't it? */
- rem = (n1 - q * d0) & BN_MASK2;
-#endif
-
- t2 = (BN_ULLONG)d1 * q;
-
+ BN_ULLONG t2 = (BN_ULLONG)d1 * q;
for (;;) {
if (t2 <= ((((BN_ULLONG)rem) << BN_BITS2) | wnump[-2])) {
break;
@@ -290,13 +274,7 @@
}
#else /* !BN_ULLONG */
BN_ULONG t2l, t2h;
-
- q = bn_div_words(n0, n1, d0);
-
- rem = (n1 - q * d0) & BN_MASK2;
-
BN_UMULT_LOHI(t2l, t2h, d1, q);
-
for (;;) {
if ((t2h < rem) || ((t2h == rem) && (t2l <= wnump[-2]))) {
break;
@@ -556,7 +534,7 @@
return 0;
}
- /* normalize input (so bn_div_words doesn't complain) */
+ /* normalize input for |bn_div_rem_words|. */
j = BN_BITS2 - BN_num_bits_word(w);
w <<= j;
if (!BN_lshift(a, a, j)) {
@@ -564,10 +542,10 @@
}
for (i = a->top - 1; i >= 0; i--) {
- BN_ULONG l, d;
-
- l = a->d[i];
- d = bn_div_words(ret, l, w);
+ BN_ULONG l = a->d[i];
+ BN_ULONG d;
+ BN_ULONG unused_rem;
+ bn_div_rem_words(&d, &unused_rem, ret, l, w);
ret = (l - ((d * w) & BN_MASK2)) & BN_MASK2;
a->d[i] = d;
}
diff --git a/crypto/bn/generic.c b/crypto/bn/generic.c
index cf2db8b..b7fedca 100644
--- a/crypto/bn/generic.c
+++ b/crypto/bn/generic.c
@@ -202,13 +202,7 @@
}
}
-#if defined(BN_ULLONG)
-
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
- return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
-}
-
-#else
+#if !defined(BN_ULLONG)
/* Divide h,l by d and return the result. */
BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
diff --git a/crypto/bn/internal.h b/crypto/bn/internal.h
index 5c9baa4..e8a2bdd 100644
--- a/crypto/bn/internal.h
+++ b/crypto/bn/internal.h
@@ -195,7 +195,6 @@
BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
@@ -225,6 +224,10 @@
#error "Either BN_ULLONG or BN_UMULT_LOHI must be defined on every platform."
#endif
+#if !defined(BN_ULLONG)
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
+#endif
+
#if defined(__cplusplus)
} /* extern C */