Simplify division-with-remainder calculations in crypto/bn/div.c.

Create a |bn_div_rem_words| that is used for double-word/single-word
divisions and division-with-remainder. Remove all implementations of
|bn_div_words| except for the implementation needed for 64-bit MSVC.
This allows more code to be shared across platforms and also removes
an instance of the dangerous pattern wherein the |div_asm| macro
modified a variable that wasn't passed as a parameter.

Also, document the limitations of the compiler-generated code for the
non-asm code paths more fully. Compilers indeed have not improved in
this respect.

Change-Id: I5a36a2edd7465de406d47d72dcd6bf3e63e5c232
Reviewed-on: https://boringssl-review.googlesource.com/7127
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
index d78a851..214c12a 100644
--- a/crypto/bn/asm/x86_64-gcc.c
+++ b/crypto/bn/asm/x86_64-gcc.c
@@ -186,14 +186,6 @@
   }
 }
 
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
-  BN_ULONG ret, waste;
-
-  asm("divq	%4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
-
-  return ret;
-}
-
 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                       int n) {
   BN_ULONG ret;
diff --git a/crypto/bn/div.c b/crypto/bn/div.c
index 5a239bc..69791f7 100644
--- a/crypto/bn/div.c
+++ b/crypto/bn/div.c
@@ -62,49 +62,46 @@
 #include "internal.h"
 
 
-#define asm __asm__
-
-#if !defined(OPENSSL_NO_ASM)
-# if defined(__GNUC__) && __GNUC__>=2
-#  if defined(OPENSSL_X86)
-   /*
-    * There were two reasons for implementing this template:
-    * - GNU C generates a call to a function (__udivdi3 to be exact)
-    *   in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
-    *   understand why...);
-    * - divl doesn't only calculate quotient, but also leaves
-    *   remainder in %edx which we can definitely use here:-)
-    *
-    *					<appro@fy.chalmers.se>
-    */
-#undef div_asm
-#  define div_asm(n0,n1,d0)		\
-	({  asm volatile (			\
-		"divl	%4"			\
-		: "=a"(q), "=d"(rem)		\
-		: "a"(n1), "d"(n0), "g"(d0)	\
-		: "cc");			\
-	    q;					\
-	})
-#  define REMAINDER_IS_ALREADY_CALCULATED
-#  elif defined(OPENSSL_X86_64)
-   /*
-    * Same story here, but it's 128-bit by 64-bit division. Wow!
-    *					<appro@fy.chalmers.se>
-    */
-#  undef div_asm
-#  define div_asm(n0,n1,d0)		\
-	({  asm volatile (			\
-		"divq	%4"			\
-		: "=a"(q), "=d"(rem)		\
-		: "a"(n1), "d"(n0), "g"(d0)	\
-		: "cc");			\
-	    q;					\
-	})
-#  define REMAINDER_IS_ALREADY_CALCULATED
-#  endif /* __<cpu> */
-# endif /* __GNUC__ */
-#endif /* OPENSSL_NO_ASM */
+static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out,
+                                    BN_ULONG n0, BN_ULONG n1, BN_ULONG d0) {
+  /* GCC and Clang generate function calls to |__udivdi3| and |__umoddi3| when
+   * the |BN_ULLONG|-based C code is used.
+   *
+   * GCC bugs:
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54183
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58897
+   *   * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65668
+   *
+   * Clang bugs:
+   *   * https://llvm.org/bugs/show_bug.cgi?id=6397
+   *   * https://llvm.org/bugs/show_bug.cgi?id=12418
+   *
+   * These issues aren't specific to x86 and x86_64, so it might be worthwhile
+   * to add more assembly language implementations. */
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__GNUC__)
+  __asm__ volatile (
+    "divl %4"
+    : "=a"(*quotient_out), "=d"(*rem_out)
+    : "a"(n1), "d"(n0), "g"(d0)
+    : "cc" );
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__GNUC__)
+  __asm__ volatile (
+    "divq %4"
+    : "=a"(*quotient_out), "=d"(*rem_out)
+    : "a"(n1), "d"(n0), "g"(d0)
+    : "cc" );
+#else
+#if defined(BN_ULLONG)
+  BN_ULLONG n = (((BN_ULLONG)n0) << BN_BITS2) | n1;
+  *quotient_out = (BN_ULONG)(n / d0);
+#else
+  *quotient_out = bn_div_words(n0, n1, d0);
+#endif
+  *rem_out = n1 - (*quotient_out * d0);
+#endif
+}
 
 /* BN_div computes  dv := num / divisor,  rounding towards
  * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
@@ -260,23 +257,10 @@
       q = BN_MASK2;
     } else {
       /* n0 < d0 */
+      bn_div_rem_words(&q, &rem, n0, n1, d0);
+
 #ifdef BN_ULLONG
-      BN_ULLONG t2;
-
-#if defined(BN_ULLONG) && !defined(div_asm)
-      q = (BN_ULONG)(((((BN_ULLONG)n0) << BN_BITS2) | n1) / d0);
-#else
-      q = div_asm(n0, n1, d0);
-#endif
-
-#ifndef REMAINDER_IS_ALREADY_CALCULATED
-      /* rem doesn't have to be BN_ULLONG. The least we know it's less that d0,
-       * isn't it? */
-      rem = (n1 - q * d0) & BN_MASK2;
-#endif
-
-      t2 = (BN_ULLONG)d1 * q;
-
+      BN_ULLONG t2 = (BN_ULLONG)d1 * q;
       for (;;) {
         if (t2 <= ((((BN_ULLONG)rem) << BN_BITS2) | wnump[-2])) {
           break;
@@ -290,13 +274,7 @@
       }
 #else /* !BN_ULLONG */
       BN_ULONG t2l, t2h;
-
-      q = bn_div_words(n0, n1, d0);
-
-      rem = (n1 - q * d0) & BN_MASK2;
-
       BN_UMULT_LOHI(t2l, t2h, d1, q);
-
       for (;;) {
         if ((t2h < rem) || ((t2h == rem) && (t2l <= wnump[-2]))) {
           break;
@@ -556,7 +534,7 @@
     return 0;
   }
 
-  /* normalize input (so bn_div_words doesn't complain) */
+  /* normalize input for |bn_div_rem_words|. */
   j = BN_BITS2 - BN_num_bits_word(w);
   w <<= j;
   if (!BN_lshift(a, a, j)) {
@@ -564,10 +542,10 @@
   }
 
   for (i = a->top - 1; i >= 0; i--) {
-    BN_ULONG l, d;
-
-    l = a->d[i];
-    d = bn_div_words(ret, l, w);
+    BN_ULONG l = a->d[i];
+    BN_ULONG d;
+    BN_ULONG unused_rem;
+    bn_div_rem_words(&d, &unused_rem, ret, l, w);
     ret = (l - ((d * w) & BN_MASK2)) & BN_MASK2;
     a->d[i] = d;
   }
diff --git a/crypto/bn/generic.c b/crypto/bn/generic.c
index cf2db8b..b7fedca 100644
--- a/crypto/bn/generic.c
+++ b/crypto/bn/generic.c
@@ -202,13 +202,7 @@
   }
 }
 
-#if defined(BN_ULLONG)
-
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
-  return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
-}
-
-#else
+#if !defined(BN_ULLONG)
 
 /* Divide h,l by d and return the result. */
 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
diff --git a/crypto/bn/internal.h b/crypto/bn/internal.h
index 5c9baa4..e8a2bdd 100644
--- a/crypto/bn/internal.h
+++ b/crypto/bn/internal.h
@@ -195,7 +195,6 @@
 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
 void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
-BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
 
@@ -225,6 +224,10 @@
 #error "Either BN_ULLONG or BN_UMULT_LOHI must be defined on every platform."
 #endif
 
+#if !defined(BN_ULLONG)
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
+#endif
+
 
 #if defined(__cplusplus)
 }  /* extern C */