Remove the confusing extra word in BN_div

This extra word was allocated so that the fixup portion of quotient
estimation could read from wnump[-2] without checking if div_n > 1. This
was actually subtle because the value it got back was wrong. It just
didn't matter because the loop was a no-op.

As a result of all this, all the indices into snum were off, and the
remainder needed to be shifted down by one word to compensate.

Really, if div_n > 1, we could just call BN_div_word, but the calling
conventions are different enough that it didn't seem worth the effort.

Bug: 358687140
Change-Id: Id694a33003f51536ee836a5bdb75ff8006b11a51
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/70179
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/div.c b/crypto/fipsmodule/bn/div.c
index 5d059e5..f1588c1 100644
--- a/crypto/fipsmodule/bn/div.c
+++ b/crypto/fipsmodule/bn/div.c
@@ -215,29 +215,22 @@
   // This ensures, in Knuth's terminology, that v1 >= b/2, needed for the
   // quotient estimation step.
   int norm_shift = BN_BITS2 - (BN_num_bits(divisor) % BN_BITS2);
-  if (!BN_lshift(sdiv, divisor, norm_shift)) {
+  if (!BN_lshift(sdiv, divisor, norm_shift) ||
+      !BN_lshift(snum, numerator, norm_shift)) {
     goto err;
   }
   bn_set_minimal_width(sdiv);
-  sdiv->neg = 0;
-
-  // TODO(crbug.com/358687140): We also shift the numerator up by one extra
-  // word. This was done for convenience so that |wnump[-2]| below always
-  // exists, but makes the offsets confusing. Remove it.
-  norm_shift += BN_BITS2;
-  if (!BN_lshift(snum, numerator, norm_shift)) {
-    goto err;
-  }
   bn_set_minimal_width(snum);
+  sdiv->neg = 0;
   snum->neg = 0;
 
   // Extend |snum| with zeros to satisfy the long division invariants:
-  // - |snum|, minus the extra word, must have at least |div_n| + 1 words.
+  // - |snum| must have at least |div_n| + 1 words.
   // - |snum|'s most significant word must be zero to guarantee the first loop
   //   iteration works with a prefix greater than |sdiv|. (This is the extra u0
   //   digit in Knuth step D1.)
   int div_n = sdiv->width;
-  int num_n = snum->width <= div_n + 1 ? div_n + 2 : snum->width + 1;
+  int num_n = snum->width <= div_n ? div_n + 1 : snum->width + 1;
   if (!bn_resize_words(snum, num_n)) {
     goto err;
   }
@@ -267,10 +260,10 @@
   // for later.
   const int numerator_neg = numerator->neg;
   res->neg = (numerator_neg ^ divisor->neg);
-  if (!bn_wexpand(res, loop - 1)) {
+  if (!bn_wexpand(res, loop)) {
     goto err;
   }
-  res->width = loop - 1;
+  res->width = loop;
 
   if (!bn_wexpand(tmp, div_n + 1)) {
     goto err;
@@ -281,10 +274,10 @@
   // our index is reversed. Each loop iteration computes res->d[i] of the
   // quotient and updates snum with the running remainder. Before each loop
   // iteration, wnum <= sdiv.
-  for (int i = loop - 2; i >= 0; i--, wnump--) {
+  for (int i = loop - 1; i >= 0; i--, wnump--) {
     // TODO(crbug.com/358687140): Remove these running pointers.
     wnum.d--;
-    assert(wnum.d == snum->d + i + 1);
+    assert(wnum.d == snum->d + i);
     assert(wnump == wnum.d + div_n);
 
     // Knuth step D3: Compute q', an estimate of q = floor(wnum / sdiv) by
@@ -331,47 +324,43 @@
       // 20, and 21. Although only one iteration is needed to correct q + 2 to
       // q + 1, Knuth uses a loop. A loop will often also correct q + 1 to q,
       // saving the slightly more expensive underflow handling below.
-      //
-      // TODO(crbug.com/358687140): This assumes div_n is at least 2, otherwise
-      // there is no wnump[-2] (u_(j+2) in Knuth) term to read. We avoid going
-      // out of bounds because of the extra word, and t2 will be zero in this
-      // case, so the loop is a no-op. Still, this is confusing.
+      if (div_n > 1) {
 #ifdef BN_ULLONG
-      BN_ULLONG t2 = (BN_ULLONG)d1 * q;
-      for (;;) {
-        if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | wnump[-2])) {
-          break;
+        BN_ULLONG t2 = (BN_ULLONG)d1 * q;
+        for (;;) {
+          if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | wnump[-2])) {
+            break;
+          }
+          q--;
+          rm += d0;
+          if (rm < d0) {
+            // If rm overflows, the true value exceeds BN_ULONG and the next
+            // t2 comparison should exit the loop.
+            break;
+          }
+          t2 -= d1;
         }
-        q--;
-        rm += d0;
-        if (rm < d0) {
-          // If rm overflows, the true value exceeds BN_ULONG and the next
-          // t2 comparison should exit the loop.
-          break;
+#else   // !BN_ULLONG
+        BN_ULONG t2l, t2h;
+        BN_UMULT_LOHI(t2l, t2h, d1, q);
+        for (;;) {
+          if (t2h < rm || (t2h == rm && t2l <= wnump[-2])) {
+            break;
+          }
+          q--;
+          rm += d0;
+          if (rm < d0) {
+            // If rm overflows, the true value exceeds BN_ULONG and the next
+            // t2 comparison should exit the loop.
+            break;
+          }
+          if (t2l < d1) {
+            t2h--;
+          }
+          t2l -= d1;
         }
-        t2 -= d1;
-      }
-#else  // !BN_ULLONG
-      BN_ULONG t2l, t2h;
-      BN_UMULT_LOHI(t2l, t2h, d1, q);
-      for (;;) {
-        if (t2h < rm ||
-            (t2h == rm && t2l <= wnump[-2])) {
-          break;
-        }
-        q--;
-        rm += d0;
-        if (rm < d0) {
-          // If rm overflows, the true value exceeds BN_ULONG and the next
-          // t2 comparison should exit the loop.
-          break;
-        }
-        if (t2l < d1) {
-          t2h--;
-        }
-        t2l -= d1;
-      }
 #endif  // !BN_ULLONG
+      }
     }
 
     // Knuth step D4 through D6: Now q' = q or q' = q + 1, and