Remove non-ASM version of |bn_mul_mont| in bn/generic.c.

When building in OPENSSL_NO_ASM mode, MSVC complains about unreachable
code. The redundant initialization of |i| is the main problem. The
skipping of the first test of the condition |i < num| with |goto| was
also confusing.

It turns out that |bn_mul_mont| is only called when assembly language
optimizations are available, but in that case the assmebly language
versions will always be used instead. Although this code will be
compiled in |OPENSSL_NO_ASM| builds, it is never called in
|OPENSSL_NO_ASM| builds. Thus, it can just be removed.

Change-Id: Id551899b2602824978edc1a1cb0703b76516808d
Reviewed-on: https://boringssl-review.googlesource.com/5550
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/bn/generic.c b/crypto/bn/generic.c
index c240a54..7fd4819 100644
--- a/crypto/bn/generic.c
+++ b/crypto/bn/generic.c
@@ -1022,110 +1022,4 @@
   r[7] = c2;
 }
 
-#if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
-/* This is essentially reference implementation, which may or may not
- * result in performance improvement. E.g. on IA-32 this routine was
- * observed to give 40% faster rsa1024 private key operations and 10%
- * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
- * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
- * reference implementation, one to be used as starting point for
- * platform-specific assembler. Mentioned numbers apply to compiler
- * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
- * can vary not only from platform to platform, but even for compiler
- * versions. Assembler vs. assembler improvement coefficients can
- * [and are known to] differ and are to be documented elsewhere. */
-int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                const BN_ULONG *np, const BN_ULONG *n0p, int num) {
-  BN_ULONG c0, c1, ml, *tp, n0;
-#ifdef mul64
-  BN_ULONG mh;
-#endif
-  volatile BN_ULONG *vp;
-  int i = 0, j;
-
-#if 0 /* template for platform-specific implementation */
-	if (ap==bp)	return bn_sqr_mont(rp,ap,np,n0p,num);
-#endif
-  vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
-
-  n0 = *n0p;
-
-  c0 = 0;
-  ml = bp[0];
-#ifdef mul64
-  mh = HBITS(ml);
-  ml = LBITS(ml);
-  for (j = 0; j < num; ++j) {
-    mul(tp[j], ap[j], ml, mh, c0);
-  }
-#else
-  for (j = 0; j < num; ++j) {
-    mul(tp[j], ap[j], ml, c0);
-  }
-#endif
-
-  tp[num] = c0;
-  tp[num + 1] = 0;
-  goto enter;
-
-  for (; i < num; i++) {
-    c0 = 0;
-    ml = bp[i];
-#ifdef mul64
-    mh = HBITS(ml);
-    ml = LBITS(ml);
-    for (j = 0; j < num; ++j) {
-      mul_add(tp[j], ap[j], ml, mh, c0);
-    }
-#else
-    for (j = 0; j < num; ++j) {
-      mul_add(tp[j], ap[j], ml, c0);
-    }
-#endif
-    c1 = (tp[num] + c0) & BN_MASK2;
-    tp[num] = c1;
-    tp[num + 1] = (c1 < c0 ? 1 : 0);
-  enter:
-    c1 = tp[0];
-    ml = (c1 * n0) & BN_MASK2;
-    c0 = 0;
-#ifdef mul64
-    mh = HBITS(ml);
-    ml = LBITS(ml);
-    mul_add(c1, np[0], ml, mh, c0);
-#else
-    mul_add(c1, ml, np[0], c0);
-#endif
-    for (j = 1; j < num; j++) {
-      c1 = tp[j];
-#ifdef mul64
-      mul_add(c1, np[j], ml, mh, c0);
-#else
-      mul_add(c1, ml, np[j], c0);
-#endif
-      tp[j - 1] = c1 & BN_MASK2;
-    }
-    c1 = (tp[num] + c0) & BN_MASK2;
-    tp[num - 1] = c1;
-    tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
-  }
-
-  if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
-    c0 = bn_sub_words(rp, tp, np, num);
-    if (tp[num] != 0 || c0 == 0) {
-      for (i = 0; i < num + 2; i++) {
-        vp[i] = 0;
-      }
-      return 1;
-    }
-  }
-  for (i = 0; i < num; i++) {
-    rp[i] = tp[i], vp[i] = 0;
-  }
-  vp[num] = 0;
-  vp[num + 1] = 0;
-  return 1;
-}
-#endif
-
 #endif