diff --git a/crypto/fipsmodule/bn/asm/x86_64-gcc.c b/crypto/fipsmodule/bn/asm/x86_64-gcc.c
index bfd770f..bcf12eb 100644
--- a/crypto/fipsmodule/bn/asm/x86_64-gcc.c
+++ b/crypto/fipsmodule/bn/asm/x86_64-gcc.c
@@ -280,7 +280,7 @@
 
 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
 
-void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+void bn_mul_comba8(BN_ULONG r[16], BN_ULONG a[8], BN_ULONG b[8]) {
   BN_ULONG c1, c2, c3;
 
   c1 = 0;
@@ -382,7 +382,7 @@
   r[15] = c1;
 }
 
-void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+void bn_mul_comba4(BN_ULONG r[8], BN_ULONG a[4], BN_ULONG b[4]) {
   BN_ULONG c1, c2, c3;
 
   c1 = 0;
@@ -420,7 +420,7 @@
   r[7] = c2;
 }
 
-void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
+void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) {
   BN_ULONG c1, c2, c3;
 
   c1 = 0;
@@ -494,7 +494,7 @@
   r[15] = c1;
 }
 
-void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
+void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
   BN_ULONG c1, c2, c3;
 
   c1 = 0;
diff --git a/crypto/fipsmodule/bn/generic.c b/crypto/fipsmodule/bn/generic.c
index 11c377c..44e0f2c 100644
--- a/crypto/fipsmodule/bn/generic.c
+++ b/crypto/fipsmodule/bn/generic.c
@@ -458,7 +458,7 @@
 
 #endif  // !BN_ULLONG
 
-void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+void bn_mul_comba8(BN_ULONG r[16], BN_ULONG a[8], BN_ULONG b[8]) {
   BN_ULONG c1, c2, c3;
 
   c1 = 0;
@@ -560,7 +560,7 @@
   r[15] = c1;
 }
 
-void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+void bn_mul_comba4(BN_ULONG r[8], BN_ULONG a[4], BN_ULONG b[4]) {
   BN_ULONG c1, c2, c3;
 
   c1 = 0;
@@ -598,7 +598,7 @@
   r[7] = c2;
 }
 
-void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
+void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) {
   BN_ULONG c1, c2, c3;
 
   c1 = 0;
@@ -672,7 +672,7 @@
   r[15] = c1;
 }
 
-void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
+void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
   BN_ULONG c1, c2, c3;
 
   c1 = 0;
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index b1e0b0d..7fef38e 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -142,41 +142,41 @@
 
 #if !defined(_MSC_VER)
 // MSVC doesn't support two-word integers on 64-bit.
-#define BN_ULLONG	uint128_t
+#define BN_ULLONG uint128_t
 #endif
 
-#define BN_BITS2	64
-#define BN_BYTES	8
-#define BN_BITS4	32
-#define BN_MASK2	(0xffffffffffffffffUL)
-#define BN_MASK2l	(0xffffffffUL)
-#define BN_MASK2h	(0xffffffff00000000UL)
-#define BN_MASK2h1	(0xffffffff80000000UL)
+#define BN_BITS2 64
+#define BN_BYTES 8
+#define BN_BITS4 32
+#define BN_MASK2 (0xffffffffffffffffUL)
+#define BN_MASK2l (0xffffffffUL)
+#define BN_MASK2h (0xffffffff00000000UL)
+#define BN_MASK2h1 (0xffffffff80000000UL)
 #define BN_MONT_CTX_N0_LIMBS 1
-#define BN_TBIT		(0x8000000000000000UL)
-#define BN_DEC_CONV	(10000000000000000000UL)
-#define BN_DEC_NUM	19
+#define BN_TBIT (0x8000000000000000UL)
+#define BN_DEC_CONV (10000000000000000000UL)
+#define BN_DEC_NUM 19
 #define TOBN(hi, lo) ((BN_ULONG)(hi) << 32 | (lo))
 
 #elif defined(OPENSSL_32_BIT)
 
-#define BN_ULLONG	uint64_t
-#define BN_BITS2	32
-#define BN_BYTES	4
-#define BN_BITS4	16
-#define BN_MASK2	(0xffffffffUL)
-#define BN_MASK2l	(0xffffUL)
-#define BN_MASK2h1	(0xffff8000UL)
-#define BN_MASK2h	(0xffff0000UL)
+#define BN_ULLONG uint64_t
+#define BN_BITS2 32
+#define BN_BYTES 4
+#define BN_BITS4 16
+#define BN_MASK2 (0xffffffffUL)
+#define BN_MASK2l (0xffffUL)
+#define BN_MASK2h1 (0xffff8000UL)
+#define BN_MASK2h (0xffff0000UL)
 // On some 32-bit platforms, Montgomery multiplication is done using 64-bit
 // arithmetic with SIMD instructions. On such platforms, |BN_MONT_CTX::n0|
 // needs to be two words long. Only certain 32-bit platforms actually make use
 // of n0[1] and shorter R value would suffice for the others. However,
 // currently only the assembly files know which is which.
 #define BN_MONT_CTX_N0_LIMBS 2
-#define BN_TBIT		(0x80000000UL)
-#define BN_DEC_CONV	(1000000000UL)
-#define BN_DEC_NUM	9
+#define BN_TBIT (0x80000000UL)
+#define BN_DEC_CONV (1000000000UL)
+#define BN_DEC_NUM 9
 #define TOBN(hi, lo) (lo), (hi)
 
 #else
@@ -212,16 +212,50 @@
 // least significant word first.
 int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num);
 
-BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
-BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
-void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
-BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
-BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+// bn_mul_add_words multiples |ap| by |w|, adds the result to |rp|, and places
+// the result in |rp|. |ap| and |rp| must both be |num| words long. It returns
+// the carry word of the operation. |ap| and |rp| may be equal but otherwise may
+// not alias.
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+                          BN_ULONG w);
 
-void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
-void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
-void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
-void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
+// bn_mul_words multiples |ap| by |w| and places the result in |rp|. |ap| and
+// |rp| must both be |num| words long. It returns the carry word of the
+// operation. |ap| and |rp| may be equal but otherwise may not alias.
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+
+// bn_sqr_words sets |rp[2*i]| and |rp[2*i+1]| to |ap[i]|'s square, for all |i|
+// up to |num|. |ap| is an array of |num| words and |rp| an array of |2*num|
+// words. |ap| and |rp| may not alias.
+//
+// This gives the contribution of the |ap[i]*ap[i]| terms when squaring |ap|.
+void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
+
+// bn_add_words adds |ap| to |bp| and places the result in |rp|, each of which
+// are |num| words long. It returns the carry bit, which is one if the operation
+// overflowed and zero otherwise. Any pair of |ap|, |bp|, and |rp| may be equal
+// to each other but otherwise may not alias.
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int num);
+
+// bn_sub_words subtracts |bp| from |ap| and places the result in |rp|. It
+// returns the borrow bit, which is one if the computation underflowed and zero
+// otherwise. Any pair of |ap|, |bp|, and |rp| may be equal to each other but
+// otherwise may not alias.
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int num);
+
+// bn_mul_comba4 sets |r| to the product of |a| and |b|.
+void bn_mul_comba4(BN_ULONG r[8], BN_ULONG a[4], BN_ULONG b[4]);
+
+// bn_mul_comba8 sets |r| to the product of |a| and |b|.
+void bn_mul_comba8(BN_ULONG r[16], BN_ULONG a[8], BN_ULONG b[8]);
+
+// bn_sqr_comba8 sets |r| to |a|^2.
+void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[4]);
+
+// bn_sqr_comba4 sets |r| to |a|^2.
+void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]);
 
 // bn_cmp_words returns a value less than, equal to or greater than zero if
 // the, length |n|, array |a| is less than, equal to or greater than |b|.
diff --git a/crypto/fipsmodule/bn/mul.c b/crypto/fipsmodule/bn/mul.c
index b6f3ff1..65f3c2b 100644
--- a/crypto/fipsmodule/bn/mul.c
+++ b/crypto/fipsmodule/bn/mul.c
@@ -659,37 +659,37 @@
 }
 
 // tmp must have 2*n words
-static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp) {
-  int i, j, max;
-  const BN_ULONG *ap;
-  BN_ULONG *rp;
-
-  max = n * 2;
-  ap = a;
-  rp = r;
+static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n,
+                          BN_ULONG *tmp) {
+  int max = n * 2;
+  const BN_ULONG *ap = a;
+  BN_ULONG *rp = r;
   rp[0] = rp[max - 1] = 0;
   rp++;
-  j = n;
+  int j = n;
 
+  // Compute the contribution of a[i] * a[j] for all i < j.
   if (--j > 0) {
     ap++;
     rp[j] = bn_mul_words(rp, ap, j, ap[-1]);
     rp += 2;
   }
 
-  for (i = n - 2; i > 0; i--) {
+  for (int i = n - 2; i > 0; i--) {
     j--;
     ap++;
     rp[j] = bn_mul_add_words(rp, ap, j, ap[-1]);
     rp += 2;
   }
 
+  // The final result fits in |max| words, so none of the following operations
+  // will overflow.
+
+  // Double |r|, giving the contribution of a[i] * a[j] for all i != j.
   bn_add_words(r, r, r, max);
 
-  // There will not be a carry
-
+  // Add in the contribution of a[i] * a[i] for all i.
   bn_sqr_words(tmp, a, n);
-
   bn_add_words(r, r, tmp, max);
 }
 
@@ -702,7 +702,8 @@
 // a[0]*b[0]
 // a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
 // a[1]*b[1]
-static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t) {
+static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2,
+                             BN_ULONG *t) {
   int n = n2 / 2;
   int zero, c1;
   BN_ULONG ln, lo, *p;
