Update x86_64-mont5.pl and RSAZ comments a bit.

Back in https://boringssl-review.googlesource.com/c/boringssl/+/33268, I
wrote that I had no idea what the mont5 assembly was doing. In
preparation for fixing up some comments around
BN_mod_exp_mont_consttime, I wanted to understand whether we were still
making assumptions about cache lines.

Happily, for the mont5 code, the answer is no, we are not. We just make
a bunch of masks and apply them in the natural way. But we do require
16-byte alignment on the table, because we use movdqa to read out of it.

I didn't look as closely at RSAZ, but I believe it too is fine. It
fairly quickly tosses $power into an XMM register and builds up masks,
rather than incorporating it into address computations.

(Both scatter5 functions incorporate it into the address, but that's
part of table building, where the index is public. I've updated the
comments to note when the index is secret or public.)

There is one reference to cache lines in the comments of mont5.pl, in
computing $N. However, $N has been unused since
https://boringssl-review.googlesource.com/c/boringssl/+/7244. (There are
references to $N[0] and friends, but those refer to @N, which is a
completely unrelated variable.) Remove it.

Change-Id: I1fac0660dffcd1380572029de2e5baece60cddf6
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/55225
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
index 012e7aa..6c596e3 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
@@ -158,8 +158,12 @@
 	movdqa	%xmm1,%xmm2
 ___
 ########################################################################
-# calculate mask by comparing 0..31 to index and save result to stack
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
 #
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
 $code.=<<___;
 	paddd	%xmm0,%xmm1
 	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
@@ -228,7 +232,8 @@
 }
 $code.=<<___;
 	por	%xmm1,%xmm0
-	pshufd	\$0x4e,%xmm0,%xmm1
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm0,%xmm1	# Swap upper and lower halves.
 	por	%xmm1,%xmm0
 	lea	$STRIDE($bp),$bp
 	movq	%xmm0,$m0		# m0=bp[0]
@@ -321,7 +326,8 @@
 }
 $code.=<<___;
 	por	%xmm5,%xmm4
-	pshufd	\$0x4e,%xmm4,%xmm0
+	# Combine the upper and lower halves of %xmm4 as %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
 	por	%xmm4,%xmm0
 	lea	$STRIDE($bp),$bp
 
@@ -575,7 +581,6 @@
 ___
 		$bp="%r12";
 		$STRIDE=2**5*8;		# 5 is "window size"
-		$N=$STRIDE/4;		# should match cache line size
 		$tp=$i;
 $code.=<<___;
 	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
@@ -589,8 +594,12 @@
 	movdqa	%xmm1,%xmm2
 ___
 ########################################################################
-# calculate mask by comparing 0..31 to index and save result to stack
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
 #
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
 $code.=<<___;
 	paddd	%xmm0,%xmm1
 	pcmpeqd	%xmm5,%xmm0		# compare to 1,0
@@ -659,7 +668,8 @@
 }
 $code.=<<___;
 	por	%xmm1,%xmm0
-	pshufd	\$0x4e,%xmm0,%xmm1
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm0,%xmm1	# Swap upper and lower halves.
 	por	%xmm1,%xmm0
 	lea	$STRIDE($bp),$bp
 	movq	%xmm0,$m0		# m0=bp[0]
@@ -836,7 +846,8 @@
 }
 $code.=<<___;
 	por	%xmm5,%xmm4
-	pshufd	\$0x4e,%xmm4,%xmm0
+	# Combine the upper and lower halves of %xmm4 as %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
 	por	%xmm4,%xmm0
 	lea	$STRIDE($bp),$bp
 	movq	%xmm0,$m0		# m0=bp[i]
@@ -2227,7 +2238,6 @@
    ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
 my $rptr=$bptr;
 my $STRIDE=2**5*8;		# 5 is "window size"
-my $N=$STRIDE/4;		# should match cache line size
 $code.=<<___;
 	movdqa	0(%rax),%xmm0		# 00000001000000010000000000000000
 	movdqa	16(%rax),%xmm1		# 00000002000000020000000200000002
@@ -2240,8 +2250,12 @@
 	movdqa	%xmm1,%xmm2
 ___
 ########################################################################
-# calculate mask by comparing 0..31 to index and save result to stack
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
 #
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
 $code.=<<___;
 	.byte	0x67
 	paddd	%xmm0,%xmm1
@@ -2310,7 +2324,8 @@
 }
 $code.=<<___;
 	pxor	%xmm1,%xmm0
-	pshufd	\$0x4e,%xmm0,%xmm1
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm0,%xmm1	# Swap upper and lower halves.
 	por	%xmm1,%xmm0
 	lea	$STRIDE($bptr),$bptr
 	movq	%xmm0,%rdx		# bp[0]
@@ -2430,7 +2445,8 @@
 }
 $code.=<<___;
 	por	%xmm5,%xmm4
-	pshufd	\$0x4e,%xmm4,%xmm0
+	# Combine the upper and lower halves of %xmm4 as %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
 	por	%xmm4,%xmm0
 	lea	$STRIDE($bptr),$bptr
 	movq	%xmm0,%rdx		# m0=bp[i]
@@ -3434,6 +3450,15 @@
 .cfi_startproc
 	cmp	\$0, $num
 	jz	.Lscatter_epilogue
+
+	# $tbl stores 32 entries, t0 through t31. Each entry has $num words.
+	# They are interleaved in memory as follows:
+	#
+	#  t0[0]      t1[0]      t2[0]      ... t31[0]
+	#  t0[1]      t1[1]      t2[1]      ... t31[1]
+	#  ...
+	#  t0[$num-1] t1[$num-1] t2[$num-1] ... t31[$num-1]
+
 	lea	($tbl,$idx,8),$tbl
 .Lscatter:
 	mov	($inp),%rax
@@ -3471,8 +3496,12 @@
 	movdqa	%xmm1,%xmm2
 ___
 ########################################################################
-# calculate mask by comparing 0..31 to $idx and save result to stack
+# Calculate masks by comparing 0..31 to $idx and save result to stack.
 #
+# We compute sixteen 16-byte masks and store them on the stack. Mask i is stored
+# in `16*i - 128`(%rax) and contains the comparisons for idx == 2*i and
+# idx == 2*i + 1 in its lower and upper halves, respectively. Mask calculations
+# are scheduled in groups of four.
 for($i=0;$i<$STRIDE/16;$i+=4) {
 $code.=<<___;
 	paddd	%xmm0,%xmm1
@@ -3510,6 +3539,8 @@
 	pxor	%xmm5,%xmm5
 ___
 for($i=0;$i<$STRIDE/16;$i+=4) {
+# Combine the masks with the corresponding table entries to select the correct
+# entry.
 $code.=<<___;
 	movdqa	`16*($i+0)-128`(%r11),%xmm0
 	movdqa	`16*($i+1)-128`(%r11),%xmm1
@@ -3528,7 +3559,8 @@
 $code.=<<___;
 	por	%xmm5,%xmm4
 	lea	$STRIDE(%r11),%r11
-	pshufd	\$0x4e,%xmm4,%xmm0
+	# Combine the upper and lower halves of %xmm0.
+	pshufd	\$0x4e,%xmm4,%xmm0	# Swap upper and lower halves.
 	por	%xmm4,%xmm0
 	movq	%xmm0,($out)		# m0=bp[0]
 	lea	8($out),$out
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index ec87dd8..9329ce7 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -374,7 +374,8 @@
 // bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it
 // by |ap| modulo |np|, and stores the result in |rp|. The values are |num|
 // words long and represented in Montgomery form. |n0| is a pointer to the
-// corresponding field in |BN_MONT_CTX|.
+// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least
+// 16 bytes. |power| must be less than 32 and is treated as secret.
 //
 // WARNING: This function implements Almost Montgomery Multiplication from
 // https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
@@ -384,20 +385,22 @@
                          const BN_ULONG *n0, int num, int power);
 
 // bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of
-// |table| are |num| words long. |power| must be less than 32. |table| must be
-// 32*|num| words long.
+// |table| are |num| words long. |power| must be less than 32 and is treated as
+// public. |table| must be 32*|num| words long. |table| must be aligned to at
+// least 16 bytes.
 void bn_scatter5(const BN_ULONG *inp, size_t num, BN_ULONG *table,
                  size_t power);
 
 // bn_gather5 loads index |power| of |table| and stores it in |out|. |out| and
-// each entry of |table| are |num| words long. |power| must be less than 32.
+// each entry of |table| are |num| words long. |power| must be less than 32 and
+// is treated as secret. |table| must be aligned to at least 16 bytes.
 void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, size_t power);
 
 // bn_power5 squares |ap| five times and multiplies it by the value stored at
 // index |power| of |table|, modulo |np|. It stores the result in |rp|. The
 // values are |num| words long and represented in Montgomery form. |n0| is a
 // pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible
-// by 8.
+// by 8. |power| must be less than 32 and is treated as secret.
 //
 // WARNING: This function implements Almost Montgomery Multiplication from
 // https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced.
diff --git a/crypto/fipsmodule/bn/rsaz_exp.h b/crypto/fipsmodule/bn/rsaz_exp.h
index bc7a439..67f1cab 100644
--- a/crypto/fipsmodule/bn/rsaz_exp.h
+++ b/crypto/fipsmodule/bn/rsaz_exp.h
@@ -79,13 +79,15 @@
                         const BN_ULONG n[40], BN_ULONG k, int count);
 
 // rsaz_1024_scatter5_avx2 stores |val| at index |i| of |tbl|. |i| must be
-// positive and at most 31. Note the table only uses 18 |BN_ULONG|s per entry
-// instead of 40. It packs two 29-bit limbs into each |BN_ULONG| and only stores
-// 36 limbs rather than the padded 40.
+// positive and at most 31. It is treated as public. Note the table only uses 18
+// |BN_ULONG|s per entry instead of 40. It packs two 29-bit limbs into each
+// |BN_ULONG| and only stores 36 limbs rather than the padded 40.
 void rsaz_1024_scatter5_avx2(BN_ULONG tbl[32 * 18], const BN_ULONG val[40],
                              int i);
 
-// rsaz_1024_gather5_avx2 loads index |i| of |tbl| and writes it to |val|.
+// rsaz_1024_gather5_avx2 loads index |i| of |tbl| and writes it to |val|. |i|
+// must be positive and at most 31. It is treated as secret. |tbl| must be
+// aligned to 32 bytes.
 void rsaz_1024_gather5_avx2(BN_ULONG val[40], const BN_ULONG tbl[32 * 18],
                             int i);