bn/asm/*-mont.pl: fix memory access pattern in final subtraction.

Montgomery multiplication post-conditions in some of code paths were
formally non-constant time. Cache access pattern was result-neutral,
but a little bit asymmetric, which might have produced a signal [if
processor reordered load and stores at run-time].

(Imported from upstream's 774ff8fed67e19d4f5f0df2f59050f2737abab2a.)

Change-Id: I77443fb79242b77e704c34d69f1de9e3162e9538
Reviewed-on: https://boringssl-review.googlesource.com/27987
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/asm/armv4-mont.pl b/crypto/fipsmodule/bn/asm/armv4-mont.pl
index 094e3b6..2ee389e 100644
--- a/crypto/fipsmodule/bn/asm/armv4-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -266,14 +266,15 @@
 	mov	$tp,sp			@ "rewind" $tp
 	sub	$rp,$rp,$aj		@ "rewind" $rp
 
-	and	$ap,$tp,$nhi
-	bic	$np,$rp,$nhi
-	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
-
-.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
+.Lcopy:	ldr	$tj,[$tp]		@ conditional copy
+	ldr	$aj,[$rp]
 	str	sp,[$tp],#4		@ zap tp
-	str	$tj,[$rp],#4
-	cmp	$tp,$num
+#ifdef	__thumb2__
+	it	cc
+#endif
+	movcc	$aj,$tj
+	str	$aj,[$rp],#4
+	teq	$tp,$num		@ preserve carry
 	bne	.Lcopy
 
 	mov	sp,$num
diff --git a/crypto/fipsmodule/bn/asm/x86-mont.pl b/crypto/fipsmodule/bn/asm/x86-mont.pl
index d15726b..214f2b0 100755
--- a/crypto/fipsmodule/bn/asm/x86-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -604,16 +604,18 @@
 	&jge	(&label("sub"));
 
 	&sbb	("eax",0);			# handle upmost overflow bit
-	&and	($tp,"eax");
-	&not	("eax");
-	&mov	($np,$rp);
-	&and	($np,"eax");
-	&or	($tp,$np);			# tp=carry?tp:rp
+	&mov	("edx",-1);
+	&xor	("edx","eax");
+	&jmp	(&label("copy"));
 
-&set_label("copy",16);				# copy or in-place refresh
-	&mov	("eax",&DWP(0,$tp,$num,4));
-	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+&set_label("copy",16);				# conditional copy
+	&mov	($tp,&DWP($frame,"esp",$num,4));
+	&mov	($np,&DWP(0,$rp,$num,4));
 	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&and	($tp,"eax");
+	&and	($np,"edx");
+	&or	($np,$tp);
+	&mov	(&DWP(0,$rp,$num,4),$np);
 	&dec	($num);
 	&jge	(&label("copy"));
 
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont.pl b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
index faa5cb0..0dd1d8b 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
@@ -299,30 +299,30 @@
 
 	xor	$i,$i			# i=0 and clear CF!
 	mov	(%rsp),%rax		# tp[0]
-	lea	(%rsp),$ap		# borrow ap for tp
 	mov	$num,$j			# j=num
-	jmp	.Lsub
+
 .align	16
 .Lsub:	sbb	($np,$i,8),%rax
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
-	mov	8($ap,$i,8),%rax	# tp[i+1]
+	mov	8(%rsp,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
 	dec	$j			# doesn't affect CF!
 	jnz	.Lsub
 
 	sbb	\$0,%rax		# handle upmost overflow bit
+	mov	\$-1,%rbx
+	xor	%rax,%rbx		# not %rax
 	xor	$i,$i
-	and	%rax,$ap
-	not	%rax
-	mov	$rp,$np
-	and	%rax,$np
 	mov	$num,$j			# j=num
-	or	$np,$ap			# ap=borrow?tp:rp
-.align	16
-.Lcopy:					# copy or in-place refresh
-	mov	($ap,$i,8),%rax
-	mov	$i,(%rsp,$i,8)		# zap temporary vector
-	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+
+.Lcopy:					# conditional copy
+	mov	($rp,$i,8),%rcx
+	mov	(%rsp,$i,8),%rdx
+	and	%rbx,%rcx
+	and	%rax,%rdx
+	mov	$num,(%rsp,$i,8)	# zap temporary vector
+	or	%rcx,%rdx
+	mov	%rdx,($rp,$i,8)		# rp[i]=tp[i]
 	lea	1($i),$i
 	sub	\$1,$j
 	jnz	.Lcopy
@@ -712,7 +712,6 @@
 	mov	16(%rsp,$num,8),$rp	# restore $rp
 	lea	-4($num),$j
 	mov	0(%rsp),@ri[0]		# tp[0]
-	pxor	%xmm0,%xmm0
 	mov	8(%rsp),@ri[1]		# tp[1]
 	shr	\$2,$j			# j=num/4-1
 	lea	(%rsp),$ap		# borrow ap for tp
@@ -722,8 +721,7 @@
 	mov	16($ap),@ri[2]		# tp[2]
 	mov	24($ap),@ri[3]		# tp[3]
 	sbb	8($np),@ri[1]
-	jmp	.Lsub4x
-.align	16
+
 .Lsub4x:
 	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
 	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
@@ -750,34 +748,35 @@
 
 	sbb	\$0,@ri[0]		# handle upmost overflow bit
 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
-	xor	$i,$i			# i=0
-	and	@ri[0],$ap
-	not	@ri[0]
-	mov	$rp,$np
-	and	@ri[0],$np
-	lea	-4($num),$j
-	or	$np,$ap			# ap=borrow?tp:rp
-	shr	\$2,$j			# j=num/4-1
+	pxor	%xmm0,%xmm0
+	movq	@ri[0],%xmm4
+	pcmpeqd	%xmm5,%xmm5
+	pshufd	\$0,%xmm4,%xmm4
+	mov	$num,$j
+	pxor	%xmm4,%xmm5
+	shr	\$2,$j			# j=num/4
+	xor	%eax,%eax		# i=0
 
-	movdqu	($ap),%xmm1
-	movdqa	%xmm0,(%rsp)
-	movdqu	%xmm1,($rp)
 	jmp	.Lcopy4x
 .align	16
-.Lcopy4x:					# copy or in-place refresh
-	movdqu	16($ap,$i),%xmm2
-	movdqu	32($ap,$i),%xmm1
-	movdqa	%xmm0,16(%rsp,$i)
-	movdqu	%xmm2,16($rp,$i)
-	movdqa	%xmm0,32(%rsp,$i)
-	movdqu	%xmm1,32($rp,$i)
-	lea	32($i),$i
+.Lcopy4x:				# conditional copy
+	movdqa	(%rsp,%rax),%xmm1
+	movdqu	($rp,%rax),%xmm2
+	pand	%xmm4,%xmm1
+	pand	%xmm5,%xmm2
+	movdqa	16(%rsp,%rax),%xmm3
+	movdqa	%xmm0,(%rsp,%rax)
+	por	%xmm2,%xmm1
+	movdqu	16($rp,%rax),%xmm2
+	movdqu	%xmm1,($rp,%rax)
+	pand	%xmm4,%xmm3
+	pand	%xmm5,%xmm2
+	movdqa	%xmm0,16(%rsp,%rax)
+	por	%xmm2,%xmm3
+	movdqu	%xmm3,16($rp,%rax)
+	lea	32(%rax),%rax
 	dec	$j
 	jnz	.Lcopy4x
-
-	movdqu	16($ap,$i),%xmm2
-	movdqa	%xmm0,16(%rsp,$i)
-	movdqu	%xmm2,16($rp,$i)
 ___
 }
 $code.=<<___;
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
index 2119826..700df17 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
@@ -411,18 +411,19 @@
 	jnz	.Lsub
 
 	sbb	\$0,%rax		# handle upmost overflow bit
+	mov	\$-1,%rbx
+	xor	%rax,%rbx
 	xor	$i,$i
-	and	%rax,$ap
-	not	%rax
-	mov	$rp,$np
-	and	%rax,$np
 	mov	$num,$j			# j=num
-	or	$np,$ap			# ap=borrow?tp:rp
-.align	16
-.Lcopy:					# copy or in-place refresh
-	mov	($ap,$i,8),%rax
+
+.Lcopy:					# conditional copy
+	mov	($rp,$i,8),%rcx
+	mov	(%rsp,$i,8),%rdx
+	and	%rbx,%rcx
+	and	%rax,%rdx
 	mov	$i,(%rsp,$i,8)		# zap temporary vector
-	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	or	%rcx,%rdx
+	mov	%rdx,($rp,$i,8)		# rp[i]=tp[i]
 	lea	1($i),$i
 	sub	\$1,$j
 	jnz	.Lcopy