OpenSSL: make final reduction in Montgomery multiplication constant-time.

(The issue was reported by Shay Gueron.)

The final reduction in Montgomery multiplication computes if (X >= m) then X =
X - m else X = X

In OpenSSL, this was done by computing T = X - m,  doing a constant-time
selection of the *addresses* of X and T, and loading from the resulting
address. But this is not cache-neutral.

This patch changes the behaviour by loading both X and T into registers, and
doing a constant-time selection of the *values*.

TODO(fork): only some of the fixes from the original patch still apply to
the 1.0.2 code.
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
index e8f6b05..0626b48 100644
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -570,16 +570,15 @@
 	&jge	(&label("sub"));
 
 	&sbb	("eax",0);			# handle upmost overflow bit
-	&and	($tp,"eax");
-	&not	("eax");
-	&mov	($np,$rp);
-	&and	($np,"eax");
-	&or	($tp,$np);			# tp=carry?tp:rp
 
 &set_label("copy",16);				# copy or in-place refresh
-	&mov	("eax",&DWP(0,$tp,$num,4));
-	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
-	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&mov	("edx",&DWP(0,$tp,$num,4));
+	&mov	($np,&DWP(0,$rp,$num,4));
+	&xor	("edx",$np);			# conditional select
+	&and	("edx","eax");
+	&xor	("edx",$np);
+	&mov	(&DWP(0,$tp,$num,4),$j)		# zap temporary vector
+	&mov	(&DWP(0,$rp,$num,4),"edx");	# rp[i]=tp[i]
 	&dec	($num);
 	&jge	(&label("copy"));
 
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 3803928..29c0f9c 100644
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl
@@ -273,22 +273,21 @@
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
 	mov	8($ap,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
-	dec	$j			# doesnn't affect CF!
+	dec	$j			# doesn't affect CF!
 	jnz	.Lsub
 
 	sbb	\$0,%rax		# handle upmost overflow bit
 	xor	$i,$i
-	and	%rax,$ap
-	not	%rax
-	mov	$rp,$np
-	and	%rax,$np
 	mov	$num,$j			# j=num
-	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
-	mov	($ap,$i,8),%rax
+	mov	(%rsp,$i,8),$ap
+	mov	($rp,$i,8),$np
+	xor	$np,$ap			# conditional select:
+	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
+	xor	$np,$ap			# ap = borrow?tp:rp
 	mov	$i,(%rsp,$i,8)		# zap temporary vector
-	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
 	lea	1($i),$i
 	sub	\$1,$j
 	jnz	.Lcopy
@@ -643,7 +642,6 @@
 $code.=<<___;
 	mov	16(%rsp,$num,8),$rp	# restore $rp
 	mov	0(%rsp),@ri[0]		# tp[0]
-	pxor	%xmm0,%xmm0
 	mov	8(%rsp),@ri[1]		# tp[1]
 	shr	\$2,$num		# num/=4
 	lea	(%rsp),$ap		# borrow ap for tp
@@ -681,35 +679,36 @@
 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
 
 	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[0],%xmm0
+	punpcklqdq %xmm0,%xmm0		# extend mask to 128 bits
 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
 	xor	$i,$i			# i=0
-	and	@ri[0],$ap
-	not	@ri[0]
-	mov	$rp,$np
-	and	@ri[0],$np
-	lea	-1($num),$j
-	or	$np,$ap			# ap=borrow?tp:rp
 
-	movdqu	($ap),%xmm1
-	movdqa	%xmm0,(%rsp)
-	movdqu	%xmm1,($rp)
+	mov	$num,$j
+	pxor	%xmm5,%xmm5
 	jmp	.Lcopy4x
 .align	16
-.Lcopy4x:					# copy or in-place refresh
-	movdqu	16($ap,$i),%xmm2
-	movdqu	32($ap,$i),%xmm1
-	movdqa	%xmm0,16(%rsp,$i)
-	movdqu	%xmm2,16($rp,$i)
-	movdqa	%xmm0,32(%rsp,$i)
-	movdqu	%xmm1,32($rp,$i)
+.Lcopy4x:				# copy or in-place refresh
+	movdqu	(%rsp,$i),%xmm2
+	movdqu  16(%rsp,$i),%xmm4
+	movdqu	($rp,$i),%xmm1
+	movdqu	16($rp,$i),%xmm3
+	pxor	%xmm1,%xmm2		# conditional select
+	pxor	%xmm3,%xmm4
+	pand	%xmm0,%xmm2
+	pand	%xmm0,%xmm4
+	pxor	%xmm1,%xmm2
+	pxor	%xmm3,%xmm4
+	movdqu	%xmm2,($rp,$i)
+	movdqu  %xmm4,16($rp,$i)
+	movdqa	%xmm5,(%rsp,$i)		# zap temporary vectors
+	movdqa	%xmm5,16(%rsp,$i)
+
 	lea	32($i),$i
 	dec	$j
 	jnz	.Lcopy4x
 
 	shl	\$2,$num
-	movdqu	16($ap,$i),%xmm2
-	movdqa	%xmm0,16(%rsp,$i)
-	movdqu	%xmm2,16($rp,$i)
 ___
 }
 $code.=<<___;
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
index c107df9..85386c1 100644
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -330,17 +330,16 @@
 
 	sbb	\$0,%rax		# handle upmost overflow bit
 	xor	$i,$i
-	and	%rax,$ap
-	not	%rax
-	mov	$rp,$np
-	and	%rax,$np
 	mov	$num,$j			# j=num
-	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
-	mov	($ap,$i,8),%rax
+	mov	(%rsp,$i,8),$ap
+	mov	($rp,$i,8),$np
+	xor	$np,$ap			# conditional select:
+	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
+	xor	$np,$ap			# ap = borrow?tp:rp
 	mov	$i,(%rsp,$i,8)		# zap temporary vector
-	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
 	lea	1($i),$i
 	sub	\$1,$j
 	jnz	.Lcopy