OpenSSL: make final reduction in Montgomery multiplication constant-time. (The issue was reported by Shay Gueron.) The final reduction in Montgomery multiplication computes if (X >= m) then X = X - m else X = X In OpenSSL, this was done by computing T = X - m, doing a constant-time selection of the *addresses* of X and T, and loading from the resulting address. But this is not cache-neutral. This patch changes the behaviour by loading both X and T into registers, and doing a constant-time selection of the *values*. TODO(fork): only some of the fixes from the original patch still apply to the 1.0.2 code.

commit: 75b833cc819a9d189adb0fdd56327bee600ff9e9 [log] [tgz]
author: Adam Langley <agl@chromium.org> Fri Jun 20 12:00:00 2014 -0700
committer: Adam Langley <agl@chromium.org> Fri Jun 20 13:17:33 2014 -0700
tree: 35da51859fe77699452f6046d327ac8874bf7c55
parent: b36a3156b6cc76294267d7083ddd31e6e5f9c170 [diff] [blame]
diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
index 3803928..29c0f9c 100644
--- a/crypto/bn/asm/x86_64-mont.pl
+++ b/crypto/bn/asm/x86_64-mont.pl

@@ -273,22 +273,21 @@
 	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
 	mov	8($ap,$i,8),%rax	# tp[i+1]
 	lea	1($i),$i		# i++
-	dec	$j			# doesnn't affect CF!
+	dec	$j			# doesn't affect CF!
 	jnz	.Lsub
 
 	sbb	\$0,%rax		# handle upmost overflow bit
 	xor	$i,$i
-	and	%rax,$ap
-	not	%rax
-	mov	$rp,$np
-	and	%rax,$np
 	mov	$num,$j			# j=num
-	or	$np,$ap			# ap=borrow?tp:rp
 .align	16
 .Lcopy:					# copy or in-place refresh
-	mov	($ap,$i,8),%rax
+	mov	(%rsp,$i,8),$ap
+	mov	($rp,$i,8),$np
+	xor	$np,$ap			# conditional select:
+	and	%rax,$ap		# ((ap ^ np) & %rax) ^ np
+	xor	$np,$ap			# ap = borrow?tp:rp
 	mov	$i,(%rsp,$i,8)		# zap temporary vector
-	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	mov	$ap,($rp,$i,8)		# rp[i]=tp[i]
 	lea	1($i),$i
 	sub	\$1,$j
 	jnz	.Lcopy
@@ -643,7 +642,6 @@
 $code.=<<___;
 	mov	16(%rsp,$num,8),$rp	# restore $rp
 	mov	0(%rsp),@ri[0]		# tp[0]
-	pxor	%xmm0,%xmm0
 	mov	8(%rsp),@ri[1]		# tp[1]
 	shr	\$2,$num		# num/=4
 	lea	(%rsp),$ap		# borrow ap for tp
@@ -681,35 +679,36 @@
 	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
 
 	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[0],%xmm0
+	punpcklqdq %xmm0,%xmm0		# extend mask to 128 bits
 	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
 	xor	$i,$i			# i=0
-	and	@ri[0],$ap
-	not	@ri[0]
-	mov	$rp,$np
-	and	@ri[0],$np
-	lea	-1($num),$j
-	or	$np,$ap			# ap=borrow?tp:rp
 
-	movdqu	($ap),%xmm1
-	movdqa	%xmm0,(%rsp)
-	movdqu	%xmm1,($rp)
+	mov	$num,$j
+	pxor	%xmm5,%xmm5
 	jmp	.Lcopy4x
 .align	16
-.Lcopy4x:					# copy or in-place refresh
-	movdqu	16($ap,$i),%xmm2
-	movdqu	32($ap,$i),%xmm1
-	movdqa	%xmm0,16(%rsp,$i)
-	movdqu	%xmm2,16($rp,$i)
-	movdqa	%xmm0,32(%rsp,$i)
-	movdqu	%xmm1,32($rp,$i)
+.Lcopy4x:				# copy or in-place refresh
+	movdqu	(%rsp,$i),%xmm2
+	movdqu  16(%rsp,$i),%xmm4
+	movdqu	($rp,$i),%xmm1
+	movdqu	16($rp,$i),%xmm3
+	pxor	%xmm1,%xmm2		# conditional select
+	pxor	%xmm3,%xmm4
+	pand	%xmm0,%xmm2
+	pand	%xmm0,%xmm4
+	pxor	%xmm1,%xmm2
+	pxor	%xmm3,%xmm4
+	movdqu	%xmm2,($rp,$i)
+	movdqu  %xmm4,16($rp,$i)
+	movdqa	%xmm5,(%rsp,$i)		# zap temporary vectors
+	movdqa	%xmm5,16(%rsp,$i)
+
 	lea	32($i),$i
 	dec	$j
 	jnz	.Lcopy4x
 
 	shl	\$2,$num
-	movdqu	16($ap,$i),%xmm2
-	movdqa	%xmm0,16(%rsp,$i)
-	movdqu	%xmm2,16($rp,$i)
 ___
 }
 $code.=<<___;
commit	75b833cc819a9d189adb0fdd56327bee600ff9e9	[log] [tgz]
author	Adam Langley <agl@chromium.org>	Fri Jun 20 12:00:00 2014 -0700
committer	Adam Langley <agl@chromium.org>	Fri Jun 20 13:17:33 2014 -0700
tree	35da51859fe77699452f6046d327ac8874bf7c55
parent	b36a3156b6cc76294267d7083ddd31e6e5f9c170 [diff] [blame]