bn/asm/*-mont.pl: fix memory access pattern in final subtraction.
Montgomery multiplication post-conditions in some of code paths were
formally non-constant time. Cache access pattern was result-neutral,
but a little bit asymmetric, which might have produced a signal [if
processor reordered load and stores at run-time].
(Imported from upstream's 774ff8fed67e19d4f5f0df2f59050f2737abab2a.)
Change-Id: I77443fb79242b77e704c34d69f1de9e3162e9538
Reviewed-on: https://boringssl-review.googlesource.com/27987
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/asm/armv4-mont.pl b/crypto/fipsmodule/bn/asm/armv4-mont.pl
index 094e3b6..2ee389e 100644
--- a/crypto/fipsmodule/bn/asm/armv4-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -266,14 +266,15 @@
mov $tp,sp @ "rewind" $tp
sub $rp,$rp,$aj @ "rewind" $rp
- and $ap,$tp,$nhi
- bic $np,$rp,$nhi
- orr $ap,$ap,$np @ ap=borrow?tp:rp
-
-.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh
+.Lcopy: ldr $tj,[$tp] @ conditional copy
+ ldr $aj,[$rp]
str sp,[$tp],#4 @ zap tp
- str $tj,[$rp],#4
- cmp $tp,$num
+#ifdef __thumb2__
+ it cc
+#endif
+ movcc $aj,$tj
+ str $aj,[$rp],#4
+ teq $tp,$num @ preserve carry
bne .Lcopy
mov sp,$num
diff --git a/crypto/fipsmodule/bn/asm/x86-mont.pl b/crypto/fipsmodule/bn/asm/x86-mont.pl
index d15726b..214f2b0 100755
--- a/crypto/fipsmodule/bn/asm/x86-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -604,16 +604,18 @@
&jge (&label("sub"));
&sbb ("eax",0); # handle upmost overflow bit
- &and ($tp,"eax");
- ¬ ("eax");
- &mov ($np,$rp);
- &and ($np,"eax");
- &or ($tp,$np); # tp=carry?tp:rp
+ &mov ("edx",-1);
+ &xor ("edx","eax");
+ &jmp (&label("copy"));
-&set_label("copy",16); # copy or in-place refresh
- &mov ("eax",&DWP(0,$tp,$num,4));
- &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i]
+&set_label("copy",16); # conditional copy
+ &mov ($tp,&DWP($frame,"esp",$num,4));
+ &mov ($np,&DWP(0,$rp,$num,4));
&mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
+ &and ($tp,"eax");
+ &and ($np,"edx");
+ &or ($np,$tp);
+ &mov (&DWP(0,$rp,$num,4),$np);
&dec ($num);
&jge (&label("copy"));
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont.pl b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
index faa5cb0..0dd1d8b 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
@@ -299,30 +299,30 @@
xor $i,$i # i=0 and clear CF!
mov (%rsp),%rax # tp[0]
- lea (%rsp),$ap # borrow ap for tp
mov $num,$j # j=num
- jmp .Lsub
+
.align 16
.Lsub: sbb ($np,$i,8),%rax
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
- mov 8($ap,$i,8),%rax # tp[i+1]
+ mov 8(%rsp,$i,8),%rax # tp[i+1]
lea 1($i),$i # i++
dec $j # doesn't affect CF!
jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
+ mov \$-1,%rbx
+ xor %rax,%rbx # not %rax
xor $i,$i
- and %rax,$ap
- not %rax
- mov $rp,$np
- and %rax,$np
mov $num,$j # j=num
- or $np,$ap # ap=borrow?tp:rp
-.align 16
-.Lcopy: # copy or in-place refresh
- mov ($ap,$i,8),%rax
- mov $i,(%rsp,$i,8) # zap temporary vector
- mov %rax,($rp,$i,8) # rp[i]=tp[i]
+
+.Lcopy: # conditional copy
+ mov ($rp,$i,8),%rcx
+ mov (%rsp,$i,8),%rdx
+ and %rbx,%rcx
+ and %rax,%rdx
+ mov $num,(%rsp,$i,8) # zap temporary vector
+ or %rcx,%rdx
+ mov %rdx,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy
@@ -712,7 +712,6 @@
mov 16(%rsp,$num,8),$rp # restore $rp
lea -4($num),$j
mov 0(%rsp),@ri[0] # tp[0]
- pxor %xmm0,%xmm0
mov 8(%rsp),@ri[1] # tp[1]
shr \$2,$j # j=num/4-1
lea (%rsp),$ap # borrow ap for tp
@@ -722,8 +721,7 @@
mov 16($ap),@ri[2] # tp[2]
mov 24($ap),@ri[3] # tp[3]
sbb 8($np),@ri[1]
- jmp .Lsub4x
-.align 16
+
.Lsub4x:
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
@@ -750,34 +748,35 @@
sbb \$0,@ri[0] # handle upmost overflow bit
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
- xor $i,$i # i=0
- and @ri[0],$ap
- not @ri[0]
- mov $rp,$np
- and @ri[0],$np
- lea -4($num),$j
- or $np,$ap # ap=borrow?tp:rp
- shr \$2,$j # j=num/4-1
+ pxor %xmm0,%xmm0
+ movq @ri[0],%xmm4
+ pcmpeqd %xmm5,%xmm5
+ pshufd \$0,%xmm4,%xmm4
+ mov $num,$j
+ pxor %xmm4,%xmm5
+ shr \$2,$j # j=num/4
+ xor %eax,%eax # i=0
- movdqu ($ap),%xmm1
- movdqa %xmm0,(%rsp)
- movdqu %xmm1,($rp)
jmp .Lcopy4x
.align 16
-.Lcopy4x: # copy or in-place refresh
- movdqu 16($ap,$i),%xmm2
- movdqu 32($ap,$i),%xmm1
- movdqa %xmm0,16(%rsp,$i)
- movdqu %xmm2,16($rp,$i)
- movdqa %xmm0,32(%rsp,$i)
- movdqu %xmm1,32($rp,$i)
- lea 32($i),$i
+.Lcopy4x: # conditional copy
+ movdqa (%rsp,%rax),%xmm1
+ movdqu ($rp,%rax),%xmm2
+ pand %xmm4,%xmm1
+ pand %xmm5,%xmm2
+ movdqa 16(%rsp,%rax),%xmm3
+ movdqa %xmm0,(%rsp,%rax)
+ por %xmm2,%xmm1
+ movdqu 16($rp,%rax),%xmm2
+ movdqu %xmm1,($rp,%rax)
+ pand %xmm4,%xmm3
+ pand %xmm5,%xmm2
+ movdqa %xmm0,16(%rsp,%rax)
+ por %xmm2,%xmm3
+ movdqu %xmm3,16($rp,%rax)
+ lea 32(%rax),%rax
dec $j
jnz .Lcopy4x
-
- movdqu 16($ap,$i),%xmm2
- movdqa %xmm0,16(%rsp,$i)
- movdqu %xmm2,16($rp,$i)
___
}
$code.=<<___;
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
index 2119826..700df17 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont5.pl
@@ -411,18 +411,19 @@
jnz .Lsub
sbb \$0,%rax # handle upmost overflow bit
+ mov \$-1,%rbx
+ xor %rax,%rbx
xor $i,$i
- and %rax,$ap
- not %rax
- mov $rp,$np
- and %rax,$np
mov $num,$j # j=num
- or $np,$ap # ap=borrow?tp:rp
-.align 16
-.Lcopy: # copy or in-place refresh
- mov ($ap,$i,8),%rax
+
+.Lcopy: # conditional copy
+ mov ($rp,$i,8),%rcx
+ mov (%rsp,$i,8),%rdx
+ and %rbx,%rcx
+ and %rax,%rdx
mov $i,(%rsp,$i,8) # zap temporary vector
- mov %rax,($rp,$i,8) # rp[i]=tp[i]
+ or %rcx,%rdx
+ mov %rdx,($rp,$i,8) # rp[i]=tp[i]
lea 1($i),$i
sub \$1,$j
jnz .Lcopy