Remove bn_sub_part_words assembly.
The assembly only existed for 32-bit x86, which is much less relevant
these days. It's also just a pile of ADDs, ADCs, etc., which compilers
should be able to figure out by now.
This frees us up to clean up that function, including the weird cl/dl
calling convention. No noticeable difference in RSA benchmarks:
Before:
Did 224 RSA 2048 signing operations in 1006100us (222.6 ops/sec)
Did 9240 RSA 2048 verify (same key) operations in 1078563us (8567.0 ops/sec)
Did 8541 RSA 2048 verify (fresh key) operations in 1064996us (8019.7 ops/sec)
Did 32 RSA 4096 signing operations in 1052851us (30.4 ops/sec)
Did 2365 RSA 4096 verify (same key) operations in 1093337us (2163.1 ops/sec)
Did 2222 RSA 4096 verify (fresh key) operations in 1090037us (2038.5 ops/sec)
After:
Did 231 RSA 2048 signing operations in 1018908us (226.7 ops/sec)
Did 9394 RSA 2048 verify (same key) operations in 1095548us (8574.7 ops/sec)
Did 8525 RSA 2048 verify (fresh key) operations in 1062449us (8023.9 ops/sec)
Did 32 RSA 4096 signing operations in 1050236us (30.5 ops/sec)
Did 2376 RSA 4096 verify (same key) operations in 1098509us (2162.9 ops/sec)
Did 2233 RSA 4096 verify (fresh key) operations in 1094724us (2039.8 ops/sec)
Bug: 314
Change-Id: I86a27b2550ab8bec2a9930cc509f4c29d6036b35
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/40144
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/asm/bn-586.pl b/crypto/fipsmodule/bn/asm/bn-586.pl
index 05ef28c..5c52e05 100644
--- a/crypto/fipsmodule/bn/asm/bn-586.pl
+++ b/crypto/fipsmodule/bn/asm/bn-586.pl
@@ -27,7 +27,6 @@
&bn_div_words("bn_div_words");
&bn_add_words("bn_add_words");
&bn_sub_words("bn_sub_words");
-&bn_sub_part_words("bn_sub_part_words");
&asm_finish();
@@ -575,211 +574,3 @@
&function_end($name);
}
-
-sub bn_sub_part_words
- {
- local($name)=@_;
-
- &function_begin($name,"");
-
- &comment("");
- $a="esi";
- $b="edi";
- $c="eax";
- $r="ebx";
- $tmp1="ecx";
- $tmp2="edx";
- $num="ebp";
-
- &mov($r,&wparam(0)); # get r
- &mov($a,&wparam(1)); # get a
- &mov($b,&wparam(2)); # get b
- &mov($num,&wparam(3)); # get num
- &xor($c,$c); # clear carry
- &and($num,0xfffffff8); # num / 8
-
- &jz(&label("aw_finish"));
-
- &set_label("aw_loop",0);
- for ($i=0; $i<8; $i++)
- {
- &comment("Round $i");
-
- &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
- &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
- &sub($tmp1,$c);
- &mov($c,0);
- &adc($c,$c);
- &sub($tmp1,$tmp2);
- &adc($c,0);
- &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
- }
-
- &comment("");
- &add($a,32);
- &add($b,32);
- &add($r,32);
- &sub($num,8);
- &jnz(&label("aw_loop"));
-
- &set_label("aw_finish",0);
- &mov($num,&wparam(3)); # get num
- &and($num,7);
- &jz(&label("aw_end"));
-
- for ($i=0; $i<7; $i++)
- {
- &comment("Tail Round $i");
- &mov($tmp1,&DWP(0,$a,"",0)); # *a
- &mov($tmp2,&DWP(0,$b,"",0));# *b
- &sub($tmp1,$c);
- &mov($c,0);
- &adc($c,$c);
- &sub($tmp1,$tmp2);
- &adc($c,0);
- &mov(&DWP(0,$r,"",0),$tmp1); # *r
- &add($a, 4);
- &add($b, 4);
- &add($r, 4);
- &dec($num) if ($i != 6);
- &jz(&label("aw_end")) if ($i != 6);
- }
- &set_label("aw_end",0);
-
- &cmp(&wparam(4),0);
- &je(&label("pw_end"));
-
- &mov($num,&wparam(4)); # get dl
- &cmp($num,0);
- &je(&label("pw_end"));
- &jge(&label("pw_pos"));
-
- &comment("pw_neg");
- &mov($tmp2,0);
- &sub($tmp2,$num);
- &mov($num,$tmp2);
- &and($num,0xfffffff8); # num / 8
- &jz(&label("pw_neg_finish"));
-
- &set_label("pw_neg_loop",0);
- for ($i=0; $i<8; $i++)
- {
- &comment("dl<0 Round $i");
-
- &mov($tmp1,0);
- &mov($tmp2,&DWP($i*4,$b,"",0)); # *b
- &sub($tmp1,$c);
- &mov($c,0);
- &adc($c,$c);
- &sub($tmp1,$tmp2);
- &adc($c,0);
- &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
- }
-
- &comment("");
- &add($b,32);
- &add($r,32);
- &sub($num,8);
- &jnz(&label("pw_neg_loop"));
-
- &set_label("pw_neg_finish",0);
- &mov($tmp2,&wparam(4)); # get dl
- &mov($num,0);
- &sub($num,$tmp2);
- &and($num,7);
- &jz(&label("pw_end"));
-
- for ($i=0; $i<7; $i++)
- {
- &comment("dl<0 Tail Round $i");
- &mov($tmp1,0);
- &mov($tmp2,&DWP($i*4,$b,"",0));# *b
- &sub($tmp1,$c);
- &mov($c,0);
- &adc($c,$c);
- &sub($tmp1,$tmp2);
- &adc($c,0);
- &dec($num) if ($i != 6);
- &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
- &jz(&label("pw_end")) if ($i != 6);
- }
-
- &jmp(&label("pw_end"));
-
- &set_label("pw_pos",0);
-
- &and($num,0xfffffff8); # num / 8
- &jz(&label("pw_pos_finish"));
-
- &set_label("pw_pos_loop",0);
-
- for ($i=0; $i<8; $i++)
- {
- &comment("dl>0 Round $i");
-
- &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
- &sub($tmp1,$c);
- &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
- &jnc(&label("pw_nc".$i));
- }
-
- &comment("");
- &add($a,32);
- &add($r,32);
- &sub($num,8);
- &jnz(&label("pw_pos_loop"));
-
- &set_label("pw_pos_finish",0);
- &mov($num,&wparam(4)); # get dl
- &and($num,7);
- &jz(&label("pw_end"));
-
- for ($i=0; $i<7; $i++)
- {
- &comment("dl>0 Tail Round $i");
- &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
- &sub($tmp1,$c);
- &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
- &jnc(&label("pw_tail_nc".$i));
- &dec($num) if ($i != 6);
- &jz(&label("pw_end")) if ($i != 6);
- }
- &mov($c,1);
- &jmp(&label("pw_end"));
-
- &set_label("pw_nc_loop",0);
- for ($i=0; $i<8; $i++)
- {
- &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
- &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
- &set_label("pw_nc".$i,0);
- }
-
- &comment("");
- &add($a,32);
- &add($r,32);
- &sub($num,8);
- &jnz(&label("pw_nc_loop"));
-
- &mov($num,&wparam(4)); # get dl
- &and($num,7);
- &jz(&label("pw_nc_end"));
-
- for ($i=0; $i<7; $i++)
- {
- &mov($tmp1,&DWP($i*4,$a,"",0)); # *a
- &mov(&DWP($i*4,$r,"",0),$tmp1); # *r
- &set_label("pw_tail_nc".$i,0);
- &dec($num) if ($i != 6);
- &jz(&label("pw_nc_end")) if ($i != 6);
- }
-
- &set_label("pw_nc_end",0);
- &mov($c,0);
-
- &set_label("pw_end",0);
-
-# &mov("eax",$c); # $c is "eax"
-
- &function_end($name);
- }
diff --git a/crypto/fipsmodule/bn/mul.c b/crypto/fipsmodule/bn/mul.c
index 640d8cd..dd31580 100644
--- a/crypto/fipsmodule/bn/mul.c
+++ b/crypto/fipsmodule/bn/mul.c
@@ -119,16 +119,13 @@
}
}
-#if !defined(OPENSSL_X86) || defined(OPENSSL_NO_ASM)
// Here follows specialised variants of bn_add_words() and bn_sub_words(). They
// have the property performing operations on arrays of different sizes. The
// sizes of those arrays is expressed through cl, which is the common length (
// basicall, min(len(a),len(b)) ), and dl, which is the delta between the two
// lengths, calculated as len(a)-len(b). All lengths are the number of
// BN_ULONGs... For the operations that require a result array as parameter,
-// it must have the length cl+abs(dl). These functions should probably end up
-// in bn_asm.c as soon as there are assembler counterparts for the systems that
-// use assembler files.
+// it must have the length cl+abs(dl).
static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
const BN_ULONG *b, int cl, int dl) {
@@ -282,11 +279,6 @@
return c;
}
-#else
-// On other platforms the function is defined in asm.
-BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
- int cl, int dl);
-#endif
// bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value
// and returning a mask of all ones if the result was negative and all zeros if
@@ -294,8 +286,7 @@
// convention.
//
// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention
-// is confusing. The trouble is 32-bit x86 implements |bn_sub_part_words| in
-// assembly, but we can probably just delete it?
+// is confusing.
static BN_ULONG bn_abs_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
const BN_ULONG *b, int cl, int dl,
BN_ULONG *tmp) {