Remove bn_sub_part_words assembly. The assembly only existed for 32-bit x86, which is much less relevant these days. It's also just a pile of ADDs, ADCs, etc., which compilers should be able to figure out by now. This frees us up to clean up that function, including the weird cl/dl calling convention. No noticeable difference in RSA benchmarks: Before: Did 224 RSA 2048 signing operations in 1006100us (222.6 ops/sec) Did 9240 RSA 2048 verify (same key) operations in 1078563us (8567.0 ops/sec) Did 8541 RSA 2048 verify (fresh key) operations in 1064996us (8019.7 ops/sec) Did 32 RSA 4096 signing operations in 1052851us (30.4 ops/sec) Did 2365 RSA 4096 verify (same key) operations in 1093337us (2163.1 ops/sec) Did 2222 RSA 4096 verify (fresh key) operations in 1090037us (2038.5 ops/sec) After: Did 231 RSA 2048 signing operations in 1018908us (226.7 ops/sec) Did 9394 RSA 2048 verify (same key) operations in 1095548us (8574.7 ops/sec) Did 8525 RSA 2048 verify (fresh key) operations in 1062449us (8023.9 ops/sec) Did 32 RSA 4096 signing operations in 1050236us (30.5 ops/sec) Did 2376 RSA 4096 verify (same key) operations in 1098509us (2162.9 ops/sec) Did 2233 RSA 4096 verify (fresh key) operations in 1094724us (2039.8 ops/sec) Bug: 314 Change-Id: I86a27b2550ab8bec2a9930cc509f4c29d6036b35 Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/40144 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/asm/bn-586.pl b/crypto/fipsmodule/bn/asm/bn-586.pl index 05ef28c..5c52e05 100644 --- a/crypto/fipsmodule/bn/asm/bn-586.pl +++ b/crypto/fipsmodule/bn/asm/bn-586.pl
@@ -27,7 +27,6 @@ &bn_div_words("bn_div_words"); &bn_add_words("bn_add_words"); &bn_sub_words("bn_sub_words"); -&bn_sub_part_words("bn_sub_part_words"); &asm_finish(); @@ -575,211 +574,3 @@ &function_end($name); } - -sub bn_sub_part_words - { - local($name)=@_; - - &function_begin($name,""); - - &comment(""); - $a="esi"; - $b="edi"; - $c="eax"; - $r="ebx"; - $tmp1="ecx"; - $tmp2="edx"; - $num="ebp"; - - &mov($r,&wparam(0)); # get r - &mov($a,&wparam(1)); # get a - &mov($b,&wparam(2)); # get b - &mov($num,&wparam(3)); # get num - &xor($c,$c); # clear carry - &and($num,0xfffffff8); # num / 8 - - &jz(&label("aw_finish")); - - &set_label("aw_loop",0); - for ($i=0; $i<8; $i++) - { - &comment("Round $i"); - - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov($tmp2,&DWP($i*4,$b,"",0)); # *b - &sub($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &sub($tmp1,$tmp2); - &adc($c,0); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - } - - &comment(""); - &add($a,32); - &add($b,32); - &add($r,32); - &sub($num,8); - &jnz(&label("aw_loop")); - - &set_label("aw_finish",0); - &mov($num,&wparam(3)); # get num - &and($num,7); - &jz(&label("aw_end")); - - for ($i=0; $i<7; $i++) - { - &comment("Tail Round $i"); - &mov($tmp1,&DWP(0,$a,"",0)); # *a - &mov($tmp2,&DWP(0,$b,"",0));# *b - &sub($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &sub($tmp1,$tmp2); - &adc($c,0); - &mov(&DWP(0,$r,"",0),$tmp1); # *r - &add($a, 4); - &add($b, 4); - &add($r, 4); - &dec($num) if ($i != 6); - &jz(&label("aw_end")) if ($i != 6); - } - &set_label("aw_end",0); - - &cmp(&wparam(4),0); - &je(&label("pw_end")); - - &mov($num,&wparam(4)); # get dl - &cmp($num,0); - &je(&label("pw_end")); - &jge(&label("pw_pos")); - - &comment("pw_neg"); - &mov($tmp2,0); - &sub($tmp2,$num); - &mov($num,$tmp2); - &and($num,0xfffffff8); # num / 8 - &jz(&label("pw_neg_finish")); - - &set_label("pw_neg_loop",0); - for ($i=0; $i<8; $i++) - { - &comment("dl<0 Round $i"); - - &mov($tmp1,0); - &mov($tmp2,&DWP($i*4,$b,"",0)); # *b - &sub($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &sub($tmp1,$tmp2); - &adc($c,0); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - } - - &comment(""); - &add($b,32); - &add($r,32); - &sub($num,8); - &jnz(&label("pw_neg_loop")); - - &set_label("pw_neg_finish",0); - &mov($tmp2,&wparam(4)); # get dl - &mov($num,0); - &sub($num,$tmp2); - &and($num,7); - &jz(&label("pw_end")); - - for ($i=0; $i<7; $i++) - { - &comment("dl<0 Tail Round $i"); - &mov($tmp1,0); - &mov($tmp2,&DWP($i*4,$b,"",0));# *b - &sub($tmp1,$c); - &mov($c,0); - &adc($c,$c); - &sub($tmp1,$tmp2); - &adc($c,0); - &dec($num) if ($i != 6); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - &jz(&label("pw_end")) if ($i != 6); - } - - &jmp(&label("pw_end")); - - &set_label("pw_pos",0); - - &and($num,0xfffffff8); # num / 8 - &jz(&label("pw_pos_finish")); - - &set_label("pw_pos_loop",0); - - for ($i=0; $i<8; $i++) - { - &comment("dl>0 Round $i"); - - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &sub($tmp1,$c); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - &jnc(&label("pw_nc".$i)); - } - - &comment(""); - &add($a,32); - &add($r,32); - &sub($num,8); - &jnz(&label("pw_pos_loop")); - - &set_label("pw_pos_finish",0); - &mov($num,&wparam(4)); # get dl - &and($num,7); - &jz(&label("pw_end")); - - for ($i=0; $i<7; $i++) - { - &comment("dl>0 Tail Round $i"); - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &sub($tmp1,$c); - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - &jnc(&label("pw_tail_nc".$i)); - &dec($num) if ($i != 6); - &jz(&label("pw_end")) if ($i != 6); - } - &mov($c,1); - &jmp(&label("pw_end")); - - &set_label("pw_nc_loop",0); - for ($i=0; $i<8; $i++) - { - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - &set_label("pw_nc".$i,0); - } - - &comment(""); - &add($a,32); - &add($r,32); - &sub($num,8); - &jnz(&label("pw_nc_loop")); - - &mov($num,&wparam(4)); # get dl - &and($num,7); - &jz(&label("pw_nc_end")); - - for ($i=0; $i<7; $i++) - { - &mov($tmp1,&DWP($i*4,$a,"",0)); # *a - &mov(&DWP($i*4,$r,"",0),$tmp1); # *r - &set_label("pw_tail_nc".$i,0); - &dec($num) if ($i != 6); - &jz(&label("pw_nc_end")) if ($i != 6); - } - - &set_label("pw_nc_end",0); - &mov($c,0); - - &set_label("pw_end",0); - -# &mov("eax",$c); # $c is "eax" - - &function_end($name); - }
diff --git a/crypto/fipsmodule/bn/mul.c b/crypto/fipsmodule/bn/mul.c index 640d8cd..dd31580 100644 --- a/crypto/fipsmodule/bn/mul.c +++ b/crypto/fipsmodule/bn/mul.c
@@ -119,16 +119,13 @@ } } -#if !defined(OPENSSL_X86) || defined(OPENSSL_NO_ASM) // Here follows specialised variants of bn_add_words() and bn_sub_words(). They // have the property performing operations on arrays of different sizes. The // sizes of those arrays is expressed through cl, which is the common length ( // basicall, min(len(a),len(b)) ), and dl, which is the delta between the two // lengths, calculated as len(a)-len(b). All lengths are the number of // BN_ULONGs... For the operations that require a result array as parameter, -// it must have the length cl+abs(dl). These functions should probably end up -// in bn_asm.c as soon as there are assembler counterparts for the systems that -// use assembler files. +// it must have the length cl+abs(dl). static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int cl, int dl) { @@ -282,11 +279,6 @@ return c; } -#else -// On other platforms the function is defined in asm. -BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, - int cl, int dl); -#endif // bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value // and returning a mask of all ones if the result was negative and all zeros if @@ -294,8 +286,7 @@ // convention. // // TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention -// is confusing. The trouble is 32-bit x86 implements |bn_sub_part_words| in -// assembly, but we can probably just delete it? +// is confusing. static BN_ULONG bn_abs_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int cl, int dl, BN_ULONG *tmp) {