Remove bn_sub_part_words assembly.

The assembly only existed for 32-bit x86, which is much less relevant
these days. It's also just a pile of ADDs, ADCs, etc., which compilers
should be able to figure out by now.

This frees us up to clean up that function, including the weird cl/dl
calling convention. No noticeable difference in RSA benchmarks:

Before:
Did 224 RSA 2048 signing operations in 1006100us (222.6 ops/sec)
Did 9240 RSA 2048 verify (same key) operations in 1078563us (8567.0 ops/sec)
Did 8541 RSA 2048 verify (fresh key) operations in 1064996us (8019.7 ops/sec)
Did 32 RSA 4096 signing operations in 1052851us (30.4 ops/sec)
Did 2365 RSA 4096 verify (same key) operations in 1093337us (2163.1 ops/sec)
Did 2222 RSA 4096 verify (fresh key) operations in 1090037us (2038.5 ops/sec)

After:
Did 231 RSA 2048 signing operations in 1018908us (226.7 ops/sec)
Did 9394 RSA 2048 verify (same key) operations in 1095548us (8574.7 ops/sec)
Did 8525 RSA 2048 verify (fresh key) operations in 1062449us (8023.9 ops/sec)
Did 32 RSA 4096 signing operations in 1050236us (30.5 ops/sec)
Did 2376 RSA 4096 verify (same key) operations in 1098509us (2162.9 ops/sec)
Did 2233 RSA 4096 verify (fresh key) operations in 1094724us (2039.8 ops/sec)

Bug: 314
Change-Id: I86a27b2550ab8bec2a9930cc509f4c29d6036b35
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/40144
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/bn/asm/bn-586.pl b/crypto/fipsmodule/bn/asm/bn-586.pl
index 05ef28c..5c52e05 100644
--- a/crypto/fipsmodule/bn/asm/bn-586.pl
+++ b/crypto/fipsmodule/bn/asm/bn-586.pl
@@ -27,7 +27,6 @@
 &bn_div_words("bn_div_words");
 &bn_add_words("bn_add_words");
 &bn_sub_words("bn_sub_words");
-&bn_sub_part_words("bn_sub_part_words");
 
 &asm_finish();
 
@@ -575,211 +574,3 @@
 
 	&function_end($name);
 	}
-
-sub bn_sub_part_words
-	{
-	local($name)=@_;
-
-	&function_begin($name,"");
-
-	&comment("");
-	$a="esi";
-	$b="edi";
-	$c="eax";
-	$r="ebx";
-	$tmp1="ecx";
-	$tmp2="edx";
-	$num="ebp";
-
-	&mov($r,&wparam(0));	# get r
-	 &mov($a,&wparam(1));	# get a
-	&mov($b,&wparam(2));	# get b
-	 &mov($num,&wparam(3));	# get num
-	&xor($c,$c);		# clear carry
-	 &and($num,0xfffffff8);	# num / 8
-
-	&jz(&label("aw_finish"));
-
-	&set_label("aw_loop",0);
-	for ($i=0; $i<8; $i++)
-		{
-		&comment("Round $i");
-
-		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
-		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
-		&sub($tmp1,$c);
-		 &mov($c,0);
-		&adc($c,$c);
-		 &sub($tmp1,$tmp2);
-		&adc($c,0);
-		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
-		}
-
-	&comment("");
-	&add($a,32);
-	 &add($b,32);
-	&add($r,32);
-	 &sub($num,8);
-	&jnz(&label("aw_loop"));
-
-	&set_label("aw_finish",0);
-	&mov($num,&wparam(3));	# get num
-	&and($num,7);
-	 &jz(&label("aw_end"));
-
-	for ($i=0; $i<7; $i++)
-		{
-		&comment("Tail Round $i");
-		&mov($tmp1,&DWP(0,$a,"",0));	# *a
-		 &mov($tmp2,&DWP(0,$b,"",0));# *b
-		&sub($tmp1,$c);
-		 &mov($c,0);
-		&adc($c,$c);
-		 &sub($tmp1,$tmp2);
-		&adc($c,0);
-		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
-		&add($a, 4);
-		&add($b, 4);
-		&add($r, 4);
-		 &dec($num) if ($i != 6);
-		 &jz(&label("aw_end")) if ($i != 6);
-		}
-	&set_label("aw_end",0);
-
-	&cmp(&wparam(4),0);
-	&je(&label("pw_end"));
-
-	&mov($num,&wparam(4));	# get dl
-	&cmp($num,0);
-	&je(&label("pw_end"));
-	&jge(&label("pw_pos"));
-
-	&comment("pw_neg");
-	&mov($tmp2,0);
-	&sub($tmp2,$num);
-	&mov($num,$tmp2);
-	&and($num,0xfffffff8);	# num / 8
-	&jz(&label("pw_neg_finish"));
-
-	&set_label("pw_neg_loop",0);
-	for ($i=0; $i<8; $i++)
-	{
-	    &comment("dl<0 Round $i");
-
-	    &mov($tmp1,0);
-	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
-	    &sub($tmp1,$c);
-	    &mov($c,0);
-	    &adc($c,$c);
-	    &sub($tmp1,$tmp2);
-	    &adc($c,0);
-	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
-	}
-
-	&comment("");
-	&add($b,32);
-	&add($r,32);
-	&sub($num,8);
-	&jnz(&label("pw_neg_loop"));
-
-	&set_label("pw_neg_finish",0);
-	&mov($tmp2,&wparam(4));	# get dl
-	&mov($num,0);
-	&sub($num,$tmp2);
-	&and($num,7);
-	&jz(&label("pw_end"));
-
-	for ($i=0; $i<7; $i++)
-	{
-	    &comment("dl<0 Tail Round $i");
-	    &mov($tmp1,0);
-	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
-	    &sub($tmp1,$c);
-	    &mov($c,0);
-	    &adc($c,$c);
-	    &sub($tmp1,$tmp2);
-	    &adc($c,0);
-	    &dec($num) if ($i != 6);
-	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
-	    &jz(&label("pw_end")) if ($i != 6);
-	}
-
-	&jmp(&label("pw_end"));
-
-	&set_label("pw_pos",0);
-
-	&and($num,0xfffffff8);	# num / 8
-	&jz(&label("pw_pos_finish"));
-
-	&set_label("pw_pos_loop",0);
-
-	for ($i=0; $i<8; $i++)
-	{
-	    &comment("dl>0 Round $i");
-
-	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
-	    &sub($tmp1,$c);
-	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
-	    &jnc(&label("pw_nc".$i));
-	}
-
-	&comment("");
-	&add($a,32);
-	&add($r,32);
-	&sub($num,8);
-	&jnz(&label("pw_pos_loop"));
-
-	&set_label("pw_pos_finish",0);
-	&mov($num,&wparam(4));	# get dl
-	&and($num,7);
-	&jz(&label("pw_end"));
-
-	for ($i=0; $i<7; $i++)
-	{
-	    &comment("dl>0 Tail Round $i");
-	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
-	    &sub($tmp1,$c);
-	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
-	    &jnc(&label("pw_tail_nc".$i));
-	    &dec($num) if ($i != 6);
-	    &jz(&label("pw_end")) if ($i != 6);
-	}
-	&mov($c,1);
-	&jmp(&label("pw_end"));
-
-	&set_label("pw_nc_loop",0);
-	for ($i=0; $i<8; $i++)
-	{
-	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
-	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
-	    &set_label("pw_nc".$i,0);
-	}
-
-	&comment("");
-	&add($a,32);
-	&add($r,32);
-	&sub($num,8);
-	&jnz(&label("pw_nc_loop"));
-
-	&mov($num,&wparam(4));	# get dl
-	&and($num,7);
-	&jz(&label("pw_nc_end"));
-
-	for ($i=0; $i<7; $i++)
-	{
-	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
-	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
-	    &set_label("pw_tail_nc".$i,0);
-	    &dec($num) if ($i != 6);
-	    &jz(&label("pw_nc_end")) if ($i != 6);
-	}
-
-	&set_label("pw_nc_end",0);
-	&mov($c,0);
-
-	&set_label("pw_end",0);
-
-#	&mov("eax",$c);		# $c is "eax"
-
-	&function_end($name);
-	}
diff --git a/crypto/fipsmodule/bn/mul.c b/crypto/fipsmodule/bn/mul.c
index 640d8cd..dd31580 100644
--- a/crypto/fipsmodule/bn/mul.c
+++ b/crypto/fipsmodule/bn/mul.c
@@ -119,16 +119,13 @@
   }
 }
 
-#if !defined(OPENSSL_X86) || defined(OPENSSL_NO_ASM)
 // Here follows specialised variants of bn_add_words() and bn_sub_words(). They
 // have the property performing operations on arrays of different sizes. The
 // sizes of those arrays is expressed through cl, which is the common length (
 // basicall, min(len(a),len(b)) ), and dl, which is the delta between the two
 // lengths, calculated as len(a)-len(b). All lengths are the number of
 // BN_ULONGs...  For the operations that require a result array as parameter,
-// it must have the length cl+abs(dl). These functions should probably end up
-// in bn_asm.c as soon as there are assembler counterparts for the systems that
-// use assembler files.
+// it must have the length cl+abs(dl).
 
 static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
                                   const BN_ULONG *b, int cl, int dl) {
@@ -282,11 +279,6 @@
 
   return c;
 }
-#else
-// On other platforms the function is defined in asm.
-BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
-                           int cl, int dl);
-#endif
 
 // bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value
 // and returning a mask of all ones if the result was negative and all zeros if
@@ -294,8 +286,7 @@
 // convention.
 //
 // TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention
-// is confusing. The trouble is 32-bit x86 implements |bn_sub_part_words| in
-// assembly, but we can probably just delete it?
+// is confusing.
 static BN_ULONG bn_abs_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
                                       const BN_ULONG *b, int cl, int dl,
                                       BN_ULONG *tmp) {