bn: Change return type of `bn_mul_mont_*` internals to `void`.

Change-Id: Id8be6697df6a6f6613105c67c96250cc084595b2
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/75647
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/bn/asm/armv4-mont.pl b/crypto/fipsmodule/bn/asm/armv4-mont.pl
index acae4e5..d3c867d 100644
--- a/crypto/fipsmodule/bn/asm/armv4-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv4-mont.pl
@@ -121,14 +121,9 @@
 bn_mul_mont_nohw:
 	ldr	ip,[sp,#4]		@ load num
 	stmdb	sp!,{r0,r2}		@ sp points at argument block
-	cmp	ip,#2
+	@ No return value. Instead, the caller must ensure num >= 2
 	mov	$num,ip			@ load num
-#ifdef	__thumb2__
-	ittt	lt
-#endif
-	movlt	r0,#0
-	addlt	sp,sp,#2*4
-	blt	.Labrt
+	@ No return value
 
 	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
 
@@ -262,8 +257,7 @@
 	add	sp,sp,#4		@ skip over tp[num+1]
 	ldmia	sp!,{r4-r12,lr}		@ restore registers
 	add	sp,sp,#2*4		@ skip over {r0,r2}
-	mov	r0,#1
-.Labrt:
+	@ No return value
 #if __ARM_ARCH>=5
 	ret				@ bx lr
 #else
@@ -717,6 +711,7 @@
 	mov	sp,ip
         vldmia  sp!,{d8-d15}
         ldmia   sp!,{r4-r11}
+	@ No return value
 	ret						@ bx lr
 .size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
 #endif
diff --git a/crypto/fipsmodule/bn/asm/armv8-mont.pl b/crypto/fipsmodule/bn/asm/armv8-mont.pl
index 1ce02ee..fe4d8cf 100644
--- a/crypto/fipsmodule/bn/asm/armv8-mont.pl
+++ b/crypto/fipsmodule/bn/asm/armv8-mont.pl
@@ -60,7 +60,7 @@
  $lo1,$hi1,$nj,$m1,$nlo,$nhi,
  $ovf, $i,$j,$tp,$tj) = map("x$_",6..17,19..24);
 
-# int bn_mul_mont(
+# void bn_mul_mont(
 $rp="x0";	# BN_ULONG *rp,
 $ap="x1";	# const BN_ULONG *ap,
 $bp="x2";	# const BN_ULONG *bp,
@@ -270,7 +270,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldr	x29,[sp],#64
 	AARCH64_VALIDATE_LINK_REGISTER
@@ -1044,7 +1044,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
@@ -1505,7 +1505,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
diff --git a/crypto/fipsmodule/bn/asm/x86-mont.pl b/crypto/fipsmodule/bn/asm/x86-mont.pl
index 3de17a4..c3e30cb 100755
--- a/crypto/fipsmodule/bn/asm/x86-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86-mont.pl
@@ -68,11 +68,9 @@
 $_bpend=&DWP(4*7,"esp");
 $frame=32;				# size of above frame rounded up to 16n
 
-	&xor	("eax","eax");
+	# No return value. Instead, the caller must ensure num >= 4
 	&mov	("edi",&wparam(5));	# int num
-	&cmp	("edi",4);
-	&jl	(&label("just_leave"));
-
+	# No return value.
 	&lea	("esi",&wparam(0));	# put aside pointer to argument block
 	&lea	("edx",&wparam(1));	# load ap
 	&add	("edi",2);		# extra two words on top of tp
@@ -326,8 +324,7 @@
 	&jge	(&label("copy"));
 
 	&mov	("esp",$_sp);		# pull saved stack pointer
-	&mov	("eax",1);
-&set_label("just_leave");
+	# No return value
 &function_end("bn_mul_mont");
 
 &asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
diff --git a/crypto/fipsmodule/bn/asm/x86_64-mont.pl b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
index 537b028..c1d4028 100755
--- a/crypto/fipsmodule/bn/asm/x86_64-mont.pl
+++ b/crypto/fipsmodule/bn/asm/x86_64-mont.pl
@@ -70,7 +70,7 @@
 # output, so this isn't useful anyway.
 $addx = 1;
 
-# int bn_mul_mont_nohw(
+# void bn_mul_mont_nohw(
 $rp="%rdi";	# BN_ULONG *rp,
 $ap="%rsi";	# const BN_ULONG *ap,
 $bp="%rdx";	# const BN_ULONG *bp,
@@ -315,7 +315,7 @@
 
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
 .cfi_def_cfa	%rsi,8
-	mov	\$1,%rax
+	# No return value
 	mov	-48(%rsi),%r15
 .cfi_restore	%r15
 	mov	-40(%rsi),%r14
@@ -762,7 +762,7 @@
 $code.=<<___;
 	mov	8(%rsp,$num,8),%rsi	# restore %rsp
 .cfi_def_cfa	%rsi, 8
-	mov	\$1,%rax
+	# No return value
 	mov	-48(%rsi),%r15
 .cfi_restore	%r15
 	mov	-40(%rsi),%r14
@@ -785,7 +785,7 @@
 }}}
 {{{
 ######################################################################
-# int bn_sqr8x_mont(
+# void bn_sqr8x_mont(
 my $rptr="%rdi";	# const BN_ULONG *rptr,
 my $aptr="%rsi";	# const BN_ULONG *aptr,
 my $mulx_adx_capable="%rdx"; # Different than upstream!
@@ -976,7 +976,7 @@
 	add	\$32,$num
 	jnz	.Lsqr8x_cond_copy
 
-	mov	\$1,%rax
+	# No return value
 	mov	-48(%rsi),%r15
 .cfi_restore	%r15
 	mov	-40(%rsi),%r14
@@ -1345,7 +1345,7 @@
 
 	mov	%rdx,($tptr)
 
-	mov	\$1,%rax
+	# No return value
 	mov	-48(%rsi),%r15
 .cfi_restore	%r15
 	mov	-40(%rsi),%r14
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index 0a9997d..6ce2cc7 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -275,14 +275,14 @@
 #define OPENSSL_BN_ASM_MONT
 // bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words
 // long. Inputs and outputs are in Montgomery form. |n0| is a pointer to the
-// corresponding field in |BN_MONT_CTX|. It returns one if |bn_mul_mont| handles
-// inputs of this size and zero otherwise.
+// corresponding field in |BN_MONT_CTX|.
 //
 // If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced.
 // If neither is fully-reduced, the output may not be either.
 //
 // This function allocates |num| words on the stack, so |num| should be at most
-// |BN_MONTGOMERY_MAX_WORDS|.
+// |BN_MONTGOMERY_MAX_WORDS|. Additionally, |num| must be at least 128 /
+// |BN_BITS2|.
 //
 // TODO(davidben): The x86_64 implementation expects a 32-bit input and masks
 // off upper bits. The aarch64 implementation expects a 64-bit input and does
@@ -291,39 +291,39 @@
 //
 // See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word
 // inputs.
-int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                 const BN_ULONG *np, const BN_ULONG *n0, size_t num);
 
 #if defined(OPENSSL_X86_64)
 inline int bn_mulx_adx_capable(void) {
   // MULX is in BMI2.
   return CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable();
 }
-int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                     const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
 inline int bn_mul4x_mont_capable(size_t num) {
   return num >= 8 && (num & 3) == 0;
 }
-int bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                  const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                   const BN_ULONG *np, const BN_ULONG *n0, size_t num);
 inline int bn_mulx4x_mont_capable(size_t num) {
   return bn_mul4x_mont_capable(num) && bn_mulx_adx_capable();
 }
-int bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                   const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                    const BN_ULONG *np, const BN_ULONG *n0, size_t num);
 inline int bn_sqr8x_mont_capable(size_t num) {
   return num >= 8 && (num & 7) == 0;
 }
-int bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, BN_ULONG mulx_adx_capable,
-                  const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, BN_ULONG mulx_adx_capable,
+                   const BN_ULONG *np, const BN_ULONG *n0, size_t num);
 #elif defined(OPENSSL_ARM)
 inline int bn_mul8x_mont_neon_capable(size_t num) {
   return (num & 7) == 0 && CRYPTO_is_NEON_capable();
 }
-int bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                       const BN_ULONG *np, const BN_ULONG *n0, size_t num);
-int bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                     const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                        const BN_ULONG *np, const BN_ULONG *n0, size_t num);
+void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      const BN_ULONG *np, const BN_ULONG *n0, size_t num);
 #endif
 
 #endif  // OPENSSL_BN_ASM_MONT
diff --git a/crypto/fipsmodule/bn/montgomery.cc.inc b/crypto/fipsmodule/bn/montgomery.cc.inc
index e73f805..c0adcca 100644
--- a/crypto/fipsmodule/bn/montgomery.cc.inc
+++ b/crypto/fipsmodule/bn/montgomery.cc.inc
@@ -324,7 +324,7 @@
   }
 
 #if defined(OPENSSL_BN_ASM_MONT)
-  // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
+  // |bn_mul_mont| requires at least 128 bits of limbs.
   int num = mont->N.width;
   if (num >= (128 / BN_BITS2) && a->width == num && b->width == num) {
     if (!bn_wexpand(r, num)) {
@@ -333,12 +333,7 @@
     // This bound is implied by |bn_mont_ctx_set_N_and_n0|. |bn_mul_mont|
     // allocates |num| words on the stack, so |num| cannot be too large.
     assert((size_t)num <= BN_MONTGOMERY_MAX_WORDS);
-    if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
-      // The check above ensures this won't happen.
-      assert(0);
-      OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR);
-      return 0;
-    }
+    bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num);
     r->neg = 0;
     r->width = num;
     return 1;
@@ -379,11 +374,9 @@
   }
 
 #if defined(OPENSSL_BN_ASM_MONT)
-  // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86.
+  // |bn_mul_mont| requires at least 128 bits of limbs.
   if (num >= (128 / BN_BITS2)) {
-    if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num)) {
-      abort();  // The check above ensures this won't happen.
-    }
+    bn_mul_mont(r, a, b, mont->N.d, mont->n0, num);
     return;
   }
 #endif
@@ -404,27 +397,27 @@
 }
 
 #if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_X86_64)
-int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
                 const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
   if (ap == bp && bn_sqr8x_mont_capable(num)) {
-    return bn_sqr8x_mont(rp, ap, bn_mulx_adx_capable(), np, n0, num);
+    bn_sqr8x_mont(rp, ap, bn_mulx_adx_capable(), np, n0, num);
+  } else if (bn_mulx4x_mont_capable(num)) {
+    bn_mulx4x_mont(rp, ap, bp, np, n0, num);
+  } else if (bn_mul4x_mont_capable(num)) {
+    bn_mul4x_mont(rp, ap, bp, np, n0, num);
+  } else {
+    bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
   }
-  if (bn_mulx4x_mont_capable(num)) {
-    return bn_mulx4x_mont(rp, ap, bp, np, n0, num);
-  }
-  if (bn_mul4x_mont_capable(num)) {
-    return bn_mul4x_mont(rp, ap, bp, np, n0, num);
-  }
-  return bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
 }
 #endif
 
 #if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_ARM)
-int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
-                const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                 const BN_ULONG *np, const BN_ULONG *n0, size_t num) {
   if (bn_mul8x_mont_neon_capable(num)) {
-    return bn_mul8x_mont_neon(rp, ap, bp, np, n0, num);
+    bn_mul8x_mont_neon(rp, ap, bp, np, n0, num);
+  } else {
+    bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
   }
-  return bn_mul_mont_nohw(rp, ap, bp, np, n0, num);
 }
 #endif
diff --git a/gen/bcm/armv4-mont-linux.S b/gen/bcm/armv4-mont-linux.S
index 704f607..73e42b5 100644
--- a/gen/bcm/armv4-mont-linux.S
+++ b/gen/bcm/armv4-mont-linux.S
@@ -24,14 +24,9 @@
 bn_mul_mont_nohw:
 	ldr	ip,[sp,#4]		@ load num
 	stmdb	sp!,{r0,r2}		@ sp points at argument block
-	cmp	ip,#2
+	@ No return value. Instead, the caller must ensure num >= 2
 	mov	r0,ip			@ load num
-#ifdef	__thumb2__
-	ittt	lt
-#endif
-	movlt	r0,#0
-	addlt	sp,sp,#2*4
-	blt	.Labrt
+	@ No return value
 
 	stmdb	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ save 10 registers
 
@@ -165,8 +160,7 @@
 	add	sp,sp,#4		@ skip over tp[num+1]
 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr}		@ restore registers
 	add	sp,sp,#2*4		@ skip over {r0,r2}
-	mov	r0,#1
-.Labrt:
+	@ No return value
 #if __ARM_ARCH>=5
 	bx	lr				@ bx lr
 #else
@@ -929,6 +923,7 @@
 	mov	sp,ip
 	vldmia	sp!,{d8,d9,d10,d11,d12,d13,d14,d15}
 	ldmia	sp!,{r4,r5,r6,r7,r8,r9,r10,r11}
+	@ No return value
 	bx	lr						@ bx lr
 .size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
 #endif
diff --git a/gen/bcm/armv8-mont-apple.S b/gen/bcm/armv8-mont-apple.S
index 9e39c1d..6aad968 100644
--- a/gen/bcm/armv8-mont-apple.S
+++ b/gen/bcm/armv8-mont-apple.S
@@ -206,7 +206,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldr	x29,[sp],#64
 	AARCH64_VALIDATE_LINK_REGISTER
@@ -965,7 +965,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
@@ -1408,7 +1408,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
diff --git a/gen/bcm/armv8-mont-linux.S b/gen/bcm/armv8-mont-linux.S
index 168162f..e49322b 100644
--- a/gen/bcm/armv8-mont-linux.S
+++ b/gen/bcm/armv8-mont-linux.S
@@ -206,7 +206,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldr	x29,[sp],#64
 	AARCH64_VALIDATE_LINK_REGISTER
@@ -965,7 +965,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
@@ -1408,7 +1408,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
diff --git a/gen/bcm/armv8-mont-win.S b/gen/bcm/armv8-mont-win.S
index b521d49..4091a5a 100644
--- a/gen/bcm/armv8-mont-win.S
+++ b/gen/bcm/armv8-mont-win.S
@@ -208,7 +208,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldr	x29,[sp],#64
 	AARCH64_VALIDATE_LINK_REGISTER
@@ -969,7 +969,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
@@ -1414,7 +1414,7 @@
 	ldp	x19,x20,[x29,#16]
 	mov	sp,x29
 	ldp	x21,x22,[x29,#32]
-	mov	x0,#1
+	// No return value
 	ldp	x23,x24,[x29,#48]
 	ldp	x25,x26,[x29,#64]
 	ldp	x27,x28,[x29,#80]
diff --git a/gen/bcm/x86-mont-apple.S b/gen/bcm/x86-mont-apple.S
index a8fd1f9..6e549c7 100644
--- a/gen/bcm/x86-mont-apple.S
+++ b/gen/bcm/x86-mont-apple.S
@@ -14,10 +14,7 @@
 	pushl	%ebx
 	pushl	%esi
 	pushl	%edi
-	xorl	%eax,%eax
 	movl	40(%esp),%edi
-	cmpl	$4,%edi
-	jl	L000just_leave
 	leal	20(%esp),%esi
 	leal	24(%esp),%edx
 	addl	$2,%edi
@@ -40,15 +37,15 @@
 	leal	(%ebp,%eax,1),%esp
 	movl	(%esp),%eax
 	cmpl	%ebp,%esp
-	ja	L001page_walk
-	jmp	L002page_walk_done
+	ja	L000page_walk
+	jmp	L001page_walk_done
 .align	4,0x90
-L001page_walk:
+L000page_walk:
 	leal	-4096(%esp),%esp
 	movl	(%esp),%eax
 	cmpl	%ebp,%esp
-	ja	L001page_walk
-L002page_walk_done:
+	ja	L000page_walk
+L001page_walk_done:
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
@@ -85,7 +82,7 @@
 	psrlq	$32,%mm3
 	incl	%ecx
 .align	4,0x90
-L0031st:
+L0021st:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -100,7 +97,7 @@
 	psrlq	$32,%mm3
 	leal	1(%ecx),%ecx
 	cmpl	%ebx,%ecx
-	jl	L0031st
+	jl	L0021st
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -114,7 +111,7 @@
 	paddq	%mm2,%mm3
 	movq	%mm3,32(%esp,%ebx,4)
 	incl	%edx
-L004outer:
+L003outer:
 	xorl	%ecx,%ecx
 	movd	(%edi,%edx,4),%mm4
 	movd	(%esi),%mm5
@@ -136,7 +133,7 @@
 	paddq	%mm6,%mm2
 	incl	%ecx
 	decl	%ebx
-L005inner:
+L004inner:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -153,7 +150,7 @@
 	paddq	%mm6,%mm2
 	decl	%ebx
 	leal	1(%ecx),%ecx
-	jnz	L005inner
+	jnz	L004inner
 	movl	%ecx,%ebx
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
@@ -171,11 +168,11 @@
 	movq	%mm3,32(%esp,%ebx,4)
 	leal	1(%edx),%edx
 	cmpl	%ebx,%edx
-	jle	L004outer
+	jle	L003outer
 	emms
-	jmp	L006common_tail
+	jmp	L005common_tail
 .align	4,0x90
-L006common_tail:
+L005common_tail:
 	movl	16(%esp),%ebp
 	movl	4(%esp),%edi
 	leal	32(%esp),%esi
@@ -183,19 +180,19 @@
 	movl	%ebx,%ecx
 	xorl	%edx,%edx
 .align	4,0x90
-L007sub:
+L006sub:
 	sbbl	(%ebp,%edx,4),%eax
 	movl	%eax,(%edi,%edx,4)
 	decl	%ecx
 	movl	4(%esi,%edx,4),%eax
 	leal	1(%edx),%edx
-	jge	L007sub
+	jge	L006sub
 	sbbl	$0,%eax
 	movl	$-1,%edx
 	xorl	%eax,%edx
-	jmp	L008copy
+	jmp	L007copy
 .align	4,0x90
-L008copy:
+L007copy:
 	movl	32(%esp,%ebx,4),%esi
 	movl	(%edi,%ebx,4),%ebp
 	movl	%ecx,32(%esp,%ebx,4)
@@ -204,10 +201,8 @@
 	orl	%esi,%ebp
 	movl	%ebp,(%edi,%ebx,4)
 	decl	%ebx
-	jge	L008copy
+	jge	L007copy
 	movl	24(%esp),%esp
-	movl	$1,%eax
-L000just_leave:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
diff --git a/gen/bcm/x86-mont-linux.S b/gen/bcm/x86-mont-linux.S
index 3d3ddb5..21fbee2 100644
--- a/gen/bcm/x86-mont-linux.S
+++ b/gen/bcm/x86-mont-linux.S
@@ -15,10 +15,7 @@
 	pushl	%ebx
 	pushl	%esi
 	pushl	%edi
-	xorl	%eax,%eax
 	movl	40(%esp),%edi
-	cmpl	$4,%edi
-	jl	.L000just_leave
 	leal	20(%esp),%esi
 	leal	24(%esp),%edx
 	addl	$2,%edi
@@ -41,15 +38,15 @@
 	leal	(%ebp,%eax,1),%esp
 	movl	(%esp),%eax
 	cmpl	%ebp,%esp
-	ja	.L001page_walk
-	jmp	.L002page_walk_done
+	ja	.L000page_walk
+	jmp	.L001page_walk_done
 .align	16
-.L001page_walk:
+.L000page_walk:
 	leal	-4096(%esp),%esp
 	movl	(%esp),%eax
 	cmpl	%ebp,%esp
-	ja	.L001page_walk
-.L002page_walk_done:
+	ja	.L000page_walk
+.L001page_walk_done:
 	movl	(%esi),%eax
 	movl	4(%esi),%ebx
 	movl	8(%esi),%ecx
@@ -86,7 +83,7 @@
 	psrlq	$32,%mm3
 	incl	%ecx
 .align	16
-.L0031st:
+.L0021st:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -101,7 +98,7 @@
 	psrlq	$32,%mm3
 	leal	1(%ecx),%ecx
 	cmpl	%ebx,%ecx
-	jl	.L0031st
+	jl	.L0021st
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -115,7 +112,7 @@
 	paddq	%mm2,%mm3
 	movq	%mm3,32(%esp,%ebx,4)
 	incl	%edx
-.L004outer:
+.L003outer:
 	xorl	%ecx,%ecx
 	movd	(%edi,%edx,4),%mm4
 	movd	(%esi),%mm5
@@ -137,7 +134,7 @@
 	paddq	%mm6,%mm2
 	incl	%ecx
 	decl	%ebx
-.L005inner:
+.L004inner:
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
 	paddq	%mm0,%mm2
@@ -154,7 +151,7 @@
 	paddq	%mm6,%mm2
 	decl	%ebx
 	leal	1(%ecx),%ecx
-	jnz	.L005inner
+	jnz	.L004inner
 	movl	%ecx,%ebx
 	pmuludq	%mm4,%mm0
 	pmuludq	%mm5,%mm1
@@ -172,11 +169,11 @@
 	movq	%mm3,32(%esp,%ebx,4)
 	leal	1(%edx),%edx
 	cmpl	%ebx,%edx
-	jle	.L004outer
+	jle	.L003outer
 	emms
-	jmp	.L006common_tail
+	jmp	.L005common_tail
 .align	16
-.L006common_tail:
+.L005common_tail:
 	movl	16(%esp),%ebp
 	movl	4(%esp),%edi
 	leal	32(%esp),%esi
@@ -184,19 +181,19 @@
 	movl	%ebx,%ecx
 	xorl	%edx,%edx
 .align	16
-.L007sub:
+.L006sub:
 	sbbl	(%ebp,%edx,4),%eax
 	movl	%eax,(%edi,%edx,4)
 	decl	%ecx
 	movl	4(%esi,%edx,4),%eax
 	leal	1(%edx),%edx
-	jge	.L007sub
+	jge	.L006sub
 	sbbl	$0,%eax
 	movl	$-1,%edx
 	xorl	%eax,%edx
-	jmp	.L008copy
+	jmp	.L007copy
 .align	16
-.L008copy:
+.L007copy:
 	movl	32(%esp,%ebx,4),%esi
 	movl	(%edi,%ebx,4),%ebp
 	movl	%ecx,32(%esp,%ebx,4)
@@ -205,10 +202,8 @@
 	orl	%esi,%ebp
 	movl	%ebp,(%edi,%ebx,4)
 	decl	%ebx
-	jge	.L008copy
+	jge	.L007copy
 	movl	24(%esp),%esp
-	movl	$1,%eax
-.L000just_leave:
 	popl	%edi
 	popl	%esi
 	popl	%ebx
diff --git a/gen/bcm/x86-mont-win.asm b/gen/bcm/x86-mont-win.asm
index 931275d..d154078 100644
--- a/gen/bcm/x86-mont-win.asm
+++ b/gen/bcm/x86-mont-win.asm
@@ -21,10 +21,7 @@
 	push	ebx
 	push	esi
 	push	edi
-	xor	eax,eax
 	mov	edi,DWORD [40+esp]
-	cmp	edi,4
-	jl	NEAR L$000just_leave
 	lea	esi,[20+esp]
 	lea	edx,[24+esp]
 	add	edi,2
@@ -47,15 +44,15 @@
 	lea	esp,[eax*1+ebp]
 	mov	eax,DWORD [esp]
 	cmp	esp,ebp
-	ja	NEAR L$001page_walk
-	jmp	NEAR L$002page_walk_done
+	ja	NEAR L$000page_walk
+	jmp	NEAR L$001page_walk_done
 align	16
-L$001page_walk:
+L$000page_walk:
 	lea	esp,[esp-4096]
 	mov	eax,DWORD [esp]
 	cmp	esp,ebp
-	ja	NEAR L$001page_walk
-L$002page_walk_done:
+	ja	NEAR L$000page_walk
+L$001page_walk_done:
 	mov	eax,DWORD [esi]
 	mov	ebx,DWORD [4+esi]
 	mov	ecx,DWORD [8+esi]
@@ -92,7 +89,7 @@
 	psrlq	mm3,32
 	inc	ecx
 align	16
-L$0031st:
+L$0021st:
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
 	paddq	mm2,mm0
@@ -107,7 +104,7 @@
 	psrlq	mm3,32
 	lea	ecx,[1+ecx]
 	cmp	ecx,ebx
-	jl	NEAR L$0031st
+	jl	NEAR L$0021st
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
 	paddq	mm2,mm0
@@ -121,7 +118,7 @@
 	paddq	mm3,mm2
 	movq	[32+ebx*4+esp],mm3
 	inc	edx
-L$004outer:
+L$003outer:
 	xor	ecx,ecx
 	movd	mm4,DWORD [edx*4+edi]
 	movd	mm5,DWORD [esi]
@@ -143,7 +140,7 @@
 	paddq	mm2,mm6
 	inc	ecx
 	dec	ebx
-L$005inner:
+L$004inner:
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
 	paddq	mm2,mm0
@@ -160,7 +157,7 @@
 	paddq	mm2,mm6
 	dec	ebx
 	lea	ecx,[1+ecx]
-	jnz	NEAR L$005inner
+	jnz	NEAR L$004inner
 	mov	ebx,ecx
 	pmuludq	mm0,mm4
 	pmuludq	mm1,mm5
@@ -178,11 +175,11 @@
 	movq	[32+ebx*4+esp],mm3
 	lea	edx,[1+edx]
 	cmp	edx,ebx
-	jle	NEAR L$004outer
+	jle	NEAR L$003outer
 	emms
-	jmp	NEAR L$006common_tail
+	jmp	NEAR L$005common_tail
 align	16
-L$006common_tail:
+L$005common_tail:
 	mov	ebp,DWORD [16+esp]
 	mov	edi,DWORD [4+esp]
 	lea	esi,[32+esp]
@@ -190,19 +187,19 @@
 	mov	ecx,ebx
 	xor	edx,edx
 align	16
-L$007sub:
+L$006sub:
 	sbb	eax,DWORD [edx*4+ebp]
 	mov	DWORD [edx*4+edi],eax
 	dec	ecx
 	mov	eax,DWORD [4+edx*4+esi]
 	lea	edx,[1+edx]
-	jge	NEAR L$007sub
+	jge	NEAR L$006sub
 	sbb	eax,0
 	mov	edx,-1
 	xor	edx,eax
-	jmp	NEAR L$008copy
+	jmp	NEAR L$007copy
 align	16
-L$008copy:
+L$007copy:
 	mov	esi,DWORD [32+ebx*4+esp]
 	mov	ebp,DWORD [ebx*4+edi]
 	mov	DWORD [32+ebx*4+esp],ecx
@@ -211,10 +208,8 @@
 	or	ebp,esi
 	mov	DWORD [ebx*4+edi],ebp
 	dec	ebx
-	jge	NEAR L$008copy
+	jge	NEAR L$007copy
 	mov	esp,DWORD [24+esp]
-	mov	eax,1
-L$000just_leave:
 	pop	edi
 	pop	esi
 	pop	ebx
diff --git a/gen/bcm/x86_64-mont-apple.S b/gen/bcm/x86_64-mont-apple.S
index d429f7c..27a168d 100644
--- a/gen/bcm/x86_64-mont-apple.S
+++ b/gen/bcm/x86_64-mont-apple.S
@@ -229,7 +229,7 @@
 
 	movq	8(%rsp,%r9,8),%rsi
 
-	movq	$1,%rax
+
 	movq	-48(%rsi),%r15
 
 	movq	-40(%rsi),%r14
@@ -662,7 +662,7 @@
 	jnz	L$copy4x
 	movq	8(%rsp,%r9,8),%rsi
 
-	movq	$1,%rax
+
 	movq	-48(%rsi),%r15
 
 	movq	-40(%rsi),%r14
@@ -853,7 +853,7 @@
 	addq	$32,%r9
 	jnz	L$sqr8x_cond_copy
 
-	movq	$1,%rax
+
 	movq	-48(%rsi),%r15
 
 	movq	-40(%rsi),%r14
@@ -1211,7 +1211,7 @@
 
 	movq	%rdx,(%rbx)
 
-	movq	$1,%rax
+
 	movq	-48(%rsi),%r15
 
 	movq	-40(%rsi),%r14
diff --git a/gen/bcm/x86_64-mont-linux.S b/gen/bcm/x86_64-mont-linux.S
index 630bb72..51c4b6c 100644
--- a/gen/bcm/x86_64-mont-linux.S
+++ b/gen/bcm/x86_64-mont-linux.S
@@ -229,7 +229,7 @@
 
 	movq	8(%rsp,%r9,8),%rsi
 .cfi_def_cfa	%rsi,8
-	movq	$1,%rax
+
 	movq	-48(%rsi),%r15
 .cfi_restore	%r15
 	movq	-40(%rsi),%r14
@@ -662,7 +662,7 @@
 	jnz	.Lcopy4x
 	movq	8(%rsp,%r9,8),%rsi
 .cfi_def_cfa	%rsi, 8
-	movq	$1,%rax
+
 	movq	-48(%rsi),%r15
 .cfi_restore	%r15
 	movq	-40(%rsi),%r14
@@ -855,7 +855,7 @@
 	addq	$32,%r9
 	jnz	.Lsqr8x_cond_copy
 
-	movq	$1,%rax
+
 	movq	-48(%rsi),%r15
 .cfi_restore	%r15
 	movq	-40(%rsi),%r14
@@ -1213,7 +1213,7 @@
 
 	movq	%rdx,(%rbx)
 
-	movq	$1,%rax
+
 	movq	-48(%rsi),%r15
 .cfi_restore	%r15
 	movq	-40(%rsi),%r14
diff --git a/gen/bcm/x86_64-mont-win.asm b/gen/bcm/x86_64-mont-win.asm
index 7e54c66..c768d16 100644
--- a/gen/bcm/x86_64-mont-win.asm
+++ b/gen/bcm/x86_64-mont-win.asm
@@ -248,7 +248,7 @@
 
 	mov	rsi,QWORD[8+r9*8+rsp]
 
-	mov	rax,1
+
 	mov	r15,QWORD[((-48))+rsi]
 
 	mov	r14,QWORD[((-40))+rsi]
@@ -694,7 +694,7 @@
 	jnz	NEAR $L$copy4x
 	mov	rsi,QWORD[8+r9*8+rsp]
 
-	mov	rax,1
+
 	mov	r15,QWORD[((-48))+rsi]
 
 	mov	r14,QWORD[((-40))+rsi]
@@ -898,7 +898,7 @@
 	add	r9,32
 	jnz	NEAR $L$sqr8x_cond_copy
 
-	mov	rax,1
+
 	mov	r15,QWORD[((-48))+rsi]
 
 	mov	r14,QWORD[((-40))+rsi]
@@ -1269,7 +1269,7 @@
 
 	mov	QWORD[rbx],rdx
 
-	mov	rax,1
+
 	mov	r15,QWORD[((-48))+rsi]
 
 	mov	r14,QWORD[((-40))+rsi]