crypto/fipsmodule/ec/asm/p256_beeu-x86_64-asm.pl - boringssl - Git at Google

 # Copyright (c) 2018, Amazon Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     https://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Written by Nir Drucker, and Shay Gueron
 # AWS Cryptographic Algorithms Group
 # (ndrucker@amazon.com, gueron@amazon.com)
 # based on BN_mod_inverse_odd

 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
 die "can't locate x86_64-xlate.pl";

 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
 *STDOUT=*OUT;

 #############################################################################
 # extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
 #                                     BN_ULONG a[P256_LIMBS],
 #                                     BN_ULONG n[P256_LIMBS]);
 #
 # (Binary Extended Euclidean Algorithm.
 #  See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
 #
 # Assumption 1: n is odd for the BEEU
 # Assumption 2: 1 < a < n < 2^256

 $out = "%rdi";
 $a = "%rsi";
 $n = "%rdx";

 # X/Y will hold the inverse parameter
 # Assumption: X,Y<2^(256)
 $x0 = "%r8";
 $x1 = "%r9";
 $x2 = "%r10";
 $x3 = "%r11";
 # borrow from out (out is needed only at the end)
 $x4 = "%rdi";
 $y0 = "%r12";
 $y1 = "%r13";
 $y2 = "%r14";
 $y3 = "%r15";
 $y4 = "%rbp";
 $shift = "%rcx";
 $t0 = "%rax";
 $t1 = "%rbx";
 $t2 = "%rsi";
 # borrow
 $t3 = "%rcx";

 $T0 = "%xmm0";
 $T1 = "%xmm1";

 # Offsets on the stack
 $out_rsp = 0;
 $shift_rsp = $out_rsp+0x8;
 $a_rsp0 = $shift_rsp+0x8;
 $a_rsp1 = $a_rsp0+0x8;
 $a_rsp2 = $a_rsp1+0x8;
 $a_rsp3 = $a_rsp2+0x8;
 $b_rsp0 = $a_rsp3+0x8;
 $b_rsp1 = $b_rsp0+0x8;
 $b_rsp2 = $b_rsp1+0x8;
 $b_rsp3 = $b_rsp2+0x8;

 # Borrow when a_rsp/b_rsp are no longer needed.
 $y_rsp0 = $a_rsp0;
 $y_rsp1 = $y_rsp0+0x8;
 $y_rsp2 = $y_rsp1+0x8;
 $y_rsp3 = $y_rsp2+0x8;
 $y_rsp4 = $y_rsp3+0x8;
 $last_rsp_offset = $b_rsp3+0x8;

 sub TEST_B_ZERO {
   return <<___;
     xorq $t1, $t1
     or $b_rsp0(%rsp), $t1
     or $b_rsp1(%rsp), $t1
     or $b_rsp2(%rsp), $t1
     or $b_rsp3(%rsp), $t1
     jz .Lbeeu_loop_end
 ___
 }

 $g_next_label = 0;

 sub SHIFT1 {
   my ($var0, $var1, $var2, $var3, $var4) = @_;
   my $label = ".Lshift1_${g_next_label}";
   $g_next_label++;

   return <<___;
     # Ensure X is even and divide by two.
     movq \$1, $t1
     andq $var0, $t1
     jz $label
     add 0*8($n), $var0
     adc 1*8($n), $var1
     adc 2*8($n), $var2
     adc 3*8($n), $var3
     adc \$0, $var4

 $label:
     shrdq \$1, $var1, $var0
     shrdq \$1, $var2, $var1
     shrdq \$1, $var3, $var2
     shrdq \$1, $var4, $var3
     shrq  \$1, $var4
 ___
 }

 sub SHIFT256 {
   my ($var) = @_;
   return <<___;
     # Copy shifted values.
     # Remember not to override t3=rcx
     movq 1*8+$var(%rsp), $t0
     movq 2*8+$var(%rsp), $t1
     movq 3*8+$var(%rsp), $t2

     shrdq %cl, $t0, 0*8+$var(%rsp)
     shrdq %cl, $t1, 1*8+$var(%rsp)
     shrdq %cl, $t2, 2*8+$var(%rsp)

     shrq  %cl, $t2
     mov $t2, 3*8+$var(%rsp)
 ___
 }

 $code.=<<___;
 .text

 .type beeu_mod_inverse_vartime,\@function
 .hidden beeu_mod_inverse_vartime
 .globl  beeu_mod_inverse_vartime
 .align 32
 beeu_mod_inverse_vartime:
 .cfi_startproc
     _CET_ENDBR
     push %rbp
 .cfi_push rbp
     push %r12
 .cfi_push r12
     push %r13
 .cfi_push r13
     push %r14
 .cfi_push r14
     push %r15
 .cfi_push r15
     push %rbx
 .cfi_push rbx
     push %rsi
 .cfi_push rsi

     sub \$$last_rsp_offset, %rsp
 .cfi_adjust_cfa_offset  $last_rsp_offset
     movq $out, $out_rsp(%rsp)

     # X=1, Y=0
     movq \$1, $x0
     xorq $x1, $x1
     xorq $x2, $x2
     xorq $x3, $x3
     xorq $x4, $x4

     xorq $y0, $y0
     xorq $y1, $y1
     xorq $y2, $y2
     xorq $y3, $y3
     xorq $y4, $y4

     # Copy a/n into B/A on the stack.
     vmovdqu 0*8($a), $T0
     vmovdqu 2*8($a), $T1
     vmovdqu $T0, $b_rsp0(%rsp)
     vmovdqu $T1, $b_rsp2(%rsp)

     vmovdqu 0*8($n), $T0
     vmovdqu 2*8($n), $T1
     vmovdqu $T0, $a_rsp0(%rsp)
     vmovdqu $T1, $a_rsp2(%rsp)

 .Lbeeu_loop:
     ${\TEST_B_ZERO}

     # 0 < B < |n|,
     # 0 < A <= |n|,
     # (1)      X*a  ==  B   (mod |n|),
     # (2) (-1)*Y*a  ==  A   (mod |n|)

     # Now divide B by the maximum possible power of two in the
     # integers, and divide X by the same value mod |n|. When we're
     # done, (1) still holds.
     movq \$1, $shift

     # Note that B > 0
 .Lbeeu_shift_loop_XB:
     movq $shift, $t1
     andq $b_rsp0(%rsp), $t1
     jnz .Lbeeu_shift_loop_end_XB

     ${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
     shl \$1, $shift

     # Test wraparound of the shift parameter. The probability to have 32 zeroes
     # in a row is small Therefore having the value below equal \$0x8000000 or
     # \$0x8000 does not affect the performance. We choose 0x8000000 because it
     # is the maximal immediate value possible.
     cmp \$0x8000000, $shift
     jne .Lbeeu_shift_loop_XB

 .Lbeeu_shift_loop_end_XB:
     bsf $shift, $shift
     test $shift, $shift
     jz .Lbeeu_no_shift_XB

     ${\SHIFT256($b_rsp0)}

 .Lbeeu_no_shift_XB:
     # Same for A and Y.  Afterwards, (2) still holds.
     movq \$1, $shift

     # Note that A > 0
 .Lbeeu_shift_loop_YA:
     movq $shift, $t1
     andq $a_rsp0(%rsp), $t1
     jnz .Lbeeu_shift_loop_end_YA

     ${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
     shl \$1, $shift

     # Test wraparound of the shift parameter. The probability to have 32 zeroes
     # in a row is small therefore having the value below equal \$0x8000000 or
     # \$0x8000 Does not affect the performance. We choose 0x8000000 because it
     # is the maximal immediate value possible.
     cmp \$0x8000000, $shift
     jne .Lbeeu_shift_loop_YA

 .Lbeeu_shift_loop_end_YA:
     bsf $shift, $shift
     test $shift, $shift
     jz .Lbeeu_no_shift_YA

     ${\SHIFT256($a_rsp0)}

 .Lbeeu_no_shift_YA:
     # T = B-A (A,B < 2^256)
     mov $b_rsp0(%rsp), $t0
     mov $b_rsp1(%rsp), $t1
     mov $b_rsp2(%rsp), $t2
     mov $b_rsp3(%rsp), $t3
     sub $a_rsp0(%rsp), $t0
     sbb $a_rsp1(%rsp), $t1
     sbb $a_rsp2(%rsp), $t2
     sbb $a_rsp3(%rsp), $t3  # borrow from shift
     jnc .Lbeeu_B_bigger_than_A

     # A = A - B
     mov $a_rsp0(%rsp), $t0
     mov $a_rsp1(%rsp), $t1
     mov $a_rsp2(%rsp), $t2
     mov $a_rsp3(%rsp), $t3
     sub $b_rsp0(%rsp), $t0
     sbb $b_rsp1(%rsp), $t1
     sbb $b_rsp2(%rsp), $t2
     sbb $b_rsp3(%rsp), $t3
     mov $t0, $a_rsp0(%rsp)
     mov $t1, $a_rsp1(%rsp)
     mov $t2, $a_rsp2(%rsp)
     mov $t3, $a_rsp3(%rsp)

     # Y = Y + X
     add $x0, $y0
     adc $x1, $y1
     adc $x2, $y2
     adc $x3, $y3
     adc $x4, $y4
     jmp .Lbeeu_loop

 .Lbeeu_B_bigger_than_A:
     # B = T = B - A
     mov $t0, $b_rsp0(%rsp)
     mov $t1, $b_rsp1(%rsp)
     mov $t2, $b_rsp2(%rsp)
     mov $t3, $b_rsp3(%rsp)

     # X = Y + X
     add $y0, $x0
     adc $y1, $x1
     adc $y2, $x2
     adc $y3, $x3
     adc $y4, $x4

     jmp .Lbeeu_loop

 .Lbeeu_loop_end:
     # The Euclid's algorithm loop ends when A == beeu(a,n);
     # Therefore (-1)*Y*a == A (mod |n|), Y>0

     # Verify that A = 1 ==> (-1)*Y*a = A = 1  (mod |n|)
     mov $a_rsp0(%rsp), $t1
     sub \$1, $t1
     or $a_rsp1(%rsp), $t1
     or $a_rsp2(%rsp), $t1
     or $a_rsp3(%rsp), $t1
     # If not, fail.
     jnz .Lbeeu_err

     # From this point on, we no longer need X
     # Therefore we use it as a temporary storage.
     # X = n
     movq 0*8($n), $x0
     movq 1*8($n), $x1
     movq 2*8($n), $x2
     movq 3*8($n), $x3
     xorq $x4, $x4

 .Lbeeu_reduction_loop:
     movq $y0, $y_rsp0(%rsp)
     movq $y1, $y_rsp1(%rsp)
     movq $y2, $y_rsp2(%rsp)
     movq $y3, $y_rsp3(%rsp)
     movq $y4, $y_rsp4(%rsp)

     # If Y>n ==> Y=Y-n
     sub $x0, $y0
     sbb $x1, $y1
     sbb $x2, $y2
     sbb $x3, $y3
     sbb \$0, $y4

     # Choose old Y or new Y
     cmovc $y_rsp0(%rsp), $y0
     cmovc $y_rsp1(%rsp), $y1
     cmovc $y_rsp2(%rsp), $y2
     cmovc $y_rsp3(%rsp), $y3
     jnc .Lbeeu_reduction_loop

     # X = n - Y (n, Y < 2^256), (Cancel the (-1))
     sub $y0, $x0
     sbb $y1, $x1
     sbb $y2, $x2
     sbb $y3, $x3

 .Lbeeu_save:
     # Save the inverse(<2^256) to out.
     mov $out_rsp(%rsp), $out

     movq $x0, 0*8($out)
     movq $x1, 1*8($out)
     movq $x2, 2*8($out)
     movq $x3, 3*8($out)

     # Return 1.
     movq \$1, %rax
     jmp .Lbeeu_finish

 .Lbeeu_err:
     # Return 0.
     xorq %rax, %rax

 .Lbeeu_finish:
     add \$$last_rsp_offset, %rsp
 .cfi_adjust_cfa_offset  -$last_rsp_offset
     pop %rsi
 .cfi_pop rsi
     pop %rbx
 .cfi_pop rbx
     pop %r15
 .cfi_pop r15
     pop %r14
 .cfi_pop r14
     pop %r13
 .cfi_pop r13
     pop %r12
 .cfi_pop r12
     pop %rbp
 .cfi_pop rbp
     ret
 .cfi_endproc

 .size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
 ___

 print $code;
 close STDOUT or die "error closing STDOUT: $!";
	# Copyright (c) 2018, Amazon Inc.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# https://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# Written by Nir Drucker, and Shay Gueron
	# AWS Cryptographic Algorithms Group
	# (ndrucker@amazon.com, gueron@amazon.com)
	# based on BN_mod_inverse_odd

	$flavour = shift;
	$output = shift;
	if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

	$win64=0; $win64=1 if ($flavour =~ /[nm]asm\|mingw64/ \|\| $output =~ /\.asm$/);

	$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
	( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
	( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
	die "can't locate x86_64-xlate.pl";

	open OUT,"\| \"$^X\" \"$xlate\" $flavour \"$output\"";
	STDOUT=OUT;

	#############################################################################
	# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
	# BN_ULONG a[P256_LIMBS],
	# BN_ULONG n[P256_LIMBS]);
	#
	# (Binary Extended Euclidean Algorithm.
	# See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
	#
	# Assumption 1: n is odd for the BEEU
	# Assumption 2: 1 < a < n < 2^256

	$out = "%rdi";
	$a = "%rsi";
	$n = "%rdx";

	# X/Y will hold the inverse parameter
	# Assumption: X,Y<2^(256)
	$x0 = "%r8";
	$x1 = "%r9";
	$x2 = "%r10";
	$x3 = "%r11";
	# borrow from out (out is needed only at the end)
	$x4 = "%rdi";
	$y0 = "%r12";
	$y1 = "%r13";
	$y2 = "%r14";
	$y3 = "%r15";
	$y4 = "%rbp";
	$shift = "%rcx";
	$t0 = "%rax";
	$t1 = "%rbx";
	$t2 = "%rsi";
	# borrow
	$t3 = "%rcx";

	$T0 = "%xmm0";
	$T1 = "%xmm1";

	# Offsets on the stack
	$out_rsp = 0;
	$shift_rsp = $out_rsp+0x8;
	$a_rsp0 = $shift_rsp+0x8;
	$a_rsp1 = $a_rsp0+0x8;
	$a_rsp2 = $a_rsp1+0x8;
	$a_rsp3 = $a_rsp2+0x8;
	$b_rsp0 = $a_rsp3+0x8;
	$b_rsp1 = $b_rsp0+0x8;
	$b_rsp2 = $b_rsp1+0x8;
	$b_rsp3 = $b_rsp2+0x8;

	# Borrow when a_rsp/b_rsp are no longer needed.
	$y_rsp0 = $a_rsp0;
	$y_rsp1 = $y_rsp0+0x8;
	$y_rsp2 = $y_rsp1+0x8;
	$y_rsp3 = $y_rsp2+0x8;
	$y_rsp4 = $y_rsp3+0x8;
	$last_rsp_offset = $b_rsp3+0x8;

	sub TEST_B_ZERO {
	return <<___;
	xorq $t1, $t1
	or $b_rsp0(%rsp), $t1
	or $b_rsp1(%rsp), $t1
	or $b_rsp2(%rsp), $t1
	or $b_rsp3(%rsp), $t1
	jz .Lbeeu_loop_end
	___
	}

	$g_next_label = 0;

	sub SHIFT1 {
	my ($var0, $var1, $var2, $var3, $var4) = @_;
	my $label = ".Lshift1_${g_next_label}";
	$g_next_label++;

	return <<___;
	# Ensure X is even and divide by two.
	movq \$1, $t1
	andq $var0, $t1
	jz $label
	add 0*8($n), $var0
	adc 1*8($n), $var1
	adc 2*8($n), $var2
	adc 3*8($n), $var3
	adc \$0, $var4

	$label:
	shrdq \$1, $var1, $var0
	shrdq \$1, $var2, $var1
	shrdq \$1, $var3, $var2
	shrdq \$1, $var4, $var3
	shrq \$1, $var4
	___
	}

	sub SHIFT256 {
	my ($var) = @_;
	return <<___;
	# Copy shifted values.
	# Remember not to override t3=rcx
	movq 1*8+$var(%rsp), $t0
	movq 2*8+$var(%rsp), $t1
	movq 3*8+$var(%rsp), $t2

	shrdq %cl, $t0, 0*8+$var(%rsp)
	shrdq %cl, $t1, 1*8+$var(%rsp)
	shrdq %cl, $t2, 2*8+$var(%rsp)

	shrq %cl, $t2
	mov $t2, 3*8+$var(%rsp)
	___
	}

	$code.=<<___;
	.text

	.type beeu_mod_inverse_vartime,\@function
	.hidden beeu_mod_inverse_vartime
	.globl beeu_mod_inverse_vartime
	.align 32
	beeu_mod_inverse_vartime:
	.cfi_startproc
	_CET_ENDBR
	push %rbp
	.cfi_push rbp
	push %r12
	.cfi_push r12
	push %r13
	.cfi_push r13
	push %r14
	.cfi_push r14
	push %r15
	.cfi_push r15
	push %rbx
	.cfi_push rbx
	push %rsi
	.cfi_push rsi

	sub \$$last_rsp_offset, %rsp
	.cfi_adjust_cfa_offset $last_rsp_offset
	movq $out, $out_rsp(%rsp)

	# X=1, Y=0
	movq \$1, $x0
	xorq $x1, $x1
	xorq $x2, $x2
	xorq $x3, $x3
	xorq $x4, $x4

	xorq $y0, $y0
	xorq $y1, $y1
	xorq $y2, $y2
	xorq $y3, $y3
	xorq $y4, $y4

	# Copy a/n into B/A on the stack.
	vmovdqu 0*8($a), $T0
	vmovdqu 2*8($a), $T1
	vmovdqu $T0, $b_rsp0(%rsp)
	vmovdqu $T1, $b_rsp2(%rsp)

	vmovdqu 0*8($n), $T0
	vmovdqu 2*8($n), $T1
	vmovdqu $T0, $a_rsp0(%rsp)
	vmovdqu $T1, $a_rsp2(%rsp)

	.Lbeeu_loop:
	${\TEST_B_ZERO}

	# 0 < B < \|n\|,
	# 0 < A <= \|n\|,
	# (1) X*a == B (mod \|n\|),
	# (2) (-1)Ya == A (mod \|n\|)

	# Now divide B by the maximum possible power of two in the
	# integers, and divide X by the same value mod \|n\|. When we're
	# done, (1) still holds.
	movq \$1, $shift

	# Note that B > 0
	.Lbeeu_shift_loop_XB:
	movq $shift, $t1
	andq $b_rsp0(%rsp), $t1
	jnz .Lbeeu_shift_loop_end_XB

	${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
	shl \$1, $shift

	# Test wraparound of the shift parameter. The probability to have 32 zeroes
	# in a row is small Therefore having the value below equal \$0x8000000 or
	# \$0x8000 does not affect the performance. We choose 0x8000000 because it
	# is the maximal immediate value possible.
	cmp \$0x8000000, $shift
	jne .Lbeeu_shift_loop_XB

	.Lbeeu_shift_loop_end_XB:
	bsf $shift, $shift
	test $shift, $shift
	jz .Lbeeu_no_shift_XB

	${\SHIFT256($b_rsp0)}

	.Lbeeu_no_shift_XB:
	# Same for A and Y. Afterwards, (2) still holds.
	movq \$1, $shift

	# Note that A > 0
	.Lbeeu_shift_loop_YA:
	movq $shift, $t1
	andq $a_rsp0(%rsp), $t1
	jnz .Lbeeu_shift_loop_end_YA

	${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
	shl \$1, $shift

	# Test wraparound of the shift parameter. The probability to have 32 zeroes
	# in a row is small therefore having the value below equal \$0x8000000 or
	# \$0x8000 Does not affect the performance. We choose 0x8000000 because it
	# is the maximal immediate value possible.
	cmp \$0x8000000, $shift
	jne .Lbeeu_shift_loop_YA

	.Lbeeu_shift_loop_end_YA:
	bsf $shift, $shift
	test $shift, $shift
	jz .Lbeeu_no_shift_YA

	${\SHIFT256($a_rsp0)}

	.Lbeeu_no_shift_YA:
	# T = B-A (A,B < 2^256)
	mov $b_rsp0(%rsp), $t0
	mov $b_rsp1(%rsp), $t1
	mov $b_rsp2(%rsp), $t2
	mov $b_rsp3(%rsp), $t3
	sub $a_rsp0(%rsp), $t0
	sbb $a_rsp1(%rsp), $t1
	sbb $a_rsp2(%rsp), $t2
	sbb $a_rsp3(%rsp), $t3 # borrow from shift
	jnc .Lbeeu_B_bigger_than_A

	# A = A - B
	mov $a_rsp0(%rsp), $t0
	mov $a_rsp1(%rsp), $t1
	mov $a_rsp2(%rsp), $t2
	mov $a_rsp3(%rsp), $t3
	sub $b_rsp0(%rsp), $t0
	sbb $b_rsp1(%rsp), $t1
	sbb $b_rsp2(%rsp), $t2
	sbb $b_rsp3(%rsp), $t3
	mov $t0, $a_rsp0(%rsp)
	mov $t1, $a_rsp1(%rsp)
	mov $t2, $a_rsp2(%rsp)
	mov $t3, $a_rsp3(%rsp)

	# Y = Y + X
	add $x0, $y0
	adc $x1, $y1
	adc $x2, $y2
	adc $x3, $y3
	adc $x4, $y4
	jmp .Lbeeu_loop

	.Lbeeu_B_bigger_than_A:
	# B = T = B - A
	mov $t0, $b_rsp0(%rsp)
	mov $t1, $b_rsp1(%rsp)
	mov $t2, $b_rsp2(%rsp)
	mov $t3, $b_rsp3(%rsp)

	# X = Y + X
	add $y0, $x0
	adc $y1, $x1
	adc $y2, $x2
	adc $y3, $x3
	adc $y4, $x4

	jmp .Lbeeu_loop

	.Lbeeu_loop_end:
	# The Euclid's algorithm loop ends when A == beeu(a,n);
	# Therefore (-1)Ya == A (mod \|n\|), Y>0

	# Verify that A = 1 ==> (-1)Ya = A = 1 (mod \|n\|)
	mov $a_rsp0(%rsp), $t1
	sub \$1, $t1
	or $a_rsp1(%rsp), $t1
	or $a_rsp2(%rsp), $t1
	or $a_rsp3(%rsp), $t1
	# If not, fail.
	jnz .Lbeeu_err

	# From this point on, we no longer need X
	# Therefore we use it as a temporary storage.
	# X = n
	movq 0*8($n), $x0
	movq 1*8($n), $x1
	movq 2*8($n), $x2
	movq 3*8($n), $x3
	xorq $x4, $x4

	.Lbeeu_reduction_loop:
	movq $y0, $y_rsp0(%rsp)
	movq $y1, $y_rsp1(%rsp)
	movq $y2, $y_rsp2(%rsp)
	movq $y3, $y_rsp3(%rsp)
	movq $y4, $y_rsp4(%rsp)

	# If Y>n ==> Y=Y-n
	sub $x0, $y0
	sbb $x1, $y1
	sbb $x2, $y2
	sbb $x3, $y3
	sbb \$0, $y4

	# Choose old Y or new Y
	cmovc $y_rsp0(%rsp), $y0
	cmovc $y_rsp1(%rsp), $y1
	cmovc $y_rsp2(%rsp), $y2
	cmovc $y_rsp3(%rsp), $y3
	jnc .Lbeeu_reduction_loop

	# X = n - Y (n, Y < 2^256), (Cancel the (-1))
	sub $y0, $x0
	sbb $y1, $x1
	sbb $y2, $x2
	sbb $y3, $x3

	.Lbeeu_save:
	# Save the inverse(<2^256) to out.
	mov $out_rsp(%rsp), $out

	movq $x0, 0*8($out)
	movq $x1, 1*8($out)
	movq $x2, 2*8($out)
	movq $x3, 3*8($out)

	# Return 1.
	movq \$1, %rax
	jmp .Lbeeu_finish

	.Lbeeu_err:
	# Return 0.
	xorq %rax, %rax

	.Lbeeu_finish:
	add \$$last_rsp_offset, %rsp
	.cfi_adjust_cfa_offset -$last_rsp_offset
	pop %rsi
	.cfi_pop rsi
	pop %rbx
	.cfi_pop rbx
	pop %r15
	.cfi_pop r15
	pop %r14
	.cfi_pop r14
	pop %r13
	.cfi_pop r13
	pop %r12
	.cfi_pop r12
	pop %rbp
	.cfi_pop rbp
	ret
	.cfi_endproc

	.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
	___

	print $code;
	close STDOUT or die "error closing STDOUT: $!";