blob: 1600d4ca50c9908cdf7a8d16409cbde5c529f3c7 [file] [log] [blame]
# Copyright (c) 2018, Amazon Inc.
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
#
# Written by Nir Drucker, and Shay Gueron
# AWS Cryptographic Algorithms Group
# (ndrucker@amazon.com, gueron@amazon.com)
# based on BN_mod_inverse_odd
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
#############################################################################
# extern int beeu_mod_inverse_vartime(BN_ULONG out[P256_LIMBS],
# BN_ULONG a[P256_LIMBS],
# BN_ULONG n[P256_LIMBS]);
#
# (Binary Extended Euclidean Algorithm.
# See https://en.wikipedia.org/wiki/Binary_GCD_algorithm)
#
# Assumption 1: n is odd for the BEEU
# Assumption 2: 1 < a < n < 2^256
$out = "%rdi";
$a = "%rsi";
$n = "%rdx";
# X/Y will hold the inverse parameter
# Assumption: X,Y<2^(256)
$x0 = "%r8";
$x1 = "%r9";
$x2 = "%r10";
$x3 = "%r11";
# borrow from out (out is needed only at the end)
$x4 = "%rdi";
$y0 = "%r12";
$y1 = "%r13";
$y2 = "%r14";
$y3 = "%r15";
$y4 = "%rbp";
$shift = "%rcx";
$t0 = "%rax";
$t1 = "%rbx";
$t2 = "%rsi";
# borrow
$t3 = "%rcx";
$T0 = "%xmm0";
$T1 = "%xmm1";
# Offsets on the stack
$out_rsp = 0;
$shift_rsp = $out_rsp+0x8;
$a_rsp0 = $shift_rsp+0x8;
$a_rsp1 = $a_rsp0+0x8;
$a_rsp2 = $a_rsp1+0x8;
$a_rsp3 = $a_rsp2+0x8;
$b_rsp0 = $a_rsp3+0x8;
$b_rsp1 = $b_rsp0+0x8;
$b_rsp2 = $b_rsp1+0x8;
$b_rsp3 = $b_rsp2+0x8;
# Borrow when a_rsp/b_rsp are no longer needed.
$y_rsp0 = $a_rsp0;
$y_rsp1 = $y_rsp0+0x8;
$y_rsp2 = $y_rsp1+0x8;
$y_rsp3 = $y_rsp2+0x8;
$y_rsp4 = $y_rsp3+0x8;
$last_rsp_offset = $b_rsp3+0x8;
sub TEST_B_ZERO {
return <<___;
xorq $t1, $t1
or $b_rsp0(%rsp), $t1
or $b_rsp1(%rsp), $t1
or $b_rsp2(%rsp), $t1
or $b_rsp3(%rsp), $t1
jz .Lbeeu_loop_end
___
}
$g_next_label = 0;
sub SHIFT1 {
my ($var0, $var1, $var2, $var3, $var4) = @_;
my $label = ".Lshift1_${g_next_label}";
$g_next_label++;
return <<___;
# Ensure X is even and divide by two.
movq \$1, $t1
andq $var0, $t1
jz $label
add 0*8($n), $var0
adc 1*8($n), $var1
adc 2*8($n), $var2
adc 3*8($n), $var3
adc \$0, $var4
$label:
shrdq \$1, $var1, $var0
shrdq \$1, $var2, $var1
shrdq \$1, $var3, $var2
shrdq \$1, $var4, $var3
shrq \$1, $var4
___
}
sub SHIFT256 {
my ($var) = @_;
return <<___;
# Copy shifted values.
# Remember not to override t3=rcx
movq 1*8+$var(%rsp), $t0
movq 2*8+$var(%rsp), $t1
movq 3*8+$var(%rsp), $t2
shrdq %cl, $t0, 0*8+$var(%rsp)
shrdq %cl, $t1, 1*8+$var(%rsp)
shrdq %cl, $t2, 2*8+$var(%rsp)
shrq %cl, $t2
mov $t2, 3*8+$var(%rsp)
___
}
$code.=<<___;
.text
.type beeu_mod_inverse_vartime,\@function
.hidden beeu_mod_inverse_vartime
.globl beeu_mod_inverse_vartime
.align 32
beeu_mod_inverse_vartime:
.cfi_startproc
_CET_ENDBR
push %rbp
.cfi_push rbp
push %r12
.cfi_push r12
push %r13
.cfi_push r13
push %r14
.cfi_push r14
push %r15
.cfi_push r15
push %rbx
.cfi_push rbx
push %rsi
.cfi_push rsi
sub \$$last_rsp_offset, %rsp
.cfi_adjust_cfa_offset $last_rsp_offset
movq $out, $out_rsp(%rsp)
# X=1, Y=0
movq \$1, $x0
xorq $x1, $x1
xorq $x2, $x2
xorq $x3, $x3
xorq $x4, $x4
xorq $y0, $y0
xorq $y1, $y1
xorq $y2, $y2
xorq $y3, $y3
xorq $y4, $y4
# Copy a/n into B/A on the stack.
vmovdqu 0*8($a), $T0
vmovdqu 2*8($a), $T1
vmovdqu $T0, $b_rsp0(%rsp)
vmovdqu $T1, $b_rsp2(%rsp)
vmovdqu 0*8($n), $T0
vmovdqu 2*8($n), $T1
vmovdqu $T0, $a_rsp0(%rsp)
vmovdqu $T1, $a_rsp2(%rsp)
.Lbeeu_loop:
${\TEST_B_ZERO}
# 0 < B < |n|,
# 0 < A <= |n|,
# (1) X*a == B (mod |n|),
# (2) (-1)*Y*a == A (mod |n|)
# Now divide B by the maximum possible power of two in the
# integers, and divide X by the same value mod |n|. When we're
# done, (1) still holds.
movq \$1, $shift
# Note that B > 0
.Lbeeu_shift_loop_XB:
movq $shift, $t1
andq $b_rsp0(%rsp), $t1
jnz .Lbeeu_shift_loop_end_XB
${\SHIFT1($x0, $x1, $x2, $x3, $x4)}
shl \$1, $shift
# Test wraparound of the shift parameter. The probability to have 32 zeroes
# in a row is small Therefore having the value below equal \$0x8000000 or
# \$0x8000 does not affect the performance. We choose 0x8000000 because it
# is the maximal immediate value possible.
cmp \$0x8000000, $shift
jne .Lbeeu_shift_loop_XB
.Lbeeu_shift_loop_end_XB:
bsf $shift, $shift
test $shift, $shift
jz .Lbeeu_no_shift_XB
${\SHIFT256($b_rsp0)}
.Lbeeu_no_shift_XB:
# Same for A and Y. Afterwards, (2) still holds.
movq \$1, $shift
# Note that A > 0
.Lbeeu_shift_loop_YA:
movq $shift, $t1
andq $a_rsp0(%rsp), $t1
jnz .Lbeeu_shift_loop_end_YA
${\SHIFT1($y0, $y1, $y2, $y3, $y4)}
shl \$1, $shift
# Test wraparound of the shift parameter. The probability to have 32 zeroes
# in a row is small therefore having the value below equal \$0x8000000 or
# \$0x8000 Does not affect the performance. We choose 0x8000000 because it
# is the maximal immediate value possible.
cmp \$0x8000000, $shift
jne .Lbeeu_shift_loop_YA
.Lbeeu_shift_loop_end_YA:
bsf $shift, $shift
test $shift, $shift
jz .Lbeeu_no_shift_YA
${\SHIFT256($a_rsp0)}
.Lbeeu_no_shift_YA:
# T = B-A (A,B < 2^256)
mov $b_rsp0(%rsp), $t0
mov $b_rsp1(%rsp), $t1
mov $b_rsp2(%rsp), $t2
mov $b_rsp3(%rsp), $t3
sub $a_rsp0(%rsp), $t0
sbb $a_rsp1(%rsp), $t1
sbb $a_rsp2(%rsp), $t2
sbb $a_rsp3(%rsp), $t3 # borrow from shift
jnc .Lbeeu_B_bigger_than_A
# A = A - B
mov $a_rsp0(%rsp), $t0
mov $a_rsp1(%rsp), $t1
mov $a_rsp2(%rsp), $t2
mov $a_rsp3(%rsp), $t3
sub $b_rsp0(%rsp), $t0
sbb $b_rsp1(%rsp), $t1
sbb $b_rsp2(%rsp), $t2
sbb $b_rsp3(%rsp), $t3
mov $t0, $a_rsp0(%rsp)
mov $t1, $a_rsp1(%rsp)
mov $t2, $a_rsp2(%rsp)
mov $t3, $a_rsp3(%rsp)
# Y = Y + X
add $x0, $y0
adc $x1, $y1
adc $x2, $y2
adc $x3, $y3
adc $x4, $y4
jmp .Lbeeu_loop
.Lbeeu_B_bigger_than_A:
# B = T = B - A
mov $t0, $b_rsp0(%rsp)
mov $t1, $b_rsp1(%rsp)
mov $t2, $b_rsp2(%rsp)
mov $t3, $b_rsp3(%rsp)
# X = Y + X
add $y0, $x0
adc $y1, $x1
adc $y2, $x2
adc $y3, $x3
adc $y4, $x4
jmp .Lbeeu_loop
.Lbeeu_loop_end:
# The Euclid's algorithm loop ends when A == beeu(a,n);
# Therefore (-1)*Y*a == A (mod |n|), Y>0
# Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|)
mov $a_rsp0(%rsp), $t1
sub \$1, $t1
or $a_rsp1(%rsp), $t1
or $a_rsp2(%rsp), $t1
or $a_rsp3(%rsp), $t1
# If not, fail.
jnz .Lbeeu_err
# From this point on, we no longer need X
# Therefore we use it as a temporary storage.
# X = n
movq 0*8($n), $x0
movq 1*8($n), $x1
movq 2*8($n), $x2
movq 3*8($n), $x3
xorq $x4, $x4
.Lbeeu_reduction_loop:
movq $y0, $y_rsp0(%rsp)
movq $y1, $y_rsp1(%rsp)
movq $y2, $y_rsp2(%rsp)
movq $y3, $y_rsp3(%rsp)
movq $y4, $y_rsp4(%rsp)
# If Y>n ==> Y=Y-n
sub $x0, $y0
sbb $x1, $y1
sbb $x2, $y2
sbb $x3, $y3
sbb \$0, $y4
# Choose old Y or new Y
cmovc $y_rsp0(%rsp), $y0
cmovc $y_rsp1(%rsp), $y1
cmovc $y_rsp2(%rsp), $y2
cmovc $y_rsp3(%rsp), $y3
jnc .Lbeeu_reduction_loop
# X = n - Y (n, Y < 2^256), (Cancel the (-1))
sub $y0, $x0
sbb $y1, $x1
sbb $y2, $x2
sbb $y3, $x3
.Lbeeu_save:
# Save the inverse(<2^256) to out.
mov $out_rsp(%rsp), $out
movq $x0, 0*8($out)
movq $x1, 1*8($out)
movq $x2, 2*8($out)
movq $x3, 3*8($out)
# Return 1.
movq \$1, %rax
jmp .Lbeeu_finish
.Lbeeu_err:
# Return 0.
xorq %rax, %rax
.Lbeeu_finish:
add \$$last_rsp_offset, %rsp
.cfi_adjust_cfa_offset -$last_rsp_offset
pop %rsi
.cfi_pop rsi
pop %rbx
.cfi_pop rbx
pop %r15
.cfi_pop r15
pop %r14
.cfi_pop r14
pop %r13
.cfi_pop r13
pop %r12
.cfi_pop r12
pop %rbp
.cfi_pop rbp
ret
.cfi_endproc
.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime
___
print $code;
close STDOUT or die "error closing STDOUT: $!";