Add x86-64 assembly for X25519. This assembly is in gas syntax so is not built on Windows nor when OPENSSL_SMALL is defined. Change-Id: I1050cf1b16350fd4b758e4c463261b30a1b65390 Reviewed-on: https://boringssl-review.googlesource.com/6782 Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/curve25519/CMakeLists.txt b/crypto/curve25519/CMakeLists.txt index ac397d0..1614ba1 100644 --- a/crypto/curve25519/CMakeLists.txt +++ b/crypto/curve25519/CMakeLists.txt
@@ -8,12 +8,21 @@ ) endif() +if (${ARCH} STREQUAL "x86_64") + set( + CURVE25519_ARCH_SOURCES + + asm/x25519-x86_64.S + ) +endif() + add_library( curve25519 OBJECT curve25519.c + x25519-x86_64.c ${CURVE25519_ARCH_SOURCES} )
diff --git a/crypto/curve25519/asm/x25519-x86_64.S b/crypto/curve25519/asm/x25519-x86_64.S new file mode 100644 index 0000000..7e86a23 --- /dev/null +++ b/crypto/curve25519/asm/x25519-x86_64.S
@@ -0,0 +1,1931 @@ +/* Copyright (c) 2015, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +/* This file is adapted from crypto_scalarmult/curve25519/amd64-51/ in + * SUPERCOP 20141124 (http://bench.cr.yp.to/supercop.html). That code is public + * domain licensed but the standard ISC license is included above to keep + * licensing simple. */ + +.data +.p2align 4 + +#if defined(__APPLE__) +/* OS X's C ABI prefixes functions with underscore. */ +#define C_ABI(x) _ ## x +#define HIDDEN .private_extern +#else +#define C_ABI(x) x +#define HIDDEN .hidden +#endif + +x25519_x86_64_REDMASK51: .quad 0x0007FFFFFFFFFFFF +x25519_x86_64_121666_213: .quad 996687872 +x25519_x86_64_2P0: .quad 0xFFFFFFFFFFFDA +x25519_x86_64_2P1234: .quad 0xFFFFFFFFFFFFE +x25519_x86_64_4P0: .quad 0x1FFFFFFFFFFFB4 +x25519_x86_64_4P1234: .quad 0x1FFFFFFFFFFFFC +x25519_x86_64_MU0: .quad 0xED9CE5A30A2C131B +x25519_x86_64_MU1: .quad 0x2106215D086329A7 +x25519_x86_64_MU2: .quad 0xFFFFFFFFFFFFFFEB +x25519_x86_64_MU3: .quad 0xFFFFFFFFFFFFFFFF +x25519_x86_64_MU4: .quad 0x000000000000000F +x25519_x86_64_ORDER0: .quad 0x5812631A5CF5D3ED +x25519_x86_64_ORDER1: .quad 0x14DEF9DEA2F79CD6 +x25519_x86_64_ORDER2: .quad 0x0000000000000000 +x25519_x86_64_ORDER3: .quad 0x1000000000000000 +x25519_x86_64_EC2D0: .quad 1859910466990425 +x25519_x86_64_EC2D1: .quad 932731440258426 +x25519_x86_64_EC2D2: .quad 1072319116312658 +x25519_x86_64_EC2D3: .quad 1815898335770999 +x25519_x86_64_EC2D4: .quad 633789495995903 +x25519_x86_64__38: .quad 38 + +.text +.p2align 5 + +.globl C_ABI(x25519_x86_64_freeze) +HIDDEN C_ABI(x25519_x86_64_freeze) +C_ABI(x25519_x86_64_freeze): +mov %rsp,%r11 +and $31,%r11 +add $64,%r11 +sub %r11,%rsp +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) +movq %rbp,48(%rsp) +movq 0(%rdi),%rsi +movq 8(%rdi),%rdx +movq 16(%rdi),%rcx +movq 24(%rdi),%r8 +movq 32(%rdi),%r9 +movq x25519_x86_64_REDMASK51(%rip),%rax +mov %rax,%r10 +sub $18,%r10 +mov $3,%r11 +._reduceloop: +mov %rsi,%r12 +shr $51,%r12 +and %rax,%rsi +add %r12,%rdx +mov %rdx,%r12 +shr $51,%r12 +and %rax,%rdx +add %r12,%rcx +mov %rcx,%r12 +shr $51,%r12 +and %rax,%rcx +add %r12,%r8 +mov %r8,%r12 +shr $51,%r12 +and %rax,%r8 +add %r12,%r9 +mov %r9,%r12 +shr $51,%r12 +and %rax,%r9 +imulq $19,%r12,%r12 +add %r12,%rsi +sub $1,%r11 +ja ._reduceloop +mov $1,%r12 +cmp %r10,%rsi +cmovl %r11,%r12 +cmp %rax,%rdx +cmovne %r11,%r12 +cmp %rax,%rcx +cmovne %r11,%r12 +cmp %rax,%r8 +cmovne %r11,%r12 +cmp %rax,%r9 +cmovne %r11,%r12 +neg %r12 +and %r12,%rax +and %r12,%r10 +sub %r10,%rsi +sub %rax,%rdx +sub %rax,%rcx +sub %rax,%r8 +sub %rax,%r9 +movq %rsi,0(%rdi) +movq %rdx,8(%rdi) +movq %rcx,16(%rdi) +movq %r8,24(%rdi) +movq %r9,32(%rdi) +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx +movq 48(%rsp),%rbp +add %r11,%rsp +mov %rdi,%rax +mov %rsi,%rdx +ret + +.p2align 5 +.globl C_ABI(x25519_x86_64_mul) +HIDDEN C_ABI(x25519_x86_64_mul) +C_ABI(x25519_x86_64_mul): +mov %rsp,%r11 +and $31,%r11 +add $96,%r11 +sub %r11,%rsp +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) +movq %rbp,48(%rsp) +movq %rdi,56(%rsp) +mov %rdx,%rcx +movq 24(%rsi),%rdx +imulq $19,%rdx,%rax +movq %rax,64(%rsp) +mulq 16(%rcx) +mov %rax,%r8 +mov %rdx,%r9 +movq 32(%rsi),%rdx +imulq $19,%rdx,%rax +movq %rax,72(%rsp) +mulq 8(%rcx) +add %rax,%r8 +adc %rdx,%r9 +movq 0(%rsi),%rax +mulq 0(%rcx) +add %rax,%r8 +adc %rdx,%r9 +movq 0(%rsi),%rax +mulq 8(%rcx) +mov %rax,%r10 +mov %rdx,%r11 +movq 0(%rsi),%rax +mulq 16(%rcx) +mov %rax,%r12 +mov %rdx,%r13 +movq 0(%rsi),%rax +mulq 24(%rcx) +mov %rax,%r14 +mov %rdx,%r15 +movq 0(%rsi),%rax +mulq 32(%rcx) +mov %rax,%rbx +mov %rdx,%rbp +movq 8(%rsi),%rax +mulq 0(%rcx) +add %rax,%r10 +adc %rdx,%r11 +movq 8(%rsi),%rax +mulq 8(%rcx) +add %rax,%r12 +adc %rdx,%r13 +movq 8(%rsi),%rax +mulq 16(%rcx) +add %rax,%r14 +adc %rdx,%r15 +movq 8(%rsi),%rax +mulq 24(%rcx) +add %rax,%rbx +adc %rdx,%rbp +movq 8(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rcx) +add %rax,%r8 +adc %rdx,%r9 +movq 16(%rsi),%rax +mulq 0(%rcx) +add %rax,%r12 +adc %rdx,%r13 +movq 16(%rsi),%rax +mulq 8(%rcx) +add %rax,%r14 +adc %rdx,%r15 +movq 16(%rsi),%rax +mulq 16(%rcx) +add %rax,%rbx +adc %rdx,%rbp +movq 16(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 24(%rcx) +add %rax,%r8 +adc %rdx,%r9 +movq 16(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rcx) +add %rax,%r10 +adc %rdx,%r11 +movq 24(%rsi),%rax +mulq 0(%rcx) +add %rax,%r14 +adc %rdx,%r15 +movq 24(%rsi),%rax +mulq 8(%rcx) +add %rax,%rbx +adc %rdx,%rbp +movq 64(%rsp),%rax +mulq 24(%rcx) +add %rax,%r10 +adc %rdx,%r11 +movq 64(%rsp),%rax +mulq 32(%rcx) +add %rax,%r12 +adc %rdx,%r13 +movq 32(%rsi),%rax +mulq 0(%rcx) +add %rax,%rbx +adc %rdx,%rbp +movq 72(%rsp),%rax +mulq 16(%rcx) +add %rax,%r10 +adc %rdx,%r11 +movq 72(%rsp),%rax +mulq 24(%rcx) +add %rax,%r12 +adc %rdx,%r13 +movq 72(%rsp),%rax +mulq 32(%rcx) +add %rax,%r14 +adc %rdx,%r15 +movq x25519_x86_64_REDMASK51(%rip),%rsi +shld $13,%r8,%r9 +and %rsi,%r8 +shld $13,%r10,%r11 +and %rsi,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rsi,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rsi,%r14 +add %r13,%r14 +shld $13,%rbx,%rbp +and %rsi,%rbx +add %r15,%rbx +imulq $19,%rbp,%rdx +add %rdx,%r8 +mov %r8,%rdx +shr $51,%rdx +add %r10,%rdx +mov %rdx,%rcx +shr $51,%rdx +and %rsi,%r8 +add %r12,%rdx +mov %rdx,%r9 +shr $51,%rdx +and %rsi,%rcx +add %r14,%rdx +mov %rdx,%rax +shr $51,%rdx +and %rsi,%r9 +add %rbx,%rdx +mov %rdx,%r10 +shr $51,%rdx +and %rsi,%rax +imulq $19,%rdx,%rdx +add %rdx,%r8 +and %rsi,%r10 +movq %r8,0(%rdi) +movq %rcx,8(%rdi) +movq %r9,16(%rdi) +movq %rax,24(%rdi) +movq %r10,32(%rdi) +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx +movq 48(%rsp),%rbp +add %r11,%rsp +mov %rdi,%rax +mov %rsi,%rdx +ret + +.p2align 5 +.globl C_ABI(x25519_x86_64_square) +HIDDEN C_ABI(x25519_x86_64_square) +C_ABI(x25519_x86_64_square): +mov %rsp,%r11 +and $31,%r11 +add $64,%r11 +sub %r11,%rsp +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) +movq %rbp,48(%rsp) +movq 0(%rsi),%rax +mulq 0(%rsi) +mov %rax,%rcx +mov %rdx,%r8 +movq 0(%rsi),%rax +shl $1,%rax +mulq 8(%rsi) +mov %rax,%r9 +mov %rdx,%r10 +movq 0(%rsi),%rax +shl $1,%rax +mulq 16(%rsi) +mov %rax,%r11 +mov %rdx,%r12 +movq 0(%rsi),%rax +shl $1,%rax +mulq 24(%rsi) +mov %rax,%r13 +mov %rdx,%r14 +movq 0(%rsi),%rax +shl $1,%rax +mulq 32(%rsi) +mov %rax,%r15 +mov %rdx,%rbx +movq 8(%rsi),%rax +mulq 8(%rsi) +add %rax,%r11 +adc %rdx,%r12 +movq 8(%rsi),%rax +shl $1,%rax +mulq 16(%rsi) +add %rax,%r13 +adc %rdx,%r14 +movq 8(%rsi),%rax +shl $1,%rax +mulq 24(%rsi) +add %rax,%r15 +adc %rdx,%rbx +movq 8(%rsi),%rdx +imulq $38,%rdx,%rax +mulq 32(%rsi) +add %rax,%rcx +adc %rdx,%r8 +movq 16(%rsi),%rax +mulq 16(%rsi) +add %rax,%r15 +adc %rdx,%rbx +movq 16(%rsi),%rdx +imulq $38,%rdx,%rax +mulq 24(%rsi) +add %rax,%rcx +adc %rdx,%r8 +movq 16(%rsi),%rdx +imulq $38,%rdx,%rax +mulq 32(%rsi) +add %rax,%r9 +adc %rdx,%r10 +movq 24(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 24(%rsi) +add %rax,%r9 +adc %rdx,%r10 +movq 24(%rsi),%rdx +imulq $38,%rdx,%rax +mulq 32(%rsi) +add %rax,%r11 +adc %rdx,%r12 +movq 32(%rsi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rsi) +add %rax,%r13 +adc %rdx,%r14 +movq x25519_x86_64_REDMASK51(%rip),%rsi +shld $13,%rcx,%r8 +and %rsi,%rcx +shld $13,%r9,%r10 +and %rsi,%r9 +add %r8,%r9 +shld $13,%r11,%r12 +and %rsi,%r11 +add %r10,%r11 +shld $13,%r13,%r14 +and %rsi,%r13 +add %r12,%r13 +shld $13,%r15,%rbx +and %rsi,%r15 +add %r14,%r15 +imulq $19,%rbx,%rdx +add %rdx,%rcx +mov %rcx,%rdx +shr $51,%rdx +add %r9,%rdx +and %rsi,%rcx +mov %rdx,%r8 +shr $51,%rdx +add %r11,%rdx +and %rsi,%r8 +mov %rdx,%r9 +shr $51,%rdx +add %r13,%rdx +and %rsi,%r9 +mov %rdx,%rax +shr $51,%rdx +add %r15,%rdx +and %rsi,%rax +mov %rdx,%r10 +shr $51,%rdx +imulq $19,%rdx,%rdx +add %rdx,%rcx +and %rsi,%r10 +movq %rcx,0(%rdi) +movq %r8,8(%rdi) +movq %r9,16(%rdi) +movq %rax,24(%rdi) +movq %r10,32(%rdi) +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx +movq 48(%rsp),%rbp +add %r11,%rsp +mov %rdi,%rax +mov %rsi,%rdx +ret + +.p2align 5 +.globl C_ABI(x25519_x86_64_ladderstep) +HIDDEN C_ABI(x25519_x86_64_ladderstep) +C_ABI(x25519_x86_64_ladderstep): +mov %rsp,%r11 +and $31,%r11 +add $352,%r11 +sub %r11,%rsp +movq %r11,0(%rsp) +movq %r12,8(%rsp) +movq %r13,16(%rsp) +movq %r14,24(%rsp) +movq %r15,32(%rsp) +movq %rbx,40(%rsp) +movq %rbp,48(%rsp) +movq 40(%rdi),%rsi +movq 48(%rdi),%rdx +movq 56(%rdi),%rcx +movq 64(%rdi),%r8 +movq 72(%rdi),%r9 +mov %rsi,%rax +mov %rdx,%r10 +mov %rcx,%r11 +mov %r8,%r12 +mov %r9,%r13 +add x25519_x86_64_2P0(%rip),%rax +add x25519_x86_64_2P1234(%rip),%r10 +add x25519_x86_64_2P1234(%rip),%r11 +add x25519_x86_64_2P1234(%rip),%r12 +add x25519_x86_64_2P1234(%rip),%r13 +addq 80(%rdi),%rsi +addq 88(%rdi),%rdx +addq 96(%rdi),%rcx +addq 104(%rdi),%r8 +addq 112(%rdi),%r9 +subq 80(%rdi),%rax +subq 88(%rdi),%r10 +subq 96(%rdi),%r11 +subq 104(%rdi),%r12 +subq 112(%rdi),%r13 +movq %rsi,56(%rsp) +movq %rdx,64(%rsp) +movq %rcx,72(%rsp) +movq %r8,80(%rsp) +movq %r9,88(%rsp) +movq %rax,96(%rsp) +movq %r10,104(%rsp) +movq %r11,112(%rsp) +movq %r12,120(%rsp) +movq %r13,128(%rsp) +movq 96(%rsp),%rax +mulq 96(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 96(%rsp),%rax +shl $1,%rax +mulq 104(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 96(%rsp),%rax +shl $1,%rax +mulq 112(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 96(%rsp),%rax +shl $1,%rax +mulq 120(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 96(%rsp),%rax +shl $1,%rax +mulq 128(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 104(%rsp),%rax +mulq 104(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 104(%rsp),%rax +shl $1,%rax +mulq 112(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 104(%rsp),%rax +shl $1,%rax +mulq 120(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 104(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 128(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 112(%rsp),%rax +mulq 112(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 112(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 120(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 112(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 128(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 120(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 120(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 120(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 128(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 128(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 128(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +and %rdx,%rsi +mov %rcx,%r8 +shr $51,%rcx +add %r10,%rcx +and %rdx,%r8 +mov %rcx,%r9 +shr $51,%rcx +add %r12,%rcx +and %rdx,%r9 +mov %rcx,%rax +shr $51,%rcx +add %r14,%rcx +and %rdx,%rax +mov %rcx,%r10 +shr $51,%rcx +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,136(%rsp) +movq %r8,144(%rsp) +movq %r9,152(%rsp) +movq %rax,160(%rsp) +movq %r10,168(%rsp) +movq 56(%rsp),%rax +mulq 56(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 56(%rsp),%rax +shl $1,%rax +mulq 64(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 56(%rsp),%rax +shl $1,%rax +mulq 72(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 56(%rsp),%rax +shl $1,%rax +mulq 80(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 56(%rsp),%rax +shl $1,%rax +mulq 88(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 64(%rsp),%rax +mulq 64(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 64(%rsp),%rax +shl $1,%rax +mulq 72(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 64(%rsp),%rax +shl $1,%rax +mulq 80(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 64(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 88(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 72(%rsp),%rax +mulq 72(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 72(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 80(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 72(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 88(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 80(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 80(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 80(%rsp),%rdx +imulq $38,%rdx,%rax +mulq 88(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 88(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 88(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +and %rdx,%rsi +mov %rcx,%r8 +shr $51,%rcx +add %r10,%rcx +and %rdx,%r8 +mov %rcx,%r9 +shr $51,%rcx +add %r12,%rcx +and %rdx,%r9 +mov %rcx,%rax +shr $51,%rcx +add %r14,%rcx +and %rdx,%rax +mov %rcx,%r10 +shr $51,%rcx +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,176(%rsp) +movq %r8,184(%rsp) +movq %r9,192(%rsp) +movq %rax,200(%rsp) +movq %r10,208(%rsp) +mov %rsi,%rsi +mov %r8,%rdx +mov %r9,%rcx +mov %rax,%r8 +mov %r10,%r9 +add x25519_x86_64_2P0(%rip),%rsi +add x25519_x86_64_2P1234(%rip),%rdx +add x25519_x86_64_2P1234(%rip),%rcx +add x25519_x86_64_2P1234(%rip),%r8 +add x25519_x86_64_2P1234(%rip),%r9 +subq 136(%rsp),%rsi +subq 144(%rsp),%rdx +subq 152(%rsp),%rcx +subq 160(%rsp),%r8 +subq 168(%rsp),%r9 +movq %rsi,216(%rsp) +movq %rdx,224(%rsp) +movq %rcx,232(%rsp) +movq %r8,240(%rsp) +movq %r9,248(%rsp) +movq 120(%rdi),%rsi +movq 128(%rdi),%rdx +movq 136(%rdi),%rcx +movq 144(%rdi),%r8 +movq 152(%rdi),%r9 +mov %rsi,%rax +mov %rdx,%r10 +mov %rcx,%r11 +mov %r8,%r12 +mov %r9,%r13 +add x25519_x86_64_2P0(%rip),%rax +add x25519_x86_64_2P1234(%rip),%r10 +add x25519_x86_64_2P1234(%rip),%r11 +add x25519_x86_64_2P1234(%rip),%r12 +add x25519_x86_64_2P1234(%rip),%r13 +addq 160(%rdi),%rsi +addq 168(%rdi),%rdx +addq 176(%rdi),%rcx +addq 184(%rdi),%r8 +addq 192(%rdi),%r9 +subq 160(%rdi),%rax +subq 168(%rdi),%r10 +subq 176(%rdi),%r11 +subq 184(%rdi),%r12 +subq 192(%rdi),%r13 +movq %rsi,256(%rsp) +movq %rdx,264(%rsp) +movq %rcx,272(%rsp) +movq %r8,280(%rsp) +movq %r9,288(%rsp) +movq %rax,296(%rsp) +movq %r10,304(%rsp) +movq %r11,312(%rsp) +movq %r12,320(%rsp) +movq %r13,328(%rsp) +movq 280(%rsp),%rsi +imulq $19,%rsi,%rax +movq %rax,336(%rsp) +mulq 112(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 288(%rsp),%rdx +imulq $19,%rdx,%rax +movq %rax,344(%rsp) +mulq 104(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 256(%rsp),%rax +mulq 96(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 256(%rsp),%rax +mulq 104(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 256(%rsp),%rax +mulq 112(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 256(%rsp),%rax +mulq 120(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 256(%rsp),%rax +mulq 128(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 264(%rsp),%rax +mulq 96(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 264(%rsp),%rax +mulq 104(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 264(%rsp),%rax +mulq 112(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 264(%rsp),%rax +mulq 120(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 264(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 128(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 272(%rsp),%rax +mulq 96(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 272(%rsp),%rax +mulq 104(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 272(%rsp),%rax +mulq 112(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 272(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 120(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 272(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 128(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 280(%rsp),%rax +mulq 96(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 280(%rsp),%rax +mulq 104(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 336(%rsp),%rax +mulq 120(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 336(%rsp),%rax +mulq 128(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 288(%rsp),%rax +mulq 96(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 344(%rsp),%rax +mulq 112(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 344(%rsp),%rax +mulq 120(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 344(%rsp),%rax +mulq 128(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,96(%rsp) +movq %r8,104(%rsp) +movq %r9,112(%rsp) +movq %rax,120(%rsp) +movq %r10,128(%rsp) +movq 320(%rsp),%rsi +imulq $19,%rsi,%rax +movq %rax,256(%rsp) +mulq 72(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 328(%rsp),%rdx +imulq $19,%rdx,%rax +movq %rax,264(%rsp) +mulq 64(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 296(%rsp),%rax +mulq 56(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 296(%rsp),%rax +mulq 64(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 296(%rsp),%rax +mulq 72(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 296(%rsp),%rax +mulq 80(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 296(%rsp),%rax +mulq 88(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 304(%rsp),%rax +mulq 56(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 304(%rsp),%rax +mulq 64(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 304(%rsp),%rax +mulq 72(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 304(%rsp),%rax +mulq 80(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 304(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 88(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 312(%rsp),%rax +mulq 56(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 312(%rsp),%rax +mulq 64(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 312(%rsp),%rax +mulq 72(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 312(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 80(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 312(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 88(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 320(%rsp),%rax +mulq 56(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 320(%rsp),%rax +mulq 64(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 256(%rsp),%rax +mulq 80(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 256(%rsp),%rax +mulq 88(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 328(%rsp),%rax +mulq 56(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 264(%rsp),%rax +mulq 72(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 264(%rsp),%rax +mulq 80(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 264(%rsp),%rax +mulq 88(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +mov %rsi,%rdx +mov %r8,%rcx +mov %r9,%r11 +mov %rax,%r12 +mov %r10,%r13 +add x25519_x86_64_2P0(%rip),%rdx +add x25519_x86_64_2P1234(%rip),%rcx +add x25519_x86_64_2P1234(%rip),%r11 +add x25519_x86_64_2P1234(%rip),%r12 +add x25519_x86_64_2P1234(%rip),%r13 +addq 96(%rsp),%rsi +addq 104(%rsp),%r8 +addq 112(%rsp),%r9 +addq 120(%rsp),%rax +addq 128(%rsp),%r10 +subq 96(%rsp),%rdx +subq 104(%rsp),%rcx +subq 112(%rsp),%r11 +subq 120(%rsp),%r12 +subq 128(%rsp),%r13 +movq %rsi,120(%rdi) +movq %r8,128(%rdi) +movq %r9,136(%rdi) +movq %rax,144(%rdi) +movq %r10,152(%rdi) +movq %rdx,160(%rdi) +movq %rcx,168(%rdi) +movq %r11,176(%rdi) +movq %r12,184(%rdi) +movq %r13,192(%rdi) +movq 120(%rdi),%rax +mulq 120(%rdi) +mov %rax,%rsi +mov %rdx,%rcx +movq 120(%rdi),%rax +shl $1,%rax +mulq 128(%rdi) +mov %rax,%r8 +mov %rdx,%r9 +movq 120(%rdi),%rax +shl $1,%rax +mulq 136(%rdi) +mov %rax,%r10 +mov %rdx,%r11 +movq 120(%rdi),%rax +shl $1,%rax +mulq 144(%rdi) +mov %rax,%r12 +mov %rdx,%r13 +movq 120(%rdi),%rax +shl $1,%rax +mulq 152(%rdi) +mov %rax,%r14 +mov %rdx,%r15 +movq 128(%rdi),%rax +mulq 128(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 128(%rdi),%rax +shl $1,%rax +mulq 136(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 128(%rdi),%rax +shl $1,%rax +mulq 144(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 128(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 152(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 136(%rdi),%rax +mulq 136(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 136(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 144(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 136(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 152(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 144(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 144(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 144(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 152(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 152(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 152(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +and %rdx,%rsi +mov %rcx,%r8 +shr $51,%rcx +add %r10,%rcx +and %rdx,%r8 +mov %rcx,%r9 +shr $51,%rcx +add %r12,%rcx +and %rdx,%r9 +mov %rcx,%rax +shr $51,%rcx +add %r14,%rcx +and %rdx,%rax +mov %rcx,%r10 +shr $51,%rcx +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,120(%rdi) +movq %r8,128(%rdi) +movq %r9,136(%rdi) +movq %rax,144(%rdi) +movq %r10,152(%rdi) +movq 160(%rdi),%rax +mulq 160(%rdi) +mov %rax,%rsi +mov %rdx,%rcx +movq 160(%rdi),%rax +shl $1,%rax +mulq 168(%rdi) +mov %rax,%r8 +mov %rdx,%r9 +movq 160(%rdi),%rax +shl $1,%rax +mulq 176(%rdi) +mov %rax,%r10 +mov %rdx,%r11 +movq 160(%rdi),%rax +shl $1,%rax +mulq 184(%rdi) +mov %rax,%r12 +mov %rdx,%r13 +movq 160(%rdi),%rax +shl $1,%rax +mulq 192(%rdi) +mov %rax,%r14 +mov %rdx,%r15 +movq 168(%rdi),%rax +mulq 168(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 168(%rdi),%rax +shl $1,%rax +mulq 176(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 168(%rdi),%rax +shl $1,%rax +mulq 184(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 168(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 192(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rdi),%rax +mulq 176(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 176(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 184(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 192(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 184(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 184(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 184(%rdi),%rdx +imulq $38,%rdx,%rax +mulq 192(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 192(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 192(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +and %rdx,%rsi +mov %rcx,%r8 +shr $51,%rcx +add %r10,%rcx +and %rdx,%r8 +mov %rcx,%r9 +shr $51,%rcx +add %r12,%rcx +and %rdx,%r9 +mov %rcx,%rax +shr $51,%rcx +add %r14,%rcx +and %rdx,%rax +mov %rcx,%r10 +shr $51,%rcx +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,160(%rdi) +movq %r8,168(%rdi) +movq %r9,176(%rdi) +movq %rax,184(%rdi) +movq %r10,192(%rdi) +movq 184(%rdi),%rsi +imulq $19,%rsi,%rax +movq %rax,56(%rsp) +mulq 16(%rdi) +mov %rax,%rsi +mov %rdx,%rcx +movq 192(%rdi),%rdx +imulq $19,%rdx,%rax +movq %rax,64(%rsp) +mulq 8(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 160(%rdi),%rax +mulq 0(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 160(%rdi),%rax +mulq 8(%rdi) +mov %rax,%r8 +mov %rdx,%r9 +movq 160(%rdi),%rax +mulq 16(%rdi) +mov %rax,%r10 +mov %rdx,%r11 +movq 160(%rdi),%rax +mulq 24(%rdi) +mov %rax,%r12 +mov %rdx,%r13 +movq 160(%rdi),%rax +mulq 32(%rdi) +mov %rax,%r14 +mov %rdx,%r15 +movq 168(%rdi),%rax +mulq 0(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 168(%rdi),%rax +mulq 8(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 168(%rdi),%rax +mulq 16(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 168(%rdi),%rax +mulq 24(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 168(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rdi),%rax +mulq 0(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 176(%rdi),%rax +mulq 8(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 176(%rdi),%rax +mulq 16(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 176(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 24(%rdi) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 32(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 184(%rdi),%rax +mulq 0(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq 184(%rdi),%rax +mulq 8(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 56(%rsp),%rax +mulq 24(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 56(%rsp),%rax +mulq 32(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 192(%rdi),%rax +mulq 0(%rdi) +add %rax,%r14 +adc %rdx,%r15 +movq 64(%rsp),%rax +mulq 16(%rdi) +add %rax,%r8 +adc %rdx,%r9 +movq 64(%rsp),%rax +mulq 24(%rdi) +add %rax,%r10 +adc %rdx,%r11 +movq 64(%rsp),%rax +mulq 32(%rdi) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,160(%rdi) +movq %r8,168(%rdi) +movq %r9,176(%rdi) +movq %rax,184(%rdi) +movq %r10,192(%rdi) +movq 200(%rsp),%rsi +imulq $19,%rsi,%rax +movq %rax,56(%rsp) +mulq 152(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 208(%rsp),%rdx +imulq $19,%rdx,%rax +movq %rax,64(%rsp) +mulq 144(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rsp),%rax +mulq 136(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 176(%rsp),%rax +mulq 144(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 176(%rsp),%rax +mulq 152(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 176(%rsp),%rax +mulq 160(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 176(%rsp),%rax +mulq 168(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 184(%rsp),%rax +mulq 136(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 184(%rsp),%rax +mulq 144(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 184(%rsp),%rax +mulq 152(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 184(%rsp),%rax +mulq 160(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 184(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 168(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 192(%rsp),%rax +mulq 136(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 192(%rsp),%rax +mulq 144(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 192(%rsp),%rax +mulq 152(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 192(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 160(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 192(%rsp),%rdx +imulq $19,%rdx,%rax +mulq 168(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 200(%rsp),%rax +mulq 136(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 200(%rsp),%rax +mulq 144(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 56(%rsp),%rax +mulq 160(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 56(%rsp),%rax +mulq 168(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 208(%rsp),%rax +mulq 136(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 64(%rsp),%rax +mulq 152(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 64(%rsp),%rax +mulq 160(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 64(%rsp),%rax +mulq 168(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,40(%rdi) +movq %r8,48(%rdi) +movq %r9,56(%rdi) +movq %rax,64(%rdi) +movq %r10,72(%rdi) +movq 216(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +mov %rax,%rsi +mov %rdx,%rcx +movq 224(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +add %rax,%rcx +mov %rdx,%r8 +movq 232(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +add %rax,%r8 +mov %rdx,%r9 +movq 240(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +add %rax,%r9 +mov %rdx,%r10 +movq 248(%rsp),%rax +mulq x25519_x86_64_121666_213(%rip) +shr $13,%rax +add %rax,%r10 +imulq $19,%rdx,%rdx +add %rdx,%rsi +addq 136(%rsp),%rsi +addq 144(%rsp),%rcx +addq 152(%rsp),%r8 +addq 160(%rsp),%r9 +addq 168(%rsp),%r10 +movq %rsi,80(%rdi) +movq %rcx,88(%rdi) +movq %r8,96(%rdi) +movq %r9,104(%rdi) +movq %r10,112(%rdi) +movq 104(%rdi),%rsi +imulq $19,%rsi,%rax +movq %rax,56(%rsp) +mulq 232(%rsp) +mov %rax,%rsi +mov %rdx,%rcx +movq 112(%rdi),%rdx +imulq $19,%rdx,%rax +movq %rax,64(%rsp) +mulq 224(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 80(%rdi),%rax +mulq 216(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 80(%rdi),%rax +mulq 224(%rsp) +mov %rax,%r8 +mov %rdx,%r9 +movq 80(%rdi),%rax +mulq 232(%rsp) +mov %rax,%r10 +mov %rdx,%r11 +movq 80(%rdi),%rax +mulq 240(%rsp) +mov %rax,%r12 +mov %rdx,%r13 +movq 80(%rdi),%rax +mulq 248(%rsp) +mov %rax,%r14 +mov %rdx,%r15 +movq 88(%rdi),%rax +mulq 216(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 88(%rdi),%rax +mulq 224(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 88(%rdi),%rax +mulq 232(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 88(%rdi),%rax +mulq 240(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 88(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 248(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 96(%rdi),%rax +mulq 216(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 96(%rdi),%rax +mulq 224(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 96(%rdi),%rax +mulq 232(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 96(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 240(%rsp) +add %rax,%rsi +adc %rdx,%rcx +movq 96(%rdi),%rdx +imulq $19,%rdx,%rax +mulq 248(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 104(%rdi),%rax +mulq 216(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq 104(%rdi),%rax +mulq 224(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 56(%rsp),%rax +mulq 240(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 56(%rsp),%rax +mulq 248(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 112(%rdi),%rax +mulq 216(%rsp) +add %rax,%r14 +adc %rdx,%r15 +movq 64(%rsp),%rax +mulq 232(%rsp) +add %rax,%r8 +adc %rdx,%r9 +movq 64(%rsp),%rax +mulq 240(%rsp) +add %rax,%r10 +adc %rdx,%r11 +movq 64(%rsp),%rax +mulq 248(%rsp) +add %rax,%r12 +adc %rdx,%r13 +movq x25519_x86_64_REDMASK51(%rip),%rdx +shld $13,%rsi,%rcx +and %rdx,%rsi +shld $13,%r8,%r9 +and %rdx,%r8 +add %rcx,%r8 +shld $13,%r10,%r11 +and %rdx,%r10 +add %r9,%r10 +shld $13,%r12,%r13 +and %rdx,%r12 +add %r11,%r12 +shld $13,%r14,%r15 +and %rdx,%r14 +add %r13,%r14 +imulq $19,%r15,%rcx +add %rcx,%rsi +mov %rsi,%rcx +shr $51,%rcx +add %r8,%rcx +mov %rcx,%r8 +shr $51,%rcx +and %rdx,%rsi +add %r10,%rcx +mov %rcx,%r9 +shr $51,%rcx +and %rdx,%r8 +add %r12,%rcx +mov %rcx,%rax +shr $51,%rcx +and %rdx,%r9 +add %r14,%rcx +mov %rcx,%r10 +shr $51,%rcx +and %rdx,%rax +imulq $19,%rcx,%rcx +add %rcx,%rsi +and %rdx,%r10 +movq %rsi,80(%rdi) +movq %r8,88(%rdi) +movq %r9,96(%rdi) +movq %rax,104(%rdi) +movq %r10,112(%rdi) +movq 0(%rsp),%r11 +movq 8(%rsp),%r12 +movq 16(%rsp),%r13 +movq 24(%rsp),%r14 +movq 32(%rsp),%r15 +movq 40(%rsp),%rbx +movq 48(%rsp),%rbp +add %r11,%rsp +mov %rdi,%rax +mov %rsi,%rdx +ret + +.p2align 5 +.globl C_ABI(x25519_x86_64_work_cswap) +HIDDEN C_ABI(x25519_x86_64_work_cswap) +C_ABI(x25519_x86_64_work_cswap): +mov %rsp,%r11 +and $31,%r11 +add $0,%r11 +sub %r11,%rsp +cmp $1,%rsi +movq 0(%rdi),%rsi +movq 80(%rdi),%rdx +movq 8(%rdi),%rcx +movq 88(%rdi),%r8 +mov %rsi,%r9 +cmove %rdx,%rsi +cmove %r9,%rdx +mov %rcx,%r9 +cmove %r8,%rcx +cmove %r9,%r8 +movq %rsi,0(%rdi) +movq %rdx,80(%rdi) +movq %rcx,8(%rdi) +movq %r8,88(%rdi) +movq 16(%rdi),%rsi +movq 96(%rdi),%rdx +movq 24(%rdi),%rcx +movq 104(%rdi),%r8 +mov %rsi,%r9 +cmove %rdx,%rsi +cmove %r9,%rdx +mov %rcx,%r9 +cmove %r8,%rcx +cmove %r9,%r8 +movq %rsi,16(%rdi) +movq %rdx,96(%rdi) +movq %rcx,24(%rdi) +movq %r8,104(%rdi) +movq 32(%rdi),%rsi +movq 112(%rdi),%rdx +movq 40(%rdi),%rcx +movq 120(%rdi),%r8 +mov %rsi,%r9 +cmove %rdx,%rsi +cmove %r9,%rdx +mov %rcx,%r9 +cmove %r8,%rcx +cmove %r9,%r8 +movq %rsi,32(%rdi) +movq %rdx,112(%rdi) +movq %rcx,40(%rdi) +movq %r8,120(%rdi) +movq 48(%rdi),%rsi +movq 128(%rdi),%rdx +movq 56(%rdi),%rcx +movq 136(%rdi),%r8 +mov %rsi,%r9 +cmove %rdx,%rsi +cmove %r9,%rdx +mov %rcx,%r9 +cmove %r8,%rcx +cmove %r9,%r8 +movq %rsi,48(%rdi) +movq %rdx,128(%rdi) +movq %rcx,56(%rdi) +movq %r8,136(%rdi) +movq 64(%rdi),%rsi +movq 144(%rdi),%rdx +movq 72(%rdi),%rcx +movq 152(%rdi),%r8 +mov %rsi,%r9 +cmove %rdx,%rsi +cmove %r9,%rdx +mov %rcx,%r9 +cmove %r8,%rcx +cmove %r9,%r8 +movq %rsi,64(%rdi) +movq %rdx,144(%rdi) +movq %rcx,72(%rdi) +movq %r8,152(%rdi) +add %r11,%rsp +mov %rdi,%rax +mov %rsi,%rdx +ret
diff --git a/crypto/curve25519/curve25519.c b/crypto/curve25519/curve25519.c index d9c58cb..5d0db2e 100644 --- a/crypto/curve25519/curve25519.c +++ b/crypto/curve25519/curve25519.c
@@ -28,6 +28,8 @@ #include <openssl/rand.h> #include <openssl/sha.h> +#include "internal.h" + /* fe means field element. Here the field is \Z/(2^255-19). An element t, * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77 @@ -225,21 +227,6 @@ h[0] = 1; } -/* Replace (f,g) with (g,f) if b == 1; - * replace (f,g) with (f,g) if b == 0. - * - * Preconditions: b in {0,1}. */ -static void fe_cswap(fe f, fe g, unsigned int b) { - b = 0-b; - unsigned i; - for (i = 0; i < 10; i++) { - int32_t x = f[i] ^ g[i]; - x &= b; - f[i] ^= x; - g[i] ^= x; - } -} - /* h = f + g * Can overlap h with f or g. * @@ -720,70 +707,6 @@ fe_mul(out, t1, t0); } -/* h = f * 121666 - * Can overlap h with f. - * - * Preconditions: - * |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. - * - * Postconditions: - * |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. */ -static void fe_mul121666(fe h, fe f) { - int32_t f0 = f[0]; - int32_t f1 = f[1]; - int32_t f2 = f[2]; - int32_t f3 = f[3]; - int32_t f4 = f[4]; - int32_t f5 = f[5]; - int32_t f6 = f[6]; - int32_t f7 = f[7]; - int32_t f8 = f[8]; - int32_t f9 = f[9]; - int64_t h0 = f0 * (int64_t) 121666; - int64_t h1 = f1 * (int64_t) 121666; - int64_t h2 = f2 * (int64_t) 121666; - int64_t h3 = f3 * (int64_t) 121666; - int64_t h4 = f4 * (int64_t) 121666; - int64_t h5 = f5 * (int64_t) 121666; - int64_t h6 = f6 * (int64_t) 121666; - int64_t h7 = f7 * (int64_t) 121666; - int64_t h8 = f8 * (int64_t) 121666; - int64_t h9 = f9 * (int64_t) 121666; - int64_t carry0; - int64_t carry1; - int64_t carry2; - int64_t carry3; - int64_t carry4; - int64_t carry5; - int64_t carry6; - int64_t carry7; - int64_t carry8; - int64_t carry9; - - carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25; - carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25; - carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25; - carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25; - carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25; - - carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26; - carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26; - carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26; - carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26; - carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26; - - h[0] = h0; - h[1] = h1; - h[2] = h2; - h[3] = h3; - h[4] = h4; - h[5] = h5; - h[6] = h6; - h[7] = h7; - h[8] = h8; - h[9] = h9; -} - /* h = -f * * Preconditions: @@ -4761,6 +4684,95 @@ return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0; } + +#if defined(BORINGSSL_X25519_X86_64) + +static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) { + x25519_x86_64(out, scalar, point); +} + +#else + +/* Replace (f,g) with (g,f) if b == 1; + * replace (f,g) with (f,g) if b == 0. + * + * Preconditions: b in {0,1}. */ +static void fe_cswap(fe f, fe g, unsigned int b) { + b = 0-b; + unsigned i; + for (i = 0; i < 10; i++) { + int32_t x = f[i] ^ g[i]; + x &= b; + f[i] ^= x; + g[i] ^= x; + } +} + +/* h = f * 121666 + * Can overlap h with f. + * + * Preconditions: + * |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc. + * + * Postconditions: + * |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. */ +static void fe_mul121666(fe h, fe f) { + int32_t f0 = f[0]; + int32_t f1 = f[1]; + int32_t f2 = f[2]; + int32_t f3 = f[3]; + int32_t f4 = f[4]; + int32_t f5 = f[5]; + int32_t f6 = f[6]; + int32_t f7 = f[7]; + int32_t f8 = f[8]; + int32_t f9 = f[9]; + int64_t h0 = f0 * (int64_t) 121666; + int64_t h1 = f1 * (int64_t) 121666; + int64_t h2 = f2 * (int64_t) 121666; + int64_t h3 = f3 * (int64_t) 121666; + int64_t h4 = f4 * (int64_t) 121666; + int64_t h5 = f5 * (int64_t) 121666; + int64_t h6 = f6 * (int64_t) 121666; + int64_t h7 = f7 * (int64_t) 121666; + int64_t h8 = f8 * (int64_t) 121666; + int64_t h9 = f9 * (int64_t) 121666; + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + + carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25; + carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25; + carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25; + carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25; + carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25; + + carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26; + carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26; + carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26; + carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26; + carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26; + + h[0] = h0; + h[1] = h1; + h[2] = h2; + h[3] = h3; + h[4] = h4; + h[5] = h5; + h[6] = h6; + h[7] = h7; + h[8] = h8; + h[9] = h9; +} + static void x25519_scalar_mult_generic(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32]) { @@ -4812,12 +4824,6 @@ fe_tobytes(out, x2); } -#if defined(OPENSSL_ARM) -/* x25519_NEON is defined in asm/x25519-arm.S. */ -void x25519_NEON(uint8_t out[32], const uint8_t scalar[32], - const uint8_t point[32]); -#endif - static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], const uint8_t point[32]) { #if defined(OPENSSL_ARM) @@ -4830,6 +4836,9 @@ x25519_scalar_mult_generic(out, scalar, point); } +#endif /* BORINGSSL_X25519_X86_64 */ + + void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) { RAND_bytes(out_private_key, 32); X25519_public_from_private(out_public_value, out_private_key); @@ -4843,6 +4852,20 @@ return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0; } +#if defined(BORINGSSL_X25519_X86_64) + +/* When |BORINGSSL_X25519_X86_64| is set, base point multiplication is done with + * the Montgomery ladder because it's faster. Otherwise it's done using the + * Ed25519 tables. */ + +void X25519_public_from_private(uint8_t out_public_value[32], + const uint8_t private_key[32]) { + static const uint8_t kMongomeryBasePoint[32] = {9}; + x25519_scalar_mult(out_public_value, private_key, kMongomeryBasePoint); +} + +#else + void X25519_public_from_private(uint8_t out_public_value[32], const uint8_t private_key[32]) { #if defined(OPENSSL_ARM) @@ -4871,3 +4894,5 @@ fe_mul(zplusy, zplusy, zminusy_inv); fe_tobytes(out_public_value, zplusy); } + +#endif /* BORINGSSL_X25519_X86_64 */
diff --git a/crypto/curve25519/internal.h b/crypto/curve25519/internal.h new file mode 100644 index 0000000..6468f91 --- /dev/null +++ b/crypto/curve25519/internal.h
@@ -0,0 +1,43 @@ +/* Copyright (c) 2015, Google Inc. + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION + * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN + * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ + +#ifndef OPENSSL_HEADER_CURVE25519_INTERNAL_H +#define OPENSSL_HEADER_CURVE25519_INTERNAL_H + +#if defined(__cplusplus) +extern "C" { +#endif + + +#if defined(OPENSSL_X86_64) && !defined(OPENSSL_SMALL) && \ + !defined(OPENSSL_WINDOWS) +#define BORINGSSL_X25519_X86_64 + +void x25519_x86_64(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]); +#endif + + +#if defined(OPENSSL_ARM) +/* x25519_NEON is defined in asm/x25519-arm.S. */ +void x25519_NEON(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]); +#endif + + +#if defined(__cplusplus) +} /* extern C */ +#endif + +#endif /* OPENSSL_HEADER_CURVE25519_INTERNAL_H */
diff --git a/crypto/curve25519/x25519-x86_64.c b/crypto/curve25519/x25519-x86_64.c new file mode 100644 index 0000000..9776c75 --- /dev/null +++ b/crypto/curve25519/x25519-x86_64.c
@@ -0,0 +1,225 @@ +#include <openssl/curve25519.h> + +#include <string.h> + +#include "internal.h" + + +#if defined(BORINGSSL_X25519_X86_64) + +typedef struct { uint64_t v[5]; } fe25519; + +/* These functions are defined in asm/x25519-x86_64.S */ +void x25519_x86_64_work_cswap(fe25519 *, uint64_t); +void x25519_x86_64_mul(fe25519 *out, const fe25519 *a, const fe25519 *b); +void x25519_x86_64_square(fe25519 *out, const fe25519 *a); +void x25519_x86_64_freeze(fe25519 *); +void x25519_x86_64_ladderstep(fe25519 *work); + +static void fe25519_setint(fe25519 *r, unsigned v) { + r->v[0] = v; + r->v[1] = 0; + r->v[2] = 0; + r->v[3] = 0; + r->v[4] = 0; +} + +/* Assumes input x being reduced below 2^255 */ +static void fe25519_pack(unsigned char r[32], const fe25519 *x) { + fe25519 t; + t = *x; + x25519_x86_64_freeze(&t); + + r[0] = (uint8_t)(t.v[0] & 0xff); + r[1] = (uint8_t)((t.v[0] >> 8) & 0xff); + r[2] = (uint8_t)((t.v[0] >> 16) & 0xff); + r[3] = (uint8_t)((t.v[0] >> 24) & 0xff); + r[4] = (uint8_t)((t.v[0] >> 32) & 0xff); + r[5] = (uint8_t)((t.v[0] >> 40) & 0xff); + r[6] = (uint8_t)((t.v[0] >> 48)); + + r[6] ^= (uint8_t)((t.v[1] << 3) & 0xf8); + r[7] = (uint8_t)((t.v[1] >> 5) & 0xff); + r[8] = (uint8_t)((t.v[1] >> 13) & 0xff); + r[9] = (uint8_t)((t.v[1] >> 21) & 0xff); + r[10] = (uint8_t)((t.v[1] >> 29) & 0xff); + r[11] = (uint8_t)((t.v[1] >> 37) & 0xff); + r[12] = (uint8_t)((t.v[1] >> 45)); + + r[12] ^= (uint8_t)((t.v[2] << 6) & 0xc0); + r[13] = (uint8_t)((t.v[2] >> 2) & 0xff); + r[14] = (uint8_t)((t.v[2] >> 10) & 0xff); + r[15] = (uint8_t)((t.v[2] >> 18) & 0xff); + r[16] = (uint8_t)((t.v[2] >> 26) & 0xff); + r[17] = (uint8_t)((t.v[2] >> 34) & 0xff); + r[18] = (uint8_t)((t.v[2] >> 42) & 0xff); + r[19] = (uint8_t)((t.v[2] >> 50)); + + r[19] ^= (uint8_t)((t.v[3] << 1) & 0xfe); + r[20] = (uint8_t)((t.v[3] >> 7) & 0xff); + r[21] = (uint8_t)((t.v[3] >> 15) & 0xff); + r[22] = (uint8_t)((t.v[3] >> 23) & 0xff); + r[23] = (uint8_t)((t.v[3] >> 31) & 0xff); + r[24] = (uint8_t)((t.v[3] >> 39) & 0xff); + r[25] = (uint8_t)((t.v[3] >> 47)); + + r[25] ^= (uint8_t)((t.v[4] << 4) & 0xf0); + r[26] = (uint8_t)((t.v[4] >> 4) & 0xff); + r[27] = (uint8_t)((t.v[4] >> 12) & 0xff); + r[28] = (uint8_t)((t.v[4] >> 20) & 0xff); + r[29] = (uint8_t)((t.v[4] >> 28) & 0xff); + r[30] = (uint8_t)((t.v[4] >> 36) & 0xff); + r[31] = (uint8_t)((t.v[4] >> 44)); +} + +static void fe25519_unpack(fe25519 *r, const uint8_t x[32]) { + r->v[0] = x[0]; + r->v[0] += (uint64_t)x[1] << 8; + r->v[0] += (uint64_t)x[2] << 16; + r->v[0] += (uint64_t)x[3] << 24; + r->v[0] += (uint64_t)x[4] << 32; + r->v[0] += (uint64_t)x[5] << 40; + r->v[0] += ((uint64_t)x[6] & 7) << 48; + + r->v[1] = x[6] >> 3; + r->v[1] += (uint64_t)x[7] << 5; + r->v[1] += (uint64_t)x[8] << 13; + r->v[1] += (uint64_t)x[9] << 21; + r->v[1] += (uint64_t)x[10] << 29; + r->v[1] += (uint64_t)x[11] << 37; + r->v[1] += ((uint64_t)x[12] & 63) << 45; + + r->v[2] = x[12] >> 6; + r->v[2] += (uint64_t)x[13] << 2; + r->v[2] += (uint64_t)x[14] << 10; + r->v[2] += (uint64_t)x[15] << 18; + r->v[2] += (uint64_t)x[16] << 26; + r->v[2] += (uint64_t)x[17] << 34; + r->v[2] += (uint64_t)x[18] << 42; + r->v[2] += ((uint64_t)x[19] & 1) << 50; + + r->v[3] = x[19] >> 1; + r->v[3] += (uint64_t)x[20] << 7; + r->v[3] += (uint64_t)x[21] << 15; + r->v[3] += (uint64_t)x[22] << 23; + r->v[3] += (uint64_t)x[23] << 31; + r->v[3] += (uint64_t)x[24] << 39; + r->v[3] += ((uint64_t)x[25] & 15) << 47; + + r->v[4] = x[25] >> 4; + r->v[4] += (uint64_t)x[26] << 4; + r->v[4] += (uint64_t)x[27] << 12; + r->v[4] += (uint64_t)x[28] << 20; + r->v[4] += (uint64_t)x[29] << 28; + r->v[4] += (uint64_t)x[30] << 36; + r->v[4] += ((uint64_t)x[31] & 127) << 44; +} + +static void fe25519_invert(fe25519 *r, const fe25519 *x) { + fe25519 z2; + fe25519 z9; + fe25519 z11; + fe25519 z2_5_0; + fe25519 z2_10_0; + fe25519 z2_20_0; + fe25519 z2_50_0; + fe25519 z2_100_0; + fe25519 t; + int i; + + /* 2 */ x25519_x86_64_square(&z2, x); + /* 4 */ x25519_x86_64_square(&t, &z2); + /* 8 */ x25519_x86_64_square(&t, &t); + /* 9 */ x25519_x86_64_mul(&z9, &t, x); + /* 11 */ x25519_x86_64_mul(&z11, &z9, &z2); + /* 22 */ x25519_x86_64_square(&t, &z11); + /* 2^5 - 2^0 = 31 */ x25519_x86_64_mul(&z2_5_0, &t, &z9); + + /* 2^6 - 2^1 */ x25519_x86_64_square(&t, &z2_5_0); + /* 2^20 - 2^10 */ for (i = 1; i < 5; i++) { x25519_x86_64_square(&t, &t); } + /* 2^10 - 2^0 */ x25519_x86_64_mul(&z2_10_0, &t, &z2_5_0); + + /* 2^11 - 2^1 */ x25519_x86_64_square(&t, &z2_10_0); + /* 2^20 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); } + /* 2^20 - 2^0 */ x25519_x86_64_mul(&z2_20_0, &t, &z2_10_0); + + /* 2^21 - 2^1 */ x25519_x86_64_square(&t, &z2_20_0); + /* 2^40 - 2^20 */ for (i = 1; i < 20; i++) { x25519_x86_64_square(&t, &t); } + /* 2^40 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_20_0); + + /* 2^41 - 2^1 */ x25519_x86_64_square(&t, &t); + /* 2^50 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); } + /* 2^50 - 2^0 */ x25519_x86_64_mul(&z2_50_0, &t, &z2_10_0); + + /* 2^51 - 2^1 */ x25519_x86_64_square(&t, &z2_50_0); + /* 2^100 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); } + /* 2^100 - 2^0 */ x25519_x86_64_mul(&z2_100_0, &t, &z2_50_0); + + /* 2^101 - 2^1 */ x25519_x86_64_square(&t, &z2_100_0); + /* 2^200 - 2^100 */ for (i = 1; i < 100; i++) { + x25519_x86_64_square(&t, &t); + } + /* 2^200 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_100_0); + + /* 2^201 - 2^1 */ x25519_x86_64_square(&t, &t); + /* 2^250 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); } + /* 2^250 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_50_0); + + /* 2^251 - 2^1 */ x25519_x86_64_square(&t, &t); + /* 2^252 - 2^2 */ x25519_x86_64_square(&t, &t); + /* 2^253 - 2^3 */ x25519_x86_64_square(&t, &t); + + /* 2^254 - 2^4 */ x25519_x86_64_square(&t, &t); + + /* 2^255 - 2^5 */ x25519_x86_64_square(&t, &t); + /* 2^255 - 21 */ x25519_x86_64_mul(r, &t, &z11); +} + +static void mladder(fe25519 *xr, fe25519 *zr, const uint8_t s[32]) { + fe25519 work[5]; + + work[0] = *xr; + fe25519_setint(work + 1, 1); + fe25519_setint(work + 2, 0); + work[3] = *xr; + fe25519_setint(work + 4, 1); + + int i, j; + uint8_t prevbit = 0; + + j = 6; + for (i = 31; i >= 0; i--) { + while (j >= 0) { + const uint8_t bit = 1 & (s[i] >> j); + const uint64_t swap = bit ^ prevbit; + prevbit = bit; + x25519_x86_64_work_cswap(work + 1, swap); + x25519_x86_64_ladderstep(work); + j -= 1; + } + j = 7; + } + + *xr = work[1]; + *zr = work[2]; +} + +void x25519_x86_64(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) { + uint8_t e[32]; + memcpy(e, scalar, sizeof(e)); + + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + fe25519 t; + fe25519 z; + fe25519_unpack(&t, point); + mladder(&t, &z, e); + fe25519_invert(&z, &z); + x25519_x86_64_mul(&t, &t, &z); + fe25519_pack(out, &t); +} + +#endif /* BORINGSSL_X25519_X86_64 */