Add x86-64 assembly for X25519.

This assembly is in gas syntax so is not built on Windows nor when
OPENSSL_SMALL is defined.

Change-Id: I1050cf1b16350fd4b758e4c463261b30a1b65390
Reviewed-on: https://boringssl-review.googlesource.com/6782
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/curve25519/CMakeLists.txt b/crypto/curve25519/CMakeLists.txt
index ac397d0..1614ba1 100644
--- a/crypto/curve25519/CMakeLists.txt
+++ b/crypto/curve25519/CMakeLists.txt
@@ -8,12 +8,21 @@
   )
 endif()
 
+if (${ARCH} STREQUAL "x86_64")
+  set(
+    CURVE25519_ARCH_SOURCES
+
+    asm/x25519-x86_64.S
+  )
+endif()
+
 add_library(
   curve25519
 
   OBJECT
 
   curve25519.c
+  x25519-x86_64.c
 
   ${CURVE25519_ARCH_SOURCES}
 )
diff --git a/crypto/curve25519/asm/x25519-x86_64.S b/crypto/curve25519/asm/x25519-x86_64.S
new file mode 100644
index 0000000..7e86a23
--- /dev/null
+++ b/crypto/curve25519/asm/x25519-x86_64.S
@@ -0,0 +1,1931 @@
+/* Copyright (c) 2015, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+/* This file is adapted from crypto_scalarmult/curve25519/amd64-51/ in
+ * SUPERCOP 20141124 (http://bench.cr.yp.to/supercop.html). That code is public
+ * domain licensed but the standard ISC license is included above to keep
+ * licensing simple. */
+
+.data
+.p2align 4
+
+#if defined(__APPLE__)
+/* OS X's C ABI prefixes functions with underscore. */
+#define C_ABI(x) _ ## x
+#define HIDDEN .private_extern
+#else
+#define C_ABI(x) x
+#define HIDDEN .hidden
+#endif
+
+x25519_x86_64_REDMASK51:   .quad 0x0007FFFFFFFFFFFF
+x25519_x86_64_121666_213:  .quad 996687872
+x25519_x86_64_2P0:         .quad 0xFFFFFFFFFFFDA
+x25519_x86_64_2P1234:      .quad 0xFFFFFFFFFFFFE
+x25519_x86_64_4P0:         .quad 0x1FFFFFFFFFFFB4
+x25519_x86_64_4P1234:      .quad 0x1FFFFFFFFFFFFC
+x25519_x86_64_MU0:         .quad 0xED9CE5A30A2C131B
+x25519_x86_64_MU1:         .quad 0x2106215D086329A7
+x25519_x86_64_MU2:         .quad 0xFFFFFFFFFFFFFFEB
+x25519_x86_64_MU3:         .quad 0xFFFFFFFFFFFFFFFF
+x25519_x86_64_MU4:         .quad 0x000000000000000F
+x25519_x86_64_ORDER0:      .quad 0x5812631A5CF5D3ED
+x25519_x86_64_ORDER1:      .quad 0x14DEF9DEA2F79CD6
+x25519_x86_64_ORDER2:      .quad 0x0000000000000000
+x25519_x86_64_ORDER3:      .quad 0x1000000000000000
+x25519_x86_64_EC2D0:       .quad 1859910466990425
+x25519_x86_64_EC2D1:       .quad 932731440258426
+x25519_x86_64_EC2D2:       .quad 1072319116312658
+x25519_x86_64_EC2D3:       .quad 1815898335770999
+x25519_x86_64_EC2D4:       .quad 633789495995903
+x25519_x86_64__38:         .quad 38
+
+.text
+.p2align 5
+
+.globl C_ABI(x25519_x86_64_freeze)
+HIDDEN C_ABI(x25519_x86_64_freeze)
+C_ABI(x25519_x86_64_freeze):
+mov %rsp,%r11
+and $31,%r11
+add $64,%r11
+sub %r11,%rsp
+movq %r11,0(%rsp)
+movq %r12,8(%rsp)
+movq %r13,16(%rsp)
+movq %r14,24(%rsp)
+movq %r15,32(%rsp)
+movq %rbx,40(%rsp)
+movq %rbp,48(%rsp)
+movq   0(%rdi),%rsi
+movq   8(%rdi),%rdx
+movq   16(%rdi),%rcx
+movq   24(%rdi),%r8
+movq   32(%rdi),%r9
+movq x25519_x86_64_REDMASK51(%rip),%rax
+mov  %rax,%r10
+sub  $18,%r10
+mov  $3,%r11
+._reduceloop:
+mov  %rsi,%r12
+shr  $51,%r12
+and  %rax,%rsi
+add  %r12,%rdx
+mov  %rdx,%r12
+shr  $51,%r12
+and  %rax,%rdx
+add  %r12,%rcx
+mov  %rcx,%r12
+shr  $51,%r12
+and  %rax,%rcx
+add  %r12,%r8
+mov  %r8,%r12
+shr  $51,%r12
+and  %rax,%r8
+add  %r12,%r9
+mov  %r9,%r12
+shr  $51,%r12
+and  %rax,%r9
+imulq  $19,%r12,%r12
+add  %r12,%rsi
+sub  $1,%r11
+ja ._reduceloop
+mov  $1,%r12
+cmp  %r10,%rsi
+cmovl %r11,%r12
+cmp  %rax,%rdx
+cmovne %r11,%r12
+cmp  %rax,%rcx
+cmovne %r11,%r12
+cmp  %rax,%r8
+cmovne %r11,%r12
+cmp  %rax,%r9
+cmovne %r11,%r12
+neg  %r12
+and  %r12,%rax
+and  %r12,%r10
+sub  %r10,%rsi
+sub  %rax,%rdx
+sub  %rax,%rcx
+sub  %rax,%r8
+sub  %rax,%r9
+movq   %rsi,0(%rdi)
+movq   %rdx,8(%rdi)
+movq   %rcx,16(%rdi)
+movq   %r8,24(%rdi)
+movq   %r9,32(%rdi)
+movq 0(%rsp),%r11
+movq 8(%rsp),%r12
+movq 16(%rsp),%r13
+movq 24(%rsp),%r14
+movq 32(%rsp),%r15
+movq 40(%rsp),%rbx
+movq 48(%rsp),%rbp
+add %r11,%rsp
+mov %rdi,%rax
+mov %rsi,%rdx
+ret
+
+.p2align 5
+.globl C_ABI(x25519_x86_64_mul)
+HIDDEN C_ABI(x25519_x86_64_mul)
+C_ABI(x25519_x86_64_mul):
+mov %rsp,%r11
+and $31,%r11
+add $96,%r11
+sub %r11,%rsp
+movq %r11,0(%rsp)
+movq %r12,8(%rsp)
+movq %r13,16(%rsp)
+movq %r14,24(%rsp)
+movq %r15,32(%rsp)
+movq %rbx,40(%rsp)
+movq %rbp,48(%rsp)
+movq %rdi,56(%rsp)
+mov  %rdx,%rcx
+movq   24(%rsi),%rdx
+imulq  $19,%rdx,%rax
+movq %rax,64(%rsp)
+mulq  16(%rcx)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq   32(%rsi),%rdx
+imulq  $19,%rdx,%rax
+movq %rax,72(%rsp)
+mulq  8(%rcx)
+add  %rax,%r8
+adc %rdx,%r9
+movq   0(%rsi),%rax
+mulq  0(%rcx)
+add  %rax,%r8
+adc %rdx,%r9
+movq   0(%rsi),%rax
+mulq  8(%rcx)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq   0(%rsi),%rax
+mulq  16(%rcx)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq   0(%rsi),%rax
+mulq  24(%rcx)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq   0(%rsi),%rax
+mulq  32(%rcx)
+mov  %rax,%rbx
+mov  %rdx,%rbp
+movq   8(%rsi),%rax
+mulq  0(%rcx)
+add  %rax,%r10
+adc %rdx,%r11
+movq   8(%rsi),%rax
+mulq  8(%rcx)
+add  %rax,%r12
+adc %rdx,%r13
+movq   8(%rsi),%rax
+mulq  16(%rcx)
+add  %rax,%r14
+adc %rdx,%r15
+movq   8(%rsi),%rax
+mulq  24(%rcx)
+add  %rax,%rbx
+adc %rdx,%rbp
+movq   8(%rsi),%rdx
+imulq  $19,%rdx,%rax
+mulq  32(%rcx)
+add  %rax,%r8
+adc %rdx,%r9
+movq   16(%rsi),%rax
+mulq  0(%rcx)
+add  %rax,%r12
+adc %rdx,%r13
+movq   16(%rsi),%rax
+mulq  8(%rcx)
+add  %rax,%r14
+adc %rdx,%r15
+movq   16(%rsi),%rax
+mulq  16(%rcx)
+add  %rax,%rbx
+adc %rdx,%rbp
+movq   16(%rsi),%rdx
+imulq  $19,%rdx,%rax
+mulq  24(%rcx)
+add  %rax,%r8
+adc %rdx,%r9
+movq   16(%rsi),%rdx
+imulq  $19,%rdx,%rax
+mulq  32(%rcx)
+add  %rax,%r10
+adc %rdx,%r11
+movq   24(%rsi),%rax
+mulq  0(%rcx)
+add  %rax,%r14
+adc %rdx,%r15
+movq   24(%rsi),%rax
+mulq  8(%rcx)
+add  %rax,%rbx
+adc %rdx,%rbp
+movq 64(%rsp),%rax
+mulq  24(%rcx)
+add  %rax,%r10
+adc %rdx,%r11
+movq 64(%rsp),%rax
+mulq  32(%rcx)
+add  %rax,%r12
+adc %rdx,%r13
+movq   32(%rsi),%rax
+mulq  0(%rcx)
+add  %rax,%rbx
+adc %rdx,%rbp
+movq 72(%rsp),%rax
+mulq  16(%rcx)
+add  %rax,%r10
+adc %rdx,%r11
+movq 72(%rsp),%rax
+mulq  24(%rcx)
+add  %rax,%r12
+adc %rdx,%r13
+movq 72(%rsp),%rax
+mulq  32(%rcx)
+add  %rax,%r14
+adc %rdx,%r15
+movq x25519_x86_64_REDMASK51(%rip),%rsi
+shld $13,%r8,%r9
+and  %rsi,%r8
+shld $13,%r10,%r11
+and  %rsi,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rsi,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rsi,%r14
+add  %r13,%r14
+shld $13,%rbx,%rbp
+and  %rsi,%rbx
+add  %r15,%rbx
+imulq  $19,%rbp,%rdx
+add  %rdx,%r8
+mov  %r8,%rdx
+shr  $51,%rdx
+add  %r10,%rdx
+mov  %rdx,%rcx
+shr  $51,%rdx
+and  %rsi,%r8
+add  %r12,%rdx
+mov  %rdx,%r9
+shr  $51,%rdx
+and  %rsi,%rcx
+add  %r14,%rdx
+mov  %rdx,%rax
+shr  $51,%rdx
+and  %rsi,%r9
+add  %rbx,%rdx
+mov  %rdx,%r10
+shr  $51,%rdx
+and  %rsi,%rax
+imulq  $19,%rdx,%rdx
+add  %rdx,%r8
+and  %rsi,%r10
+movq   %r8,0(%rdi)
+movq   %rcx,8(%rdi)
+movq   %r9,16(%rdi)
+movq   %rax,24(%rdi)
+movq   %r10,32(%rdi)
+movq 0(%rsp),%r11
+movq 8(%rsp),%r12
+movq 16(%rsp),%r13
+movq 24(%rsp),%r14
+movq 32(%rsp),%r15
+movq 40(%rsp),%rbx
+movq 48(%rsp),%rbp
+add %r11,%rsp
+mov %rdi,%rax
+mov %rsi,%rdx
+ret
+
+.p2align 5
+.globl C_ABI(x25519_x86_64_square)
+HIDDEN C_ABI(x25519_x86_64_square)
+C_ABI(x25519_x86_64_square):
+mov %rsp,%r11
+and $31,%r11
+add $64,%r11
+sub %r11,%rsp
+movq %r11,0(%rsp)
+movq %r12,8(%rsp)
+movq %r13,16(%rsp)
+movq %r14,24(%rsp)
+movq %r15,32(%rsp)
+movq %rbx,40(%rsp)
+movq %rbp,48(%rsp)
+movq   0(%rsi),%rax
+mulq  0(%rsi)
+mov  %rax,%rcx
+mov  %rdx,%r8
+movq   0(%rsi),%rax
+shl  $1,%rax
+mulq  8(%rsi)
+mov  %rax,%r9
+mov  %rdx,%r10
+movq   0(%rsi),%rax
+shl  $1,%rax
+mulq  16(%rsi)
+mov  %rax,%r11
+mov  %rdx,%r12
+movq   0(%rsi),%rax
+shl  $1,%rax
+mulq  24(%rsi)
+mov  %rax,%r13
+mov  %rdx,%r14
+movq   0(%rsi),%rax
+shl  $1,%rax
+mulq  32(%rsi)
+mov  %rax,%r15
+mov  %rdx,%rbx
+movq   8(%rsi),%rax
+mulq  8(%rsi)
+add  %rax,%r11
+adc %rdx,%r12
+movq   8(%rsi),%rax
+shl  $1,%rax
+mulq  16(%rsi)
+add  %rax,%r13
+adc %rdx,%r14
+movq   8(%rsi),%rax
+shl  $1,%rax
+mulq  24(%rsi)
+add  %rax,%r15
+adc %rdx,%rbx
+movq   8(%rsi),%rdx
+imulq  $38,%rdx,%rax
+mulq  32(%rsi)
+add  %rax,%rcx
+adc %rdx,%r8
+movq   16(%rsi),%rax
+mulq  16(%rsi)
+add  %rax,%r15
+adc %rdx,%rbx
+movq   16(%rsi),%rdx
+imulq  $38,%rdx,%rax
+mulq  24(%rsi)
+add  %rax,%rcx
+adc %rdx,%r8
+movq   16(%rsi),%rdx
+imulq  $38,%rdx,%rax
+mulq  32(%rsi)
+add  %rax,%r9
+adc %rdx,%r10
+movq   24(%rsi),%rdx
+imulq  $19,%rdx,%rax
+mulq  24(%rsi)
+add  %rax,%r9
+adc %rdx,%r10
+movq   24(%rsi),%rdx
+imulq  $38,%rdx,%rax
+mulq  32(%rsi)
+add  %rax,%r11
+adc %rdx,%r12
+movq   32(%rsi),%rdx
+imulq  $19,%rdx,%rax
+mulq  32(%rsi)
+add  %rax,%r13
+adc %rdx,%r14
+movq x25519_x86_64_REDMASK51(%rip),%rsi
+shld $13,%rcx,%r8
+and  %rsi,%rcx
+shld $13,%r9,%r10
+and  %rsi,%r9
+add  %r8,%r9
+shld $13,%r11,%r12
+and  %rsi,%r11
+add  %r10,%r11
+shld $13,%r13,%r14
+and  %rsi,%r13
+add  %r12,%r13
+shld $13,%r15,%rbx
+and  %rsi,%r15
+add  %r14,%r15
+imulq  $19,%rbx,%rdx
+add  %rdx,%rcx
+mov  %rcx,%rdx
+shr  $51,%rdx
+add  %r9,%rdx
+and  %rsi,%rcx
+mov  %rdx,%r8
+shr  $51,%rdx
+add  %r11,%rdx
+and  %rsi,%r8
+mov  %rdx,%r9
+shr  $51,%rdx
+add  %r13,%rdx
+and  %rsi,%r9
+mov  %rdx,%rax
+shr  $51,%rdx
+add  %r15,%rdx
+and  %rsi,%rax
+mov  %rdx,%r10
+shr  $51,%rdx
+imulq  $19,%rdx,%rdx
+add  %rdx,%rcx
+and  %rsi,%r10
+movq   %rcx,0(%rdi)
+movq   %r8,8(%rdi)
+movq   %r9,16(%rdi)
+movq   %rax,24(%rdi)
+movq   %r10,32(%rdi)
+movq 0(%rsp),%r11
+movq 8(%rsp),%r12
+movq 16(%rsp),%r13
+movq 24(%rsp),%r14
+movq 32(%rsp),%r15
+movq 40(%rsp),%rbx
+movq 48(%rsp),%rbp
+add %r11,%rsp
+mov %rdi,%rax
+mov %rsi,%rdx
+ret
+
+.p2align 5
+.globl C_ABI(x25519_x86_64_ladderstep)
+HIDDEN C_ABI(x25519_x86_64_ladderstep)
+C_ABI(x25519_x86_64_ladderstep):
+mov %rsp,%r11
+and $31,%r11
+add $352,%r11
+sub %r11,%rsp
+movq %r11,0(%rsp)
+movq %r12,8(%rsp)
+movq %r13,16(%rsp)
+movq %r14,24(%rsp)
+movq %r15,32(%rsp)
+movq %rbx,40(%rsp)
+movq %rbp,48(%rsp)
+movq   40(%rdi),%rsi
+movq   48(%rdi),%rdx
+movq   56(%rdi),%rcx
+movq   64(%rdi),%r8
+movq   72(%rdi),%r9
+mov  %rsi,%rax
+mov  %rdx,%r10
+mov  %rcx,%r11
+mov  %r8,%r12
+mov  %r9,%r13
+add  x25519_x86_64_2P0(%rip),%rax
+add  x25519_x86_64_2P1234(%rip),%r10
+add  x25519_x86_64_2P1234(%rip),%r11
+add  x25519_x86_64_2P1234(%rip),%r12
+add  x25519_x86_64_2P1234(%rip),%r13
+addq 80(%rdi),%rsi
+addq 88(%rdi),%rdx
+addq 96(%rdi),%rcx
+addq 104(%rdi),%r8
+addq 112(%rdi),%r9
+subq 80(%rdi),%rax
+subq 88(%rdi),%r10
+subq 96(%rdi),%r11
+subq 104(%rdi),%r12
+subq 112(%rdi),%r13
+movq %rsi,56(%rsp)
+movq %rdx,64(%rsp)
+movq %rcx,72(%rsp)
+movq %r8,80(%rsp)
+movq %r9,88(%rsp)
+movq %rax,96(%rsp)
+movq %r10,104(%rsp)
+movq %r11,112(%rsp)
+movq %r12,120(%rsp)
+movq %r13,128(%rsp)
+movq 96(%rsp),%rax
+mulq  96(%rsp)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq 96(%rsp),%rax
+shl  $1,%rax
+mulq  104(%rsp)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq 96(%rsp),%rax
+shl  $1,%rax
+mulq  112(%rsp)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq 96(%rsp),%rax
+shl  $1,%rax
+mulq  120(%rsp)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq 96(%rsp),%rax
+shl  $1,%rax
+mulq  128(%rsp)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq 104(%rsp),%rax
+mulq  104(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 104(%rsp),%rax
+shl  $1,%rax
+mulq  112(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 104(%rsp),%rax
+shl  $1,%rax
+mulq  120(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 104(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  128(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 112(%rsp),%rax
+mulq  112(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 112(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  120(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 112(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  128(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 120(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  120(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 120(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  128(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 128(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  128(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+and  %rdx,%rsi
+mov  %rcx,%r8
+shr  $51,%rcx
+add  %r10,%rcx
+and  %rdx,%r8
+mov  %rcx,%r9
+shr  $51,%rcx
+add  %r12,%rcx
+and  %rdx,%r9
+mov  %rcx,%rax
+shr  $51,%rcx
+add  %r14,%rcx
+and  %rdx,%rax
+mov  %rcx,%r10
+shr  $51,%rcx
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+movq %rsi,136(%rsp)
+movq %r8,144(%rsp)
+movq %r9,152(%rsp)
+movq %rax,160(%rsp)
+movq %r10,168(%rsp)
+movq 56(%rsp),%rax
+mulq  56(%rsp)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq 56(%rsp),%rax
+shl  $1,%rax
+mulq  64(%rsp)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq 56(%rsp),%rax
+shl  $1,%rax
+mulq  72(%rsp)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq 56(%rsp),%rax
+shl  $1,%rax
+mulq  80(%rsp)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq 56(%rsp),%rax
+shl  $1,%rax
+mulq  88(%rsp)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq 64(%rsp),%rax
+mulq  64(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 64(%rsp),%rax
+shl  $1,%rax
+mulq  72(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 64(%rsp),%rax
+shl  $1,%rax
+mulq  80(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 64(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  88(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 72(%rsp),%rax
+mulq  72(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 72(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  80(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 72(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  88(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 80(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  80(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 80(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  88(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 88(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  88(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+and  %rdx,%rsi
+mov  %rcx,%r8
+shr  $51,%rcx
+add  %r10,%rcx
+and  %rdx,%r8
+mov  %rcx,%r9
+shr  $51,%rcx
+add  %r12,%rcx
+and  %rdx,%r9
+mov  %rcx,%rax
+shr  $51,%rcx
+add  %r14,%rcx
+and  %rdx,%rax
+mov  %rcx,%r10
+shr  $51,%rcx
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+movq %rsi,176(%rsp)
+movq %r8,184(%rsp)
+movq %r9,192(%rsp)
+movq %rax,200(%rsp)
+movq %r10,208(%rsp)
+mov  %rsi,%rsi
+mov  %r8,%rdx
+mov  %r9,%rcx
+mov  %rax,%r8
+mov  %r10,%r9
+add  x25519_x86_64_2P0(%rip),%rsi
+add  x25519_x86_64_2P1234(%rip),%rdx
+add  x25519_x86_64_2P1234(%rip),%rcx
+add  x25519_x86_64_2P1234(%rip),%r8
+add  x25519_x86_64_2P1234(%rip),%r9
+subq 136(%rsp),%rsi
+subq 144(%rsp),%rdx
+subq 152(%rsp),%rcx
+subq 160(%rsp),%r8
+subq 168(%rsp),%r9
+movq %rsi,216(%rsp)
+movq %rdx,224(%rsp)
+movq %rcx,232(%rsp)
+movq %r8,240(%rsp)
+movq %r9,248(%rsp)
+movq   120(%rdi),%rsi
+movq   128(%rdi),%rdx
+movq   136(%rdi),%rcx
+movq   144(%rdi),%r8
+movq   152(%rdi),%r9
+mov  %rsi,%rax
+mov  %rdx,%r10
+mov  %rcx,%r11
+mov  %r8,%r12
+mov  %r9,%r13
+add  x25519_x86_64_2P0(%rip),%rax
+add  x25519_x86_64_2P1234(%rip),%r10
+add  x25519_x86_64_2P1234(%rip),%r11
+add  x25519_x86_64_2P1234(%rip),%r12
+add  x25519_x86_64_2P1234(%rip),%r13
+addq 160(%rdi),%rsi
+addq 168(%rdi),%rdx
+addq 176(%rdi),%rcx
+addq 184(%rdi),%r8
+addq 192(%rdi),%r9
+subq 160(%rdi),%rax
+subq 168(%rdi),%r10
+subq 176(%rdi),%r11
+subq 184(%rdi),%r12
+subq 192(%rdi),%r13
+movq %rsi,256(%rsp)
+movq %rdx,264(%rsp)
+movq %rcx,272(%rsp)
+movq %r8,280(%rsp)
+movq %r9,288(%rsp)
+movq %rax,296(%rsp)
+movq %r10,304(%rsp)
+movq %r11,312(%rsp)
+movq %r12,320(%rsp)
+movq %r13,328(%rsp)
+movq 280(%rsp),%rsi
+imulq  $19,%rsi,%rax
+movq %rax,336(%rsp)
+mulq  112(%rsp)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq 288(%rsp),%rdx
+imulq  $19,%rdx,%rax
+movq %rax,344(%rsp)
+mulq  104(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 256(%rsp),%rax
+mulq  96(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 256(%rsp),%rax
+mulq  104(%rsp)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq 256(%rsp),%rax
+mulq  112(%rsp)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq 256(%rsp),%rax
+mulq  120(%rsp)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq 256(%rsp),%rax
+mulq  128(%rsp)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq 264(%rsp),%rax
+mulq  96(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 264(%rsp),%rax
+mulq  104(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 264(%rsp),%rax
+mulq  112(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 264(%rsp),%rax
+mulq  120(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 264(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  128(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 272(%rsp),%rax
+mulq  96(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 272(%rsp),%rax
+mulq  104(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 272(%rsp),%rax
+mulq  112(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 272(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  120(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 272(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  128(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 280(%rsp),%rax
+mulq  96(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 280(%rsp),%rax
+mulq  104(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 336(%rsp),%rax
+mulq  120(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 336(%rsp),%rax
+mulq  128(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 288(%rsp),%rax
+mulq  96(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 344(%rsp),%rax
+mulq  112(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 344(%rsp),%rax
+mulq  120(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 344(%rsp),%rax
+mulq  128(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+mov  %rcx,%r8
+shr  $51,%rcx
+and  %rdx,%rsi
+add  %r10,%rcx
+mov  %rcx,%r9
+shr  $51,%rcx
+and  %rdx,%r8
+add  %r12,%rcx
+mov  %rcx,%rax
+shr  $51,%rcx
+and  %rdx,%r9
+add  %r14,%rcx
+mov  %rcx,%r10
+shr  $51,%rcx
+and  %rdx,%rax
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+movq %rsi,96(%rsp)
+movq %r8,104(%rsp)
+movq %r9,112(%rsp)
+movq %rax,120(%rsp)
+movq %r10,128(%rsp)
+movq 320(%rsp),%rsi
+imulq  $19,%rsi,%rax
+movq %rax,256(%rsp)
+mulq  72(%rsp)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq 328(%rsp),%rdx
+imulq  $19,%rdx,%rax
+movq %rax,264(%rsp)
+mulq  64(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 296(%rsp),%rax
+mulq  56(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 296(%rsp),%rax
+mulq  64(%rsp)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq 296(%rsp),%rax
+mulq  72(%rsp)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq 296(%rsp),%rax
+mulq  80(%rsp)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq 296(%rsp),%rax
+mulq  88(%rsp)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq 304(%rsp),%rax
+mulq  56(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 304(%rsp),%rax
+mulq  64(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 304(%rsp),%rax
+mulq  72(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 304(%rsp),%rax
+mulq  80(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 304(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  88(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 312(%rsp),%rax
+mulq  56(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 312(%rsp),%rax
+mulq  64(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 312(%rsp),%rax
+mulq  72(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 312(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  80(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 312(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  88(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 320(%rsp),%rax
+mulq  56(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 320(%rsp),%rax
+mulq  64(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 256(%rsp),%rax
+mulq  80(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 256(%rsp),%rax
+mulq  88(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 328(%rsp),%rax
+mulq  56(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 264(%rsp),%rax
+mulq  72(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 264(%rsp),%rax
+mulq  80(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 264(%rsp),%rax
+mulq  88(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+mov  %rcx,%r8
+shr  $51,%rcx
+and  %rdx,%rsi
+add  %r10,%rcx
+mov  %rcx,%r9
+shr  $51,%rcx
+and  %rdx,%r8
+add  %r12,%rcx
+mov  %rcx,%rax
+shr  $51,%rcx
+and  %rdx,%r9
+add  %r14,%rcx
+mov  %rcx,%r10
+shr  $51,%rcx
+and  %rdx,%rax
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+mov  %rsi,%rdx
+mov  %r8,%rcx
+mov  %r9,%r11
+mov  %rax,%r12
+mov  %r10,%r13
+add  x25519_x86_64_2P0(%rip),%rdx
+add  x25519_x86_64_2P1234(%rip),%rcx
+add  x25519_x86_64_2P1234(%rip),%r11
+add  x25519_x86_64_2P1234(%rip),%r12
+add  x25519_x86_64_2P1234(%rip),%r13
+addq 96(%rsp),%rsi
+addq 104(%rsp),%r8
+addq 112(%rsp),%r9
+addq 120(%rsp),%rax
+addq 128(%rsp),%r10
+subq 96(%rsp),%rdx
+subq 104(%rsp),%rcx
+subq 112(%rsp),%r11
+subq 120(%rsp),%r12
+subq 128(%rsp),%r13
+movq   %rsi,120(%rdi)
+movq   %r8,128(%rdi)
+movq   %r9,136(%rdi)
+movq   %rax,144(%rdi)
+movq   %r10,152(%rdi)
+movq   %rdx,160(%rdi)
+movq   %rcx,168(%rdi)
+movq   %r11,176(%rdi)
+movq   %r12,184(%rdi)
+movq   %r13,192(%rdi)
+movq   120(%rdi),%rax
+mulq  120(%rdi)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq   120(%rdi),%rax
+shl  $1,%rax
+mulq  128(%rdi)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq   120(%rdi),%rax
+shl  $1,%rax
+mulq  136(%rdi)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq   120(%rdi),%rax
+shl  $1,%rax
+mulq  144(%rdi)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq   120(%rdi),%rax
+shl  $1,%rax
+mulq  152(%rdi)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq   128(%rdi),%rax
+mulq  128(%rdi)
+add  %rax,%r10
+adc %rdx,%r11
+movq   128(%rdi),%rax
+shl  $1,%rax
+mulq  136(%rdi)
+add  %rax,%r12
+adc %rdx,%r13
+movq   128(%rdi),%rax
+shl  $1,%rax
+mulq  144(%rdi)
+add  %rax,%r14
+adc %rdx,%r15
+movq   128(%rdi),%rdx
+imulq  $38,%rdx,%rax
+mulq  152(%rdi)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   136(%rdi),%rax
+mulq  136(%rdi)
+add  %rax,%r14
+adc %rdx,%r15
+movq   136(%rdi),%rdx
+imulq  $38,%rdx,%rax
+mulq  144(%rdi)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   136(%rdi),%rdx
+imulq  $38,%rdx,%rax
+mulq  152(%rdi)
+add  %rax,%r8
+adc %rdx,%r9
+movq   144(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  144(%rdi)
+add  %rax,%r8
+adc %rdx,%r9
+movq   144(%rdi),%rdx
+imulq  $38,%rdx,%rax
+mulq  152(%rdi)
+add  %rax,%r10
+adc %rdx,%r11
+movq   152(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  152(%rdi)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+and  %rdx,%rsi
+mov  %rcx,%r8
+shr  $51,%rcx
+add  %r10,%rcx
+and  %rdx,%r8
+mov  %rcx,%r9
+shr  $51,%rcx
+add  %r12,%rcx
+and  %rdx,%r9
+mov  %rcx,%rax
+shr  $51,%rcx
+add  %r14,%rcx
+and  %rdx,%rax
+mov  %rcx,%r10
+shr  $51,%rcx
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+movq   %rsi,120(%rdi)
+movq   %r8,128(%rdi)
+movq   %r9,136(%rdi)
+movq   %rax,144(%rdi)
+movq   %r10,152(%rdi)
+movq   160(%rdi),%rax
+mulq  160(%rdi)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq   160(%rdi),%rax
+shl  $1,%rax
+mulq  168(%rdi)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq   160(%rdi),%rax
+shl  $1,%rax
+mulq  176(%rdi)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq   160(%rdi),%rax
+shl  $1,%rax
+mulq  184(%rdi)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq   160(%rdi),%rax
+shl  $1,%rax
+mulq  192(%rdi)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq   168(%rdi),%rax
+mulq  168(%rdi)
+add  %rax,%r10
+adc %rdx,%r11
+movq   168(%rdi),%rax
+shl  $1,%rax
+mulq  176(%rdi)
+add  %rax,%r12
+adc %rdx,%r13
+movq   168(%rdi),%rax
+shl  $1,%rax
+mulq  184(%rdi)
+add  %rax,%r14
+adc %rdx,%r15
+movq   168(%rdi),%rdx
+imulq  $38,%rdx,%rax
+mulq  192(%rdi)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   176(%rdi),%rax
+mulq  176(%rdi)
+add  %rax,%r14
+adc %rdx,%r15
+movq   176(%rdi),%rdx
+imulq  $38,%rdx,%rax
+mulq  184(%rdi)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   176(%rdi),%rdx
+imulq  $38,%rdx,%rax
+mulq  192(%rdi)
+add  %rax,%r8
+adc %rdx,%r9
+movq   184(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  184(%rdi)
+add  %rax,%r8
+adc %rdx,%r9
+movq   184(%rdi),%rdx
+imulq  $38,%rdx,%rax
+mulq  192(%rdi)
+add  %rax,%r10
+adc %rdx,%r11
+movq   192(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  192(%rdi)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+and  %rdx,%rsi
+mov  %rcx,%r8
+shr  $51,%rcx
+add  %r10,%rcx
+and  %rdx,%r8
+mov  %rcx,%r9
+shr  $51,%rcx
+add  %r12,%rcx
+and  %rdx,%r9
+mov  %rcx,%rax
+shr  $51,%rcx
+add  %r14,%rcx
+and  %rdx,%rax
+mov  %rcx,%r10
+shr  $51,%rcx
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+movq   %rsi,160(%rdi)
+movq   %r8,168(%rdi)
+movq   %r9,176(%rdi)
+movq   %rax,184(%rdi)
+movq   %r10,192(%rdi)
+movq   184(%rdi),%rsi
+imulq  $19,%rsi,%rax
+movq %rax,56(%rsp)
+mulq  16(%rdi)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq   192(%rdi),%rdx
+imulq  $19,%rdx,%rax
+movq %rax,64(%rsp)
+mulq  8(%rdi)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   160(%rdi),%rax
+mulq  0(%rdi)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   160(%rdi),%rax
+mulq  8(%rdi)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq   160(%rdi),%rax
+mulq  16(%rdi)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq   160(%rdi),%rax
+mulq  24(%rdi)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq   160(%rdi),%rax
+mulq  32(%rdi)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq   168(%rdi),%rax
+mulq  0(%rdi)
+add  %rax,%r8
+adc %rdx,%r9
+movq   168(%rdi),%rax
+mulq  8(%rdi)
+add  %rax,%r10
+adc %rdx,%r11
+movq   168(%rdi),%rax
+mulq  16(%rdi)
+add  %rax,%r12
+adc %rdx,%r13
+movq   168(%rdi),%rax
+mulq  24(%rdi)
+add  %rax,%r14
+adc %rdx,%r15
+movq   168(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  32(%rdi)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   176(%rdi),%rax
+mulq  0(%rdi)
+add  %rax,%r10
+adc %rdx,%r11
+movq   176(%rdi),%rax
+mulq  8(%rdi)
+add  %rax,%r12
+adc %rdx,%r13
+movq   176(%rdi),%rax
+mulq  16(%rdi)
+add  %rax,%r14
+adc %rdx,%r15
+movq   176(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  24(%rdi)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   176(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  32(%rdi)
+add  %rax,%r8
+adc %rdx,%r9
+movq   184(%rdi),%rax
+mulq  0(%rdi)
+add  %rax,%r12
+adc %rdx,%r13
+movq   184(%rdi),%rax
+mulq  8(%rdi)
+add  %rax,%r14
+adc %rdx,%r15
+movq 56(%rsp),%rax
+mulq  24(%rdi)
+add  %rax,%r8
+adc %rdx,%r9
+movq 56(%rsp),%rax
+mulq  32(%rdi)
+add  %rax,%r10
+adc %rdx,%r11
+movq   192(%rdi),%rax
+mulq  0(%rdi)
+add  %rax,%r14
+adc %rdx,%r15
+movq 64(%rsp),%rax
+mulq  16(%rdi)
+add  %rax,%r8
+adc %rdx,%r9
+movq 64(%rsp),%rax
+mulq  24(%rdi)
+add  %rax,%r10
+adc %rdx,%r11
+movq 64(%rsp),%rax
+mulq  32(%rdi)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+mov  %rcx,%r8
+shr  $51,%rcx
+and  %rdx,%rsi
+add  %r10,%rcx
+mov  %rcx,%r9
+shr  $51,%rcx
+and  %rdx,%r8
+add  %r12,%rcx
+mov  %rcx,%rax
+shr  $51,%rcx
+and  %rdx,%r9
+add  %r14,%rcx
+mov  %rcx,%r10
+shr  $51,%rcx
+and  %rdx,%rax
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+movq   %rsi,160(%rdi)
+movq   %r8,168(%rdi)
+movq   %r9,176(%rdi)
+movq   %rax,184(%rdi)
+movq   %r10,192(%rdi)
+movq 200(%rsp),%rsi
+imulq  $19,%rsi,%rax
+movq %rax,56(%rsp)
+mulq  152(%rsp)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq 208(%rsp),%rdx
+imulq  $19,%rdx,%rax
+movq %rax,64(%rsp)
+mulq  144(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 176(%rsp),%rax
+mulq  136(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 176(%rsp),%rax
+mulq  144(%rsp)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq 176(%rsp),%rax
+mulq  152(%rsp)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq 176(%rsp),%rax
+mulq  160(%rsp)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq 176(%rsp),%rax
+mulq  168(%rsp)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq 184(%rsp),%rax
+mulq  136(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 184(%rsp),%rax
+mulq  144(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 184(%rsp),%rax
+mulq  152(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 184(%rsp),%rax
+mulq  160(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 184(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  168(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 192(%rsp),%rax
+mulq  136(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 192(%rsp),%rax
+mulq  144(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 192(%rsp),%rax
+mulq  152(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 192(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  160(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 192(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  168(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 200(%rsp),%rax
+mulq  136(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 200(%rsp),%rax
+mulq  144(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 56(%rsp),%rax
+mulq  160(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 56(%rsp),%rax
+mulq  168(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 208(%rsp),%rax
+mulq  136(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 64(%rsp),%rax
+mulq  152(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 64(%rsp),%rax
+mulq  160(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 64(%rsp),%rax
+mulq  168(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+mov  %rcx,%r8
+shr  $51,%rcx
+and  %rdx,%rsi
+add  %r10,%rcx
+mov  %rcx,%r9
+shr  $51,%rcx
+and  %rdx,%r8
+add  %r12,%rcx
+mov  %rcx,%rax
+shr  $51,%rcx
+and  %rdx,%r9
+add  %r14,%rcx
+mov  %rcx,%r10
+shr  $51,%rcx
+and  %rdx,%rax
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+movq   %rsi,40(%rdi)
+movq   %r8,48(%rdi)
+movq   %r9,56(%rdi)
+movq   %rax,64(%rdi)
+movq   %r10,72(%rdi)
+movq 216(%rsp),%rax
+mulq  x25519_x86_64_121666_213(%rip)
+shr  $13,%rax
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq 224(%rsp),%rax
+mulq  x25519_x86_64_121666_213(%rip)
+shr  $13,%rax
+add  %rax,%rcx
+mov  %rdx,%r8
+movq 232(%rsp),%rax
+mulq  x25519_x86_64_121666_213(%rip)
+shr  $13,%rax
+add  %rax,%r8
+mov  %rdx,%r9
+movq 240(%rsp),%rax
+mulq  x25519_x86_64_121666_213(%rip)
+shr  $13,%rax
+add  %rax,%r9
+mov  %rdx,%r10
+movq 248(%rsp),%rax
+mulq  x25519_x86_64_121666_213(%rip)
+shr  $13,%rax
+add  %rax,%r10
+imulq  $19,%rdx,%rdx
+add  %rdx,%rsi
+addq 136(%rsp),%rsi
+addq 144(%rsp),%rcx
+addq 152(%rsp),%r8
+addq 160(%rsp),%r9
+addq 168(%rsp),%r10
+movq   %rsi,80(%rdi)
+movq   %rcx,88(%rdi)
+movq   %r8,96(%rdi)
+movq   %r9,104(%rdi)
+movq   %r10,112(%rdi)
+movq   104(%rdi),%rsi
+imulq  $19,%rsi,%rax
+movq %rax,56(%rsp)
+mulq  232(%rsp)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq   112(%rdi),%rdx
+imulq  $19,%rdx,%rax
+movq %rax,64(%rsp)
+mulq  224(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   80(%rdi),%rax
+mulq  216(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   80(%rdi),%rax
+mulq  224(%rsp)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq   80(%rdi),%rax
+mulq  232(%rsp)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq   80(%rdi),%rax
+mulq  240(%rsp)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq   80(%rdi),%rax
+mulq  248(%rsp)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq   88(%rdi),%rax
+mulq  216(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq   88(%rdi),%rax
+mulq  224(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq   88(%rdi),%rax
+mulq  232(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq   88(%rdi),%rax
+mulq  240(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq   88(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  248(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   96(%rdi),%rax
+mulq  216(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq   96(%rdi),%rax
+mulq  224(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq   96(%rdi),%rax
+mulq  232(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq   96(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  240(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq   96(%rdi),%rdx
+imulq  $19,%rdx,%rax
+mulq  248(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq   104(%rdi),%rax
+mulq  216(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq   104(%rdi),%rax
+mulq  224(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 56(%rsp),%rax
+mulq  240(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 56(%rsp),%rax
+mulq  248(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq   112(%rdi),%rax
+mulq  216(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 64(%rsp),%rax
+mulq  232(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 64(%rsp),%rax
+mulq  240(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 64(%rsp),%rax
+mulq  248(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+mov  %rcx,%r8
+shr  $51,%rcx
+and  %rdx,%rsi
+add  %r10,%rcx
+mov  %rcx,%r9
+shr  $51,%rcx
+and  %rdx,%r8
+add  %r12,%rcx
+mov  %rcx,%rax
+shr  $51,%rcx
+and  %rdx,%r9
+add  %r14,%rcx
+mov  %rcx,%r10
+shr  $51,%rcx
+and  %rdx,%rax
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+movq   %rsi,80(%rdi)
+movq   %r8,88(%rdi)
+movq   %r9,96(%rdi)
+movq   %rax,104(%rdi)
+movq   %r10,112(%rdi)
+movq 0(%rsp),%r11
+movq 8(%rsp),%r12
+movq 16(%rsp),%r13
+movq 24(%rsp),%r14
+movq 32(%rsp),%r15
+movq 40(%rsp),%rbx
+movq 48(%rsp),%rbp
+add %r11,%rsp
+mov %rdi,%rax
+mov %rsi,%rdx
+ret
+
+.p2align 5
+.globl C_ABI(x25519_x86_64_work_cswap)
+HIDDEN C_ABI(x25519_x86_64_work_cswap)
+C_ABI(x25519_x86_64_work_cswap):
+mov %rsp,%r11
+and $31,%r11
+add $0,%r11
+sub %r11,%rsp
+cmp  $1,%rsi
+movq   0(%rdi),%rsi
+movq   80(%rdi),%rdx
+movq   8(%rdi),%rcx
+movq   88(%rdi),%r8
+mov  %rsi,%r9
+cmove %rdx,%rsi
+cmove %r9,%rdx
+mov  %rcx,%r9
+cmove %r8,%rcx
+cmove %r9,%r8
+movq   %rsi,0(%rdi)
+movq   %rdx,80(%rdi)
+movq   %rcx,8(%rdi)
+movq   %r8,88(%rdi)
+movq   16(%rdi),%rsi
+movq   96(%rdi),%rdx
+movq   24(%rdi),%rcx
+movq   104(%rdi),%r8
+mov  %rsi,%r9
+cmove %rdx,%rsi
+cmove %r9,%rdx
+mov  %rcx,%r9
+cmove %r8,%rcx
+cmove %r9,%r8
+movq   %rsi,16(%rdi)
+movq   %rdx,96(%rdi)
+movq   %rcx,24(%rdi)
+movq   %r8,104(%rdi)
+movq   32(%rdi),%rsi
+movq   112(%rdi),%rdx
+movq   40(%rdi),%rcx
+movq   120(%rdi),%r8
+mov  %rsi,%r9
+cmove %rdx,%rsi
+cmove %r9,%rdx
+mov  %rcx,%r9
+cmove %r8,%rcx
+cmove %r9,%r8
+movq   %rsi,32(%rdi)
+movq   %rdx,112(%rdi)
+movq   %rcx,40(%rdi)
+movq   %r8,120(%rdi)
+movq   48(%rdi),%rsi
+movq   128(%rdi),%rdx
+movq   56(%rdi),%rcx
+movq   136(%rdi),%r8
+mov  %rsi,%r9
+cmove %rdx,%rsi
+cmove %r9,%rdx
+mov  %rcx,%r9
+cmove %r8,%rcx
+cmove %r9,%r8
+movq   %rsi,48(%rdi)
+movq   %rdx,128(%rdi)
+movq   %rcx,56(%rdi)
+movq   %r8,136(%rdi)
+movq   64(%rdi),%rsi
+movq   144(%rdi),%rdx
+movq   72(%rdi),%rcx
+movq   152(%rdi),%r8
+mov  %rsi,%r9
+cmove %rdx,%rsi
+cmove %r9,%rdx
+mov  %rcx,%r9
+cmove %r8,%rcx
+cmove %r9,%r8
+movq   %rsi,64(%rdi)
+movq   %rdx,144(%rdi)
+movq   %rcx,72(%rdi)
+movq   %r8,152(%rdi)
+add %r11,%rsp
+mov %rdi,%rax
+mov %rsi,%rdx
+ret
diff --git a/crypto/curve25519/curve25519.c b/crypto/curve25519/curve25519.c
index d9c58cb..5d0db2e 100644
--- a/crypto/curve25519/curve25519.c
+++ b/crypto/curve25519/curve25519.c
@@ -28,6 +28,8 @@
 #include <openssl/rand.h>
 #include <openssl/sha.h>
 
+#include "internal.h"
+
 
 /* fe means field element. Here the field is \Z/(2^255-19). An element t,
  * entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
@@ -225,21 +227,6 @@
   h[0] = 1;
 }
 
-/* Replace (f,g) with (g,f) if b == 1;
- * replace (f,g) with (f,g) if b == 0.
- *
- * Preconditions: b in {0,1}. */
-static void fe_cswap(fe f, fe g, unsigned int b) {
-  b = 0-b;
-  unsigned i;
-  for (i = 0; i < 10; i++) {
-    int32_t x = f[i] ^ g[i];
-    x &= b;
-    f[i] ^= x;
-    g[i] ^= x;
-  }
-}
-
 /* h = f + g
  * Can overlap h with f or g.
  *
@@ -720,70 +707,6 @@
   fe_mul(out, t1, t0);
 }
 
-/* h = f * 121666
- * Can overlap h with f.
- *
- * Preconditions:
- *    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
- *
- * Postconditions:
- *    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. */
-static void fe_mul121666(fe h, fe f) {
-  int32_t f0 = f[0];
-  int32_t f1 = f[1];
-  int32_t f2 = f[2];
-  int32_t f3 = f[3];
-  int32_t f4 = f[4];
-  int32_t f5 = f[5];
-  int32_t f6 = f[6];
-  int32_t f7 = f[7];
-  int32_t f8 = f[8];
-  int32_t f9 = f[9];
-  int64_t h0 = f0 * (int64_t) 121666;
-  int64_t h1 = f1 * (int64_t) 121666;
-  int64_t h2 = f2 * (int64_t) 121666;
-  int64_t h3 = f3 * (int64_t) 121666;
-  int64_t h4 = f4 * (int64_t) 121666;
-  int64_t h5 = f5 * (int64_t) 121666;
-  int64_t h6 = f6 * (int64_t) 121666;
-  int64_t h7 = f7 * (int64_t) 121666;
-  int64_t h8 = f8 * (int64_t) 121666;
-  int64_t h9 = f9 * (int64_t) 121666;
-  int64_t carry0;
-  int64_t carry1;
-  int64_t carry2;
-  int64_t carry3;
-  int64_t carry4;
-  int64_t carry5;
-  int64_t carry6;
-  int64_t carry7;
-  int64_t carry8;
-  int64_t carry9;
-
-  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
-  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
-  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
-  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
-  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
-
-  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
-  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
-  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
-  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
-  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
-
-  h[0] = h0;
-  h[1] = h1;
-  h[2] = h2;
-  h[3] = h3;
-  h[4] = h4;
-  h[5] = h5;
-  h[6] = h6;
-  h[7] = h7;
-  h[8] = h8;
-  h[9] = h9;
-}
-
 /* h = -f
  *
  * Preconditions:
@@ -4761,6 +4684,95 @@
   return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0;
 }
 
+
+#if defined(BORINGSSL_X25519_X86_64)
+
+static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
+                               const uint8_t point[32]) {
+  x25519_x86_64(out, scalar, point);
+}
+
+#else
+
+/* Replace (f,g) with (g,f) if b == 1;
+ * replace (f,g) with (f,g) if b == 0.
+ *
+ * Preconditions: b in {0,1}. */
+static void fe_cswap(fe f, fe g, unsigned int b) {
+  b = 0-b;
+  unsigned i;
+  for (i = 0; i < 10; i++) {
+    int32_t x = f[i] ^ g[i];
+    x &= b;
+    f[i] ^= x;
+    g[i] ^= x;
+  }
+}
+
+/* h = f * 121666
+ * Can overlap h with f.
+ *
+ * Preconditions:
+ *    |f| bounded by 1.1*2^26,1.1*2^25,1.1*2^26,1.1*2^25,etc.
+ *
+ * Postconditions:
+ *    |h| bounded by 1.1*2^25,1.1*2^24,1.1*2^25,1.1*2^24,etc. */
+static void fe_mul121666(fe h, fe f) {
+  int32_t f0 = f[0];
+  int32_t f1 = f[1];
+  int32_t f2 = f[2];
+  int32_t f3 = f[3];
+  int32_t f4 = f[4];
+  int32_t f5 = f[5];
+  int32_t f6 = f[6];
+  int32_t f7 = f[7];
+  int32_t f8 = f[8];
+  int32_t f9 = f[9];
+  int64_t h0 = f0 * (int64_t) 121666;
+  int64_t h1 = f1 * (int64_t) 121666;
+  int64_t h2 = f2 * (int64_t) 121666;
+  int64_t h3 = f3 * (int64_t) 121666;
+  int64_t h4 = f4 * (int64_t) 121666;
+  int64_t h5 = f5 * (int64_t) 121666;
+  int64_t h6 = f6 * (int64_t) 121666;
+  int64_t h7 = f7 * (int64_t) 121666;
+  int64_t h8 = f8 * (int64_t) 121666;
+  int64_t h9 = f9 * (int64_t) 121666;
+  int64_t carry0;
+  int64_t carry1;
+  int64_t carry2;
+  int64_t carry3;
+  int64_t carry4;
+  int64_t carry5;
+  int64_t carry6;
+  int64_t carry7;
+  int64_t carry8;
+  int64_t carry9;
+
+  carry9 = (h9 + (int64_t) (1<<24)) >> 25; h0 += carry9 * 19; h9 -= carry9 << 25;
+  carry1 = (h1 + (int64_t) (1<<24)) >> 25; h2 += carry1; h1 -= carry1 << 25;
+  carry3 = (h3 + (int64_t) (1<<24)) >> 25; h4 += carry3; h3 -= carry3 << 25;
+  carry5 = (h5 + (int64_t) (1<<24)) >> 25; h6 += carry5; h5 -= carry5 << 25;
+  carry7 = (h7 + (int64_t) (1<<24)) >> 25; h8 += carry7; h7 -= carry7 << 25;
+
+  carry0 = (h0 + (int64_t) (1<<25)) >> 26; h1 += carry0; h0 -= carry0 << 26;
+  carry2 = (h2 + (int64_t) (1<<25)) >> 26; h3 += carry2; h2 -= carry2 << 26;
+  carry4 = (h4 + (int64_t) (1<<25)) >> 26; h5 += carry4; h4 -= carry4 << 26;
+  carry6 = (h6 + (int64_t) (1<<25)) >> 26; h7 += carry6; h6 -= carry6 << 26;
+  carry8 = (h8 + (int64_t) (1<<25)) >> 26; h9 += carry8; h8 -= carry8 << 26;
+
+  h[0] = h0;
+  h[1] = h1;
+  h[2] = h2;
+  h[3] = h3;
+  h[4] = h4;
+  h[5] = h5;
+  h[6] = h6;
+  h[7] = h7;
+  h[8] = h8;
+  h[9] = h9;
+}
+
 static void x25519_scalar_mult_generic(uint8_t out[32],
                                        const uint8_t scalar[32],
                                        const uint8_t point[32]) {
@@ -4812,12 +4824,6 @@
   fe_tobytes(out, x2);
 }
 
-#if defined(OPENSSL_ARM)
-/* x25519_NEON is defined in asm/x25519-arm.S. */
-void x25519_NEON(uint8_t out[32], const uint8_t scalar[32],
-                 const uint8_t point[32]);
-#endif
-
 static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32],
                                const uint8_t point[32]) {
 #if defined(OPENSSL_ARM)
@@ -4830,6 +4836,9 @@
   x25519_scalar_mult_generic(out, scalar, point);
 }
 
+#endif  /* BORINGSSL_X25519_X86_64 */
+
+
 void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) {
   RAND_bytes(out_private_key, 32);
   X25519_public_from_private(out_public_value, out_private_key);
@@ -4843,6 +4852,20 @@
   return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0;
 }
 
+#if defined(BORINGSSL_X25519_X86_64)
+
+/* When |BORINGSSL_X25519_X86_64| is set, base point multiplication is done with
+ * the Montgomery ladder because it's faster. Otherwise it's done using the
+ * Ed25519 tables. */
+
+void X25519_public_from_private(uint8_t out_public_value[32],
+                                const uint8_t private_key[32]) {
+  static const uint8_t kMongomeryBasePoint[32] = {9};
+  x25519_scalar_mult(out_public_value, private_key, kMongomeryBasePoint);
+}
+
+#else
+
 void X25519_public_from_private(uint8_t out_public_value[32],
                                 const uint8_t private_key[32]) {
 #if defined(OPENSSL_ARM)
@@ -4871,3 +4894,5 @@
   fe_mul(zplusy, zplusy, zminusy_inv);
   fe_tobytes(out_public_value, zplusy);
 }
+
+#endif  /* BORINGSSL_X25519_X86_64 */
diff --git a/crypto/curve25519/internal.h b/crypto/curve25519/internal.h
new file mode 100644
index 0000000..6468f91
--- /dev/null
+++ b/crypto/curve25519/internal.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2015, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_CURVE25519_INTERNAL_H
+#define OPENSSL_HEADER_CURVE25519_INTERNAL_H
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+#if defined(OPENSSL_X86_64) && !defined(OPENSSL_SMALL) && \
+    !defined(OPENSSL_WINDOWS)
+#define BORINGSSL_X25519_X86_64
+
+void x25519_x86_64(uint8_t out[32], const uint8_t scalar[32],
+                   const uint8_t point[32]);
+#endif
+
+
+#if defined(OPENSSL_ARM)
+/* x25519_NEON is defined in asm/x25519-arm.S. */
+void x25519_NEON(uint8_t out[32], const uint8_t scalar[32],
+                 const uint8_t point[32]);
+#endif
+
+
+#if defined(__cplusplus)
+}  /* extern C */
+#endif
+
+#endif  /* OPENSSL_HEADER_CURVE25519_INTERNAL_H */
diff --git a/crypto/curve25519/x25519-x86_64.c b/crypto/curve25519/x25519-x86_64.c
new file mode 100644
index 0000000..9776c75
--- /dev/null
+++ b/crypto/curve25519/x25519-x86_64.c
@@ -0,0 +1,225 @@
+#include <openssl/curve25519.h>
+
+#include <string.h>
+
+#include "internal.h"
+
+
+#if defined(BORINGSSL_X25519_X86_64)
+
+typedef struct { uint64_t v[5]; } fe25519;
+
+/* These functions are defined in asm/x25519-x86_64.S */
+void x25519_x86_64_work_cswap(fe25519 *, uint64_t);
+void x25519_x86_64_mul(fe25519 *out, const fe25519 *a, const fe25519 *b);
+void x25519_x86_64_square(fe25519 *out, const fe25519 *a);
+void x25519_x86_64_freeze(fe25519 *);
+void x25519_x86_64_ladderstep(fe25519 *work);
+
+static void fe25519_setint(fe25519 *r, unsigned v) {
+  r->v[0] = v;
+  r->v[1] = 0;
+  r->v[2] = 0;
+  r->v[3] = 0;
+  r->v[4] = 0;
+}
+
+/* Assumes input x being reduced below 2^255 */
+static void fe25519_pack(unsigned char r[32], const fe25519 *x) {
+  fe25519 t;
+  t = *x;
+  x25519_x86_64_freeze(&t);
+
+  r[0] = (uint8_t)(t.v[0] & 0xff);
+  r[1] = (uint8_t)((t.v[0] >> 8) & 0xff);
+  r[2] = (uint8_t)((t.v[0] >> 16) & 0xff);
+  r[3] = (uint8_t)((t.v[0] >> 24) & 0xff);
+  r[4] = (uint8_t)((t.v[0] >> 32) & 0xff);
+  r[5] = (uint8_t)((t.v[0] >> 40) & 0xff);
+  r[6] = (uint8_t)((t.v[0] >> 48));
+
+  r[6] ^= (uint8_t)((t.v[1] << 3) & 0xf8);
+  r[7] = (uint8_t)((t.v[1] >> 5) & 0xff);
+  r[8] = (uint8_t)((t.v[1] >> 13) & 0xff);
+  r[9] = (uint8_t)((t.v[1] >> 21) & 0xff);
+  r[10] = (uint8_t)((t.v[1] >> 29) & 0xff);
+  r[11] = (uint8_t)((t.v[1] >> 37) & 0xff);
+  r[12] = (uint8_t)((t.v[1] >> 45));
+
+  r[12] ^= (uint8_t)((t.v[2] << 6) & 0xc0);
+  r[13] = (uint8_t)((t.v[2] >> 2) & 0xff);
+  r[14] = (uint8_t)((t.v[2] >> 10) & 0xff);
+  r[15] = (uint8_t)((t.v[2] >> 18) & 0xff);
+  r[16] = (uint8_t)((t.v[2] >> 26) & 0xff);
+  r[17] = (uint8_t)((t.v[2] >> 34) & 0xff);
+  r[18] = (uint8_t)((t.v[2] >> 42) & 0xff);
+  r[19] = (uint8_t)((t.v[2] >> 50));
+
+  r[19] ^= (uint8_t)((t.v[3] << 1) & 0xfe);
+  r[20] = (uint8_t)((t.v[3] >> 7) & 0xff);
+  r[21] = (uint8_t)((t.v[3] >> 15) & 0xff);
+  r[22] = (uint8_t)((t.v[3] >> 23) & 0xff);
+  r[23] = (uint8_t)((t.v[3] >> 31) & 0xff);
+  r[24] = (uint8_t)((t.v[3] >> 39) & 0xff);
+  r[25] = (uint8_t)((t.v[3] >> 47));
+
+  r[25] ^= (uint8_t)((t.v[4] << 4) & 0xf0);
+  r[26] = (uint8_t)((t.v[4] >> 4) & 0xff);
+  r[27] = (uint8_t)((t.v[4] >> 12) & 0xff);
+  r[28] = (uint8_t)((t.v[4] >> 20) & 0xff);
+  r[29] = (uint8_t)((t.v[4] >> 28) & 0xff);
+  r[30] = (uint8_t)((t.v[4] >> 36) & 0xff);
+  r[31] = (uint8_t)((t.v[4] >> 44));
+}
+
+static void fe25519_unpack(fe25519 *r, const uint8_t x[32]) {
+  r->v[0] = x[0];
+  r->v[0] += (uint64_t)x[1] << 8;
+  r->v[0] += (uint64_t)x[2] << 16;
+  r->v[0] += (uint64_t)x[3] << 24;
+  r->v[0] += (uint64_t)x[4] << 32;
+  r->v[0] += (uint64_t)x[5] << 40;
+  r->v[0] += ((uint64_t)x[6] & 7) << 48;
+
+  r->v[1] = x[6] >> 3;
+  r->v[1] += (uint64_t)x[7] << 5;
+  r->v[1] += (uint64_t)x[8] << 13;
+  r->v[1] += (uint64_t)x[9] << 21;
+  r->v[1] += (uint64_t)x[10] << 29;
+  r->v[1] += (uint64_t)x[11] << 37;
+  r->v[1] += ((uint64_t)x[12] & 63) << 45;
+
+  r->v[2] = x[12] >> 6;
+  r->v[2] += (uint64_t)x[13] << 2;
+  r->v[2] += (uint64_t)x[14] << 10;
+  r->v[2] += (uint64_t)x[15] << 18;
+  r->v[2] += (uint64_t)x[16] << 26;
+  r->v[2] += (uint64_t)x[17] << 34;
+  r->v[2] += (uint64_t)x[18] << 42;
+  r->v[2] += ((uint64_t)x[19] & 1) << 50;
+
+  r->v[3] = x[19] >> 1;
+  r->v[3] += (uint64_t)x[20] << 7;
+  r->v[3] += (uint64_t)x[21] << 15;
+  r->v[3] += (uint64_t)x[22] << 23;
+  r->v[3] += (uint64_t)x[23] << 31;
+  r->v[3] += (uint64_t)x[24] << 39;
+  r->v[3] += ((uint64_t)x[25] & 15) << 47;
+
+  r->v[4] = x[25] >> 4;
+  r->v[4] += (uint64_t)x[26] << 4;
+  r->v[4] += (uint64_t)x[27] << 12;
+  r->v[4] += (uint64_t)x[28] << 20;
+  r->v[4] += (uint64_t)x[29] << 28;
+  r->v[4] += (uint64_t)x[30] << 36;
+  r->v[4] += ((uint64_t)x[31] & 127) << 44;
+}
+
+static void fe25519_invert(fe25519 *r, const fe25519 *x) {
+  fe25519 z2;
+  fe25519 z9;
+  fe25519 z11;
+  fe25519 z2_5_0;
+  fe25519 z2_10_0;
+  fe25519 z2_20_0;
+  fe25519 z2_50_0;
+  fe25519 z2_100_0;
+  fe25519 t;
+  int i;
+
+  /* 2 */ x25519_x86_64_square(&z2, x);
+  /* 4 */ x25519_x86_64_square(&t, &z2);
+  /* 8 */ x25519_x86_64_square(&t, &t);
+  /* 9 */ x25519_x86_64_mul(&z9, &t, x);
+  /* 11 */ x25519_x86_64_mul(&z11, &z9, &z2);
+  /* 22 */ x25519_x86_64_square(&t, &z11);
+  /* 2^5 - 2^0 = 31 */ x25519_x86_64_mul(&z2_5_0, &t, &z9);
+
+  /* 2^6 - 2^1 */ x25519_x86_64_square(&t, &z2_5_0);
+  /* 2^20 - 2^10 */ for (i = 1; i < 5; i++) { x25519_x86_64_square(&t, &t); }
+  /* 2^10 - 2^0 */ x25519_x86_64_mul(&z2_10_0, &t, &z2_5_0);
+
+  /* 2^11 - 2^1 */ x25519_x86_64_square(&t, &z2_10_0);
+  /* 2^20 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
+  /* 2^20 - 2^0 */ x25519_x86_64_mul(&z2_20_0, &t, &z2_10_0);
+
+  /* 2^21 - 2^1 */ x25519_x86_64_square(&t, &z2_20_0);
+  /* 2^40 - 2^20 */ for (i = 1; i < 20; i++) { x25519_x86_64_square(&t, &t); }
+  /* 2^40 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_20_0);
+
+  /* 2^41 - 2^1 */ x25519_x86_64_square(&t, &t);
+  /* 2^50 - 2^10 */ for (i = 1; i < 10; i++) { x25519_x86_64_square(&t, &t); }
+  /* 2^50 - 2^0 */ x25519_x86_64_mul(&z2_50_0, &t, &z2_10_0);
+
+  /* 2^51 - 2^1 */ x25519_x86_64_square(&t, &z2_50_0);
+  /* 2^100 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
+  /* 2^100 - 2^0 */ x25519_x86_64_mul(&z2_100_0, &t, &z2_50_0);
+
+  /* 2^101 - 2^1 */ x25519_x86_64_square(&t, &z2_100_0);
+  /* 2^200 - 2^100 */ for (i = 1; i < 100; i++) {
+    x25519_x86_64_square(&t, &t);
+  }
+  /* 2^200 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_100_0);
+
+  /* 2^201 - 2^1 */ x25519_x86_64_square(&t, &t);
+  /* 2^250 - 2^50 */ for (i = 1; i < 50; i++) { x25519_x86_64_square(&t, &t); }
+  /* 2^250 - 2^0 */ x25519_x86_64_mul(&t, &t, &z2_50_0);
+
+  /* 2^251 - 2^1 */ x25519_x86_64_square(&t, &t);
+  /* 2^252 - 2^2 */ x25519_x86_64_square(&t, &t);
+  /* 2^253 - 2^3 */ x25519_x86_64_square(&t, &t);
+
+  /* 2^254 - 2^4 */ x25519_x86_64_square(&t, &t);
+
+  /* 2^255 - 2^5 */ x25519_x86_64_square(&t, &t);
+  /* 2^255 - 21 */ x25519_x86_64_mul(r, &t, &z11);
+}
+
+static void mladder(fe25519 *xr, fe25519 *zr, const uint8_t s[32]) {
+  fe25519 work[5];
+
+  work[0] = *xr;
+  fe25519_setint(work + 1, 1);
+  fe25519_setint(work + 2, 0);
+  work[3] = *xr;
+  fe25519_setint(work + 4, 1);
+
+  int i, j;
+  uint8_t prevbit = 0;
+
+  j = 6;
+  for (i = 31; i >= 0; i--) {
+    while (j >= 0) {
+      const uint8_t bit = 1 & (s[i] >> j);
+      const uint64_t swap = bit ^ prevbit;
+      prevbit = bit;
+      x25519_x86_64_work_cswap(work + 1, swap);
+      x25519_x86_64_ladderstep(work);
+      j -= 1;
+    }
+    j = 7;
+  }
+
+  *xr = work[1];
+  *zr = work[2];
+}
+
+void x25519_x86_64(uint8_t out[32], const uint8_t scalar[32],
+                  const uint8_t point[32]) {
+  uint8_t e[32];
+  memcpy(e, scalar, sizeof(e));
+
+  e[0] &= 248;
+  e[31] &= 127;
+  e[31] |= 64;
+
+  fe25519 t;
+  fe25519 z;
+  fe25519_unpack(&t, point);
+  mladder(&t, &z, e);
+  fe25519_invert(&z, &z);
+  x25519_x86_64_mul(&t, &t, &z);
+  fe25519_pack(out, &t);
+}
+
+#endif  /* BORINGSSL_X25519_X86_64 */