Reorder the X25519 ladderstep stack frame on x86-64.

The current X25519 assembly has a 352-byte stack frame and saves the
regsiters at the bottom. This means that the CFI information cannot be
represented in the “compact” form that MacOS seems to want to use (see
linked bug).

The stack frame looked like:

 360 CFA
 352 return address
 ⋮
 56  (296 bytes of scratch space)
 48  saved RBP
 40  saved RBX
 32  saved R15
 24  saved R14
 16  saved R13
 8   saved R12
 0   (hole left from 3f38d80b dropping the superfluous saving of R11)

Now it looks like:

 352 CFA
 344 return address
 336 saved RBP
 328 saved RBX
 320 saved R15
 312 saved R14
 304 saved R13
 296 saved R12
 ⋮
 0   (296 bytes of scratch space)

The bulk of the changes involve subtracting 56 from all the offsets to
RSP when working in the scratch space. This was done in Vim with:
  '<,'>s/\([1-9][0-9]*\)(%rsp)/\=submatch(1)-56."(%rsp)"/

BUG=176

Change-Id: I022830e8f896fe2d877015fa3ecfa1d073207679
Reviewed-on: https://boringssl-review.googlesource.com/13580
Commit-Queue: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/curve25519/asm/x25519-asm-x86_64.S b/crypto/curve25519/asm/x25519-asm-x86_64.S
index dc8df94..18041d0 100644
--- a/crypto/curve25519/asm/x25519-asm-x86_64.S
+++ b/crypto/curve25519/asm/x25519-asm-x86_64.S
@@ -447,20 +447,20 @@
 HIDDEN C_ABI(x25519_x86_64_ladderstep)
 C_ABI(x25519_x86_64_ladderstep):
 .cfi_startproc
-sub $352,%rsp
-.cfi_adjust_cfa_offset 352
-movq %r12,8(%rsp)
-.cfi_rel_offset r12, 8
-movq %r13,16(%rsp)
-.cfi_rel_offset r13, 16
-movq %r14,24(%rsp)
-.cfi_rel_offset r14, 24
-movq %r15,32(%rsp)
-.cfi_rel_offset r15, 32
-movq %rbx,40(%rsp)
-.cfi_rel_offset rbx, 40
-movq %rbp,48(%rsp)
-.cfi_rel_offset rbp, 48
+sub $344,%rsp
+.cfi_adjust_cfa_offset 344
+movq %r12,296(%rsp)
+.cfi_rel_offset r12, 296
+movq %r13,304(%rsp)
+.cfi_rel_offset r13, 304
+movq %r14,312(%rsp)
+.cfi_rel_offset r14, 312
+movq %r15,320(%rsp)
+.cfi_rel_offset r15, 320
+movq %rbx,328(%rsp)
+.cfi_rel_offset rbx, 328
+movq %rbp,336(%rsp)
+.cfi_rel_offset rbp, 336
 movq   40(%rdi),%rsi
 movq   48(%rdi),%rdx
 movq   56(%rdi),%rcx
@@ -486,201 +486,86 @@
 subq 96(%rdi),%r11
 subq 104(%rdi),%r12
 subq 112(%rdi),%r13
-movq %rsi,56(%rsp)
-movq %rdx,64(%rsp)
-movq %rcx,72(%rsp)
-movq %r8,80(%rsp)
-movq %r9,88(%rsp)
-movq %rax,96(%rsp)
-movq %r10,104(%rsp)
-movq %r11,112(%rsp)
-movq %r12,120(%rsp)
-movq %r13,128(%rsp)
-movq 96(%rsp),%rax
-mulq  96(%rsp)
+movq %rsi,0(%rsp)
+movq %rdx,8(%rsp)
+movq %rcx,16(%rsp)
+movq %r8,24(%rsp)
+movq %r9,32(%rsp)
+movq %rax,40(%rsp)
+movq %r10,48(%rsp)
+movq %r11,56(%rsp)
+movq %r12,64(%rsp)
+movq %r13,72(%rsp)
+movq 40(%rsp),%rax
+mulq  40(%rsp)
 mov  %rax,%rsi
 mov  %rdx,%rcx
-movq 96(%rsp),%rax
+movq 40(%rsp),%rax
 shl  $1,%rax
-mulq  104(%rsp)
+mulq  48(%rsp)
 mov  %rax,%r8
 mov  %rdx,%r9
-movq 96(%rsp),%rax
+movq 40(%rsp),%rax
 shl  $1,%rax
-mulq  112(%rsp)
+mulq  56(%rsp)
 mov  %rax,%r10
 mov  %rdx,%r11
-movq 96(%rsp),%rax
+movq 40(%rsp),%rax
 shl  $1,%rax
-mulq  120(%rsp)
+mulq  64(%rsp)
 mov  %rax,%r12
 mov  %rdx,%r13
-movq 96(%rsp),%rax
+movq 40(%rsp),%rax
 shl  $1,%rax
-mulq  128(%rsp)
+mulq  72(%rsp)
 mov  %rax,%r14
 mov  %rdx,%r15
-movq 104(%rsp),%rax
-mulq  104(%rsp)
+movq 48(%rsp),%rax
+mulq  48(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 104(%rsp),%rax
+movq 48(%rsp),%rax
 shl  $1,%rax
-mulq  112(%rsp)
+mulq  56(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
-movq 104(%rsp),%rax
+movq 48(%rsp),%rax
 shl  $1,%rax
-mulq  120(%rsp)
+mulq  64(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 104(%rsp),%rdx
+movq 48(%rsp),%rdx
 imulq  $38,%rdx,%rax
-mulq  128(%rsp)
+mulq  72(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 112(%rsp),%rax
-mulq  112(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 112(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  120(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 112(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  128(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 120(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  120(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 120(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  128(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 128(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  128(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq x25519_x86_64_REDMASK51(%rip),%rdx
-shld $13,%rsi,%rcx
-and  %rdx,%rsi
-shld $13,%r8,%r9
-and  %rdx,%r8
-add  %rcx,%r8
-shld $13,%r10,%r11
-and  %rdx,%r10
-add  %r9,%r10
-shld $13,%r12,%r13
-and  %rdx,%r12
-add  %r11,%r12
-shld $13,%r14,%r15
-and  %rdx,%r14
-add  %r13,%r14
-imulq  $19,%r15,%rcx
-add  %rcx,%rsi
-mov  %rsi,%rcx
-shr  $51,%rcx
-add  %r8,%rcx
-and  %rdx,%rsi
-mov  %rcx,%r8
-shr  $51,%rcx
-add  %r10,%rcx
-and  %rdx,%r8
-mov  %rcx,%r9
-shr  $51,%rcx
-add  %r12,%rcx
-and  %rdx,%r9
-mov  %rcx,%rax
-shr  $51,%rcx
-add  %r14,%rcx
-and  %rdx,%rax
-mov  %rcx,%r10
-shr  $51,%rcx
-imulq  $19,%rcx,%rcx
-add  %rcx,%rsi
-and  %rdx,%r10
-movq %rsi,136(%rsp)
-movq %r8,144(%rsp)
-movq %r9,152(%rsp)
-movq %rax,160(%rsp)
-movq %r10,168(%rsp)
 movq 56(%rsp),%rax
 mulq  56(%rsp)
-mov  %rax,%rsi
-mov  %rdx,%rcx
-movq 56(%rsp),%rax
-shl  $1,%rax
-mulq  64(%rsp)
-mov  %rax,%r8
-mov  %rdx,%r9
-movq 56(%rsp),%rax
-shl  $1,%rax
-mulq  72(%rsp)
-mov  %rax,%r10
-mov  %rdx,%r11
-movq 56(%rsp),%rax
-shl  $1,%rax
-mulq  80(%rsp)
-mov  %rax,%r12
-mov  %rdx,%r13
-movq 56(%rsp),%rax
-shl  $1,%rax
-mulq  88(%rsp)
-mov  %rax,%r14
-mov  %rdx,%r15
-movq 64(%rsp),%rax
-mulq  64(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 64(%rsp),%rax
-shl  $1,%rax
-mulq  72(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 64(%rsp),%rax
-shl  $1,%rax
-mulq  80(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
+movq 56(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  64(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 56(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  72(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 64(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  64(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
 movq 64(%rsp),%rdx
 imulq  $38,%rdx,%rax
-mulq  88(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 72(%rsp),%rax
 mulq  72(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 72(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  80(%rsp)
-add  %rax,%rsi
-adc %rdx,%rcx
-movq 72(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  88(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 80(%rsp),%rdx
-imulq  $19,%rdx,%rax
-mulq  80(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 80(%rsp),%rdx
-imulq  $38,%rdx,%rax
-mulq  88(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 88(%rsp),%rdx
+movq 72(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  88(%rsp)
+mulq  72(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
 movq x25519_x86_64_REDMASK51(%rip),%rdx
@@ -721,11 +606,126 @@
 imulq  $19,%rcx,%rcx
 add  %rcx,%rsi
 and  %rdx,%r10
-movq %rsi,176(%rsp)
-movq %r8,184(%rsp)
-movq %r9,192(%rsp)
-movq %rax,200(%rsp)
-movq %r10,208(%rsp)
+movq %rsi,80(%rsp)
+movq %r8,88(%rsp)
+movq %r9,96(%rsp)
+movq %rax,104(%rsp)
+movq %r10,112(%rsp)
+movq 0(%rsp),%rax
+mulq  0(%rsp)
+mov  %rax,%rsi
+mov  %rdx,%rcx
+movq 0(%rsp),%rax
+shl  $1,%rax
+mulq  8(%rsp)
+mov  %rax,%r8
+mov  %rdx,%r9
+movq 0(%rsp),%rax
+shl  $1,%rax
+mulq  16(%rsp)
+mov  %rax,%r10
+mov  %rdx,%r11
+movq 0(%rsp),%rax
+shl  $1,%rax
+mulq  24(%rsp)
+mov  %rax,%r12
+mov  %rdx,%r13
+movq 0(%rsp),%rax
+shl  $1,%rax
+mulq  32(%rsp)
+mov  %rax,%r14
+mov  %rdx,%r15
+movq 8(%rsp),%rax
+mulq  8(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 8(%rsp),%rax
+shl  $1,%rax
+mulq  16(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 8(%rsp),%rax
+shl  $1,%rax
+mulq  24(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 8(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  32(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 16(%rsp),%rax
+mulq  16(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 16(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  24(%rsp)
+add  %rax,%rsi
+adc %rdx,%rcx
+movq 16(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  32(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 24(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  24(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 24(%rsp),%rdx
+imulq  $38,%rdx,%rax
+mulq  32(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 32(%rsp),%rdx
+imulq  $19,%rdx,%rax
+mulq  32(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq x25519_x86_64_REDMASK51(%rip),%rdx
+shld $13,%rsi,%rcx
+and  %rdx,%rsi
+shld $13,%r8,%r9
+and  %rdx,%r8
+add  %rcx,%r8
+shld $13,%r10,%r11
+and  %rdx,%r10
+add  %r9,%r10
+shld $13,%r12,%r13
+and  %rdx,%r12
+add  %r11,%r12
+shld $13,%r14,%r15
+and  %rdx,%r14
+add  %r13,%r14
+imulq  $19,%r15,%rcx
+add  %rcx,%rsi
+mov  %rsi,%rcx
+shr  $51,%rcx
+add  %r8,%rcx
+and  %rdx,%rsi
+mov  %rcx,%r8
+shr  $51,%rcx
+add  %r10,%rcx
+and  %rdx,%r8
+mov  %rcx,%r9
+shr  $51,%rcx
+add  %r12,%rcx
+and  %rdx,%r9
+mov  %rcx,%rax
+shr  $51,%rcx
+add  %r14,%rcx
+and  %rdx,%rax
+mov  %rcx,%r10
+shr  $51,%rcx
+imulq  $19,%rcx,%rcx
+add  %rcx,%rsi
+and  %rdx,%r10
+movq %rsi,120(%rsp)
+movq %r8,128(%rsp)
+movq %r9,136(%rsp)
+movq %rax,144(%rsp)
+movq %r10,152(%rsp)
 mov  %rsi,%rsi
 mov  %r8,%rdx
 mov  %r9,%rcx
@@ -736,16 +736,16 @@
 add  x25519_x86_64_2P1234(%rip),%rcx
 add  x25519_x86_64_2P1234(%rip),%r8
 add  x25519_x86_64_2P1234(%rip),%r9
-subq 136(%rsp),%rsi
-subq 144(%rsp),%rdx
-subq 152(%rsp),%rcx
-subq 160(%rsp),%r8
-subq 168(%rsp),%r9
-movq %rsi,216(%rsp)
-movq %rdx,224(%rsp)
-movq %rcx,232(%rsp)
-movq %r8,240(%rsp)
-movq %r9,248(%rsp)
+subq 80(%rsp),%rsi
+subq 88(%rsp),%rdx
+subq 96(%rsp),%rcx
+subq 104(%rsp),%r8
+subq 112(%rsp),%r9
+movq %rsi,160(%rsp)
+movq %rdx,168(%rsp)
+movq %rcx,176(%rsp)
+movq %r8,184(%rsp)
+movq %r9,192(%rsp)
 movq   120(%rdi),%rsi
 movq   128(%rdi),%rdx
 movq   136(%rdi),%rcx
@@ -771,121 +771,121 @@
 subq 176(%rdi),%r11
 subq 184(%rdi),%r12
 subq 192(%rdi),%r13
-movq %rsi,256(%rsp)
-movq %rdx,264(%rsp)
-movq %rcx,272(%rsp)
-movq %r8,280(%rsp)
-movq %r9,288(%rsp)
-movq %rax,296(%rsp)
-movq %r10,304(%rsp)
-movq %r11,312(%rsp)
-movq %r12,320(%rsp)
-movq %r13,328(%rsp)
-movq 280(%rsp),%rsi
+movq %rsi,200(%rsp)
+movq %rdx,208(%rsp)
+movq %rcx,216(%rsp)
+movq %r8,224(%rsp)
+movq %r9,232(%rsp)
+movq %rax,240(%rsp)
+movq %r10,248(%rsp)
+movq %r11,256(%rsp)
+movq %r12,264(%rsp)
+movq %r13,272(%rsp)
+movq 224(%rsp),%rsi
 imulq  $19,%rsi,%rax
-movq %rax,336(%rsp)
-mulq  112(%rsp)
+movq %rax,280(%rsp)
+mulq  56(%rsp)
 mov  %rax,%rsi
 mov  %rdx,%rcx
-movq 288(%rsp),%rdx
+movq 232(%rsp),%rdx
 imulq  $19,%rdx,%rax
-movq %rax,344(%rsp)
-mulq  104(%rsp)
+movq %rax,288(%rsp)
+mulq  48(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 256(%rsp),%rax
-mulq  96(%rsp)
+movq 200(%rsp),%rax
+mulq  40(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 256(%rsp),%rax
-mulq  104(%rsp)
+movq 200(%rsp),%rax
+mulq  48(%rsp)
 mov  %rax,%r8
 mov  %rdx,%r9
-movq 256(%rsp),%rax
-mulq  112(%rsp)
+movq 200(%rsp),%rax
+mulq  56(%rsp)
 mov  %rax,%r10
 mov  %rdx,%r11
-movq 256(%rsp),%rax
-mulq  120(%rsp)
+movq 200(%rsp),%rax
+mulq  64(%rsp)
 mov  %rax,%r12
 mov  %rdx,%r13
-movq 256(%rsp),%rax
-mulq  128(%rsp)
+movq 200(%rsp),%rax
+mulq  72(%rsp)
 mov  %rax,%r14
 mov  %rdx,%r15
-movq 264(%rsp),%rax
-mulq  96(%rsp)
+movq 208(%rsp),%rax
+mulq  40(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 264(%rsp),%rax
-mulq  104(%rsp)
+movq 208(%rsp),%rax
+mulq  48(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 264(%rsp),%rax
-mulq  112(%rsp)
+movq 208(%rsp),%rax
+mulq  56(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
-movq 264(%rsp),%rax
-mulq  120(%rsp)
+movq 208(%rsp),%rax
+mulq  64(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 264(%rsp),%rdx
+movq 208(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  128(%rsp)
+mulq  72(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 272(%rsp),%rax
-mulq  96(%rsp)
+movq 216(%rsp),%rax
+mulq  40(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 272(%rsp),%rax
-mulq  104(%rsp)
+movq 216(%rsp),%rax
+mulq  48(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
-movq 272(%rsp),%rax
-mulq  112(%rsp)
+movq 216(%rsp),%rax
+mulq  56(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 272(%rsp),%rdx
+movq 216(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  120(%rsp)
+mulq  64(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 272(%rsp),%rdx
+movq 216(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  128(%rsp)
+mulq  72(%rsp)
+add  %rax,%r8
+adc %rdx,%r9
+movq 224(%rsp),%rax
+mulq  40(%rsp)
+add  %rax,%r12
+adc %rdx,%r13
+movq 224(%rsp),%rax
+mulq  48(%rsp)
+add  %rax,%r14
+adc %rdx,%r15
+movq 280(%rsp),%rax
+mulq  64(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
 movq 280(%rsp),%rax
-mulq  96(%rsp)
-add  %rax,%r12
-adc %rdx,%r13
-movq 280(%rsp),%rax
-mulq  104(%rsp)
+mulq  72(%rsp)
+add  %rax,%r10
+adc %rdx,%r11
+movq 232(%rsp),%rax
+mulq  40(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 336(%rsp),%rax
-mulq  120(%rsp)
+movq 288(%rsp),%rax
+mulq  56(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 336(%rsp),%rax
-mulq  128(%rsp)
+movq 288(%rsp),%rax
+mulq  64(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
 movq 288(%rsp),%rax
-mulq  96(%rsp)
-add  %rax,%r14
-adc %rdx,%r15
-movq 344(%rsp),%rax
-mulq  112(%rsp)
-add  %rax,%r8
-adc %rdx,%r9
-movq 344(%rsp),%rax
-mulq  120(%rsp)
-add  %rax,%r10
-adc %rdx,%r11
-movq 344(%rsp),%rax
-mulq  128(%rsp)
+mulq  72(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
 movq x25519_x86_64_REDMASK51(%rip),%rdx
@@ -926,116 +926,116 @@
 imulq  $19,%rcx,%rcx
 add  %rcx,%rsi
 and  %rdx,%r10
-movq %rsi,96(%rsp)
-movq %r8,104(%rsp)
-movq %r9,112(%rsp)
-movq %rax,120(%rsp)
-movq %r10,128(%rsp)
-movq 320(%rsp),%rsi
+movq %rsi,40(%rsp)
+movq %r8,48(%rsp)
+movq %r9,56(%rsp)
+movq %rax,64(%rsp)
+movq %r10,72(%rsp)
+movq 264(%rsp),%rsi
 imulq  $19,%rsi,%rax
-movq %rax,256(%rsp)
-mulq  72(%rsp)
+movq %rax,200(%rsp)
+mulq  16(%rsp)
 mov  %rax,%rsi
 mov  %rdx,%rcx
-movq 328(%rsp),%rdx
+movq 272(%rsp),%rdx
 imulq  $19,%rdx,%rax
-movq %rax,264(%rsp)
-mulq  64(%rsp)
+movq %rax,208(%rsp)
+mulq  8(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 296(%rsp),%rax
-mulq  56(%rsp)
+movq 240(%rsp),%rax
+mulq  0(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 296(%rsp),%rax
-mulq  64(%rsp)
+movq 240(%rsp),%rax
+mulq  8(%rsp)
 mov  %rax,%r8
 mov  %rdx,%r9
-movq 296(%rsp),%rax
-mulq  72(%rsp)
+movq 240(%rsp),%rax
+mulq  16(%rsp)
 mov  %rax,%r10
 mov  %rdx,%r11
-movq 296(%rsp),%rax
-mulq  80(%rsp)
+movq 240(%rsp),%rax
+mulq  24(%rsp)
 mov  %rax,%r12
 mov  %rdx,%r13
-movq 296(%rsp),%rax
-mulq  88(%rsp)
+movq 240(%rsp),%rax
+mulq  32(%rsp)
 mov  %rax,%r14
 mov  %rdx,%r15
-movq 304(%rsp),%rax
-mulq  56(%rsp)
+movq 248(%rsp),%rax
+mulq  0(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 304(%rsp),%rax
-mulq  64(%rsp)
+movq 248(%rsp),%rax
+mulq  8(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 304(%rsp),%rax
-mulq  72(%rsp)
+movq 248(%rsp),%rax
+mulq  16(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
-movq 304(%rsp),%rax
-mulq  80(%rsp)
+movq 248(%rsp),%rax
+mulq  24(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 304(%rsp),%rdx
+movq 248(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  88(%rsp)
+mulq  32(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 312(%rsp),%rax
-mulq  56(%rsp)
+movq 256(%rsp),%rax
+mulq  0(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 312(%rsp),%rax
-mulq  64(%rsp)
+movq 256(%rsp),%rax
+mulq  8(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
-movq 312(%rsp),%rax
-mulq  72(%rsp)
+movq 256(%rsp),%rax
+mulq  16(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 312(%rsp),%rdx
+movq 256(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  80(%rsp)
+mulq  24(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 312(%rsp),%rdx
+movq 256(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  88(%rsp)
+mulq  32(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 320(%rsp),%rax
-mulq  56(%rsp)
+movq 264(%rsp),%rax
+mulq  0(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
-movq 320(%rsp),%rax
-mulq  64(%rsp)
+movq 264(%rsp),%rax
+mulq  8(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 256(%rsp),%rax
-mulq  80(%rsp)
+movq 200(%rsp),%rax
+mulq  24(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 256(%rsp),%rax
-mulq  88(%rsp)
+movq 200(%rsp),%rax
+mulq  32(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 328(%rsp),%rax
-mulq  56(%rsp)
+movq 272(%rsp),%rax
+mulq  0(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 264(%rsp),%rax
-mulq  72(%rsp)
+movq 208(%rsp),%rax
+mulq  16(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 264(%rsp),%rax
-mulq  80(%rsp)
+movq 208(%rsp),%rax
+mulq  24(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 264(%rsp),%rax
-mulq  88(%rsp)
+movq 208(%rsp),%rax
+mulq  32(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
 movq x25519_x86_64_REDMASK51(%rip),%rdx
@@ -1086,16 +1086,16 @@
 add  x25519_x86_64_2P1234(%rip),%r11
 add  x25519_x86_64_2P1234(%rip),%r12
 add  x25519_x86_64_2P1234(%rip),%r13
-addq 96(%rsp),%rsi
-addq 104(%rsp),%r8
-addq 112(%rsp),%r9
-addq 120(%rsp),%rax
-addq 128(%rsp),%r10
-subq 96(%rsp),%rdx
-subq 104(%rsp),%rcx
-subq 112(%rsp),%r11
-subq 120(%rsp),%r12
-subq 128(%rsp),%r13
+addq 40(%rsp),%rsi
+addq 48(%rsp),%r8
+addq 56(%rsp),%r9
+addq 64(%rsp),%rax
+addq 72(%rsp),%r10
+subq 40(%rsp),%rdx
+subq 48(%rsp),%rcx
+subq 56(%rsp),%r11
+subq 64(%rsp),%r12
+subq 72(%rsp),%r13
 movq   %rsi,120(%rdi)
 movq   %r8,128(%rdi)
 movq   %r9,136(%rdi)
@@ -1338,13 +1338,13 @@
 movq   %r10,192(%rdi)
 movq   184(%rdi),%rsi
 imulq  $19,%rsi,%rax
-movq %rax,56(%rsp)
+movq %rax,0(%rsp)
 mulq  16(%rdi)
 mov  %rax,%rsi
 mov  %rdx,%rcx
 movq   192(%rdi),%rdx
 imulq  $19,%rdx,%rax
-movq %rax,64(%rsp)
+movq %rax,8(%rsp)
 mulq  8(%rdi)
 add  %rax,%rsi
 adc %rdx,%rcx
@@ -1419,11 +1419,11 @@
 mulq  8(%rdi)
 add  %rax,%r14
 adc %rdx,%r15
-movq 56(%rsp),%rax
+movq 0(%rsp),%rax
 mulq  24(%rdi)
 add  %rax,%r8
 adc %rdx,%r9
-movq 56(%rsp),%rax
+movq 0(%rsp),%rax
 mulq  32(%rdi)
 add  %rax,%r10
 adc %rdx,%r11
@@ -1431,15 +1431,15 @@
 mulq  0(%rdi)
 add  %rax,%r14
 adc %rdx,%r15
-movq 64(%rsp),%rax
+movq 8(%rsp),%rax
 mulq  16(%rdi)
 add  %rax,%r8
 adc %rdx,%r9
-movq 64(%rsp),%rax
+movq 8(%rsp),%rax
 mulq  24(%rdi)
 add  %rax,%r10
 adc %rdx,%r11
-movq 64(%rsp),%rax
+movq 8(%rsp),%rax
 mulq  32(%rdi)
 add  %rax,%r12
 adc %rdx,%r13
@@ -1486,111 +1486,111 @@
 movq   %r9,176(%rdi)
 movq   %rax,184(%rdi)
 movq   %r10,192(%rdi)
-movq 200(%rsp),%rsi
+movq 144(%rsp),%rsi
 imulq  $19,%rsi,%rax
-movq %rax,56(%rsp)
-mulq  152(%rsp)
+movq %rax,0(%rsp)
+mulq  96(%rsp)
 mov  %rax,%rsi
 mov  %rdx,%rcx
-movq 208(%rsp),%rdx
+movq 152(%rsp),%rdx
 imulq  $19,%rdx,%rax
-movq %rax,64(%rsp)
-mulq  144(%rsp)
+movq %rax,8(%rsp)
+mulq  88(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 176(%rsp),%rax
-mulq  136(%rsp)
+movq 120(%rsp),%rax
+mulq  80(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 176(%rsp),%rax
-mulq  144(%rsp)
+movq 120(%rsp),%rax
+mulq  88(%rsp)
 mov  %rax,%r8
 mov  %rdx,%r9
-movq 176(%rsp),%rax
-mulq  152(%rsp)
+movq 120(%rsp),%rax
+mulq  96(%rsp)
 mov  %rax,%r10
 mov  %rdx,%r11
-movq 176(%rsp),%rax
-mulq  160(%rsp)
+movq 120(%rsp),%rax
+mulq  104(%rsp)
 mov  %rax,%r12
 mov  %rdx,%r13
-movq 176(%rsp),%rax
-mulq  168(%rsp)
+movq 120(%rsp),%rax
+mulq  112(%rsp)
 mov  %rax,%r14
 mov  %rdx,%r15
-movq 184(%rsp),%rax
-mulq  136(%rsp)
+movq 128(%rsp),%rax
+mulq  80(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 184(%rsp),%rax
-mulq  144(%rsp)
+movq 128(%rsp),%rax
+mulq  88(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 184(%rsp),%rax
-mulq  152(%rsp)
+movq 128(%rsp),%rax
+mulq  96(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
-movq 184(%rsp),%rax
-mulq  160(%rsp)
+movq 128(%rsp),%rax
+mulq  104(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 184(%rsp),%rdx
+movq 128(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  168(%rsp)
+mulq  112(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 192(%rsp),%rax
-mulq  136(%rsp)
+movq 136(%rsp),%rax
+mulq  80(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 192(%rsp),%rax
-mulq  144(%rsp)
+movq 136(%rsp),%rax
+mulq  88(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
-movq 192(%rsp),%rax
-mulq  152(%rsp)
+movq 136(%rsp),%rax
+mulq  96(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 192(%rsp),%rdx
+movq 136(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  160(%rsp)
+mulq  104(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
-movq 192(%rsp),%rdx
+movq 136(%rsp),%rdx
 imulq  $19,%rdx,%rax
-mulq  168(%rsp)
+mulq  112(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 200(%rsp),%rax
-mulq  136(%rsp)
+movq 144(%rsp),%rax
+mulq  80(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
-movq 200(%rsp),%rax
-mulq  144(%rsp)
+movq 144(%rsp),%rax
+mulq  88(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 56(%rsp),%rax
-mulq  160(%rsp)
+movq 0(%rsp),%rax
+mulq  104(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 56(%rsp),%rax
-mulq  168(%rsp)
+movq 0(%rsp),%rax
+mulq  112(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 208(%rsp),%rax
-mulq  136(%rsp)
+movq 152(%rsp),%rax
+mulq  80(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 64(%rsp),%rax
-mulq  152(%rsp)
+movq 8(%rsp),%rax
+mulq  96(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 64(%rsp),%rax
-mulq  160(%rsp)
+movq 8(%rsp),%rax
+mulq  104(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 64(%rsp),%rax
-mulq  168(%rsp)
+movq 8(%rsp),%rax
+mulq  112(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
 movq x25519_x86_64_REDMASK51(%rip),%rdx
@@ -1636,37 +1636,37 @@
 movq   %r9,56(%rdi)
 movq   %rax,64(%rdi)
 movq   %r10,72(%rdi)
-movq 216(%rsp),%rax
+movq 160(%rsp),%rax
 mulq  x25519_x86_64_121666_213(%rip)
 shr  $13,%rax
 mov  %rax,%rsi
 mov  %rdx,%rcx
-movq 224(%rsp),%rax
+movq 168(%rsp),%rax
 mulq  x25519_x86_64_121666_213(%rip)
 shr  $13,%rax
 add  %rax,%rcx
 mov  %rdx,%r8
-movq 232(%rsp),%rax
+movq 176(%rsp),%rax
 mulq  x25519_x86_64_121666_213(%rip)
 shr  $13,%rax
 add  %rax,%r8
 mov  %rdx,%r9
-movq 240(%rsp),%rax
+movq 184(%rsp),%rax
 mulq  x25519_x86_64_121666_213(%rip)
 shr  $13,%rax
 add  %rax,%r9
 mov  %rdx,%r10
-movq 248(%rsp),%rax
+movq 192(%rsp),%rax
 mulq  x25519_x86_64_121666_213(%rip)
 shr  $13,%rax
 add  %rax,%r10
 imulq  $19,%rdx,%rdx
 add  %rdx,%rsi
-addq 136(%rsp),%rsi
-addq 144(%rsp),%rcx
-addq 152(%rsp),%r8
-addq 160(%rsp),%r9
-addq 168(%rsp),%r10
+addq 80(%rsp),%rsi
+addq 88(%rsp),%rcx
+addq 96(%rsp),%r8
+addq 104(%rsp),%r9
+addq 112(%rsp),%r10
 movq   %rsi,80(%rdi)
 movq   %rcx,88(%rdi)
 movq   %r8,96(%rdi)
@@ -1674,109 +1674,109 @@
 movq   %r10,112(%rdi)
 movq   104(%rdi),%rsi
 imulq  $19,%rsi,%rax
-movq %rax,56(%rsp)
-mulq  232(%rsp)
+movq %rax,0(%rsp)
+mulq  176(%rsp)
 mov  %rax,%rsi
 mov  %rdx,%rcx
 movq   112(%rdi),%rdx
 imulq  $19,%rdx,%rax
-movq %rax,64(%rsp)
-mulq  224(%rsp)
+movq %rax,8(%rsp)
+mulq  168(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
 movq   80(%rdi),%rax
-mulq  216(%rsp)
+mulq  160(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
 movq   80(%rdi),%rax
-mulq  224(%rsp)
+mulq  168(%rsp)
 mov  %rax,%r8
 mov  %rdx,%r9
 movq   80(%rdi),%rax
-mulq  232(%rsp)
+mulq  176(%rsp)
 mov  %rax,%r10
 mov  %rdx,%r11
 movq   80(%rdi),%rax
-mulq  240(%rsp)
+mulq  184(%rsp)
 mov  %rax,%r12
 mov  %rdx,%r13
 movq   80(%rdi),%rax
-mulq  248(%rsp)
+mulq  192(%rsp)
 mov  %rax,%r14
 mov  %rdx,%r15
 movq   88(%rdi),%rax
-mulq  216(%rsp)
+mulq  160(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
 movq   88(%rdi),%rax
-mulq  224(%rsp)
+mulq  168(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
 movq   88(%rdi),%rax
-mulq  232(%rsp)
+mulq  176(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
 movq   88(%rdi),%rax
-mulq  240(%rsp)
+mulq  184(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
 movq   88(%rdi),%rdx
 imulq  $19,%rdx,%rax
-mulq  248(%rsp)
+mulq  192(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
 movq   96(%rdi),%rax
-mulq  216(%rsp)
+mulq  160(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
 movq   96(%rdi),%rax
-mulq  224(%rsp)
+mulq  168(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
 movq   96(%rdi),%rax
-mulq  232(%rsp)
+mulq  176(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
 movq   96(%rdi),%rdx
 imulq  $19,%rdx,%rax
-mulq  240(%rsp)
+mulq  184(%rsp)
 add  %rax,%rsi
 adc %rdx,%rcx
 movq   96(%rdi),%rdx
 imulq  $19,%rdx,%rax
-mulq  248(%rsp)
+mulq  192(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
 movq   104(%rdi),%rax
-mulq  216(%rsp)
+mulq  160(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
 movq   104(%rdi),%rax
-mulq  224(%rsp)
+mulq  168(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 56(%rsp),%rax
-mulq  240(%rsp)
+movq 0(%rsp),%rax
+mulq  184(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 56(%rsp),%rax
-mulq  248(%rsp)
+movq 0(%rsp),%rax
+mulq  192(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
 movq   112(%rdi),%rax
-mulq  216(%rsp)
+mulq  160(%rsp)
 add  %rax,%r14
 adc %rdx,%r15
-movq 64(%rsp),%rax
-mulq  232(%rsp)
+movq 8(%rsp),%rax
+mulq  176(%rsp)
 add  %rax,%r8
 adc %rdx,%r9
-movq 64(%rsp),%rax
-mulq  240(%rsp)
+movq 8(%rsp),%rax
+mulq  184(%rsp)
 add  %rax,%r10
 adc %rdx,%r11
-movq 64(%rsp),%rax
-mulq  248(%rsp)
+movq 8(%rsp),%rax
+mulq  192(%rsp)
 add  %rax,%r12
 adc %rdx,%r13
 movq x25519_x86_64_REDMASK51(%rip),%rdx
@@ -1822,14 +1822,14 @@
 movq   %r9,96(%rdi)
 movq   %rax,104(%rdi)
 movq   %r10,112(%rdi)
-movq 8(%rsp),%r12
-movq 16(%rsp),%r13
-movq 24(%rsp),%r14
-movq 32(%rsp),%r15
-movq 40(%rsp),%rbx
-movq 48(%rsp),%rbp
-add $352,%rsp
-.cfi_adjust_cfa_offset -352
+movq 296(%rsp),%r12
+movq 304(%rsp),%r13
+movq 312(%rsp),%r14
+movq 320(%rsp),%r15
+movq 328(%rsp),%rbx
+movq 336(%rsp),%rbp
+add $344,%rsp
+.cfi_adjust_cfa_offset -344
 ret
 .cfi_endproc