Add CFI information to the x86-64 X25519 asm.

This change serves to check that all our consumers can process assembly
with CFI directives in it.

For the first change I picked a file that's not perlasm to keep things
slightly simplier, but that might have been a mistake:

DJB's tooling always aligns the stack to 32 bytes and it's not possible
to express this in DWARF format (without using a register to store the
old stack pointer).

Since none of the functions here appear to care about that alignment, I
removed it from each of them. I also trimmed the set of saved registers
where possible and used the redzone for functions that didn't need much
stack.

Overall, this appears to have slightly improved the performance (by
about 0.7%):

Before:

Did 46000 Curve25519 base-point multiplication operations in 3023288us (15215.2 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3017315us (15245.3 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3015346us (15255.3 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3018609us (15238.8 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3019004us (15236.8 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3013135us (15266.5 ops/sec)

After:

Did 46000 Curve25519 base-point multiplication operations in 3007659us (15294.3 ops/sec)
Did 47000 Curve25519 arbitrary point multiplication operations in 3054202us (15388.6 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3008714us (15288.9 ops/sec)
Did 46000 Curve25519 arbitrary point multiplication operations in 3004740us (15309.1 ops/sec)
Did 46000 Curve25519 base-point multiplication operations in 3009140us (15286.8 ops/sec)
Did 47000 Curve25519 arbitrary point multiplication operations in 3057518us (15371.9 ops/sec)

Change-Id: I31df11c45b2ea0bf44dde861d52c27f848331691
Reviewed-on: https://boringssl-review.googlesource.com/13200
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/curve25519/asm/x25519-asm-x86_64.S b/crypto/curve25519/asm/x25519-asm-x86_64.S
index 531ac16..72f69db 100644
--- a/crypto/curve25519/asm/x25519-asm-x86_64.S
+++ b/crypto/curve25519/asm/x25519-asm-x86_64.S
@@ -60,17 +60,10 @@
 .globl C_ABI(x25519_x86_64_freeze)
 HIDDEN C_ABI(x25519_x86_64_freeze)
 C_ABI(x25519_x86_64_freeze):
-mov %rsp,%r11
-and $31,%r11
-add $64,%r11
-sub %r11,%rsp
-movq %r11,0(%rsp)
-movq %r12,8(%rsp)
-movq %r13,16(%rsp)
-movq %r14,24(%rsp)
-movq %r15,32(%rsp)
-movq %rbx,40(%rsp)
-movq %rbp,48(%rsp)
+.cfi_startproc
+/* This is a leaf function and uses the redzone for saving registers. */
+movq %r12,-8(%rsp)
+.cfi_rel_offset r12, -8
 movq   0(%rdi),%rsi
 movq   8(%rdi),%rdx
 movq   16(%rdi),%rcx
@@ -128,44 +121,40 @@
 movq   %rcx,16(%rdi)
 movq   %r8,24(%rdi)
 movq   %r9,32(%rdi)
-movq 0(%rsp),%r11
-movq 8(%rsp),%r12
-movq 16(%rsp),%r13
-movq 24(%rsp),%r14
-movq 32(%rsp),%r15
-movq 40(%rsp),%rbx
-movq 48(%rsp),%rbp
-add %r11,%rsp
-mov %rdi,%rax
-mov %rsi,%rdx
+movq -8(%rsp),%r12
 ret
+.cfi_endproc
 
 .p2align 5
 .globl C_ABI(x25519_x86_64_mul)
 HIDDEN C_ABI(x25519_x86_64_mul)
 C_ABI(x25519_x86_64_mul):
-mov %rsp,%r11
-and $31,%r11
-add $96,%r11
-sub %r11,%rsp
-movq %r11,0(%rsp)
-movq %r12,8(%rsp)
-movq %r13,16(%rsp)
-movq %r14,24(%rsp)
-movq %r15,32(%rsp)
-movq %rbx,40(%rsp)
-movq %rbp,48(%rsp)
-movq %rdi,56(%rsp)
+.cfi_startproc
+/* This is a leaf function and uses the redzone for saving registers. */
+movq %r12,-8(%rsp)
+.cfi_rel_offset r12, -8
+movq %r13,-16(%rsp)
+.cfi_rel_offset r13, -16
+movq %r14,-24(%rsp)
+.cfi_rel_offset r14, -24
+movq %r15,-32(%rsp)
+.cfi_rel_offset r15, -32
+movq %rbx,-40(%rsp)
+.cfi_rel_offset rbx, -40
+movq %rbp,-48(%rsp)
+.cfi_rel_offset rbp, -48
+movq %rdi,-56(%rsp)
+.cfi_rel_offset rdi, -56
 mov  %rdx,%rcx
 movq   24(%rsi),%rdx
 imulq  $19,%rdx,%rax
-movq %rax,64(%rsp)
+movq %rax,-64(%rsp)
 mulq  16(%rcx)
 mov  %rax,%r8
 mov  %rdx,%r9
 movq   32(%rsi),%rdx
 imulq  $19,%rdx,%rax
-movq %rax,72(%rsp)
+movq %rax,-72(%rsp)
 mulq  8(%rcx)
 add  %rax,%r8
 adc %rdx,%r9
@@ -240,11 +229,11 @@
 mulq  8(%rcx)
 add  %rax,%rbx
 adc %rdx,%rbp
-movq 64(%rsp),%rax
+movq -64(%rsp),%rax
 mulq  24(%rcx)
 add  %rax,%r10
 adc %rdx,%r11
-movq 64(%rsp),%rax
+movq -64(%rsp),%rax
 mulq  32(%rcx)
 add  %rax,%r12
 adc %rdx,%r13
@@ -252,15 +241,15 @@
 mulq  0(%rcx)
 add  %rax,%rbx
 adc %rdx,%rbp
-movq 72(%rsp),%rax
+movq -72(%rsp),%rax
 mulq  16(%rcx)
 add  %rax,%r10
 adc %rdx,%r11
-movq 72(%rsp),%rax
+movq -72(%rsp),%rax
 mulq  24(%rcx)
 add  %rax,%r12
 adc %rdx,%r13
-movq 72(%rsp),%rax
+movq -72(%rsp),%rax
 mulq  32(%rcx)
 add  %rax,%r14
 adc %rdx,%r15
@@ -307,33 +296,31 @@
 movq   %r9,16(%rdi)
 movq   %rax,24(%rdi)
 movq   %r10,32(%rdi)
-movq 0(%rsp),%r11
-movq 8(%rsp),%r12
-movq 16(%rsp),%r13
-movq 24(%rsp),%r14
-movq 32(%rsp),%r15
-movq 40(%rsp),%rbx
-movq 48(%rsp),%rbp
-add %r11,%rsp
-mov %rdi,%rax
-mov %rsi,%rdx
+movq -8(%rsp),%r12
+movq -16(%rsp),%r13
+movq -24(%rsp),%r14
+movq -32(%rsp),%r15
+movq -40(%rsp),%rbx
+movq -48(%rsp),%rbp
 ret
+.cfi_endproc
 
 .p2align 5
 .globl C_ABI(x25519_x86_64_square)
 HIDDEN C_ABI(x25519_x86_64_square)
 C_ABI(x25519_x86_64_square):
-mov %rsp,%r11
-and $31,%r11
-add $64,%r11
-sub %r11,%rsp
-movq %r11,0(%rsp)
-movq %r12,8(%rsp)
-movq %r13,16(%rsp)
-movq %r14,24(%rsp)
-movq %r15,32(%rsp)
-movq %rbx,40(%rsp)
-movq %rbp,48(%rsp)
+.cfi_startproc
+/* This is a leaf function and uses the redzone for saving registers. */
+movq %r12,-8(%rsp)
+.cfi_rel_offset r12, -8
+movq %r13,-16(%rsp)
+.cfi_rel_offset r13, -16
+movq %r14,-24(%rsp)
+.cfi_rel_offset r14, -24
+movq %r15,-32(%rsp)
+.cfi_rel_offset r15, -32
+movq %rbx,-40(%rsp)
+.cfi_rel_offset rbx, -40
 movq   0(%rsi),%rax
 mulq  0(%rsi)
 mov  %rax,%rcx
@@ -449,33 +436,33 @@
 movq   %r9,16(%rdi)
 movq   %rax,24(%rdi)
 movq   %r10,32(%rdi)
-movq 0(%rsp),%r11
-movq 8(%rsp),%r12
-movq 16(%rsp),%r13
-movq 24(%rsp),%r14
-movq 32(%rsp),%r15
-movq 40(%rsp),%rbx
-movq 48(%rsp),%rbp
-add %r11,%rsp
-mov %rdi,%rax
-mov %rsi,%rdx
+movq -8(%rsp),%r12
+movq -16(%rsp),%r13
+movq -24(%rsp),%r14
+movq -32(%rsp),%r15
+movq -40(%rsp),%rbx
 ret
+.cfi_endproc
 
 .p2align 5
 .globl C_ABI(x25519_x86_64_ladderstep)
 HIDDEN C_ABI(x25519_x86_64_ladderstep)
 C_ABI(x25519_x86_64_ladderstep):
-mov %rsp,%r11
-and $31,%r11
-add $352,%r11
-sub %r11,%rsp
-movq %r11,0(%rsp)
+.cfi_startproc
+sub $352,%rsp
+.cfi_adjust_cfa_offset 352
 movq %r12,8(%rsp)
+.cfi_rel_offset r12, 8
 movq %r13,16(%rsp)
+.cfi_rel_offset r13, 16
 movq %r14,24(%rsp)
+.cfi_rel_offset r14, 24
 movq %r15,32(%rsp)
+.cfi_rel_offset r15, 32
 movq %rbx,40(%rsp)
+.cfi_rel_offset rbx, 40
 movq %rbp,48(%rsp)
+.cfi_rel_offset rbp, 48
 movq   40(%rdi),%rsi
 movq   48(%rdi),%rdx
 movq   56(%rdi),%rcx
@@ -1837,26 +1824,22 @@
 movq   %r9,96(%rdi)
 movq   %rax,104(%rdi)
 movq   %r10,112(%rdi)
-movq 0(%rsp),%r11
 movq 8(%rsp),%r12
 movq 16(%rsp),%r13
 movq 24(%rsp),%r14
 movq 32(%rsp),%r15
 movq 40(%rsp),%rbx
 movq 48(%rsp),%rbp
-add %r11,%rsp
-mov %rdi,%rax
-mov %rsi,%rdx
+add $352,%rsp
+.cfi_adjust_cfa_offset -352
 ret
+.cfi_endproc
 
 .p2align 5
 .globl C_ABI(x25519_x86_64_work_cswap)
 HIDDEN C_ABI(x25519_x86_64_work_cswap)
 C_ABI(x25519_x86_64_work_cswap):
-mov %rsp,%r11
-and $31,%r11
-add $0,%r11
-sub %r11,%rsp
+.cfi_startproc
 cmp  $1,%rsi
 movq   0(%rdi),%rsi
 movq   80(%rdi),%rdx
@@ -1928,10 +1911,10 @@
 movq   %rdx,144(%rdi)
 movq   %rcx,72(%rdi)
 movq   %r8,152(%rdi)
-add %r11,%rsp
 mov %rdi,%rax
 mov %rsi,%rdx
 ret
+.cfi_endproc
 
 #endif  /* __x86_64__ */
 #endif  /* !OPENSSL_NO_ASM */