Improve Curve25519 cswap x64 assembly

This change replace the cmovq scheme with slightly faster SSE2 code.
The SSE2 code was first introduced in Go's curve25519 implementation.
See: https://go-review.googlesource.com/c/39693/

The implementation is basicly copied from the Go assembly.

Change-Id: I25931a421ba141ce33809875699f048b0941c061
Reviewed-on: https://boringssl-review.googlesource.com/16564
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/curve25519/asm/x25519-asm-x86_64.S b/crypto/curve25519/asm/x25519-asm-x86_64.S
index 18041d0..6cff53e 100644
--- a/crypto/curve25519/asm/x25519-asm-x86_64.S
+++ b/crypto/curve25519/asm/x25519-asm-x86_64.S
@@ -1838,79 +1838,55 @@
 HIDDEN C_ABI(x25519_x86_64_work_cswap)
 C_ABI(x25519_x86_64_work_cswap):
 .cfi_startproc
-cmp  $1,%rsi
-movq   0(%rdi),%rsi
-movq   80(%rdi),%rdx
-movq   8(%rdi),%rcx
-movq   88(%rdi),%r8
-mov  %rsi,%r9
-cmove %rdx,%rsi
-cmove %r9,%rdx
-mov  %rcx,%r9
-cmove %r8,%rcx
-cmove %r9,%r8
-movq   %rsi,0(%rdi)
-movq   %rdx,80(%rdi)
-movq   %rcx,8(%rdi)
-movq   %r8,88(%rdi)
-movq   16(%rdi),%rsi
-movq   96(%rdi),%rdx
-movq   24(%rdi),%rcx
-movq   104(%rdi),%r8
-mov  %rsi,%r9
-cmove %rdx,%rsi
-cmove %r9,%rdx
-mov  %rcx,%r9
-cmove %r8,%rcx
-cmove %r9,%r8
-movq   %rsi,16(%rdi)
-movq   %rdx,96(%rdi)
-movq   %rcx,24(%rdi)
-movq   %r8,104(%rdi)
-movq   32(%rdi),%rsi
-movq   112(%rdi),%rdx
-movq   40(%rdi),%rcx
-movq   120(%rdi),%r8
-mov  %rsi,%r9
-cmove %rdx,%rsi
-cmove %r9,%rdx
-mov  %rcx,%r9
-cmove %r8,%rcx
-cmove %r9,%r8
-movq   %rsi,32(%rdi)
-movq   %rdx,112(%rdi)
-movq   %rcx,40(%rdi)
-movq   %r8,120(%rdi)
-movq   48(%rdi),%rsi
-movq   128(%rdi),%rdx
-movq   56(%rdi),%rcx
-movq   136(%rdi),%r8
-mov  %rsi,%r9
-cmove %rdx,%rsi
-cmove %r9,%rdx
-mov  %rcx,%r9
-cmove %r8,%rcx
-cmove %r9,%r8
-movq   %rsi,48(%rdi)
-movq   %rdx,128(%rdi)
-movq   %rcx,56(%rdi)
-movq   %r8,136(%rdi)
-movq   64(%rdi),%rsi
-movq   144(%rdi),%rdx
-movq   72(%rdi),%rcx
-movq   152(%rdi),%r8
-mov  %rsi,%r9
-cmove %rdx,%rsi
-cmove %r9,%rdx
-mov  %rcx,%r9
-cmove %r8,%rcx
-cmove %r9,%r8
-movq   %rsi,64(%rdi)
-movq   %rdx,144(%rdi)
-movq   %rcx,72(%rdi)
-movq   %r8,152(%rdi)
-mov %rdi,%rax
-mov %rsi,%rdx
+subq $1,%rsi
+notq %rsi
+movq %rsi,%xmm15
+pshufd $0x44,%xmm15,%xmm15
+movdqu 0(%rdi),%xmm0
+movdqu 16(%rdi),%xmm2
+movdqu 32(%rdi),%xmm4
+movdqu 48(%rdi),%xmm6
+movdqu 64(%rdi),%xmm8
+movdqu 80(%rdi),%xmm1
+movdqu 96(%rdi),%xmm3
+movdqu 112(%rdi),%xmm5
+movdqu 128(%rdi),%xmm7
+movdqu 144(%rdi),%xmm9
+movdqa %xmm1,%xmm10
+movdqa %xmm3,%xmm11
+movdqa %xmm5,%xmm12
+movdqa %xmm7,%xmm13
+movdqa %xmm9,%xmm14
+pxor %xmm0,%xmm10
+pxor %xmm2,%xmm11
+pxor %xmm4,%xmm12
+pxor %xmm6,%xmm13
+pxor %xmm8,%xmm14
+pand %xmm15,%xmm10
+pand %xmm15,%xmm11
+pand %xmm15,%xmm12
+pand %xmm15,%xmm13
+pand %xmm15,%xmm14
+pxor %xmm10,%xmm0
+pxor %xmm10,%xmm1
+pxor %xmm11,%xmm2
+pxor %xmm11,%xmm3
+pxor %xmm12,%xmm4
+pxor %xmm12,%xmm5
+pxor %xmm13,%xmm6
+pxor %xmm13,%xmm7
+pxor %xmm14,%xmm8
+pxor %xmm14,%xmm9
+movdqu %xmm0,0(%rdi)
+movdqu %xmm2,16(%rdi)
+movdqu %xmm4,32(%rdi)
+movdqu %xmm6,48(%rdi)
+movdqu %xmm8,64(%rdi)
+movdqu %xmm1,80(%rdi)
+movdqu %xmm3,96(%rdi)
+movdqu %xmm5,112(%rdi)
+movdqu %xmm7,128(%rdi)
+movdqu %xmm9,144(%rdi)
 ret
 .cfi_endproc