Improve Curve25519 cswap x64 assembly This change replace the cmovq scheme with slightly faster SSE2 code. The SSE2 code was first introduced in Go's curve25519 implementation. See: https://go-review.googlesource.com/c/39693/ The implementation is basicly copied from the Go assembly. Change-Id: I25931a421ba141ce33809875699f048b0941c061 Reviewed-on: https://boringssl-review.googlesource.com/16564 Reviewed-by: Adam Langley <agl@google.com> Commit-Queue: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
diff --git a/crypto/curve25519/asm/x25519-asm-x86_64.S b/crypto/curve25519/asm/x25519-asm-x86_64.S index 18041d0..6cff53e 100644 --- a/crypto/curve25519/asm/x25519-asm-x86_64.S +++ b/crypto/curve25519/asm/x25519-asm-x86_64.S
@@ -1838,79 +1838,55 @@ HIDDEN C_ABI(x25519_x86_64_work_cswap) C_ABI(x25519_x86_64_work_cswap): .cfi_startproc -cmp $1,%rsi -movq 0(%rdi),%rsi -movq 80(%rdi),%rdx -movq 8(%rdi),%rcx -movq 88(%rdi),%r8 -mov %rsi,%r9 -cmove %rdx,%rsi -cmove %r9,%rdx -mov %rcx,%r9 -cmove %r8,%rcx -cmove %r9,%r8 -movq %rsi,0(%rdi) -movq %rdx,80(%rdi) -movq %rcx,8(%rdi) -movq %r8,88(%rdi) -movq 16(%rdi),%rsi -movq 96(%rdi),%rdx -movq 24(%rdi),%rcx -movq 104(%rdi),%r8 -mov %rsi,%r9 -cmove %rdx,%rsi -cmove %r9,%rdx -mov %rcx,%r9 -cmove %r8,%rcx -cmove %r9,%r8 -movq %rsi,16(%rdi) -movq %rdx,96(%rdi) -movq %rcx,24(%rdi) -movq %r8,104(%rdi) -movq 32(%rdi),%rsi -movq 112(%rdi),%rdx -movq 40(%rdi),%rcx -movq 120(%rdi),%r8 -mov %rsi,%r9 -cmove %rdx,%rsi -cmove %r9,%rdx -mov %rcx,%r9 -cmove %r8,%rcx -cmove %r9,%r8 -movq %rsi,32(%rdi) -movq %rdx,112(%rdi) -movq %rcx,40(%rdi) -movq %r8,120(%rdi) -movq 48(%rdi),%rsi -movq 128(%rdi),%rdx -movq 56(%rdi),%rcx -movq 136(%rdi),%r8 -mov %rsi,%r9 -cmove %rdx,%rsi -cmove %r9,%rdx -mov %rcx,%r9 -cmove %r8,%rcx -cmove %r9,%r8 -movq %rsi,48(%rdi) -movq %rdx,128(%rdi) -movq %rcx,56(%rdi) -movq %r8,136(%rdi) -movq 64(%rdi),%rsi -movq 144(%rdi),%rdx -movq 72(%rdi),%rcx -movq 152(%rdi),%r8 -mov %rsi,%r9 -cmove %rdx,%rsi -cmove %r9,%rdx -mov %rcx,%r9 -cmove %r8,%rcx -cmove %r9,%r8 -movq %rsi,64(%rdi) -movq %rdx,144(%rdi) -movq %rcx,72(%rdi) -movq %r8,152(%rdi) -mov %rdi,%rax -mov %rsi,%rdx +subq $1,%rsi +notq %rsi +movq %rsi,%xmm15 +pshufd $0x44,%xmm15,%xmm15 +movdqu 0(%rdi),%xmm0 +movdqu 16(%rdi),%xmm2 +movdqu 32(%rdi),%xmm4 +movdqu 48(%rdi),%xmm6 +movdqu 64(%rdi),%xmm8 +movdqu 80(%rdi),%xmm1 +movdqu 96(%rdi),%xmm3 +movdqu 112(%rdi),%xmm5 +movdqu 128(%rdi),%xmm7 +movdqu 144(%rdi),%xmm9 +movdqa %xmm1,%xmm10 +movdqa %xmm3,%xmm11 +movdqa %xmm5,%xmm12 +movdqa %xmm7,%xmm13 +movdqa %xmm9,%xmm14 +pxor %xmm0,%xmm10 +pxor %xmm2,%xmm11 +pxor %xmm4,%xmm12 +pxor %xmm6,%xmm13 +pxor %xmm8,%xmm14 +pand %xmm15,%xmm10 +pand %xmm15,%xmm11 +pand %xmm15,%xmm12 +pand %xmm15,%xmm13 +pand %xmm15,%xmm14 +pxor %xmm10,%xmm0 +pxor %xmm10,%xmm1 +pxor %xmm11,%xmm2 +pxor %xmm11,%xmm3 +pxor %xmm12,%xmm4 +pxor %xmm12,%xmm5 +pxor %xmm13,%xmm6 +pxor %xmm13,%xmm7 +pxor %xmm14,%xmm8 +pxor %xmm14,%xmm9 +movdqu %xmm0,0(%rdi) +movdqu %xmm2,16(%rdi) +movdqu %xmm4,32(%rdi) +movdqu %xmm6,48(%rdi) +movdqu %xmm8,64(%rdi) +movdqu %xmm1,80(%rdi) +movdqu %xmm3,96(%rdi) +movdqu %xmm5,112(%rdi) +movdqu %xmm7,128(%rdi) +movdqu %xmm9,144(%rdi) ret .cfi_endproc