x86_64 assembly pack: "optimize" for Knights Landing, add AVX-512 results.
The changes to the assembly files are synced from upstream's
64d92d74985ebb3d0be58a9718f9e080a14a8e7f. cpu-intel.c is translated to C
from that commit and d84df594404ebbd71d21fec5526178d935e4d88d.
Change-Id: I02c8f83aa4780df301c21f011ef2d8d8300e2f2a
Reviewed-on: https://boringssl-review.googlesource.com/18411
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl
index 5ab6f87..6b2065e 100755
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@@ -24,7 +24,7 @@
#
# Performance in cycles per byte out of large buffer.
#
-# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2
+# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 NxAVX(v)
#
# P4 9.48/+99% -/22.7(ii) -
# Core2 7.83/+55% 7.90/8.08 4.35
@@ -32,11 +32,13 @@
# Sandy Bridge 8.31/+42% 5.45/6.76 2.72
# Ivy Bridge 6.71/+46% 5.40/6.49 2.41
# Haswell 5.92/+43% 5.20/6.45 2.42 1.23
-# Skylake 5.87/+39% 4.70/- 2.31 1.19
+# Skylake[-X] 5.87/+39% 4.70/- 2.31 1.19[0.57]
# Silvermont 12.0/+33% 7.75/7.40 7.03(iii)
+# Knights L 11.7/- - 9.60(iii) 0.80
# Goldmont 10.6/+17% 5.10/- 3.28
# Sledgehammer 7.28/+52% -/14.2(ii) -
# Bulldozer 9.66/+28% 9.85/11.1 3.06(iv)
+# Ryzen 5.96/+50% 5.19/- 2.40 2.09
# VIA Nano 10.5/+46% 6.72/8.60 6.05
#
# (i) compared to older gcc 3.x one can observe >2x improvement on
diff --git a/crypto/cpu-intel.c b/crypto/cpu-intel.c
index 92a8eff..ef327df 100644
--- a/crypto/cpu-intel.c
+++ b/crypto/cpu-intel.c
@@ -207,6 +207,14 @@
/* Reserved bit #30 is repurposed to signal an Intel CPU. */
if (is_intel) {
edx |= (1 << 30);
+
+ /* Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables
+ * some Silvermont-specific codepaths which perform better. See OpenSSL
+ * commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. */
+ if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ ||
+ (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) {
+ ecx &= ~(1 << 26);
+ }
} else {
edx &= ~(1 << 30);
}
@@ -243,6 +251,12 @@
extended_features &= ~(1 << 16);
}
+ /* Disable ADX instructions on Knights Landing. See OpenSSL commit
+ * 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. */
+ if ((ecx & (1 << 26)) == 0) {
+ extended_features &= ~(1 << 19);
+ }
+
OPENSSL_ia32cap_P[0] = edx;
OPENSSL_ia32cap_P[1] = ecx;
OPENSSL_ia32cap_P[2] = extended_features;
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 4ad0fb1..a9b3151 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -179,8 +179,10 @@
# Haswell 4.44/0.63 0.63 0.73 0.63 0.70
# Skylake 2.62/0.63 0.63 0.63 0.63
# Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
+# Knights L 2.54/0.77 0.78 0.85 - 1.50
# Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
# Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
+# Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
#
# (*) Atom Silvermont ECB result is suboptimal because of penalties
# incurred by operations on %xmm8-15. As ECB is not considered
diff --git a/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl b/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
index 57a6a8d..dd6657b 100644
--- a/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -28,6 +35,8 @@
# Applications using the EVP interface will observe a few percent
# worse performance.]
#
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
diff --git a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
index 1778ac0..e6dd041 100644
--- a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -67,6 +74,7 @@
# Skylake 0.44(+110%)(if system doesn't support AVX)
# Bulldozer 1.49(+27%)
# Silvermont 2.88(+13%)
+# Knights L 2.12(-) (if system doesn't support AVX)
# Goldmont 1.08(+24%)
# March 2013
@@ -79,6 +87,8 @@
# it performs in 0.41 cycles per byte on Haswell processor, in
# 0.29 on Broadwell, and in 0.36 on Skylake.
#
+# Knights Landing achieves 1.09 cpb.
+#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
$flavour = shift;
diff --git a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
index b269e84..f5bc2e6 100755
--- a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -75,9 +82,11 @@
# Haswell 5.45 4.15/+31% 3.57/+53%
# Skylake 5.18 4.06/+28% 3.54/+46%
# Bulldozer 9.11 5.95/+53%
+# Ryzen 4.75 3.80/+24% 1.93/+150%(**)
# VIA Nano 9.32 7.15/+30%
# Atom 10.3 9.17/+12%
# Silvermont 13.1(*) 9.37/+40%
+# Knights L 13.2(*) 9.68/+36% 8.30/+59%
# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
#
# (*) obviously suboptimal result, nothing was done about it,
@@ -537,7 +546,7 @@
$code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
}
-sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
+sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
@@ -1458,7 +1467,7 @@
)
}
-sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4
+sub Xupdate_avx2_16_31() # recall that $Xi starts with 4
{ use integer;
my $body = shift;
my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index 7ad7491..e62ad75 100755
--- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -88,9 +95,11 @@
# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
+# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
# VIA Nano 23.0 16.5(+39%) - 14.7 -
# Atom 23.0 18.9(+22%) - 14.7 -
# Silvermont 27.4 20.6(+33%) - 17.5 -
+# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
#
# (*) whichever best applicable, including SHAEXT;
@@ -311,7 +320,6 @@
mov $SZ*5($ctx),$F
mov $SZ*6($ctx),$G
mov $SZ*7($ctx),$H
-
jmp .Lloop
.align 16