x86_64 assembly pack: "optimize" for Knights Landing, add AVX-512 results.

The changes to the assembly files are synced from upstream's
64d92d74985ebb3d0be58a9718f9e080a14a8e7f. cpu-intel.c is translated to C
from that commit and d84df594404ebbd71d21fec5526178d935e4d88d.

Change-Id: I02c8f83aa4780df301c21f011ef2d8d8300e2f2a
Reviewed-on: https://boringssl-review.googlesource.com/18411
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl
index 5ab6f87..6b2065e 100755
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@@ -24,7 +24,7 @@
 #
 # Performance in cycles per byte out of large buffer.
 #
-#		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    8xAVX2
+#		IALU/gcc 4.8(i)	1xSSSE3/SSE2	4xSSSE3	    NxAVX(v)
 #
 # P4		9.48/+99%	-/22.7(ii)	-
 # Core2		7.83/+55%	7.90/8.08	4.35
@@ -32,11 +32,13 @@
 # Sandy Bridge	8.31/+42%	5.45/6.76	2.72
 # Ivy Bridge	6.71/+46%	5.40/6.49	2.41
 # Haswell	5.92/+43%	5.20/6.45	2.42	    1.23
-# Skylake	5.87/+39%	4.70/-		2.31	    1.19
+# Skylake[-X]	5.87/+39%	4.70/-		2.31	    1.19[0.57]
 # Silvermont	12.0/+33%	7.75/7.40	7.03(iii)
+# Knights L	11.7/-		-		9.60(iii)   0.80
 # Goldmont	10.6/+17%	5.10/-		3.28
 # Sledgehammer	7.28/+52%	-/14.2(ii)	-
 # Bulldozer	9.66/+28%	9.85/11.1	3.06(iv)
+# Ryzen		5.96/+50%	5.19/-		2.40        2.09
 # VIA Nano	10.5/+46%	6.72/8.60	6.05
 #
 # (i)	compared to older gcc 3.x one can observe >2x improvement on
diff --git a/crypto/cpu-intel.c b/crypto/cpu-intel.c
index 92a8eff..ef327df 100644
--- a/crypto/cpu-intel.c
+++ b/crypto/cpu-intel.c
@@ -207,6 +207,14 @@
   /* Reserved bit #30 is repurposed to signal an Intel CPU. */
   if (is_intel) {
     edx |= (1 << 30);
+
+    /* Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables
+     * some Silvermont-specific codepaths which perform better. See OpenSSL
+     * commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. */
+    if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ ||
+        (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) {
+      ecx &= ~(1 << 26);
+    }
   } else {
     edx &= ~(1 << 30);
   }
@@ -243,6 +251,12 @@
     extended_features &= ~(1 << 16);
   }
 
+  /* Disable ADX instructions on Knights Landing. See OpenSSL commit
+   * 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. */
+  if ((ecx & (1 << 26)) == 0) {
+    extended_features &= ~(1 << 19);
+  }
+
   OPENSSL_ia32cap_P[0] = edx;
   OPENSSL_ia32cap_P[1] = ecx;
   OPENSSL_ia32cap_P[2] = extended_features;
diff --git a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
index 4ad0fb1..a9b3151 100644
--- a/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
+++ b/crypto/fipsmodule/aes/asm/aesni-x86_64.pl
@@ -179,8 +179,10 @@
 # Haswell	4.44/0.63	0.63	0.73	0.63	0.70
 # Skylake	2.62/0.63	0.63	0.63	0.63
 # Silvermont	5.75/3.54	3.56	4.12	3.87(*)	4.11
+# Knights L	2.54/0.77	0.78	0.85	-	1.50
 # Goldmont	3.82/1.26	1.26	1.29	1.29	1.50
 # Bulldozer	5.77/0.70	0.72	0.90	0.70	0.95
+# Ryzen		2.71/0.35	0.35	0.44	0.38	0.49
 #
 # (*)	Atom Silvermont ECB result is suboptimal because of penalties
 #	incurred by operations on %xmm8-15. As ECB is not considered
diff --git a/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl b/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
index 57a6a8d..dd6657b 100644
--- a/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/aesni-gcm-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -28,6 +35,8 @@
 # Applications using the EVP interface will observe a few percent
 # worse performance.]
 #
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
 
diff --git a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
index 1778ac0..e6dd041 100644
--- a/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/ghash-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -67,6 +74,7 @@
 # Skylake	0.44(+110%)(if system doesn't support AVX)
 # Bulldozer	1.49(+27%)
 # Silvermont	2.88(+13%)
+# Knights L	2.12(-)    (if system doesn't support AVX)
 # Goldmont	1.08(+24%)
 
 # March 2013
@@ -79,6 +87,8 @@
 # it performs in 0.41 cycles per byte on Haswell processor, in
 # 0.29 on Broadwell, and in 0.36 on Skylake.
 #
+# Knights Landing achieves 1.09 cpb.
+#
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 
 $flavour = shift;
diff --git a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
index b269e84..f5bc2e6 100755
--- a/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha1-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -75,9 +82,11 @@
 # Haswell	5.45		4.15/+31%	3.57/+53%
 # Skylake	5.18		4.06/+28%	3.54/+46%
 # Bulldozer	9.11		5.95/+53%
+# Ryzen		4.75		3.80/+24%	1.93/+150%(**)
 # VIA Nano	9.32		7.15/+30%
 # Atom		10.3		9.17/+12%
 # Silvermont	13.1(*)		9.37/+40%
+# Knights L	13.2(*)		9.68/+36%	8.30/+59%
 # Goldmont	8.13		6.42/+27%	1.70/+380%(**)
 #
 # (*)	obviously suboptimal result, nothing was done about it,
@@ -537,7 +546,7 @@
     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 }
 
-sub Xupdate_ssse3_16_31()		# recall that $Xi starts wtih 4
+sub Xupdate_ssse3_16_31()		# recall that $Xi starts with 4
 { use integer;
   my $body = shift;
   my @insns = (&$body,&$body,&$body,&$body);	# 40 instructions
@@ -1458,7 +1467,7 @@
 	)
 }
 
-sub Xupdate_avx2_16_31()		# recall that $Xi starts wtih 4
+sub Xupdate_avx2_16_31()		# recall that $Xi starts with 4
 { use integer;
   my $body = shift;
   my @insns = (&$body,&$body,&$body,&$body,&$body);	# 35 instructions
diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
index 7ad7491..e62ad75 100755
--- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
+++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -88,9 +95,11 @@
 # Haswell	12.2	9.28(+31%)  7.80(+56%)	    7.66    5.40(+42%)
 # Skylake	11.4	9.03(+26%)  7.70(+48%)      7.25    5.20(+40%)
 # Bulldozer	21.1	13.6(+54%)  13.6(+54%(***)) 13.5    8.58(+57%)
+# Ryzen		11.0	9.02(+22%)  2.05(+440%)     7.05    5.67(+20%)
 # VIA Nano	23.0	16.5(+39%)  -		    14.7    -
 # Atom		23.0	18.9(+22%)  -		    14.7    -
 # Silvermont	27.4	20.6(+33%)  -               17.5    -
+# Knights L	27.4	21.0(+30%)  19.6(+40%)	    17.5    12.8(+37%)
 # Goldmont	18.9	14.3(+32%)  4.16(+350%)     12.0    -
 #
 # (*)	whichever best applicable, including SHAEXT;
@@ -311,7 +320,6 @@
 	mov	$SZ*5($ctx),$F
 	mov	$SZ*6($ctx),$G
 	mov	$SZ*7($ctx),$H
-
 	jmp	.Lloop
 
 .align	16