Add benchmarks for hash functions to bssl speed.
diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl
index acc4f63..81252a6 100644
--- a/crypto/sha/asm/sha1-586.pl
+++ b/crypto/sha/asm/sha1-586.pl
@@ -128,6 +128,8 @@
 		`ml 2>&1` =~ /Version ([0-9]+)\./ &&
 		$1>=10);	# first version supporting AVX
 
+$shaext=$xmm;	### set to zero if compiling for 1.0.1
+
 &external_label("OPENSSL_ia32cap_P") if ($xmm);
 
 
@@ -307,7 +309,7 @@
 
 &function_begin("sha1_block_data_order");
 if ($xmm) {
-  &static_label("shaext_shortcut");
+  &static_label("shaext_shortcut")	if ($shaext);
   &static_label("ssse3_shortcut");
   &static_label("avx_shortcut")		if ($ymm);
   &static_label("K_XX_XX");
@@ -325,8 +327,10 @@
 	&mov	($C,&DWP(8,$T));
 	&test	($A,1<<24);		# check FXSR bit
 	&jz	(&label("x86"));
-	&test	($C,1<<29);		# check SHA bit
-	&jnz	(&label("shaext_shortcut"));
+	if ($shaext) {
+		&test	($C,1<<29);		# check SHA bit
+		&jnz	(&label("shaext_shortcut"));
+	}
 	if ($ymm) {
 		&and	($D,1<<28);		# mask AVX bit
 		&and	($A,1<<30);		# mask "Intel CPU" bit
@@ -405,7 +409,7 @@
 &function_end("sha1_block_data_order");
 
 if ($xmm) {
-{
+if ($shaext) {
 ######################################################################
 # Intel SHA Extensions implementation of SHA1 update function.
 #
diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl
index ea288c1..01010cf 100644
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -107,6 +107,9 @@
 	$avx = ($1>=10) + ($1>=11);
 }
 
+$shaext=0;	### set to zero if compiling for 1.0.1
+$avx=1		if (!$shaext && $avx);
+
 open OUT,"| \"$^X\" $xlate $flavour $output";
 *STDOUT=*OUT;
 
@@ -245,7 +248,8 @@
 	mov	OPENSSL_ia32cap_P+8(%rip),%r10d
 	test	\$`1<<9`,%r8d		# check SSSE3 bit
 	jz	.Lialu
-
+___
+$code.=<<___ if ($shaext);
 	test	\$`1<<29`,%r10d		# check SHA bit	
 	jnz	_shaext_shortcut
 ___
@@ -321,7 +325,7 @@
 	ret
 .size	sha1_block_data_order,.-sha1_block_data_order
 ___
-{{{
+if ($shaext) {{{
 ######################################################################
 # Intel SHA Extensions implementation of SHA1 update function.
 #
@@ -1956,9 +1960,13 @@
 	.rva	.LSEH_begin_sha1_block_data_order
 	.rva	.LSEH_end_sha1_block_data_order
 	.rva	.LSEH_info_sha1_block_data_order
+___
+$code.=<<___ if ($shaext);
 	.rva	.LSEH_begin_sha1_block_data_order_shaext
 	.rva	.LSEH_end_sha1_block_data_order_shaext
 	.rva	.LSEH_info_sha1_block_data_order_shaext
+___
+$code.=<<___;
 	.rva	.LSEH_begin_sha1_block_data_order_ssse3
 	.rva	.LSEH_end_sha1_block_data_order_ssse3
 	.rva	.LSEH_info_sha1_block_data_order_ssse3
diff --git a/crypto/sha/asm/sha256-586.pl b/crypto/sha/asm/sha256-586.pl
index 09648a8..ee094a9 100644
--- a/crypto/sha/asm/sha256-586.pl
+++ b/crypto/sha/asm/sha256-586.pl
@@ -82,6 +82,8 @@
 	$avx = ($1>=10) + ($1>=11);
 }
 
+$shaext=$xmm;	### set to zero if compiling for 1.0.1
+
 $unroll_after = 64*4;	# If pre-evicted from L1P cache first spin of
 			# fully unrolled loop was measured to run about
 			# 3-4x slower. If slowdown coefficient is N and
@@ -205,8 +207,8 @@
 	&jz	($unroll_after?&label("no_xmm"):&label("loop"));
 	&and	("ecx",1<<30);		# mask "Intel CPU" bit
 	&and	("ebx",1<<28|1<<9);	# mask AVX and SSSE3 bits
-	&test	("edx",1<<29)		if ($xmm);	# check for SHA
-	&jnz	(&label("shaext"))	if ($xmm);
+	&test	("edx",1<<29)		if ($shaext);	# check for SHA
+	&jnz	(&label("shaext"))	if ($shaext);
 	&or	("ecx","ebx");
 	&and	("ecx",1<<28|1<<30);
 	&cmp	("ecx",1<<28|1<<30);
@@ -505,7 +507,7 @@
 &function_end_A();
 }
 						if (!$i386 && $xmm) {{{
-{
+if ($shaext) {
 ######################################################################
 # Intel SHA Extensions implementation of SHA256 update function.
 #
diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl
index e2253f1..bc5620d 100644
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@@ -123,6 +123,9 @@
 	$avx = ($1>=10) + ($1>=11);
 }
 
+$shaext=1;	### set to zero if compiling for 1.0.1
+$avx=1		if (!$shaext && $avx);
+
 open OUT,"| \"$^X\" $xlate $flavour";
 *STDOUT=*OUT;
 
@@ -259,7 +262,7 @@
 	mov	4(%r11),%r10d
 	mov	8(%r11),%r11d
 ___
-$code.=<<___ if ($SZ==4);
+$code.=<<___ if ($SZ==4 && $shaext);
 	test	\$`1<<29`,%r11d		# check for SHA
 	jnz	_shaext_shortcut
 ___
@@ -518,7 +521,7 @@
 ######################################################################
 # SIMD code paths
 #
-if ($SZ==4) {{{
+if ($SZ==4 && $shaext) {{{
 ######################################################################
 # Intel SHA Extensions implementation of SHA256 update function.
 #
@@ -2295,10 +2298,12 @@
 	.rva	.LSEH_end_$func
 	.rva	.LSEH_info_$func
 ___
-$code.=<<___ if ($SZ==4);
+$code.=<<___ if ($SZ==4 && $shext);
 	.rva	.LSEH_begin_${func}_shaext
 	.rva	.LSEH_end_${func}_shaext
 	.rva	.LSEH_info_${func}_shaext
+___
+$code.=<<___ if ($SZ==4);
 	.rva	.LSEH_begin_${func}_ssse3
 	.rva	.LSEH_end_${func}_ssse3
 	.rva	.LSEH_info_${func}_ssse3
diff --git a/tool/speed.cc b/tool/speed.cc
index 706d4a2..176e2e2 100644
--- a/tool/speed.cc
+++ b/tool/speed.cc
@@ -22,6 +22,7 @@
 
 #include <openssl/aead.h>
 #include <openssl/bio.h>
+#include <openssl/digest.h>
 #include <openssl/obj.h>
 #include <openssl/rsa.h>
 
@@ -198,6 +199,41 @@
          SpeedAEADChunk(aead, name + " (8192 bytes)", 8192);
 }
 
+static bool SpeedHashChunk(const EVP_MD *md, const std::string &name,
+                           size_t chunk_len) {
+  EVP_MD_CTX *ctx = EVP_MD_CTX_create();
+  uint8_t scratch[8192];
+
+  if (chunk_len > sizeof(scratch)) {
+    return false;
+  }
+
+  TimeResults results;
+  if (!TimeFunction(&results, [ctx, md, chunk_len, &scratch]() -> bool {
+        uint8_t digest[EVP_MAX_MD_SIZE];
+        unsigned int md_len;
+
+        return EVP_DigestInit_ex(ctx, md, NULL /* ENGINE */) &&
+               EVP_DigestUpdate(ctx, scratch, chunk_len) &&
+               EVP_DigestFinal_ex(ctx, digest, &md_len);
+      })) {
+    fprintf(stderr, "EVP_DigestInit_ex failed.\n");
+    BIO_print_errors_fp(stderr);
+    return false;
+  }
+
+  results.PrintWithBytes(name, chunk_len);
+
+  EVP_MD_CTX_destroy(ctx);
+
+  return true;
+}
+static bool SpeedHash(const EVP_MD *md, const std::string &name) {
+  return SpeedHashChunk(md, name + " (16 bytes)", 16) &&
+         SpeedHashChunk(md, name + " (256 bytes)", 256) &&
+         SpeedHashChunk(md, name + " (8192 bytes)", 8192);
+}
+
 bool Speed(const std::vector<std::string> &args) {
   const uint8_t *inp;
 
@@ -231,7 +267,10 @@
 
   if (!SpeedAEAD(EVP_aead_aes_128_gcm(), "AES-128-GCM") ||
       !SpeedAEAD(EVP_aead_aes_256_gcm(), "AES-256-GCM") ||
-      !SpeedAEAD(EVP_aead_chacha20_poly1305(), "ChaCha20-Poly1305")) {
+      !SpeedAEAD(EVP_aead_chacha20_poly1305(), "ChaCha20-Poly1305") ||
+      !SpeedHash(EVP_sha1(), "SHA-1") ||
+      !SpeedHash(EVP_sha256(), "SHA-256") ||
+      !SpeedHash(EVP_sha512(), "SHA-512")) {
     return false;
   }