Improve crypto/digest/md32_common.h mechanism.

The documentation in md32_common.h is now (more) correct with respect
to the most important details of the layout of |HASH_CTX|. The
documentation explaining why sha512.c doesn't use md32_common.h is now
more accurate as well.

Before, the C implementations of HASH_BLOCK_DATA_ORDER took a pointer
to the |HASH_CTX| and the assembly language implementations took a
pointer to the hash state |h| member of |HASH_CTX|. (This worked
because |h| is always the first member of |HASH_CTX|.) Now, the C
implementations take a pointer directly to |h| too.

The definitions of |MD4_CTX|, |MD5_CTX|, and |SHA1_CTX| were changed to
be consistent with |SHA256_CTX| and |SHA512_CTX| in storing the hash
state in an array. This will break source compatibility with any
external code that accesses the hash state directly, but will not
affect binary compatibility.

The second parameter of |HASH_BLOCK_DATA_ORDER| is now of type
|const uint8_t *|; previously it was |void *| and all implementations
had a |uint8_t *data| variable to access it as an array of bytes.

This change paves the way for future refactorings such as automatically
generating the |*_Init| functions and/or sharing one I-U-F
implementation across all digest algorithms.

Change-Id: I6e9dd09ff057c67941021d324a4fa1d39f58b0db
Reviewed-on: https://boringssl-review.googlesource.com/6405
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/cipher/tls_cbc.c b/crypto/cipher/tls_cbc.c
index 8bca2f3..c541db3 100644
--- a/crypto/cipher/tls_cbc.c
+++ b/crypto/cipher/tls_cbc.c
@@ -229,11 +229,11 @@
  * typically does. */
 static void tls1_sha1_final_raw(void *ctx, uint8_t *md_out) {
   SHA_CTX *sha1 = ctx;
-  u32toBE(sha1->h0, md_out);
-  u32toBE(sha1->h1, md_out);
-  u32toBE(sha1->h2, md_out);
-  u32toBE(sha1->h3, md_out);
-  u32toBE(sha1->h4, md_out);
+  u32toBE(sha1->h[0], md_out);
+  u32toBE(sha1->h[1], md_out);
+  u32toBE(sha1->h[2], md_out);
+  u32toBE(sha1->h[3], md_out);
+  u32toBE(sha1->h[4], md_out);
 }
 #define LARGEST_DIGEST_CTX SHA_CTX
 
diff --git a/crypto/digest/md32_common.h b/crypto/digest/md32_common.h
index 14607fb..d213476 100644
--- a/crypto/digest/md32_common.h
+++ b/crypto/digest/md32_common.h
@@ -58,49 +58,54 @@
 
 #define asm __asm__
 
-/* This is a generic 32 bit "collector" for message digest algorithms.
- * Whenever needed it collects input character stream into chunks of
- * 32 bit values and invokes a block function that performs actual hash
- * calculations.
+/* This is a generic 32-bit "collector" for message digest algorithms. It
+ * collects input character stream into chunks of 32-bit values and invokes the
+ * block function that performs the actual hash calculations. To make use of
+ * this mechanism, the following macros must be defined before including
+ * md32_common.h.
  *
- * Porting guide.
+ * One of |DATA_ORDER_IS_BIG_ENDIAN| or |DATA_ORDER_IS_LITTLE_ENDIAN| must be
+ * defined to specify the byte order of the input stream.
  *
- * Obligatory macros:
+ * |HASH_CBLOCK| must be defined as the integer block size, in bytes.
  *
- * DATA_ORDER_IS_BIG_ENDIAN or DATA_ORDER_IS_LITTLE_ENDIAN
- *	this macro defines byte order of input stream.
- * HASH_CBLOCK
- *	size of a unit chunk HASH_BLOCK operates on.
- * HASH_LONG
- *	has to be at least 32 bit wide.
- * HASH_CTX
- *	context structure that at least contains following
- *	members:
- *		typedef struct {
- *			...
- *			HASH_LONG	Nl,Nh;
- *			either {
- *			HASH_LONG	data[HASH_LBLOCK];
- *			unsigned char	data[HASH_CBLOCK];
- *			};
- *			unsigned int	num;
- *			...
- *			} HASH_CTX;
- *	data[] vector is expected to be zeroed upon first call to
- *	HASH_UPDATE.
- * HASH_UPDATE
- *	name of "Update" function, implemented here.
- * HASH_TRANSFORM
- *	name of "Transform" function, implemented here.
- * HASH_FINAL
- *	name of "Final" function, implemented here.
- * HASH_BLOCK_DATA_ORDER
- *	name of "block" function capable of treating *unaligned* input
- *	message in original (data) byte order, implemented externally.
- * HASH_MAKE_STRING
- *	macro convering context variables to an ASCII hash string.
+ * |HASH_CTX| must be defined as the name of the context structure, which must
+ * have at least the following members:
  *
- *					<appro@fy.chalmers.se>
+ *     typedef struct <name>_state_st {
+ *       uint32_t h[<chaining length> / sizeof(uint32_t)];
+ *       uint32_t Nl,Nh;
+ *       uint32_t data[HASH_CBLOCK / sizeof(uint32_t)];
+ *       unsigned int num
+ *       ...
+ *     } <NAME>_CTX;
+ *
+ * <chaining length> is the output length of the hash in bytes, before
+ * any truncation (e.g. 64 for SHA-224 and SHA-256, 128 for SHA-384 and SHA-512).
+ *
+ * |HASH_UPDATE| must be defined as the name of the "Update" function to
+ * generate.
+ *
+ * |HASH_TRANSFORM| must be defined as the  the name of the "Transform"
+ * function to generate.
+ *
+ * |HASH_FINAL| must be defined as the name of "Final" function to generate.
+ *
+ * |HASH_BLOCK_DATA_ORDER| must be defined as the name of the "Block" function.
+ * That function must be implemented manually. It must be capable of operating
+ * on *unaligned* input data in its original (data) byte order. It must have
+ * this signature:
+ *
+ *     void HASH_BLOCK_DATA_ORDER(uint32_t *state, const uint8_t *data,
+ *                                size_t num);
+ *
+ * It must update the hash state |state| with |num| blocks of data from |data|,
+ * where each block is |HASH_CBLOCK| bytes; i.e. |data| points to a array of
+ * |HASH_CBLOCK * num| bytes. |state| points to the |h| member of a |HASH_CTX|,
+ * and so will have |<chaining length> / sizeof(uint32_t)| elements.
+ *
+ * |HASH_MAKE_STRING(c, s)| must be defined as a block statement that converts
+ * the hash state |c->h| into the output byte order, storing the result in |s|.
  */
 
 #if !defined(DATA_ORDER_IS_BIG_ENDIAN) && !defined(DATA_ORDER_IS_LITTLE_ENDIAN)
@@ -110,9 +115,6 @@
 #ifndef HASH_CBLOCK
 #error "HASH_CBLOCK must be defined!"
 #endif
-#ifndef HASH_LONG
-#error "HASH_LONG must be defined!"
-#endif
 #ifndef HASH_CTX
 #error "HASH_CTX must be defined!"
 #endif
@@ -243,17 +245,17 @@
 	{
 	const uint8_t *data=data_;
 	uint8_t *p;
-	HASH_LONG l;
+	uint32_t l;
 	size_t n;
 
 	if (len==0) return 1;
 
-	l=(c->Nl+(((HASH_LONG)len)<<3))&0xffffffffUL;
+	l=(c->Nl+(((uint32_t)len)<<3))&0xffffffffUL;
 	/* 95-05-24 eay Fixed a bug with the overflow handling, thanks to
 	 * Wei Dai <weidai@eskimo.com> for pointing it out. */
 	if (l < c->Nl) /* overflow */
 		c->Nh++;
-	c->Nh+=(HASH_LONG)(len>>29);	/* might cause compiler warning on 16-bit */
+	c->Nh+=(uint32_t)(len>>29);	/* might cause compiler warning on 16-bit */
 	c->Nl=l;
 
 	n = c->num;
@@ -264,7 +266,7 @@
 		if (len >= HASH_CBLOCK || len+n >= HASH_CBLOCK)
 			{
 			memcpy (p+n,data,HASH_CBLOCK-n);
-			HASH_BLOCK_DATA_ORDER (c,p,1);
+			HASH_BLOCK_DATA_ORDER (c->h,p,1);
 			n      = HASH_CBLOCK-n;
 			data  += n;
 			len   -= n;
@@ -282,7 +284,7 @@
 	n = len/HASH_CBLOCK;
 	if (n > 0)
 		{
-		HASH_BLOCK_DATA_ORDER (c,data,n);
+		HASH_BLOCK_DATA_ORDER (c->h,data,n);
 		n    *= HASH_CBLOCK;
 		data += n;
 		len  -= n;
@@ -300,7 +302,7 @@
 
 void HASH_TRANSFORM (HASH_CTX *c, const uint8_t *data)
 	{
-	HASH_BLOCK_DATA_ORDER (c,data,1);
+	HASH_BLOCK_DATA_ORDER (c->h,data,1);
 	}
 
 
@@ -316,7 +318,7 @@
 		{
 		memset (p+n,0,HASH_CBLOCK-n);
 		n=0;
-		HASH_BLOCK_DATA_ORDER (c,p,1);
+		HASH_BLOCK_DATA_ORDER (c->h,p,1);
 		}
 	memset (p+n,0,HASH_CBLOCK-8-n);
 
@@ -329,7 +331,7 @@
 	(void)HOST_l2c(c->Nh,p);
 #endif
 	p -= HASH_CBLOCK;
-	HASH_BLOCK_DATA_ORDER (c,p,1);
+	HASH_BLOCK_DATA_ORDER (c->h,p,1);
 	c->num=0;
 	memset (p,0,HASH_CBLOCK);
 
diff --git a/crypto/md4/md4.c b/crypto/md4/md4.c
index 5ef9ae5..0a8ea1d 100644
--- a/crypto/md4/md4.c
+++ b/crypto/md4/md4.c
@@ -64,18 +64,17 @@
 
 int MD4_Init(MD4_CTX *md4) {
   memset(md4, 0, sizeof(MD4_CTX));
-  md4->A = 0x67452301UL;
-  md4->B = 0xefcdab89UL;
-  md4->C = 0x98badcfeUL;
-  md4->D = 0x10325476UL;
+  md4->h[0] = 0x67452301UL;
+  md4->h[1] = 0xefcdab89UL;
+  md4->h[2] = 0x98badcfeUL;
+  md4->h[3] = 0x10325476UL;
   return 1;
 }
 
-void md4_block_data_order (MD4_CTX *md4, const void *p, size_t num);
+void md4_block_data_order(uint32_t *state, const uint8_t *data, size_t num);
 
 #define DATA_ORDER_IS_LITTLE_ENDIAN
 
-#define HASH_LONG uint32_t
 #define HASH_CTX MD4_CTX
 #define HASH_CBLOCK 64
 #define HASH_UPDATE MD4_Update
@@ -84,13 +83,13 @@
 #define HASH_MAKE_STRING(c, s) \
   do {                         \
     uint32_t ll;               \
-    ll = (c)->A;               \
+    ll = (c)->h[0];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->B;               \
+    ll = (c)->h[1];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->C;               \
+    ll = (c)->h[2];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->D;               \
+    ll = (c)->h[3];            \
     (void) HOST_l2c(ll, (s));  \
   } while (0)
 #define HASH_BLOCK_DATA_ORDER md4_block_data_order
@@ -122,15 +121,14 @@
     a = ROTATE(a, s);                  \
   };
 
-void md4_block_data_order(MD4_CTX *c, const void *data_, size_t num) {
-  const uint8_t *data = data_;
+void md4_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
   uint32_t A, B, C, D, l;
   uint32_t X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15;
 
-  A = c->A;
-  B = c->B;
-  C = c->C;
-  D = c->D;
+  A = state[0];
+  B = state[1];
+  C = state[2];
+  D = state[3];
 
   for (; num--;) {
     HOST_c2l(data, l);
@@ -217,9 +215,9 @@
     R2(C, D, A, B, X7, 11, 0x6ED9EBA1L);
     R2(B, C, D, A, X15, 15, 0x6ED9EBA1L);
 
-    A = c->A += A;
-    B = c->B += B;
-    C = c->C += C;
-    D = c->D += D;
+    A = state[0] += A;
+    B = state[1] += B;
+    C = state[2] += C;
+    D = state[3] += D;
   }
 }
diff --git a/crypto/md5/md5.c b/crypto/md5/md5.c
index 6ad8d12..f27e62d 100644
--- a/crypto/md5/md5.c
+++ b/crypto/md5/md5.c
@@ -79,10 +79,10 @@
 
 int MD5_Init(MD5_CTX *md5) {
   memset(md5, 0, sizeof(MD5_CTX));
-  md5->A = 0x67452301UL;
-  md5->B = 0xefcdab89UL;
-  md5->C = 0x98badcfeUL;
-  md5->D = 0x10325476UL;
+  md5->h[0] = 0x67452301UL;
+  md5->h[1] = 0xefcdab89UL;
+  md5->h[2] = 0x98badcfeUL;
+  md5->h[3] = 0x10325476UL;
   return 1;
 }
 
@@ -93,11 +93,10 @@
 #endif
 
 
-void md5_block_data_order(MD5_CTX *md5, const void *p, size_t num);
+void md5_block_data_order(uint32_t *state, const uint8_t *data, size_t num);
 
 #define DATA_ORDER_IS_LITTLE_ENDIAN
 
-#define HASH_LONG uint32_t
 #define HASH_CTX MD5_CTX
 #define HASH_CBLOCK 64
 #define HASH_UPDATE MD5_Update
@@ -106,13 +105,13 @@
 #define HASH_MAKE_STRING(c, s) \
   do {                         \
     uint32_t ll;               \
-    ll = (c)->A;               \
+    ll = (c)->h[0];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->B;               \
+    ll = (c)->h[1];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->C;               \
+    ll = (c)->h[2];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->D;               \
+    ll = (c)->h[3];            \
     (void) HOST_l2c(ll, (s));  \
   } while (0)
 #define HASH_BLOCK_DATA_ORDER md5_block_data_order
@@ -152,17 +151,16 @@
 #ifdef X
 #undef X
 #endif
-void md5_block_data_order(MD5_CTX *md5, const void *in_data, size_t num) {
-  const uint8_t *data = in_data;
+void md5_block_data_order(uint32_t *state, const uint8_t *data, size_t num) {
   uint32_t A, B, C, D, l;
   uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10, XX11, XX12,
       XX13, XX14, XX15;
 #define X(i) XX##i
 
-  A = md5->A;
-  B = md5->B;
-  C = md5->C;
-  D = md5->D;
+  A = state[0];
+  B = state[1];
+  C = state[2];
+  D = state[3];
 
   for (; num--;) {
     HOST_c2l(data, l);
@@ -266,10 +264,10 @@
     R3(C, D, A, B, X(2), 15, 0x2ad7d2bbL);
     R3(B, C, D, A, X(9), 21, 0xeb86d391L);
 
-    A = md5->A += A;
-    B = md5->B += B;
-    C = md5->C += C;
-    D = md5->D += D;
+    A = state[0] += A;
+    B = state[1] += B;
+    C = state[2] += C;
+    D = state[3] += D;
   }
 }
 #endif
diff --git a/crypto/sha/sha1.c b/crypto/sha/sha1.c
index c03e608..b3318c5 100644
--- a/crypto/sha/sha1.c
+++ b/crypto/sha/sha1.c
@@ -69,11 +69,11 @@
 
 int SHA1_Init(SHA_CTX *sha) {
   memset(sha, 0, sizeof(SHA_CTX));
-  sha->h0 = 0x67452301UL;
-  sha->h1 = 0xefcdab89UL;
-  sha->h2 = 0x98badcfeUL;
-  sha->h3 = 0x10325476UL;
-  sha->h4 = 0xc3d2e1f0UL;
+  sha->h[0] = 0x67452301UL;
+  sha->h[1] = 0xefcdab89UL;
+  sha->h[2] = 0x98badcfeUL;
+  sha->h[3] = 0x10325476UL;
+  sha->h[4] = 0xc3d2e1f0UL;
   return 1;
 }
 
@@ -96,21 +96,20 @@
 
 #define DATA_ORDER_IS_BIG_ENDIAN
 
-#define HASH_LONG               uint32_t
 #define HASH_CTX                SHA_CTX
 #define HASH_CBLOCK             64
 #define HASH_MAKE_STRING(c, s) \
   do {                         \
     uint32_t ll;               \
-    ll = (c)->h0;              \
+    ll = (c)->h[0];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->h1;              \
+    ll = (c)->h[1];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->h2;              \
+    ll = (c)->h[2];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->h3;              \
+    ll = (c)->h[3];            \
     (void) HOST_l2c(ll, (s));  \
-    ll = (c)->h4;              \
+    ll = (c)->h[4];            \
     (void) HOST_l2c(ll, (s));  \
   } while (0)
 
@@ -124,7 +123,7 @@
 #ifndef SHA1_ASM
 static
 #endif
-void sha1_block_data_order(SHA_CTX *c, const void *p, size_t num);
+void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num);
 
 #include "../digest/md32_common.h"
 
@@ -186,17 +185,17 @@
 #define X(i)	XX##i
 
 #if !defined(SHA1_ASM)
-static void HASH_BLOCK_DATA_ORDER(SHA_CTX *c, const void *p, size_t num) {
-  const uint8_t *data = p;
+static void sha1_block_data_order(uint32_t *state, const uint8_t *data,
+                                  size_t num) {
   register uint32_t A, B, C, D, E, T, l;
   uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10,
       XX11, XX12, XX13, XX14, XX15;
 
-  A = c->h0;
-  B = c->h1;
-  C = c->h2;
-  D = c->h3;
-  E = c->h4;
+  A = state[0];
+  B = state[1];
+  C = state[2];
+  D = state[3];
+  E = state[4];
 
   for (;;) {
     const union {
@@ -204,7 +203,7 @@
       char little;
     } is_endian = {1};
 
-    if (!is_endian.little && ((size_t)p % 4) == 0) {
+    if (!is_endian.little && ((uintptr_t)data % 4) == 0) {
       const uint32_t *W = (const uint32_t *)data;
 
       X(0) = W[0];
@@ -361,21 +360,21 @@
     BODY_60_79(78, A, B, C, D, E, T, X(14), X(0), X(6), X(11));
     BODY_60_79(79, T, A, B, C, D, E, X(15), X(1), X(7), X(12));
 
-    c->h0 = (c->h0 + E) & 0xffffffffL;
-    c->h1 = (c->h1 + T) & 0xffffffffL;
-    c->h2 = (c->h2 + A) & 0xffffffffL;
-    c->h3 = (c->h3 + B) & 0xffffffffL;
-    c->h4 = (c->h4 + C) & 0xffffffffL;
+    state[0] = (state[0] + E) & 0xffffffffL;
+    state[1] = (state[1] + T) & 0xffffffffL;
+    state[2] = (state[2] + A) & 0xffffffffL;
+    state[3] = (state[3] + B) & 0xffffffffL;
+    state[4] = (state[4] + C) & 0xffffffffL;
 
     if (--num == 0) {
       break;
     }
 
-    A = c->h0;
-    B = c->h1;
-    C = c->h2;
-    D = c->h3;
-    E = c->h4;
+    A = state[0];
+    B = state[1];
+    C = state[2];
+    D = state[3];
+    E = state[4];
   }
 }
 #endif
diff --git a/crypto/sha/sha256.c b/crypto/sha/sha256.c
index 8276bbb..53480dd 100644
--- a/crypto/sha/sha256.c
+++ b/crypto/sha/sha256.c
@@ -135,7 +135,6 @@
 
 #define DATA_ORDER_IS_BIG_ENDIAN
 
-#define HASH_LONG uint32_t
 #define HASH_CTX SHA256_CTX
 #define HASH_CBLOCK 64
 
@@ -185,12 +184,12 @@
 #ifndef SHA256_ASM
 static
 #endif
-void sha256_block_data_order(SHA256_CTX *ctx, const void *in, size_t num);
+void sha256_block_data_order(uint32_t *state, const uint8_t *in, size_t num);
 
 #include "../digest/md32_common.h"
 
 #ifndef SHA256_ASM
-static const HASH_LONG K256[64] = {
+static const uint32_t K256[64] = {
     0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL,
     0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL,
     0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL,
@@ -234,29 +233,28 @@
     ROUND_00_15(i, a, b, c, d, e, f, g, h);            \
   } while (0)
 
-static void sha256_block_data_order(SHA256_CTX *ctx, const void *in,
+static void sha256_block_data_order(uint32_t *state, const uint8_t *data,
                                     size_t num) {
   uint32_t a, b, c, d, e, f, g, h, s0, s1, T1;
-  HASH_LONG X[16];
+  uint32_t X[16];
   int i;
-  const uint8_t *data = in;
   const union {
     long one;
     char little;
   } is_endian = {1};
 
   while (num--) {
-    a = ctx->h[0];
-    b = ctx->h[1];
-    c = ctx->h[2];
-    d = ctx->h[3];
-    e = ctx->h[4];
-    f = ctx->h[5];
-    g = ctx->h[6];
-    h = ctx->h[7];
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    f = state[5];
+    g = state[6];
+    h = state[7];
 
-    if (!is_endian.little && sizeof(HASH_LONG) == 4 && ((size_t)in % 4) == 0) {
-      const HASH_LONG *W = (const HASH_LONG *)data;
+    if (!is_endian.little && ((uintptr_t)data % 4) == 0) {
+      const uint32_t *W = (const uint32_t *)data;
 
       T1 = X[0] = W[0];
       ROUND_00_15(0, a, b, c, d, e, f, g, h);
@@ -293,7 +291,7 @@
 
       data += HASH_CBLOCK;
     } else {
-      HASH_LONG l;
+      uint32_t l;
 
       HOST_c2l(data, l);
       T1 = X[0] = l;
@@ -356,14 +354,14 @@
       ROUND_16_63(i + 7, b, c, d, e, f, g, h, a, X);
     }
 
-    ctx->h[0] += a;
-    ctx->h[1] += b;
-    ctx->h[2] += c;
-    ctx->h[3] += d;
-    ctx->h[4] += e;
-    ctx->h[5] += f;
-    ctx->h[6] += g;
-    ctx->h[7] += h;
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    state[5] += f;
+    state[6] += g;
+    state[7] += h;
   }
 }
 
diff --git a/crypto/sha/sha512.c b/crypto/sha/sha512.c
index 57c96ab..5e77a1e 100644
--- a/crypto/sha/sha512.c
+++ b/crypto/sha/sha512.c
@@ -65,27 +65,15 @@
 
 /* IMPLEMENTATION NOTES.
  *
- * As you might have noticed 32-bit hash algorithms:
- *
- * - permit SHA_LONG to be wider than 32-bit (case on CRAY);
- * - optimized versions implement two transform functions: one operating
- *   on [aligned] data in host byte order and one - on data in input
- *   stream byte order;
- * - share common byte-order neutral collector and padding function
- *   implementations, ../md32_common.h;
- *
- * Neither of the above applies to this SHA-512 implementations. Reasons
+ * The 32-bit hash algorithms share a common byte-order neutral collector and
+ * padding function implementations that operate on unaligned data,
+ * ../md32_common.h. This SHA-512 implementation does not. Reasons
  * [in reverse order] are:
  *
- * - it's the only 64-bit hash algorithm for the moment of this writing,
+ * - It's the only 64-bit hash algorithm for the moment of this writing,
  *   there is no need for common collector/padding implementation [yet];
- * - by supporting only one transform function [which operates on
- *   *aligned* data in input stream byte order, big-endian in this case]
- *   we minimize burden of maintenance in two ways: a) collector/padding
- *   function is simpler; b) only one transform function to stare at;
- * - SHA_LONG64 is required to be exactly 64-bit in order to be able to
- *   apply a number of optimizations to mitigate potential performance
- *   penalties caused by previous design decision; */
+ * - By supporting only a transform function that operates on *aligned* data
+ *   the collector/padding function is simpler and easier to optimize. */
 
 #if !defined(OPENSSL_NO_ASM) &&                         \
     (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \
@@ -163,7 +151,7 @@
 #if !defined(SHA512_ASM)
 static
 #endif
-void sha512_block_data_order(SHA512_CTX *ctx, const void *in, size_t num);
+void sha512_block_data_order(uint64_t *state, const uint64_t *W, size_t num);
 
 
 int SHA384_Final(uint8_t *md, SHA512_CTX *sha) {
@@ -181,7 +169,7 @@
     data = c->u.p;
   }
 #endif
-  sha512_block_data_order(c, data, 1);
+  sha512_block_data_order(c->h, (uint64_t *)data, 1);
 }
 
 int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) {
@@ -213,7 +201,7 @@
       memcpy(p + c->num, data, n), c->num = 0;
       len -= n;
       data += n;
-      sha512_block_data_order(c, p, 1);
+      sha512_block_data_order(c->h, (uint64_t *)p, 1);
     }
   }
 
@@ -222,14 +210,14 @@
     if ((size_t)data % sizeof(c->u.d[0]) != 0) {
       while (len >= sizeof(c->u)) {
         memcpy(p, data, sizeof(c->u));
-        sha512_block_data_order(c, p, 1);
+        sha512_block_data_order(c->h, (uint64_t *)p, 1);
         len -= sizeof(c->u);
         data += sizeof(c->u);
       }
     } else
 #endif
     {
-      sha512_block_data_order(c, data, len / sizeof(c->u));
+      sha512_block_data_order(c->h, (uint64_t *)data, len / sizeof(c->u));
       data += len;
       len %= sizeof(c->u);
       data -= len;
@@ -253,7 +241,7 @@
   if (n > (sizeof(sha->u) - 16)) {
     memset(p + n, 0, sizeof(sha->u) - n);
     n = 0;
-    sha512_block_data_order(sha, p, 1);
+    sha512_block_data_order(sha->h, (uint64_t *)p, 1);
   }
 
   memset(p + n, 0, sizeof(sha->u) - 16 - n);
@@ -274,7 +262,7 @@
   p[sizeof(sha->u) - 15] = (uint8_t)(sha->Nh >> 48);
   p[sizeof(sha->u) - 16] = (uint8_t)(sha->Nh >> 56);
 
-  sha512_block_data_order(sha, p, 1);
+  sha512_block_data_order(sha->h, (uint64_t *)p, 1);
 
   if (md == NULL) {
     /* TODO(davidben): This NULL check is absent in other low-level hash 'final'
@@ -443,23 +431,22 @@
  * This code should give better results on 32-bit CPU with less than
  * ~24 registers, both size and performance wise...
  */
-static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
+static void sha512_block_data_order(uint64_t *state, const uint64_t *W,
                                     size_t num) {
-  const uint64_t *W = in;
   uint64_t A, E, T;
   uint64_t X[9 + 80], *F;
   int i;
 
   while (num--) {
     F = X + 80;
-    A = ctx->h[0];
-    F[1] = ctx->h[1];
-    F[2] = ctx->h[2];
-    F[3] = ctx->h[3];
-    E = ctx->h[4];
-    F[5] = ctx->h[5];
-    F[6] = ctx->h[6];
-    F[7] = ctx->h[7];
+    A = state[0];
+    F[1] = state[1];
+    F[2] = state[2];
+    F[3] = state[3];
+    E = state[4];
+    F[5] = state[5];
+    F[6] = state[6];
+    F[7] = state[7];
 
     for (i = 0; i < 16; i++, F--) {
       T = PULL64(W[i]);
@@ -484,14 +471,14 @@
       A = T + Sigma0(A) + Maj(A, F[1], F[2]);
     }
 
-    ctx->h[0] += A;
-    ctx->h[1] += F[1];
-    ctx->h[2] += F[2];
-    ctx->h[3] += F[3];
-    ctx->h[4] += E;
-    ctx->h[5] += F[5];
-    ctx->h[6] += F[6];
-    ctx->h[7] += F[7];
+    state[0] += A;
+    state[1] += F[1];
+    state[2] += F[2];
+    state[3] += F[3];
+    state[4] += E;
+    state[5] += F[5];
+    state[6] += F[6];
+    state[7] += F[7];
 
     W += 16;
   }
@@ -517,23 +504,22 @@
     ROUND_00_15(i + j, a, b, c, d, e, f, g, h);        \
   } while (0)
 
-static void sha512_block_data_order(SHA512_CTX *ctx, const void *in,
+static void sha512_block_data_order(uint64_t *state, const uint64_t *W,
                                     size_t num) {
-  const uint64_t *W = in;
   uint64_t a, b, c, d, e, f, g, h, s0, s1, T1;
   uint64_t X[16];
   int i;
 
   while (num--) {
 
-    a = ctx->h[0];
-    b = ctx->h[1];
-    c = ctx->h[2];
-    d = ctx->h[3];
-    e = ctx->h[4];
-    f = ctx->h[5];
-    g = ctx->h[6];
-    h = ctx->h[7];
+    a = state[0];
+    b = state[1];
+    c = state[2];
+    d = state[3];
+    e = state[4];
+    f = state[5];
+    g = state[6];
+    h = state[7];
 
     T1 = X[0] = PULL64(W[0]);
     ROUND_00_15(0, a, b, c, d, e, f, g, h);
@@ -587,14 +573,14 @@
       ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X);
     }
 
-    ctx->h[0] += a;
-    ctx->h[1] += b;
-    ctx->h[2] += c;
-    ctx->h[3] += d;
-    ctx->h[4] += e;
-    ctx->h[5] += f;
-    ctx->h[6] += g;
-    ctx->h[7] += h;
+    state[0] += a;
+    state[1] += b;
+    state[2] += c;
+    state[3] += d;
+    state[4] += e;
+    state[5] += f;
+    state[6] += g;
+    state[7] += h;
 
     W += 16;
   }
diff --git a/include/openssl/md4.h b/include/openssl/md4.h
index 1db7499..e363b73 100644
--- a/include/openssl/md4.h
+++ b/include/openssl/md4.h
@@ -88,7 +88,7 @@
 OPENSSL_EXPORT void MD4_Transform(MD4_CTX *md4, const uint8_t *block);
 
 struct md4_state_st {
-  uint32_t A, B, C, D;
+  uint32_t h[4];
   uint32_t Nl, Nh;
   uint32_t data[16];
   unsigned int num;
diff --git a/include/openssl/md5.h b/include/openssl/md5.h
index 9b13922..87c3ba4 100644
--- a/include/openssl/md5.h
+++ b/include/openssl/md5.h
@@ -93,7 +93,7 @@
 OPENSSL_EXPORT void MD5_Transform(MD5_CTX *md5, const uint8_t *block);
 
 struct md5_state_st {
-  uint32_t A, B, C, D;
+  uint32_t h[4];
   uint32_t Nl, Nh;
   uint32_t data[16];
   unsigned int num;
diff --git a/include/openssl/sha.h b/include/openssl/sha.h
index ac2ab75..f4253ec 100644
--- a/include/openssl/sha.h
+++ b/include/openssl/sha.h
@@ -98,7 +98,22 @@
 OPENSSL_EXPORT void SHA1_Transform(SHA_CTX *sha, const uint8_t *block);
 
 struct sha_state_st {
-  uint32_t h0, h1, h2, h3, h4;
+#if !defined(ANDROID)
+  uint32_t h[5];
+#else
+  /* wpa_supplicant accesses |h0|..|h4| so we must support those names
+   * for compatibility with it until it can be updated. */
+  union {
+    uint32_t h[5];
+    struct {
+      uint32_t h0;
+      uint32_t h1;
+      uint32_t h2;
+      uint32_t h3;
+      uint32_t h4;
+    };
+  };
+#endif
   uint32_t Nl, Nh;
   uint32_t data[16];
   unsigned int num;