Remove the arch-specific HOST_c2l/HOST_l2c implementations.

These do not appear to have much discernable effect on performance. Three
comparison runs:

Before:
Did 5414000 SHA-1 (16 bytes) operations in 1000009us (5413951.3 ops/sec): 86.6 MB/s
Did 1607000 SHA-1 (256 bytes) operations in 1000403us (1606352.6 ops/sec): 411.2 MB/s
Did 70000 SHA-1 (8192 bytes) operations in 1014426us (69004.5 ops/sec): 565.3 MB/s
Did 2991000 SHA-256 (16 bytes) operations in 1000204us (2990390.0 ops/sec): 47.8 MB/s
Did 741000 SHA-256 (256 bytes) operations in 1000371us (740725.2 ops/sec): 189.6 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1019327us (30412.2 ops/sec): 249.1 MB/s
Did 2340000 SHA-512 (16 bytes) operations in 1000312us (2339270.1 ops/sec): 37.4 MB/s
Did 880000 SHA-512 (256 bytes) operations in 1000879us (879227.2 ops/sec): 225.1 MB/s
Did 44000 SHA-512 (8192 bytes) operations in 1013355us (43420.1 ops/sec): 355.7 MB/s
After:
Did 5259000 SHA-1 (16 bytes) operations in 1000013us (5258931.6 ops/sec): 84.1 MB/s
Did 1547000 SHA-1 (256 bytes) operations in 1000011us (1546983.0 ops/sec): 396.0 MB/s
Did 69000 SHA-1 (8192 bytes) operations in 1001089us (68924.9 ops/sec): 564.6 MB/s
Did 2984000 SHA-256 (16 bytes) operations in 1000207us (2983382.4 ops/sec): 47.7 MB/s
Did 734000 SHA-256 (256 bytes) operations in 1000317us (733767.4 ops/sec): 187.8 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1021065us (30360.5 ops/sec): 248.7 MB/s
Did 2324000 SHA-512 (16 bytes) operations in 1000116us (2323730.4 ops/sec): 37.2 MB/s
Did 828000 SHA-512 (256 bytes) operations in 1001046us (827134.8 ops/sec): 211.7 MB/s
Did 43000 SHA-512 (8192 bytes) operations in 1003381us (42855.1 ops/sec): 351.1 MB/s

---

Before:
Did 5415000 SHA-1 (16 bytes) operations in 1000055us (5414702.2 ops/sec): 86.6 MB/s
Did 1604000 SHA-1 (256 bytes) operations in 1000524us (1603159.9 ops/sec): 410.4 MB/s
Did 71000 SHA-1 (8192 bytes) operations in 1007686us (70458.5 ops/sec): 577.2 MB/s
Did 2984000 SHA-256 (16 bytes) operations in 1000472us (2982592.2 ops/sec): 47.7 MB/s
Did 738000 SHA-256 (256 bytes) operations in 1000885us (737347.4 ops/sec): 188.8 MB/s
Did 30000 SHA-256 (8192 bytes) operations in 1020475us (29398.1 ops/sec): 240.8 MB/s
Did 2297000 SHA-512 (16 bytes) operations in 1000391us (2296102.2 ops/sec): 36.7 MB/s
Did 882000 SHA-512 (256 bytes) operations in 1000389us (881657.0 ops/sec): 225.7 MB/s
Did 43000 SHA-512 (8192 bytes) operations in 1001313us (42943.6 ops/sec): 351.8 MB/s
After:
Did 5228000 SHA-1 (16 bytes) operations in 1000035us (5227817.0 ops/sec): 83.6 MB/s
Did 1575000 SHA-1 (256 bytes) operations in 1000410us (1574354.5 ops/sec): 403.0 MB/s
Did 69000 SHA-1 (8192 bytes) operations in 1004180us (68712.8 ops/sec): 562.9 MB/s
Did 2884000 SHA-256 (16 bytes) operations in 1000093us (2883731.8 ops/sec): 46.1 MB/s
Did 718000 SHA-256 (256 bytes) operations in 1000413us (717703.6 ops/sec): 183.7 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1030257us (30089.6 ops/sec): 246.5 MB/s
Did 2286000 SHA-512 (16 bytes) operations in 1000172us (2285606.9 ops/sec): 36.6 MB/s
Did 979000 SHA-512 (256 bytes) operations in 1000384us (978624.2 ops/sec): 250.5 MB/s
Did 47000 SHA-512 (8192 bytes) operations in 1017846us (46175.9 ops/sec): 378.3 MB/s

---

Before:
Did 5429000 SHA-1 (16 bytes) operations in 1000104us (5428435.4 ops/sec): 86.9 MB/s
Did 1604000 SHA-1 (256 bytes) operations in 1000473us (1603241.7 ops/sec): 410.4 MB/s
Did 69000 SHA-1 (8192 bytes) operations in 1002621us (68819.6 ops/sec): 563.8 MB/s
Did 3021000 SHA-256 (16 bytes) operations in 1000152us (3020540.9 ops/sec): 48.3 MB/s
Did 735000 SHA-256 (256 bytes) operations in 1000048us (734964.7 ops/sec): 188.2 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1019902us (30395.1 ops/sec): 249.0 MB/s
Did 2301000 SHA-512 (16 bytes) operations in 1000207us (2300523.8 ops/sec): 36.8 MB/s
Did 881000 SHA-512 (256 bytes) operations in 1001122us (880012.6 ops/sec): 225.3 MB/s
Did 44000 SHA-512 (8192 bytes) operations in 1015313us (43336.4 ops/sec): 355.0 MB/s
After:
Did 5264000 SHA-1 (16 bytes) operations in 1000061us (5263678.9 ops/sec): 84.2 MB/s
Did 1587000 SHA-1 (256 bytes) operations in 1000293us (1586535.1 ops/sec): 406.2 MB/s
Did 71000 SHA-1 (8192 bytes) operations in 1007587us (70465.4 ops/sec): 577.3 MB/s
Did 2967000 SHA-256 (16 bytes) operations in 1000240us (2966288.1 ops/sec): 47.5 MB/s
Did 737000 SHA-256 (256 bytes) operations in 1000874us (736356.4 ops/sec): 188.5 MB/s
Did 31000 SHA-256 (8192 bytes) operations in 1019630us (30403.2 ops/sec): 249.1 MB/s
Did 2326000 SHA-512 (16 bytes) operations in 1000413us (2325039.8 ops/sec): 37.2 MB/s
Did 885000 SHA-512 (256 bytes) operations in 1000253us (884776.2 ops/sec): 226.5 MB/s
Did 44000 SHA-512 (8192 bytes) operations in 1013216us (43426.1 ops/sec): 355.7 MB/s

Change-Id: Ifd4500f4e9f41ffc0f73542141e8888b4d7f1e0b
Reviewed-on: https://boringssl-review.googlesource.com/6652
Reviewed-by: Adam Langley <alangley@gmail.com>
diff --git a/crypto/digest/md32_common.h b/crypto/digest/md32_common.h
index 9db8c54..4cf050c 100644
--- a/crypto/digest/md32_common.h
+++ b/crypto/digest/md32_common.h
@@ -140,89 +140,29 @@
 
 #if defined(DATA_ORDER_IS_BIG_ENDIAN)
 
-#if !defined(PEDANTIC) && defined(__GNUC__) && __GNUC__ >= 2 && \
-    !defined(OPENSSL_NO_ASM)
-#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
-/* The first macro gives a ~30-40% performance improvement in SHA-256 compiled
- * with gcc on P4. This can only be done on x86, where unaligned data fetches
- * are possible. */
-#define HOST_c2l(c, l)                       \
-  (void)({                                   \
-    uint32_t r = *((const uint32_t *)(c));   \
-    __asm__("bswapl %0" : "=r"(r) : "0"(r)); \
-    (c) += 4;                                \
-    (l) = r;                                 \
-  })
-#define HOST_l2c(l, c)                       \
-  (void)({                                   \
-    uint32_t r = (l);                        \
-    __asm__("bswapl %0" : "=r"(r) : "0"(r)); \
-    *((uint32_t *)(c)) = r;                  \
-    (c) += 4;                                \
-    r;                                       \
-  })
-#elif defined(__aarch64__) && defined(__BYTE_ORDER__)
-#if defined(__ORDER_LITTLE_ENDIAN__) && \
-    __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-#define HOST_c2l(c, l)                                                 \
-  (void)({                                                             \
-    uint32_t r;                                                        \
-    __asm__("rev %w0, %w1" : "=r"(r) : "r"(*((const uint32_t *)(c)))); \
-    (c) += 4;                                                          \
-    (l) = r;                                                           \
-  })
-#define HOST_l2c(l, c)                                      \
-  (void)({                                                  \
-    uint32_t r;                                             \
-    __asm__("rev %w0, %w1" : "=r"(r) : "r"((uint32_t)(l))); \
-    *((uint32_t *)(c)) = r;                                 \
-    (c) += 4;                                               \
-    r;                                                      \
-  })
-#elif defined(__ORDER_BIG_ENDIAN__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
-#define HOST_c2l(c, l) (void)((l) = *((const uint32_t *)(c)), (c) += 4)
-#define HOST_l2c(l, c) (*((uint32_t *)(c)) = (l), (c) += 4, (l))
-#endif /* __aarch64__ && __BYTE_ORDER__ */
-#endif /* ARCH */
-#endif /* !PEDANTIC && GNUC && !NO_ASM */
-
-#ifndef HOST_c2l
 #define HOST_c2l(c, l)                        \
   (void)(l = (((uint32_t)(*((c)++))) << 24),  \
          l |= (((uint32_t)(*((c)++))) << 16), \
          l |= (((uint32_t)(*((c)++))) << 8), l |= (((uint32_t)(*((c)++)))))
-#endif
 
-#ifndef HOST_l2c
 #define HOST_l2c(l, c)                             \
   (void)(*((c)++) = (uint8_t)(((l) >> 24) & 0xff), \
          *((c)++) = (uint8_t)(((l) >> 16) & 0xff), \
          *((c)++) = (uint8_t)(((l) >> 8) & 0xff),  \
          *((c)++) = (uint8_t)(((l)) & 0xff))
-#endif
 
 #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN)
 
-#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
-/* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */
-#define HOST_c2l(c, l) (void)((l) = *((const uint32_t *)(c)), (c) += 4)
-#define HOST_l2c(l, c) (void)(*((uint32_t *)(c)) = (l), (c) += 4, l)
-#endif /* OPENSSL_X86 || OPENSSL_X86_64 */
-
-#ifndef HOST_c2l
 #define HOST_c2l(c, l)                                                     \
   (void)(l = (((uint32_t)(*((c)++)))), l |= (((uint32_t)(*((c)++))) << 8), \
          l |= (((uint32_t)(*((c)++))) << 16),                              \
          l |= (((uint32_t)(*((c)++))) << 24))
-#endif
 
-#ifndef HOST_l2c
 #define HOST_l2c(l, c)                             \
   (void)(*((c)++) = (uint8_t)(((l)) & 0xff),       \
          *((c)++) = (uint8_t)(((l) >> 8) & 0xff),  \
          *((c)++) = (uint8_t)(((l) >> 16) & 0xff), \
          *((c)++) = (uint8_t)(((l) >> 24) & 0xff))
-#endif
 
 #endif /* DATA_ORDER */