Speed up constant-time base64 decoding.

I was inspired to look at this again recently and noticed we could do a
bit better. Instead of a tower of selects, rely on all the cases being
mutually exclusive and use the ret |= mask & value formulation without
loss in clarity. We do need to fixup the invalid case slightly, but
since that computation is mostly independent, I'm guessing the CPU and
compiler are able to schedule it effectively.

Before:
Did 251000 base64 decode operations in 2002569us (159.4 MB/sec)
After:
Did 346000 base64 decode operations in 2005426us (219.5 MB/sec) [+37.7%]

Change-Id: I542167202fd4e94c93dd5a2519a97bc388072c89
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/49525
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/base64/base64.c b/crypto/base64/base64.c
index 349452d..3d92059 100644
--- a/crypto/base64/base64.c
+++ b/crypto/base64/base64.c
@@ -265,14 +265,17 @@
   const uint8_t is_slash = constant_time_eq_8(a, '/');
   const uint8_t is_equals = constant_time_eq_8(a, '=');
 
-  uint8_t ret = 0xff;  // 0xff signals invalid.
-  ret = constant_time_select_8(is_upper, a - 'A', ret);       // [0,26)
-  ret = constant_time_select_8(is_lower, a - 'a' + 26, ret);  // [26,52)
-  ret = constant_time_select_8(is_digit, a - '0' + 52, ret);  // [52,62)
-  ret = constant_time_select_8(is_plus, 62, ret);
-  ret = constant_time_select_8(is_slash, 63, ret);
-  // Padding maps to zero, to be further handled by the caller.
-  ret = constant_time_select_8(is_equals, 0, ret);
+  uint8_t ret = 0;
+  ret |= is_upper & (a - 'A');       // [0,26)
+  ret |= is_lower & (a - 'a' + 26);  // [26,52)
+  ret |= is_digit & (a - '0' + 52);  // [52,62)
+  ret |= is_plus & 62;
+  ret |= is_slash & 63;
+  // Invalid inputs, 'A', and '=' have all been mapped to zero. Map invalid
+  // inputs to 0xff. Note '=' is padding and handled separately by the caller.
+  const uint8_t is_valid =
+      is_upper | is_lower | is_digit | is_plus | is_slash | is_equals;
+  ret |= ~is_valid;
   return ret;
 }
 
diff --git a/tool/speed.cc b/tool/speed.cc
index 613e630..b91a4ce 100644
--- a/tool/speed.cc
+++ b/tool/speed.cc
@@ -26,6 +26,7 @@
 
 #include <openssl/aead.h>
 #include <openssl/aes.h>
+#include <openssl/base64.h>
 #include <openssl/bn.h>
 #include <openssl/curve25519.h>
 #include <openssl/crypto.h>
@@ -992,6 +993,48 @@
   return true;
 }
 
+static bool SpeedBase64(const std::string &selected) {
+  if (!selected.empty() && selected.find("base64") == std::string::npos) {
+    return true;
+  }
+
+  static const char kInput[] =
+    "MIIDtTCCAp2gAwIBAgIJALW2IrlaBKUhMA0GCSqGSIb3DQEBCwUAMEUxCzAJBgNV"
+    "BAYTAkFVMRMwEQYDVQQIEwpTb21lLVN0YXRlMSEwHwYDVQQKExhJbnRlcm5ldCBX"
+    "aWRnaXRzIFB0eSBMdGQwHhcNMTYwNzA5MDQzODA5WhcNMTYwODA4MDQzODA5WjBF"
+    "MQswCQYDVQQGEwJBVTETMBEGA1UECBMKU29tZS1TdGF0ZTEhMB8GA1UEChMYSW50"
+    "ZXJuZXQgV2lkZ2l0cyBQdHkgTHRkMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB"
+    "CgKCAQEAugvahBkSAUF1fC49vb1bvlPrcl80kop1iLpiuYoz4Qptwy57+EWssZBc"
+    "HprZ5BkWf6PeGZ7F5AX1PyJbGHZLqvMCvViP6pd4MFox/igESISEHEixoiXCzepB"
+    "rhtp5UQSjHD4D4hKtgdMgVxX+LRtwgW3mnu/vBu7rzpr/DS8io99p3lqZ1Aky+aN"
+    "lcMj6MYy8U+YFEevb/V0lRY9oqwmW7BHnXikm/vi6sjIS350U8zb/mRzYeIs2R65"
+    "LUduTL50+UMgat9ocewI2dv8aO9Dph+8NdGtg8LFYyTTHcUxJoMr1PTOgnmET19W"
+    "JH4PrFwk7ZE1QJQQ1L4iKmPeQistuQIDAQABo4GnMIGkMB0GA1UdDgQWBBT5m6Vv"
+    "zYjVYHG30iBE+j2XDhUE8jB1BgNVHSMEbjBsgBT5m6VvzYjVYHG30iBE+j2XDhUE"
+    "8qFJpEcwRTELMAkGA1UEBhMCQVUxEzARBgNVBAgTClNvbWUtU3RhdGUxITAfBgNV"
+    "BAoTGEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZIIJALW2IrlaBKUhMAwGA1UdEwQF"
+    "MAMBAf8wDQYJKoZIhvcNAQELBQADggEBAD7Jg68SArYWlcoHfZAB90Pmyrt5H6D8"
+    "LRi+W2Ri1fBNxREELnezWJ2scjl4UMcsKYp4Pi950gVN+62IgrImcCNvtb5I1Cfy"
+    "/MNNur9ffas6X334D0hYVIQTePyFk3umI+2mJQrtZZyMPIKSY/sYGQHhGGX6wGK+"
+    "GO/og0PQk/Vu6D+GU2XRnDV0YZg1lsAsHd21XryK6fDmNkEMwbIWrts4xc7scRrG"
+    "HWy+iMf6/7p/Ak/SIicM4XSwmlQ8pPxAZPr+E2LoVd9pMpWUwpW2UbtO5wsGTrY5"
+    "sO45tFNN/y+jtUheB1C2ijObG/tXELaiyCdM+S/waeuv0MXtI4xnn1A=";
+
+  std::vector<uint8_t> out(strlen(kInput));
+  size_t len;
+  TimeResults results;
+  if (!TimeFunction(&results, [&]() -> bool {
+        return EVP_DecodeBase64(out.data(), &len, out.size(),
+                                reinterpret_cast<const uint8_t *>(kInput),
+                                strlen(kInput));
+      })) {
+    fprintf(stderr, "base64 decode failed.\n");
+    return false;
+  }
+  results.PrintWithBytes("base64 decode", strlen(kInput));
+  return true;
+}
+
 static TRUST_TOKEN_PRETOKEN *trust_token_pretoken_dup(
     TRUST_TOKEN_PRETOKEN *in) {
   TRUST_TOKEN_PRETOKEN *out =
@@ -1390,7 +1433,8 @@
       !SpeedTrustToken("TrustToken-Exp2PMB-Batch1",
                        TRUST_TOKEN_experiment_v2_pmb(), 1, selected) ||
       !SpeedTrustToken("TrustToken-Exp2PMB-Batch10",
-                       TRUST_TOKEN_experiment_v2_pmb(), 10, selected)) {
+                       TRUST_TOKEN_experiment_v2_pmb(), 10, selected) ||
+      !SpeedBase64(selected)) {
     return false;
   }
 #if defined(BORINGSSL_FIPS)