Check Unicode string encodings in crypto/asn1.

This checks validity for UTF8String, BMPString, and UniversalString.

When we detach the core types from ASN1_ITEM, this will likely also be
reshuffled around, probably into type-specific functions. But for now
just get the checks in place.

Update-Note: Invalid strings in X.509 certificates and other ASN.1
structures will now be rejected. This change is less risky than it seems
because most strings in X.509 are in X509_NAME, which already rejected
invalid instances of these string types (but not other string types)
during canonicalization. See https://crbug.com/boringssl/412 for a
discussion of that mess.

Bug: 427
Change-Id: I0d7e24dfd841703d2a8581ec4e78ed5bc3862d75
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/53225
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/asn1/asn1_test.cc b/crypto/asn1/asn1_test.cc
index 442c1ea..195d5a8 100644
--- a/crypto/asn1/asn1_test.cc
+++ b/crypto/asn1/asn1_test.cc
@@ -2175,6 +2175,75 @@
                  0x04, 0x01, 0x84, 0xb7, 0x09, 0x01, 0x00, 0x01, 0x61});
 }
 
+TEST(ASN1Test, StringEncoding) {
+  const struct {
+    ASN1_STRING *(*d2i)(ASN1_STRING **out, const uint8_t **inp, long len);
+    std::vector<uint8_t> in;
+    bool valid;
+  } kTests[] = {
+      // All OCTET STRINGs are valid.
+      {d2i_ASN1_OCTET_STRING, {0x04, 0x00}, true},
+      {d2i_ASN1_OCTET_STRING, {0x04, 0x01, 0x00}, true},
+
+      // UTF8String must be valid UTF-8.
+      {d2i_ASN1_UTF8STRING, {0x0c, 0x00}, true},
+      {d2i_ASN1_UTF8STRING, {0x0c, 0x01, 'a'}, true},
+      {d2i_ASN1_UTF8STRING, {0x0c, 0x03, 0xe2, 0x98, 0x83}, true},
+      // Non-minimal, two-byte UTF-8.
+      {d2i_ASN1_UTF8STRING, {0x0c, 0x02, 0xc0, 0x81}, false},
+      // Truncated, four-byte UTF-8.
+      {d2i_ASN1_UTF8STRING, {0x0c, 0x03, 0xf0, 0x80, 0x80}, false},
+      // Low-surrogate value.
+      {d2i_ASN1_UTF8STRING, {0x0c, 0x03, 0xed, 0xa0, 0x80}, false},
+      // High-surrogate value.
+      {d2i_ASN1_UTF8STRING, {0x0c, 0x03, 0xed, 0xb0, 0x81}, false},
+
+      // BMPString must be valid UCS-2.
+      {d2i_ASN1_BMPSTRING, {0x1e, 0x00}, true},
+      {d2i_ASN1_BMPSTRING, {0x1e, 0x02, 0x00, 'a'}, true},
+      // Truncated code unit.
+      {d2i_ASN1_BMPSTRING, {0x1e, 0x01, 'a'}, false},
+      // Lone surrogate.
+      {d2i_ASN1_BMPSTRING, {0x1e, 0x02, 0xd8, 0}, false},
+      // BMPString is UCS-2, not UTF-16, so surrogate pairs are also invalid.
+      {d2i_ASN1_BMPSTRING, {0x1e, 0x04, 0xd8, 0, 0xdc, 1}, false},
+
+      // UniversalString must be valid UTF-32.
+      {d2i_ASN1_UNIVERSALSTRING, {0x1c, 0x00}, true},
+      {d2i_ASN1_UNIVERSALSTRING, {0x1c, 0x04, 0x00, 0x00, 0x00, 'a'}, true},
+      // Maximum code point.
+      {d2i_ASN1_UNIVERSALSTRING, {0x1c, 0x04, 0x00, 0x10, 0xff, 0xfd}, true},
+      // Reserved.
+      {d2i_ASN1_UNIVERSALSTRING, {0x1c, 0x04, 0x00, 0x10, 0xff, 0xfe}, false},
+      {d2i_ASN1_UNIVERSALSTRING, {0x1c, 0x04, 0x00, 0x10, 0xff, 0xff}, false},
+      // Too high.
+      {d2i_ASN1_UNIVERSALSTRING, {0x1c, 0x04, 0x00, 0x11, 0x00, 0x00}, false},
+      // Surrogates are not characters.
+      {d2i_ASN1_UNIVERSALSTRING, {0x1c, 0x04, 0x00, 0x00, 0xd8, 0}, false},
+      // Truncated codepoint.
+      {d2i_ASN1_UNIVERSALSTRING, {0x1c, 0x03, 0x00, 0x00, 0x00}, false},
+
+      // We interpret T61String as Latin-1, so all inputs are valid.
+      {d2i_ASN1_T61STRING, {0x14, 0x00}, true},
+      {d2i_ASN1_T61STRING, {0x14, 0x01, 0x00}, true},
+  };
+  for (const auto& t : kTests) {
+    SCOPED_TRACE(Bytes(t.in));
+    const uint8_t *inp;
+
+    if (t.d2i != nullptr) {
+      inp = t.in.data();
+      bssl::UniquePtr<ASN1_STRING> str(t.d2i(nullptr, &inp, t.in.size()));
+      EXPECT_EQ(t.valid, str != nullptr);
+    }
+
+    // Also test with the ANY parser.
+    inp = t.in.data();
+    bssl::UniquePtr<ASN1_TYPE> any(d2i_ASN1_TYPE(nullptr, &inp, t.in.size()));
+    EXPECT_EQ(t.valid, any != nullptr);
+  }
+}
+
 // The ASN.1 macros do not work on Windows shared library builds, where usage of
 // |OPENSSL_EXPORT| is a bit stricter.
 #if !defined(OPENSSL_WINDOWS) || !defined(BORINGSSL_SHARED_LIBRARY)
diff --git a/crypto/asn1/tasn_dec.c b/crypto/asn1/tasn_dec.c
index c855fe5..de87791 100644
--- a/crypto/asn1/tasn_dec.c
+++ b/crypto/asn1/tasn_dec.c
@@ -63,6 +63,7 @@
 #include <limits.h>
 #include <string.h>
 
+#include "../bytestring/internal.h"
 #include "../internal.h"
 #include "internal.h"
 
@@ -797,32 +798,51 @@
     case V_ASN1_OTHER:
     case V_ASN1_SET:
     case V_ASN1_SEQUENCE:
-    default:
-      if (utype == V_ASN1_BMPSTRING && (len & 1)) {
-        OPENSSL_PUT_ERROR(ASN1, ASN1_R_BMPSTRING_IS_WRONG_LENGTH);
-        goto err;
+    default: {
+      CBS cbs;
+      CBS_init(&cbs, cont, (size_t)len);
+      if (utype == V_ASN1_BMPSTRING) {
+        while (CBS_len(&cbs) != 0) {
+          uint32_t c;
+          if (!cbs_get_ucs2_be(&cbs, &c)) {
+            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BMPSTRING);
+            goto err;
+          }
+        }
       }
-      if (utype == V_ASN1_UNIVERSALSTRING && (len & 3)) {
-        OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNIVERSALSTRING_IS_WRONG_LENGTH);
-        goto err;
+      if (utype == V_ASN1_UNIVERSALSTRING) {
+        while (CBS_len(&cbs) != 0) {
+          uint32_t c;
+          if (!cbs_get_utf32_be(&cbs, &c)) {
+            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UNIVERSALSTRING);
+            goto err;
+          }
+        }
+      }
+      if (utype == V_ASN1_UTF8STRING) {
+        while (CBS_len(&cbs) != 0) {
+          uint32_t c;
+          if (!cbs_get_utf8(&cbs, &c)) {
+            OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UTF8STRING);
+            goto err;
+          }
+        }
       }
       if (utype == V_ASN1_UTCTIME) {
-        CBS cbs;
-        CBS_init(&cbs, cont, (size_t)len);
         if (!CBS_parse_utc_time(&cbs, NULL, /*allow_timezone_offset=*/1)) {
           OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_TIME_FORMAT);
           goto err;
         }
       }
       if (utype == V_ASN1_GENERALIZEDTIME) {
-        CBS cbs;
-        CBS_init(&cbs, cont, (size_t)len);
         if (!CBS_parse_generalized_time(&cbs, NULL,
                                         /*allow_timezone_offset=*/0)) {
           OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_TIME_FORMAT);
           goto err;
         }
       }
+      // TODO(https://crbug.com/boringssl/427): Check other string types.
+
       // All based on ASN1_STRING and handled the same
       if (!*pval) {
         stmp = ASN1_STRING_type_new(utype);
@@ -842,6 +862,7 @@
         goto err;
       }
       break;
+    }
   }
   // If ASN1_ANY and NULL type fix up value
   if (typ && (utype == V_ASN1_NULL)) {