Check for invalid UCS-2 and UTF-32 in ASN1_STRING_print_ex.

Switch to the CBS functions, which do all the checks together. This
resolves a TODO that ASN1_STRING_print_ex was inconsistently checking
for invalid codepoints. It also removes an optimization when
round-tripping UTF-8. This optimization was incorrect if the input was
invalid.

Finally, this removes UTF8_getc, which no longer has any callers.
(I've left UTF8_putc for now because CBB would force a malloc on every
character, even with CBB_init_fixed. We should either decide we don't
care, or make it possible to stack-allocate the cbb_buffer_st.)

Update-Note: This will make ASN1_STRING_print_ex newly fail, but such
inputs should be unreachable from the parser as of an earlier change.

Change-Id: I52d747c500c6f5f9ef659cdee3ef5d241f38ed21
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/53226
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/asn1/a_strex.c b/crypto/asn1/a_strex.c
index e6aacf0..fd08b02 100644
--- a/crypto/asn1/a_strex.c
+++ b/crypto/asn1/a_strex.c
@@ -66,6 +66,7 @@
 #include <openssl/bytestring.h>
 #include <openssl/mem.h>
 
+#include "../bytestring/internal.h"
 #include "internal.h"
 
 
@@ -129,70 +130,46 @@
 // appropriate.
 
 static int do_buf(const unsigned char *buf, int buflen, int encoding,
-                  int utf8_convert, unsigned long flags, char *quotes,
-                  BIO *out) {
-  // Reject invalid UCS-4 and UCS-2 lengths without parsing.
+                  unsigned long flags, char *quotes, BIO *out) {
+  int (*get_char)(CBS *cbs, uint32_t *out);
+  int get_char_error;
   switch (encoding) {
     case MBSTRING_UNIV:
-      if (buflen & 3) {
-        OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UNIVERSALSTRING);
-        return -1;
-      }
+      get_char = cbs_get_utf32_be;
+      get_char_error = ASN1_R_INVALID_UNIVERSALSTRING;
       break;
     case MBSTRING_BMP:
-      if (buflen & 1) {
-        OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BMPSTRING);
-        return -1;
-      }
+      get_char = cbs_get_ucs2_be;
+      get_char_error = ASN1_R_INVALID_BMPSTRING;
       break;
+    case MBSTRING_ASC:
+      get_char = cbs_get_latin1;
+      get_char_error = ERR_R_INTERNAL_ERROR;  // Should not be possible.
+      break;
+    case MBSTRING_UTF8:
+      get_char = cbs_get_utf8;
+      get_char_error = ASN1_R_INVALID_UTF8STRING;
+      break;
+    default:
+      assert(0);
+      return -1;
   }
 
-  const unsigned char *p = buf;
-  const unsigned char *q = buf + buflen;
+  CBS cbs;
+  CBS_init(&cbs, buf, buflen);
   int outlen = 0;
-  while (p != q) {
-    const int is_first = p == buf;
-    // TODO(davidben): Replace this with |cbs_get_ucs2_be|, etc., to check
-    // for invalid codepoints. Before doing that, enforce it in the parser,
-    // https://crbug.com/boringssl/427, so these error cases are not
-    // reachable from parsed objects.
+  while (CBS_len(&cbs) != 0) {
+    const int is_first = CBS_data(&cbs) == buf;
     uint32_t c;
-    switch (encoding) {
-      case MBSTRING_UNIV:
-        c = ((uint32_t)*p++) << 24;
-        c |= ((uint32_t)*p++) << 16;
-        c |= ((uint32_t)*p++) << 8;
-        c |= *p++;
-        break;
-
-      case MBSTRING_BMP:
-        c = ((uint32_t)*p++) << 8;
-        c |= *p++;
-        break;
-
-      case MBSTRING_ASC:
-        c = *p++;
-        break;
-
-      case MBSTRING_UTF8: {
-        int consumed = UTF8_getc(p, buflen, &c);
-        if (consumed < 0) {
-          return -1;  // Invalid UTF8String
-        }
-        buflen -= consumed;
-        p += consumed;
-        break;
-      }
-
-      default:
-        assert(0);
-        return -1;
+    if (!get_char(&cbs, &c)) {
+      OPENSSL_PUT_ERROR(ASN1, get_char_error);
+      return -1;
     }
-    const int is_last = p == q;
-    if (utf8_convert) {
+    const int is_last = CBS_len(&cbs) == 0;
+    if (flags & ASN1_STRFLGS_UTF8_CONVERT) {
       unsigned char utfbuf[6];
       int utflen;
-      utflen = UTF8_putc(utfbuf, sizeof utfbuf, c);
+      utflen = UTF8_putc(utfbuf, sizeof(utfbuf), c);
       for (int i = 0; i < utflen; i++) {
         int len = do_esc_char(utfbuf[i], flags, quotes, out, is_first && i == 0,
                               is_last && i == utflen - 1);
@@ -350,24 +327,9 @@
     return outlen;
   }
 
-  int utf8_convert = 0;
-  if (flags & ASN1_STRFLGS_UTF8_CONVERT) {
-    // If the string is UTF-8, skip decoding and just interpret it as 1 byte
-    // per character to avoid converting twice.
-    //
-    // TODO(davidben): This is not quite a valid optimization if the input
-    // was invalid UTF-8.
-    if (encoding == MBSTRING_UTF8) {
-      encoding = MBSTRING_ASC;
-    } else {
-      utf8_convert = 1;
-    }
-  }
-
   // Measure the length.
   char quotes = 0;
-  int len = do_buf(str->data, str->length, encoding, utf8_convert, flags,
-                   &quotes, NULL);
+  int len = do_buf(str->data, str->length, encoding, flags, &quotes, NULL);
   if (len < 0) {
     return -1;
   }
@@ -381,8 +343,7 @@
 
   // Encode the value.
   if ((quotes && !maybe_write(out, "\"", 1)) ||
-      do_buf(str->data, str->length, encoding, utf8_convert, flags, NULL, out) <
-          0 ||
+      do_buf(str->data, str->length, encoding, flags, NULL, out) < 0 ||
       (quotes && !maybe_write(out, "\"", 1))) {
     return -1;
   }
diff --git a/crypto/asn1/a_utf8.c b/crypto/asn1/a_utf8.c
index 58496b2..02c1d56 100644
--- a/crypto/asn1/a_utf8.c
+++ b/crypto/asn1/a_utf8.c
@@ -63,112 +63,6 @@
 
 // UTF8 utilities
 
-// This parses a UTF8 string one character at a time. It is passed a pointer
-// to the string and the length of the string. It sets 'value' to the value
-// of the current character. It returns the number of characters read or a
-// negative error code: -1 = string too short -2 = illegal character -3 =
-// subsequent characters not of the form 10xxxxxx -4 = character encoded
-// incorrectly (not minimal length).
-
-int UTF8_getc(const unsigned char *str, int len, uint32_t *val) {
-  const unsigned char *p;
-  uint32_t value;
-  int ret;
-  if (len <= 0) {
-    return 0;
-  }
-  p = str;
-
-  // Check syntax and work out the encoded value (if correct)
-  if ((*p & 0x80) == 0) {
-    value = *p++ & 0x7f;
-    ret = 1;
-  } else if ((*p & 0xe0) == 0xc0) {
-    if (len < 2) {
-      return -1;
-    }
-    if ((p[1] & 0xc0) != 0x80) {
-      return -3;
-    }
-    value = (*p++ & 0x1f) << 6;
-    value |= *p++ & 0x3f;
-    if (value < 0x80) {
-      return -4;
-    }
-    ret = 2;
-  } else if ((*p & 0xf0) == 0xe0) {
-    if (len < 3) {
-      return -1;
-    }
-    if (((p[1] & 0xc0) != 0x80) || ((p[2] & 0xc0) != 0x80)) {
-      return -3;
-    }
-    value = (*p++ & 0xf) << 12;
-    value |= (*p++ & 0x3f) << 6;
-    value |= *p++ & 0x3f;
-    if (value < 0x800) {
-      return -4;
-    }
-    ret = 3;
-  } else if ((*p & 0xf8) == 0xf0) {
-    if (len < 4) {
-      return -1;
-    }
-    if (((p[1] & 0xc0) != 0x80) || ((p[2] & 0xc0) != 0x80) ||
-        ((p[3] & 0xc0) != 0x80)) {
-      return -3;
-    }
-    value = ((uint32_t)(*p++ & 0x7)) << 18;
-    value |= (*p++ & 0x3f) << 12;
-    value |= (*p++ & 0x3f) << 6;
-    value |= *p++ & 0x3f;
-    if (value < 0x10000) {
-      return -4;
-    }
-    ret = 4;
-  } else if ((*p & 0xfc) == 0xf8) {
-    if (len < 5) {
-      return -1;
-    }
-    if (((p[1] & 0xc0) != 0x80) || ((p[2] & 0xc0) != 0x80) ||
-        ((p[3] & 0xc0) != 0x80) || ((p[4] & 0xc0) != 0x80)) {
-      return -3;
-    }
-    value = ((uint32_t)(*p++ & 0x3)) << 24;
-    value |= ((uint32_t)(*p++ & 0x3f)) << 18;
-    value |= ((uint32_t)(*p++ & 0x3f)) << 12;
-    value |= (*p++ & 0x3f) << 6;
-    value |= *p++ & 0x3f;
-    if (value < 0x200000) {
-      return -4;
-    }
-    ret = 5;
-  } else if ((*p & 0xfe) == 0xfc) {
-    if (len < 6) {
-      return -1;
-    }
-    if (((p[1] & 0xc0) != 0x80) || ((p[2] & 0xc0) != 0x80) ||
-        ((p[3] & 0xc0) != 0x80) || ((p[4] & 0xc0) != 0x80) ||
-        ((p[5] & 0xc0) != 0x80)) {
-      return -3;
-    }
-    value = ((uint32_t)(*p++ & 0x1)) << 30;
-    value |= ((uint32_t)(*p++ & 0x3f)) << 24;
-    value |= ((uint32_t)(*p++ & 0x3f)) << 18;
-    value |= ((uint32_t)(*p++ & 0x3f)) << 12;
-    value |= (*p++ & 0x3f) << 6;
-    value |= *p++ & 0x3f;
-    if (value < 0x4000000) {
-      return -4;
-    }
-    ret = 6;
-  } else {
-    return -2;
-  }
-  *val = value;
-  return ret;
-}
-
 // This takes a character 'value' and writes the UTF8 encoded value in 'str'
 // where 'str' is a buffer containing 'len' characters. Returns the number of
 // characters written or -1 if 'len' is too small. 'str' can be set to NULL
diff --git a/crypto/asn1/asn1_test.cc b/crypto/asn1/asn1_test.cc
index 195d5a8..9df309a 100644
--- a/crypto/asn1/asn1_test.cc
+++ b/crypto/asn1/asn1_test.cc
@@ -1278,11 +1278,23 @@
     int str_flags;
     unsigned long flags;
   } kUnprintableTests[] = {
-      // When decoding strings, invalid codepoints are errors.
+      // It is an error if the string cannot be decoded.
       {V_ASN1_UTF8STRING, {0xff}, 0, ASN1_STRFLGS_ESC_MSB},
       {V_ASN1_BMPSTRING, {0xff}, 0, ASN1_STRFLGS_ESC_MSB},
       {V_ASN1_BMPSTRING, {0xff}, 0, ASN1_STRFLGS_ESC_MSB},
       {V_ASN1_UNIVERSALSTRING, {0xff}, 0, ASN1_STRFLGS_ESC_MSB},
+      // Invalid codepoints are errors.
+      {V_ASN1_UTF8STRING, {0xed, 0xa0, 0x80}, 0, ASN1_STRFLGS_ESC_MSB},
+      {V_ASN1_BMPSTRING, {0xd8, 0x00}, 0, ASN1_STRFLGS_ESC_MSB},
+      {V_ASN1_UNIVERSALSTRING,
+       {0x00, 0x00, 0xd8, 0x00},
+       0,
+       ASN1_STRFLGS_ESC_MSB},
+      // Even when re-encoding UTF-8 back into UTF-8, we should check validity.
+      {V_ASN1_UTF8STRING,
+       {0xff},
+       0,
+       ASN1_STRFLGS_ESC_MSB | ASN1_STRFLGS_UTF8_CONVERT},
   };
   for (const auto &t : kUnprintableTests) {
     SCOPED_TRACE(t.type);
diff --git a/crypto/asn1/internal.h b/crypto/asn1/internal.h
index 9c0d73e..f8ca8fe 100644
--- a/crypto/asn1/internal.h
+++ b/crypto/asn1/internal.h
@@ -133,7 +133,6 @@
 void asn1_item_combine_free(ASN1_VALUE **pval, const ASN1_ITEM *it,
                             int combine);
 
-int UTF8_getc(const unsigned char *str, int len, uint32_t *val);
 int UTF8_putc(unsigned char *str, int len, uint32_t value);
 
 int ASN1_item_ex_new(ASN1_VALUE **pval, const ASN1_ITEM *it);