Check for invalid UCS-2 and UTF-32 in ASN1_STRING_print_ex.
Switch to the CBS functions, which do all the checks together. This
resolves a TODO that ASN1_STRING_print_ex was inconsistently checking
for invalid codepoints. It also removes an optimization when
round-tripping UTF-8. This optimization was incorrect if the input was
invalid.
Finally, this removes UTF8_getc, which no longer has any callers.
(I've left UTF8_putc for now because CBB would force a malloc on every
character, even with CBB_init_fixed. We should either decide we don't
care, or make it possible to stack-allocate the cbb_buffer_st.)
Update-Note: This will make ASN1_STRING_print_ex newly fail, but such
inputs should be unreachable from the parser as of an earlier change.
Change-Id: I52d747c500c6f5f9ef659cdee3ef5d241f38ed21
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/53226
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/asn1/a_strex.c b/crypto/asn1/a_strex.c
index e6aacf0..fd08b02 100644
--- a/crypto/asn1/a_strex.c
+++ b/crypto/asn1/a_strex.c
@@ -66,6 +66,7 @@
#include <openssl/bytestring.h>
#include <openssl/mem.h>
+#include "../bytestring/internal.h"
#include "internal.h"
@@ -129,70 +130,46 @@
// appropriate.
static int do_buf(const unsigned char *buf, int buflen, int encoding,
- int utf8_convert, unsigned long flags, char *quotes,
- BIO *out) {
- // Reject invalid UCS-4 and UCS-2 lengths without parsing.
+ unsigned long flags, char *quotes, BIO *out) {
+ int (*get_char)(CBS *cbs, uint32_t *out);
+ int get_char_error;
switch (encoding) {
case MBSTRING_UNIV:
- if (buflen & 3) {
- OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UNIVERSALSTRING);
- return -1;
- }
+ get_char = cbs_get_utf32_be;
+ get_char_error = ASN1_R_INVALID_UNIVERSALSTRING;
break;
case MBSTRING_BMP:
- if (buflen & 1) {
- OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BMPSTRING);
- return -1;
- }
+ get_char = cbs_get_ucs2_be;
+ get_char_error = ASN1_R_INVALID_BMPSTRING;
break;
+ case MBSTRING_ASC:
+ get_char = cbs_get_latin1;
+ get_char_error = ERR_R_INTERNAL_ERROR; // Should not be possible.
+ break;
+ case MBSTRING_UTF8:
+ get_char = cbs_get_utf8;
+ get_char_error = ASN1_R_INVALID_UTF8STRING;
+ break;
+ default:
+ assert(0);
+ return -1;
}
- const unsigned char *p = buf;
- const unsigned char *q = buf + buflen;
+ CBS cbs;
+ CBS_init(&cbs, buf, buflen);
int outlen = 0;
- while (p != q) {
- const int is_first = p == buf;
- // TODO(davidben): Replace this with |cbs_get_ucs2_be|, etc., to check
- // for invalid codepoints. Before doing that, enforce it in the parser,
- // https://crbug.com/boringssl/427, so these error cases are not
- // reachable from parsed objects.
+ while (CBS_len(&cbs) != 0) {
+ const int is_first = CBS_data(&cbs) == buf;
uint32_t c;
- switch (encoding) {
- case MBSTRING_UNIV:
- c = ((uint32_t)*p++) << 24;
- c |= ((uint32_t)*p++) << 16;
- c |= ((uint32_t)*p++) << 8;
- c |= *p++;
- break;
-
- case MBSTRING_BMP:
- c = ((uint32_t)*p++) << 8;
- c |= *p++;
- break;
-
- case MBSTRING_ASC:
- c = *p++;
- break;
-
- case MBSTRING_UTF8: {
- int consumed = UTF8_getc(p, buflen, &c);
- if (consumed < 0) {
- return -1; // Invalid UTF8String
- }
- buflen -= consumed;
- p += consumed;
- break;
- }
-
- default:
- assert(0);
- return -1;
+ if (!get_char(&cbs, &c)) {
+ OPENSSL_PUT_ERROR(ASN1, get_char_error);
+ return -1;
}
- const int is_last = p == q;
- if (utf8_convert) {
+ const int is_last = CBS_len(&cbs) == 0;
+ if (flags & ASN1_STRFLGS_UTF8_CONVERT) {
unsigned char utfbuf[6];
int utflen;
- utflen = UTF8_putc(utfbuf, sizeof utfbuf, c);
+ utflen = UTF8_putc(utfbuf, sizeof(utfbuf), c);
for (int i = 0; i < utflen; i++) {
int len = do_esc_char(utfbuf[i], flags, quotes, out, is_first && i == 0,
is_last && i == utflen - 1);
@@ -350,24 +327,9 @@
return outlen;
}
- int utf8_convert = 0;
- if (flags & ASN1_STRFLGS_UTF8_CONVERT) {
- // If the string is UTF-8, skip decoding and just interpret it as 1 byte
- // per character to avoid converting twice.
- //
- // TODO(davidben): This is not quite a valid optimization if the input
- // was invalid UTF-8.
- if (encoding == MBSTRING_UTF8) {
- encoding = MBSTRING_ASC;
- } else {
- utf8_convert = 1;
- }
- }
-
// Measure the length.
char quotes = 0;
- int len = do_buf(str->data, str->length, encoding, utf8_convert, flags,
- "es, NULL);
+ int len = do_buf(str->data, str->length, encoding, flags, "es, NULL);
if (len < 0) {
return -1;
}
@@ -381,8 +343,7 @@
// Encode the value.
if ((quotes && !maybe_write(out, "\"", 1)) ||
- do_buf(str->data, str->length, encoding, utf8_convert, flags, NULL, out) <
- 0 ||
+ do_buf(str->data, str->length, encoding, flags, NULL, out) < 0 ||
(quotes && !maybe_write(out, "\"", 1))) {
return -1;
}
diff --git a/crypto/asn1/a_utf8.c b/crypto/asn1/a_utf8.c
index 58496b2..02c1d56 100644
--- a/crypto/asn1/a_utf8.c
+++ b/crypto/asn1/a_utf8.c
@@ -63,112 +63,6 @@
// UTF8 utilities
-// This parses a UTF8 string one character at a time. It is passed a pointer
-// to the string and the length of the string. It sets 'value' to the value
-// of the current character. It returns the number of characters read or a
-// negative error code: -1 = string too short -2 = illegal character -3 =
-// subsequent characters not of the form 10xxxxxx -4 = character encoded
-// incorrectly (not minimal length).
-
-int UTF8_getc(const unsigned char *str, int len, uint32_t *val) {
- const unsigned char *p;
- uint32_t value;
- int ret;
- if (len <= 0) {
- return 0;
- }
- p = str;
-
- // Check syntax and work out the encoded value (if correct)
- if ((*p & 0x80) == 0) {
- value = *p++ & 0x7f;
- ret = 1;
- } else if ((*p & 0xe0) == 0xc0) {
- if (len < 2) {
- return -1;
- }
- if ((p[1] & 0xc0) != 0x80) {
- return -3;
- }
- value = (*p++ & 0x1f) << 6;
- value |= *p++ & 0x3f;
- if (value < 0x80) {
- return -4;
- }
- ret = 2;
- } else if ((*p & 0xf0) == 0xe0) {
- if (len < 3) {
- return -1;
- }
- if (((p[1] & 0xc0) != 0x80) || ((p[2] & 0xc0) != 0x80)) {
- return -3;
- }
- value = (*p++ & 0xf) << 12;
- value |= (*p++ & 0x3f) << 6;
- value |= *p++ & 0x3f;
- if (value < 0x800) {
- return -4;
- }
- ret = 3;
- } else if ((*p & 0xf8) == 0xf0) {
- if (len < 4) {
- return -1;
- }
- if (((p[1] & 0xc0) != 0x80) || ((p[2] & 0xc0) != 0x80) ||
- ((p[3] & 0xc0) != 0x80)) {
- return -3;
- }
- value = ((uint32_t)(*p++ & 0x7)) << 18;
- value |= (*p++ & 0x3f) << 12;
- value |= (*p++ & 0x3f) << 6;
- value |= *p++ & 0x3f;
- if (value < 0x10000) {
- return -4;
- }
- ret = 4;
- } else if ((*p & 0xfc) == 0xf8) {
- if (len < 5) {
- return -1;
- }
- if (((p[1] & 0xc0) != 0x80) || ((p[2] & 0xc0) != 0x80) ||
- ((p[3] & 0xc0) != 0x80) || ((p[4] & 0xc0) != 0x80)) {
- return -3;
- }
- value = ((uint32_t)(*p++ & 0x3)) << 24;
- value |= ((uint32_t)(*p++ & 0x3f)) << 18;
- value |= ((uint32_t)(*p++ & 0x3f)) << 12;
- value |= (*p++ & 0x3f) << 6;
- value |= *p++ & 0x3f;
- if (value < 0x200000) {
- return -4;
- }
- ret = 5;
- } else if ((*p & 0xfe) == 0xfc) {
- if (len < 6) {
- return -1;
- }
- if (((p[1] & 0xc0) != 0x80) || ((p[2] & 0xc0) != 0x80) ||
- ((p[3] & 0xc0) != 0x80) || ((p[4] & 0xc0) != 0x80) ||
- ((p[5] & 0xc0) != 0x80)) {
- return -3;
- }
- value = ((uint32_t)(*p++ & 0x1)) << 30;
- value |= ((uint32_t)(*p++ & 0x3f)) << 24;
- value |= ((uint32_t)(*p++ & 0x3f)) << 18;
- value |= ((uint32_t)(*p++ & 0x3f)) << 12;
- value |= (*p++ & 0x3f) << 6;
- value |= *p++ & 0x3f;
- if (value < 0x4000000) {
- return -4;
- }
- ret = 6;
- } else {
- return -2;
- }
- *val = value;
- return ret;
-}
-
// This takes a character 'value' and writes the UTF8 encoded value in 'str'
// where 'str' is a buffer containing 'len' characters. Returns the number of
// characters written or -1 if 'len' is too small. 'str' can be set to NULL
diff --git a/crypto/asn1/asn1_test.cc b/crypto/asn1/asn1_test.cc
index 195d5a8..9df309a 100644
--- a/crypto/asn1/asn1_test.cc
+++ b/crypto/asn1/asn1_test.cc
@@ -1278,11 +1278,23 @@
int str_flags;
unsigned long flags;
} kUnprintableTests[] = {
- // When decoding strings, invalid codepoints are errors.
+ // It is an error if the string cannot be decoded.
{V_ASN1_UTF8STRING, {0xff}, 0, ASN1_STRFLGS_ESC_MSB},
{V_ASN1_BMPSTRING, {0xff}, 0, ASN1_STRFLGS_ESC_MSB},
{V_ASN1_BMPSTRING, {0xff}, 0, ASN1_STRFLGS_ESC_MSB},
{V_ASN1_UNIVERSALSTRING, {0xff}, 0, ASN1_STRFLGS_ESC_MSB},
+ // Invalid codepoints are errors.
+ {V_ASN1_UTF8STRING, {0xed, 0xa0, 0x80}, 0, ASN1_STRFLGS_ESC_MSB},
+ {V_ASN1_BMPSTRING, {0xd8, 0x00}, 0, ASN1_STRFLGS_ESC_MSB},
+ {V_ASN1_UNIVERSALSTRING,
+ {0x00, 0x00, 0xd8, 0x00},
+ 0,
+ ASN1_STRFLGS_ESC_MSB},
+ // Even when re-encoding UTF-8 back into UTF-8, we should check validity.
+ {V_ASN1_UTF8STRING,
+ {0xff},
+ 0,
+ ASN1_STRFLGS_ESC_MSB | ASN1_STRFLGS_UTF8_CONVERT},
};
for (const auto &t : kUnprintableTests) {
SCOPED_TRACE(t.type);
diff --git a/crypto/asn1/internal.h b/crypto/asn1/internal.h
index 9c0d73e..f8ca8fe 100644
--- a/crypto/asn1/internal.h
+++ b/crypto/asn1/internal.h
@@ -133,7 +133,6 @@
void asn1_item_combine_free(ASN1_VALUE **pval, const ASN1_ITEM *it,
int combine);
-int UTF8_getc(const unsigned char *str, int len, uint32_t *val);
int UTF8_putc(unsigned char *str, int len, uint32_t value);
int ASN1_item_ex_new(ASN1_VALUE **pval, const ASN1_ITEM *it);