Add new character encoding functions.

These will be used for the PKCS#12 code and to replace some of the
crypto/asn1 logic. So far they support the ones implemented by
crypto/asn1, which are Latin-1, UCS-2 (ASN.1 BMPStrings can't go beyond
the BMP), UTF-32 (ASN.1 UniversalString) and UTF-8.

Change-Id: I3d5c0d964cc6f97c3a0a1e352c9dd7d8cc0d87f2
Reviewed-on: https://boringssl-review.googlesource.com/28324
Commit-Queue: Adam Langley <agl@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/bytestring/CMakeLists.txt b/crypto/bytestring/CMakeLists.txt
index 37ff51c..2bb4c15 100644
--- a/crypto/bytestring/CMakeLists.txt
+++ b/crypto/bytestring/CMakeLists.txt
@@ -9,4 +9,5 @@
   ber.c
   cbs.c
   cbb.c
+  unicode.c
 )
diff --git a/crypto/bytestring/bytestring_test.cc b/crypto/bytestring/bytestring_test.cc
index 10eac69..5d1b1da 100644
--- a/crypto/bytestring/bytestring_test.cc
+++ b/crypto/bytestring/bytestring_test.cc
@@ -1051,3 +1051,221 @@
     EXPECT_FALSE(CBB_flush_asn1_set_of(&child));
   }
 }
+
+template <class T>
+static std::vector<uint8_t> LiteralToBytes(const T *str) {
+  std::vector<uint8_t> ret;
+  for (; *str != 0; str++) {
+    for (size_t i = 0; i < sizeof(T); i++) {
+      ret.push_back(static_cast<uint8_t>(*str >> (8 * (sizeof(T) - 1 - i))));
+    }
+  }
+  return ret;
+}
+
+static std::vector<uint32_t> LiteralToCodePoints(const char32_t *str) {
+  std::vector<uint32_t> ret;
+  for (; *str != 0; str++) {
+    ret.push_back(static_cast<uint32_t>(*str));
+  }
+  return ret;
+}
+
+TEST(CBBTest, Unicode) {
+  struct {
+    int (*decode)(CBS *, uint32_t *);
+    int (*encode)(CBB *, uint32_t);
+    std::vector<uint8_t> in;
+    std::vector<uint32_t> out;
+    bool ok;
+  } kTests[] = {
+      {cbs_get_utf8, cbb_add_utf8,
+       // This test string captures all four cases in UTF-8.
+       LiteralToBytes(u8"Hello, δΈ–η•Œ! ¡Hola, 🌎!"),
+       LiteralToCodePoints(U"Hello, δΈ–η•Œ! ¡Hola, 🌎!"), true},
+
+      // Some invalid inputs adapted from
+      // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
+      // 2.1  First possible sequence of a certain length. (5- and 6-bit
+      // sequences no longer exist.)
+      {cbs_get_utf8, cbb_add_utf8, {0xf8, 0x88, 0x80, 0x80, 0x80}, {}, false},
+      {cbs_get_utf8,
+       cbb_add_utf8,
+       {0xfc, 0x84, 0x80, 0x80, 0x80, 0x80},
+       {},
+       false},
+      // 3.1  Unexpected continuation bytes.
+      {cbs_get_utf8, cbb_add_utf8, {0x80}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xbf}, {}, false},
+      // 3.2  Lonely start characters.
+      {cbs_get_utf8, cbb_add_utf8, {0xc0, ' '}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xe0, ' '}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xf0, ' '}, {}, false},
+      // 3.3  Sequences with last continuation byte missing
+      {cbs_get_utf8, cbb_add_utf8, {0xc0}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xe0, 0x80}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xf0, 0x80, 0x80}, {}, false},
+      // Variation of the above with unexpected spaces.
+      {cbs_get_utf8, cbb_add_utf8, {0xe0, 0x80, ' '}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xf0, 0x80, 0x80, ' '}, {}, false},
+      // 4.1  Examples of an overlong ASCII character
+      {cbs_get_utf8, cbb_add_utf8, {0xc0, 0xaf}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xe0, 0x80, 0xaf}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xf0, 0x80, 0x80, 0xaf}, {}, false},
+      // 4.2  Maximum overlong sequences
+      {cbs_get_utf8, cbb_add_utf8, {0xc1, 0xbf}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xe0, 0x9f, 0xbf}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xf0, 0x8f, 0xbf, 0xbf}, {}, false},
+      // 4.3  Overlong representation of the NUL character
+      {cbs_get_utf8, cbb_add_utf8, {0xc0, 0x80}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xe0, 0x80, 0x80}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xf0, 0x80, 0x80, 0x80}, {}, false},
+      // 5.1  Single UTF-16 surrogates
+      {cbs_get_utf8, cbb_add_utf8, {0xed, 0xa0, 0x80}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xed, 0xad, 0xbf}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xed, 0xae, 0x80}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xed, 0xb0, 0x80}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xed, 0xbe, 0x80}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xed, 0xbf, 0xbf}, {}, false},
+      // 5.2  Paired UTF-16 surrogates
+      {cbs_get_utf8,
+       cbb_add_utf8,
+       {0xed, 0xa0, 0x80, 0xed, 0xb0, 0x80},
+       {},
+       false},
+      {cbs_get_utf8,
+       cbb_add_utf8,
+       {0xed, 0xa0, 0x80, 0xed, 0xbf, 0xbf},
+       {},
+       false},
+      {cbs_get_utf8,
+       cbb_add_utf8,
+       {0xed, 0xad, 0xbf, 0xed, 0xb0, 0x80},
+       {},
+       false},
+      {cbs_get_utf8,
+       cbb_add_utf8,
+       {0xed, 0xad, 0xbf, 0xed, 0xbf, 0xbf},
+       {},
+       false},
+      {cbs_get_utf8,
+       cbb_add_utf8,
+       {0xed, 0xae, 0x80, 0xed, 0xb0, 0x80},
+       {},
+       false},
+      {cbs_get_utf8,
+       cbb_add_utf8,
+       {0xed, 0xae, 0x80, 0xed, 0xbf, 0xbf},
+       {},
+       false},
+      {cbs_get_utf8,
+       cbb_add_utf8,
+       {0xed, 0xaf, 0xbf, 0xed, 0xb0, 0x80},
+       {},
+       false},
+      {cbs_get_utf8,
+       cbb_add_utf8,
+       {0xed, 0xaf, 0xbf, 0xed, 0xbf, 0xbf},
+       {},
+       false},
+      // 5.3  Noncharacter code positions
+      {cbs_get_utf8, cbb_add_utf8, {0xef, 0xbf, 0xbe}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xef, 0xbf, 0xbf}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xef, 0xb7, 0x90}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xef, 0xb7, 0xaf}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xf0, 0x9f, 0xbf, 0xbe}, {}, false},
+      {cbs_get_utf8, cbb_add_utf8, {0xf0, 0x9f, 0xbf, 0xbf}, {}, false},
+
+      {cbs_get_latin1, cbb_add_latin1, LiteralToBytes("\xa1Hola!"),
+       LiteralToCodePoints(U"¡Hola!"), true},
+
+      // UCS-2 matches UTF-16 on the BMP.
+      {cbs_get_ucs2_be, cbb_add_ucs2_be, LiteralToBytes(u"Hello, δΈ–η•Œ!"),
+       LiteralToCodePoints(U"Hello, δΈ–η•Œ!"), true},
+      // It does not support characters beyond the BMP.
+      {cbs_get_ucs2_be, cbb_add_ucs2_be,
+       LiteralToBytes(u"Hello, δΈ–η•Œ! ¡Hola, 🌎!"),
+       LiteralToCodePoints(U"Hello, δΈ–η•Œ! ¡Hola, "), false},
+      // Unpaired surrogates and non-characters are also rejected.
+      {cbs_get_ucs2_be, cbb_add_ucs2_be, {0xd8, 0x00}, {}, false},
+      {cbs_get_ucs2_be, cbb_add_ucs2_be, {0xff, 0xfe}, {}, false},
+
+      {cbs_get_utf32_be, cbb_add_utf32_be,
+       LiteralToBytes(U"Hello, δΈ–η•Œ! ¡Hola, 🌎!"),
+       LiteralToCodePoints(U"Hello, δΈ–η•Œ! ¡Hola, 🌎!"), true},
+      // Unpaired surrogates and non-characters are rejected.
+      {cbs_get_utf32_be, cbb_add_utf32_be, {0x00, 0x00, 0xd8, 0x00}, {}, false},
+      {cbs_get_utf32_be, cbb_add_utf32_be, {0x00, 0x00, 0xff, 0xfe}, {}, false},
+
+      // Test that the NUL character can be encoded.
+      {cbs_get_latin1, cbb_add_latin1, {0}, {0}, true},
+      {cbs_get_utf8, cbb_add_utf8, {0}, {0}, true},
+      {cbs_get_ucs2_be, cbb_add_ucs2_be, {0, 0}, {0}, true},
+      {cbs_get_utf32_be, cbb_add_utf32_be, {0, 0, 0, 0}, {0}, true},
+  };
+  for (const auto &t : kTests) {
+    SCOPED_TRACE(Bytes(t.in));
+
+    // Test decoding.
+    CBS cbs;
+    CBS_init(&cbs, t.in.data(), t.in.size());
+    std::vector<uint32_t> out;
+    bool ok = true;
+    while (CBS_len(&cbs) != 0) {
+      uint32_t u;
+      if (!t.decode(&cbs, &u)) {
+        ok = false;
+        break;
+      }
+      out.push_back(u);
+    }
+    EXPECT_EQ(t.ok, ok);
+    EXPECT_EQ(t.out, out);
+
+    // Test encoding.
+    if (t.ok) {
+      bssl::ScopedCBB cbb;
+      ASSERT_TRUE(CBB_init(cbb.get(), 0));
+      for (uint32_t u : t.out) {
+        ASSERT_TRUE(t.encode(cbb.get(), u));
+      }
+      EXPECT_EQ(Bytes(t.in), Bytes(CBB_data(cbb.get()), CBB_len(cbb.get())));
+    }
+  }
+
+  static const uint32_t kBadCodePoints[] = {
+    // Surrogate pairs.
+    0xd800,
+    0xdfff,
+    // Non-characters.
+    0xfffe,
+    0xffff,
+    0xfdd0,
+    0x1fffe,
+    0x1ffff,
+    // Too big.
+    0x110000,
+  };
+  bssl::ScopedCBB cbb;
+  ASSERT_TRUE(CBB_init(cbb.get(), 0));
+  for (uint32_t v : kBadCodePoints) {
+    SCOPED_TRACE(v);
+    EXPECT_FALSE(cbb_add_utf8(cbb.get(), v));
+    EXPECT_FALSE(cbb_add_latin1(cbb.get(), v));
+    EXPECT_FALSE(cbb_add_ucs2_be(cbb.get(), v));
+    EXPECT_FALSE(cbb_add_utf32_be(cbb.get(), v));
+  }
+
+  // Additional values that are out of range.
+  EXPECT_FALSE(cbb_add_latin1(cbb.get(), 0x100));
+  EXPECT_FALSE(cbb_add_ucs2_be(cbb.get(), 0x10000));
+
+  EXPECT_EQ(1u, cbb_get_utf8_len(0));
+  EXPECT_EQ(1u, cbb_get_utf8_len(0x7f));
+  EXPECT_EQ(2u, cbb_get_utf8_len(0x80));
+  EXPECT_EQ(2u, cbb_get_utf8_len(0x7ff));
+  EXPECT_EQ(3u, cbb_get_utf8_len(0x800));
+  EXPECT_EQ(3u, cbb_get_utf8_len(0xffff));
+  EXPECT_EQ(4u, cbb_get_utf8_len(0x10000));
+  EXPECT_EQ(4u, cbb_get_utf8_len(0x10ffff));
+}
diff --git a/crypto/bytestring/internal.h b/crypto/bytestring/internal.h
index f6ac32c..b731aad 100644
--- a/crypto/bytestring/internal.h
+++ b/crypto/bytestring/internal.h
@@ -68,6 +68,28 @@
 int CBB_finish_i2d(CBB *cbb, uint8_t **outp);
 
 
+// Unicode utilities.
+
+// The following functions read one Unicode code point from |cbs| with the
+// corresponding encoding and store it in |*out|. They return one on success and
+// zero on error.
+OPENSSL_EXPORT int cbs_get_utf8(CBS *cbs, uint32_t *out);
+OPENSSL_EXPORT int cbs_get_latin1(CBS *cbs, uint32_t *out);
+OPENSSL_EXPORT int cbs_get_ucs2_be(CBS *cbs, uint32_t *out);
+OPENSSL_EXPORT int cbs_get_utf32_be(CBS *cbs, uint32_t *out);
+
+// cbb_get_utf8_len returns the number of bytes needed to represent |u| in
+// UTF-8.
+OPENSSL_EXPORT size_t cbb_get_utf8_len(uint32_t u);
+
+// The following functions encode |u| to |cbb| with the corresponding
+// encoding. They return one on success and zero on error.
+OPENSSL_EXPORT int cbb_add_utf8(CBB *cbb, uint32_t u);
+OPENSSL_EXPORT int cbb_add_latin1(CBB *cbb, uint32_t u);
+OPENSSL_EXPORT int cbb_add_ucs2_be(CBB *cbb, uint32_t u);
+OPENSSL_EXPORT int cbb_add_utf32_be(CBB *cbb, uint32_t u);
+
+
 #if defined(__cplusplus)
 }  // extern C
 #endif
diff --git a/crypto/bytestring/unicode.c b/crypto/bytestring/unicode.c
new file mode 100644
index 0000000..6f9467f
--- /dev/null
+++ b/crypto/bytestring/unicode.c
@@ -0,0 +1,155 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/bytestring.h>
+
+#include "internal.h"
+
+
+static int is_valid_code_point(uint32_t v) {
+  // References in the following are to Unicode 9.0.0.
+  if (// The Unicode space runs from zero to 0x10ffff (3.4 D9).
+      v > 0x10ffff ||
+      // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved
+      // (3.4 D14)
+      (v & 0xfffe) == 0xfffe ||
+      (v >= 0xfdd0 && v <= 0xfdef) ||
+      // Surrogate code points are invalid (3.2 C1).
+      (v >= 0xd800 && v <= 0xdfff)) {
+    return 0;
+  }
+  return 1;
+}
+
+// BOTTOM_BITS returns a byte with the bottom |n| bits set.
+#define BOTTOM_BITS(n) (uint8_t)((1u << (n)) - 1)
+
+// TOP_BITS returns a byte with the top |n| bits set.
+#define TOP_BITS(n) ((uint8_t)~BOTTOM_BITS(8 - (n)))
+
+int cbs_get_utf8(CBS *cbs, uint32_t *out) {
+  uint8_t c;
+  if (!CBS_get_u8(cbs, &c)) {
+    return 0;
+  }
+  if (c <= 0x7f) {
+    *out = c;
+    return 1;
+  }
+  uint32_t v, lower_bound;
+  size_t len;
+  if ((c & TOP_BITS(3)) == TOP_BITS(2)) {
+    v = c & BOTTOM_BITS(5);
+    len = 1;
+    lower_bound = 0x80;
+  } else if ((c & TOP_BITS(4)) == TOP_BITS(3)) {
+    v = c & BOTTOM_BITS(4);
+    len = 2;
+    lower_bound = 0x800;
+  } else if ((c & TOP_BITS(5)) == TOP_BITS(4)) {
+    v = c & BOTTOM_BITS(3);
+    len = 3;
+    lower_bound = 0x10000;
+  } else {
+    return 0;
+  }
+  for (size_t i = 0; i < len; i++) {
+    if (!CBS_get_u8(cbs, &c) ||
+        (c & TOP_BITS(2)) != TOP_BITS(1)) {
+      return 0;
+    }
+    v <<= 6;
+    v |= c & BOTTOM_BITS(6);
+  }
+  if (!is_valid_code_point(v) ||
+      v < lower_bound) {
+    return 0;
+  }
+  *out = v;
+  return 1;
+}
+
+int cbs_get_latin1(CBS *cbs, uint32_t *out) {
+  uint8_t c;
+  if (!CBS_get_u8(cbs, &c)) {
+    return 0;
+  }
+  *out = c;
+  return 1;
+}
+
+int cbs_get_ucs2_be(CBS *cbs, uint32_t *out) {
+  // Note UCS-2 (used by BMPString) does not support surrogates.
+  uint16_t c;
+  if (!CBS_get_u16(cbs, &c) ||
+      !is_valid_code_point(c)) {
+    return 0;
+  }
+  *out = c;
+  return 1;
+}
+
+int cbs_get_utf32_be(CBS *cbs, uint32_t *out) {
+  return CBS_get_u32(cbs, out) && is_valid_code_point(*out);
+}
+
+size_t cbb_get_utf8_len(uint32_t u) {
+  if (u <= 0x7f) {
+    return 1;
+  }
+  if (u <= 0x7ff) {
+    return 2;
+  }
+  if (u <= 0xffff) {
+    return 3;
+  }
+  return 4;
+}
+
+int cbb_add_utf8(CBB *cbb, uint32_t u) {
+  if (!is_valid_code_point(u)) {
+    return 0;
+  }
+  if (u <= 0x7f) {
+    return CBB_add_u8(cbb, (uint8_t)u);
+  }
+  if (u <= 0x7ff) {
+    return CBB_add_u8(cbb, TOP_BITS(2) | (u >> 6)) &&
+           CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
+  }
+  if (u <= 0xffff) {
+    return CBB_add_u8(cbb, TOP_BITS(3) | (u >> 12)) &&
+           CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
+           CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
+  }
+  if (u <= 0x10ffff) {
+    return CBB_add_u8(cbb, TOP_BITS(4) | (u >> 18)) &&
+           CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 12) & BOTTOM_BITS(6))) &&
+           CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
+           CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
+  }
+  return 0;
+}
+
+int cbb_add_latin1(CBB *cbb, uint32_t u) {
+  return u <= 0xff && CBB_add_u8(cbb, (uint8_t)u);
+}
+
+int cbb_add_ucs2_be(CBB *cbb, uint32_t u) {
+  return u <= 0xffff && is_valid_code_point(u) && CBB_add_u16(cbb, (uint16_t)u);
+}
+
+int cbb_add_utf32_be(CBB *cbb, uint32_t u) {
+  return is_valid_code_point(u) && CBB_add_u32(cbb, u);
+}