Add a test for ASN1_mbstring_copy and clean up.

In writing the tests, I noticed that the documentation was wrong. First,
the maximum lengths are measured in codepoints, not bytes.

Second, the TODO was wrong. We actually do handle this correctly,
*almost*. Rather, the bug is that the function assumes |mask| contains
no extraneous bits. If it does, all extraneous bits are interpreted as
B_ASN1_UTF8STRING. This seems like a bug, so I've gone ahead and fixed
that, with a test.

Change-Id: I7ba8fa700a8e21e6d25cb7ce879dace685eecf7e
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/48825
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/asn1/a_mbstr.c b/crypto/asn1/a_mbstr.c
index 953095c..35c130c 100644
--- a/crypto/asn1/a_mbstr.c
+++ b/crypto/asn1/a_mbstr.c
@@ -208,11 +208,14 @@
         encode_func = cbb_add_utf32_be;
         size_estimate = 4 * nchar;
         outform = MBSTRING_UNIV;
-    } else {
+    } else if (mask & B_ASN1_UTF8STRING) {
         str_type = V_ASN1_UTF8STRING;
         outform = MBSTRING_UTF8;
         encode_func = cbb_add_utf8;
         size_estimate = utf8_len;
+    } else {
+        OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS);
+        return -1;
     }
 
     if (!out)
diff --git a/crypto/asn1/asn1_test.cc b/crypto/asn1/asn1_test.cc
index 76f87bf..6fa3f95 100644
--- a/crypto/asn1/asn1_test.cc
+++ b/crypto/asn1/asn1_test.cc
@@ -817,6 +817,212 @@
   }
 }
 
+TEST(ASN1Test, MBString) {
+  const unsigned long kAll = B_ASN1_PRINTABLESTRING | B_ASN1_IA5STRING |
+                             B_ASN1_T61STRING | B_ASN1_BMPSTRING |
+                             B_ASN1_UNIVERSALSTRING | B_ASN1_UTF8STRING;
+
+  const struct {
+    int format;
+    std::vector<uint8_t> in;
+    unsigned long mask;
+    int expected_type;
+    std::vector<uint8_t> expected_data;
+    int num_codepoints;
+  } kTests[] = {
+      // Given a choice of formats, we pick the smallest that fits.
+      {MBSTRING_UTF8, {}, kAll, V_ASN1_PRINTABLESTRING, {}, 0},
+      {MBSTRING_UTF8, {'a'}, kAll, V_ASN1_PRINTABLESTRING, {'a'}, 1},
+      {MBSTRING_UTF8, {'\n'}, kAll, V_ASN1_IA5STRING, {'\n'}, 1},
+      {MBSTRING_UTF8,
+       {0xc2, 0x80 /* U+0080 */},
+       kAll,
+       V_ASN1_T61STRING,
+       {0x80},
+       1},
+      {MBSTRING_UTF8,
+       {0xc4, 0x80 /* U+0100 */},
+       kAll,
+       V_ASN1_BMPSTRING,
+       {0x01, 0x00},
+       1},
+      {MBSTRING_UTF8,
+       {0xf0, 0x90, 0x80, 0x80 /* U+10000 */},
+       kAll,
+       V_ASN1_UNIVERSALSTRING,
+       {0x00, 0x01, 0x00, 0x00},
+       1},
+      {MBSTRING_UTF8,
+       {0xf0, 0x90, 0x80, 0x80 /* U+10000 */},
+       kAll & ~B_ASN1_UNIVERSALSTRING,
+       V_ASN1_UTF8STRING,
+       {0xf0, 0x90, 0x80, 0x80},
+       1},
+
+      // When a particular format is specified, we use it.
+      {MBSTRING_UTF8,
+       {'a'},
+       B_ASN1_PRINTABLESTRING,
+       V_ASN1_PRINTABLESTRING,
+       {'a'},
+       1},
+      {MBSTRING_UTF8, {'a'}, B_ASN1_IA5STRING, V_ASN1_IA5STRING, {'a'}, 1},
+      {MBSTRING_UTF8, {'a'}, B_ASN1_T61STRING, V_ASN1_T61STRING, {'a'}, 1},
+      {MBSTRING_UTF8, {'a'}, B_ASN1_UTF8STRING, V_ASN1_UTF8STRING, {'a'}, 1},
+      {MBSTRING_UTF8,
+       {'a'},
+       B_ASN1_BMPSTRING,
+       V_ASN1_BMPSTRING,
+       {0x00, 'a'},
+       1},
+      {MBSTRING_UTF8,
+       {'a'},
+       B_ASN1_UNIVERSALSTRING,
+       V_ASN1_UNIVERSALSTRING,
+       {0x00, 0x00, 0x00, 'a'},
+       1},
+
+      // A long string with characters of many widths, to test sizes are
+      // measured in code points.
+      {MBSTRING_UTF8,
+       {
+           'a',                     //
+           0xc2, 0x80,              // U+0080
+           0xc4, 0x80,              // U+0100
+           0xf0, 0x90, 0x80, 0x80,  // U+10000
+       },
+       B_ASN1_UNIVERSALSTRING,
+       V_ASN1_UNIVERSALSTRING,
+       {
+           0x00, 0x00, 0x00, 'a',   //
+           0x00, 0x00, 0x00, 0x80,  //
+           0x00, 0x00, 0x01, 0x00,  //
+           0x00, 0x01, 0x00, 0x00,  //
+       },
+       4},
+  };
+  for (const auto &t : kTests) {
+    SCOPED_TRACE(t.format);
+    SCOPED_TRACE(Bytes(t.in));
+    SCOPED_TRACE(t.mask);
+
+    // Passing in nullptr should do a dry run.
+    EXPECT_EQ(t.expected_type,
+              ASN1_mbstring_copy(nullptr, t.in.data(), t.in.size(), t.format,
+                                 t.mask));
+
+    // Test allocating a new object.
+    ASN1_STRING *str = nullptr;
+    EXPECT_EQ(
+        t.expected_type,
+        ASN1_mbstring_copy(&str, t.in.data(), t.in.size(), t.format, t.mask));
+    ASSERT_TRUE(str);
+    EXPECT_EQ(t.expected_type, ASN1_STRING_type(str));
+    EXPECT_EQ(Bytes(t.expected_data),
+              Bytes(ASN1_STRING_get0_data(str), ASN1_STRING_length(str)));
+
+    // Test writing into an existing object.
+    ASN1_STRING_free(str);
+    str = ASN1_STRING_new();
+    ASSERT_TRUE(str);
+    ASN1_STRING *old_str = str;
+    EXPECT_EQ(
+        t.expected_type,
+        ASN1_mbstring_copy(&str, t.in.data(), t.in.size(), t.format, t.mask));
+    ASSERT_EQ(old_str, str);
+    EXPECT_EQ(t.expected_type, ASN1_STRING_type(str));
+    EXPECT_EQ(Bytes(t.expected_data),
+              Bytes(ASN1_STRING_get0_data(str), ASN1_STRING_length(str)));
+    ASN1_STRING_free(str);
+    str = nullptr;
+
+    // minsize and maxsize should be enforced, even in a dry run.
+    EXPECT_EQ(t.expected_type,
+              ASN1_mbstring_ncopy(nullptr, t.in.data(), t.in.size(), t.format,
+                                  t.mask, /*minsize=*/t.num_codepoints,
+                                  /*maxsize=*/t.num_codepoints));
+
+    EXPECT_EQ(t.expected_type,
+              ASN1_mbstring_ncopy(&str, t.in.data(), t.in.size(), t.format,
+                                  t.mask, /*minsize=*/t.num_codepoints,
+                                  /*maxsize=*/t.num_codepoints));
+    ASSERT_TRUE(str);
+    EXPECT_EQ(t.expected_type, ASN1_STRING_type(str));
+    EXPECT_EQ(Bytes(t.expected_data),
+              Bytes(ASN1_STRING_get0_data(str), ASN1_STRING_length(str)));
+    ASN1_STRING_free(str);
+    str = nullptr;
+
+    EXPECT_EQ(-1, ASN1_mbstring_ncopy(
+                      nullptr, t.in.data(), t.in.size(), t.format, t.mask,
+                      /*minsize=*/t.num_codepoints + 1, /*maxsize=*/0));
+    ERR_clear_error();
+    EXPECT_EQ(-1, ASN1_mbstring_ncopy(
+                      &str, t.in.data(), t.in.size(), t.format, t.mask,
+                      /*minsize=*/t.num_codepoints + 1, /*maxsize=*/0));
+    EXPECT_FALSE(str);
+    ERR_clear_error();
+    if (t.num_codepoints > 1) {
+      EXPECT_EQ(-1, ASN1_mbstring_ncopy(
+                        nullptr, t.in.data(), t.in.size(), t.format, t.mask,
+                        /*minsize=*/0, /*maxsize=*/t.num_codepoints - 1));
+      ERR_clear_error();
+      EXPECT_EQ(-1, ASN1_mbstring_ncopy(
+                        &str, t.in.data(), t.in.size(), t.format, t.mask,
+                        /*minsize=*/0, /*maxsize=*/t.num_codepoints - 1));
+      EXPECT_FALSE(str);
+      ERR_clear_error();
+    }
+  }
+
+  const struct {
+    int format;
+    std::vector<uint8_t> in;
+    unsigned long mask;
+  } kInvalidTests[] = {
+      // Invalid encodings are rejected.
+      {MBSTRING_UTF8, {0xff}, B_ASN1_UTF8STRING},
+      {MBSTRING_BMP, {0xff}, B_ASN1_UTF8STRING},
+      {MBSTRING_UNIV, {0xff}, B_ASN1_UTF8STRING},
+
+      // Lone surrogates are not code points.
+      {MBSTRING_UTF8, {0xed, 0xa0, 0x80}, B_ASN1_UTF8STRING},
+      {MBSTRING_BMP, {0xd8, 0x00}, B_ASN1_UTF8STRING},
+      {MBSTRING_UNIV, {0x00, 0x00, 0xd8, 0x00}, B_ASN1_UTF8STRING},
+
+      // The input does not fit in the allowed output types.
+      {MBSTRING_UTF8, {'\n'}, B_ASN1_PRINTABLESTRING},
+      {MBSTRING_UTF8,
+       {0xc2, 0x80 /* U+0080 */},
+       B_ASN1_PRINTABLESTRING | B_ASN1_IA5STRING},
+      {MBSTRING_UTF8,
+       {0xc4, 0x80 /* U+0100 */},
+       B_ASN1_PRINTABLESTRING | B_ASN1_IA5STRING | B_ASN1_T61STRING},
+      {MBSTRING_UTF8,
+       {0xf0, 0x90, 0x80, 0x80 /* U+10000 */},
+       B_ASN1_PRINTABLESTRING | B_ASN1_IA5STRING | B_ASN1_T61STRING |
+           B_ASN1_BMPSTRING},
+
+      // Unrecognized bits are ignored.
+      {MBSTRING_UTF8, {'\n'}, B_ASN1_PRINTABLESTRING | B_ASN1_SEQUENCE},
+  };
+  for (const auto &t : kInvalidTests) {
+    SCOPED_TRACE(t.format);
+    SCOPED_TRACE(Bytes(t.in));
+    SCOPED_TRACE(t.mask);
+
+    EXPECT_EQ(-1, ASN1_mbstring_copy(nullptr, t.in.data(), t.in.size(),
+                                     t.format, t.mask));
+    ERR_clear_error();
+
+    ASN1_STRING *str = nullptr;
+    EXPECT_EQ(-1, ASN1_mbstring_copy(&str, t.in.data(), t.in.size(),
+                                     t.format, t.mask));
+    ERR_clear_error();
+    EXPECT_EQ(nullptr, str);
+  }
+}
+
 // The ASN.1 macros do not work on Windows shared library builds, where usage of
 // |OPENSSL_EXPORT| is a bit stricter.
 #if !defined(OPENSSL_WINDOWS) || !defined(BORINGSSL_SHARED_LIBRARY)
diff --git a/include/openssl/asn1.h b/include/openssl/asn1.h
index 4cf71ad..1d30300 100644
--- a/include/openssl/asn1.h
+++ b/include/openssl/asn1.h
@@ -337,6 +337,14 @@
 #define MBSTRING_BMP (MBSTRING_FLAG | 2)
 #define MBSTRING_UNIV (MBSTRING_FLAG | 4)
 
+// DIRSTRING_TYPE contains the valid string types in an X.509 DirectoryString.
+#define DIRSTRING_TYPE                                            \
+  (B_ASN1_PRINTABLESTRING | B_ASN1_T61STRING | B_ASN1_BMPSTRING | \
+   B_ASN1_UTF8STRING)
+
+// PKCS9STRING_TYPE contains the valid string types in a PKCS9String.
+#define PKCS9STRING_TYPE (DIRSTRING_TYPE | B_ASN1_IA5STRING)
+
 // ASN1_mbstring_copy converts |len| bytes from |in| to an ASN.1 string. If
 // |len| is -1, |in| must be NUL-terminated and the length is determined by
 // |strlen|. |in| is decoded according to |inform|, which must be one of
@@ -347,23 +355,21 @@
 // first output type in |mask| which can represent |in|. It interprets T61String
 // as Latin-1, rather than T.61.
 //
+// If |mask| is zero, |DIRSTRING_TYPE| is used by default.
+//
 // On success, this function returns the |V_ASN1_*| constant corresponding to
 // the selected output type and, if |out| and |*out| are both non-NULL, updates
 // the object at |*out| with the result. If |out| is non-NULL and |*out| is
 // NULL, it instead sets |*out| to a newly-allocated |ASN1_STRING| containing
 // the result. If |out| is NULL, it returns the selected output type without
 // constructing an |ASN1_STRING|. On error, this function returns -1.
-//
-// TODO(davidben): If no format in |mask| can represent |in|, the function
-// currently falls back to |B_ASN1_UTF8STRING|. This is probably a bug and will
-// cause functions like |ASN1_STRING_set_by_NID| to output the wrong type.
-// Return an error instead.
 OPENSSL_EXPORT int ASN1_mbstring_copy(ASN1_STRING **out, const uint8_t *in,
                                       int len, int inform, unsigned long mask);
 
 // ASN1_mbstring_ncopy behaves like |ASN1_mbstring_copy| but returns an error if
-// the output would be less than |minsize| or greater than |maxsize|. A
-// |maxsize| value of zero is ignored.
+// the input is less than |minsize| or greater than |maxsize| codepoints long. A
+// |maxsize| value of zero is ignored. Note the sizes are measured in
+// codepoints, not output bytes.
 OPENSSL_EXPORT int ASN1_mbstring_ncopy(ASN1_STRING **out, const uint8_t *in,
                                        int len, int inform, unsigned long mask,
                                        long minsize, long maxsize);
@@ -863,10 +869,6 @@
 
 #define STABLE_FLAGS_MALLOC 0x01
 #define STABLE_NO_MASK 0x02
-#define DIRSTRING_TYPE                                            \
-  (B_ASN1_PRINTABLESTRING | B_ASN1_T61STRING | B_ASN1_BMPSTRING | \
-   B_ASN1_UTF8STRING)
-#define PKCS9STRING_TYPE (DIRSTRING_TYPE | B_ASN1_IA5STRING)
 
 typedef struct asn1_string_table_st {
   int nid;