Handle ChaCha20 counter overflow consistently

The assembly functions from OpenSSL vary in how the counter overflow
works. The aarch64 implementation uses a mix of 32-bit and 64-bit
counters. This is because, when packing a block into 64-bit
general-purpose registers, it is easier to implement a 64-bit counter
than a 32-bit one. Whereas, on 32-bit general-purpose registers, or when
using vector registers with 32-bit lanes, it is easier to implement a
32-bit counter.

Counters will never overflow with the AEAD, which sets the length limit
so it never happens. (Failing to do so will reuse a key/nonce/counter
triple.) RFC 8439 is silent on what happens on overflow, so at best one
can say it is implicitly undefined behavior.

This came about because pyca/cryptography reportedly exposed a ChaCha20
API which encouraged callers to randomize the starting counter. Wrapping
with a randomized starting counter isn't inherently wrong, though it is
pointless and goes against how the spec recommends using the initial
counter value.

Nonetheless, we would prefer our functions behave consistently across
platforms, rather than silently give ill-defined output given some
inputs. So, normalize the behavior to the wrapping version in
CRYPTO_chacha_20 by dividing up into multiple ChaCha20_ctr32 calls as
needed.

Fixed: 614
Change-Id: I191461f25753b9f6b59064c6c08cd4299085e172
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/60387
Commit-Queue: Adam Langley <agl@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/chacha/chacha.c b/crypto/chacha/chacha.c
index 1092b7a..a4d88c0 100644
--- a/crypto/chacha/chacha.c
+++ b/crypto/chacha/chacha.c
@@ -91,7 +91,25 @@
   }
 #endif
 
-  ChaCha20_ctr32(out, in, in_len, key_ptr, counter_nonce);
+  while (in_len > 0) {
+    // The assembly functions do not have defined overflow behavior. While
+    // overflow is almost always a bug in the caller, we prefer our functions to
+    // behave the same across platforms, so divide into multiple calls to avoid
+    // this case.
+    uint64_t todo = 64 * ((UINT64_C(1) << 32) - counter_nonce[0]);
+    if (todo > in_len) {
+      todo = in_len;
+    }
+
+    ChaCha20_ctr32(out, in, (size_t)todo, key_ptr, counter_nonce);
+    in += todo;
+    out += todo;
+    in_len -= todo;
+
+    // We're either done and will next break out of the loop, or we stopped at
+    // the wraparound point and the counter should continue at zero.
+    counter_nonce[0] = 0;
+  }
 }
 
 #else
diff --git a/crypto/chacha/chacha_test.cc b/crypto/chacha/chacha_test.cc
index 25313ca..00683ce 100644
--- a/crypto/chacha/chacha_test.cc
+++ b/crypto/chacha/chacha_test.cc
@@ -218,8 +218,102 @@
     0x8f, 0x40, 0xcf, 0x4a,
 };
 
+static uint32_t kOverflowCounter = 0xffffffff;
+
+static const uint8_t kOverflowOutput[] = {
+    0x37, 0x64, 0x38, 0xcb, 0x25, 0x69, 0x2c, 0xf5, 0x88, 0x8a, 0xfe, 0x6d,
+    0x3b, 0x10, 0x07, 0x3c, 0x77, 0xac, 0xcd, 0x1c, 0x0c, 0xa7, 0x17, 0x31,
+    0x1d, 0xc3, 0x81, 0xd1, 0xa5, 0x20, 0x55, 0xea, 0xd3, 0x00, 0xc9, 0x84,
+    0xde, 0xe2, 0xe5, 0x5e, 0x7b, 0x28, 0x28, 0x59, 0x73, 0x3a, 0x8e, 0x57,
+    0x62, 0x18, 0x50, 0x55, 0x97, 0xca, 0x50, 0x3e, 0x8a, 0x84, 0x61, 0x28,
+    0x4c, 0x22, 0x93, 0x50, 0x48, 0x7e, 0x65, 0x78, 0x06, 0x5a, 0xcd, 0x2b,
+    0x11, 0xf7, 0x10, 0xfd, 0x6f, 0x41, 0x92, 0x82, 0x7c, 0x3a, 0x71, 0x07,
+    0x67, 0xd0, 0x7e, 0xb7, 0xdf, 0xdc, 0xfc, 0xee, 0xe6, 0x55, 0xdd, 0x6f,
+    0x79, 0x23, 0xf3, 0xae, 0xb1, 0x21, 0x96, 0xbe, 0xea, 0x0e, 0x1b, 0x58,
+    0x0b, 0x3f, 0x63, 0x51, 0xd4, 0xce, 0x98, 0xfe, 0x1a, 0xc7, 0xa7, 0x43,
+    0x7f, 0x0c, 0xe8, 0x62, 0xcf, 0x78, 0x3f, 0x4e, 0x31, 0xbf, 0x2b, 0x76,
+    0x91, 0xcd, 0x19, 0x80, 0x0d, 0x7f, 0x11, 0x8b, 0x76, 0xef, 0x43, 0x3c,
+    0x4f, 0x61, 0x86, 0xc5, 0x64, 0xa8, 0xc2, 0x73, 0xc2, 0x64, 0x39, 0xa0,
+    0x8b, 0xe6, 0x7f, 0xf6, 0x26, 0xd4, 0x47, 0x4f, 0xe4, 0x46, 0xe2, 0xf5,
+    0x9e, 0xe6, 0xc7, 0x76, 0x6c, 0xa9, 0x0f, 0x1d, 0x1b, 0x22, 0xa5, 0x62,
+    0x0a, 0x88, 0x3e, 0x8c, 0xf0, 0xbc, 0x4c, 0x11, 0x3f, 0x0d, 0xf7, 0x85,
+    0x67, 0x0b, 0x4c, 0xa3, 0x3f, 0xa8, 0xf1, 0x2a, 0x65, 0x2e, 0x00, 0x03,
+    0xc9, 0x49, 0x91, 0x48, 0xb7, 0xc8, 0x29, 0x28, 0x2f, 0x46, 0x8e, 0x8b,
+    0xd6, 0x73, 0x19, 0x06, 0x3e, 0x6f, 0x92, 0xc8, 0x3d, 0x3f, 0x4d, 0x68,
+    0xbc, 0x02, 0xc0, 0x8f, 0x71, 0x46, 0x0d, 0x28, 0x63, 0xfe, 0xad, 0x14,
+    0x81, 0x04, 0xb7, 0x23, 0xfd, 0x21, 0x0a, 0xf0, 0x6f, 0xcd, 0x47, 0x0b,
+    0x0e, 0x93, 0xa3, 0xa8, 0x44, 0x15, 0xd6, 0xae, 0x06, 0x44, 0x6b, 0xbc,
+    0xff, 0x8a, 0x56, 0x60, 0x3c, 0x38, 0xd6, 0xed, 0x03, 0x2d, 0x79, 0x2a,
+    0xe9, 0x15, 0xef, 0xfc, 0x92, 0x1f, 0x83, 0xa4, 0x60, 0x8f, 0xc9, 0x29,
+    0xb2, 0xb4, 0x9e, 0x3f, 0xa9, 0xe8, 0xfb, 0xa2, 0x62, 0x20, 0x2e, 0xc9,
+    0x43, 0xb2, 0xd1, 0x36, 0x85, 0x1e, 0xa4, 0xb3, 0x4f, 0x8c, 0x9e, 0x81,
+    0x75, 0x68, 0xbc, 0xf1, 0x52, 0xd5, 0x03, 0x22, 0xcf, 0xdf, 0x64, 0xb0,
+    0x28, 0xd2, 0x45, 0x18, 0x38, 0x8c, 0xd0, 0xf6, 0x30, 0x3c, 0x04, 0xd9,
+    0x8d, 0xb6, 0xb2, 0x57, 0x2a, 0xee, 0x28, 0xeb, 0x5f, 0x1a, 0x10, 0x6e,
+    0x88, 0x79, 0x08, 0x23, 0x19, 0x84, 0xf8, 0x80, 0x1a, 0x7d, 0x6f, 0x8b,
+    0xc1, 0x8e, 0x5f, 0x5f, 0x54, 0x14, 0x2a, 0xdc, 0x41, 0x5d, 0xeb, 0x00,
+    0xf2, 0x50, 0xae, 0xd3, 0x55, 0x32, 0xf6, 0xd9, 0x34, 0xf4, 0xb2, 0xf2,
+    0xf5, 0x90, 0x05, 0x8a, 0x9c, 0xc7, 0x94, 0x5d, 0x2d, 0x5a, 0x0f, 0xdd,
+    0x03, 0xde, 0xbe, 0x18, 0xb3, 0xe3, 0x07, 0x6b, 0x57, 0xfa, 0x1b, 0x7b,
+    0x75, 0xcb, 0xc2, 0x4d, 0xf7, 0x88, 0xfe, 0xf9, 0xc0, 0x6c, 0xdb, 0x5f,
+    0xf6, 0x48, 0x00, 0x4a, 0x5d, 0x75, 0xfa, 0x6b, 0x45, 0x43, 0xc4, 0x7f,
+    0x97, 0x31, 0x22, 0xb4, 0x9c, 0xa3, 0xee, 0x2f, 0x27, 0xa9, 0x9f, 0x0e,
+    0xdc, 0x40, 0x67, 0x17, 0x2e, 0xcb, 0xfd, 0x9e, 0xe7, 0xb2, 0x85, 0xcd,
+    0x49, 0x24, 0xc8, 0x8a, 0x59, 0x6b, 0x1f, 0xec, 0x72, 0x89, 0xf8, 0x30,
+    0xdf, 0x82, 0x61, 0x3b, 0x8b, 0xc9, 0x80, 0xe4, 0x27, 0x0d, 0xfe, 0x42,
+    0x27, 0x6c, 0xaf, 0x62, 0x3e, 0x2f, 0x1d, 0x38, 0xb6, 0x88, 0x8f, 0x71,
+    0x5a, 0x54, 0x6c, 0x68, 0x57, 0x40, 0x49, 0x7a, 0xb2, 0xe8, 0xb6, 0x97,
+    0xab, 0xd6, 0x3c, 0x35, 0xf3, 0x95, 0x12, 0xde, 0xa2, 0x39, 0x54, 0x52,
+    0x8c, 0x38, 0x2a, 0x2b, 0xe7, 0x21, 0x38, 0x63, 0xb0, 0xd6, 0xad, 0x94,
+    0x44, 0xaf, 0x49, 0x5d, 0xfc, 0x49, 0x6b, 0x30, 0xdf, 0xe9, 0x19, 0x1e,
+    0xed, 0x98, 0x0d, 0x4a, 0x3d, 0x56, 0x5e, 0x74, 0xad, 0x13, 0x8b, 0x68,
+    0x45, 0x08, 0xbe, 0x0e, 0x6c, 0xb4, 0x62, 0x93, 0x27, 0x8b, 0x4f, 0xab,
+    0x3e, 0xba, 0xe1, 0xe5, 0xff, 0xa8, 0x5d, 0x33, 0x32, 0xff, 0x34, 0xf9,
+    0x8d, 0x67, 0x24, 0x4a, 0xbb, 0x2c, 0x60, 0xb5, 0x88, 0x96, 0x1b, 0xcc,
+    0x53, 0xfb, 0x2e, 0x05, 0x1d, 0x8b, 0xc2, 0xa0, 0xde, 0x21, 0x41, 0x5e,
+    0x11, 0x1b, 0x96, 0xd9, 0xa6, 0xae, 0xbd, 0xf0, 0x91, 0xad, 0x69, 0x2b,
+    0xd2, 0x3f, 0xe4, 0x3d, 0x16, 0x69, 0xa6, 0xb2, 0x9c, 0xbe, 0x59, 0x7b,
+    0x87, 0x79, 0xf5, 0xc2, 0x5a, 0xcc, 0xdf, 0xfe, 0x7f, 0xf9, 0xa6, 0x52,
+    0xde, 0x5f, 0x46, 0x91, 0x21, 0x2c, 0x2c, 0x49, 0x25, 0x00, 0xd5, 0xe4,
+    0x81, 0x6b, 0x85, 0xad, 0x98, 0xaf, 0x06, 0x4a, 0x83, 0xb2, 0xe3, 0x42,
+    0x39, 0x31, 0x50, 0xe1, 0x2d, 0x22, 0xe6, 0x07, 0x24, 0x65, 0x29, 0x3f,
+    0x4c, 0xbd, 0x14, 0x8d, 0xfa, 0x31, 0xfa, 0xa4, 0xb5, 0x99, 0x04, 0xa2,
+    0xa5, 0xcc, 0x3b, 0x12, 0xb1, 0xaa, 0x6a, 0x17, 0x78, 0x8b, 0xb3, 0xe4,
+    0x3c, 0x4c, 0xc5, 0xaa, 0x79, 0x12, 0x17, 0xe0, 0x22, 0x4d, 0xf4, 0xa9,
+    0xd5, 0xd0, 0xed, 0xf8, 0xfe, 0x0a, 0x45, 0x80, 0x9f, 0x3b, 0x74, 0xa0,
+    0xb1, 0xda, 0x18, 0xfa, 0xc2, 0x7d, 0xf6, 0x18, 0x2e, 0xa9, 0x2b, 0x7e,
+    0x69, 0x06, 0x43, 0x2d, 0x62, 0x09, 0x42, 0x10, 0x9f, 0x83, 0xad, 0xd9,
+    0xdd, 0xcd, 0xcb, 0x1b, 0x33, 0x32, 0x3e, 0x1f, 0xf6, 0xac, 0x3b, 0xa3,
+    0x29, 0xd7, 0xc0, 0x88, 0xf9, 0xb7, 0x4c, 0xcd, 0x0a, 0x1f, 0xb8, 0x0f,
+    0xe6, 0xf7, 0xd7, 0x4d, 0x5f, 0x06, 0x12, 0x8a, 0x12, 0xa6, 0x2d, 0xbe,
+    0x5c, 0x57, 0xf8, 0x7f, 0x54, 0x3f, 0x90, 0x83, 0x2c, 0x0a, 0xc5, 0x3d,
+    0x03, 0x78, 0x8a, 0x68, 0xf0, 0xbd, 0xa5, 0x3e, 0xe7, 0x07, 0xab, 0xc8,
+    0x58, 0x2f, 0x5c, 0xfd, 0xb5, 0x39, 0xe3, 0xc6, 0x1c, 0x27, 0xf9, 0x0b,
+    0xc7, 0x4c, 0xcc, 0x67, 0x62, 0xe6, 0x79, 0xe8, 0xc1, 0x0a, 0x86, 0x8a,
+    0xb2, 0x32, 0x7b, 0x90, 0x36, 0x50, 0x92, 0x1f, 0x3e, 0x68, 0x39, 0x1c,
+    0x4d, 0x5d, 0xf8, 0x2b, 0xe8, 0x7d, 0xe2, 0x34, 0x61, 0x9e, 0xc3, 0x77,
+    0xb9, 0x4c, 0x34, 0x08, 0xda, 0x31, 0xc9, 0x1d, 0xbd, 0x3b, 0x7b, 0xf1,
+    0x14, 0xba, 0x3a, 0x34, 0x13, 0xaa, 0x5e, 0xa8, 0x36, 0xf6, 0xfe, 0xed,
+    0x5b, 0xef, 0xaf, 0x24, 0x42, 0xba, 0xfc, 0xc9, 0x30, 0x84, 0xec, 0x49,
+    0x14, 0xab, 0x58, 0x71, 0xfe, 0x4b, 0x6d, 0x7b, 0x9f, 0xbb, 0x3c, 0x83,
+    0xdf, 0x3a, 0xfb, 0x54, 0xff, 0x36, 0xaa, 0x6c, 0x47, 0x94, 0xc0, 0xde,
+    0x89, 0x2e, 0xac, 0x68, 0xee, 0xe8, 0xf4, 0xae, 0xa3, 0xe0, 0x91, 0x55,
+    0x0b, 0x0c, 0xd7, 0xf4, 0x33, 0xb5, 0xf9, 0xf2, 0x9e, 0xda, 0x78, 0xe5,
+    0x75, 0xec, 0xdb, 0xf6, 0xed, 0x27, 0x9f, 0x44, 0x19, 0x9f, 0xb7, 0xf0,
+    0xac, 0x1b, 0x3a, 0xf5, 0x77, 0xc7, 0x76, 0x1e, 0x3f, 0x78, 0x12, 0x48,
+    0x1d, 0xb8, 0xe0, 0x30, 0x29, 0x9a, 0x8c, 0x8f, 0x21, 0x44, 0x9c, 0x89,
+    0xec, 0x8e, 0xd0, 0x81, 0xf5, 0x6a, 0xd0, 0xac, 0x5e, 0xf0, 0x0f, 0x88,
+    0x86, 0x31, 0x2e, 0x15, 0x1e, 0x0d, 0x2d, 0xeb, 0x56, 0x30, 0x27, 0x02,
+    0x93, 0xf4, 0x07, 0x07, 0xba, 0xf7, 0xbd, 0xe8, 0x27, 0x4f, 0xc6, 0xd9,
+    0x57, 0x10, 0x3b, 0xf0, 0xff, 0x2f, 0x2d, 0x6b, 0xd0, 0x17, 0xb3, 0x49,
+    0xeb, 0xc2, 0x49, 0xdb,
+};
+
+
 static_assert(sizeof(kInput) == sizeof(kOutput),
               "Input and output lengths don't match.");
+static_assert(sizeof(kInput) == sizeof(kOverflowOutput),
+              "Input and output lengths don't match.");
 
 TEST(ChaChaTest, TestVector) {
   // Run the test with the test vector at all lengths.
@@ -237,6 +331,22 @@
   }
 }
 
+TEST(ChaChaTest, CounterOverflow) {
+  // Run the test with the test vector at all lengths.
+  for (size_t len = 0; len <= sizeof(kInput); len++) {
+    SCOPED_TRACE(len);
+
+    std::unique_ptr<uint8_t[]> buf(new uint8_t[len]);
+    CRYPTO_chacha_20(buf.get(), kInput, len, kKey, kNonce, kOverflowCounter);
+    EXPECT_EQ(Bytes(kOverflowOutput, len), Bytes(buf.get(), len));
+
+    // Test the in-place version.
+    OPENSSL_memcpy(buf.get(), kInput, len);
+    CRYPTO_chacha_20(buf.get(), buf.get(), len, kKey, kNonce, kOverflowCounter);
+    EXPECT_EQ(Bytes(kOverflowOutput, len), Bytes(buf.get(), len));
+  }
+}
+
 #if defined(CHACHA20_ASM) && defined(SUPPORTS_ABI_TEST)
 TEST(ChaChaTest, ABI) {
   uint32_t key[8];
diff --git a/crypto/chacha/internal.h b/crypto/chacha/internal.h
index 1435e3b..5f442ec 100644
--- a/crypto/chacha/internal.h
+++ b/crypto/chacha/internal.h
@@ -32,7 +32,14 @@
      defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))
 #define CHACHA20_ASM
 
-// ChaCha20_ctr32 is defined in asm/chacha-*.pl.
+// ChaCha20_ctr32 encrypts |in_len| bytes from |in| and writes the result to
+// |out|. If |in| and |out| alias, they must be equal.
+//
+// |counter[0]| is the initial 32-bit block counter, and the remainder is the
+// 96-bit nonce. If the counter overflows, the output is undefined. The function
+// will produce output, but the output may vary by machine and may not be
+// self-consistent. (On some architectures, the assembly implements a mix of
+// 64-bit and 32-bit counters.)
 void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len,
                     const uint32_t key[8], const uint32_t counter[4]);
 #endif
diff --git a/include/openssl/chacha.h b/include/openssl/chacha.h
index cfbaa75..2868c29 100644
--- a/include/openssl/chacha.h
+++ b/include/openssl/chacha.h
@@ -29,6 +29,12 @@
 // CRYPTO_chacha_20 encrypts |in_len| bytes from |in| with the given key and
 // nonce and writes the result to |out|. If |in| and |out| alias, they must be
 // equal. The initial block counter is specified by |counter|.
+//
+// This function implements a 32-bit block counter as in RFC 8439. On overflow,
+// the counter wraps. Reusing a key, nonce, and block counter combination is not
+// secure, so wrapping is usually a bug in the caller. While it is possible to
+// wrap without reuse with a large initial block counter, this is not
+// recommended and may not be portable to other ChaCha20 implementations.
 OPENSSL_EXPORT void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in,
                                      size_t in_len, const uint8_t key[32],
                                      const uint8_t nonce[12], uint32_t counter);