Fix the endianness of DTLS 1.3 ChaCha20 record number encryption

The spec is a little bit unclear, but it passes the counter portion as
bytes:

>  Mask = ChaCha20(sn_key, Ciphertext[0..3], Ciphertext[4..15])

And then RFC 8439 says:

>  A 32-bit block count parameter, treated as a 32-bit little-endian
>  integer.

So I believe this means that, formally, the block count parameter is a
[4]uint8, not a uint32, and then ChaCha20 internally reads it as
little-endian. Our API takes a uint32, so it is the caller's
responsibility to pick little-endian.

This also matches the QUIC construction.

While I'm here, avoid an unnecessary two-byte allocation on every DTLS
1.3 record decryption. Functions like these generally can generally work
in-place.

Bug: 42290594
Change-Id: I879944ca533d37a1599d2170a00193caecd01f42
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/71547
Reviewed-by: Bob Beck <bbe@google.com>
Commit-Queue: Bob Beck <bbe@google.com>
Auto-Submit: David Benjamin <davidben@google.com>
diff --git a/ssl/dtls_record.cc b/ssl/dtls_record.cc
index 8ca7b63..ba06acf 100644
--- a/ssl/dtls_record.cc
+++ b/ssl/dtls_record.cc
@@ -528,8 +528,13 @@
     // |0|0|1|0|1|1|E E|
     // +-+-+-+-+-+-+-+-+
     out[0] = 0x2c | (epoch & 0x3);
+    // We always use a two-byte sequence number. A one-byte sequence number
+    // would require coordinating with the application on ACK feedback to know
+    // that the peer is not too far behind.
     out[1] = *seq >> 8;
     out[2] = *seq & 0xff;
+    // TODO(crbug.com/42290594): When we know the record is last in the packet,
+    // omit the length.
     out[3] = ciphertext_len >> 8;
     out[4] = ciphertext_len & 0xff;
     // DTLS 1.3 uses the sequence number without the epoch for the AEAD.
diff --git a/ssl/ssl_aead_ctx.cc b/ssl/ssl_aead_ctx.cc
index 4f532e9..5079ca2 100644
--- a/ssl/ssl_aead_ctx.cc
+++ b/ssl/ssl_aead_ctx.cc
@@ -483,20 +483,17 @@
 
 bool ChaChaRecordNumberEncrypter::GenerateMask(Span<uint8_t> out,
                                                Span<const uint8_t> sample) {
-  Array<uint8_t> zeroes;
-  if (!zeroes.Init(out.size())) {
-    return false;
-  }
-  OPENSSL_memset(zeroes.data(), 0, zeroes.size());
   // RFC 9147 section 4.2.3 uses the first 4 bytes of the sample as the counter
   // and the next 12 bytes as the nonce. If we have less than 4+12=16 bytes in
-  // the sample, then we'll read past the end of the |sample| buffer.
+  // the sample, then we'll read past the end of the |sample| buffer. The
+  // counter is interpreted as little-endian per RFC 8439.
   if (sample.size() < 16) {
     return false;
   }
-  uint32_t counter = CRYPTO_load_u32_be(sample.data());
+  uint32_t counter = CRYPTO_load_u32_le(sample.data());
   Span<const uint8_t> nonce = sample.subspan(4);
-  CRYPTO_chacha_20(out.data(), zeroes.data(), zeroes.size(), key_, nonce.data(),
+  OPENSSL_memset(out.data(), 0, out.size());
+  CRYPTO_chacha_20(out.data(), out.data(), out.size(), key_, nonce.data(),
                    counter);
   return true;
 }
diff --git a/ssl/test/runner/conn.go b/ssl/test/runner/conn.go
index 38b1354..9229302 100644
--- a/ssl/test/runner/conn.go
+++ b/ssl/test/runner/conn.go
@@ -776,20 +776,18 @@
 }
 
 func (c *chachaRecordNumberEncrypter) generateMask(sample []byte) []byte {
-	var counter uint32
-	nonce := make([]byte, 12)
+	var counter, nonce []byte
 	sampleReader := cryptobyte.String(sample)
-	if !sampleReader.ReadUint32(&counter) || !sampleReader.CopyBytes(nonce) {
+	if !sampleReader.ReadBytes(&counter, 4) || !sampleReader.ReadBytes(&nonce, 12) {
 		panic("chachaRecordNumberEncrypter.GenerateMask called with wrong size sample")
 	}
 	cipher, err := chacha20.NewUnauthenticatedCipher(c.key, nonce)
 	if err != nil {
 		panic("Failed to create chacha20 cipher for record number encryption")
 	}
-	cipher.SetCounter(counter)
-	zeroes := make([]byte, 2)
+	cipher.SetCounter(binary.LittleEndian.Uint32(counter))
 	out := make([]byte, 2)
-	cipher.XORKeyStream(out, zeroes)
+	cipher.XORKeyStream(out, out)
 	return out
 }