Change |CRYPTO_chacha_20| to use 96-bit nonces, 32-bit counters.

The new function |CRYPTO_chacha_96_bit_nonce_from_64_bit_nonce| can be
used to adapt code from that uses 64 bit nonces, in a way that is
compatible with the old semantics.

Change-Id: I83d5b2d482e006e82982f58c9f981e8078c3e1b0
Reviewed-on: https://boringssl-review.googlesource.com/6100
Reviewed-by: Adam Langley <alangley@gmail.com>
diff --git a/crypto/chacha/chacha_generic.c b/crypto/chacha/chacha_generic.c
index 31cf4f0..f262033 100644
--- a/crypto/chacha/chacha_generic.c
+++ b/crypto/chacha/chacha_generic.c
@@ -54,8 +54,8 @@
 #if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM)
 /* Defined in chacha_vec.c */
 void CRYPTO_chacha_20_neon(uint8_t *out, const uint8_t *in, size_t in_len,
-                           const uint8_t key[32], const uint8_t nonce[8],
-                           size_t counter);
+                           const uint8_t key[32], const uint8_t nonce[12],
+                           uint32_t counter);
 #endif
 
 /* chacha_core performs 20 rounds of ChaCha on the input words in
@@ -85,8 +85,8 @@
 }
 
 void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len,
-                      const uint8_t key[32], const uint8_t nonce[8],
-                      size_t counter) {
+                      const uint8_t key[32], const uint8_t nonce[12],
+                      uint32_t counter) {
   uint32_t input[16];
   uint8_t buf[64];
   size_t todo, i;
@@ -114,9 +114,9 @@
   input[11] = U8TO32_LITTLE(key + 28);
 
   input[12] = counter;
-  input[13] = ((uint64_t)counter) >> 32;
-  input[14] = U8TO32_LITTLE(nonce + 0);
-  input[15] = U8TO32_LITTLE(nonce + 4);
+  input[13] = U8TO32_LITTLE(nonce + 0);
+  input[14] = U8TO32_LITTLE(nonce + 4);
+  input[15] = U8TO32_LITTLE(nonce + 8);
 
   while (in_len > 0) {
     todo = sizeof(buf);
@@ -134,9 +134,6 @@
     in_len -= todo;
 
     input[12]++;
-    if (input[12] == 0) {
-      input[13]++;
-    }
   }
 }
 
diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c
index 14b54a7..addbaa3 100644
--- a/crypto/chacha/chacha_vec.c
+++ b/crypto/chacha/chacha_vec.c
@@ -154,12 +154,12 @@
 	const uint8_t *in,
 	size_t inlen,
 	const uint8_t key[32],
-	const uint8_t nonce[8],
-	size_t counter)
+	const uint8_t nonce[12],
+	uint32_t counter)
 	{
 	unsigned iters, i, *op=(unsigned *)out, *ip=(unsigned *)in, *kp;
 #if defined(__ARM_NEON__)
-	uint32_t np[2];
+	uint32_t np[3];
 	uint8_t alignment_buffer[16] __attribute__((aligned(16)));
 #endif
 	vec s0, s1, s2, s3;
@@ -167,20 +167,16 @@
 		{0x61707865,0x3320646E,0x79622D32,0x6B206574};
 	kp = (unsigned *)key;
 #if defined(__ARM_NEON__)
-	memcpy(np, nonce, 8);
+	memcpy(np, nonce, 12);
 #endif
 	s0 = LOAD_ALIGNED(chacha_const);
 	s1 = LOAD(&((vec*)kp)[0]);
 	s2 = LOAD(&((vec*)kp)[1]);
 	s3 = (vec){
-		counter & 0xffffffff,
-#if __ARM_NEON__ || defined(OPENSSL_X86)
-		0,  /* can't right-shift 32 bits on a 32-bit system. */
-#else
-		counter >> 32,
-#endif
+		counter,
 		((uint32_t*)nonce)[0],
-		((uint32_t*)nonce)[1]
+		((uint32_t*)nonce)[1],
+		((uint32_t*)nonce)[2]
 	};
 
 	for (iters = 0; iters < inlen/(BPI*64); iters++)
@@ -212,8 +208,8 @@
 		x2 = chacha_const[2]; x3 = chacha_const[3];
 		x4 = kp[0]; x5 = kp[1]; x6  = kp[2]; x7  = kp[3];
 		x8 = kp[4]; x9 = kp[5]; x10 = kp[6]; x11 = kp[7];
-		x12 = counter+BPI*iters+(BPI-1); x13 = 0;
-		x14 = np[0]; x15 = np[1];
+		x12 = counter+BPI*iters+(BPI-1); x13 = np[0];
+		x14 = np[1]; x15 = np[2];
 #endif
 		for (i = CHACHA_RNDS/2; i; i--)
 			{
@@ -265,9 +261,9 @@
 		op[10] = REVW_BE(REVW_BE(ip[10]) ^ (x10 + kp[6]));
 		op[11] = REVW_BE(REVW_BE(ip[11]) ^ (x11 + kp[7]));
 		op[12] = REVW_BE(REVW_BE(ip[12]) ^ (x12 + counter+BPI*iters+(BPI-1)));
-		op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13));
-		op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[0]));
-		op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[1]));
+		op[13] = REVW_BE(REVW_BE(ip[13]) ^ (x13 + np[0]));
+		op[14] = REVW_BE(REVW_BE(ip[14]) ^ (x14 + np[1]));
+		op[15] = REVW_BE(REVW_BE(ip[15]) ^ (x15 + np[2]));
 		s3 += ONE;
 		ip += 16;
 		op += 16;
diff --git a/crypto/chacha/chacha_vec_arm.S b/crypto/chacha/chacha_vec_arm.S
index dfd596c..f18c867 100644
--- a/crypto/chacha/chacha_vec_arm.S
+++ b/crypto/chacha/chacha_vec_arm.S
@@ -60,137 +60,138 @@
 	.thumb_func
 	.type	CRYPTO_chacha_20_neon, %function
 CRYPTO_chacha_20_neon:
-	@ args = 8, pretend = 0, frame = 152
+	@ args = 8, pretend = 0, frame = 160
 	@ frame_needed = 1, uses_anonymous_args = 0
 	push	{r4, r5, r6, r7, r8, r9, r10, fp, lr}
-	mov	r8, r3
+	mov	r9, r3
 	vpush.64	{d8, d9, d10, d11, d12, d13, d14, d15}
-	mov	r9, r2
+	mov	r10, r2
 	ldr	r4, .L91+16
-	mov	fp, r0
-	mov	r10, r1
-	mov	lr, r8
+	mov	fp, r1
+	mov	r8, r9
 .LPIC16:
 	add	r4, pc
-	sub	sp, sp, #156
+	sub	sp, sp, #164
 	add	r7, sp, #0
 	sub	sp, sp, #112
-	add	r6, r7, #144
-	str	r0, [r7, #88]
+	add	lr, r7, #148
+	str	r0, [r7, #80]
 	str	r1, [r7, #12]
 	str	r2, [r7, #8]
 	ldmia	r4, {r0, r1, r2, r3}
 	add	r4, sp, #15
 	bic	r4, r4, #15
-	ldr	ip, [r7, #256]
-	str	r4, [r7, #84]
+	ldr	r6, [r7, #264]
+	str	r4, [r7, #88]
 	mov	r5, r4
 	adds	r4, r4, #64
-	adds	r5, r5, #80
-	str	r8, [r7, #68]
+	add	ip, r5, #80
+	str	r9, [r7, #56]
 	stmia	r4, {r0, r1, r2, r3}
 	movw	r4, #43691
-	ldr	r0, [ip]	@ unaligned
+	ldr	r0, [r6]	@ unaligned
 	movt	r4, 43690
-	ldr	r1, [ip, #4]	@ unaligned
-	ldr	r3, [r7, #84]
-	ldr	r2, [r8, #8]	@ unaligned
-	mov	r8, #0
-	stmia	r6!, {r0, r1}
-	mov	r6, r5
-	ldr	r1, [lr, #4]	@ unaligned
-	ldr	r0, [lr]	@ unaligned
-	vldr	d24, [r3, #64]
-	vldr	d25, [r3, #72]
-	ldr	r3, [lr, #12]	@ unaligned
-	str	r5, [r7, #80]
-	stmia	r5!, {r0, r1, r2, r3}
-	ldr	r0, [lr, #16]!	@ unaligned
-	ldr	r2, [r7, #84]
-	umull	r4, r5, r9, r4
+	ldr	r1, [r6, #4]	@ unaligned
+	ldr	r2, [r6, #8]	@ unaligned
+	ldr	r3, [r9, #12]	@ unaligned
+	str	ip, [r7, #84]
+	stmia	lr!, {r0, r1, r2}
+	mov	lr, ip
+	ldr	r1, [r9, #4]	@ unaligned
+	ldr	r2, [r9, #8]	@ unaligned
+	ldr	r0, [r9]	@ unaligned
+	vldr	d24, [r5, #64]
+	vldr	d25, [r5, #72]
+	umull	r4, r5, r10, r4
+	stmia	ip!, {r0, r1, r2, r3}
+	ldr	r0, [r8, #16]!	@ unaligned
+	ldr	r2, [r7, #88]
+	ldr	r4, [r7, #268]
+	ldr	r1, [r8, #4]	@ unaligned
 	vldr	d26, [r2, #80]
 	vldr	d27, [r2, #88]
-	ldr	r1, [lr, #4]	@ unaligned
-	ldr	r2, [lr, #8]	@ unaligned
-	ldr	r3, [lr, #12]	@ unaligned
-	ldr	r4, [r7, #260]
-	stmia	r6!, {r0, r1, r2, r3}
-	ldr	r3, [ip]
-	ldr	r1, [r7, #84]
-	ldr	r2, [ip, #4]
-	str	r3, [r7, #64]
-	vldr	d28, [r1, #80]
-	vldr	d29, [r1, #88]
-	str	r3, [r7, #136]
+	ldr	r3, [r8, #12]	@ unaligned
+	ldr	r2, [r8, #8]	@ unaligned
+	stmia	lr!, {r0, r1, r2, r3}
+	ldr	r3, [r6]
+	ldr	r1, [r6, #4]
+	ldr	r6, [r6, #8]
+	str	r3, [r7, #68]
+	str	r3, [r7, #132]
 	lsrs	r3, r5, #7
+	str	r6, [r7, #140]
+	str	r6, [r7, #60]
+	ldr	r6, [r7, #88]
 	str	r4, [r7, #128]
-	str	r2, [r7, #140]
-	str	r8, [r7, #132]
-	str	r2, [r7, #60]
+	str	r1, [r7, #136]
+	str	r1, [r7, #64]
+	vldr	d28, [r6, #80]
+	vldr	d29, [r6, #88]
 	vldr	d22, [r7, #128]
 	vldr	d23, [r7, #136]
 	beq	.L26
+	mov	r5, r6
 	lsls	r2, r3, #8
-	ldr	r5, [r1, #64]
 	sub	r3, r2, r3, lsl #6
-	ldr	r2, [r1, #68]
+	ldr	r2, [r5, #68]
+	ldr	r6, [r6, #64]
 	vldr	d0, .L91
 	vldr	d1, .L91+8
-	adds	r4, r4, #2
-	str	r5, [r7, #56]
-	str	r2, [r7, #52]
-	ldr	r5, [r1, #72]
-	ldr	r2, [r1, #76]
+	str	r2, [r7, #48]
+	ldr	r2, [r5, #72]
 	str	r3, [r7, #4]
-	str	r5, [r7, #48]
+	str	r6, [r7, #52]
 	str	r2, [r7, #44]
-	mov	r2, fp
-	str	r4, [r7, #72]
+	adds	r2, r4, #2
+	str	r2, [r7, #72]
+	ldr	r2, [r5, #76]
+	str	fp, [r7, #76]
+	str	r2, [r7, #40]
+	ldr	r2, [r7, #80]
 	adds	r3, r2, r3
-	str	r10, [r7, #76]
 	str	r3, [r7, #16]
 .L4:
-	ldr	r5, [r7, #68]
-	add	r8, r7, #44
-	ldr	r4, [r7, #72]
+	ldr	r5, [r7, #56]
+	add	r8, r7, #40
+	ldr	r4, [r7, #68]
 	vadd.i32	q3, q11, q0
 	ldmia	r8, {r8, r9, r10, fp}
-	vmov	q8, q14  @ v4si
+	mov	r1, r5
 	ldr	r2, [r5, #4]
-	vmov	q1, q13  @ v4si
+	vmov	q8, q14  @ v4si
 	ldr	r3, [r5]
+	vmov	q1, q13  @ v4si
+	ldr	r6, [r1, #28]
 	vmov	q9, q12  @ v4si
-	ldr	lr, [r5, #20]
-	vmov	q2, q11  @ v4si
 	mov	r0, r2
 	ldr	r2, [r5, #8]
-	str	r3, [r7, #108]
-	mov	r3, r5
-	ldr	ip, [r5, #16]
-	vmov	q15, q14  @ v4si
-	mov	r1, r2
-	ldr	r2, [r5, #12]
-	ldr	r5, [r5, #24]
-	vmov	q5, q13  @ v4si
-	ldr	r6, [r3, #28]
-	vmov	q10, q12  @ v4si
-	ldr	r3, [r7, #64]
-	str	r5, [r7, #116]
-	movs	r5, #10
-	str	r6, [r7, #120]
 	str	r4, [r7, #112]
+	movs	r1, #10
+	ldr	r4, [r7, #72]
+	vmov	q2, q11  @ v4si
+	ldr	lr, [r5, #20]
+	vmov	q15, q14  @ v4si
+	str	r3, [r7, #108]
+	vmov	q5, q13  @ v4si
+	str	r2, [r7, #116]
+	vmov	q10, q12  @ v4si
+	ldr	r2, [r5, #12]
+	ldr	ip, [r5, #16]
+	ldr	r3, [r7, #64]
+	ldr	r5, [r5, #24]
+	str	r6, [r7, #120]
+	str	r1, [r7, #92]
 	ldr	r6, [r7, #60]
+	str	r4, [r7, #100]
+	ldr	r1, [r7, #116]
+	ldr	r4, [r7, #108]
 	str	r8, [r7, #96]
 	mov	r8, r10
-	ldr	r4, [r7, #108]
-	mov	r10, r9
-	ldr	r9, [r7, #116]
 	str	lr, [r7, #104]
+	mov	r10, r9
 	mov	lr, r3
-	str	r5, [r7, #92]
-	movs	r5, #0
+	mov	r9, r5
 	str	r6, [r7, #124]
-	str	r5, [r7, #100]
 	b	.L92
 .L93:
 	.align	3
@@ -213,25 +214,24 @@
 	str	r5, [r7, #116]
 	add	r10, r10, r1
 	vrev32.16	q3, q3
-	eor	lr, lr, r10
+	str	r6, [r7, #108]
 	vadd.i32	q8, q8, q3
 	vrev32.16	q2, q2
 	vadd.i32	q15, q15, q2
 	mov	fp, r3
-	ldr	r3, [r7, #112]
+	ldr	r3, [r7, #100]
 	veor	q4, q8, q1
-	str	r6, [r7, #112]
 	veor	q6, q15, q5
+	add	fp, fp, r2
 	eors	r3, r3, r5
 	mov	r5, r6
-	ldr	r6, [r7, #100]
+	ldr	r6, [r7, #112]
 	vshl.i32	q1, q4, #12
 	vshl.i32	q5, q6, #12
-	add	fp, fp, r2
-	eors	r6, r6, r5
 	ror	r3, r3, #16
+	eors	r6, r6, r5
+	eor	lr, lr, r10
 	vsri.32	q1, q4, #20
-	ror	lr, lr, #16
 	mov	r5, r6
 	ldr	r6, [r7, #124]
 	vsri.32	q5, q6, #20
@@ -239,25 +239,26 @@
 	eor	r6, r6, fp
 	ror	r5, r5, #16
 	vadd.i32	q9, q9, q1
-	add	r9, r9, lr
+	ror	lr, lr, #16
 	ror	r3, r6, #16
 	ldr	r6, [r7, #124]
 	vadd.i32	q10, q10, q5
-	str	r3, [r7, #108]
+	add	r9, r9, lr
 	veor	q4, q9, q3
 	add	ip, ip, r6
 	ldr	r6, [r7, #104]
 	veor	q6, q10, q2
 	eor	r4, ip, r4
-	eor	r1, r9, r1
+	str	r3, [r7, #104]
 	vshl.i32	q3, q4, #8
+	eor	r1, r9, r1
 	mov	r8, r6
 	ldr	r6, [r7, #120]
 	vshl.i32	q2, q6, #8
 	ror	r4, r4, #20
 	add	r6, r6, r3
 	vsri.32	q3, q4, #24
-	str	r6, [r7, #104]
+	str	r6, [r7, #100]
 	eors	r2, r2, r6
 	ldr	r6, [r7, #116]
 	vsri.32	q2, q6, #24
@@ -268,7 +269,7 @@
 	eor	r0, r8, r0
 	vadd.i32	q15, q15, q2
 	mov	r3, r6
-	ldr	r6, [r7, #112]
+	ldr	r6, [r7, #108]
 	veor	q6, q4, q1
 	ror	r0, r0, #20
 	str	r3, [r7, #112]
@@ -285,7 +286,7 @@
 	ror	r1, r1, #20
 	eors	r5, r5, r6
 	vsri.32	q8, q6, #25
-	ldr	r6, [r7, #108]
+	ldr	r6, [r7, #104]
 	ror	r3, r3, #24
 	ror	r5, r5, #24
 	vsri.32	q1, q5, #25
@@ -297,7 +298,7 @@
 	vext.32	q8, q8, q8, #1
 	str	ip, [r7, #124]
 	add	ip, r5, r8
-	ldr	r5, [r7, #104]
+	ldr	r5, [r7, #100]
 	eor	lr, r10, lr
 	ror	r6, r6, #24
 	vext.32	q1, q1, q1, #1
@@ -410,7 +411,7 @@
 	veor	q6, q15, q1
 	ldr	r3, [r7, #116]
 	vshl.i32	q1, q4, #7
-	str	r2, [r7, #112]
+	str	r2, [r7, #100]
 	add	r3, r3, r2
 	str	r3, [r7, #120]
 	vshl.i32	q5, q6, #7
@@ -423,7 +424,7 @@
 	vsri.32	q5, q6, #25
 	ldr	r3, [r7, #92]
 	ror	r4, r4, #25
-	str	r6, [r7, #100]
+	str	r6, [r7, #112]
 	ror	r0, r0, #25
 	subs	r3, r3, #1
 	str	r5, [r7, #104]
@@ -437,308 +438,325 @@
 	vext.32	q5, q5, q5, #3
 	vext.32	q1, q1, q1, #3
 	bne	.L3
-	ldr	r3, [r7, #80]
+	ldr	r3, [r7, #84]
 	vadd.i32	q4, q12, q10
-	str	r9, [r7, #116]
+	str	r9, [r7, #92]
 	mov	r9, r10
 	mov	r10, r8
 	ldr	r8, [r7, #96]
 	str	lr, [r7, #96]
 	mov	lr, r5
-	ldr	r5, [r7, #56]
+	ldr	r5, [r7, #52]
 	vadd.i32	q5, q13, q5
 	ldr	r6, [r7, #76]
 	vadd.i32	q15, q14, q15
 	add	fp, fp, r5
-	ldr	r5, [r7, #52]
-	str	r4, [r7, #108]
+	ldr	r5, [r7, #48]
+	str	r3, [r7, #104]
 	vadd.i32	q7, q14, q8
-	ldr	r4, [r7, #112]
-	add	r5, r10, r5
-	str	r3, [r7, #112]
-	vadd.i32	q2, q11, q2
 	ldr	r3, [r6, #12]	@ unaligned
-	vadd.i32	q6, q12, q9
-	str	r0, [r7, #92]
-	vadd.i32	q1, q13, q1
+	add	r10, r10, r5
+	str	r0, [r7, #36]
+	vadd.i32	q2, q11, q2
 	ldr	r0, [r6]	@ unaligned
+	vadd.i32	q6, q12, q9
+	ldr	r5, [r7, #104]
+	vadd.i32	q1, q13, q1
+	str	r1, [r7, #116]
 	vadd.i32	q11, q11, q0
-	str	r1, [r7, #40]
-	str	r2, [r7, #36]
-	vadd.i32	q3, q11, q3
 	ldr	r1, [r6, #4]	@ unaligned
-	vadd.i32	q11, q11, q0
+	str	r2, [r7, #32]
+	vadd.i32	q3, q11, q3
 	ldr	r2, [r6, #8]	@ unaligned
-	str	r5, [r7, #104]
 	vadd.i32	q11, q11, q0
-	ldr	r5, [r7, #112]
-	ldr	r10, [r7, #80]
+	str	r4, [r7, #108]
+	ldr	r4, [r7, #100]
+	vadd.i32	q11, q11, q0
 	stmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r10
-	ldr	r0, [r7, #84]
-	ldr	r2, [r7, #48]
-	ldr	r3, [r7, #72]
-	vldr	d20, [r0, #80]
-	vldr	d21, [r0, #88]
-	add	r9, r9, r2
+	ldr	r2, [r7, #88]
+	ldr	r3, [r7, #44]
+	ldr	r5, [r7, #84]
+	vldr	d20, [r2, #80]
+	vldr	d21, [r2, #88]
+	add	r3, r9, r3
+	str	r3, [r7, #104]
 	veor	q10, q10, q4
-	ldr	r2, [r7, #44]
+	ldr	r3, [r7, #40]
+	add	r3, r8, r3
+	str	r3, [r7, #100]
+	ldr	r3, [r7, #72]
+	vstr	d20, [r2, #80]
+	vstr	d21, [r2, #88]
 	adds	r1, r4, r3
 	str	r1, [r7, #28]
-	add	r2, r8, r2
-	str	r2, [r7, #32]
-	vstr	d20, [r0, #80]
-	vstr	d21, [r0, #88]
 	ldmia	r5!, {r0, r1, r2, r3}
+	ldr	r4, [r7, #68]
+	ldr	r5, [r7, #112]
+	ldr	r8, [r7, #84]
+	add	r5, r5, r4
 	ldr	r4, [r7, #96]
+	str	r5, [r7, #24]
 	ldr	r5, [r7, #64]
 	add	r4, r4, r5
-	ldr	r5, [r7, #124]
+	ldr	r5, [r7, #60]
 	str	r4, [r7, #96]
-	ldr	r4, [r7, #60]
-	add	r5, r5, r4
-	ldr	r4, [r7, #88]
-	str	r5, [r7, #24]
-	mov	r5, r10
+	ldr	r4, [r7, #124]
+	add	r4, r4, r5
+	str	r4, [r7, #20]
+	ldr	r4, [r7, #80]
+	mov	r5, r8
 	str	r0, [r4]	@ unaligned
 	mov	r0, r4
 	str	r1, [r4, #4]	@ unaligned
-	mov	r8, r0
+	mov	r4, r8
 	str	r2, [r0, #8]	@ unaligned
-	mov	r4, r10
+	mov	r8, r0
 	str	r3, [r0, #12]	@ unaligned
+	mov	r9, r4
 	ldr	r0, [r6, #16]!	@ unaligned
+	ldr	r3, [r6, #12]	@ unaligned
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
-	ldr	r3, [r6, #12]	@ unaligned
 	ldr	r6, [r7, #76]
 	stmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r10
-	ldr	r3, [r7, #84]
+	mov	r5, r8
+	ldr	r3, [r7, #88]
 	vldr	d20, [r3, #80]
 	vldr	d21, [r3, #88]
 	veor	q10, q10, q5
 	vstr	d20, [r3, #80]
 	vstr	d21, [r3, #88]
 	ldmia	r4!, {r0, r1, r2, r3}
-	mov	r4, r8
+	mov	r4, r9
 	str	r0, [r8, #16]	@ unaligned
 	str	r1, [r8, #20]	@ unaligned
 	str	r2, [r8, #24]	@ unaligned
 	str	r3, [r8, #28]	@ unaligned
-	mov	r8, r4
+	mov	r8, r5
 	ldr	r0, [r6, #32]!	@ unaligned
-	str	r10, [r7, #124]
+	mov	r5, r9
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
 	ldr	r6, [r7, #76]
 	stmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r10
-	ldr	r2, [r7, #84]
-	vldr	d16, [r2, #80]
-	vldr	d17, [r2, #88]
+	mov	r5, r8
+	ldr	r1, [r7, #88]
+	vldr	d16, [r1, #80]
+	vldr	d17, [r1, #88]
 	veor	q15, q8, q15
-	vstr	d30, [r2, #80]
-	vstr	d31, [r2, #88]
-	ldmia	r10!, {r0, r1, r2, r3}
-	str	r0, [r4, #32]	@ unaligned
-	str	r1, [r4, #36]	@ unaligned
-	str	r2, [r4, #40]	@ unaligned
-	str	r3, [r4, #44]	@ unaligned
+	vstr	d30, [r1, #80]
+	vstr	d31, [r1, #88]
+	ldmia	r4!, {r0, r1, r2, r3}
+	mov	r4, r9
+	str	r0, [r8, #32]	@ unaligned
+	str	r1, [r8, #36]	@ unaligned
+	str	r2, [r8, #40]	@ unaligned
+	str	r3, [r8, #44]	@ unaligned
+	mov	r8, r5
 	ldr	r0, [r6, #48]!	@ unaligned
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
 	ldr	r6, [r7, #76]
-	stmia	r5!, {r0, r1, r2, r3}
-	ldr	r1, [r7, #84]
+	stmia	r4!, {r0, r1, r2, r3}
+	mov	r4, r9
+	ldr	r1, [r7, #88]
+	str	r9, [r7, #112]
 	vldr	d18, [r1, #80]
 	vldr	d19, [r1, #88]
 	veor	q9, q9, q2
 	vstr	d18, [r1, #80]
 	vstr	d19, [r1, #88]
-	ldr	r3, [r7, #112]
-	ldr	r5, [r7, #80]
-	mov	r10, r3
-	ldmia	r10!, {r0, r1, r2, r3}
-	str	r0, [r4, #48]	@ unaligned
-	str	r1, [r4, #52]	@ unaligned
-	str	r2, [r4, #56]	@ unaligned
-	str	r3, [r4, #60]	@ unaligned
+	ldmia	r9!, {r0, r1, r2, r3}
+	str	r0, [r5, #48]	@ unaligned
+	str	r1, [r5, #52]	@ unaligned
+	str	r2, [r5, #56]	@ unaligned
+	str	r3, [r5, #60]	@ unaligned
 	ldr	r0, [r6, #64]!	@ unaligned
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
 	ldr	r6, [r7, #76]
-	stmia	r5!, {r0, r1, r2, r3}
-	ldr	r1, [r7, #84]
-	ldr	r3, [r7, #112]
-	ldr	r5, [r7, #80]
+	mov	r9, r6
+	mov	r6, r4
+	stmia	r6!, {r0, r1, r2, r3}
+	mov	r6, r4
+	ldr	r1, [r7, #88]
 	vldr	d18, [r1, #80]
 	vldr	d19, [r1, #88]
 	veor	q9, q9, q6
-	mov	r10, r3
-	str	r5, [r7, #20]
 	vstr	d18, [r1, #80]
 	vstr	d19, [r1, #88]
-	ldmia	r10!, {r0, r1, r2, r3}
-	str	r1, [r4, #68]	@ unaligned
-	str	r2, [r4, #72]	@ unaligned
-	str	r3, [r4, #76]	@ unaligned
-	str	r0, [r4, #64]	@ unaligned
-	ldr	r0, [r6, #80]!	@ unaligned
-	ldr	r1, [r6, #4]	@ unaligned
-	ldr	r2, [r6, #8]	@ unaligned
-	ldr	r3, [r6, #12]	@ unaligned
+	ldmia	r4!, {r0, r1, r2, r3}
+	mov	r4, r6
+	str	r3, [r5, #76]	@ unaligned
+	mov	r3, r9
+	str	r2, [r5, #72]	@ unaligned
+	str	r0, [r5, #64]	@ unaligned
+	str	r1, [r5, #68]	@ unaligned
+	mov	r5, r4
+	ldr	r0, [r3, #80]!	@ unaligned
+	mov	r9, r3
+	ldr	r1, [r9, #4]	@ unaligned
+	ldr	r2, [r9, #8]	@ unaligned
+	ldr	r3, [r9, #12]	@ unaligned
+	mov	r9, r4
 	ldr	r6, [r7, #76]
+	str	r9, [r7, #124]
 	stmia	r5!, {r0, r1, r2, r3}
-	ldr	r1, [r7, #84]
-	ldr	r3, [r7, #20]
-	ldr	r5, [r7, #80]
+	mov	r5, r8
+	ldr	r1, [r7, #88]
 	vldr	d18, [r1, #80]
 	vldr	d19, [r1, #88]
 	veor	q1, q9, q1
-	mov	r10, r3
 	vstr	d2, [r1, #80]
 	vstr	d3, [r1, #88]
-	ldmia	r10!, {r0, r1, r2, r3}
-	mov	r10, r5
-	str	r0, [r4, #80]	@ unaligned
-	str	r1, [r4, #84]	@ unaligned
-	str	r2, [r4, #88]	@ unaligned
-	str	r3, [r4, #92]	@ unaligned
+	ldmia	r4!, {r0, r1, r2, r3}
+	mov	r4, r9
+	str	r0, [r8, #80]	@ unaligned
+	str	r1, [r8, #84]	@ unaligned
+	str	r2, [r8, #88]	@ unaligned
+	str	r3, [r8, #92]	@ unaligned
 	ldr	r0, [r6, #96]!	@ unaligned
+	ldr	r3, [r6, #12]	@ unaligned
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
-	ldr	r3, [r6, #12]	@ unaligned
 	ldr	r6, [r7, #76]
-	stmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r10
-	ldr	r3, [r7, #84]
+	stmia	r4!, {r0, r1, r2, r3}
+	mov	r4, r9
+	ldr	r3, [r7, #88]
 	vldr	d16, [r3, #80]
 	vldr	d17, [r3, #88]
 	veor	q8, q8, q7
 	vstr	d16, [r3, #80]
 	vstr	d17, [r3, #88]
-	ldmia	r10!, {r0, r1, r2, r3}
-	str	r0, [r4, #96]	@ unaligned
-	str	r1, [r4, #100]	@ unaligned
-	str	r2, [r4, #104]	@ unaligned
-	str	r3, [r4, #108]	@ unaligned
+	ldmia	r9!, {r0, r1, r2, r3}
+	str	r0, [r5, #96]	@ unaligned
+	str	r1, [r5, #100]	@ unaligned
+	str	r2, [r5, #104]	@ unaligned
+	str	r3, [r5, #108]	@ unaligned
 	ldr	r0, [r6, #112]!	@ unaligned
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
-	mov	r6, r5
+	mov	r6, r4
 	stmia	r6!, {r0, r1, r2, r3}
-	ldr	r3, [r7, #84]
+	mov	r6, r5
+	ldr	r3, [r7, #88]
 	vldr	d16, [r3, #80]
 	vldr	d17, [r3, #88]
 	veor	q8, q8, q3
 	vstr	d16, [r3, #80]
 	vstr	d17, [r3, #88]
-	ldmia	r5!, {r0, r1, r2, r3}
-	str	r1, [r4, #116]	@ unaligned
-	ldr	r1, [r7, #76]
-	str	r0, [r4, #112]	@ unaligned
-	str	r2, [r4, #120]	@ unaligned
-	str	r3, [r4, #124]	@ unaligned
-	ldr	r3, [r1, #128]
-	ldr	r2, [r7, #104]
+	ldmia	r4!, {r0, r1, r2, r3}
+	mov	r4, r5
+	mov	r8, r4
+	str	r2, [r5, #120]	@ unaligned
+	ldr	r2, [r7, #76]
+	str	r0, [r5, #112]	@ unaligned
+	str	r1, [r5, #116]	@ unaligned
+	str	r3, [r5, #124]	@ unaligned
+	ldr	r3, [r2, #128]
+	ldr	r1, [r7, #104]
 	eor	r3, fp, r3
-	str	r3, [r4, #128]
-	ldr	r3, [r1, #132]
-	eors	r2, r2, r3
-	str	r2, [r8, #132]
-	ldr	r3, [r1, #136]
-	ldr	r5, [r7, #68]
-	ldr	r6, [r7, #32]
-	eor	r3, r9, r3
-	str	r3, [r4, #136]
-	ldr	r3, [r1, #140]
-	ldr	r0, [r7, #92]
-	eors	r3, r3, r6
-	ldr	r6, [r7, #108]
+	str	r3, [r5, #128]
+	ldr	r3, [r2, #132]
+	mov	r5, r2
+	eor	r3, r10, r3
+	str	r3, [r6, #132]
+	ldr	r3, [r2, #136]
+	mov	r6, r5
+	eors	r1, r1, r3
+	str	r1, [r8, #136]
+	ldr	r1, [r7, #56]
+	ldr	r3, [r2, #140]
+	ldr	r2, [r7, #100]
+	ldr	r0, [r7, #108]
+	eors	r3, r3, r2
 	str	r3, [r4, #140]
-	ldr	r3, [r5]
-	ldr	r2, [r1, #144]
-	add	r6, r6, r3
-	eors	r2, r2, r6
+	ldr	r3, [r1]
+	ldr	r2, [r5, #144]
+	mov	r8, r0
+	add	r8, r8, r3
+	mov	r5, r6
+	mov	r3, r8
+	eors	r2, r2, r3
 	str	r2, [r4, #144]
-	ldr	r2, [r5, #4]
-	ldr	r3, [r1, #148]
-	add	r0, r0, r2
+	ldr	r3, [r6, #148]
+	ldr	r2, [r1, #4]
 	ldr	r6, [r7, #36]
-	eors	r3, r3, r0
-	ldr	r0, [r7, #40]
-	str	r3, [r4, #148]
-	ldr	r2, [r5, #8]
-	ldr	r3, [r1, #152]
-	add	r0, r0, r2
-	eors	r3, r3, r0
-	str	r3, [r4, #152]
-	ldr	r2, [r5, #12]
-	mov	r0, r4
-	ldr	r3, [r1, #156]
-	mov	r4, r1
 	add	r6, r6, r2
-	mov	r1, r0
 	eors	r3, r3, r6
-	str	r3, [r0, #156]
-	ldr	r2, [r5, #16]
-	ldr	r3, [r4, #160]
+	mov	r6, r1
+	str	r3, [r4, #148]
+	ldr	r2, [r1, #8]
+	ldr	r1, [r7, #116]
+	ldr	r3, [r5, #152]
+	mov	r8, r1
+	add	r8, r8, r2
+	ldr	r1, [r7, #32]
+	mov	r2, r8
+	eors	r3, r3, r2
+	str	r3, [r4, #152]
+	mov	r8, r4
+	ldr	r2, [r6, #12]
+	ldr	r3, [r5, #156]
+	add	r1, r1, r2
+	eors	r3, r3, r1
+	str	r3, [r4, #156]
+	ldr	r2, [r6, #16]
+	mov	r1, r4
+	ldr	r3, [r5, #160]
+	mov	r4, r5
 	add	ip, ip, r2
+	mov	r5, r6
 	eor	r3, ip, r3
 	str	r3, [r1, #160]
-	ldr	r2, [r5, #20]
+	ldr	r2, [r6, #20]
 	ldr	r3, [r4, #164]
 	add	lr, lr, r2
-	ldr	r2, [r7, #116]
+	ldr	r2, [r7, #92]
 	eor	r3, lr, r3
 	str	r3, [r1, #164]
 	ldr	r6, [r5, #24]
 	mov	lr, r4
 	ldr	r3, [r4, #168]
 	add	r2, r2, r6
-	mov	r6, r4
+	ldr	r6, [r7, #120]
 	eors	r3, r3, r2
 	str	r3, [r1, #168]
 	ldr	r5, [r5, #28]
-	mov	r2, r1
 	ldr	r3, [r4, #172]
-	ldr	r0, [r7, #120]
-	add	r0, r0, r5
-	ldr	r5, [r7, #24]
-	eors	r3, r3, r0
+	add	r6, r6, r5
+	eors	r3, r3, r6
 	str	r3, [r1, #172]
-	ldr	r3, [r7, #72]
 	ldr	r4, [r4, #176]
-	ldr	r1, [r7, #28]
-	eors	r4, r4, r1
-	adds	r1, r3, #3
-	str	r4, [r2, #176]
-	ldr	r3, [r7, #100]
+	ldr	r0, [r7, #28]
+	ldr	r5, [r7, #24]
+	eors	r4, r4, r0
+	str	r4, [r8, #176]
 	ldr	r0, [lr, #180]
-	str	r1, [r7, #72]
-	eors	r3, r3, r0
-	mov	r0, r3
-	mov	r3, r2
-	str	r0, [r2, #180]
-	adds	r3, r3, #192
-	ldr	r1, [lr, #184]
 	ldr	r2, [r7, #96]
+	eors	r0, r0, r5
+	str	r0, [r8, #180]
+	ldr	r1, [lr, #184]
+	ldr	r4, [r7, #20]
 	eors	r1, r1, r2
-	str	r1, [r3, #-8]
+	str	r1, [r8, #184]
 	ldr	r2, [lr, #188]
-	mov	r1, r6
-	adds	r1, r1, #192
-	str	r1, [r7, #76]
-	eors	r2, r2, r5
-	str	r2, [r3, #-4]
+	add	r1, lr, #192
+	ldr	r3, [r7, #72]
+	eors	r2, r2, r4
+	str	r2, [r8, #188]
 	ldr	r2, [r7, #16]
-	str	r3, [r7, #88]
+	adds	r3, r3, #3
+	str	r3, [r7, #72]
+	mov	r3, r8
+	adds	r3, r3, #192
+	str	r1, [r7, #76]
 	cmp	r2, r3
+	str	r3, [r7, #80]
 	bne	.L4
 	ldr	r3, [r7, #12]
 	ldr	r2, [r7, #4]
@@ -757,8 +775,8 @@
 	beq	.L6
 	ldr	r5, [r7, #12]
 	ldr	r4, [r7, #16]
-	ldr	r6, [r7, #84]
-	ldr	lr, [r7, #80]
+	ldr	r6, [r7, #88]
+	ldr	lr, [r7, #84]
 	vldr	d30, .L94
 	vldr	d31, .L94+8
 	str	fp, [r7, #120]
@@ -964,7 +982,7 @@
 	mov	r9, r5
 	bhi	.L88
 	vadd.i32	q12, q12, q10
-	ldr	r3, [r7, #84]
+	ldr	r3, [r7, #88]
 	vst1.64	{d24-d25}, [r3:128]
 .L14:
 	ldr	r3, [r7, #8]
@@ -1001,7 +1019,7 @@
 	movcs	r1, ip
 	cmp	r1, #0
 	beq	.L17
-	ldr	r5, [r7, #84]
+	ldr	r5, [r7, #88]
 	cmp	r1, #1
 	ldrb	r0, [r0]	@ zero_extendqisi2
 	add	r3, r2, #1
@@ -1136,7 +1154,7 @@
 	ldr	r5, [r7, #16]
 	cmp	r6, #1
 	add	r0, r1, r2
-	ldr	r1, [r7, #84]
+	ldr	r1, [r7, #88]
 	add	r1, r1, r2
 	vld1.64	{d18-d19}, [r0:64]
 	add	r2, r2, r5
@@ -1174,7 +1192,7 @@
 	add	r3, r3, lr
 	beq	.L1
 .L19:
-	ldr	r4, [r7, #84]
+	ldr	r4, [r7, #88]
 	adds	r2, r3, #1
 	ldr	r1, [r7, #12]
 	cmp	r2, r9
@@ -1289,7 +1307,7 @@
 	eor	r1, r1, r0
 	strb	r1, [r5, r2]
 	bls	.L1
-	ldr	r2, [r7, #84]
+	ldr	r2, [r7, #88]
 	ldrb	r1, [r2, r3]	@ zero_extendqisi2
 	ldr	r2, [r7, #12]
 	ldrb	r2, [r2, r3]	@ zero_extendqisi2
@@ -1297,7 +1315,7 @@
 	ldr	r1, [r7, #16]
 	strb	r2, [r1, r3]
 .L1:
-	adds	r7, r7, #156
+	adds	r7, r7, #164
 	mov	sp, r7
 	@ sp needed
 	vldm	sp!, {d8-d15}
@@ -1305,7 +1323,7 @@
 .L88:
 	ldr	r5, [r7, #12]
 	vadd.i32	q12, q12, q10
-	ldr	r4, [r7, #80]
+	ldr	r4, [r7, #84]
 	cmp	r9, #31
 	ldr	r0, [r5]	@ unaligned
 	ldr	r1, [r5, #4]	@ unaligned
@@ -1313,7 +1331,7 @@
 	ldr	r2, [r5, #8]	@ unaligned
 	ldr	r3, [r5, #12]	@ unaligned
 	stmia	r6!, {r0, r1, r2, r3}
-	ldr	r2, [r7, #84]
+	ldr	r2, [r7, #88]
 	ldr	r6, [r7, #16]
 	vldr	d18, [r2, #80]
 	vldr	d19, [r2, #88]
@@ -1328,7 +1346,7 @@
 	str	r3, [r6, #12]	@ unaligned
 	bhi	.L89
 	vadd.i32	q13, q13, q15
-	ldr	r3, [r7, #84]
+	ldr	r3, [r7, #88]
 	vstr	d26, [r3, #16]
 	vstr	d27, [r3, #24]
 	b	.L14
@@ -1337,7 +1355,7 @@
 	ldr	r2, [r7, #12]
 	add	r2, r2, r9
 	mov	r5, r2
-	ldr	r2, [r7, #84]
+	ldr	r2, [r7, #88]
 	add	r2, r2, r3
 	mov	r3, r2
 .L24:
@@ -1347,17 +1365,18 @@
 	eor	r2, r2, r1
 	strb	r2, [r4], #1
 	bne	.L24
-	adds	r7, r7, #156
+	adds	r7, r7, #164
 	mov	sp, r7
 	@ sp needed
 	vldm	sp!, {d8-d15}
 	pop	{r4, r5, r6, r7, r8, r9, r10, fp, pc}
 .L26:
-	str	fp, [r7, #16]
+	ldr	r3, [r7, #80]
+	str	r3, [r7, #16]
 	b	.L2
 .L89:
 	mov	r3, r5
-	ldr	r4, [r7, #80]
+	ldr	r4, [r7, #84]
 	ldr	r0, [r3, #16]!	@ unaligned
 	add	lr, r1, #16
 	mov	r5, r1
@@ -1368,7 +1387,7 @@
 	ldr	r2, [r3, #8]	@ unaligned
 	ldr	r3, [r3, #12]	@ unaligned
 	stmia	r6!, {r0, r1, r2, r3}
-	ldr	r2, [r7, #84]
+	ldr	r2, [r7, #88]
 	vldr	d18, [r2, #80]
 	vldr	d19, [r2, #88]
 	veor	q13, q9, q13
@@ -1381,16 +1400,16 @@
 	str	r3, [lr, #12]	@ unaligned
 	bhi	.L90
 	vadd.i32	q8, q14, q8
-	ldr	r3, [r7, #84]
+	ldr	r3, [r7, #88]
 	vstr	d16, [r3, #32]
 	vstr	d17, [r3, #40]
 	b	.L14
 .L90:
 	ldr	r3, [r7, #12]
 	add	lr, r5, #32
-	ldr	r4, [r7, #80]
+	ldr	r4, [r7, #84]
 	vadd.i32	q8, q14, q8
-	ldr	r5, [r7, #84]
+	ldr	r5, [r7, #88]
 	vadd.i32	q11, q11, q3
 	ldr	r0, [r3, #32]!	@ unaligned
 	mov	r6, r4
diff --git a/crypto/cipher/e_chacha20poly1305.c b/crypto/cipher/e_chacha20poly1305.c
index 9dda1b0..34446b4 100644
--- a/crypto/cipher/e_chacha20poly1305.c
+++ b/crypto/cipher/e_chacha20poly1305.c
@@ -26,7 +26,6 @@
 
 
 #define POLY1305_TAG_LEN 16
-#define CHACHA20_NONCE_LEN 8
 
 struct aead_chacha20_poly1305_ctx {
   unsigned char key[32];
@@ -99,8 +98,7 @@
   poly1305_state poly1305;
   const uint64_t in_len_64 = in_len;
 
-  /* The underlying ChaCha implementation may not overflow the block
-   * counter into the second counter word. Therefore we disallow
+  /* |CRYPTO_chacha_20| uses a 32-bit block counter. Therefore we disallow
    * individual operations that work on more than 256GB at a time.
    * |in_len_64| is needed because, on 32-bit platforms, size_t is only
    * 32-bits and this produces a warning because it's always false.
@@ -121,18 +119,21 @@
     return 0;
   }
 
-  if (nonce_len != CHACHA20_NONCE_LEN) {
+  if (nonce_len != 8) {
     OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_IV_TOO_LARGE);
     return 0;
   }
+  uint8_t nonce_96[12];
+  memset(nonce_96, 0, 4);
+  memcpy(nonce_96 + 4, nonce, 8);
 
   memset(poly1305_key, 0, sizeof(poly1305_key));
   CRYPTO_chacha_20(poly1305_key, poly1305_key, sizeof(poly1305_key),
-                   c20_ctx->key, nonce, 0);
+                   c20_ctx->key, nonce_96, 0);
 
   CRYPTO_poly1305_init(&poly1305, poly1305_key);
   poly1305_update_with_length(&poly1305, ad, ad_len);
-  CRYPTO_chacha_20(out, in, in_len, c20_ctx->key, nonce, 1);
+  CRYPTO_chacha_20(out, in, in_len, c20_ctx->key, nonce_96, 1);
   poly1305_update_with_length(&poly1305, out, in_len);
 
   uint8_t tag[POLY1305_TAG_LEN] ALIGNED;
@@ -159,8 +160,7 @@
     return 0;
   }
 
-  /* The underlying ChaCha implementation may not overflow the block
-   * counter into the second counter word. Therefore we disallow
+  /* |CRYPTO_chacha_20| uses a 32-bit block counter. Therefore we disallow
    * individual operations that work on more than 256GB at a time.
    * |in_len_64| is needed because, on 32-bit platforms, size_t is only
    * 32-bits and this produces a warning because it's always false.
@@ -171,10 +171,13 @@
     return 0;
   }
 
-  if (nonce_len != CHACHA20_NONCE_LEN) {
+  if (nonce_len != 8) {
     OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_IV_TOO_LARGE);
     return 0;
   }
+  uint8_t nonce_96[12];
+  memset(nonce_96, 0, 4);
+  memcpy(nonce_96 + 4, nonce, 8);
 
   plaintext_len = in_len - c20_ctx->tag_len;
 
@@ -185,7 +188,7 @@
 
   memset(poly1305_key, 0, sizeof(poly1305_key));
   CRYPTO_chacha_20(poly1305_key, poly1305_key, sizeof(poly1305_key),
-                   c20_ctx->key, nonce, 0);
+                   c20_ctx->key, nonce_96, 0);
 
   CRYPTO_poly1305_init(&poly1305, poly1305_key);
   poly1305_update_with_length(&poly1305, ad, ad_len);
@@ -197,14 +200,14 @@
     return 0;
   }
 
-  CRYPTO_chacha_20(out, in, plaintext_len, c20_ctx->key, nonce, 1);
+  CRYPTO_chacha_20(out, in, plaintext_len, c20_ctx->key, nonce_96, 1);
   *out_len = plaintext_len;
   return 1;
 }
 
 static const EVP_AEAD aead_chacha20_poly1305 = {
     32,                 /* key len */
-    CHACHA20_NONCE_LEN, /* nonce len */
+    8,                  /* nonce len */
     POLY1305_TAG_LEN,   /* overhead */
     POLY1305_TAG_LEN,   /* max tag length */
     aead_chacha20_poly1305_init,
diff --git a/crypto/rand/rand.c b/crypto/rand/rand.c
index 8076b78..8b11728 100644
--- a/crypto/rand/rand.c
+++ b/crypto/rand/rand.c
@@ -159,17 +159,21 @@
       if (todo > kMaxBytesPerCall) {
         todo = kMaxBytesPerCall;
       }
-      CRYPTO_chacha_20(buf, buf, todo, state->key,
-                       (uint8_t *)&state->calls_used, 0);
+      uint8_t nonce[12];
+      memset(nonce, 0, 4);
+      memcpy(nonce + 4, &state->calls_used, sizeof(state->calls_used));
+      CRYPTO_chacha_20(buf, buf, todo, state->key, nonce, 0);
       buf += todo;
       remaining -= todo;
       state->calls_used++;
     }
   } else {
     if (sizeof(state->partial_block) - state->partial_block_used < len) {
+      uint8_t nonce[12];
+      memset(nonce, 0, 4);
+      memcpy(nonce + 4, &state->calls_used, sizeof(state->calls_used));
       CRYPTO_chacha_20(state->partial_block, state->partial_block,
-                       sizeof(state->partial_block), state->key,
-                       (uint8_t *)&state->calls_used, 0);
+                       sizeof(state->partial_block), state->key, nonce, 0);
       state->partial_block_used = 0;
     }
 
diff --git a/include/openssl/chacha.h b/include/openssl/chacha.h
index b7f5882..64713c2 100644
--- a/include/openssl/chacha.h
+++ b/include/openssl/chacha.h
@@ -27,7 +27,7 @@
  * initial block counter is specified by |counter|. */
 OPENSSL_EXPORT void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in,
                                      size_t in_len, const uint8_t key[32],
-                                     const uint8_t nonce[8], size_t counter);
+                                     const uint8_t nonce[12], uint32_t counter);
 
 
 #if defined(__cplusplus)