Don't require the ChaCha nonce to be aligned on ARM.

Change-Id: I34ee66fcc53d3371591beee3373c46598c31b5c5
Reviewed-on: https://boringssl-review.googlesource.com/3460
Reviewed-by: David Benjamin <davidben@chromium.org>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/chacha/chacha_vec.c b/crypto/chacha/chacha_vec.c
index 2b0fd9c..b3f425f 100644
--- a/crypto/chacha/chacha_vec.c
+++ b/crypto/chacha/chacha_vec.c
@@ -159,7 +159,7 @@
 	{
 	unsigned iters, i, *op=(unsigned *)out, *ip=(unsigned *)in, *kp;
 #if defined(__ARM_NEON__)
-	unsigned *np;
+	unsigned np[2];
 	uint8_t alignment_buffer[16] __attribute__((aligned(16)));
 #endif
 	vec s0, s1, s2, s3;
@@ -167,7 +167,7 @@
 		{0x61707865,0x3320646E,0x79622D32,0x6B206574};
 	kp = (unsigned *)key;
 #if defined(__ARM_NEON__)
-	np = (unsigned*) nonce;
+	memcpy(np, nonce, 8);
 #endif
 	s0 = LOAD_ALIGNED(chacha_const);
 	s1 = LOAD(&((vec*)kp)[0]);
diff --git a/crypto/chacha/chacha_vec_arm.S b/crypto/chacha/chacha_vec_arm.S
index a1fb541..ddc374e 100644
--- a/crypto/chacha/chacha_vec_arm.S
+++ b/crypto/chacha/chacha_vec_arm.S
@@ -59,145 +59,147 @@
 	.thumb_func
 	.type	CRYPTO_chacha_20_neon, %function
 CRYPTO_chacha_20_neon:
-	@ args = 8, pretend = 0, frame = 128
+	@ args = 8, pretend = 0, frame = 152
 	@ frame_needed = 1, uses_anonymous_args = 0
 	push	{r4, r5, r6, r7, r8, r9, r10, fp, lr}
-	mov	ip, r3
+	mov	r8, r3
 	vpush.64	{d8, d9, d10, d11, d12, d13, d14, d15}
 	mov	r9, r2
-	ldr	r4, .L92+16
+	ldr	r4, .L91+16
 	mov	fp, r0
 	mov	r10, r1
-	mov	lr, ip
+	mov	lr, r8
 .LPIC16:
 	add	r4, pc
-	movw	r8, #43691
-	sub	sp, sp, #132
+	sub	sp, sp, #156
 	add	r7, sp, #0
 	sub	sp, sp, #112
-	movt	r8, 43690
-	str	r0, [r7, #60]
+	add	r6, r7, #144
+	str	r0, [r7, #88]
 	str	r1, [r7, #12]
 	str	r2, [r7, #8]
 	ldmia	r4, {r0, r1, r2, r3}
 	add	r4, sp, #15
 	bic	r4, r4, #15
-	str	ip, [r7, #44]
-	umull	r8, r9, r9, r8
-	mov	r6, r4
+	ldr	ip, [r7, #256]
+	str	r4, [r7, #84]
+	mov	r5, r4
 	adds	r4, r4, #64
-	add	r5, r6, #80
-	str	r5, [r7, #68]
+	adds	r5, r5, #80
+	str	r8, [r7, #68]
 	stmia	r4, {r0, r1, r2, r3}
-	movs	r4, #0
+	movw	r4, #43691
 	ldr	r0, [ip]	@ unaligned
+	movt	r4, 43690
 	ldr	r1, [ip, #4]	@ unaligned
-	ldr	r2, [ip, #8]	@ unaligned
-	ldr	r3, [ip, #12]	@ unaligned
-	vldr	d24, [r6, #64]
-	vldr	d25, [r6, #72]
-	str	r4, [r7, #116]
-	mov	r4, r5
+	ldr	r3, [r7, #84]
+	ldr	r2, [r8, #8]	@ unaligned
+	mov	r8, #0
+	stmia	r6!, {r0, r1}
+	mov	r6, r5
+	ldr	r1, [lr, #4]	@ unaligned
+	ldr	r0, [lr]	@ unaligned
+	vldr	d24, [r3, #64]
+	vldr	d25, [r3, #72]
+	ldr	r3, [lr, #12]	@ unaligned
+	str	r5, [r7, #80]
 	stmia	r5!, {r0, r1, r2, r3}
 	ldr	r0, [lr, #16]!	@ unaligned
-	ldr	r3, [r7, #232]
-	str	r6, [r7, #72]
-	ldr	r2, [lr, #8]	@ unaligned
-	ldr	ip, [r3]
-	ldr	r6, [r7, #236]
+	ldr	r2, [r7, #84]
+	umull	r4, r5, r9, r4
+	vldr	d26, [r2, #80]
+	vldr	d27, [r2, #88]
 	ldr	r1, [lr, #4]	@ unaligned
+	ldr	r2, [lr, #8]	@ unaligned
 	ldr	r3, [lr, #12]	@ unaligned
-	ldr	r5, [r7, #72]
-	vldr	d26, [r5, #80]
-	vldr	d27, [r5, #88]
-	str	ip, [r7, #120]
-	stmia	r4!, {r0, r1, r2, r3}
-	lsrs	r2, r9, #7
-	ldr	r3, [r7, #232]
-	str	r6, [r7, #112]
-	vldr	d28, [r5, #80]
-	vldr	d29, [r5, #88]
-	ldr	r3, [r3, #4]
-	str	r3, [r7, #124]
-	vldr	d22, [r7, #112]
-	vldr	d23, [r7, #120]
+	ldr	r4, [r7, #260]
+	stmia	r6!, {r0, r1, r2, r3}
+	ldr	r3, [ip]
+	ldr	r1, [r7, #84]
+	ldr	r2, [ip, #4]
+	str	r3, [r7, #64]
+	vldr	d28, [r1, #80]
+	vldr	d29, [r1, #88]
+	str	r3, [r7, #136]
+	lsrs	r3, r5, #7
+	str	r4, [r7, #128]
+	str	r2, [r7, #140]
+	str	r8, [r7, #132]
+	str	r2, [r7, #60]
+	vldr	d22, [r7, #128]
+	vldr	d23, [r7, #136]
 	beq	.L26
-	lsls	r1, r2, #8
-	ldr	r4, [r5, #64]
-	sub	r2, r1, r2, lsl #6
-	str	r2, [r7, #4]
-	vldr	d0, .L92
-	vldr	d1, .L92+8
-	mov	r0, fp
-	mov	r1, r2
-	ldr	r2, [r5, #68]
-	str	r4, [r7, #40]
-	ldr	r4, [r5, #72]
-	str	r2, [r7, #36]
-	ldr	r2, [r5, #76]
-	str	r4, [r7, #32]
-	adds	r4, r6, #2
-	str	r10, [r7, #64]
-	str	r2, [r7, #28]
-	adds	r2, r0, r1
-	mov	r1, ip
-	str	r2, [r7, #16]
-	mov	r2, r3
-	str	r4, [r7, #48]
+	lsls	r2, r3, #8
+	ldr	r5, [r1, #64]
+	sub	r3, r2, r3, lsl #6
+	ldr	r2, [r1, #68]
+	vldr	d0, .L91
+	vldr	d1, .L91+8
+	adds	r4, r4, #2
+	str	r5, [r7, #56]
+	str	r2, [r7, #52]
+	ldr	r5, [r1, #72]
+	ldr	r2, [r1, #76]
+	str	r3, [r7, #4]
+	str	r5, [r7, #48]
+	str	r2, [r7, #44]
+	mov	r2, fp
+	str	r4, [r7, #72]
+	adds	r3, r2, r3
+	str	r10, [r7, #76]
+	str	r3, [r7, #16]
 .L4:
-	ldr	r0, [r7, #44]
-	add	r8, r7, #28
-	str	r2, [r7, #108]
+	ldr	r5, [r7, #68]
+	add	r8, r7, #44
+	ldr	r4, [r7, #72]
 	vadd.i32	q3, q11, q0
 	ldmia	r8, {r8, r9, r10, fp}
 	vmov	q8, q14  @ v4si
-	ldr	r3, [r0]
+	ldr	r2, [r5, #4]
 	vmov	q1, q13  @ v4si
+	ldr	r3, [r5]
 	vmov	q9, q12  @ v4si
+	ldr	lr, [r5, #20]
 	vmov	q2, q11  @ v4si
-	str	r3, [r7, #52]
-	mov	r3, r0
-	ldr	r5, [r3, #8]
+	mov	r0, r2
+	ldr	r2, [r5, #8]
+	str	r3, [r7, #108]
+	mov	r3, r5
+	ldr	ip, [r5, #16]
 	vmov	q15, q14  @ v4si
-	ldr	lr, [r3, #20]
+	mov	r1, r2
+	ldr	r2, [r5, #12]
+	ldr	r5, [r5, #24]
 	vmov	q5, q13  @ v4si
-	ldr	r6, [r3, #12]
+	ldr	r6, [r3, #28]
 	vmov	q10, q12  @ v4si
-	str	r5, [r7, #92]
-	mov	r5, r3
-	ldr	r4, [r5, #28]
+	ldr	r3, [r7, #64]
+	str	r5, [r7, #116]
 	movs	r5, #10
-	ldr	ip, [r3, #16]
-	ldr	r3, [r3, #24]
-	str	r4, [r7, #104]
-	ldr	r4, [r7, #48]
-	str	r3, [r7, #100]
-	mov	r3, r1
-	str	r6, [r7, #56]
-	str	r4, [r7, #96]
-	str	r8, [r7, #80]
+	str	r6, [r7, #120]
+	str	r4, [r7, #112]
+	ldr	r6, [r7, #60]
+	str	r8, [r7, #96]
 	mov	r8, r10
-	ldr	r0, [r0, #4]
+	ldr	r4, [r7, #108]
 	mov	r10, r9
-	ldr	r1, [r7, #92]
-	ldr	r2, [r7, #56]
-	ldr	r9, [r7, #100]
-	ldr	r4, [r7, #52]
-	str	lr, [r7, #88]
+	ldr	r9, [r7, #116]
+	str	lr, [r7, #104]
 	mov	lr, r3
-	str	r5, [r7, #76]
+	str	r5, [r7, #92]
 	movs	r5, #0
-	str	r5, [r7, #84]
-	b	.L93
-.L94:
+	str	r6, [r7, #124]
+	str	r5, [r7, #100]
+	b	.L92
+.L93:
 	.align	3
-.L92:
+.L91:
 	.word	1
 	.word	0
 	.word	0
 	.word	0
 	.word	.LANCHOR0-(.LPIC16+4)
-.L93:
+.L92:
 .L3:
 	vadd.i32	q9, q9, q1
 	add	r3, r8, r0
@@ -206,8 +208,8 @@
 	veor	q3, q3, q9
 	mov	r6, r3
 	veor	q2, q2, q10
-	ldr	r3, [r7, #80]
-	str	r5, [r7, #100]
+	ldr	r3, [r7, #96]
+	str	r5, [r7, #116]
 	add	r10, r10, r1
 	vrev32.16	q3, q3
 	eor	lr, lr, r10
@@ -215,13 +217,13 @@
 	vrev32.16	q2, q2
 	vadd.i32	q15, q15, q2
 	mov	fp, r3
-	ldr	r3, [r7, #96]
+	ldr	r3, [r7, #112]
 	veor	q4, q8, q1
-	str	r6, [r7, #96]
+	str	r6, [r7, #112]
 	veor	q6, q15, q5
 	eors	r3, r3, r5
 	mov	r5, r6
-	ldr	r6, [r7, #84]
+	ldr	r6, [r7, #100]
 	vshl.i32	q1, q4, #12
 	vshl.i32	q5, q6, #12
 	add	fp, fp, r2
@@ -230,33 +232,33 @@
 	vsri.32	q1, q4, #20
 	ror	lr, lr, #16
 	mov	r5, r6
-	ldr	r6, [r7, #108]
+	ldr	r6, [r7, #124]
 	vsri.32	q5, q6, #20
-	str	r3, [r7, #108]
+	str	r3, [r7, #124]
 	eor	r6, r6, fp
 	ror	r5, r5, #16
 	vadd.i32	q9, q9, q1
 	add	r9, r9, lr
 	ror	r3, r6, #16
-	ldr	r6, [r7, #108]
+	ldr	r6, [r7, #124]
 	vadd.i32	q10, q10, q5
-	str	r3, [r7, #92]
+	str	r3, [r7, #108]
 	veor	q4, q9, q3
 	add	ip, ip, r6
-	ldr	r6, [r7, #88]
+	ldr	r6, [r7, #104]
 	veor	q6, q10, q2
 	eor	r4, ip, r4
 	eor	r1, r9, r1
 	vshl.i32	q3, q4, #8
 	mov	r8, r6
-	ldr	r6, [r7, #104]
+	ldr	r6, [r7, #120]
 	vshl.i32	q2, q6, #8
 	ror	r4, r4, #20
 	add	r6, r6, r3
 	vsri.32	q3, q4, #24
-	str	r6, [r7, #88]
+	str	r6, [r7, #104]
 	eors	r2, r2, r6
-	ldr	r6, [r7, #100]
+	ldr	r6, [r7, #116]
 	vsri.32	q2, q6, #24
 	add	r8, r8, r5
 	ror	r2, r2, #20
@@ -265,42 +267,42 @@
 	eor	r0, r8, r0
 	vadd.i32	q15, q15, q2
 	mov	r3, r6
-	ldr	r6, [r7, #96]
+	ldr	r6, [r7, #112]
 	veor	q6, q4, q1
 	ror	r0, r0, #20
-	str	r3, [r7, #96]
+	str	r3, [r7, #112]
 	veor	q5, q15, q5
 	adds	r6, r0, r6
-	str	r6, [r7, #104]
+	str	r6, [r7, #120]
 	mov	r6, r3
-	ldr	r3, [r7, #108]
+	ldr	r3, [r7, #124]
 	vshl.i32	q8, q6, #7
 	add	fp, fp, r2
 	eors	r3, r3, r6
-	ldr	r6, [r7, #104]
+	ldr	r6, [r7, #120]
 	vshl.i32	q1, q5, #7
 	ror	r1, r1, #20
 	eors	r5, r5, r6
 	vsri.32	q8, q6, #25
-	ldr	r6, [r7, #92]
+	ldr	r6, [r7, #108]
 	ror	r3, r3, #24
 	ror	r5, r5, #24
 	vsri.32	q1, q5, #25
-	str	r5, [r7, #100]
+	str	r5, [r7, #116]
 	eor	r6, fp, r6
-	ldr	r5, [r7, #100]
+	ldr	r5, [r7, #116]
 	add	r10, r10, r1
 	add	ip, r3, ip
 	vext.32	q8, q8, q8, #1
-	str	ip, [r7, #108]
+	str	ip, [r7, #124]
 	add	ip, r5, r8
-	ldr	r5, [r7, #88]
+	ldr	r5, [r7, #104]
 	eor	lr, r10, lr
 	ror	r6, r6, #24
 	vext.32	q1, q1, q1, #1
 	add	r8, r6, r5
 	vadd.i32	q9, q9, q8
-	ldr	r5, [r7, #108]
+	ldr	r5, [r7, #124]
 	vext.32	q3, q3, q3, #3
 	vadd.i32	q10, q10, q1
 	ror	lr, lr, #24
@@ -309,14 +311,14 @@
 	add	r9, r9, lr
 	eors	r4, r4, r5
 	veor	q3, q9, q3
-	ldr	r5, [r7, #96]
+	ldr	r5, [r7, #112]
 	eor	r1, r9, r1
 	ror	r0, r0, #25
 	veor	q2, q10, q2
 	adds	r5, r0, r5
 	vext.32	q4, q4, q4, #2
-	str	r5, [r7, #96]
-	ldr	r5, [r7, #104]
+	str	r5, [r7, #112]
+	ldr	r5, [r7, #120]
 	ror	r1, r1, #25
 	vrev32.16	q3, q3
 	eor	r2, r8, r2
@@ -325,10 +327,10 @@
 	vadd.i32	q4, q4, q3
 	ror	r4, r4, #25
 	vrev32.16	q2, q2
-	str	r5, [r7, #84]
+	str	r5, [r7, #100]
 	vadd.i32	q15, q15, q2
 	eors	r3, r3, r5
-	ldr	r5, [r7, #96]
+	ldr	r5, [r7, #112]
 	add	fp, fp, r4
 	veor	q8, q4, q8
 	ror	r2, r2, #25
@@ -336,174 +338,182 @@
 	eor	lr, fp, lr
 	eors	r6, r6, r5
 	ror	r3, r3, #16
-	ldr	r5, [r7, #100]
+	ldr	r5, [r7, #116]
 	add	r10, r10, r2
-	str	r3, [r7, #104]
+	str	r3, [r7, #120]
 	ror	lr, lr, #16
-	ldr	r3, [r7, #104]
+	ldr	r3, [r7, #120]
 	eor	r5, r10, r5
 	vshl.i32	q5, q8, #12
 	add	ip, lr, ip
 	vshl.i32	q6, q1, #12
-	str	ip, [r7, #88]
+	str	ip, [r7, #104]
 	add	ip, r3, r8
-	str	ip, [r7, #100]
-	ldr	r3, [r7, #108]
+	str	ip, [r7, #116]
+	ldr	r3, [r7, #124]
 	ror	r5, r5, #16
 	vsri.32	q5, q8, #20
 	ror	r6, r6, #16
 	add	ip, r5, r3
-	ldr	r3, [r7, #88]
+	ldr	r3, [r7, #104]
 	vsri.32	q6, q1, #20
 	add	r9, r9, r6
 	eor	r2, ip, r2
 	eors	r4, r4, r3
-	ldr	r3, [r7, #100]
+	ldr	r3, [r7, #116]
 	eor	r0, r9, r0
 	vadd.i32	q9, q9, q5
 	ror	r4, r4, #20
 	eors	r1, r1, r3
 	vadd.i32	q10, q10, q6
 	ror	r3, r2, #20
-	str	r3, [r7, #92]
-	ldr	r3, [r7, #96]
+	str	r3, [r7, #108]
+	ldr	r3, [r7, #112]
 	veor	q3, q9, q3
 	ror	r0, r0, #20
 	add	r8, r4, fp
 	veor	q2, q10, q2
 	add	fp, r0, r3
-	ldr	r3, [r7, #84]
+	ldr	r3, [r7, #100]
 	ror	r1, r1, #20
 	mov	r2, r8
 	vshl.i32	q8, q3, #8
-	str	r8, [r7, #80]
+	str	r8, [r7, #96]
 	add	r8, r1, r3
-	ldr	r3, [r7, #92]
+	ldr	r3, [r7, #108]
 	vmov	q1, q6  @ v4si
 	vshl.i32	q6, q2, #8
 	eor	r6, fp, r6
 	add	r10, r10, r3
-	ldr	r3, [r7, #104]
+	ldr	r3, [r7, #120]
 	vsri.32	q8, q3, #24
 	eor	lr, r2, lr
 	eor	r3, r8, r3
 	ror	r2, r6, #24
 	vsri.32	q6, q2, #24
 	eor	r5, r10, r5
-	str	r2, [r7, #108]
+	str	r2, [r7, #124]
 	ror	r2, r3, #24
-	ldr	r3, [r7, #88]
+	ldr	r3, [r7, #104]
 	vmov	q3, q8  @ v4si
 	vadd.i32	q15, q15, q6
 	ror	lr, lr, #24
 	vadd.i32	q8, q4, q8
 	ror	r6, r5, #24
 	add	r5, lr, r3
-	ldr	r3, [r7, #108]
+	ldr	r3, [r7, #124]
 	veor	q4, q8, q5
 	add	ip, ip, r6
 	vmov	q2, q6  @ v4si
 	add	r9, r9, r3
 	veor	q6, q15, q1
-	ldr	r3, [r7, #100]
+	ldr	r3, [r7, #116]
 	vshl.i32	q1, q4, #7
-	str	r2, [r7, #96]
+	str	r2, [r7, #112]
 	add	r3, r3, r2
-	str	r3, [r7, #104]
+	str	r3, [r7, #120]
 	vshl.i32	q5, q6, #7
 	eors	r1, r1, r3
-	ldr	r3, [r7, #92]
+	ldr	r3, [r7, #108]
 	vsri.32	q1, q4, #25
 	eors	r4, r4, r5
 	eor	r0, r9, r0
 	eor	r2, ip, r3
 	vsri.32	q5, q6, #25
-	ldr	r3, [r7, #76]
+	ldr	r3, [r7, #92]
 	ror	r4, r4, #25
-	str	r6, [r7, #84]
+	str	r6, [r7, #100]
 	ror	r0, r0, #25
 	subs	r3, r3, #1
-	str	r5, [r7, #88]
+	str	r5, [r7, #104]
 	ror	r1, r1, #25
 	ror	r2, r2, #25
 	vext.32	q15, q15, q15, #2
-	str	r3, [r7, #76]
+	str	r3, [r7, #92]
 	vext.32	q2, q2, q2, #1
 	vext.32	q8, q8, q8, #2
 	vext.32	q3, q3, q3, #1
 	vext.32	q5, q5, q5, #3
 	vext.32	q1, q1, q1, #3
 	bne	.L3
-	ldr	r3, [r7, #68]
+	ldr	r3, [r7, #80]
 	vadd.i32	q4, q12, q10
-	str	r9, [r7, #100]
+	str	r9, [r7, #116]
 	mov	r9, r10
 	mov	r10, r8
-	ldr	r8, [r7, #80]
-	str	lr, [r7, #80]
+	ldr	r8, [r7, #96]
+	str	lr, [r7, #96]
 	mov	lr, r5
-	ldr	r5, [r7, #40]
+	ldr	r5, [r7, #56]
 	vadd.i32	q5, q13, q5
-	ldr	r6, [r7, #64]
+	ldr	r6, [r7, #76]
 	vadd.i32	q15, q14, q15
 	add	fp, fp, r5
-	ldr	r5, [r7, #36]
-	str	r4, [r7, #52]
+	ldr	r5, [r7, #52]
+	str	r4, [r7, #108]
 	vadd.i32	q7, q14, q8
-	ldr	r4, [r7, #96]
+	ldr	r4, [r7, #112]
 	add	r5, r10, r5
-	str	r3, [r7, #96]
+	str	r3, [r7, #112]
 	vadd.i32	q2, q11, q2
 	ldr	r3, [r6, #12]	@ unaligned
 	vadd.i32	q6, q12, q9
-	str	r0, [r7, #76]
+	str	r0, [r7, #92]
 	vadd.i32	q1, q13, q1
 	ldr	r0, [r6]	@ unaligned
 	vadd.i32	q11, q11, q0
-	str	r1, [r7, #92]
-	str	r2, [r7, #56]
+	str	r1, [r7, #40]
+	str	r2, [r7, #36]
 	vadd.i32	q3, q11, q3
 	ldr	r1, [r6, #4]	@ unaligned
 	vadd.i32	q11, q11, q0
 	ldr	r2, [r6, #8]	@ unaligned
-	str	r5, [r7, #88]
+	str	r5, [r7, #104]
 	vadd.i32	q11, q11, q0
-	ldr	r5, [r7, #96]
-	ldr	r10, [r7, #68]
+	ldr	r5, [r7, #112]
+	ldr	r10, [r7, #80]
 	stmia	r5!, {r0, r1, r2, r3}
 	mov	r5, r10
-	ldr	r2, [r7, #72]
-	ldr	r1, [r7, #32]
-	ldr	r3, [r7, #48]
-	vldr	d20, [r2, #80]
-	vldr	d21, [r2, #88]
-	add	r9, r9, r1
+	ldr	r0, [r7, #84]
+	ldr	r2, [r7, #48]
+	ldr	r3, [r7, #72]
+	vldr	d20, [r0, #80]
+	vldr	d21, [r0, #88]
+	add	r9, r9, r2
 	veor	q10, q10, q4
-	ldr	r1, [r7, #28]
-	add	r0, r8, r1
-	str	r0, [r7, #24]
-	vstr	d20, [r2, #80]
-	vstr	d21, [r2, #88]
-	adds	r0, r4, r3
-	str	r0, [r7, #20]
+	ldr	r2, [r7, #44]
+	adds	r1, r4, r3
+	str	r1, [r7, #28]
+	add	r2, r8, r2
+	str	r2, [r7, #32]
+	vstr	d20, [r0, #80]
+	vstr	d21, [r0, #88]
 	ldmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r10
+	ldr	r4, [r7, #96]
+	ldr	r5, [r7, #64]
+	add	r4, r4, r5
+	ldr	r5, [r7, #124]
+	str	r4, [r7, #96]
 	ldr	r4, [r7, #60]
+	add	r5, r5, r4
+	ldr	r4, [r7, #88]
+	str	r5, [r7, #24]
+	mov	r5, r10
 	str	r0, [r4]	@ unaligned
-	mov	r4, r10
-	ldr	r0, [r7, #60]
-	str	r1, [r0, #4]	@ unaligned
+	mov	r0, r4
+	str	r1, [r4, #4]	@ unaligned
 	mov	r8, r0
 	str	r2, [r0, #8]	@ unaligned
+	mov	r4, r10
 	str	r3, [r0, #12]	@ unaligned
 	ldr	r0, [r6, #16]!	@ unaligned
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
-	ldr	r6, [r7, #64]
+	ldr	r6, [r7, #76]
 	stmia	r5!, {r0, r1, r2, r3}
 	mov	r5, r10
-	ldr	r3, [r7, #72]
+	ldr	r3, [r7, #84]
 	vldr	d20, [r3, #80]
 	vldr	d21, [r3, #88]
 	veor	q10, q10, q5
@@ -515,21 +525,22 @@
 	str	r1, [r8, #20]	@ unaligned
 	str	r2, [r8, #24]	@ unaligned
 	str	r3, [r8, #28]	@ unaligned
+	mov	r8, r4
 	ldr	r0, [r6, #32]!	@ unaligned
+	str	r10, [r7, #124]
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
-	ldr	r6, [r7, #64]
+	ldr	r6, [r7, #76]
 	stmia	r5!, {r0, r1, r2, r3}
 	mov	r5, r10
-	ldr	r0, [r7, #72]
-	vldr	d16, [r0, #80]
-	vldr	d17, [r0, #88]
+	ldr	r2, [r7, #84]
+	vldr	d16, [r2, #80]
+	vldr	d17, [r2, #88]
 	veor	q15, q8, q15
-	vstr	d30, [r0, #80]
-	vstr	d31, [r0, #88]
+	vstr	d30, [r2, #80]
+	vstr	d31, [r2, #88]
 	ldmia	r10!, {r0, r1, r2, r3}
-	mov	r10, r5
 	str	r0, [r4, #32]	@ unaligned
 	str	r1, [r4, #36]	@ unaligned
 	str	r2, [r4, #40]	@ unaligned
@@ -538,17 +549,18 @@
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
-	ldr	r6, [r7, #64]
+	ldr	r6, [r7, #76]
 	stmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r10
-	ldr	r2, [r7, #72]
-	vldr	d18, [r2, #80]
-	vldr	d19, [r2, #88]
+	ldr	r1, [r7, #84]
+	vldr	d18, [r1, #80]
+	vldr	d19, [r1, #88]
 	veor	q9, q9, q2
-	vstr	d18, [r2, #80]
-	vstr	d19, [r2, #88]
+	vstr	d18, [r1, #80]
+	vstr	d19, [r1, #88]
+	ldr	r3, [r7, #112]
+	ldr	r5, [r7, #80]
+	mov	r10, r3
 	ldmia	r10!, {r0, r1, r2, r3}
-	mov	r10, r5
 	str	r0, [r4, #48]	@ unaligned
 	str	r1, [r4, #52]	@ unaligned
 	str	r2, [r4, #56]	@ unaligned
@@ -557,34 +569,38 @@
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
-	ldr	r6, [r7, #64]
+	ldr	r6, [r7, #76]
 	stmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r10
-	ldr	r2, [r7, #72]
-	vldr	d18, [r2, #80]
-	vldr	d19, [r2, #88]
+	ldr	r1, [r7, #84]
+	ldr	r3, [r7, #112]
+	ldr	r5, [r7, #80]
+	vldr	d18, [r1, #80]
+	vldr	d19, [r1, #88]
 	veor	q9, q9, q6
-	vstr	d18, [r2, #80]
-	vstr	d19, [r2, #88]
+	mov	r10, r3
+	str	r5, [r7, #20]
+	vstr	d18, [r1, #80]
+	vstr	d19, [r1, #88]
 	ldmia	r10!, {r0, r1, r2, r3}
-	mov	r10, r5
-	str	r0, [r4, #64]	@ unaligned
 	str	r1, [r4, #68]	@ unaligned
 	str	r2, [r4, #72]	@ unaligned
 	str	r3, [r4, #76]	@ unaligned
+	str	r0, [r4, #64]	@ unaligned
 	ldr	r0, [r6, #80]!	@ unaligned
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
-	ldr	r6, [r7, #64]
+	ldr	r6, [r7, #76]
 	stmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r10
-	ldr	r2, [r7, #72]
-	vldr	d18, [r2, #80]
-	vldr	d19, [r2, #88]
+	ldr	r1, [r7, #84]
+	ldr	r3, [r7, #20]
+	ldr	r5, [r7, #80]
+	vldr	d18, [r1, #80]
+	vldr	d19, [r1, #88]
 	veor	q1, q9, q1
-	vstr	d2, [r2, #80]
-	vstr	d3, [r2, #88]
+	mov	r10, r3
+	vstr	d2, [r1, #80]
+	vstr	d3, [r1, #88]
 	ldmia	r10!, {r0, r1, r2, r3}
 	mov	r10, r5
 	str	r0, [r4, #80]	@ unaligned
@@ -595,17 +611,16 @@
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
-	ldr	r6, [r7, #64]
+	ldr	r6, [r7, #76]
 	stmia	r5!, {r0, r1, r2, r3}
 	mov	r5, r10
-	ldr	r3, [r7, #72]
+	ldr	r3, [r7, #84]
 	vldr	d16, [r3, #80]
 	vldr	d17, [r3, #88]
 	veor	q8, q8, q7
 	vstr	d16, [r3, #80]
 	vstr	d17, [r3, #88]
 	ldmia	r10!, {r0, r1, r2, r3}
-	mov	r10, r5
 	str	r0, [r4, #96]	@ unaligned
 	str	r1, [r4, #100]	@ unaligned
 	str	r2, [r4, #104]	@ unaligned
@@ -614,140 +629,116 @@
 	ldr	r1, [r6, #4]	@ unaligned
 	ldr	r2, [r6, #8]	@ unaligned
 	ldr	r3, [r6, #12]	@ unaligned
-	stmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r10
-	ldr	r0, [r7, #72]
-	ldr	r6, [r7, #44]
-	vldr	d16, [r0, #80]
-	vldr	d17, [r0, #88]
+	mov	r6, r5
+	stmia	r6!, {r0, r1, r2, r3}
+	ldr	r3, [r7, #84]
+	vldr	d16, [r3, #80]
+	vldr	d17, [r3, #88]
 	veor	q8, q8, q3
-	vstr	d16, [r0, #80]
-	vstr	d17, [r0, #88]
+	vstr	d16, [r3, #80]
+	vstr	d17, [r3, #88]
 	ldmia	r5!, {r0, r1, r2, r3}
-	mov	r5, r4
-	mov	r8, r5
 	str	r1, [r4, #116]	@ unaligned
-	ldr	r1, [r7, #64]
+	ldr	r1, [r7, #76]
 	str	r0, [r4, #112]	@ unaligned
-	mov	r0, r5
 	str	r2, [r4, #120]	@ unaligned
 	str	r3, [r4, #124]	@ unaligned
 	ldr	r3, [r1, #128]
-	ldr	r2, [r7, #88]
+	ldr	r2, [r7, #104]
 	eor	r3, fp, r3
 	str	r3, [r4, #128]
 	ldr	r3, [r1, #132]
-	mov	r4, r1
-	mov	r1, r5
 	eors	r2, r2, r3
 	str	r2, [r8, #132]
-	ldr	r3, [r4, #136]
-	ldr	r2, [r7, #24]
+	ldr	r3, [r1, #136]
+	ldr	r5, [r7, #68]
+	ldr	r6, [r7, #32]
 	eor	r3, r9, r3
-	str	r3, [r5, #136]
-	ldr	r3, [r4, #140]
-	eors	r3, r3, r2
-	str	r3, [r5, #140]
-	mov	r5, r4
-	ldr	r3, [r6]
-	ldr	r2, [r4, #144]
-	ldr	r4, [r7, #52]
-	add	r4, r4, r3
-	eors	r2, r2, r4
-	mov	r4, r1
-	str	r2, [r1, #144]
-	ldr	r1, [r7, #76]
-	ldr	r2, [r6, #4]
-	ldr	r3, [r5, #148]
-	mov	r8, r1
-	add	r8, r8, r2
-	mov	r2, r8
-	eors	r3, r3, r2
-	str	r3, [r0, #148]
-	mov	r0, r4
-	ldr	r2, [r6, #8]
-	ldr	r1, [r7, #92]
-	ldr	r3, [r5, #152]
-	mov	r8, r1
-	add	r8, r8, r2
-	ldr	r1, [r7, #56]
-	mov	r2, r8
-	eors	r3, r3, r2
+	str	r3, [r4, #136]
+	ldr	r3, [r1, #140]
+	ldr	r0, [r7, #92]
+	eors	r3, r3, r6
+	ldr	r6, [r7, #108]
+	str	r3, [r4, #140]
+	ldr	r3, [r5]
+	ldr	r2, [r1, #144]
+	add	r6, r6, r3
+	eors	r2, r2, r6
+	str	r2, [r4, #144]
+	ldr	r2, [r5, #4]
+	ldr	r3, [r1, #148]
+	add	r0, r0, r2
+	ldr	r6, [r7, #36]
+	eors	r3, r3, r0
+	ldr	r0, [r7, #40]
+	str	r3, [r4, #148]
+	ldr	r2, [r5, #8]
+	ldr	r3, [r1, #152]
+	add	r0, r0, r2
+	eors	r3, r3, r0
 	str	r3, [r4, #152]
-	mov	r8, r6
-	ldr	r2, [r6, #12]
-	mov	r4, r5
-	ldr	r3, [r5, #156]
-	add	r1, r1, r2
-	eors	r3, r3, r1
-	str	r3, [r0, #156]
-	ldr	r2, [r6, #16]
+	ldr	r2, [r5, #12]
+	mov	r0, r4
+	ldr	r3, [r1, #156]
+	mov	r4, r1
+	add	r6, r6, r2
 	mov	r1, r0
-	ldr	r3, [r5, #160]
+	eors	r3, r3, r6
+	str	r3, [r0, #156]
+	ldr	r2, [r5, #16]
+	ldr	r3, [r4, #160]
 	add	ip, ip, r2
 	eor	r3, ip, r3
-	str	r3, [r0, #160]
-	ldr	r2, [r6, #20]
-	mov	ip, r0
-	ldr	r3, [r5, #164]
+	str	r3, [r1, #160]
+	ldr	r2, [r5, #20]
+	ldr	r3, [r4, #164]
 	add	lr, lr, r2
-	ldr	r2, [r7, #100]
+	ldr	r2, [r7, #116]
 	eor	r3, lr, r3
 	str	r3, [r1, #164]
-	ldr	r6, [r6, #24]
+	ldr	r6, [r5, #24]
+	mov	lr, r4
 	ldr	r3, [r4, #168]
 	add	r2, r2, r6
+	mov	r6, r4
 	eors	r3, r3, r2
-	ldr	r2, [r7, #104]
-	str	r3, [r0, #168]
-	ldr	r5, [r8, #28]
+	str	r3, [r1, #168]
+	ldr	r5, [r5, #28]
+	mov	r2, r1
 	ldr	r3, [r4, #172]
-	add	r2, r2, r5
-	mov	r5, r4
-	eors	r3, r3, r2
-	mov	r2, r0
-	str	r3, [r0, #172]
-	ldr	r3, [r7, #48]
+	ldr	r0, [r7, #120]
+	add	r0, r0, r5
+	ldr	r5, [r7, #24]
+	eors	r3, r3, r0
+	str	r3, [r1, #172]
+	ldr	r3, [r7, #72]
 	ldr	r4, [r4, #176]
-	ldr	r0, [r7, #20]
+	ldr	r1, [r7, #28]
+	eors	r4, r4, r1
 	adds	r1, r3, #3
-	ldr	r3, [r7, #84]
-	eors	r4, r4, r0
 	str	r4, [r2, #176]
-	ldr	r0, [r5, #180]
-	mov	r4, r2
-	str	r1, [r7, #48]
+	ldr	r3, [r7, #100]
+	ldr	r0, [lr, #180]
+	str	r1, [r7, #72]
 	eors	r3, r3, r0
 	mov	r0, r3
-	ldr	r3, [r7, #232]
+	mov	r3, r2
 	str	r0, [r2, #180]
-	ldr	r1, [r3]
-	ldr	r3, [r5, #184]
-	ldr	r2, [r7, #80]
-	add	r2, r2, r1
-	mov	r1, r5
-	eors	r3, r3, r2
-	str	r3, [ip, #184]
-	ldr	r3, [r7, #232]
-	adds	r1, r1, #192
-	str	r1, [r7, #64]
-	ldr	r1, [r7, #108]
-	ldr	r2, [r3, #4]
-	ldr	r3, [r5, #188]
-	add	r1, r1, r2
-	mov	r2, r1
-	eors	r2, r2, r3
-	str	r2, [ip, #188]
-	mov	r3, r4
-	ldr	r2, [r7, #16]
 	adds	r3, r3, #192
-	str	r3, [r7, #60]
+	ldr	r1, [lr, #184]
+	ldr	r2, [r7, #96]
+	eors	r1, r1, r2
+	str	r1, [r3, #-8]
+	ldr	r2, [lr, #188]
+	mov	r1, r6
+	adds	r1, r1, #192
+	str	r1, [r7, #76]
+	eors	r2, r2, r5
+	str	r2, [r3, #-4]
+	ldr	r2, [r7, #16]
+	str	r3, [r7, #88]
 	cmp	r2, r3
-	beq	.L85
-	ldr	r3, [r7, #232]
-	ldmia	r3, {r1, r2}
-	b	.L4
-.L85:
+	bne	.L4
 	ldr	r3, [r7, #12]
 	ldr	r2, [r7, #4]
 	add	r3, r3, r2
@@ -765,12 +756,12 @@
 	beq	.L6
 	ldr	r5, [r7, #12]
 	ldr	r4, [r7, #16]
-	ldr	r6, [r7, #72]
-	ldr	lr, [r7, #68]
-	vldr	d30, .L95
-	vldr	d31, .L95+8
-	str	fp, [r7, #104]
-	str	fp, [r7, #108]
+	ldr	r6, [r7, #84]
+	ldr	lr, [r7, #80]
+	vldr	d30, .L94
+	vldr	d31, .L94+8
+	str	fp, [r7, #120]
+	str	fp, [r7, #124]
 .L8:
 	vmov	q2, q11  @ v4si
 	movs	r3, #10
@@ -895,22 +886,22 @@
 	str	r0, [r4, #-16]	@ unaligned
 	str	r1, [r4, #-12]	@ unaligned
 	str	r3, [r10, #12]	@ unaligned
-	ldr	r3, [r7, #108]
+	ldr	r3, [r7, #124]
 	str	r2, [r10, #8]	@ unaligned
 	cmp	r3, #1
-	beq	.L88
+	beq	.L87
 	movs	r3, #1
-	str	r3, [r7, #108]
+	str	r3, [r7, #124]
 	b	.L8
-.L96:
-	.align	3
 .L95:
+	.align	3
+.L94:
 	.word	1
 	.word	0
 	.word	0
 	.word	0
-.L88:
-	ldr	fp, [r7, #104]
+.L87:
+	ldr	fp, [r7, #120]
 	ldr	r3, [r7, #12]
 	lsl	fp, fp, #6
 	add	r3, r3, fp
@@ -970,9 +961,9 @@
 	bne	.L10
 	cmp	r5, #15
 	mov	r9, r5
-	bhi	.L89
+	bhi	.L88
 	vadd.i32	q12, q12, q10
-	ldr	r3, [r7, #72]
+	ldr	r3, [r7, #84]
 	vst1.64	{d24-d25}, [r3:128]
 .L14:
 	ldr	r3, [r7, #8]
@@ -1009,7 +1000,7 @@
 	movcs	r1, ip
 	cmp	r1, #0
 	beq	.L17
-	ldr	r5, [r7, #72]
+	ldr	r5, [r7, #84]
 	cmp	r1, #1
 	ldrb	r0, [r0]	@ zero_extendqisi2
 	add	r3, r2, #1
@@ -1144,7 +1135,7 @@
 	ldr	r5, [r7, #16]
 	cmp	r6, #1
 	add	r0, r1, r2
-	ldr	r1, [r7, #72]
+	ldr	r1, [r7, #84]
 	add	r1, r1, r2
 	vld1.64	{d18-d19}, [r0:64]
 	add	r2, r2, r5
@@ -1182,7 +1173,7 @@
 	add	r3, r3, lr
 	beq	.L1
 .L19:
-	ldr	r4, [r7, #72]
+	ldr	r4, [r7, #84]
 	adds	r2, r3, #1
 	ldr	r1, [r7, #12]
 	cmp	r2, r9
@@ -1297,7 +1288,7 @@
 	eor	r1, r1, r0
 	strb	r1, [r5, r2]
 	bls	.L1
-	ldr	r2, [r7, #72]
+	ldr	r2, [r7, #84]
 	ldrb	r1, [r2, r3]	@ zero_extendqisi2
 	ldr	r2, [r7, #12]
 	ldrb	r2, [r2, r3]	@ zero_extendqisi2
@@ -1305,15 +1296,15 @@
 	ldr	r1, [r7, #16]
 	strb	r2, [r1, r3]
 .L1:
-	adds	r7, r7, #132
+	adds	r7, r7, #156
 	mov	sp, r7
 	@ sp needed
 	vldm	sp!, {d8-d15}
 	pop	{r4, r5, r6, r7, r8, r9, r10, fp, pc}
-.L89:
+.L88:
 	ldr	r5, [r7, #12]
 	vadd.i32	q12, q12, q10
-	ldr	r4, [r7, #68]
+	ldr	r4, [r7, #80]
 	cmp	r9, #31
 	ldr	r0, [r5]	@ unaligned
 	ldr	r1, [r5, #4]	@ unaligned
@@ -1321,7 +1312,7 @@
 	ldr	r2, [r5, #8]	@ unaligned
 	ldr	r3, [r5, #12]	@ unaligned
 	stmia	r6!, {r0, r1, r2, r3}
-	ldr	r2, [r7, #72]
+	ldr	r2, [r7, #84]
 	ldr	r6, [r7, #16]
 	vldr	d18, [r2, #80]
 	vldr	d19, [r2, #88]
@@ -1334,9 +1325,9 @@
 	str	r0, [r6]	@ unaligned
 	str	r2, [r6, #8]	@ unaligned
 	str	r3, [r6, #12]	@ unaligned
-	bhi	.L90
+	bhi	.L89
 	vadd.i32	q13, q13, q15
-	ldr	r3, [r7, #72]
+	ldr	r3, [r7, #84]
 	vstr	d26, [r3, #16]
 	vstr	d27, [r3, #24]
 	b	.L14
@@ -1345,7 +1336,7 @@
 	ldr	r2, [r7, #12]
 	add	r2, r2, r9
 	mov	r5, r2
-	ldr	r2, [r7, #72]
+	ldr	r2, [r7, #84]
 	add	r2, r2, r3
 	mov	r3, r2
 .L24:
@@ -1355,7 +1346,7 @@
 	eor	r2, r2, r1
 	strb	r2, [r4], #1
 	bne	.L24
-	adds	r7, r7, #132
+	adds	r7, r7, #156
 	mov	sp, r7
 	@ sp needed
 	vldm	sp!, {d8-d15}
@@ -1363,9 +1354,9 @@
 .L26:
 	str	fp, [r7, #16]
 	b	.L2
-.L90:
+.L89:
 	mov	r3, r5
-	ldr	r4, [r7, #68]
+	ldr	r4, [r7, #80]
 	ldr	r0, [r3, #16]!	@ unaligned
 	add	lr, r1, #16
 	mov	r5, r1
@@ -1376,7 +1367,7 @@
 	ldr	r2, [r3, #8]	@ unaligned
 	ldr	r3, [r3, #12]	@ unaligned
 	stmia	r6!, {r0, r1, r2, r3}
-	ldr	r2, [r7, #72]
+	ldr	r2, [r7, #84]
 	vldr	d18, [r2, #80]
 	vldr	d19, [r2, #88]
 	veor	q13, q9, q13
@@ -1387,18 +1378,18 @@
 	str	r1, [lr, #4]	@ unaligned
 	str	r2, [lr, #8]	@ unaligned
 	str	r3, [lr, #12]	@ unaligned
-	bhi	.L91
+	bhi	.L90
 	vadd.i32	q8, q14, q8
-	ldr	r3, [r7, #72]
+	ldr	r3, [r7, #84]
 	vstr	d16, [r3, #32]
 	vstr	d17, [r3, #40]
 	b	.L14
-.L91:
+.L90:
 	ldr	r3, [r7, #12]
 	add	lr, r5, #32
-	ldr	r4, [r7, #68]
+	ldr	r4, [r7, #80]
 	vadd.i32	q8, q14, q8
-	ldr	r5, [r7, #72]
+	ldr	r5, [r7, #84]
 	vadd.i32	q11, q11, q3
 	ldr	r0, [r3, #32]!	@ unaligned
 	mov	r6, r4