Add initial HRSS support.

This change includes support for a variant of [HRSS], a post-quantum KEM
based on NTRU. It includes changes suggested in [SXY]. This is not yet
ready for any deployment: some breaking changes, like removing the
confirmation hash, are still planned.

(CLA for HRSS's assembly code noted in b/119426559.)

[HRSS] https://eprint.iacr.org/2017/667.pdf
[SXY] https://eprint.iacr.org/2017/1005.pdf

Change-Id: I85d813733b066d5c578484bdd248de3f764194db
Reviewed-on: https://boringssl-review.googlesource.com/c/33105
Reviewed-by: David Benjamin <davidben@google.com>
diff --git a/crypto/CMakeLists.txt b/crypto/CMakeLists.txt
index b1ca70e..e53885e 100644
--- a/crypto/CMakeLists.txt
+++ b/crypto/CMakeLists.txt
@@ -105,6 +105,7 @@
     chacha/chacha-armv4.${ASM_EXT}
     curve25519/asm/x25519-asm-arm.S
     poly1305/poly1305_arm_asm.S
+    hrss/asm/poly_mul_vec_armv7_neon.S
   )
 endif()
 
@@ -131,6 +132,7 @@
     chacha/chacha-x86_64.${ASM_EXT}
     cipher_extra/aes128gcmsiv-x86_64.${ASM_EXT}
     cipher_extra/chacha20_poly1305_x86_64.${ASM_EXT}
+    hrss/asm/poly_rq_mul.S
   )
 endif()
 
@@ -275,6 +277,7 @@
   evp/sign.c
   ex_data.c
   hkdf/hkdf.c
+  hrss/hrss.c
   lhash/lhash.c
   mem.c
   obj/obj.c
@@ -455,6 +458,7 @@
   fipsmodule/rand/ctrdrbg_test.cc
   hkdf/hkdf_test.cc
   hmac_extra/hmac_test.cc
+  hrss/hrss_test.cc
   lhash/lhash_test.cc
   obj/obj_test.cc
   pem/pem_test.cc
diff --git a/crypto/hrss/asm/poly_mul_vec_armv7_neon.S b/crypto/hrss/asm/poly_mul_vec_armv7_neon.S
new file mode 100644
index 0000000..93d491c
--- /dev/null
+++ b/crypto/hrss/asm/poly_mul_vec_armv7_neon.S
@@ -0,0 +1,4260 @@
+// Copyright (c) 2018, Google Inc.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+// This file is produced by compiling hrss.c with Clang and -mfpu=neon, and
+// then trimming the output to just include the vectorised functions.
+
+#if !defined(OPENSSL_NO_ASM) && !defined(__ARM_NEON__)
+
+	.text
+	.syntax unified
+	.eabi_attribute	67, "2.09"	@ Tag_conformance
+	.eabi_attribute	6, 10	@ Tag_CPU_arch
+	.eabi_attribute	7, 65	@ Tag_CPU_arch_profile
+	.eabi_attribute	8, 1	@ Tag_ARM_ISA_use
+	.eabi_attribute	9, 2	@ Tag_THUMB_ISA_use
+	.fpu	neon
+	.eabi_attribute	34, 1	@ Tag_CPU_unaligned_access
+	.eabi_attribute	15, 1	@ Tag_ABI_PCS_RW_data
+	.eabi_attribute	16, 1	@ Tag_ABI_PCS_RO_data
+	.eabi_attribute	17, 2	@ Tag_ABI_PCS_GOT_use
+	.eabi_attribute	20, 1	@ Tag_ABI_FP_denormal
+	.eabi_attribute	21, 1	@ Tag_ABI_FP_exceptions
+	.eabi_attribute	23, 3	@ Tag_ABI_FP_number_model
+	.eabi_attribute	24, 1	@ Tag_ABI_align_needed
+	.eabi_attribute	25, 1	@ Tag_ABI_align_preserved
+	.eabi_attribute	38, 1	@ Tag_ABI_FP_16bit_format
+	.eabi_attribute	18, 4	@ Tag_ABI_PCS_wchar_t
+	.eabi_attribute	26, 2	@ Tag_ABI_enum_size
+	.eabi_attribute	14, 0	@ Tag_ABI_PCS_R9_use
+	.file	"hrss.c"
+
+	.section	.text.poly3_invert_vec,"ax",%progbits
+	.hidden	poly3_invert_vec        @ -- Begin function poly3_invert_vec
+	.globl	poly3_invert_vec
+	.p2align	4
+	.type	poly3_invert_vec,%function
+	.code	16                      @ @poly3_invert_vec
+	.thumb_func
+poly3_invert_vec:
+.Lfunc_begin0:
+	.file	1 "../crypto/hrss/hrss.c"
+	.loc	1 718 0                 @ ../crypto/hrss/hrss.c:718:0
+	.fnstart
+	.cfi_sections .debug_frame
+	.cfi_startproc
+@ %bb.0:
+	.save	{r4, r5, r6, r7, lr}
+	push	{r4, r5, r6, r7, lr}
+	.cfi_def_cfa_offset 20
+	.cfi_offset lr, -4
+	.cfi_offset r7, -8
+	.cfi_offset r6, -12
+	.cfi_offset r5, -16
+	.cfi_offset r4, -20
+	.setfp	r7, sp, #12
+	add	r7, sp, #12
+	.cfi_def_cfa r7, 8
+	.save	{r8, r9, r10}
+	push.w	{r8, r9, r10}
+	.cfi_offset r10, -24
+	.cfi_offset r9, -28
+	.cfi_offset r8, -32
+	.vsave	{d8, d9, d10, d11, d12, d13, d14, d15}
+	vpush	{d8, d9, d10, d11, d12, d13, d14, d15}
+	.cfi_offset d15, -40
+	.cfi_offset d14, -48
+	.cfi_offset d13, -56
+	.cfi_offset d12, -64
+	.cfi_offset d11, -72
+	.cfi_offset d10, -80
+	.cfi_offset d9, -88
+	.cfi_offset d8, -96
+	.pad	#944
+	sub.w	sp, sp, #944
+	mov	r4, sp
+	bfc	r4, #0, #4
+	mov	sp, r4
+	mov	r10, r0
+.Ltmp0:
+	.loc	1 735 3 prologue_end    @ ../crypto/hrss/hrss.c:735:3
+	movs	r0, #104
+	.loc	1 733 3                 @ ../crypto/hrss/hrss.c:733:3
+	mov	r2, r1
+	add.w	lr, sp, #704
+	vld1.16	{d4, d5}, [r2], r0
+	adr	r0, .LCPI0_2
+	vmov.i8	q14, #0xff
+	mov.w	r5, #700
+	vld1.64	{d16, d17}, [r0:128]
+	adr	r0, .LCPI0_3
+	vmov.i32	q1, #0x0
+	mvn	r12, #-2147483648
+	vst1.64	{d16, d17}, [lr:128]    @ 16-byte Spill
+	.loc	1 735 3                 @ ../crypto/hrss/hrss.c:735:3
+	add.w	lr, sp, #672
+	vmov.i32	q11, #0x0
+	mov.w	r6, #700
+	vld1.64	{d16, d17}, [r0:128]
+	add.w	r0, r1, #152
+	vmov.i32	q12, #0x0
+	vst1.64	{d16, d17}, [lr:128]    @ 16-byte Spill
+	add.w	lr, sp, #384
+	vld1.32	{d16, d17}, [r2]
+	.loc	1 733 3                 @ ../crypto/hrss/hrss.c:733:3
+	add.w	r2, r1, #64
+	.loc	1 735 3                 @ ../crypto/hrss/hrss.c:735:3
+	vst1.64	{d16, d17}, [lr:128]    @ 16-byte Spill
+	add.w	lr, sp, #640
+	vld1.32	{d16, d17}, [r0]
+	add.w	r0, r1, #136
+	vst1.64	{d16, d17}, [lr:128]    @ 16-byte Spill
+	add.w	lr, sp, #496
+	vld1.32	{d16, d17}, [r0]
+	add.w	r0, r1, #120
+	vst1.64	{d16, d17}, [lr:128]    @ 16-byte Spill
+	add.w	lr, sp, #432
+	vld1.32	{d16, d17}, [r0]
+	add.w	r0, r1, #88
+	vst1.64	{d16, d17}, [lr:128]    @ 16-byte Spill
+	vmov.i32	d17, #0x0
+	.loc	1 733 3                 @ ../crypto/hrss/hrss.c:733:3
+	add.w	lr, sp, #544
+	vld1.32	{d20, d21}, [r2]
+	add.w	r2, r1, #32
+	.loc	1 735 3                 @ ../crypto/hrss/hrss.c:735:3
+	vld1.32	{d30, d31}, [r0]
+	.loc	1 733 3                 @ ../crypto/hrss/hrss.c:733:3
+	add.w	r0, r1, #16
+	vldr	d18, [r1, #80]
+	.loc	1 735 3                 @ ../crypto/hrss/hrss.c:735:3
+	vldr	d16, [r1, #168]
+	.loc	1 733 3                 @ ../crypto/hrss/hrss.c:733:3
+	adds	r1, #48
+	vst1.64	{d20, d21}, [lr:128]    @ 16-byte Spill
+	add.w	lr, sp, #416
+	vorr	d19, d17, d17
+	vld1.32	{d20, d21}, [r1]
+	movs	r1, #0
+	vst1.64	{d20, d21}, [lr:128]    @ 16-byte Spill
+	add.w	lr, sp, #400
+	vld1.32	{d20, d21}, [r2]
+	movw	r2, #1399
+	vst1.64	{d20, d21}, [lr:128]    @ 16-byte Spill
+	add.w	lr, sp, #352
+	vld1.32	{d20, d21}, [r0]
+	add	r0, sp, #880
+	vst1.64	{d20, d21}, [lr:128]    @ 16-byte Spill
+	add.w	lr, sp, #656
+	vmov.i8	q10, #0xff
+	vst1.64	{d16, d17}, [lr:128]    @ 16-byte Spill
+	vmov.i16	q8, #0xf
+	add.w	lr, sp, #624
+	vneg.s16	q8, q8
+	vst1.64	{d18, d19}, [lr:128]    @ 16-byte Spill
+	add.w	lr, sp, #608
+	vmov.i8	q9, #0xff
+	vst1.64	{d16, d17}, [lr:128]    @ 16-byte Spill
+	vmov.i32	q8, #0x0
+	mov.w	lr, #0
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q8, #0x0
+	add	r0, sp, #896
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #592
+	vmov.i8	q8, #0xff
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i8	q9, #0xff
+	add	r0, sp, #576
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i8	q9, #0xff
+	add	r0, sp, #560
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #528
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #512
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #480
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #464
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #448
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #208
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #224
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #320
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #288
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #256
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #368
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #336
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #304
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #272
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #240
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #800
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #816
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #832
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #848
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #864
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #688
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #720
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #736
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #752
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	add	r0, sp, #784
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmov.i32	q9, #0x0
+	.loc	1 747 3                 @ ../crypto/hrss/hrss.c:747:3
+	add	r0, sp, #768
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	b	.LBB0_3
+	.p2align	4
+@ %bb.1:
+	.loc	1 0 3 is_stmt 0         @ ../crypto/hrss/hrss.c:0:3
+.LCPI0_2:
+	.short	1                       @ 0x1
+	.short	0                       @ 0x0
+	.short	0                       @ 0x0
+	.short	0                       @ 0x0
+	.short	0                       @ 0x0
+	.short	0                       @ 0x0
+	.short	0                       @ 0x0
+	.short	0                       @ 0x0
+	.p2align	4
+@ %bb.2:
+.LCPI0_3:
+	.short	65535                   @ 0xffff
+	.short	65535                   @ 0xffff
+	.short	65535                   @ 0xffff
+	.short	8191                    @ 0x1fff
+	.short	0                       @ 0x0
+	.short	0                       @ 0x0
+	.short	0                       @ 0x0
+	.short	0                       @ 0x0
+	.p2align	1
+.LBB0_3:                                @ =>This Inner Loop Header: Depth=1
+	.loc	1 749 32 is_stmt 1      @ ../crypto/hrss/hrss.c:749:32
+	add	r0, sp, #96
+	vand	q9, q1, q15
+	.loc	1 751 32                @ ../crypto/hrss/hrss.c:751:32
+	vand	q13, q8, q15
+.Ltmp1:
+	.file	2 "../crypto/hrss/../internal.h"
+	.loc	2 270 42                @ ../crypto/hrss/../internal.h:270:42
+	subs	r4, r5, r6
+.Ltmp2:
+	.loc	1 749 32                @ ../crypto/hrss/hrss.c:749:32
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #128
+	.loc	1 749 52 is_stmt 0      @ ../crypto/hrss/hrss.c:749:52
+	vand	q11, q8, q2
+.Ltmp3:
+	.loc	2 270 35 is_stmt 1      @ ../crypto/hrss/../internal.h:270:35
+	eor.w	r3, r5, r6
+.Ltmp4:
+	.loc	1 749 32                @ ../crypto/hrss/hrss.c:749:32
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	.loc	1 751 52                @ ../crypto/hrss/hrss.c:751:52
+	vand	q12, q1, q2
+	.loc	1 749 42                @ ../crypto/hrss/hrss.c:749:42
+	veor	q9, q11, q9
+.Ltmp5:
+	.loc	2 270 45                @ ../crypto/hrss/../internal.h:270:45
+	eors	r4, r5
+.Ltmp6:
+	.loc	1 751 42                @ ../crypto/hrss/hrss.c:751:42
+	veor	q11, q13, q12
+.Ltmp7:
+	.loc	2 270 38                @ ../crypto/hrss/../internal.h:270:38
+	orrs	r4, r3
+.Ltmp8:
+	.loc	1 749 21                @ ../crypto/hrss/hrss.c:749:21
+	vand	q12, q14, q9
+.Ltmp9:
+	.loc	2 270 31                @ ../crypto/hrss/../internal.h:270:31
+	eors	r4, r5
+.Ltmp10:
+	.loc	1 751 21                @ ../crypto/hrss/hrss.c:751:21
+	vand	q9, q14, q11
+	.loc	1 749 32                @ ../crypto/hrss/hrss.c:749:32
+	add	r0, sp, #912
+.Ltmp11:
+	.loc	2 234 13                @ ../crypto/hrss/../internal.h:234:13
+	asrs	r4, r4, #31
+.Ltmp12:
+	.loc	1 747 26                @ ../crypto/hrss/hrss.c:747:26
+	subs	r2, #1
+.Ltmp13:
+	.loc	1 185 7                 @ ../crypto/hrss/hrss.c:185:7
+	vorr	q11, q9, q12
+.Ltmp14:
+	.loc	1 153 50                @ ../crypto/hrss/hrss.c:153:50
+	vmov.16	d26[0], r4
+.Ltmp15:
+	.loc	1 185 7                 @ ../crypto/hrss/hrss.c:185:7
+	vshl.i16	q9, q9, #15
+.Ltmp16:
+	.loc	1 749 32                @ ../crypto/hrss/hrss.c:749:32
+	vst1.64	{d28, d29}, [r0:128]    @ 16-byte Spill
+.Ltmp17:
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	add	r0, sp, #192
+.Ltmp18:
+	.loc	1 185 7                 @ ../crypto/hrss/hrss.c:185:7
+	vshl.i16	q11, q11, #15
+.Ltmp19:
+	.loc	1 753 14                @ ../crypto/hrss/hrss.c:753:14
+	vshr.s16	q11, q11, #15
+	.loc	1 753 21 is_stmt 0      @ ../crypto/hrss/hrss.c:753:21
+	vand	q11, q13, q11
+.Ltmp20:
+	.loc	1 689 40 is_stmt 1      @ ../crypto/hrss/hrss.c:689:40
+	veor	q13, q8, q15
+.Ltmp21:
+	.loc	1 753 21                @ ../crypto/hrss/hrss.c:753:21
+	vdup.16	q0, d22[0]
+.Ltmp22:
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	veor	q11, q1, q2
+	.loc	1 689 30                @ ../crypto/hrss/hrss.c:689:30
+	vand	q13, q0, q13
+	.loc	1 685 30                @ ../crypto/hrss/hrss.c:685:30
+	vand	q14, q0, q11
+	.loc	1 691 12                @ ../crypto/hrss/hrss.c:691:12
+	veor	q8, q13, q8
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q1, q14, q1
+.Ltmp23:
+	.loc	1 185 7                 @ ../crypto/hrss/hrss.c:185:7
+	vshl.i16	q11, q12, #15
+.Ltmp24:
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+.Ltmp25:
+	.loc	1 185 7                 @ ../crypto/hrss/hrss.c:185:7
+	add	r0, sp, #160
+	vst1.64	{d2, d3}, [r0:128]      @ 16-byte Spill
+	add	r0, sp, #608
+	vld1.64	{d6, d7}, [r0:128]      @ 16-byte Reload
+.Ltmp26:
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	add	r0, sp, #144
+.Ltmp27:
+	.loc	1 185 7                 @ ../crypto/hrss/hrss.c:185:7
+	vshl.s16	q9, q9, q3
+.Ltmp28:
+	.loc	1 185 7 is_stmt 0       @ ../crypto/hrss/hrss.c:185:7
+	vshl.s16	q11, q11, q3
+.Ltmp29:
+	.loc	1 186 10 is_stmt 1      @ ../crypto/hrss/hrss.c:186:10
+	vdup.16	q9, d18[0]
+.Ltmp30:
+	.loc	1 186 10 is_stmt 0      @ ../crypto/hrss/hrss.c:186:10
+	vdup.16	q3, d22[0]
+.Ltmp31:
+	.loc	1 701 44 is_stmt 1      @ ../crypto/hrss/hrss.c:701:44
+	vand	q12, q8, q9
+	.loc	1 701 32 is_stmt 0      @ ../crypto/hrss/hrss.c:701:32
+	vand	q11, q1, q3
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q5, q12, q11
+	.loc	1 702 33 is_stmt 1      @ ../crypto/hrss/hrss.c:702:33
+	vand	q12, q1, q9
+	.loc	1 702 44 is_stmt 0      @ ../crypto/hrss/hrss.c:702:44
+	vand	q1, q8, q3
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q4, q1, q12
+.Ltmp32:
+	.loc	1 686 12 is_stmt 1      @ ../crypto/hrss/hrss.c:686:12
+	veor	q1, q14, q2
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q14, q13, q15
+.Ltmp33:
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q2, q5, q1
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q8, q14, q1
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	vbic	q13, q4, q8
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	.loc	1 706 34                @ ../crypto/hrss/hrss.c:706:34
+	vorr	q8, q5, q4
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	add	r0, sp, #80
+	.loc	1 708 35 is_stmt 0      @ ../crypto/hrss/hrss.c:708:35
+	veor	q13, q13, q2
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q2, q14, q8
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #384
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q6, q13, q2
+	vld1.64	{d24, d25}, [r0:128]    @ 16-byte Reload
+.Ltmp34:
+	.loc	1 689 30 is_stmt 1      @ ../crypto/hrss/hrss.c:689:30
+	add	r0, sp, #928
+	.loc	1 689 40 is_stmt 0      @ ../crypto/hrss/hrss.c:689:40
+	veor	q13, q10, q12
+	.loc	1 689 30                @ ../crypto/hrss/hrss.c:689:30
+	vst1.64	{d0, d1}, [r0:128]      @ 16-byte Spill
+	.loc	1 685 40 is_stmt 1      @ ../crypto/hrss/hrss.c:685:40
+	add	r0, sp, #176
+	.loc	1 689 30                @ ../crypto/hrss/hrss.c:689:30
+	vand	q13, q0, q13
+.Ltmp35:
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q14, q4, q14
+.Ltmp36:
+	.loc	1 691 12                @ ../crypto/hrss/hrss.c:691:12
+	veor	q10, q13, q10
+.Ltmp37:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q15, q10, q9
+.Ltmp38:
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #528
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #352
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp39:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	add	r0, sp, #16
+.Ltmp40:
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	veor	q2, q8, q11
+.Ltmp41:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #528
+.Ltmp42:
+	.loc	1 685 30                @ ../crypto/hrss/hrss.c:685:30
+	vand	q2, q0, q2
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q8, q2, q8
+.Ltmp43:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q7, q8, q3
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+.Ltmp44:
+	.loc	1 223 33                @ ../crypto/hrss/hrss.c:223:33
+	add	r0, sp, #32
+.Ltmp45:
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q15, q15, q7
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q7, q8, q9
+	.loc	1 702 44 is_stmt 0      @ ../crypto/hrss/hrss.c:702:44
+	vand	q8, q10, q3
+.Ltmp46:
+	.loc	1 690 12 is_stmt 1      @ ../crypto/hrss/hrss.c:690:12
+	veor	q10, q13, q12
+.Ltmp47:
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q9, q8, q7
+.Ltmp48:
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q7, q2, q11
+.Ltmp49:
+	.loc	1 706 34                @ ../crypto/hrss/hrss.c:706:34
+	vorr	q12, q15, q9
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q0, q10, q7
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q2, q15, q7
+	.loc	1 708 41 is_stmt 0      @ ../crypto/hrss/hrss.c:708:41
+	vbic	q13, q9, q0
+.Ltmp50:
+	.loc	1 224 12 is_stmt 1      @ ../crypto/hrss/hrss.c:224:12
+	vshr.u16	q11, q6, #1
+.Ltmp51:
+	.loc	1 708 35                @ ../crypto/hrss/hrss.c:708:35
+	veor	q13, q13, q2
+	.loc	1 708 60 is_stmt 0      @ ../crypto/hrss/hrss.c:708:60
+	vbic	q2, q10, q12
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q8, q13, q2
+	vmov.i32	q2, #0x0
+.Ltmp52:
+	.loc	1 223 33 is_stmt 1      @ ../crypto/hrss/hrss.c:223:33
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshl.i16	q8, q8, #15
+	.loc	1 224 12                @ ../crypto/hrss/hrss.c:224:12
+	add	r0, sp, #384
+	.loc	1 227 15                @ ../crypto/hrss/hrss.c:227:15
+	vext.16	q13, q2, q8, #1
+	.loc	1 225 12                @ ../crypto/hrss/hrss.c:225:12
+	vorr	q11, q13, q11
+	.loc	1 224 12                @ ../crypto/hrss/hrss.c:224:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	.loc	1 223 33                @ ../crypto/hrss/hrss.c:223:33
+	vshl.i16	q13, q6, #15
+.Ltmp53:
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	add	r0, sp, #80
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	.loc	1 707 41 is_stmt 0      @ ../crypto/hrss/hrss.c:707:41
+	add	r0, sp, #144
+.Ltmp54:
+	.loc	1 225 15 is_stmt 1      @ ../crypto/hrss/hrss.c:225:15
+	vext.16	q13, q13, q2, #1
+	.loc	1 226 12                @ ../crypto/hrss/hrss.c:226:12
+	vorr	q6, q11, q13
+.Ltmp55:
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q11, q1, q8
+	.loc	1 707 41 is_stmt 0      @ ../crypto/hrss/hrss.c:707:41
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	vbic	q13, q5, q8
+.Ltmp56:
+	.loc	1 218 12 is_stmt 1      @ ../crypto/hrss/hrss.c:218:12
+	add	r0, sp, #352
+.Ltmp57:
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q8, q9, q10
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q13, q13, q14
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q11, q13, q11
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q13, q7, q12
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q12, q15, q0
+	.loc	1 707 35                @ ../crypto/hrss/hrss.c:707:35
+	veor	q8, q12, q8
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q1, q8, q13
+.Ltmp58:
+	.loc	1 218 12 is_stmt 1      @ ../crypto/hrss/hrss.c:218:12
+	vshr.u16	q8, q11, #1
+	.loc	1 217 33                @ ../crypto/hrss/hrss.c:217:33
+	vshl.i16	q9, q1, #15
+	.loc	1 218 12                @ ../crypto/hrss/hrss.c:218:12
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #80
+	.loc	1 221 15                @ ../crypto/hrss/hrss.c:221:15
+	vext.16	q9, q2, q9, #1
+	.loc	1 219 12                @ ../crypto/hrss/hrss.c:219:12
+	vorr	q8, q9, q8
+	.loc	1 217 33                @ ../crypto/hrss/hrss.c:217:33
+	vshl.i16	q9, q11, #15
+	.loc	1 219 15                @ ../crypto/hrss/hrss.c:219:15
+	vext.16	q9, q9, q2, #1
+	.loc	1 220 12                @ ../crypto/hrss/hrss.c:220:12
+	vorr	q12, q8, q9
+.Ltmp59:
+	.loc	1 772 56                @ ../crypto/hrss/hrss.c:772:56
+	vorr	q8, q12, q6
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #112
+	vst1.64	{d12, d13}, [r0:128]    @ 16-byte Spill
+.Ltmp60:
+	.loc	1 185 7                 @ ../crypto/hrss/hrss.c:185:7
+	add	r0, sp, #608
+	vshl.i16	q8, q8, #15
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #144
+	vshl.s16	q8, q8, q9
+	.loc	1 186 10                @ ../crypto/hrss/hrss.c:186:10
+	vdup.16	q11, d16[0]
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #896
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp61:
+	.loc	1 777 40                @ ../crypto/hrss/hrss.c:777:40
+	add	r0, sp, #912
+	.loc	1 779 65                @ ../crypto/hrss/hrss.c:779:65
+	veor	q8, q6, q10
+	.loc	1 777 40                @ ../crypto/hrss/hrss.c:777:40
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #896
+	vand	q9, q11, q9
+	.loc	1 779 55                @ ../crypto/hrss/hrss.c:779:55
+	vand	q8, q9, q8
+	.loc	1 780 9                 @ ../crypto/hrss/hrss.c:780:9
+	veor	q10, q8, q10
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #880
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #880
+	.loc	1 777 65                @ ../crypto/hrss/hrss.c:777:65
+	veor	q8, q12, q10
+	.loc	1 777 55 is_stmt 0      @ ../crypto/hrss/hrss.c:777:55
+	vand	q8, q9, q8
+	.loc	1 778 9 is_stmt 1       @ ../crypto/hrss/hrss.c:778:9
+	veor	q10, q8, q10
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #672
+	vld1.64	{d24, d25}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #656
+	vld1.64	{d26, d27}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #928
+.Ltmp62:
+	.loc	1 689 40                @ ../crypto/hrss/hrss.c:689:40
+	veor	q8, q12, q13
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #624
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q8, q14, q8
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #448
+	.loc	1 691 12 is_stmt 1      @ ../crypto/hrss/hrss.c:691:12
+	veor	q12, q8, q12
+	vld1.64	{d8, d9}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #16
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	veor	q9, q4, q15
+	vld1.64	{d0, d1}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #448
+	.loc	1 685 30 is_stmt 0      @ ../crypto/hrss/hrss.c:685:30
+	vand	q9, q14, q9
+.Ltmp63:
+	.loc	1 701 44 is_stmt 1      @ ../crypto/hrss/hrss.c:701:44
+	vand	q11, q12, q0
+.Ltmp64:
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q4, q9, q4
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q9, q9, q15
+.Ltmp65:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q10, q4, q3
+	vst1.64	{d8, d9}, [r0:128]      @ 16-byte Spill
+	add	r0, sp, #672
+	.loc	1 701 38 is_stmt 0      @ ../crypto/hrss/hrss.c:701:38
+	veor	q10, q11, q10
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+.Ltmp66:
+	.loc	1 690 12 is_stmt 1      @ ../crypto/hrss/hrss.c:690:12
+	veor	q8, q8, q13
+.Ltmp67:
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q11, q4, q0
+.Ltmp68:
+	.loc	1 225 15                @ ../crypto/hrss/hrss.c:225:15
+	add	r0, sp, #64
+.Ltmp69:
+	.loc	1 702 44                @ ../crypto/hrss/hrss.c:702:44
+	vand	q12, q12, q3
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q13, q8, q9
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q11, q12, q11
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q15, q10, q9
+	.loc	1 708 41 is_stmt 0      @ ../crypto/hrss/hrss.c:708:41
+	vbic	q12, q11, q13
+	.loc	1 708 35                @ ../crypto/hrss/hrss.c:708:35
+	veor	q12, q12, q15
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q15, q10, q11
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q10, q10, q13
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q4, q8, q15
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q8, q11, q8
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q12, q12, q4
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q9, q9, q15
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q8, q10, q8
+.Ltmp70:
+	.loc	1 224 12 is_stmt 1      @ ../crypto/hrss/hrss.c:224:12
+	vshr.u16	q4, q12, #1
+	.loc	1 223 33                @ ../crypto/hrss/hrss.c:223:33
+	vshl.i16	q12, q12, #15
+.Ltmp71:
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q8, q8, q9
+	vmov.i32	q11, #0x0
+.Ltmp72:
+	.loc	1 225 15                @ ../crypto/hrss/hrss.c:225:15
+	vext.16	q5, q12, q2, #1
+	.loc	1 218 12                @ ../crypto/hrss/hrss.c:218:12
+	vshr.u16	q9, q8, #1
+	.loc	1 225 15                @ ../crypto/hrss/hrss.c:225:15
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	.loc	1 226 12                @ ../crypto/hrss/hrss.c:226:12
+	vorr	q12, q5, q4
+.Ltmp73:
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	add	r0, sp, #656
+.Ltmp74:
+	.loc	1 217 33                @ ../crypto/hrss/hrss.c:217:33
+	vshl.i16	q8, q8, #15
+.Ltmp75:
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+.Ltmp76:
+	.loc	1 219 15                @ ../crypto/hrss/hrss.c:219:15
+	add	r0, sp, #48
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	.loc	1 224 12                @ ../crypto/hrss/hrss.c:224:12
+	add	r0, sp, #624
+	.loc	1 219 15                @ ../crypto/hrss/hrss.c:219:15
+	vext.16	q8, q8, q2, #1
+	.loc	1 220 12                @ ../crypto/hrss/hrss.c:220:12
+	vorr	q8, q8, q9
+	.loc	1 224 12                @ ../crypto/hrss/hrss.c:224:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #32
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #592
+	vld1.64	{d24, d25}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #432
+	vshr.u16	q8, q8, #1
+	vld1.64	{d8, d9}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #512
+.Ltmp77:
+	.loc	1 689 40                @ ../crypto/hrss/hrss.c:689:40
+	veor	q9, q12, q4
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #400
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q9, q14, q9
+	vld1.64	{d10, d11}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #512
+	.loc	1 685 40 is_stmt 1      @ ../crypto/hrss/hrss.c:685:40
+	veor	q10, q2, q5
+	.loc	1 691 12                @ ../crypto/hrss/hrss.c:691:12
+	veor	q12, q9, q12
+	.loc	1 685 30                @ ../crypto/hrss/hrss.c:685:30
+	vand	q10, q14, q10
+.Ltmp78:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q15, q12, q0
+.Ltmp79:
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q2, q10, q2
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q10, q10, q5
+.Ltmp80:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q13, q2, q3
+.Ltmp81:
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q9, q9, q4
+	vst1.64	{d4, d5}, [r0:128]      @ 16-byte Spill
+.Ltmp82:
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q13, q15, q13
+	add	r0, sp, #592
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q15, q2, q0
+	.loc	1 702 44 is_stmt 0      @ ../crypto/hrss/hrss.c:702:44
+	vand	q2, q12, q3
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	.loc	1 705 31 is_stmt 1      @ ../crypto/hrss/hrss.c:705:31
+	vorr	q4, q9, q10
+.Ltmp83:
+	.loc	1 225 15                @ ../crypto/hrss/hrss.c:225:15
+	add	r0, sp, #384
+.Ltmp84:
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q15, q2, q15
+.Ltmp85:
+	.loc	1 225 15                @ ../crypto/hrss/hrss.c:225:15
+	vld1.64	{d24, d25}, [r0:128]    @ 16-byte Reload
+.Ltmp86:
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q5, q13, q10
+.Ltmp87:
+	.loc	1 218 12                @ ../crypto/hrss/hrss.c:218:12
+	add	r0, sp, #384
+.Ltmp88:
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	vbic	q2, q15, q4
+	.loc	1 708 35 is_stmt 0      @ ../crypto/hrss/hrss.c:708:35
+	veor	q2, q2, q5
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q5, q13, q15
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q13, q13, q4
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q6, q9, q5
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q9, q15, q9
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q6, q2, q6
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q10, q10, q5
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q9, q13, q9
+.Ltmp89:
+	.loc	1 223 33 is_stmt 1      @ ../crypto/hrss/hrss.c:223:33
+	vshl.i16	q2, q6, #15
+	.loc	1 227 15                @ ../crypto/hrss/hrss.c:227:15
+	vext.16	q7, q11, q2, #1
+	.loc	1 225 12                @ ../crypto/hrss/hrss.c:225:12
+	vorr	q8, q7, q8
+	.loc	1 225 15 is_stmt 0      @ ../crypto/hrss/hrss.c:225:15
+	vext.16	q7, q12, q11, #1
+	.loc	1 226 12 is_stmt 1      @ ../crypto/hrss/hrss.c:226:12
+	vorr	q8, q8, q7
+	.loc	1 218 12                @ ../crypto/hrss/hrss.c:218:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshr.u16	q8, q1, #1
+.Ltmp90:
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q1, q9, q10
+.Ltmp91:
+	.loc	1 219 15                @ ../crypto/hrss/hrss.c:219:15
+	add	r0, sp, #352
+	.loc	1 217 33                @ ../crypto/hrss/hrss.c:217:33
+	vshl.i16	q13, q1, #15
+	.loc	1 221 15                @ ../crypto/hrss/hrss.c:221:15
+	vext.16	q9, q11, q13, #1
+	.loc	1 219 12                @ ../crypto/hrss/hrss.c:219:12
+	vorr	q8, q9, q8
+	.loc	1 219 15 is_stmt 0      @ ../crypto/hrss/hrss.c:219:15
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	.loc	1 224 12 is_stmt 1      @ ../crypto/hrss/hrss.c:224:12
+	add	r0, sp, #352
+	.loc	1 219 15                @ ../crypto/hrss/hrss.c:219:15
+	vext.16	q9, q9, q11, #1
+	vmov.i32	q11, #0x0
+	.loc	1 220 12                @ ../crypto/hrss/hrss.c:220:12
+	vorr	q8, q8, q9
+	.loc	1 224 12                @ ../crypto/hrss/hrss.c:224:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #576
+	vshr.u16	q8, q6, #1
+	vld1.64	{d24, d25}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #496
+	vld1.64	{d10, d11}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #480
+.Ltmp92:
+	.loc	1 689 40                @ ../crypto/hrss/hrss.c:689:40
+	veor	q9, q12, q5
+	vld1.64	{d8, d9}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #416
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q9, q14, q9
+	vld1.64	{d12, d13}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #480
+	.loc	1 685 40 is_stmt 1      @ ../crypto/hrss/hrss.c:685:40
+	veor	q10, q4, q6
+	.loc	1 691 12                @ ../crypto/hrss/hrss.c:691:12
+	veor	q12, q9, q12
+	.loc	1 685 30                @ ../crypto/hrss/hrss.c:685:30
+	vand	q10, q14, q10
+.Ltmp93:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q15, q12, q0
+.Ltmp94:
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q4, q10, q4
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q10, q10, q6
+.Ltmp95:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q14, q4, q3
+.Ltmp96:
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q9, q9, q5
+	vst1.64	{d8, d9}, [r0:128]      @ 16-byte Spill
+.Ltmp97:
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q15, q15, q14
+	add	r0, sp, #576
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q14, q4, q0
+	.loc	1 702 44 is_stmt 0      @ ../crypto/hrss/hrss.c:702:44
+	vand	q4, q12, q3
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	.loc	1 705 31 is_stmt 1      @ ../crypto/hrss/hrss.c:705:31
+	vorr	q5, q9, q10
+.Ltmp98:
+	.loc	1 218 12                @ ../crypto/hrss/hrss.c:218:12
+	add	r0, sp, #432
+.Ltmp99:
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q4, q4, q14
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q6, q15, q10
+	.loc	1 708 41 is_stmt 0      @ ../crypto/hrss/hrss.c:708:41
+	vbic	q14, q4, q5
+	.loc	1 708 35                @ ../crypto/hrss/hrss.c:708:35
+	veor	q14, q14, q6
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q6, q15, q4
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q7, q9, q6
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q9, q4, q9
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q7, q14, q7
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q10, q10, q6
+.Ltmp100:
+	.loc	1 223 33                @ ../crypto/hrss/hrss.c:223:33
+	vshl.i16	q14, q7, #15
+	.loc	1 227 15                @ ../crypto/hrss/hrss.c:227:15
+	vext.16	q12, q11, q14, #1
+	.loc	1 225 12                @ ../crypto/hrss/hrss.c:225:12
+	vorr	q8, q12, q8
+	.loc	1 225 15 is_stmt 0      @ ../crypto/hrss/hrss.c:225:15
+	vext.16	q12, q2, q11, #1
+	.loc	1 226 12 is_stmt 1      @ ../crypto/hrss/hrss.c:226:12
+	vorr	q8, q8, q12
+	.loc	1 225 15                @ ../crypto/hrss/hrss.c:225:15
+	vext.16	q14, q14, q11, #1
+.Ltmp101:
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q12, q15, q5
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q9, q12, q9
+.Ltmp102:
+	.loc	1 218 12 is_stmt 1      @ ../crypto/hrss/hrss.c:218:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshr.u16	q8, q1, #1
+	.loc	1 224 12                @ ../crypto/hrss/hrss.c:224:12
+	add	r0, sp, #400
+.Ltmp103:
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q2, q9, q10
+.Ltmp104:
+	.loc	1 217 33                @ ../crypto/hrss/hrss.c:217:33
+	vshl.i16	q1, q2, #15
+	.loc	1 221 15                @ ../crypto/hrss/hrss.c:221:15
+	vext.16	q9, q11, q1, #1
+	.loc	1 219 12                @ ../crypto/hrss/hrss.c:219:12
+	vorr	q8, q9, q8
+	.loc	1 219 15 is_stmt 0      @ ../crypto/hrss/hrss.c:219:15
+	vext.16	q9, q13, q11, #1
+	.loc	1 220 12 is_stmt 1      @ ../crypto/hrss/hrss.c:220:12
+	vorr	q8, q8, q9
+	.loc	1 224 12                @ ../crypto/hrss/hrss.c:224:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #640
+	vshr.u16	q8, q7, #1
+	vld1.64	{d8, d9}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #560
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #928
+.Ltmp105:
+	.loc	1 689 40                @ ../crypto/hrss/hrss.c:689:40
+	veor	q9, q15, q4
+	vld1.64	{d24, d25}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #544
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q9, q12, q9
+	vld1.64	{d10, d11}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #464
+	.loc	1 691 12 is_stmt 1      @ ../crypto/hrss/hrss.c:691:12
+	veor	q15, q9, q15
+	vld1.64	{d12, d13}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #464
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	veor	q10, q6, q5
+.Ltmp106:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q13, q15, q0
+.Ltmp107:
+	.loc	1 685 30                @ ../crypto/hrss/hrss.c:685:30
+	vand	q10, q12, q10
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q9, q9, q4
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q6, q10, q6
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q10, q10, q5
+.Ltmp108:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q12, q6, q3
+	vst1.64	{d12, d13}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #560
+	.loc	1 701 38 is_stmt 0      @ ../crypto/hrss/hrss.c:701:38
+	veor	q12, q13, q12
+	vst1.64	{d30, d31}, [r0:128]    @ 16-byte Spill
+	.loc	1 702 33 is_stmt 1      @ ../crypto/hrss/hrss.c:702:33
+	vand	q13, q6, q0
+	.loc	1 702 44 is_stmt 0      @ ../crypto/hrss/hrss.c:702:44
+	vand	q15, q15, q3
+.Ltmp109:
+	.loc	1 218 12 is_stmt 1      @ ../crypto/hrss/hrss.c:218:12
+	add	r0, sp, #496
+.Ltmp110:
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q5, q12, q10
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q13, q15, q13
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q15, q9, q10
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	vbic	q4, q13, q15
+	.loc	1 708 35 is_stmt 0      @ ../crypto/hrss/hrss.c:708:35
+	veor	q4, q4, q5
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q5, q12, q13
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q12, q12, q15
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q6, q9, q5
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q9, q13, q9
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q4, q4, q6
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q10, q10, q5
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q9, q12, q9
+.Ltmp111:
+	.loc	1 223 33 is_stmt 1      @ ../crypto/hrss/hrss.c:223:33
+	vshl.i16	q6, q4, #15
+.Ltmp112:
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q9, q9, q10
+	vmov.i32	q13, #0x0
+.Ltmp113:
+	.loc	1 227 15                @ ../crypto/hrss/hrss.c:227:15
+	vext.16	q7, q11, q6, #1
+	vmov.i32	q11, #0x0
+	.loc	1 217 33                @ ../crypto/hrss/hrss.c:217:33
+	vshl.i16	q10, q9, #15
+	.loc	1 225 12                @ ../crypto/hrss/hrss.c:225:12
+	vorr	q8, q7, q8
+	.loc	1 218 12                @ ../crypto/hrss/hrss.c:218:12
+	vshr.u16	q9, q9, #1
+	.loc	1 226 12                @ ../crypto/hrss/hrss.c:226:12
+	vorr	q8, q8, q14
+	.loc	1 221 15                @ ../crypto/hrss/hrss.c:221:15
+	vext.16	q12, q13, q10, #1
+	.loc	1 218 12                @ ../crypto/hrss/hrss.c:218:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshr.u16	q8, q2, #1
+	.loc	1 227 15                @ ../crypto/hrss/hrss.c:227:15
+	add	r0, sp, #416
+	.loc	1 219 12                @ ../crypto/hrss/hrss.c:219:12
+	vorr	q8, q12, q8
+	.loc	1 219 15 is_stmt 0      @ ../crypto/hrss/hrss.c:219:15
+	vext.16	q12, q1, q13, #1
+	.loc	1 220 12 is_stmt 1      @ ../crypto/hrss/hrss.c:220:12
+	vorr	q8, q8, q12
+	.loc	1 224 12                @ ../crypto/hrss/hrss.c:224:12
+	vshr.u16	q12, q4, #1
+	.loc	1 227 15                @ ../crypto/hrss/hrss.c:227:15
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #64
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	.loc	1 221 15                @ ../crypto/hrss/hrss.c:221:15
+	add	r0, sp, #640
+	.loc	1 227 15                @ ../crypto/hrss/hrss.c:227:15
+	vext.16	q8, q13, q8, #1
+	.loc	1 225 12                @ ../crypto/hrss/hrss.c:225:12
+	vorr	q8, q8, q12
+	.loc	1 225 15 is_stmt 0      @ ../crypto/hrss/hrss.c:225:15
+	vext.16	q12, q6, q13, #1
+	.loc	1 226 12 is_stmt 1      @ ../crypto/hrss/hrss.c:226:12
+	vorr	q8, q8, q12
+	.loc	1 221 15                @ ../crypto/hrss/hrss.c:221:15
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #48
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+.Ltmp114:
+	.loc	1 689 40                @ ../crypto/hrss/hrss.c:689:40
+	add	r0, sp, #544
+.Ltmp115:
+	.loc	1 221 15                @ ../crypto/hrss/hrss.c:221:15
+	vext.16	q8, q13, q8, #1
+	.loc	1 219 12                @ ../crypto/hrss/hrss.c:219:12
+	vorr	q8, q8, q9
+	.loc	1 219 15 is_stmt 0      @ ../crypto/hrss/hrss.c:219:15
+	vext.16	q9, q10, q11, #1
+	.loc	1 220 12 is_stmt 1      @ ../crypto/hrss/hrss.c:220:12
+	vorr	q8, q8, q9
+.Ltmp116:
+	.loc	1 689 40                @ ../crypto/hrss/hrss.c:689:40
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #864
+	vld1.64	{d26, d27}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #256
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #928
+	veor	q8, q9, q13
+	vld1.64	{d14, d15}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #256
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q8, q7, q8
+.Ltmp117:
+	.loc	1 714 3 is_stmt 1       @ ../crypto/hrss/hrss.c:714:3
+	vmov.32	r4, d14[0]
+.Ltmp118:
+	.loc	1 691 12                @ ../crypto/hrss/hrss.c:691:12
+	veor	q15, q8, q9
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q8, q8, q13
+.Ltmp119:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q11, q15, q0
+	vst1.64	{d30, d31}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #768
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #240
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	.loc	1 701 32 is_stmt 0      @ ../crypto/hrss/hrss.c:701:32
+	add	r0, sp, #240
+.Ltmp120:
+	.loc	1 685 40 is_stmt 1      @ ../crypto/hrss/hrss.c:685:40
+	veor	q9, q10, q14
+	.loc	1 685 30 is_stmt 0      @ ../crypto/hrss/hrss.c:685:30
+	vand	q9, q7, q9
+	.loc	1 687 12 is_stmt 1      @ ../crypto/hrss/hrss.c:687:12
+	veor	q12, q9, q10
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q9, q9, q14
+.Ltmp121:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q10, q12, q3
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	add	r0, sp, #864
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q10, q11, q10
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q11, q12, q0
+	.loc	1 702 44 is_stmt 0      @ ../crypto/hrss/hrss.c:702:44
+	vand	q12, q15, q3
+.Ltmp122:
+	.loc	1 766 52 is_stmt 1      @ ../crypto/hrss/hrss.c:766:52
+	and.w	r3, r3, r4
+.Ltmp123:
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q14, q10, q9
+.Ltmp124:
+	.loc	1 767 11                @ ../crypto/hrss/hrss.c:767:11
+	eor.w	r5, r5, r3
+.Ltmp125:
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q11, q12, q11
+.Ltmp126:
+	.loc	2 304 30                @ ../crypto/hrss/../internal.h:304:30
+	add.w	r4, r5, r12
+.Ltmp127:
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q12, q8, q9
+.Ltmp128:
+	.loc	1 768 11                @ ../crypto/hrss/hrss.c:768:11
+	eor.w	r6, r6, r3
+.Ltmp129:
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	vbic	q13, q11, q12
+	.loc	1 708 35 is_stmt 0      @ ../crypto/hrss/hrss.c:708:35
+	veor	q13, q13, q14
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q14, q10, q11
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q10, q10, q12
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q15, q8, q14
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q8, q11, q8
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q13, q13, q15
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q9, q9, q14
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q8, q10, q8
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vst1.64	{d26, d27}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #768
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q8, q8, q9
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #848
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #288
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #288
+.Ltmp130:
+	.loc	1 689 40 is_stmt 1      @ ../crypto/hrss/hrss.c:689:40
+	veor	q8, q10, q14
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q9, q7, q8
+	.loc	1 691 12 is_stmt 1      @ ../crypto/hrss/hrss.c:691:12
+	veor	q13, q9, q10
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q9, q9, q14
+.Ltmp131:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q12, q13, q0
+	vst1.64	{d26, d27}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #784
+	.loc	1 702 44                @ ../crypto/hrss/hrss.c:702:44
+	vand	q13, q13, q3
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #272
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	add	r0, sp, #272
+.Ltmp132:
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	veor	q8, q11, q15
+	.loc	1 685 30 is_stmt 0      @ ../crypto/hrss/hrss.c:685:30
+	vand	q10, q7, q8
+	.loc	1 687 12 is_stmt 1      @ ../crypto/hrss/hrss.c:687:12
+	veor	q8, q10, q11
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q10, q10, q15
+.Ltmp133:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q11, q8, q3
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	add	r0, sp, #848
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q11, q12, q11
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q12, q8, q0
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q15, q11, q10
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q12, q13, q12
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q13, q9, q10
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	vbic	q14, q12, q13
+	.loc	1 708 35 is_stmt 0      @ ../crypto/hrss/hrss.c:708:35
+	veor	q14, q14, q15
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q15, q11, q12
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q11, q11, q13
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q4, q9, q15
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q9, q12, q9
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q8, q14, q4
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q10, q10, q15
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q9, q11, q9
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #784
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q8, q9, q10
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #832
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #320
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #752
+.Ltmp134:
+	.loc	1 689 40 is_stmt 1      @ ../crypto/hrss/hrss.c:689:40
+	veor	q9, q10, q8
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #304
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q9, q7, q9
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp135:
+	.loc	1 701 32 is_stmt 1      @ ../crypto/hrss/hrss.c:701:32
+	add	r0, sp, #304
+.Ltmp136:
+	.loc	1 691 12                @ ../crypto/hrss/hrss.c:691:12
+	veor	q12, q9, q10
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	veor	q10, q11, q15
+.Ltmp137:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q13, q12, q0
+.Ltmp138:
+	.loc	1 685 30                @ ../crypto/hrss/hrss.c:685:30
+	vand	q10, q7, q10
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q9, q9, q8
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q14, q10, q11
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q10, q10, q15
+.Ltmp139:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q11, q14, q3
+	vst1.64	{d28, d29}, [r0:128]    @ 16-byte Spill
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	add	r0, sp, #832
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q11, q13, q11
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q13, q14, q0
+	.loc	1 702 44 is_stmt 0      @ ../crypto/hrss/hrss.c:702:44
+	vand	q14, q12, q3
+	.loc	1 708 22 is_stmt 1      @ ../crypto/hrss/hrss.c:708:22
+	vand	q4, q11, q10
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q13, q14, q13
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q14, q9, q10
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	vbic	q15, q13, q14
+	.loc	1 708 35 is_stmt 0      @ ../crypto/hrss/hrss.c:708:35
+	veor	q15, q15, q4
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q4, q11, q13
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q11, q11, q14
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q5, q9, q4
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q9, q13, q9
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q8, q15, q5
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q10, q10, q4
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q9, q11, q9
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #752
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q8, q9, q10
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #816
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #224
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #736
+.Ltmp140:
+	.loc	1 689 40 is_stmt 1      @ ../crypto/hrss/hrss.c:689:40
+	veor	q9, q10, q8
+	vld1.64	{d2, d3}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #336
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q9, q7, q9
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp141:
+	.loc	1 707 60 is_stmt 1      @ ../crypto/hrss/hrss.c:707:60
+	add	r0, sp, #816
+.Ltmp142:
+	.loc	1 691 12                @ ../crypto/hrss/hrss.c:691:12
+	veor	q14, q9, q10
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	veor	q10, q11, q1
+.Ltmp143:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q15, q14, q0
+.Ltmp144:
+	.loc	1 685 30                @ ../crypto/hrss/hrss.c:685:30
+	vand	q10, q7, q10
+.Ltmp145:
+	.loc	1 702 44                @ ../crypto/hrss/hrss.c:702:44
+	vand	q4, q14, q3
+.Ltmp146:
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q11, q10, q11
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q10, q10, q1
+.Ltmp147:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q13, q11, q3
+.Ltmp148:
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q9, q9, q8
+.Ltmp149:
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q13, q15, q13
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q15, q11, q0
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q1, q13, q10
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q15, q4, q15
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q4, q9, q10
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	vbic	q5, q15, q4
+	.loc	1 708 35 is_stmt 0      @ ../crypto/hrss/hrss.c:708:35
+	veor	q1, q5, q1
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q5, q13, q15
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q13, q13, q4
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q2, q9, q5
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q9, q15, q9
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q8, q1, q2
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q10, q10, q5
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q9, q13, q9
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #736
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q8, q9, q10
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #800
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #96
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #720
+.Ltmp150:
+	.loc	1 689 40 is_stmt 1      @ ../crypto/hrss/hrss.c:689:40
+	veor	q9, q10, q8
+	vld1.64	{d10, d11}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #128
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q9, q7, q9
+	vld1.64	{d26, d27}, [r0:128]    @ 16-byte Reload
+.Ltmp151:
+	.loc	1 707 60 is_stmt 1      @ ../crypto/hrss/hrss.c:707:60
+	add	r0, sp, #800
+.Ltmp152:
+	.loc	1 691 12                @ ../crypto/hrss/hrss.c:691:12
+	veor	q4, q9, q10
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	veor	q10, q13, q5
+.Ltmp153:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q1, q4, q0
+.Ltmp154:
+	.loc	1 685 30                @ ../crypto/hrss/hrss.c:685:30
+	vand	q10, q7, q10
+.Ltmp155:
+	.loc	1 702 44                @ ../crypto/hrss/hrss.c:702:44
+	vand	q2, q4, q3
+.Ltmp156:
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q13, q10, q13
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q10, q10, q5
+.Ltmp157:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q15, q13, q3
+.Ltmp158:
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q9, q9, q8
+.Ltmp159:
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q15, q1, q15
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q1, q13, q0
+	.loc	1 708 22                @ ../crypto/hrss/hrss.c:708:22
+	vand	q8, q15, q10
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q1, q2, q1
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q2, q9, q10
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	vbic	q5, q1, q2
+	.loc	1 708 35 is_stmt 0      @ ../crypto/hrss/hrss.c:708:35
+	veor	q8, q5, q8
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q5, q15, q1
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q6, q9, q5
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q9, q1, q9
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q8, q8, q6
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vbic	q8, q10, q5
+	.loc	1 707 41 is_stmt 0      @ ../crypto/hrss/hrss.c:707:41
+	vbic	q10, q15, q2
+.Ltmp160:
+	.loc	1 689 40 is_stmt 1      @ ../crypto/hrss/hrss.c:689:40
+	add	r0, sp, #720
+.Ltmp161:
+	.loc	1 707 35                @ ../crypto/hrss/hrss.c:707:35
+	veor	q9, q10, q9
+	.loc	1 707 54 is_stmt 0      @ ../crypto/hrss/hrss.c:707:54
+	veor	q8, q9, q8
+.Ltmp162:
+	.loc	1 689 40 is_stmt 1      @ ../crypto/hrss/hrss.c:689:40
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #704
+	vld1.64	{d10, d11}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #208
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #368
+	veor	q8, q9, q5
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #688
+	.loc	1 689 30 is_stmt 0      @ ../crypto/hrss/hrss.c:689:30
+	vand	q8, q7, q8
+	vld1.64	{d12, d13}, [r0:128]    @ 16-byte Reload
+.Ltmp163:
+	.loc	1 707 60 is_stmt 1      @ ../crypto/hrss/hrss.c:707:60
+	add	r0, sp, #704
+.Ltmp164:
+	.loc	1 685 40                @ ../crypto/hrss/hrss.c:685:40
+	veor	q10, q15, q6
+	.loc	1 691 12                @ ../crypto/hrss/hrss.c:691:12
+	veor	q9, q8, q9
+	.loc	1 685 30                @ ../crypto/hrss/hrss.c:685:30
+	vand	q10, q7, q10
+.Ltmp165:
+	.loc	1 701 44                @ ../crypto/hrss/hrss.c:701:44
+	vand	q2, q9, q0
+.Ltmp166:
+	.loc	1 687 12                @ ../crypto/hrss/hrss.c:687:12
+	veor	q15, q10, q15
+	.loc	1 686 12                @ ../crypto/hrss/hrss.c:686:12
+	veor	q10, q10, q6
+.Ltmp167:
+	.loc	1 701 32                @ ../crypto/hrss/hrss.c:701:32
+	vand	q1, q15, q3
+.Ltmp168:
+	.loc	1 690 12                @ ../crypto/hrss/hrss.c:690:12
+	veor	q8, q8, q5
+.Ltmp169:
+	.loc	1 701 38                @ ../crypto/hrss/hrss.c:701:38
+	veor	q1, q2, q1
+	.loc	1 702 33                @ ../crypto/hrss/hrss.c:702:33
+	vand	q2, q15, q0
+	.loc	1 702 44 is_stmt 0      @ ../crypto/hrss/hrss.c:702:44
+	vand	q0, q9, q3
+	.loc	1 708 22 is_stmt 1      @ ../crypto/hrss/hrss.c:708:22
+	vand	q5, q1, q10
+	.loc	1 702 38                @ ../crypto/hrss/hrss.c:702:38
+	veor	q0, q0, q2
+	.loc	1 705 31                @ ../crypto/hrss/hrss.c:705:31
+	vorr	q2, q8, q10
+	.loc	1 708 41                @ ../crypto/hrss/hrss.c:708:41
+	vbic	q3, q0, q2
+	.loc	1 708 35 is_stmt 0      @ ../crypto/hrss/hrss.c:708:35
+	veor	q3, q3, q5
+	.loc	1 706 34 is_stmt 1      @ ../crypto/hrss/hrss.c:706:34
+	vorr	q5, q1, q0
+	.loc	1 707 41                @ ../crypto/hrss/hrss.c:707:41
+	vbic	q1, q1, q2
+	.loc	1 708 60                @ ../crypto/hrss/hrss.c:708:60
+	vbic	q6, q8, q5
+	.loc	1 707 22                @ ../crypto/hrss/hrss.c:707:22
+	vand	q8, q0, q8
+	.loc	1 708 54                @ ../crypto/hrss/hrss.c:708:54
+	veor	q3, q3, q6
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vbic	q10, q10, q5
+	.loc	1 707 35 is_stmt 0      @ ../crypto/hrss/hrss.c:707:35
+	veor	q8, q1, q8
+	.loc	1 707 60                @ ../crypto/hrss/hrss.c:707:60
+	vst1.64	{d6, d7}, [r0:128]      @ 16-byte Spill
+.Ltmp170:
+	.loc	1 203 33 is_stmt 1      @ ../crypto/hrss/hrss.c:203:33
+	vshr.u16	q0, q9, #15
+.Ltmp171:
+	.loc	1 707 54                @ ../crypto/hrss/hrss.c:707:54
+	veor	q8, q8, q10
+.Ltmp172:
+	.loc	1 204 12                @ ../crypto/hrss/hrss.c:204:12
+	add	r0, sp, #688
+	vmov.i32	q3, #0x0
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshl.i16	q8, q9, #1
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	add	r0, sp, #208
+	.loc	1 205 15                @ ../crypto/hrss/hrss.c:205:15
+	vext.16	q9, q3, q0, #7
+	.loc	1 206 12                @ ../crypto/hrss/hrss.c:206:12
+	vorr	q8, q9, q8
+	.loc	1 195 33                @ ../crypto/hrss/hrss.c:195:33
+	vshr.u16	q9, q15, #15
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshl.i16	q8, q15, #1
+	.loc	1 203 33                @ ../crypto/hrss/hrss.c:203:33
+	add	r0, sp, #368
+	.loc	1 198 15                @ ../crypto/hrss/hrss.c:198:15
+	vext.16	q10, q3, q9, #7
+	.loc	1 199 12                @ ../crypto/hrss/hrss.c:199:12
+	vorr	q8, q10, q8
+	.loc	1 203 33                @ ../crypto/hrss/hrss.c:203:33
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #288
+	vld1.64	{d10, d11}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #256
+	vshr.u16	q8, q5, #15
+	vld1.64	{d2, d3}, [r0:128]      @ 16-byte Reload
+	.loc	1 195 33                @ ../crypto/hrss/hrss.c:195:33
+	add	r0, sp, #256
+	.loc	1 204 12                @ ../crypto/hrss/hrss.c:204:12
+	vshl.i16	q10, q1, #1
+	.loc	1 207 15                @ ../crypto/hrss/hrss.c:207:15
+	vext.16	q15, q8, q3, #7
+	.loc	1 205 12                @ ../crypto/hrss/hrss.c:205:12
+	vorr	q10, q15, q10
+	.loc	1 205 15 is_stmt 0      @ ../crypto/hrss/hrss.c:205:15
+	vext.16	q8, q3, q8, #7
+	.loc	1 203 33 is_stmt 1      @ ../crypto/hrss/hrss.c:203:33
+	vshr.u16	q15, q1, #15
+	.loc	1 205 15                @ ../crypto/hrss/hrss.c:205:15
+	vext.16	q15, q3, q15, #7
+	.loc	1 206 12                @ ../crypto/hrss/hrss.c:206:12
+	vorr	q10, q10, q15
+	.loc	1 195 33                @ ../crypto/hrss/hrss.c:195:33
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #272
+	vld1.64	{d12, d13}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #240
+	vshr.u16	q10, q6, #15
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+	.loc	1 204 12                @ ../crypto/hrss/hrss.c:204:12
+	add	r0, sp, #240
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	vshl.i16	q15, q2, #1
+	.loc	1 201 15                @ ../crypto/hrss/hrss.c:201:15
+	vext.16	q1, q10, q3, #7
+	.loc	1 198 12                @ ../crypto/hrss/hrss.c:198:12
+	vorr	q15, q1, q15
+	.loc	1 198 15 is_stmt 0      @ ../crypto/hrss/hrss.c:198:15
+	vext.16	q10, q3, q10, #7
+	.loc	1 195 33 is_stmt 1      @ ../crypto/hrss/hrss.c:195:33
+	vshr.u16	q1, q2, #15
+	.loc	1 198 15                @ ../crypto/hrss/hrss.c:198:15
+	vext.16	q1, q3, q1, #7
+	.loc	1 199 12                @ ../crypto/hrss/hrss.c:199:12
+	vorr	q15, q15, q1
+	.loc	1 203 33                @ ../crypto/hrss/hrss.c:203:33
+	vshr.u16	q1, q12, #15
+	.loc	1 204 12                @ ../crypto/hrss/hrss.c:204:12
+	vst1.64	{d30, d31}, [r0:128]    @ 16-byte Spill
+	vshl.i16	q15, q5, #1
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	add	r0, sp, #288
+	.loc	1 207 15                @ ../crypto/hrss/hrss.c:207:15
+	vext.16	q2, q1, q3, #7
+	.loc	1 205 12                @ ../crypto/hrss/hrss.c:205:12
+	vorr	q15, q2, q15
+	.loc	1 206 12                @ ../crypto/hrss/hrss.c:206:12
+	vorr	q8, q15, q8
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #304
+	vshl.i16	q8, q6, #1
+	vld1.64	{d10, d11}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #80
+	.loc	1 195 33                @ ../crypto/hrss/hrss.c:195:33
+	vshr.u16	q15, q5, #15
+	.loc	1 201 15                @ ../crypto/hrss/hrss.c:201:15
+	vext.16	q2, q15, q3, #7
+	.loc	1 198 12                @ ../crypto/hrss/hrss.c:198:12
+	vorr	q8, q2, q8
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+	.loc	1 204 12                @ ../crypto/hrss/hrss.c:204:12
+	add	r0, sp, #272
+	.loc	1 199 12                @ ../crypto/hrss/hrss.c:199:12
+	vorr	q8, q8, q10
+	.loc	1 198 15                @ ../crypto/hrss/hrss.c:198:15
+	vext.16	q15, q3, q15, #7
+	.loc	1 203 33                @ ../crypto/hrss/hrss.c:203:33
+	vshr.u16	q10, q14, #15
+	.loc	1 204 12                @ ../crypto/hrss/hrss.c:204:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshl.i16	q8, q12, #1
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	add	r0, sp, #320
+	.loc	1 207 15                @ ../crypto/hrss/hrss.c:207:15
+	vext.16	q12, q10, q3, #7
+	.loc	1 205 12                @ ../crypto/hrss/hrss.c:205:12
+	vorr	q8, q12, q8
+	.loc	1 205 15 is_stmt 0      @ ../crypto/hrss/hrss.c:205:15
+	vext.16	q12, q3, q1, #7
+	.loc	1 206 12 is_stmt 1      @ ../crypto/hrss/hrss.c:206:12
+	vorr	q8, q8, q12
+	.loc	1 205 15                @ ../crypto/hrss/hrss.c:205:15
+	vext.16	q10, q3, q10, #7
+	.loc	1 195 33                @ ../crypto/hrss/hrss.c:195:33
+	vshr.u16	q12, q11, #15
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshl.i16	q8, q5, #1
+	add	r0, sp, #160
+	.loc	1 201 15                @ ../crypto/hrss/hrss.c:201:15
+	vext.16	q1, q12, q3, #7
+	.loc	1 198 12                @ ../crypto/hrss/hrss.c:198:12
+	vorr	q8, q1, q8
+	vld1.64	{d2, d3}, [r0:128]      @ 16-byte Reload
+	.loc	1 204 12                @ ../crypto/hrss/hrss.c:204:12
+	add	r0, sp, #304
+	.loc	1 199 12                @ ../crypto/hrss/hrss.c:199:12
+	vorr	q8, q8, q15
+	.loc	1 204 12                @ ../crypto/hrss/hrss.c:204:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshl.i16	q8, q14, #1
+	.loc	1 203 33                @ ../crypto/hrss/hrss.c:203:33
+	vshr.u16	q14, q4, #15
+	add	r0, sp, #112
+	.loc	1 207 15                @ ../crypto/hrss/hrss.c:207:15
+	vext.16	q15, q14, q3, #7
+	.loc	1 205 12                @ ../crypto/hrss/hrss.c:205:12
+	vorr	q8, q15, q8
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	add	r0, sp, #224
+	.loc	1 206 12                @ ../crypto/hrss/hrss.c:206:12
+	vorr	q8, q8, q10
+	.loc	1 195 33                @ ../crypto/hrss/hrss.c:195:33
+	vshr.u16	q10, q13, #15
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vshl.i16	q8, q11, #1
+	.loc	1 207 15                @ ../crypto/hrss/hrss.c:207:15
+	add	r0, sp, #336
+	.loc	1 201 15                @ ../crypto/hrss/hrss.c:201:15
+	vext.16	q11, q10, q3, #7
+	.loc	1 198 12                @ ../crypto/hrss/hrss.c:198:12
+	vorr	q8, q11, q8
+	.loc	1 198 15 is_stmt 0      @ ../crypto/hrss/hrss.c:198:15
+	vext.16	q11, q3, q12, #7
+	vmov.i32	q12, #0x0
+	.loc	1 199 12 is_stmt 1      @ ../crypto/hrss/hrss.c:199:12
+	vorr	q8, q8, q11
+	.loc	1 204 12                @ ../crypto/hrss/hrss.c:204:12
+	vshl.i16	q11, q4, #1
+	.loc	1 207 15                @ ../crypto/hrss/hrss.c:207:15
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #176
+	vext.16	q8, q0, q3, #7
+	.loc	1 205 12                @ ../crypto/hrss/hrss.c:205:12
+	vorr	q8, q8, q11
+	.loc	1 205 15 is_stmt 0      @ ../crypto/hrss/hrss.c:205:15
+	vext.16	q11, q3, q14, #7
+	.loc	1 206 12 is_stmt 1      @ ../crypto/hrss/hrss.c:206:12
+	vorr	q11, q8, q11
+	.loc	1 201 15                @ ../crypto/hrss/hrss.c:201:15
+	vext.16	q8, q9, q3, #7
+	.loc	1 196 12                @ ../crypto/hrss/hrss.c:196:12
+	vshl.i16	q9, q13, #1
+	.loc	1 198 12                @ ../crypto/hrss/hrss.c:198:12
+	vorr	q8, q8, q9
+	.loc	1 198 15 is_stmt 0      @ ../crypto/hrss/hrss.c:198:15
+	vext.16	q9, q12, q10, #7
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp173:
+	.loc	2 304 38 is_stmt 1      @ ../crypto/hrss/../internal.h:304:38
+	sub.w	r0, r5, #2
+	.loc	2 304 33 is_stmt 0      @ ../crypto/hrss/../internal.h:304:33
+	and.w	r0, r0, r4
+.Ltmp174:
+	.loc	1 199 12 is_stmt 1      @ ../crypto/hrss/hrss.c:199:12
+	vorr	q12, q8, q9
+.Ltmp175:
+	.loc	1 770 10                @ ../crypto/hrss/hrss.c:770:10
+	sub.w	r5, r5, #1
+.Ltmp176:
+	.loc	2 234 13                @ ../crypto/hrss/../internal.h:234:13
+	asr.w	r0, r0, #31
+.Ltmp177:
+	.loc	1 153 50                @ ../crypto/hrss/hrss.c:153:50
+	vdup.16	q8, r0
+	add	r0, sp, #192
+.Ltmp178:
+	.loc	1 782 19                @ ../crypto/hrss/hrss.c:782:19
+	vmvn	q9, q8
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+.Ltmp179:
+	.loc	1 714 3                 @ ../crypto/hrss/hrss.c:714:3
+	add	r0, sp, #912
+	vld1.64	{d26, d27}, [r0:128]    @ 16-byte Reload
+.Ltmp180:
+	.loc	1 714 3 is_stmt 0       @ ../crypto/hrss/hrss.c:714:3
+	add	r0, sp, #144
+	vorr	q14, q9, q9
+.Ltmp181:
+	.loc	1 714 3                 @ ../crypto/hrss/hrss.c:714:3
+	vmov.32	r4, d26[0]
+.Ltmp182:
+	.loc	1 714 3                 @ ../crypto/hrss/hrss.c:714:3
+	vld1.64	{d26, d27}, [r0:128]    @ 16-byte Reload
+	vmov.32	r0, d26[0]
+.Ltmp183:
+	.loc	1 775 56 is_stmt 1      @ ../crypto/hrss/hrss.c:775:56
+	and.w	r0, r0, r4
+	.loc	1 771 12                @ ../crypto/hrss/hrss.c:771:12
+	and	r4, r4, #1
+	.loc	1 771 7 is_stmt 0       @ ../crypto/hrss/hrss.c:771:7
+	add	r1, r4
+.Ltmp184:
+	.loc	2 343 16 is_stmt 1      @ ../crypto/hrss/../internal.h:343:16
+	and.w	r4, r0, r1
+	.loc	2 343 30 is_stmt 0      @ ../crypto/hrss/../internal.h:343:30
+	bic.w	r0, lr, r0
+	.loc	2 343 21                @ ../crypto/hrss/../internal.h:343:21
+	orr.w	lr, r0, r4
+.Ltmp185:
+	.loc	1 747 3 is_stmt 1       @ ../crypto/hrss/hrss.c:747:3
+	bne.w	.LBB0_3
+@ %bb.4:
+	.loc	1 786 3                 @ ../crypto/hrss/hrss.c:786:3
+	add	r0, sp, #720
+	add.w	r1, r10, #16
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #736
+	mov	r5, r10
+	.loc	1 787 3                 @ ../crypto/hrss/hrss.c:787:3
+	add.w	r9, r10, #88
+	.loc	1 786 3                 @ ../crypto/hrss/hrss.c:786:3
+	vst1.32	{d16, d17}, [r1]
+	add.w	r1, r10, #32
+	.loc	1 787 3                 @ ../crypto/hrss/hrss.c:787:3
+	mov	r8, r9
+	.loc	1 786 3                 @ ../crypto/hrss/hrss.c:786:3
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #752
+	vst1.32	{d16, d17}, [r1]
+	add.w	r1, r10, #48
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #784
+	vst1.32	{d16, d17}, [r1]
+	add.w	r1, r10, #64
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #768
+	vst1.32	{d16, d17}, [r1]
+	.loc	1 787 3                 @ ../crypto/hrss/hrss.c:787:3
+	movs	r1, #104
+	.loc	1 786 3                 @ ../crypto/hrss/hrss.c:786:3
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #688
+	vstr	d16, [r10, #80]
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	.loc	1 787 3                 @ ../crypto/hrss/hrss.c:787:3
+	add	r0, sp, #800
+	.loc	1 786 3                 @ ../crypto/hrss/hrss.c:786:3
+	vst1.16	{d16, d17}, [r5], r1
+	.loc	1 787 3                 @ ../crypto/hrss/hrss.c:787:3
+	add.w	r1, r10, #120
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #816
+	vst1.32	{d16, d17}, [r5]
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #832
+	vst1.32	{d16, d17}, [r1]
+	add.w	r1, r10, #136
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #848
+	vst1.32	{d16, d17}, [r1]
+	add.w	r1, r10, #152
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #704
+	vst1.32	{d16, d17}, [r1]
+	movs	r1, #80
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #864
+	vst1.16	{d16, d17}, [r8], r1
+.Ltmp186:
+	.loc	2 270 42                @ ../crypto/hrss/../internal.h:270:42
+	movw	r1, #701
+	sub.w	r2, r1, lr
+	.loc	2 270 38 is_stmt 0      @ ../crypto/hrss/../internal.h:270:38
+	orr.w	r2, r2, lr
+.Ltmp187:
+	.loc	1 787 3 is_stmt 1       @ ../crypto/hrss/hrss.c:787:3
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+.Ltmp188:
+	.loc	1 461 3                 @ ../crypto/hrss/hrss.c:461:3
+	mov	r0, r10
+.Ltmp189:
+	.loc	1 785 17                @ ../crypto/hrss/hrss.c:785:17
+	and.w	r1, r1, r2, asr #31
+	.loc	1 787 3                 @ ../crypto/hrss/hrss.c:787:3
+	vstr	d16, [r8]
+	.loc	1 785 12                @ ../crypto/hrss/hrss.c:785:12
+	sub.w	r6, lr, r1
+.Ltmp190:
+	.loc	1 461 3                 @ ../crypto/hrss/hrss.c:461:3
+	mov	r1, r6
+	bl	poly2_rotr_consttime
+	.loc	1 462 3                 @ ../crypto/hrss/hrss.c:462:3
+	mov	r0, r9
+	mov	r1, r6
+	bl	poly2_rotr_consttime
+.Ltmp191:
+	.loc	1 714 3                 @ ../crypto/hrss/hrss.c:714:3
+	add.w	lr, sp, #880
+.Ltmp192:
+	.loc	1 789 3                 @ ../crypto/hrss/hrss.c:789:3
+	mov	r0, r10
+.Ltmp193:
+	.loc	1 714 3                 @ ../crypto/hrss/hrss.c:714:3
+	vld1.64	{d16, d17}, [lr:128]    @ 16-byte Reload
+.Ltmp194:
+	.loc	1 714 3 is_stmt 0       @ ../crypto/hrss/hrss.c:714:3
+	add.w	lr, sp, #896
+.Ltmp195:
+	.loc	1 714 3                 @ ../crypto/hrss/hrss.c:714:3
+	vmov.32	r1, d16[0]
+.Ltmp196:
+	.loc	1 714 3                 @ ../crypto/hrss/hrss.c:714:3
+	vld1.64	{d16, d17}, [lr:128]    @ 16-byte Reload
+	vmov.32	r2, d16[0]
+.Ltmp197:
+	.loc	1 789 3 is_stmt 1       @ ../crypto/hrss/hrss.c:789:3
+	bl	poly3_mul_const
+	movs	r0, #84
+.Ltmp198:
+	.loc	1 500 44                @ ../crypto/hrss/hrss.c:500:44
+	ldr.w	r1, [r10, #84]
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vld1.32	{d19}, [r9], r0
+	movs	r6, #112
+.Ltmp199:
+	.loc	1 791 1                 @ ../crypto/hrss/hrss.c:791:1
+	sub.w	r4, r7, #88
+.Ltmp200:
+	.loc	1 499 44                @ ../crypto/hrss/hrss.c:499:44
+	ldr.w	r0, [r9]
+.Ltmp201:
+	.loc	1 489 35                @ ../crypto/hrss/hrss.c:489:35
+	orr.w	r3, r1, r0
+.Ltmp202:
+	.loc	1 489 75 is_stmt 0      @ ../crypto/hrss/hrss.c:489:75
+	sbfx	r2, r0, #28, #1
+.Ltmp203:
+	.loc	1 504 22 is_stmt 1      @ ../crypto/hrss/hrss.c:504:22
+	mov	r0, r10
+	vld1.32	{d20}, [r0], r6
+.Ltmp204:
+	.loc	1 489 75                @ ../crypto/hrss/hrss.c:489:75
+	sbfx	r1, r1, #28, #1
+.Ltmp205:
+	.loc	1 506 27                @ ../crypto/hrss/hrss.c:506:27
+	vorr	d21, d19, d20
+	.loc	1 503 3                 @ ../crypto/hrss/hrss.c:503:3
+	vdup.32	d16, r1
+.Ltmp206:
+	.loc	1 489 75                @ ../crypto/hrss/hrss.c:489:75
+	lsls	r1, r3, #3
+.Ltmp207:
+	.loc	1 501 21                @ ../crypto/hrss/hrss.c:501:21
+	mvn.w	r1, r1, asr #31
+	.loc	1 507 20                @ ../crypto/hrss/hrss.c:507:20
+	vand	d22, d19, d16
+	.loc	1 503 3                 @ ../crypto/hrss/hrss.c:503:3
+	vdup.32	d18, r1
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	mov	r1, r10
+	.loc	1 507 56 is_stmt 0      @ ../crypto/hrss/hrss.c:507:56
+	vand	d23, d20, d18
+	.loc	1 503 3 is_stmt 1       @ ../crypto/hrss/hrss.c:503:3
+	vdup.32	d17, r2
+	.loc	1 507 38                @ ../crypto/hrss/hrss.c:507:38
+	vbic	d24, d17, d21
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	movs	r2, #96
+	.loc	1 507 32                @ ../crypto/hrss/hrss.c:507:32
+	veor	d22, d22, d23
+	.loc	1 504 22                @ ../crypto/hrss/hrss.c:504:22
+	vldr	d25, [r10, #16]
+	vldr	d26, [r10, #24]
+	.loc	1 508 20                @ ../crypto/hrss/hrss.c:508:20
+	vand	d30, d25, d17
+	.loc	1 507 50                @ ../crypto/hrss/hrss.c:507:50
+	veor	d22, d22, d24
+	.loc	1 504 22                @ ../crypto/hrss/hrss.c:504:22
+	vldr	d23, [r10, #8]
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vst1.32	{d22}, [r1], r2
+	.loc	1 508 20                @ ../crypto/hrss/hrss.c:508:20
+	vand	d24, d23, d17
+	vand	d3, d26, d17
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vldr	d22, [r1]
+	.loc	1 508 20                @ ../crypto/hrss/hrss.c:508:20
+	vand	d20, d20, d17
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vldr	d27, [r5]
+	.loc	1 506 27                @ ../crypto/hrss/hrss.c:506:27
+	vorr	d31, d22, d23
+	.loc	1 508 56                @ ../crypto/hrss/hrss.c:508:56
+	vand	d28, d22, d18
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vldr	d0, [r0]
+	.loc	1 506 27                @ ../crypto/hrss/hrss.c:506:27
+	vorr	d29, d27, d25
+	.loc	1 508 56                @ ../crypto/hrss/hrss.c:508:56
+	vand	d1, d27, d18
+	.loc	1 506 27                @ ../crypto/hrss/hrss.c:506:27
+	vorr	d2, d0, d26
+	.loc	1 508 56                @ ../crypto/hrss/hrss.c:508:56
+	vand	d4, d0, d18
+	.loc	1 508 32 is_stmt 0      @ ../crypto/hrss/hrss.c:508:32
+	veor	d24, d28, d24
+	.loc	1 508 38                @ ../crypto/hrss/hrss.c:508:38
+	vbic	d5, d16, d31
+	.loc	1 508 32                @ ../crypto/hrss/hrss.c:508:32
+	veor	d30, d1, d30
+	.loc	1 508 38                @ ../crypto/hrss/hrss.c:508:38
+	vbic	d28, d16, d29
+	vbic	d1, d16, d2
+	.loc	1 508 32                @ ../crypto/hrss/hrss.c:508:32
+	veor	d3, d4, d3
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d24, d24, d5
+	.loc	1 508 56                @ ../crypto/hrss/hrss.c:508:56
+	vand	d19, d19, d18
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d28, d30, d28
+	veor	d30, d3, d1
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d24, [r1]
+	.loc	1 508 32                @ ../crypto/hrss/hrss.c:508:32
+	veor	d19, d19, d20
+	.loc	1 504 22 is_stmt 1      @ ../crypto/hrss/hrss.c:504:22
+	vldr	d24, [r10, #32]
+	.loc	1 508 38                @ ../crypto/hrss/hrss.c:508:38
+	vbic	d21, d16, d21
+	.loc	1 507 20                @ ../crypto/hrss/hrss.c:507:20
+	vand	d27, d27, d16
+	.loc	1 507 56 is_stmt 0      @ ../crypto/hrss/hrss.c:507:56
+	vand	d25, d25, d18
+	.loc	1 507 20                @ ../crypto/hrss/hrss.c:507:20
+	vand	d0, d0, d16
+	.loc	1 507 56                @ ../crypto/hrss/hrss.c:507:56
+	vand	d26, d26, d18
+	vand	d23, d23, d18
+	.loc	1 507 20                @ ../crypto/hrss/hrss.c:507:20
+	vand	d22, d22, d16
+	.loc	1 507 38                @ ../crypto/hrss/hrss.c:507:38
+	vbic	d20, d17, d29
+	vbic	d29, d17, d31
+	.loc	1 505 22 is_stmt 1      @ ../crypto/hrss/hrss.c:505:22
+	vldr	d31, [r10, #120]
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d28, [r5]
+	.loc	1 507 32                @ ../crypto/hrss/hrss.c:507:32
+	veor	d22, d22, d23
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d30, [r0]
+	.loc	1 507 32                @ ../crypto/hrss/hrss.c:507:32
+	veor	d25, d27, d25
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vldr	d1, [r10, #128]
+	.loc	1 507 32                @ ../crypto/hrss/hrss.c:507:32
+	veor	d26, d0, d26
+	.loc	1 504 22                @ ../crypto/hrss/hrss.c:504:22
+	vldr	d30, [r10, #40]
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d19, d19, d21
+	.loc	1 507 38                @ ../crypto/hrss/hrss.c:507:38
+	vbic	d27, d17, d2
+	.loc	1 508 20                @ ../crypto/hrss/hrss.c:508:20
+	vand	d23, d24, d17
+	.loc	1 507 56                @ ../crypto/hrss/hrss.c:507:56
+	vand	d28, d24, d18
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d19, [r10, #88]
+	.loc	1 506 27                @ ../crypto/hrss/hrss.c:506:27
+	vorr	d21, d1, d30
+	.loc	1 507 20                @ ../crypto/hrss/hrss.c:507:20
+	vand	d0, d31, d16
+	.loc	1 506 27                @ ../crypto/hrss/hrss.c:506:27
+	vorr	d24, d31, d24
+	.loc	1 508 56                @ ../crypto/hrss/hrss.c:508:56
+	vand	d31, d31, d18
+	.loc	1 507 56                @ ../crypto/hrss/hrss.c:507:56
+	vand	d2, d30, d18
+	.loc	1 508 20                @ ../crypto/hrss/hrss.c:508:20
+	vand	d4, d30, d17
+	.loc	1 507 20                @ ../crypto/hrss/hrss.c:507:20
+	vand	d3, d1, d16
+	.loc	1 508 56                @ ../crypto/hrss/hrss.c:508:56
+	vand	d5, d1, d18
+	.loc	1 507 50                @ ../crypto/hrss/hrss.c:507:50
+	veor	d20, d25, d20
+	veor	d25, d26, d27
+	veor	d22, d22, d29
+	.loc	1 507 32 is_stmt 0      @ ../crypto/hrss/hrss.c:507:32
+	veor	d26, d0, d28
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d20, [r10, #16]
+	.loc	1 507 38                @ ../crypto/hrss/hrss.c:507:38
+	vbic	d27, d17, d24
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d25, [r10, #24]
+	.loc	1 508 32 is_stmt 1      @ ../crypto/hrss/hrss.c:508:32
+	veor	d23, d31, d23
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d22, [r10, #8]
+	.loc	1 508 38                @ ../crypto/hrss/hrss.c:508:38
+	vbic	d24, d16, d24
+	.loc	1 507 32                @ ../crypto/hrss/hrss.c:507:32
+	veor	d28, d3, d2
+	.loc	1 507 38 is_stmt 0      @ ../crypto/hrss/hrss.c:507:38
+	vbic	d19, d17, d21
+	.loc	1 508 38 is_stmt 1      @ ../crypto/hrss/hrss.c:508:38
+	vbic	d30, d16, d21
+	.loc	1 508 32 is_stmt 0      @ ../crypto/hrss/hrss.c:508:32
+	veor	d29, d5, d4
+	.loc	1 507 50 is_stmt 1      @ ../crypto/hrss/hrss.c:507:50
+	veor	d20, d26, d27
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d21, d23, d24
+	.loc	1 507 50                @ ../crypto/hrss/hrss.c:507:50
+	veor	d19, d28, d19
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d22, d29, d30
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d20, [r10, #32]
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d21, [r10, #120]
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d19, [r10, #40]
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d22, [r10, #128]
+	.loc	1 504 22                @ ../crypto/hrss/hrss.c:504:22
+	vldr	d19, [r10, #48]
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vldr	d20, [r10, #136]
+	.loc	1 507 56                @ ../crypto/hrss/hrss.c:507:56
+	vand	d21, d19, d18
+	.loc	1 507 20 is_stmt 0      @ ../crypto/hrss/hrss.c:507:20
+	vand	d22, d20, d16
+	.loc	1 506 27 is_stmt 1      @ ../crypto/hrss/hrss.c:506:27
+	vorr	d23, d20, d19
+	.loc	1 508 20                @ ../crypto/hrss/hrss.c:508:20
+	vand	d19, d19, d17
+	.loc	1 508 56 is_stmt 0      @ ../crypto/hrss/hrss.c:508:56
+	vand	d20, d20, d18
+	.loc	1 507 32 is_stmt 1      @ ../crypto/hrss/hrss.c:507:32
+	veor	d21, d22, d21
+	.loc	1 507 38 is_stmt 0      @ ../crypto/hrss/hrss.c:507:38
+	vbic	d22, d17, d23
+	.loc	1 508 32 is_stmt 1      @ ../crypto/hrss/hrss.c:508:32
+	veor	d19, d20, d19
+	.loc	1 508 38 is_stmt 0      @ ../crypto/hrss/hrss.c:508:38
+	vbic	d20, d16, d23
+	.loc	1 507 50 is_stmt 1      @ ../crypto/hrss/hrss.c:507:50
+	veor	d21, d21, d22
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d19, d19, d20
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d21, [r10, #48]
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d19, [r10, #136]
+	.loc	1 504 22                @ ../crypto/hrss/hrss.c:504:22
+	vldr	d19, [r10, #56]
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vldr	d20, [r10, #144]
+	.loc	1 507 56                @ ../crypto/hrss/hrss.c:507:56
+	vand	d21, d19, d18
+	.loc	1 507 20 is_stmt 0      @ ../crypto/hrss/hrss.c:507:20
+	vand	d22, d20, d16
+	.loc	1 506 27 is_stmt 1      @ ../crypto/hrss/hrss.c:506:27
+	vorr	d23, d20, d19
+	.loc	1 508 20                @ ../crypto/hrss/hrss.c:508:20
+	vand	d19, d19, d17
+	.loc	1 508 56 is_stmt 0      @ ../crypto/hrss/hrss.c:508:56
+	vand	d20, d20, d18
+	.loc	1 507 32 is_stmt 1      @ ../crypto/hrss/hrss.c:507:32
+	veor	d21, d22, d21
+	.loc	1 507 38 is_stmt 0      @ ../crypto/hrss/hrss.c:507:38
+	vbic	d22, d17, d23
+	.loc	1 508 32 is_stmt 1      @ ../crypto/hrss/hrss.c:508:32
+	veor	d19, d20, d19
+	.loc	1 508 38 is_stmt 0      @ ../crypto/hrss/hrss.c:508:38
+	vbic	d20, d16, d23
+	.loc	1 507 50 is_stmt 1      @ ../crypto/hrss/hrss.c:507:50
+	veor	d21, d21, d22
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d19, d19, d20
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d21, [r10, #56]
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d19, [r10, #144]
+	.loc	1 504 22                @ ../crypto/hrss/hrss.c:504:22
+	vldr	d19, [r10, #64]
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vldr	d20, [r10, #152]
+	.loc	1 507 56                @ ../crypto/hrss/hrss.c:507:56
+	vand	d21, d19, d18
+	.loc	1 507 20 is_stmt 0      @ ../crypto/hrss/hrss.c:507:20
+	vand	d22, d20, d16
+	.loc	1 506 27 is_stmt 1      @ ../crypto/hrss/hrss.c:506:27
+	vorr	d23, d20, d19
+	.loc	1 508 20                @ ../crypto/hrss/hrss.c:508:20
+	vand	d19, d19, d17
+	.loc	1 508 56 is_stmt 0      @ ../crypto/hrss/hrss.c:508:56
+	vand	d20, d20, d18
+	.loc	1 507 32 is_stmt 1      @ ../crypto/hrss/hrss.c:507:32
+	veor	d21, d22, d21
+	.loc	1 507 38 is_stmt 0      @ ../crypto/hrss/hrss.c:507:38
+	vbic	d22, d17, d23
+	.loc	1 508 32 is_stmt 1      @ ../crypto/hrss/hrss.c:508:32
+	veor	d19, d20, d19
+	.loc	1 508 38 is_stmt 0      @ ../crypto/hrss/hrss.c:508:38
+	vbic	d20, d16, d23
+	.loc	1 507 50 is_stmt 1      @ ../crypto/hrss/hrss.c:507:50
+	veor	d21, d21, d22
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d19, d19, d20
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d21, [r10, #64]
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d19, [r10, #152]
+	.loc	1 504 22                @ ../crypto/hrss/hrss.c:504:22
+	vldr	d19, [r10, #72]
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vldr	d20, [r10, #160]
+	.loc	1 507 56                @ ../crypto/hrss/hrss.c:507:56
+	vand	d21, d19, d18
+	.loc	1 507 20 is_stmt 0      @ ../crypto/hrss/hrss.c:507:20
+	vand	d22, d20, d16
+	.loc	1 506 27 is_stmt 1      @ ../crypto/hrss/hrss.c:506:27
+	vorr	d23, d20, d19
+	.loc	1 508 20                @ ../crypto/hrss/hrss.c:508:20
+	vand	d19, d19, d17
+	.loc	1 508 56 is_stmt 0      @ ../crypto/hrss/hrss.c:508:56
+	vand	d20, d20, d18
+	.loc	1 507 32 is_stmt 1      @ ../crypto/hrss/hrss.c:507:32
+	veor	d21, d22, d21
+	.loc	1 507 38 is_stmt 0      @ ../crypto/hrss/hrss.c:507:38
+	vbic	d22, d17, d23
+	.loc	1 508 32 is_stmt 1      @ ../crypto/hrss/hrss.c:508:32
+	veor	d19, d20, d19
+	.loc	1 508 38 is_stmt 0      @ ../crypto/hrss/hrss.c:508:38
+	vbic	d20, d16, d23
+	.loc	1 507 50 is_stmt 1      @ ../crypto/hrss/hrss.c:507:50
+	veor	d21, d21, d22
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d19, d19, d20
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d21, [r10, #72]
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d19, [r10, #160]
+	.loc	1 505 22                @ ../crypto/hrss/hrss.c:505:22
+	vldr	d19, [r8]
+	.loc	1 504 22                @ ../crypto/hrss/hrss.c:504:22
+	vldr	d20, [r10, #80]
+	.loc	1 507 20                @ ../crypto/hrss/hrss.c:507:20
+	vand	d22, d19, d16
+	.loc	1 506 27                @ ../crypto/hrss/hrss.c:506:27
+	vorr	d21, d19, d20
+	.loc	1 507 56                @ ../crypto/hrss/hrss.c:507:56
+	vand	d23, d20, d18
+	.loc	1 508 56                @ ../crypto/hrss/hrss.c:508:56
+	vand	d18, d19, d18
+	.loc	1 508 20 is_stmt 0      @ ../crypto/hrss/hrss.c:508:20
+	vand	d19, d20, d17
+	.loc	1 507 38 is_stmt 1      @ ../crypto/hrss/hrss.c:507:38
+	vbic	d17, d17, d21
+	.loc	1 507 32 is_stmt 0      @ ../crypto/hrss/hrss.c:507:32
+	veor	d20, d22, d23
+	.loc	1 508 38 is_stmt 1      @ ../crypto/hrss/hrss.c:508:38
+	vbic	d16, d16, d21
+	.loc	1 508 32 is_stmt 0      @ ../crypto/hrss/hrss.c:508:32
+	veor	d18, d18, d19
+	.loc	1 507 50 is_stmt 1      @ ../crypto/hrss/hrss.c:507:50
+	veor	d17, d20, d17
+	.loc	1 508 50                @ ../crypto/hrss/hrss.c:508:50
+	veor	d16, d18, d16
+	.loc	1 507 15                @ ../crypto/hrss/hrss.c:507:15
+	vstr	d17, [r10, #80]
+	.loc	1 511 30                @ ../crypto/hrss/hrss.c:511:30
+	ldr.w	r0, [r10, #84]
+	.loc	1 508 15                @ ../crypto/hrss/hrss.c:508:15
+	vstr	d16, [r8]
+	.loc	1 512 30                @ ../crypto/hrss/hrss.c:512:30
+	ldr.w	r1, [r9]
+	.loc	1 511 30                @ ../crypto/hrss/hrss.c:511:30
+	bic	r0, r0, #-536870912
+	str.w	r0, [r10, #84]
+	.loc	1 512 30                @ ../crypto/hrss/hrss.c:512:30
+	bic	r0, r1, #-536870912
+	str.w	r0, [r9]
+.Ltmp208:
+	.loc	1 791 1                 @ ../crypto/hrss/hrss.c:791:1
+	mov	sp, r4
+	vpop	{d8, d9, d10, d11, d12, d13, d14, d15}
+	pop.w	{r8, r9, r10}
+	pop	{r4, r5, r6, r7, pc}
+.Ltmp209:
+@ %bb.5:
+.Lfunc_end0:
+	.size	poly3_invert_vec, .Lfunc_end0-poly3_invert_vec
+	.cfi_endproc
+	.fnend
+
+	.section	.text.poly_mul_vec,"ax",%progbits
+	.hidden	poly_mul_vec            @ -- Begin function poly_mul_vec
+	.globl	poly_mul_vec
+	.p2align	2
+	.type	poly_mul_vec,%function
+	.code	16                      @ @poly_mul_vec
+	.thumb_func
+poly_mul_vec:
+.Lfunc_begin2:
+	.loc	1 1087 0                @ ../crypto/hrss/hrss.c:1087:0
+	.fnstart
+	.cfi_startproc
+@ %bb.0:
+	.save	{r4, r5, r6, r7, lr}
+	push	{r4, r5, r6, r7, lr}
+	.cfi_def_cfa_offset 20
+	.cfi_offset lr, -4
+	.cfi_offset r7, -8
+	.cfi_offset r6, -12
+	.cfi_offset r5, -16
+	.cfi_offset r4, -20
+	.setfp	r7, sp, #12
+	add	r7, sp, #12
+	.cfi_def_cfa r7, 8
+	.save	{r8, r9, r11}
+	push.w	{r8, r9, r11}
+	.cfi_offset r11, -24
+	.cfi_offset r9, -28
+	.cfi_offset r8, -32
+	.pad	#5600
+	sub.w	sp, sp, #5600
+	mov	r4, sp
+	bfc	r4, #0, #4
+	mov	sp, r4
+	mov	r4, r0
+	ldr	r0, .LCPI2_0
+	add.w	r8, sp, #12
+	movs	r6, #0
+.LPC2_0:
+	add	r0, pc
+	add.w	r5, sp, #2768
+	mov	r3, r2
+	mov	r2, r1
+	ldr.w	r9, [r0]
+	ldr.w	r0, [r9]
+	str.w	r0, [r8]
+.Ltmp218:
+	.loc	1 1098 3 prologue_end   @ ../crypto/hrss/hrss.c:1098:3
+	movs	r0, #88
+.Ltmp219:
+	.loc	2 713 10                @ ../crypto/hrss/../internal.h:713:10
+	strh.w	r6, [r1, #1406]
+	str.w	r6, [r1, #1402]
+	add	r1, sp, #16
+.Ltmp220:
+	.loc	2 713 10 is_stmt 0      @ ../crypto/hrss/../internal.h:713:10
+	strh.w	r6, [r3, #1406]
+	str.w	r6, [r3, #1402]
+.Ltmp221:
+	.loc	1 1098 3 is_stmt 1      @ ../crypto/hrss/hrss.c:1098:3
+	str	r0, [sp]
+	mov	r0, r5
+	bl	poly_mul_vec_aux
+	add.w	r0, r5, #1392
+	.loc	1 1108 24               @ ../crypto/hrss/hrss.c:1108:24
+	vld1.64	{d16, d17}, [r0:128]
+	mov.w	r0, #1408
+.LBB2_1:                                @ =>This Inner Loop Header: Depth=1
+	.loc	1 1109 24               @ ../crypto/hrss/hrss.c:1109:24
+	adds	r1, r5, r6
+	.loc	1 1110 27               @ ../crypto/hrss/hrss.c:1110:27
+	vld1.16	{d18, d19}, [r1:128], r0
+	.loc	1 1109 24               @ ../crypto/hrss/hrss.c:1109:24
+	vld1.64	{d20, d21}, [r1:128]
+	.loc	1 1110 17               @ ../crypto/hrss/hrss.c:1110:17
+	adds	r1, r4, r6
+	.loc	1 1107 24               @ ../crypto/hrss/hrss.c:1107:24
+	adds	r6, #16
+.Ltmp222:
+	.loc	1 181 10                @ ../crypto/hrss/hrss.c:181:10
+	vext.16	q8, q8, q10, #5
+.Ltmp223:
+	.loc	1 1107 3                @ ../crypto/hrss/hrss.c:1107:3
+	cmp.w	r6, #1408
+.Ltmp224:
+	.loc	1 155 58                @ ../crypto/hrss/hrss.c:155:58
+	vadd.i16	q8, q8, q9
+.Ltmp225:
+	.loc	1 1110 17               @ ../crypto/hrss/hrss.c:1110:17
+	vst1.64	{d16, d17}, [r1:128]
+	.loc	1 1108 24               @ ../crypto/hrss/hrss.c:1108:24
+	vorr	q8, q10, q10
+	.loc	1 1107 3                @ ../crypto/hrss/hrss.c:1107:3
+	bne	.LBB2_1
+@ %bb.2:
+.Ltmp226:
+	.loc	2 713 10                @ ../crypto/hrss/../internal.h:713:10
+	movs	r0, #0
+	strh.w	r0, [r4, #1406]
+	str.w	r0, [r4, #1402]
+	ldr.w	r0, [r8]
+	ldr.w	r1, [r9]
+	subs	r0, r1, r0
+.Ltmp227:
+	.loc	1 1114 1                @ ../crypto/hrss/hrss.c:1114:1
+	itttt	eq
+	subeq.w	r4, r7, #24
+	moveq	sp, r4
+	popeq.w	{r8, r9, r11}
+	popeq	{r4, r5, r6, r7, pc}
+	bl	__stack_chk_fail
+.Ltmp228:
+	.p2align	2
+@ %bb.3:
+	.loc	1 0 1 is_stmt 0         @ ../crypto/hrss/hrss.c:0:1
+.LCPI2_0:
+.Ltmp229:
+	.long	__stack_chk_guard(GOT_PREL)-((.LPC2_0+4)-.Ltmp229)
+.Lfunc_end2:
+	.size	poly_mul_vec, .Lfunc_end2-poly_mul_vec
+	.cfi_endproc
+	.fnend
+                                        @ -- End function
+	.section	.text.poly_mul_vec_aux,"ax",%progbits
+	.p2align	1               @ -- Begin function poly_mul_vec_aux
+	.type	poly_mul_vec_aux,%function
+	.code	16                      @ @poly_mul_vec_aux
+	.thumb_func
+poly_mul_vec_aux:
+.Lfunc_begin3:
+	.loc	1 897 0 is_stmt 1       @ ../crypto/hrss/hrss.c:897:0
+	.fnstart
+	.cfi_startproc
+@ %bb.0:
+	.save	{r4, r5, r6, r7, lr}
+	push	{r4, r5, r6, r7, lr}
+	.cfi_def_cfa_offset 20
+	.cfi_offset lr, -4
+	.cfi_offset r7, -8
+	.cfi_offset r6, -12
+	.cfi_offset r5, -16
+	.cfi_offset r4, -20
+	.setfp	r7, sp, #12
+	add	r7, sp, #12
+	.cfi_def_cfa r7, 8
+	.save	{r8, r9, r10, r11}
+	push.w	{r8, r9, r10, r11}
+	.cfi_offset r11, -24
+	.cfi_offset r10, -28
+	.cfi_offset r9, -32
+	.cfi_offset r8, -36
+	.pad	#4
+	sub	sp, #4
+	.vsave	{d8, d9, d10, d11, d12, d13, d14, d15}
+	vpush	{d8, d9, d10, d11, d12, d13, d14, d15}
+	.cfi_offset d15, -48
+	.cfi_offset d14, -56
+	.cfi_offset d13, -64
+	.cfi_offset d12, -72
+	.cfi_offset d11, -80
+	.cfi_offset d10, -88
+	.cfi_offset d9, -96
+	.cfi_offset d8, -104
+	.pad	#856
+	sub.w	sp, sp, #856
+	mov	r4, sp
+	bfc	r4, #0, #4
+	mov	sp, r4
+	mov	r9, r1
+	ldr	r1, [r7, #8]
+	mov	r8, r3
+	mov	r10, r2
+	mov	lr, r0
+.Ltmp230:
+	.loc	1 898 7 prologue_end    @ ../crypto/hrss/hrss.c:898:7
+	cmp	r1, #3
+	beq.w	.LBB3_3
+@ %bb.1:
+	cmp	r1, #2
+	bne.w	.LBB3_4
+@ %bb.2:
+	.loc	1 902 16                @ ../crypto/hrss/hrss.c:902:16
+	vld1.16	{d20, d21}, [r10:128]!
+	.loc	1 903 16                @ ../crypto/hrss/hrss.c:903:16
+	add	r0, sp, #816
+.Ltmp231:
+	.loc	1 167 10                @ ../crypto/hrss/hrss.c:167:10
+	vmov.i32	q8, #0x0
+.Ltmp232:
+	.loc	1 952 5                 @ ../crypto/hrss/hrss.c:952:5
+	movs	r1, #30
+	.loc	1 903 16                @ ../crypto/hrss/hrss.c:903:16
+	vld1.64	{d22, d23}, [r10:128]
+.Ltmp233:
+	.loc	1 167 10                @ ../crypto/hrss/hrss.c:167:10
+	vmov.i32	q9, #0x0
+.Ltmp234:
+	.loc	1 903 16                @ ../crypto/hrss/hrss.c:903:16
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+.Ltmp235:
+	.loc	1 167 10                @ ../crypto/hrss/hrss.c:167:10
+	add	r0, sp, #768
+	vext.16	q13, q11, q8, #7
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #640
+	vst1.64	{d26, d27}, [r0:128]    @ 16-byte Spill
+.Ltmp236:
+	.loc	1 921 5                 @ ../crypto/hrss/hrss.c:921:5
+	add.w	r0, r8, #2
+.Ltmp237:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d16[], d17[]}, [r0:16]
+.Ltmp238:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #784
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	vmul.i16	q8, q8, q13
+.Ltmp239:
+	.loc	1 910 46 is_stmt 1      @ ../crypto/hrss/hrss.c:910:46
+	add.w	r0, r8, #16
+.Ltmp240:
+	.loc	1 168 10                @ ../crypto/hrss/hrss.c:168:10
+	vext.16	q4, q10, q11, #7
+.Ltmp241:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d24[], d25[]}, [r0:16]
+.Ltmp242:
+	.loc	1 168 10                @ ../crypto/hrss/hrss.c:168:10
+	add	r0, sp, #736
+.Ltmp243:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q12, q11
+.Ltmp244:
+	.loc	1 168 10                @ ../crypto/hrss/hrss.c:168:10
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+.Ltmp245:
+	.loc	1 922 5                 @ ../crypto/hrss/hrss.c:922:5
+	add.w	r0, r8, #18
+.Ltmp246:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d22[], d23[]}, [r0:16]
+.Ltmp247:
+	.loc	1 167 10                @ ../crypto/hrss/hrss.c:167:10
+	add	r0, sp, #832
+.Ltmp248:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q11, q4
+.Ltmp249:
+	.loc	1 167 10                @ ../crypto/hrss/hrss.c:167:10
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+.Ltmp250:
+	.loc	1 926 5                 @ ../crypto/hrss/hrss.c:926:5
+	add.w	r0, r8, #4
+.Ltmp251:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d22[], d23[]}, [r0:16]
+.Ltmp252:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #752
+.Ltmp253:
+	.loc	1 167 10 is_stmt 1      @ ../crypto/hrss/hrss.c:167:10
+	vext.16	q13, q4, q13, #7
+.Ltmp254:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	vmla.i16	q8, q11, q13
+.Ltmp255:
+	.loc	1 927 5                 @ ../crypto/hrss/hrss.c:927:5
+	add	r0, sp, #672
+.Ltmp256:
+	.loc	1 169 10                @ ../crypto/hrss/hrss.c:169:10
+	vext.16	q11, q9, q10, #7
+.Ltmp257:
+	.loc	1 927 5                 @ ../crypto/hrss/hrss.c:927:5
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	add.w	r0, r8, #20
+.Ltmp258:
+	.loc	1 168 10                @ ../crypto/hrss/hrss.c:168:10
+	vext.16	q1, q11, q4, #7
+.Ltmp259:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d6[], d7[]}, [r0:16]
+.Ltmp260:
+	.loc	1 931 5                 @ ../crypto/hrss/hrss.c:931:5
+	add.w	r0, r8, #6
+.Ltmp261:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q3, q1
+.Ltmp262:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+.Ltmp263:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #704
+.Ltmp264:
+	.loc	1 167 10 is_stmt 1      @ ../crypto/hrss/hrss.c:167:10
+	vext.16	q0, q1, q13, #7
+.Ltmp265:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmul.i16	q13, q3, q13
+.Ltmp266:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+	vmla.i16	q8, q10, q0
+.Ltmp267:
+	.loc	1 932 5 is_stmt 1       @ ../crypto/hrss/hrss.c:932:5
+	add	r0, sp, #624
+.Ltmp268:
+	.loc	1 169 10                @ ../crypto/hrss/hrss.c:169:10
+	vext.16	q10, q9, q11, #7
+	vorr	q12, q10, q10
+.Ltmp269:
+	.loc	1 168 10                @ ../crypto/hrss/hrss.c:168:10
+	vext.16	q15, q10, q1, #7
+.Ltmp270:
+	.loc	1 932 5                 @ ../crypto/hrss/hrss.c:932:5
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	add.w	r0, r8, #22
+.Ltmp271:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d14[], d15[]}, [r0:16]
+.Ltmp272:
+	.loc	1 936 5                 @ ../crypto/hrss/hrss.c:936:5
+	add.w	r0, r8, #8
+.Ltmp273:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q7, q15
+.Ltmp274:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+.Ltmp275:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #656
+.Ltmp276:
+	.loc	1 167 10 is_stmt 1      @ ../crypto/hrss/hrss.c:167:10
+	vext.16	q11, q15, q0, #7
+.Ltmp277:
+	.loc	1 169 10                @ ../crypto/hrss/hrss.c:169:10
+	vext.16	q2, q9, q12, #7
+.Ltmp278:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q10, q11
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+.Ltmp279:
+	.loc	1 937 5                 @ ../crypto/hrss/hrss.c:937:5
+	add	r0, sp, #576
+.Ltmp280:
+	.loc	1 168 10                @ ../crypto/hrss/hrss.c:168:10
+	vext.16	q10, q2, q15, #7
+.Ltmp281:
+	.loc	1 937 5                 @ ../crypto/hrss/hrss.c:937:5
+	vst1.64	{d4, d5}, [r0:128]      @ 16-byte Spill
+	add.w	r0, r8, #24
+.Ltmp282:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d24[], d25[]}, [r0:16]
+.Ltmp283:
+	.loc	1 941 5                 @ ../crypto/hrss/hrss.c:941:5
+	add	r0, sp, #688
+.Ltmp284:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q12, q10
+.Ltmp285:
+	.loc	1 167 10                @ ../crypto/hrss/hrss.c:167:10
+	vext.16	q5, q10, q11, #7
+.Ltmp286:
+	.loc	1 169 10                @ ../crypto/hrss/hrss.c:169:10
+	vext.16	q6, q9, q2, #7
+.Ltmp287:
+	.loc	1 941 5                 @ ../crypto/hrss/hrss.c:941:5
+	vst1.64	{d10, d11}, [r0:128]    @ 16-byte Spill
+	add.w	r0, r8, #10
+.Ltmp288:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d28[], d29[]}, [r0:16]
+.Ltmp289:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #608
+	vmla.i16	q8, q14, q5
+	vst1.64	{d28, d29}, [r0:128]    @ 16-byte Spill
+.Ltmp290:
+	.loc	1 942 5 is_stmt 1       @ ../crypto/hrss/hrss.c:942:5
+	add.w	r0, r8, #26
+.Ltmp291:
+	.loc	1 168 10                @ ../crypto/hrss/hrss.c:168:10
+	vext.16	q9, q6, q10, #7
+.Ltmp292:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d28[], d29[]}, [r0:16]
+.Ltmp293:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #592
+	vmla.i16	q8, q14, q9
+	vst1.64	{d28, d29}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #720
+.Ltmp294:
+	.loc	1 167 10 is_stmt 1      @ ../crypto/hrss/hrss.c:167:10
+	vext.16	q14, q9, q5, #7
+	vst1.64	{d28, d29}, [r0:128]    @ 16-byte Spill
+.Ltmp295:
+	.loc	1 946 5                 @ ../crypto/hrss/hrss.c:946:5
+	add.w	r0, r8, #12
+.Ltmp296:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d4[], d5[]}, [r0:16]
+.Ltmp297:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #800
+	vmla.i16	q8, q2, q14
+	vst1.64	{d4, d5}, [r0:128]      @ 16-byte Spill
+.Ltmp298:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #832
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #640
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+.Ltmp299:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #816
+.Ltmp300:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q14, q2
+.Ltmp301:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q7, q0
+.Ltmp302:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q12, q11
+.Ltmp303:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #736
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+.Ltmp304:
+	.loc	1 159 59 is_stmt 1      @ ../crypto/hrss/hrss.c:159:59
+	mov	r0, r8
+.Ltmp305:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmul.i16	q0, q14, q11
+.Ltmp306:
+	.loc	1 159 59                @ ../crypto/hrss/hrss.c:159:59
+	vld1.16	{d28[], d29[]}, [r0:16], r1
+.Ltmp307:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r1, sp, #736
+	vst1.64	{d28, d29}, [r1:128]    @ 16-byte Spill
+	add	r1, sp, #768
+	vld1.64	{d22, d23}, [r1:128]    @ 16-byte Reload
+	add	r1, sp, #784
+	vmla.i16	q0, q14, q11
+	vld1.64	{d10, d11}, [r1:128]    @ 16-byte Reload
+	add	r1, sp, #672
+	vld1.64	{d22, d23}, [r1:128]    @ 16-byte Reload
+.Ltmp308:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r1, sp, #832
+.Ltmp309:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q5, q4
+.Ltmp310:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d28, d29}, [r1:128]    @ 16-byte Reload
+	add	r1, sp, #752
+	vld1.64	{d8, d9}, [r1:128]      @ 16-byte Reload
+	add	r1, sp, #624
+	vld1.64	{d4, d5}, [r1:128]      @ 16-byte Reload
+	add	r1, sp, #704
+	vmla.i16	q0, q14, q11
+.Ltmp311:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q4, q1
+	vld1.64	{d2, d3}, [r1:128]      @ 16-byte Reload
+	add	r1, sp, #576
+.Ltmp312:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q3, q2
+.Ltmp313:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q1, q15
+	vld1.64	{d30, d31}, [r1:128]    @ 16-byte Reload
+	add	r1, sp, #656
+.Ltmp314:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q7, q15
+	vld1.64	{d14, d15}, [r1:128]    @ 16-byte Reload
+	add	r1, sp, #608
+	vld1.64	{d6, d7}, [r1:128]      @ 16-byte Reload
+	add	r1, sp, #592
+	vld1.64	{d28, d29}, [r1:128]    @ 16-byte Reload
+.Ltmp315:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r1, sp, #800
+.Ltmp316:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q7, q10
+.Ltmp317:
+	.loc	1 169 10 is_stmt 1      @ ../crypto/hrss/hrss.c:169:10
+	vmov.i32	q10, #0x0
+.Ltmp318:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q12, q6
+.Ltmp319:
+	.loc	1 169 10                @ ../crypto/hrss/hrss.c:169:10
+	vext.16	q10, q10, q6, #7
+.Ltmp320:
+	.loc	1 168 10                @ ../crypto/hrss/hrss.c:168:10
+	vext.16	q12, q10, q9, #7
+.Ltmp321:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q3, q9
+.Ltmp322:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r1:128]    @ 16-byte Reload
+.Ltmp323:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r1, sp, #816
+.Ltmp324:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q14, q10
+.Ltmp325:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q9, q12
+.Ltmp326:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmul.i16	q9, q5, q11
+	vld1.64	{d22, d23}, [r1:128]    @ 16-byte Reload
+	add	r1, sp, #736
+	vld1.64	{d10, d11}, [r1:128]    @ 16-byte Reload
+.Ltmp327:
+	.loc	1 947 5 is_stmt 1       @ ../crypto/hrss/hrss.c:947:5
+	add.w	r1, r8, #28
+.Ltmp328:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q9, q5, q11
+.Ltmp329:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d22[], d23[]}, [r1:16]
+.Ltmp330:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r1, sp, #688
+.Ltmp331:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q11, q12
+.Ltmp332:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q9, q4, q2
+.Ltmp333:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q9, q1, q15
+.Ltmp334:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d30, d31}, [r1:128]    @ 16-byte Reload
+.Ltmp335:
+	.loc	1 951 5 is_stmt 1       @ ../crypto/hrss/hrss.c:951:5
+	add.w	r1, r8, #14
+.Ltmp336:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q14, q15
+.Ltmp337:
+	.loc	1 169 10                @ ../crypto/hrss/hrss.c:169:10
+	vmov.i32	q14, #0x0
+.Ltmp338:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q9, q7, q6
+.Ltmp339:
+	.loc	1 167 10                @ ../crypto/hrss/hrss.c:167:10
+	vmov.i32	q1, #0x0
+.Ltmp340:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q9, q3, q10
+.Ltmp341:
+	.loc	1 169 10                @ ../crypto/hrss/hrss.c:169:10
+	vext.16	q10, q14, q10, #7
+.Ltmp342:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d28[], d29[]}, [r1:16]
+	add	r1, sp, #720
+.Ltmp343:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q11, q10
+	vld1.64	{d6, d7}, [r1:128]      @ 16-byte Reload
+.Ltmp344:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r1, sp, #800
+.Ltmp345:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q11, q3
+.Ltmp346:
+	.loc	1 167 10 is_stmt 1      @ ../crypto/hrss/hrss.c:167:10
+	vext.16	q15, q12, q3, #7
+	.loc	1 168 10                @ ../crypto/hrss/hrss.c:168:10
+	vext.16	q11, q10, q12, #7
+.Ltmp347:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q14, q15
+.Ltmp348:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d24, d25}, [r1:128]    @ 16-byte Reload
+.Ltmp349:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q14, q11
+.Ltmp350:
+	.loc	1 956 5 is_stmt 1       @ ../crypto/hrss/hrss.c:956:5
+	mov	r1, lr
+.Ltmp351:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q9, q12, q10
+.Ltmp352:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d24[], d25[]}, [r0:16]
+.Ltmp353:
+	.loc	1 956 5 is_stmt 1       @ ../crypto/hrss/hrss.c:956:5
+	movs	r0, #48
+.Ltmp354:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q12, q15
+.Ltmp355:
+	.loc	1 169 10                @ ../crypto/hrss/hrss.c:169:10
+	vext.16	q10, q1, q10, #7
+.Ltmp356:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q12, q11
+.Ltmp357:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q12, q10
+.Ltmp358:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q9, q14, q10
+.Ltmp359:
+	.loc	1 956 5 is_stmt 1       @ ../crypto/hrss/hrss.c:956:5
+	vst1.16	{d18, d19}, [r1:128], r0
+	add.w	r0, lr, #32
+	vst1.64	{d26, d27}, [r1:128]
+	vst1.64	{d16, d17}, [r0:128]
+	add.w	r0, lr, #16
+	vst1.64	{d0, d1}, [r0:128]
+	b.w	.LBB3_17
+.LBB3_3:
+	.loc	1 0 5 is_stmt 0         @ ../crypto/hrss/hrss.c:0:5
+	movs	r0, #32
+	.loc	1 965 16 is_stmt 1      @ ../crypto/hrss/hrss.c:965:16
+	add.w	r1, r10, #16
+	.loc	1 964 16                @ ../crypto/hrss/hrss.c:964:16
+	vld1.16	{d22, d23}, [r10:128], r0
+.Ltmp360:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	vmov.i32	q8, #0x0
+.Ltmp361:
+	.loc	1 966 16                @ ../crypto/hrss/hrss.c:966:16
+	add	r0, sp, #752
+.Ltmp362:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	vmov.i32	q10, #0x0
+.Ltmp363:
+	.loc	1 965 16                @ ../crypto/hrss/hrss.c:965:16
+	vld1.64	{d18, d19}, [r1:128]
+.Ltmp364:
+	.loc	1 159 59                @ ../crypto/hrss/hrss.c:159:59
+	mov	r1, r8
+	vorr	q14, q9, q9
+.Ltmp365:
+	.loc	1 966 16                @ ../crypto/hrss/hrss.c:966:16
+	vld1.64	{d6, d7}, [r10:128]
+.Ltmp366:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	vext.16	q1, q3, q8, #7
+	.loc	1 175 10                @ ../crypto/hrss/hrss.c:175:10
+	vext.16	q15, q9, q3, #7
+.Ltmp367:
+	.loc	1 966 16                @ ../crypto/hrss/hrss.c:966:16
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #768
+.Ltmp368:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	vext.16	q8, q15, q1, #7
+	vst1.64	{d30, d31}, [r0:128]    @ 16-byte Spill
+.Ltmp369:
+	.loc	1 1000 5                @ ../crypto/hrss/hrss.c:1000:5
+	add	r0, sp, #624
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add.w	r0, r8, #36
+.Ltmp370:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d24[], d25[]}, [r0:16]
+.Ltmp371:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #832
+	vmul.i16	q2, q12, q8
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+.Ltmp372:
+	.loc	1 994 5 is_stmt 1       @ ../crypto/hrss/hrss.c:994:5
+	add.w	r0, r8, #34
+.Ltmp373:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d24[], d25[]}, [r0:16]
+.Ltmp374:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #256
+	vmla.i16	q2, q12, q1
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #64
+.Ltmp375:
+	.loc	1 176 10 is_stmt 1      @ ../crypto/hrss/hrss.c:176:10
+	vext.16	q13, q11, q9, #7
+	vst1.64	{d2, d3}, [r0:128]      @ 16-byte Spill
+.Ltmp376:
+	.loc	1 175 10                @ ../crypto/hrss/hrss.c:175:10
+	add	r0, sp, #704
+	vext.16	q6, q13, q15, #7
+	vorr	q15, q13, q13
+	vst1.64	{d28, d29}, [r0:128]    @ 16-byte Spill
+.Ltmp377:
+	.loc	1 1006 5                @ ../crypto/hrss/hrss.c:1006:5
+	add.w	r0, r8, #38
+.Ltmp378:
+	.loc	1 177 10                @ ../crypto/hrss/hrss.c:177:10
+	vext.16	q12, q10, q11, #7
+.Ltmp379:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d18[], d19[]}, [r0:16]
+.Ltmp380:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #320
+.Ltmp381:
+	.loc	1 174 10 is_stmt 1      @ ../crypto/hrss/hrss.c:174:10
+	vext.16	q8, q6, q8, #7
+.Ltmp382:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	vmla.i16	q2, q9, q8
+.Ltmp383:
+	.loc	1 176 10                @ ../crypto/hrss/hrss.c:176:10
+	add	r0, sp, #160
+	vext.16	q9, q12, q13, #7
+	vorr	q7, q8, q8
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+.Ltmp384:
+	.loc	1 175 10                @ ../crypto/hrss/hrss.c:175:10
+	add	r0, sp, #672
+	vorr	q4, q9, q9
+	vext.16	q13, q9, q6, #7
+	vst1.64	{d30, d31}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #816
+	vst1.64	{d26, d27}, [r0:128]    @ 16-byte Spill
+.Ltmp385:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	add	r0, sp, #416
+	vst1.64	{d12, d13}, [r0:128]    @ 16-byte Spill
+.Ltmp386:
+	.loc	1 1012 5                @ ../crypto/hrss/hrss.c:1012:5
+	add	r0, sp, #48
+	vst1.64	{d14, d15}, [r0:128]    @ 16-byte Spill
+	add.w	r0, r8, #40
+.Ltmp387:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	vext.16	q0, q13, q8, #7
+.Ltmp388:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d22[], d23[]}, [r0:16]
+	add	r0, sp, #352
+	vorr	q8, q0, q0
+.Ltmp389:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q2, q11, q0
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #544
+.Ltmp390:
+	.loc	1 177 10 is_stmt 1      @ ../crypto/hrss/hrss.c:177:10
+	vext.16	q11, q10, q12, #7
+.Ltmp391:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+.Ltmp392:
+	.loc	1 176 10                @ ../crypto/hrss/hrss.c:176:10
+	add	r0, sp, #736
+	vext.16	q12, q11, q9, #7
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+.Ltmp393:
+	.loc	1 175 10                @ ../crypto/hrss/hrss.c:175:10
+	add	r0, sp, #640
+	vorr	q5, q12, q12
+	vext.16	q0, q12, q13, #7
+	vst1.64	{d8, d9}, [r0:128]      @ 16-byte Spill
+.Ltmp394:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	add	r0, sp, #272
+.Ltmp395:
+	.loc	1 177 10                @ ../crypto/hrss/hrss.c:177:10
+	vext.16	q9, q10, q11, #7
+.Ltmp396:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	vst1.64	{d0, d1}, [r0:128]      @ 16-byte Spill
+.Ltmp397:
+	.loc	1 1018 5                @ ../crypto/hrss/hrss.c:1018:5
+	add.w	r0, r8, #42
+.Ltmp398:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	vext.16	q13, q0, q8, #7
+.Ltmp399:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d16[], d17[]}, [r0:16]
+	add	r0, sp, #384
+.Ltmp400:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q2, q8, q13
+.Ltmp401:
+	.loc	1 176 10 is_stmt 1      @ ../crypto/hrss/hrss.c:176:10
+	vext.16	q11, q9, q12, #7
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+.Ltmp402:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #496
+.Ltmp403:
+	.loc	1 175 10                @ ../crypto/hrss/hrss.c:175:10
+	vext.16	q12, q11, q0, #7
+.Ltmp404:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vst1.64	{d26, d27}, [r0:128]    @ 16-byte Spill
+.Ltmp405:
+	.loc	1 176 10                @ ../crypto/hrss/hrss.c:176:10
+	add	r0, sp, #720
+	vorr	q0, q12, q12
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+.Ltmp406:
+	.loc	1 175 10                @ ../crypto/hrss/hrss.c:175:10
+	add	r0, sp, #656
+.Ltmp407:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	vext.16	q12, q12, q13, #7
+.Ltmp408:
+	.loc	1 175 10                @ ../crypto/hrss/hrss.c:175:10
+	vst1.64	{d10, d11}, [r0:128]    @ 16-byte Spill
+.Ltmp409:
+	.loc	1 1024 5                @ ../crypto/hrss/hrss.c:1024:5
+	add	r0, sp, #464
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	add.w	r0, r8, #44
+.Ltmp410:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d16[], d17[]}, [r0:16]
+	add	r0, sp, #400
+.Ltmp411:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q2, q8, q12
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+.Ltmp412:
+	.loc	1 176 10 is_stmt 1      @ ../crypto/hrss/hrss.c:176:10
+	add	r0, sp, #784
+.Ltmp413:
+	.loc	1 177 10                @ ../crypto/hrss/hrss.c:177:10
+	vext.16	q8, q10, q9, #7
+.Ltmp414:
+	.loc	1 176 10                @ ../crypto/hrss/hrss.c:176:10
+	vext.16	q9, q8, q11, #7
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #800
+	vorr	q8, q11, q11
+	vst1.64	{d18, d19}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #592
+.Ltmp415:
+	.loc	1 175 10                @ ../crypto/hrss/hrss.c:175:10
+	vext.16	q10, q9, q0, #7
+	vorr	q9, q0, q0
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #304
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+.Ltmp416:
+	.loc	1 1030 5                @ ../crypto/hrss/hrss.c:1030:5
+	movs	r0, #46
+.Ltmp417:
+	.loc	1 159 59                @ ../crypto/hrss/hrss.c:159:59
+	vld1.16	{d22[], d23[]}, [r1:16], r0
+.Ltmp418:
+	.loc	1 174 10                @ ../crypto/hrss/hrss.c:174:10
+	add	r0, sp, #512
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #368
+	vext.16	q11, q10, q12, #7
+.Ltmp419:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r1:16]
+.Ltmp420:
+	.loc	1 1035 5                @ ../crypto/hrss/hrss.c:1035:5
+	mov	r1, lr
+.Ltmp421:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q2, q10, q11
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #176
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #144
+	vst1.64	{d4, d5}, [r0:128]      @ 16-byte Spill
+.Ltmp422:
+	.loc	1 980 5                 @ ../crypto/hrss/hrss.c:980:5
+	add.w	r0, r8, #32
+.Ltmp423:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+.Ltmp424:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #480
+	vmul.i16	q13, q10, q14
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+.Ltmp425:
+	.loc	1 979 5 is_stmt 1       @ ../crypto/hrss/hrss.c:979:5
+	add.w	r0, r8, #16
+	vorr	q14, q3, q3
+.Ltmp426:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+.Ltmp427:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #688
+	vmla.i16	q13, q10, q3
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+.Ltmp428:
+	.loc	1 992 5 is_stmt 1       @ ../crypto/hrss/hrss.c:992:5
+	add.w	r0, r8, #2
+.Ltmp429:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+.Ltmp430:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #128
+	vmla.i16	q13, q10, q1
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+.Ltmp431:
+	.loc	1 993 5 is_stmt 1       @ ../crypto/hrss/hrss.c:993:5
+	add.w	r0, r8, #18
+.Ltmp432:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+	add	r0, sp, #288
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+.Ltmp433:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #768
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #256
+	vmla.i16	q13, q10, q11
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp434:
+	.loc	1 998 5 is_stmt 1       @ ../crypto/hrss/hrss.c:998:5
+	add.w	r0, r8, #4
+.Ltmp435:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d24[], d25[]}, [r0:16]
+.Ltmp436:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #432
+.Ltmp437:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q11, q15
+.Ltmp438:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #624
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp439:
+	.loc	1 999 5 is_stmt 1       @ ../crypto/hrss/hrss.c:999:5
+	add.w	r0, r8, #20
+.Ltmp440:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q12, q10
+.Ltmp441:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+.Ltmp442:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #832
+	vorr	q15, q10, q10
+.Ltmp443:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q10, q6
+.Ltmp444:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp445:
+	.loc	1 1004 5 is_stmt 1      @ ../crypto/hrss/hrss.c:1004:5
+	add.w	r0, r8, #6
+	vorr	q6, q14, q14
+.Ltmp446:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q10, q4
+.Ltmp447:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+.Ltmp448:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #608
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+.Ltmp449:
+	.loc	1 1005 5 is_stmt 1      @ ../crypto/hrss/hrss.c:1005:5
+	add.w	r0, r8, #22
+.Ltmp450:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q10, q7
+.Ltmp451:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+.Ltmp452:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #240
+	vorr	q7, q9, q9
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #816
+	vld1.64	{d24, d25}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #320
+	vmla.i16	q13, q10, q12
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+.Ltmp453:
+	.loc	1 1010 5 is_stmt 1      @ ../crypto/hrss/hrss.c:1010:5
+	add.w	r0, r8, #8
+.Ltmp454:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d24[], d25[]}, [r0:16]
+.Ltmp455:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #96
+.Ltmp456:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q2, q5
+.Ltmp457:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #544
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp458:
+	.loc	1 1011 5 is_stmt 1      @ ../crypto/hrss/hrss.c:1011:5
+	add.w	r0, r8, #24
+.Ltmp459:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q12, q10
+.Ltmp460:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d20[], d21[]}, [r0:16]
+	add	r0, sp, #224
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #272
+	vld1.64	{d10, d11}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #352
+.Ltmp461:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q10, q5
+	vld1.64	{d0, d1}, [r0:128]      @ 16-byte Reload
+.Ltmp462:
+	.loc	1 1016 5 is_stmt 1      @ ../crypto/hrss/hrss.c:1016:5
+	add.w	r0, r8, #10
+.Ltmp463:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q0, q8
+.Ltmp464:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d16[], d17[]}, [r0:16]
+.Ltmp465:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #576
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #496
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp466:
+	.loc	1 1017 5 is_stmt 1      @ ../crypto/hrss/hrss.c:1017:5
+	add.w	r0, r8, #26
+.Ltmp467:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q8, q10
+.Ltmp468:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d16[], d17[]}, [r0:16]
+	add	r0, sp, #208
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+.Ltmp469:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #16
+	vmla.i16	q13, q8, q9
+	vst1.64	{d14, d15}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #800
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #384
+	vld1.64	{d8, d9}, [r0:128]      @ 16-byte Reload
+.Ltmp470:
+	.loc	1 1022 5 is_stmt 1      @ ../crypto/hrss/hrss.c:1022:5
+	add.w	r0, r8, #12
+.Ltmp471:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q4, q9
+.Ltmp472:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d16[], d17[]}, [r0:16]
+.Ltmp473:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #528
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #464
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp474:
+	.loc	1 1023 5 is_stmt 1      @ ../crypto/hrss/hrss.c:1023:5
+	add.w	r0, r8, #28
+.Ltmp475:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q8, q10
+.Ltmp476:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d16[], d17[]}, [r0:16]
+	add	r0, sp, #192
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #304
+	vld1.64	{d6, d7}, [r0:128]      @ 16-byte Reload
+.Ltmp477:
+	.loc	1 177 10 is_stmt 1      @ ../crypto/hrss/hrss.c:177:10
+	add	r0, sp, #784
+.Ltmp478:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q8, q3
+.Ltmp479:
+	.loc	1 177 10                @ ../crypto/hrss/hrss.c:177:10
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp480:
+	.loc	1 176 10                @ ../crypto/hrss/hrss.c:176:10
+	add	r0, sp, #336
+.Ltmp481:
+	.loc	1 177 10                @ ../crypto/hrss/hrss.c:177:10
+	vmov.i32	q8, #0x0
+	vext.16	q8, q8, q10, #7
+.Ltmp482:
+	.loc	1 176 10                @ ../crypto/hrss/hrss.c:176:10
+	vext.16	q10, q8, q9, #7
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #448
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #400
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp483:
+	.loc	1 1028 5                @ ../crypto/hrss/hrss.c:1028:5
+	add.w	r0, r8, #14
+.Ltmp484:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q9, q10
+.Ltmp485:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d16[], d17[]}, [r0:16]
+	add	r0, sp, #80
+	vst1.64	{d16, d17}, [r0:128]    @ 16-byte Spill
+.Ltmp486:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #368
+.Ltmp487:
+	.loc	1 175 10 is_stmt 1      @ ../crypto/hrss/hrss.c:175:10
+	vext.16	q10, q10, q3, #7
+.Ltmp488:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d24, d25}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #560
+	vmla.i16	q13, q8, q12
+	vst1.64	{d20, d21}, [r0:128]    @ 16-byte Spill
+.Ltmp489:
+	.loc	1 1029 5                @ ../crypto/hrss/hrss.c:1029:5
+	add.w	r0, r8, #30
+.Ltmp490:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.16	{d24[], d25[]}, [r0:16]
+	add	r0, sp, #112
+.Ltmp491:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q12, q10
+	vst1.64	{d24, d25}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #288
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp492:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #64
+	vld1.64	{d16, d17}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #480
+	vmul.i16	q8, q10, q8
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+.Ltmp493:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #768
+.Ltmp494:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q14, q6
+.Ltmp495:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+.Ltmp496:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #32
+.Ltmp497:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q11, q14
+	vorr	q11, q15, q15
+.Ltmp498:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vst1.64	{d22, d23}, [r0:128]    @ 16-byte Spill
+	add	r0, sp, #624
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+.Ltmp499:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #416
+.Ltmp500:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q15, q14
+.Ltmp501:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #832
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #240
+	vmla.i16	q8, q15, q14
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+.Ltmp502:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #48
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+.Ltmp503:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #816
+.Ltmp504:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q14, q15
+.Ltmp505:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #224
+	vmla.i16	q8, q2, q15
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+.Ltmp506:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #544
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #208
+	vmla.i16	q8, q15, q2
+.Ltmp507:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q0, q5
+	vld1.64	{d0, d1}, [r0:128]      @ 16-byte Reload
+.Ltmp508:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #496
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #192
+	vld1.64	{d2, d3}, [r0:128]      @ 16-byte Reload
+.Ltmp509:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #464
+.Ltmp510:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q0, q2
+.Ltmp511:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+.Ltmp512:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #368
+.Ltmp513:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q4, q7
+.Ltmp514:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q1, q2
+.Ltmp515:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q9, q3
+.Ltmp516:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp517:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #704
+.Ltmp518:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q8, q12, q9
+.Ltmp519:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #688
+	vld1.64	{d24, d25}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #512
+	vmul.i16	q12, q12, q9
+	vld1.64	{d10, d11}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #752
+	vld1.64	{d6, d7}, [r0:128]      @ 16-byte Reload
+.Ltmp520:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #480
+.Ltmp521:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q5, q6
+.Ltmp522:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #128
+	vld1.64	{d8, d9}, [r0:128]      @ 16-byte Reload
+.Ltmp523:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #768
+.Ltmp524:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q3
+.Ltmp525:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp526:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #672
+.Ltmp527:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q4, q9
+.Ltmp528:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #160
+	vld1.64	{d12, d13}, [r0:128]    @ 16-byte Reload
+.Ltmp529:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #256
+.Ltmp530:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q10, q9
+.Ltmp531:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #432
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+.Ltmp532:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #416
+.Ltmp533:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q6
+.Ltmp534:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp535:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #640
+.Ltmp536:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q2, q9
+.Ltmp537:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #736
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp538:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #832
+.Ltmp539:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q11, q9
+.Ltmp540:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp541:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #608
+.Ltmp542:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q10
+.Ltmp543:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #816
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp544:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #656
+.Ltmp545:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q11
+.Ltmp546:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #720
+	vmla.i16	q12, q14, q9
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+.Ltmp547:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #320
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #96
+	vld1.64	{d14, d15}, [r0:128]    @ 16-byte Reload
+.Ltmp548:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #272
+.Ltmp549:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q14
+.Ltmp550:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp551:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #592
+.Ltmp552:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q7, q9
+.Ltmp553:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp554:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #784
+.Ltmp555:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q15, q9
+.Ltmp556:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #352
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp557:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #576
+.Ltmp558:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q11, q9
+.Ltmp559:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #16
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp560:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #800
+.Ltmp561:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q11
+.Ltmp562:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #336
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp563:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #384
+.Ltmp564:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q0, q9
+.Ltmp565:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp566:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #528
+.Ltmp567:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q11
+.Ltmp568:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #304
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+.Ltmp569:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #448
+.Ltmp570:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q15
+.Ltmp571:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp572:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #400
+.Ltmp573:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q1, q9
+.Ltmp574:
+	.loc	1 174 10 is_stmt 1      @ ../crypto/hrss/hrss.c:174:10
+	vmov.i32	q9, #0x0
+.Ltmp575:
+	.loc	1 177 10                @ ../crypto/hrss/hrss.c:177:10
+	vext.16	q1, q9, q11, #7
+.Ltmp576:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #80
+	vmla.i16	q12, q9, q1
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp577:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #560
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp578:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #688
+.Ltmp579:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q11
+.Ltmp580:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #704
+	vmul.i16	q0, q11, q3
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp581:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #672
+	vorr	q3, q4, q4
+.Ltmp582:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q5, q11
+.Ltmp583:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp584:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #288
+	vorr	q5, q7, q7
+.Ltmp585:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q4, q11
+.Ltmp586:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp587:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #640
+.Ltmp588:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q11, q6
+.Ltmp589:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp590:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #32
+.Ltmp591:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q2, q11
+.Ltmp592:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #608
+	vld1.64	{d4, d5}, [r0:128]      @ 16-byte Reload
+.Ltmp593:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #656
+.Ltmp594:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q11, q10
+.Ltmp595:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp596:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #240
+.Ltmp597:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q2, q10
+.Ltmp598:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp599:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #592
+.Ltmp600:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q10, q14
+.Ltmp601:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #784
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+.Ltmp602:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #224
+.Ltmp603:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q7, q10
+.Ltmp604:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #576
+	vld1.64	{d14, d15}, [r0:128]    @ 16-byte Reload
+.Ltmp605:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #800
+.Ltmp606:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q10, q15
+.Ltmp607:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #336
+	vld1.64	{d22, d23}, [r0:128]    @ 16-byte Reload
+.Ltmp608:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #208
+.Ltmp609:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q7, q10
+.Ltmp610:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #528
+	vld1.64	{d8, d9}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #448
+	vmla.i16	q0, q10, q11
+	vld1.64	{d20, d21}, [r0:128]    @ 16-byte Reload
+.Ltmp611:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #192
+	vld1.64	{d28, d29}, [r0:128]    @ 16-byte Reload
+.Ltmp612:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #752
+.Ltmp613:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q4, q10
+.Ltmp614:
+	.loc	1 176 10 is_stmt 1      @ ../crypto/hrss/hrss.c:176:10
+	vext.16	q10, q1, q10, #7
+.Ltmp615:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q14, q1
+.Ltmp616:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vmul.i16	q14, q3, q6
+	vld1.64	{d12, d13}, [r0:128]    @ 16-byte Reload
+	add	r0, sp, #512
+	vld1.64	{d6, d7}, [r0:128]      @ 16-byte Reload
+.Ltmp617:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #432
+.Ltmp618:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q9, q10
+.Ltmp619:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q14, q3, q6
+.Ltmp620:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d6, d7}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #736
+	vld1.64	{d12, d13}, [r0:128]    @ 16-byte Reload
+.Ltmp621:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #720
+.Ltmp622:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q14, q3, q6
+.Ltmp623:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d6, d7}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #176
+	vmla.i16	q14, q2, q3
+.Ltmp624:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q14, q5, q15
+	vld1.64	{d30, d31}, [r0:128]    @ 16-byte Reload
+.Ltmp625:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	add	r0, sp, #560
+.Ltmp626:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q13, q15, q10
+.Ltmp627:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q14, q7, q11
+.Ltmp628:
+	.loc	1 177 10 is_stmt 1      @ ../crypto/hrss/hrss.c:177:10
+	vmov.i32	q11, #0x0
+.Ltmp629:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q14, q4, q1
+.Ltmp630:
+	.loc	1 177 10                @ ../crypto/hrss/hrss.c:177:10
+	vext.16	q11, q11, q1, #7
+.Ltmp631:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vld1.64	{d2, d3}, [r0:128]      @ 16-byte Reload
+	add	r0, sp, #112
+	vmla.i16	q8, q15, q1
+.Ltmp632:
+	.loc	1 162 10 is_stmt 0      @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q14, q9, q11
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+.Ltmp633:
+	.loc	1 1035 5 is_stmt 1      @ ../crypto/hrss/hrss.c:1035:5
+	movs	r0, #80
+.Ltmp634:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q9, q10
+.Ltmp635:
+	.loc	1 1035 5                @ ../crypto/hrss/hrss.c:1035:5
+	vst1.16	{d28, d29}, [r1:128], r0
+	add	r0, sp, #144
+.Ltmp636:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q0, q9, q11
+.Ltmp637:
+	.loc	1 1035 5                @ ../crypto/hrss/hrss.c:1035:5
+	vld1.64	{d18, d19}, [r0:128]    @ 16-byte Reload
+	add.w	r0, lr, #64
+	vst1.64	{d18, d19}, [r1:128]
+.Ltmp638:
+	.loc	1 162 10                @ ../crypto/hrss/hrss.c:162:10
+	vmla.i16	q12, q15, q11
+.Ltmp639:
+	.loc	1 1035 5                @ ../crypto/hrss/hrss.c:1035:5
+	vst1.64	{d16, d17}, [r0:128]
+	add.w	r0, lr, #16
+	vst1.64	{d0, d1}, [r0:128]
+	add.w	r0, lr, #48
+	vst1.64	{d26, d27}, [r0:128]
+	add.w	r0, lr, #32
+	vst1.64	{d24, d25}, [r0:128]
+	b	.LBB3_17
+.LBB3_4:
+	.loc	1 1045 28               @ ../crypto/hrss/hrss.c:1045:28
+	lsr.w	r11, r1, #1
+	.loc	1 1048 26               @ ../crypto/hrss/hrss.c:1048:26
+	add.w	r0, r8, r11, lsl #4
+	.loc	1 1047 26               @ ../crypto/hrss/hrss.c:1047:26
+	str	r0, [sp, #816]          @ 4-byte Spill
+	add.w	r0, r10, r11, lsl #4
+	.loc	1 1046 29               @ ../crypto/hrss/hrss.c:1046:29
+	str	r0, [sp, #800]          @ 4-byte Spill
+	sub.w	r0, r1, r1, lsr #1
+	str	r0, [sp, #832]          @ 4-byte Spill
+	movs	r0, #0
+	.loc	1 1052 3                @ ../crypto/hrss/hrss.c:1052:3
+	cmp.w	r0, r1, lsr #1
+	beq	.LBB3_7
+@ %bb.5:
+	.loc	1 1053 22               @ ../crypto/hrss/hrss.c:1053:22
+	ldr	r0, [r7, #8]
+	lsl.w	r1, r11, #4
+	mov	r2, r11
+	mov	r3, r8
+	mov	r4, lr
+	mov	r5, r10
+	lsls	r0, r0, #4
+	sub.w	r12, r0, r11, lsl #4
+.LBB3_6:                                @ =>This Inner Loop Header: Depth=1
+	adds	r0, r5, r1
+	.loc	1 1053 33 is_stmt 0     @ ../crypto/hrss/hrss.c:1053:33
+	vld1.16	{d16, d17}, [r5:128]!
+	.loc	1 1054 33 is_stmt 1     @ ../crypto/hrss/hrss.c:1054:33
+	adds	r6, r3, r1
+	.loc	1 1052 24               @ ../crypto/hrss/hrss.c:1052:24
+	subs	r2, #1
+	.loc	1 1053 22               @ ../crypto/hrss/hrss.c:1053:22
+	vld1.64	{d18, d19}, [r0:128]
+	.loc	1 1054 23               @ ../crypto/hrss/hrss.c:1054:23
+	add.w	r0, r4, r12
+.Ltmp640:
+	.loc	1 155 58                @ ../crypto/hrss/hrss.c:155:58
+	vadd.i16	q8, q8, q9
+.Ltmp641:
+	.loc	1 1053 12               @ ../crypto/hrss/hrss.c:1053:12
+	vst1.16	{d16, d17}, [r4:128]!
+	.loc	1 1054 44               @ ../crypto/hrss/hrss.c:1054:44
+	vld1.16	{d18, d19}, [r3:128]!
+	.loc	1 1054 33 is_stmt 0     @ ../crypto/hrss/hrss.c:1054:33
+	vld1.64	{d16, d17}, [r6:128]
+.Ltmp642:
+	.loc	1 155 58 is_stmt 1      @ ../crypto/hrss/hrss.c:155:58
+	vadd.i16	q8, q9, q8
+.Ltmp643:
+	.loc	1 1054 23               @ ../crypto/hrss/hrss.c:1054:23
+	vst1.64	{d16, d17}, [r0:128]
+	.loc	1 1052 3                @ ../crypto/hrss/hrss.c:1052:3
+	bne	.LBB3_6
+.LBB3_7:
+	.loc	1 1056 7                @ ../crypto/hrss/hrss.c:1056:7
+	ldr	r0, [sp, #832]          @ 4-byte Reload
+	ldr	r4, [r7, #8]
+	cmp	r0, r11
+	beq	.LBB3_9
+@ %bb.8:
+	.loc	1 1057 20               @ ../crypto/hrss/hrss.c:1057:20
+	ldr	r0, [sp, #800]          @ 4-byte Reload
+	add.w	r0, r0, r11, lsl #4
+	vld1.64	{d16, d17}, [r0:128]
+	.loc	1 1057 5 is_stmt 0      @ ../crypto/hrss/hrss.c:1057:5
+	add.w	r0, lr, r11, lsl #4
+	.loc	1 1057 18               @ ../crypto/hrss/hrss.c:1057:18
+	vst1.64	{d16, d17}, [r0:128]
+	.loc	1 1058 31 is_stmt 1     @ ../crypto/hrss/hrss.c:1058:31
+	ldr	r0, [sp, #816]          @ 4-byte Reload
+	add.w	r0, r0, r11, lsl #4
+	vld1.64	{d16, d17}, [r0:128]
+	.loc	1 1058 5 is_stmt 0      @ ../crypto/hrss/hrss.c:1058:5
+	add.w	r0, lr, r4, lsl #4
+	.loc	1 1058 29               @ ../crypto/hrss/hrss.c:1058:29
+	vst1.64	{d16, d17}, [r0:128]
+.LBB3_9:
+	.loc	1 0 29                  @ ../crypto/hrss/hrss.c:0:29
+	ldr	r4, [sp, #832]          @ 4-byte Reload
+	mov	r6, r11
+	.loc	1 1063 3 is_stmt 1      @ ../crypto/hrss/hrss.c:1063:3
+	mov	r0, r9
+	mov	r2, lr
+	str	r4, [sp]
+	mov	r5, lr
+	.loc	1 1061 33               @ ../crypto/hrss/hrss.c:1061:33
+	add.w	r11, r9, r4, lsl #5
+	.loc	1 1063 50               @ ../crypto/hrss/hrss.c:1063:50
+	add.w	r3, lr, r4, lsl #4
+	.loc	1 1063 3 is_stmt 0      @ ../crypto/hrss/hrss.c:1063:3
+	mov	r1, r11
+	bl	poly_mul_vec_aux
+	.loc	1 1065 33 is_stmt 1     @ ../crypto/hrss/hrss.c:1065:33
+	ldr	r0, [r7, #8]
+	.loc	1 1065 3 is_stmt 0      @ ../crypto/hrss/hrss.c:1065:3
+	mov	r1, r11
+	str	r4, [sp]
+	.loc	1 1065 33               @ ../crypto/hrss/hrss.c:1065:33
+	bic	r4, r0, #1
+	.loc	1 1065 3                @ ../crypto/hrss/hrss.c:1065:3
+	ldr	r2, [sp, #800]          @ 4-byte Reload
+	ldr	r3, [sp, #816]          @ 4-byte Reload
+	.loc	1 1065 21               @ ../crypto/hrss/hrss.c:1065:21
+	add.w	r0, r5, r4, lsl #4
+	.loc	1 1065 3                @ ../crypto/hrss/hrss.c:1065:3
+	bl	poly_mul_vec_aux
+	.loc	1 1067 3 is_stmt 1      @ ../crypto/hrss/hrss.c:1067:3
+	mov	r1, r11
+	mov	r0, r5
+	mov	r2, r10
+	mov	r3, r8
+	str	r6, [sp]
+	mov	r11, r6
+	bl	poly_mul_vec_aux
+	.loc	1 1070 3                @ ../crypto/hrss/hrss.c:1070:3
+	cbz	r4, .LBB3_12
+@ %bb.10:
+	.loc	1 1071 26               @ ../crypto/hrss/hrss.c:1071:26
+	lsl.w	r0, r11, #5
+	mov	r1, r4
+	mov	r2, r5
+	mov	r3, r9
+.LBB3_11:                               @ =>This Inner Loop Header: Depth=1
+	.loc	1 1071 54 is_stmt 0     @ ../crypto/hrss/hrss.c:1071:54
+	adds	r6, r2, r0
+	.loc	1 1071 46               @ ../crypto/hrss/hrss.c:1071:46
+	vld1.16	{d16, d17}, [r2:128]!
+	.loc	1 1070 24 is_stmt 1     @ ../crypto/hrss/hrss.c:1070:24
+	subs	r1, #1
+	.loc	1 1071 26               @ ../crypto/hrss/hrss.c:1071:26
+	vld1.64	{d18, d19}, [r3:128]
+.Ltmp644:
+	.loc	1 155 58                @ ../crypto/hrss/hrss.c:155:58
+	vsub.i16	q8, q9, q8
+.Ltmp645:
+	.loc	1 1071 54               @ ../crypto/hrss/hrss.c:1071:54
+	vld1.64	{d20, d21}, [r6:128]
+.Ltmp646:
+	.loc	1 157 58                @ ../crypto/hrss/hrss.c:157:58
+	vsub.i16	q8, q8, q10
+.Ltmp647:
+	.loc	1 1071 16               @ ../crypto/hrss/hrss.c:1071:16
+	vst1.16	{d16, d17}, [r3:128]!
+	.loc	1 1070 3                @ ../crypto/hrss/hrss.c:1070:3
+	bne	.LBB3_11
+.LBB3_12:
+	.loc	1 0 3 is_stmt 0         @ ../crypto/hrss/hrss.c:0:3
+	ldr	r1, [sp, #832]          @ 4-byte Reload
+	.loc	1 1073 7 is_stmt 1      @ ../crypto/hrss/hrss.c:1073:7
+	cmp	r1, r11
+	lsl.w	r0, r1, #1
+	beq	.LBB3_14
+@ %bb.13:
+	.loc	1 1074 58               @ ../crypto/hrss/hrss.c:1074:58
+	add.w	r1, r5, r11, lsl #6
+	vld1.64	{d16, d17}, [r1:128]
+	.loc	1 1074 36 is_stmt 0     @ ../crypto/hrss/hrss.c:1074:36
+	add.w	r1, r9, r4, lsl #4
+	vld1.64	{d18, d19}, [r1:128]
+.Ltmp648:
+	.loc	1 157 58 is_stmt 1      @ ../crypto/hrss/hrss.c:157:58
+	vsub.i16	q8, q9, q8
+.Ltmp649:
+	.loc	1 1074 26               @ ../crypto/hrss/hrss.c:1074:26
+	vst1.64	{d16, d17}, [r1:128]
+	movs	r1, #16
+	.loc	1 1076 43               @ ../crypto/hrss/hrss.c:1076:43
+	orr.w	r1, r1, r11, lsl #6
+	add	r1, r5
+	vld1.64	{d16, d17}, [r1:128]
+	.loc	1 1076 17 is_stmt 0     @ ../crypto/hrss/hrss.c:1076:17
+	ldr	r1, [r7, #8]
+	orr	r1, r1, #1
+	add.w	r1, r9, r1, lsl #4
+	vld1.64	{d18, d19}, [r1:128]
+.Ltmp650:
+	.loc	1 157 58 is_stmt 1      @ ../crypto/hrss/hrss.c:157:58
+	vsub.i16	q8, q9, q8
+.Ltmp651:
+	.loc	1 1075 30               @ ../crypto/hrss/hrss.c:1075:30
+	vst1.64	{d16, d17}, [r1:128]
+.LBB3_14:
+	.loc	1 1080 3                @ ../crypto/hrss/hrss.c:1080:3
+	cbz	r0, .LBB3_17
+@ %bb.15:
+	.loc	1 1081 44               @ ../crypto/hrss/hrss.c:1081:44
+	add.w	r1, r5, r11, lsl #4
+.LBB3_16:                               @ =>This Inner Loop Header: Depth=1
+	.loc	1 1081 32 is_stmt 0     @ ../crypto/hrss/hrss.c:1081:32
+	vld1.64	{d16, d17}, [r1:128]
+	.loc	1 1080 24 is_stmt 1     @ ../crypto/hrss/hrss.c:1080:24
+	subs	r0, #1
+	.loc	1 1081 50               @ ../crypto/hrss/hrss.c:1081:50
+	vld1.16	{d18, d19}, [r9:128]!
+.Ltmp652:
+	.loc	1 155 58                @ ../crypto/hrss/hrss.c:155:58
+	vadd.i16	q8, q9, q8
+.Ltmp653:
+	.loc	1 1081 22               @ ../crypto/hrss/hrss.c:1081:22
+	vst1.16	{d16, d17}, [r1:128]!
+	.loc	1 1080 3                @ ../crypto/hrss/hrss.c:1080:3
+	bne	.LBB3_16
+.LBB3_17:
+	.loc	1 1083 1                @ ../crypto/hrss/hrss.c:1083:1
+	sub.w	r4, r7, #96
+	mov	sp, r4
+	vpop	{d8, d9, d10, d11, d12, d13, d14, d15}
+	add	sp, #4
+	pop.w	{r8, r9, r10, r11}
+	pop	{r4, r5, r6, r7, pc}
+.Ltmp654:
+.Lfunc_end3:
+	.size	poly_mul_vec_aux, .Lfunc_end3-poly_mul_vec_aux
+	.cfi_endproc
+	.fnend
+
+	.section	".note.GNU-stack","",%progbits
+	.section	.debug_line,"",%progbits
+
+#endif
diff --git a/crypto/hrss/asm/poly_rq_mul.S b/crypto/hrss/asm/poly_rq_mul.S
new file mode 100644
index 0000000..0ad0fb5
--- /dev/null
+++ b/crypto/hrss/asm/poly_rq_mul.S
@@ -0,0 +1,8457 @@
+// Copyright (c) 2017, the HRSS authors.
+//
+// Permission to use, copy, modify, and/or distribute this software for any
+// purpose with or without fee is hereby granted, provided that the above
+// copyright notice and this permission notice appear in all copies.
+//
+// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && defined(__linux__)
+
+// This is the polynomial multiplication function from [HRSS], provided by kind
+// permission of the authors.
+//
+// HRSS: https://eprint.iacr.org/2017/1005
+
+# This file was generated by poly_rq_mul.py
+.text
+.align 32
+mask_low9words:
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0xffff
+.word 0x0
+.word 0x0
+.word 0x0
+.word 0x0
+.word 0x0
+.word 0x0
+.word 0x0
+const3:
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+.word 3
+const9:
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+.word 9
+const0:
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+const729:
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+.word 729
+const3_inv:
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+.word 43691
+const5_inv:
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+.word 52429
+shuf48_16:
+.byte 10
+.byte 11
+.byte 12
+.byte 13
+.byte 14
+.byte 15
+.byte 0
+.byte 1
+.byte 2
+.byte 3
+.byte 4
+.byte 5
+.byte 6
+.byte 7
+.byte 8
+.byte 9
+.byte 10
+.byte 11
+.byte 12
+.byte 13
+.byte 14
+.byte 15
+.byte 0
+.byte 1
+.byte 2
+.byte 3
+.byte 4
+.byte 5
+.byte 6
+.byte 7
+.byte 8
+.byte 9
+shufmin1_mask3:
+.byte 2
+.byte 3
+.byte 4
+.byte 5
+.byte 6
+.byte 7
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+.byte 255
+mask32_to_16:
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+.word 0xffff
+.word 0x0
+mask5_3_5_3:
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+mask3_5_3_5:
+.word 65535
+.word 65535
+.word 65535
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+mask3_5_4_3_1:
+.word 65535
+.word 65535
+.word 65535
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 0
+mask_keephigh:
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 0
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+.word 65535
+mask_mod8192:
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.word 8191
+.text
+.global poly_Rq_mul
+.hidden poly_Rq_mul
+.att_syntax prefix
+poly_Rq_mul:
+.cfi_startproc
+push %rbp
+.cfi_adjust_cfa_offset 8
+.cfi_offset rbp, -16
+movq %rsp, %rbp
+.cfi_def_cfa_register rbp
+push %r12
+.cfi_offset r12, -24
+mov %rsp, %r8
+andq $-32, %rsp
+subq $6144, %rsp
+mov %rsp, %rax
+subq $6144, %rsp
+mov %rsp, %r11
+subq $12288, %rsp
+mov %rsp, %r12
+subq $512, %rsp
+vmovdqa const3(%rip), %ymm3
+vmovdqu 0(%rsi), %ymm0
+vmovdqu 88(%rsi), %ymm1
+vmovdqu 176(%rsi), %ymm2
+vmovdqu 264(%rsi), %ymm12
+vmovdqu 1056(%rsi), %ymm4
+vmovdqu 1144(%rsi), %ymm5
+vmovdqu 1232(%rsi), %ymm6
+vmovdqu 1320(%rsi), %ymm7
+vmovdqu 352(%rsi), %ymm8
+vmovdqu 440(%rsi), %ymm9
+vmovdqu 528(%rsi), %ymm10
+vmovdqu 616(%rsi), %ymm11
+vmovdqa %ymm0, 0(%rax)
+vmovdqa %ymm1, 96(%rax)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 192(%rax)
+vmovdqa %ymm2, 288(%rax)
+vmovdqa %ymm12, 384(%rax)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 480(%rax)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 576(%rax)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 672(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 768(%rax)
+vmovdqa %ymm4, 5184(%rax)
+vmovdqa %ymm5, 5280(%rax)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5376(%rax)
+vmovdqa %ymm6, 5472(%rax)
+vmovdqa %ymm7, 5568(%rax)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5664(%rax)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5760(%rax)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5856(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 5952(%rax)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 704(%rsi), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 792(%rsi), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 880(%rsi), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 968(%rsi), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 864(%rax)
+vmovdqa %ymm9, 960(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1056(%rax)
+vmovdqa %ymm10, 1152(%rax)
+vmovdqa %ymm11, 1248(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1344(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1440(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1536(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1632(%rax)
+vmovdqa %ymm12, 1728(%rax)
+vmovdqa %ymm13, 1824(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1920(%rax)
+vmovdqa %ymm14, 2016(%rax)
+vmovdqa %ymm15, 2112(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2208(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2304(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2400(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2496(%rax)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2592(%rax)
+vmovdqa %ymm9, 2688(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2784(%rax)
+vmovdqa %ymm10, 2880(%rax)
+vmovdqa %ymm11, 2976(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3072(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3168(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3264(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3360(%rax)
+vmovdqa %ymm12, 3456(%rax)
+vmovdqa %ymm13, 3552(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3648(%rax)
+vmovdqa %ymm14, 3744(%rax)
+vmovdqa %ymm15, 3840(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 3936(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4032(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4128(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4224(%rax)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4320(%rax)
+vmovdqa %ymm13, 4416(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4512(%rax)
+vmovdqa %ymm14, 4608(%rax)
+vmovdqa %ymm15, 4704(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4800(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4896(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4992(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5088(%rax)
+vmovdqu 32(%rsi), %ymm0
+vmovdqu 120(%rsi), %ymm1
+vmovdqu 208(%rsi), %ymm2
+vmovdqu 296(%rsi), %ymm12
+vmovdqu 1088(%rsi), %ymm4
+vmovdqu 1176(%rsi), %ymm5
+vmovdqu 1264(%rsi), %ymm6
+vmovdqu 1352(%rsi), %ymm7
+vmovdqu 384(%rsi), %ymm8
+vmovdqu 472(%rsi), %ymm9
+vmovdqu 560(%rsi), %ymm10
+vmovdqu 648(%rsi), %ymm11
+vmovdqa %ymm0, 32(%rax)
+vmovdqa %ymm1, 128(%rax)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 224(%rax)
+vmovdqa %ymm2, 320(%rax)
+vmovdqa %ymm12, 416(%rax)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 512(%rax)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 608(%rax)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 704(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 800(%rax)
+vmovdqa %ymm4, 5216(%rax)
+vmovdqa %ymm5, 5312(%rax)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5408(%rax)
+vmovdqa %ymm6, 5504(%rax)
+vmovdqa %ymm7, 5600(%rax)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5696(%rax)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5792(%rax)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5888(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 5984(%rax)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 736(%rsi), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 824(%rsi), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 912(%rsi), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 1000(%rsi), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 896(%rax)
+vmovdqa %ymm9, 992(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1088(%rax)
+vmovdqa %ymm10, 1184(%rax)
+vmovdqa %ymm11, 1280(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1376(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1472(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1568(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1664(%rax)
+vmovdqa %ymm12, 1760(%rax)
+vmovdqa %ymm13, 1856(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1952(%rax)
+vmovdqa %ymm14, 2048(%rax)
+vmovdqa %ymm15, 2144(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2240(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2336(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2432(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2528(%rax)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2624(%rax)
+vmovdqa %ymm9, 2720(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2816(%rax)
+vmovdqa %ymm10, 2912(%rax)
+vmovdqa %ymm11, 3008(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3104(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3200(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3296(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3392(%rax)
+vmovdqa %ymm12, 3488(%rax)
+vmovdqa %ymm13, 3584(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3680(%rax)
+vmovdqa %ymm14, 3776(%rax)
+vmovdqa %ymm15, 3872(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 3968(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4064(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4160(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4256(%rax)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4352(%rax)
+vmovdqa %ymm13, 4448(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4544(%rax)
+vmovdqa %ymm14, 4640(%rax)
+vmovdqa %ymm15, 4736(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4832(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4928(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 5024(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5120(%rax)
+vmovdqu 64(%rsi), %ymm0
+vmovdqu 152(%rsi), %ymm1
+vmovdqu 240(%rsi), %ymm2
+vmovdqu 328(%rsi), %ymm12
+vmovdqu 1120(%rsi), %ymm4
+vmovdqu 1208(%rsi), %ymm5
+vmovdqu 1296(%rsi), %ymm6
+vmovdqu 1384(%rsi), %ymm7
+vpand mask_low9words(%rip), %ymm7, %ymm7
+vmovdqu 416(%rsi), %ymm8
+vmovdqu 504(%rsi), %ymm9
+vmovdqu 592(%rsi), %ymm10
+vmovdqu 680(%rsi), %ymm11
+vmovdqa %ymm0, 64(%rax)
+vmovdqa %ymm1, 160(%rax)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 256(%rax)
+vmovdqa %ymm2, 352(%rax)
+vmovdqa %ymm12, 448(%rax)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 544(%rax)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 640(%rax)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 736(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 832(%rax)
+vmovdqa %ymm4, 5248(%rax)
+vmovdqa %ymm5, 5344(%rax)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5440(%rax)
+vmovdqa %ymm6, 5536(%rax)
+vmovdqa %ymm7, 5632(%rax)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5728(%rax)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5824(%rax)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5920(%rax)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 6016(%rax)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 768(%rsi), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 856(%rsi), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 944(%rsi), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 1032(%rsi), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 928(%rax)
+vmovdqa %ymm9, 1024(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1120(%rax)
+vmovdqa %ymm10, 1216(%rax)
+vmovdqa %ymm11, 1312(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1408(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1504(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1600(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1696(%rax)
+vmovdqa %ymm12, 1792(%rax)
+vmovdqa %ymm13, 1888(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1984(%rax)
+vmovdqa %ymm14, 2080(%rax)
+vmovdqa %ymm15, 2176(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2272(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2368(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2464(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2560(%rax)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2656(%rax)
+vmovdqa %ymm9, 2752(%rax)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2848(%rax)
+vmovdqa %ymm10, 2944(%rax)
+vmovdqa %ymm11, 3040(%rax)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3136(%rax)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3232(%rax)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3328(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3424(%rax)
+vmovdqa %ymm12, 3520(%rax)
+vmovdqa %ymm13, 3616(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3712(%rax)
+vmovdqa %ymm14, 3808(%rax)
+vmovdqa %ymm15, 3904(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4000(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4096(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4192(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4288(%rax)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4384(%rax)
+vmovdqa %ymm13, 4480(%rax)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4576(%rax)
+vmovdqa %ymm14, 4672(%rax)
+vmovdqa %ymm15, 4768(%rax)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4864(%rax)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4960(%rax)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 5056(%rax)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5152(%rax)
+vmovdqu 0(%rdx), %ymm0
+vmovdqu 88(%rdx), %ymm1
+vmovdqu 176(%rdx), %ymm2
+vmovdqu 264(%rdx), %ymm12
+vmovdqu 1056(%rdx), %ymm4
+vmovdqu 1144(%rdx), %ymm5
+vmovdqu 1232(%rdx), %ymm6
+vmovdqu 1320(%rdx), %ymm7
+vmovdqu 352(%rdx), %ymm8
+vmovdqu 440(%rdx), %ymm9
+vmovdqu 528(%rdx), %ymm10
+vmovdqu 616(%rdx), %ymm11
+vmovdqa %ymm0, 0(%r11)
+vmovdqa %ymm1, 96(%r11)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 192(%r11)
+vmovdqa %ymm2, 288(%r11)
+vmovdqa %ymm12, 384(%r11)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 480(%r11)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 576(%r11)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 672(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 768(%r11)
+vmovdqa %ymm4, 5184(%r11)
+vmovdqa %ymm5, 5280(%r11)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5376(%r11)
+vmovdqa %ymm6, 5472(%r11)
+vmovdqa %ymm7, 5568(%r11)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5664(%r11)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5760(%r11)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5856(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 5952(%r11)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 704(%rdx), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 792(%rdx), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 880(%rdx), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 968(%rdx), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 864(%r11)
+vmovdqa %ymm9, 960(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1056(%r11)
+vmovdqa %ymm10, 1152(%r11)
+vmovdqa %ymm11, 1248(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1344(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1440(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1536(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1632(%r11)
+vmovdqa %ymm12, 1728(%r11)
+vmovdqa %ymm13, 1824(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1920(%r11)
+vmovdqa %ymm14, 2016(%r11)
+vmovdqa %ymm15, 2112(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2208(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2304(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2400(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2496(%r11)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2592(%r11)
+vmovdqa %ymm9, 2688(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2784(%r11)
+vmovdqa %ymm10, 2880(%r11)
+vmovdqa %ymm11, 2976(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3072(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3168(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3264(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3360(%r11)
+vmovdqa %ymm12, 3456(%r11)
+vmovdqa %ymm13, 3552(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3648(%r11)
+vmovdqa %ymm14, 3744(%r11)
+vmovdqa %ymm15, 3840(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 3936(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4032(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4128(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4224(%r11)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4320(%r11)
+vmovdqa %ymm13, 4416(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4512(%r11)
+vmovdqa %ymm14, 4608(%r11)
+vmovdqa %ymm15, 4704(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4800(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4896(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4992(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5088(%r11)
+vmovdqu 32(%rdx), %ymm0
+vmovdqu 120(%rdx), %ymm1
+vmovdqu 208(%rdx), %ymm2
+vmovdqu 296(%rdx), %ymm12
+vmovdqu 1088(%rdx), %ymm4
+vmovdqu 1176(%rdx), %ymm5
+vmovdqu 1264(%rdx), %ymm6
+vmovdqu 1352(%rdx), %ymm7
+vmovdqu 384(%rdx), %ymm8
+vmovdqu 472(%rdx), %ymm9
+vmovdqu 560(%rdx), %ymm10
+vmovdqu 648(%rdx), %ymm11
+vmovdqa %ymm0, 32(%r11)
+vmovdqa %ymm1, 128(%r11)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 224(%r11)
+vmovdqa %ymm2, 320(%r11)
+vmovdqa %ymm12, 416(%r11)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 512(%r11)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 608(%r11)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 704(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 800(%r11)
+vmovdqa %ymm4, 5216(%r11)
+vmovdqa %ymm5, 5312(%r11)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5408(%r11)
+vmovdqa %ymm6, 5504(%r11)
+vmovdqa %ymm7, 5600(%r11)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5696(%r11)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5792(%r11)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5888(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 5984(%r11)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 736(%rdx), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 824(%rdx), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 912(%rdx), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 1000(%rdx), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 896(%r11)
+vmovdqa %ymm9, 992(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1088(%r11)
+vmovdqa %ymm10, 1184(%r11)
+vmovdqa %ymm11, 1280(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1376(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1472(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1568(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1664(%r11)
+vmovdqa %ymm12, 1760(%r11)
+vmovdqa %ymm13, 1856(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1952(%r11)
+vmovdqa %ymm14, 2048(%r11)
+vmovdqa %ymm15, 2144(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2240(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2336(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2432(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2528(%r11)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2624(%r11)
+vmovdqa %ymm9, 2720(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2816(%r11)
+vmovdqa %ymm10, 2912(%r11)
+vmovdqa %ymm11, 3008(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3104(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3200(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3296(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3392(%r11)
+vmovdqa %ymm12, 3488(%r11)
+vmovdqa %ymm13, 3584(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3680(%r11)
+vmovdqa %ymm14, 3776(%r11)
+vmovdqa %ymm15, 3872(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 3968(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4064(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4160(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4256(%r11)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4352(%r11)
+vmovdqa %ymm13, 4448(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4544(%r11)
+vmovdqa %ymm14, 4640(%r11)
+vmovdqa %ymm15, 4736(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4832(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4928(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 5024(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5120(%r11)
+vmovdqu 64(%rdx), %ymm0
+vmovdqu 152(%rdx), %ymm1
+vmovdqu 240(%rdx), %ymm2
+vmovdqu 328(%rdx), %ymm12
+vmovdqu 1120(%rdx), %ymm4
+vmovdqu 1208(%rdx), %ymm5
+vmovdqu 1296(%rdx), %ymm6
+vmovdqu 1384(%rdx), %ymm7
+vpand mask_low9words(%rip), %ymm7, %ymm7
+vmovdqu 416(%rdx), %ymm8
+vmovdqu 504(%rdx), %ymm9
+vmovdqu 592(%rdx), %ymm10
+vmovdqu 680(%rdx), %ymm11
+vmovdqa %ymm0, 64(%r11)
+vmovdqa %ymm1, 160(%r11)
+vpaddw %ymm0, %ymm1, %ymm14
+vmovdqa %ymm14, 256(%r11)
+vmovdqa %ymm2, 352(%r11)
+vmovdqa %ymm12, 448(%r11)
+vpaddw %ymm2, %ymm12, %ymm14
+vmovdqa %ymm14, 544(%r11)
+vpaddw %ymm0, %ymm2, %ymm14
+vmovdqa %ymm14, 640(%r11)
+vpaddw %ymm1, %ymm12, %ymm15
+vmovdqa %ymm15, 736(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 832(%r11)
+vmovdqa %ymm4, 5248(%r11)
+vmovdqa %ymm5, 5344(%r11)
+vpaddw %ymm4, %ymm5, %ymm14
+vmovdqa %ymm14, 5440(%r11)
+vmovdqa %ymm6, 5536(%r11)
+vmovdqa %ymm7, 5632(%r11)
+vpaddw %ymm6, %ymm7, %ymm14
+vmovdqa %ymm14, 5728(%r11)
+vpaddw %ymm4, %ymm6, %ymm14
+vmovdqa %ymm14, 5824(%r11)
+vpaddw %ymm5, %ymm7, %ymm15
+vmovdqa %ymm15, 5920(%r11)
+vpaddw %ymm14, %ymm15, %ymm14
+vmovdqa %ymm14, 6016(%r11)
+vmovdqa %ymm0, 0(%rsp)
+vmovdqa %ymm1, 32(%rsp)
+vmovdqa %ymm2, 64(%rsp)
+vmovdqa %ymm12, 96(%rsp)
+vmovdqa %ymm8, 128(%rsp)
+vmovdqa %ymm9, 160(%rsp)
+vmovdqa %ymm10, 192(%rsp)
+vmovdqa %ymm11, 224(%rsp)
+vmovdqu 768(%rdx), %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm1
+vpaddw 128(%rsp), %ymm4, %ymm2
+vpaddw %ymm2, %ymm1, %ymm8
+vpsubw %ymm2, %ymm1, %ymm12
+vmovdqa %ymm0, 256(%rsp)
+vmovdqu 856(%rdx), %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm1
+vpaddw 160(%rsp), %ymm5, %ymm2
+vpaddw %ymm2, %ymm1, %ymm9
+vpsubw %ymm2, %ymm1, %ymm13
+vmovdqa %ymm0, 288(%rsp)
+vmovdqu 944(%rdx), %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm1
+vpaddw 192(%rsp), %ymm6, %ymm2
+vpaddw %ymm2, %ymm1, %ymm10
+vpsubw %ymm2, %ymm1, %ymm14
+vmovdqa %ymm0, 320(%rsp)
+vmovdqu 1032(%rdx), %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm1
+vpaddw 224(%rsp), %ymm7, %ymm2
+vpaddw %ymm2, %ymm1, %ymm11
+vpsubw %ymm2, %ymm1, %ymm15
+vmovdqa %ymm0, 352(%rsp)
+vmovdqa %ymm8, 928(%r11)
+vmovdqa %ymm9, 1024(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 1120(%r11)
+vmovdqa %ymm10, 1216(%r11)
+vmovdqa %ymm11, 1312(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 1408(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 1504(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 1600(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 1696(%r11)
+vmovdqa %ymm12, 1792(%r11)
+vmovdqa %ymm13, 1888(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 1984(%r11)
+vmovdqa %ymm14, 2080(%r11)
+vmovdqa %ymm15, 2176(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 2272(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 2368(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 2464(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 2560(%r11)
+vmovdqa 256(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm4, %ymm1
+vpaddw 128(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm8
+vpsubw %ymm1, %ymm0, %ymm12
+vmovdqa 288(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm5, %ymm1
+vpaddw 160(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm9
+vpsubw %ymm1, %ymm0, %ymm13
+vmovdqa 320(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm6, %ymm1
+vpaddw 192(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm10
+vpsubw %ymm1, %ymm0, %ymm14
+vmovdqa 352(%rsp), %ymm0
+vpsllw $2, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm0
+vpsllw $2, %ymm7, %ymm1
+vpaddw 224(%rsp), %ymm1, %ymm1
+vpsllw $1, %ymm1, %ymm1
+vpaddw %ymm1, %ymm0, %ymm11
+vpsubw %ymm1, %ymm0, %ymm15
+vmovdqa %ymm8, 2656(%r11)
+vmovdqa %ymm9, 2752(%r11)
+vpaddw %ymm8, %ymm9, %ymm0
+vmovdqa %ymm0, 2848(%r11)
+vmovdqa %ymm10, 2944(%r11)
+vmovdqa %ymm11, 3040(%r11)
+vpaddw %ymm10, %ymm11, %ymm0
+vmovdqa %ymm0, 3136(%r11)
+vpaddw %ymm8, %ymm10, %ymm0
+vmovdqa %ymm0, 3232(%r11)
+vpaddw %ymm9, %ymm11, %ymm1
+vmovdqa %ymm1, 3328(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 3424(%r11)
+vmovdqa %ymm12, 3520(%r11)
+vmovdqa %ymm13, 3616(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 3712(%r11)
+vmovdqa %ymm14, 3808(%r11)
+vmovdqa %ymm15, 3904(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4000(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4096(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 4192(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 4288(%r11)
+vpmullw %ymm3, %ymm4, %ymm0
+vpaddw 256(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 128(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 0(%rsp), %ymm0, %ymm12
+vpmullw %ymm3, %ymm5, %ymm0
+vpaddw 288(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 160(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 32(%rsp), %ymm0, %ymm13
+vpmullw %ymm3, %ymm6, %ymm0
+vpaddw 320(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 192(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 64(%rsp), %ymm0, %ymm14
+vpmullw %ymm3, %ymm7, %ymm0
+vpaddw 352(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 224(%rsp), %ymm0, %ymm0
+vpmullw %ymm3, %ymm0, %ymm0
+vpaddw 96(%rsp), %ymm0, %ymm15
+vmovdqa %ymm12, 4384(%r11)
+vmovdqa %ymm13, 4480(%r11)
+vpaddw %ymm12, %ymm13, %ymm0
+vmovdqa %ymm0, 4576(%r11)
+vmovdqa %ymm14, 4672(%r11)
+vmovdqa %ymm15, 4768(%r11)
+vpaddw %ymm14, %ymm15, %ymm0
+vmovdqa %ymm0, 4864(%r11)
+vpaddw %ymm12, %ymm14, %ymm0
+vmovdqa %ymm0, 4960(%r11)
+vpaddw %ymm13, %ymm15, %ymm1
+vmovdqa %ymm1, 5056(%r11)
+vpaddw %ymm0, %ymm1, %ymm0
+vmovdqa %ymm0, 5152(%r11)
+subq $9408, %rsp
+mov $4, %ecx
+karatsuba_loop_4eced63f144beffcb0247f9c6f67d165:
+mov %rsp, %r9
+mov %rsp, %r10
+subq $32, %rsp
+vmovdqa 0(%rax), %ymm0
+vmovdqa 192(%rax), %ymm1
+vmovdqa 384(%rax), %ymm2
+vmovdqa 576(%rax), %ymm3
+vpunpcklwd 96(%rax), %ymm0, %ymm4
+vpunpckhwd 96(%rax), %ymm0, %ymm5
+vpunpcklwd 288(%rax), %ymm1, %ymm6
+vpunpckhwd 288(%rax), %ymm1, %ymm7
+vpunpcklwd 480(%rax), %ymm2, %ymm8
+vpunpckhwd 480(%rax), %ymm2, %ymm9
+vpunpcklwd 672(%rax), %ymm3, %ymm10
+vpunpckhwd 672(%rax), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 768(%rax), %ymm0
+vmovdqa 960(%rax), %ymm1
+vmovdqa 1152(%rax), %ymm2
+vmovdqa 1344(%rax), %ymm3
+vpunpcklwd 864(%rax), %ymm0, %ymm12
+vpunpckhwd 864(%rax), %ymm0, %ymm13
+vpunpcklwd 1056(%rax), %ymm1, %ymm14
+vpunpckhwd 1056(%rax), %ymm1, %ymm15
+vpunpcklwd 1248(%rax), %ymm2, %ymm0
+vpunpckhwd 1248(%rax), %ymm2, %ymm1
+vpunpcklwd 1440(%rax), %ymm3, %ymm2
+vpunpckhwd 1440(%rax), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 0(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 32(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 64(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 96(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 128(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 160(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 192(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 256(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 288(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 320(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 352(%r9)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 384(%r9)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 416(%r9)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 448(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 224(%r9)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 480(%r9)
+vmovdqa 32(%rax), %ymm0
+vmovdqa 224(%rax), %ymm1
+vmovdqa 416(%rax), %ymm2
+vmovdqa 608(%rax), %ymm3
+vpunpcklwd 128(%rax), %ymm0, %ymm4
+vpunpckhwd 128(%rax), %ymm0, %ymm5
+vpunpcklwd 320(%rax), %ymm1, %ymm6
+vpunpckhwd 320(%rax), %ymm1, %ymm7
+vpunpcklwd 512(%rax), %ymm2, %ymm8
+vpunpckhwd 512(%rax), %ymm2, %ymm9
+vpunpcklwd 704(%rax), %ymm3, %ymm10
+vpunpckhwd 704(%rax), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 800(%rax), %ymm0
+vmovdqa 992(%rax), %ymm1
+vmovdqa 1184(%rax), %ymm2
+vmovdqa 1376(%rax), %ymm3
+vpunpcklwd 896(%rax), %ymm0, %ymm12
+vpunpckhwd 896(%rax), %ymm0, %ymm13
+vpunpcklwd 1088(%rax), %ymm1, %ymm14
+vpunpckhwd 1088(%rax), %ymm1, %ymm15
+vpunpcklwd 1280(%rax), %ymm2, %ymm0
+vpunpckhwd 1280(%rax), %ymm2, %ymm1
+vpunpcklwd 1472(%rax), %ymm3, %ymm2
+vpunpckhwd 1472(%rax), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 512(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 544(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 576(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 608(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 640(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 672(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 704(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 768(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 800(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 832(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 864(%r9)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 896(%r9)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 928(%r9)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 960(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 736(%r9)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 992(%r9)
+vmovdqa 64(%rax), %ymm0
+vmovdqa 256(%rax), %ymm1
+vmovdqa 448(%rax), %ymm2
+vmovdqa 640(%rax), %ymm3
+vpunpcklwd 160(%rax), %ymm0, %ymm4
+vpunpckhwd 160(%rax), %ymm0, %ymm5
+vpunpcklwd 352(%rax), %ymm1, %ymm6
+vpunpckhwd 352(%rax), %ymm1, %ymm7
+vpunpcklwd 544(%rax), %ymm2, %ymm8
+vpunpckhwd 544(%rax), %ymm2, %ymm9
+vpunpcklwd 736(%rax), %ymm3, %ymm10
+vpunpckhwd 736(%rax), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 832(%rax), %ymm0
+vmovdqa 1024(%rax), %ymm1
+vmovdqa 1216(%rax), %ymm2
+vmovdqa 1408(%rax), %ymm3
+vpunpcklwd 928(%rax), %ymm0, %ymm12
+vpunpckhwd 928(%rax), %ymm0, %ymm13
+vpunpcklwd 1120(%rax), %ymm1, %ymm14
+vpunpckhwd 1120(%rax), %ymm1, %ymm15
+vpunpcklwd 1312(%rax), %ymm2, %ymm0
+vpunpckhwd 1312(%rax), %ymm2, %ymm1
+vpunpcklwd 1504(%rax), %ymm3, %ymm2
+vpunpckhwd 1504(%rax), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 1024(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 1056(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 1088(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 1120(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 1152(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1184(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1216(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1280(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1312(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1344(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 1376(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1248(%r9)
+addq $32, %rsp
+subq $32, %rsp
+vmovdqa 0(%r11), %ymm0
+vmovdqa 192(%r11), %ymm1
+vmovdqa 384(%r11), %ymm2
+vmovdqa 576(%r11), %ymm3
+vpunpcklwd 96(%r11), %ymm0, %ymm4
+vpunpckhwd 96(%r11), %ymm0, %ymm5
+vpunpcklwd 288(%r11), %ymm1, %ymm6
+vpunpckhwd 288(%r11), %ymm1, %ymm7
+vpunpcklwd 480(%r11), %ymm2, %ymm8
+vpunpckhwd 480(%r11), %ymm2, %ymm9
+vpunpcklwd 672(%r11), %ymm3, %ymm10
+vpunpckhwd 672(%r11), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 768(%r11), %ymm0
+vmovdqa 960(%r11), %ymm1
+vmovdqa 1152(%r11), %ymm2
+vmovdqa 1344(%r11), %ymm3
+vpunpcklwd 864(%r11), %ymm0, %ymm12
+vpunpckhwd 864(%r11), %ymm0, %ymm13
+vpunpcklwd 1056(%r11), %ymm1, %ymm14
+vpunpckhwd 1056(%r11), %ymm1, %ymm15
+vpunpcklwd 1248(%r11), %ymm2, %ymm0
+vpunpckhwd 1248(%r11), %ymm2, %ymm1
+vpunpcklwd 1440(%r11), %ymm3, %ymm2
+vpunpckhwd 1440(%r11), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 1408(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 1440(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 1472(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 1504(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 1536(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1568(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1600(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1664(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1696(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1728(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 1760(%r9)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 1792(%r9)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 1824(%r9)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 1856(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1632(%r9)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 1888(%r9)
+vmovdqa 32(%r11), %ymm0
+vmovdqa 224(%r11), %ymm1
+vmovdqa 416(%r11), %ymm2
+vmovdqa 608(%r11), %ymm3
+vpunpcklwd 128(%r11), %ymm0, %ymm4
+vpunpckhwd 128(%r11), %ymm0, %ymm5
+vpunpcklwd 320(%r11), %ymm1, %ymm6
+vpunpckhwd 320(%r11), %ymm1, %ymm7
+vpunpcklwd 512(%r11), %ymm2, %ymm8
+vpunpckhwd 512(%r11), %ymm2, %ymm9
+vpunpcklwd 704(%r11), %ymm3, %ymm10
+vpunpckhwd 704(%r11), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 800(%r11), %ymm0
+vmovdqa 992(%r11), %ymm1
+vmovdqa 1184(%r11), %ymm2
+vmovdqa 1376(%r11), %ymm3
+vpunpcklwd 896(%r11), %ymm0, %ymm12
+vpunpckhwd 896(%r11), %ymm0, %ymm13
+vpunpcklwd 1088(%r11), %ymm1, %ymm14
+vpunpckhwd 1088(%r11), %ymm1, %ymm15
+vpunpcklwd 1280(%r11), %ymm2, %ymm0
+vpunpckhwd 1280(%r11), %ymm2, %ymm1
+vpunpcklwd 1472(%r11), %ymm3, %ymm2
+vpunpckhwd 1472(%r11), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 1920(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 1952(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 1984(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 2016(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 2048(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 2080(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 2112(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 2176(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 2208(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2240(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2272(%r9)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2304(%r9)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2336(%r9)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2368(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 2144(%r9)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2400(%r9)
+vmovdqa 64(%r11), %ymm0
+vmovdqa 256(%r11), %ymm1
+vmovdqa 448(%r11), %ymm2
+vmovdqa 640(%r11), %ymm3
+vpunpcklwd 160(%r11), %ymm0, %ymm4
+vpunpckhwd 160(%r11), %ymm0, %ymm5
+vpunpcklwd 352(%r11), %ymm1, %ymm6
+vpunpckhwd 352(%r11), %ymm1, %ymm7
+vpunpcklwd 544(%r11), %ymm2, %ymm8
+vpunpckhwd 544(%r11), %ymm2, %ymm9
+vpunpcklwd 736(%r11), %ymm3, %ymm10
+vpunpckhwd 736(%r11), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 832(%r11), %ymm0
+vmovdqa 1024(%r11), %ymm1
+vmovdqa 1216(%r11), %ymm2
+vmovdqa 1408(%r11), %ymm3
+vpunpcklwd 928(%r11), %ymm0, %ymm12
+vpunpckhwd 928(%r11), %ymm0, %ymm13
+vpunpcklwd 1120(%r11), %ymm1, %ymm14
+vpunpckhwd 1120(%r11), %ymm1, %ymm15
+vpunpcklwd 1312(%r11), %ymm2, %ymm0
+vpunpckhwd 1312(%r11), %ymm2, %ymm1
+vpunpcklwd 1504(%r11), %ymm3, %ymm2
+vpunpckhwd 1504(%r11), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 2432(%r9)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 2464(%r9)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 2496(%r9)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 2528(%r9)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 2560(%r9)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 2592(%r9)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 2624(%r9)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 2688(%r9)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 2720(%r9)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2752(%r9)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2784(%r9)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 2656(%r9)
+addq $32, %rsp
+innerloop_4eced63f144beffcb0247f9c6f67d165:
+vmovdqa 0(%r9), %ymm0
+vmovdqa 1408(%r9), %ymm6
+vmovdqa 32(%r9), %ymm1
+vmovdqa 1440(%r9), %ymm7
+vmovdqa 64(%r9), %ymm2
+vmovdqa 1472(%r9), %ymm8
+vmovdqa 96(%r9), %ymm3
+vmovdqa 1504(%r9), %ymm9
+vmovdqa 128(%r9), %ymm4
+vmovdqa 1536(%r9), %ymm10
+vmovdqa 160(%r9), %ymm5
+vmovdqa 1568(%r9), %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 2816(%r10)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 2848(%r10)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 2880(%r10)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 2912(%r10)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 2944(%r10)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 2976(%r10)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3008(%r10)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3040(%r10)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3072(%r10)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3104(%r10)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 3136(%r10)
+vmovdqa 192(%r9), %ymm0
+vmovdqa 1600(%r9), %ymm6
+vmovdqa 224(%r9), %ymm1
+vmovdqa 1632(%r9), %ymm7
+vmovdqa 256(%r9), %ymm2
+vmovdqa 1664(%r9), %ymm8
+vmovdqa 288(%r9), %ymm3
+vmovdqa 1696(%r9), %ymm9
+vmovdqa 320(%r9), %ymm4
+vmovdqa 1728(%r9), %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 3200(%r10)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3232(%r10)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3264(%r10)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3296(%r10)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3328(%r10)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3360(%r10)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3392(%r10)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3424(%r10)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 3456(%r10)
+vpaddw 0(%r9), %ymm0, %ymm0
+vpaddw 1408(%r9), %ymm6, %ymm6
+vpaddw 32(%r9), %ymm1, %ymm1
+vpaddw 1440(%r9), %ymm7, %ymm7
+vpaddw 64(%r9), %ymm2, %ymm2
+vpaddw 1472(%r9), %ymm8, %ymm8
+vpaddw 96(%r9), %ymm3, %ymm3
+vpaddw 1504(%r9), %ymm9, %ymm9
+vpaddw 128(%r9), %ymm4, %ymm4
+vpaddw 1536(%r9), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 2976(%r10), %ymm12, %ymm12
+vpsubw 3360(%r10), %ymm12, %ymm12
+vmovdqa %ymm12, 3168(%r10)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 3008(%r10), %ymm0
+vpsubw 3200(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 3392(%r10), %ymm6, %ymm6
+vmovdqa %ymm6, 3200(%r10)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 2816(%r10), %ymm0, %ymm0
+vmovdqa %ymm0, 3008(%r10)
+vmovdqa 3040(%r10), %ymm1
+vpsubw 3232(%r10), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 3424(%r10), %ymm7, %ymm7
+vmovdqa %ymm7, 3232(%r10)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 2848(%r10), %ymm1, %ymm1
+vmovdqa %ymm1, 3040(%r10)
+vmovdqa 3072(%r10), %ymm2
+vpsubw 3264(%r10), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 3456(%r10), %ymm8, %ymm8
+vmovdqa %ymm8, 3264(%r10)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 2880(%r10), %ymm2, %ymm2
+vmovdqa %ymm2, 3072(%r10)
+vmovdqa 3104(%r10), %ymm3
+vpsubw 3296(%r10), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 3296(%r10)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 2912(%r10), %ymm3, %ymm3
+vmovdqa %ymm3, 3104(%r10)
+vmovdqa 3136(%r10), %ymm4
+vpsubw 3328(%r10), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 2944(%r10), %ymm4, %ymm4
+vmovdqa %ymm4, 3136(%r10)
+vmovdqa 352(%r9), %ymm0
+vmovdqa 1760(%r9), %ymm6
+vmovdqa 384(%r9), %ymm1
+vmovdqa 1792(%r9), %ymm7
+vmovdqa 416(%r9), %ymm2
+vmovdqa 1824(%r9), %ymm8
+vmovdqa 448(%r9), %ymm3
+vmovdqa 1856(%r9), %ymm9
+vmovdqa 480(%r9), %ymm4
+vmovdqa 1888(%r9), %ymm10
+vmovdqa 512(%r9), %ymm5
+vmovdqa 1920(%r9), %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 3520(%r10)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3552(%r10)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3584(%r10)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3616(%r10)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3648(%r10)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3680(%r10)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3712(%r10)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3744(%r10)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3776(%r10)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3808(%r10)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 3840(%r10)
+vmovdqa 544(%r9), %ymm0
+vmovdqa 1952(%r9), %ymm6
+vmovdqa 576(%r9), %ymm1
+vmovdqa 1984(%r9), %ymm7
+vmovdqa 608(%r9), %ymm2
+vmovdqa 2016(%r9), %ymm8
+vmovdqa 640(%r9), %ymm3
+vmovdqa 2048(%r9), %ymm9
+vmovdqa 672(%r9), %ymm4
+vmovdqa 2080(%r9), %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 3904(%r10)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 3936(%r10)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 3968(%r10)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 4000(%r10)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 4032(%r10)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 4064(%r10)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 4096(%r10)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 4128(%r10)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 4160(%r10)
+vpaddw 352(%r9), %ymm0, %ymm0
+vpaddw 1760(%r9), %ymm6, %ymm6
+vpaddw 384(%r9), %ymm1, %ymm1
+vpaddw 1792(%r9), %ymm7, %ymm7
+vpaddw 416(%r9), %ymm2, %ymm2
+vpaddw 1824(%r9), %ymm8, %ymm8
+vpaddw 448(%r9), %ymm3, %ymm3
+vpaddw 1856(%r9), %ymm9, %ymm9
+vpaddw 480(%r9), %ymm4, %ymm4
+vpaddw 1888(%r9), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 3680(%r10), %ymm12, %ymm12
+vpsubw 4064(%r10), %ymm12, %ymm12
+vmovdqa %ymm12, 3872(%r10)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 3712(%r10), %ymm0
+vpsubw 3904(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 4096(%r10), %ymm6, %ymm6
+vmovdqa %ymm6, 3904(%r10)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 3520(%r10), %ymm0, %ymm0
+vmovdqa %ymm0, 3712(%r10)
+vmovdqa 3744(%r10), %ymm1
+vpsubw 3936(%r10), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 4128(%r10), %ymm7, %ymm7
+vmovdqa %ymm7, 3936(%r10)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 3552(%r10), %ymm1, %ymm1
+vmovdqa %ymm1, 3744(%r10)
+vmovdqa 3776(%r10), %ymm2
+vpsubw 3968(%r10), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 4160(%r10), %ymm8, %ymm8
+vmovdqa %ymm8, 3968(%r10)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 3584(%r10), %ymm2, %ymm2
+vmovdqa %ymm2, 3776(%r10)
+vmovdqa 3808(%r10), %ymm3
+vpsubw 4000(%r10), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 4000(%r10)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 3616(%r10), %ymm3, %ymm3
+vmovdqa %ymm3, 3808(%r10)
+vmovdqa 3840(%r10), %ymm4
+vpsubw 4032(%r10), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 3648(%r10), %ymm4, %ymm4
+vmovdqa %ymm4, 3840(%r10)
+vmovdqa 0(%r9), %ymm0
+vmovdqa 1408(%r9), %ymm6
+vpaddw 352(%r9), %ymm0, %ymm0
+vpaddw 1760(%r9), %ymm6, %ymm6
+vmovdqa 32(%r9), %ymm1
+vmovdqa 1440(%r9), %ymm7
+vpaddw 384(%r9), %ymm1, %ymm1
+vpaddw 1792(%r9), %ymm7, %ymm7
+vmovdqa 64(%r9), %ymm2
+vmovdqa 1472(%r9), %ymm8
+vpaddw 416(%r9), %ymm2, %ymm2
+vpaddw 1824(%r9), %ymm8, %ymm8
+vmovdqa 96(%r9), %ymm3
+vmovdqa 1504(%r9), %ymm9
+vpaddw 448(%r9), %ymm3, %ymm3
+vpaddw 1856(%r9), %ymm9, %ymm9
+vmovdqa 128(%r9), %ymm4
+vmovdqa 1536(%r9), %ymm10
+vpaddw 480(%r9), %ymm4, %ymm4
+vpaddw 1888(%r9), %ymm10, %ymm10
+vmovdqa 160(%r9), %ymm5
+vmovdqa 1568(%r9), %ymm11
+vpaddw 512(%r9), %ymm5, %ymm5
+vpaddw 1920(%r9), %ymm11, %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 5888(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 5920(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 5952(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 5984(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6016(%rsp)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6048(%rsp)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6080(%rsp)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6112(%rsp)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6144(%rsp)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6176(%rsp)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 6208(%rsp)
+vmovdqa 192(%r9), %ymm0
+vmovdqa 1600(%r9), %ymm6
+vpaddw 544(%r9), %ymm0, %ymm0
+vpaddw 1952(%r9), %ymm6, %ymm6
+vmovdqa 224(%r9), %ymm1
+vmovdqa 1632(%r9), %ymm7
+vpaddw 576(%r9), %ymm1, %ymm1
+vpaddw 1984(%r9), %ymm7, %ymm7
+vmovdqa 256(%r9), %ymm2
+vmovdqa 1664(%r9), %ymm8
+vpaddw 608(%r9), %ymm2, %ymm2
+vpaddw 2016(%r9), %ymm8, %ymm8
+vmovdqa 288(%r9), %ymm3
+vmovdqa 1696(%r9), %ymm9
+vpaddw 640(%r9), %ymm3, %ymm3
+vpaddw 2048(%r9), %ymm9, %ymm9
+vmovdqa 320(%r9), %ymm4
+vmovdqa 1728(%r9), %ymm10
+vpaddw 672(%r9), %ymm4, %ymm4
+vpaddw 2080(%r9), %ymm10, %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 6272(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6304(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6336(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6368(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6400(%rsp)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6432(%rsp)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6464(%rsp)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6496(%rsp)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 6528(%rsp)
+vpaddw 0(%r9), %ymm0, %ymm0
+vpaddw 1408(%r9), %ymm6, %ymm6
+vpaddw 352(%r9), %ymm0, %ymm0
+vpaddw 1760(%r9), %ymm6, %ymm6
+vpaddw 32(%r9), %ymm1, %ymm1
+vpaddw 1440(%r9), %ymm7, %ymm7
+vpaddw 384(%r9), %ymm1, %ymm1
+vpaddw 1792(%r9), %ymm7, %ymm7
+vpaddw 64(%r9), %ymm2, %ymm2
+vpaddw 1472(%r9), %ymm8, %ymm8
+vpaddw 416(%r9), %ymm2, %ymm2
+vpaddw 1824(%r9), %ymm8, %ymm8
+vpaddw 96(%r9), %ymm3, %ymm3
+vpaddw 1504(%r9), %ymm9, %ymm9
+vpaddw 448(%r9), %ymm3, %ymm3
+vpaddw 1856(%r9), %ymm9, %ymm9
+vpaddw 128(%r9), %ymm4, %ymm4
+vpaddw 1536(%r9), %ymm10, %ymm10
+vpaddw 480(%r9), %ymm4, %ymm4
+vpaddw 1888(%r9), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 6048(%rsp), %ymm12, %ymm12
+vpsubw 6432(%rsp), %ymm12, %ymm12
+vmovdqa %ymm12, 6240(%rsp)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 6080(%rsp), %ymm0
+vpsubw 6272(%rsp), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 6464(%rsp), %ymm6, %ymm6
+vmovdqa %ymm6, 6272(%rsp)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 5888(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 6080(%rsp)
+vmovdqa 6112(%rsp), %ymm1
+vpsubw 6304(%rsp), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 6496(%rsp), %ymm7, %ymm7
+vmovdqa %ymm7, 6304(%rsp)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 5920(%rsp), %ymm1, %ymm1
+vmovdqa %ymm1, 6112(%rsp)
+vmovdqa 6144(%rsp), %ymm2
+vpsubw 6336(%rsp), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 6528(%rsp), %ymm8, %ymm8
+vmovdqa %ymm8, 6336(%rsp)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 5952(%rsp), %ymm2, %ymm2
+vmovdqa %ymm2, 6144(%rsp)
+vmovdqa 6176(%rsp), %ymm3
+vpsubw 6368(%rsp), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 6368(%rsp)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 5984(%rsp), %ymm3, %ymm3
+vmovdqa %ymm3, 6176(%rsp)
+vmovdqa 6208(%rsp), %ymm4
+vpsubw 6400(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 6016(%rsp), %ymm4, %ymm4
+vmovdqa %ymm4, 6208(%rsp)
+vmovdqa 6208(%rsp), %ymm0
+vpsubw 3136(%r10), %ymm0, %ymm0
+vpsubw 3840(%r10), %ymm0, %ymm0
+vmovdqa %ymm0, 3488(%r10)
+vmovdqa 3168(%r10), %ymm0
+vpsubw 3520(%r10), %ymm0, %ymm0
+vmovdqa 6240(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3872(%r10), %ymm1, %ymm1
+vpsubw 2816(%r10), %ymm0, %ymm0
+vpaddw 5888(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3168(%r10)
+vmovdqa %ymm1, 3520(%r10)
+vmovdqa 3200(%r10), %ymm0
+vpsubw 3552(%r10), %ymm0, %ymm0
+vmovdqa 6272(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3904(%r10), %ymm1, %ymm1
+vpsubw 2848(%r10), %ymm0, %ymm0
+vpaddw 5920(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3200(%r10)
+vmovdqa %ymm1, 3552(%r10)
+vmovdqa 3232(%r10), %ymm0
+vpsubw 3584(%r10), %ymm0, %ymm0
+vmovdqa 6304(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3936(%r10), %ymm1, %ymm1
+vpsubw 2880(%r10), %ymm0, %ymm0
+vpaddw 5952(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3232(%r10)
+vmovdqa %ymm1, 3584(%r10)
+vmovdqa 3264(%r10), %ymm0
+vpsubw 3616(%r10), %ymm0, %ymm0
+vmovdqa 6336(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3968(%r10), %ymm1, %ymm1
+vpsubw 2912(%r10), %ymm0, %ymm0
+vpaddw 5984(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3264(%r10)
+vmovdqa %ymm1, 3616(%r10)
+vmovdqa 3296(%r10), %ymm0
+vpsubw 3648(%r10), %ymm0, %ymm0
+vmovdqa 6368(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4000(%r10), %ymm1, %ymm1
+vpsubw 2944(%r10), %ymm0, %ymm0
+vpaddw 6016(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3296(%r10)
+vmovdqa %ymm1, 3648(%r10)
+vmovdqa 3328(%r10), %ymm0
+vpsubw 3680(%r10), %ymm0, %ymm0
+vmovdqa 6400(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4032(%r10), %ymm1, %ymm1
+vpsubw 2976(%r10), %ymm0, %ymm0
+vpaddw 6048(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3328(%r10)
+vmovdqa %ymm1, 3680(%r10)
+vmovdqa 3360(%r10), %ymm0
+vpsubw 3712(%r10), %ymm0, %ymm0
+vmovdqa 6432(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4064(%r10), %ymm1, %ymm1
+vpsubw 3008(%r10), %ymm0, %ymm0
+vpaddw 6080(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3360(%r10)
+vmovdqa %ymm1, 3712(%r10)
+vmovdqa 3392(%r10), %ymm0
+vpsubw 3744(%r10), %ymm0, %ymm0
+vmovdqa 6464(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4096(%r10), %ymm1, %ymm1
+vpsubw 3040(%r10), %ymm0, %ymm0
+vpaddw 6112(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3392(%r10)
+vmovdqa %ymm1, 3744(%r10)
+vmovdqa 3424(%r10), %ymm0
+vpsubw 3776(%r10), %ymm0, %ymm0
+vmovdqa 6496(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4128(%r10), %ymm1, %ymm1
+vpsubw 3072(%r10), %ymm0, %ymm0
+vpaddw 6144(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3424(%r10)
+vmovdqa %ymm1, 3776(%r10)
+vmovdqa 3456(%r10), %ymm0
+vpsubw 3808(%r10), %ymm0, %ymm0
+vmovdqa 6528(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 4160(%r10), %ymm1, %ymm1
+vpsubw 3104(%r10), %ymm0, %ymm0
+vpaddw 6176(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3456(%r10)
+vmovdqa %ymm1, 3808(%r10)
+neg %ecx
+jns done_4eced63f144beffcb0247f9c6f67d165
+add $704, %r9
+add $1408, %r10
+jmp innerloop_4eced63f144beffcb0247f9c6f67d165
+done_4eced63f144beffcb0247f9c6f67d165:
+sub $704, %r9
+sub $1408, %r10
+vmovdqa 0(%r9), %ymm0
+vpaddw 704(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6592(%rsp)
+vmovdqa 1408(%r9), %ymm0
+vpaddw 2112(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7296(%rsp)
+vmovdqa 32(%r9), %ymm0
+vpaddw 736(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6624(%rsp)
+vmovdqa 1440(%r9), %ymm0
+vpaddw 2144(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7328(%rsp)
+vmovdqa 64(%r9), %ymm0
+vpaddw 768(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6656(%rsp)
+vmovdqa 1472(%r9), %ymm0
+vpaddw 2176(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7360(%rsp)
+vmovdqa 96(%r9), %ymm0
+vpaddw 800(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6688(%rsp)
+vmovdqa 1504(%r9), %ymm0
+vpaddw 2208(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7392(%rsp)
+vmovdqa 128(%r9), %ymm0
+vpaddw 832(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6720(%rsp)
+vmovdqa 1536(%r9), %ymm0
+vpaddw 2240(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7424(%rsp)
+vmovdqa 160(%r9), %ymm0
+vpaddw 864(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6752(%rsp)
+vmovdqa 1568(%r9), %ymm0
+vpaddw 2272(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7456(%rsp)
+vmovdqa 192(%r9), %ymm0
+vpaddw 896(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6784(%rsp)
+vmovdqa 1600(%r9), %ymm0
+vpaddw 2304(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7488(%rsp)
+vmovdqa 224(%r9), %ymm0
+vpaddw 928(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6816(%rsp)
+vmovdqa 1632(%r9), %ymm0
+vpaddw 2336(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7520(%rsp)
+vmovdqa 256(%r9), %ymm0
+vpaddw 960(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6848(%rsp)
+vmovdqa 1664(%r9), %ymm0
+vpaddw 2368(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7552(%rsp)
+vmovdqa 288(%r9), %ymm0
+vpaddw 992(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6880(%rsp)
+vmovdqa 1696(%r9), %ymm0
+vpaddw 2400(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7584(%rsp)
+vmovdqa 320(%r9), %ymm0
+vpaddw 1024(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6912(%rsp)
+vmovdqa 1728(%r9), %ymm0
+vpaddw 2432(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7616(%rsp)
+vmovdqa 352(%r9), %ymm0
+vpaddw 1056(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6944(%rsp)
+vmovdqa 1760(%r9), %ymm0
+vpaddw 2464(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7648(%rsp)
+vmovdqa 384(%r9), %ymm0
+vpaddw 1088(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 6976(%rsp)
+vmovdqa 1792(%r9), %ymm0
+vpaddw 2496(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7680(%rsp)
+vmovdqa 416(%r9), %ymm0
+vpaddw 1120(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7008(%rsp)
+vmovdqa 1824(%r9), %ymm0
+vpaddw 2528(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7712(%rsp)
+vmovdqa 448(%r9), %ymm0
+vpaddw 1152(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7040(%rsp)
+vmovdqa 1856(%r9), %ymm0
+vpaddw 2560(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7744(%rsp)
+vmovdqa 480(%r9), %ymm0
+vpaddw 1184(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7072(%rsp)
+vmovdqa 1888(%r9), %ymm0
+vpaddw 2592(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7776(%rsp)
+vmovdqa 512(%r9), %ymm0
+vpaddw 1216(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7104(%rsp)
+vmovdqa 1920(%r9), %ymm0
+vpaddw 2624(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7808(%rsp)
+vmovdqa 544(%r9), %ymm0
+vpaddw 1248(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7136(%rsp)
+vmovdqa 1952(%r9), %ymm0
+vpaddw 2656(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7840(%rsp)
+vmovdqa 576(%r9), %ymm0
+vpaddw 1280(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7168(%rsp)
+vmovdqa 1984(%r9), %ymm0
+vpaddw 2688(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7872(%rsp)
+vmovdqa 608(%r9), %ymm0
+vpaddw 1312(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7200(%rsp)
+vmovdqa 2016(%r9), %ymm0
+vpaddw 2720(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7904(%rsp)
+vmovdqa 640(%r9), %ymm0
+vpaddw 1344(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7232(%rsp)
+vmovdqa 2048(%r9), %ymm0
+vpaddw 2752(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7936(%rsp)
+vmovdqa 672(%r9), %ymm0
+vpaddw 1376(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7264(%rsp)
+vmovdqa 2080(%r9), %ymm0
+vpaddw 2784(%r9), %ymm0, %ymm0
+vmovdqa %ymm0, 7968(%rsp)
+vmovdqa 6592(%rsp), %ymm0
+vmovdqa 7296(%rsp), %ymm6
+vmovdqa 6624(%rsp), %ymm1
+vmovdqa 7328(%rsp), %ymm7
+vmovdqa 6656(%rsp), %ymm2
+vmovdqa 7360(%rsp), %ymm8
+vmovdqa 6688(%rsp), %ymm3
+vmovdqa 7392(%rsp), %ymm9
+vmovdqa 6720(%rsp), %ymm4
+vmovdqa 7424(%rsp), %ymm10
+vmovdqa 6752(%rsp), %ymm5
+vmovdqa 7456(%rsp), %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 8000(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8032(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8064(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8096(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8128(%rsp)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8160(%rsp)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8192(%rsp)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8224(%rsp)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8256(%rsp)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8288(%rsp)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 8320(%rsp)
+vmovdqa 6784(%rsp), %ymm0
+vmovdqa 7488(%rsp), %ymm6
+vmovdqa 6816(%rsp), %ymm1
+vmovdqa 7520(%rsp), %ymm7
+vmovdqa 6848(%rsp), %ymm2
+vmovdqa 7552(%rsp), %ymm8
+vmovdqa 6880(%rsp), %ymm3
+vmovdqa 7584(%rsp), %ymm9
+vmovdqa 6912(%rsp), %ymm4
+vmovdqa 7616(%rsp), %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 8384(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8416(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8448(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8480(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8512(%rsp)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8544(%rsp)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8576(%rsp)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8608(%rsp)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 8640(%rsp)
+vpaddw 6592(%rsp), %ymm0, %ymm0
+vpaddw 7296(%rsp), %ymm6, %ymm6
+vpaddw 6624(%rsp), %ymm1, %ymm1
+vpaddw 7328(%rsp), %ymm7, %ymm7
+vpaddw 6656(%rsp), %ymm2, %ymm2
+vpaddw 7360(%rsp), %ymm8, %ymm8
+vpaddw 6688(%rsp), %ymm3, %ymm3
+vpaddw 7392(%rsp), %ymm9, %ymm9
+vpaddw 6720(%rsp), %ymm4, %ymm4
+vpaddw 7424(%rsp), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 8160(%rsp), %ymm12, %ymm12
+vpsubw 8544(%rsp), %ymm12, %ymm12
+vmovdqa %ymm12, 8352(%rsp)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 8192(%rsp), %ymm0
+vpsubw 8384(%rsp), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 8576(%rsp), %ymm6, %ymm6
+vmovdqa %ymm6, 8384(%rsp)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 8000(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8192(%rsp)
+vmovdqa 8224(%rsp), %ymm1
+vpsubw 8416(%rsp), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 8608(%rsp), %ymm7, %ymm7
+vmovdqa %ymm7, 8416(%rsp)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 8032(%rsp), %ymm1, %ymm1
+vmovdqa %ymm1, 8224(%rsp)
+vmovdqa 8256(%rsp), %ymm2
+vpsubw 8448(%rsp), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 8640(%rsp), %ymm8, %ymm8
+vmovdqa %ymm8, 8448(%rsp)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 8064(%rsp), %ymm2, %ymm2
+vmovdqa %ymm2, 8256(%rsp)
+vmovdqa 8288(%rsp), %ymm3
+vpsubw 8480(%rsp), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 8480(%rsp)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 8096(%rsp), %ymm3, %ymm3
+vmovdqa %ymm3, 8288(%rsp)
+vmovdqa 8320(%rsp), %ymm4
+vpsubw 8512(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 8128(%rsp), %ymm4, %ymm4
+vmovdqa %ymm4, 8320(%rsp)
+vmovdqa 6944(%rsp), %ymm0
+vmovdqa 7648(%rsp), %ymm6
+vmovdqa 6976(%rsp), %ymm1
+vmovdqa 7680(%rsp), %ymm7
+vmovdqa 7008(%rsp), %ymm2
+vmovdqa 7712(%rsp), %ymm8
+vmovdqa 7040(%rsp), %ymm3
+vmovdqa 7744(%rsp), %ymm9
+vmovdqa 7072(%rsp), %ymm4
+vmovdqa 7776(%rsp), %ymm10
+vmovdqa 7104(%rsp), %ymm5
+vmovdqa 7808(%rsp), %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 8704(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8736(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8768(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8800(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8832(%rsp)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8864(%rsp)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8896(%rsp)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8928(%rsp)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 8960(%rsp)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 8992(%rsp)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 9024(%rsp)
+vmovdqa 7136(%rsp), %ymm0
+vmovdqa 7840(%rsp), %ymm6
+vmovdqa 7168(%rsp), %ymm1
+vmovdqa 7872(%rsp), %ymm7
+vmovdqa 7200(%rsp), %ymm2
+vmovdqa 7904(%rsp), %ymm8
+vmovdqa 7232(%rsp), %ymm3
+vmovdqa 7936(%rsp), %ymm9
+vmovdqa 7264(%rsp), %ymm4
+vmovdqa 7968(%rsp), %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 9088(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 9120(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 9152(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 9184(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 9216(%rsp)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 9248(%rsp)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 9280(%rsp)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 9312(%rsp)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 9344(%rsp)
+vpaddw 6944(%rsp), %ymm0, %ymm0
+vpaddw 7648(%rsp), %ymm6, %ymm6
+vpaddw 6976(%rsp), %ymm1, %ymm1
+vpaddw 7680(%rsp), %ymm7, %ymm7
+vpaddw 7008(%rsp), %ymm2, %ymm2
+vpaddw 7712(%rsp), %ymm8, %ymm8
+vpaddw 7040(%rsp), %ymm3, %ymm3
+vpaddw 7744(%rsp), %ymm9, %ymm9
+vpaddw 7072(%rsp), %ymm4, %ymm4
+vpaddw 7776(%rsp), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 8864(%rsp), %ymm12, %ymm12
+vpsubw 9248(%rsp), %ymm12, %ymm12
+vmovdqa %ymm12, 9056(%rsp)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 8896(%rsp), %ymm0
+vpsubw 9088(%rsp), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 9280(%rsp), %ymm6, %ymm6
+vmovdqa %ymm6, 9088(%rsp)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 8704(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8896(%rsp)
+vmovdqa 8928(%rsp), %ymm1
+vpsubw 9120(%rsp), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 9312(%rsp), %ymm7, %ymm7
+vmovdqa %ymm7, 9120(%rsp)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 8736(%rsp), %ymm1, %ymm1
+vmovdqa %ymm1, 8928(%rsp)
+vmovdqa 8960(%rsp), %ymm2
+vpsubw 9152(%rsp), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 9344(%rsp), %ymm8, %ymm8
+vmovdqa %ymm8, 9152(%rsp)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 8768(%rsp), %ymm2, %ymm2
+vmovdqa %ymm2, 8960(%rsp)
+vmovdqa 8992(%rsp), %ymm3
+vpsubw 9184(%rsp), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 9184(%rsp)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 8800(%rsp), %ymm3, %ymm3
+vmovdqa %ymm3, 8992(%rsp)
+vmovdqa 9024(%rsp), %ymm4
+vpsubw 9216(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 8832(%rsp), %ymm4, %ymm4
+vmovdqa %ymm4, 9024(%rsp)
+vmovdqa 6592(%rsp), %ymm0
+vmovdqa 7296(%rsp), %ymm6
+vpaddw 6944(%rsp), %ymm0, %ymm0
+vpaddw 7648(%rsp), %ymm6, %ymm6
+vmovdqa 6624(%rsp), %ymm1
+vmovdqa 7328(%rsp), %ymm7
+vpaddw 6976(%rsp), %ymm1, %ymm1
+vpaddw 7680(%rsp), %ymm7, %ymm7
+vmovdqa 6656(%rsp), %ymm2
+vmovdqa 7360(%rsp), %ymm8
+vpaddw 7008(%rsp), %ymm2, %ymm2
+vpaddw 7712(%rsp), %ymm8, %ymm8
+vmovdqa 6688(%rsp), %ymm3
+vmovdqa 7392(%rsp), %ymm9
+vpaddw 7040(%rsp), %ymm3, %ymm3
+vpaddw 7744(%rsp), %ymm9, %ymm9
+vmovdqa 6720(%rsp), %ymm4
+vmovdqa 7424(%rsp), %ymm10
+vpaddw 7072(%rsp), %ymm4, %ymm4
+vpaddw 7776(%rsp), %ymm10, %ymm10
+vmovdqa 6752(%rsp), %ymm5
+vmovdqa 7456(%rsp), %ymm11
+vpaddw 7104(%rsp), %ymm5, %ymm5
+vpaddw 7808(%rsp), %ymm11, %ymm11
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 5888(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 5920(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 5952(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 5984(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6016(%rsp)
+vpmullw %ymm0, %ymm11, %ymm13
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6048(%rsp)
+vpmullw %ymm1, %ymm11, %ymm12
+vpmullw %ymm2, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6080(%rsp)
+vpmullw %ymm2, %ymm11, %ymm13
+vpmullw %ymm3, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm5, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6112(%rsp)
+vpmullw %ymm3, %ymm11, %ymm12
+vpmullw %ymm4, %ymm10, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm5, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6144(%rsp)
+vpmullw %ymm4, %ymm11, %ymm13
+vpmullw %ymm5, %ymm10, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6176(%rsp)
+vpmullw %ymm5, %ymm11, %ymm12
+vmovdqa %ymm12, 6208(%rsp)
+vmovdqa 6784(%rsp), %ymm0
+vmovdqa 7488(%rsp), %ymm6
+vpaddw 7136(%rsp), %ymm0, %ymm0
+vpaddw 7840(%rsp), %ymm6, %ymm6
+vmovdqa 6816(%rsp), %ymm1
+vmovdqa 7520(%rsp), %ymm7
+vpaddw 7168(%rsp), %ymm1, %ymm1
+vpaddw 7872(%rsp), %ymm7, %ymm7
+vmovdqa 6848(%rsp), %ymm2
+vmovdqa 7552(%rsp), %ymm8
+vpaddw 7200(%rsp), %ymm2, %ymm2
+vpaddw 7904(%rsp), %ymm8, %ymm8
+vmovdqa 6880(%rsp), %ymm3
+vmovdqa 7584(%rsp), %ymm9
+vpaddw 7232(%rsp), %ymm3, %ymm3
+vpaddw 7936(%rsp), %ymm9, %ymm9
+vmovdqa 6912(%rsp), %ymm4
+vmovdqa 7616(%rsp), %ymm10
+vpaddw 7264(%rsp), %ymm4, %ymm4
+vpaddw 7968(%rsp), %ymm10, %ymm10
+vpmullw %ymm0, %ymm6, %ymm12
+vmovdqa %ymm12, 6272(%rsp)
+vpmullw %ymm0, %ymm7, %ymm13
+vpmullw %ymm1, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6304(%rsp)
+vpmullw %ymm0, %ymm8, %ymm12
+vpmullw %ymm1, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6336(%rsp)
+vpmullw %ymm0, %ymm9, %ymm13
+vpmullw %ymm1, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm2, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm6, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6368(%rsp)
+vpmullw %ymm0, %ymm10, %ymm12
+vpmullw %ymm1, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm2, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm3, %ymm7, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm6, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6400(%rsp)
+vpmullw %ymm1, %ymm10, %ymm13
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6432(%rsp)
+vpmullw %ymm2, %ymm10, %ymm12
+vpmullw %ymm3, %ymm9, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vpmullw %ymm4, %ymm8, %ymm15
+vpaddw %ymm12, %ymm15, %ymm12
+vmovdqa %ymm12, 6464(%rsp)
+vpmullw %ymm3, %ymm10, %ymm13
+vpmullw %ymm4, %ymm9, %ymm15
+vpaddw %ymm13, %ymm15, %ymm13
+vmovdqa %ymm13, 6496(%rsp)
+vpmullw %ymm4, %ymm10, %ymm12
+vmovdqa %ymm12, 6528(%rsp)
+vpaddw 6592(%rsp), %ymm0, %ymm0
+vpaddw 7296(%rsp), %ymm6, %ymm6
+vpaddw 6944(%rsp), %ymm0, %ymm0
+vpaddw 7648(%rsp), %ymm6, %ymm6
+vpaddw 6624(%rsp), %ymm1, %ymm1
+vpaddw 7328(%rsp), %ymm7, %ymm7
+vpaddw 6976(%rsp), %ymm1, %ymm1
+vpaddw 7680(%rsp), %ymm7, %ymm7
+vpaddw 6656(%rsp), %ymm2, %ymm2
+vpaddw 7360(%rsp), %ymm8, %ymm8
+vpaddw 7008(%rsp), %ymm2, %ymm2
+vpaddw 7712(%rsp), %ymm8, %ymm8
+vpaddw 6688(%rsp), %ymm3, %ymm3
+vpaddw 7392(%rsp), %ymm9, %ymm9
+vpaddw 7040(%rsp), %ymm3, %ymm3
+vpaddw 7744(%rsp), %ymm9, %ymm9
+vpaddw 6720(%rsp), %ymm4, %ymm4
+vpaddw 7424(%rsp), %ymm10, %ymm10
+vpaddw 7072(%rsp), %ymm4, %ymm4
+vpaddw 7776(%rsp), %ymm10, %ymm10
+vpmullw %ymm0, %ymm11, %ymm12
+vpmullw %ymm1, %ymm10, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm2, %ymm9, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm3, %ymm8, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm4, %ymm7, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpmullw %ymm5, %ymm6, %ymm15
+vpaddw %ymm15, %ymm12, %ymm12
+vpsubw 6048(%rsp), %ymm12, %ymm12
+vpsubw 6432(%rsp), %ymm12, %ymm12
+vmovdqa %ymm12, 6240(%rsp)
+vpmullw %ymm5, %ymm7, %ymm12
+vpmullw %ymm5, %ymm8, %ymm13
+vpmullw %ymm5, %ymm9, %ymm14
+vpmullw %ymm5, %ymm10, %ymm15
+vpmullw %ymm1, %ymm11, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm10, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm3, %ymm9, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm4, %ymm8, %ymm5
+vpaddw %ymm5, %ymm12, %ymm12
+vpmullw %ymm2, %ymm11, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm10, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm4, %ymm9, %ymm5
+vpaddw %ymm5, %ymm13, %ymm13
+vpmullw %ymm3, %ymm11, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm10, %ymm5
+vpaddw %ymm5, %ymm14, %ymm14
+vpmullw %ymm4, %ymm11, %ymm5
+vpaddw %ymm5, %ymm15, %ymm15
+vpmullw %ymm0, %ymm10, %ymm11
+vpmullw %ymm1, %ymm9, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm2, %ymm8, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm3, %ymm7, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm4, %ymm6, %ymm5
+vpaddw %ymm5, %ymm11, %ymm11
+vpmullw %ymm0, %ymm9, %ymm10
+vpmullw %ymm1, %ymm8, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm2, %ymm7, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm3, %ymm6, %ymm5
+vpaddw %ymm5, %ymm10, %ymm10
+vpmullw %ymm0, %ymm8, %ymm9
+vpmullw %ymm1, %ymm7, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm2, %ymm6, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vpmullw %ymm0, %ymm7, %ymm8
+vpmullw %ymm1, %ymm6, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vpmullw %ymm0, %ymm6, %ymm7
+vmovdqa 6080(%rsp), %ymm0
+vpsubw 6272(%rsp), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm6
+vpsubw 6464(%rsp), %ymm6, %ymm6
+vmovdqa %ymm6, 6272(%rsp)
+vpaddw %ymm7, %ymm0, %ymm0
+vpsubw 5888(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 6080(%rsp)
+vmovdqa 6112(%rsp), %ymm1
+vpsubw 6304(%rsp), %ymm1, %ymm1
+vpsubw %ymm1, %ymm13, %ymm7
+vpsubw 6496(%rsp), %ymm7, %ymm7
+vmovdqa %ymm7, 6304(%rsp)
+vpaddw %ymm8, %ymm1, %ymm1
+vpsubw 5920(%rsp), %ymm1, %ymm1
+vmovdqa %ymm1, 6112(%rsp)
+vmovdqa 6144(%rsp), %ymm2
+vpsubw 6336(%rsp), %ymm2, %ymm2
+vpsubw %ymm2, %ymm14, %ymm8
+vpsubw 6528(%rsp), %ymm8, %ymm8
+vmovdqa %ymm8, 6336(%rsp)
+vpaddw %ymm9, %ymm2, %ymm2
+vpsubw 5952(%rsp), %ymm2, %ymm2
+vmovdqa %ymm2, 6144(%rsp)
+vmovdqa 6176(%rsp), %ymm3
+vpsubw 6368(%rsp), %ymm3, %ymm3
+vpsubw %ymm3, %ymm15, %ymm9
+vmovdqa %ymm9, 6368(%rsp)
+vpaddw %ymm10, %ymm3, %ymm3
+vpsubw 5984(%rsp), %ymm3, %ymm3
+vmovdqa %ymm3, 6176(%rsp)
+vmovdqa 6208(%rsp), %ymm4
+vpsubw 6400(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vpsubw 6016(%rsp), %ymm4, %ymm4
+vmovdqa %ymm4, 6208(%rsp)
+vmovdqa 8352(%rsp), %ymm0
+vpsubw 8704(%rsp), %ymm0, %ymm0
+vmovdqa 6240(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9056(%rsp), %ymm1, %ymm6
+vpsubw 8000(%rsp), %ymm0, %ymm0
+vpaddw 5888(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8352(%rsp)
+vmovdqa 8384(%rsp), %ymm0
+vpsubw 8736(%rsp), %ymm0, %ymm0
+vmovdqa 6272(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9088(%rsp), %ymm1, %ymm7
+vpsubw 8032(%rsp), %ymm0, %ymm0
+vpaddw 5920(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8384(%rsp)
+vmovdqa 8416(%rsp), %ymm0
+vpsubw 8768(%rsp), %ymm0, %ymm0
+vmovdqa 6304(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9120(%rsp), %ymm1, %ymm8
+vpsubw 8064(%rsp), %ymm0, %ymm0
+vpaddw 5952(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8416(%rsp)
+vmovdqa 8448(%rsp), %ymm0
+vpsubw 8800(%rsp), %ymm0, %ymm0
+vmovdqa 6336(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9152(%rsp), %ymm1, %ymm9
+vpsubw 8096(%rsp), %ymm0, %ymm0
+vpaddw 5984(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8448(%rsp)
+vmovdqa 8480(%rsp), %ymm0
+vpsubw 8832(%rsp), %ymm0, %ymm0
+vmovdqa 6368(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9184(%rsp), %ymm1, %ymm10
+vpsubw 8128(%rsp), %ymm0, %ymm0
+vpaddw 6016(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8480(%rsp)
+vmovdqa 8512(%rsp), %ymm0
+vpsubw 8864(%rsp), %ymm0, %ymm0
+vmovdqa 6400(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9216(%rsp), %ymm1, %ymm11
+vpsubw 8160(%rsp), %ymm0, %ymm0
+vpaddw 6048(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8512(%rsp)
+vmovdqa 8544(%rsp), %ymm0
+vpsubw 8896(%rsp), %ymm0, %ymm0
+vmovdqa 6432(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9248(%rsp), %ymm1, %ymm12
+vpsubw 8192(%rsp), %ymm0, %ymm0
+vpaddw 6080(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8544(%rsp)
+vmovdqa 8576(%rsp), %ymm0
+vpsubw 8928(%rsp), %ymm0, %ymm0
+vmovdqa 6464(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9280(%rsp), %ymm1, %ymm13
+vpsubw 8224(%rsp), %ymm0, %ymm0
+vpaddw 6112(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8576(%rsp)
+vmovdqa 8608(%rsp), %ymm0
+vpsubw 8960(%rsp), %ymm0, %ymm0
+vmovdqa 6496(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9312(%rsp), %ymm1, %ymm14
+vpsubw 8256(%rsp), %ymm0, %ymm0
+vpaddw 6144(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8608(%rsp)
+vmovdqa 8640(%rsp), %ymm0
+vpsubw 8992(%rsp), %ymm0, %ymm0
+vmovdqa 6528(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 9344(%rsp), %ymm1, %ymm15
+vpsubw 8288(%rsp), %ymm0, %ymm0
+vpaddw 6176(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 8640(%rsp)
+vmovdqa 6208(%rsp), %ymm0
+vpsubw 8320(%rsp), %ymm0, %ymm0
+vpsubw 9024(%rsp), %ymm0, %ymm0
+vpsubw 3488(%r10), %ymm0, %ymm0
+vpsubw 4896(%r10), %ymm0, %ymm0
+vmovdqa %ymm0, 4192(%r10)
+vmovdqa 3520(%r10), %ymm0
+vpsubw 4224(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm6, %ymm6
+vpsubw 4928(%r10), %ymm6, %ymm6
+vpsubw 2816(%r10), %ymm0, %ymm0
+vpaddw 8000(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3520(%r10)
+vmovdqa %ymm6, 4224(%r10)
+vmovdqa 3552(%r10), %ymm0
+vpsubw 4256(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm7, %ymm7
+vpsubw 4960(%r10), %ymm7, %ymm7
+vpsubw 2848(%r10), %ymm0, %ymm0
+vpaddw 8032(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3552(%r10)
+vmovdqa %ymm7, 4256(%r10)
+vmovdqa 3584(%r10), %ymm0
+vpsubw 4288(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm8, %ymm8
+vpsubw 4992(%r10), %ymm8, %ymm8
+vpsubw 2880(%r10), %ymm0, %ymm0
+vpaddw 8064(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3584(%r10)
+vmovdqa %ymm8, 4288(%r10)
+vmovdqa 3616(%r10), %ymm0
+vpsubw 4320(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm9, %ymm9
+vpsubw 5024(%r10), %ymm9, %ymm9
+vpsubw 2912(%r10), %ymm0, %ymm0
+vpaddw 8096(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3616(%r10)
+vmovdqa %ymm9, 4320(%r10)
+vmovdqa 3648(%r10), %ymm0
+vpsubw 4352(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm10, %ymm10
+vpsubw 5056(%r10), %ymm10, %ymm10
+vpsubw 2944(%r10), %ymm0, %ymm0
+vpaddw 8128(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3648(%r10)
+vmovdqa %ymm10, 4352(%r10)
+vmovdqa 3680(%r10), %ymm0
+vpsubw 4384(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm11, %ymm11
+vpsubw 5088(%r10), %ymm11, %ymm11
+vpsubw 2976(%r10), %ymm0, %ymm0
+vpaddw 8160(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3680(%r10)
+vmovdqa %ymm11, 4384(%r10)
+vmovdqa 3712(%r10), %ymm0
+vpsubw 4416(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm12, %ymm12
+vpsubw 5120(%r10), %ymm12, %ymm12
+vpsubw 3008(%r10), %ymm0, %ymm0
+vpaddw 8192(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3712(%r10)
+vmovdqa %ymm12, 4416(%r10)
+vmovdqa 3744(%r10), %ymm0
+vpsubw 4448(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm13, %ymm13
+vpsubw 5152(%r10), %ymm13, %ymm13
+vpsubw 3040(%r10), %ymm0, %ymm0
+vpaddw 8224(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3744(%r10)
+vmovdqa %ymm13, 4448(%r10)
+vmovdqa 3776(%r10), %ymm0
+vpsubw 4480(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm14, %ymm14
+vpsubw 5184(%r10), %ymm14, %ymm14
+vpsubw 3072(%r10), %ymm0, %ymm0
+vpaddw 8256(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3776(%r10)
+vmovdqa %ymm14, 4480(%r10)
+vmovdqa 3808(%r10), %ymm0
+vpsubw 4512(%r10), %ymm0, %ymm0
+vpsubw %ymm0, %ymm15, %ymm15
+vpsubw 5216(%r10), %ymm15, %ymm15
+vpsubw 3104(%r10), %ymm0, %ymm0
+vpaddw 8288(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3808(%r10)
+vmovdqa %ymm15, 4512(%r10)
+vmovdqa 3840(%r10), %ymm0
+vpsubw 4544(%r10), %ymm0, %ymm0
+vmovdqa 9024(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5248(%r10), %ymm1, %ymm1
+vpsubw 3136(%r10), %ymm0, %ymm0
+vpaddw 8320(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3840(%r10)
+vmovdqa %ymm1, 4544(%r10)
+vmovdqa 3872(%r10), %ymm0
+vpsubw 4576(%r10), %ymm0, %ymm0
+vmovdqa 9056(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5280(%r10), %ymm1, %ymm1
+vpsubw 3168(%r10), %ymm0, %ymm0
+vpaddw 8352(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3872(%r10)
+vmovdqa %ymm1, 4576(%r10)
+vmovdqa 3904(%r10), %ymm0
+vpsubw 4608(%r10), %ymm0, %ymm0
+vmovdqa 9088(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5312(%r10), %ymm1, %ymm1
+vpsubw 3200(%r10), %ymm0, %ymm0
+vpaddw 8384(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3904(%r10)
+vmovdqa %ymm1, 4608(%r10)
+vmovdqa 3936(%r10), %ymm0
+vpsubw 4640(%r10), %ymm0, %ymm0
+vmovdqa 9120(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5344(%r10), %ymm1, %ymm1
+vpsubw 3232(%r10), %ymm0, %ymm0
+vpaddw 8416(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3936(%r10)
+vmovdqa %ymm1, 4640(%r10)
+vmovdqa 3968(%r10), %ymm0
+vpsubw 4672(%r10), %ymm0, %ymm0
+vmovdqa 9152(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5376(%r10), %ymm1, %ymm1
+vpsubw 3264(%r10), %ymm0, %ymm0
+vpaddw 8448(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 3968(%r10)
+vmovdqa %ymm1, 4672(%r10)
+vmovdqa 4000(%r10), %ymm0
+vpsubw 4704(%r10), %ymm0, %ymm0
+vmovdqa 9184(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5408(%r10), %ymm1, %ymm1
+vpsubw 3296(%r10), %ymm0, %ymm0
+vpaddw 8480(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4000(%r10)
+vmovdqa %ymm1, 4704(%r10)
+vmovdqa 4032(%r10), %ymm0
+vpsubw 4736(%r10), %ymm0, %ymm0
+vmovdqa 9216(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5440(%r10), %ymm1, %ymm1
+vpsubw 3328(%r10), %ymm0, %ymm0
+vpaddw 8512(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4032(%r10)
+vmovdqa %ymm1, 4736(%r10)
+vmovdqa 4064(%r10), %ymm0
+vpsubw 4768(%r10), %ymm0, %ymm0
+vmovdqa 9248(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5472(%r10), %ymm1, %ymm1
+vpsubw 3360(%r10), %ymm0, %ymm0
+vpaddw 8544(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4064(%r10)
+vmovdqa %ymm1, 4768(%r10)
+vmovdqa 4096(%r10), %ymm0
+vpsubw 4800(%r10), %ymm0, %ymm0
+vmovdqa 9280(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5504(%r10), %ymm1, %ymm1
+vpsubw 3392(%r10), %ymm0, %ymm0
+vpaddw 8576(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4096(%r10)
+vmovdqa %ymm1, 4800(%r10)
+vmovdqa 4128(%r10), %ymm0
+vpsubw 4832(%r10), %ymm0, %ymm0
+vmovdqa 9312(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5536(%r10), %ymm1, %ymm1
+vpsubw 3424(%r10), %ymm0, %ymm0
+vpaddw 8608(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4128(%r10)
+vmovdqa %ymm1, 4832(%r10)
+vmovdqa 4160(%r10), %ymm0
+vpsubw 4864(%r10), %ymm0, %ymm0
+vmovdqa 9344(%rsp), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5568(%r10), %ymm1, %ymm1
+vpsubw 3456(%r10), %ymm0, %ymm0
+vpaddw 8640(%rsp), %ymm0, %ymm0
+vmovdqa %ymm0, 4160(%r10)
+vmovdqa %ymm1, 4864(%r10)
+vpxor %ymm1, %ymm1, %ymm1
+vmovdqa %ymm1, 5600(%r10)
+subq $32, %rsp
+vmovdqa 2816(%r10), %ymm0
+vmovdqa 2880(%r10), %ymm1
+vmovdqa 2944(%r10), %ymm2
+vmovdqa 3008(%r10), %ymm3
+vpunpcklwd 2848(%r10), %ymm0, %ymm4
+vpunpckhwd 2848(%r10), %ymm0, %ymm5
+vpunpcklwd 2912(%r10), %ymm1, %ymm6
+vpunpckhwd 2912(%r10), %ymm1, %ymm7
+vpunpcklwd 2976(%r10), %ymm2, %ymm8
+vpunpckhwd 2976(%r10), %ymm2, %ymm9
+vpunpcklwd 3040(%r10), %ymm3, %ymm10
+vpunpckhwd 3040(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 3072(%r10), %ymm0
+vmovdqa 3136(%r10), %ymm1
+vmovdqa 3200(%r10), %ymm2
+vmovdqa 3264(%r10), %ymm3
+vpunpcklwd 3104(%r10), %ymm0, %ymm12
+vpunpckhwd 3104(%r10), %ymm0, %ymm13
+vpunpcklwd 3168(%r10), %ymm1, %ymm14
+vpunpckhwd 3168(%r10), %ymm1, %ymm15
+vpunpcklwd 3232(%r10), %ymm2, %ymm0
+vpunpckhwd 3232(%r10), %ymm2, %ymm1
+vpunpcklwd 3296(%r10), %ymm3, %ymm2
+vpunpckhwd 3296(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 0(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 192(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 384(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 576(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 768(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 960(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1152(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1536(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1728(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1920(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2112(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2304(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2496(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2688(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1344(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2880(%r12)
+vmovdqa 3328(%r10), %ymm0
+vmovdqa 3392(%r10), %ymm1
+vmovdqa 3456(%r10), %ymm2
+vmovdqa 3520(%r10), %ymm3
+vpunpcklwd 3360(%r10), %ymm0, %ymm4
+vpunpckhwd 3360(%r10), %ymm0, %ymm5
+vpunpcklwd 3424(%r10), %ymm1, %ymm6
+vpunpckhwd 3424(%r10), %ymm1, %ymm7
+vpunpcklwd 3488(%r10), %ymm2, %ymm8
+vpunpckhwd 3488(%r10), %ymm2, %ymm9
+vpunpcklwd 3552(%r10), %ymm3, %ymm10
+vpunpckhwd 3552(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 3584(%r10), %ymm0
+vmovdqa 3648(%r10), %ymm1
+vmovdqa 3712(%r10), %ymm2
+vmovdqa 3776(%r10), %ymm3
+vpunpcklwd 3616(%r10), %ymm0, %ymm12
+vpunpckhwd 3616(%r10), %ymm0, %ymm13
+vpunpcklwd 3680(%r10), %ymm1, %ymm14
+vpunpckhwd 3680(%r10), %ymm1, %ymm15
+vpunpcklwd 3744(%r10), %ymm2, %ymm0
+vpunpckhwd 3744(%r10), %ymm2, %ymm1
+vpunpcklwd 3808(%r10), %ymm3, %ymm2
+vpunpckhwd 3808(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 32(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 224(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 416(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 608(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 800(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 992(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1184(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1568(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1760(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1952(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2144(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2336(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2528(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2720(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1376(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2912(%r12)
+vmovdqa 3840(%r10), %ymm0
+vmovdqa 3904(%r10), %ymm1
+vmovdqa 3968(%r10), %ymm2
+vmovdqa 4032(%r10), %ymm3
+vpunpcklwd 3872(%r10), %ymm0, %ymm4
+vpunpckhwd 3872(%r10), %ymm0, %ymm5
+vpunpcklwd 3936(%r10), %ymm1, %ymm6
+vpunpckhwd 3936(%r10), %ymm1, %ymm7
+vpunpcklwd 4000(%r10), %ymm2, %ymm8
+vpunpckhwd 4000(%r10), %ymm2, %ymm9
+vpunpcklwd 4064(%r10), %ymm3, %ymm10
+vpunpckhwd 4064(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 4096(%r10), %ymm0
+vmovdqa 4160(%r10), %ymm1
+vmovdqa 4224(%r10), %ymm2
+vmovdqa 4288(%r10), %ymm3
+vpunpcklwd 4128(%r10), %ymm0, %ymm12
+vpunpckhwd 4128(%r10), %ymm0, %ymm13
+vpunpcklwd 4192(%r10), %ymm1, %ymm14
+vpunpckhwd 4192(%r10), %ymm1, %ymm15
+vpunpcklwd 4256(%r10), %ymm2, %ymm0
+vpunpckhwd 4256(%r10), %ymm2, %ymm1
+vpunpcklwd 4320(%r10), %ymm3, %ymm2
+vpunpckhwd 4320(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 64(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 256(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 448(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 640(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 832(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1024(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1216(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1600(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1792(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 1984(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2176(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2368(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2560(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2752(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1408(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2944(%r12)
+vmovdqa 4224(%r10), %ymm0
+vmovdqa 4288(%r10), %ymm1
+vmovdqa 4352(%r10), %ymm2
+vmovdqa 4416(%r10), %ymm3
+vpunpcklwd 4256(%r10), %ymm0, %ymm4
+vpunpckhwd 4256(%r10), %ymm0, %ymm5
+vpunpcklwd 4320(%r10), %ymm1, %ymm6
+vpunpckhwd 4320(%r10), %ymm1, %ymm7
+vpunpcklwd 4384(%r10), %ymm2, %ymm8
+vpunpckhwd 4384(%r10), %ymm2, %ymm9
+vpunpcklwd 4448(%r10), %ymm3, %ymm10
+vpunpckhwd 4448(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 4480(%r10), %ymm0
+vmovdqa 4544(%r10), %ymm1
+vmovdqa 4608(%r10), %ymm2
+vmovdqa 4672(%r10), %ymm3
+vpunpcklwd 4512(%r10), %ymm0, %ymm12
+vpunpckhwd 4512(%r10), %ymm0, %ymm13
+vpunpcklwd 4576(%r10), %ymm1, %ymm14
+vpunpckhwd 4576(%r10), %ymm1, %ymm15
+vpunpcklwd 4640(%r10), %ymm2, %ymm0
+vpunpckhwd 4640(%r10), %ymm2, %ymm1
+vpunpcklwd 4704(%r10), %ymm3, %ymm2
+vpunpckhwd 4704(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 96(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 288(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 480(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 672(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 864(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1056(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1248(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1632(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1824(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2016(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2208(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2400(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2592(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2784(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1440(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 2976(%r12)
+vmovdqa 4736(%r10), %ymm0
+vmovdqa 4800(%r10), %ymm1
+vmovdqa 4864(%r10), %ymm2
+vmovdqa 4928(%r10), %ymm3
+vpunpcklwd 4768(%r10), %ymm0, %ymm4
+vpunpckhwd 4768(%r10), %ymm0, %ymm5
+vpunpcklwd 4832(%r10), %ymm1, %ymm6
+vpunpckhwd 4832(%r10), %ymm1, %ymm7
+vpunpcklwd 4896(%r10), %ymm2, %ymm8
+vpunpckhwd 4896(%r10), %ymm2, %ymm9
+vpunpcklwd 4960(%r10), %ymm3, %ymm10
+vpunpckhwd 4960(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 4992(%r10), %ymm0
+vmovdqa 5056(%r10), %ymm1
+vmovdqa 5120(%r10), %ymm2
+vmovdqa 5184(%r10), %ymm3
+vpunpcklwd 5024(%r10), %ymm0, %ymm12
+vpunpckhwd 5024(%r10), %ymm0, %ymm13
+vpunpcklwd 5088(%r10), %ymm1, %ymm14
+vpunpckhwd 5088(%r10), %ymm1, %ymm15
+vpunpcklwd 5152(%r10), %ymm2, %ymm0
+vpunpckhwd 5152(%r10), %ymm2, %ymm1
+vpunpcklwd 5216(%r10), %ymm3, %ymm2
+vpunpckhwd 5216(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 128(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 320(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 512(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 704(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 896(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1088(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1280(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1664(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1856(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2048(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2240(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2432(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2624(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2816(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1472(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 3008(%r12)
+vmovdqa 5248(%r10), %ymm0
+vmovdqa 5312(%r10), %ymm1
+vmovdqa 5376(%r10), %ymm2
+vmovdqa 5440(%r10), %ymm3
+vpunpcklwd 5280(%r10), %ymm0, %ymm4
+vpunpckhwd 5280(%r10), %ymm0, %ymm5
+vpunpcklwd 5344(%r10), %ymm1, %ymm6
+vpunpckhwd 5344(%r10), %ymm1, %ymm7
+vpunpcklwd 5408(%r10), %ymm2, %ymm8
+vpunpckhwd 5408(%r10), %ymm2, %ymm9
+vpunpcklwd 5472(%r10), %ymm3, %ymm10
+vpunpckhwd 5472(%r10), %ymm3, %ymm11
+vpunpckldq %ymm6, %ymm4, %ymm0
+vpunpckhdq %ymm6, %ymm4, %ymm1
+vpunpckldq %ymm7, %ymm5, %ymm2
+vpunpckhdq %ymm7, %ymm5, %ymm3
+vpunpckldq %ymm10, %ymm8, %ymm12
+vpunpckhdq %ymm10, %ymm8, %ymm13
+vpunpckldq %ymm11, %ymm9, %ymm14
+vpunpckhdq %ymm11, %ymm9, %ymm15
+vpunpcklqdq %ymm12, %ymm0, %ymm4
+vpunpckhqdq %ymm12, %ymm0, %ymm5
+vpunpcklqdq %ymm13, %ymm1, %ymm6
+vpunpckhqdq %ymm13, %ymm1, %ymm7
+vpunpcklqdq %ymm14, %ymm2, %ymm8
+vpunpckhqdq %ymm14, %ymm2, %ymm9
+vpunpcklqdq %ymm15, %ymm3, %ymm10
+vpunpckhqdq %ymm15, %ymm3, %ymm11
+vmovdqa 5504(%r10), %ymm0
+vmovdqa 5568(%r10), %ymm1
+vmovdqa 5632(%r10), %ymm2
+vmovdqa 5696(%r10), %ymm3
+vpunpcklwd 5536(%r10), %ymm0, %ymm12
+vpunpckhwd 5536(%r10), %ymm0, %ymm13
+vpunpcklwd 5600(%r10), %ymm1, %ymm14
+vpunpckhwd 5600(%r10), %ymm1, %ymm15
+vpunpcklwd 5664(%r10), %ymm2, %ymm0
+vpunpckhwd 5664(%r10), %ymm2, %ymm1
+vpunpcklwd 5728(%r10), %ymm3, %ymm2
+vpunpckhwd 5728(%r10), %ymm3, %ymm3
+vmovdqa %ymm11, 0(%rsp)
+vpunpckldq %ymm14, %ymm12, %ymm11
+vpunpckhdq %ymm14, %ymm12, %ymm12
+vpunpckldq %ymm15, %ymm13, %ymm14
+vpunpckhdq %ymm15, %ymm13, %ymm15
+vpunpckldq %ymm2, %ymm0, %ymm13
+vpunpckhdq %ymm2, %ymm0, %ymm0
+vpunpckldq %ymm3, %ymm1, %ymm2
+vpunpckhdq %ymm3, %ymm1, %ymm1
+vpunpcklqdq %ymm13, %ymm11, %ymm3
+vpunpckhqdq %ymm13, %ymm11, %ymm13
+vpunpcklqdq %ymm0, %ymm12, %ymm11
+vpunpckhqdq %ymm0, %ymm12, %ymm0
+vpunpcklqdq %ymm2, %ymm14, %ymm12
+vpunpckhqdq %ymm2, %ymm14, %ymm2
+vpunpcklqdq %ymm1, %ymm15, %ymm14
+vpunpckhqdq %ymm1, %ymm15, %ymm1
+vinserti128 $1, %xmm3, %ymm4, %ymm15
+vmovdqa %ymm15, 160(%r12)
+vinserti128 $1, %xmm13, %ymm5, %ymm15
+vmovdqa %ymm15, 352(%r12)
+vinserti128 $1, %xmm11, %ymm6, %ymm15
+vmovdqa %ymm15, 544(%r12)
+vinserti128 $1, %xmm0, %ymm7, %ymm15
+vmovdqa %ymm15, 736(%r12)
+vinserti128 $1, %xmm12, %ymm8, %ymm15
+vmovdqa %ymm15, 928(%r12)
+vinserti128 $1, %xmm2, %ymm9, %ymm15
+vmovdqa %ymm15, 1120(%r12)
+vinserti128 $1, %xmm14, %ymm10, %ymm15
+vmovdqa %ymm15, 1312(%r12)
+vpermq $78, %ymm4, %ymm4
+vpermq $78, %ymm5, %ymm5
+vpermq $78, %ymm6, %ymm6
+vpermq $78, %ymm7, %ymm7
+vpermq $78, %ymm8, %ymm8
+vpermq $78, %ymm9, %ymm9
+vpermq $78, %ymm10, %ymm10
+vinserti128 $0, %xmm4, %ymm3, %ymm15
+vmovdqa %ymm15, 1696(%r12)
+vinserti128 $0, %xmm5, %ymm13, %ymm15
+vmovdqa %ymm15, 1888(%r12)
+vinserti128 $0, %xmm6, %ymm11, %ymm15
+vmovdqa %ymm15, 2080(%r12)
+vinserti128 $0, %xmm7, %ymm0, %ymm15
+vmovdqa %ymm15, 2272(%r12)
+vinserti128 $0, %xmm8, %ymm12, %ymm15
+vmovdqa %ymm15, 2464(%r12)
+vinserti128 $0, %xmm9, %ymm2, %ymm15
+vmovdqa %ymm15, 2656(%r12)
+vinserti128 $0, %xmm10, %ymm14, %ymm15
+vmovdqa %ymm15, 2848(%r12)
+vmovdqa 0(%rsp), %ymm11
+vinserti128 $1, %xmm1, %ymm11, %ymm14
+vmovdqa %ymm14, 1504(%r12)
+vpermq $78, %ymm11, %ymm11
+vinserti128 $0, %xmm11, %ymm1, %ymm1
+vmovdqa %ymm1, 3040(%r12)
+addq $32, %rsp
+add $1536, %rax
+add $1536, %r11
+add $3072, %r12
+dec %ecx
+jnz karatsuba_loop_4eced63f144beffcb0247f9c6f67d165
+sub $12288, %r12
+add $9408, %rsp
+subq $2400, %rsp
+vpxor %ymm0, %ymm0, %ymm0
+vmovdqa %ymm0, 1792(%rsp)
+vmovdqa %ymm0, 1824(%rsp)
+vmovdqa %ymm0, 1856(%rsp)
+vmovdqa %ymm0, 1888(%rsp)
+vmovdqa %ymm0, 1920(%rsp)
+vmovdqa %ymm0, 1952(%rsp)
+vmovdqa %ymm0, 1984(%rsp)
+vmovdqa %ymm0, 2016(%rsp)
+vmovdqa %ymm0, 2048(%rsp)
+vmovdqa %ymm0, 2080(%rsp)
+vmovdqa %ymm0, 2112(%rsp)
+vmovdqa %ymm0, 2144(%rsp)
+vmovdqa %ymm0, 2176(%rsp)
+vmovdqa %ymm0, 2208(%rsp)
+vmovdqa %ymm0, 2240(%rsp)
+vmovdqa %ymm0, 2272(%rsp)
+vmovdqa %ymm0, 2304(%rsp)
+vmovdqa %ymm0, 2336(%rsp)
+vmovdqa %ymm0, 2368(%rsp)
+vmovdqa %ymm0, 2400(%rsp)
+vmovdqa %ymm0, 2432(%rsp)
+vmovdqa %ymm0, 2464(%rsp)
+vmovdqa %ymm0, 2496(%rsp)
+vmovdqa %ymm0, 2528(%rsp)
+vmovdqa %ymm0, 2560(%rsp)
+vmovdqa %ymm0, 2592(%rsp)
+vmovdqa %ymm0, 2624(%rsp)
+vmovdqa %ymm0, 2656(%rsp)
+vmovdqa %ymm0, 2688(%rsp)
+vmovdqa %ymm0, 2720(%rsp)
+vmovdqa %ymm0, 2752(%rsp)
+vmovdqa %ymm0, 2784(%rsp)
+vmovdqa const729(%rip), %ymm15
+vmovdqa const3_inv(%rip), %ymm14
+vmovdqa const5_inv(%rip), %ymm13
+vmovdqa const9(%rip), %ymm12
+vmovdqa 96(%r12), %ymm0
+vpsubw 192(%r12), %ymm0, %ymm0
+vmovdqa 480(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 288(%r12), %ymm1, %ymm1
+vpsubw 0(%r12), %ymm0, %ymm0
+vpaddw 384(%r12), %ymm0, %ymm0
+vmovdqa 672(%r12), %ymm2
+vpsubw 768(%r12), %ymm2, %ymm2
+vmovdqa 1056(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 864(%r12), %ymm3, %ymm3
+vpsubw 576(%r12), %ymm2, %ymm2
+vpaddw 960(%r12), %ymm2, %ymm2
+vmovdqa 1248(%r12), %ymm4
+vpsubw 1344(%r12), %ymm4, %ymm4
+vmovdqa 1632(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 1440(%r12), %ymm5, %ymm5
+vpsubw 1152(%r12), %ymm4, %ymm4
+vpaddw 1536(%r12), %ymm4, %ymm4
+vpsubw 576(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 0(%r12), %ymm1, %ymm1
+vpaddw 1152(%r12), %ymm1, %ymm1
+vmovdqa 288(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 1440(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 864(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 0(%r12), %ymm8
+vmovdqa 864(%r12), %ymm9
+vmovdqa %ymm8, 0(%rsp)
+vmovdqa %ymm0, 32(%rsp)
+vmovdqa %ymm1, 64(%rsp)
+vmovdqa %ymm7, 96(%rsp)
+vmovdqa %ymm5, 128(%rsp)
+vmovdqa %ymm2, 160(%rsp)
+vmovdqa %ymm3, 192(%rsp)
+vmovdqa %ymm9, 224(%rsp)
+vmovdqa 1824(%r12), %ymm0
+vpsubw 1920(%r12), %ymm0, %ymm0
+vmovdqa 2208(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 2016(%r12), %ymm1, %ymm1
+vpsubw 1728(%r12), %ymm0, %ymm0
+vpaddw 2112(%r12), %ymm0, %ymm0
+vmovdqa 2400(%r12), %ymm2
+vpsubw 2496(%r12), %ymm2, %ymm2
+vmovdqa 2784(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 2592(%r12), %ymm3, %ymm3
+vpsubw 2304(%r12), %ymm2, %ymm2
+vpaddw 2688(%r12), %ymm2, %ymm2
+vmovdqa 2976(%r12), %ymm4
+vpsubw 3072(%r12), %ymm4, %ymm4
+vmovdqa 3360(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 3168(%r12), %ymm5, %ymm5
+vpsubw 2880(%r12), %ymm4, %ymm4
+vpaddw 3264(%r12), %ymm4, %ymm4
+vpsubw 2304(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 1728(%r12), %ymm1, %ymm1
+vpaddw 2880(%r12), %ymm1, %ymm1
+vmovdqa 2016(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 3168(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 2592(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 1728(%r12), %ymm8
+vmovdqa 2592(%r12), %ymm9
+vmovdqa %ymm8, 256(%rsp)
+vmovdqa %ymm0, 288(%rsp)
+vmovdqa %ymm1, 320(%rsp)
+vmovdqa %ymm7, 352(%rsp)
+vmovdqa %ymm5, 384(%rsp)
+vmovdqa %ymm2, 416(%rsp)
+vmovdqa %ymm3, 448(%rsp)
+vmovdqa %ymm9, 480(%rsp)
+vmovdqa 3552(%r12), %ymm0
+vpsubw 3648(%r12), %ymm0, %ymm0
+vmovdqa 3936(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3744(%r12), %ymm1, %ymm1
+vpsubw 3456(%r12), %ymm0, %ymm0
+vpaddw 3840(%r12), %ymm0, %ymm0
+vmovdqa 4128(%r12), %ymm2
+vpsubw 4224(%r12), %ymm2, %ymm2
+vmovdqa 4512(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 4320(%r12), %ymm3, %ymm3
+vpsubw 4032(%r12), %ymm2, %ymm2
+vpaddw 4416(%r12), %ymm2, %ymm2
+vmovdqa 4704(%r12), %ymm4
+vpsubw 4800(%r12), %ymm4, %ymm4
+vmovdqa 5088(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 4896(%r12), %ymm5, %ymm5
+vpsubw 4608(%r12), %ymm4, %ymm4
+vpaddw 4992(%r12), %ymm4, %ymm4
+vpsubw 4032(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 3456(%r12), %ymm1, %ymm1
+vpaddw 4608(%r12), %ymm1, %ymm1
+vmovdqa 3744(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 4896(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 4320(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 3456(%r12), %ymm8
+vmovdqa 4320(%r12), %ymm9
+vmovdqa %ymm8, 512(%rsp)
+vmovdqa %ymm0, 544(%rsp)
+vmovdqa %ymm1, 576(%rsp)
+vmovdqa %ymm7, 608(%rsp)
+vmovdqa %ymm5, 640(%rsp)
+vmovdqa %ymm2, 672(%rsp)
+vmovdqa %ymm3, 704(%rsp)
+vmovdqa %ymm9, 736(%rsp)
+vmovdqa 5280(%r12), %ymm0
+vpsubw 5376(%r12), %ymm0, %ymm0
+vmovdqa 5664(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5472(%r12), %ymm1, %ymm1
+vpsubw 5184(%r12), %ymm0, %ymm0
+vpaddw 5568(%r12), %ymm0, %ymm0
+vmovdqa 5856(%r12), %ymm2
+vpsubw 5952(%r12), %ymm2, %ymm2
+vmovdqa 6240(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 6048(%r12), %ymm3, %ymm3
+vpsubw 5760(%r12), %ymm2, %ymm2
+vpaddw 6144(%r12), %ymm2, %ymm2
+vmovdqa 6432(%r12), %ymm4
+vpsubw 6528(%r12), %ymm4, %ymm4
+vmovdqa 6816(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 6624(%r12), %ymm5, %ymm5
+vpsubw 6336(%r12), %ymm4, %ymm4
+vpaddw 6720(%r12), %ymm4, %ymm4
+vpsubw 5760(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 5184(%r12), %ymm1, %ymm1
+vpaddw 6336(%r12), %ymm1, %ymm1
+vmovdqa 5472(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 6624(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 6048(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 5184(%r12), %ymm8
+vmovdqa 6048(%r12), %ymm9
+vmovdqa %ymm8, 768(%rsp)
+vmovdqa %ymm0, 800(%rsp)
+vmovdqa %ymm1, 832(%rsp)
+vmovdqa %ymm7, 864(%rsp)
+vmovdqa %ymm5, 896(%rsp)
+vmovdqa %ymm2, 928(%rsp)
+vmovdqa %ymm3, 960(%rsp)
+vmovdqa %ymm9, 992(%rsp)
+vmovdqa 7008(%r12), %ymm0
+vpsubw 7104(%r12), %ymm0, %ymm0
+vmovdqa 7392(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 7200(%r12), %ymm1, %ymm1
+vpsubw 6912(%r12), %ymm0, %ymm0
+vpaddw 7296(%r12), %ymm0, %ymm0
+vmovdqa 7584(%r12), %ymm2
+vpsubw 7680(%r12), %ymm2, %ymm2
+vmovdqa 7968(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 7776(%r12), %ymm3, %ymm3
+vpsubw 7488(%r12), %ymm2, %ymm2
+vpaddw 7872(%r12), %ymm2, %ymm2
+vmovdqa 8160(%r12), %ymm4
+vpsubw 8256(%r12), %ymm4, %ymm4
+vmovdqa 8544(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 8352(%r12), %ymm5, %ymm5
+vpsubw 8064(%r12), %ymm4, %ymm4
+vpaddw 8448(%r12), %ymm4, %ymm4
+vpsubw 7488(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 6912(%r12), %ymm1, %ymm1
+vpaddw 8064(%r12), %ymm1, %ymm1
+vmovdqa 7200(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 8352(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 7776(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 6912(%r12), %ymm8
+vmovdqa 7776(%r12), %ymm9
+vmovdqa %ymm8, 1024(%rsp)
+vmovdqa %ymm0, 1056(%rsp)
+vmovdqa %ymm1, 1088(%rsp)
+vmovdqa %ymm7, 1120(%rsp)
+vmovdqa %ymm5, 1152(%rsp)
+vmovdqa %ymm2, 1184(%rsp)
+vmovdqa %ymm3, 1216(%rsp)
+vmovdqa %ymm9, 1248(%rsp)
+vmovdqa 8736(%r12), %ymm0
+vpsubw 8832(%r12), %ymm0, %ymm0
+vmovdqa 9120(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 8928(%r12), %ymm1, %ymm1
+vpsubw 8640(%r12), %ymm0, %ymm0
+vpaddw 9024(%r12), %ymm0, %ymm0
+vmovdqa 9312(%r12), %ymm2
+vpsubw 9408(%r12), %ymm2, %ymm2
+vmovdqa 9696(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 9504(%r12), %ymm3, %ymm3
+vpsubw 9216(%r12), %ymm2, %ymm2
+vpaddw 9600(%r12), %ymm2, %ymm2
+vmovdqa 9888(%r12), %ymm4
+vpsubw 9984(%r12), %ymm4, %ymm4
+vmovdqa 10272(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 10080(%r12), %ymm5, %ymm5
+vpsubw 9792(%r12), %ymm4, %ymm4
+vpaddw 10176(%r12), %ymm4, %ymm4
+vpsubw 9216(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 8640(%r12), %ymm1, %ymm1
+vpaddw 9792(%r12), %ymm1, %ymm1
+vmovdqa 8928(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 10080(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 9504(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 8640(%r12), %ymm8
+vmovdqa 9504(%r12), %ymm9
+vmovdqa %ymm8, 1280(%rsp)
+vmovdqa %ymm0, 1312(%rsp)
+vmovdqa %ymm1, 1344(%rsp)
+vmovdqa %ymm7, 1376(%rsp)
+vmovdqa %ymm5, 1408(%rsp)
+vmovdqa %ymm2, 1440(%rsp)
+vmovdqa %ymm3, 1472(%rsp)
+vmovdqa %ymm9, 1504(%rsp)
+vmovdqa 10464(%r12), %ymm0
+vpsubw 10560(%r12), %ymm0, %ymm0
+vmovdqa 10848(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 10656(%r12), %ymm1, %ymm1
+vpsubw 10368(%r12), %ymm0, %ymm0
+vpaddw 10752(%r12), %ymm0, %ymm0
+vmovdqa 11040(%r12), %ymm2
+vpsubw 11136(%r12), %ymm2, %ymm2
+vmovdqa 11424(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 11232(%r12), %ymm3, %ymm3
+vpsubw 10944(%r12), %ymm2, %ymm2
+vpaddw 11328(%r12), %ymm2, %ymm2
+vmovdqa 11616(%r12), %ymm4
+vpsubw 11712(%r12), %ymm4, %ymm4
+vmovdqa 12000(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 11808(%r12), %ymm5, %ymm5
+vpsubw 11520(%r12), %ymm4, %ymm4
+vpaddw 11904(%r12), %ymm4, %ymm4
+vpsubw 10944(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 10368(%r12), %ymm1, %ymm1
+vpaddw 11520(%r12), %ymm1, %ymm1
+vmovdqa 10656(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 11808(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 11232(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 10368(%r12), %ymm8
+vmovdqa 11232(%r12), %ymm9
+vmovdqa %ymm8, 1536(%rsp)
+vmovdqa %ymm0, 1568(%rsp)
+vmovdqa %ymm1, 1600(%rsp)
+vmovdqa %ymm7, 1632(%rsp)
+vmovdqa %ymm5, 1664(%rsp)
+vmovdqa %ymm2, 1696(%rsp)
+vmovdqa %ymm3, 1728(%rsp)
+vmovdqa %ymm9, 1760(%rsp)
+vmovdqa 0(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm9, %ymm9
+vmovdqa 256(%rsp), %ymm8
+vpunpcklwd const0(%rip), %ymm8, %ymm7
+vpunpckhwd const0(%rip), %ymm8, %ymm8
+vmovdqa 512(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm7, %ymm4
+vpaddd %ymm6, %ymm8, %ymm3
+vpsubd %ymm10, %ymm4, %ymm4
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm5, %ymm7, %ymm5
+vpsubd %ymm6, %ymm8, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1536(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm8
+vpunpckhwd const0(%rip), %ymm5, %ymm7
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm7, %ymm7
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm7, %ymm3, %ymm3
+vpsrld $1, %ymm4, %ymm4
+vpsrld $1, %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpackusdw %ymm3, %ymm4, %ymm3
+vmovdqa 768(%rsp), %ymm4
+vpaddw 1024(%rsp), %ymm4, %ymm7
+vpsubw 1024(%rsp), %ymm4, %ymm4
+vpsrlw $2, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsllw $1, %ymm11, %ymm8
+vpsubw %ymm8, %ymm7, %ymm8
+vpsllw $7, %ymm5, %ymm7
+vpsubw %ymm7, %ymm8, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm3, %ymm7, %ymm7
+vmovdqa 1280(%rsp), %ymm8
+vpsubw %ymm11, %ymm8, %ymm8
+vpmullw %ymm15, %ymm5, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm7, %ymm3, %ymm3
+vpmullw %ymm12, %ymm7, %ymm8
+vpaddw %ymm8, %ymm3, %ymm8
+vpmullw %ymm12, %ymm8, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm4, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpmullw %ymm13, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm9
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm10
+vpor %ymm10, %ymm7, %ymm7
+vpaddw %ymm7, %ymm11, %ymm11
+vmovdqa %xmm9, 2048(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm9
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm10
+vpor %ymm10, %ymm8, %ymm8
+vpaddw %ymm8, %ymm6, %ymm6
+vmovdqa %xmm9, 2304(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm9
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm10
+vpor %ymm10, %ymm5, %ymm5
+vpaddw %ymm5, %ymm3, %ymm3
+vmovdqa %xmm9, 2560(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 0(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 352(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 704(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %ymm4, 1056(%rdi)
+vmovdqa 32(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm8
+vpunpckhwd const0(%rip), %ymm5, %ymm7
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm7, %ymm7
+vmovdqa 288(%rsp), %ymm4
+vpunpcklwd const0(%rip), %ymm4, %ymm3
+vpunpckhwd const0(%rip), %ymm4, %ymm4
+vmovdqa 544(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm3, %ymm9
+vpaddd %ymm6, %ymm4, %ymm10
+vpsubd %ymm8, %ymm9, %ymm9
+vpsubd %ymm7, %ymm10, %ymm10
+vpsubd %ymm11, %ymm3, %ymm11
+vpsubd %ymm6, %ymm4, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1568(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm3, %ymm3
+vpsubd %ymm4, %ymm9, %ymm9
+vpsubd %ymm3, %ymm10, %ymm10
+vpsrld $1, %ymm9, %ymm9
+vpsrld $1, %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpackusdw %ymm10, %ymm9, %ymm10
+vmovdqa 800(%rsp), %ymm9
+vpaddw 1056(%rsp), %ymm9, %ymm3
+vpsubw 1056(%rsp), %ymm9, %ymm9
+vpsrlw $2, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsllw $1, %ymm5, %ymm4
+vpsubw %ymm4, %ymm3, %ymm4
+vpsllw $7, %ymm11, %ymm3
+vpsubw %ymm3, %ymm4, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vmovdqa 1312(%rsp), %ymm4
+vpsubw %ymm5, %ymm4, %ymm4
+vpmullw %ymm15, %ymm11, %ymm7
+vpsubw %ymm7, %ymm4, %ymm7
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpmullw %ymm12, %ymm3, %ymm4
+vpaddw %ymm4, %ymm10, %ymm4
+vpmullw %ymm12, %ymm4, %ymm4
+vpsubw %ymm4, %ymm7, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpmullw %ymm13, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm7
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm8
+vpor %ymm8, %ymm3, %ymm3
+vpaddw %ymm3, %ymm5, %ymm5
+vmovdqa %xmm7, 2080(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm7
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm8
+vpor %ymm8, %ymm4, %ymm4
+vpaddw %ymm4, %ymm6, %ymm6
+vmovdqa %xmm7, 2336(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm7
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm8
+vpor %ymm8, %ymm11, %ymm11
+vpaddw %ymm11, %ymm10, %ymm10
+vmovdqa %xmm7, 2592(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 88(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 440(%rdi)
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 792(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %ymm9, 1144(%rdi)
+vmovdqa 64(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm3, %ymm3
+vmovdqa 320(%rsp), %ymm9
+vpunpcklwd const0(%rip), %ymm9, %ymm10
+vpunpckhwd const0(%rip), %ymm9, %ymm9
+vmovdqa 576(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm10, %ymm7
+vpaddd %ymm6, %ymm9, %ymm8
+vpsubd %ymm4, %ymm7, %ymm7
+vpsubd %ymm3, %ymm8, %ymm8
+vpsubd %ymm5, %ymm10, %ymm5
+vpsubd %ymm6, %ymm9, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1600(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm9
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm10, %ymm10
+vpsubd %ymm9, %ymm7, %ymm7
+vpsubd %ymm10, %ymm8, %ymm8
+vpsrld $1, %ymm7, %ymm7
+vpsrld $1, %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpackusdw %ymm8, %ymm7, %ymm8
+vmovdqa 832(%rsp), %ymm7
+vpaddw 1088(%rsp), %ymm7, %ymm10
+vpsubw 1088(%rsp), %ymm7, %ymm7
+vpsrlw $2, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsllw $1, %ymm11, %ymm9
+vpsubw %ymm9, %ymm10, %ymm9
+vpsllw $7, %ymm5, %ymm10
+vpsubw %ymm10, %ymm9, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm8, %ymm10, %ymm10
+vmovdqa 1344(%rsp), %ymm9
+vpsubw %ymm11, %ymm9, %ymm9
+vpmullw %ymm15, %ymm5, %ymm3
+vpsubw %ymm3, %ymm9, %ymm3
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm10, %ymm8, %ymm8
+vpmullw %ymm12, %ymm10, %ymm9
+vpaddw %ymm9, %ymm8, %ymm9
+vpmullw %ymm12, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm7, %ymm9, %ymm9
+vpsubw %ymm9, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vpmullw %ymm13, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm3
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm10, %ymm10
+vpaddw %ymm10, %ymm11, %ymm11
+vmovdqa %xmm3, 2112(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm3
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm9, %ymm9
+vpaddw %ymm9, %ymm6, %ymm6
+vmovdqa %xmm3, 2368(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm3
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm5, %ymm5
+vpaddw %ymm5, %ymm8, %ymm8
+vmovdqa %xmm3, 2624(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 176(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 528(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %ymm8, 880(%rdi)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %ymm7, 1232(%rdi)
+vmovdqa 96(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm9
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm10, %ymm10
+vmovdqa 352(%rsp), %ymm7
+vpunpcklwd const0(%rip), %ymm7, %ymm8
+vpunpckhwd const0(%rip), %ymm7, %ymm7
+vmovdqa 608(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm8, %ymm3
+vpaddd %ymm6, %ymm7, %ymm4
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm10, %ymm4, %ymm4
+vpsubd %ymm11, %ymm8, %ymm11
+vpsubd %ymm6, %ymm7, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1632(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm7
+vpunpckhwd const0(%rip), %ymm11, %ymm8
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm8, %ymm8
+vpsubd %ymm7, %ymm3, %ymm3
+vpsubd %ymm8, %ymm4, %ymm4
+vpsrld $1, %ymm3, %ymm3
+vpsrld $1, %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpackusdw %ymm4, %ymm3, %ymm4
+vmovdqa 864(%rsp), %ymm3
+vpaddw 1120(%rsp), %ymm3, %ymm8
+vpsubw 1120(%rsp), %ymm3, %ymm3
+vpsrlw $2, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsllw $1, %ymm5, %ymm7
+vpsubw %ymm7, %ymm8, %ymm7
+vpsllw $7, %ymm11, %ymm8
+vpsubw %ymm8, %ymm7, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm4, %ymm8, %ymm8
+vmovdqa 1376(%rsp), %ymm7
+vpsubw %ymm5, %ymm7, %ymm7
+vpmullw %ymm15, %ymm11, %ymm10
+vpsubw %ymm10, %ymm7, %ymm10
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm4
+vpmullw %ymm12, %ymm8, %ymm7
+vpaddw %ymm7, %ymm4, %ymm7
+vpmullw %ymm12, %ymm7, %ymm7
+vpsubw %ymm7, %ymm10, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm3, %ymm7, %ymm7
+vpsubw %ymm7, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpmullw %ymm13, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm10
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm9
+vpor %ymm9, %ymm8, %ymm8
+vpaddw %ymm8, %ymm5, %ymm5
+vmovdqa %xmm10, 2144(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm10
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm9
+vpor %ymm9, %ymm7, %ymm7
+vpaddw %ymm7, %ymm6, %ymm6
+vmovdqa %xmm10, 2400(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm10
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm9
+vpor %ymm9, %ymm11, %ymm11
+vpaddw %ymm11, %ymm4, %ymm4
+vmovdqa %xmm10, 2656(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 264(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 616(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %ymm4, 968(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 1320(%rdi)
+vmovdqa 128(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm7
+vpunpckhwd const0(%rip), %ymm11, %ymm8
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm8, %ymm8
+vmovdqa 384(%rsp), %ymm3
+vpunpcklwd const0(%rip), %ymm3, %ymm4
+vpunpckhwd const0(%rip), %ymm3, %ymm3
+vmovdqa 640(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm4, %ymm10
+vpaddd %ymm6, %ymm3, %ymm9
+vpsubd %ymm7, %ymm10, %ymm10
+vpsubd %ymm8, %ymm9, %ymm9
+vpsubd %ymm5, %ymm4, %ymm5
+vpsubd %ymm6, %ymm3, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1664(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm3
+vpunpckhwd const0(%rip), %ymm5, %ymm4
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm4, %ymm4
+vpsubd %ymm3, %ymm10, %ymm10
+vpsubd %ymm4, %ymm9, %ymm9
+vpsrld $1, %ymm10, %ymm10
+vpsrld $1, %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpackusdw %ymm9, %ymm10, %ymm9
+vmovdqa 896(%rsp), %ymm10
+vpaddw 1152(%rsp), %ymm10, %ymm4
+vpsubw 1152(%rsp), %ymm10, %ymm10
+vpsrlw $2, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsllw $1, %ymm11, %ymm3
+vpsubw %ymm3, %ymm4, %ymm3
+vpsllw $7, %ymm5, %ymm4
+vpsubw %ymm4, %ymm3, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vmovdqa 1408(%rsp), %ymm3
+vpsubw %ymm11, %ymm3, %ymm3
+vpmullw %ymm15, %ymm5, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpmullw %ymm12, %ymm4, %ymm3
+vpaddw %ymm3, %ymm9, %ymm3
+vpmullw %ymm12, %ymm3, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vpmullw %ymm13, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vmovdqu 352(%rdi), %ymm8
+vmovdqu 704(%rdi), %ymm7
+vmovdqu 1056(%rdi), %ymm2
+vpaddw %ymm11, %ymm8, %ymm11
+vpaddw %ymm6, %ymm7, %ymm6
+vpaddw %ymm9, %ymm2, %ymm9
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm2
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm7
+vpor %ymm7, %ymm10, %ymm10
+vmovdqu 0(%rdi), %ymm7
+vpaddw %ymm10, %ymm7, %ymm7
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %ymm7, 0(%rdi)
+vmovdqa %xmm2, 1920(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm2
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm7
+vpor %ymm7, %ymm4, %ymm4
+vpaddw %ymm4, %ymm11, %ymm11
+vmovdqa %xmm2, 2176(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm2
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm7
+vpor %ymm7, %ymm3, %ymm3
+vpaddw %ymm3, %ymm6, %ymm6
+vmovdqa %xmm2, 2432(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm2
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm7
+vpor %ymm7, %ymm5, %ymm5
+vpaddw %ymm5, %ymm9, %ymm9
+vmovdqa %xmm2, 2688(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 352(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 704(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %ymm9, 1056(%rdi)
+vmovdqa 160(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm3
+vpunpckhwd const0(%rip), %ymm5, %ymm4
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm4, %ymm4
+vmovdqa 416(%rsp), %ymm10
+vpunpcklwd const0(%rip), %ymm10, %ymm9
+vpunpckhwd const0(%rip), %ymm10, %ymm10
+vmovdqa 672(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm9, %ymm2
+vpaddd %ymm6, %ymm10, %ymm7
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm4, %ymm7, %ymm7
+vpsubd %ymm11, %ymm9, %ymm11
+vpsubd %ymm6, %ymm10, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1696(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm9, %ymm9
+vpsubd %ymm10, %ymm2, %ymm2
+vpsubd %ymm9, %ymm7, %ymm7
+vpsrld $1, %ymm2, %ymm2
+vpsrld $1, %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpackusdw %ymm7, %ymm2, %ymm7
+vmovdqa 928(%rsp), %ymm2
+vpaddw 1184(%rsp), %ymm2, %ymm9
+vpsubw 1184(%rsp), %ymm2, %ymm2
+vpsrlw $2, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsllw $1, %ymm5, %ymm10
+vpsubw %ymm10, %ymm9, %ymm10
+vpsllw $7, %ymm11, %ymm9
+vpsubw %ymm9, %ymm10, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm7, %ymm9, %ymm9
+vmovdqa 1440(%rsp), %ymm10
+vpsubw %ymm5, %ymm10, %ymm10
+vpmullw %ymm15, %ymm11, %ymm4
+vpsubw %ymm4, %ymm10, %ymm4
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm9, %ymm7, %ymm7
+vpmullw %ymm12, %ymm9, %ymm10
+vpaddw %ymm10, %ymm7, %ymm10
+vpmullw %ymm12, %ymm10, %ymm10
+vpsubw %ymm10, %ymm4, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm2, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vpmullw %ymm13, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vmovdqu 440(%rdi), %ymm4
+vmovdqu 792(%rdi), %ymm3
+vmovdqu 1144(%rdi), %ymm8
+vpaddw %ymm5, %ymm4, %ymm5
+vpaddw %ymm6, %ymm3, %ymm6
+vpaddw %ymm7, %ymm8, %ymm7
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm8
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm2, %ymm2
+vmovdqu 88(%rdi), %ymm3
+vpaddw %ymm2, %ymm3, %ymm3
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 88(%rdi)
+vmovdqa %xmm8, 1952(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm8
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm9, %ymm9
+vpaddw %ymm9, %ymm5, %ymm5
+vmovdqa %xmm8, 2208(%rsp)
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm8
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm10, %ymm10
+vpaddw %ymm10, %ymm6, %ymm6
+vmovdqa %xmm8, 2464(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm8
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm11, %ymm11
+vpaddw %ymm11, %ymm7, %ymm7
+vmovdqa %xmm8, 2720(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 440(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 792(%rdi)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %ymm7, 1144(%rdi)
+vmovdqa 192(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm9, %ymm9
+vmovdqa 448(%rsp), %ymm2
+vpunpcklwd const0(%rip), %ymm2, %ymm7
+vpunpckhwd const0(%rip), %ymm2, %ymm2
+vmovdqa 704(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm7, %ymm8
+vpaddd %ymm6, %ymm2, %ymm3
+vpsubd %ymm10, %ymm8, %ymm8
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm5, %ymm7, %ymm5
+vpsubd %ymm6, %ymm2, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1728(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm7
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm7, %ymm7
+vpsubd %ymm2, %ymm8, %ymm8
+vpsubd %ymm7, %ymm3, %ymm3
+vpsrld $1, %ymm8, %ymm8
+vpsrld $1, %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpackusdw %ymm3, %ymm8, %ymm3
+vmovdqa 960(%rsp), %ymm8
+vpaddw 1216(%rsp), %ymm8, %ymm7
+vpsubw 1216(%rsp), %ymm8, %ymm8
+vpsrlw $2, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsllw $1, %ymm11, %ymm2
+vpsubw %ymm2, %ymm7, %ymm2
+vpsllw $7, %ymm5, %ymm7
+vpsubw %ymm7, %ymm2, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm3, %ymm7, %ymm7
+vmovdqa 1472(%rsp), %ymm2
+vpsubw %ymm11, %ymm2, %ymm2
+vpmullw %ymm15, %ymm5, %ymm9
+vpsubw %ymm9, %ymm2, %ymm9
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm7, %ymm3, %ymm3
+vpmullw %ymm12, %ymm7, %ymm2
+vpaddw %ymm2, %ymm3, %ymm2
+vpmullw %ymm12, %ymm2, %ymm2
+vpsubw %ymm2, %ymm9, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpmullw %ymm13, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vmovdqu 528(%rdi), %ymm9
+vmovdqu 880(%rdi), %ymm10
+vmovdqu 1232(%rdi), %ymm4
+vpaddw %ymm11, %ymm9, %ymm11
+vpaddw %ymm6, %ymm10, %ymm6
+vpaddw %ymm3, %ymm4, %ymm3
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm4
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm10
+vpor %ymm10, %ymm8, %ymm8
+vmovdqu 176(%rdi), %ymm10
+vpaddw %ymm8, %ymm10, %ymm10
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 176(%rdi)
+vmovdqa %xmm4, 1984(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm4
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm10
+vpor %ymm10, %ymm7, %ymm7
+vpaddw %ymm7, %ymm11, %ymm11
+vmovdqa %xmm4, 2240(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm4
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm10
+vpor %ymm10, %ymm2, %ymm2
+vpaddw %ymm2, %ymm6, %ymm6
+vmovdqa %xmm4, 2496(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm4
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm10
+vpor %ymm10, %ymm5, %ymm5
+vpaddw %ymm5, %ymm3, %ymm3
+vmovdqa %xmm4, 2752(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 528(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 880(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 1232(%rdi)
+vmovdqa 224(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm7
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm7, %ymm7
+vmovdqa 480(%rsp), %ymm8
+vpunpcklwd const0(%rip), %ymm8, %ymm3
+vpunpckhwd const0(%rip), %ymm8, %ymm8
+vmovdqa 736(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm3, %ymm4
+vpaddd %ymm6, %ymm8, %ymm10
+vpsubd %ymm2, %ymm4, %ymm4
+vpsubd %ymm7, %ymm10, %ymm10
+vpsubd %ymm11, %ymm3, %ymm11
+vpsubd %ymm6, %ymm8, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1760(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm3, %ymm3
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm3, %ymm10, %ymm10
+vpsrld $1, %ymm4, %ymm4
+vpsrld $1, %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpackusdw %ymm10, %ymm4, %ymm10
+vmovdqa 992(%rsp), %ymm4
+vpaddw 1248(%rsp), %ymm4, %ymm3
+vpsubw 1248(%rsp), %ymm4, %ymm4
+vpsrlw $2, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsllw $1, %ymm5, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpsllw $7, %ymm11, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vmovdqa 1504(%rsp), %ymm8
+vpsubw %ymm5, %ymm8, %ymm8
+vpmullw %ymm15, %ymm11, %ymm7
+vpsubw %ymm7, %ymm8, %ymm7
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpmullw %ymm12, %ymm3, %ymm8
+vpaddw %ymm8, %ymm10, %ymm8
+vpmullw %ymm12, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm4, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpmullw %ymm13, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vmovdqu 616(%rdi), %ymm7
+vmovdqu 968(%rdi), %ymm2
+vmovdqu 1320(%rdi), %ymm9
+vpaddw %ymm5, %ymm7, %ymm5
+vpaddw %ymm6, %ymm2, %ymm6
+vpaddw %ymm10, %ymm9, %ymm10
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm9
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm2
+vpor %ymm2, %ymm4, %ymm4
+vmovdqu 264(%rdi), %ymm2
+vpaddw %ymm4, %ymm2, %ymm2
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 264(%rdi)
+vmovdqa %xmm9, 2016(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm9
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm2
+vpor %ymm2, %ymm3, %ymm3
+vpaddw %ymm3, %ymm5, %ymm5
+vmovdqa %xmm9, 2272(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm9
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm2
+vpor %ymm2, %ymm8, %ymm8
+vpaddw %ymm8, %ymm6, %ymm6
+vmovdqa %xmm9, 2528(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm9
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm2
+vpor %ymm2, %ymm11, %ymm11
+vpaddw %ymm11, %ymm10, %ymm10
+vmovdqa %xmm9, 2784(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 616(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 968(%rdi)
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 1320(%rdi)
+vmovdqa 128(%r12), %ymm0
+vpsubw 224(%r12), %ymm0, %ymm0
+vmovdqa 512(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 320(%r12), %ymm1, %ymm1
+vpsubw 32(%r12), %ymm0, %ymm0
+vpaddw 416(%r12), %ymm0, %ymm0
+vmovdqa 704(%r12), %ymm2
+vpsubw 800(%r12), %ymm2, %ymm2
+vmovdqa 1088(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 896(%r12), %ymm3, %ymm3
+vpsubw 608(%r12), %ymm2, %ymm2
+vpaddw 992(%r12), %ymm2, %ymm2
+vmovdqa 1280(%r12), %ymm4
+vpsubw 1376(%r12), %ymm4, %ymm4
+vmovdqa 1664(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 1472(%r12), %ymm5, %ymm5
+vpsubw 1184(%r12), %ymm4, %ymm4
+vpaddw 1568(%r12), %ymm4, %ymm4
+vpsubw 608(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 32(%r12), %ymm1, %ymm1
+vpaddw 1184(%r12), %ymm1, %ymm1
+vmovdqa 320(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 1472(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 896(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 32(%r12), %ymm8
+vmovdqa 896(%r12), %ymm9
+vmovdqa %ymm8, 0(%rsp)
+vmovdqa %ymm0, 32(%rsp)
+vmovdqa %ymm1, 64(%rsp)
+vmovdqa %ymm7, 96(%rsp)
+vmovdqa %ymm5, 128(%rsp)
+vmovdqa %ymm2, 160(%rsp)
+vmovdqa %ymm3, 192(%rsp)
+vmovdqa %ymm9, 224(%rsp)
+vmovdqa 1856(%r12), %ymm0
+vpsubw 1952(%r12), %ymm0, %ymm0
+vmovdqa 2240(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 2048(%r12), %ymm1, %ymm1
+vpsubw 1760(%r12), %ymm0, %ymm0
+vpaddw 2144(%r12), %ymm0, %ymm0
+vmovdqa 2432(%r12), %ymm2
+vpsubw 2528(%r12), %ymm2, %ymm2
+vmovdqa 2816(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 2624(%r12), %ymm3, %ymm3
+vpsubw 2336(%r12), %ymm2, %ymm2
+vpaddw 2720(%r12), %ymm2, %ymm2
+vmovdqa 3008(%r12), %ymm4
+vpsubw 3104(%r12), %ymm4, %ymm4
+vmovdqa 3392(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 3200(%r12), %ymm5, %ymm5
+vpsubw 2912(%r12), %ymm4, %ymm4
+vpaddw 3296(%r12), %ymm4, %ymm4
+vpsubw 2336(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 1760(%r12), %ymm1, %ymm1
+vpaddw 2912(%r12), %ymm1, %ymm1
+vmovdqa 2048(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 3200(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 2624(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 1760(%r12), %ymm8
+vmovdqa 2624(%r12), %ymm9
+vmovdqa %ymm8, 256(%rsp)
+vmovdqa %ymm0, 288(%rsp)
+vmovdqa %ymm1, 320(%rsp)
+vmovdqa %ymm7, 352(%rsp)
+vmovdqa %ymm5, 384(%rsp)
+vmovdqa %ymm2, 416(%rsp)
+vmovdqa %ymm3, 448(%rsp)
+vmovdqa %ymm9, 480(%rsp)
+vmovdqa 3584(%r12), %ymm0
+vpsubw 3680(%r12), %ymm0, %ymm0
+vmovdqa 3968(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3776(%r12), %ymm1, %ymm1
+vpsubw 3488(%r12), %ymm0, %ymm0
+vpaddw 3872(%r12), %ymm0, %ymm0
+vmovdqa 4160(%r12), %ymm2
+vpsubw 4256(%r12), %ymm2, %ymm2
+vmovdqa 4544(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 4352(%r12), %ymm3, %ymm3
+vpsubw 4064(%r12), %ymm2, %ymm2
+vpaddw 4448(%r12), %ymm2, %ymm2
+vmovdqa 4736(%r12), %ymm4
+vpsubw 4832(%r12), %ymm4, %ymm4
+vmovdqa 5120(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 4928(%r12), %ymm5, %ymm5
+vpsubw 4640(%r12), %ymm4, %ymm4
+vpaddw 5024(%r12), %ymm4, %ymm4
+vpsubw 4064(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 3488(%r12), %ymm1, %ymm1
+vpaddw 4640(%r12), %ymm1, %ymm1
+vmovdqa 3776(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 4928(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 4352(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 3488(%r12), %ymm8
+vmovdqa 4352(%r12), %ymm9
+vmovdqa %ymm8, 512(%rsp)
+vmovdqa %ymm0, 544(%rsp)
+vmovdqa %ymm1, 576(%rsp)
+vmovdqa %ymm7, 608(%rsp)
+vmovdqa %ymm5, 640(%rsp)
+vmovdqa %ymm2, 672(%rsp)
+vmovdqa %ymm3, 704(%rsp)
+vmovdqa %ymm9, 736(%rsp)
+vmovdqa 5312(%r12), %ymm0
+vpsubw 5408(%r12), %ymm0, %ymm0
+vmovdqa 5696(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5504(%r12), %ymm1, %ymm1
+vpsubw 5216(%r12), %ymm0, %ymm0
+vpaddw 5600(%r12), %ymm0, %ymm0
+vmovdqa 5888(%r12), %ymm2
+vpsubw 5984(%r12), %ymm2, %ymm2
+vmovdqa 6272(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 6080(%r12), %ymm3, %ymm3
+vpsubw 5792(%r12), %ymm2, %ymm2
+vpaddw 6176(%r12), %ymm2, %ymm2
+vmovdqa 6464(%r12), %ymm4
+vpsubw 6560(%r12), %ymm4, %ymm4
+vmovdqa 6848(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 6656(%r12), %ymm5, %ymm5
+vpsubw 6368(%r12), %ymm4, %ymm4
+vpaddw 6752(%r12), %ymm4, %ymm4
+vpsubw 5792(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 5216(%r12), %ymm1, %ymm1
+vpaddw 6368(%r12), %ymm1, %ymm1
+vmovdqa 5504(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 6656(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 6080(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 5216(%r12), %ymm8
+vmovdqa 6080(%r12), %ymm9
+vmovdqa %ymm8, 768(%rsp)
+vmovdqa %ymm0, 800(%rsp)
+vmovdqa %ymm1, 832(%rsp)
+vmovdqa %ymm7, 864(%rsp)
+vmovdqa %ymm5, 896(%rsp)
+vmovdqa %ymm2, 928(%rsp)
+vmovdqa %ymm3, 960(%rsp)
+vmovdqa %ymm9, 992(%rsp)
+vmovdqa 7040(%r12), %ymm0
+vpsubw 7136(%r12), %ymm0, %ymm0
+vmovdqa 7424(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 7232(%r12), %ymm1, %ymm1
+vpsubw 6944(%r12), %ymm0, %ymm0
+vpaddw 7328(%r12), %ymm0, %ymm0
+vmovdqa 7616(%r12), %ymm2
+vpsubw 7712(%r12), %ymm2, %ymm2
+vmovdqa 8000(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 7808(%r12), %ymm3, %ymm3
+vpsubw 7520(%r12), %ymm2, %ymm2
+vpaddw 7904(%r12), %ymm2, %ymm2
+vmovdqa 8192(%r12), %ymm4
+vpsubw 8288(%r12), %ymm4, %ymm4
+vmovdqa 8576(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 8384(%r12), %ymm5, %ymm5
+vpsubw 8096(%r12), %ymm4, %ymm4
+vpaddw 8480(%r12), %ymm4, %ymm4
+vpsubw 7520(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 6944(%r12), %ymm1, %ymm1
+vpaddw 8096(%r12), %ymm1, %ymm1
+vmovdqa 7232(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 8384(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 7808(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 6944(%r12), %ymm8
+vmovdqa 7808(%r12), %ymm9
+vmovdqa %ymm8, 1024(%rsp)
+vmovdqa %ymm0, 1056(%rsp)
+vmovdqa %ymm1, 1088(%rsp)
+vmovdqa %ymm7, 1120(%rsp)
+vmovdqa %ymm5, 1152(%rsp)
+vmovdqa %ymm2, 1184(%rsp)
+vmovdqa %ymm3, 1216(%rsp)
+vmovdqa %ymm9, 1248(%rsp)
+vmovdqa 8768(%r12), %ymm0
+vpsubw 8864(%r12), %ymm0, %ymm0
+vmovdqa 9152(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 8960(%r12), %ymm1, %ymm1
+vpsubw 8672(%r12), %ymm0, %ymm0
+vpaddw 9056(%r12), %ymm0, %ymm0
+vmovdqa 9344(%r12), %ymm2
+vpsubw 9440(%r12), %ymm2, %ymm2
+vmovdqa 9728(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 9536(%r12), %ymm3, %ymm3
+vpsubw 9248(%r12), %ymm2, %ymm2
+vpaddw 9632(%r12), %ymm2, %ymm2
+vmovdqa 9920(%r12), %ymm4
+vpsubw 10016(%r12), %ymm4, %ymm4
+vmovdqa 10304(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 10112(%r12), %ymm5, %ymm5
+vpsubw 9824(%r12), %ymm4, %ymm4
+vpaddw 10208(%r12), %ymm4, %ymm4
+vpsubw 9248(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 8672(%r12), %ymm1, %ymm1
+vpaddw 9824(%r12), %ymm1, %ymm1
+vmovdqa 8960(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 10112(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 9536(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 8672(%r12), %ymm8
+vmovdqa 9536(%r12), %ymm9
+vmovdqa %ymm8, 1280(%rsp)
+vmovdqa %ymm0, 1312(%rsp)
+vmovdqa %ymm1, 1344(%rsp)
+vmovdqa %ymm7, 1376(%rsp)
+vmovdqa %ymm5, 1408(%rsp)
+vmovdqa %ymm2, 1440(%rsp)
+vmovdqa %ymm3, 1472(%rsp)
+vmovdqa %ymm9, 1504(%rsp)
+vmovdqa 10496(%r12), %ymm0
+vpsubw 10592(%r12), %ymm0, %ymm0
+vmovdqa 10880(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 10688(%r12), %ymm1, %ymm1
+vpsubw 10400(%r12), %ymm0, %ymm0
+vpaddw 10784(%r12), %ymm0, %ymm0
+vmovdqa 11072(%r12), %ymm2
+vpsubw 11168(%r12), %ymm2, %ymm2
+vmovdqa 11456(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 11264(%r12), %ymm3, %ymm3
+vpsubw 10976(%r12), %ymm2, %ymm2
+vpaddw 11360(%r12), %ymm2, %ymm2
+vmovdqa 11648(%r12), %ymm4
+vpsubw 11744(%r12), %ymm4, %ymm4
+vmovdqa 12032(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 11840(%r12), %ymm5, %ymm5
+vpsubw 11552(%r12), %ymm4, %ymm4
+vpaddw 11936(%r12), %ymm4, %ymm4
+vpsubw 10976(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 10400(%r12), %ymm1, %ymm1
+vpaddw 11552(%r12), %ymm1, %ymm1
+vmovdqa 10688(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 11840(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 11264(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 10400(%r12), %ymm8
+vmovdqa 11264(%r12), %ymm9
+vmovdqa %ymm8, 1536(%rsp)
+vmovdqa %ymm0, 1568(%rsp)
+vmovdqa %ymm1, 1600(%rsp)
+vmovdqa %ymm7, 1632(%rsp)
+vmovdqa %ymm5, 1664(%rsp)
+vmovdqa %ymm2, 1696(%rsp)
+vmovdqa %ymm3, 1728(%rsp)
+vmovdqa %ymm9, 1760(%rsp)
+vmovdqa 0(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm3, %ymm3
+vmovdqa 256(%rsp), %ymm4
+vpunpcklwd const0(%rip), %ymm4, %ymm10
+vpunpckhwd const0(%rip), %ymm4, %ymm4
+vmovdqa 512(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm10, %ymm9
+vpaddd %ymm6, %ymm4, %ymm2
+vpsubd %ymm8, %ymm9, %ymm9
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm5, %ymm10, %ymm5
+vpsubd %ymm6, %ymm4, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1536(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm4
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm10, %ymm10
+vpsubd %ymm4, %ymm9, %ymm9
+vpsubd %ymm10, %ymm2, %ymm2
+vpsrld $1, %ymm9, %ymm9
+vpsrld $1, %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpackusdw %ymm2, %ymm9, %ymm2
+vmovdqa 768(%rsp), %ymm9
+vpaddw 1024(%rsp), %ymm9, %ymm10
+vpsubw 1024(%rsp), %ymm9, %ymm9
+vpsrlw $2, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsllw $1, %ymm11, %ymm4
+vpsubw %ymm4, %ymm10, %ymm4
+vpsllw $7, %ymm5, %ymm10
+vpsubw %ymm10, %ymm4, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm2, %ymm10, %ymm10
+vmovdqa 1280(%rsp), %ymm4
+vpsubw %ymm11, %ymm4, %ymm4
+vpmullw %ymm15, %ymm5, %ymm3
+vpsubw %ymm3, %ymm4, %ymm3
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm2
+vpmullw %ymm12, %ymm10, %ymm4
+vpaddw %ymm4, %ymm2, %ymm4
+vpmullw %ymm12, %ymm4, %ymm4
+vpsubw %ymm4, %ymm3, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpmullw %ymm13, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm3
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm8
+vpor %ymm8, %ymm10, %ymm10
+vpaddw 2048(%rsp), %ymm11, %ymm11
+vpaddw %ymm10, %ymm11, %ymm11
+vmovdqa %xmm3, 2048(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm3
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm8
+vpor %ymm8, %ymm4, %ymm4
+vpaddw 2304(%rsp), %ymm6, %ymm6
+vpaddw %ymm4, %ymm6, %ymm6
+vmovdqa %xmm3, 2304(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm3
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm8
+vpor %ymm8, %ymm5, %ymm5
+vpaddw 2560(%rsp), %ymm2, %ymm2
+vpaddw %ymm5, %ymm2, %ymm2
+vmovdqa %xmm3, 2560(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 32(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 384(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 736(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %ymm9, 1088(%rdi)
+vmovdqa 32(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm4
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm10, %ymm10
+vmovdqa 288(%rsp), %ymm9
+vpunpcklwd const0(%rip), %ymm9, %ymm2
+vpunpckhwd const0(%rip), %ymm9, %ymm9
+vmovdqa 544(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm2, %ymm3
+vpaddd %ymm6, %ymm9, %ymm8
+vpsubd %ymm4, %ymm3, %ymm3
+vpsubd %ymm10, %ymm8, %ymm8
+vpsubd %ymm11, %ymm2, %ymm11
+vpsubd %ymm6, %ymm9, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1568(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm9
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm2, %ymm2
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm2, %ymm8, %ymm8
+vpsrld $1, %ymm3, %ymm3
+vpsrld $1, %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpackusdw %ymm8, %ymm3, %ymm8
+vmovdqa 800(%rsp), %ymm3
+vpaddw 1056(%rsp), %ymm3, %ymm2
+vpsubw 1056(%rsp), %ymm3, %ymm3
+vpsrlw $2, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsllw $1, %ymm5, %ymm9
+vpsubw %ymm9, %ymm2, %ymm9
+vpsllw $7, %ymm11, %ymm2
+vpsubw %ymm2, %ymm9, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vmovdqa 1312(%rsp), %ymm9
+vpsubw %ymm5, %ymm9, %ymm9
+vpmullw %ymm15, %ymm11, %ymm10
+vpsubw %ymm10, %ymm9, %ymm10
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpmullw %ymm12, %ymm2, %ymm9
+vpaddw %ymm9, %ymm8, %ymm9
+vpmullw %ymm12, %ymm9, %ymm9
+vpsubw %ymm9, %ymm10, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpmullw %ymm13, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm10
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm4
+vpor %ymm4, %ymm2, %ymm2
+vpaddw 2080(%rsp), %ymm5, %ymm5
+vpaddw %ymm2, %ymm5, %ymm5
+vmovdqa %xmm10, 2080(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm10
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm4
+vpor %ymm4, %ymm9, %ymm9
+vpaddw 2336(%rsp), %ymm6, %ymm6
+vpaddw %ymm9, %ymm6, %ymm6
+vmovdqa %xmm10, 2336(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm10
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm4
+vpor %ymm4, %ymm11, %ymm11
+vpaddw 2592(%rsp), %ymm8, %ymm8
+vpaddw %ymm11, %ymm8, %ymm8
+vmovdqa %xmm10, 2592(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 120(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 472(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %ymm8, 824(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 1176(%rdi)
+vmovdqa 64(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm9
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm2, %ymm2
+vmovdqa 320(%rsp), %ymm3
+vpunpcklwd const0(%rip), %ymm3, %ymm8
+vpunpckhwd const0(%rip), %ymm3, %ymm3
+vmovdqa 576(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm8, %ymm10
+vpaddd %ymm6, %ymm3, %ymm4
+vpsubd %ymm9, %ymm10, %ymm10
+vpsubd %ymm2, %ymm4, %ymm4
+vpsubd %ymm5, %ymm8, %ymm5
+vpsubd %ymm6, %ymm3, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1600(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm3
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpsubd %ymm3, %ymm10, %ymm10
+vpsubd %ymm8, %ymm4, %ymm4
+vpsrld $1, %ymm10, %ymm10
+vpsrld $1, %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpackusdw %ymm4, %ymm10, %ymm4
+vmovdqa 832(%rsp), %ymm10
+vpaddw 1088(%rsp), %ymm10, %ymm8
+vpsubw 1088(%rsp), %ymm10, %ymm10
+vpsrlw $2, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsllw $1, %ymm11, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpsllw $7, %ymm5, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm4, %ymm8, %ymm8
+vmovdqa 1344(%rsp), %ymm3
+vpsubw %ymm11, %ymm3, %ymm3
+vpmullw %ymm15, %ymm5, %ymm2
+vpsubw %ymm2, %ymm3, %ymm2
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm4
+vpmullw %ymm12, %ymm8, %ymm3
+vpaddw %ymm3, %ymm4, %ymm3
+vpmullw %ymm12, %ymm3, %ymm3
+vpsubw %ymm3, %ymm2, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vpmullw %ymm13, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm2
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm9
+vpor %ymm9, %ymm8, %ymm8
+vpaddw 2112(%rsp), %ymm11, %ymm11
+vpaddw %ymm8, %ymm11, %ymm11
+vmovdqa %xmm2, 2112(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm2
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm9
+vpor %ymm9, %ymm3, %ymm3
+vpaddw 2368(%rsp), %ymm6, %ymm6
+vpaddw %ymm3, %ymm6, %ymm6
+vmovdqa %xmm2, 2368(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm2
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm9
+vpor %ymm9, %ymm5, %ymm5
+vpaddw 2624(%rsp), %ymm4, %ymm4
+vpaddw %ymm5, %ymm4, %ymm4
+vmovdqa %xmm2, 2624(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 208(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 560(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %ymm4, 912(%rdi)
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 1264(%rdi)
+vmovdqa 96(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm3
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm8, %ymm8
+vmovdqa 352(%rsp), %ymm10
+vpunpcklwd const0(%rip), %ymm10, %ymm4
+vpunpckhwd const0(%rip), %ymm10, %ymm10
+vmovdqa 608(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm4, %ymm2
+vpaddd %ymm6, %ymm10, %ymm9
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm8, %ymm9, %ymm9
+vpsubd %ymm11, %ymm4, %ymm11
+vpsubd %ymm6, %ymm10, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1632(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm4
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm4, %ymm4
+vpsubd %ymm10, %ymm2, %ymm2
+vpsubd %ymm4, %ymm9, %ymm9
+vpsrld $1, %ymm2, %ymm2
+vpsrld $1, %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpackusdw %ymm9, %ymm2, %ymm9
+vmovdqa 864(%rsp), %ymm2
+vpaddw 1120(%rsp), %ymm2, %ymm4
+vpsubw 1120(%rsp), %ymm2, %ymm2
+vpsrlw $2, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsllw $1, %ymm5, %ymm10
+vpsubw %ymm10, %ymm4, %ymm10
+vpsllw $7, %ymm11, %ymm4
+vpsubw %ymm4, %ymm10, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vmovdqa 1376(%rsp), %ymm10
+vpsubw %ymm5, %ymm10, %ymm10
+vpmullw %ymm15, %ymm11, %ymm8
+vpsubw %ymm8, %ymm10, %ymm8
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpmullw %ymm12, %ymm4, %ymm10
+vpaddw %ymm10, %ymm9, %ymm10
+vpmullw %ymm12, %ymm10, %ymm10
+vpsubw %ymm10, %ymm8, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm2, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vpmullw %ymm13, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm8
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm4, %ymm4
+vpaddw 2144(%rsp), %ymm5, %ymm5
+vpaddw %ymm4, %ymm5, %ymm5
+vmovdqa %xmm8, 2144(%rsp)
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm8
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm10, %ymm10
+vpaddw 2400(%rsp), %ymm6, %ymm6
+vpaddw %ymm10, %ymm6, %ymm6
+vmovdqa %xmm8, 2400(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm8
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm3
+vpor %ymm3, %ymm11, %ymm11
+vpaddw 2656(%rsp), %ymm9, %ymm9
+vpaddw %ymm11, %ymm9, %ymm9
+vmovdqa %xmm8, 2656(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 296(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 648(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %ymm9, 1000(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 1352(%rdi)
+vmovdqa 128(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm10
+vpunpckhwd const0(%rip), %ymm11, %ymm4
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm4, %ymm4
+vmovdqa 384(%rsp), %ymm2
+vpunpcklwd const0(%rip), %ymm2, %ymm9
+vpunpckhwd const0(%rip), %ymm2, %ymm2
+vmovdqa 640(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm9, %ymm8
+vpaddd %ymm6, %ymm2, %ymm3
+vpsubd %ymm10, %ymm8, %ymm8
+vpsubd %ymm4, %ymm3, %ymm3
+vpsubd %ymm5, %ymm9, %ymm5
+vpsubd %ymm6, %ymm2, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1664(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm9
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm9, %ymm9
+vpsubd %ymm2, %ymm8, %ymm8
+vpsubd %ymm9, %ymm3, %ymm3
+vpsrld $1, %ymm8, %ymm8
+vpsrld $1, %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpackusdw %ymm3, %ymm8, %ymm3
+vmovdqa 896(%rsp), %ymm8
+vpaddw 1152(%rsp), %ymm8, %ymm9
+vpsubw 1152(%rsp), %ymm8, %ymm8
+vpsrlw $2, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsllw $1, %ymm11, %ymm2
+vpsubw %ymm2, %ymm9, %ymm2
+vpsllw $7, %ymm5, %ymm9
+vpsubw %ymm9, %ymm2, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vmovdqa 1408(%rsp), %ymm2
+vpsubw %ymm11, %ymm2, %ymm2
+vpmullw %ymm15, %ymm5, %ymm4
+vpsubw %ymm4, %ymm2, %ymm4
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpmullw %ymm12, %ymm9, %ymm2
+vpaddw %ymm2, %ymm3, %ymm2
+vpmullw %ymm12, %ymm2, %ymm2
+vpsubw %ymm2, %ymm4, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpmullw %ymm13, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vmovdqu 384(%rdi), %ymm4
+vmovdqu 736(%rdi), %ymm10
+vmovdqu 1088(%rdi), %ymm7
+vpaddw %ymm11, %ymm4, %ymm11
+vpaddw %ymm6, %ymm10, %ymm6
+vpaddw %ymm3, %ymm7, %ymm3
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm7
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm10
+vpor %ymm10, %ymm8, %ymm8
+vmovdqu 32(%rdi), %ymm10
+vpaddw 1920(%rsp), %ymm10, %ymm10
+vpaddw %ymm8, %ymm10, %ymm10
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 32(%rdi)
+vmovdqa %xmm7, 1920(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm7
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm10
+vpor %ymm10, %ymm9, %ymm9
+vpaddw 2176(%rsp), %ymm11, %ymm11
+vpaddw %ymm9, %ymm11, %ymm11
+vmovdqa %xmm7, 2176(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm7
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm10
+vpor %ymm10, %ymm2, %ymm2
+vpaddw 2432(%rsp), %ymm6, %ymm6
+vpaddw %ymm2, %ymm6, %ymm6
+vmovdqa %xmm7, 2432(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm7
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm10
+vpor %ymm10, %ymm5, %ymm5
+vpaddw 2688(%rsp), %ymm3, %ymm3
+vpaddw %ymm5, %ymm3, %ymm3
+vmovdqa %xmm7, 2688(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 384(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 736(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %ymm3, 1088(%rdi)
+vmovdqa 160(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm9
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm9, %ymm9
+vmovdqa 416(%rsp), %ymm8
+vpunpcklwd const0(%rip), %ymm8, %ymm3
+vpunpckhwd const0(%rip), %ymm8, %ymm8
+vmovdqa 672(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm3, %ymm7
+vpaddd %ymm6, %ymm8, %ymm10
+vpsubd %ymm2, %ymm7, %ymm7
+vpsubd %ymm9, %ymm10, %ymm10
+vpsubd %ymm11, %ymm3, %ymm11
+vpsubd %ymm6, %ymm8, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1696(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm3, %ymm3
+vpsubd %ymm8, %ymm7, %ymm7
+vpsubd %ymm3, %ymm10, %ymm10
+vpsrld $1, %ymm7, %ymm7
+vpsrld $1, %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpackusdw %ymm10, %ymm7, %ymm10
+vmovdqa 928(%rsp), %ymm7
+vpaddw 1184(%rsp), %ymm7, %ymm3
+vpsubw 1184(%rsp), %ymm7, %ymm7
+vpsrlw $2, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsllw $1, %ymm5, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpsllw $7, %ymm11, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm10, %ymm3, %ymm3
+vmovdqa 1440(%rsp), %ymm8
+vpsubw %ymm5, %ymm8, %ymm8
+vpmullw %ymm15, %ymm11, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm3, %ymm10, %ymm10
+vpmullw %ymm12, %ymm3, %ymm8
+vpaddw %ymm8, %ymm10, %ymm8
+vpmullw %ymm12, %ymm8, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm7, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vpmullw %ymm13, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vmovdqu 472(%rdi), %ymm9
+vmovdqu 824(%rdi), %ymm2
+vmovdqu 1176(%rdi), %ymm4
+vpaddw %ymm5, %ymm9, %ymm5
+vpaddw %ymm6, %ymm2, %ymm6
+vpaddw %ymm10, %ymm4, %ymm10
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm4
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm7, %ymm7
+vmovdqu 120(%rdi), %ymm2
+vpaddw 1952(%rsp), %ymm2, %ymm2
+vpaddw %ymm7, %ymm2, %ymm2
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 120(%rdi)
+vmovdqa %xmm4, 1952(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_3_5(%rip), %ymm3, %ymm4
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm3, %ymm3
+vpaddw 2208(%rsp), %ymm5, %ymm5
+vpaddw %ymm3, %ymm5, %ymm5
+vmovdqa %xmm4, 2208(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_3_5(%rip), %ymm8, %ymm4
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm8, %ymm8
+vpaddw 2464(%rsp), %ymm6, %ymm6
+vpaddw %ymm8, %ymm6, %ymm6
+vmovdqa %xmm4, 2464(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm4
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm11, %ymm11
+vpaddw 2720(%rsp), %ymm10, %ymm10
+vpaddw %ymm11, %ymm10, %ymm10
+vmovdqa %xmm4, 2720(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 472(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 824(%rdi)
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %ymm10, 1176(%rdi)
+vmovdqa 192(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm3
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm3, %ymm3
+vmovdqa 448(%rsp), %ymm7
+vpunpcklwd const0(%rip), %ymm7, %ymm10
+vpunpckhwd const0(%rip), %ymm7, %ymm7
+vmovdqa 704(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm10, %ymm4
+vpaddd %ymm6, %ymm7, %ymm2
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm5, %ymm10, %ymm5
+vpsubd %ymm6, %ymm7, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1728(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm7
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm10, %ymm10
+vpsubd %ymm7, %ymm4, %ymm4
+vpsubd %ymm10, %ymm2, %ymm2
+vpsrld $1, %ymm4, %ymm4
+vpsrld $1, %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpackusdw %ymm2, %ymm4, %ymm2
+vmovdqa 960(%rsp), %ymm4
+vpaddw 1216(%rsp), %ymm4, %ymm10
+vpsubw 1216(%rsp), %ymm4, %ymm4
+vpsrlw $2, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsllw $1, %ymm11, %ymm7
+vpsubw %ymm7, %ymm10, %ymm7
+vpsllw $7, %ymm5, %ymm10
+vpsubw %ymm10, %ymm7, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm2, %ymm10, %ymm10
+vmovdqa 1472(%rsp), %ymm7
+vpsubw %ymm11, %ymm7, %ymm7
+vpmullw %ymm15, %ymm5, %ymm3
+vpsubw %ymm3, %ymm7, %ymm3
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm2
+vpmullw %ymm12, %ymm10, %ymm7
+vpaddw %ymm7, %ymm2, %ymm7
+vpmullw %ymm12, %ymm7, %ymm7
+vpsubw %ymm7, %ymm3, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm4, %ymm7, %ymm7
+vpsubw %ymm7, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpmullw %ymm13, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vmovdqu 560(%rdi), %ymm3
+vmovdqu 912(%rdi), %ymm8
+vmovdqu 1264(%rdi), %ymm9
+vpaddw %ymm11, %ymm3, %ymm11
+vpaddw %ymm6, %ymm8, %ymm6
+vpaddw %ymm2, %ymm9, %ymm2
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm9
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm8
+vpor %ymm8, %ymm4, %ymm4
+vmovdqu 208(%rdi), %ymm8
+vpaddw 1984(%rsp), %ymm8, %ymm8
+vpaddw %ymm4, %ymm8, %ymm8
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %ymm8, 208(%rdi)
+vmovdqa %xmm9, 1984(%rsp)
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_3_5(%rip), %ymm10, %ymm9
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm8
+vpor %ymm8, %ymm10, %ymm10
+vpaddw 2240(%rsp), %ymm11, %ymm11
+vpaddw %ymm10, %ymm11, %ymm11
+vmovdqa %xmm9, 2240(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_3_5(%rip), %ymm7, %ymm9
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm8
+vpor %ymm8, %ymm7, %ymm7
+vpaddw 2496(%rsp), %ymm6, %ymm6
+vpaddw %ymm7, %ymm6, %ymm6
+vmovdqa %xmm9, 2496(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_3_5(%rip), %ymm5, %ymm9
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $206, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm8
+vpor %ymm8, %ymm5, %ymm5
+vpaddw 2752(%rsp), %ymm2, %ymm2
+vpaddw %ymm5, %ymm2, %ymm2
+vmovdqa %xmm9, 2752(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 560(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 912(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %ymm2, 1264(%rdi)
+vmovdqa 224(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm7
+vpunpckhwd const0(%rip), %ymm5, %ymm10
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm10, %ymm10
+vmovdqa 480(%rsp), %ymm4
+vpunpcklwd const0(%rip), %ymm4, %ymm2
+vpunpckhwd const0(%rip), %ymm4, %ymm4
+vmovdqa 736(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm2, %ymm9
+vpaddd %ymm6, %ymm4, %ymm8
+vpsubd %ymm7, %ymm9, %ymm9
+vpsubd %ymm10, %ymm8, %ymm8
+vpsubd %ymm11, %ymm2, %ymm11
+vpsubd %ymm6, %ymm4, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1760(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm2, %ymm2
+vpsubd %ymm4, %ymm9, %ymm9
+vpsubd %ymm2, %ymm8, %ymm8
+vpsrld $1, %ymm9, %ymm9
+vpsrld $1, %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpackusdw %ymm8, %ymm9, %ymm8
+vmovdqa 992(%rsp), %ymm9
+vpaddw 1248(%rsp), %ymm9, %ymm2
+vpsubw 1248(%rsp), %ymm9, %ymm9
+vpsrlw $2, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsllw $1, %ymm5, %ymm4
+vpsubw %ymm4, %ymm2, %ymm4
+vpsllw $7, %ymm11, %ymm2
+vpsubw %ymm2, %ymm4, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vmovdqa 1504(%rsp), %ymm4
+vpsubw %ymm5, %ymm4, %ymm4
+vpmullw %ymm15, %ymm11, %ymm10
+vpsubw %ymm10, %ymm4, %ymm10
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpmullw %ymm12, %ymm2, %ymm4
+vpaddw %ymm4, %ymm8, %ymm4
+vpmullw %ymm12, %ymm4, %ymm4
+vpsubw %ymm4, %ymm10, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpmullw %ymm13, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vmovdqu 648(%rdi), %ymm10
+vmovdqu 1000(%rdi), %ymm7
+vmovdqu 1352(%rdi), %ymm3
+vpaddw %ymm5, %ymm10, %ymm5
+vpaddw %ymm6, %ymm7, %ymm6
+vpaddw %ymm8, %ymm3, %ymm8
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_3_5(%rip), %ymm9, %ymm3
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm7
+vpor %ymm7, %ymm9, %ymm9
+vmovdqu 296(%rdi), %ymm7
+vpaddw 2016(%rsp), %ymm7, %ymm7
+vpaddw %ymm9, %ymm7, %ymm7
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %ymm7, 296(%rdi)
+vmovdqa %xmm3, 2016(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_3_5(%rip), %ymm2, %ymm3
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm7
+vpor %ymm7, %ymm2, %ymm2
+vpaddw 2272(%rsp), %ymm5, %ymm5
+vpaddw %ymm2, %ymm5, %ymm5
+vmovdqa %xmm3, 2272(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_3_5(%rip), %ymm4, %ymm3
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm7
+vpor %ymm7, %ymm4, %ymm4
+vpaddw 2528(%rsp), %ymm6, %ymm6
+vpaddw %ymm4, %ymm6, %ymm6
+vmovdqa %xmm3, 2528(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_3_5(%rip), %ymm11, %ymm3
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $206, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm7
+vpor %ymm7, %ymm11, %ymm11
+vpaddw 2784(%rsp), %ymm8, %ymm8
+vpaddw %ymm11, %ymm8, %ymm8
+vmovdqa %xmm3, 2784(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %ymm5, 648(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %ymm6, 1000(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %ymm8, 1352(%rdi)
+vmovdqa 160(%r12), %ymm0
+vpsubw 256(%r12), %ymm0, %ymm0
+vmovdqa 544(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 352(%r12), %ymm1, %ymm1
+vpsubw 64(%r12), %ymm0, %ymm0
+vpaddw 448(%r12), %ymm0, %ymm0
+vmovdqa 736(%r12), %ymm2
+vpsubw 832(%r12), %ymm2, %ymm2
+vmovdqa 1120(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 928(%r12), %ymm3, %ymm3
+vpsubw 640(%r12), %ymm2, %ymm2
+vpaddw 1024(%r12), %ymm2, %ymm2
+vmovdqa 1312(%r12), %ymm4
+vpsubw 1408(%r12), %ymm4, %ymm4
+vmovdqa 1696(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 1504(%r12), %ymm5, %ymm5
+vpsubw 1216(%r12), %ymm4, %ymm4
+vpaddw 1600(%r12), %ymm4, %ymm4
+vpsubw 640(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 64(%r12), %ymm1, %ymm1
+vpaddw 1216(%r12), %ymm1, %ymm1
+vmovdqa 352(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 1504(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 928(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 64(%r12), %ymm8
+vmovdqa 928(%r12), %ymm9
+vmovdqa %ymm8, 0(%rsp)
+vmovdqa %ymm0, 32(%rsp)
+vmovdqa %ymm1, 64(%rsp)
+vmovdqa %ymm7, 96(%rsp)
+vmovdqa %ymm5, 128(%rsp)
+vmovdqa %ymm2, 160(%rsp)
+vmovdqa %ymm3, 192(%rsp)
+vmovdqa %ymm9, 224(%rsp)
+vmovdqa 1888(%r12), %ymm0
+vpsubw 1984(%r12), %ymm0, %ymm0
+vmovdqa 2272(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 2080(%r12), %ymm1, %ymm1
+vpsubw 1792(%r12), %ymm0, %ymm0
+vpaddw 2176(%r12), %ymm0, %ymm0
+vmovdqa 2464(%r12), %ymm2
+vpsubw 2560(%r12), %ymm2, %ymm2
+vmovdqa 2848(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 2656(%r12), %ymm3, %ymm3
+vpsubw 2368(%r12), %ymm2, %ymm2
+vpaddw 2752(%r12), %ymm2, %ymm2
+vmovdqa 3040(%r12), %ymm4
+vpsubw 3136(%r12), %ymm4, %ymm4
+vmovdqa 3424(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 3232(%r12), %ymm5, %ymm5
+vpsubw 2944(%r12), %ymm4, %ymm4
+vpaddw 3328(%r12), %ymm4, %ymm4
+vpsubw 2368(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 1792(%r12), %ymm1, %ymm1
+vpaddw 2944(%r12), %ymm1, %ymm1
+vmovdqa 2080(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 3232(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 2656(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 1792(%r12), %ymm8
+vmovdqa 2656(%r12), %ymm9
+vmovdqa %ymm8, 256(%rsp)
+vmovdqa %ymm0, 288(%rsp)
+vmovdqa %ymm1, 320(%rsp)
+vmovdqa %ymm7, 352(%rsp)
+vmovdqa %ymm5, 384(%rsp)
+vmovdqa %ymm2, 416(%rsp)
+vmovdqa %ymm3, 448(%rsp)
+vmovdqa %ymm9, 480(%rsp)
+vmovdqa 3616(%r12), %ymm0
+vpsubw 3712(%r12), %ymm0, %ymm0
+vmovdqa 4000(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 3808(%r12), %ymm1, %ymm1
+vpsubw 3520(%r12), %ymm0, %ymm0
+vpaddw 3904(%r12), %ymm0, %ymm0
+vmovdqa 4192(%r12), %ymm2
+vpsubw 4288(%r12), %ymm2, %ymm2
+vmovdqa 4576(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 4384(%r12), %ymm3, %ymm3
+vpsubw 4096(%r12), %ymm2, %ymm2
+vpaddw 4480(%r12), %ymm2, %ymm2
+vmovdqa 4768(%r12), %ymm4
+vpsubw 4864(%r12), %ymm4, %ymm4
+vmovdqa 5152(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 4960(%r12), %ymm5, %ymm5
+vpsubw 4672(%r12), %ymm4, %ymm4
+vpaddw 5056(%r12), %ymm4, %ymm4
+vpsubw 4096(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 3520(%r12), %ymm1, %ymm1
+vpaddw 4672(%r12), %ymm1, %ymm1
+vmovdqa 3808(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 4960(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 4384(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 3520(%r12), %ymm8
+vmovdqa 4384(%r12), %ymm9
+vmovdqa %ymm8, 512(%rsp)
+vmovdqa %ymm0, 544(%rsp)
+vmovdqa %ymm1, 576(%rsp)
+vmovdqa %ymm7, 608(%rsp)
+vmovdqa %ymm5, 640(%rsp)
+vmovdqa %ymm2, 672(%rsp)
+vmovdqa %ymm3, 704(%rsp)
+vmovdqa %ymm9, 736(%rsp)
+vmovdqa 5344(%r12), %ymm0
+vpsubw 5440(%r12), %ymm0, %ymm0
+vmovdqa 5728(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 5536(%r12), %ymm1, %ymm1
+vpsubw 5248(%r12), %ymm0, %ymm0
+vpaddw 5632(%r12), %ymm0, %ymm0
+vmovdqa 5920(%r12), %ymm2
+vpsubw 6016(%r12), %ymm2, %ymm2
+vmovdqa 6304(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 6112(%r12), %ymm3, %ymm3
+vpsubw 5824(%r12), %ymm2, %ymm2
+vpaddw 6208(%r12), %ymm2, %ymm2
+vmovdqa 6496(%r12), %ymm4
+vpsubw 6592(%r12), %ymm4, %ymm4
+vmovdqa 6880(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 6688(%r12), %ymm5, %ymm5
+vpsubw 6400(%r12), %ymm4, %ymm4
+vpaddw 6784(%r12), %ymm4, %ymm4
+vpsubw 5824(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 5248(%r12), %ymm1, %ymm1
+vpaddw 6400(%r12), %ymm1, %ymm1
+vmovdqa 5536(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 6688(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 6112(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 5248(%r12), %ymm8
+vmovdqa 6112(%r12), %ymm9
+vmovdqa %ymm8, 768(%rsp)
+vmovdqa %ymm0, 800(%rsp)
+vmovdqa %ymm1, 832(%rsp)
+vmovdqa %ymm7, 864(%rsp)
+vmovdqa %ymm5, 896(%rsp)
+vmovdqa %ymm2, 928(%rsp)
+vmovdqa %ymm3, 960(%rsp)
+vmovdqa %ymm9, 992(%rsp)
+vmovdqa 7072(%r12), %ymm0
+vpsubw 7168(%r12), %ymm0, %ymm0
+vmovdqa 7456(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 7264(%r12), %ymm1, %ymm1
+vpsubw 6976(%r12), %ymm0, %ymm0
+vpaddw 7360(%r12), %ymm0, %ymm0
+vmovdqa 7648(%r12), %ymm2
+vpsubw 7744(%r12), %ymm2, %ymm2
+vmovdqa 8032(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 7840(%r12), %ymm3, %ymm3
+vpsubw 7552(%r12), %ymm2, %ymm2
+vpaddw 7936(%r12), %ymm2, %ymm2
+vmovdqa 8224(%r12), %ymm4
+vpsubw 8320(%r12), %ymm4, %ymm4
+vmovdqa 8608(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 8416(%r12), %ymm5, %ymm5
+vpsubw 8128(%r12), %ymm4, %ymm4
+vpaddw 8512(%r12), %ymm4, %ymm4
+vpsubw 7552(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 6976(%r12), %ymm1, %ymm1
+vpaddw 8128(%r12), %ymm1, %ymm1
+vmovdqa 7264(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 8416(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 7840(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 6976(%r12), %ymm8
+vmovdqa 7840(%r12), %ymm9
+vmovdqa %ymm8, 1024(%rsp)
+vmovdqa %ymm0, 1056(%rsp)
+vmovdqa %ymm1, 1088(%rsp)
+vmovdqa %ymm7, 1120(%rsp)
+vmovdqa %ymm5, 1152(%rsp)
+vmovdqa %ymm2, 1184(%rsp)
+vmovdqa %ymm3, 1216(%rsp)
+vmovdqa %ymm9, 1248(%rsp)
+vmovdqa 8800(%r12), %ymm0
+vpsubw 8896(%r12), %ymm0, %ymm0
+vmovdqa 9184(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 8992(%r12), %ymm1, %ymm1
+vpsubw 8704(%r12), %ymm0, %ymm0
+vpaddw 9088(%r12), %ymm0, %ymm0
+vmovdqa 9376(%r12), %ymm2
+vpsubw 9472(%r12), %ymm2, %ymm2
+vmovdqa 9760(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 9568(%r12), %ymm3, %ymm3
+vpsubw 9280(%r12), %ymm2, %ymm2
+vpaddw 9664(%r12), %ymm2, %ymm2
+vmovdqa 9952(%r12), %ymm4
+vpsubw 10048(%r12), %ymm4, %ymm4
+vmovdqa 10336(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 10144(%r12), %ymm5, %ymm5
+vpsubw 9856(%r12), %ymm4, %ymm4
+vpaddw 10240(%r12), %ymm4, %ymm4
+vpsubw 9280(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 8704(%r12), %ymm1, %ymm1
+vpaddw 9856(%r12), %ymm1, %ymm1
+vmovdqa 8992(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 10144(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 9568(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 8704(%r12), %ymm8
+vmovdqa 9568(%r12), %ymm9
+vmovdqa %ymm8, 1280(%rsp)
+vmovdqa %ymm0, 1312(%rsp)
+vmovdqa %ymm1, 1344(%rsp)
+vmovdqa %ymm7, 1376(%rsp)
+vmovdqa %ymm5, 1408(%rsp)
+vmovdqa %ymm2, 1440(%rsp)
+vmovdqa %ymm3, 1472(%rsp)
+vmovdqa %ymm9, 1504(%rsp)
+vmovdqa 10528(%r12), %ymm0
+vpsubw 10624(%r12), %ymm0, %ymm0
+vmovdqa 10912(%r12), %ymm1
+vpsubw %ymm0, %ymm1, %ymm1
+vpsubw 10720(%r12), %ymm1, %ymm1
+vpsubw 10432(%r12), %ymm0, %ymm0
+vpaddw 10816(%r12), %ymm0, %ymm0
+vmovdqa 11104(%r12), %ymm2
+vpsubw 11200(%r12), %ymm2, %ymm2
+vmovdqa 11488(%r12), %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw 11296(%r12), %ymm3, %ymm3
+vpsubw 11008(%r12), %ymm2, %ymm2
+vpaddw 11392(%r12), %ymm2, %ymm2
+vmovdqa 11680(%r12), %ymm4
+vpsubw 11776(%r12), %ymm4, %ymm4
+vmovdqa 12064(%r12), %ymm5
+vpsubw %ymm4, %ymm5, %ymm5
+vpsubw 11872(%r12), %ymm5, %ymm5
+vpsubw 11584(%r12), %ymm4, %ymm4
+vpaddw 11968(%r12), %ymm4, %ymm4
+vpsubw 11008(%r12), %ymm1, %ymm1
+vpsubw %ymm1, %ymm5, %ymm5
+vpsubw %ymm3, %ymm5, %ymm5
+vpsubw 10432(%r12), %ymm1, %ymm1
+vpaddw 11584(%r12), %ymm1, %ymm1
+vmovdqa 10720(%r12), %ymm6
+vpsubw %ymm2, %ymm6, %ymm7
+vmovdqa 11872(%r12), %ymm2
+vpsubw %ymm7, %ymm2, %ymm2
+vpsubw 11296(%r12), %ymm2, %ymm2
+vpsubw %ymm0, %ymm7, %ymm7
+vpaddw %ymm4, %ymm7, %ymm7
+vmovdqa 10432(%r12), %ymm8
+vmovdqa 11296(%r12), %ymm9
+vmovdqa %ymm8, 1536(%rsp)
+vmovdqa %ymm0, 1568(%rsp)
+vmovdqa %ymm1, 1600(%rsp)
+vmovdqa %ymm7, 1632(%rsp)
+vmovdqa %ymm5, 1664(%rsp)
+vmovdqa %ymm2, 1696(%rsp)
+vmovdqa %ymm3, 1728(%rsp)
+vmovdqa %ymm9, 1760(%rsp)
+vmovdqa 0(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm2, %ymm2
+vmovdqa 256(%rsp), %ymm9
+vpunpcklwd const0(%rip), %ymm9, %ymm8
+vpunpckhwd const0(%rip), %ymm9, %ymm9
+vmovdqa 512(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm8, %ymm3
+vpaddd %ymm6, %ymm9, %ymm7
+vpsubd %ymm4, %ymm3, %ymm3
+vpsubd %ymm2, %ymm7, %ymm7
+vpsubd %ymm5, %ymm8, %ymm5
+vpsubd %ymm6, %ymm9, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1536(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm9
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm8, %ymm8
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm8, %ymm7, %ymm7
+vpsrld $1, %ymm3, %ymm3
+vpsrld $1, %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpackusdw %ymm7, %ymm3, %ymm7
+vmovdqa 768(%rsp), %ymm3
+vpaddw 1024(%rsp), %ymm3, %ymm8
+vpsubw 1024(%rsp), %ymm3, %ymm3
+vpsrlw $2, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsllw $1, %ymm11, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpsllw $7, %ymm5, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm7, %ymm8, %ymm8
+vmovdqa 1280(%rsp), %ymm9
+vpsubw %ymm11, %ymm9, %ymm9
+vpmullw %ymm15, %ymm5, %ymm2
+vpsubw %ymm2, %ymm9, %ymm2
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm7
+vpmullw %ymm12, %ymm8, %ymm9
+vpaddw %ymm9, %ymm7, %ymm9
+vpmullw %ymm12, %ymm9, %ymm9
+vpsubw %ymm9, %ymm2, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpmullw %ymm13, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_4_3_1(%rip), %ymm8, %ymm2
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm4
+vpor %ymm4, %ymm8, %ymm8
+vpaddw 2048(%rsp), %ymm11, %ymm11
+vpaddw %ymm8, %ymm11, %ymm11
+vmovdqa %xmm2, 2048(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_4_3_1(%rip), %ymm9, %ymm2
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm4
+vpor %ymm4, %ymm9, %ymm9
+vpaddw 2304(%rsp), %ymm6, %ymm6
+vpaddw %ymm9, %ymm6, %ymm6
+vmovdqa %xmm2, 2304(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_4_3_1(%rip), %ymm5, %ymm2
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm4
+vpor %ymm4, %ymm5, %ymm5
+vpaddw 2560(%rsp), %ymm7, %ymm7
+vpaddw %ymm5, %ymm7, %ymm7
+vmovdqa %xmm2, 2560(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %xmm11, 64(%rdi)
+vextracti128 $1, %ymm11, %xmm11
+vmovq %xmm11, 80(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 416(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 432(%rdi)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %xmm7, 768(%rdi)
+vextracti128 $1, %ymm7, %xmm7
+vmovq %xmm7, 784(%rdi)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %xmm3, 1120(%rdi)
+vextracti128 $1, %ymm3, %xmm3
+vmovq %xmm3, 1136(%rdi)
+vmovdqa 32(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm9
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm8, %ymm8
+vmovdqa 288(%rsp), %ymm3
+vpunpcklwd const0(%rip), %ymm3, %ymm7
+vpunpckhwd const0(%rip), %ymm3, %ymm3
+vmovdqa 544(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm7, %ymm2
+vpaddd %ymm6, %ymm3, %ymm4
+vpsubd %ymm9, %ymm2, %ymm2
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm11, %ymm7, %ymm11
+vpsubd %ymm6, %ymm3, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1568(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm3
+vpunpckhwd const0(%rip), %ymm11, %ymm7
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm7, %ymm7
+vpsubd %ymm3, %ymm2, %ymm2
+vpsubd %ymm7, %ymm4, %ymm4
+vpsrld $1, %ymm2, %ymm2
+vpsrld $1, %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpackusdw %ymm4, %ymm2, %ymm4
+vmovdqa 800(%rsp), %ymm2
+vpaddw 1056(%rsp), %ymm2, %ymm7
+vpsubw 1056(%rsp), %ymm2, %ymm2
+vpsrlw $2, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsllw $1, %ymm5, %ymm3
+vpsubw %ymm3, %ymm7, %ymm3
+vpsllw $7, %ymm11, %ymm7
+vpsubw %ymm7, %ymm3, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm4, %ymm7, %ymm7
+vmovdqa 1312(%rsp), %ymm3
+vpsubw %ymm5, %ymm3, %ymm3
+vpmullw %ymm15, %ymm11, %ymm8
+vpsubw %ymm8, %ymm3, %ymm8
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm7, %ymm4, %ymm4
+vpmullw %ymm12, %ymm7, %ymm3
+vpaddw %ymm3, %ymm4, %ymm3
+vpmullw %ymm12, %ymm3, %ymm3
+vpsubw %ymm3, %ymm8, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vpsubw %ymm3, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vpmullw %ymm13, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_4_3_1(%rip), %ymm7, %ymm8
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $139, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm9
+vpor %ymm9, %ymm7, %ymm7
+vpaddw 2080(%rsp), %ymm5, %ymm5
+vpaddw %ymm7, %ymm5, %ymm5
+vmovdqa %xmm8, 2080(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_4_3_1(%rip), %ymm3, %ymm8
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $139, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm9
+vpor %ymm9, %ymm3, %ymm3
+vpaddw 2336(%rsp), %ymm6, %ymm6
+vpaddw %ymm3, %ymm6, %ymm6
+vmovdqa %xmm8, 2336(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_4_3_1(%rip), %ymm11, %ymm8
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $139, %ymm8, %ymm8
+vpand mask_keephigh(%rip), %ymm8, %ymm9
+vpor %ymm9, %ymm11, %ymm11
+vpaddw 2592(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vmovdqa %xmm8, 2592(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %xmm5, 152(%rdi)
+vextracti128 $1, %ymm5, %xmm5
+vmovq %xmm5, 168(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 504(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 520(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %xmm4, 856(%rdi)
+vextracti128 $1, %ymm4, %xmm4
+vmovq %xmm4, 872(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %xmm2, 1208(%rdi)
+vextracti128 $1, %ymm2, %xmm2
+vmovq %xmm2, 1224(%rdi)
+vmovdqa 64(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm3
+vpunpckhwd const0(%rip), %ymm11, %ymm7
+vpslld $1, %ymm3, %ymm3
+vpslld $1, %ymm7, %ymm7
+vmovdqa 320(%rsp), %ymm2
+vpunpcklwd const0(%rip), %ymm2, %ymm4
+vpunpckhwd const0(%rip), %ymm2, %ymm2
+vmovdqa 576(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm4, %ymm8
+vpaddd %ymm6, %ymm2, %ymm9
+vpsubd %ymm3, %ymm8, %ymm8
+vpsubd %ymm7, %ymm9, %ymm9
+vpsubd %ymm5, %ymm4, %ymm5
+vpsubd %ymm6, %ymm2, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1600(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm4
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpsubd %ymm2, %ymm8, %ymm8
+vpsubd %ymm4, %ymm9, %ymm9
+vpsrld $1, %ymm8, %ymm8
+vpsrld $1, %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpackusdw %ymm9, %ymm8, %ymm9
+vmovdqa 832(%rsp), %ymm8
+vpaddw 1088(%rsp), %ymm8, %ymm4
+vpsubw 1088(%rsp), %ymm8, %ymm8
+vpsrlw $2, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsllw $1, %ymm11, %ymm2
+vpsubw %ymm2, %ymm4, %ymm2
+vpsllw $7, %ymm5, %ymm4
+vpsubw %ymm4, %ymm2, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm9, %ymm4, %ymm4
+vmovdqa 1344(%rsp), %ymm2
+vpsubw %ymm11, %ymm2, %ymm2
+vpmullw %ymm15, %ymm5, %ymm7
+vpsubw %ymm7, %ymm2, %ymm7
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm4, %ymm9, %ymm9
+vpmullw %ymm12, %ymm4, %ymm2
+vpaddw %ymm2, %ymm9, %ymm2
+vpmullw %ymm12, %ymm2, %ymm2
+vpsubw %ymm2, %ymm7, %ymm2
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm6, %ymm2, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpmullw %ymm13, %ymm2, %ymm2
+vpsubw %ymm2, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_4_3_1(%rip), %ymm4, %ymm7
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $139, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm3
+vpor %ymm3, %ymm4, %ymm4
+vpaddw 2112(%rsp), %ymm11, %ymm11
+vpaddw %ymm4, %ymm11, %ymm11
+vmovdqa %xmm7, 2112(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_4_3_1(%rip), %ymm2, %ymm7
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $139, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm3
+vpor %ymm3, %ymm2, %ymm2
+vpaddw 2368(%rsp), %ymm6, %ymm6
+vpaddw %ymm2, %ymm6, %ymm6
+vmovdqa %xmm7, 2368(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_4_3_1(%rip), %ymm5, %ymm7
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $139, %ymm7, %ymm7
+vpand mask_keephigh(%rip), %ymm7, %ymm3
+vpor %ymm3, %ymm5, %ymm5
+vpaddw 2624(%rsp), %ymm9, %ymm9
+vpaddw %ymm5, %ymm9, %ymm9
+vmovdqa %xmm7, 2624(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %xmm11, 240(%rdi)
+vextracti128 $1, %ymm11, %xmm11
+vmovq %xmm11, 256(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 592(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 608(%rdi)
+vpand mask_mod8192(%rip), %ymm9, %ymm9
+vmovdqu %xmm9, 944(%rdi)
+vextracti128 $1, %ymm9, %xmm9
+vmovq %xmm9, 960(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %xmm8, 1296(%rdi)
+vextracti128 $1, %ymm8, %xmm8
+vmovq %xmm8, 1312(%rdi)
+vmovdqa 96(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm2
+vpunpckhwd const0(%rip), %ymm5, %ymm4
+vpslld $1, %ymm2, %ymm2
+vpslld $1, %ymm4, %ymm4
+vmovdqa 352(%rsp), %ymm8
+vpunpcklwd const0(%rip), %ymm8, %ymm9
+vpunpckhwd const0(%rip), %ymm8, %ymm8
+vmovdqa 608(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm9, %ymm7
+vpaddd %ymm6, %ymm8, %ymm3
+vpsubd %ymm2, %ymm7, %ymm7
+vpsubd %ymm4, %ymm3, %ymm3
+vpsubd %ymm11, %ymm9, %ymm11
+vpsubd %ymm6, %ymm8, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1632(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm9, %ymm9
+vpsubd %ymm8, %ymm7, %ymm7
+vpsubd %ymm9, %ymm3, %ymm3
+vpsrld $1, %ymm7, %ymm7
+vpsrld $1, %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpackusdw %ymm3, %ymm7, %ymm3
+vmovdqa 864(%rsp), %ymm7
+vpaddw 1120(%rsp), %ymm7, %ymm9
+vpsubw 1120(%rsp), %ymm7, %ymm7
+vpsrlw $2, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsllw $1, %ymm5, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpsllw $7, %ymm11, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vmovdqa 1376(%rsp), %ymm8
+vpsubw %ymm5, %ymm8, %ymm8
+vpmullw %ymm15, %ymm11, %ymm4
+vpsubw %ymm4, %ymm8, %ymm4
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpmullw %ymm12, %ymm9, %ymm8
+vpaddw %ymm8, %ymm3, %ymm8
+vpmullw %ymm12, %ymm8, %ymm8
+vpsubw %ymm8, %ymm4, %ymm8
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm6, %ymm8, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm7, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vpmullw %ymm13, %ymm8, %ymm8
+vpsubw %ymm8, %ymm6, %ymm6
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_4_3_1(%rip), %ymm9, %ymm4
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $139, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm9, %ymm9
+vpaddw 2144(%rsp), %ymm5, %ymm5
+vpaddw %ymm9, %ymm5, %ymm5
+vmovdqa %xmm4, 2144(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_4_3_1(%rip), %ymm8, %ymm4
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $139, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm8, %ymm8
+vpaddw 2400(%rsp), %ymm6, %ymm6
+vpaddw %ymm8, %ymm6, %ymm6
+vmovdqa %xmm4, 2400(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_4_3_1(%rip), %ymm11, %ymm4
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $139, %ymm4, %ymm4
+vpand mask_keephigh(%rip), %ymm4, %ymm2
+vpor %ymm2, %ymm11, %ymm11
+vpaddw 2656(%rsp), %ymm3, %ymm3
+vpaddw %ymm11, %ymm3, %ymm3
+vmovdqa %xmm4, 2656(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %xmm5, 328(%rdi)
+vextracti128 $1, %ymm5, %xmm5
+vmovq %xmm5, 344(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm5, %ymm5
+vmovdqa %xmm5, 1792(%rsp)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 680(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 696(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm6, %ymm6
+vmovdqa %xmm6, 1824(%rsp)
+vpand mask_mod8192(%rip), %ymm3, %ymm3
+vmovdqu %xmm3, 1032(%rdi)
+vextracti128 $1, %ymm3, %xmm3
+vmovq %xmm3, 1048(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm3, %ymm3
+vmovdqa %xmm3, 1856(%rsp)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %xmm7, 1384(%rdi)
+vextracti128 $1, %ymm7, %xmm7
+vpextrw $0, %xmm7, 1400(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm7, %ymm7
+vmovdqa %xmm7, 1888(%rsp)
+vmovdqa 128(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm8
+vpunpckhwd const0(%rip), %ymm11, %ymm9
+vpslld $1, %ymm8, %ymm8
+vpslld $1, %ymm9, %ymm9
+vmovdqa 384(%rsp), %ymm7
+vpunpcklwd const0(%rip), %ymm7, %ymm3
+vpunpckhwd const0(%rip), %ymm7, %ymm7
+vmovdqa 640(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm3, %ymm4
+vpaddd %ymm6, %ymm7, %ymm2
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm9, %ymm2, %ymm2
+vpsubd %ymm5, %ymm3, %ymm5
+vpsubd %ymm6, %ymm7, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1664(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm7
+vpunpckhwd const0(%rip), %ymm5, %ymm3
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm3, %ymm3
+vpsubd %ymm7, %ymm4, %ymm4
+vpsubd %ymm3, %ymm2, %ymm2
+vpsrld $1, %ymm4, %ymm4
+vpsrld $1, %ymm2, %ymm2
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm2, %ymm2
+vpackusdw %ymm2, %ymm4, %ymm2
+vmovdqa 896(%rsp), %ymm4
+vpaddw 1152(%rsp), %ymm4, %ymm3
+vpsubw 1152(%rsp), %ymm4, %ymm4
+vpsrlw $2, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsllw $1, %ymm11, %ymm7
+vpsubw %ymm7, %ymm3, %ymm7
+vpsllw $7, %ymm5, %ymm3
+vpsubw %ymm3, %ymm7, %ymm3
+vpsrlw $3, %ymm3, %ymm3
+vpsubw %ymm2, %ymm3, %ymm3
+vmovdqa 1408(%rsp), %ymm7
+vpsubw %ymm11, %ymm7, %ymm7
+vpmullw %ymm15, %ymm5, %ymm9
+vpsubw %ymm9, %ymm7, %ymm9
+vpmullw %ymm14, %ymm3, %ymm3
+vpsubw %ymm3, %ymm2, %ymm2
+vpmullw %ymm12, %ymm3, %ymm7
+vpaddw %ymm7, %ymm2, %ymm7
+vpmullw %ymm12, %ymm7, %ymm7
+vpsubw %ymm7, %ymm9, %ymm7
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm6, %ymm7, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm4, %ymm7, %ymm7
+vpsubw %ymm7, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vpmullw %ymm13, %ymm7, %ymm7
+vpsubw %ymm7, %ymm6, %ymm6
+vmovdqu 416(%rdi), %ymm9
+vmovdqu 768(%rdi), %ymm8
+vmovdqu 1120(%rdi), %ymm10
+vpaddw %ymm11, %ymm9, %ymm11
+vpaddw %ymm6, %ymm8, %ymm6
+vpaddw %ymm2, %ymm10, %ymm2
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_4_3_1(%rip), %ymm4, %ymm10
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $139, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm8
+vpor %ymm8, %ymm4, %ymm4
+vmovdqu 64(%rdi), %ymm8
+vpaddw 1920(%rsp), %ymm8, %ymm8
+vpaddw %ymm4, %ymm8, %ymm8
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %xmm8, 64(%rdi)
+vextracti128 $1, %ymm8, %xmm8
+vmovq %xmm8, 80(%rdi)
+vmovdqa %xmm10, 1920(%rsp)
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_4_3_1(%rip), %ymm3, %ymm10
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $139, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm8
+vpor %ymm8, %ymm3, %ymm3
+vpaddw 2176(%rsp), %ymm11, %ymm11
+vpaddw %ymm3, %ymm11, %ymm11
+vmovdqa %xmm10, 2176(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_4_3_1(%rip), %ymm7, %ymm10
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $139, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm8
+vpor %ymm8, %ymm7, %ymm7
+vpaddw 2432(%rsp), %ymm6, %ymm6
+vpaddw %ymm7, %ymm6, %ymm6
+vmovdqa %xmm10, 2432(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_4_3_1(%rip), %ymm5, %ymm10
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $139, %ymm10, %ymm10
+vpand mask_keephigh(%rip), %ymm10, %ymm8
+vpor %ymm8, %ymm5, %ymm5
+vpaddw 2688(%rsp), %ymm2, %ymm2
+vpaddw %ymm5, %ymm2, %ymm2
+vmovdqa %xmm10, 2688(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %xmm11, 416(%rdi)
+vextracti128 $1, %ymm11, %xmm11
+vmovq %xmm11, 432(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 768(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 784(%rdi)
+vpand mask_mod8192(%rip), %ymm2, %ymm2
+vmovdqu %xmm2, 1120(%rdi)
+vextracti128 $1, %ymm2, %xmm2
+vmovq %xmm2, 1136(%rdi)
+vmovdqa 160(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm7
+vpunpckhwd const0(%rip), %ymm5, %ymm3
+vpslld $1, %ymm7, %ymm7
+vpslld $1, %ymm3, %ymm3
+vmovdqa 416(%rsp), %ymm4
+vpunpcklwd const0(%rip), %ymm4, %ymm2
+vpunpckhwd const0(%rip), %ymm4, %ymm4
+vmovdqa 672(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm2, %ymm10
+vpaddd %ymm6, %ymm4, %ymm8
+vpsubd %ymm7, %ymm10, %ymm10
+vpsubd %ymm3, %ymm8, %ymm8
+vpsubd %ymm11, %ymm2, %ymm11
+vpsubd %ymm6, %ymm4, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1696(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm2, %ymm2
+vpsubd %ymm4, %ymm10, %ymm10
+vpsubd %ymm2, %ymm8, %ymm8
+vpsrld $1, %ymm10, %ymm10
+vpsrld $1, %ymm8, %ymm8
+vpand mask32_to_16(%rip), %ymm10, %ymm10
+vpand mask32_to_16(%rip), %ymm8, %ymm8
+vpackusdw %ymm8, %ymm10, %ymm8
+vmovdqa 928(%rsp), %ymm10
+vpaddw 1184(%rsp), %ymm10, %ymm2
+vpsubw 1184(%rsp), %ymm10, %ymm10
+vpsrlw $2, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsllw $1, %ymm5, %ymm4
+vpsubw %ymm4, %ymm2, %ymm4
+vpsllw $7, %ymm11, %ymm2
+vpsubw %ymm2, %ymm4, %ymm2
+vpsrlw $3, %ymm2, %ymm2
+vpsubw %ymm8, %ymm2, %ymm2
+vmovdqa 1440(%rsp), %ymm4
+vpsubw %ymm5, %ymm4, %ymm4
+vpmullw %ymm15, %ymm11, %ymm3
+vpsubw %ymm3, %ymm4, %ymm3
+vpmullw %ymm14, %ymm2, %ymm2
+vpsubw %ymm2, %ymm8, %ymm8
+vpmullw %ymm12, %ymm2, %ymm4
+vpaddw %ymm4, %ymm8, %ymm4
+vpmullw %ymm12, %ymm4, %ymm4
+vpsubw %ymm4, %ymm3, %ymm4
+vpmullw %ymm14, %ymm4, %ymm4
+vpsubw %ymm6, %ymm4, %ymm4
+vpsrlw $3, %ymm4, %ymm4
+vpsubw %ymm10, %ymm4, %ymm4
+vpsubw %ymm4, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vpmullw %ymm13, %ymm4, %ymm4
+vpsubw %ymm4, %ymm6, %ymm6
+vmovdqu 504(%rdi), %ymm3
+vmovdqu 856(%rdi), %ymm7
+vmovdqu 1208(%rdi), %ymm9
+vpaddw %ymm5, %ymm3, %ymm5
+vpaddw %ymm6, %ymm7, %ymm6
+vpaddw %ymm8, %ymm9, %ymm8
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_4_3_1(%rip), %ymm10, %ymm9
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $139, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm7
+vpor %ymm7, %ymm10, %ymm10
+vmovdqu 152(%rdi), %ymm7
+vpaddw 1952(%rsp), %ymm7, %ymm7
+vpaddw %ymm10, %ymm7, %ymm7
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %xmm7, 152(%rdi)
+vextracti128 $1, %ymm7, %xmm7
+vmovq %xmm7, 168(%rdi)
+vmovdqa %xmm9, 1952(%rsp)
+vpshufb shuf48_16(%rip), %ymm2, %ymm2
+vpand mask3_5_4_3_1(%rip), %ymm2, %ymm9
+vpand mask5_3_5_3(%rip), %ymm2, %ymm2
+vpermq $139, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm7
+vpor %ymm7, %ymm2, %ymm2
+vpaddw 2208(%rsp), %ymm5, %ymm5
+vpaddw %ymm2, %ymm5, %ymm5
+vmovdqa %xmm9, 2208(%rsp)
+vpshufb shuf48_16(%rip), %ymm4, %ymm4
+vpand mask3_5_4_3_1(%rip), %ymm4, %ymm9
+vpand mask5_3_5_3(%rip), %ymm4, %ymm4
+vpermq $139, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm7
+vpor %ymm7, %ymm4, %ymm4
+vpaddw 2464(%rsp), %ymm6, %ymm6
+vpaddw %ymm4, %ymm6, %ymm6
+vmovdqa %xmm9, 2464(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_4_3_1(%rip), %ymm11, %ymm9
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $139, %ymm9, %ymm9
+vpand mask_keephigh(%rip), %ymm9, %ymm7
+vpor %ymm7, %ymm11, %ymm11
+vpaddw 2720(%rsp), %ymm8, %ymm8
+vpaddw %ymm11, %ymm8, %ymm8
+vmovdqa %xmm9, 2720(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %xmm5, 504(%rdi)
+vextracti128 $1, %ymm5, %xmm5
+vmovq %xmm5, 520(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 856(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 872(%rdi)
+vpand mask_mod8192(%rip), %ymm8, %ymm8
+vmovdqu %xmm8, 1208(%rdi)
+vextracti128 $1, %ymm8, %xmm8
+vmovq %xmm8, 1224(%rdi)
+vmovdqa 192(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm4
+vpunpckhwd const0(%rip), %ymm11, %ymm2
+vpslld $1, %ymm4, %ymm4
+vpslld $1, %ymm2, %ymm2
+vmovdqa 448(%rsp), %ymm10
+vpunpcklwd const0(%rip), %ymm10, %ymm8
+vpunpckhwd const0(%rip), %ymm10, %ymm10
+vmovdqa 704(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm5
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm5, %ymm8, %ymm9
+vpaddd %ymm6, %ymm10, %ymm7
+vpsubd %ymm4, %ymm9, %ymm9
+vpsubd %ymm2, %ymm7, %ymm7
+vpsubd %ymm5, %ymm8, %ymm5
+vpsubd %ymm6, %ymm10, %ymm6
+vpsrld $1, %ymm5, %ymm5
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm5, %ymm5
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm5, %ymm6
+vmovdqa 1728(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm10
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm8, %ymm8
+vpsubd %ymm10, %ymm9, %ymm9
+vpsubd %ymm8, %ymm7, %ymm7
+vpsrld $1, %ymm9, %ymm9
+vpsrld $1, %ymm7, %ymm7
+vpand mask32_to_16(%rip), %ymm9, %ymm9
+vpand mask32_to_16(%rip), %ymm7, %ymm7
+vpackusdw %ymm7, %ymm9, %ymm7
+vmovdqa 960(%rsp), %ymm9
+vpaddw 1216(%rsp), %ymm9, %ymm8
+vpsubw 1216(%rsp), %ymm9, %ymm9
+vpsrlw $2, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsllw $1, %ymm11, %ymm10
+vpsubw %ymm10, %ymm8, %ymm10
+vpsllw $7, %ymm5, %ymm8
+vpsubw %ymm8, %ymm10, %ymm8
+vpsrlw $3, %ymm8, %ymm8
+vpsubw %ymm7, %ymm8, %ymm8
+vmovdqa 1472(%rsp), %ymm10
+vpsubw %ymm11, %ymm10, %ymm10
+vpmullw %ymm15, %ymm5, %ymm2
+vpsubw %ymm2, %ymm10, %ymm2
+vpmullw %ymm14, %ymm8, %ymm8
+vpsubw %ymm8, %ymm7, %ymm7
+vpmullw %ymm12, %ymm8, %ymm10
+vpaddw %ymm10, %ymm7, %ymm10
+vpmullw %ymm12, %ymm10, %ymm10
+vpsubw %ymm10, %ymm2, %ymm10
+vpmullw %ymm14, %ymm10, %ymm10
+vpsubw %ymm6, %ymm10, %ymm10
+vpsrlw $3, %ymm10, %ymm10
+vpsubw %ymm9, %ymm10, %ymm10
+vpsubw %ymm10, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vpmullw %ymm13, %ymm10, %ymm10
+vpsubw %ymm10, %ymm6, %ymm6
+vmovdqu 592(%rdi), %ymm2
+vmovdqu 944(%rdi), %ymm4
+vmovdqu 1296(%rdi), %ymm3
+vpaddw %ymm11, %ymm2, %ymm11
+vpaddw %ymm6, %ymm4, %ymm6
+vpaddw %ymm7, %ymm3, %ymm7
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_4_3_1(%rip), %ymm9, %ymm3
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $139, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm9, %ymm9
+vmovdqu 240(%rdi), %ymm4
+vpaddw 1984(%rsp), %ymm4, %ymm4
+vpaddw %ymm9, %ymm4, %ymm4
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %xmm4, 240(%rdi)
+vextracti128 $1, %ymm4, %xmm4
+vmovq %xmm4, 256(%rdi)
+vmovdqa %xmm3, 1984(%rsp)
+vpshufb shuf48_16(%rip), %ymm8, %ymm8
+vpand mask3_5_4_3_1(%rip), %ymm8, %ymm3
+vpand mask5_3_5_3(%rip), %ymm8, %ymm8
+vpermq $139, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm8, %ymm8
+vpaddw 2240(%rsp), %ymm11, %ymm11
+vpaddw %ymm8, %ymm11, %ymm11
+vmovdqa %xmm3, 2240(%rsp)
+vpshufb shuf48_16(%rip), %ymm10, %ymm10
+vpand mask3_5_4_3_1(%rip), %ymm10, %ymm3
+vpand mask5_3_5_3(%rip), %ymm10, %ymm10
+vpermq $139, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm10, %ymm10
+vpaddw 2496(%rsp), %ymm6, %ymm6
+vpaddw %ymm10, %ymm6, %ymm6
+vmovdqa %xmm3, 2496(%rsp)
+vpshufb shuf48_16(%rip), %ymm5, %ymm5
+vpand mask3_5_4_3_1(%rip), %ymm5, %ymm3
+vpand mask5_3_5_3(%rip), %ymm5, %ymm5
+vpermq $139, %ymm3, %ymm3
+vpand mask_keephigh(%rip), %ymm3, %ymm4
+vpor %ymm4, %ymm5, %ymm5
+vpaddw 2752(%rsp), %ymm7, %ymm7
+vpaddw %ymm5, %ymm7, %ymm7
+vmovdqa %xmm3, 2752(%rsp)
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %xmm11, 592(%rdi)
+vextracti128 $1, %ymm11, %xmm11
+vmovq %xmm11, 608(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 944(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 960(%rdi)
+vpand mask_mod8192(%rip), %ymm7, %ymm7
+vmovdqu %xmm7, 1296(%rdi)
+vextracti128 $1, %ymm7, %xmm7
+vmovq %xmm7, 1312(%rdi)
+vmovdqa 224(%rsp), %ymm5
+vpunpcklwd const0(%rip), %ymm5, %ymm10
+vpunpckhwd const0(%rip), %ymm5, %ymm8
+vpslld $1, %ymm10, %ymm10
+vpslld $1, %ymm8, %ymm8
+vmovdqa 480(%rsp), %ymm9
+vpunpcklwd const0(%rip), %ymm9, %ymm7
+vpunpckhwd const0(%rip), %ymm9, %ymm9
+vmovdqa 736(%rsp), %ymm6
+vpunpcklwd const0(%rip), %ymm6, %ymm11
+vpunpckhwd const0(%rip), %ymm6, %ymm6
+vpaddd %ymm11, %ymm7, %ymm3
+vpaddd %ymm6, %ymm9, %ymm4
+vpsubd %ymm10, %ymm3, %ymm3
+vpsubd %ymm8, %ymm4, %ymm4
+vpsubd %ymm11, %ymm7, %ymm11
+vpsubd %ymm6, %ymm9, %ymm6
+vpsrld $1, %ymm11, %ymm11
+vpsrld $1, %ymm6, %ymm6
+vpand mask32_to_16(%rip), %ymm11, %ymm11
+vpand mask32_to_16(%rip), %ymm6, %ymm6
+vpackusdw %ymm6, %ymm11, %ymm6
+vmovdqa 1760(%rsp), %ymm11
+vpunpcklwd const0(%rip), %ymm11, %ymm9
+vpunpckhwd const0(%rip), %ymm11, %ymm7
+vpslld $1, %ymm9, %ymm9
+vpslld $1, %ymm7, %ymm7
+vpsubd %ymm9, %ymm3, %ymm3
+vpsubd %ymm7, %ymm4, %ymm4
+vpsrld $1, %ymm3, %ymm3
+vpsrld $1, %ymm4, %ymm4
+vpand mask32_to_16(%rip), %ymm3, %ymm3
+vpand mask32_to_16(%rip), %ymm4, %ymm4
+vpackusdw %ymm4, %ymm3, %ymm4
+vmovdqa 992(%rsp), %ymm3
+vpaddw 1248(%rsp), %ymm3, %ymm7
+vpsubw 1248(%rsp), %ymm3, %ymm3
+vpsrlw $2, %ymm3, %ymm3
+vpsubw %ymm6, %ymm3, %ymm3
+vpmullw %ymm14, %ymm3, %ymm3
+vpsllw $1, %ymm5, %ymm9
+vpsubw %ymm9, %ymm7, %ymm9
+vpsllw $7, %ymm11, %ymm7
+vpsubw %ymm7, %ymm9, %ymm7
+vpsrlw $3, %ymm7, %ymm7
+vpsubw %ymm4, %ymm7, %ymm7
+vmovdqa 1504(%rsp), %ymm9
+vpsubw %ymm5, %ymm9, %ymm9
+vpmullw %ymm15, %ymm11, %ymm8
+vpsubw %ymm8, %ymm9, %ymm8
+vpmullw %ymm14, %ymm7, %ymm7
+vpsubw %ymm7, %ymm4, %ymm4
+vpmullw %ymm12, %ymm7, %ymm9
+vpaddw %ymm9, %ymm4, %ymm9
+vpmullw %ymm12, %ymm9, %ymm9
+vpsubw %ymm9, %ymm8, %ymm9
+vpmullw %ymm14, %ymm9, %ymm9
+vpsubw %ymm6, %ymm9, %ymm9
+vpsrlw $3, %ymm9, %ymm9
+vpsubw %ymm3, %ymm9, %ymm9
+vpsubw %ymm9, %ymm3, %ymm3
+vpsubw %ymm3, %ymm6, %ymm6
+vpmullw %ymm13, %ymm9, %ymm9
+vpsubw %ymm9, %ymm6, %ymm6
+vextracti128 $1, %ymm4, %xmm8
+vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8
+vmovdqa %ymm8, 2816(%rsp)
+vextracti128 $1, %ymm3, %xmm8
+vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8
+vmovdqa %ymm8, 2848(%rsp)
+vextracti128 $1, %ymm7, %xmm8
+vpshufb shufmin1_mask3(%rip), %ymm8, %ymm8
+vmovdqa %ymm8, 2880(%rsp)
+vmovdqu 680(%rdi), %ymm8
+vmovdqu 1032(%rdi), %ymm10
+vmovdqu 1384(%rdi), %ymm2
+vpaddw %ymm5, %ymm8, %ymm5
+vpaddw %ymm6, %ymm10, %ymm6
+vpaddw %ymm4, %ymm2, %ymm4
+vpshufb shuf48_16(%rip), %ymm3, %ymm3
+vpand mask3_5_4_3_1(%rip), %ymm3, %ymm2
+vpand mask5_3_5_3(%rip), %ymm3, %ymm3
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm10
+vpor %ymm10, %ymm3, %ymm3
+vmovdqu 328(%rdi), %ymm10
+vpaddw 2016(%rsp), %ymm10, %ymm10
+vpaddw %ymm3, %ymm10, %ymm10
+vpand mask_mod8192(%rip), %ymm10, %ymm10
+vmovdqu %xmm10, 328(%rdi)
+vextracti128 $1, %ymm10, %xmm10
+vmovq %xmm10, 344(%rdi)
+vpshufb shufmin1_mask3(%rip), %ymm10, %ymm10
+vmovdqa %xmm10, 1792(%rsp)
+vmovdqa %xmm2, 2016(%rsp)
+vpshufb shuf48_16(%rip), %ymm7, %ymm7
+vpand mask3_5_4_3_1(%rip), %ymm7, %ymm2
+vpand mask5_3_5_3(%rip), %ymm7, %ymm7
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm10
+vpor %ymm10, %ymm7, %ymm7
+vpaddw 2272(%rsp), %ymm5, %ymm5
+vpaddw %ymm7, %ymm5, %ymm5
+vmovdqa %xmm2, 2272(%rsp)
+vpshufb shuf48_16(%rip), %ymm9, %ymm9
+vpand mask3_5_4_3_1(%rip), %ymm9, %ymm2
+vpand mask5_3_5_3(%rip), %ymm9, %ymm9
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm10
+vpor %ymm10, %ymm9, %ymm9
+vpaddw 2528(%rsp), %ymm6, %ymm6
+vpaddw %ymm9, %ymm6, %ymm6
+vmovdqa %xmm2, 2528(%rsp)
+vpshufb shuf48_16(%rip), %ymm11, %ymm11
+vpand mask3_5_4_3_1(%rip), %ymm11, %ymm2
+vpand mask5_3_5_3(%rip), %ymm11, %ymm11
+vpermq $139, %ymm2, %ymm2
+vpand mask_keephigh(%rip), %ymm2, %ymm10
+vpor %ymm10, %ymm11, %ymm11
+vpaddw 2784(%rsp), %ymm4, %ymm4
+vpaddw %ymm11, %ymm4, %ymm4
+vmovdqa %xmm2, 2784(%rsp)
+vpand mask_mod8192(%rip), %ymm5, %ymm5
+vmovdqu %xmm5, 680(%rdi)
+vextracti128 $1, %ymm5, %xmm5
+vmovq %xmm5, 696(%rdi)
+vpand mask_mod8192(%rip), %ymm6, %ymm6
+vmovdqu %xmm6, 1032(%rdi)
+vextracti128 $1, %ymm6, %xmm6
+vmovq %xmm6, 1048(%rdi)
+vpand mask_mod8192(%rip), %ymm4, %ymm4
+vmovdqu %xmm4, 1384(%rdi)
+vextracti128 $1, %ymm4, %xmm4
+vpextrw $0, %xmm4, 1400(%rdi)
+vmovdqu 0(%rdi), %ymm11
+vpaddw 1888(%rsp), %ymm11, %ymm11
+vpaddw 2816(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 0(%rdi)
+vmovdqu 352(%rdi), %ymm11
+vpaddw 2528(%rsp), %ymm11, %ymm11
+vpaddw 2848(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 352(%rdi)
+vmovdqu 704(%rdi), %ymm11
+vpaddw 2784(%rsp), %ymm11, %ymm11
+vpaddw 2880(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 704(%rdi)
+vmovdqu 88(%rdi), %ymm11
+vpaddw 2048(%rsp), %ymm11, %ymm11
+vpaddw 1920(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 88(%rdi)
+vmovdqu 440(%rdi), %ymm11
+vpaddw 2304(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 440(%rdi)
+vmovdqu 792(%rdi), %ymm11
+vpaddw 2560(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 792(%rdi)
+vmovdqu 176(%rdi), %ymm11
+vpaddw 2080(%rsp), %ymm11, %ymm11
+vpaddw 1952(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 176(%rdi)
+vmovdqu 528(%rdi), %ymm11
+vpaddw 2336(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 528(%rdi)
+vmovdqu 880(%rdi), %ymm11
+vpaddw 2592(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 880(%rdi)
+vmovdqu 264(%rdi), %ymm11
+vpaddw 2112(%rsp), %ymm11, %ymm11
+vpaddw 1984(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 264(%rdi)
+vmovdqu 616(%rdi), %ymm11
+vpaddw 2368(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 616(%rdi)
+vmovdqu 968(%rdi), %ymm11
+vpaddw 2624(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 968(%rdi)
+vmovdqu 352(%rdi), %ymm11
+vpaddw 2144(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 352(%rdi)
+vmovdqu 704(%rdi), %ymm11
+vpaddw 2400(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 704(%rdi)
+vmovdqu 1056(%rdi), %ymm11
+vpaddw 2656(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 1056(%rdi)
+vmovdqu 440(%rdi), %ymm11
+vpaddw 2176(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 440(%rdi)
+vmovdqu 792(%rdi), %ymm11
+vpaddw 2432(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 792(%rdi)
+vmovdqu 1144(%rdi), %ymm11
+vpaddw 2688(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 1144(%rdi)
+vmovdqu 528(%rdi), %ymm11
+vpaddw 2208(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 528(%rdi)
+vmovdqu 880(%rdi), %ymm11
+vpaddw 2464(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 880(%rdi)
+vmovdqu 1232(%rdi), %ymm11
+vpaddw 2720(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 1232(%rdi)
+vmovdqu 616(%rdi), %ymm11
+vpaddw 2240(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 616(%rdi)
+vmovdqu 968(%rdi), %ymm11
+vpaddw 2496(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 968(%rdi)
+vmovdqu 1320(%rdi), %ymm11
+vpaddw 2752(%rsp), %ymm11, %ymm11
+vpand mask_mod8192(%rip), %ymm11, %ymm11
+vmovdqu %ymm11, 1320(%rdi)
+mov %r8, %rsp
+pop %r12
+pop %rbp
+ret
+.cfi_endproc
+
+#endif
diff --git a/crypto/hrss/hrss.c b/crypto/hrss/hrss.c
new file mode 100644
index 0000000..c059b83
--- /dev/null
+++ b/crypto/hrss/hrss.c
@@ -0,0 +1,2265 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/hrss.h>
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <openssl/bn.h>
+#include <openssl/cpu.h>
+#include <openssl/hmac.h>
+#include <openssl/mem.h>
+#include <openssl/sha.h>
+
+#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64)
+#include <emmintrin.h>
+#endif
+
+#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
+    (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <arm_neon.h>
+#endif
+
+#if defined(_MSC_VER)
+#define RESTRICT
+#else
+#define RESTRICT restrict
+#endif
+
+#include "../internal.h"
+#include "internal.h"
+
+// This is an implementation of [HRSS], but with a KEM transformation based on
+// [SXY]. The primary references are:
+
+// HRSS: https://eprint.iacr.org/2017/667.pdf
+// HRSSNIST:
+// https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip
+// SXY: https://eprint.iacr.org/2017/1005.pdf
+// NTRUTN14:
+// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf
+
+
+// Vector operations.
+//
+// A couple of functions in this file can use vector operations to meaningful
+// effect. If we're building for a target that has a supported vector unit,
+// |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a
+// 128-bit vector. The following functions abstract over the differences between
+// NEON and SSE2 for implementing some vector operations.
+
+// TODO: MSVC can likely also be made to work with vector operations.
+#if (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) && \
+    (defined(__clang__) || !defined(_MSC_VER))
+
+#define HRSS_HAVE_VECTOR_UNIT
+typedef __m128i vec_t;
+
+// vec_capable returns one iff the current platform supports SSE2.
+static int vec_capable(void) {
+#if defined(__SSE2__)
+  return 1;
+#else
+  int has_sse2 = (OPENSSL_ia32cap_P[0] & (1 << 26)) != 0;
+  return has_sse2;
+#endif
+}
+
+// vec_add performs a pair-wise addition of four uint16s from |a| and |b|.
+static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); }
+
+// vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|.
+static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); }
+
+// vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting
+// vector.
+static inline vec_t vec_mul(vec_t a, uint16_t b) {
+  return _mm_mullo_epi16(a, _mm_set1_epi16(b));
+}
+
+// vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and
+// returns the resulting vector.
+static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
+  return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c)));
+}
+
+// vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16.
+static inline void vec3_rshift_word(vec_t v[3]) {
+  // Intel's left and right shifting is backwards compared to the order in
+  // memory because they're based on little-endian order of words (and not just
+  // bytes). So the shifts in this function will be backwards from what one
+  // might expect.
+  const __m128i carry0 = _mm_srli_si128(v[0], 14);
+  v[0] = _mm_slli_si128(v[0], 2);
+
+  const __m128i carry1 = _mm_srli_si128(v[1], 14);
+  v[1] = _mm_slli_si128(v[1], 2);
+  v[1] |= carry0;
+
+  v[2] = _mm_slli_si128(v[2], 2);
+  v[2] |= carry1;
+}
+
+// vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16.
+static inline void vec4_rshift_word(vec_t v[4]) {
+  // Intel's left and right shifting is backwards compared to the order in
+  // memory because they're based on little-endian order of words (and not just
+  // bytes). So the shifts in this function will be backwards from what one
+  // might expect.
+  const __m128i carry0 = _mm_srli_si128(v[0], 14);
+  v[0] = _mm_slli_si128(v[0], 2);
+
+  const __m128i carry1 = _mm_srli_si128(v[1], 14);
+  v[1] = _mm_slli_si128(v[1], 2);
+  v[1] |= carry0;
+
+  const __m128i carry2 = _mm_srli_si128(v[2], 14);
+  v[2] = _mm_slli_si128(v[2], 2);
+  v[2] |= carry1;
+
+  v[3] = _mm_slli_si128(v[3], 2);
+  v[3] |= carry2;
+}
+
+// vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first
+// five from |right|, and returns the resulting vector.
+static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
+  return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6);
+}
+
+// poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one
+// bit.
+static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
+  vec_t carry_s = {0};
+  vec_t carry_a = {0};
+
+  for (int i = 0; i < 6; i++) {
+    vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63);
+    a_s[i] = _mm_slli_epi64(a_s[i], 1);
+    a_s[i] |= _mm_slli_si128(next_carry_s, 8);
+    a_s[i] |= carry_s;
+    carry_s = _mm_srli_si128(next_carry_s, 8);
+
+    vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63);
+    a_a[i] = _mm_slli_epi64(a_a[i], 1);
+    a_a[i] |= _mm_slli_si128(next_carry_a, 8);
+    a_a[i] |= carry_a;
+    carry_a = _mm_srli_si128(next_carry_a, 8);
+  }
+}
+
+// poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one
+// bit.
+static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
+  vec_t carry_s = {0};
+  vec_t carry_a = {0};
+
+  for (int i = 5; i >= 0; i--) {
+    const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63);
+    a_s[i] = _mm_srli_epi64(a_s[i], 1);
+    a_s[i] |= _mm_srli_si128(next_carry_s, 8);
+    a_s[i] |= carry_s;
+    carry_s = _mm_slli_si128(next_carry_s, 8);
+
+    const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63);
+    a_a[i] = _mm_srli_epi64(a_a[i], 1);
+    a_a[i] |= _mm_srli_si128(next_carry_a, 8);
+    a_a[i] |= carry_a;
+    carry_a = _mm_slli_si128(next_carry_a, 8);
+  }
+}
+
+// vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in
+// a vector and returns the result.
+static inline vec_t vec_broadcast_bit(vec_t a) {
+  return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31),
+                           0b01010101);
+}
+
+// vec_broadcast_bit15 duplicates the most-significant bit of the first word in
+// |a| to all bits in a vector and returns the result.
+static inline vec_t vec_broadcast_bit15(vec_t a) {
+  return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63 - 15), 31),
+                           0b01010101);
+}
+
+// vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the
+// compiler requires that |i| be a compile-time constant.)
+#define vec_get_word(v, i) _mm_extract_epi16(v, i)
+
+#elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \
+    (defined(__ARM_NEON__) || defined(__ARM_NEON))
+
+#define HRSS_HAVE_VECTOR_UNIT
+typedef uint16x8_t vec_t;
+
+// These functions perform the same actions as the SSE2 function of the same
+// name, above.
+
+static int vec_capable() { return CRYPTO_is_NEON_capable(); }
+
+static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; }
+
+static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; }
+
+static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); }
+
+static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) {
+  return vmlaq_n_u16(a, b, c);
+}
+
+static inline void vec3_rshift_word(vec_t v[3]) {
+  const uint16x8_t kZero = {0};
+  v[2] = vextq_u16(v[1], v[2], 7);
+  v[1] = vextq_u16(v[0], v[1], 7);
+  v[0] = vextq_u16(kZero, v[0], 7);
+}
+
+static inline void vec4_rshift_word(vec_t v[4]) {
+  const uint16x8_t kZero = {0};
+  v[3] = vextq_u16(v[2], v[3], 7);
+  v[2] = vextq_u16(v[1], v[2], 7);
+  v[1] = vextq_u16(v[0], v[1], 7);
+  v[0] = vextq_u16(kZero, v[0], 7);
+}
+
+static inline vec_t vec_merge_3_5(vec_t left, vec_t right) {
+  return vextq_u16(left, right, 5);
+}
+
+static inline uint16_t vec_get_word(vec_t v, unsigned i) {
+  return v[i];
+}
+
+#if !defined(OPENSSL_AARCH64)
+
+static inline vec_t vec_broadcast_bit(vec_t a) {
+  a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15);
+  return vdupq_lane_u16(vget_low_u16(a), 0);
+}
+
+static inline vec_t vec_broadcast_bit15(vec_t a) {
+  a = (vec_t)vshrq_n_s16((int16x8_t)a, 15);
+  return vdupq_lane_u16(vget_low_u16(a), 0);
+}
+
+static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) {
+  vec_t carry_s = {0};
+  vec_t carry_a = {0};
+  const vec_t kZero = {0};
+
+  for (int i = 0; i < 6; i++) {
+    vec_t next_carry_s = a_s[i] >> 15;
+    a_s[i] <<= 1;
+    a_s[i] |= vextq_u16(kZero, next_carry_s, 7);
+    a_s[i] |= carry_s;
+    carry_s = vextq_u16(next_carry_s, kZero, 7);
+
+    vec_t next_carry_a = a_a[i] >> 15;
+    a_a[i] <<= 1;
+    a_a[i] |= vextq_u16(kZero, next_carry_a, 7);
+    a_a[i] |= carry_a;
+    carry_a = vextq_u16(next_carry_a, kZero, 7);
+  }
+}
+
+static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) {
+  vec_t carry_s = {0};
+  vec_t carry_a = {0};
+  const vec_t kZero = {0};
+
+  for (int i = 5; i >= 0; i--) {
+    vec_t next_carry_s = a_s[i] << 15;
+    a_s[i] >>= 1;
+    a_s[i] |= vextq_u16(next_carry_s, kZero, 1);
+    a_s[i] |= carry_s;
+    carry_s = vextq_u16(kZero, next_carry_s, 1);
+
+    vec_t next_carry_a = a_a[i] << 15;
+    a_a[i] >>= 1;
+    a_a[i] |= vextq_u16(next_carry_a, kZero, 1);
+    a_a[i] |= carry_a;
+    carry_a = vextq_u16(kZero, next_carry_a, 1);
+  }
+}
+
+#endif  // !OPENSSL_AARCH64
+
+#endif  // (ARM || AARCH64) && NEON
+
+// Polynomials in this scheme have N terms.
+// #define N 701
+
+// Underlying data types and arithmetic operations.
+// ------------------------------------------------
+
+// Binary polynomials.
+
+// poly2 represents a degree-N polynomial over GF(2). The words are in little-
+// endian order, i.e. the coefficient of x^0 is the LSB of the first word. The
+// final word is only partially used since N is not a multiple of the word size.
+
+// Defined in internal.h:
+// struct poly2 {
+//  crypto_word_t v[WORDS_PER_POLY];
+// };
+
+OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) {
+  const uint8_t *in = (const uint8_t *)void_in;
+  for (size_t i = 0; i < len; i++) {
+    printf("%02x", in[i]);
+  }
+  printf("\n");
+}
+
+static void poly2_zero(struct poly2 *p) {
+  OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY);
+}
+
+// poly2_cmov sets |out| to |in| iff |mov| is all ones.
+static void poly2_cmov(struct poly2 *out, const struct poly2 *in,
+                       crypto_word_t mov) {
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    out->v[i] = (out->v[i] & ~mov) | (in->v[i] & mov);
+  }
+}
+
+// poly2_rotr_words performs a right-rotate on |in|, writing the result to
+// |out|. The shift count, |bits|, must be a non-zero multiple of the word size.
+static void poly2_rotr_words(struct poly2 *out, const struct poly2 *in,
+                             size_t bits) {
+  assert(bits >= BITS_PER_WORD && bits % BITS_PER_WORD == 0);
+  assert(out != in);
+
+  const size_t start = bits / BITS_PER_WORD;
+  const size_t n = (N - bits) / BITS_PER_WORD;
+
+  // The rotate is by a whole number of words so the first few words are easy:
+  // just move them down.
+  for (size_t i = 0; i < n; i++) {
+    out->v[i] = in->v[start + i];
+  }
+
+  // Since the last word is only partially filled, however, the remainder needs
+  // shifting and merging of words to take care of that.
+  crypto_word_t carry = in->v[WORDS_PER_POLY - 1];
+
+  for (size_t i = 0; i < start; i++) {
+    out->v[n + i] = carry | in->v[i] << BITS_IN_LAST_WORD;
+    carry = in->v[i] >> (BITS_PER_WORD - BITS_IN_LAST_WORD);
+  }
+
+  out->v[WORDS_PER_POLY - 1] = carry;
+}
+
+// poly2_rotr_bits performs a right-rotate on |in|, writing the result to |out|.
+// The shift count, |bits|, must be a power of two that is less than
+// |BITS_PER_WORD|.
+static void poly2_rotr_bits(struct poly2 *out, const struct poly2 *in,
+                            size_t bits) {
+  assert(bits <= BITS_PER_WORD / 2);
+  assert(bits != 0);
+  assert((bits & (bits - 1)) == 0);
+  assert(out != in);
+
+  // BITS_PER_WORD/2 is the greatest legal value of |bits|. If
+  // |BITS_IN_LAST_WORD| is smaller than this then the code below doesn't work
+  // because more than the last word needs to carry down in the previous one and
+  // so on.
+  OPENSSL_STATIC_ASSERT(
+      BITS_IN_LAST_WORD >= BITS_PER_WORD / 2,
+      "there are more carry bits than fit in BITS_IN_LAST_WORD");
+
+  crypto_word_t carry = in->v[WORDS_PER_POLY - 1] << (BITS_PER_WORD - bits);
+
+  for (size_t i = WORDS_PER_POLY - 2; i < WORDS_PER_POLY; i--) {
+    out->v[i] = carry | in->v[i] >> bits;
+    carry = in->v[i] << (BITS_PER_WORD - bits);
+  }
+
+  crypto_word_t last_word = carry >> (BITS_PER_WORD - BITS_IN_LAST_WORD) |
+                            in->v[WORDS_PER_POLY - 1] >> bits;
+  last_word &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+  out->v[WORDS_PER_POLY - 1] = last_word;
+}
+
+// HRSS_poly2_rotr_consttime right-rotates |p| by |bits| in constant-time.
+void HRSS_poly2_rotr_consttime(struct poly2 *p, size_t bits) {
+  assert(bits <= N);
+  assert(p->v[WORDS_PER_POLY-1] >> BITS_IN_LAST_WORD == 0);
+
+  // Constant-time rotation is implemented by calculating the rotations of
+  // powers-of-two bits and throwing away the unneeded values. 2^9 (i.e. 512) is
+  // the largest power-of-two shift that we need to consider because 2^10 > N.
+#define HRSS_POLY2_MAX_SHIFT 9
+  size_t shift = HRSS_POLY2_MAX_SHIFT;
+  OPENSSL_STATIC_ASSERT((1 << (HRSS_POLY2_MAX_SHIFT + 1)) > N,
+                        "maximum shift is too small");
+  OPENSSL_STATIC_ASSERT((1 << HRSS_POLY2_MAX_SHIFT) <= N,
+                        "maximum shift is too large");
+  struct poly2 shifted;
+
+  for (; (UINT64_C(1) << shift) >= BITS_PER_WORD; shift--) {
+    poly2_rotr_words(&shifted, p, UINT64_C(1) << shift);
+    poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1));
+  }
+
+  for (; shift < HRSS_POLY2_MAX_SHIFT; shift--) {
+    poly2_rotr_bits(&shifted, p, UINT64_C(1) << shift);
+    poly2_cmov(p, &shifted, ~((1 & (bits >> shift)) - 1));
+  }
+#undef HRSS_POLY2_MAX_SHIFT
+}
+
+// poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones.
+static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) {
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]);
+    a->v[i] ^= sum;
+    b->v[i] ^= sum;
+  }
+}
+
+// poly2_fmadd sets |out| to |out| + |in| * m, where m is either
+// |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|.
+static void poly2_fmadd(struct poly2 *out, const struct poly2 *in,
+                        crypto_word_t m) {
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    out->v[i] ^= in->v[i] & m;
+  }
+}
+
+// poly2_lshift1 left-shifts |p| by one bit.
+static void poly2_lshift1(struct poly2 *p) {
+  crypto_word_t carry = 0;
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1);
+    p->v[i] <<= 1;
+    p->v[i] |= carry;
+    carry = next_carry;
+  }
+}
+
+// poly2_rshift1 right-shifts |p| by one bit.
+static void poly2_rshift1(struct poly2 *p) {
+  crypto_word_t carry = 0;
+  for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) {
+    const crypto_word_t next_carry = p->v[i] & 1;
+    p->v[i] >>= 1;
+    p->v[i] |= carry << (BITS_PER_WORD - 1);
+    carry = next_carry;
+  }
+}
+
+// poly2_clear_top_bits clears the bits in the final word that are only for
+// alignment.
+static void poly2_clear_top_bits(struct poly2 *p) {
+  p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+}
+
+// poly2_top_bits_are_clear returns one iff the extra bits in the final words of
+// |p| are zero.
+static int poly2_top_bits_are_clear(const struct poly2 *p) {
+  return (p->v[WORDS_PER_POLY - 1] &
+          ~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0;
+}
+
+// Ternary polynomials.
+
+// poly3 represents a degree-N polynomial over GF(3). Each coefficient is
+// bitsliced across the |s| and |a| arrays, like this:
+//
+//   s  |  a  | value
+//  -----------------
+//   0  |  0  | 0
+//   0  |  1  | 1
+//   1  |  0  | 2 (aka -1)
+//   1  |  1  | <invalid>
+//
+// ('s' is for sign, and 'a' just a letter.)
+//
+// Once bitsliced as such, the following circuits can be used to implement
+// addition and multiplication mod 3:
+//
+//   (s3, a3) = (s1, a1) × (s2, a2)
+//   s3 = (a1 ∧ s2) ⊕ (s1 ∧ a2)
+//   a3 = (s1 ∧ s2) ⊕ (a1 ∧ a2)
+//
+//   (s3, a3) = (s1, a1) + (s2, a2)
+//   x = (a1 ⊕ a2)
+//   y = (s1 ⊕ s2) ⊕ (a1 ∧ a2)
+//   z = (s1 ∧ s2)
+//   s3 = y ∧ ¬x
+//   a3 = z ∨ (x ∧ ¬y)
+//
+// Negating a value just involves swapping s and a.
+// struct poly3 {
+//   struct poly2 s, a;
+// };
+
+OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) {
+  struct poly3 p;
+  OPENSSL_memcpy(&p, in, sizeof(p));
+  p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
+  p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1;
+
+  printf("{[");
+  for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
+    if (i) {
+      printf(" ");
+    }
+    printf(BN_HEX_FMT2, p.s.v[i]);
+  }
+  printf("] [");
+  for (unsigned i = 0; i < WORDS_PER_POLY; i++) {
+    if (i) {
+      printf(" ");
+    }
+    printf(BN_HEX_FMT2, p.a.v[i]);
+  }
+  printf("]}\n");
+}
+
+static void poly3_zero(struct poly3 *p) {
+  poly2_zero(&p->s);
+  poly2_zero(&p->a);
+}
+
+// lsb_to_all replicates the least-significant bit of |v| to all bits of the
+// word. This is used in bit-slicing operations to make a vector from a fixed
+// value.
+static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); }
+
+// poly3_mul_const sets |p| to |p|×m, where m  = (ms, ma).
+static void poly3_mul_const(struct poly3 *p, crypto_word_t ms,
+                            crypto_word_t ma) {
+  ms = lsb_to_all(ms);
+  ma = lsb_to_all(ma);
+
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    const crypto_word_t s = p->s.v[i];
+    const crypto_word_t a = p->a.v[i];
+    p->s.v[i] = (s & ma) ^ (ms & a);
+    p->a.v[i] = (ms & s) ^ (ma & a);
+  }
+}
+
+// poly3_rotr_consttime right-rotates |p| by |bits| in constant-time.
+static void poly3_rotr_consttime(struct poly3 *p, size_t bits) {
+  assert(bits <= N);
+  HRSS_poly2_rotr_consttime(&p->s, bits);
+  HRSS_poly2_rotr_consttime(&p->a, bits);
+}
+
+// poly3_fmadd sets |out| to |out| + |in|×m, where m is (ms, ma).
+static void poly3_fmadd(struct poly3 *RESTRICT out,
+                        const struct poly3 *RESTRICT in, crypto_word_t ms,
+                        crypto_word_t ma) {
+  // (See the multiplication and addition circuits given above.)
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    const crypto_word_t s = in->s.v[i];
+    const crypto_word_t a = in->a.v[i];
+    const crypto_word_t product_s = (s & ma) ^ (ms & a);
+    const crypto_word_t product_a = (ms & s) ^ (ma & a);
+
+    const crypto_word_t x = out->a.v[i] ^ product_a;
+    const crypto_word_t y =
+        (out->s.v[i] ^ product_s) ^ (out->a.v[i] & product_a);
+    const crypto_word_t z = (out->s.v[i] & product_s);
+    out->s.v[i] = y & ~x;
+    out->a.v[i] = z | (x & ~y);
+  }
+}
+
+// final_bit_to_all replicates the bit in the final position of the last word to
+// all the bits in the word.
+static crypto_word_t final_bit_to_all(crypto_word_t v) {
+  return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1));
+}
+
+// poly3_top_bits_are_clear returns one iff the extra bits in the final words of
+// |p| are zero.
+OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) {
+  return poly2_top_bits_are_clear(&p->s) && poly2_top_bits_are_clear(&p->a);
+}
+
+// poly3_mod_phiN reduces |p| by Φ(N).
+static void poly3_mod_phiN(struct poly3 *p) {
+  // In order to reduce by Φ(N) we subtract by the value of the greatest
+  // coefficient. That's the same as adding the negative of its value. The
+  // negative of (s, a) is (a, s), so the arguments are swapped in the following
+  // two lines.
+  const crypto_word_t factor_s = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]);
+  const crypto_word_t factor_a = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]);
+
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    const crypto_word_t s = p->s.v[i];
+    const crypto_word_t a = p->a.v[i];
+    const crypto_word_t x = a ^ factor_a;
+    const crypto_word_t y = (s ^ factor_s) ^ (a & factor_a);
+    const crypto_word_t z = (s & factor_s);
+    p->s.v[i] = y & ~x;
+    p->a.v[i] = z | (x & ~y);
+  }
+
+  poly2_clear_top_bits(&p->s);
+  poly2_clear_top_bits(&p->a);
+}
+
+static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) {
+  poly2_cswap(&a->s, &b->s, swap);
+  poly2_cswap(&a->a, &b->a, swap);
+}
+
+static void poly3_lshift1(struct poly3 *p) {
+  poly2_lshift1(&p->s);
+  poly2_lshift1(&p->a);
+}
+
+static void poly3_rshift1(struct poly3 *p) {
+  poly2_rshift1(&p->s);
+  poly2_rshift1(&p->a);
+}
+
+// poly3_span represents a pointer into a poly3.
+struct poly3_span {
+  crypto_word_t *s;
+  crypto_word_t *a;
+};
+
+// poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|).
+static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a,
+                           const crypto_word_t s1, const crypto_word_t a1,
+                           const crypto_word_t s2, const crypto_word_t a2) {
+  const crypto_word_t x = a1 ^ a2;
+  const crypto_word_t y = (s1 ^ s2) ^ (a1 & a2);
+  const crypto_word_t z = s1 & s2;
+  *out_s = y & ~x;
+  *out_a = z | (x & ~y);
+}
+
+// poly3_span_add adds |n| words of values from |a| and |b| and writes the
+// result to |out|.
+static void poly3_span_add(const struct poly3_span *out,
+                           const struct poly3_span *a,
+                           const struct poly3_span *b, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]);
+  }
+}
+
+// poly3_span_sub subtracts |n| words of |b| from |n| words of |a|.
+static void poly3_span_sub(const struct poly3_span *a,
+                           const struct poly3_span *b, size_t n) {
+  for (size_t i = 0; i < n; i++) {
+    // Swapping |b->s| and |b->a| negates the value being added.
+    poly3_word_add(&a->s[i], &a->a[i], a->s[i], a->a[i], b->a[i], b->s[i]);
+  }
+}
+
+// poly3_mul_aux is a recursive function that multiplies |n| words from |a| and
+// |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of
+// |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't
+// used and the recursion stops. For |n| in {11, 22}, the transitive total
+// amount of |scratch| needed happens to be 2n+2.
+static void poly3_mul_aux(const struct poly3_span *out,
+                          const struct poly3_span *scratch,
+                          const struct poly3_span *a,
+                          const struct poly3_span *b, size_t n) {
+  if (n == 1) {
+    crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0;
+    crypto_word_t b_s = b->s[0], b_a = b->a[0];
+    const crypto_word_t a_s = a->s[0], a_a = a->a[0];
+
+    for (size_t i = 0; i < BITS_PER_WORD; i++) {
+      // Multiply (s, a) by the next value from (b_s, b_a).
+      const crypto_word_t v_s = lsb_to_all(b_s);
+      const crypto_word_t v_a = lsb_to_all(b_a);
+      b_s >>= 1;
+      b_a >>= 1;
+
+      const crypto_word_t m_s = (v_s & a_a) ^ (a_s & v_a);
+      const crypto_word_t m_a = (a_s & v_s) ^ (a_a & v_a);
+
+      if (i == 0) {
+        // Special case otherwise the code tries to shift by BITS_PER_WORD
+        // below, which is undefined.
+        r_s_low = m_s;
+        r_a_low = m_a;
+        continue;
+      }
+
+      // Shift the multiplication result to the correct position.
+      const crypto_word_t m_s_low = m_s << i;
+      const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i);
+      const crypto_word_t m_a_low = m_a << i;
+      const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i);
+
+      // Add into the result.
+      poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low);
+      poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high,
+                     m_a_high);
+    }
+
+    out->s[0] = r_s_low;
+    out->s[1] = r_s_high;
+    out->a[0] = r_a_low;
+    out->a[1] = r_a_high;
+    return;
+  }
+
+  // Karatsuba multiplication.
+  // https://en.wikipedia.org/wiki/Karatsuba_algorithm
+
+  // When |n| is odd, the two "halves" will have different lengths. The first
+  // is always the smaller.
+  const size_t low_len = n / 2;
+  const size_t high_len = n - low_len;
+  const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]};
+  const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]};
+
+  // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
+  // half.
+  const struct poly3_span a_cross_sum = *out;
+  const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]};
+  poly3_span_add(&a_cross_sum, a, &a_high, low_len);
+  poly3_span_add(&b_cross_sum, b, &b_high, low_len);
+  if (high_len != low_len) {
+    a_cross_sum.s[low_len] = a_high.s[low_len];
+    a_cross_sum.a[low_len] = a_high.a[low_len];
+    b_cross_sum.s[low_len] = b_high.s[low_len];
+    b_cross_sum.a[low_len] = b_high.a[low_len];
+  }
+
+  const struct poly3_span child_scratch = {&scratch->s[2 * high_len],
+                                           &scratch->a[2 * high_len]};
+  const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]};
+  const struct poly3_span out_high = {&out->s[2 * low_len],
+                                      &out->a[2 * low_len]};
+
+  // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
+  poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len);
+  // Calculate a_1 × b_1.
+  poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len);
+  // Calculate a_0 × b_0.
+  poly3_mul_aux(out, &child_scratch, a, b, low_len);
+
+  // Subtract those last two products from the first.
+  poly3_span_sub(scratch, out, low_len * 2);
+  poly3_span_sub(scratch, &out_high, high_len * 2);
+
+  // Add the middle product into the output.
+  poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2);
+}
+
+// HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N).
+void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x,
+                    const struct poly3 *y) {
+  crypto_word_t prod_s[WORDS_PER_POLY * 2];
+  crypto_word_t prod_a[WORDS_PER_POLY * 2];
+  crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2];
+  crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2];
+  const struct poly3_span prod_span = {prod_s, prod_a};
+  const struct poly3_span scratch_span = {scratch_s, scratch_a};
+  const struct poly3_span x_span = {(crypto_word_t *)x->s.v,
+                                    (crypto_word_t *)x->a.v};
+  const struct poly3_span y_span = {(crypto_word_t *)y->s.v,
+                                    (crypto_word_t *)y->a.v};
+
+  poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY);
+
+  // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
+  // upper-half to the lower-half. However, N is 701, which isn't a multiple of
+  // BITS_PER_WORD, so the upper-half vectors all have to be shifted before
+  // being added to the lower-half.
+  for (size_t i = 0; i < WORDS_PER_POLY; i++) {
+    crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
+    v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
+    crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD;
+    v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD);
+
+    poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a);
+  }
+
+  poly3_mod_phiN(out);
+}
+
+#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
+
+// poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is
+// |0xff..ff|. Otherwise, |swap| must be zero.
+static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
+                                   vec_t b_a[6], const vec_t swap) {
+  for (int i = 0; i < 6; i++) {
+    const vec_t sum_s = swap & (a_s[i] ^ b_s[i]);
+    a_s[i] ^= sum_s;
+    b_s[i] ^= sum_s;
+
+    const vec_t sum_a = swap & (a_a[i] ^ b_a[i]);
+    a_a[i] ^= sum_a;
+    b_a[i] ^= sum_a;
+  }
+}
+
+// poly3_vec_fmadd adds (|ms|, |ma|) × (|b_s|, |b_a|) to (|a_s|, |a_a|).
+static inline void poly3_vec_fmadd(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6],
+                                   vec_t b_a[6], const vec_t ms,
+                                   const vec_t ma) {
+  for (int i = 0; i < 6; i++) {
+    const vec_t s = b_s[i];
+    const vec_t a = b_a[i];
+    const vec_t product_s = (s & ma) ^ (ms & a);
+    const vec_t product_a = (ms & s) ^ (ma & a);
+
+    const vec_t x = a_a[i] ^ product_a;
+    const vec_t y = (a_s[i] ^ product_s) ^ (a_a[i] & product_a);
+    const vec_t z = (a_s[i] & product_s);
+    a_s[i] = y & ~x;
+    a_a[i] = z | (x & ~y);
+  }
+}
+
+// poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
+// Φ(N).
+static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) {
+  // See the comment in |HRSS_poly3_invert| about this algorithm. In addition to
+  // the changes described there, this implementation attempts to use vector
+  // registers to speed up the computation. Even non-poly3 variables are held in
+  // vectors where possible to minimise the amount of data movement between
+  // the vector and general-purpose registers.
+
+  vec_t b_s[6], b_a[6], c_s[6], c_a[6], f_s[6], f_a[6], g_s[6], g_a[6];
+  const vec_t kZero = {0};
+  const vec_t kOne = {1};
+  static const uint8_t kOneBytes[sizeof(vec_t)] = {1};
+  static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = {
+      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f};
+
+  memset(b_s, 0, sizeof(b_s));
+  memcpy(b_a, kOneBytes, sizeof(kOneBytes));
+  memset(&b_a[1], 0, 5 * sizeof(vec_t));
+
+  memset(c_s, 0, sizeof(c_s));
+  memset(c_a, 0, sizeof(c_a));
+
+  f_s[5] = kZero;
+  memcpy(f_s, in->s.v, WORDS_PER_POLY * sizeof(crypto_word_t));
+  f_a[5] = kZero;
+  memcpy(f_a, in->a.v, WORDS_PER_POLY * sizeof(crypto_word_t));
+
+  // Set g to all ones.
+  memset(g_s, 0, sizeof(g_s));
+  memset(g_a, 0xff, 5 * sizeof(vec_t));
+  memcpy(&g_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne));
+
+  vec_t deg_f = {N - 1}, deg_g = {N - 1}, rotation = kZero;
+  vec_t k = kOne;
+  vec_t f0s = {0}, f0a = {0};
+  vec_t still_going;
+  memset(&still_going, 0xff, sizeof(still_going));
+
+  for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) {
+    const vec_t s_a = vec_broadcast_bit(
+        still_going & ((f_a[0] & g_s[0]) ^ (f_s[0] & g_a[0])));
+    const vec_t s_s = vec_broadcast_bit(
+        still_going & ((f_a[0] & g_a[0]) ^ (f_s[0] & g_s[0])));
+    const vec_t should_swap =
+        (s_s | s_a) & vec_broadcast_bit15(deg_f - deg_g);
+
+    poly3_vec_cswap(f_s, f_a, g_s, g_a, should_swap);
+    poly3_vec_fmadd(f_s, f_a, g_s, g_a, s_s, s_a);
+    poly3_vec_rshift1(f_s, f_a);
+
+    poly3_vec_cswap(b_s, b_a, c_s, c_a, should_swap);
+    poly3_vec_fmadd(b_s, b_a, c_s, c_a, s_s, s_a);
+    poly3_vec_lshift1(c_s, c_a);
+
+    const vec_t deg_sum = should_swap & (deg_f ^ deg_g);
+    deg_f ^= deg_sum;
+    deg_g ^= deg_sum;
+
+    deg_f -= kOne;
+    still_going &= ~vec_broadcast_bit15(deg_f - kOne);
+
+    const vec_t f0_is_nonzero = vec_broadcast_bit(f_s[0] | f_a[0]);
+    // |f0_is_nonzero| implies |still_going|.
+    rotation ^= f0_is_nonzero & (k ^ rotation);
+    k += kOne;
+
+    const vec_t f0s_sum = f0_is_nonzero & (f_s[0] ^ f0s);
+    f0s ^= f0s_sum;
+    const vec_t f0a_sum = f0_is_nonzero & (f_a[0] ^ f0a);
+    f0a ^= f0a_sum;
+  }
+
+  crypto_word_t rotation_word = vec_get_word(rotation, 0);
+  rotation_word -= N & constant_time_lt_w(N, rotation_word);
+  memcpy(out->s.v, b_s, WORDS_PER_POLY * sizeof(crypto_word_t));
+  memcpy(out->a.v, b_a, WORDS_PER_POLY * sizeof(crypto_word_t));
+  assert(poly3_top_bits_are_clear(out));
+  poly3_rotr_consttime(out, rotation_word);
+  poly3_mul_const(out, vec_get_word(f0s, 0), vec_get_word(f0a, 0));
+  poly3_mod_phiN(out);
+}
+
+#endif  // HRSS_HAVE_VECTOR_UNIT
+
+// HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod
+// Φ(N).
+void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) {
+  // The vector version of this function seems slightly slower on AArch64, but
+  // is useful on ARMv7 and x86-64.
+#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64)
+  if (vec_capable()) {
+    poly3_invert_vec(out, in);
+    return;
+  }
+#endif
+
+  // This algorithm mostly follows algorithm 10 in the paper. Some changes:
+  //   1) k should start at zero, not one. In the code below k is omitted and
+  //      the loop counter, |i|, is used instead.
+  //   2) The rotation count is conditionally updated to handle trailing zero
+  //      coefficients.
+  // The best explanation for why it works is in the "Why it works" section of
+  // [NTRUTN14].
+
+  struct poly3 c, f, g;
+  OPENSSL_memcpy(&f, in, sizeof(f));
+
+  // Set g to all ones.
+  OPENSSL_memset(&g.s, 0, sizeof(struct poly2));
+  OPENSSL_memset(&g.a, 0xff, sizeof(struct poly2));
+  g.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
+
+  struct poly3 *b = out;
+  poly3_zero(b);
+  poly3_zero(&c);
+  // Set b to one.
+  b->a.v[0] = 1;
+
+  crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0;
+  crypto_word_t f0s = 0, f0a = 0;
+  crypto_word_t still_going = CONSTTIME_TRUE_W;
+
+  for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) {
+    const crypto_word_t s_a = lsb_to_all(
+        still_going & ((f.a.v[0] & g.s.v[0]) ^ (f.s.v[0] & g.a.v[0])));
+    const crypto_word_t s_s = lsb_to_all(
+        still_going & ((f.a.v[0] & g.a.v[0]) ^ (f.s.v[0] & g.s.v[0])));
+    const crypto_word_t should_swap =
+        (s_s | s_a) & constant_time_lt_w(deg_f, deg_g);
+
+    poly3_cswap(&f, &g, should_swap);
+    poly3_cswap(b, &c, should_swap);
+
+    const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g);
+    deg_f ^= deg_sum;
+    deg_g ^= deg_sum;
+    assert(deg_g >= 1);
+
+    poly3_fmadd(&f, &g, s_s, s_a);
+    poly3_fmadd(b, &c, s_s, s_a);
+    poly3_rshift1(&f);
+    poly3_lshift1(&c);
+
+    deg_f--;
+    const crypto_word_t f0_is_nonzero =
+        lsb_to_all(f.s.v[0]) | lsb_to_all(f.a.v[0]);
+    // |f0_is_nonzero| implies |still_going|.
+    assert(!(f0_is_nonzero && !still_going));
+    still_going &= ~constant_time_is_zero_w(deg_f);
+
+    rotation = constant_time_select_w(f0_is_nonzero, i, rotation);
+    f0s = constant_time_select_w(f0_is_nonzero, f.s.v[0], f0s);
+    f0a = constant_time_select_w(f0_is_nonzero, f.a.v[0], f0a);
+  }
+
+  rotation++;
+  rotation -= N & constant_time_lt_w(N, rotation);
+  assert(poly3_top_bits_are_clear(out));
+  poly3_rotr_consttime(out, rotation);
+  poly3_mul_const(out, f0s, f0a);
+  poly3_mod_phiN(out);
+}
+
+// Polynomials in Q.
+
+// Coefficients are reduced mod Q. (Q is clearly not prime, therefore the
+// coefficients do not form a field.)
+#define Q 8192
+
+// VECS_PER_POLY is the number of 128-bit vectors needed to represent a
+// polynomial.
+#define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t))
+#define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC)
+
+// poly represents a polynomial with coefficients mod Q. Note that, while Q is a
+// power of two, this does not operate in GF(Q). That would be a binary field
+// but this is simply mod Q. Thus the coefficients are not a field.
+//
+// Coefficients are ordered little-endian, thus the coefficient of x^0 is the
+// first element of the array.
+struct poly {
+#if defined(HRSS_HAVE_VECTOR_UNIT)
+  union {
+    // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for
+    // the vector code.
+    uint16_t v[N + 3];
+    vec_t vectors[VECS_PER_POLY];
+  };
+#else
+  uint16_t v[N + 3];
+#endif
+};
+
+OPENSSL_UNUSED static void poly_print(const struct poly *p) {
+  printf("[");
+  for (unsigned i = 0; i < N; i++) {
+    if (i) {
+      printf(" ");
+    }
+    printf("%d", p->v[i]);
+  }
+  printf("]\n");
+}
+
+#if defined(HRSS_HAVE_VECTOR_UNIT)
+
+// poly_mul_vec_aux is a recursive function that multiplies |n| words from |a|
+// and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements
+// of |scratch| and the function recurses, except if |n| < 3, when |scratch|
+// isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch|
+// needs 172 elements.
+static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch,
+                             const vec_t *restrict a, const vec_t *restrict b,
+                             const size_t n) {
+  // In [HRSS], the technique they used for polynomial multiplication is
+  // described: they start with Toom-4 at the top level and then two layers of
+  // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook
+  // decomposition, which splits an input n-ways and produces 2n-1
+  // multiplications of those parts. So, starting with 704 coefficients (rounded
+  // up from 701 to have more factors of two), Toom-4 gives seven
+  // multiplications of degree-174 polynomials. Each round of Karatsuba (which
+  // is Toom-2) increases the number of multiplications by a factor of three
+  // while halving the size of the values being multiplied. So two rounds gives
+  // 63 multiplications of degree-44 polynomials. Then they (I think) form
+  // vectors by gathering all 63 coefficients of each power together, for each
+  // input, and doing more rounds of Karatsuba on the vectors until they bottom-
+  // out somewhere with schoolbook multiplication.
+  //
+  // I tried something like that for NEON. NEON vectors are 128 bits so hold
+  // eight coefficients. I wrote a function that did Karatsuba on eight
+  // multiplications at the same time, using such vectors, and a Go script that
+  // decomposed from degree-704, with Karatsuba in non-transposed form, until it
+  // reached multiplications of degree-44. It batched up those 81
+  // multiplications into lots of eight with a single one left over (which was
+  // handled directly).
+  //
+  // It worked, but it was significantly slower than the dumb algorithm used
+  // below. Potentially that was because I misunderstood how [HRSS] did it, or
+  // because Clang is bad at generating good code from NEON intrinsics on ARMv7.
+  // (Which is true: the code generated by Clang for the below is pretty crap.)
+  //
+  // This algorithm is much simpler. It just does Karatsuba decomposition all
+  // the way down and never transposes. When it gets down to degree-16 or
+  // degree-24 values, they are multiplied using schoolbook multiplication and
+  // vector intrinsics. The vector operations form each of the eight phase-
+  // shifts of one of the inputs, point-wise multiply, and then add into the
+  // result at the correct place. This means that 33% (degree-16) or 25%
+  // (degree-24) of the multiplies and adds are wasted, but it does ok.
+  if (n == 2) {
+    vec_t result[4];
+    vec_t vec_a[3];
+    static const vec_t kZero = {0};
+    vec_a[0] = a[0];
+    vec_a[1] = a[1];
+    vec_a[2] = kZero;
+
+    result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0));
+    result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0));
+
+    result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0));
+    result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0));
+    result[3] = kZero;
+
+    vec3_rshift_word(vec_a);
+
+#define BLOCK(x, y)                                                      \
+  do {                                                                   \
+    result[x + 0] =                                                      \
+        vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 1] =                                                      \
+        vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 2] =                                                      \
+        vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
+  } while (0)
+
+    BLOCK(0, 1);
+    BLOCK(1, 9);
+
+    vec3_rshift_word(vec_a);
+
+    BLOCK(0, 2);
+    BLOCK(1, 10);
+
+    vec3_rshift_word(vec_a);
+
+    BLOCK(0, 3);
+    BLOCK(1, 11);
+
+    vec3_rshift_word(vec_a);
+
+    BLOCK(0, 4);
+    BLOCK(1, 12);
+
+    vec3_rshift_word(vec_a);
+
+    BLOCK(0, 5);
+    BLOCK(1, 13);
+
+    vec3_rshift_word(vec_a);
+
+    BLOCK(0, 6);
+    BLOCK(1, 14);
+
+    vec3_rshift_word(vec_a);
+
+    BLOCK(0, 7);
+    BLOCK(1, 15);
+
+#undef BLOCK
+
+    memcpy(out, result, sizeof(result));
+    return;
+  }
+
+  if (n == 3) {
+    vec_t result[6];
+    vec_t vec_a[4];
+    static const vec_t kZero = {0};
+    vec_a[0] = a[0];
+    vec_a[1] = a[1];
+    vec_a[2] = a[2];
+    vec_a[3] = kZero;
+
+    result[0] = vec_mul(a[0], vec_get_word(b[0], 0));
+    result[1] = vec_mul(a[1], vec_get_word(b[0], 0));
+    result[2] = vec_mul(a[2], vec_get_word(b[0], 0));
+
+#define BLOCK_PRE(x, y)                                                  \
+  do {                                                                   \
+    result[x + 0] =                                                      \
+        vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 1] =                                                      \
+        vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8));    \
+  } while (0)
+
+    BLOCK_PRE(1, 8);
+    BLOCK_PRE(2, 16);
+
+    result[5] = kZero;
+
+    vec4_rshift_word(vec_a);
+
+#define BLOCK(x, y)                                                      \
+  do {                                                                   \
+    result[x + 0] =                                                      \
+        vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 1] =                                                      \
+        vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 2] =                                                      \
+        vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \
+    result[x + 3] =                                                      \
+        vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \
+  } while (0)
+
+    BLOCK(0, 1);
+    BLOCK(1, 9);
+    BLOCK(2, 17);
+
+    vec4_rshift_word(vec_a);
+
+    BLOCK(0, 2);
+    BLOCK(1, 10);
+    BLOCK(2, 18);
+
+    vec4_rshift_word(vec_a);
+
+    BLOCK(0, 3);
+    BLOCK(1, 11);
+    BLOCK(2, 19);
+
+    vec4_rshift_word(vec_a);
+
+    BLOCK(0, 4);
+    BLOCK(1, 12);
+    BLOCK(2, 20);
+
+    vec4_rshift_word(vec_a);
+
+    BLOCK(0, 5);
+    BLOCK(1, 13);
+    BLOCK(2, 21);
+
+    vec4_rshift_word(vec_a);
+
+    BLOCK(0, 6);
+    BLOCK(1, 14);
+    BLOCK(2, 22);
+
+    vec4_rshift_word(vec_a);
+
+    BLOCK(0, 7);
+    BLOCK(1, 15);
+    BLOCK(2, 23);
+
+#undef BLOCK
+#undef BLOCK_PRE
+
+    memcpy(out, result, sizeof(result));
+
+    return;
+  }
+
+  // Karatsuba multiplication.
+  // https://en.wikipedia.org/wiki/Karatsuba_algorithm
+
+  // When |n| is odd, the two "halves" will have different lengths. The first is
+  // always the smaller.
+  const size_t low_len = n / 2;
+  const size_t high_len = n - low_len;
+  const vec_t *a_high = &a[low_len];
+  const vec_t *b_high = &b[low_len];
+
+  // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second
+  // half.
+  for (size_t i = 0; i < low_len; i++) {
+    out[i] = vec_add(a_high[i], a[i]);
+    out[high_len + i] = vec_add(b_high[i], b[i]);
+  }
+  if (high_len != low_len) {
+    out[low_len] = a_high[low_len];
+    out[high_len + low_len] = b_high[low_len];
+  }
+
+  vec_t *const child_scratch = &scratch[2 * high_len];
+  // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer.
+  poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len);
+  // Calculate a_1 × b_1.
+  poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len);
+  // Calculate a_0 × b_0.
+  poly_mul_vec_aux(out, child_scratch, a, b, low_len);
+
+  // Subtract those last two products from the first.
+  for (size_t i = 0; i < low_len * 2; i++) {
+    scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i]));
+  }
+  if (low_len != high_len) {
+    scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]);
+    scratch[low_len * 2 + 1] =
+        vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]);
+  }
+
+  // Add the middle product into the output.
+  for (size_t i = 0; i < high_len * 2; i++) {
+    out[low_len + i] = vec_add(out[low_len + i], scratch[i]);
+  }
+}
+
+// poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1).
+static void poly_mul_vec(struct poly *out, const struct poly *x,
+                         const struct poly *y) {
+  OPENSSL_memset((uint16_t *)&x->v[N], 0, 3 * sizeof(uint16_t));
+  OPENSSL_memset((uint16_t *)&y->v[N], 0, 3 * sizeof(uint16_t));
+
+  OPENSSL_STATIC_ASSERT(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY,
+                        "struct poly is the wrong size");
+  OPENSSL_STATIC_ASSERT(alignof(struct poly) == alignof(vec_t),
+                        "struct poly has incorrect alignment");
+
+  vec_t prod[VECS_PER_POLY * 2];
+  vec_t scratch[172];
+  poly_mul_vec_aux(prod, scratch, x->vectors, y->vectors, VECS_PER_POLY);
+
+  // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the
+  // upper-half to the lower-half. However, N is 701, which isn't a multiple of
+  // the vector size, so the upper-half vectors all have to be shifted before
+  // being added to the lower-half.
+  vec_t *out_vecs = (vec_t *)out->v;
+
+  for (size_t i = 0; i < VECS_PER_POLY; i++) {
+    const vec_t prev = prod[VECS_PER_POLY - 1 + i];
+    const vec_t this = prod[VECS_PER_POLY + i];
+    out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this));
+  }
+
+  OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
+}
+
+#endif  // HRSS_HAVE_VECTOR_UNIT
+
+// poly_mul_novec_aux writes the product of |a| and |b| to |out|, using
+// |scratch| as scratch space. It'll use Karatsuba if the inputs are large
+// enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and
+// the function recurses, except if |n| < 64, when |scratch| isn't used and the
+// recursion stops. If |n| == |N| then |scratch| needs 1318 elements.
+static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch,
+                               const uint16_t *a, const uint16_t *b, size_t n) {
+  static const size_t kSchoolbookLimit = 64;
+  if (n < kSchoolbookLimit) {
+    OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2);
+    for (size_t i = 0; i < n; i++) {
+      for (size_t j = 0; j < n; j++) {
+        out[i + j] += (unsigned) a[i] * b[j];
+      }
+    }
+
+    return;
+  }
+
+  // Karatsuba multiplication.
+  // https://en.wikipedia.org/wiki/Karatsuba_algorithm
+
+  // When |n| is odd, the two "halves" will have different lengths. The
+  // first is always the smaller.
+  const size_t low_len = n / 2;
+  const size_t high_len = n - low_len;
+  const uint16_t *const a_high = &a[low_len];
+  const uint16_t *const b_high = &b[low_len];
+
+  for (size_t i = 0; i < low_len; i++) {
+    out[i] = a_high[i] + a[i];
+    out[high_len + i] = b_high[i] + b[i];
+  }
+  if (high_len != low_len) {
+    out[low_len] = a_high[low_len];
+    out[high_len + low_len] = b_high[low_len];
+  }
+
+  uint16_t *const child_scratch = &scratch[2 * high_len];
+  poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len);
+  poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high,
+                     high_len);
+  poly_mul_novec_aux(out, child_scratch, a, b, low_len);
+
+  for (size_t i = 0; i < low_len * 2; i++) {
+    scratch[i] -= out[i] + out[low_len * 2 + i];
+  }
+  if (low_len != high_len) {
+    scratch[low_len * 2] -= out[low_len * 4];
+    assert(out[low_len * 4 + 1] == 0);
+  }
+
+  for (size_t i = 0; i < high_len * 2; i++) {
+    out[low_len + i] += scratch[i];
+  }
+}
+
+// poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1).
+static void poly_mul_novec(struct poly *out, const struct poly *x,
+                           const struct poly *y) {
+  uint16_t prod[2 * N];
+  uint16_t scratch[1318];
+  poly_mul_novec_aux(prod, scratch, x->v, y->v, N);
+
+  for (size_t i = 0; i < N; i++) {
+    out->v[i] = prod[i] + prod[i + N];
+  }
+  OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t));
+}
+
+// On x86-64, we can use the AVX2 code from [HRSS]. (The authors have given
+// explicit permission for this and signed a CLA.) However it's 57KB of object
+// code, so it's not used if |OPENSSL_SMALL| is defined.
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \
+    defined(OPENSSL_X86_64) && defined(OPENSSL_LINUX)
+// poly_Rq_mul is defined in assembly.
+extern void poly_Rq_mul(struct poly *r, const struct poly *a,
+                        const struct poly *b);
+#endif
+
+// The file cannot always be built with -mfpu=neon on ARMv7 because that would
+// enable NEON instructions everywhere, not just in functions guarded by a
+// runtime check for NEON capability. Therefore on ARMv7, if -mfpu=neon isn't
+// used, a version of the vector code that has been precompiled and checked-in
+// as assembly sources is used. (For AArch64, NEON is assumed to be provided.)
+#if defined(OPENSSL_ARM) && !defined(HRSS_HAVE_VECTOR_UNIT)
+// poly_mul_vec is defined in assembly.
+extern void poly_mul_vec(struct poly *out, const struct poly *x,
+                         const struct poly *y);
+#endif
+
+static void poly_mul(struct poly *r, const struct poly *a,
+                     const struct poly *b) {
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \
+    defined(OPENSSL_X86_64) && defined(OPENSSL_LINUX)
+  const int has_avx2 = (OPENSSL_ia32cap_P[2] & (1 << 5)) != 0;
+  if (has_avx2) {
+    poly_Rq_mul(r, a, b);
+    return;
+  }
+#endif
+
+#if defined(HRSS_HAVE_VECTOR_UNIT)
+  if (vec_capable()) {
+    poly_mul_vec(r, a, b);
+    return;
+  }
+#endif
+
+#if defined(OPENSSL_ARM) && !defined(HRSS_HAVE_VECTOR_UNIT)
+  // See above about this call.
+  if (CRYPTO_is_NEON_capable()) {
+    poly_mul_vec(r, a, b);
+    return;
+  }
+#endif
+
+  // Fallback, non-vector case.
+  poly_mul_novec(r, a, b);
+}
+
+// poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1).
+static void poly_mul_x_minus_1(struct poly *p) {
+  // Multiplying by (𝑥 - 1) means negating each coefficient and adding in
+  // the value of the previous one.
+  const uint16_t orig_final_coefficient = p->v[N - 1];
+
+  for (size_t i = N - 1; i > 0; i--) {
+    p->v[i] = p->v[i - 1] - p->v[i];
+  }
+  p->v[0] = orig_final_coefficient - p->v[0];
+}
+
+// poly_mod_phiN sets |p| to |p| mod Φ(N).
+static void poly_mod_phiN(struct poly *p) {
+  const uint16_t coeff700 = p->v[N - 1];
+
+  for (unsigned i = 0; i < N; i++) {
+    p->v[i] -= coeff700;
+  }
+}
+
+// poly_clamp reduces each coefficient mod Q.
+static void poly_clamp(struct poly *p) {
+  for (unsigned i = 0; i < N; i++) {
+    p->v[i] &= Q - 1;
+  }
+}
+
+
+// Conversion functions
+// --------------------
+
+// poly2_from_poly sets |*out| to |in| mod 2.
+static void poly2_from_poly(struct poly2 *out, const struct poly *in) {
+  crypto_word_t *words = out->v;
+  unsigned shift = 0;
+  crypto_word_t word = 0;
+
+  for (unsigned i = 0; i < N; i++) {
+    word >>= 1;
+    word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1);
+    shift++;
+
+    if (shift == BITS_PER_WORD) {
+      *words = word;
+      words++;
+      word = 0;
+      shift = 0;
+    }
+  }
+
+  word >>= BITS_PER_WORD - shift;
+  *words = word;
+}
+
+// mod3 treats |a| is a signed number and returns |a| mod 3.
+static uint16_t mod3(int16_t a) {
+  const int16_t q = ((int32_t)a * 21845) >> 16;
+  int16_t ret = a - 3 * q;
+  // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0,
+  // 1, 2, 0}.
+  return ret & ((ret & (ret >> 1)) - 1);
+}
+
+// poly3_from_poly sets |*out| to |in|.
+static void poly3_from_poly(struct poly3 *out, const struct poly *in) {
+  crypto_word_t *words_s = out->s.v;
+  crypto_word_t *words_a = out->a.v;
+  crypto_word_t s = 0;
+  crypto_word_t a = 0;
+  unsigned shift = 0;
+
+  for (unsigned i = 0; i < N; i++) {
+    // This duplicates the 13th bit upwards to the top of the uint16,
+    // essentially treating it as a sign bit and converting into a signed int16.
+    // The signed value is reduced mod 3, yielding {0, 1, 2}.
+    const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3);
+    s >>= 1;
+    s |= (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2);
+    a >>= 1;
+    a |= (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1);
+    shift++;
+
+    if (shift == BITS_PER_WORD) {
+      *words_s = s;
+      words_s++;
+      *words_a = a;
+      words_a++;
+      s = a = 0;
+      shift = 0;
+    }
+  }
+
+  s >>= BITS_PER_WORD - shift;
+  a >>= BITS_PER_WORD - shift;
+  *words_s = s;
+  *words_a = a;
+}
+
+// poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1,
+// Q-1}. It returns a mask indicating whether all coefficients were found to be
+// in that set.
+static crypto_word_t poly3_from_poly_checked(struct poly3 *out,
+                                             const struct poly *in) {
+  crypto_word_t *words_s = out->s.v;
+  crypto_word_t *words_a = out->a.v;
+  crypto_word_t s = 0;
+  crypto_word_t a = 0;
+  unsigned shift = 0;
+  crypto_word_t ok = CONSTTIME_TRUE_W;
+
+  for (unsigned i = 0; i < N; i++) {
+    const uint16_t v = in->v[i];
+    // Maps {0, 1, Q-1} to {0, 1, 2}.
+    uint16_t mod3 = v & 3;
+    mod3 ^= mod3 >> 1;
+    const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q;
+    ok &= constant_time_eq_w(v, expected);
+
+    s >>= 1;
+    s |= (crypto_word_t)(mod3 & 2) << (BITS_PER_WORD - 2);
+    a >>= 1;
+    a |= (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1);
+    shift++;
+
+    if (shift == BITS_PER_WORD) {
+      *words_s = s;
+      words_s++;
+      *words_a = a;
+      words_a++;
+      s = a = 0;
+      shift = 0;
+    }
+  }
+
+  s >>= BITS_PER_WORD - shift;
+  a >>= BITS_PER_WORD - shift;
+  *words_s = s;
+  *words_a = a;
+
+  return ok;
+}
+
+static void poly_from_poly2(struct poly *out, const struct poly2 *in) {
+  const crypto_word_t *words = in->v;
+  unsigned shift = 0;
+  crypto_word_t word = *words;
+
+  for (unsigned i = 0; i < N; i++) {
+    out->v[i] = word & 1;
+    word >>= 1;
+    shift++;
+
+    if (shift == BITS_PER_WORD) {
+      words++;
+      word = *words;
+      shift = 0;
+    }
+  }
+}
+
+static void poly_from_poly3(struct poly *out, const struct poly3 *in) {
+  const crypto_word_t *words_s = in->s.v;
+  const crypto_word_t *words_a = in->a.v;
+  crypto_word_t word_s = ~(*words_s);
+  crypto_word_t word_a = *words_a;
+  unsigned shift = 0;
+
+  for (unsigned i = 0; i < N; i++) {
+    out->v[i] = (uint16_t)(word_s & 1) - 1;
+    out->v[i] |= word_a & 1;
+    word_s >>= 1;
+    word_a >>= 1;
+    shift++;
+
+    if (shift == BITS_PER_WORD) {
+      words_s++;
+      words_a++;
+      word_s = ~(*words_s);
+      word_a = *words_a;
+      shift = 0;
+    }
+  }
+}
+
+// Polynomial inversion
+// --------------------
+
+// poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod
+// Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion
+// mod Q.
+static void poly_invert_mod2(struct poly *out, const struct poly *in) {
+  // This algorithm follows algorithm 10 in the paper. (Although, in contrast to
+  // the paper, k should start at zero, not one, and the rotation count is needs
+  // to handle trailing zero coefficients.) The best explanation for why it
+  // works is in the "Why it works" section of [NTRUTN14].
+
+  struct poly2 b, c, f, g;
+  poly2_from_poly(&f, in);
+  OPENSSL_memset(&b, 0, sizeof(b));
+  b.v[0] = 1;
+  OPENSSL_memset(&c, 0, sizeof(c));
+
+  // Set g to all ones.
+  OPENSSL_memset(&g, 0xff, sizeof(struct poly2));
+  g.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD;
+
+  crypto_word_t deg_f = N - 1, deg_g = N - 1, rotation = 0;
+  crypto_word_t still_going = CONSTTIME_TRUE_W;
+
+  for (unsigned i = 0; i < 2 * (N - 1) - 1; i++) {
+    const crypto_word_t s = still_going & lsb_to_all(f.v[0]);
+    const crypto_word_t should_swap = s & constant_time_lt_w(deg_f, deg_g);
+    poly2_cswap(&f, &g, should_swap);
+    poly2_cswap(&b, &c, should_swap);
+    const crypto_word_t deg_sum = should_swap & (deg_f ^ deg_g);
+    deg_f ^= deg_sum;
+    deg_g ^= deg_sum;
+    assert(deg_g >= 1);
+    poly2_fmadd(&f, &g, s);
+    poly2_fmadd(&b, &c, s);
+
+    poly2_rshift1(&f);
+    poly2_lshift1(&c);
+
+    deg_f--;
+    const crypto_word_t f0_is_nonzero = lsb_to_all(f.v[0]);
+    // |f0_is_nonzero| implies |still_going|.
+    assert(!(f0_is_nonzero && !still_going));
+    rotation = constant_time_select_w(f0_is_nonzero, i, rotation);
+    still_going &= ~constant_time_is_zero_w(deg_f);
+  }
+
+  rotation++;
+  rotation -= N & constant_time_lt_w(N, rotation);
+  assert(poly2_top_bits_are_clear(&b));
+  HRSS_poly2_rotr_consttime(&b, rotation);
+  poly_from_poly2(out, &b);
+}
+
+// poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)).
+static void poly_invert(struct poly *out, const struct poly *in) {
+  // Inversion mod Q, which is done based on the result of inverting mod
+  // 2. See [NTRUTN14] paper, bottom of page two.
+  struct poly a, *b, tmp;
+
+  // a = -in.
+  for (unsigned i = 0; i < N; i++) {
+    a.v[i] = -in->v[i];
+  }
+
+  // b = in^-1 mod 2.
+  b = out;
+  poly_invert_mod2(b, in);
+
+  // We are working mod Q=2**13 and we need to iterate ceil(log_2(13))
+  // times, which is four.
+  for (unsigned i = 0; i < 4; i++) {
+    poly_mul(&tmp, &a, b);
+    tmp.v[0] += 2;
+    poly_mul(b, b, &tmp);
+  }
+}
+
+// Marshal and unmarshal functions for various basic types.
+// --------------------------------------------------------
+
+#define POLY_BYTES 1138
+
+static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) {
+  const uint16_t *p = in->v;
+
+  for (size_t i = 0; i < N / 8; i++) {
+    out[0] = p[0];
+    out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
+    out[2] = p[1] >> 3;
+    out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
+    out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
+    out[5] = p[3] >> 1;
+    out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4);
+    out[7] = p[4] >> 4;
+    out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1);
+    out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6);
+    out[10] = p[6] >> 2;
+    out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3);
+    out[12] = p[7] >> 5;
+
+    p += 8;
+    out += 13;
+  }
+
+  // There are four remaining values.
+  out[0] = p[0];
+  out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5);
+  out[2] = p[1] >> 3;
+  out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2);
+  out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7);
+  out[5] = p[3] >> 1;
+  out[6] = 0xf & (p[3] >> 9);
+}
+
+static void poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) {
+  uint16_t *p = out->v;
+
+  for (size_t i = 0; i < N / 8; i++) {
+    p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
+    p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
+           (uint16_t)(in[3] & 3) << 11;
+    p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
+    p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
+           (uint16_t)(in[6] & 0xf) << 9;
+    p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 |
+           (uint16_t)(in[8] & 1) << 12;
+    p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7;
+    p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 |
+           (uint16_t)(in[11] & 7) << 10;
+    p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5;
+
+    p += 8;
+    in += 13;
+  }
+
+  // There are four coefficients remaining.
+  p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8;
+  p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 |
+         (uint16_t)(in[3] & 3) << 11;
+  p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6;
+  p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 |
+         (uint16_t)(in[6] & 0xf) << 9;
+
+  for (unsigned i = 0; i < N - 1; i++) {
+    out->v[i] = (int16_t)(out->v[i] << 3) >> 3;
+  }
+
+  // There are four unused bits at the top of the final byte. They are always
+  // marshaled as zero by this code but we allow them to take any value when
+  // parsing in order to support future extension.
+
+  // Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6.
+  uint32_t sum = 0;
+  for (size_t i = 0; i < N - 1; i++) {
+    sum += out->v[i];
+  }
+
+  out->v[N - 1] = (uint16_t)(0u - sum);
+}
+
+// mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may
+// have an invalid value when processing attacker-controlled inputs.
+static uint16_t mod3_from_modQ(uint16_t v) {
+  v &= 3;
+  return v ^ (v >> 1);
+}
+
+// poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are
+// all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may
+// have invalid values when processing attacker-controlled inputs.)
+static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES],
+                              const struct poly *in) {
+  const uint16_t *coeffs = in->v;
+
+  // Only 700 coefficients are marshaled because in[700] must be zero.
+  assert(coeffs[N-1] == 0);
+
+  for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) {
+    const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]);
+    const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]);
+    const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]);
+    const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]);
+    const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]);
+    out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81;
+    coeffs += 5;
+  }
+}
+
+// HRSS-specific functions
+// -----------------------
+
+// poly_short_sample implements the sampling algorithm given in [HRSSNIST]
+// section 1.8.1. The output coefficients are in {0, 1, 0xffff} which makes some
+// later computation easier.
+static void poly_short_sample(struct poly *out,
+                              const uint8_t in[HRSS_SAMPLE_BYTES]) {
+  // We wish to calculate the difference (mod 3) between two, two-bit numbers.
+  // Here is a table of results for a - b. Negative one is written as 0b11 so
+  // that a couple of shifts can be used to sign-extend it. Any input value of
+  // 0b11 is invalid and a convention is adopted that an invalid input results
+  // in an invalid output (0b10).
+  //
+  //  b  a result
+  // 00 00 00
+  // 00 01 01
+  // 00 10 11
+  // 00 11 10
+  // 01 00 11
+  // 01 01 00
+  // 01 10 01
+  // 01 11 10
+  // 10 00 01
+  // 10 01 11
+  // 10 10 00
+  // 10 11 10
+  // 11 00 10
+  // 11 01 10
+  // 11 10 10
+  // 11 11 10
+  //
+  // The result column is encoded in a single-word lookup-table:
+  // 0001 1110 1100 0110 0111 0010 1010 1010
+  //   1    d    c    6    7    2    a    a
+  static const uint32_t kLookup = 0x1dc672aa;
+
+  // In order to generate pairs of numbers mod 3 (non-uniformly) we treat pairs
+  // of bits in a uint32 as separate values and sum two random vectors of 1-bit
+  // numbers. This works because these pairs are isolated because no carry can
+  // spread between them.
+
+  uint16_t *p = out->v;
+  for (size_t i = 0; i < N / 8; i++) {
+    uint32_t v;
+    OPENSSL_memcpy(&v, in, sizeof(v));
+    in += sizeof(v);
+
+    uint32_t sums = (v & 0x55555555) + ((v >> 1) & 0x55555555);
+    for (unsigned j = 0; j < 8; j++) {
+      p[j] = (int32_t)(kLookup << ((sums & 15) << 1)) >> 30;
+      sums >>= 4;
+    }
+    p += 8;
+  }
+
+  // There are four values remaining.
+  uint16_t v;
+  OPENSSL_memcpy(&v, in, sizeof(v));
+
+  uint16_t sums = (v & 0x5555) + ((v >> 1) & 0x5555);
+  for (unsigned j = 0; j < 4; j++) {
+    p[j] = (int32_t)(kLookup << ((sums & 15) << 1)) >> 30;
+    sums >>= 4;
+  }
+
+  out->v[N - 1] = 0;
+}
+
+// poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST],
+// section 1.8.2.
+static void poly_short_sample_plus(struct poly *out,
+                                   const uint8_t in[HRSS_SAMPLE_BYTES]) {
+  poly_short_sample(out, in);
+
+  // sum (and the product in the for loop) will overflow. But that's fine
+  // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out.
+  uint16_t sum = 0;
+  for (unsigned i = 0; i < N - 2; i++) {
+    sum += (unsigned) out->v[i] * out->v[i + 1];
+  }
+
+  // If the sum is negative, flip the sign of even-positioned coefficients. (See
+  // page 8 of [HRSS].)
+  sum = ((int16_t) sum) >> 15;
+  const uint16_t scale = sum | (~sum & 1);
+  for (unsigned i = 0; i < N; i += 2) {
+    out->v[i] = (unsigned) out->v[i] * scale;
+  }
+}
+
+// poly_lift computes the function discussed in [HRSS], appendix B.
+static void poly_lift(struct poly *out, const struct poly *a) {
+  // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the
+  // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime).
+
+  // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up:
+  //
+  // R.<x> = PolynomialRing(GF(3)…)
+  // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n))
+  // list(inv)[:15]
+  //   [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2]
+  //
+  // This three-element pattern of coefficients repeats for the whole
+  // polynomial.
+  //
+  // Next define the overbar operator such that z̅ = z[0] +
+  // reverse(z[1:]). (Index zero of a polynomial here is the coefficient
+  // of the constant term. So index one is the coefficient of 𝑥 and so
+  // on.)
+  //
+  // A less odd way to define this is to see that z̅ negates the indexes,
+  // so z̅[0] = z[-0], z̅[1] = z[-1] and so on.
+  //
+  // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = <v,
+  // z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum
+  // of the point-wise products.) Although we calculated the inverse mod
+  // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end.
+  // (That's because (𝑥^N - 1) is a multiple of Φ(N).)
+  //
+  // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation
+  // of the list of coefficients.
+  //
+  // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like:
+  //
+  // def reverse(xs):
+  //   suffix = list(xs[1:])
+  //   suffix.reverse()
+  //   return [xs[0]] + suffix
+  //
+  // def rotate(xs):
+  //   return [xs[-1]] + xs[:-1]
+  //
+  // zoverbar = reverse(list(inv) + [0])
+  // xzoverbar = rotate(reverse(list(inv) + [0]))
+  // x2zoverbar = rotate(rotate(reverse(list(inv) + [0])))
+  //
+  // zoverbar[:15]
+  //   [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1]
+  // xzoverbar[:15]
+  //   [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
+  // x2zoverbar[:15]
+  //   [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
+  //
+  // (For a formula for z̅, see lemma two of appendix B.)
+  //
+  // After the first three elements have been taken care of, all then have
+  // a repeating three-element cycle. The next value (𝑥^3z̅) involves
+  // three rotations of the first pattern, thus the three-element cycle
+  // lines up. However, the discontinuity in the first three elements
+  // obviously moves to a different position. Consider the difference
+  // between 𝑥^3z̅ and z̅:
+  //
+  // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15]
+  //    [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+  //
+  // This pattern of differences is the same for all elements, although it
+  // obviously moves right with the rotations.
+  //
+  // From this, we reach algorithm eight of appendix B.
+
+  // Handle the first three elements of the inner products.
+  out->v[0] = a->v[0] + a->v[2];
+  out->v[1] = a->v[1];
+  out->v[2] = -a->v[0] + a->v[2];
+
+  // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2],
+  // respectively. We do not compute s1 because it's just -(s0 + s1).
+  uint16_t s0 = 0, s2 = 0;
+  for (size_t i = 3; i < 699; i += 3) {
+    s0 += -a->v[i] + a->v[i + 2];
+    // s1 += a->v[i] - a->v[i + 1];
+    s2 += a->v[i + 1] - a->v[i + 2];
+  }
+
+  // Handle the fact that the three-element pattern doesn't fill the
+  // polynomial exactly (since 701 isn't a multiple of three).
+  s0 -= a->v[699];
+  // s1 += a->v[699] - a->v[700];
+  s2 += a->v[700];
+
+  // Note that s0 + s1 + s2 = 0.
+  out->v[0] += s0;
+  out->v[1] -= (s0 + s2); // = s1
+  out->v[2] += s2;
+
+  // Calculate the remaining inner products by taking advantage of the
+  // fact that the pattern repeats every three cycles and the pattern of
+  // differences moves with the rotation.
+  for (size_t i = 3; i < N; i++) {
+    out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i]));
+  }
+
+  // Reduce mod Φ(N) by subtracting a multiple of out[700] from every
+  // element and convert to mod Q. (See above about adding twice as
+  // subtraction.)
+  const crypto_word_t v = out->v[700];
+  for (unsigned i = 0; i < N; i++) {
+    const uint16_t vi_mod3 = mod3(out->v[i] - v);
+    // Map {0, 1, 2} to {0, 1, 0xffff}.
+    out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3;
+  }
+
+  poly_mul_x_minus_1(out);
+}
+
+struct public_key {
+  struct poly ph;
+};
+
+struct private_key {
+  struct poly3 f, f_inverse;
+  struct poly ph_inverse;
+  uint8_t hmac_key[32];
+};
+
+// public_key_from_external converts an external public key pointer into an
+// internal one. Externally the alignment is only specified to be eight bytes
+// but we need 16-byte alignment. We could annotate the external struct with
+// that alignment but we can only assume that malloced pointers are 8-byte
+// aligned in any case. (Even if the underlying malloc returns values with
+// 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess
+// that up.)
+static struct public_key *public_key_from_external(
+    struct HRSS_public_key *ext) {
+  OPENSSL_STATIC_ASSERT(
+      sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15,
+      "HRSS public key too small");
+
+  uintptr_t p = (uintptr_t)ext;
+  p = (p + 15) & ~15;
+  return (struct public_key *)p;
+}
+
+// private_key_from_external does the same thing as |public_key_from_external|,
+// but for private keys. See the comment on that function about alignment
+// issues.
+static struct private_key *private_key_from_external(
+    struct HRSS_private_key *ext) {
+  OPENSSL_STATIC_ASSERT(
+      sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15,
+      "HRSS private key too small");
+
+  uintptr_t p = (uintptr_t)ext;
+  p = (p + 15) & ~15;
+  return (struct private_key *)p;
+}
+
+void HRSS_generate_key(
+    struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv,
+    const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) {
+  struct public_key *pub = public_key_from_external(out_pub);
+  struct private_key *priv = private_key_from_external(out_priv);
+
+  OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES,
+                 sizeof(priv->hmac_key));
+
+  struct poly f;
+  poly_short_sample_plus(&f, in);
+  poly3_from_poly(&priv->f, &f);
+  HRSS_poly3_invert(&priv->f_inverse, &priv->f);
+
+  // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1).
+  struct poly pg_phi1;
+  poly_short_sample_plus(&pg_phi1, in + HRSS_SAMPLE_BYTES);
+  for (unsigned i = 0; i < N; i++) {
+    pg_phi1.v[i] *= 3;
+  }
+  poly_mul_x_minus_1(&pg_phi1);
+
+  struct poly pfg_phi1;
+  poly_mul(&pfg_phi1, &f, &pg_phi1);
+
+  struct poly pfg_phi1_inverse;
+  poly_invert(&pfg_phi1_inverse, &pfg_phi1);
+
+  poly_mul(&pub->ph, &pfg_phi1_inverse, &pg_phi1);
+  poly_mul(&pub->ph, &pub->ph, &pg_phi1);
+  poly_clamp(&pub->ph);
+
+  poly_mul(&priv->ph_inverse, &pfg_phi1_inverse, &f);
+  poly_mul(&priv->ph_inverse, &priv->ph_inverse, &f);
+  poly_clamp(&priv->ph_inverse);
+}
+
+static void owf(uint8_t out[POLY_BYTES], const struct public_key *pub,
+                const struct poly *m_lifted, const struct poly *r) {
+  struct poly prh_plus_m;
+  poly_mul(&prh_plus_m, r, &pub->ph);
+  for (unsigned i = 0; i < N; i++) {
+    prh_plus_m.v[i] += m_lifted->v[i];
+  }
+
+  poly_marshal(out, &prh_plus_m);
+}
+
+static const char kConfirmationHash[] = "confirmation hash";
+static const char kSharedKey[] = "shared key";
+
+void HRSS_encap(uint8_t out_ciphertext[POLY_BYTES + 32],
+                uint8_t out_shared_key[32],
+                const struct HRSS_public_key *in_pub,
+                const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) {
+  const struct public_key *pub =
+      public_key_from_external((struct HRSS_public_key *)in_pub);
+  struct poly m, r, m_lifted;
+  poly_short_sample(&m, in);
+  poly_short_sample(&r, in + HRSS_SAMPLE_BYTES);
+  poly_lift(&m_lifted, &m);
+  owf(out_ciphertext, pub, &m_lifted, &r);
+
+  uint8_t m_bytes[HRSS_POLY3_BYTES], r_bytes[HRSS_POLY3_BYTES];
+  poly_marshal_mod3(m_bytes, &m);
+  poly_marshal_mod3(r_bytes, &r);
+
+  SHA256_CTX hash_ctx;
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, kConfirmationHash, sizeof(kConfirmationHash));
+  SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes));
+  SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes));
+  SHA256_Final(out_ciphertext + POLY_BYTES, &hash_ctx);
+
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey));
+  SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes));
+  SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes));
+  SHA256_Update(&hash_ctx, out_ciphertext, POLY_BYTES + 32);
+  SHA256_Final(out_shared_key, &hash_ctx);
+}
+
+void HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES],
+                const struct HRSS_public_key *in_pub,
+                const struct HRSS_private_key *in_priv,
+                const uint8_t *ciphertext, size_t ciphertext_len) {
+  const struct public_key *pub =
+      public_key_from_external((struct HRSS_public_key *)in_pub);
+  const struct private_key *priv =
+      private_key_from_external((struct HRSS_private_key *)in_priv);
+
+  // This is HMAC, expanded inline rather than using the |HMAC| function so that
+  // we can avoid dealing with possible allocation failures and so keep this
+  // function infallible.
+  uint8_t masked_key[SHA256_CBLOCK];
+  OPENSSL_STATIC_ASSERT(sizeof(priv->hmac_key) <= sizeof(masked_key),
+                        "HRSS HMAC key larger than SHA-256 block size");
+  for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
+    masked_key[i] = priv->hmac_key[i] ^ 0x36;
+  }
+  OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x36,
+                 sizeof(masked_key) - sizeof(priv->hmac_key));
+
+  SHA256_CTX hash_ctx;
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key));
+  SHA256_Update(&hash_ctx, ciphertext, ciphertext_len);
+  uint8_t inner_digest[SHA256_DIGEST_LENGTH];
+  SHA256_Final(inner_digest, &hash_ctx);
+
+  for (size_t i = 0; i < sizeof(priv->hmac_key); i++) {
+    masked_key[i] ^= (0x5c ^ 0x36);
+  }
+  OPENSSL_memset(masked_key + sizeof(priv->hmac_key), 0x5c,
+                 sizeof(masked_key) - sizeof(priv->hmac_key));
+
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, masked_key, sizeof(masked_key));
+  SHA256_Update(&hash_ctx, inner_digest, sizeof(inner_digest));
+  OPENSSL_STATIC_ASSERT(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH,
+                        "HRSS shared key length incorrect");
+  SHA256_Final(out_shared_key, &hash_ctx);
+
+  // If the ciphertext is publicly invalid then a random shared key is still
+  // returned to simply the logic of the caller, but this path is not constant
+  // time.
+  if (ciphertext_len != POLY_BYTES + 32) {
+    return;
+  }
+
+  struct poly c;
+  poly_unmarshal(&c, ciphertext);
+
+  struct poly f;
+  poly_from_poly3(&f, &priv->f);
+
+  struct poly cf;
+  poly_mul(&cf, &c, &f);
+
+  struct poly3 cf3;
+  poly3_from_poly(&cf3, &cf);
+  // Note that cf3 is not reduced mod Φ(N). That reduction is deferred.
+
+  struct poly3 m3;
+  HRSS_poly3_mul(&m3, &cf3, &priv->f_inverse);
+
+  struct poly m, m_lifted;
+  poly_from_poly3(&m, &m3);
+  poly_lift(&m_lifted, &m);
+
+  for (unsigned i = 0; i < N; i++) {
+    c.v[i] -= m_lifted.v[i];
+  }
+  poly_mul(&c, &c, &priv->ph_inverse);
+  poly_mod_phiN(&c);
+  poly_clamp(&c);
+
+  struct poly3 r3;
+  crypto_word_t ok = poly3_from_poly_checked(&r3, &c);
+
+  uint8_t expected_ciphertext[POLY_BYTES + 32];
+  assert(ciphertext_len == sizeof(expected_ciphertext));
+  owf(expected_ciphertext, pub, &m_lifted, &c);
+
+  uint8_t m_bytes[HRSS_POLY3_BYTES];
+  uint8_t r_bytes[HRSS_POLY3_BYTES];
+  poly_marshal_mod3(m_bytes, &m);
+  poly_marshal_mod3(r_bytes, &c);
+
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, kConfirmationHash, sizeof(kConfirmationHash));
+  SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes));
+  SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes));
+  SHA256_Final(expected_ciphertext + POLY_BYTES, &hash_ctx);
+
+  ok &= constant_time_is_zero_w(CRYPTO_memcmp(ciphertext, expected_ciphertext,
+                                              sizeof(expected_ciphertext)));
+
+  uint8_t shared_key[32];
+  SHA256_Init(&hash_ctx);
+  SHA256_Update(&hash_ctx, kSharedKey, sizeof(kSharedKey));
+  SHA256_Update(&hash_ctx, m_bytes, sizeof(m_bytes));
+  SHA256_Update(&hash_ctx, r_bytes, sizeof(r_bytes));
+  SHA256_Update(&hash_ctx, expected_ciphertext, sizeof(expected_ciphertext));
+  SHA256_Final(shared_key, &hash_ctx);
+
+  for (unsigned i = 0; i < sizeof(shared_key); i++) {
+    out_shared_key[i] =
+        constant_time_select_8(ok, shared_key[i], out_shared_key[i]);
+  }
+}
+
+void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES],
+                             const struct HRSS_public_key *in_pub) {
+  const struct public_key *pub =
+      public_key_from_external((struct HRSS_public_key *)in_pub);
+  poly_marshal(out, &pub->ph);
+}
+
+int HRSS_parse_public_key(struct HRSS_public_key *out,
+                          const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) {
+  struct public_key *pub = public_key_from_external(out);
+  poly_unmarshal(&pub->ph, in);
+  OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t));
+  return 1;
+}
diff --git a/crypto/hrss/hrss_test.cc b/crypto/hrss/hrss_test.cc
new file mode 100644
index 0000000..d23e68e
--- /dev/null
+++ b/crypto/hrss/hrss_test.cc
@@ -0,0 +1,475 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <gtest/gtest.h>
+
+#include <openssl/hrss.h>
+#include <openssl/rand.h>
+
+#include "../test/test_util.h"
+#include "internal.h"
+
+// poly2_from_bits takes the least-significant bit from each byte of |in| and
+// sets the bits of |*out| to match.
+static void poly2_from_bits(struct poly2 *out, const uint8_t in[N]) {
+  crypto_word_t *words = out->v;
+  unsigned shift = 0;
+  crypto_word_t word = 0;
+
+  for (unsigned i = 0; i < N; i++) {
+    word >>= 1;
+    word |= (crypto_word_t)(in[i] & 1) << (BITS_PER_WORD - 1);
+    shift++;
+
+    if (shift == BITS_PER_WORD) {
+      *words = word;
+      words++;
+      word = 0;
+      shift = 0;
+    }
+  }
+
+  word >>= BITS_PER_WORD - shift;
+  *words = word;
+}
+
+TEST(HRSS, Poly2RotateRight) {
+  uint8_t bits[N];
+  RAND_bytes(bits, sizeof(bits));
+  for (size_t i = 0; i < N; i++) {
+    bits[i] &= 1;
+  };
+
+  struct poly2 p, orig, shifted;
+  poly2_from_bits(&p, bits);
+  OPENSSL_memcpy(&orig, &p, sizeof(orig));
+
+  // Test |HRSS_poly2_rotr_consttime| by manually rotating |bits| step-by-step
+  // and testing every possible shift to ensure that it produces the correct
+  // answer.
+  for (size_t shift = 0; shift <= N; shift++) {
+    SCOPED_TRACE(shift);
+
+    OPENSSL_memcpy(&p, &orig, sizeof(orig));
+    HRSS_poly2_rotr_consttime(&p, shift);
+    poly2_from_bits(&shifted, bits);
+    ASSERT_EQ(
+        Bytes(reinterpret_cast<const uint8_t *>(&shifted), sizeof(shifted)),
+        Bytes(reinterpret_cast<const uint8_t *>(&p), sizeof(p)));
+
+    const uint8_t least_significant_bit = bits[0];
+    OPENSSL_memmove(bits, &bits[1], N-1);
+    bits[N-1] = least_significant_bit;
+  }
+}
+
+// poly3_rand sets |r| to a random value (albeit with bias).
+static void poly3_rand(poly3 *p) {
+  RAND_bytes(reinterpret_cast<uint8_t *>(p), sizeof(p));
+  p->s.v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+  p->a.v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1;
+  // (s, a) = (1, 1) is invalid. Map those to one.
+  for (size_t j = 0; j < WORDS_PER_POLY; j++) {
+    p->s.v[j] ^= p->s.v[j] & p->a.v[j];
+  }
+}
+
+// poly3_word_add sets (|s1|, |a1|) += (|s2|, |a2|).
+static void poly3_word_add(crypto_word_t *s1, crypto_word_t *a1,
+                           const crypto_word_t s2, const crypto_word_t a2) {
+  const crypto_word_t x = *a1 ^ a2;
+  const crypto_word_t y = (*s1 ^ s2) ^ (*a1 & a2);
+  const crypto_word_t z = *s1 & s2;
+  *s1 = y & ~x;
+  *a1 = z | (x & ~y);
+}
+
+TEST(HRSS, Poly3Invert) {
+  poly3 p, inverse, result;
+  memset(&p, 0, sizeof(p));
+  memset(&inverse, 0, sizeof(inverse));
+  memset(&result, 0, sizeof(result));
+
+  // The inverse of -1 is -1.
+  p.s.v[0] = 1;
+  HRSS_poly3_invert(&inverse, &p);
+  EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t*>(&p), sizeof(p)),
+            Bytes(reinterpret_cast<const uint8_t*>(&inverse), sizeof(inverse)));
+
+  // The inverse of 1 is 1.
+  p.s.v[0] = 0;
+  p.a.v[0] = 1;
+  HRSS_poly3_invert(&inverse, &p);
+  EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t*>(&p), sizeof(p)),
+            Bytes(reinterpret_cast<const uint8_t*>(&inverse), sizeof(inverse)));
+
+  for (size_t i = 0; i < 500; i++) {
+    poly3 r;
+    poly3_rand(&r);
+    HRSS_poly3_invert(&inverse, &r);
+    HRSS_poly3_mul(&result, &inverse, &r);
+    // r×r⁻¹ = 1, and |p| contains 1.
+    EXPECT_EQ(
+        Bytes(reinterpret_cast<const uint8_t *>(&p), sizeof(p)),
+        Bytes(reinterpret_cast<const uint8_t *>(&result), sizeof(result)));
+  }
+}
+
+TEST(HRSS, Poly3UnreducedInput) {
+  // Check that |poly3_mul| works correctly with inputs that aren't reduced mod
+  // Φ(N).
+  poly3 r, inverse, result, one;
+  poly3_rand(&r);
+  HRSS_poly3_invert(&inverse, &r);
+  HRSS_poly3_mul(&result, &inverse, &r);
+
+  memset(&one, 0, sizeof(one));
+  one.a.v[0] = 1;
+  EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t *>(&one), sizeof(one)),
+            Bytes(reinterpret_cast<const uint8_t *>(&result), sizeof(result)));
+
+  // |r| is probably already not reduced mod Φ(N), but add x^701 - 1 and
+  // recompute to ensure that we get the same answer. (Since (x^701 - 1) ≡ 0 mod
+  // Φ(N).)
+  poly3_word_add(&r.s.v[0], &r.a.v[0], 1, 0);
+  poly3_word_add(&r.s.v[WORDS_PER_POLY - 1], &r.a.v[WORDS_PER_POLY - 1], 0,
+                 UINT64_C(1) << BITS_IN_LAST_WORD);
+
+  HRSS_poly3_mul(&result, &inverse, &r);
+  EXPECT_EQ(Bytes(reinterpret_cast<const uint8_t *>(&one), sizeof(one)),
+            Bytes(reinterpret_cast<const uint8_t *>(&result), sizeof(result)));
+
+  // Check that x^700 × 1 gives -x^699 - x^698 … -1.
+  poly3 x700;
+  memset(&x700, 0, sizeof(x700));
+  x700.a.v[WORDS_PER_POLY-1] = UINT64_C(1) << (BITS_IN_LAST_WORD - 1);
+  HRSS_poly3_mul(&result, &one, &x700);
+
+  for (size_t i = 0; i < WORDS_PER_POLY-1; i++) {
+    EXPECT_EQ(CONSTTIME_TRUE_W, result.s.v[i]);
+    EXPECT_EQ(0u, result.a.v[i]);
+  }
+  EXPECT_EQ((UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1,
+            result.s.v[WORDS_PER_POLY - 1]);
+  EXPECT_EQ(0u, result.a.v[WORDS_PER_POLY - 1]);
+}
+
+TEST(HRSS, Basic) {
+  uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES];
+  for (unsigned i = 0; i < sizeof(generate_key_entropy); i++) {
+    generate_key_entropy[i] = i;
+  }
+
+  HRSS_public_key pub;
+  HRSS_private_key priv;
+  HRSS_generate_key(&pub, &priv, generate_key_entropy);
+
+  uint8_t encap_entropy[HRSS_ENCAP_BYTES];
+  for (unsigned i = 0; i < sizeof(encap_entropy); i++) {
+    encap_entropy[i] = i;
+  }
+
+  uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+  uint8_t shared_key[HRSS_KEY_BYTES];
+  HRSS_encap(ciphertext, shared_key, &pub, encap_entropy);
+
+  HRSS_public_key pub2;
+  uint8_t pub_bytes[HRSS_PUBLIC_KEY_BYTES];
+  HRSS_marshal_public_key(pub_bytes, &pub);
+  ASSERT_TRUE(HRSS_parse_public_key(&pub2, pub_bytes));
+
+  uint8_t shared_key2[HRSS_KEY_BYTES];
+  HRSS_decap(shared_key2, &pub2, &priv, ciphertext, sizeof(ciphertext));
+
+  EXPECT_EQ(Bytes(shared_key), Bytes(shared_key2));
+}
+
+TEST(HRSS, Random) {
+  for (unsigned i = 0; i < 10; i++) {
+    uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES];
+    RAND_bytes(generate_key_entropy, sizeof(generate_key_entropy));
+    SCOPED_TRACE(Bytes(generate_key_entropy));
+
+    HRSS_public_key pub;
+    HRSS_private_key priv;
+    HRSS_generate_key(&pub, &priv, generate_key_entropy);
+
+    for (unsigned j = 0; j < 10; j++) {
+      uint8_t encap_entropy[HRSS_ENCAP_BYTES];
+      RAND_bytes(encap_entropy, sizeof(encap_entropy));
+      SCOPED_TRACE(Bytes(generate_key_entropy));
+
+      uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+      uint8_t shared_key[HRSS_KEY_BYTES];
+      HRSS_encap(ciphertext, shared_key, &pub, encap_entropy);
+
+      uint8_t shared_key2[HRSS_KEY_BYTES];
+      HRSS_decap(shared_key2, &pub, &priv, ciphertext, sizeof(ciphertext));
+
+      EXPECT_EQ(Bytes(shared_key), Bytes(shared_key2));
+    }
+  }
+}
+
+TEST(HRSS, Golden) {
+  uint8_t generate_key_entropy[HRSS_GENERATE_KEY_BYTES];
+  for (unsigned i = 0; i < HRSS_SAMPLE_BYTES; i++) {
+    generate_key_entropy[i] = i;
+  }
+  for (unsigned i = HRSS_SAMPLE_BYTES; i < 2 * HRSS_SAMPLE_BYTES; i++) {
+    generate_key_entropy[i] = 2 + i;
+  }
+  for (unsigned i = 2 * HRSS_SAMPLE_BYTES; i < sizeof(generate_key_entropy);
+       i++) {
+    generate_key_entropy[i] = 4 + i;
+  }
+
+  HRSS_public_key pub;
+  HRSS_private_key priv;
+  OPENSSL_memset(&pub, 0, sizeof(pub));
+  OPENSSL_memset(&priv, 0, sizeof(priv));
+  HRSS_generate_key(&pub, &priv, generate_key_entropy);
+
+  static const uint8_t kExpectedPub[HRSS_PUBLIC_KEY_BYTES] = {
+      0xf8, 0x9f, 0xa0, 0xfc, 0xf1, 0xd4, 0xfa, 0x4d, 0x8f, 0x35, 0x28, 0x73,
+      0x0e, 0x37, 0x18, 0x1d, 0x09, 0xf3, 0x9e, 0x16, 0x0d, 0x7f, 0x9c, 0x82,
+      0x17, 0xa1, 0xa1, 0x88, 0x6b, 0x29, 0x5b, 0x3a, 0x30, 0xcd, 0x6f, 0x8e,
+      0x0c, 0xd3, 0x38, 0x0c, 0x05, 0x68, 0x6e, 0x4c, 0xcc, 0x20, 0xd4, 0x06,
+      0x77, 0x0c, 0xac, 0x1c, 0x49, 0x14, 0x00, 0xd6, 0x9b, 0x1c, 0xde, 0x43,
+      0x0a, 0x59, 0x37, 0xd6, 0x46, 0x68, 0x1f, 0x04, 0xcb, 0x73, 0x92, 0x37,
+      0x2d, 0x7f, 0x57, 0x70, 0x16, 0xe8, 0x06, 0x48, 0x3b, 0x66, 0xb3, 0x63,
+      0x02, 0x5a, 0x71, 0x46, 0xdd, 0xa4, 0xee, 0xb8, 0x78, 0x44, 0xfd, 0x9e,
+      0xd0, 0x71, 0x16, 0x00, 0xbd, 0x01, 0x1e, 0x27, 0x2e, 0xa0, 0xc6, 0x8d,
+      0x55, 0x89, 0x7c, 0x2a, 0x01, 0x2b, 0x1b, 0x75, 0xa2, 0xc2, 0xd1, 0x5a,
+      0x67, 0xfa, 0xdd, 0x3b, 0x70, 0x9d, 0xdb, 0xcd, 0x73, 0x32, 0x5e, 0x24,
+      0xb1, 0xcf, 0x23, 0xbe, 0x3c, 0x56, 0xcc, 0xbe, 0x61, 0xdb, 0xe7, 0x3c,
+      0xc7, 0xf5, 0x09, 0xe6, 0x87, 0xa0, 0x09, 0x52, 0x9d, 0x61, 0x5b, 0xc6,
+      0xd4, 0xc5, 0x2e, 0xc2, 0x6c, 0x87, 0x30, 0x36, 0x49, 0x6f, 0x04, 0xaa,
+      0xb3, 0x26, 0xd5, 0x63, 0xcf, 0xd4, 0x74, 0x1e, 0xc7, 0x79, 0xb3, 0xfc,
+      0x8c, 0x41, 0x36, 0x79, 0xaa, 0xd5, 0xba, 0x64, 0x49, 0x48, 0xdb, 0xeb,
+      0xe8, 0x33, 0x7d, 0xbe, 0x3b, 0x67, 0xd7, 0xfd, 0x93, 0x1e, 0x80, 0x8d,
+      0x17, 0xab, 0x6f, 0xfd, 0x1c, 0x4b, 0x2d, 0x5b, 0x90, 0xf0, 0xf0, 0x5d,
+      0xbe, 0x8f, 0x81, 0x18, 0x29, 0x08, 0x9a, 0x47, 0x1b, 0xc2, 0x2d, 0xa2,
+      0x22, 0x5a, 0x4f, 0xe9, 0x81, 0x64, 0xdd, 0x53, 0x2e, 0x67, 0xe5, 0x07,
+      0x1a, 0xf0, 0x0c, 0x54, 0x9b, 0xe2, 0xf8, 0xe6, 0xb3, 0xb6, 0xe0, 0x5a,
+      0x74, 0xfa, 0x8d, 0x9c, 0xa5, 0x7c, 0x6e, 0x73, 0xba, 0xee, 0x6e, 0x6e,
+      0x31, 0xcb, 0x59, 0xd7, 0xfd, 0x94, 0x1c, 0x4d, 0x62, 0xc6, 0x87, 0x0b,
+      0x38, 0x54, 0xc6, 0x35, 0xac, 0xc8, 0x8c, 0xc0, 0xd9, 0x99, 0xee, 0xfc,
+      0xa9, 0xde, 0xc4, 0x50, 0x88, 0x8e, 0x24, 0xf6, 0xd6, 0x04, 0x54, 0x3e,
+      0x81, 0xc4, 0x96, 0x9a, 0x40, 0xe5, 0xef, 0x8b, 0xec, 0x41, 0x50, 0x1d,
+      0x14, 0xae, 0xa4, 0x5a, 0xac, 0xd4, 0x73, 0x31, 0xc3, 0x1d, 0xc1, 0x96,
+      0x89, 0xd8, 0x62, 0x97, 0x60, 0x3f, 0x58, 0x2a, 0x5f, 0xcf, 0xcb, 0x26,
+      0x99, 0x69, 0x81, 0x13, 0x9c, 0xaf, 0x17, 0x91, 0xa8, 0xeb, 0x9a, 0xf9,
+      0xd3, 0x83, 0x47, 0x66, 0xc7, 0xf8, 0xd8, 0xe3, 0xd2, 0x7e, 0x58, 0xa9,
+      0xf5, 0xb2, 0x03, 0xbe, 0x7e, 0xa5, 0x29, 0x9d, 0xff, 0xd1, 0xd8, 0x55,
+      0x39, 0xc7, 0x2c, 0xce, 0x03, 0x64, 0xdc, 0x18, 0xe7, 0xb0, 0x60, 0x46,
+      0x26, 0xeb, 0xb7, 0x61, 0x4b, 0x91, 0x2c, 0xd8, 0xa2, 0xee, 0x63, 0x2e,
+      0x15, 0x0a, 0x58, 0x88, 0x04, 0xb1, 0xed, 0x6d, 0xf1, 0x5c, 0xc7, 0xee,
+      0x60, 0x38, 0x26, 0xc9, 0x31, 0x7e, 0x69, 0xe4, 0xac, 0x3c, 0x72, 0x09,
+      0x3e, 0xe6, 0x24, 0x30, 0x44, 0x6e, 0x66, 0x83, 0xb9, 0x2a, 0x22, 0xaf,
+      0x26, 0x1e, 0xaa, 0xa3, 0xf4, 0xb1, 0xa1, 0x5c, 0xfa, 0x5f, 0x0d, 0x71,
+      0xac, 0xe3, 0xe0, 0xc3, 0xdd, 0x4f, 0x96, 0x57, 0x8b, 0x58, 0xac, 0xe3,
+      0x42, 0x8e, 0x47, 0x72, 0xb1, 0xe4, 0x19, 0x68, 0x3e, 0xbb, 0x19, 0x14,
+      0xdf, 0x16, 0xb5, 0xde, 0x7f, 0x37, 0xaf, 0xd8, 0xd3, 0x3d, 0x6a, 0x16,
+      0x1b, 0x26, 0xd3, 0xcc, 0x53, 0x82, 0x57, 0x90, 0x89, 0xc5, 0x7e, 0x6d,
+      0x7e, 0x99, 0x5b, 0xcd, 0xd3, 0x18, 0xbb, 0x89, 0xef, 0x76, 0xbd, 0xd2,
+      0x62, 0xf0, 0xe8, 0x25, 0x2a, 0x8d, 0xe2, 0x21, 0xea, 0xde, 0x6e, 0xa5,
+      0xa4, 0x3d, 0x58, 0xee, 0xdf, 0x90, 0xc1, 0xa1, 0x38, 0x5d, 0x11, 0x50,
+      0xb5, 0xac, 0x9d, 0xb4, 0xfd, 0xef, 0x53, 0xe8, 0xc0, 0x17, 0x6c, 0x4f,
+      0x31, 0xe0, 0xcc, 0x8f, 0x80, 0x7a, 0x84, 0x14, 0xde, 0xee, 0xec, 0xdd,
+      0x6a, 0xad, 0x29, 0x65, 0xa5, 0x72, 0xc3, 0x73, 0x5f, 0xe3, 0x6f, 0x60,
+      0xb1, 0xfb, 0x0f, 0xaa, 0xc6, 0xda, 0x53, 0x4a, 0xb1, 0x92, 0x2a, 0xb7,
+      0x02, 0xbe, 0xf9, 0xdf, 0x37, 0x16, 0xe7, 0x5c, 0x38, 0x0b, 0x3c, 0xe2,
+      0xdd, 0x90, 0xb8, 0x7b, 0x48, 0x69, 0x79, 0x81, 0xc5, 0xae, 0x9a, 0x0d,
+      0x78, 0x95, 0x52, 0x63, 0x80, 0xda, 0x46, 0x69, 0x20, 0x57, 0x9b, 0x27,
+      0xe2, 0xe8, 0xbd, 0x2f, 0x45, 0xe6, 0x46, 0x40, 0xae, 0x50, 0xd5, 0xa2,
+      0x53, 0x93, 0xe1, 0x99, 0xfd, 0x13, 0x7c, 0xf6, 0x22, 0xc4, 0x6c, 0xab,
+      0xe3, 0xc9, 0x55, 0x0a, 0x16, 0x67, 0x68, 0x26, 0x6b, 0xd6, 0x7d, 0xde,
+      0xd3, 0xae, 0x71, 0x32, 0x02, 0xf1, 0x27, 0x67, 0x47, 0x74, 0xd9, 0x40,
+      0x35, 0x1d, 0x25, 0x72, 0x32, 0xdf, 0x75, 0xd5, 0x60, 0x26, 0xab, 0x90,
+      0xfa, 0xeb, 0x26, 0x11, 0x4b, 0xb4, 0xc5, 0xc2, 0x3e, 0xa9, 0x23, 0x3a,
+      0x4e, 0x6a, 0xb1, 0xbb, 0xb3, 0xea, 0xf9, 0x1e, 0xe4, 0x10, 0xf5, 0xdc,
+      0x35, 0xde, 0xb5, 0xee, 0xf0, 0xde, 0xa1, 0x18, 0x80, 0xc7, 0x13, 0x68,
+      0x46, 0x94, 0x0e, 0x2a, 0x8e, 0xf8, 0xe9, 0x26, 0x84, 0x42, 0x0f, 0x56,
+      0xed, 0x67, 0x7f, 0xeb, 0x7d, 0x35, 0x07, 0x01, 0x11, 0x81, 0x8b, 0x56,
+      0x88, 0xc6, 0x58, 0x61, 0x65, 0x3c, 0x5d, 0x9c, 0x58, 0x25, 0xd6, 0xdf,
+      0x4e, 0x3b, 0x93, 0xbf, 0x82, 0xe1, 0x19, 0xb8, 0xda, 0xde, 0x26, 0x38,
+      0xf2, 0xd9, 0x95, 0x24, 0x98, 0xde, 0x58, 0xf7, 0x0c, 0xe9, 0x32, 0xbb,
+      0xcc, 0xf7, 0x92, 0x69, 0xa2, 0xf0, 0xc3, 0xfa, 0xd2, 0x31, 0x8b, 0x43,
+      0x4e, 0x03, 0xe2, 0x13, 0x79, 0x6e, 0x73, 0x63, 0x3b, 0x45, 0xde, 0x80,
+      0xf4, 0x26, 0xb1, 0x38, 0xed, 0x62, 0x55, 0xc6, 0x6a, 0x67, 0x00, 0x2d,
+      0xba, 0xb2, 0xc5, 0xb6, 0x97, 0x62, 0x28, 0x64, 0x30, 0xb9, 0xfb, 0x3f,
+      0x94, 0x03, 0x48, 0x36, 0x2c, 0x5d, 0xfd, 0x08, 0x96, 0x40, 0xd1, 0x6c,
+      0xe5, 0xd0, 0xf8, 0x99, 0x40, 0x82, 0x87, 0xd7, 0xdc, 0x2f, 0x8b, 0xaa,
+      0x31, 0x96, 0x0a, 0x34, 0x33, 0xa6, 0xf1, 0x84, 0x6e, 0x33, 0x73, 0xc5,
+      0xe3, 0x26, 0xad, 0xd0, 0xcb, 0x62, 0x71, 0x82, 0xab, 0xd1, 0x82, 0x33,
+      0xe6, 0xca, 0xd0, 0x3e, 0xf5, 0x4d, 0x12, 0x6e, 0xf1, 0x83, 0xbd, 0xdc,
+      0x4d, 0xdf, 0x49, 0xbc, 0x63, 0xae, 0x7e, 0x59, 0xe8, 0x3c, 0x0d, 0xd6,
+      0x1d, 0x41, 0x89, 0x72, 0x52, 0xc0, 0xae, 0xd1, 0x2f, 0x0a, 0x8a, 0xce,
+      0x26, 0xd0, 0x3e, 0x0c, 0x71, 0x32, 0x52, 0xb2, 0xe4, 0xee, 0xa2, 0xe5,
+      0x28, 0xb6, 0x33, 0x69, 0x97, 0x5a, 0x53, 0xdb, 0x56, 0x63, 0xe9, 0xb3,
+      0x6d, 0x60, 0xf4, 0x7a, 0xce, 0xec, 0x36, 0x65, 0xd5, 0xca, 0x63, 0x2a,
+      0x19, 0x90, 0x14, 0x7b, 0x02, 0x33, 0xfa, 0x11, 0x58, 0x5a, 0xd9, 0xc5,
+      0x54, 0xf3, 0x28, 0xd5, 0x6e, 0xea, 0x85, 0xf5, 0x09, 0xbb, 0x81, 0x44,
+      0x1c, 0x63, 0x66, 0x81, 0xc5, 0x96, 0x2d, 0x7c, 0x0e, 0x75, 0x7b, 0xb4,
+      0x7e, 0x4e, 0x0c, 0xfd, 0x3c, 0xc5, 0x5a, 0x22, 0x85, 0x5c, 0xc8, 0xf3,
+      0x97, 0x98, 0x2c, 0xe9, 0x46, 0xb4, 0x02, 0xcf, 0x7d, 0xa4, 0xf2, 0x44,
+      0x7a, 0x89, 0x71, 0xa0, 0xfa, 0xb6, 0xa3, 0xaf, 0x13, 0x25, 0x46, 0xe2,
+      0x64, 0xe3, 0x69, 0xba, 0xf9, 0x68, 0x5c, 0xc0, 0xb7, 0xa8, 0xa6, 0x4b,
+      0xe1, 0x42, 0xe9, 0xb5, 0xc7, 0x84, 0xbb, 0xa6, 0x4b, 0x10, 0x4e, 0xd4,
+      0x68, 0x70, 0x0a, 0x75, 0x2a, 0xbb, 0x9d, 0xa0, 0xcb, 0xf0, 0x36, 0x4c,
+      0x70, 0x6c, 0x60, 0x4d, 0xfe, 0xe8, 0xc8, 0x66, 0x80, 0x1b, 0xf7, 0xcc,
+      0x1a, 0xdd, 0x6b, 0xa7, 0xa7, 0x25, 0x61, 0x0c, 0x31, 0xf0, 0x34, 0x63,
+      0x00, 0x0e, 0x48, 0x6a, 0x5a, 0x8d, 0x47, 0x94, 0x3f, 0x14, 0x16, 0xa8,
+      0x8a, 0x49, 0xbb, 0x0c, 0x43, 0x21, 0xda, 0xf2, 0xc5, 0xd0, 0xff, 0x19,
+      0x3e, 0x36, 0x64, 0x20, 0xb3, 0x70, 0xae, 0x54, 0xca, 0x73, 0x05, 0x56,
+      0x7a, 0x49, 0x45, 0xe9, 0x46, 0xbc, 0xc2, 0x61, 0x70, 0x40, 0x7c, 0xb0,
+      0xf7, 0xea, 0xc0, 0xd1, 0xb0, 0x77, 0x2c, 0xc7, 0xdd, 0x88, 0xcb, 0x9d,
+      0xea, 0x55, 0x6c, 0x5c, 0x28, 0xb8, 0x84, 0x1c, 0x2c, 0x06,
+  };
+  uint8_t pub_bytes[HRSS_PUBLIC_KEY_BYTES];
+  HRSS_marshal_public_key(pub_bytes, &pub);
+  EXPECT_EQ(Bytes(pub_bytes), Bytes(kExpectedPub));
+
+  uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+  uint8_t shared_key[HRSS_KEY_BYTES];
+  OPENSSL_STATIC_ASSERT(
+      sizeof(kExpectedPub) >= HRSS_ENCAP_BYTES,
+      "Private key too small to use as input to HRSS encapsulation");
+  HRSS_encap(ciphertext, shared_key, &pub, kExpectedPub);
+
+  static const uint8_t kExpectedCiphertext[HRSS_CIPHERTEXT_BYTES] = {
+      0x8e, 0x6b, 0x46, 0x9d, 0x4a, 0xef, 0xa6, 0x8c, 0x28, 0x7b, 0xec, 0x6f,
+      0x13, 0x2d, 0x7f, 0x6c, 0xca, 0x7d, 0x9e, 0x6b, 0x54, 0x62, 0xa3, 0x13,
+      0xe1, 0x1e, 0x8f, 0x5f, 0x71, 0x67, 0xc4, 0x85, 0xdf, 0xd5, 0x6b, 0xbd,
+      0x86, 0x0f, 0x98, 0xec, 0xa5, 0x04, 0xf7, 0x7b, 0x2a, 0xbe, 0xcb, 0xac,
+      0x29, 0xbe, 0xe1, 0x0f, 0xbc, 0x62, 0x87, 0x85, 0x7f, 0x05, 0xae, 0xe4,
+      0x3f, 0x87, 0xfc, 0x1f, 0xf7, 0x45, 0x1e, 0xa3, 0xdb, 0xb1, 0xa0, 0x25,
+      0xba, 0x82, 0xec, 0xca, 0x8d, 0xab, 0x7a, 0x20, 0x03, 0xeb, 0xe5, 0x5c,
+      0x9f, 0xd0, 0x46, 0x78, 0xf1, 0x5a, 0xc7, 0x9e, 0xb4, 0x10, 0x6d, 0x37,
+      0xc0, 0x75, 0x08, 0xfb, 0xeb, 0xcb, 0xd8, 0x35, 0x21, 0x9b, 0x89, 0xa0,
+      0xaa, 0x87, 0x00, 0x66, 0x38, 0x37, 0x68, 0xa4, 0xa3, 0x93, 0x8e, 0x2b,
+      0xca, 0xf7, 0x7a, 0x43, 0xb2, 0x15, 0x79, 0x81, 0xce, 0xa9, 0x09, 0xcb,
+      0x29, 0xd4, 0xcc, 0xef, 0xf1, 0x9b, 0xbd, 0xe6, 0x63, 0xd5, 0x26, 0x0f,
+      0xe8, 0x8b, 0xdf, 0xf1, 0xc3, 0xb4, 0x18, 0x0e, 0xf2, 0x1d, 0x5d, 0x82,
+      0x9b, 0x1f, 0xf3, 0xca, 0x36, 0x2a, 0x26, 0x0a, 0x7f, 0xc4, 0x0d, 0xbd,
+      0x5b, 0x15, 0x1c, 0x18, 0x6c, 0x11, 0x4e, 0xec, 0x36, 0x01, 0xc1, 0x15,
+      0xab, 0xf7, 0x0b, 0x1a, 0xd3, 0xa1, 0xbd, 0x68, 0xc8, 0x59, 0xe7, 0x49,
+      0x5c, 0xd5, 0x4b, 0x8c, 0x31, 0xdb, 0xb3, 0xea, 0x88, 0x09, 0x2f, 0xb9,
+      0x8b, 0xfd, 0x96, 0x35, 0x88, 0x53, 0x72, 0x40, 0xcd, 0x89, 0x75, 0xb4,
+      0x20, 0xf6, 0xf6, 0xe5, 0x74, 0x19, 0x48, 0xaf, 0x4b, 0xaa, 0x42, 0xa4,
+      0xc8, 0x90, 0xee, 0xf3, 0x12, 0x04, 0x63, 0x90, 0x92, 0x8a, 0x89, 0xc3,
+      0xa0, 0x7e, 0xfe, 0x19, 0xb3, 0x54, 0x53, 0x83, 0xe9, 0xc1, 0x6c, 0xe3,
+      0x97, 0xa6, 0x27, 0xc3, 0x20, 0x9a, 0x79, 0x35, 0xc9, 0xb5, 0xc0, 0x90,
+      0xe1, 0x56, 0x84, 0x69, 0xc2, 0x54, 0x77, 0x52, 0x48, 0x55, 0x71, 0x3e,
+      0xcd, 0xa7, 0xd6, 0x25, 0x5d, 0x49, 0x13, 0xd2, 0x59, 0xd7, 0xe1, 0xd1,
+      0x70, 0x46, 0xa0, 0xd4, 0xee, 0x59, 0x13, 0x1f, 0x1a, 0xd3, 0x39, 0x7d,
+      0xb0, 0x79, 0xf7, 0xc0, 0x73, 0x5e, 0xbb, 0x08, 0xf7, 0x5c, 0xb0, 0x31,
+      0x41, 0x3d, 0x7b, 0x1e, 0xf0, 0xe6, 0x47, 0x5c, 0x37, 0xd5, 0x54, 0xf1,
+      0xbb, 0x64, 0xd7, 0x41, 0x8b, 0x34, 0x55, 0xaa, 0xc3, 0x5a, 0x9c, 0xa0,
+      0xcc, 0x29, 0x8e, 0x5a, 0x1a, 0x93, 0x5a, 0x49, 0xd3, 0xd0, 0xa0, 0x56,
+      0xda, 0x32, 0xa2, 0xa9, 0xa7, 0x13, 0x42, 0x93, 0x9b, 0x20, 0x32, 0x37,
+      0x5c, 0x3e, 0x03, 0xa5, 0x28, 0x10, 0x93, 0xdd, 0xa0, 0x04, 0x7b, 0x2a,
+      0xbd, 0x31, 0xc3, 0x6a, 0x89, 0x58, 0x6e, 0x55, 0x0e, 0xc9, 0x5c, 0x70,
+      0x07, 0x10, 0xf1, 0x9a, 0xbd, 0xfb, 0xd2, 0xb7, 0x94, 0x5b, 0x4f, 0x8d,
+      0x90, 0xfa, 0xee, 0xae, 0x37, 0x48, 0xc5, 0xf8, 0x16, 0xa1, 0x3b, 0x70,
+      0x03, 0x1f, 0x0e, 0xb8, 0xbd, 0x8d, 0x30, 0x4f, 0x95, 0x31, 0x0b, 0x9f,
+      0xfc, 0x80, 0xf8, 0xef, 0xa3, 0x3c, 0xbc, 0xe2, 0x23, 0x23, 0x3e, 0x2a,
+      0x55, 0x11, 0xe8, 0x2c, 0x17, 0xea, 0x1c, 0xbd, 0x1d, 0x2d, 0x1b, 0xd5,
+      0x16, 0x9e, 0x05, 0xfc, 0x89, 0x64, 0x50, 0x4d, 0x9a, 0x22, 0x50, 0xc6,
+      0x5a, 0xd9, 0x58, 0x99, 0x8f, 0xbd, 0xf2, 0x4f, 0x2c, 0xdb, 0x51, 0x6a,
+      0x86, 0xe2, 0xc6, 0x64, 0x8f, 0x54, 0x1a, 0xf2, 0xcb, 0x34, 0x88, 0x08,
+      0xbd, 0x2a, 0x8f, 0xec, 0x29, 0xf5, 0x22, 0x36, 0x83, 0x99, 0xb9, 0x71,
+      0x8c, 0x99, 0x5c, 0xec, 0x91, 0x78, 0xc1, 0xe2, 0x2d, 0xe9, 0xd1, 0x4d,
+      0xf5, 0x15, 0x93, 0x4d, 0x93, 0x92, 0x9f, 0x0f, 0x33, 0x5e, 0xcd, 0x58,
+      0x5f, 0x3d, 0x52, 0xb9, 0x38, 0x6a, 0x85, 0x63, 0x8b, 0x63, 0x29, 0xcb,
+      0x67, 0x12, 0x25, 0xc2, 0x44, 0xd7, 0xab, 0x1a, 0x24, 0xca, 0x3d, 0xca,
+      0x77, 0xce, 0x28, 0x68, 0x1a, 0x91, 0xed, 0x7b, 0xc9, 0x70, 0x84, 0xab,
+      0xe2, 0xd4, 0xf4, 0xac, 0x58, 0xf6, 0x70, 0x99, 0xfc, 0x99, 0x4d, 0xbd,
+      0xb4, 0x1b, 0x4f, 0x15, 0x86, 0x95, 0x08, 0xd1, 0x4e, 0x73, 0xa9, 0xbc,
+      0x6a, 0x8c, 0xbc, 0xb5, 0x4b, 0xe0, 0xee, 0x35, 0x24, 0xf9, 0x12, 0xf5,
+      0x88, 0x70, 0x50, 0x6c, 0xfe, 0x0d, 0x35, 0xbd, 0xf7, 0xc4, 0x2e, 0x39,
+      0x16, 0x30, 0x6c, 0xf3, 0xb2, 0x19, 0x44, 0xaa, 0xcb, 0x4a, 0xf6, 0x75,
+      0xb7, 0x09, 0xb9, 0xe1, 0x47, 0x71, 0x70, 0x5c, 0x05, 0x5f, 0x50, 0x50,
+      0x9c, 0xd0, 0xe3, 0xc7, 0x91, 0xee, 0x6b, 0xc7, 0x0f, 0x71, 0x1b, 0xc3,
+      0x48, 0x8b, 0xed, 0x15, 0x26, 0x8c, 0xc3, 0xd5, 0x54, 0x08, 0xcc, 0x33,
+      0x79, 0xc0, 0x9f, 0x49, 0xc8, 0x75, 0xef, 0xb6, 0xf3, 0x29, 0x89, 0xfd,
+      0x75, 0xd1, 0xda, 0x92, 0xc3, 0x13, 0xc6, 0x76, 0x51, 0x11, 0x40, 0x7b,
+      0x82, 0xf7, 0x30, 0x79, 0x49, 0x04, 0xe3, 0xbb, 0x61, 0x34, 0xa6, 0x58,
+      0x0b, 0x7d, 0xef, 0x3e, 0xf9, 0xb3, 0x8d, 0x2a, 0xba, 0xe9, 0xbc, 0xc0,
+      0xa7, 0xe6, 0x6c, 0xda, 0xf8, 0x8c, 0xdf, 0x8d, 0x96, 0x83, 0x2d, 0x80,
+      0x4f, 0x21, 0x81, 0xde, 0x57, 0x9d, 0x0a, 0x3c, 0xcc, 0xec, 0x3b, 0xb2,
+      0x25, 0x96, 0x3c, 0xea, 0xfd, 0x46, 0x26, 0xbe, 0x1c, 0x79, 0x82, 0x1d,
+      0xe0, 0x14, 0x22, 0x7c, 0x80, 0x3d, 0xbd, 0x05, 0x90, 0xfa, 0xaf, 0x7d,
+      0x70, 0x13, 0x43, 0x0f, 0x3d, 0xa0, 0x7f, 0x92, 0x3a, 0x53, 0x69, 0xe4,
+      0xb0, 0x10, 0x0d, 0xa7, 0x73, 0xa8, 0x8c, 0x74, 0xab, 0xd7, 0x78, 0x15,
+      0x45, 0xec, 0x6e, 0xc8, 0x8b, 0xa0, 0xba, 0x21, 0x6f, 0xf3, 0x08, 0xb8,
+      0xc7, 0x4f, 0x14, 0xf5, 0xcc, 0xfd, 0x39, 0xbc, 0x11, 0xf5, 0xb9, 0x11,
+      0xba, 0xf3, 0x11, 0x24, 0x74, 0x3e, 0x0c, 0x07, 0x4f, 0xac, 0x2a, 0xb2,
+      0xb1, 0x3c, 0x00, 0xfa, 0xbb, 0x8c, 0xd8, 0x7d, 0x17, 0x5b, 0x8d, 0x39,
+      0xc6, 0x23, 0x31, 0x32, 0x7d, 0x6e, 0x20, 0x38, 0xd0, 0xc3, 0x58, 0xe2,
+      0xb1, 0xfe, 0x53, 0x6b, 0xc7, 0x10, 0x13, 0x7e, 0xc6, 0x7c, 0x67, 0x59,
+      0x43, 0x70, 0x4a, 0x2d, 0x7f, 0x76, 0xde, 0xbd, 0x45, 0x43, 0x56, 0x60,
+      0xcd, 0xe9, 0x24, 0x7b, 0xb7, 0x41, 0xce, 0x56, 0xed, 0xd3, 0x74, 0x75,
+      0xcc, 0x9d, 0x48, 0x61, 0xc8, 0x19, 0x66, 0x08, 0xfb, 0x28, 0x60, 0x1f,
+      0x83, 0x11, 0xc0, 0x9b, 0xbd, 0x71, 0x53, 0x36, 0x01, 0x76, 0xa8, 0xc0,
+      0xdc, 0x1d, 0x18, 0x85, 0x19, 0x65, 0xce, 0xcf, 0x14, 0x2e, 0x6c, 0x32,
+      0x15, 0xbc, 0x2c, 0x5e, 0x8f, 0xfc, 0x3c, 0xf0, 0x2d, 0xf5, 0x5c, 0x04,
+      0xc9, 0x22, 0xf4, 0xc3, 0xb8, 0x57, 0x79, 0x52, 0x41, 0xfd, 0xff, 0xcd,
+      0x26, 0xa8, 0xc0, 0xd2, 0xe1, 0x71, 0xd6, 0xf1, 0xf4, 0x0c, 0xa8, 0xeb,
+      0x0c, 0x33, 0x40, 0x25, 0x73, 0xbb, 0x31, 0xda, 0x0c, 0xa6, 0xee, 0x0c,
+      0x41, 0x51, 0x94, 0x3c, 0x24, 0x27, 0x65, 0xe9, 0xb5, 0xc4, 0xe2, 0x88,
+      0xc0, 0x82, 0xd0, 0x72, 0xd9, 0x10, 0x4d, 0x7f, 0xc0, 0x88, 0x94, 0x41,
+      0x2d, 0x05, 0x09, 0xfb, 0x97, 0x31, 0x6e, 0xc1, 0xe9, 0xf4, 0x50, 0x70,
+      0xdc, 0x3f, 0x0a, 0x90, 0x46, 0x37, 0x60, 0x8c, 0xfb, 0x06, 0x6e, 0xde,
+      0x6f, 0xa7, 0x6b, 0xa3, 0x88, 0x18, 0x96, 0x93, 0x19, 0x87, 0xe7, 0x0a,
+      0x98, 0xf0, 0x13, 0x01, 0xab, 0x7c, 0xeb, 0x25, 0xa5, 0xe2, 0x98, 0x44,
+      0x7d, 0x09, 0xe2, 0x42, 0x33, 0xd4, 0xeb, 0xcc, 0x9b, 0x70, 0xf6, 0x0f,
+      0xf0, 0xb2, 0x99, 0xcc, 0x4f, 0x64, 0xc4, 0x69, 0x12, 0xea, 0x56, 0xfe,
+      0x50, 0x0e, 0x02, 0x1f, 0x6d, 0x7a, 0x79, 0x62, 0xaa, 0x2e, 0x52, 0xaf,
+      0xa3, 0xed, 0xcd, 0xa7, 0x45, 0xe6, 0x86, 0xed, 0xa1, 0x73, 0x5b, 0x1e,
+      0x49, 0x4f, 0x92, 0x50, 0x83, 0x99, 0x3c, 0xf4, 0xf6, 0xa8, 0x49, 0xd7,
+      0x08, 0xf7, 0xdc, 0x28, 0x2c, 0xe6, 0x22, 0x6f, 0xf8, 0xfa, 0xba, 0x9e,
+      0x0a, 0xcf, 0x72, 0x74, 0x76, 0x75, 0x99, 0x4d, 0x3d, 0x9a, 0x4c, 0x54,
+      0xcd, 0xf8, 0x54, 0xf0, 0xbd, 0x73, 0xe9, 0x4f, 0x29, 0xd0, 0xe1, 0x24,
+      0x94, 0x52, 0xd6, 0x60, 0x80, 0x71, 0x24, 0x95, 0x92, 0x01, 0x0e, 0xa9,
+      0x7e, 0x64, 0x2e, 0xed, 0x51, 0xcc, 0xd2, 0xff, 0xfd, 0x0b, 0xf4, 0x1d,
+      0x25, 0x5d, 0x10, 0x87, 0x09, 0x55, 0x06, 0x95, 0xae, 0xb3, 0xef, 0xe9,
+      0xaa, 0x36, 0x15, 0x97, 0xe6, 0xf2, 0x24, 0xcf, 0x7d, 0xcd, 0x55, 0x11,
+      0xba, 0x20, 0xd0, 0xd7, 0xdc, 0xa6,
+  };
+  EXPECT_EQ(Bytes(ciphertext), Bytes(kExpectedCiphertext));
+
+  static const uint8_t kExpectedSharedKey[HRSS_KEY_BYTES] = {
+      0x04, 0x5a, 0x1a, 0xbc, 0x4c, 0x76, 0x47, 0x1f, 0xbf, 0xc9, 0x23,
+      0xec, 0xcb, 0x6e, 0x4d, 0x59, 0x8d, 0x3f, 0x90, 0x3e, 0x53, 0x73,
+      0x3c, 0x2c, 0x71, 0xcc, 0xac, 0xc5, 0xe0, 0xf2, 0xbc, 0xe8,
+  };
+  EXPECT_EQ(Bytes(shared_key), Bytes(kExpectedSharedKey));
+
+  HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext));
+  EXPECT_EQ(Bytes(shared_key, sizeof(shared_key)),
+            Bytes(kExpectedSharedKey, sizeof(kExpectedSharedKey)));
+
+  // Corrupt the ciphertext and ensure that the failure key is constant.
+  ciphertext[50] ^= 4;
+  HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext));
+
+  static const uint8_t kExpectedFailureKey[HRSS_KEY_BYTES] = {
+      0x3a, 0xec, 0xc0, 0x38, 0x4f, 0xa7, 0x17, 0xb2, 0x77, 0x61, 0xb1,
+      0xf8, 0x12, 0x7f, 0xd9, 0x61, 0x67, 0x70, 0x63, 0xbe, 0xa2, 0x72,
+      0xfe, 0x1a, 0x82, 0x8d, 0x1d, 0x90, 0xe0, 0x36, 0x69, 0x2d,
+  };
+  EXPECT_EQ(Bytes(shared_key), Bytes(kExpectedFailureKey));
+}
diff --git a/crypto/hrss/internal.h b/crypto/hrss/internal.h
new file mode 100644
index 0000000..70218b8
--- /dev/null
+++ b/crypto/hrss/internal.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_HRSS_INTERNAL_H
+#define OPENSSL_HEADER_HRSS_INTERNAL_H
+
+#include <openssl/base.h>
+#include "../internal.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+#define N 701
+#define BITS_PER_WORD (sizeof(crypto_word_t) * 8)
+#define WORDS_PER_POLY ((N + BITS_PER_WORD - 1) / BITS_PER_WORD)
+#define BITS_IN_LAST_WORD (N % BITS_PER_WORD)
+
+struct poly2 {
+  crypto_word_t v[WORDS_PER_POLY];
+};
+
+struct poly3 {
+  struct poly2 s, a;
+};
+
+OPENSSL_EXPORT void HRSS_poly2_rotr_consttime(struct poly2 *p, size_t bits);
+OPENSSL_EXPORT void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x,
+                                   const struct poly3 *y);
+OPENSSL_EXPORT void HRSS_poly3_invert(struct poly3 *out,
+                                      const struct poly3 *in);
+
+
+#if defined(__cplusplus)
+}  // extern "C"
+#endif
+
+#endif  // !OPENSSL_HEADER_HRSS_INTERNAL_H
diff --git a/crypto/obj/obj_dat.h b/crypto/obj/obj_dat.h
index 0f5a3fa..0313a08 100644
--- a/crypto/obj/obj_dat.h
+++ b/crypto/obj/obj_dat.h
@@ -57,7 +57,7 @@
 /* This file is generated by crypto/obj/objects.go. */
 
 
-#define NUM_NID 959
+#define NUM_NID 960
 
 static const uint8_t kObjectData[] = {
     /* NID_rsadsi */
@@ -8755,6 +8755,7 @@
     {"AuthPSK", "auth-psk", NID_auth_psk, 0, NULL, 0},
     {"KxANY", "kx-any", NID_kx_any, 0, NULL, 0},
     {"AuthANY", "auth-any", NID_auth_any, 0, NULL, 0},
+    {"CECPQ2", "CECPQ2", NID_CECPQ2, 0, NULL, 0},
 };
 
 static const unsigned kNIDsInShortNameOrder[] = {
@@ -8816,6 +8817,7 @@
     110 /* CAST5-CFB */,
     109 /* CAST5-ECB */,
     111 /* CAST5-OFB */,
+    959 /* CECPQ2 */,
     894 /* CMAC */,
     13 /* CN */,
     141 /* CRLReason */,
@@ -9720,6 +9722,7 @@
     285 /* Biometric Info */,
     179 /* CA Issuers */,
     785 /* CA Repository */,
+    959 /* CECPQ2 */,
     131 /* Code Signing */,
     783 /* Diffie-Hellman based MAC */,
     382 /* Directory */,
diff --git a/crypto/obj/obj_mac.num b/crypto/obj/obj_mac.num
index 6dbc0f1..5fa839d 100644
--- a/crypto/obj/obj_mac.num
+++ b/crypto/obj/obj_mac.num
@@ -947,3 +947,4 @@
 auth_psk		956
 kx_any		957
 auth_any		958
+CECPQ2		959
diff --git a/crypto/obj/objects.txt b/crypto/obj/objects.txt
index 0c48e3c..6dbb7ad 100644
--- a/crypto/obj/objects.txt
+++ b/crypto/obj/objects.txt
@@ -559,7 +559,7 @@
 id-cmc 21		: id-cmc-queryPending
 id-cmc 22		: id-cmc-popLinkRandom
 id-cmc 23		: id-cmc-popLinkWitness
-id-cmc 24		: id-cmc-confirmCertAcceptance 
+id-cmc 24		: id-cmc-confirmCertAcceptance
 
 # other names
 id-on 1			: id-on-personalData
@@ -1239,7 +1239,7 @@
 # Definitions for Camellia cipher - ECB, CFB, OFB MODE
 
 !Alias ntt-ds 0 3 4401 5
-!Alias camellia ntt-ds 3 1 9 
+!Alias camellia ntt-ds 3 1 9
 
 camellia 1		: CAMELLIA-128-ECB		: camellia-128-ecb
 !Cname camellia-128-ofb128
@@ -1310,7 +1310,7 @@
 1 3 36 3 3 2 8 1 1 11 : brainpoolP384r1
 1 3 36 3 3 2 8 1 1 12 : brainpoolP384t1
 1 3 36 3 3 2 8 1 1 13 : brainpoolP512r1
-1 3 36 3 3 2 8 1 1 14 : brainpoolP512t1            
+1 3 36 3 3 2 8 1 1 14 : brainpoolP512t1
 
 # ECDH schemes from RFC5753
 !Alias x9-63-scheme 1 3 133 16 840 63 0
@@ -1334,6 +1334,9 @@
 # NID for X25519 (no corresponding OID).
  : X25519
 
+# NID for CECPQ2 (no corresponding OID).
+ : CECPQ2
+
 # See RFC 8410.
 1 3 101 112 : ED25519
 
diff --git a/include/openssl/hrss.h b/include/openssl/hrss.h
new file mode 100644
index 0000000..4e1c73f
--- /dev/null
+++ b/include/openssl/hrss.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2018, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#ifndef OPENSSL_HEADER_HRSS_H
+#define OPENSSL_HEADER_HRSS_H
+
+#include <openssl/base.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+// HRSS
+//
+// HRSS is a structured-lattice-based post-quantum key encapsulation mechanism.
+// The best exposition is https://eprint.iacr.org/2017/667.pdf although this
+// implementation uses a different KEM construction based on
+// https://eprint.iacr.org/2017/1005.pdf.
+
+struct HRSS_private_key {
+  uint8_t opaque[1808];
+};
+
+struct HRSS_public_key {
+  uint8_t opaque[1424];
+};
+
+// HRSS_SAMPLE_BYTES is the number of bytes of entropy needed to generate a
+// short vector. There are 701 coefficients, but the final one is always set to
+// zero when sampling. Otherwise, one byte of input is enough to generate two
+// coefficients.
+#define HRSS_SAMPLE_BYTES ((701 - 1) / 2)
+// HRSS_GENERATE_KEY_BYTES is the number of bytes of entropy needed to generate
+// an HRSS key pair.
+#define HRSS_GENERATE_KEY_BYTES (HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32)
+// HRSS_ENCAP_BYTES is the number of bytes of entropy needed to encapsulate a
+// session key.
+#define HRSS_ENCAP_BYTES (HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES)
+// HRSS_PUBLIC_KEY_BYTES is the number of bytes in a public key.
+#define HRSS_PUBLIC_KEY_BYTES 1138
+// HRSS_CIPHERTEXT_BYTES is the number of bytes in a ciphertext.
+#define HRSS_CIPHERTEXT_BYTES (1138 + 32)
+// HRSS_KEY_BYTES is the number of bytes in a shared key.
+#define HRSS_KEY_BYTES 32
+// HRSS_POLY3_BYTES is the number of bytes needed to serialise a mod 3
+// polynomial.
+#define HRSS_POLY3_BYTES 140
+#define HRSS_PRIVATE_KEY_BYTES \
+  (HRSS_POLY3_BYTES * 2 + HRSS_PUBLIC_KEY_BYTES + 2 + 32)
+
+// HRSS_generate_key is a deterministic function that outputs a public and
+// private key based on the given entropy.
+OPENSSL_EXPORT void HRSS_generate_key(
+    struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv,
+    const uint8_t input[HRSS_GENERATE_KEY_BYTES]);
+
+// HRSS_encap is a deterministic function the generates and encrypts a random
+// session key from the given entropy, writing those values to |out_shared_key|
+// and |out_ciphertext|, respectively.
+OPENSSL_EXPORT void HRSS_encap(uint8_t out_ciphertext[HRSS_CIPHERTEXT_BYTES],
+                               uint8_t out_shared_key[HRSS_KEY_BYTES],
+                               const struct HRSS_public_key *in_pub,
+                               const uint8_t in[HRSS_ENCAP_BYTES]);
+
+// HRSS_decap decrypts a session key from |ciphertext_len| bytes of
+// |ciphertext|. If the ciphertext is valid, the decrypted key is written to
+// |out_shared_key|. Otherwise the HMAC of |ciphertext| under a secret key (kept
+// in |in_priv|) is written. If the ciphertext is the wrong length then it will
+// leak which was done via side-channels. Otherwise it should perform either
+// action in constant-time.
+OPENSSL_EXPORT void HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES],
+                               const struct HRSS_public_key *in_pub,
+                               const struct HRSS_private_key *in_priv,
+                               const uint8_t *ciphertext,
+                               size_t ciphertext_len);
+
+// HRSS_marshal_public_key serialises |in_pub| to |out|.
+OPENSSL_EXPORT void HRSS_marshal_public_key(
+    uint8_t out[HRSS_PUBLIC_KEY_BYTES], const struct HRSS_public_key *in_pub);
+
+// HRSS_parse_public_key sets |*out| to the public-key encoded in |in|. It
+// returns true on success and zero on error.
+OPENSSL_EXPORT int HRSS_parse_public_key(
+    struct HRSS_public_key *out, const uint8_t in[HRSS_PUBLIC_KEY_BYTES]);
+
+
+#if defined(__cplusplus)
+}  // extern C
+#endif
+
+#endif  // OPENSSL_HEADER_HRSS_H
diff --git a/include/openssl/nid.h b/include/openssl/nid.h
index afeb2de..270d443 100644
--- a/include/openssl/nid.h
+++ b/include/openssl/nid.h
@@ -4234,6 +4234,9 @@
 #define LN_auth_any "auth-any"
 #define NID_auth_any 958
 
+#define SN_CECPQ2 "CECPQ2"
+#define NID_CECPQ2 959
+
 
 #if defined(__cplusplus)
 } /* extern C */
diff --git a/include/openssl/ssl.h b/include/openssl/ssl.h
index 17c5592..2f8163a 100644
--- a/include/openssl/ssl.h
+++ b/include/openssl/ssl.h
@@ -2177,6 +2177,7 @@
 #define SSL_CURVE_SECP384R1 24
 #define SSL_CURVE_SECP521R1 25
 #define SSL_CURVE_X25519 29
+#define SSL_CURVE_CECPQ2 16696
 
 // SSL_get_curve_id returns the ID of the curve used by |ssl|'s most recently
 // completed handshake or 0 if not applicable.
diff --git a/ssl/handoff.cc b/ssl/handoff.cc
index 4cca981..f9dbd13 100644
--- a/ssl/handoff.cc
+++ b/ssl/handoff.cc
@@ -307,7 +307,7 @@
     return false;
   }
   if (type == handback_after_ecdhe &&
-      !s3->hs->key_share->Serialize(&key_share)) {
+      !s3->hs->key_shares[0]->Serialize(&key_share)) {
     return false;
   }
   return CBB_flush(out);
@@ -471,7 +471,7 @@
     return false;
   }
   if (type == handback_after_ecdhe &&
-      (s3->hs->key_share = SSLKeyShare::Create(&key_share)) == nullptr) {
+      (s3->hs->key_shares[0] = SSLKeyShare::Create(&key_share)) == nullptr) {
     return false;
   }
 
diff --git a/ssl/handshake_client.cc b/ssl/handshake_client.cc
index c1d54bd..0274dc2 100644
--- a/ssl/handshake_client.cc
+++ b/ssl/handshake_client.cc
@@ -590,7 +590,8 @@
   }
 
   // Clear some TLS 1.3 state that no longer needs to be retained.
-  hs->key_share.reset();
+  hs->key_shares[0].reset();
+  hs->key_shares[1].reset();
   hs->key_share_bytes.Reset();
 
   // A TLS 1.2 server would not know to skip the early data we offered. Report
@@ -1006,8 +1007,8 @@
     }
 
     // Initialize ECDH and save the peer public key for later.
-    hs->key_share = SSLKeyShare::Create(group_id);
-    if (!hs->key_share ||
+    hs->key_shares[0] = SSLKeyShare::Create(group_id);
+    if (!hs->key_shares[0] ||
         !hs->peer_key.CopyFrom(point)) {
       return ssl_hs_error;
     }
@@ -1324,7 +1325,7 @@
 
     // Compute the premaster.
     uint8_t alert = SSL_AD_DECODE_ERROR;
-    if (!hs->key_share->Accept(&child, &pms, &alert, hs->peer_key)) {
+    if (!hs->key_shares[0]->Accept(&child, &pms, &alert, hs->peer_key)) {
       ssl_send_alert(ssl, SSL3_AL_FATAL, alert);
       return ssl_hs_error;
     }
@@ -1333,7 +1334,8 @@
     }
 
     // The key exchange state may now be discarded.
-    hs->key_share.reset();
+    hs->key_shares[0].reset();
+    hs->key_shares[1].reset();
     hs->peer_key.Reset();
   } else if (alg_k & SSL_kPSK) {
     // For plain PSK, other_secret is a block of 0s with the same length as
diff --git a/ssl/handshake_server.cc b/ssl/handshake_server.cc
index c4f3b75..8b3b942 100644
--- a/ssl/handshake_server.cc
+++ b/ssl/handshake_server.cc
@@ -932,12 +932,12 @@
       hs->new_session->group_id = group_id;
 
       // Set up ECDH, generate a key, and emit the public half.
-      hs->key_share = SSLKeyShare::Create(group_id);
-      if (!hs->key_share ||
+      hs->key_shares[0] = SSLKeyShare::Create(group_id);
+      if (!hs->key_shares[0] ||
           !CBB_add_u8(cbb.get(), NAMED_CURVE_TYPE) ||
           !CBB_add_u16(cbb.get(), group_id) ||
           !CBB_add_u8_length_prefixed(cbb.get(), &child) ||
-          !hs->key_share->Offer(&child)) {
+          !hs->key_shares[0]->Offer(&child)) {
         return ssl_hs_error;
       }
     } else {
@@ -1275,13 +1275,14 @@
 
     // Compute the premaster.
     uint8_t alert = SSL_AD_DECODE_ERROR;
-    if (!hs->key_share->Finish(&premaster_secret, &alert, peer_key)) {
+    if (!hs->key_shares[0]->Finish(&premaster_secret, &alert, peer_key)) {
       ssl_send_alert(ssl, SSL3_AL_FATAL, alert);
       return ssl_hs_error;
     }
 
     // The key exchange state may now be discarded.
-    hs->key_share.reset();
+    hs->key_shares[0].reset();
+    hs->key_shares[1].reset();
   } else if (!(alg_k & SSL_kPSK)) {
     OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR);
     ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_HANDSHAKE_FAILURE);
diff --git a/ssl/internal.h b/ssl/internal.h
index f8a2ea7..bbce7ec 100644
--- a/ssl/internal.h
+++ b/ssl/internal.h
@@ -974,10 +974,10 @@
   // |out_public_key|. It returns true on success and false on error.
   virtual bool Offer(CBB *out_public_key) PURE_VIRTUAL;
 
-  // Accept performs a key exchange against the |peer_key| generated by |offer|.
+  // Accept performs a key exchange against the |peer_key| generated by |Offer|.
   // On success, it returns true, writes the public value to |out_public_key|,
-  // and sets |*out_secret| the shared secret. On failure, it returns false and
-  // sets |*out_alert| to an alert to send to the peer.
+  // and sets |*out_secret| to the shared secret. On failure, it returns false
+  // and sets |*out_alert| to an alert to send to the peer.
   //
   // The default implementation calls |Offer| and then |Finish|, assuming a key
   // exchange protocol where the peers are symmetric.
@@ -986,7 +986,7 @@
 
   // Finish performs a key exchange against the |peer_key| generated by
   // |Accept|. On success, it returns true and sets |*out_secret| to the shared
-  // secret. On failure, it returns zero and sets |*out_alert| to an alert to
+  // secret. On failure, it returns false and sets |*out_alert| to an alert to
   // send to the peer.
   virtual bool Finish(Array<uint8_t> *out_secret, uint8_t *out_alert,
                       Span<const uint8_t> peer_key) PURE_VIRTUAL;
@@ -1436,8 +1436,10 @@
   // error, if |wait| is |ssl_hs_error|, is the error the handshake failed on.
   UniquePtr<ERR_SAVE_STATE> error;
 
-  // key_share is the current key exchange instance.
-  UniquePtr<SSLKeyShare> key_share;
+  // key_shares are the current key exchange instances. The second is only used
+  // as a client if we believe that we should offer two key shares in a
+  // ClientHello.
+  UniquePtr<SSLKeyShare> key_shares[2];
 
   // transcript is the current handshake transcript.
   SSLTranscript transcript;
diff --git a/ssl/ssl_key_share.cc b/ssl/ssl_key_share.cc
index 55c7463..108ea6a 100644
--- a/ssl/ssl_key_share.cc
+++ b/ssl/ssl_key_share.cc
@@ -24,8 +24,10 @@
 #include <openssl/curve25519.h>
 #include <openssl/ec.h>
 #include <openssl/err.h>
+#include <openssl/hrss.h>
 #include <openssl/mem.h>
 #include <openssl/nid.h>
+#include <openssl/rand.h>
 
 #include "internal.h"
 #include "../crypto/internal.h"
@@ -207,12 +209,104 @@
   uint8_t private_key_[32];
 };
 
+class CECPQ2KeyShare : public SSLKeyShare {
+ public:
+  CECPQ2KeyShare() {}
+
+  uint16_t GroupID() const override { return SSL_CURVE_CECPQ2; }
+
+  bool Offer(CBB *out) override {
+    uint8_t x25519_public_key[32];
+    X25519_keypair(x25519_public_key, x25519_private_key_);
+
+    uint8_t hrss_entropy[HRSS_GENERATE_KEY_BYTES];
+    RAND_bytes(hrss_entropy, sizeof(hrss_entropy));
+    HRSS_generate_key(&hrss_public_key_, &hrss_private_key_, hrss_entropy);
+
+    uint8_t hrss_public_key_bytes[HRSS_PUBLIC_KEY_BYTES];
+    HRSS_marshal_public_key(hrss_public_key_bytes, &hrss_public_key_);
+
+    if (!CBB_add_bytes(out, x25519_public_key, sizeof(x25519_public_key)) ||
+        !CBB_add_bytes(out, hrss_public_key_bytes,
+                       sizeof(hrss_public_key_bytes))) {
+      return false;
+    }
+
+    return true;
+  };
+
+  bool Accept(CBB *out_public_key, Array<uint8_t> *out_secret,
+              uint8_t *out_alert, Span<const uint8_t> peer_key) override {
+    Array<uint8_t> secret;
+    if (!secret.Init(32 + HRSS_KEY_BYTES)) {
+      OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+      return false;
+    }
+
+    uint8_t x25519_public_key[32];
+    X25519_keypair(x25519_public_key, x25519_private_key_);
+
+    HRSS_public_key peer_public_key;
+    if (peer_key.size() != 32 + HRSS_PUBLIC_KEY_BYTES ||
+        !HRSS_parse_public_key(&peer_public_key, peer_key.data() + 32) ||
+        !X25519(secret.data(), x25519_private_key_, peer_key.data())) {
+      *out_alert = SSL_AD_DECODE_ERROR;
+      OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT);
+      return false;
+    }
+
+    uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+    uint8_t entropy[HRSS_ENCAP_BYTES];
+    RAND_bytes(entropy, sizeof(entropy));
+    HRSS_encap(ciphertext, secret.data() + 32, &peer_public_key, entropy);
+
+    if (!CBB_add_bytes(out_public_key, x25519_public_key,
+                       sizeof(x25519_public_key)) ||
+        !CBB_add_bytes(out_public_key, ciphertext, sizeof(ciphertext))) {
+      return false;
+    }
+
+    *out_secret = std::move(secret);
+    return true;
+  }
+
+  bool Finish(Array<uint8_t> *out_secret, uint8_t *out_alert,
+              Span<const uint8_t> peer_key) override {
+    *out_alert = SSL_AD_INTERNAL_ERROR;
+
+    Array<uint8_t> secret;
+    if (!secret.Init(32 + HRSS_KEY_BYTES)) {
+      OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE);
+      return false;
+    }
+
+    if (peer_key.size() != 32 + HRSS_CIPHERTEXT_BYTES ||
+        !X25519(secret.data(), x25519_private_key_, peer_key.data())) {
+      *out_alert = SSL_AD_DECODE_ERROR;
+      OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT);
+      return false;
+    }
+
+    HRSS_decap(secret.data() + 32, &hrss_public_key_, &hrss_private_key_,
+               peer_key.data() + 32, peer_key.size() - 32);
+
+    *out_secret = std::move(secret);
+    return true;
+  };
+
+ private:
+  uint8_t x25519_private_key_[32];
+  HRSS_public_key hrss_public_key_;
+  HRSS_private_key hrss_private_key_;
+};
+
 CONSTEXPR_ARRAY NamedGroup kNamedGroups[] = {
     {NID_secp224r1, SSL_CURVE_SECP224R1, "P-224", "secp224r1"},
     {NID_X9_62_prime256v1, SSL_CURVE_SECP256R1, "P-256", "prime256v1"},
     {NID_secp384r1, SSL_CURVE_SECP384R1, "P-384", "secp384r1"},
     {NID_secp521r1, SSL_CURVE_SECP521R1, "P-521", "secp521r1"},
     {NID_X25519, SSL_CURVE_X25519, "X25519", "x25519"},
+    {NID_CECPQ2, SSL_CURVE_CECPQ2, "CECPQ2", "CECPQ2"},
 };
 
 }  // namespace
@@ -237,6 +331,8 @@
           New<ECKeyShare>(NID_secp521r1, SSL_CURVE_SECP521R1));
     case SSL_CURVE_X25519:
       return UniquePtr<SSLKeyShare>(New<X25519KeyShare>());
+    case SSL_CURVE_CECPQ2:
+      return UniquePtr<SSLKeyShare>(New<CECPQ2KeyShare>());
     default:
       return nullptr;
   }
diff --git a/ssl/ssl_test.cc b/ssl/ssl_test.cc
index 470379c..705528b 100644
--- a/ssl/ssl_test.cc
+++ b/ssl/ssl_test.cc
@@ -395,6 +395,11 @@
     { SSL_CURVE_SECP256R1 },
   },
   {
+    "P-256:CECPQ2",
+    { SSL_CURVE_SECP256R1, SSL_CURVE_CECPQ2 },
+  },
+
+  {
     "P-256:P-384:P-521:X25519",
     {
       SSL_CURVE_SECP256R1,
diff --git a/ssl/t1_lib.cc b/ssl/t1_lib.cc
index 00c796a..5e65f81 100644
--- a/ssl/t1_lib.cc
+++ b/ssl/t1_lib.cc
@@ -292,10 +292,23 @@
     SSL_CURVE_SECP384R1,
 };
 
+// TLS 1.3 servers will pick CECPQ2 if offered by a client, but it's not enabled
+// by default for clients.
+static const uint16_t kDefaultGroupsServer[] = {
+    // CECPQ2 is not yet enabled by default.
+    // SSL_CURVE_CECPQ2,
+    SSL_CURVE_X25519,
+    SSL_CURVE_SECP256R1,
+    SSL_CURVE_SECP384R1,
+};;
+
 Span<const uint16_t> tls1_get_grouplist(const SSL_HANDSHAKE *hs) {
   if (!hs->config->supported_group_list.empty()) {
     return hs->config->supported_group_list;
   }
+  if (hs->ssl->server) {
+    return Span<const uint16_t>(kDefaultGroupsServer);
+  }
   return Span<const uint16_t>(kDefaultGroups);
 }
 
@@ -324,7 +337,11 @@
 
   for (uint16_t pref_group : pref) {
     for (uint16_t supp_group : supp) {
-      if (pref_group == supp_group) {
+      if (pref_group == supp_group &&
+          // CECPQ2 doesn't fit in the u8-length-prefixed ECPoint field in TLS
+          // 1.2 and below.
+          (ssl_protocol_version(ssl) >= TLS1_3_VERSION ||
+           pref_group != SSL_CURVE_CECPQ2)) {
         *out_group_id = pref_group;
         return true;
       }
@@ -386,6 +403,12 @@
 }
 
 bool tls1_check_group_id(const SSL_HANDSHAKE *hs, uint16_t group_id) {
+  if (group_id == SSL_CURVE_CECPQ2 &&
+      ssl_protocol_version(hs->ssl) < TLS1_3_VERSION) {
+    // CECPQ2 requires TLS 1.3.
+    return false;
+  }
+
   for (uint16_t supported : tls1_get_grouplist(hs)) {
     if (supported == group_id) {
       return true;
@@ -2144,6 +2167,7 @@
   }
 
   uint16_t group_id = hs->retry_group;
+  uint16_t second_group_id = 0;
   if (hs->received_hello_retry_request) {
     // We received a HelloRetryRequest without a new curve, so there is no new
     // share to append. Leave |hs->key_share| as-is.
@@ -2174,19 +2198,38 @@
     }
 
     group_id = groups[0];
+
+    if (group_id == SSL_CURVE_CECPQ2 && groups.size() >= 2) {
+      // CECPQ2 is not sent as the only initial key share. We'll include the
+      // 2nd preference group too to avoid round-trips.
+      second_group_id = groups[1];
+      assert(second_group_id != group_id);
+    }
   }
 
-  hs->key_share = SSLKeyShare::Create(group_id);
   CBB key_exchange;
-  if (!hs->key_share ||
+  hs->key_shares[0] = SSLKeyShare::Create(group_id);
+  if (!hs->key_shares[0] ||
       !CBB_add_u16(&kse_bytes, group_id) ||
       !CBB_add_u16_length_prefixed(&kse_bytes, &key_exchange) ||
-      !hs->key_share->Offer(&key_exchange) ||
+      !hs->key_shares[0]->Offer(&key_exchange) ||
       !CBB_flush(&kse_bytes)) {
     return false;
   }
 
-  // Save the contents of the extension to repeat it in the second ClientHello.
+  if (second_group_id != 0) {
+    hs->key_shares[1] = SSLKeyShare::Create(second_group_id);
+    if (!hs->key_shares[1] ||
+        !CBB_add_u16(&kse_bytes, second_group_id) ||
+        !CBB_add_u16_length_prefixed(&kse_bytes, &key_exchange) ||
+        !hs->key_shares[1]->Offer(&key_exchange) ||
+        !CBB_flush(&kse_bytes)) {
+      return false;
+    }
+  }
+
+  // Save the contents of the extension to repeat it in the second
+  // ClientHello.
   if (!hs->received_hello_retry_request &&
       !hs->key_share_bytes.CopyFrom(
           MakeConstSpan(CBB_data(&kse_bytes), CBB_len(&kse_bytes)))) {
@@ -2209,19 +2252,24 @@
     return false;
   }
 
-  if (hs->key_share->GroupID() != group_id) {
-    *out_alert = SSL_AD_ILLEGAL_PARAMETER;
-    OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE);
-    return false;
+  SSLKeyShare *key_share = hs->key_shares[0].get();
+  if (key_share->GroupID() != group_id) {
+    if (!hs->key_shares[1] || hs->key_shares[1]->GroupID() != group_id) {
+      *out_alert = SSL_AD_ILLEGAL_PARAMETER;
+      OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE);
+      return false;
+    }
+    key_share = hs->key_shares[1].get();
   }
 
-  if (!hs->key_share->Finish(out_secret, out_alert, peer_key)) {
+  if (!key_share->Finish(out_secret, out_alert, peer_key)) {
     *out_alert = SSL_AD_INTERNAL_ERROR;
     return false;
   }
 
   hs->new_session->group_id = group_id;
-  hs->key_share.reset();
+  hs->key_shares[0].reset();
+  hs->key_shares[1].reset();
   return true;
 }
 
@@ -2389,6 +2437,10 @@
   }
 
   for (uint16_t group : tls1_get_grouplist(hs)) {
+    if (group == SSL_CURVE_CECPQ2 &&
+        hs->max_version < TLS1_3_VERSION) {
+      continue;
+    }
     if (!CBB_add_u16(&groups_bytes, group)) {
       return false;
     }
diff --git a/ssl/test/bssl_shim.cc b/ssl/test/bssl_shim.cc
index 675a08a..3632fc5 100644
--- a/ssl/test/bssl_shim.cc
+++ b/ssl/test/bssl_shim.cc
@@ -649,7 +649,6 @@
     SSL_set_connect_state(ssl.get());
   }
 
-
   int sock = Connect(config->port);
   if (sock == -1) {
     return false;
diff --git a/ssl/test/runner/cipher_suites.go b/ssl/test/runner/cipher_suites.go
index f4c5900..3246f0b 100644
--- a/ssl/test/runner/cipher_suites.go
+++ b/ssl/test/runner/cipher_suites.go
@@ -26,7 +26,7 @@
 	// In the case that the key agreement protocol doesn't use a
 	// ServerKeyExchange message, generateServerKeyExchange can return nil,
 	// nil.
-	generateServerKeyExchange(*Config, *Certificate, *clientHelloMsg, *serverHelloMsg) (*serverKeyExchangeMsg, error)
+	generateServerKeyExchange(*Config, *Certificate, *clientHelloMsg, *serverHelloMsg, uint16) (*serverKeyExchangeMsg, error)
 	processClientKeyExchange(*Config, *Certificate, *clientKeyExchangeMsg, uint16) ([]byte, error)
 
 	// On the client side, the next two methods are called in order.
diff --git a/ssl/test/runner/common.go b/ssl/test/runner/common.go
index 73b8889..d99518c 100644
--- a/ssl/test/runner/common.go
+++ b/ssl/test/runner/common.go
@@ -163,6 +163,7 @@
 	CurveP384   CurveID = 24
 	CurveP521   CurveID = 25
 	CurveX25519 CurveID = 29
+	CurveCECPQ2 CurveID = 16696
 )
 
 // TLS Elliptic Curve Point Formats
@@ -1645,6 +1646,18 @@
 	// ExpectJDK11DowngradeRandom is whether the client should expect the
 	// server to send the JDK 11 downgrade signal.
 	ExpectJDK11DowngradeRandom bool
+
+	// FailIfHelloRetryRequested causes a handshake failure if a server requests a
+	// hello retry.
+	FailIfHelloRetryRequested bool
+
+	// FailedIfCECPQ2Offered will cause a server to reject a ClientHello if CECPQ2
+	// is supported.
+	FailIfCECPQ2Offered bool
+
+	// ExpectKeyShares, if not nil, lists (in order) the curves that a ClientHello
+	// should have key shares for.
+	ExpectedKeyShares []CurveID
 }
 
 func (c *Config) serverInit() {
@@ -1724,7 +1737,7 @@
 	return ret
 }
 
-var defaultCurvePreferences = []CurveID{CurveX25519, CurveP256, CurveP384, CurveP521}
+var defaultCurvePreferences = []CurveID{CurveCECPQ2, CurveX25519, CurveP256, CurveP384, CurveP521}
 
 func (c *Config) curvePreferences() []CurveID {
 	if c == nil || len(c.CurvePreferences) == 0 {
diff --git a/ssl/test/runner/handshake_client.go b/ssl/test/runner/handshake_client.go
index ab1f4dd..5234462 100644
--- a/ssl/test/runner/handshake_client.go
+++ b/ssl/test/runner/handshake_client.go
@@ -549,6 +549,9 @@
 	helloRetryRequest, haveHelloRetryRequest := msg.(*helloRetryRequestMsg)
 	var secondHelloBytes []byte
 	if haveHelloRetryRequest {
+		if c.config.Bugs.FailIfHelloRetryRequested {
+			return errors.New("tls: unexpected HelloRetryRequest")
+		}
 		// Explicitly read the ChangeCipherSpec now; it should
 		// be attached to the first flight, not the second flight.
 		if err := c.readTLS13ChangeCipherSpec(); err != nil {
diff --git a/ssl/test/runner/handshake_server.go b/ssl/test/runner/handshake_server.go
index 6a75242..5486342 100644
--- a/ssl/test/runner/handshake_server.go
+++ b/ssl/test/runner/handshake_server.go
@@ -208,6 +208,26 @@
 		}
 	}
 
+	if config.Bugs.FailIfCECPQ2Offered {
+		for _, offeredCurve := range hs.clientHello.supportedCurves {
+			if offeredCurve == CurveCECPQ2 {
+				return errors.New("tls: CECPQ2 was offered")
+			}
+		}
+	}
+
+	if expected := config.Bugs.ExpectedKeyShares; expected != nil {
+		if len(expected) != len(hs.clientHello.keyShares) {
+			return fmt.Errorf("tls: expected %d key shares, but found %d", len(expected), len(hs.clientHello.keyShares))
+		}
+
+		for i, group := range expected {
+			if found := hs.clientHello.keyShares[i].group; found != group {
+				return fmt.Errorf("tls: key share #%d is for group %d, not %d", i, found, group)
+			}
+		}
+	}
+
 	c.clientVersion = hs.clientHello.vers
 
 	// Use the versions extension if supplied, otherwise use the legacy ClientHello version.
@@ -1212,6 +1232,11 @@
 	preferredCurves := config.curvePreferences()
 Curves:
 	for _, curve := range hs.clientHello.supportedCurves {
+		if curve == CurveCECPQ2 && c.vers < VersionTLS13 {
+			// CECPQ2 is TLS 1.3-only.
+			continue
+		}
+
 		for _, supported := range preferredCurves {
 			if supported == curve {
 				supportedCurve = true
@@ -1621,7 +1646,7 @@
 	}
 
 	keyAgreement := hs.suite.ka(c.vers)
-	skx, err := keyAgreement.generateServerKeyExchange(config, hs.cert, hs.clientHello, hs.hello)
+	skx, err := keyAgreement.generateServerKeyExchange(config, hs.cert, hs.clientHello, hs.hello, c.vers)
 	if err != nil {
 		c.sendAlert(alertHandshakeFailure)
 		return err
diff --git a/ssl/test/runner/hrss/hrss.go b/ssl/test/runner/hrss/hrss.go
new file mode 100644
index 0000000..ebda656
--- /dev/null
+++ b/ssl/test/runner/hrss/hrss.go
@@ -0,0 +1,1230 @@
+package hrss
+
+import (
+	"crypto/hmac"
+	"crypto/sha256"
+	"crypto/subtle"
+	"encoding/binary"
+	"io"
+	"math/bits"
+)
+
+const (
+	PublicKeySize  = modQBytes
+	CiphertextSize = modQBytes + 32
+)
+
+const (
+	N         = 701
+	Q         = 8192
+	mod3Bytes = 140
+	modQBytes = 1138
+)
+
+const (
+	bitsPerWord      = bits.UintSize
+	wordsPerPoly     = (N + bitsPerWord - 1) / bitsPerWord
+	fullWordsPerPoly = N / bitsPerWord
+	bitsInLastWord   = N % bitsPerWord
+)
+
+// poly3 represents a degree-N polynomial over GF(3). Each coefficient is
+// bitsliced across the |s| and |a| arrays, like this:
+//
+//   s  |  a  | value
+//  -----------------
+//   0  |  0  | 0
+//   0  |  1  | 1
+//   1  |  0  | 2 (aka -1)
+//   1  |  1  | <invalid>
+//
+// ('s' is for sign, and 'a' is just a letter.)
+//
+// Once bitsliced as such, the following circuits can be used to implement
+// addition and multiplication mod 3:
+//
+//   (s3, a3) = (s1, a1) × (s2, a2)
+//   s3 = (s2 ∧ a1) ⊕ (s1 ∧ a2)
+//   a3 = (s1 ∧ s2) ⊕ (a1 ∧ a2)
+//
+//   (s3, a3) = (s1, a1) + (s2, a2)
+//   t1 = ~(s1 ∨ a1)
+//   t2 = ~(s2 ∨ a2)
+//   s3 = (a1 ∧ a2) ⊕ (t1 ∧ s2) ⊕ (t2 ∧ s1)
+//   a3 = (s1 ∧ s2) ⊕ (t1 ∧ a2) ⊕ (t2 ∧ a1)
+//
+// Negating a value just involves swapping s and a.
+type poly3 struct {
+	s [wordsPerPoly]uint
+	a [wordsPerPoly]uint
+}
+
+func (p *poly3) trim() {
+	p.s[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1
+	p.a[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1
+}
+
+func (p *poly3) zero() {
+	for i := range p.a {
+		p.s[i] = 0
+		p.a[i] = 0
+	}
+}
+
+func (p *poly3) fromDiscrete(in *poly) {
+	var shift uint
+	s := p.s[:]
+	a := p.a[:]
+	s[0] = 0
+	a[0] = 0
+
+	for _, v := range in {
+		s[0] >>= 1
+		s[0] |= uint((v>>1)&1) << (bitsPerWord - 1)
+		a[0] >>= 1
+		a[0] |= uint(v&1) << (bitsPerWord - 1)
+		shift++
+		if shift == bitsPerWord {
+			s = s[1:]
+			a = a[1:]
+			s[0] = 0
+			a[0] = 0
+			shift = 0
+		}
+	}
+
+	a[0] >>= bitsPerWord - shift
+	s[0] >>= bitsPerWord - shift
+}
+
+func (p *poly3) fromModQ(in *poly) int {
+	var shift uint
+	s := p.s[:]
+	a := p.a[:]
+	s[0] = 0
+	a[0] = 0
+	ok := 1
+
+	for _, v := range in {
+		vMod3, vOk := modQToMod3(v)
+		ok &= vOk
+
+		s[0] >>= 1
+		s[0] |= uint((vMod3>>1)&1) << (bitsPerWord - 1)
+		a[0] >>= 1
+		a[0] |= uint(vMod3&1) << (bitsPerWord - 1)
+		shift++
+		if shift == bitsPerWord {
+			s = s[1:]
+			a = a[1:]
+			s[0] = 0
+			a[0] = 0
+			shift = 0
+		}
+	}
+
+	a[0] >>= bitsPerWord - shift
+	s[0] >>= bitsPerWord - shift
+
+	return ok
+}
+
+func (p *poly3) fromDiscreteMod3(in *poly) {
+	var shift uint
+	s := p.s[:]
+	a := p.a[:]
+	s[0] = 0
+	a[0] = 0
+
+	for _, v := range in {
+		// This duplicates the 13th bit upwards to the top of the
+		// uint16, essentially treating it as a sign bit and converting
+		// into a signed int16. The signed value is reduced mod 3,
+		// yeilding {-2, -1, 0, 1, 2}.
+		v = uint16((int16(v<<3)>>3)%3) & 7
+
+		// We want to map v thus:
+		// {-2, -1, 0, 1, 2} -> {1, 2, 0, 1, 2}. We take the bottom
+		// three bits and then the constants below, when shifted by
+		// those three bits, perform the required mapping.
+		s[0] >>= 1
+		s[0] |= (0xbc >> v) << (bitsPerWord - 1)
+		a[0] >>= 1
+		a[0] |= (0x7a >> v) << (bitsPerWord - 1)
+		shift++
+		if shift == bitsPerWord {
+			s = s[1:]
+			a = a[1:]
+			s[0] = 0
+			a[0] = 0
+			shift = 0
+		}
+	}
+
+	a[0] >>= bitsPerWord - shift
+	s[0] >>= bitsPerWord - shift
+}
+
+func (p *poly3) marshal(out []byte) {
+	s := p.s[:]
+	a := p.a[:]
+	sw := s[0]
+	aw := a[0]
+	var shift int
+
+	for i := 0; i < 700; i += 5 {
+		acc, scale := 0, 1
+		for j := 0; j < 5; j++ {
+			v := int(aw&1) | int(sw&1)<<1
+			acc += scale * v
+			scale *= 3
+
+			shift++
+			if shift == bitsPerWord {
+				s = s[1:]
+				a = a[1:]
+				sw = s[0]
+				aw = a[0]
+				shift = 0
+			} else {
+				sw >>= 1
+				aw >>= 1
+			}
+		}
+
+		out[0] = byte(acc)
+		out = out[1:]
+	}
+}
+
+func (p *poly) fromMod2(in *poly2) {
+	var shift uint
+	words := in[:]
+	word := words[0]
+
+	for i := range p {
+		p[i] = uint16(word & 1)
+		word >>= 1
+		shift++
+		if shift == bitsPerWord {
+			words = words[1:]
+			word = words[0]
+			shift = 0
+		}
+	}
+}
+
+func (p *poly) fromMod3(in *poly3) {
+	var shift uint
+	s := in.s[:]
+	a := in.a[:]
+	sw := s[0]
+	aw := a[0]
+
+	for i := range p {
+		p[i] = uint16(aw&1 | (sw&1)<<1)
+		aw >>= 1
+		sw >>= 1
+		shift++
+		if shift == bitsPerWord {
+			a = a[1:]
+			s = s[1:]
+			aw = a[0]
+			sw = s[0]
+			shift = 0
+		}
+	}
+}
+
+func (p *poly) fromMod3ToModQ(in *poly3) {
+	var shift uint
+	s := in.s[:]
+	a := in.a[:]
+	sw := s[0]
+	aw := a[0]
+
+	for i := range p {
+		p[i] = mod3ToModQ(uint16(aw&1 | (sw&1)<<1))
+		aw >>= 1
+		sw >>= 1
+		shift++
+		if shift == bitsPerWord {
+			a = a[1:]
+			s = s[1:]
+			aw = a[0]
+			sw = s[0]
+			shift = 0
+		}
+	}
+}
+
+func lsbToAll(v uint) uint {
+	return uint(int(v<<(bitsPerWord-1)) >> (bitsPerWord - 1))
+}
+
+func (p *poly3) mulConst(ms, ma uint) {
+	ms = lsbToAll(ms)
+	ma = lsbToAll(ma)
+
+	for i := range p.a {
+		p.s[i], p.a[i] = (ma&p.s[i])^(ms&p.a[i]), (ma&p.a[i])^(ms&p.s[i])
+	}
+}
+
+func cmovWords(out, in *[wordsPerPoly]uint, mov uint) {
+	for i := range out {
+		out[i] = (out[i] & ^mov) | (in[i] & mov)
+	}
+}
+
+func rotWords(out, in *[wordsPerPoly]uint, bits uint) {
+	start := bits / bitsPerWord
+	n := (N - bits) / bitsPerWord
+
+	for i := uint(0); i < n; i++ {
+		out[i] = in[start+i]
+	}
+
+	carry := in[wordsPerPoly-1]
+
+	for i := uint(0); i < start; i++ {
+		out[n+i] = carry | in[i]<<bitsInLastWord
+		carry = in[i] >> (bitsPerWord - bitsInLastWord)
+	}
+
+	out[wordsPerPoly-1] = carry
+}
+
+// rotBits right-rotates the bits in |in|. bits must be a non-zero power of two
+// and less than bitsPerWord.
+func rotBits(out, in *[wordsPerPoly]uint, bits uint) {
+	if (bits == 0 || (bits & (bits - 1)) != 0 || bits > bitsPerWord/2 || bitsInLastWord < bitsPerWord/2) {
+		panic("internal error");
+	}
+
+	carry := in[wordsPerPoly-1] << (bitsPerWord - bits)
+
+	for i := wordsPerPoly - 2; i >= 0; i-- {
+		out[i] = carry | in[i]>>bits
+		carry = in[i] << (bitsPerWord - bits)
+	}
+
+	out[wordsPerPoly-1] = carry>>(bitsPerWord-bitsInLastWord) | in[wordsPerPoly-1]>>bits
+}
+
+func (p *poly3) rotWords(bits uint, in *poly3) {
+	rotWords(&p.s, &in.s, bits)
+	rotWords(&p.a, &in.a, bits)
+}
+
+func (p *poly3) rotBits(bits uint, in *poly3) {
+	rotBits(&p.s, &in.s, bits)
+	rotBits(&p.a, &in.a, bits)
+}
+
+func (p *poly3) cmov(in *poly3, mov uint) {
+	cmovWords(&p.s, &in.s, mov)
+	cmovWords(&p.a, &in.a, mov)
+}
+
+func (p *poly3) rot(bits uint) {
+	if bits > N {
+		panic("invalid")
+	}
+	var shifted poly3
+
+	shift := uint(9)
+	for ; (1 << shift) >= bitsPerWord; shift-- {
+		shifted.rotWords(1<<shift, p)
+		p.cmov(&shifted, lsbToAll(bits>>shift))
+	}
+	for ; shift < 9; shift-- {
+		shifted.rotBits(1<<shift, p)
+		p.cmov(&shifted, lsbToAll(bits>>shift))
+	}
+}
+
+func (p *poly3) fmadd(ms, ma uint, in *poly3) {
+	ms = lsbToAll(ms)
+	ma = lsbToAll(ma)
+
+	for i := range p.a {
+		products := (ma & in.s[i]) ^ (ms & in.a[i])
+		producta := (ma & in.a[i]) ^ (ms & in.s[i])
+
+		ns1Ana1 := ^p.s[i] & ^p.a[i]
+		ns2Ana2 := ^products & ^producta
+
+		p.s[i], p.a[i] = (p.a[i]&producta)^(ns1Ana1&products)^(p.s[i]&ns2Ana2), (p.s[i]&products)^(ns1Ana1&producta)^(p.a[i]&ns2Ana2)
+	}
+}
+
+func (p *poly3) modPhiN() {
+	factora := uint(int(p.s[wordsPerPoly-1]<<(bitsPerWord-bitsInLastWord)) >> (bitsPerWord - 1))
+	factors := uint(int(p.a[wordsPerPoly-1]<<(bitsPerWord-bitsInLastWord)) >> (bitsPerWord - 1))
+	ns2Ana2 := ^factors & ^factora
+
+	for i := range p.s {
+		ns1Ana1 := ^p.s[i] & ^p.a[i]
+		p.s[i], p.a[i] = (p.a[i]&factora)^(ns1Ana1&factors)^(p.s[i]&ns2Ana2), (p.s[i]&factors)^(ns1Ana1&factora)^(p.a[i]&ns2Ana2)
+	}
+}
+
+func (p *poly3) cswap(other *poly3, swap uint) {
+	for i := range p.s {
+		sums := swap & (p.s[i] ^ other.s[i])
+		p.s[i] ^= sums
+		other.s[i] ^= sums
+
+		suma := swap & (p.a[i] ^ other.a[i])
+		p.a[i] ^= suma
+		other.a[i] ^= suma
+	}
+}
+
+func (p *poly3) mulx() {
+	carrys := (p.s[wordsPerPoly-1] >> (bitsInLastWord - 1)) & 1
+	carrya := (p.a[wordsPerPoly-1] >> (bitsInLastWord - 1)) & 1
+
+	for i := range p.s {
+		outCarrys := p.s[i] >> (bitsPerWord - 1)
+		outCarrya := p.a[i] >> (bitsPerWord - 1)
+		p.s[i] <<= 1
+		p.a[i] <<= 1
+		p.s[i] |= carrys
+		p.a[i] |= carrya
+		carrys = outCarrys
+		carrya = outCarrya
+	}
+}
+
+func (p *poly3) divx() {
+	var carrys, carrya uint
+
+	for i := len(p.s) - 1; i >= 0; i-- {
+		outCarrys := p.s[i] & 1
+		outCarrya := p.a[i] & 1
+		p.s[i] >>= 1
+		p.a[i] >>= 1
+		p.s[i] |= carrys << (bitsPerWord - 1)
+		p.a[i] |= carrya << (bitsPerWord - 1)
+		carrys = outCarrys
+		carrya = outCarrya
+	}
+}
+
+type poly2 [wordsPerPoly]uint
+
+func (p *poly2) fromDiscrete(in *poly) {
+	var shift uint
+	words := p[:]
+	words[0] = 0
+
+	for _, v := range in {
+		words[0] >>= 1
+		words[0] |= uint(v&1) << (bitsPerWord - 1)
+		shift++
+		if shift == bitsPerWord {
+			words = words[1:]
+			words[0] = 0
+			shift = 0
+		}
+	}
+
+	words[0] >>= bitsPerWord - shift
+}
+
+func (p *poly2) setPhiN() {
+	for i := range p {
+		p[i] = ^uint(0)
+	}
+	p[wordsPerPoly-1] &= (1 << bitsInLastWord) - 1
+}
+
+func (p *poly2) cswap(other *poly2, swap uint) {
+	for i := range p {
+		sum := swap & (p[i] ^ other[i])
+		p[i] ^= sum
+		other[i] ^= sum
+	}
+}
+
+func (p *poly2) fmadd(m uint, in *poly2) {
+	m = ^(m - 1)
+
+	for i := range p {
+		p[i] ^= in[i] & m
+	}
+}
+
+func (p *poly2) lshift1() {
+	var carry uint
+	for i := range p {
+		nextCarry := p[i] >> (bitsPerWord - 1)
+		p[i] <<= 1
+		p[i] |= carry
+		carry = nextCarry
+	}
+}
+
+func (p *poly2) rshift1() {
+	var carry uint
+	for i := len(p) - 1; i >= 0; i-- {
+		nextCarry := p[i] & 1
+		p[i] >>= 1
+		p[i] |= carry << (bitsPerWord - 1)
+		carry = nextCarry
+	}
+}
+
+func (p *poly2) rot(bits uint) {
+	if bits > N {
+		panic("invalid")
+	}
+	var shifted [wordsPerPoly]uint
+	out := (*[wordsPerPoly]uint)(p)
+
+	shift := uint(9)
+	for ; (1 << shift) >= bitsPerWord; shift-- {
+		rotWords(&shifted, out, 1<<shift)
+		cmovWords(out, &shifted, lsbToAll(bits>>shift))
+	}
+	for ; shift < 9; shift-- {
+		rotBits(&shifted, out, 1<<shift)
+		cmovWords(out, &shifted, lsbToAll(bits>>shift))
+	}
+}
+
+type poly [N]uint16
+
+func (in *poly) marshal(out []byte) {
+	p := in[:]
+
+	for len(p) >= 8 {
+		out[0] = byte(p[0])
+		out[1] = byte(p[0]>>8) | byte((p[1]&0x07)<<5)
+		out[2] = byte(p[1] >> 3)
+		out[3] = byte(p[1]>>11) | byte((p[2]&0x3f)<<2)
+		out[4] = byte(p[2]>>6) | byte((p[3]&0x01)<<7)
+		out[5] = byte(p[3] >> 1)
+		out[6] = byte(p[3]>>9) | byte((p[4]&0x0f)<<4)
+		out[7] = byte(p[4] >> 4)
+		out[8] = byte(p[4]>>12) | byte((p[5]&0x7f)<<1)
+		out[9] = byte(p[5]>>7) | byte((p[6]&0x03)<<6)
+		out[10] = byte(p[6] >> 2)
+		out[11] = byte(p[6]>>10) | byte((p[7]&0x1f)<<3)
+		out[12] = byte(p[7] >> 5)
+
+		p = p[8:]
+		out = out[13:]
+	}
+
+	// There are four remaining values.
+	out[0] = byte(p[0])
+	out[1] = byte(p[0]>>8) | byte((p[1]&0x07)<<5)
+	out[2] = byte(p[1] >> 3)
+	out[3] = byte(p[1]>>11) | byte((p[2]&0x3f)<<2)
+	out[4] = byte(p[2]>>6) | byte((p[3]&0x01)<<7)
+	out[5] = byte(p[3] >> 1)
+	out[6] = byte(p[3] >> 9)
+}
+
+func (out *poly) unmarshal(in []byte) bool {
+	p := out[:]
+	for i := 0; i < 87; i++ {
+		p[0] = uint16(in[0]) | uint16(in[1]&0x1f)<<8
+		p[1] = uint16(in[1]>>5) | uint16(in[2])<<3 | uint16(in[3]&3)<<11
+		p[2] = uint16(in[3]>>2) | uint16(in[4]&0x7f)<<6
+		p[3] = uint16(in[4]>>7) | uint16(in[5])<<1 | uint16(in[6]&0xf)<<9
+		p[4] = uint16(in[6]>>4) | uint16(in[7])<<4 | uint16(in[8]&1)<<12
+		p[5] = uint16(in[8]>>1) | uint16(in[9]&0x3f)<<7
+		p[6] = uint16(in[9]>>6) | uint16(in[10])<<2 | uint16(in[11]&7)<<10
+		p[7] = uint16(in[11]>>3) | uint16(in[12])<<5
+
+		p = p[8:]
+		in = in[13:]
+	}
+
+	// There are four coefficients left over
+	p[0] = uint16(in[0]) | uint16(in[1]&0x1f)<<8
+	p[1] = uint16(in[1]>>5) | uint16(in[2])<<3 | uint16(in[3]&3)<<11
+	p[2] = uint16(in[3]>>2) | uint16(in[4]&0x7f)<<6
+	p[3] = uint16(in[4]>>7) | uint16(in[5])<<1 | uint16(in[6]&0xf)<<9
+
+	if in[6]&0xf0 != 0 {
+		return false
+	}
+
+	out[N-1] = 0
+	var top int
+	for _, v := range out {
+		top += int(v)
+	}
+
+	out[N-1] = uint16(-top) % Q
+	return true
+}
+
+func (in *poly) marshalS3(out []byte) {
+	p := in[:]
+	for len(p) >= 5 {
+		out[0] = byte(p[0] + p[1]*3 + p[2]*9 + p[3]*27 + p[4]*81)
+		out = out[1:]
+		p = p[5:]
+	}
+}
+
+func (out *poly) unmarshalS3(in []byte) bool {
+	p := out[:]
+	for i := 0; i < 140; i++ {
+		c := in[0]
+		if c >= 243 {
+			return false
+		}
+		p[0] = uint16(c % 3)
+		p[1] = uint16((c / 3) % 3)
+		p[2] = uint16((c / 9) % 3)
+		p[3] = uint16((c / 27) % 3)
+		p[4] = uint16((c / 81) % 3)
+
+		p = p[5:]
+		in = in[1:]
+	}
+
+	out[N-1] = 0
+	return true
+}
+
+func (p *poly) modPhiN() {
+	for i := range p {
+		p[i] = (p[i] + Q - p[N-1]) % Q
+	}
+}
+
+func (out *poly) shortSample(in []byte) {
+	//  b  a  result
+	// 00 00 00
+	// 00 01 01
+	// 00 10 10
+	// 00 11 11
+	// 01 00 10
+	// 01 01 00
+	// 01 10 01
+	// 01 11 11
+	// 10 00 01
+	// 10 01 10
+	// 10 10 00
+	// 10 11 11
+	// 11 00 11
+	// 11 01 11
+	// 11 10 11
+	// 11 11 11
+
+	// 1111 1111 1100 1001 1101 0010 1110 0100
+	//   f    f    c    9    d    2    e    4
+	const lookup = uint32(0xffc9d2e4)
+
+	p := out[:]
+	for i := 0; i < 87; i++ {
+		v := binary.LittleEndian.Uint32(in)
+		v2 := (v & 0x55555555) + ((v >> 1) & 0x55555555)
+		for j := 0; j < 8; j++ {
+			p[j] = uint16(lookup >> ((v2 & 15) << 1) & 3)
+			v2 >>= 4
+		}
+		p = p[8:]
+		in = in[4:]
+	}
+
+	// There are four values remaining.
+	v := binary.LittleEndian.Uint32(in)
+	v2 := (v & 0x55555555) + ((v >> 1) & 0x55555555)
+	for j := 0; j < 4; j++ {
+		p[j] = uint16(lookup >> ((v2 & 15) << 1) & 3)
+		v2 >>= 4
+	}
+
+	out[N-1] = 0
+}
+
+func (out *poly) shortSamplePlus(in []byte) {
+	out.shortSample(in)
+
+	var sum uint16
+	for i := 0; i < N-1; i++ {
+		sum += mod3ResultToModQ(out[i] * out[i+1])
+	}
+
+	scale := 1 + (1 & (sum >> 12))
+	for i := 0; i < len(out); i += 2 {
+		out[i] = (out[i] * scale) % 3
+	}
+}
+
+func mul(out, scratch, a, b []uint16) {
+	const schoolbookLimit = 32
+	if len(a) < schoolbookLimit {
+		for i := 0; i < len(a)*2; i++ {
+			out[i] = 0
+		}
+		for i := range a {
+			for j := range b {
+				out[i+j] += a[i] * b[j]
+			}
+		}
+		return
+	}
+
+	lowLen := len(a) / 2
+	highLen := len(a) - lowLen
+	aLow, aHigh := a[:lowLen], a[lowLen:]
+	bLow, bHigh := b[:lowLen], b[lowLen:]
+
+	for i := 0; i < lowLen; i++ {
+		out[i] = aHigh[i] + aLow[i]
+	}
+	if highLen != lowLen {
+		out[lowLen] = aHigh[lowLen]
+	}
+
+	for i := 0; i < lowLen; i++ {
+		out[highLen+i] = bHigh[i] + bLow[i]
+	}
+	if highLen != lowLen {
+		out[highLen+lowLen] = bHigh[lowLen]
+	}
+
+	mul(scratch, scratch[2*highLen:], out[:highLen], out[highLen:highLen*2])
+	mul(out[lowLen*2:], scratch[2*highLen:], aHigh, bHigh)
+	mul(out, scratch[2*highLen:], aLow, bLow)
+
+	for i := 0; i < lowLen*2; i++ {
+		scratch[i] -= out[i] + out[lowLen*2+i]
+	}
+	if lowLen != highLen {
+		scratch[lowLen*2] -= out[lowLen*4]
+	}
+
+	for i := 0; i < 2*highLen; i++ {
+		out[lowLen+i] += scratch[i]
+	}
+}
+
+func (out *poly) mul(a, b *poly) {
+	var prod, scratch [2 * N]uint16
+	mul(prod[:], scratch[:], a[:], b[:])
+	for i := range out {
+		out[i] = (prod[i] + prod[i+N]) % Q
+	}
+}
+
+func (p3 *poly3) mulMod3(x, y *poly3) {
+	// (𝑥^n - 1) is a multiple of Φ(N) so we can work mod (𝑥^n - 1) here and
+	// (reduce mod Φ(N) afterwards.
+	x3 := *x
+	y3 := *y
+	s := x3.s[:]
+	a := x3.a[:]
+	sw := s[0]
+	aw := a[0]
+	p3.zero()
+	var shift uint
+	for i := 0; i < N; i++ {
+		p3.fmadd(sw, aw, &y3)
+		sw >>= 1
+		aw >>= 1
+		shift++
+		if shift == bitsPerWord {
+			s = s[1:]
+			a = a[1:]
+			sw = s[0]
+			aw = a[0]
+			shift = 0
+		}
+		y3.mulx()
+	}
+	p3.modPhiN()
+}
+
+// mod3ToModQ maps {0, 1, 2, 3} to {0, 1, Q-1, 0xffff}
+// The case of n == 3 should never happen but is included so that modQToMod3
+// can easily catch invalid inputs.
+func mod3ToModQ(n uint16) uint16 {
+	return uint16(uint64(0xffff1fff00010000) >> (16 * n))
+}
+
+// modQToMod3 maps {0, 1, Q-1} to {(0, 0), (0, 1), (1, 0)} and also returns an int
+// which is one if the input is in range and zero otherwise.
+func modQToMod3(n uint16) (uint16, int) {
+	result := (n&3 - (n>>1)&1)
+	return result, subtle.ConstantTimeEq(int32(mod3ToModQ(result)), int32(n))
+}
+
+// mod3ResultToModQ maps {0, 1, 2, 4} to {0, 1, Q-1, 1}
+func mod3ResultToModQ(n uint16) uint16 {
+	return ((((uint16(0x13) >> n) & 1) - 1) & 0x1fff) | ((uint16(0x12) >> n) & 1)
+	//shift := (uint(0x324) >> (2 * n)) & 3
+	//return uint16(uint64(0x00011fff00010000) >> (16 * shift))
+}
+
+// mulXMinus1 sets out to a×(𝑥 - 1) mod (𝑥^n - 1)
+func (out *poly) mulXMinus1() {
+	// Multiplying by (𝑥 - 1) means negating each coefficient and adding in
+	// the value of the previous one.
+	origOut700 := out[700]
+
+	for i := N - 1; i > 0; i-- {
+		out[i] = (Q - out[i] + out[i-1]) % Q
+	}
+	out[0] = (Q - out[0] + origOut700) % Q
+}
+
+func (out *poly) lift(a *poly) {
+	// We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the
+	// Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime).
+
+	// 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up:
+	//
+	// R.<x> = PolynomialRing(GF(3)…)
+	// inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n))
+	// list(inv)[:15]
+	//   [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2]
+	//
+	// This three-element pattern of coefficients repeats for the whole
+	// polynomial.
+	//
+	// Next define the overbar operator such that z̅ = z[0] +
+	// reverse(z[1:]). (Index zero of a polynomial here is the coefficient
+	// of the constant term. So index one is the coefficient of 𝑥 and so
+	// on.)
+	//
+	// A less odd way to define this is to see that z̅ negates the indexes,
+	// so z̅[0] = z[-0], z̅[1] = z[-1] and so on.
+	//
+	// The use of z̅  is that, when working mod (𝑥^701 - 1), vz[0] = <v,
+	// z̅>, vz[1] = <v, 𝑥z̅>, …. (Where <a, b> is the inner product: the sum
+	// of the point-wise products.) Although we calculated the inverse mod
+	// Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end.
+	// (That's because (𝑥^N - 1) is a multiple of Φ(N).)
+	//
+	// When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation
+	// of the list of coefficients.
+	//
+	// Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like:
+	//
+	// def reverse(xs):
+	//   suffix = list(xs[1:])
+	//   suffix.reverse()
+	//   return [xs[0]] + suffix
+	//
+	// def rotate(xs):
+	//   return [xs[-1]] + xs[:-1]
+	//
+	// zoverbar = reverse(list(inv) + [0])
+	// xzoverbar = rotate(reverse(list(inv) + [0]))
+	// x2zoverbar = rotate(rotate(reverse(list(inv) + [0])))
+	//
+	// zoverbar[:15]
+	//   [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1]
+	// xzoverbar[:15]
+	//   [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0]
+	// x2zoverbar[:15]
+	//   [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2]
+	//
+	// (For a formula for z̅, see lemma two of appendix B.)
+	//
+	// After the first three elements have been taken care of, all then have
+	// a repeating three-element cycle. The next value (𝑥^3z̅) involves
+	// three rotations of the first pattern, thus the three-element cycle
+	// lines up. However, the discontinuity in the first three elements
+	// obviously moves to a different position. Consider the difference
+	// between 𝑥^3z̅ and z̅:
+	//
+	// [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15]
+	//    [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+	//
+	// This pattern of differences is the same for all elements, although it
+	// obviously moves right with the rotations.
+	//
+	// From this, we reach algorithm eight of appendix B.
+
+	// Handle the first three elements of the inner products.
+	out[0] = a[0] + a[2]
+	out[1] = a[1]
+	out[2] = 2*a[0] + a[2]
+
+	// Use the repeating pattern to complete the first three inner products.
+	for i := 3; i < 699; i += 3 {
+		out[0] += 2*a[i] + a[i+2]
+		out[1] += a[i] + 2*a[i+1]
+		out[2] += a[i+1] + 2*a[i+2]
+	}
+
+	// Handle the fact that the three-element pattern doesn't fill the
+	// polynomial exactly (since 701 isn't a multiple of three).
+	out[2] += a[700]
+	out[0] += 2 * a[699]
+	out[1] += a[699] + 2*a[700]
+
+	out[0] = out[0] % 3
+	out[1] = out[1] % 3
+	out[2] = out[2] % 3
+
+	// Calculate the remaining inner products by taking advantage of the
+	// fact that the pattern repeats every three cycles and the pattern of
+	// differences is moves with the rotation.
+	for i := 3; i < N; i++ {
+		// Add twice something is the same as subtracting when working
+		// mod 3. Doing it this way avoids underflow. Underflow is bad
+		// because "% 3" doesn't work correctly for negative numbers
+		// here since underflow will wrap to 2^16-1 and 2^16 isn't a
+		// multiple of three.
+		out[i] = (out[i-3] + 2*(a[i-2]+a[i-1]+a[i])) % 3
+	}
+
+	// Reduce mod Φ(N) by subtracting a multiple of out[700] from every
+	// element and convert to mod Q. (See above about adding twice as
+	// subtraction.)
+	v := out[700] * 2
+	for i := range out {
+		out[i] = mod3ToModQ((out[i] + v) % 3)
+	}
+
+	out.mulXMinus1()
+}
+
+func (a *poly) cswap(b *poly, swap uint16) {
+	for i := range a {
+		sum := swap & (a[i] ^ b[i])
+		a[i] ^= sum
+		b[i] ^= sum
+	}
+}
+
+func lt(a, b uint) uint {
+	if a < b {
+		return ^uint(0)
+	}
+	return 0
+}
+
+func bsMul(s1, a1, s2, a2 uint) (s3, a3 uint) {
+	s3 = (a1 & s2) ^ (s1 & a2)
+	a3 = (a1 & a2) ^ (s1 & s2)
+	return
+}
+
+func (out *poly3) invertMod3(in *poly3) {
+	// This algorithm follows algorithm 10 in the paper. (Although note that
+	// the paper appears to have a bug: k should start at zero, not one.)
+	// The best explanation for why it works is in the "Why it works"
+	// section of
+	// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf.
+	var k uint
+	degF, degG := uint(N-1), uint(N-1)
+
+	var b, c, g poly3
+	f := *in
+
+	for i := range g.a {
+		g.a[i] = ^uint(0)
+	}
+
+	b.a[0] = 1
+
+	var f0s, f0a uint
+	stillGoing := ^uint(0)
+	for i := 0; i < 2*(N-1)-1; i++ {
+		ss, sa := bsMul(f.s[0], f.a[0], g.s[0], g.a[0])
+		ss, sa = sa&stillGoing&1, ss&stillGoing&1
+		shouldSwap := ^uint(int((ss|sa)-1)>>(bitsPerWord-1)) & lt(degF, degG)
+		f.cswap(&g, shouldSwap)
+		b.cswap(&c, shouldSwap)
+		degF, degG = (degG&shouldSwap)|(degF & ^shouldSwap), (degF&shouldSwap)|(degG&^shouldSwap)
+		f.fmadd(ss, sa, &g)
+		b.fmadd(ss, sa, &c)
+
+		f.divx()
+		f.s[wordsPerPoly-1] &= ((1 << bitsInLastWord) - 1) >> 1
+		f.a[wordsPerPoly-1] &= ((1 << bitsInLastWord) - 1) >> 1
+		c.mulx()
+		c.s[0] &= ^uint(1)
+		c.a[0] &= ^uint(1)
+
+		degF--
+		k += 1 & stillGoing
+		f0s = (stillGoing & f.s[0]) | (^stillGoing & f0s)
+		f0a = (stillGoing & f.a[0]) | (^stillGoing & f0a)
+		stillGoing = ^uint(int(degF-1) >> (bitsPerWord - 1))
+	}
+
+	k -= N & lt(N, k)
+	*out = b
+	out.rot(k)
+	out.mulConst(f0s, f0a)
+	out.modPhiN()
+}
+
+func (out *poly) invertMod2(a *poly) {
+	// This algorithm follows mix of algorithm 10 in the paper and the first
+	// page of the PDF linked below. (Although note that the paper appears
+	// to have a bug: k should start at zero, not one.) The best explanation
+	// for why it works is in the "Why it works" section of
+	// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf.
+	var k uint
+	degF, degG := uint(N-1), uint(N-1)
+
+	var f poly2
+	f.fromDiscrete(a)
+	var b, c, g poly2
+	g.setPhiN()
+	b[0] = 1
+
+	stillGoing := ^uint(0)
+	for i := 0; i < 2*(N-1)-1; i++ {
+		s := uint(f[0]&1) & stillGoing
+		shouldSwap := ^(s - 1) & lt(degF, degG)
+		f.cswap(&g, shouldSwap)
+		b.cswap(&c, shouldSwap)
+		degF, degG = (degG&shouldSwap)|(degF & ^shouldSwap), (degF&shouldSwap)|(degG&^shouldSwap)
+		f.fmadd(s, &g)
+		b.fmadd(s, &c)
+
+		f.rshift1()
+		c.lshift1()
+
+		degF--
+		k += 1 & stillGoing
+		stillGoing = ^uint(int(degF-1) >> (bitsPerWord - 1))
+	}
+
+	k -= N & lt(N, k)
+	b.rot(k)
+	out.fromMod2(&b)
+}
+
+func (out *poly) invert(origA *poly) {
+	// Inversion mod Q, which is done based on the result of inverting mod
+	// 2. See the NTRU paper, page three.
+	var a, tmp, tmp2, b poly
+	b.invertMod2(origA)
+
+	// Negate a.
+	for i := range a {
+		a[i] = Q - origA[i]
+	}
+
+	// We are working mod Q=2**13 and we need to iterate ceil(log_2(13))
+	// times, which is four.
+	for i := 0; i < 4; i++ {
+		tmp.mul(&a, &b)
+		tmp[0] += 2
+		tmp2.mul(&b, &tmp)
+		b = tmp2
+	}
+
+	*out = b
+}
+
+type PublicKey struct {
+	h poly
+}
+
+func ParsePublicKey(in []byte) (*PublicKey, bool) {
+	ret := new(PublicKey)
+	if !ret.h.unmarshal(in) {
+		return nil, false
+	}
+	return ret, true
+}
+
+func (pub *PublicKey) Marshal() []byte {
+	ret := make([]byte, modQBytes)
+	pub.h.marshal(ret)
+	return ret
+}
+
+func (pub *PublicKey) Encap(rand io.Reader) (ciphertext []byte, sharedKey []byte) {
+	var randBytes [352 + 352]byte
+	if _, err := io.ReadFull(rand, randBytes[:]); err != nil {
+		panic("rand failed")
+	}
+
+	var m, r poly
+	m.shortSample(randBytes[:352])
+	r.shortSample(randBytes[352:])
+
+	var mBytes, rBytes [mod3Bytes]byte
+	m.marshalS3(mBytes[:])
+	r.marshalS3(rBytes[:])
+
+	h := sha256.New()
+	h.Write([]byte("confirmation hash\x00"))
+	h.Write(mBytes[:])
+	h.Write(rBytes[:])
+	confirmationDigest := h.Sum(nil)
+
+	encrypted := pub.owf(&m, &r)
+	ciphertext = make([]byte, 0, len(encrypted)+len(confirmationDigest))
+	ciphertext = append(ciphertext, encrypted...)
+	ciphertext = append(ciphertext, confirmationDigest...)
+
+	h.Reset()
+	h.Write([]byte("shared key\x00"))
+	h.Write(mBytes[:])
+	h.Write(rBytes[:])
+	h.Write(ciphertext)
+	sharedKey = h.Sum(nil)
+
+	return ciphertext, sharedKey
+}
+
+func (pub *PublicKey) owf(m, r *poly) []byte {
+	for i := range r {
+		r[i] = mod3ToModQ(r[i])
+	}
+
+	var mq poly
+	mq.lift(m)
+
+	var e poly
+	e.mul(r, &pub.h)
+	for i := range e {
+		e[i] = (e[i] + mq[i]) % Q
+	}
+
+	ret := make([]byte, modQBytes)
+	e.marshal(ret[:])
+	return ret
+}
+
+type PrivateKey struct {
+	PublicKey
+	f, fp   poly3
+	hInv    poly
+	hmacKey [32]byte
+}
+
+func (priv *PrivateKey) Marshal() []byte {
+	var ret [2*mod3Bytes + modQBytes]byte
+	priv.f.marshal(ret[:])
+	priv.fp.marshal(ret[mod3Bytes:])
+	priv.h.marshal(ret[2*mod3Bytes:])
+	return ret[:]
+}
+
+func (priv *PrivateKey) Decap(ciphertext []byte) (sharedKey []byte, ok bool) {
+	if len(ciphertext) != modQBytes+32 {
+		return nil, false
+	}
+
+	var e poly
+	if !e.unmarshal(ciphertext[:modQBytes]) {
+		return nil, false
+	}
+
+	var f poly
+	f.fromMod3ToModQ(&priv.f)
+
+	var v1, m poly
+	v1.mul(&e, &f)
+
+	var v13 poly3
+	v13.fromDiscreteMod3(&v1)
+	// Note: v13 is not reduced mod phi(n).
+
+	var m3 poly3
+	m3.mulMod3(&v13, &priv.fp)
+	m3.modPhiN()
+	m.fromMod3(&m3)
+
+	var mLift, delta poly
+	mLift.lift(&m)
+	for i := range delta {
+		delta[i] = (e[i] - mLift[i] + Q) % Q
+	}
+	delta.mul(&delta, &priv.hInv)
+	delta.modPhiN()
+
+	var r poly3
+	allOk := r.fromModQ(&delta)
+
+	var mBytes, rBytes [mod3Bytes]byte
+	m.marshalS3(mBytes[:])
+	r.marshal(rBytes[:])
+
+	h := sha256.New()
+	h.Write([]byte("confirmation hash\x00"))
+	h.Write(mBytes[:])
+	h.Write(rBytes[:])
+	confirmationDigest := h.Sum(nil)
+
+	var rPoly poly
+	rPoly.fromMod3(&r)
+	encrypted := priv.PublicKey.owf(&m, &rPoly)
+	expectedCiphertext := make([]byte, 0, len(encrypted)+len(confirmationDigest))
+	expectedCiphertext = append(expectedCiphertext, encrypted...)
+	expectedCiphertext = append(expectedCiphertext, confirmationDigest...)
+
+	allOk &= subtle.ConstantTimeCompare(ciphertext, expectedCiphertext)
+
+	hmacHash := hmac.New(sha256.New, priv.hmacKey[:])
+	hmacHash.Write(ciphertext)
+	hmacDigest := hmacHash.Sum(nil)
+
+	h.Reset()
+	h.Write([]byte("shared key\x00"))
+	h.Write(mBytes[:])
+	h.Write(rBytes[:])
+	h.Write(ciphertext)
+	sharedKey = h.Sum(nil)
+
+	mask := uint8(allOk - 1)
+	for i := range sharedKey {
+		sharedKey[i] = (sharedKey[i] & ^mask) | (hmacDigest[i] & mask)
+	}
+
+	return sharedKey, true
+}
+
+func GenerateKey(rand io.Reader) PrivateKey {
+	var randBytes [352 + 352]byte
+	if _, err := io.ReadFull(rand, randBytes[:]); err != nil {
+		panic("rand failed")
+	}
+
+	var f poly
+	f.shortSamplePlus(randBytes[:352])
+	var priv PrivateKey
+	priv.f.fromDiscrete(&f)
+	priv.fp.invertMod3(&priv.f)
+
+	var g poly
+	g.shortSamplePlus(randBytes[352:])
+
+	var pgPhi1 poly
+	for i := range g {
+		pgPhi1[i] = mod3ToModQ(g[i])
+	}
+	for i := range pgPhi1 {
+		pgPhi1[i] = (pgPhi1[i] * 3) % Q
+	}
+	pgPhi1.mulXMinus1()
+
+	var fModQ poly
+	fModQ.fromMod3ToModQ(&priv.f)
+
+	var pfgPhi1 poly
+	pfgPhi1.mul(&fModQ, &pgPhi1)
+
+	var i poly
+	i.invert(&pfgPhi1)
+
+	priv.h.mul(&i, &pgPhi1)
+	priv.h.mul(&priv.h, &pgPhi1)
+
+	priv.hInv.mul(&i, &fModQ)
+	priv.hInv.mul(&priv.hInv, &fModQ)
+
+	return priv
+}
diff --git a/ssl/test/runner/key_agreement.go b/ssl/test/runner/key_agreement.go
index 791325c..f40552d 100644
--- a/ssl/test/runner/key_agreement.go
+++ b/ssl/test/runner/key_agreement.go
@@ -17,6 +17,7 @@
 
 	"boringssl.googlesource.com/boringssl/ssl/test/runner/curve25519"
 	"boringssl.googlesource.com/boringssl/ssl/test/runner/ed25519"
+	"boringssl.googlesource.com/boringssl/ssl/test/runner/hrss"
 )
 
 type keyType int
@@ -37,7 +38,7 @@
 	exportKey     *rsa.PrivateKey
 }
 
-func (ka *rsaKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) {
+func (ka *rsaKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) {
 	// Save the client version for comparison later.
 	ka.clientVersion = clientHello.vers
 
@@ -347,6 +348,90 @@
 	return out[:], nil
 }
 
+// cecpq2Curve implements CECPQ2, which is HRSS+SXY combined with X25519.
+type cecpq2Curve struct {
+	x25519PrivateKey [32]byte
+	hrssPrivateKey   hrss.PrivateKey
+}
+
+func (e *cecpq2Curve) offer(rand io.Reader) (publicKey []byte, err error) {
+	if _, err := io.ReadFull(rand, e.x25519PrivateKey[:]); err != nil {
+		return nil, err
+	}
+
+	var x25519Public [32]byte
+	curve25519.ScalarBaseMult(&x25519Public, &e.x25519PrivateKey)
+
+	e.hrssPrivateKey = hrss.GenerateKey(rand)
+	hrssPublic := e.hrssPrivateKey.PublicKey.Marshal()
+
+	var ret []byte
+	ret = append(ret, x25519Public[:]...)
+	ret = append(ret, hrssPublic...)
+	return ret, nil
+}
+
+func (e *cecpq2Curve) accept(rand io.Reader, peerKey []byte) (publicKey []byte, preMasterSecret []byte, err error) {
+	if len(peerKey) != 32+hrss.PublicKeySize {
+		return nil, nil, errors.New("tls: bad length CECPQ2 offer")
+	}
+
+	if _, err := io.ReadFull(rand, e.x25519PrivateKey[:]); err != nil {
+		return nil, nil, err
+	}
+
+	var x25519Shared, x25519PeerKey, x25519Public [32]byte
+	copy(x25519PeerKey[:], peerKey)
+	curve25519.ScalarBaseMult(&x25519Public, &e.x25519PrivateKey)
+	curve25519.ScalarMult(&x25519Shared, &e.x25519PrivateKey, &x25519PeerKey)
+
+	// Per RFC 7748, reject the all-zero value in constant time.
+	var zeros [32]byte
+	if subtle.ConstantTimeCompare(zeros[:], x25519Shared[:]) == 1 {
+		return nil, nil, errors.New("tls: X25519 value with wrong order")
+	}
+
+	hrssPublicKey, ok := hrss.ParsePublicKey(peerKey[32:])
+	if !ok {
+		return nil, nil, errors.New("tls: bad CECPQ2 offer")
+	}
+
+	hrssCiphertext, hrssShared := hrssPublicKey.Encap(rand)
+
+	publicKey = append(publicKey, x25519Public[:]...)
+	publicKey = append(publicKey, hrssCiphertext...)
+	preMasterSecret = append(preMasterSecret, x25519Shared[:]...)
+	preMasterSecret = append(preMasterSecret, hrssShared...)
+
+	return publicKey, preMasterSecret, nil
+}
+
+func (e *cecpq2Curve) finish(peerKey []byte) (preMasterSecret []byte, err error) {
+	if len(peerKey) != 32+hrss.CiphertextSize {
+		return nil, errors.New("tls: bad length CECPQ2 reply")
+	}
+
+	var x25519Shared, x25519PeerKey [32]byte
+	copy(x25519PeerKey[:], peerKey)
+	curve25519.ScalarMult(&x25519Shared, &e.x25519PrivateKey, &x25519PeerKey)
+
+	// Per RFC 7748, reject the all-zero value in constant time.
+	var zeros [32]byte
+	if subtle.ConstantTimeCompare(zeros[:], x25519Shared[:]) == 1 {
+		return nil, errors.New("tls: X25519 value with wrong order")
+	}
+
+	hrssShared, ok := e.hrssPrivateKey.Decap(peerKey[32:])
+	if !ok {
+		return nil, errors.New("tls: invalid HRSS ciphertext")
+	}
+
+	preMasterSecret = append(preMasterSecret, x25519Shared[:]...)
+	preMasterSecret = append(preMasterSecret, hrssShared...)
+
+	return preMasterSecret, nil
+}
+
 func curveForCurveID(id CurveID, config *Config) (ecdhCurve, bool) {
 	switch id {
 	case CurveP224:
@@ -359,6 +444,8 @@
 		return &ellipticECDHCurve{curve: elliptic.P521(), sendCompressed: config.Bugs.SendCompressedCoordinates}, true
 	case CurveX25519:
 		return &x25519ECDHCurve{setHighBit: config.Bugs.SetX25519HighBit}, true
+	case CurveCECPQ2:
+		return &cecpq2Curve{}, true
 	default:
 		return nil, false
 	}
@@ -501,12 +588,17 @@
 	peerKey []byte
 }
 
-func (ka *ecdheKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) {
+func (ka *ecdheKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) {
 	var curveid CurveID
 	preferredCurves := config.curvePreferences()
 
 NextCandidate:
 	for _, candidate := range preferredCurves {
+		if candidate == CurveCECPQ2 && version < VersionTLS13 {
+			// CECPQ2 is TLS 1.3-only.
+			continue
+		}
+
 		for _, c := range clientHello.supportedCurves {
 			if candidate == c {
 				curveid = c
@@ -614,7 +706,7 @@
 // exchange.
 type nilKeyAgreement struct{}
 
-func (ka *nilKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) {
+func (ka *nilKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) {
 	return nil, nil
 }
 
@@ -666,7 +758,7 @@
 	identityHint string
 }
 
-func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg) (*serverKeyExchangeMsg, error) {
+func (ka *pskKeyAgreement) generateServerKeyExchange(config *Config, cert *Certificate, clientHello *clientHelloMsg, hello *serverHelloMsg, version uint16) (*serverKeyExchangeMsg, error) {
 	// Assemble the identity hint.
 	bytes := make([]byte, 2+len(config.PreSharedKeyIdentity))
 	bytes[0] = byte(len(config.PreSharedKeyIdentity) >> 8)
@@ -675,7 +767,7 @@
 
 	// If there is one, append the base key agreement's
 	// ServerKeyExchange.
-	baseSkx, err := ka.base.generateServerKeyExchange(config, cert, clientHello, hello)
+	baseSkx, err := ka.base.generateServerKeyExchange(config, cert, clientHello, hello, version)
 	if err != nil {
 		return nil, err
 	}
diff --git a/ssl/test/runner/runner.go b/ssl/test/runner/runner.go
index da81f23..cbce065 100644
--- a/ssl/test/runner/runner.go
+++ b/ssl/test/runner/runner.go
@@ -9619,7 +9619,7 @@
 			CipherSuites: []uint16{TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256},
 			Certificates: []Certificate{ecdsaP256Certificate},
 		},
-		flags:         []string{"-p384-only"},
+		flags:         []string{"-curves", strconv.Itoa(int(CurveP384))},
 		shouldFail:    true,
 		expectedError: ":BAD_ECC_CERT:",
 	})
@@ -9631,7 +9631,7 @@
 			MaxVersion:   VersionTLS13,
 			Certificates: []Certificate{ecdsaP256Certificate},
 		},
-		flags: []string{"-p384-only"},
+		flags: []string{"-curves", strconv.Itoa(int(CurveP384))},
 	})
 
 	// In TLS 1.2, the ECDSA curve is not in the signature algorithm.
@@ -10711,6 +10711,7 @@
 	{"P-384", CurveP384},
 	{"P-521", CurveP521},
 	{"X25519", CurveX25519},
+	{"CECPQ2", CurveCECPQ2},
 }
 
 const bogusCurve = 0x1234
@@ -10718,6 +10719,10 @@
 func addCurveTests() {
 	for _, curve := range testCurves {
 		for _, ver := range tlsVersions {
+			if curve.id == CurveCECPQ2 && ver.version < VersionTLS13 {
+				continue
+			}
+
 			suffix := curve.name + "-" + ver.name
 
 			testCases = append(testCases, testCase{
@@ -10758,7 +10763,7 @@
 				expectedCurveID: curve.id,
 			})
 
-			if curve.id != CurveX25519 {
+			if curve.id != CurveX25519 && curve.id != CurveCECPQ2 {
 				testCases = append(testCases, testCase{
 					name: "CurveTest-Client-Compressed-" + suffix,
 					config: Config{
@@ -10902,7 +10907,7 @@
 				IgnorePeerCurvePreferences: true,
 			},
 		},
-		flags:         []string{"-p384-only"},
+		flags:         []string{"-curves", strconv.Itoa(int(CurveP384))},
 		shouldFail:    true,
 		expectedError: ":WRONG_CURVE:",
 	})
@@ -10918,7 +10923,7 @@
 				SendCurve: CurveP256,
 			},
 		},
-		flags:         []string{"-p384-only"},
+		flags:         []string{"-curves", strconv.Itoa(int(CurveP384))},
 		shouldFail:    true,
 		expectedError: ":WRONG_CURVE:",
 	})
@@ -11169,6 +11174,112 @@
 			},
 		},
 	})
+
+	// CECPQ2 should not be offered by a TLS < 1.3 client.
+	testCases = append(testCases, testCase{
+		name: "CECPQ2NotInTLS12",
+		config: Config{
+			Bugs: ProtocolBugs{
+				FailIfCECPQ2Offered: true,
+			},
+		},
+		flags: []string{
+			"-max-version", strconv.Itoa(VersionTLS12),
+			"-curves", strconv.Itoa(int(CurveCECPQ2)),
+			"-curves", strconv.Itoa(int(CurveX25519)),
+		},
+	})
+
+	// CECPQ2 should not crash a TLS < 1.3 client if the server mistakenly
+	// selects it.
+	testCases = append(testCases, testCase{
+		name: "CECPQ2NotAcceptedByTLS12Client",
+		config: Config{
+			Bugs: ProtocolBugs{
+				SendCurve: CurveCECPQ2,
+			},
+		},
+		flags: []string{
+			"-max-version", strconv.Itoa(VersionTLS12),
+			"-curves", strconv.Itoa(int(CurveCECPQ2)),
+			"-curves", strconv.Itoa(int(CurveX25519)),
+		},
+		shouldFail:    true,
+		expectedError: ":WRONG_CURVE:",
+	})
+
+	// CECPQ2 should not be offered by default as a client.
+	testCases = append(testCases, testCase{
+		name: "CECPQ2NotEnabledByDefaultInClients",
+		config: Config{
+			MinVersion: VersionTLS13,
+			Bugs: ProtocolBugs{
+				FailIfCECPQ2Offered: true,
+			},
+		},
+	})
+
+	// If CECPQ2 is offered, both X25519 and CECPQ2 should have a key-share.
+	testCases = append(testCases, testCase{
+		name: "NotJustCECPQ2KeyShare",
+		config: Config{
+			MinVersion: VersionTLS13,
+			Bugs: ProtocolBugs{
+				ExpectedKeyShares: []CurveID{CurveCECPQ2, CurveX25519},
+			},
+		},
+		flags: []string{
+			"-curves", strconv.Itoa(int(CurveCECPQ2)),
+			"-curves", strconv.Itoa(int(CurveX25519)),
+			"-expect-curve-id", strconv.Itoa(int(CurveCECPQ2)),
+		},
+	})
+
+	// ... but only if CECPQ2 is listed first.
+	testCases = append(testCases, testCase{
+		name: "CECPQ2KeyShareNotIncludedSecond",
+		config: Config{
+			MinVersion: VersionTLS13,
+			Bugs: ProtocolBugs{
+				ExpectedKeyShares: []CurveID{CurveX25519},
+			},
+		},
+		flags: []string{
+			"-curves", strconv.Itoa(int(CurveX25519)),
+			"-curves", strconv.Itoa(int(CurveCECPQ2)),
+			"-expect-curve-id", strconv.Itoa(int(CurveX25519)),
+		},
+	})
+
+	// If CECPQ2 is the only configured curve, the key share is sent.
+	testCases = append(testCases, testCase{
+		name: "JustConfiguringCECPQ2Works",
+		config: Config{
+			MinVersion: VersionTLS13,
+			Bugs: ProtocolBugs{
+				ExpectedKeyShares: []CurveID{CurveCECPQ2},
+			},
+		},
+		flags: []string{
+			"-curves", strconv.Itoa(int(CurveCECPQ2)),
+			"-expect-curve-id", strconv.Itoa(int(CurveCECPQ2)),
+		},
+	})
+
+	// As a server, CECPQ2 is not yet supported by default.
+	testCases = append(testCases, testCase{
+		testType: serverTest,
+		name:     "CECPQ2NotEnabledByDefaultForAServer",
+		config: Config{
+			MinVersion:       VersionTLS13,
+			CurvePreferences: []CurveID{CurveCECPQ2, CurveX25519},
+			DefaultCurves:    []CurveID{CurveCECPQ2},
+		},
+		flags: []string{
+			"-server-preference",
+			"-expect-curve-id", strconv.Itoa(int(CurveX25519)),
+		},
+	})
 }
 
 func addTLS13RecordTests() {
@@ -12706,7 +12817,7 @@
 				},
 			},
 			tls13Variant:  variant,
-			flags:         []string{"-p384-only"},
+			flags:         []string{"-curves", strconv.Itoa(int(CurveP384))},
 			shouldFail:    true,
 			expectedError: ":WRONG_CURVE:",
 		})
diff --git a/ssl/test/test_config.cc b/ssl/test/test_config.cc
index 7447d5a..9a5c9b2 100644
--- a/ssl/test/test_config.cc
+++ b/ssl/test/test_config.cc
@@ -104,7 +104,6 @@
   { "-renegotiate-ignore", &TestConfig::renegotiate_ignore },
   { "-forbid-renegotiation-after-handshake",
     &TestConfig::forbid_renegotiation_after_handshake },
-  { "-p384-only", &TestConfig::p384_only },
   { "-enable-all-curves", &TestConfig::enable_all_curves },
   { "-use-old-client-cert-callback",
     &TestConfig::use_old_client_cert_callback },
@@ -147,6 +146,7 @@
   { "-handshaker-resume", &TestConfig::handshaker_resume },
   { "-reverify-on-resume", &TestConfig::reverify_on_resume },
   { "-jdk11-workaround", &TestConfig::jdk11_workaround },
+  { "-server-preference", &TestConfig::server_preference },
 };
 
 const Flag<std::string> kStringFlags[] = {
@@ -220,10 +220,10 @@
 };
 
 const Flag<std::vector<int>> kIntVectorFlags[] = {
-  { "-signing-prefs", &TestConfig::signing_prefs },
-  { "-verify-prefs", &TestConfig::verify_prefs },
-  { "-expect-peer-verify-pref",
-    &TestConfig::expected_peer_verify_prefs },
+    {"-signing-prefs", &TestConfig::signing_prefs},
+    {"-verify-prefs", &TestConfig::verify_prefs},
+    {"-expect-peer-verify-pref", &TestConfig::expected_peer_verify_prefs},
+    {"-curves", &TestConfig::curves},
 };
 
 bool ParseFlag(char *flag, int argc, char **argv, int *i,
@@ -1294,7 +1294,6 @@
     return nullptr;
   }
 
-
   if (install_cert_compression_algs &&
       (!SSL_CTX_add_cert_compression_alg(
            ssl_ctx.get(), 0xff02,
@@ -1341,6 +1340,10 @@
     abort();
   }
 
+  if (server_preference) {
+    SSL_CTX_set_options(ssl_ctx.get(), SSL_OP_CIPHER_SERVER_PREFERENCE);
+  }
+
   return ssl_ctx;
 }
 
@@ -1589,16 +1592,43 @@
   if (!check_close_notify) {
     SSL_set_quiet_shutdown(ssl.get(), 1);
   }
-  if (p384_only) {
-    int nid = NID_secp384r1;
-    if (!SSL_set1_curves(ssl.get(), &nid, 1)) {
-      return nullptr;
+  if (!curves.empty()) {
+    std::vector<int> nids;
+    for (auto curve : curves) {
+      switch (curve) {
+        case SSL_CURVE_SECP224R1:
+          nids.push_back(NID_secp224r1);
+          break;
+
+        case SSL_CURVE_SECP256R1:
+          nids.push_back(NID_X9_62_prime256v1);
+          break;
+
+        case SSL_CURVE_SECP384R1:
+          nids.push_back(NID_secp384r1);
+          break;
+
+        case SSL_CURVE_SECP521R1:
+          nids.push_back(NID_secp521r1);
+          break;
+
+        case SSL_CURVE_X25519:
+          nids.push_back(NID_X25519);
+          break;
+
+        case SSL_CURVE_CECPQ2:
+          nids.push_back(NID_CECPQ2);
+          break;
+      }
+      if (!SSL_set1_curves(ssl.get(), &nids[0], nids.size())) {
+        return nullptr;
+      }
     }
   }
   if (enable_all_curves) {
     static const int kAllCurves[] = {
         NID_secp224r1, NID_X9_62_prime256v1, NID_secp384r1,
-        NID_secp521r1, NID_X25519,
+        NID_secp521r1, NID_X25519,           NID_CECPQ2,
     };
     if (!SSL_set1_curves(ssl.get(), kAllCurves,
                          OPENSSL_ARRAY_SIZE(kAllCurves))) {
diff --git a/ssl/test/test_config.h b/ssl/test/test_config.h
index bffe911..0e842c0 100644
--- a/ssl/test/test_config.h
+++ b/ssl/test/test_config.h
@@ -33,6 +33,7 @@
   std::vector<int> signing_prefs;
   std::vector<int> verify_prefs;
   std::vector<int> expected_peer_verify_prefs;
+  std::vector<int> curves;
   std::string key_file;
   std::string cert_file;
   std::string expected_server_name;
@@ -122,7 +123,6 @@
   bool renegotiate_ignore = false;
   bool forbid_renegotiation_after_handshake = false;
   int expect_peer_signature_algorithm = 0;
-  bool p384_only = false;
   bool enable_all_curves = false;
   int expect_curve_id = 0;
   bool use_old_client_cert_callback = false;
@@ -170,6 +170,7 @@
   bool handshaker_resume = false;
   std::string handshaker_path;
   bool jdk11_workaround = false;
+  bool server_preference = false;
 
   int argc;
   char **argv;
diff --git a/ssl/tls13_client.cc b/ssl/tls13_client.cc
index 0d77896..40913dc 100644
--- a/ssl/tls13_client.cc
+++ b/ssl/tls13_client.cc
@@ -165,15 +165,17 @@
       return ssl_hs_error;
     }
 
-    // Check that the HelloRetryRequest does not request the key share that
-    // was provided in the initial ClientHello.
-    if (hs->key_share->GroupID() == group_id) {
+    // Check that the HelloRetryRequest does not request a key share that was
+    // provided in the initial ClientHello.
+    if (hs->key_shares[0]->GroupID() == group_id ||
+        (hs->key_shares[1] && hs->key_shares[1]->GroupID() == group_id)) {
       ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_ILLEGAL_PARAMETER);
       OPENSSL_PUT_ERROR(SSL, SSL_R_WRONG_CURVE);
       return ssl_hs_error;
     }
 
-    hs->key_share.reset();
+    hs->key_shares[0].reset();
+    hs->key_shares[1].reset();
     hs->retry_group = group_id;
   }
 
diff --git a/tool/speed.cc b/tool/speed.cc
index 2175baa..975fb53 100644
--- a/tool/speed.cc
+++ b/tool/speed.cc
@@ -32,6 +32,7 @@
 #include <openssl/ecdsa.h>
 #include <openssl/ec_key.h>
 #include <openssl/evp.h>
+#include <openssl/hrss.h>
 #include <openssl/nid.h>
 #include <openssl/rand.h>
 #include <openssl/rsa.h>
@@ -744,6 +745,61 @@
   return true;
 }
 
+static bool SpeedHRSS(const std::string &selected) {
+  if (!selected.empty() && selected != "HRSS") {
+    return true;
+  }
+
+  TimeResults results;
+
+  if (!TimeFunction(&results, []() -> bool {
+    struct HRSS_public_key pub;
+    struct HRSS_private_key priv;
+    uint8_t entropy[HRSS_GENERATE_KEY_BYTES];
+    RAND_bytes(entropy, sizeof(entropy));
+    HRSS_generate_key(&pub, &priv, entropy);
+    return true;
+  })) {
+    fprintf(stderr, "Failed to time HRSS_generate_key.\n");
+    return false;
+  }
+
+  results.Print("HRSS generate");
+
+  struct HRSS_public_key pub;
+  struct HRSS_private_key priv;
+  uint8_t key_entropy[HRSS_GENERATE_KEY_BYTES];
+  RAND_bytes(key_entropy, sizeof(key_entropy));
+  HRSS_generate_key(&pub, &priv, key_entropy);
+
+  uint8_t ciphertext[HRSS_CIPHERTEXT_BYTES];
+  if (!TimeFunction(&results, [&pub, &ciphertext]() -> bool {
+    uint8_t entropy[HRSS_ENCAP_BYTES];
+    uint8_t shared_key[HRSS_KEY_BYTES];
+    RAND_bytes(entropy, sizeof(entropy));
+    HRSS_encap(ciphertext, shared_key, &pub, entropy);
+    return true;
+  })) {
+    fprintf(stderr, "Failed to time HRSS_encap.\n");
+    return false;
+  }
+
+  results.Print("HRSS encap");
+
+  if (!TimeFunction(&results, [&pub, &priv, &ciphertext]() -> bool {
+    uint8_t shared_key[HRSS_KEY_BYTES];
+    HRSS_decap(shared_key, &pub, &priv, ciphertext, sizeof(ciphertext));
+    return true;
+  })) {
+    fprintf(stderr, "Failed to time HRSS_encap.\n");
+    return false;
+  }
+
+  results.Print("HRSS decap");
+
+  return true;
+}
+
 static const struct argument kArguments[] = {
     {
      "-filter", kOptionalArgument,
@@ -817,7 +873,8 @@
       !Speed25519(selected) ||
       !SpeedSPAKE2(selected) ||
       !SpeedScrypt(selected) ||
-      !SpeedRSAKeyGen(selected)) {
+      !SpeedRSAKeyGen(selected) ||
+      !SpeedHRSS(selected)) {
     return false;
   }
 
diff --git a/util/generate_build_files.py b/util/generate_build_files.py
index 9c635dc..44db7f5 100644
--- a/util/generate_build_files.py
+++ b/util/generate_build_files.py
@@ -43,6 +43,10 @@
     ('linux', 'arm'): [
         'src/crypto/curve25519/asm/x25519-asm-arm.S',
         'src/crypto/poly1305/poly1305_arm_asm.S',
+        'src/crypto/hrss/asm/poly_mul_vec_armv7_neon.S',
+    ],
+    ('linux', 'x86_64'): [
+        'src/crypto/hrss/asm/poly_rq_mul.S',
     ],
 }