Fix ChaCha20-Poly1305 x86-64 asm on Windows

Current:

Did 2916000 ChaCha20-Poly1305 (16 bytes) seal operations in 1015000us (2872906.4 ops/sec): 46.0 MB/s
Did 1604750 ChaCha20-Poly1305 (256 bytes) seal operations in 1016000us (1579478.3 ops/sec): 404.3 MB/s
Did 516750 ChaCha20-Poly1305 (1350 bytes) seal operations in 1015000us (509113.3 ops/sec): 687.3 MB/s
Did 99750 ChaCha20-Poly1305 (8192 bytes) seal operations in 1016000us (98179.1 ops/sec): 804.3 MB/s
Did 50500 ChaCha20-Poly1305 (16384 bytes) seal operations in 1016000us (49704.7 ops/sec): 814.4 MB/s

With fix:

Did 6366750 ChaCha20-Poly1305 (16 bytes) seal operations in 1016000us (6266486.2 ops/sec): 100.3 MB/s
Did 3938000 ChaCha20-Poly1305 (256 bytes) seal operations in 1016000us (3875984.3 ops/sec): 992.3 MB/s
Did 1207750 ChaCha20-Poly1305 (1350 bytes) seal operations in 1015000us (1189901.5 ops/sec): 1606.4 MB/s
Did 258500 ChaCha20-Poly1305 (8192 bytes) seal operations in 1016000us (254429.1 ops/sec): 2084.3 MB/s
Did 131500 ChaCha20-Poly1305 (16384 bytes) seal operations in 1016000us (129429.1 ops/sec): 2120.6 MB/s

Change-Id: Iec6417b9855b9d3d1d5154c93a370f80f219c65f
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/44347
Reviewed-by: David Benjamin <davidben@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/cipher_extra/aead_test.cc b/crypto/cipher_extra/aead_test.cc
index d0c266a..bf02e78 100644
--- a/crypto/cipher_extra/aead_test.cc
+++ b/crypto/cipher_extra/aead_test.cc
@@ -24,6 +24,7 @@
 #include <openssl/err.h>
 
 #include "../fipsmodule/cipher/internal.h"
+#include "internal.h"
 #include "../internal.h"
 #include "../test/abi_test.h"
 #include "../test/file_test.h"
@@ -827,6 +828,27 @@
   EXPECT_EQ(Bytes(plaintext + 1, sizeof(plaintext) - 1),
             Bytes(plaintext2 + 1, plaintext2_len));
 }
+
+TEST(ChaChaPoly1305Test, ABI) {
+  if (!chacha20_poly1305_asm_capable()) {
+    return;
+  }
+
+  std::unique_ptr<uint8_t[]> buf(new uint8_t[1024]);
+  for (size_t len = 0; len <= 1024; len += 5) {
+    SCOPED_TRACE(len);
+    union chacha20_poly1305_open_data open_ctx = {};
+    CHECK_ABI(chacha20_poly1305_open, buf.get(), buf.get(), len, buf.get(),
+              len % 128, &open_ctx);
+  }
+
+  for (size_t len = 0; len <= 1024; len += 5) {
+    SCOPED_TRACE(len);
+    union chacha20_poly1305_seal_data seal_ctx = {};
+    CHECK_ABI(chacha20_poly1305_seal, buf.get(), buf.get(), len, buf.get(),
+              len % 128, &seal_ctx);
+  }
+}
 #endif  // SUPPORTS_ABI_TEST
 
 TEST(AEADTest, AESCCMLargeAD) {
diff --git a/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl b/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl
index b748c23..3826cb7 100644
--- a/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl
+++ b/crypto/cipher_extra/asm/chacha20_poly1305_x86_64.pl
@@ -43,26 +43,26 @@
 chacha20_poly1305_constants:
 
 .align 64
-.chacha20_consts:
+.Lchacha20_consts:
 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
 .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
-.rol8:
+.Lrol8:
 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
 .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
-.rol16:
+.Lrol16:
 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
 .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
-.avx2_init:
+.Lavx2_init:
 .long 0,0,0,0
-.sse_inc:
+.Lsse_inc:
 .long 1,0,0,0
-.avx2_inc:
+.Lavx2_inc:
 .long 2,0,0,0,2,0,0,0
-.clamp:
+.Lclamp:
 .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
 .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
 .align 16
-.and_masks:
+.Land_masks:
 .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
 .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
@@ -81,28 +81,33 @@
 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
 ___
 
-my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
+my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2,$adl)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8","%r8");
 my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
 my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
 my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
 my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
-my $r_store="0*16(%rbp)";
-my $s_store="1*16(%rbp)";
-my $len_store="2*16(%rbp)";
-my $state1_store="3*16(%rbp)";
-my $state2_store="4*16(%rbp)";
-my $tmp_store="5*16(%rbp)";
-my $ctr0_store="6*16(%rbp)";
-my $ctr1_store="7*16(%rbp)";
-my $ctr2_store="8*16(%rbp)";
-my $ctr3_store="9*16(%rbp)";
+my $xmm_storage = 0;
+if ($win64) {
+    $xmm_storage = 10*16;
+}
+my $xmm_store="0*16(%rbp)";
+my $r_store="$xmm_storage+0*16(%rbp)";
+my $s_store="$xmm_storage+1*16(%rbp)";
+my $len_store="$xmm_storage+2*16(%rbp)";
+my $state1_store="$xmm_storage+3*16(%rbp)";
+my $state2_store="$xmm_storage+4*16(%rbp)";
+my $tmp_store="$xmm_storage+5*16(%rbp)";
+my $ctr0_store="$xmm_storage+6*16(%rbp)";
+my $ctr1_store="$xmm_storage+7*16(%rbp)";
+my $ctr2_store="$xmm_storage+8*16(%rbp)";
+my $ctr3_store="$xmm_storage+9*16(%rbp)";
 
 sub chacha_qr {
 my ($a,$b,$c,$d,$t,$dir)=@_;
 $code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
 $code.="paddd $b, $a
         pxor $a, $d
-        pshufb .rol16(%rip), $d
+        pshufb .Lrol16(%rip), $d
         paddd $d, $c
         pxor $c, $b
         movdqa $b, $t
@@ -111,7 +116,7 @@
         pxor $t, $b
         paddd $b, $a
         pxor $a, $d
-        pshufb .rol8(%rip), $d
+        pshufb .Lrol8(%rip), $d
         paddd $d, $c
         pxor $c, $b
         movdqa $b, $t
@@ -129,7 +134,7 @@
 
 sub poly_add {
 my ($src)=@_;
-$code.="add $src, $acc0
+$code.="add 0+$src, $acc0
         adc 8+$src, $acc1
         adc \$1, $acc2\n";
 }
@@ -166,22 +171,26 @@
         adc %rdx, $t3\n";
 }
 
+# At the beginning of the reduce stage t = [t3:t2:t1:t0] is a product of
+# r = [r1:r0] and acc = [acc2:acc1:acc0]
+# r is 124 bits at most (due to clamping) and acc is 131 bits at most
+# (acc2 is at most 4 before the addition and can be at most 6 when we add in
+# the next block) therefore t is at most 255 bits big, and t3 is 63 bits.
 sub poly_reduce_stage {
 $code.="mov $t0, $acc0
         mov $t1, $acc1
         mov $t2, $acc2
-        and \$3, $acc2
+        and \$3, $acc2 # At this point acc2 is 2 bits at most (value of 3)
         mov $t2, $t0
         and \$-4, $t0
         mov $t3, $t1
         shrd \$2, $t3, $t2
         shr \$2, $t3
-        add $t0, $acc0
-        adc $t1, $acc1
-        adc \$0, $acc2
+        add $t0, $t2
+        adc $t1, $t3 # No carry out since t3 is 61 bits and t1 is 63 bits
         add $t2, $acc0
         adc $t3, $acc1
-        adc \$0, $acc2\n";
+        adc \$0, $acc2\n"; # At this point acc2 has the value of 4 at most
 }
 
 sub poly_mul {
@@ -193,7 +202,7 @@
 
 sub prep_state {
 my ($n)=@_;
-$code.="movdqa .chacha20_consts(%rip), $A0
+$code.="movdqa .Lchacha20_consts(%rip), $A0
         movdqa $state1_store, $B0
         movdqa $state2_store, $C0\n";
 $code.="movdqa $A0, $A1
@@ -206,31 +215,31 @@
         movdqa $B0, $B3
         movdqa $C0, $C3\n" if ($n ge 4);
 $code.="movdqa $ctr0_store, $D0
-        paddd .sse_inc(%rip), $D0
+        paddd .Lsse_inc(%rip), $D0
         movdqa $D0, $ctr0_store\n" if ($n eq 1);
 $code.="movdqa $ctr0_store, $D1
-        paddd .sse_inc(%rip), $D1
+        paddd .Lsse_inc(%rip), $D1
         movdqa $D1, $D0
-        paddd .sse_inc(%rip), $D0
+        paddd .Lsse_inc(%rip), $D0
         movdqa $D0, $ctr0_store
         movdqa $D1, $ctr1_store\n" if ($n eq 2);
 $code.="movdqa $ctr0_store, $D2
-        paddd .sse_inc(%rip), $D2
+        paddd .Lsse_inc(%rip), $D2
         movdqa $D2, $D1
-        paddd .sse_inc(%rip), $D1
+        paddd .Lsse_inc(%rip), $D1
         movdqa $D1, $D0
-        paddd .sse_inc(%rip), $D0
+        paddd .Lsse_inc(%rip), $D0
         movdqa $D0, $ctr0_store
         movdqa $D1, $ctr1_store
         movdqa $D2, $ctr2_store\n" if ($n eq 3);
 $code.="movdqa $ctr0_store, $D3
-        paddd .sse_inc(%rip), $D3
+        paddd .Lsse_inc(%rip), $D3
         movdqa $D3, $D2
-        paddd .sse_inc(%rip), $D2
+        paddd .Lsse_inc(%rip), $D2
         movdqa $D2, $D1
-        paddd .sse_inc(%rip), $D1
+        paddd .Lsse_inc(%rip), $D1
         movdqa $D1, $D0
-        paddd .sse_inc(%rip), $D0
+        paddd .Lsse_inc(%rip), $D0
         movdqa $D0, $ctr0_store
         movdqa $D1, $ctr1_store
         movdqa $D2, $ctr2_store
@@ -239,19 +248,19 @@
 
 sub finalize_state {
 my ($n)=@_;
-$code.="paddd .chacha20_consts(%rip), $A3
+$code.="paddd .Lchacha20_consts(%rip), $A3
         paddd $state1_store, $B3
         paddd $state2_store, $C3
         paddd $ctr3_store, $D3\n" if ($n eq 4);
-$code.="paddd .chacha20_consts(%rip), $A2
+$code.="paddd .Lchacha20_consts(%rip), $A2
         paddd $state1_store, $B2
         paddd $state2_store, $C2
         paddd $ctr2_store, $D2\n" if ($n ge 3);
-$code.="paddd .chacha20_consts(%rip), $A1
+$code.="paddd .Lchacha20_consts(%rip), $A1
         paddd $state1_store, $B1
         paddd $state2_store, $C1
         paddd $ctr1_store, $D1\n" if ($n ge 2);
-$code.="paddd .chacha20_consts(%rip), $A0
+$code.="paddd .Lchacha20_consts(%rip), $A0
         paddd $state1_store, $B0
         paddd $state2_store, $C0
         paddd $ctr0_store, $D0\n";
@@ -352,10 +361,10 @@
 return $round;
 };
 
-$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
-               &gen_chacha_round(25, ".rol8(%rip)", "left") .
-               &gen_chacha_round(20, ".rol16(%rip)") .
-               &gen_chacha_round(25, ".rol8(%rip)", "right");
+$chacha_body = &gen_chacha_round(20, ".Lrol16(%rip)") .
+               &gen_chacha_round(25, ".Lrol8(%rip)", "left") .
+               &gen_chacha_round(20, ".Lrol16(%rip)") .
+               &gen_chacha_round(25, ".Lrol8(%rip)", "right");
 
 my @loop_body = split /\n/, $chacha_body;
 
@@ -370,16 +379,17 @@
 ################################################################################
 # void poly_hash_ad_internal();
 $code.="
-.type poly_hash_ad_internal,\@function,2
+.type poly_hash_ad_internal,\@abi-omnipotent
 .align 64
 poly_hash_ad_internal:
 .cfi_startproc
+.cfi_def_cfa rsp, 8
     xor $acc0, $acc0
     xor $acc1, $acc1
     xor $acc2, $acc2
     cmp \$13,  $itr2
-    jne hash_ad_loop
-poly_fast_tls_ad:
+    jne .Lhash_ad_loop
+.Lpoly_fast_tls_ad:
     # Special treatment for the TLS case of 13 bytes
     mov ($adp), $acc0
     mov 5($adp), $acc1
@@ -387,38 +397,38 @@
     mov \$1, $acc2\n";
     &poly_mul(); $code.="
     ret
-hash_ad_loop:
+.Lhash_ad_loop:
         # Hash in 16 byte chunk
         cmp \$16, $itr2
-        jb hash_ad_tail\n";
+        jb .Lhash_ad_tail\n";
         &poly_add("0($adp)");
         &poly_mul(); $code.="
         lea 1*16($adp), $adp
         sub \$16, $itr2
-    jmp hash_ad_loop
-hash_ad_tail:
+    jmp .Lhash_ad_loop
+.Lhash_ad_tail:
     cmp \$0, $itr2
-    je 1f
+    je .Lhash_ad_done
     # Hash last < 16 byte tail
     xor $t0, $t0
     xor $t1, $t1
     xor $t2, $t2
     add $itr2, $adp
-hash_ad_tail_loop:
+.Lhash_ad_tail_loop:
         shld \$8, $t0, $t1
         shl \$8, $t0
         movzxb -1($adp), $t2
         xor $t2, $t0
         dec $adp
         dec $itr2
-    jne hash_ad_tail_loop
+    jne .Lhash_ad_tail_loop
 
     add $t0, $acc0
     adc $t1, $acc1
     adc \$1, $acc2\n";
     &poly_mul(); $code.="
     # Finished AD
-1:
+.Lhash_ad_done:
     ret
 .cfi_endproc
 .size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
@@ -426,86 +436,98 @@
 
 {
 ################################################################################
-# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
+# extern void chacha20_poly1305_open(uint8_t *out_plaintext,
+#                                    const uint8_t *ciphertext,
+#                                    size_t plaintext_len, const uint8_t *ad,
+#                                    size_t ad_len, union open_data *aead_data)
+#
 $code.="
 .globl chacha20_poly1305_open
-.type chacha20_poly1305_open,\@function,2
+.type chacha20_poly1305_open,\@function,6
 .align 64
 chacha20_poly1305_open:
 .cfi_startproc
     push %rbp
-.cfi_adjust_cfa_offset 8
+.cfi_push %rbp
     push %rbx
-.cfi_adjust_cfa_offset 8
+.cfi_push %rbx
     push %r12
-.cfi_adjust_cfa_offset 8
+.cfi_push %r12
     push %r13
-.cfi_adjust_cfa_offset 8
+.cfi_push %r13
     push %r14
-.cfi_adjust_cfa_offset 8
+.cfi_push %r14
     push %r15
-.cfi_adjust_cfa_offset 8
+.cfi_push %r15
     # We write the calculated authenticator back to keyp at the end, so save
     # the pointer on the stack too.
     push $keyp
-.cfi_adjust_cfa_offset 8
-    sub \$288 + 32, %rsp
+.cfi_push $keyp
+    sub \$288 + $xmm_storage + 32, %rsp
 .cfi_adjust_cfa_offset 288 + 32
-.cfi_offset rbp, -16
-.cfi_offset rbx, -24
-.cfi_offset r12, -32
-.cfi_offset r13, -40
-.cfi_offset r14, -48
-.cfi_offset r15, -56
+
     lea 32(%rsp), %rbp
-    and \$-32, %rbp
-    mov %rdx, 8+$len_store
-    mov %r8, 0+$len_store
-    mov %rdx, $inl\n"; $code.="
+    and \$-32, %rbp\n";
+$code.="
+    movaps %xmm6,16*0+$xmm_store
+    movaps %xmm7,16*1+$xmm_store
+    movaps %xmm8,16*2+$xmm_store
+    movaps %xmm9,16*3+$xmm_store
+    movaps %xmm10,16*4+$xmm_store
+    movaps %xmm11,16*5+$xmm_store
+    movaps %xmm12,16*6+$xmm_store
+    movaps %xmm13,16*7+$xmm_store
+    movaps %xmm14,16*8+$xmm_store
+    movaps %xmm15,16*9+$xmm_store\n" if ($win64);
+$code.="
+    mov %rdx, $inl
+    mov $adl, 0+$len_store
+    mov $inl, 8+$len_store\n";
+$code.="
     mov OPENSSL_ia32cap_P+8(%rip), %eax
     and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
     xor \$`(1<<5) + (1<<8)`, %eax
-    jz  chacha20_poly1305_open_avx2\n" if ($avx>1);
+    jz chacha20_poly1305_open_avx2\n" if ($avx>1);
 $code.="
-1:
     cmp \$128, $inl
-    jbe open_sse_128
+    jbe .Lopen_sse_128
     # For long buffers, prepare the poly key first
-    movdqa .chacha20_consts(%rip), $A0
+    movdqa .Lchacha20_consts(%rip), $A0
     movdqu 0*16($keyp), $B0
     movdqu 1*16($keyp), $C0
     movdqu 2*16($keyp), $D0
+
     movdqa $D0, $T1
     # Store on stack, to free keyp
     movdqa $B0, $state1_store
     movdqa $C0, $state2_store
     movdqa $D0, $ctr0_store
     mov \$10, $acc0
-1:  \n";
+.Lopen_sse_init_rounds:\n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         dec $acc0
-    jne 1b
+    jne .Lopen_sse_init_rounds
     # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
-    paddd .chacha20_consts(%rip), $A0
+    paddd .Lchacha20_consts(%rip), $A0
     paddd $state1_store, $B0
     # Clamp and store the key
-    pand .clamp(%rip), $A0
+    pand .Lclamp(%rip), $A0
     movdqa $A0, $r_store
     movdqa $B0, $s_store
     # Hash
-    mov %r8, $itr2
+    mov $adl, $itr2
     call poly_hash_ad_internal
-open_sse_main_loop:
+.Lopen_sse_main_loop:
         cmp \$16*16, $inl
-        jb 2f
+        jb .Lopen_sse_tail
         # Load state, increment counter blocks\n";
         &prep_state(4); $code.="
         # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
         # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
         mov \$4, $itr1
         mov $inp, $itr2
-1:  \n";
+.Lopen_sse_main_loop_rounds:\n";
             &emit_body(20);
             &poly_add("0($itr2)"); $code.="
             lea 2*8($itr2), $itr2\n";
@@ -520,12 +542,12 @@
             foreach $l (@loop_body) {$code.=$l."\n";}
             @loop_body = split /\n/, $chacha_body; $code.="
             dec $itr1
-        jge 1b\n";
+        jge .Lopen_sse_main_loop_rounds\n";
             &poly_add("0($itr2)");
             &poly_mul(); $code.="
             lea 2*8($itr2), $itr2
             cmp \$-6, $itr1
-        jg 1b\n";
+        jg .Lopen_sse_main_loop_rounds\n";
         &finalize_state(4);
         &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
         &xor_stream($A2, $B2, $C2, $D2, "4*16");
@@ -534,66 +556,66 @@
         lea 16*16($inp), $inp
         lea 16*16($oup), $oup
         sub \$16*16, $inl
-    jmp open_sse_main_loop
-2:
+    jmp .Lopen_sse_main_loop
+.Lopen_sse_tail:
     # Handle the various tail sizes efficiently
     test $inl, $inl
-    jz open_sse_finalize
+    jz .Lopen_sse_finalize
+    cmp \$12*16, $inl
+    ja .Lopen_sse_tail_256
+    cmp \$8*16, $inl
+    ja .Lopen_sse_tail_192
     cmp \$4*16, $inl
-    ja 3f\n";
+    ja .Lopen_sse_tail_128\n";
 ###############################################################################
     # At most 64 bytes are left
     &prep_state(1); $code.="
     xor $itr2, $itr2
     mov $inl, $itr1
     cmp \$16, $itr1
-    jb 2f
-1:  \n";
-        &poly_add("0($inp, $itr2)");
+    jb .Lopen_sse_tail_64_rounds
+.Lopen_sse_tail_64_rounds_and_x1hash: \n";
+        &poly_add("0($inp,$itr2)");
         &poly_mul(); $code.="
         sub \$16, $itr1
-2:
+.Lopen_sse_tail_64_rounds:
         add \$16, $itr2\n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         cmp \$16, $itr1
-    jae 1b
+    jae .Lopen_sse_tail_64_rounds_and_x1hash
         cmp \$10*16, $itr2
-    jne 2b\n";
+    jne .Lopen_sse_tail_64_rounds\n";
     &finalize_state(1); $code.="
-    jmp open_sse_tail_64_dec_loop
-3:
-    cmp \$8*16, $inl
-    ja 3f\n";
+    jmp .Lopen_sse_tail_64_dec_loop
 ###############################################################################
+.Lopen_sse_tail_128:\n";
     # 65 - 128 bytes are left
     &prep_state(2); $code.="
     mov $inl, $itr1
     and \$-16, $itr1
     xor $itr2, $itr2
-1:  \n";
-        &poly_add("0($inp, $itr2)");
+.Lopen_sse_tail_128_rounds_and_x1hash: \n";
+        &poly_add("0($inp,$itr2)");
         &poly_mul(); $code.="
-2:
+.Lopen_sse_tail_128_rounds:
         add \$16, $itr2\n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
         cmp $itr1, $itr2
-    jb 1b
+    jb .Lopen_sse_tail_128_rounds_and_x1hash
         cmp \$10*16, $itr2
-    jne 2b\n";
+    jne .Lopen_sse_tail_128_rounds\n";
     &finalize_state(2);
     &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
     sub \$4*16, $inl
     lea 4*16($inp), $inp
     lea 4*16($oup), $oup
-    jmp open_sse_tail_64_dec_loop
-3:
-    cmp \$12*16, $inl
-    ja 3f\n";
+    jmp .Lopen_sse_tail_64_dec_loop
 ###############################################################################
+.Lopen_sse_tail_192:\n";
     # 129 - 192 bytes are left
     &prep_state(3); $code.="
     mov $inl, $itr1
@@ -602,10 +624,10 @@
     cmovg $itr2, $itr1
     and \$-16, $itr1
     xor $itr2, $itr2
-1:  \n";
-        &poly_add("0($inp, $itr2)");
+.Lopen_sse_tail_192_rounds_and_x1hash: \n";
+        &poly_add("0($inp,$itr2)");
         &poly_mul(); $code.="
-2:
+.Lopen_sse_tail_192_rounds:
         add \$16, $itr2\n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
@@ -614,32 +636,32 @@
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         cmp $itr1, $itr2
-    jb 1b
+    jb .Lopen_sse_tail_192_rounds_and_x1hash
         cmp \$10*16, $itr2
-    jne 2b
+    jne .Lopen_sse_tail_192_rounds
     cmp \$11*16, $inl
-    jb 1f\n";
+    jb .Lopen_sse_tail_192_finish\n";
     &poly_add("10*16($inp)");
     &poly_mul(); $code.="
     cmp \$12*16, $inl
-    jb 1f\n";
+    jb .Lopen_sse_tail_192_finish\n";
     &poly_add("11*16($inp)");
     &poly_mul(); $code.="
-1:  \n";
+.Lopen_sse_tail_192_finish: \n";
     &finalize_state(3);
     &xor_stream($A2, $B2, $C2, $D2, "0*16");
     &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
     sub \$8*16, $inl
     lea 8*16($inp), $inp
     lea 8*16($oup), $oup
-    jmp open_sse_tail_64_dec_loop
-3:
-###############################################################################\n";
+    jmp .Lopen_sse_tail_64_dec_loop
+###############################################################################
+.Lopen_sse_tail_256:\n";
     # 193 - 255 bytes are left
     &prep_state(4); $code.="
     xor $itr2, $itr2
-1:  \n";
-        &poly_add("0($inp, $itr2)");
+.Lopen_sse_tail_256_rounds_and_x1hash: \n";
+        &poly_add("0($inp,$itr2)");
         &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
         &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
         &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
@@ -654,15 +676,16 @@
         &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
         add \$16, $itr2
         cmp \$10*16, $itr2
-    jb 1b
+    jb .Lopen_sse_tail_256_rounds_and_x1hash
+
     mov $inl, $itr1
     and \$-16, $itr1
-1:  \n";
-        &poly_add("0($inp, $itr2)");
+.Lopen_sse_tail_256_hash: \n";
+        &poly_add("0($inp,$itr2)");
         &poly_mul(); $code.="
         add \$16, $itr2
         cmp $itr1, $itr2
-    jb 1b\n";
+    jb .Lopen_sse_tail_256_hash\n";
     &finalize_state(4);
     &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
     &xor_stream($A2, $B2, $C2, $D2, "4*16");
@@ -673,9 +696,9 @@
     lea 12*16($oup), $oup
 ###############################################################################
     # Decrypt the remaining data, 16B at a time, using existing stream
-open_sse_tail_64_dec_loop:
+.Lopen_sse_tail_64_dec_loop:
     cmp \$16, $inl
-    jb 1f
+    jb .Lopen_sse_tail_16_init
         sub \$16, $inl
         movdqu ($inp), $T0
         pxor $T0, $A0
@@ -685,47 +708,46 @@
         movdqa $B0, $A0
         movdqa $C0, $B0
         movdqa $D0, $C0
-    jmp open_sse_tail_64_dec_loop
-1:
+    jmp .Lopen_sse_tail_64_dec_loop
+.Lopen_sse_tail_16_init:
     movdqa $A0, $A1
 
     # Decrypt up to 16 bytes at the end.
-open_sse_tail_16:
+.Lopen_sse_tail_16:
     test $inl, $inl
-    jz open_sse_finalize
+    jz .Lopen_sse_finalize
 
     # Read the final bytes into $T0. They need to be read in reverse order so
     # that they end up in the correct order in $T0.
     pxor $T0, $T0
-    lea -1($inp, $inl), $inp
+    lea -1($inp,$inl), $inp
     movq $inl, $itr2
-2:
+.Lopen_sse_tail_16_compose:
         pslldq \$1, $T0
         pinsrb \$0, ($inp), $T0
         sub \$1, $inp
         sub \$1, $itr2
-        jnz 2b
+        jnz .Lopen_sse_tail_16_compose
 
-3:
     movq $T0, $t0
     pextrq \$1, $T0, $t1
     # The final bytes of keystream are in $A1.
     pxor $A1, $T0
 
     # Copy the plaintext bytes out.
-2:
+.Lopen_sse_tail_16_extract:
         pextrb \$0, $T0, ($oup)
         psrldq \$1, $T0
         add \$1, $oup
         sub \$1, $inl
-    jne 2b
+    jne .Lopen_sse_tail_16_extract
 
     add $t0, $acc0
     adc $t1, $acc1
     adc \$1, $acc2\n";
     &poly_mul(); $code.="
 
-open_sse_finalize:\n";
+.Lopen_sse_finalize:\n";
     &poly_add($len_store);
     &poly_mul(); $code.="
     # Final reduce
@@ -740,40 +762,54 @@
     cmovc $t2, $acc2
     # Add in s part of the key
     add 0+$s_store, $acc0
-    adc 8+$s_store, $acc1
+    adc 8+$s_store, $acc1\n";
 
-    add \$288 + 32, %rsp
+$code.="
+    movaps 16*0+$xmm_store, %xmm6
+    movaps 16*1+$xmm_store, %xmm7
+    movaps 16*2+$xmm_store, %xmm8
+    movaps 16*3+$xmm_store, %xmm9
+    movaps 16*4+$xmm_store, %xmm10
+    movaps 16*5+$xmm_store, %xmm11
+    movaps 16*6+$xmm_store, %xmm12
+    movaps 16*7+$xmm_store, %xmm13
+    movaps 16*8+$xmm_store, %xmm14
+    movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
+$code.="
+.cfi_remember_state
+    add \$288 + $xmm_storage + 32, %rsp
 .cfi_adjust_cfa_offset -(288 + 32)
+    # The tag replaces the key on return
     pop $keyp
-.cfi_adjust_cfa_offset -8
-    movq $acc0, ($keyp)
-    movq $acc1, 8($keyp)
-
+.cfi_pop $keyp
+    mov $acc0, ($keyp)
+    mov $acc1, 8($keyp)
     pop %r15
-.cfi_adjust_cfa_offset -8
+.cfi_pop %r15
     pop %r14
-.cfi_adjust_cfa_offset -8
+.cfi_pop %r14
     pop %r13
-.cfi_adjust_cfa_offset -8
+.cfi_pop %r13
     pop %r12
-.cfi_adjust_cfa_offset -8
+.cfi_pop %r12
     pop %rbx
-.cfi_adjust_cfa_offset -8
+.cfi_pop %rbx
     pop %rbp
-.cfi_adjust_cfa_offset -8
+.cfi_pop %rbp
     ret
-.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
 ###############################################################################
-open_sse_128:
-    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
+.Lopen_sse_128:
+.cfi_restore_state
+    movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
     movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
     movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
     movdqu 2*16($keyp), $D0
-    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
-    movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
+    movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
+    movdqa $D1, $D2\npaddd .Lsse_inc(%rip), $D2
     movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
     mov \$10, $acc0
-1:  \n";
+
+.Lopen_sse_128_rounds:  \n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
@@ -781,25 +817,25 @@
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
     dec $acc0
-    jnz 1b
-    paddd .chacha20_consts(%rip), $A0
-    paddd .chacha20_consts(%rip), $A1
-    paddd .chacha20_consts(%rip), $A2
+    jnz .Lopen_sse_128_rounds
+    paddd .Lchacha20_consts(%rip), $A0
+    paddd .Lchacha20_consts(%rip), $A1
+    paddd .Lchacha20_consts(%rip), $A2
     paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
     paddd $T2, $C1\npaddd $T2, $C2
     paddd $T3, $D1
-    paddd .sse_inc(%rip), $T3
+    paddd .Lsse_inc(%rip), $T3
     paddd $T3, $D2
     # Clamp and store the key
-    pand .clamp(%rip), $A0
+    pand .Lclamp(%rip), $A0
     movdqa $A0, $r_store
     movdqa $B0, $s_store
     # Hash
-    mov %r8, $itr2
+    mov $adl, $itr2
     call poly_hash_ad_internal
-1:
+.Lopen_sse_128_xor_hash:
         cmp \$16, $inl
-        jb open_sse_tail_16
+        jb .Lopen_sse_tail_16
         sub \$16, $inl\n";
         # Load for hashing
         &poly_add("0*8($inp)"); $code.="
@@ -818,62 +854,69 @@
         movdqa $B2, $A2
         movdqa $C2, $B2
         movdqa $D2, $C2
-    jmp 1b
-    jmp open_sse_tail_16
+    jmp .Lopen_sse_128_xor_hash
 .size chacha20_poly1305_open, .-chacha20_poly1305_open
 .cfi_endproc
 
 ################################################################################
 ################################################################################
-# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
+# void chacha20_poly1305_seal(uint8_t *in_out, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t keyp[48]);
 .globl  chacha20_poly1305_seal
-.type chacha20_poly1305_seal,\@function,2
+.type chacha20_poly1305_seal,\@function,6
 .align 64
 chacha20_poly1305_seal:
 .cfi_startproc
     push %rbp
-.cfi_adjust_cfa_offset 8
+.cfi_push %rbp
     push %rbx
-.cfi_adjust_cfa_offset 8
+.cfi_push %rbx
     push %r12
-.cfi_adjust_cfa_offset 8
+.cfi_push %r12
     push %r13
-.cfi_adjust_cfa_offset 8
+.cfi_push %r13
     push %r14
-.cfi_adjust_cfa_offset 8
+.cfi_push %r14
     push %r15
-.cfi_adjust_cfa_offset 8
-    # We write the calculated authenticator back to keyp at the end, so save
-    # the pointer on the stack too.
+.cfi_push %r15   
+# We write the calculated authenticator back to keyp at the end, so save
+# the pointer on the stack too.
     push $keyp
-.cfi_adjust_cfa_offset 8
-    sub \$288 + 32, %rsp
+.cfi_push $keyp
+    sub \$288 + $xmm_storage + 32, %rsp
 .cfi_adjust_cfa_offset 288 + 32
-.cfi_offset rbp, -16
-.cfi_offset rbx, -24
-.cfi_offset r12, -32
-.cfi_offset r13, -40
-.cfi_offset r14, -48
-.cfi_offset r15, -56
     lea 32(%rsp), %rbp
-    and \$-32, %rbp
+    and \$-32, %rbp\n";
+$code.="
+    movaps %xmm6,16*0+$xmm_store
+    movaps %xmm7,16*1+$xmm_store
+    movaps %xmm8,16*2+$xmm_store
+    movaps %xmm9,16*3+$xmm_store
+    movaps %xmm10,16*4+$xmm_store
+    movaps %xmm11,16*5+$xmm_store
+    movaps %xmm12,16*6+$xmm_store
+    movaps %xmm13,16*7+$xmm_store
+    movaps %xmm14,16*8+$xmm_store
+    movaps %xmm15,16*9+$xmm_store\n" if ($win64);
+$code.="
     mov 56($keyp), $inl  # extra_in_len
     addq %rdx, $inl
+    mov $adl, 0+$len_store
     mov $inl, 8+$len_store
-    mov %r8, 0+$len_store
-    mov %rdx, $inl\n"; $code.="
+    mov %rdx, $inl\n";
+$code.="
     mov OPENSSL_ia32cap_P+8(%rip), %eax
     and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
     xor \$`(1<<5) + (1<<8)`, %eax
-    jz  chacha20_poly1305_seal_avx2\n" if ($avx>1);
+    jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
 $code.="
     cmp \$128, $inl
-    jbe seal_sse_128
+    jbe .Lseal_sse_128
     # For longer buffers, prepare the poly key + some stream
-    movdqa .chacha20_consts(%rip), $A0
+    movdqa .Lchacha20_consts(%rip), $A0
     movdqu 0*16($keyp), $B0
     movdqu 1*16($keyp), $C0
     movdqu 2*16($keyp), $D0
+
     movdqa $A0, $A1
     movdqa $A0, $A2
     movdqa $A0, $A3
@@ -884,11 +927,11 @@
     movdqa $C0, $C2
     movdqa $C0, $C3
     movdqa $D0, $D3
-    paddd .sse_inc(%rip), $D0
+    paddd .Lsse_inc(%rip), $D0
     movdqa $D0, $D2
-    paddd .sse_inc(%rip), $D0
+    paddd .Lsse_inc(%rip), $D0
     movdqa $D0, $D1
-    paddd .sse_inc(%rip), $D0
+    paddd .Lsse_inc(%rip), $D0
     # Store on stack
     movdqa $B0, $state1_store
     movdqa $C0, $state2_store
@@ -897,28 +940,28 @@
     movdqa $D2, $ctr2_store
     movdqa $D3, $ctr3_store
     mov \$10, $acc0
-1:  \n";
+.Lseal_sse_init_rounds:  \n";
         foreach $l (@loop_body) {$code.=$l."\n";}
         @loop_body = split /\n/, $chacha_body; $code.="
         dec $acc0
-    jnz 1b\n";
+    jnz .Lseal_sse_init_rounds\n";
     &finalize_state(4); $code.="
     # Clamp and store the key
-    pand .clamp(%rip), $A3
+    pand .Lclamp(%rip), $A3
     movdqa $A3, $r_store
     movdqa $B3, $s_store
     # Hash
-    mov %r8, $itr2
+    mov $adl, $itr2
     call poly_hash_ad_internal\n";
     &xor_stream($A2,$B2,$C2,$D2,"0*16");
     &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
     cmp \$12*16, $inl
-    ja 1f
+    ja .Lseal_sse_main_init
     mov \$8*16, $itr1
     sub \$8*16, $inl
     lea 8*16($inp), $inp
-    jmp seal_sse_128_seal_hash
-1:  \n";
+    jmp .Lseal_sse_128_tail_hash
+.Lseal_sse_main_init:\n";
     &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
     mov \$12*16, $itr1
     sub \$12*16, $inl
@@ -926,16 +969,17 @@
     mov \$2, $itr1
     mov \$8, $itr2
     cmp \$4*16, $inl
-    jbe seal_sse_tail_64
+    jbe .Lseal_sse_tail_64
     cmp \$8*16, $inl
-    jbe seal_sse_tail_128
+    jbe .Lseal_sse_tail_128
     cmp \$12*16, $inl
-    jbe seal_sse_tail_192
+    jbe .Lseal_sse_tail_192
 
-1:  \n";
+.Lseal_sse_main_loop: \n";
     # The main loop
         &prep_state(4); $code.="
-2:  \n";
+.align 32
+.Lseal_sse_main_rounds: \n";
             &emit_body(20);
             &poly_add("0($oup)");
             &emit_body(20);
@@ -950,12 +994,12 @@
             @loop_body = split /\n/, $chacha_body; $code.="
             lea 16($oup), $oup
             dec $itr2
-        jge 2b\n";
+        jge .Lseal_sse_main_rounds\n";
             &poly_add("0*8($oup)");
             &poly_mul(); $code.="
             lea 16($oup), $oup
             dec $itr1
-        jg 2b\n";
+        jg .Lseal_sse_main_rounds\n";
 
         &finalize_state(4);$code.="
         movdqa $D2, $tmp_store\n";
@@ -964,56 +1008,55 @@
         &xor_stream($A2,$B2,$C2,$D2, 4*16);
         &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
         cmp \$16*16, $inl
-        ja 3f
+        ja .Lseal_sse_main_loop_xor
 
         mov \$12*16, $itr1
         sub \$12*16, $inl
         lea 12*16($inp), $inp
-        jmp seal_sse_128_seal_hash
-3:  \n";
+        jmp .Lseal_sse_128_tail_hash
+.Lseal_sse_main_loop_xor: \n";
         &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
         lea 16*16($inp), $inp
         sub \$16*16, $inl
         mov \$6, $itr1
         mov \$4, $itr2
         cmp \$12*16, $inl
-    jg 1b
+    jg .Lseal_sse_main_loop
     mov $inl, $itr1
     test $inl, $inl
-    je seal_sse_128_seal_hash
+    je .Lseal_sse_128_tail_hash
     mov \$6, $itr1
+    cmp \$8*16, $inl
+    ja .Lseal_sse_tail_192
     cmp \$4*16, $inl
-    jg 3f
+    ja .Lseal_sse_tail_128
 ###############################################################################
-seal_sse_tail_64:\n";
+.Lseal_sse_tail_64: \n";
     &prep_state(1); $code.="
-1:  \n";
+.Lseal_sse_tail_64_rounds_and_x2hash: \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 16($oup), $oup
-2:  \n";
+.Lseal_sse_tail_64_rounds_and_x1hash: \n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 16($oup), $oup
     dec $itr1
-    jg 1b
+    jg .Lseal_sse_tail_64_rounds_and_x2hash
     dec $itr2
-    jge 2b\n";
+    jge .Lseal_sse_tail_64_rounds_and_x1hash\n";
     &finalize_state(1); $code.="
-    jmp seal_sse_128_seal
-3:
-    cmp \$8*16, $inl
-    jg 3f
+    jmp .Lseal_sse_128_tail_xor
 ###############################################################################
-seal_sse_tail_128:\n";
+.Lseal_sse_tail_128:\n";
     &prep_state(2); $code.="
-1:  \n";
+.Lseal_sse_tail_128_rounds_and_x2hash: \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 16($oup), $oup
-2:  \n";
+.Lseal_sse_tail_128_rounds_and_x1hash: \n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &poly_add("0($oup)");
@@ -1022,24 +1065,23 @@
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
         lea 16($oup), $oup
     dec $itr1
-    jg 1b
+    jg .Lseal_sse_tail_128_rounds_and_x2hash
     dec $itr2
-    jge 2b\n";
+    jge .Lseal_sse_tail_128_rounds_and_x1hash\n";
     &finalize_state(2);
     &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
     mov \$4*16, $itr1
     sub \$4*16, $inl
     lea 4*16($inp), $inp
-    jmp seal_sse_128_seal_hash
-3:
+    jmp .Lseal_sse_128_tail_hash
 ###############################################################################
-seal_sse_tail_192:\n";
+.Lseal_sse_tail_192:\n";
     &prep_state(3); $code.="
-1:  \n";
+.Lseal_sse_tail_192_rounds_and_x2hash: \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 16($oup), $oup
-2:  \n";
+.Lseal_sse_tail_192_rounds_and_x1hash: \n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
@@ -1050,9 +1092,9 @@
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         lea 16($oup), $oup
     dec $itr1
-    jg 1b
+    jg .Lseal_sse_tail_192_rounds_and_x2hash
     dec $itr2
-    jge 2b\n";
+    jge .Lseal_sse_tail_192_rounds_and_x1hash\n";
     &finalize_state(3);
     &xor_stream($A2,$B2,$C2,$D2,0*16);
     &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
@@ -1060,18 +1102,18 @@
     sub \$8*16, $inl
     lea 8*16($inp), $inp
 ###############################################################################
-seal_sse_128_seal_hash:
+.Lseal_sse_128_tail_hash:
         cmp \$16, $itr1
-        jb seal_sse_128_seal\n";
+        jb .Lseal_sse_128_tail_xor\n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         sub \$16, $itr1
         lea 16($oup), $oup
-    jmp seal_sse_128_seal_hash
+    jmp .Lseal_sse_128_tail_hash
 
-seal_sse_128_seal:
+.Lseal_sse_128_tail_xor:
         cmp \$16, $inl
-        jb seal_sse_tail_16
+        jb .Lseal_sse_tail_16
         sub \$16, $inl
         # Load for decryption
         movdqu 0*16($inp), $T0
@@ -1092,22 +1134,22 @@
         movdqa $B1, $A1
         movdqa $C1, $B1
         movdqa $D1, $C1
-    jmp seal_sse_128_seal
+    jmp .Lseal_sse_128_tail_xor
 
-seal_sse_tail_16:
+.Lseal_sse_tail_16:
     test $inl, $inl
-    jz process_blocks_of_extra_in
+    jz .Lprocess_blocks_of_extra_in
     # We can only load the PT one byte at a time to avoid buffer overread
     mov $inl, $itr2
     mov $inl, $itr1
-    lea -1($inp, $inl), $inp
+    lea -1($inp,$inl), $inp
     pxor $T3, $T3
-1:
+.Lseal_sse_tail_16_compose:
         pslldq \$1, $T3
         pinsrb \$0, ($inp), $T3
         lea -1($inp), $inp
         dec $itr1
-        jne 1b
+        jne .Lseal_sse_tail_16_compose
 
     # XOR the keystream with the plaintext.
     pxor $A0, $T3
@@ -1115,12 +1157,12 @@
     # Write ciphertext out, byte-by-byte.
     movq $inl, $itr1
     movdqu $T3, $A0
-2:
+.Lseal_sse_tail_16_extract:
         pextrb \$0, $A0, ($oup)
         psrldq \$1, $A0
         add \$1, $oup
         sub \$1, $itr1
-        jnz 2b
+        jnz .Lseal_sse_tail_16_extract
 
     # $T3 contains the final (partial, non-empty) block of ciphertext which
     # needs to be fed into the Poly1305 state. The right-most $inl bytes of it
@@ -1129,23 +1171,23 @@
     #
     # $keyp points to the tag output, which is actually a struct with the
     # extra_in pointer and length at offset 48.
-    movq 288+32(%rsp), $keyp
+    movq 288 + $xmm_storage + 32(%rsp), $keyp
     movq 56($keyp), $t1  # extra_in_len
     movq 48($keyp), $t0  # extra_in
     test $t1, $t1
-    jz process_partial_block  # Common case: no bytes of extra_in
+    jz .Lprocess_partial_block  # Common case: no bytes of extra_in
 
     movq \$16, $t2
     subq $inl, $t2  # 16-$inl is the number of bytes that fit into $T3.
     cmpq $t2, $t1   # if extra_in_len < 16-$inl, only copy extra_in_len
                     # (note that AT&T syntax reverses the arguments)
-    jge load_extra_in
+    jge .Lload_extra_in
     movq $t1, $t2
 
-load_extra_in:
+.Lload_extra_in:
     # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
     # into $T3. They are loaded in reverse order.
-    leaq -1($t0, $t2), $inp
+    leaq -1($t0,$t2), $inp
     # Update extra_in and extra_in_len to reflect the bytes that are about to
     # be read.
     addq $t2, $t0
@@ -1159,29 +1201,29 @@
 
     # Load $t2 bytes of extra_in into $T2.
     pxor $T2, $T2
-3:
+.Lload_extra_load_loop:
         pslldq \$1, $T2
         pinsrb \$0, ($inp), $T2
         lea -1($inp), $inp
         sub \$1, $t2
-        jnz 3b
+        jnz .Lload_extra_load_loop
 
     # Shift $T2 up the length of the remainder from the main encryption. Sadly,
     # the shift for an XMM register has to be a constant, thus we loop to do
     # this.
     movq $inl, $t2
 
-4:
+.Lload_extra_shift_loop:
         pslldq \$1, $T2
         sub \$1, $t2
-        jnz 4b
+        jnz .Lload_extra_shift_loop
 
     # Mask $T3 (the remainder from the main encryption) so that superfluous
     # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
     # disjoint and so we can merge them with an OR.
-    lea .and_masks(%rip), $t2
+    lea .Land_masks(%rip), $t2
     shl \$4, $inl
-    pand -16($t2, $inl), $T3
+    pand -16($t2,$inl), $T3
 
     # Merge $T2 into $T3, forming the remainder block.
     por $T2, $T3
@@ -1195,40 +1237,39 @@
     adc \$1, $acc2\n";
     &poly_mul(); $code.="
 
-process_blocks_of_extra_in:
+.Lprocess_blocks_of_extra_in:
     # There may be additional bytes of extra_in to process.
-    movq 288+32(%rsp), $keyp
+    movq 288+32+$xmm_storage (%rsp), $keyp
     movq 48($keyp), $inp   # extra_in
     movq 56($keyp), $itr2  # extra_in_len
     movq $itr2, $itr1
     shr \$4, $itr2         # number of blocks
 
-5:
+.Lprocess_extra_hash_loop:
         jz process_extra_in_trailer\n";
         &poly_add("0($inp)");
         &poly_mul(); $code.="
         leaq 16($inp), $inp
         subq \$1, $itr2
-        jmp 5b
-
+        jmp .Lprocess_extra_hash_loop
 process_extra_in_trailer:
     andq \$15, $itr1       # remaining num bytes (<16) of extra_in
     movq $itr1, $inl
-    jz do_length_block
-    leaq -1($inp, $itr1), $inp
+    jz .Ldo_length_block
+    leaq -1($inp,$itr1), $inp
 
-6:
+.Lprocess_extra_in_trailer_load:
         pslldq \$1, $T3
         pinsrb \$0, ($inp), $T3
         lea -1($inp), $inp
         sub \$1, $itr1
-        jnz 6b
+        jnz .Lprocess_extra_in_trailer_load
 
-process_partial_block:
+.Lprocess_partial_block:
     # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
-    lea .and_masks(%rip), $t2
+    lea .Land_masks(%rip), $t2
     shl \$4, $inl
-    pand -16($t2, $inl), $T3
+    pand -16($t2,$inl), $T3
     movq $T3, $t0
     pextrq \$1, $T3, $t1
     add $t0, $acc0
@@ -1236,7 +1277,7 @@
     adc \$1, $acc2\n";
     &poly_mul(); $code.="
 
-do_length_block:\n";
+.Ldo_length_block:\n";
     &poly_add($len_store);
     &poly_mul(); $code.="
     # Final reduce
@@ -1251,40 +1292,54 @@
     cmovc $t2, $acc2
     # Add in s part of the key
     add 0+$s_store, $acc0
-    adc 8+$s_store, $acc1
+    adc 8+$s_store, $acc1\n";
 
-    add \$288 + 32, %rsp
+$code.="
+    movaps 16*0+$xmm_store, %xmm6
+    movaps 16*1+$xmm_store, %xmm7
+    movaps 16*2+$xmm_store, %xmm8
+    movaps 16*3+$xmm_store, %xmm9
+    movaps 16*4+$xmm_store, %xmm10
+    movaps 16*5+$xmm_store, %xmm11
+    movaps 16*6+$xmm_store, %xmm12
+    movaps 16*7+$xmm_store, %xmm13
+    movaps 16*8+$xmm_store, %xmm14
+    movaps 16*9+$xmm_store, %xmm15\n" if ($win64);
+$code.="
+.cfi_remember_state
+    add \$288 + $xmm_storage + 32, %rsp
 .cfi_adjust_cfa_offset -(288 + 32)
+    # The tag replaces the key on return
     pop $keyp
-.cfi_adjust_cfa_offset -8
-    mov $acc0, 0*8($keyp)
-    mov $acc1, 1*8($keyp)
-
+.cfi_pop $keyp
+    mov $acc0, ($keyp)
+    mov $acc1, 8($keyp)
     pop %r15
-.cfi_adjust_cfa_offset -8
+.cfi_pop %r15
     pop %r14
-.cfi_adjust_cfa_offset -8
+.cfi_pop %r14
     pop %r13
-.cfi_adjust_cfa_offset -8
+.cfi_pop %r13
     pop %r12
-.cfi_adjust_cfa_offset -8
+.cfi_pop %r12
     pop %rbx
-.cfi_adjust_cfa_offset -8
+.cfi_pop %rbx
     pop %rbp
-.cfi_adjust_cfa_offset -8
+.cfi_pop %rbp
     ret
-.cfi_adjust_cfa_offset (8 * 7) + 288 + 32
 ################################################################################
-seal_sse_128:
-    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
+.Lseal_sse_128:
+.cfi_restore_state
+    movdqu .Lchacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
     movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
     movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
     movdqu 2*16($keyp), $D2
-    movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
-    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
+    movdqa $D2, $D0\npaddd .Lsse_inc(%rip), $D0
+    movdqa $D0, $D1\npaddd .Lsse_inc(%rip), $D1
     movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
     mov \$10, $acc0
-1:\n";
+
+.Lseal_sse_128_rounds:\n";
         &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
@@ -1292,43 +1347,39 @@
         &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         dec $acc0
-    jnz 1b
-    paddd .chacha20_consts(%rip), $A0
-    paddd .chacha20_consts(%rip), $A1
-    paddd .chacha20_consts(%rip), $A2
+    jnz .Lseal_sse_128_rounds
+    paddd .Lchacha20_consts(%rip), $A0
+    paddd .Lchacha20_consts(%rip), $A1
+    paddd .Lchacha20_consts(%rip), $A2
     paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
     paddd $T2, $C0\npaddd $T2, $C1
     paddd $T3, $D0
-    paddd .sse_inc(%rip), $T3
+    paddd .Lsse_inc(%rip), $T3
     paddd $T3, $D1
     # Clamp and store the key
-    pand .clamp(%rip), $A2
+    pand .Lclamp(%rip), $A2
     movdqa $A2, $r_store
     movdqa $B2, $s_store
     # Hash
     mov %r8, $itr2
     call poly_hash_ad_internal
-    jmp seal_sse_128_seal
-.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
+    jmp .Lseal_sse_128_tail_xor
+.size chacha20_poly1305_seal, .-chacha20_poly1305_seal
+.cfi_endproc\n";
 }
 
-# There should have been a cfi_endproc at the end of that function, but the two
-# following blocks of code are jumped to without a stack frame and the CFI
-# context which they are used in happens to match the CFI context at the end of
-# the previous function. So the CFI table is just extended to the end of them.
-
 if ($avx>1) {
 
 ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
 my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
 ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
-$state1_store="2*32(%rbp)";
-$state2_store="3*32(%rbp)";
-$tmp_store="4*32(%rbp)";
-$ctr0_store="5*32(%rbp)";
-$ctr1_store="6*32(%rbp)";
-$ctr2_store="7*32(%rbp)";
-$ctr3_store="8*32(%rbp)";
+$state1_store="$xmm_storage+2*32(%rbp)";
+$state2_store="$xmm_storage+3*32(%rbp)";
+$tmp_store="$xmm_storage+4*32(%rbp)";
+$ctr0_store="$xmm_storage+5*32(%rbp)";
+$ctr1_store="$xmm_storage+6*32(%rbp)";
+$ctr2_store="$xmm_storage+7*32(%rbp)";
+$ctr3_store="$xmm_storage+8*32(%rbp)";
 
 sub chacha_qr_avx2 {
 my ($a,$b,$c,$d,$t,$dir)=@_;
@@ -1338,7 +1389,7 @@
 $code.=<<___;
     vpaddd $b, $a, $a
     vpxor $a, $d, $d
-    vpshufb .rol16(%rip), $d, $d
+    vpshufb .Lrol16(%rip), $d, $d
     vpaddd $d, $c, $c
     vpxor $c, $b, $b
     vpsrld \$20, $b, $t
@@ -1346,7 +1397,7 @@
     vpxor $t, $b, $b
     vpaddd $b, $a, $a
     vpxor $a, $d, $d
-    vpshufb .rol8(%rip), $d, $d
+    vpshufb .Lrol8(%rip), $d, $d
     vpaddd $d, $c, $c
     vpxor $c, $b, $b
     vpslld \$7, $b, $t
@@ -1371,7 +1422,7 @@
 sub prep_state_avx2 {
 my ($n)=@_;
 $code.=<<___;
-    vmovdqa .chacha20_consts(%rip), $A0
+    vmovdqa .Lchacha20_consts(%rip), $A0
     vmovdqa $state1_store, $B0
     vmovdqa $state2_store, $C0
 ___
@@ -1391,19 +1442,19 @@
     vmovdqa $C0, $C3
 ___
 $code.=<<___ if ($n eq 1);
-    vmovdqa .avx2_inc(%rip), $D0
+    vmovdqa .Lavx2_inc(%rip), $D0
     vpaddd $ctr0_store, $D0, $D0
     vmovdqa $D0, $ctr0_store
 ___
 $code.=<<___ if ($n eq 2);
-    vmovdqa .avx2_inc(%rip), $D0
+    vmovdqa .Lavx2_inc(%rip), $D0
     vpaddd $ctr0_store, $D0, $D1
     vpaddd $D1, $D0, $D0
     vmovdqa $D0, $ctr0_store
     vmovdqa $D1, $ctr1_store
 ___
 $code.=<<___ if ($n eq 3);
-    vmovdqa .avx2_inc(%rip), $D0
+    vmovdqa .Lavx2_inc(%rip), $D0
     vpaddd $ctr0_store, $D0, $D2
     vpaddd $D2, $D0, $D1
     vpaddd $D1, $D0, $D0
@@ -1412,7 +1463,7 @@
     vmovdqa $D2, $ctr2_store
 ___
 $code.=<<___ if ($n eq 4);
-    vmovdqa .avx2_inc(%rip), $D0
+    vmovdqa .Lavx2_inc(%rip), $D0
     vpaddd $ctr0_store, $D0, $D3
     vpaddd $D3, $D0, $D2
     vpaddd $D2, $D0, $D1
@@ -1427,25 +1478,25 @@
 sub finalize_state_avx2 {
 my ($n)=@_;
 $code.=<<___ if ($n eq 4);
-    vpaddd .chacha20_consts(%rip), $A3, $A3
+    vpaddd .Lchacha20_consts(%rip), $A3, $A3
     vpaddd $state1_store, $B3, $B3
     vpaddd $state2_store, $C3, $C3
     vpaddd $ctr3_store, $D3, $D3
 ___
 $code.=<<___ if ($n ge 3);
-    vpaddd .chacha20_consts(%rip), $A2, $A2
+    vpaddd .Lchacha20_consts(%rip), $A2, $A2
     vpaddd $state1_store, $B2, $B2
     vpaddd $state2_store, $C2, $C2
     vpaddd $ctr2_store, $D2, $D2
 ___
 $code.=<<___ if ($n ge 2);
-    vpaddd .chacha20_consts(%rip), $A1, $A1
+    vpaddd .Lchacha20_consts(%rip), $A1, $A1
     vpaddd $state1_store, $B1, $B1
     vpaddd $state2_store, $C1, $C1
     vpaddd $ctr1_store, $D1, $D1
 ___
 $code.=<<___;
-    vpaddd .chacha20_consts(%rip), $A0, $A0
+    vpaddd .Lchacha20_consts(%rip), $A0, $A0
     vpaddd $state1_store, $B0, $B0
     vpaddd $state2_store, $C0, $C0
     vpaddd $ctr0_store, $D0, $D0
@@ -1536,11 +1587,10 @@
                 vpshufb $C0, $D2, $D2
                 vpshufb $C0, $D1, $D1
                 vpshufb $C0, $D0, $D0
-                vmovdqa $tmp_store, $C0
                 vpaddd $D3, $C3, $C3
                 vpaddd $D2, $C2, $C2
                 vpaddd $D1, $C1, $C1
-                vpaddd $D0, $C0, $C0
+                vpaddd $tmp_store, $D0, $C0
                 vpxor $C3, $B3, $B3
                 vpxor $C2, $B2, $B2
                 vpxor $C1, $B1, $B1
@@ -1577,77 +1627,90 @@
 return $round;
 };
 
-$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
-               &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
-               &gen_chacha_round_avx2(20, ".rol16(%rip)") .
-               &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
+$chacha_body = &gen_chacha_round_avx2(20, ".Lrol16(%rip)") .
+               &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "left") .
+               &gen_chacha_round_avx2(20, ".Lrol16(%rip)") .
+               &gen_chacha_round_avx2(25, ".Lrol8(%rip)", "right");
 
 @loop_body = split /\n/, $chacha_body;
 
 $code.="
 ###############################################################################
-.type chacha20_poly1305_open_avx2,\@function,2
+.type chacha20_poly1305_open_avx2,\@abi-omnipotent
 .align 64
 chacha20_poly1305_open_avx2:
+.cfi_startproc
+
+# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here
+.cfi_push %rbp
+.cfi_push %rbx
+.cfi_push %r12
+.cfi_push %r13
+.cfi_push %r14
+.cfi_push %r15
+.cfi_push $keyp
+.cfi_adjust_cfa_offset 288 + 32
+
     vzeroupper
-    vmovdqa .chacha20_consts(%rip), $A0
+    vmovdqa .Lchacha20_consts(%rip), $A0
     vbroadcasti128 0*16($keyp), $B0
     vbroadcasti128 1*16($keyp), $C0
     vbroadcasti128 2*16($keyp), $D0
-    vpaddd .avx2_init(%rip), $D0, $D0
+    vpaddd .Lavx2_init(%rip), $D0, $D0
     cmp \$6*32, $inl
-    jbe open_avx2_192
+    jbe .Lopen_avx2_192
     cmp \$10*32, $inl
-    jbe open_avx2_320
+    jbe .Lopen_avx2_320
 
     vmovdqa $B0, $state1_store
     vmovdqa $C0, $state2_store
     vmovdqa $D0, $ctr0_store
     mov \$10, $acc0
-1:  \n";
+.Lopen_avx2_init_rounds:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         dec $acc0
-    jne 1b
-    vpaddd .chacha20_consts(%rip), $A0, $A0
+    jne .Lopen_avx2_init_rounds
+    vpaddd .Lchacha20_consts(%rip), $A0, $A0
     vpaddd $state1_store, $B0, $B0
     vpaddd $state2_store, $C0, $C0
     vpaddd $ctr0_store, $D0, $D0
 
     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store key
-    vpand .clamp(%rip), $T0, $T0
+    vpand .Lclamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for the first 64 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
     vperm2i128 \$0x13, $C0, $D0, $B0
     # Hash AD + first 64 bytes
-    mov %r8, $itr2
+    mov $adl, $itr2
     call poly_hash_ad_internal
-    xor $itr1, $itr1
     # Hash first 64 bytes
-1:  \n";
-       &poly_add("0($inp, $itr1)");
+    xor $itr1, $itr1
+.Lopen_avx2_init_hash: \n";
+       &poly_add("0($inp,$itr1)");
        &poly_mul(); $code.="
        add \$16, $itr1
        cmp \$2*32, $itr1
-    jne 1b
+    jne .Lopen_avx2_init_hash
     # Decrypt first 64 bytes
     vpxor 0*32($inp), $A0, $A0
     vpxor 1*32($inp), $B0, $B0
+    # Store first 64 bytes of decrypted data
     vmovdqu $A0, 0*32($oup)
     vmovdqu $B0, 1*32($oup)
     lea 2*32($inp), $inp
     lea 2*32($oup), $oup
     sub \$2*32, $inl
-1:
+.Lopen_avx2_main_loop:
         # Hash and decrypt 512 bytes each iteration
         cmp \$16*32, $inl
-        jb 3f\n";
-        &prep_state_avx2(4); $code.="
+        jb .Lopen_avx2_main_loop_done\n";
+        &prep_state_avx2(4); $code.=" 
         xor $itr1, $itr1
-2:  \n";
-            &poly_add("0*8($inp, $itr1)");
+.Lopen_avx2_main_loop_rounds: \n";
+            &poly_add("0*8($inp,$itr1)");
             &emit_body(10);
             &poly_stage1_mulx();
             &emit_body(9);
@@ -1657,7 +1720,7 @@
             &emit_body(10);
             &poly_reduce_stage();
             &emit_body(9);
-            &poly_add("2*8($inp, $itr1)");
+            &poly_add("2*8($inp,$itr1)");
             &emit_body(8);
             &poly_stage1_mulx();
             &emit_body(18);
@@ -1667,7 +1730,7 @@
             &emit_body(9);
             &poly_reduce_stage();
             &emit_body(8);
-            &poly_add("4*8($inp, $itr1)"); $code.="
+            &poly_add("4*8($inp,$itr1)"); $code.="
             lea 6*8($itr1), $itr1\n";
             &emit_body(18);
             &poly_stage1_mulx();
@@ -1680,7 +1743,7 @@
             foreach $l (@loop_body) {$code.=$l."\n";}
             @loop_body = split /\n/, $chacha_body; $code.="
             cmp \$10*6*8, $itr1
-        jne 2b\n";
+        jne .Lopen_avx2_main_loop_rounds\n";
         &finalize_state_avx2(4); $code.="
         vmovdqa $A0, $tmp_store\n";
         &poly_add("10*6*8($inp)");
@@ -1695,14 +1758,18 @@
         lea 16*32($inp), $inp
         lea 16*32($oup), $oup
         sub \$16*32, $inl
-    jmp 1b
-3:
+    jmp .Lopen_avx2_main_loop
+.Lopen_avx2_main_loop_done:
     test $inl, $inl
     vzeroupper
-    je open_sse_finalize
-3:
+    je .Lopen_sse_finalize
+
+    cmp \$12*32, $inl
+    ja .Lopen_avx2_tail_512
+    cmp \$8*32, $inl
+    ja .Lopen_avx2_tail_384
     cmp \$4*32, $inl
-    ja 3f\n";
+    ja .Lopen_avx2_tail_256\n";
 ###############################################################################
     # 1-128 bytes left
     &prep_state_avx2(1); $code.="
@@ -1710,25 +1777,23 @@
     mov $inl, $itr1
     and \$-16, $itr1
     test $itr1, $itr1
-    je 2f
-1:  \n";
-        &poly_add("0*8($inp, $itr2)");
+    je .Lopen_avx2_tail_128_rounds # Have nothing to hash
+.Lopen_avx2_tail_128_rounds_and_x1hash: \n";
+        &poly_add("0*8($inp,$itr2)");
         &poly_mul(); $code.="
-2:
+.Lopen_avx2_tail_128_rounds:
         add \$16, $itr2\n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         cmp $itr1, $itr2
-    jb 1b
+    jb .Lopen_avx2_tail_128_rounds_and_x1hash
         cmp \$160, $itr2
-    jne 2b\n";
+    jne .Lopen_avx2_tail_128_rounds\n";
     &finalize_state_avx2(1);
     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
-    jmp open_avx2_tail_loop
-3:
-    cmp \$8*32, $inl
-    ja 3f\n";
+    jmp .Lopen_avx2_tail_128_xor
 ###############################################################################
+.Lopen_avx2_tail_256: \n";
     # 129-256 bytes left
     &prep_state_avx2(2); $code.="
     mov $inl, $tmp_store
@@ -1740,11 +1805,11 @@
     cmovg $itr2, $itr1
     mov $inp, $inl
     xor $itr2, $itr2
-1:  \n";
+.Lopen_avx2_tail_256_rounds_and_x1hash: \n";
         &poly_add("0*8($inl)");
         &poly_mul_mulx(); $code.="
         lea 16($inl), $inl
-2:  \n";
+.Lopen_avx2_tail_256_rounds: \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
         inc $itr2\n";
@@ -1752,33 +1817,31 @@
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         cmp $itr1, $itr2
-    jb 1b
+    jb .Lopen_avx2_tail_256_rounds_and_x1hash
         cmp \$10, $itr2
-    jne 2b
+    jne .Lopen_avx2_tail_256_rounds
     mov $inl, $itr2
     sub $inp, $inl
     mov $inl, $itr1
     mov $tmp_store, $inl
-1:
+.Lopen_avx2_tail_256_hash:
         add \$16, $itr1
         cmp $inl, $itr1
-        jg 1f\n";
+        jg .Lopen_avx2_tail_256_done\n";
         &poly_add("0*8($itr2)");
         &poly_mul_mulx(); $code.="
         lea 16($itr2), $itr2
-    jmp 1b
-1:  \n";
+    jmp .Lopen_avx2_tail_256_hash
+.Lopen_avx2_tail_256_done: \n";
     &finalize_state_avx2(2);
     &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
     &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
     lea 4*32($inp), $inp
     lea 4*32($oup), $oup
     sub \$4*32, $inl
-    jmp open_avx2_tail_loop
-3:
-    cmp \$12*32, $inl
-    ja 3f\n";
+    jmp .Lopen_avx2_tail_128_xor
 ###############################################################################
+.Lopen_avx2_tail_384: \n";
     # 257-383 bytes left
     &prep_state_avx2(3); $code.="
     mov $inl, $tmp_store
@@ -1791,11 +1854,11 @@
     cmovg $itr2, $itr1
     mov $inp, $inl
     xor $itr2, $itr2
-1:  \n";
+.Lopen_avx2_tail_384_rounds_and_x2hash: \n";
         &poly_add("0*8($inl)");
         &poly_mul_mulx(); $code.="
         lea 16($inl), $inl
-2:  \n";
+.Lopen_avx2_tail_384_rounds_and_x1hash: \n";
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
@@ -1807,22 +1870,22 @@
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
         cmp $itr1, $itr2
-    jb 1b
+    jb  .Lopen_avx2_tail_384_rounds_and_x2hash
         cmp \$10, $itr2
-    jne 2b
+    jne .Lopen_avx2_tail_384_rounds_and_x1hash
     mov $inl, $itr2
     sub $inp, $inl
     mov $inl, $itr1
     mov $tmp_store, $inl
-1:
+.Lopen_avx2_384_tail_hash:
         add \$16, $itr1
         cmp $inl, $itr1
-        jg 1f\n";
+        jg .Lopen_avx2_384_tail_done\n";
         &poly_add("0*8($itr2)");
         &poly_mul_mulx(); $code.="
         lea 16($itr2), $itr2
-    jmp 1b
-1:  \n";
+    jmp .Lopen_avx2_384_tail_hash
+.Lopen_avx2_384_tail_done: \n";
     &finalize_state_avx2(3);
     &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
     &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
@@ -1830,18 +1893,18 @@
     lea 8*32($inp), $inp
     lea 8*32($oup), $oup
     sub \$8*32, $inl
-    jmp open_avx2_tail_loop
-3:  \n";
+    jmp .Lopen_avx2_tail_128_xor
 ###############################################################################
+.Lopen_avx2_tail_512: \n";
     # 384-512 bytes left
     &prep_state_avx2(4); $code.="
     xor $itr1, $itr1
     mov $inp, $itr2
-1:  \n";
+.Lopen_avx2_tail_512_rounds_and_x2hash: \n";
         &poly_add("0*8($itr2)");
         &poly_mul(); $code.="
         lea 2*8($itr2), $itr2
-2:  \n";
+.Lopen_avx2_tail_512_rounds_and_x1hash: \n";
         &emit_body(37);
         &poly_add("0*8($itr2)");
         &poly_mul_mulx();
@@ -1853,21 +1916,21 @@
         @loop_body = split /\n/, $chacha_body; $code.="
         inc $itr1
         cmp \$4, $itr1
-    jl  1b
+    jl  .Lopen_avx2_tail_512_rounds_and_x2hash
         cmp \$10, $itr1
-    jne 2b
+    jne .Lopen_avx2_tail_512_rounds_and_x1hash
     mov $inl, $itr1
     sub \$12*32, $itr1
     and \$-16, $itr1
-1:
+.Lopen_avx2_tail_512_hash:
         test $itr1, $itr1
-        je 1f\n";
+        je .Lopen_avx2_tail_512_done\n";
         &poly_add("0*8($itr2)");
         &poly_mul_mulx(); $code.="
         lea 2*8($itr2), $itr2
         sub \$2*8, $itr1
-    jmp 1b
-1:  \n";
+    jmp .Lopen_avx2_tail_512_hash
+.Lopen_avx2_tail_512_done: \n";
     &finalize_state_avx2(4); $code.="
     vmovdqa $A0, $tmp_store\n";
     &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
@@ -1878,9 +1941,9 @@
     lea 12*32($inp), $inp
     lea 12*32($oup), $oup
     sub \$12*32, $inl
-open_avx2_tail_loop:
+.Lopen_avx2_tail_128_xor:
     cmp \$32, $inl
-    jb open_avx2_tail
+    jb .Lopen_avx2_tail_32_xor
         sub \$32, $inl
         vpxor ($inp), $A0, $A0
         vmovdqu $A0, ($oup)
@@ -1889,11 +1952,11 @@
         vmovdqa $B0, $A0
         vmovdqa $C0, $B0
         vmovdqa $D0, $C0
-    jmp open_avx2_tail_loop
-open_avx2_tail:
+    jmp .Lopen_avx2_tail_128_xor
+.Lopen_avx2_tail_32_xor:
     cmp \$16, $inl
     vmovdqa $A0x, $A1x
-    jb 1f
+    jb .Lopen_avx2_exit
     sub \$16, $inl
     #load for decryption
     vpxor ($inp), $A0x, $A1x
@@ -1902,28 +1965,28 @@
     lea 1*16($oup), $oup
     vperm2i128 \$0x11, $A0, $A0, $A0
     vmovdqa $A0x, $A1x
-1:
+.Lopen_avx2_exit:
     vzeroupper
-    jmp open_sse_tail_16
+    jmp .Lopen_sse_tail_16
 ###############################################################################
-open_avx2_192:
+.Lopen_avx2_192:
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $B0, $B1
     vmovdqa $B0, $B2
     vmovdqa $C0, $C1
     vmovdqa $C0, $C2
-    vpaddd .avx2_inc(%rip), $D0, $D1
+    vpaddd .Lavx2_inc(%rip), $D0, $D1
     vmovdqa $D0, $T2
     vmovdqa $D1, $T3
     mov \$10, $acc0
-1:  \n";
+.Lopen_avx2_192_rounds: \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
         dec $acc0
-    jne 1b
+    jne .Lopen_avx2_192_rounds
     vpaddd $A2, $A0, $A0
     vpaddd $A2, $A1, $A1
     vpaddd $B2, $B0, $B0
@@ -1934,7 +1997,7 @@
     vpaddd $T3, $D1, $D1
     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store the key
-    vpand .clamp(%rip), $T0, $T0
+    vpand .Lclamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for up to 192 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
@@ -1943,12 +2006,12 @@
     vperm2i128 \$0x02, $C1, $D1, $D0
     vperm2i128 \$0x13, $A1, $B1, $A1
     vperm2i128 \$0x13, $C1, $D1, $B1
-open_avx2_short:
-    mov %r8, $itr2
+.Lopen_avx2_short:
+    mov $adl, $itr2
     call poly_hash_ad_internal
-open_avx2_hash_and_xor_loop:
+.Lopen_avx2_short_hash_and_xor_loop:
         cmp \$32, $inl
-        jb open_avx2_short_tail_32
+        jb .Lopen_avx2_short_tail_32
         sub \$32, $inl\n";
         # Load + hash
         &poly_add("0*8($inp)");
@@ -1970,11 +2033,11 @@
         vmovdqa $D1, $C1
         vmovdqa $A2, $D1
         vmovdqa $B2, $A2
-    jmp open_avx2_hash_and_xor_loop
-open_avx2_short_tail_32:
+    jmp .Lopen_avx2_short_hash_and_xor_loop
+.Lopen_avx2_short_tail_32:
     cmp \$16, $inl
     vmovdqa $A0x, $A1x
-    jb 1f
+    jb .Lopen_avx2_short_tail_32_exit
     sub \$16, $inl\n";
     &poly_add("0*8($inp)");
     &poly_mul(); $code.="
@@ -1983,26 +2046,26 @@
     lea 1*16($inp), $inp
     lea 1*16($oup), $oup
     vextracti128 \$1, $A0, $A1x
-1:
+.Lopen_avx2_short_tail_32_exit:
     vzeroupper
-    jmp open_sse_tail_16
+    jmp .Lopen_sse_tail_16
 ###############################################################################
-open_avx2_320:
+.Lopen_avx2_320:
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $B0, $B1
     vmovdqa $B0, $B2
     vmovdqa $C0, $C1
     vmovdqa $C0, $C2
-    vpaddd .avx2_inc(%rip), $D0, $D1
-    vpaddd .avx2_inc(%rip), $D1, $D2
+    vpaddd .Lavx2_inc(%rip), $D0, $D1
+    vpaddd .Lavx2_inc(%rip), $D1, $D2
     vmovdqa $B0, $T1
     vmovdqa $C0, $T2
     vmovdqa $D0, $ctr0_store
     vmovdqa $D1, $ctr1_store
     vmovdqa $D2, $ctr2_store
     mov \$10, $acc0
-1:  \n";
+.Lopen_avx2_320_rounds:  \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
@@ -2010,10 +2073,10 @@
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         dec $acc0
-    jne 1b
-    vpaddd .chacha20_consts(%rip), $A0, $A0
-    vpaddd .chacha20_consts(%rip), $A1, $A1
-    vpaddd .chacha20_consts(%rip), $A2, $A2
+    jne .Lopen_avx2_320_rounds
+    vpaddd .Lchacha20_consts(%rip), $A0, $A0
+    vpaddd .Lchacha20_consts(%rip), $A1, $A1
+    vpaddd .Lchacha20_consts(%rip), $A2, $A2
     vpaddd $T1, $B0, $B0
     vpaddd $T1, $B1, $B1
     vpaddd $T1, $B2, $B2
@@ -2025,7 +2088,7 @@
     vpaddd $ctr2_store, $D2, $D2
     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store the key
-    vpand .clamp(%rip), $T0, $T0
+    vpand .Lclamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for up to 320 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
@@ -2038,23 +2101,36 @@
     vperm2i128 \$0x02, $C2, $D2, $D1
     vperm2i128 \$0x13, $A2, $B2, $A2
     vperm2i128 \$0x13, $C2, $D2, $B2
-    jmp open_avx2_short
+    jmp .Lopen_avx2_short
 .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
+.cfi_endproc
 ###############################################################################
 ###############################################################################
-.type chacha20_poly1305_seal_avx2,\@function,2
+.type chacha20_poly1305_seal_avx2,\@abi-omnipotent
 .align 64
 chacha20_poly1305_seal_avx2:
+.cfi_startproc
+
+# Since the AVX2 function operates in the frame of the SSE function, we just copy the frame state to over here
+.cfi_push %rbp
+.cfi_push %rbx
+.cfi_push %r12
+.cfi_push %r13
+.cfi_push %r14
+.cfi_push %r15
+.cfi_push $keyp
+.cfi_adjust_cfa_offset 288 + 32
+
     vzeroupper
-    vmovdqa .chacha20_consts(%rip), $A0
+    vmovdqa .Lchacha20_consts(%rip), $A0
     vbroadcasti128 0*16($keyp), $B0
     vbroadcasti128 1*16($keyp), $C0
     vbroadcasti128 2*16($keyp), $D0
-    vpaddd .avx2_init(%rip), $D0, $D0
+    vpaddd .Lavx2_init(%rip), $D0, $D0
     cmp \$6*32, $inl
-    jbe seal_avx2_192
+    jbe .Lseal_avx2_192
     cmp \$10*32, $inl
-    jbe seal_avx2_320
+    jbe .Lseal_avx2_320
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $A0, $A3
@@ -2067,26 +2143,26 @@
     vmovdqa $C0, $C3
     vmovdqa $C0, $state2_store
     vmovdqa $D0, $D3
-    vpaddd .avx2_inc(%rip), $D3, $D2
-    vpaddd .avx2_inc(%rip), $D2, $D1
-    vpaddd .avx2_inc(%rip), $D1, $D0
+    vpaddd .Lavx2_inc(%rip), $D3, $D2
+    vpaddd .Lavx2_inc(%rip), $D2, $D1
+    vpaddd .Lavx2_inc(%rip), $D1, $D0
     vmovdqa $D0, $ctr0_store
     vmovdqa $D1, $ctr1_store
     vmovdqa $D2, $ctr2_store
     vmovdqa $D3, $ctr3_store
     mov \$10, $acc0
-1:  \n";
+.Lseal_avx2_init_rounds: \n";
         foreach $l (@loop_body) {$code.=$l."\n";}
         @loop_body = split /\n/, $chacha_body; $code.="
         dec $acc0
-        jnz 1b\n";
+        jnz .Lseal_avx2_init_rounds\n";
     &finalize_state_avx2(4); $code.="
     vperm2i128 \$0x13, $C3, $D3, $C3
     vperm2i128 \$0x02, $A3, $B3, $D3
     vperm2i128 \$0x13, $A3, $B3, $A3
-    vpand .clamp(%rip), $D3, $D3
+    vpand .Lclamp(%rip), $D3, $D3
     vmovdqa $D3, $r_store
-    mov %r8, $itr2
+    mov $adl, $itr2
     call poly_hash_ad_internal
     # Safely store 320 bytes (otherwise would handle with optimized call)
     vpxor 0*32($inp), $A3, $A3
@@ -2100,7 +2176,7 @@
     sub \$10*32, $inl
     mov \$10*32, $itr1
     cmp \$4*32, $inl
-    jbe seal_avx2_hash
+    jbe .Lseal_avx2_short_hash_remainder
     vpxor 0*32($inp), $A0, $A0
     vpxor 1*32($inp), $B0, $B0
     vpxor 2*32($inp), $C0, $C0
@@ -2114,13 +2190,13 @@
     mov \$8, $itr1
     mov \$2, $itr2
     cmp \$4*32, $inl
-    jbe seal_avx2_tail_128
+    jbe .Lseal_avx2_tail_128
     cmp \$8*32, $inl
-    jbe seal_avx2_tail_256
+    jbe .Lseal_avx2_tail_256
     cmp \$12*32, $inl
-    jbe seal_avx2_tail_384
+    jbe .Lseal_avx2_tail_384
     cmp \$16*32, $inl
-    jbe seal_avx2_tail_512\n";
+    jbe .Lseal_avx2_tail_512\n";
     # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
     &prep_state_avx2(4);
     foreach $l (@loop_body) {$code.=$l."\n";}
@@ -2129,11 +2205,13 @@
     @loop_body = split /\n/, $chacha_body; $code.="
     sub \$16, $oup
     mov \$9, $itr1
-    jmp 4f
-1:  \n";
+    jmp .Lseal_avx2_main_loop_rounds_entry
+.align 32
+.Lseal_avx2_main_loop: \n";
         &prep_state_avx2(4); $code.="
         mov \$10, $itr1
-2:  \n";
+.align 32
+.Lseal_avx2_main_loop_rounds: \n";
             &poly_add("0*8($oup)");
             &emit_body(10);
             &poly_stage1_mulx();
@@ -2143,7 +2221,7 @@
             &poly_stage3_mulx();
             &emit_body(10);
             &poly_reduce_stage(); $code.="
-4:  \n";
+.Lseal_avx2_main_loop_rounds_entry: \n";
             &emit_body(9);
             &poly_add("2*8($oup)");
             &emit_body(8);
@@ -2168,65 +2246,68 @@
             foreach $l (@loop_body) {$code.=$l."\n";}
             @loop_body = split /\n/, $chacha_body; $code.="
             dec $itr1
-        jne 2b\n";
+        jne .Lseal_avx2_main_loop_rounds\n";
         &finalize_state_avx2(4); $code.="
-        lea 4*8($oup), $oup
         vmovdqa $A0, $tmp_store\n";
-        &poly_add("-4*8($oup)");
+        &poly_add("0*8($oup)");
+        &poly_mul_mulx();
+        &poly_add("2*8($oup)");
+        &poly_mul_mulx(); $code.="
+        lea 4*8($oup), $oup\n";
         &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
         vmovdqa $tmp_store, $A0\n";
-        &poly_mul();
         &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
-        &poly_add("-2*8($oup)");
         &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
-        &poly_mul();
         &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
         lea 16*32($inp), $inp
         sub \$16*32, $inl
         cmp \$16*32, $inl
-    jg 1b\n";
+    jg .Lseal_avx2_main_loop
+\n";
     &poly_add("0*8($oup)");
-    &poly_mul();
+    &poly_mul_mulx();
     &poly_add("2*8($oup)");
-    &poly_mul(); $code.="
+    &poly_mul_mulx(); $code.="
     lea 4*8($oup), $oup
     mov \$10, $itr1
     xor $itr2, $itr2
+
+    cmp \$12*32, $inl
+    ja  .Lseal_avx2_tail_512
+    cmp \$8*32, $inl
+    ja  .Lseal_avx2_tail_384
     cmp \$4*32, $inl
-    ja 3f
+    ja  .Lseal_avx2_tail_256
 ###############################################################################
-seal_avx2_tail_128:\n";
+.Lseal_avx2_tail_128:\n";
     &prep_state_avx2(1); $code.="
-1:  \n";
+.Lseal_avx2_tail_128_rounds_and_3xhash: \n";
         &poly_add("0($oup)");
-        &poly_mul(); $code.="
+        &poly_mul_mulx(); $code.="
         lea 2*8($oup), $oup
-2:  \n";
+.Lseal_avx2_tail_128_rounds_and_2xhash: \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &poly_add("0*8($oup)");
-        &poly_mul();
+        &poly_mul_mulx();
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &poly_add("2*8($oup)");
-        &poly_mul(); $code.="
+        &poly_mul_mulx(); $code.="
         lea 4*8($oup), $oup
         dec $itr1
-    jg 1b
+    jg  .Lseal_avx2_tail_128_rounds_and_3xhash
         dec $itr2
-    jge 2b\n";
+    jge .Lseal_avx2_tail_128_rounds_and_2xhash\n";
     &finalize_state_avx2(1);
     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
-    jmp seal_avx2_short_loop
-3:
-    cmp \$8*32, $inl
-    ja 3f
+    jmp .Lseal_avx2_short_loop
 ###############################################################################
-seal_avx2_tail_256:\n";
+.Lseal_avx2_tail_256:\n";
     &prep_state_avx2(2); $code.="
-1:  \n";
+.Lseal_avx2_tail_256_rounds_and_3xhash: \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 2*8($oup), $oup
-2:  \n";
+.Lseal_avx2_tail_256_rounds_and_2xhash: \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &poly_add("0*8($oup)");
@@ -2237,27 +2318,24 @@
         &poly_mul(); $code.="
         lea 4*8($oup), $oup
         dec $itr1
-    jg 1b
+    jg  .Lseal_avx2_tail_256_rounds_and_3xhash
         dec $itr2
-    jge 2b\n";
+    jge .Lseal_avx2_tail_256_rounds_and_2xhash\n";
     &finalize_state_avx2(2);
     &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
     &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
     mov \$4*32, $itr1
     lea 4*32($inp), $inp
     sub \$4*32, $inl
-    jmp seal_avx2_hash
-3:
-    cmp \$12*32, $inl
-    ja seal_avx2_tail_512
+    jmp .Lseal_avx2_short_hash_remainder
 ###############################################################################
-seal_avx2_tail_384:\n";
+.Lseal_avx2_tail_384:\n";
     &prep_state_avx2(3); $code.="
-1:  \n";
+.Lseal_avx2_tail_384_rounds_and_3xhash: \n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         lea 2*8($oup), $oup
-2:  \n";
+.Lseal_avx2_tail_384_rounds_and_2xhash: \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &poly_add("0*8($oup)");
@@ -2270,9 +2348,9 @@
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         lea 4*8($oup), $oup
         dec $itr1
-    jg 1b
+    jg  .Lseal_avx2_tail_384_rounds_and_3xhash
         dec $itr2
-    jge 2b\n";
+    jge .Lseal_avx2_tail_384_rounds_and_2xhash\n";
     &finalize_state_avx2(3);
     &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
     &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
@@ -2280,15 +2358,15 @@
     mov \$8*32, $itr1
     lea 8*32($inp), $inp
     sub \$8*32, $inl
-    jmp seal_avx2_hash
+    jmp .Lseal_avx2_short_hash_remainder
 ###############################################################################
-seal_avx2_tail_512:\n";
+.Lseal_avx2_tail_512:\n";
     &prep_state_avx2(4); $code.="
-1:  \n";
+.Lseal_avx2_tail_512_rounds_and_3xhash: \n";
         &poly_add("0($oup)");
         &poly_mul_mulx(); $code.="
         lea 2*8($oup), $oup
-2:  \n";
+.Lseal_avx2_tail_512_rounds_and_2xhash: \n";
         &emit_body(20);
         &poly_add("0*8($oup)");
         &emit_body(20);
@@ -2313,9 +2391,9 @@
         @loop_body = split /\n/, $chacha_body; $code.="
         lea 4*8($oup), $oup
         dec $itr1
-    jg 1b
+    jg .Lseal_avx2_tail_512_rounds_and_3xhash
         dec $itr2
-    jge 2b\n";
+    jge .Lseal_avx2_tail_512_rounds_and_2xhash\n";
     &finalize_state_avx2(4); $code.="
     vmovdqa $A0, $tmp_store\n";
     &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
@@ -2326,24 +2404,24 @@
     mov \$12*32, $itr1
     lea 12*32($inp), $inp
     sub \$12*32, $inl
-    jmp seal_avx2_hash
+    jmp .Lseal_avx2_short_hash_remainder
 ################################################################################
-seal_avx2_320:
+.Lseal_avx2_320:
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $B0, $B1
     vmovdqa $B0, $B2
     vmovdqa $C0, $C1
     vmovdqa $C0, $C2
-    vpaddd .avx2_inc(%rip), $D0, $D1
-    vpaddd .avx2_inc(%rip), $D1, $D2
+    vpaddd .Lavx2_inc(%rip), $D0, $D1
+    vpaddd .Lavx2_inc(%rip), $D1, $D2
     vmovdqa $B0, $T1
     vmovdqa $C0, $T2
     vmovdqa $D0, $ctr0_store
     vmovdqa $D1, $ctr1_store
     vmovdqa $D2, $ctr2_store
     mov \$10, $acc0
-1:  \n";
+.Lseal_avx2_320_rounds: \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
@@ -2351,10 +2429,10 @@
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
         &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
         dec $acc0
-    jne 1b
-    vpaddd .chacha20_consts(%rip), $A0, $A0
-    vpaddd .chacha20_consts(%rip), $A1, $A1
-    vpaddd .chacha20_consts(%rip), $A2, $A2
+    jne .Lseal_avx2_320_rounds
+    vpaddd .Lchacha20_consts(%rip), $A0, $A0
+    vpaddd .Lchacha20_consts(%rip), $A1, $A1
+    vpaddd .Lchacha20_consts(%rip), $A2, $A2
     vpaddd $T1, $B0, $B0
     vpaddd $T1, $B1, $B1
     vpaddd $T1, $B2, $B2
@@ -2366,7 +2444,7 @@
     vpaddd $ctr2_store, $D2, $D2
     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store the key
-    vpand .clamp(%rip), $T0, $T0
+    vpand .Lclamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for up to 320 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
@@ -2379,26 +2457,26 @@
     vperm2i128 \$0x02, $C2, $D2, $D1
     vperm2i128 \$0x13, $A2, $B2, $A2
     vperm2i128 \$0x13, $C2, $D2, $B2
-    jmp seal_avx2_short
+    jmp .Lseal_avx2_short
 ################################################################################
-seal_avx2_192:
+.Lseal_avx2_192:
     vmovdqa $A0, $A1
     vmovdqa $A0, $A2
     vmovdqa $B0, $B1
     vmovdqa $B0, $B2
     vmovdqa $C0, $C1
     vmovdqa $C0, $C2
-    vpaddd .avx2_inc(%rip), $D0, $D1
+    vpaddd .Lavx2_inc(%rip), $D0, $D1
     vmovdqa $D0, $T2
     vmovdqa $D1, $T3
     mov \$10, $acc0
-1:  \n";
+.Lseal_avx2_192_rounds: \n";
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
         &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
         &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
         dec $acc0
-    jne 1b
+    jne .Lseal_avx2_192_rounds
     vpaddd $A2, $A0, $A0
     vpaddd $A2, $A1, $A1
     vpaddd $B2, $B0, $B0
@@ -2409,7 +2487,7 @@
     vpaddd $T3, $D1, $D1
     vperm2i128 \$0x02, $A0, $B0, $T0
     # Clamp and store the key
-    vpand .clamp(%rip), $T0, $T0
+    vpand .Lclamp(%rip), $T0, $T0
     vmovdqa $T0, $r_store
     # Stream for up to 192 bytes
     vperm2i128 \$0x13, $A0, $B0, $A0
@@ -2418,21 +2496,21 @@
     vperm2i128 \$0x02, $C1, $D1, $D0
     vperm2i128 \$0x13, $A1, $B1, $A1
     vperm2i128 \$0x13, $C1, $D1, $B1
-seal_avx2_short:
-    mov %r8, $itr2
+.Lseal_avx2_short:
+    mov $adl, $itr2
     call poly_hash_ad_internal
     xor $itr1, $itr1
-seal_avx2_hash:
+.Lseal_avx2_short_hash_remainder:
         cmp \$16, $itr1
-        jb seal_avx2_short_loop\n";
+        jb .Lseal_avx2_short_loop\n";
         &poly_add("0($oup)");
         &poly_mul(); $code.="
         sub \$16, $itr1
         add \$16, $oup
-    jmp seal_avx2_hash
-seal_avx2_short_loop:
+    jmp .Lseal_avx2_short_hash_remainder
+.Lseal_avx2_short_loop:
         cmp \$32, $inl
-        jb seal_avx2_short_tail
+        jb .Lseal_avx2_short_tail
         sub \$32, $inl
         # Encrypt
         vpxor ($inp), $A0, $A0
@@ -2454,10 +2532,10 @@
         vmovdqa $D1, $C1
         vmovdqa $A2, $D1
         vmovdqa $B2, $A2
-    jmp seal_avx2_short_loop
-seal_avx2_short_tail:
+    jmp .Lseal_avx2_short_loop
+.Lseal_avx2_short_tail:
     cmp \$16, $inl
-    jb 1f
+    jb .Lseal_avx2_exit
     sub \$16, $inl
     vpxor ($inp), $A0x, $A3x
     vmovdqu $A3x, ($oup)
@@ -2466,24 +2544,16 @@
     &poly_mul(); $code.="
     lea 1*16($oup), $oup
     vextracti128 \$1, $A0, $A0x
-1:
+.Lseal_avx2_exit:
     vzeroupper
-    jmp seal_sse_tail_16
+    jmp .Lseal_sse_tail_16
 .cfi_endproc
+.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2
 ";
 }
 
-if (!$win64) {
-  $code =~ s/\`([^\`]*)\`/eval $1/gem;
-  print $code;
-} else {
-  print <<___;
-.text
-.globl dummy_chacha20_poly1305_asm
-.type dummy_chacha20_poly1305_asm,\@abi-omnipotent
-dummy_chacha20_poly1305_asm:
-    ret
-___
-}
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+
+print $code;
 
 close STDOUT or die "error closing STDOUT";
diff --git a/crypto/cipher_extra/e_chacha20poly1305.c b/crypto/cipher_extra/e_chacha20poly1305.c
index 1c175e9..1650188 100644
--- a/crypto/cipher_extra/e_chacha20poly1305.c
+++ b/crypto/cipher_extra/e_chacha20poly1305.c
@@ -18,18 +18,15 @@
 
 #include <openssl/chacha.h>
 #include <openssl/cipher.h>
-#include <openssl/cpu.h>
 #include <openssl/err.h>
 #include <openssl/mem.h>
 #include <openssl/poly1305.h>
 #include <openssl/type_check.h>
 
+#include "internal.h"
+#include "../chacha/internal.h"
 #include "../fipsmodule/cipher/internal.h"
 #include "../internal.h"
-#include "../chacha/internal.h"
-
-
-#define POLY1305_TAG_LEN 16
 
 struct aead_chacha20_poly1305_ctx {
   uint8_t key[32];
@@ -44,78 +41,6 @@
                       "AEAD state has insufficient alignment");
 #endif
 
-// For convenience (the x86_64 calling convention allows only six parameters in
-// registers), the final parameter for the assembly functions is both an input
-// and output parameter.
-union open_data {
-  struct {
-    alignas(16) uint8_t key[32];
-    uint32_t counter;
-    uint8_t nonce[12];
-  } in;
-  struct {
-    uint8_t tag[POLY1305_TAG_LEN];
-  } out;
-};
-
-union seal_data {
-  struct {
-    alignas(16) uint8_t key[32];
-    uint32_t counter;
-    uint8_t nonce[12];
-    const uint8_t *extra_ciphertext;
-    size_t extra_ciphertext_len;
-  } in;
-  struct {
-    uint8_t tag[POLY1305_TAG_LEN];
-  } out;
-};
-
-#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \
-    !defined(OPENSSL_WINDOWS)
-static int asm_capable(void) {
-  const int sse41_capable = (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0;
-  return sse41_capable;
-}
-
-OPENSSL_STATIC_ASSERT(sizeof(union open_data) == 48, "wrong open_data size");
-OPENSSL_STATIC_ASSERT(sizeof(union seal_data) == 48 + 8 + 8,
-                      "wrong seal_data size");
-
-// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It decrypts
-// |plaintext_len| bytes from |ciphertext| and writes them to |out_plaintext|.
-// Additional input parameters are passed in |aead_data->in|. On exit, it will
-// write calculated tag value to |aead_data->out.tag|, which the caller must
-// check.
-extern void chacha20_poly1305_open(uint8_t *out_plaintext,
-                                   const uint8_t *ciphertext,
-                                   size_t plaintext_len, const uint8_t *ad,
-                                   size_t ad_len, union open_data *aead_data);
-
-// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It encrypts
-// |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|.
-// Additional input parameters are passed in |aead_data->in|. The calculated tag
-// value is over the computed ciphertext concatenated with |extra_ciphertext|
-// and written to |aead_data->out.tag|.
-extern void chacha20_poly1305_seal(uint8_t *out_ciphertext,
-                                   const uint8_t *plaintext,
-                                   size_t plaintext_len, const uint8_t *ad,
-                                   size_t ad_len, union seal_data *aead_data);
-#else
-static int asm_capable(void) { return 0; }
-
-
-static void chacha20_poly1305_open(uint8_t *out_plaintext,
-                                   const uint8_t *ciphertext,
-                                   size_t plaintext_len, const uint8_t *ad,
-                                   size_t ad_len, union open_data *aead_data) {}
-
-static void chacha20_poly1305_seal(uint8_t *out_ciphertext,
-                                   const uint8_t *plaintext,
-                                   size_t plaintext_len, const uint8_t *ad,
-                                   size_t ad_len, union seal_data *aead_data) {}
-#endif
-
 static int aead_chacha20_poly1305_init(EVP_AEAD_CTX *ctx, const uint8_t *key,
                                        size_t key_len, size_t tag_len) {
   struct aead_chacha20_poly1305_ctx *c20_ctx =
@@ -238,8 +163,8 @@
     }
   }
 
-  union seal_data data;
-  if (asm_capable()) {
+  union chacha20_poly1305_seal_data data;
+  if (chacha20_poly1305_asm_capable()) {
     OPENSSL_memcpy(data.in.key, key, 32);
     data.in.counter = 0;
     OPENSSL_memcpy(data.in.nonce, nonce, 12);
@@ -321,8 +246,8 @@
     return 0;
   }
 
-  union open_data data;
-  if (asm_capable()) {
+  union chacha20_poly1305_open_data data;
+  if (chacha20_poly1305_asm_capable()) {
     OPENSSL_memcpy(data.in.key, key, 32);
     data.in.counter = 0;
     OPENSSL_memcpy(data.in.nonce, nonce, 12);
diff --git a/crypto/cipher_extra/internal.h b/crypto/cipher_extra/internal.h
index 1d2c4e1..c2af48e 100644
--- a/crypto/cipher_extra/internal.h
+++ b/crypto/cipher_extra/internal.h
@@ -57,7 +57,11 @@
 #ifndef OPENSSL_HEADER_CIPHER_EXTRA_INTERNAL_H
 #define OPENSSL_HEADER_CIPHER_EXTRA_INTERNAL_H
 
+#include <stdlib.h>
+
 #include <openssl/base.h>
+#include <openssl/cpu.h>
+#include <openssl/type_check.h>
 
 #include "../internal.h"
 
@@ -120,6 +124,89 @@
                               const uint8_t *mac_secret,
                               unsigned mac_secret_length);
 
+#define POLY1305_TAG_LEN 16
+
+// For convenience (the x86_64 calling convention allows only six parameters in
+// registers), the final parameter for the assembly functions is both an input
+// and output parameter.
+union chacha20_poly1305_open_data {
+  struct {
+    alignas(16) uint8_t key[32];
+    uint32_t counter;
+    uint8_t nonce[12];
+  } in;
+  struct {
+    uint8_t tag[POLY1305_TAG_LEN];
+  } out;
+};
+
+union chacha20_poly1305_seal_data {
+  struct {
+    alignas(16) uint8_t key[32];
+    uint32_t counter;
+    uint8_t nonce[12];
+    const uint8_t *extra_ciphertext;
+    size_t extra_ciphertext_len;
+  } in;
+  struct {
+    uint8_t tag[POLY1305_TAG_LEN];
+  } out;
+};
+
+#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM)
+
+OPENSSL_STATIC_ASSERT(sizeof(union chacha20_poly1305_open_data) == 48,
+                      "wrong chacha20_poly1305_open_data size");
+OPENSSL_STATIC_ASSERT(sizeof(union chacha20_poly1305_seal_data) == 48 + 8 + 8,
+                      "wrong chacha20_poly1305_seal_data size");
+
+OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) {
+  const int sse41_capable = (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0;
+  return sse41_capable;
+}
+
+// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It decrypts
+// |plaintext_len| bytes from |ciphertext| and writes them to |out_plaintext|.
+// Additional input parameters are passed in |aead_data->in|. On exit, it will
+// write calculated tag value to |aead_data->out.tag|, which the caller must
+// check.
+extern void chacha20_poly1305_open(uint8_t *out_plaintext,
+                                   const uint8_t *ciphertext,
+                                   size_t plaintext_len, const uint8_t *ad,
+                                   size_t ad_len,
+                                   union chacha20_poly1305_open_data *data);
+
+// chacha20_poly1305_open is defined in chacha20_poly1305_x86_64.pl. It encrypts
+// |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|.
+// Additional input parameters are passed in |aead_data->in|. The calculated tag
+// value is over the computed ciphertext concatenated with |extra_ciphertext|
+// and written to |aead_data->out.tag|.
+extern void chacha20_poly1305_seal(uint8_t *out_ciphertext,
+                                   const uint8_t *plaintext,
+                                   size_t plaintext_len, const uint8_t *ad,
+                                   size_t ad_len,
+                                   union chacha20_poly1305_seal_data *data);
+#else
+
+OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { return 0; }
+
+OPENSSL_INLINE void chacha20_poly1305_open(uint8_t *out_plaintext,
+                                   const uint8_t *ciphertext,
+                                   size_t plaintext_len, const uint8_t *ad,
+                                   size_t ad_len,
+                                   union chacha20_poly1305_open_data *data) {
+  abort();
+}
+
+OPENSSL_INLINE void chacha20_poly1305_seal(uint8_t *out_ciphertext,
+                                   const uint8_t *plaintext,
+                                   size_t plaintext_len, const uint8_t *ad,
+                                   size_t ad_len,
+                                   union chacha20_poly1305_seal_data *data) {
+  abort();
+}
+#endif
+
 
 #if defined(__cplusplus)
 }  // extern C