Fix the order of Windows unwind codes.

The unwind tester suggests Windows doesn't care, but the documentation
says that unwind codes should be sorted in descending offset, which
means the last instruction should be first.

https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=vs-2017#struct-unwind_code

Bug: 259
Change-Id: I21e54c362e18e0405f980005112cc3f7c417c70c
Reviewed-on: https://boringssl-review.googlesource.com/c/34785
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl
index 830381b..1dd2519 100644
--- a/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl
+++ b/crypto/fipsmodule/modes/asm/ghash-ssse3-x86_64.pl
@@ -374,16 +374,16 @@
 	.byte	5	# num_slots = 1 + 2 + 2
 	.byte	0	# no frame register
 
-	.byte	.Lgmult_seh_allocstack-.Lgmult_seh_begin
-	.byte	@{[$UWOP_ALLOC_SMALL | (((40 - 8) / 8) << 4)]}
+	.byte	.Lgmult_seh_save_xmm10-.Lgmult_seh_begin
+	.byte	@{[$UWOP_SAVE_XMM128 | (10 << 4)]}
+	.value	1
 
 	.byte	.Lgmult_seh_save_xmm6-.Lgmult_seh_begin
 	.byte	@{[$UWOP_SAVE_XMM128 | (6 << 4)]}
 	.value	0
 
-	.byte	.Lgmult_seh_save_xmm10-.Lgmult_seh_begin
-	.byte	@{[$UWOP_SAVE_XMM128 | (10 << 4)]}
-	.value	1
+	.byte	.Lgmult_seh_allocstack-.Lgmult_seh_begin
+	.byte	@{[$UWOP_ALLOC_SMALL | (((40 - 8) / 8) << 4)]}
 
 .align	8
 .Lghash_seh_info:
@@ -392,20 +392,20 @@
 	.byte	7	# num_slots = 1 + 2 + 2 + 2
 	.byte	0	# no frame register
 
-	.byte	.Lghash_seh_allocstack-.Lghash_seh_begin
-	.byte	@{[$UWOP_ALLOC_SMALL | (((56 - 8) / 8) << 4)]}
-
-	.byte	.Lghash_seh_save_xmm6-.Lghash_seh_begin
-	.byte	@{[$UWOP_SAVE_XMM128 | (6 << 4)]}
-	.value	0
+	.byte	.Lghash_seh_save_xmm11-.Lghash_seh_begin
+	.byte	@{[$UWOP_SAVE_XMM128 | (11 << 4)]}
+	.value	2
 
 	.byte	.Lghash_seh_save_xmm10-.Lghash_seh_begin
 	.byte	@{[$UWOP_SAVE_XMM128 | (10 << 4)]}
 	.value	1
 
-	.byte	.Lghash_seh_save_xmm11-.Lghash_seh_begin
-	.byte	@{[$UWOP_SAVE_XMM128 | (11 << 4)]}
-	.value	2
+	.byte	.Lghash_seh_save_xmm6-.Lghash_seh_begin
+	.byte	@{[$UWOP_SAVE_XMM128 | (6 << 4)]}
+	.value	0
+
+	.byte	.Lghash_seh_allocstack-.Lghash_seh_begin
+	.byte	@{[$UWOP_ALLOC_SMALL | (((56 - 8) / 8) << 4)]}
 ____
 }
 
diff --git a/crypto/test/asm/trampoline-x86_64.pl b/crypto/test/asm/trampoline-x86_64.pl
index 75aae45..8cb1410 100755
--- a/crypto/test/asm/trampoline-x86_64.pl
+++ b/crypto/test/asm/trampoline-x86_64.pl
@@ -447,26 +447,7 @@
 
   my $unwind_codes = "";
   my $num_slots = 0;
-  if ($stack_alloc_size <= 128) {
-    my $info = $UWOP_ALLOC_SMALL | ((($stack_alloc_size - 8) / 8) << 4);
-    $unwind_codes .= <<____;
-	.byte	.Labi_test_trampoline_seh_prolog_alloc-.Labi_test_trampoline_seh_begin
-	.byte	$info
-____
-    $num_slots++;
-  } else {
-    die "stack allocation needs three unwind slots" if ($stack_alloc_size > 512 * 1024 + 8);
-    my $info = $UWOP_ALLOC_LARGE;
-    my $value = $stack_alloc_size / 8;
-    $unwind_codes .= <<____;
-	.byte	.Labi_test_trampoline_seh_prolog_alloc-.Labi_test_trampoline_seh_begin
-	.byte	$info
-	.value	$value
-____
-    $num_slots += 2;
-  }
-
-  foreach my $reg (@caller_state) {
+  foreach my $reg (reverse @caller_state) {
     $reg = substr($reg, 1);
     die "unknown register $reg" unless exists($reg_offsets{$reg});
     if ($reg =~ /^r/) {
@@ -493,6 +474,25 @@
     }
   }
 
+  if ($stack_alloc_size <= 128) {
+    my $info = $UWOP_ALLOC_SMALL | ((($stack_alloc_size - 8) / 8) << 4);
+    $unwind_codes .= <<____;
+	.byte	.Labi_test_trampoline_seh_prolog_alloc-.Labi_test_trampoline_seh_begin
+	.byte	$info
+____
+    $num_slots++;
+  } else {
+    die "stack allocation needs three unwind slots" if ($stack_alloc_size > 512 * 1024 + 8);
+    my $info = $UWOP_ALLOC_LARGE;
+    my $value = $stack_alloc_size / 8;
+    $unwind_codes .= <<____;
+	.byte	.Labi_test_trampoline_seh_prolog_alloc-.Labi_test_trampoline_seh_begin
+	.byte	$info
+	.value	$value
+____
+    $num_slots += 2;
+  }
+
   $code .= <<____;
 .section	.pdata
 .align	4