p256-x86_64-asm.pl: minor sqr_montx cleanup. Drop some redundant instructions in reduction in ecp_nistz256_sqr_montx. (Imported from upstream's 8fc063dcc9668589fd95533d25932396d60987f9.) I believe this is a no-op for us as we do not currently enable the ADX-based optimizations. Change-Id: I34a5f5ffb965d59c67f6b9f0ca7937e49ba6e820 Reviewed-on: https://boringssl-review.googlesource.com/16884 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: Adam Langley <agl@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>

commit: a51912f7fe6596a746338209f41326f3298c333f [log] [tgz]
author: David Benjamin <davidben@google.com> Mon Jun 05 13:25:46 2017 -0400
committer: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org> Mon Jun 05 18:37:55 2017 +0000
tree: 58334ed7ae4373b4699340867fffa80a0814cb66
parent: 2b56981b6415dc3fa70432649c2f289c77707a69 [diff]
diff --git a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
index 4dadd4a..1ac3d21 100755
--- a/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl
+++ b/crypto/fipsmodule/ec/asm/p256-x86_64-asm.pl

@@ -863,19 +863,18 @@
 	adox	$t1, $acc5
 	.byte	0x67,0x67
 	mulx	%rdx, $t0, $t4
-	 mov	$acc0, %rdx
+	 mov	.Lpoly+8*3(%rip), %rdx
 	adox	$t0, $acc6
 	 shlx	$a_ptr, $acc0, $t0
 	adox	$t4, $acc7
 	 shrx	$a_ptr, $acc0, $t4
-	 mov	.Lpoly+8*3(%rip), $t1
+	mov	%rdx,$t1
 
 	# reduction step 1
 	add	$t0, $acc1
 	adc	$t4, $acc2
 
-	mulx	$t1, $t0, $acc0
-	 mov	$acc1, %rdx
+	mulx	$acc0, $t0, $acc0
 	adc	$t0, $acc3
 	 shlx	$a_ptr, $acc1, $t0
 	adc	\$0, $acc0
@@ -885,8 +884,7 @@
 	add	$t0, $acc2
 	adc	$t4, $acc3
 
-	mulx	$t1, $t0, $acc1
-	 mov	$acc2, %rdx
+	mulx	$acc1, $t0, $acc1
 	adc	$t0, $acc0
 	 shlx	$a_ptr, $acc2, $t0
 	adc	\$0, $acc1
@@ -896,8 +894,7 @@
 	add	$t0, $acc3
 	adc	$t4, $acc0
 
-	mulx	$t1, $t0, $acc2
-	 mov	$acc3, %rdx
+	mulx	$acc2, $t0, $acc2
 	adc	$t0, $acc1
 	 shlx	$a_ptr, $acc3, $t0
 	adc	\$0, $acc2
@@ -907,12 +904,12 @@
 	add	$t0, $acc0
 	adc	$t4, $acc1
 
-	mulx	$t1, $t0, $acc3
+	mulx	$acc3, $t0, $acc3
 	adc	$t0, $acc2
 	adc	\$0, $acc3
 
-	xor	$t3, $t3		# cf=0
-	adc	$acc0, $acc4		# accumulate upper half
+	xor	$t3, $t3
+	add	$acc0, $acc4		# accumulate upper half
 	 mov	.Lpoly+8*1(%rip), $a_ptr
 	adc	$acc1, $acc5
 	 mov	$acc4, $acc0
@@ -921,8 +918,7 @@
 	 mov	$acc5, $acc1
 	adc	\$0, $t3
 
-	xor	%eax, %eax		# cf=0
-	sbb	\$-1, $acc4		# .Lpoly[0]
+	sub	\$-1, $acc4		# .Lpoly[0]
 	 mov	$acc6, $acc2
 	sbb	$a_ptr, $acc5		# .Lpoly[1]
 	sbb	\$0, $acc6		# .Lpoly[2]
commit	a51912f7fe6596a746338209f41326f3298c333f	[log] [tgz]
author	David Benjamin <davidben@google.com>	Mon Jun 05 13:25:46 2017 -0400
committer	CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>	Mon Jun 05 18:37:55 2017 +0000
tree	58334ed7ae4373b4699340867fffa80a0814cb66
parent	2b56981b6415dc3fa70432649c2f289c77707a69 [diff]