Clarify use of |$end0| in stitched x86-64 AES-GCM code.

There was some uncertainty about what the code is doing with |$end0|
and whether it was necessary for |$len| to be a multiple of 16 or 96.
Hopefully these added comments make it clear that the code is correct
except for the caveat regarding low memory addresses.

Change-Id: Iea546a59dc7aeb400f50ac5d2d7b9cb88ace9027
Reviewed-on: https://boringssl-review.googlesource.com/7194
Reviewed-by: Adam Langley <agl@google.com>
diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl
index c7402f5..5d58bbb 100644
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -97,6 +97,23 @@
 	  vpxor		$rndkey,$inout3,$inout3
 	  vmovups	0x10-0x80($key),$T2	# borrow $T2 for $rndkey
 	vpclmulqdq	\$0x01,$Hkey,$Z3,$Z2
+
+	# At this point, the current block of 96 (0x60) bytes has already been
+	# loaded into registers. Concurrently with processing it, we want to
+	# load the next 96 bytes of input for the next round. Obviously, we can
+	# only do this if there are at least 96 more bytes of input beyond the
+	# input we're currently processing, or else we'd read past the end of
+	# the input buffer. Here, we set |%r12| to 96 if there are at least 96
+	# bytes of input beyond the 96 bytes we're already processing, and we
+	# set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
+	# we'll read in the next block so that it is in registers for the next
+	# loop iteration. In the case where we set |%r12| to 0, we'll re-read
+	# the current block and then ignore what we re-read.
+	#
+	# At this point, |$in0| points to the current (already read into
+	# registers) block, and |$end0| points to 2*96 bytes before the end of
+	# the input. Thus, |$in0| > |$end0| means that we do not have the next
+	# 96-byte block to read in, and |$in0| <= |$end0| means we do.
 	xor		%r12,%r12
 	cmp		$in0,$end0
 
@@ -389,6 +406,9 @@
 .align	32
 aesni_gcm_decrypt:
 	xor	$ret,$ret
+
+	# We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
+	# bytes of input.
 	cmp	\$0x60,$len			# minimal accepted length
 	jb	.Lgcm_dec_abort
 
@@ -443,7 +463,15 @@
 	vmovdqu		0x50($inp),$Z3		# I[5]
 	lea		($inp),$in0
 	vmovdqu		0x40($inp),$Z0
+
+	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+	# bytes before the end of the input. Note, in particular, that this is
+	# correct even if |$len| is not an even multiple of 96 or 16. XXX: This
+	# seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
+	# not be near the very beginning of the address space when |$len| < 2*96
+	# (0xc0).
 	lea		-0xc0($inp,$len),$end0
+
 	vmovdqu		0x30($inp),$Z1
 	shr		\$4,$len
 	xor		$ret,$ret
@@ -599,6 +627,10 @@
 .align	32
 aesni_gcm_encrypt:
 	xor	$ret,$ret
+
+	# We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
+	# input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
+	# least 96 more bytes of input.
 	cmp	\$0x60*3,$len			# minimal accepted length
 	jb	.Lgcm_enc_abort
 
@@ -648,7 +680,16 @@
 .Lenc_no_key_aliasing:
 
 	lea		($out),$in0
+
+	# |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
+	# bytes before the end of the input. Note, in particular, that this is
+	# correct even if |$len| is not an even multiple of 96 or 16. Unlike in
+	# the decryption case, there's no caveat that |$out| must not be near
+	# the very beginning of the address space, because we know that
+	# |$len| >= 3*96 from the check above, and so we know
+	# |$out| + |$len| >= 2*96 (0xc0).
 	lea		-0xc0($out,$len),$end0
+
 	shr		\$4,$len
 
 	call		_aesni_ctr32_6x