Fix vpaes-armv7.pl in ARM mode.
This file runs against the limit of ARMv7's ADR pseudo-instruction. ADR
expands to an ADD or SUB of the pc register to find an address. That
immediate must fit in ARM's encoding scheme: 8 bits of constant and 4
bits of rotation. This means larger values must be more aligned.
ARM additionally has two encodings, ARM and Thumb mode. Our assembly
files may use either encoding (do we actually need to support this?). In
ARM mode, the distances get large enough to require 16-byte alignment.
Moving constants closer to their use resolves most of this, but common
constants in _vpaes_consts are used by the whole file. Affected ADR
instructions must be placed at 8 mod 16 (the pc register is 8 ahead).
Instructions with this constraint have been commented.
For details on ARM's immediate value encoding scheme, see
https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/
Update-Note: See b/141080375
Change-Id: Iadac36d800bb45901b513055fcc28a3a60f9060c
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/37524
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: David Benjamin <davidben@google.com>
diff --git a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
index f9fadda..deb9a2a 100644
--- a/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
+++ b/crypto/fipsmodule/aes/asm/vpaes-armv7.pl
@@ -66,6 +66,22 @@
# We adjust the register usage from x86_64 to avoid this. (Unfortunately, the
# two-address pshufb always matched these operands, so this is common.)
#
+# This file also runs against the limit of ARMv7's ADR pseudo-instruction. ADR
+# expands to an ADD or SUB of the pc register to find an address. That immediate
+# must fit in ARM's encoding scheme: 8 bits of constant and 4 bits of rotation.
+# This means larger values must be more aligned.
+#
+# ARM additionally has two encodings, ARM and Thumb mode. Our assembly files may
+# use either encoding (do we actually need to support this?). In ARM mode, the
+# distances get large enough to require 16-byte alignment. Moving constants
+# closer to their use resolves most of this, but common constants in
+# _vpaes_consts are used by the whole file. Affected ADR instructions must be
+# placed at 8 mod 16 (the pc register is 8 ahead). Instructions with this
+# constraint have been commented.
+#
+# For details on ARM's immediate value encoding scheme, see
+# https://alisdair.mcdiarmid.org/arm-immediate-value-encoding/
+#
# Finally, a summary of armv7 and aarch64 SIMD syntax differences:
#
# * armv7 prefixes SIMD instructions with 'v', while aarch64 does not.
@@ -150,98 +166,6 @@
.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
-@
-@ Decryption stuff
-@
-.Lk_dipt: @ decryption input transform
- .quad 0x0F505B040B545F00, 0x154A411E114E451A
- .quad 0x86E383E660056500, 0x12771772F491F194
-.Lk_dsbo: @ decryption sbox final output
- .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
- .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.Lk_dsb9: @ decryption sbox output *9*u, *9*t
- .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
- .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
-.Lk_dsbd: @ decryption sbox output *D*u, *D*t
- .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
- .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
-.Lk_dsbb: @ decryption sbox output *B*u, *B*t
- .quad 0xD022649296B44200, 0x602646F6B0F2D404
- .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
-.Lk_dsbe: @ decryption sbox output *E*u, *E*t
- .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
- .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
-
-@
-@ Key schedule constants
-@
-.Lk_dksd: @ decryption key schedule: invskew x*D
- .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
- .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
-.Lk_dksb: @ decryption key schedule: invskew x*B
- .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
- .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
-.Lk_dkse: @ decryption key schedule: invskew x*E + 0x63
- .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
- .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
-.Lk_dks9: @ decryption key schedule: invskew x*9
- .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
- .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
-
-.Lk_rcon: @ rcon
- .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
-
-.Lk_opt: @ output transform
- .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
- .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
-.Lk_deskew: @ deskew tables: inverts the sbox's "skew"
- .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
- .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
-
-
-@ Additional constants for converting to bsaes.
-
-@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
-@ transform in the AES S-box. 0x63 is incorporated into the low half of the
-@ table. This was computed with the following script:
-@
-@ def u64s_to_u128(x, y):
-@ return x | (y << 64)
-@ def u128_to_u64s(w):
-@ return w & ((1<<64)-1), w >> 64
-@ def get_byte(w, i):
-@ return (w >> (i*8)) & 0xff
-@ def apply_table(table, b):
-@ lo = b & 0xf
-@ hi = b >> 4
-@ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
-@ def opt(b):
-@ table = [
-@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
-@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
-@ ]
-@ return apply_table(table, b)
-@ def rot_byte(b, n):
-@ return 0xff & ((b << n) | (b >> (8-n)))
-@ def skew(x):
-@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
-@ rot_byte(x, 4))
-@ table = [0, 0]
-@ for i in range(16):
-@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
-@ table[1] |= skew(opt(i<<4)) << (i*8)
-@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0]))
-@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1]))
-.Lk_opt_then_skew:
- .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
- .quad 0x1f30062936192f00, 0xb49bad829db284ab
-
-@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
-@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
-@ becomes 0x22334411 and then 0x11443322.
-.Lk_decrypt_transform:
- .quad 0x0704050603000102, 0x0f0c0d0e0b08090a
-
.asciz "Vector Permutation AES for ARMv7 NEON, Mike Hamburg (Stanford University)"
.size _vpaes_consts,.-_vpaes_consts
.align 6
@@ -406,6 +330,31 @@
ldmia sp!, {r7-r11, pc} @ return
.size vpaes_encrypt,.-vpaes_encrypt
+@
+@ Decryption stuff
+@
+.type _vpaes_decrypt_consts,%object
+.align 4
+.Lk_dipt: @ decryption input transform
+ .quad 0x0F505B040B545F00, 0x154A411E114E451A
+ .quad 0x86E383E660056500, 0x12771772F491F194
+.Lk_dsbo: @ decryption sbox final output
+ .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+ .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.Lk_dsb9: @ decryption sbox output *9*u, *9*t
+ .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
+ .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd: @ decryption sbox output *D*u, *D*t
+ .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+ .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb: @ decryption sbox output *B*u, *B*t
+ .quad 0xD022649296B44200, 0x602646F6B0F2D404
+ .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe: @ decryption sbox output *E*u, *E*t
+ .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
+ .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.size _vpaes_decrypt_consts,.-_vpaes_decrypt_consts
+
@@
@@ Decryption core
@@
@@ -613,12 +562,42 @@
@ We pin some constants for convenience and leave q14 and q15 free to load
@ others on demand.
+@
+@ Key schedule constants
+@
+.type _vpaes_key_consts,%object
+.align 4
+_vpaes_key_consts:
+.Lk_dksd: @ decryption key schedule: invskew x*D
+ .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+ .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb: @ decryption key schedule: invskew x*B
+ .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
+ .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse: @ decryption key schedule: invskew x*E + 0x63
+ .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
+ .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9: @ decryption key schedule: invskew x*9
+ .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
+ .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+.Lk_rcon: @ rcon
+ .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_opt: @ output transform
+ .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
+ .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+.Lk_deskew: @ deskew tables: inverts the sbox's "skew"
+ .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+ .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+.size _vpaes_key_consts,.-_vpaes_key_consts
+
.type _vpaes_key_preheat,%function
.align 4
_vpaes_key_preheat:
- adr r10, .Lk_inv
adr r11, .Lk_rcon
vmov.i8 $s63, #0x5b @ .Lk_s63
+ adr r10, .Lk_inv @ Must be aligned to 8 mod 16.
vmov.i8 $s0F, #0x0f @ .Lk_s0F
vld1.64 {$invlo,$invhi}, [r10] @ .Lk_inv
vld1.64 {$rcon}, [r11] @ .Lk_rcon
@@ -634,17 +613,17 @@
bl _vpaes_key_preheat @ load the tables
+ adr r11, .Lk_ipt @ Must be aligned to 8 mod 16.
vld1.64 {q0}, [$inp]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned)
@ input transform
@ Use q4 here rather than q3 so .Lschedule_am_decrypting does not
@ overlap table and destination.
vmov q4, q0 @ vmovdqa %xmm0, %xmm3
- adr r11, .Lk_ipt
bl _vpaes_schedule_transform
+ adr r10, .Lk_sr @ Must be aligned to 8 mod 16.
vmov q7, q0 @ vmovdqa %xmm0, %xmm7
- adr r10, .Lk_sr @ lea .Lk_sr(%rip),%r10
add r8, r8, r10
tst $dir, $dir
bne .Lschedule_am_decrypting
@@ -957,10 +936,10 @@
.type _vpaes_schedule_mangle,%function
.align 4
_vpaes_schedule_mangle:
- adr r11, .Lk_mc_forward
- vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
- vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5
tst $dir, $dir
+ vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later
+ adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16.
+ vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5
bne .Lschedule_mangle_dec
@ encrypting
@@ -1096,6 +1075,53 @@
my ($s0F, $s63, $s63_raw, $mc_forward) = map("q$_", (9..12));
$code .= <<___;
+
+@ Additional constants for converting to bsaes.
+.type _vpaes_convert_consts,%object
+.align 4
+_vpaes_convert_consts:
+@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear
+@ transform in the AES S-box. 0x63 is incorporated into the low half of the
+@ table. This was computed with the following script:
+@
+@ def u64s_to_u128(x, y):
+@ return x | (y << 64)
+@ def u128_to_u64s(w):
+@ return w & ((1<<64)-1), w >> 64
+@ def get_byte(w, i):
+@ return (w >> (i*8)) & 0xff
+@ def apply_table(table, b):
+@ lo = b & 0xf
+@ hi = b >> 4
+@ return get_byte(table[0], lo) ^ get_byte(table[1], hi)
+@ def opt(b):
+@ table = [
+@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808),
+@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0),
+@ ]
+@ return apply_table(table, b)
+@ def rot_byte(b, n):
+@ return 0xff & ((b << n) | (b >> (8-n)))
+@ def skew(x):
+@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^
+@ rot_byte(x, 4))
+@ table = [0, 0]
+@ for i in range(16):
+@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8)
+@ table[1] |= skew(opt(i<<4)) << (i*8)
+@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[0]))
+@ print("\t.quad\t0x%016x, 0x%016x" % u128_to_u64s(table[1]))
+.Lk_opt_then_skew:
+ .quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b
+ .quad 0x1f30062936192f00, 0xb49bad829db284ab
+
+@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation
+@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344
+@ becomes 0x22334411 and then 0x11443322.
+.Lk_decrypt_transform:
+ .quad 0x0704050603000102, 0x0f0c0d0e0b08090a
+.size _vpaes_convert_consts,.-_vpaes_convert_consts
+
@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes);
.globl vpaes_encrypt_key_to_bsaes
.type vpaes_encrypt_key_to_bsaes,%function
@@ -1115,12 +1141,13 @@
@ byteswapped, as a convenience for (unsupported) big-endian ARM, at the
@ cost of extra REV and VREV32 operations in little-endian ARM.
- adr r2, .Lk_mc_forward
- adr r3, .Lk_sr+0x10
- adr r11, .Lk_opt @ Input to _vpaes_schedule_transform.
- vld1.64 {$mc_forward}, [r2]
vmov.i8 $s0F, #0x0f @ Required by _vpaes_schedule_transform
+ adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16.
+ add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression)
+
+ vld1.64 {$mc_forward}, [r2]
vmov.i8 $s63, #0x5b @ .Lk_s63 from vpaes-x86_64
+ adr r11, .Lk_opt @ Must be aligned to 8 mod 16.
vmov.i8 $s63_raw, #0x63 @ .LK_s63 without .Lk_ipt applied
@ vpaes stores one fewer round count than bsaes, but the number of keys