boringssl / boringssl / 5ce12e6436c14085d9dbd86a89165b0b61ac4bd8 / . / crypto / fipsmodule / modes / asm / ghash-ssse3-x86.pl

#!/usr/bin/env perl | |

# Copyright (c) 2019, Google Inc. | |

# | |

# Permission to use, copy, modify, and/or distribute this software for any | |

# purpose with or without fee is hereby granted, provided that the above | |

# copyright notice and this permission notice appear in all copies. | |

# | |

# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES | |

# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF | |

# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY | |

# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES | |

# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION | |

# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN | |

# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. | |

# ghash-ssse3-x86.pl is a constant-time variant of the traditional 4-bit | |

# table-based GHASH implementation. It requires SSSE3 instructions. | |

# | |

# For background, the table-based strategy is a 4-bit windowed multiplication. | |

# It precomputes all 4-bit multiples of H (this is 16 128-bit rows), then loops | |

# over 4-bit windows of the input and indexes them up into the table. Visually, | |

# it multiplies as in the schoolbook multiplication diagram below, but with | |

# more terms. (Each term is 4 bits, so there are 32 terms in each row.) First | |

# it incorporates the terms labeled '1' by indexing the most significant term | |

# of X into the table. Then it shifts and repeats for '2' and so on. | |

# | |

# hhhhhh | |

# * xxxxxx | |

# ============ | |

# 666666 | |

# 555555 | |

# 444444 | |

# 333333 | |

# 222222 | |

# 111111 | |

# | |

# This implementation changes the order. We treat the table as a 16×16 matrix | |

# and transpose it. The first row is then the first byte of each multiple of H, | |

# and so on. We then reorder terms as below. Observe that the terms labeled '1' | |

# and '2' are all lookups into the first row, etc. This maps well to the SSSE3 | |

# pshufb instruction, using alternating terms of X in parallel as indices. This | |

# alternation is needed because pshufb maps 4 bits to 8 bits. Then we shift and | |

# repeat for each row. | |

# | |

# hhhhhh | |

# * xxxxxx | |

# ============ | |

# 224466 | |

# 113355 | |

# 224466 | |

# 113355 | |

# 224466 | |

# 113355 | |

# | |

# Next we account for GCM's confusing bit order. The "first" bit is the least | |

# significant coefficient, but GCM treats the most sigificant bit within a byte | |

# as first. Bytes are little-endian, and bits are big-endian. We reverse the | |

# bytes in XMM registers for a consistent bit and byte ordering, but this means | |

# the least significant bit is the most significant coefficient and vice versa. | |

# | |

# For consistency, "low", "high", "left-shift", and "right-shift" refer to the | |

# bit ordering within the XMM register, rather than the reversed coefficient | |

# ordering. Low bits are less significant bits and more significant | |

# coefficients. Right-shifts move from MSB to the LSB and correspond to | |

# increasing the power of each coefficient. | |

# | |

# Note this bit reversal enters into the table's column indices. H*1 is stored | |

# in column 0b1000 and H*x^3 is stored in column 0b0001. It also means earlier | |

# table rows contain more significant coefficients, so we iterate forwards. | |

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; | |

push(@INC,"${dir}","${dir}../../../perlasm"); | |

require "x86asm.pl"; | |

$output = pop; | |

open STDOUT, ">$output"; | |

&asm_init($ARGV[0]); | |

my ($Xi, $Htable, $in, $len) = ("edi", "esi", "edx", "ecx"); | |

&static_label("reverse_bytes"); | |

&static_label("low4_mask"); | |

my $call_counter = 0; | |

# process_rows emits assembly code to process $rows rows of the table. On | |

# input, $Htable stores the pointer to the next row. xmm0 and xmm1 store the | |

# low and high halves of the input. The result so far is passed in xmm2. xmm3 | |

# must be zero. On output, $Htable is advanced to the next row and xmm2 is | |

# updated. xmm3 remains zero. It clobbers eax, xmm4, xmm5, and xmm6. | |

sub process_rows { | |

my ($rows) = @_; | |

$call_counter++; | |

# Shifting whole XMM registers by bits is complex. psrldq shifts by | |

# bytes, and psrlq shifts the two 64-bit halves separately. Each row | |

# produces 8 bits of carry, and the reduction needs an additional 7-bit | |

# shift. This must fit in 64 bits so reduction can use psrlq. This | |

# allows up to 7 rows at a time. | |

die "Carry register would overflow 64 bits." if ($rows*8 + 7 > 64); | |

&mov("eax", $rows); | |

&set_label("loop_row_$call_counter"); | |

&movdqa("xmm4", &QWP(0, $Htable)); | |

&lea($Htable, &DWP(16, $Htable)); | |

# Right-shift xmm2 and xmm3 by 8 bytes. | |

&movdqa("xmm6", "xmm2"); | |

&palignr("xmm6", "xmm3", 1); | |

&movdqa("xmm3", "xmm6"); | |

&psrldq("xmm2", 1); | |

# Load the next table row and index the low and high bits of the input. | |

# Note the low (respectively, high) half corresponds to more | |

# (respectively, less) significant coefficients. | |

&movdqa("xmm5", "xmm4"); | |

&pshufb("xmm4", "xmm0"); | |

&pshufb("xmm5", "xmm1"); | |

# Add the high half (xmm5) without shifting. | |

&pxor("xmm2", "xmm5"); | |

# Add the low half (xmm4). This must be right-shifted by 4 bits. First, | |

# add into the carry register (xmm3). | |

&movdqa("xmm5", "xmm4"); | |

&psllq("xmm5", 60); | |

&movdqa("xmm6", "xmm5"); | |

&pslldq("xmm6", 8); | |

&pxor("xmm3", "xmm6"); | |

# Next, add into xmm2. | |

&psrldq("xmm5", 8); | |

&pxor("xmm2", "xmm5"); | |

&psrlq("xmm4", 4); | |

&pxor("xmm2", "xmm4"); | |

&sub("eax", 1); | |

&jnz(&label("loop_row_$call_counter")); | |

# Reduce the carry register. The reduction polynomial is 1 + x + x^2 + | |

# x^7, so we shift and XOR four times. | |

&pxor("xmm2", "xmm3"); # x^0 = 0 | |

&psrlq("xmm3", 1); | |

&pxor("xmm2", "xmm3"); # x^1 = x | |

&psrlq("xmm3", 1); | |

&pxor("xmm2", "xmm3"); # x^(1+1) = x^2 | |

&psrlq("xmm3", 5); | |

&pxor("xmm2", "xmm3"); # x^(1+1+5) = x^7 | |

&pxor("xmm3", "xmm3"); | |

____ | |

} | |

# gcm_gmult_ssse3 multiplies |Xi| by |Htable| and writes the result to |Xi|. | |

# |Xi| is represented in GHASH's serialized byte representation. |Htable| is | |

# formatted as described above. | |

# void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); | |

&function_begin("gcm_gmult_ssse3"); | |

&mov($Xi, &wparam(0)); | |

&mov($Htable, &wparam(1)); | |

&movdqu("xmm0", &QWP(0, $Xi)); | |

&call(&label("pic_point")); | |

&set_label("pic_point"); | |

&blindpop("eax"); | |

&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "eax")); | |

&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "eax")); | |

# Reverse input bytes to deserialize. | |

&pshufb("xmm0", "xmm7"); | |

# Split each byte into low (xmm0) and high (xmm1) halves. | |

&movdqa("xmm1", "xmm2"); | |

&pandn("xmm1", "xmm0"); | |

&psrld("xmm1", 4); | |

&pand("xmm0", "xmm2"); | |

# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note | |

# that, due to bit reversal, xmm3 contains bits that fall off when | |

# right-shifting, not left-shifting. | |

&pxor("xmm2", "xmm2"); | |

&pxor("xmm3", "xmm3"); | |

# We must reduce at least once every 7 rows, so divide into three | |

# chunks. | |

&process_rows(5); | |

&process_rows(5); | |

&process_rows(6); | |

# Store the result. Reverse bytes to serialize. | |

&pshufb("xmm2", "xmm7"); | |

&movdqu(&QWP(0, $Xi), "xmm2"); | |

# Zero any registers which contain secrets. | |

&pxor("xmm0", "xmm0"); | |

&pxor("xmm1", "xmm1"); | |

&pxor("xmm2", "xmm2"); | |

&pxor("xmm3", "xmm3"); | |

&pxor("xmm4", "xmm4"); | |

&pxor("xmm5", "xmm5"); | |

&pxor("xmm6", "xmm6"); | |

&function_end("gcm_gmult_ssse3"); | |

# gcm_ghash_ssse3 incorporates |len| bytes from |in| to |Xi|, using |Htable| as | |

# the key. It writes the result back to |Xi|. |Xi| is represented in GHASH's | |

# serialized byte representation. |Htable| is formatted as described above. | |

# void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, | |

# size_t len); | |

&function_begin("gcm_ghash_ssse3"); | |

&mov($Xi, &wparam(0)); | |

&mov($Htable, &wparam(1)); | |

&mov($in, &wparam(2)); | |

&mov($len, &wparam(3)); | |

&movdqu("xmm0", &QWP(0, $Xi)); | |

&call(&label("pic_point")); | |

&set_label("pic_point"); | |

&blindpop("ebx"); | |

&movdqa("xmm7", &QWP(&label("reverse_bytes")."-".&label("pic_point"), "ebx")); | |

# This function only processes whole blocks. | |

&and($len, -16); | |

# Reverse input bytes to deserialize. We maintain the running | |

# total in xmm0. | |

&pshufb("xmm0", "xmm7"); | |

# Iterate over each block. On entry to each iteration, xmm3 is zero. | |

&pxor("xmm3", "xmm3"); | |

&set_label("loop_ghash"); | |

&movdqa("xmm2", &QWP(&label("low4_mask")."-".&label("pic_point"), "ebx")); | |

# Incorporate the next block of input. | |

&movdqu("xmm1", &QWP(0, $in)); | |

&pshufb("xmm1", "xmm7"); # Reverse bytes. | |

&pxor("xmm0", "xmm1"); | |

# Split each byte into low (xmm0) and high (xmm1) halves. | |

&movdqa("xmm1", "xmm2"); | |

&pandn("xmm1", "xmm0"); | |

&psrld("xmm1", 4); | |

&pand("xmm0", "xmm2"); | |

# Maintain the result in xmm2 (the value) and xmm3 (carry bits). Note | |

# that, due to bit reversal, xmm3 contains bits that fall off when | |

# right-shifting, not left-shifting. | |

&pxor("xmm2", "xmm2"); | |

# xmm3 is already zero at this point. | |

# We must reduce at least once every 7 rows, so divide into three | |

# chunks. | |

&process_rows(5); | |

&process_rows(5); | |

&process_rows(6); | |

&movdqa("xmm0", "xmm2"); | |

# Rewind $Htable for the next iteration. | |

&lea($Htable, &DWP(-256, $Htable)); | |

# Advance input and continue. | |

&lea($in, &DWP(16, $in)); | |

&sub($len, 16); | |

&jnz(&label("loop_ghash")); | |

# Reverse bytes and store the result. | |

&pshufb("xmm0", "xmm7"); | |

&movdqu(&QWP(0, $Xi), "xmm0"); | |

# Zero any registers which contain secrets. | |

&pxor("xmm0", "xmm0"); | |

&pxor("xmm1", "xmm1"); | |

&pxor("xmm2", "xmm2"); | |

&pxor("xmm3", "xmm3"); | |

&pxor("xmm4", "xmm4"); | |

&pxor("xmm5", "xmm5"); | |

&pxor("xmm6", "xmm6"); | |

&function_end("gcm_ghash_ssse3"); | |

# reverse_bytes is a permutation which, if applied with pshufb, reverses the | |

# bytes in an XMM register. | |

&set_label("reverse_bytes", 16); | |

&data_byte(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); | |

# low4_mask is an XMM mask which selects the low four bits of each byte. | |

&set_label("low4_mask", 16); | |

&data_word(0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f); | |

&asm_finish(); | |

close STDOUT; |