Simpler square-root computation for Ed25519

Description:
Mark Wooden and Franck Rondepierre noted that the square-root-mod-p
operations used in the EdDSA RFC (RFC 8032) can be simplified.  For
Ed25519, instead of computing u*v^3 * (u * v^7)^((p-5)/8), we can
compute u * (u*v)^((p-5)/8).  This saves 3 multiplications and 2
squarings.  For more details (including a proof), see the following
message from the CFRG mailing list:

  https://mailarchive.ietf.org/arch/msg/cfrg/qlKpMBqxXZYmDpXXIx6LO3Oznv4/

Testing:
Build and run the Ed25519 tests:

  mkdir build
  cd build
  cmake -GNinja ..
  ninja && ./crypto/crypto_test --gtest_filter="Ed25519Test*"

Numerical testing of the square-root computation can be done using the
following sage script:

  def legendre(x,p):
      return kronecker(x,p)

  # Ed25519
  p = 2**255-19
  # -1 is a square
  if legendre(-1,p)==1:
      print("-1 is a square")
  # 2 is a non-square
  if legendre(2,p)==-1:
      print("2 is a non-square")

  # 2 is a generator
  # this can be checked by factoring p-1
  # and then showing 2**((p-1)/q) != 1 (mod p)
  # for all primes q dividing p-1.

  # suppose u/v is a square.
  # to compute one of its square roots, find x such that
  #    x**4 == (u/v)**2 .
  # this implies
  #    x**2 ==  u/v, or
  #    x**2 == -(u/v) ,
  # which implies either x or i*x is a square-root of u/v (where i is a square root of -1).
  # we can take x equal to u * (u*v)**((p-5)/8).

  g = 2
  s = p>>2  # s = (p-1)/4
  i = power_mod(g, s, p)

  t = p>>3  # t = (p-5)/8
  COUNT = 1<<18
  while COUNT > 0:
      COUNT -= 1

      r = randint(0,p-1)   # r = u/v
      v = randint(1,p-1)
      u = mod(r*v,p)

      # compute x = u * (u*v)**((p-5)/8)
      w = mod(u*v,p)
      x = mod(u*power_mod(w, t, p), p)

      # check that x**2 == r, or (i*x)**2 == r, or r is not a square
      rr = power_mod(x, 2, p)
      if rr==r:
          continue

      rr = power_mod(mod(i*x,p), 2, p)
      if rr==r:
          continue

      if legendre(r,p) != 1:
          continue

      print("failure!")
      exit()

  print("passed!")

Change-Id: Iaa284d3365dd8c9fa18a4584121013f05a3f4cc6
Reviewed-on: https://boringssl-review.googlesource.com/c/boringssl/+/50965
Reviewed-by: David Benjamin <davidben@google.com>
Reviewed-by: Adam Langley <agl@google.com>
Commit-Queue: Adam Langley <agl@google.com>
diff --git a/crypto/curve25519/curve25519.c b/crypto/curve25519/curve25519.c
index 64aa1e6..7cb0add 100644
--- a/crypto/curve25519/curve25519.c
+++ b/crypto/curve25519/curve25519.c
@@ -502,27 +502,21 @@
 int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) {
   fe u;
   fe_loose v;
-  fe v3;
+  fe w;
   fe vxx;
   fe_loose check;
 
   fe_frombytes(&h->Y, s);
   fe_1(&h->Z);
-  fe_sq_tt(&v3, &h->Y);
-  fe_mul_ttt(&vxx, &v3, &d);
-  fe_sub(&v, &v3, &h->Z);  // u = y^2-1
+  fe_sq_tt(&w, &h->Y);
+  fe_mul_ttt(&vxx, &w, &d);
+  fe_sub(&v, &w, &h->Z);  // u = y^2-1
   fe_carry(&u, &v);
   fe_add(&v, &vxx, &h->Z);  // v = dy^2+1
 
-  fe_sq_tl(&v3, &v);
-  fe_mul_ttl(&v3, &v3, &v);  // v3 = v^3
-  fe_sq_tt(&h->X, &v3);
-  fe_mul_ttl(&h->X, &h->X, &v);
-  fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^7
-
-  fe_pow22523(&h->X, &h->X);  // x = (uv^7)^((q-5)/8)
-  fe_mul_ttt(&h->X, &h->X, &v3);
-  fe_mul_ttt(&h->X, &h->X, &u);  // x = uv^3(uv^7)^((q-5)/8)
+  fe_mul_ttl(&w, &u, &v);  // w = u*v
+  fe_pow22523(&h->X, &w);  // x = w^((q-5)/8)
+  fe_mul_ttt(&h->X, &h->X, &u);  // x = u*w^((q-5)/8)
 
   fe_sq_tt(&vxx, &h->X);
   fe_mul_ttl(&vxx, &vxx, &v);