Inital import. Initial fork from f2d678e6e89b6508147086610e985d4e8416e867 (1.0.2 beta). (This change contains substantial changes from the original and effectively starts a new history.)

commit: 95c29f3cd1f6c08c6c0927868683392eea727ccb [log] [tgz]
author: Adam Langley <agl@chromium.org> Fri Jun 20 12:00:00 2014 -0700
committer: Adam Langley <agl@chromium.org> Fri Jun 20 13:17:32 2014 -0700
tree: 012767320ced9abca61472a4daa4c4a56b7ebe2b
diff --git a/crypto/bn/CMakeLists.txt b/crypto/bn/CMakeLists.txt
new file mode 100644
index 0000000..5d2dfcc
--- /dev/null
+++ b/crypto/bn/CMakeLists.txt

@@ -0,0 +1,81 @@
+include_directories(. .. ../../include)
+
+if (${ARCH} STREQUAL "x86_64")
+	set(
+		BN_ARCH_SOURCES
+
+		asm/x86_64-gcc.c
+		x86_64-mont.${ASM_EXT}
+		x86_64-mont5.${ASM_EXT}
+		modexp512-x86_64.${ASM_EXT}
+		rsaz-x86_64.${ASM_EXT}
+		rsaz-avx2.${ASM_EXT}
+
+		rsaz_exp.c
+	)
+endif()
+
+if (${ARCH} STREQUAL "x86")
+	set(
+		BN_ARCH_SOURCES
+
+		bn-586.${ASM_EXT}
+		co-586.${ASM_EXT}
+		x86-mont.${ASM_EXT}
+		x86.${ASM_EXT}
+	)
+endif()
+
+if (${ARCH} STREQUAL "arm")
+	set(
+		BN_ARCH_SOURCES
+
+		armv4-mont.${ASM_EXT}
+	)
+endif()
+
+add_library(
+	bn
+
+	OBJECT
+
+	bn_error.c
+	add.c
+	bn.c
+	cmp.c
+	convert.c
+	ctx.c
+	div.c
+	exponentiation.c
+	generic.c
+	gcd.c
+	kronecker.c
+	montgomery.c
+	mul.c
+	prime.c
+	random.c
+	shift.c
+	sqrt.c
+
+	${BN_ARCH_SOURCES}
+)
+
+perlasm(x86_64-mont.${ASM_EXT} asm/x86_64-mont.pl)
+perlasm(x86_64-mont5.${ASM_EXT} asm/x86_64-mont5.pl)
+perlasm(x86_64-gf2m.${ASM_EXT} asm/x86_64-gf2m.pl)
+perlasm(modexp512-x86_64.${ASM_EXT} asm/modexp512-x86_64.pl)
+perlasm(rsaz-x86_64.${ASM_EXT} asm/rsaz-x86_64.pl)
+perlasm(rsaz-avx2.${ASM_EXT} asm/rsaz-avx2.pl)
+perlasm(bn-586.${ASM_EXT} asm/bn-586.pl)
+perlasm(co-586.${ASM_EXT} asm/co-586.pl)
+perlasm(x86-mont.${ASM_EXT} asm/x86-mont.pl)
+perlasm(x86.${ASM_EXT} asm/x86.pl)
+perlasm(armv4-mont.${ASM_EXT} asm/armv4-mont.pl)
+
+add_executable(
+	bn_test
+
+	bn_test.c
+)
+
+target_link_libraries(bn_test crypto)

diff --git a/crypto/bn/add.c b/crypto/bn/add.c
new file mode 100644
index 0000000..1c6b2d7
--- /dev/null
+++ b/crypto/bn/add.c

@@ -0,0 +1,394 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+
+int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  const BIGNUM *tmp;
+  int a_neg = a->neg, ret;
+
+  /*  a +  b	a+b
+   *  a + -b	a-b
+   * -a +  b	b-a
+   * -a + -b	-(a+b)
+   */
+  if (a_neg ^ b->neg) {
+    /* only one is negative */
+    if (a_neg) {
+      tmp = a;
+      a = b;
+      b = tmp;
+    }
+
+    /* we are now a - b */
+    if (BN_ucmp(a, b) < 0) {
+      if (!BN_usub(r, b, a)) {
+        return 0;
+      }
+      r->neg = 1;
+    } else {
+      if (!BN_usub(r, a, b)) {
+        return 0;
+      }
+      r->neg = 0;
+    }
+    return 1;
+  }
+
+  ret = BN_uadd(r, a, b);
+  r->neg = a_neg;
+  return ret;
+}
+
+int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  int max, min, dif;
+  BN_ULONG *ap, *bp, *rp, carry, t1, t2;
+  const BIGNUM *tmp;
+
+  if (a->top < b->top) {
+    tmp = a;
+    a = b;
+    b = tmp;
+  }
+  max = a->top;
+  min = b->top;
+  dif = max - min;
+
+  if (bn_wexpand(r, max + 1) == NULL) {
+    return 0;
+  }
+
+  r->top = max;
+
+  ap = a->d;
+  bp = b->d;
+  rp = r->d;
+
+  carry = bn_add_words(rp, ap, bp, min);
+  rp += min;
+  ap += min;
+  bp += min;
+
+  if (carry) {
+    while (dif) {
+      dif--;
+      t1 = *(ap++);
+      t2 = (t1 + 1) & BN_MASK2;
+      *(rp++) = t2;
+      if (t2) {
+        carry = 0;
+        break;
+      }
+    }
+    if (carry) {
+      /* carry != 0 => dif == 0 */
+      *rp = 1;
+      r->top++;
+    }
+  }
+
+  if (dif && rp != ap) {
+    while (dif--) {
+      /* copy remaining words if ap != rp */
+      *(rp++) = *(ap++);
+    }
+  }
+
+  r->neg = 0;
+  return 1;
+}
+
+int BN_add_word(BIGNUM *a, BN_ULONG w) {
+  BN_ULONG l;
+  int i;
+
+  w &= BN_MASK2;
+
+  /* degenerate case: w is zero */
+  if (!w) {
+    return 1;
+  }
+
+  /* degenerate case: a is zero */
+  if (BN_is_zero(a)) {
+    return BN_set_word(a, w);
+  }
+
+  /* handle 'a' when negative */
+  if (a->neg) {
+    a->neg = 0;
+    i = BN_sub_word(a, w);
+    if (!BN_is_zero(a)) {
+      a->neg = !(a->neg);
+    }
+    return i;
+  }
+
+  for (i = 0; w != 0 && i < a->top; i++) {
+    a->d[i] = l = (a->d[i] + w) & BN_MASK2;
+    w = (w > l) ? 1 : 0;
+  }
+
+  if (w && i == a->top) {
+    if (bn_wexpand(a, a->top + 1) == NULL) {
+      return 0;
+    }
+    a->top++;
+    a->d[i] = w;
+  }
+
+  return 1;
+}
+
+int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  int max;
+  int add = 0, neg = 0;
+  const BIGNUM *tmp;
+
+  /*  a -  b	a-b
+   *  a - -b	a+b
+   * -a -  b	-(a+b)
+   * -a - -b	b-a
+   */
+  if (a->neg) {
+    if (b->neg) {
+      tmp = a;
+      a = b;
+      b = tmp;
+    } else {
+      add = 1;
+      neg = 1;
+    }
+  } else {
+    if (b->neg) {
+      add = 1;
+      neg = 0;
+    }
+  }
+
+  if (add) {
+    if (!BN_uadd(r, a, b)) {
+      return 0;
+    }
+
+    r->neg = neg;
+    return 1;
+  }
+
+  /* We are actually doing a - b :-) */
+
+  max = (a->top > b->top) ? a->top : b->top;
+  if (bn_wexpand(r, max) == NULL) {
+    return 0;
+  }
+
+  if (BN_ucmp(a, b) < 0) {
+    if (!BN_usub(r, b, a)) {
+      return 0;
+    }
+    r->neg = 1;
+  } else {
+    if (!BN_usub(r, a, b)) {
+      return 0;
+    }
+    r->neg = 0;
+  }
+
+  return 1;
+}
+
+int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) {
+  int max, min, dif;
+  register BN_ULONG t1, t2, *ap, *bp, *rp;
+  int i, carry;
+
+  max = a->top;
+  min = b->top;
+  dif = max - min;
+
+  if (dif < 0) /* hmm... should not be happening */
+  {
+    OPENSSL_PUT_ERROR(BN, BN_usub, BN_R_ARG2_LT_ARG3);
+    return 0;
+  }
+
+  if (bn_wexpand(r, max) == NULL) {
+    return 0;
+  }
+
+  ap = a->d;
+  bp = b->d;
+  rp = r->d;
+
+  carry = 0;
+  for (i = min; i != 0; i--) {
+    t1 = *(ap++);
+    t2 = *(bp++);
+    if (carry) {
+      carry = (t1 <= t2);
+      t1 = (t1 - t2 - 1) & BN_MASK2;
+    } else {
+      carry = (t1 < t2);
+      t1 = (t1 - t2) & BN_MASK2;
+    }
+    *(rp++) = t1 & BN_MASK2;
+  }
+
+  if (carry) /* subtracted */
+  {
+    if (!dif) {
+      /* error: a < b */
+      return 0;
+    }
+
+    while (dif) {
+      dif--;
+      t1 = *(ap++);
+      t2 = (t1 - 1) & BN_MASK2;
+      *(rp++) = t2;
+      if (t1) {
+        break;
+      }
+    }
+  }
+
+  if (rp != ap) {
+    for (;;) {
+      if (!dif--) {
+        break;
+      }
+      rp[0] = ap[0];
+      if (!dif--) {
+        break;
+      }
+      rp[1] = ap[1];
+      if (!dif--) {
+        break;
+      }
+      rp[2] = ap[2];
+      if (!dif--) {
+        break;
+      }
+      rp[3] = ap[3];
+      rp += 4;
+      ap += 4;
+    }
+  }
+
+  r->top = max;
+  r->neg = 0;
+  bn_correct_top(r);
+
+  return 1;
+}
+
+int BN_sub_word(BIGNUM *a, BN_ULONG w) {
+  int i;
+
+  w &= BN_MASK2;
+
+  /* degenerate case: w is zero */
+  if (!w) {
+    return 1;
+  }
+
+  /* degenerate case: a is zero */
+  if (BN_is_zero(a)) {
+    i = BN_set_word(a, w);
+    if (i != 0) {
+      BN_set_negative(a, 1);
+    }
+    return i;
+  }
+
+  /* handle 'a' when negative */
+  if (a->neg) {
+    a->neg = 0;
+    i = BN_add_word(a, w);
+    a->neg = 1;
+    return i;
+  }
+
+  if ((a->top == 1) && (a->d[0] < w)) {
+    a->d[0] = w - a->d[0];
+    a->neg = 1;
+    return 1;
+  }
+
+  i = 0;
+  for (;;) {
+    if (a->d[i] >= w) {
+      a->d[i] -= w;
+      break;
+    } else {
+      a->d[i] = (a->d[i] - w) & BN_MASK2;
+      i++;
+      w = 1;
+    }
+  }
+
+  if ((a->d[i] == 0) && (i == (a->top - 1))) {
+    a->top--;
+  }
+
+  return 1;
+}

diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl
new file mode 100644
index 0000000..fe81f9b
--- /dev/null
+++ b/crypto/bn/asm/armv4-mont.pl

@@ -0,0 +1,669 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2007.
+
+# Montgomery multiplication for ARMv4.
+#
+# Performance improvement naturally varies among CPU implementations
+# and compilers. The code was observed to provide +65-35% improvement
+# [depending on key length, less for longer keys] on ARM920T, and
+# +115-80% on Intel IXP425. This is compared to pre-bn_mul_mont code
+# base and compiler generated code with in-lined umull and even umlal
+# instructions. The latter means that this code didn't really have an 
+# "advantage" of utilizing some "secret" instruction.
+#
+# The code is interoperable with Thumb ISA and is rather compact, less
+# than 1/2KB. Windows CE port would be trivial, as it's exclusively
+# about decorations, ABI and instruction syntax are identical.
+
+# November 2013
+#
+# Add NEON code path, which handles lengths divisible by 8. RSA/DSA
+# performance improvement on Cortex-A8 is ~45-100% depending on key
+# length, more for longer keys. On Cortex-A15 the span is ~10-105%.
+# On Snapdragon S4 improvement was measured to vary from ~70% to
+# incredible ~380%, yes, 4.8x faster, for RSA4096 sign. But this is
+# rather because original integer-only code seems to perform
+# suboptimally on S4. Situation on Cortex-A9 is unfortunately
+# different. It's being looked into, but the trouble is that
+# performance for vectors longer than 256 bits is actually couple
+# of percent worse than for integer-only code. The code is chosen
+# for execution on all NEON-capable processors, because gain on
+# others outweighs the marginal loss on Cortex-A9.
+
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$num="r0";	# starts as num argument, but holds &tp[num-1]
+$ap="r1";
+$bp="r2"; $bi="r2"; $rp="r2";
+$np="r3";
+$tp="r4";
+$aj="r5";
+$nj="r6";
+$tj="r7";
+$n0="r8";
+###########	# r9 is reserved by ELF as platform specific, e.g. TLS pointer
+$alo="r10";	# sl, gcc uses it to keep @GOT
+$ahi="r11";	# fp
+$nlo="r12";	# ip
+###########	# r13 is stack pointer
+$nhi="r14";	# lr
+###########	# r15 is program counter
+
+#### argument block layout relative to &tp[num-1], a.k.a. $num
+$_rp="$num,#12*4";
+# ap permanently resides in r1
+$_bp="$num,#13*4";
+# np permanently resides in r3
+$_n0="$num,#14*4";
+$_num="$num,#15*4";	$_bpend=$_num;
+
+$code=<<___;
+#include "arm_arch.h"
+
+.text
+.code	32
+
+#if __ARM_ARCH__>=7
+.align	5
+.LOPENSSL_armcap:
+.word	OPENSSL_armcap_P-bn_mul_mont
+#endif
+
+.global	bn_mul_mont
+.type	bn_mul_mont,%function
+
+.align	5
+bn_mul_mont:
+	ldr	ip,[sp,#4]		@ load num
+	stmdb	sp!,{r0,r2}		@ sp points at argument block
+#if __ARM_ARCH__>=7
+	tst	ip,#7
+	bne	.Lialu
+	adr	r0,bn_mul_mont
+	ldr	r2,.LOPENSSL_armcap
+	ldr	r0,[r0,r2]
+	tst	r0,#1			@ NEON available?
+	ldmia	sp, {r0,r2}
+	beq	.Lialu
+	add	sp,sp,#8
+	b	bn_mul8x_mont_neon
+.align	4
+.Lialu:
+#endif
+	cmp	ip,#2
+	mov	$num,ip			@ load num
+	movlt	r0,#0
+	addlt	sp,sp,#2*4
+	blt	.Labrt
+
+	stmdb	sp!,{r4-r12,lr}		@ save 10 registers
+
+	mov	$num,$num,lsl#2		@ rescale $num for byte count
+	sub	sp,sp,$num		@ alloca(4*num)
+	sub	sp,sp,#4		@ +extra dword
+	sub	$num,$num,#4		@ "num=num-1"
+	add	$tp,$bp,$num		@ &bp[num-1]
+
+	add	$num,sp,$num		@ $num to point at &tp[num-1]
+	ldr	$n0,[$_n0]		@ &n0
+	ldr	$bi,[$bp]		@ bp[0]
+	ldr	$aj,[$ap],#4		@ ap[0],ap++
+	ldr	$nj,[$np],#4		@ np[0],np++
+	ldr	$n0,[$n0]		@ *n0
+	str	$tp,[$_bpend]		@ save &bp[num]
+
+	umull	$alo,$ahi,$aj,$bi	@ ap[0]*bp[0]
+	str	$n0,[$_n0]		@ save n0 value
+	mul	$n0,$alo,$n0		@ "tp[0]"*n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"t[0]"
+	mov	$tp,sp
+
+.L1st:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	mov	$alo,$ahi
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[0]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.L1st
+
+	adds	$nlo,$nlo,$ahi
+	ldr	$tp,[$_bp]		@ restore bp
+	mov	$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+.Louter:
+	sub	$tj,$num,sp		@ "original" $num-1 value
+	sub	$ap,$ap,$tj		@ "rewind" ap to &ap[1]
+	ldr	$bi,[$tp,#4]!		@ *(++bp)
+	sub	$np,$np,$tj		@ "rewind" np to &np[1]
+	ldr	$aj,[$ap,#-4]		@ ap[0]
+	ldr	$alo,[sp]		@ tp[0]
+	ldr	$nj,[$np,#-4]		@ np[0]
+	ldr	$tj,[sp,#4]		@ tp[1]
+
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[0]*bp[i]+tp[0]
+	str	$tp,[$_bp]		@ save bp
+	mul	$n0,$alo,$n0
+	mov	$nlo,#0
+	umlal	$alo,$nlo,$nj,$n0	@ np[0]*n0+"tp[0]"
+	mov	$tp,sp
+
+.Linner:
+	ldr	$aj,[$ap],#4		@ ap[j],ap++
+	adds	$alo,$ahi,$tj		@ +=tp[j]
+	ldr	$nj,[$np],#4		@ np[j],np++
+	mov	$ahi,#0
+	umlal	$alo,$ahi,$aj,$bi	@ ap[j]*bp[i]
+	mov	$nhi,#0
+	umlal	$nlo,$nhi,$nj,$n0	@ np[j]*n0
+	adc	$ahi,$ahi,#0
+	ldr	$tj,[$tp,#8]		@ tp[j+1]
+	adds	$nlo,$nlo,$alo
+	str	$nlo,[$tp],#4		@ tp[j-1]=,tp++
+	adc	$nlo,$nhi,#0
+	cmp	$tp,$num
+	bne	.Linner
+
+	adds	$nlo,$nlo,$ahi
+	mov	$nhi,#0
+	ldr	$tp,[$_bp]		@ restore bp
+	adc	$nhi,$nhi,#0
+	ldr	$n0,[$_n0]		@ restore n0
+	adds	$nlo,$nlo,$tj
+	ldr	$tj,[$_bpend]		@ restore &bp[num]
+	adc	$nhi,$nhi,#0
+	str	$nlo,[$num]		@ tp[num-1]=
+	str	$nhi,[$num,#4]		@ tp[num]=
+
+	cmp	$tp,$tj
+	bne	.Louter
+
+	ldr	$rp,[$_rp]		@ pull rp
+	add	$num,$num,#4		@ $num to point at &tp[num]
+	sub	$aj,$num,sp		@ "original" num value
+	mov	$tp,sp			@ "rewind" $tp
+	mov	$ap,$tp			@ "borrow" $ap
+	sub	$np,$np,$aj		@ "rewind" $np to &np[0]
+
+	subs	$tj,$tj,$tj		@ "clear" carry flag
+.Lsub:	ldr	$tj,[$tp],#4
+	ldr	$nj,[$np],#4
+	sbcs	$tj,$tj,$nj		@ tp[j]-np[j]
+	str	$tj,[$rp],#4		@ rp[j]=
+	teq	$tp,$num		@ preserve carry
+	bne	.Lsub
+	sbcs	$nhi,$nhi,#0		@ upmost carry
+	mov	$tp,sp			@ "rewind" $tp
+	sub	$rp,$rp,$aj		@ "rewind" $rp
+
+	and	$ap,$tp,$nhi
+	bic	$np,$rp,$nhi
+	orr	$ap,$ap,$np		@ ap=borrow?tp:rp
+
+.Lcopy:	ldr	$tj,[$ap],#4		@ copy or in-place refresh
+	str	sp,[$tp],#4		@ zap tp
+	str	$tj,[$rp],#4
+	cmp	$tp,$num
+	bne	.Lcopy
+
+	add	sp,$num,#4		@ skip over tp[num+1]
+	ldmia	sp!,{r4-r12,lr}		@ restore registers
+	add	sp,sp,#2*4		@ skip over {r0,r2}
+	mov	r0,#1
+.Labrt:	tst	lr,#1
+	moveq	pc,lr			@ be binary compatible with V4, yet
+	bx	lr			@ interoperable with Thumb ISA:-)
+.size	bn_mul_mont,.-bn_mul_mont
+___
+{
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+my ($A0,$A1,$A2,$A3)=map("d$_",(0..3));
+my ($N0,$N1,$N2,$N3)=map("d$_",(4..7));
+my ($Z,$Temp)=("q4","q5");
+my ($A0xB,$A1xB,$A2xB,$A3xB,$A4xB,$A5xB,$A6xB,$A7xB)=map("q$_",(6..13));
+my ($Bi,$Ni,$M0)=map("d$_",(28..31));
+my $zero=&Dlo($Z);
+my $temp=&Dlo($Temp);
+
+my ($rptr,$aptr,$bptr,$nptr,$n0,$num)=map("r$_",(0..5));
+my ($tinptr,$toutptr,$inner,$outer)=map("r$_",(6..9));
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu	neon
+
+.type	bn_mul8x_mont_neon,%function
+.align	5
+bn_mul8x_mont_neon:
+	mov	ip,sp
+	stmdb	sp!,{r4-r11}
+	vstmdb	sp!,{d8-d15}		@ ABI specification says so
+	ldmia	ip,{r4-r5}		@ load rest of parameter block
+
+	sub		$toutptr,sp,#16
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	sub		$toutptr,$toutptr,$num,lsl#4
+	vld1.32		{$A0-$A3},  [$aptr]!		@ can't specify :32 :-(
+	and		$toutptr,$toutptr,#-64
+	vld1.32		{${M0}[0]}, [$n0,:32]
+	mov		sp,$toutptr			@ alloca
+	veor		$zero,$zero,$zero
+	subs		$inner,$num,#8
+	vzip.16		$Bi,$zero
+
+	vmull.u32	$A0xB,$Bi,${A0}[0]
+	vmull.u32	$A1xB,$Bi,${A0}[1]
+	vmull.u32	$A2xB,$Bi,${A1}[0]
+	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
+	vmull.u32	$A3xB,$Bi,${A1}[1]
+
+	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	veor		$zero,$zero,$zero
+	vmul.u32	$Ni,$temp,$M0
+
+	vmull.u32	$A4xB,$Bi,${A2}[0]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+	vmull.u32	$A5xB,$Bi,${A2}[1]
+	vmull.u32	$A6xB,$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmull.u32	$A7xB,$Bi,${A3}[1]
+
+	bne	.LNEON_1st
+
+	@ special case for num=8, everything is in register bank...
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	sub		$outer,$num,#1
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vmov		$Temp,$A0xB
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vmov		$A0xB,$A1xB
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vmov		$A1xB,$A2xB
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	vmov		$A2xB,$A3xB
+	vmov		$A3xB,$A4xB
+	vshr.u64	$temp,$temp,#16
+	vmov		$A4xB,$A5xB
+	vmov		$A5xB,$A6xB
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	vmov		$A6xB,$A7xB
+	veor		$A7xB,$A7xB
+	vshr.u64	$temp,$temp,#16
+
+	b	.LNEON_outer8
+
+.align	4
+.LNEON_outer8:
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	veor		$zero,$zero,$zero
+	vzip.16		$Bi,$zero
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+	vmlal.u32	$A0xB,$Bi,${A0}[0]
+	vmlal.u32	$A1xB,$Bi,${A0}[1]
+	vmlal.u32	$A2xB,$Bi,${A1}[0]
+	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
+	vmlal.u32	$A3xB,$Bi,${A1}[1]
+
+	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	veor		$zero,$zero,$zero
+	subs		$outer,$outer,#1
+	vmul.u32	$Ni,$temp,$M0
+
+	vmlal.u32	$A4xB,$Bi,${A2}[0]
+	vmlal.u32	$A5xB,$Bi,${A2}[1]
+	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmlal.u32	$A7xB,$Bi,${A3}[1]
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vmov		$Temp,$A0xB
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vmov		$A0xB,$A1xB
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vmov		$A1xB,$A2xB
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	vmov		$A2xB,$A3xB
+	vmov		$A3xB,$A4xB
+	vshr.u64	$temp,$temp,#16
+	vmov		$A4xB,$A5xB
+	vmov		$A5xB,$A6xB
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	vmov		$A6xB,$A7xB
+	veor		$A7xB,$A7xB
+	vshr.u64	$temp,$temp,#16
+
+	bne	.LNEON_outer8
+
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+	mov		$toutptr,sp
+	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
+	mov		$inner,$num
+	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+	add		$tinptr,sp,#16
+	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
+	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+	b	.LNEON_tail2
+
+.align	4
+.LNEON_1st:
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	 vld1.32	{$A0-$A3}, [$aptr]!
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	subs		$inner,$inner,#8
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	 vld1.32	{$N0-$N1}, [$nptr]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	 vst1.64	{$A0xB-$A1xB}, [$toutptr,:256]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	 vst1.64	{$A2xB-$A3xB}, [$toutptr,:256]!
+
+	vmull.u32	$A0xB,$Bi,${A0}[0]
+	 vld1.32	{$N2-$N3}, [$nptr]!
+	vmull.u32	$A1xB,$Bi,${A0}[1]
+	 vst1.64	{$A4xB-$A5xB}, [$toutptr,:256]!
+	vmull.u32	$A2xB,$Bi,${A1}[0]
+	vmull.u32	$A3xB,$Bi,${A1}[1]
+	 vst1.64	{$A6xB-$A7xB}, [$toutptr,:256]!
+
+	vmull.u32	$A4xB,$Bi,${A2}[0]
+	vmull.u32	$A5xB,$Bi,${A2}[1]
+	vmull.u32	$A6xB,$Bi,${A3}[0]
+	vmull.u32	$A7xB,$Bi,${A3}[1]
+
+	bne	.LNEON_1st
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	add		$tinptr,sp,#16
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	 vld1.64	{$Temp}, [sp,:128]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+	sub		$outer,$num,#1
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vshr.u64	$temp,$temp,#16
+	 vld1.64	{$A0xB},       [$tinptr, :128]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+
+	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	veor		$Z,$Z,$Z
+	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
+	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
+	vst1.64		{$Z},          [$toutptr,:128]
+	vshr.u64	$temp,$temp,#16
+
+	b		.LNEON_outer
+
+.align	4
+.LNEON_outer:
+	vld1.32		{${Bi}[0]}, [$bptr,:32]!
+	sub		$nptr,$nptr,$num,lsl#2		@ rewind $nptr
+	vld1.32		{$A0-$A3},  [$aptr]!
+	veor		$zero,$zero,$zero
+	mov		$toutptr,sp
+	vzip.16		$Bi,$zero
+	sub		$inner,$num,#8
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+
+	vmlal.u32	$A0xB,$Bi,${A0}[0]
+	 vld1.64	{$A3xB-$A4xB},[$tinptr,:256]!
+	vmlal.u32	$A1xB,$Bi,${A0}[1]
+	vmlal.u32	$A2xB,$Bi,${A1}[0]
+	 vld1.64	{$A5xB-$A6xB},[$tinptr,:256]!
+	vmlal.u32	$A3xB,$Bi,${A1}[1]
+
+	vshl.i64	$temp,`&Dhi("$A0xB")`,#16
+	veor		$zero,$zero,$zero
+	vadd.u64	$temp,$temp,`&Dlo("$A0xB")`
+	 vld1.64	{$A7xB},[$tinptr,:128]!
+	vmul.u32	$Ni,$temp,$M0
+
+	vmlal.u32	$A4xB,$Bi,${A2}[0]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+	vmlal.u32	$A5xB,$Bi,${A2}[1]
+	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vzip.16		$Ni,$zero
+	vmlal.u32	$A7xB,$Bi,${A3}[1]
+
+.LNEON_inner:
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	 vld1.32	{$A0-$A3}, [$aptr]!
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	 subs		$inner,$inner,#8
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	 vld1.64	{$A0xB},       [$tinptr, :128]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
+
+	vmlal.u32	$A0xB,$Bi,${A0}[0]
+	 vld1.64	{$A3xB-$A4xB}, [$tinptr, :256]!
+	vmlal.u32	$A1xB,$Bi,${A0}[1]
+	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
+	vmlal.u32	$A2xB,$Bi,${A1}[0]
+	 vld1.64	{$A5xB-$A6xB}, [$tinptr, :256]!
+	vmlal.u32	$A3xB,$Bi,${A1}[1]
+	 vld1.32	{$N0-$N3}, [$nptr]!
+
+	vmlal.u32	$A4xB,$Bi,${A2}[0]
+	 vld1.64	{$A7xB},       [$tinptr, :128]!
+	vmlal.u32	$A5xB,$Bi,${A2}[1]
+	vmlal.u32	$A6xB,$Bi,${A3}[0]
+	vmlal.u32	$A7xB,$Bi,${A3}[1]
+
+	bne	.LNEON_inner
+
+	vmlal.u32	$A0xB,$Ni,${N0}[0]
+	add		$tinptr,sp,#16
+	vmlal.u32	$A1xB,$Ni,${N0}[1]
+	sub		$aptr,$aptr,$num,lsl#2		@ rewind $aptr
+	vmlal.u32	$A2xB,$Ni,${N1}[0]
+	 vld1.64	{$Temp}, [sp,:128]
+	vmlal.u32	$A3xB,$Ni,${N1}[1]
+	subs		$outer,$outer,#1
+
+	vmlal.u32	$A4xB,$Ni,${N2}[0]
+	vst1.64		{$A0xB-$A1xB}, [$toutptr,:256]!
+	vmlal.u32	$A5xB,$Ni,${N2}[1]
+	 vld1.64	{$A0xB},       [$tinptr, :128]!
+	vshr.u64	$temp,$temp,#16
+	vst1.64		{$A2xB-$A3xB}, [$toutptr,:256]!
+	vmlal.u32	$A6xB,$Ni,${N3}[0]
+	 vld1.64	{$A1xB-$A2xB}, [$tinptr, :256]!
+	vmlal.u32	$A7xB,$Ni,${N3}[1]
+
+	vst1.64		{$A4xB-$A5xB}, [$toutptr,:256]!
+	vadd.u64	$temp,$temp,`&Dhi("$Temp")`
+	vst1.64		{$A6xB-$A7xB}, [$toutptr,:256]!
+	vshr.u64	$temp,$temp,#16
+
+	bne	.LNEON_outer
+
+	mov		$toutptr,sp
+	mov		$inner,$num
+
+.LNEON_tail:
+	vadd.u64	`&Dlo("$A0xB")`,`&Dlo("$A0xB")`,$temp
+	vld1.64		{$A3xB-$A4xB}, [$tinptr, :256]!
+	vshr.u64	$temp,`&Dlo("$A0xB")`,#16
+	vadd.u64	`&Dhi("$A0xB")`,`&Dhi("$A0xB")`,$temp
+	vld1.64		{$A5xB-$A6xB}, [$tinptr, :256]!
+	vshr.u64	$temp,`&Dhi("$A0xB")`,#16
+	vld1.64		{$A7xB},       [$tinptr, :128]!
+	vzip.16		`&Dlo("$A0xB")`,`&Dhi("$A0xB")`
+
+.LNEON_tail2:
+	vadd.u64	`&Dlo("$A1xB")`,`&Dlo("$A1xB")`,$temp
+	vst1.32		{`&Dlo("$A0xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A1xB")`,#16
+	vadd.u64	`&Dhi("$A1xB")`,`&Dhi("$A1xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A1xB")`,#16
+	vzip.16		`&Dlo("$A1xB")`,`&Dhi("$A1xB")`
+
+	vadd.u64	`&Dlo("$A2xB")`,`&Dlo("$A2xB")`,$temp
+	vst1.32		{`&Dlo("$A1xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A2xB")`,#16
+	vadd.u64	`&Dhi("$A2xB")`,`&Dhi("$A2xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A2xB")`,#16
+	vzip.16		`&Dlo("$A2xB")`,`&Dhi("$A2xB")`
+
+	vadd.u64	`&Dlo("$A3xB")`,`&Dlo("$A3xB")`,$temp
+	vst1.32		{`&Dlo("$A2xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A3xB")`,#16
+	vadd.u64	`&Dhi("$A3xB")`,`&Dhi("$A3xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A3xB")`,#16
+	vzip.16		`&Dlo("$A3xB")`,`&Dhi("$A3xB")`
+
+	vadd.u64	`&Dlo("$A4xB")`,`&Dlo("$A4xB")`,$temp
+	vst1.32		{`&Dlo("$A3xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A4xB")`,#16
+	vadd.u64	`&Dhi("$A4xB")`,`&Dhi("$A4xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A4xB")`,#16
+	vzip.16		`&Dlo("$A4xB")`,`&Dhi("$A4xB")`
+
+	vadd.u64	`&Dlo("$A5xB")`,`&Dlo("$A5xB")`,$temp
+	vst1.32		{`&Dlo("$A4xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A5xB")`,#16
+	vadd.u64	`&Dhi("$A5xB")`,`&Dhi("$A5xB")`,$temp
+	vshr.u64	$temp,`&Dhi("$A5xB")`,#16
+	vzip.16		`&Dlo("$A5xB")`,`&Dhi("$A5xB")`
+
+	vadd.u64	`&Dlo("$A6xB")`,`&Dlo("$A6xB")`,$temp
+	vst1.32		{`&Dlo("$A5xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A6xB")`,#16
+	vadd.u64	`&Dhi("$A6xB")`,`&Dhi("$A6xB")`,$temp
+	vld1.64		{$A0xB}, [$tinptr, :128]!
+	vshr.u64	$temp,`&Dhi("$A6xB")`,#16
+	vzip.16		`&Dlo("$A6xB")`,`&Dhi("$A6xB")`
+
+	vadd.u64	`&Dlo("$A7xB")`,`&Dlo("$A7xB")`,$temp
+	vst1.32		{`&Dlo("$A6xB")`[0]}, [$toutptr, :32]!
+	vshr.u64	$temp,`&Dlo("$A7xB")`,#16
+	vadd.u64	`&Dhi("$A7xB")`,`&Dhi("$A7xB")`,$temp
+	vld1.64		{$A1xB-$A2xB},	[$tinptr, :256]!
+	vshr.u64	$temp,`&Dhi("$A7xB")`,#16
+	vzip.16		`&Dlo("$A7xB")`,`&Dhi("$A7xB")`
+	subs		$inner,$inner,#8
+	vst1.32		{`&Dlo("$A7xB")`[0]}, [$toutptr, :32]!
+
+	bne	.LNEON_tail
+
+	vst1.32	{${temp}[0]}, [$toutptr, :32]		@ top-most bit
+	sub	$nptr,$nptr,$num,lsl#2			@ rewind $nptr
+	subs	$aptr,sp,#0				@ clear carry flag
+	add	$bptr,sp,$num,lsl#2
+
+.LNEON_sub:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$nptr!, {r8-r11}
+	sbcs	r8, r4,r8
+	sbcs	r9, r5,r9
+	sbcs	r10,r6,r10
+	sbcs	r11,r7,r11
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_sub
+
+	ldr	r10, [$aptr]				@ load top-most bit
+	veor	q0,q0,q0
+	sub	r11,$bptr,sp				@ this is num*4
+	veor	q1,q1,q1
+	mov	$aptr,sp
+	sub	$rptr,$rptr,r11				@ rewind $rptr
+	mov	$nptr,$bptr				@ second 3/4th of frame
+	sbcs	r10,r10,#0				@ result is carry flag
+
+.LNEON_copy_n_zap:
+	ldmia	$aptr!, {r4-r7}
+	ldmia	$rptr,  {r8-r11}
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	movcc	r11,r7
+	ldmia	$aptr, {r4-r7}
+	stmia	$rptr!, {r8-r11}
+	sub	$aptr,$aptr,#16
+	ldmia	$rptr, {r8-r11}
+	movcc	r8, r4
+	vst1.64	{q0-q1}, [$aptr,:256]!			@ wipe
+	movcc	r9, r5
+	movcc	r10,r6
+	vst1.64	{q0-q1}, [$nptr,:256]!			@ wipe
+	movcc	r11,r7
+	teq	$aptr,$bptr				@ preserves carry
+	stmia	$rptr!, {r8-r11}
+	bne	.LNEON_copy_n_zap
+
+	sub	sp,ip,#96
+        vldmia  sp!,{d8-d15}
+        ldmia   sp!,{r4-r11}
+	bx	lr
+.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz	"Montgomery multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.align	2
+#if __ARM_ARCH__>=7
+.comm	OPENSSL_armcap_P,4,4
+#endif
+___
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;	# make it possible to compile with -march=armv4
+print $code;
+close STDOUT;

diff --git a/crypto/bn/asm/bn-586.pl b/crypto/bn/asm/bn-586.pl
new file mode 100644
index 0000000..26d9bcb
--- /dev/null
+++ b/crypto/bn/asm/bn-586.pl

@@ -0,0 +1,774 @@
+#!/usr/bin/env perl
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_sub_part_words("bn_sub_part_words");
+
+&asm_finish();
+
+sub bn_mul_add_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("maw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry_in
+		&jmp(&label("maw_sse2_entry"));
+		
+	&set_label("maw_sse2_unrolled",16);
+		&movd("mm3",&DWP(0,$r,"",0));	# mm3 = r[0]
+		&paddq("mm1","mm3");		# mm1 = carry_in + r[0]
+		&movd("mm2",&DWP(0,$a,"",0));	# mm2 = a[0]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[0]
+		&movd("mm4",&DWP(4,$a,"",0));	# mm4 = a[1]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[1]
+		&movd("mm6",&DWP(8,$a,"",0));	# mm6 = a[2]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[2]
+		&movd("mm7",&DWP(12,$a,"",0));	# mm7 = a[3]
+		&pmuludq("mm7","mm0");		# mm7 = w*a[3]
+		&paddq("mm1","mm2");		# mm1 = carry_in + r[0] + w*a[0]
+		&movd("mm3",&DWP(4,$r,"",0));	# mm3 = r[1]
+		&paddq("mm3","mm4");		# mm3 = r[1] + w*a[1]
+		&movd("mm5",&DWP(8,$r,"",0));	# mm5 = r[2]
+		&paddq("mm5","mm6");		# mm5 = r[2] + w*a[2]
+		&movd("mm4",&DWP(12,$r,"",0));	# mm4 = r[3]
+		&paddq("mm7","mm4");		# mm7 = r[3] + w*a[3]
+		&movd(&DWP(0,$r,"",0),"mm1");
+		&movd("mm2",&DWP(16,$a,"",0));	# mm2 = a[4]
+		&pmuludq("mm2","mm0");		# mm2 = w*a[4]
+		&psrlq("mm1",32);		# mm1 = carry0
+		&movd("mm4",&DWP(20,$a,"",0));	# mm4 = a[5]
+		&pmuludq("mm4","mm0");		# mm4 = w*a[5]
+		&paddq("mm1","mm3");		# mm1 = carry0 + r[1] + w*a[1]
+		&movd("mm6",&DWP(24,$a,"",0));	# mm6 = a[6]
+		&pmuludq("mm6","mm0");		# mm6 = w*a[6]
+		&movd(&DWP(4,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry1
+		&movd("mm3",&DWP(28,$a,"",0));	# mm3 = a[7]
+		&add($a,32);
+		&pmuludq("mm3","mm0");		# mm3 = w*a[7]
+		&paddq("mm1","mm5");		# mm1 = carry1 + r[2] + w*a[2]
+		&movd("mm5",&DWP(16,$r,"",0));	# mm5 = r[4]
+		&paddq("mm2","mm5");		# mm2 = r[4] + w*a[4]
+		&movd(&DWP(8,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry2
+		&paddq("mm1","mm7");		# mm1 = carry2 + r[3] + w*a[3]
+		&movd("mm5",&DWP(20,$r,"",0));	# mm5 = r[5]
+		&paddq("mm4","mm5");		# mm4 = r[5] + w*a[5]
+		&movd(&DWP(12,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry3
+		&paddq("mm1","mm2");		# mm1 = carry3 + r[4] + w*a[4]
+		&movd("mm5",&DWP(24,$r,"",0));	# mm5 = r[6]
+		&paddq("mm6","mm5");		# mm6 = r[6] + w*a[6]
+		&movd(&DWP(16,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry4
+		&paddq("mm1","mm4");		# mm1 = carry4 + r[5] + w*a[5]
+		&movd("mm5",&DWP(28,$r,"",0));	# mm5 = r[7]
+		&paddq("mm3","mm5");		# mm3 = r[7] + w*a[7]
+		&movd(&DWP(20,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry5
+		&paddq("mm1","mm6");		# mm1 = carry5 + r[6] + w*a[6]
+		&movd(&DWP(24,$r,"",0),"mm1");
+		&psrlq("mm1",32);		# mm1 = carry6
+		&paddq("mm1","mm3");		# mm1 = carry6 + r[7] + w*a[7]
+		&movd(&DWP(28,$r,"",0),"mm1");
+		&lea($r,&DWP(32,$r));
+		&psrlq("mm1",32);		# mm1 = carry_out
+
+		&sub($c,8);
+		&jz(&label("maw_sse2_exit"));
+	&set_label("maw_sse2_entry");
+		&test($c,0xfffffff8);
+		&jnz(&label("maw_sse2_unrolled"));
+
+	&set_label("maw_sse2_loop",4);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&movd("mm3",&DWP(0,$r));	# mm3 = r[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm3");		# carry += r[i]
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("maw_sse2_loop"));
+	&set_label("maw_sse2_exit");
+		&movd("eax","mm1");		# c = carry_out
+		&emms();
+		&ret();
+
+	&set_label("maw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
+
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",16);
+
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+= c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r),"eax");	# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&sub("ecx",8);
+	&lea($a,&DWP(32,$a));
+	&lea($r,&DWP(32,$r));
+	&jnz(&label("maw_loop"));
+
+	&set_label("maw_finish",0);
+	&mov("ecx",&wparam(2));	# get num
+	&and("ecx",7);
+	&jnz(&label("maw_finish2"));	# helps branch prediction
+	&jmp(&label("maw_end"));
+
+	&set_label("maw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a));	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",&DWP($i*4,$r));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &dec("ecx") if ($i != 7-1);
+		&mov(&DWP($i*4,$r),"eax");	# *r= L(t);
+		 &mov($c,"edx");		# c=  H(t);
+		&jz(&label("maw_end")) if ($i != 7-1);
+		}
+	&set_label("maw_end",0);
+	&mov("eax",$c);
+
+	&pop("ecx");	# clear variable from
+
+	&function_end($name);
+	}
+
+sub bn_mul_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("mw_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+		&movd("mm0",&wparam(3));	# mm0 = w
+		&pxor("mm1","mm1");		# mm1 = carry = 0
+
+	&set_label("mw_sse2_loop",16);
+		&movd("mm2",&DWP(0,$a));	# mm2 = a[i]
+		&pmuludq("mm2","mm0");		# a[i] *= w
+		&lea($a,&DWP(4,$a));
+		&paddq("mm1","mm2");		# carry += a[i]*w
+		&movd(&DWP(0,$r),"mm1");	# r[i] = carry_low
+		&sub($c,1);
+		&psrlq("mm1",32);		# carry = carry_high
+		&lea($r,&DWP(4,$r));
+		&jnz(&label("mw_sse2_loop"));
+
+		&movd("eax","mm1");		# return carry
+		&emms();
+		&ret();
+	&set_label("mw_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ecx";
+	$r="edi";
+	$c="esi";
+	$num="ebp";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+	&mov($w,&wparam(3));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("mw_finish"));
+
+	&set_label("mw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jz(&label("mw_finish"));
+	&jmp(&label("mw_loop"));
+
+	&set_label("mw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jnz(&label("mw_finish2"));
+	&jmp(&label("mw_end"));
+
+	&set_label("mw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		 &dec($num) if ($i != 7-1);
+		&jz(&label("mw_end")) if ($i != 7-1);
+		}
+	&set_label("mw_end",0);
+	&mov("eax",$c);
+
+	&function_end($name);
+	}
+
+sub bn_sqr_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,$sse2?"EXTRN\t_OPENSSL_ia32cap_P:DWORD":"");
+
+	$r="eax";
+	$a="edx";
+	$c="ecx";
+
+	if ($sse2) {
+		&picmeup("eax","OPENSSL_ia32cap_P");
+		&bt(&DWP(0,"eax"),26);
+		&jnc(&label("sqr_non_sse2"));
+
+		&mov($r,&wparam(0));
+		&mov($a,&wparam(1));
+		&mov($c,&wparam(2));
+
+	&set_label("sqr_sse2_loop",16);
+		&movd("mm0",&DWP(0,$a));	# mm0 = a[i]
+		&pmuludq("mm0","mm0");		# a[i] *= a[i]
+		&lea($a,&DWP(4,$a));		# a++
+		&movq(&QWP(0,$r),"mm0");	# r[i] = a[i]*a[i]
+		&sub($c,1);
+		&lea($r,&DWP(8,$r));		# r += 2
+		&jnz(&label("sqr_sse2_loop"));
+
+		&emms();
+		&ret();
+	&set_label("sqr_non_sse2",16);
+	}
+
+	# function_begin prologue
+	&push("ebp");
+	&push("ebx");
+	&push("esi");
+	&push("edi");
+
+	&comment("");
+	$r="esi";
+	$a="edi";
+	$num="ebx";
+
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("sw_finish"));
+
+	&set_label("sw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+		&mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*2,$r,"",0),"eax");	#
+		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,64);
+	&sub($num,8);
+	&jnz(&label("sw_loop"));
+
+	&set_label("sw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jz(&label("sw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov("eax",&DWP($i*4,$a,"",0));	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*8,$r,"",0),"eax");	#
+		 &dec($num) if ($i != 7-1);
+		&mov(&DWP($i*8+4,$r,"",0),"edx");
+		 &jz(&label("sw_end")) if ($i != 7-1);
+		}
+	&set_label("sw_end",0);
+
+	&function_end($name);
+	}
+
+sub bn_div_words
+	{
+	local($name)=@_;
+
+	&function_begin_B($name,"");
+	&mov("edx",&wparam(0));	#
+	&mov("eax",&wparam(1));	#
+	&mov("ecx",&wparam(2));	#
+	&div("ecx");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+sub bn_sub_part_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP(0,$a,"",0));	# *a
+		 &mov($tmp2,&DWP(0,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		&mov(&DWP(0,$r,"",0),$tmp1);	# *r
+		&add($a, 4);
+		&add($b, 4);
+		&add($r, 4);
+		 &dec($num) if ($i != 6);
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+	&cmp(&wparam(4),0);
+	&je(&label("pw_end"));
+
+	&mov($num,&wparam(4));	# get dl
+	&cmp($num,0);
+	&je(&label("pw_end"));
+	&jge(&label("pw_pos"));
+
+	&comment("pw_neg");
+	&mov($tmp2,0);
+	&sub($tmp2,$num);
+	&mov($num,$tmp2);
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("pw_neg_finish"));
+
+	&set_label("pw_neg_loop",0);
+	for ($i=0; $i<8; $i++)
+	{
+	    &comment("dl<0 Round $i");
+
+	    &mov($tmp1,0);
+	    &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+	    &sub($tmp1,$c);
+	    &mov($c,0);
+	    &adc($c,$c);
+	    &sub($tmp1,$tmp2);
+	    &adc($c,0);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+	}
+	    
+	&comment("");
+	&add($b,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_neg_loop"));
+	    
+	&set_label("pw_neg_finish",0);
+	&mov($tmp2,&wparam(4));	# get dl
+	&mov($num,0);
+	&sub($num,$tmp2);
+	&and($num,7);
+	&jz(&label("pw_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &comment("dl<0 Tail Round $i");
+	    &mov($tmp1,0);
+	    &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+	    &sub($tmp1,$c);
+	    &mov($c,0);
+	    &adc($c,$c);
+	    &sub($tmp1,$tmp2);
+	    &adc($c,0);
+	    &dec($num) if ($i != 6);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jz(&label("pw_end")) if ($i != 6);
+	}
+
+	&jmp(&label("pw_end"));
+	
+	&set_label("pw_pos",0);
+	
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("pw_pos_finish"));
+
+	&set_label("pw_pos_loop",0);
+
+	for ($i=0; $i<8; $i++)
+	{
+	    &comment("dl>0 Round $i");
+
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &sub($tmp1,$c);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jnc(&label("pw_nc".$i));
+	}
+	    
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_pos_loop"));
+	    
+	&set_label("pw_pos_finish",0);
+	&mov($num,&wparam(4));	# get dl
+	&and($num,7);
+	&jz(&label("pw_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &comment("dl>0 Tail Round $i");
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &sub($tmp1,$c);
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &jnc(&label("pw_tail_nc".$i));
+	    &dec($num) if ($i != 6);
+	    &jz(&label("pw_end")) if ($i != 6);
+	}
+	&mov($c,1);
+	&jmp(&label("pw_end"));
+
+	&set_label("pw_nc_loop",0);
+	for ($i=0; $i<8; $i++)
+	{
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &set_label("pw_nc".$i,0);
+	}
+	    
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jnz(&label("pw_nc_loop"));
+	    
+	&mov($num,&wparam(4));	# get dl
+	&and($num,7);
+	&jz(&label("pw_nc_end"));
+	    
+	for ($i=0; $i<7; $i++)
+	{
+	    &mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+	    &mov(&DWP($i*4,$r,"",0),$tmp1);	# *r
+	    &set_label("pw_tail_nc".$i,0);
+	    &dec($num) if ($i != 6);
+	    &jz(&label("pw_nc_end")) if ($i != 6);
+	}
+
+	&set_label("pw_nc_end",0);
+	&mov($c,0);
+
+	&set_label("pw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+

diff --git a/crypto/bn/asm/co-586.pl b/crypto/bn/asm/co-586.pl
new file mode 100644
index 0000000..57101a6
--- /dev/null
+++ b/crypto/bn/asm/co-586.pl

@@ -0,0 +1,287 @@
+#!/usr/local/bin/perl
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();
+
+sub mul_add_c
+	{
+	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("mul a[$ai]*b[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	&mul("edx");
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
+	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
+	}
+
+sub sqr_add_c
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
+	}
+
+sub sqr_add_c2
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$a,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add("eax","eax");
+	 ###
+	&adc("edx","edx");
+	 ###
+	&adc($c2,0);
+	 &add($c0,"eax");
+	&adc($c1,"edx");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
+	&adc($c2,0);
+	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+	 ###
+	}
+
+sub bn_mul_comba
+	{
+	local($name,$num)=@_;
+	local($a,$b,$c0,$c1,$c2);
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($tot,$end);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$b="edi";
+	
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	&push("esi");
+	 &mov($a,&wparam(1));
+	&push("edi");
+	 &mov($b,&wparam(2));
+	&push("ebp");
+	 &push("ebx");
+
+	&xor($c0,$c0);
+	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
+	&xor($c1,$c1);
+	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("################## Calculate word $i"); 
+
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($j+1) == $end)
+				{
+				$v=1;
+				$v=2 if (($i+1) == $tot);
+				}
+			else
+				{ $v=0; }
+			if (($j+1) != $end)
+				{
+				$na=($ai-1);
+				$nb=($bi+1);
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				# &mov("eax",&wparam(0));
+				# &mov(&DWP($i*4,"eax","",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&comment("save r[$i]");
+	# &mov("eax",&wparam(0));
+	&mov(&DWP($i*4,"eax","",0),$c0);
+
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_sqr_comba
+	{
+	local($name,$num)=@_;
+	local($r,$a,$c0,$c1,$c2)=@_;
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($b,$tot,$end,$half);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$r="edi";
+
+	&push("esi");
+	 &push("edi");
+	&push("ebp");
+	 &push("ebx");
+	&mov($r,&wparam(0));
+	 &mov($a,&wparam(1));
+	&xor($c0,$c0);
+	 &xor($c1,$c1);
+	&mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("############### Calculate word $i");
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($ai-1) < ($bi+1))
+				{
+				$v=1;
+				$v=2 if ($i+1) == $tot;
+				}
+			else
+				{ $v=0; }
+			if (!$v)
+				{
+				$na=$ai-1;
+				$nb=$bi+1;
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+			if ($ai == $bi)
+				{
+				&sqr_add_c($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			else
+				{
+				&sqr_add_c2($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				#&mov(&DWP($i*4,$r,"",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				last;
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&mov(&DWP($i*4,$r,"",0),$c0);
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}

diff --git a/crypto/bn/asm/modexp512-x86_64.pl b/crypto/bn/asm/modexp512-x86_64.pl
new file mode 100644
index 0000000..bfd6e97
--- /dev/null
+++ b/crypto/bn/asm/modexp512-x86_64.pl

@@ -0,0 +1,1497 @@
+#!/usr/bin/env perl
+#
+# Copyright (c) 2010-2011 Intel Corp.
+#   Author: Vinodh.Gopal@intel.com
+#           Jim Guilford
+#           Erdinc.Ozturk@intel.com
+#           Maxim.Perminov@intel.com
+#
+# More information about algorithm used can be found at:
+#   http://www.cse.buffalo.edu/srds2009/escs2009_submission_Gopal.pdf
+#
+# ====================================================================
+# Copyright (c) 2011 The OpenSSL Project.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in
+#    the documentation and/or other materials provided with the
+#    distribution.
+#
+# 3. All advertising materials mentioning features or use of this
+#    software must display the following acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)"
+#
+# 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+#    endorse or promote products derived from this software without
+#    prior written permission. For written permission, please contact
+#    licensing@OpenSSL.org.
+#
+# 5. Products derived from this software may not be called "OpenSSL"
+#    nor may "OpenSSL" appear in their names without prior written
+#    permission of the OpenSSL Project.
+#
+# 6. Redistributions of any form whatsoever must retain the following
+#    acknowledgment:
+#    "This product includes software developed by the OpenSSL Project
+#    for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)"
+#
+# THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+# ====================================================================
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+my $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+use strict;
+my $code=".text\n\n";
+my $m=0;
+
+#
+# Define x512 macros
+#
+
+#MULSTEP_512_ADD	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src1, src2, add_src, tmp1, tmp2
+#
+# uses rax, rdx, and args
+sub MULSTEP_512_ADD
+{
+ my ($x, $DST, $SRC2, $ASRC, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	($ASRC), $X[0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 mov	(+8*$i)($ASRC), $X[$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#MULSTEP_512	MACRO	x7, x6, x5, x4, x3, x2, x1, x0, dst, src2, src1_val, tmp
+#
+# uses rax, rdx, and args
+sub MULSTEP_512
+{
+ my ($x, $DST, $SRC2, $OP, $TMP)=@_;
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	 mov	(+8*0)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 add	%rax, $X[0]
+	 adc	\$0, %rdx
+	 mov	$X[0], $DST
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $TMP
+
+	 mov	(+8*$i)($SRC2), %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i]
+	 adc	\$0, %rdx
+	 add	$TMP, $X[$i]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $X[0]
+___
+}
+
+#
+# Swizzle Macros
+#
+
+# macro to copy data from flat space to swizzled table
+#MACRO swizzle	pDst, pSrc, tmp1, tmp2
+# pDst and pSrc are modified
+sub swizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0)=@_;
+$code.=<<___;
+	 mov	\$8, $cnt
+loop_$m:
+	 mov	($pSrc), $d0
+	 mov	$d0#w, ($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*1)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*2)($pDst)
+	 shr	\$16, $d0
+	 mov	$d0#w, (+64*3)($pDst)
+	 lea	8($pSrc), $pSrc
+	 lea	64*4($pDst), $pDst
+	 dec	$cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+# macro to copy data from swizzled table to  flat space
+#MACRO unswizzle	pDst, pSrc, tmp*3
+sub unswizzle
+{
+ my ($pDst, $pSrc, $cnt, $d0, $d1)=@_;
+$code.=<<___;
+	 mov	\$4, $cnt
+loop_$m:
+	 movzxw	(+64*3+256*0)($pSrc), $d0
+	 movzxw	(+64*3+256*1)($pSrc), $d1
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*2+256*0)($pSrc), $d0#w
+	 mov	(+64*2+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*1+256*0)($pSrc), $d0#w
+	 mov	(+64*1+256*1)($pSrc), $d1#w
+	 shl	\$16, $d0
+	 shl	\$16, $d1
+	 mov	(+64*0+256*0)($pSrc), $d0#w
+	 mov	(+64*0+256*1)($pSrc), $d1#w
+	 mov	$d0, (+8*0)($pDst)
+	 mov	$d1, (+8*1)($pDst)
+	 lea	256*2($pSrc), $pSrc
+	 lea	8*2($pDst), $pDst
+	 sub	\$1, $cnt
+	 jnz	loop_$m
+___
+
+ $m++;
+}
+
+#
+# Data Structures
+#
+
+# Reduce Data
+#
+#
+# Offset  Value
+# 0C0     Carries
+# 0B8     X2[10]
+# 0B0     X2[9]
+# 0A8     X2[8]
+# 0A0     X2[7]
+# 098     X2[6]
+# 090     X2[5]
+# 088     X2[4]
+# 080     X2[3]
+# 078     X2[2]
+# 070     X2[1]
+# 068     X2[0]
+# 060     X1[12]  P[10]
+# 058     X1[11]  P[9]  Z[8]
+# 050     X1[10]  P[8]  Z[7]
+# 048     X1[9]   P[7]  Z[6]
+# 040     X1[8]   P[6]  Z[5]
+# 038     X1[7]   P[5]  Z[4]
+# 030     X1[6]   P[4]  Z[3]
+# 028     X1[5]   P[3]  Z[2]
+# 020     X1[4]   P[2]  Z[1]
+# 018     X1[3]   P[1]  Z[0]
+# 010     X1[2]   P[0]  Y[2]
+# 008     X1[1]   Q[1]  Y[1]
+# 000     X1[0]   Q[0]  Y[0]
+
+my $X1_offset           =  0;			# 13 qwords
+my $X2_offset           =  $X1_offset + 13*8;			# 11 qwords
+my $Carries_offset      =  $X2_offset + 11*8;			# 1 qword
+my $Q_offset            =  0;			# 2 qwords
+my $P_offset            =  $Q_offset + 2*8;			# 11 qwords
+my $Y_offset            =  0;			# 3 qwords
+my $Z_offset            =  $Y_offset + 3*8;			# 9 qwords
+
+my $Red_Data_Size       =  $Carries_offset + 1*8;			# (25 qwords)
+
+#
+# Stack Frame
+#
+#
+# offset	value
+# ...		<old stack contents>
+# ...
+# 280		Garray
+
+# 278		tmp16[15]
+# ...		...
+# 200		tmp16[0]
+
+# 1F8		tmp[7]
+# ...		...
+# 1C0		tmp[0]
+
+# 1B8		GT[7]
+# ...		...
+# 180		GT[0]
+
+# 178		Reduce Data
+# ...		...
+# 0B8		Reduce Data
+# 0B0		reserved
+# 0A8		reserved
+# 0A0		reserved
+# 098		reserved
+# 090		reserved
+# 088		reduce result addr
+# 080		exp[8]
+
+# ...
+# 048		exp[1]
+# 040		exp[0]
+
+# 038		reserved
+# 030		loop_idx
+# 028		pg
+# 020		i
+# 018		pData	; arg 4
+# 010		pG	; arg 2
+# 008		pResult	; arg 1
+# 000		rsp	; stack pointer before subtract
+
+my $rsp_offset          =  0;
+my $pResult_offset      =  8*1 + $rsp_offset;
+my $pG_offset           =  8*1 + $pResult_offset;
+my $pData_offset        =  8*1 + $pG_offset;
+my $i_offset            =  8*1 + $pData_offset;
+my $pg_offset           =  8*1 + $i_offset;
+my $loop_idx_offset     =  8*1 + $pg_offset;
+my $reserved1_offset    =  8*1 + $loop_idx_offset;
+my $exp_offset          =  8*1 + $reserved1_offset;
+my $red_result_addr_offset=  8*9 + $exp_offset;
+my $reserved2_offset    =  8*1 + $red_result_addr_offset;
+my $Reduce_Data_offset  =  8*5 + $reserved2_offset;
+my $GT_offset           =  $Red_Data_Size + $Reduce_Data_offset;
+my $tmp_offset          =  8*8 + $GT_offset;
+my $tmp16_offset        =  8*8 + $tmp_offset;
+my $garray_offset       =  8*16 + $tmp16_offset;
+my $mem_size            =  8*8*32 + $garray_offset;
+
+#
+# Offsets within Reduce Data
+#
+#
+#	struct MODF_2FOLD_MONT_512_C1_DATA {
+#	UINT64 t[8][8];
+#	UINT64 m[8];
+#	UINT64 m1[8]; /* 2^768 % m */
+#	UINT64 m2[8]; /* 2^640 % m */
+#	UINT64 k1[2]; /* (- 1/m) % 2^128 */
+#	};
+
+my $T                   =  0;
+my $M                   =  512;			# = 8 * 8 * 8
+my $M1                  =  576;			# = 8 * 8 * 9 /* += 8 * 8 */
+my $M2                  =  640;			# = 8 * 8 * 10 /* += 8 * 8 */
+my $K1                  =  704;			# = 8 * 8 * 11 /* += 8 * 8 */
+
+#
+#   FUNCTIONS
+#
+
+{{{
+#
+# MULADD_128x512 : Function to multiply 128-bits (2 qwords) by 512-bits (8 qwords)
+#                       and add 512-bits (8 qwords)
+#                       to get 640 bits (10 qwords)
+# Input: 128-bit mul source: [rdi+8*1], rbp
+#        512-bit mul source: [rsi+8*n]
+#        512-bit add source: r15, r14, ..., r9, r8
+# Output: r9, r8, r15, r14, r13, r12, r11, r10, [rcx+8*1], [rcx+8*0]
+# Clobbers all regs except: rcx, rsi, rdi
+$code.=<<___;
+.type	MULADD_128x512,\@abi-omnipotent
+.align	16
+MULADD_128x512:
+___
+	&MULSTEP_512([map("%r$_",(8..15))], "(+8*0)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 mov	(+8*1)(%rdi), %rbp
+___
+	&MULSTEP_512([map("%r$_",(9..15,8))], "(+8*1)(%rcx)", "%rsi", "%rbp", "%rbx");
+$code.=<<___;
+	 ret
+.size	MULADD_128x512,.-MULADD_128x512
+___
+}}}
+
+{{{
+#MULADD_256x512	MACRO	pDst, pA, pB, OP, TMP, X7, X6, X5, X4, X3, X2, X1, X0
+#
+# Inputs: pDst: Destination  (768 bits, 12 qwords)
+#         pA:   Multiplicand (1024 bits, 16 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Dst = Ah * B + Al
+# where Ah is (in qwords) A[15:12] (256 bits) and Al is A[7:0] (512 bits)
+# Results in X3 X2 X1 X0 X7 X6 X5 X4 Dst[3:0]
+# Uses registers: arguments, RAX, RDX
+sub MULADD_256x512
+{
+ my ($pDst, $pA, $pB, $OP, $TMP, $X)=@_;
+$code.=<<___;
+	mov	(+8*12)($pA), $OP
+___
+	&MULSTEP_512_ADD($X, "(+8*0)($pDst)", $pB, $pA, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*13)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*1)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*14)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*2)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+
+$code.=<<___;
+	 mov	(+8*15)($pA), $OP
+___
+	&MULSTEP_512($X, "(+8*3)($pDst)", $pB, $OP, $TMP);
+	push(@$X,shift(@$X));
+}
+
+#
+# mont_reduce(UINT64 *x,  /* 1024 bits, 16 qwords */
+#	       UINT64 *m,  /*  512 bits,  8 qwords */
+#	       MODF_2FOLD_MONT_512_C1_DATA *data,
+#             UINT64 *r)  /*  512 bits,  8 qwords */
+# Input:  x (number to be reduced): tmp16 (Implicit)
+#         m (modulus):              [pM]  (Implicit)
+#         data (reduce data):       [pData] (Implicit)
+# Output: r (result):		     Address in [red_res_addr]
+#         result also in: r9, r8, r15, r14, r13, r12, r11, r10
+
+my @X=map("%r$_",(8..15));
+
+$code.=<<___;
+.type	mont_reduce,\@abi-omnipotent
+.align	16
+mont_reduce:
+___
+
+my $STACK_DEPTH         =  8;
+	#
+	# X1 = Xh * M1 + Xl
+$code.=<<___;
+	 lea	(+$Reduce_Data_offset+$X1_offset+$STACK_DEPTH)(%rsp), %rdi			# pX1 (Dst) 769 bits, 13 qwords
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rsi			# pM1 (Bsrc) 512 bits, 8 qwords
+	 add	\$$M1, %rsi
+	 lea	(+$tmp16_offset+$STACK_DEPTH)(%rsp), %rcx			# X (Asrc) 1024 bits, 16 qwords
+
+___
+
+	&MULADD_256x512("%rdi", "%rcx", "%rsi", "%rbp", "%rbx", \@X);	# rotates @X 4 times
+	# results in r11, r10, r9, r8, r15, r14, r13, r12, X1[3:0]
+
+$code.=<<___;
+	 xor	%rax, %rax
+	# X1 += xl
+	 add	(+8*8)(%rcx), $X[4]
+	 adc	(+8*9)(%rcx), $X[5]
+	 adc	(+8*10)(%rcx), $X[6]
+	 adc	(+8*11)(%rcx), $X[7]
+	 adc	\$0, %rax
+	# X1 is now rax, r11-r8, r15-r12, tmp16[3:0]
+
+	#
+	# check for carry ;; carry stored in rax
+	 mov	$X[4], (+8*8)(%rdi)			# rdi points to X1
+	 mov	$X[5], (+8*9)(%rdi)
+	 mov	$X[6], %rbp
+	 mov	$X[7], (+8*11)(%rdi)
+
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 mov	(+8*0)(%rdi), $X[4]
+	 mov	(+8*1)(%rdi), $X[5]
+	 mov	(+8*2)(%rdi), $X[6]
+	 mov	(+8*3)(%rdi), $X[7]
+
+	# X1 is now stored in: X1[11], rbp, X1[9:8], r15-r8
+	# rdi -> X1
+	# rsi -> M1
+
+	#
+	# X2 = Xh * M2 + Xl
+	# do first part (X2 = Xh * M2)
+	 add	\$8*10, %rdi			# rdi -> pXh ; 128 bits, 2 qwords
+				#        Xh is actually { [rdi+8*1], rbp }
+	 add	\$`$M2-$M1`, %rsi			# rsi -> M2
+	 lea	(+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx			# rcx -> pX2 ; 641 bits, 11 qwords
+___
+	unshift(@X,pop(@X));	unshift(@X,pop(@X));
+$code.=<<___;
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rax
+
+	# X2 += Xl
+	 add	(+8*8-8*10)(%rdi), $X[6]		# (-8*10) is to adjust rdi -> Xh to Xl
+	 adc	(+8*9-8*10)(%rdi), $X[7]
+	 mov	$X[6], (+8*8)(%rcx)
+	 mov	$X[7], (+8*9)(%rcx)
+
+	 adc	%rax, %rax
+	 mov	%rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
+
+	 lea	(+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi			# rdi -> pQ ; 128 bits, 2 qwords
+	 add	\$`$K1-$M2`, %rsi			# rsi -> pK1 ; 128 bits, 2 qwords
+
+	# MUL_128x128t128	rdi, rcx, rsi	; Q = X2 * K1 (bottom half)
+	# B1:B0 = rsi[1:0] = K1[1:0]
+	# A1:A0 = rcx[1:0] = X2[1:0]
+	# Result = rdi[1],rbp = Q[1],rbp
+	 mov	(%rsi), %r8			# B0
+	 mov	(+8*1)(%rsi), %rbx			# B1
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%r8			# B0
+	 mov	%rax, %rbp
+	 mov	%rdx, %r9
+
+	 mov	(+8*1)(%rcx), %rax			# A1
+	 mul	%r8			# B0
+	 add	%rax, %r9
+
+	 mov	(%rcx), %rax			# A0
+	 mul	%rbx			# B1
+	 add	%rax, %r9
+
+	 mov	%r9, (+8*1)(%rdi)
+	# end MUL_128x128t128
+
+	 sub	\$`$K1-$M`, %rsi
+
+	 mov	(%rcx), $X[6]
+	 mov	(+8*1)(%rcx), $X[7]			# r9:r8 = X2[1:0]
+
+	 call	MULADD_128x512			# args in rcx, rdi / rbp, rsi, r15-r8
+	# result in r9, r8, r15, r14, r13, r12, r11, r10, X2[1:0]
+
+	# load first half of m to rdx, rdi, rbx, rax
+	# moved this here for efficiency
+	 mov	(+8*0)(%rsi), %rax
+	 mov	(+8*1)(%rsi), %rbx
+	 mov	(+8*2)(%rsi), %rdi
+	 mov	(+8*3)(%rsi), %rdx
+
+	# continue with reduction
+	 mov	(+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp), %rbp
+
+	 add	(+8*8)(%rcx), $X[6]
+	 adc	(+8*9)(%rcx), $X[7]
+
+	#accumulate the final carry to rbp
+	 adc	%rbp, %rbp
+
+	# Add in overflow corrections: R = (X2>>128) += T[overflow]
+	# R = {r9, r8, r15, r14, ..., r10}
+	 shl	\$3, %rbp
+	 mov	(+$pData_offset+$STACK_DEPTH)(%rsp), %rcx			# rsi -> Data (and points to T)
+	 add	%rcx, %rbp			# pT ; 512 bits, 8 qwords, spread out
+
+	# rsi will be used to generate a mask after the addition
+	 xor	%rsi, %rsi
+
+	 add	(+8*8*0)(%rbp), $X[0]
+	 adc	(+8*8*1)(%rbp), $X[1]
+	 adc	(+8*8*2)(%rbp), $X[2]
+	 adc	(+8*8*3)(%rbp), $X[3]
+	 adc	(+8*8*4)(%rbp), $X[4]
+	 adc	(+8*8*5)(%rbp), $X[5]
+	 adc	(+8*8*6)(%rbp), $X[6]
+	 adc	(+8*8*7)(%rbp), $X[7]
+
+	# if there is a carry:	rsi = 0xFFFFFFFFFFFFFFFF
+	# if carry is clear:	rsi = 0x0000000000000000
+	 sbb	\$0, %rsi
+
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	 mov	\$1, %rbp
+	 sub	%rax, $X[0]
+	 sbb	%rbx, $X[1]
+	 sbb	%rdi, $X[2]
+	 sbb	%rdx, $X[3]
+
+	# if there is a borrow:		rbp = 0
+	# if there is no borrow:	rbp = 1
+	# this is used to save the borrows in between the first half and the 2nd half of the subtraction of m
+	 sbb	\$0, %rbp
+
+	#load second half of m to rdx, rdi, rbx, rax
+
+	 add	\$$M, %rcx
+	 mov	(+8*4)(%rcx), %rax
+	 mov	(+8*5)(%rcx), %rbx
+	 mov	(+8*6)(%rcx), %rdi
+	 mov	(+8*7)(%rcx), %rdx
+
+	# use the rsi mask as before
+	# if carry is clear, subtract 0. Otherwise, subtract 256 bits of m
+	 and	%rsi, %rax
+	 and	%rsi, %rbx
+	 and	%rsi, %rdi
+	 and	%rsi, %rdx
+
+	# if rbp = 0, there was a borrow before, it is moved to the carry flag
+	# if rbp = 1, there was not a borrow before, carry flag is cleared
+	 sub	\$1, %rbp
+
+	 sbb	%rax, $X[4]
+	 sbb	%rbx, $X[5]
+	 sbb	%rdi, $X[6]
+	 sbb	%rdx, $X[7]
+
+	# write R back to memory
+
+	 mov	(+$red_result_addr_offset+$STACK_DEPTH)(%rsp), %rsi
+	 mov	$X[0], (+8*0)(%rsi)
+	 mov	$X[1], (+8*1)(%rsi)
+	 mov	$X[2], (+8*2)(%rsi)
+	 mov	$X[3], (+8*3)(%rsi)
+	 mov	$X[4], (+8*4)(%rsi)
+	 mov	$X[5], (+8*5)(%rsi)
+	 mov	$X[6], (+8*6)(%rsi)
+	 mov	$X[7], (+8*7)(%rsi)
+
+	 ret
+.size	mont_reduce,.-mont_reduce
+___
+}}}
+
+{{{
+#MUL_512x512	MACRO	pDst, pA, pB, x7, x6, x5, x4, x3, x2, x1, x0, tmp*2
+#
+# Inputs: pDst: Destination  (1024 bits, 16 qwords)
+#         pA:   Multiplicand (512 bits, 8 qwords)
+#         pB:   Multiplicand (512 bits, 8 qwords)
+# Uses registers rax, rdx, args
+#   B operand in [pB] and also in x7...x0
+sub MUL_512x512
+{
+ my ($pDst, $pA, $pB, $x, $OP, $TMP, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+
+$code.=<<___;
+	 mov	(+8*0)($pA), $OP
+
+	 mov	$X[0], %rax
+	 mul	$OP			# rdx:rax = %OP * [0]
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $X[0]
+___
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	$X[$i], %rax
+	 mul	$OP			# rdx:rax = %OP * [$i]
+	 add	%rax, $X[$i-1]
+	 adc	\$0, %rdx
+	 mov	%rdx, $X[$i]
+___
+}
+
+for(my $i=1;$i<8;$i++) {
+$code.=<<___;
+	 mov	(+8*$i)($pA), $OP
+___
+
+	&MULSTEP_512(\@X, "(+$pDst_o+8*$i)($pDst)", $pB, $OP, $TMP);
+	push(@X,shift(@X));
+}
+
+$code.=<<___;
+	 mov	$X[0], (+$pDst_o+8*8)($pDst)
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+	 mov	$X[3], (+$pDst_o+8*11)($pDst)
+	 mov	$X[4], (+$pDst_o+8*12)($pDst)
+	 mov	$X[5], (+$pDst_o+8*13)($pDst)
+	 mov	$X[6], (+$pDst_o+8*14)($pDst)
+	 mov	$X[7], (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# mont_mul_a3b : subroutine to compute (Src1 * Src2) % M (all 512-bits)
+# Input:  src1: Address of source 1: rdi
+#         src2: Address of source 2: rsi
+# Output: dst:  Address of destination: [red_res_addr]
+#    src2 and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+# Temp:   Clobbers [tmp16], all registers
+$code.=<<___;
+.type	mont_mul_a3b,\@abi-omnipotent
+.align	16
+mont_mul_a3b:
+	#
+	# multiply tmp = src1 * src2
+	# For multiply: dst = rcx, src1 = rdi, src2 = rsi
+	# stack depth is extra 8 from call
+___
+	&MUL_512x512("%rsp+$tmp16_offset+8", "%rdi", "%rsi", [map("%r$_",(10..15,8..9))], "%rbp", "%rbx");
+$code.=<<___;
+	#
+	# Dst = tmp % m
+	# Call reduce(tmp, m, data, dst)
+
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	mont_mul_a3b,.-mont_mul_a3b
+___
+}}}
+
+{{{
+#SQR_512 MACRO pDest, pA, x7, x6, x5, x4, x3, x2, x1, x0, tmp*4
+#
+# Input in memory [pA] and also in x7...x0
+# Uses all argument registers plus rax and rdx
+#
+# This version computes all of the off-diagonal terms into memory,
+# and then it adds in the diagonal terms
+
+sub SQR_512
+{
+ my ($pDst, $pA, $x, $A, $tmp, $x7, $x6, $pDst_o)=@_;
+ my ($pDst,  $pDst_o) = ($pDst =~ m/([^+]*)\+?(.*)?/);
+ my @X=@$x;	# make a copy
+$code.=<<___;
+	# ------------------
+	# first pass 01...07
+	# ------------------
+	 mov	$X[0], $A
+
+	 mov	$X[1],%rax
+	 mul	$A
+	 mov	%rax, (+$pDst_o+8*1)($pDst)
+___
+for(my $i=2;$i<8;$i++) {
+$code.=<<___;
+	 mov	%rdx, $X[$i-2]
+	 mov	$X[$i],%rax
+	 mul	$A
+	 add	%rax, $X[$i-2]
+	 adc	\$0, %rdx
+___
+}
+$code.=<<___;
+	 mov	%rdx, $x7
+
+	 mov	$X[0], (+$pDst_o+8*2)($pDst)
+
+	# ------------------
+	# second pass 12...17
+	# ------------------
+
+	 mov	(+8*1)($pA), $A
+
+	 mov	(+8*2)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*3)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*4)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# third pass 23...27
+	# ------------------
+	 mov	(+8*2)($pA), $A
+
+	 mov	(+8*3)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[3]
+	 adc	\$0, %rdx
+	 mov	$X[3], (+$pDst_o+8*5)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[4]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[4]
+	 adc	\$0, %rdx
+	 mov	$X[4], (+$pDst_o+8*6)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# fourth pass 34...37
+	# ------------------
+
+	 mov	(+8*3)($pA), $A
+
+	 mov	(+8*4)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*7)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $x7
+	 adc	\$0, %rdx
+	 add	$X[0], $x7
+	 adc	\$0, %rdx
+	 mov	$x7, (+$pDst_o+8*8)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[5]
+
+	# ------------------
+	# fifth pass 45...47
+	# ------------------
+	 mov	(+8*4)($pA), $A
+
+	 mov	(+8*5)($pA),%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*9)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*10)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[5]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $X[1]
+
+	# ------------------
+	# sixth pass 56...57
+	# ------------------
+	 mov	(+8*5)($pA), $A
+
+	 mov	$X[6],%rax
+	 mul	$A
+	 add	%rax, $X[5]
+	 adc	\$0, %rdx
+	 mov	$X[5], (+$pDst_o+8*11)($pDst)
+
+	 mov	%rdx, $X[0]
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[1]
+	 adc	\$0, %rdx
+	 add	$X[0], $X[1]
+	 adc	\$0, %rdx
+	 mov	$X[1], (+$pDst_o+8*12)($pDst)
+
+	 mov	%rdx, $X[2]
+
+	# ------------------
+	# seventh pass 67
+	# ------------------
+	 mov	$X[6], $A
+
+	 mov	$X[7],%rax
+	 mul	$A
+	 add	%rax, $X[2]
+	 adc	\$0, %rdx
+	 mov	$X[2], (+$pDst_o+8*13)($pDst)
+
+	 mov	%rdx, (+$pDst_o+8*14)($pDst)
+
+	# start finalize (add	in squares, and double off-terms)
+	 mov	(+$pDst_o+8*1)($pDst), $X[0]
+	 mov	(+$pDst_o+8*2)($pDst), $X[1]
+	 mov	(+$pDst_o+8*3)($pDst), $X[2]
+	 mov	(+$pDst_o+8*4)($pDst), $X[3]
+	 mov	(+$pDst_o+8*5)($pDst), $X[4]
+	 mov	(+$pDst_o+8*6)($pDst), $X[5]
+
+	 mov	(+8*3)($pA), %rax
+	 mul	%rax
+	 mov	%rax, $x6
+	 mov	%rdx, $X[6]
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	\$0, $X[6]
+
+	 mov	(+8*0)($pA), %rax
+	 mul	%rax
+	 mov	%rax, (+$pDst_o+8*0)($pDst)
+	 mov	%rdx, $A
+
+	 mov	(+8*1)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+	 mov	$X[0], (+$pDst_o+8*1)($pDst)
+	 mov	$X[1], (+$pDst_o+8*2)($pDst)
+
+	 mov	(+8*2)($pA), %rax
+	 mul	%rax
+
+	 add	$A, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $A
+
+	 mov	$X[2], (+$pDst_o+8*3)($pDst)
+	 mov	$X[3], (+$pDst_o+8*4)($pDst)
+
+	 xor	$tmp, $tmp
+	 add	$A, $X[4]
+	 adc	$x6, $X[5]
+	 adc	\$0, $tmp
+
+	 mov	$X[4], (+$pDst_o+8*5)($pDst)
+	 mov	$X[5], (+$pDst_o+8*6)($pDst)
+
+	# %%tmp has 0/1 in column 7
+	# %%A6 has a full value in column 7
+
+	 mov	(+$pDst_o+8*7)($pDst), $X[0]
+	 mov	(+$pDst_o+8*8)($pDst), $X[1]
+	 mov	(+$pDst_o+8*9)($pDst), $X[2]
+	 mov	(+$pDst_o+8*10)($pDst), $X[3]
+	 mov	(+$pDst_o+8*11)($pDst), $X[4]
+	 mov	(+$pDst_o+8*12)($pDst), $X[5]
+	 mov	(+$pDst_o+8*13)($pDst), $x6
+	 mov	(+$pDst_o+8*14)($pDst), $x7
+
+	 mov	$X[7], %rax
+	 mul	%rax
+	 mov	%rax, $X[7]
+	 mov	%rdx, $A
+
+	 add	$X[0], $X[0]
+	 adc	$X[1], $X[1]
+	 adc	$X[2], $X[2]
+	 adc	$X[3], $X[3]
+	 adc	$X[4], $X[4]
+	 adc	$X[5], $X[5]
+	 adc	$x6, $x6
+	 adc	$x7, $x7
+	 adc	\$0, $A
+
+	 add	$tmp, $X[0]
+
+	 mov	(+8*4)($pA), %rax
+	 mul	%rax
+
+	 add	$X[6], $X[0]
+	 adc	%rax, $X[1]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[0], (+$pDst_o+8*7)($pDst)
+	 mov	$X[1], (+$pDst_o+8*8)($pDst)
+
+	 mov	(+8*5)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[2]
+	 adc	%rax, $X[3]
+	 adc	\$0, %rdx
+
+	 mov	%rdx, $tmp
+
+	 mov	$X[2], (+$pDst_o+8*9)($pDst)
+	 mov	$X[3], (+$pDst_o+8*10)($pDst)
+
+	 mov	(+8*6)($pA), %rax
+	 mul	%rax
+
+	 add	$tmp, $X[4]
+	 adc	%rax, $X[5]
+	 adc	\$0, %rdx
+
+	 mov	$X[4], (+$pDst_o+8*11)($pDst)
+	 mov	$X[5], (+$pDst_o+8*12)($pDst)
+
+	 add	%rdx, $x6
+	 adc	$X[7], $x7
+	 adc	\$0, $A
+
+	 mov	$x6, (+$pDst_o+8*13)($pDst)
+	 mov	$x7, (+$pDst_o+8*14)($pDst)
+	 mov	$A, (+$pDst_o+8*15)($pDst)
+___
+}
+
+#
+# sqr_reduce: subroutine to compute Result = reduce(Result * Result)
+#
+# input and result also in: r9, r8, r15, r14, r13, r12, r11, r10
+#
+$code.=<<___;
+.type	sqr_reduce,\@abi-omnipotent
+.align	16
+sqr_reduce:
+	 mov	(+$pResult_offset+8)(%rsp), %rcx
+___
+	&SQR_512("%rsp+$tmp16_offset+8", "%rcx", [map("%r$_",(10..15,8..9))], "%rbx", "%rbp", "%rsi", "%rdi");
+$code.=<<___;
+	# tail recursion optimization: jmp to mont_reduce and return from there
+	 jmp	mont_reduce
+	# call	mont_reduce
+	# ret
+.size	sqr_reduce,.-sqr_reduce
+___
+}}}
+
+#
+# MAIN FUNCTION
+#
+
+#mod_exp_512(UINT64 *result, /* 512 bits, 8 qwords */
+#           UINT64 *g,   /* 512 bits, 8 qwords */
+#           UINT64 *exp, /* 512 bits, 8 qwords */
+#           struct mod_ctx_512 *data)
+
+# window size = 5
+# table size = 2^5 = 32
+#table_entries	equ	32
+#table_size	equ	table_entries * 8
+$code.=<<___;
+.globl	mod_exp_512
+.type	mod_exp_512,\@function,4
+mod_exp_512:
+	 push	%rbp
+	 push	%rbx
+	 push	%r12
+	 push	%r13
+	 push	%r14
+	 push	%r15
+
+	# adjust stack down and then align it with cache boundary
+	 mov	%rsp, %r8
+	 sub	\$$mem_size, %rsp
+	 and	\$-64, %rsp
+
+	# store previous stack pointer and arguments
+	 mov	%r8, (+$rsp_offset)(%rsp)
+	 mov	%rdi, (+$pResult_offset)(%rsp)
+	 mov	%rsi, (+$pG_offset)(%rsp)
+	 mov	%rcx, (+$pData_offset)(%rsp)
+.Lbody:
+	# transform g into montgomery space
+	# GT = reduce(g * C2) = reduce(g * (2^256))
+	# reduce expects to have the input in [tmp16]
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rsi), %xmm0
+	 movdqu	(+16*1)(%rsi), %xmm1
+	 movdqu	(+16*2)(%rsi), %xmm2
+	 movdqu	(+16*3)(%rsi), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*3)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*5)(%rsp)
+
+	# load pExp before rdx gets blown away
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+
+	 lea	(+$GT_offset)(%rsp), %rbx
+	 mov	%rbx, (+$red_result_addr_offset)(%rsp)
+	 call	mont_reduce
+
+	# Initialize tmp = C
+	 lea	(+$tmp_offset)(%rsp), %rcx
+	 xor	%rax, %rax
+	 mov	%rax, (+8*0)(%rcx)
+	 mov	%rax, (+8*1)(%rcx)
+	 mov	%rax, (+8*3)(%rcx)
+	 mov	%rax, (+8*4)(%rcx)
+	 mov	%rax, (+8*5)(%rcx)
+	 mov	%rax, (+8*6)(%rcx)
+	 mov	%rax, (+8*7)(%rcx)
+	 mov	%rax, (+$exp_offset+8*8)(%rsp)
+	 movq	\$1, (+8*2)(%rcx)
+
+	 lea	(+$garray_offset)(%rsp), %rbp
+	 mov	%rcx, %rsi			# pTmp
+	 mov	%rbp, %rdi			# Garray[][0]
+___
+
+	&swizzle("%rdi", "%rcx", "%rax", "%rbx");
+
+	# for (rax = 31; rax != 0; rax--) {
+	#     tmp = reduce(tmp * G)
+	#     swizzle(pg, tmp);
+	#     pg += 2; }
+$code.=<<___;
+	 mov	\$31, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	# rsi -> pTmp
+	 mov	%rsi, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rsi), %r10
+	 mov	(+8*1)(%rsi), %r11
+	 mov	(+8*2)(%rsi), %r12
+	 mov	(+8*3)(%rsi), %r13
+	 mov	(+8*4)(%rsi), %r14
+	 mov	(+8*5)(%rsi), %r15
+	 mov	(+8*6)(%rsi), %r8
+	 mov	(+8*7)(%rsi), %r9
+init_loop:
+	 lea	(+$GT_offset)(%rsp), %rdi
+	 call	mont_mul_a3b
+	 lea	(+$tmp_offset)(%rsp), %rsi
+	 mov	(+$pg_offset)(%rsp), %rbp
+	 add	\$2, %rbp
+	 mov	%rbp, (+$pg_offset)(%rsp)
+	 mov	%rsi, %rcx			# rcx = rsi = addr of tmp
+___
+
+	&swizzle("%rbp", "%rcx", "%rax", "%rbx");
+$code.=<<___;
+	 mov	(+$i_offset)(%rsp), %rax
+	 sub	\$1, %rax
+	 mov	%rax, (+$i_offset)(%rsp)
+	 jne	init_loop
+
+	#
+	# Copy exponent onto stack
+	 movdqa	%xmm0, (+$exp_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$exp_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$exp_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$exp_offset+16*3)(%rsp)
+
+
+	#
+	# Do exponentiation
+	# Initialize result to G[exp{511:507}]
+	 mov	(+$exp_offset+62)(%rsp), %eax
+	 mov	%rax, %rdx
+	 shr	\$11, %rax
+	 and	\$0x07FF, %edx
+	 mov	%edx, (+$exp_offset+62)(%rsp)
+	 lea	(+$garray_offset)(%rsp,%rax,2), %rsi
+	 mov	(+$pResult_offset)(%rsp), %rdx
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+
+	#
+	# Loop variables
+	# rcx = [loop_idx] = index: 510-5 to 0 by 5
+$code.=<<___;
+	 movq	\$505, (+$loop_idx_offset)(%rsp)
+
+	 mov	(+$pResult_offset)(%rsp), %rcx
+	 mov	%rcx, (+$red_result_addr_offset)(%rsp)
+	 mov	(+8*0)(%rcx), %r10
+	 mov	(+8*1)(%rcx), %r11
+	 mov	(+8*2)(%rcx), %r12
+	 mov	(+8*3)(%rcx), %r13
+	 mov	(+8*4)(%rcx), %r14
+	 mov	(+8*5)(%rcx), %r15
+	 mov	(+8*6)(%rcx), %r8
+	 mov	(+8*7)(%rcx), %r9
+	 jmp	sqr_2
+
+main_loop_a3b:
+	 call	sqr_reduce
+	 call	sqr_reduce
+	 call	sqr_reduce
+sqr_2:
+	 call	sqr_reduce
+	 call	sqr_reduce
+
+	#
+	# Do multiply, first look up proper value in Garray
+	 mov	(+$loop_idx_offset)(%rsp), %rcx			# bit index
+	 mov	%rcx, %rax
+	 shr	\$4, %rax			# rax is word pointer
+	 mov	(+$exp_offset)(%rsp,%rax,2), %edx
+	 and	\$15, %rcx
+	 shrq	%cl, %rdx
+	 and	\$0x1F, %rdx
+
+	 lea	(+$garray_offset)(%rsp,%rdx,2), %rsi
+	 lea	(+$tmp_offset)(%rsp), %rdx
+	 mov	%rdx, %rdi
+___
+
+	&unswizzle("%rdx", "%rsi", "%rbp", "%rbx", "%rax");
+	# rdi = tmp = pG
+
+	#
+	# Call mod_mul_a1(pDst,  pSrc1, pSrc2, pM, pData)
+	#                 result result pG     M   Data
+$code.=<<___;
+	 mov	(+$pResult_offset)(%rsp), %rsi
+	 call	mont_mul_a3b
+
+	#
+	# finish loop
+	 mov	(+$loop_idx_offset)(%rsp), %rcx
+	 sub	\$5, %rcx
+	 mov	%rcx, (+$loop_idx_offset)(%rsp)
+	 jge	main_loop_a3b
+
+	#
+
+end_main_loop_a3b:
+	# transform result out of Montgomery space
+	# result = reduce(result)
+	 mov	(+$pResult_offset)(%rsp), %rdx
+	 pxor	%xmm4, %xmm4
+	 movdqu	(+16*0)(%rdx), %xmm0
+	 movdqu	(+16*1)(%rdx), %xmm1
+	 movdqu	(+16*2)(%rdx), %xmm2
+	 movdqu	(+16*3)(%rdx), %xmm3
+	 movdqa	%xmm4, (+$tmp16_offset+16*4)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*5)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*6)(%rsp)
+	 movdqa	%xmm4, (+$tmp16_offset+16*7)(%rsp)
+	 movdqa	%xmm0, (+$tmp16_offset+16*0)(%rsp)
+	 movdqa	%xmm1, (+$tmp16_offset+16*1)(%rsp)
+	 movdqa	%xmm2, (+$tmp16_offset+16*2)(%rsp)
+	 movdqa	%xmm3, (+$tmp16_offset+16*3)(%rsp)
+	 call	mont_reduce
+
+	# If result > m, subract m
+	# load result into r15:r8
+	 mov	(+$pResult_offset)(%rsp), %rax
+	 mov	(+8*0)(%rax), %r8
+	 mov	(+8*1)(%rax), %r9
+	 mov	(+8*2)(%rax), %r10
+	 mov	(+8*3)(%rax), %r11
+	 mov	(+8*4)(%rax), %r12
+	 mov	(+8*5)(%rax), %r13
+	 mov	(+8*6)(%rax), %r14
+	 mov	(+8*7)(%rax), %r15
+
+	# subtract m
+	 mov	(+$pData_offset)(%rsp), %rbx
+	 add	\$$M, %rbx
+
+	 sub	(+8*0)(%rbx), %r8
+	 sbb	(+8*1)(%rbx), %r9
+	 sbb	(+8*2)(%rbx), %r10
+	 sbb	(+8*3)(%rbx), %r11
+	 sbb	(+8*4)(%rbx), %r12
+	 sbb	(+8*5)(%rbx), %r13
+	 sbb	(+8*6)(%rbx), %r14
+	 sbb	(+8*7)(%rbx), %r15
+
+	# if Carry is clear, replace result with difference
+	 mov	(+8*0)(%rax), %rsi
+	 mov	(+8*1)(%rax), %rdi
+	 mov	(+8*2)(%rax), %rcx
+	 mov	(+8*3)(%rax), %rdx
+	 cmovnc	%r8, %rsi
+	 cmovnc	%r9, %rdi
+	 cmovnc	%r10, %rcx
+	 cmovnc	%r11, %rdx
+	 mov	%rsi, (+8*0)(%rax)
+	 mov	%rdi, (+8*1)(%rax)
+	 mov	%rcx, (+8*2)(%rax)
+	 mov	%rdx, (+8*3)(%rax)
+
+	 mov	(+8*4)(%rax), %rsi
+	 mov	(+8*5)(%rax), %rdi
+	 mov	(+8*6)(%rax), %rcx
+	 mov	(+8*7)(%rax), %rdx
+	 cmovnc	%r12, %rsi
+	 cmovnc	%r13, %rdi
+	 cmovnc	%r14, %rcx
+	 cmovnc	%r15, %rdx
+	 mov	%rsi, (+8*4)(%rax)
+	 mov	%rdi, (+8*5)(%rax)
+	 mov	%rcx, (+8*6)(%rax)
+	 mov	%rdx, (+8*7)(%rax)
+
+	 mov	(+$rsp_offset)(%rsp), %rsi
+	 mov	0(%rsi),%r15
+	 mov	8(%rsi),%r14
+	 mov	16(%rsi),%r13
+	 mov	24(%rsi),%r12
+	 mov	32(%rsi),%rbx
+	 mov	40(%rsi),%rbp
+	 lea	48(%rsi),%rsp
+.Lepilogue:
+	 ret
+.size mod_exp_512, . - mod_exp_512
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+my $rec="%rcx";
+my $frame="%rdx";
+my $context="%r8";
+my $disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mod_exp_512_se_handler,\@abi-omnipotent
+.align	16
+mod_exp_512_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	lea	.Lbody(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lin_prologue
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	lea	.Lepilogue(%rip),%r10
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lin_prologue
+
+	mov	$rsp_offset(%rax),%rax	# pull saved Rsp
+
+	mov	32(%rax),%rbx
+	mov	40(%rax),%rbp
+	mov	24(%rax),%r12
+	mov	16(%rax),%r13
+	mov	8(%rax),%r14
+	mov	0(%rax),%r15
+	lea	48(%rax),%rax
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lin_prologue:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mod_exp_512_se_handler,.-mod_exp_512_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_mod_exp_512
+	.rva	.LSEH_end_mod_exp_512
+	.rva	.LSEH_info_mod_exp_512
+
+.section	.xdata
+.align	8
+.LSEH_info_mod_exp_512:
+	.byte	9,0,0,0
+	.rva	mod_exp_512_se_handler
+___
+}
+
+sub reg_part {
+my ($reg,$conv)=@_;
+    if ($reg =~ /%r[0-9]+/)	{ $reg .= $conv; }
+    elsif ($conv eq "b")	{ $reg =~ s/%[er]([^x]+)x?/%$1l/;	}
+    elsif ($conv eq "w")	{ $reg =~ s/%[er](.+)/%$1/;		}
+    elsif ($conv eq "d")	{ $reg =~ s/%[er](.+)/%e$1/;		}
+    return $reg;
+}
+
+$code =~ s/(%[a-z0-9]+)#([bwd])/reg_part($1,$2)/gem;
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+$code =~ s/(\(\+[^)]+\))/eval $1/gem;
+print $code;
+close STDOUT;

diff --git a/crypto/bn/asm/rsaz-avx2.pl b/crypto/bn/asm/rsaz-avx2.pl
new file mode 100644
index 0000000..3eb9556
--- /dev/null
+++ b/crypto/bn/asm/rsaz-avx2.pl

@@ -0,0 +1,1875 @@
+#!/usr/bin/env perl
+
+##############################################################################
+#                                                                            #
+#  Copyright (c) 2012, Intel Corporation                                     #
+#                                                                            #
+#  All rights reserved.                                                      #
+#                                                                            #
+#  Redistribution and use in source and binary forms, with or without        #
+#  modification, are permitted provided that the following conditions are    #
+#  met:                                                                      #
+#                                                                            #
+#  *  Redistributions of source code must retain the above copyright         #
+#     notice, this list of conditions and the following disclaimer.          #
+#                                                                            #
+#  *  Redistributions in binary form must reproduce the above copyright      #
+#     notice, this list of conditions and the following disclaimer in the    #
+#     documentation and/or other materials provided with the                 #
+#     distribution.                                                          #
+#                                                                            #
+#  *  Neither the name of the Intel Corporation nor the names of its         #
+#     contributors may be used to endorse or promote products derived from   #
+#     this software without specific prior written permission.               #
+#                                                                            #
+#                                                                            #
+#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
+#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
+#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
+#                                                                            #
+##############################################################################
+# Developers and authors:                                                    #
+# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
+# (1) Intel Corporation, Israel Development Center, Haifa, Israel            #
+# (2) University of Haifa, Israel                                            #
+##############################################################################
+# Reference:                                                                 #
+# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular             #
+#     Exponentiation,  Using Advanced Vector Instructions Architectures",    #
+#     F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,   #
+#     pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012              #
+# [2] S. Gueron: "Efficient Software Implementations of Modular              #
+#     Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).  #
+# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE         #
+#     Proceedings of 9th International Conference on Information Technology: #
+#     New Generations (ITNG 2012), pp.821-823 (2012)                         #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
+#     resistant 1024-bit modular exponentiation, for optimizing RSA2048      #
+#     on AVX2 capable x86_64 platforms",                                     #
+#     http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
+##############################################################################
+#
+# +13% improvement over original submission by <appro@openssl.org>
+#
+# rsa2048 sign/sec	OpenSSL 1.0.1	scalar(*)	this
+# 2.3GHz Haswell	621		765/+23%	1113/+79%
+#
+# (*)	if system doesn't support AVX2, for reference purposes;
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.19) + ($1>=2.22);
+}
+
+if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$avx = ($1>=2.09) + ($1>=2.10);
+}
+
+if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$avx = ($1>=10) + ($1>=11);
+}
+
+open OUT,"| $^X $xlate $flavour $output";
+*STDOUT = *OUT;
+
+if ($avx>1) {{{
+{ # void AMS_WW(
+my $rp="%rdi";	# BN_ULONG *rp,
+my $ap="%rsi";	# const BN_ULONG *ap,
+my $np="%rdx";	# const BN_ULONG *np,
+my $n0="%ecx";	# const BN_ULONG n0,
+my $rep="%r8d";	# int repeat);
+
+# The registers that hold the accumulated redundant result
+# The AMM works on 1024 bit operands, and redundant word size is 29
+# Therefore: ceil(1024/29)/4 = 9
+my $ACC0="%ymm0";
+my $ACC1="%ymm1";
+my $ACC2="%ymm2";
+my $ACC3="%ymm3";
+my $ACC4="%ymm4";
+my $ACC5="%ymm5";
+my $ACC6="%ymm6";
+my $ACC7="%ymm7";
+my $ACC8="%ymm8";
+my $ACC9="%ymm9";
+# Registers that hold the broadcasted words of bp, currently used
+my $B1="%ymm10";
+my $B2="%ymm11";
+# Registers that hold the broadcasted words of Y, currently used
+my $Y1="%ymm12";
+my $Y2="%ymm13";
+# Helper registers
+my $TEMP1="%ymm14";
+my $AND_MASK="%ymm15";
+# alu registers that hold the first words of the ACC
+my $r0="%r9";
+my $r1="%r10";
+my $r2="%r11";
+my $r3="%r12";
+
+my $i="%r14d";			# loop counter
+my $tmp = "%r15";
+
+my $FrameSize=32*18+32*8;	# place for A^2 and 2*A
+
+my $aap=$r0;
+my $tp0="%rbx";
+my $tp1=$r3;
+my $tpa=$tmp;
+
+$np="%r13";			# reassigned argument
+
+$code.=<<___;
+.text
+
+.globl	rsaz_1024_sqr_avx2
+.type	rsaz_1024_sqr_avx2,\@function,5
+.align	64
+rsaz_1024_sqr_avx2:		# 702 cycles, 14% faster than rsaz_1024_mul_avx2
+	lea	(%rsp), %rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	lea	-0xa8(%rsp),%rsp
+	vmovaps	%xmm6,-0xd8(%rax)
+	vmovaps	%xmm7,-0xc8(%rax)
+	vmovaps	%xmm8,-0xb8(%rax)
+	vmovaps	%xmm9,-0xa8(%rax)
+	vmovaps	%xmm10,-0x98(%rax)
+	vmovaps	%xmm11,-0x88(%rax)
+	vmovaps	%xmm12,-0x78(%rax)
+	vmovaps	%xmm13,-0x68(%rax)
+	vmovaps	%xmm14,-0x58(%rax)
+	vmovaps	%xmm15,-0x48(%rax)
+.Lsqr_1024_body:
+___
+$code.=<<___;
+	mov	%rax,%rbp
+	mov	%rdx, $np			# reassigned argument
+	sub	\$$FrameSize, %rsp
+	mov	$np, $tmp
+	sub	\$-128, $rp			# size optimization
+	sub	\$-128, $ap
+	sub	\$-128, $np
+
+	and	\$4095, $tmp			# see if $np crosses page
+	add	\$32*10, $tmp
+	shr	\$12, $tmp
+	vpxor	$ACC9,$ACC9,$ACC9
+	jz	.Lsqr_1024_no_n_copy
+
+	# unaligned 256-bit load that crosses page boundary can
+	# cause >2x performance degradation here, so if $np does
+	# cross page boundary, copy it to stack and make sure stack
+	# frame doesn't...
+	sub		\$32*10,%rsp
+	vmovdqu		32*0-128($np), $ACC0
+	and		\$-2048, %rsp
+	vmovdqu		32*1-128($np), $ACC1
+	vmovdqu		32*2-128($np), $ACC2
+	vmovdqu		32*3-128($np), $ACC3
+	vmovdqu		32*4-128($np), $ACC4
+	vmovdqu		32*5-128($np), $ACC5
+	vmovdqu		32*6-128($np), $ACC6
+	vmovdqu		32*7-128($np), $ACC7
+	vmovdqu		32*8-128($np), $ACC8
+	lea		$FrameSize+128(%rsp),$np
+	vmovdqu		$ACC0, 32*0-128($np)
+	vmovdqu		$ACC1, 32*1-128($np)
+	vmovdqu		$ACC2, 32*2-128($np)
+	vmovdqu		$ACC3, 32*3-128($np)
+	vmovdqu		$ACC4, 32*4-128($np)
+	vmovdqu		$ACC5, 32*5-128($np)
+	vmovdqu		$ACC6, 32*6-128($np)
+	vmovdqu		$ACC7, 32*7-128($np)
+	vmovdqu		$ACC8, 32*8-128($np)
+	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero
+
+.Lsqr_1024_no_n_copy:
+	and		\$-1024, %rsp
+
+	vmovdqu		32*1-128($ap), $ACC1
+	vmovdqu		32*2-128($ap), $ACC2
+	vmovdqu		32*3-128($ap), $ACC3
+	vmovdqu		32*4-128($ap), $ACC4
+	vmovdqu		32*5-128($ap), $ACC5
+	vmovdqu		32*6-128($ap), $ACC6
+	vmovdqu		32*7-128($ap), $ACC7
+	vmovdqu		32*8-128($ap), $ACC8
+
+	lea	192(%rsp), $tp0			# 64+128=192
+	vpbroadcastq	.Land_mask(%rip), $AND_MASK
+	jmp	.LOOP_GRANDE_SQR_1024
+
+.align	32
+.LOOP_GRANDE_SQR_1024:
+	lea	32*18+128(%rsp), $aap		# size optimization
+	lea	448(%rsp), $tp1			# 64+128+256=448
+
+	# the squaring is performed as described in Variant B of
+	# "Speeding up Big-Number Squaring", so start by calculating
+	# the A*2=A+A vector
+	vpaddq		$ACC1, $ACC1, $ACC1
+	 vpbroadcastq	32*0-128($ap), $B1
+	vpaddq		$ACC2, $ACC2, $ACC2
+	vmovdqa		$ACC1, 32*0-128($aap)
+	vpaddq		$ACC3, $ACC3, $ACC3
+	vmovdqa		$ACC2, 32*1-128($aap)
+	vpaddq		$ACC4, $ACC4, $ACC4
+	vmovdqa		$ACC3, 32*2-128($aap)
+	vpaddq		$ACC5, $ACC5, $ACC5
+	vmovdqa		$ACC4, 32*3-128($aap)
+	vpaddq		$ACC6, $ACC6, $ACC6
+	vmovdqa		$ACC5, 32*4-128($aap)
+	vpaddq		$ACC7, $ACC7, $ACC7
+	vmovdqa		$ACC6, 32*5-128($aap)
+	vpaddq		$ACC8, $ACC8, $ACC8
+	vmovdqa		$ACC7, 32*6-128($aap)
+	vpxor		$ACC9, $ACC9, $ACC9
+	vmovdqa		$ACC8, 32*7-128($aap)
+
+	vpmuludq	32*0-128($ap), $B1, $ACC0
+	 vpbroadcastq	32*1-128($ap), $B2
+	 vmovdqu	$ACC9, 32*9-192($tp0)	# zero upper half
+	vpmuludq	$B1, $ACC1, $ACC1
+	 vmovdqu	$ACC9, 32*10-448($tp1)
+	vpmuludq	$B1, $ACC2, $ACC2
+	 vmovdqu	$ACC9, 32*11-448($tp1)
+	vpmuludq	$B1, $ACC3, $ACC3
+	 vmovdqu	$ACC9, 32*12-448($tp1)
+	vpmuludq	$B1, $ACC4, $ACC4
+	 vmovdqu	$ACC9, 32*13-448($tp1)
+	vpmuludq	$B1, $ACC5, $ACC5
+	 vmovdqu	$ACC9, 32*14-448($tp1)
+	vpmuludq	$B1, $ACC6, $ACC6
+	 vmovdqu	$ACC9, 32*15-448($tp1)
+	vpmuludq	$B1, $ACC7, $ACC7
+	 vmovdqu	$ACC9, 32*16-448($tp1)
+	vpmuludq	$B1, $ACC8, $ACC8
+	 vpbroadcastq	32*2-128($ap), $B1
+	 vmovdqu	$ACC9, 32*17-448($tp1)
+
+	mov	$ap, $tpa
+	mov 	\$4, $i
+	jmp	.Lsqr_entry_1024
+___
+$TEMP0=$Y1;
+$TEMP2=$Y2;
+$code.=<<___;
+.align	32
+.LOOP_SQR_1024:
+	 vpbroadcastq	32*1-128($tpa), $B2
+	vpmuludq	32*0-128($ap), $B1, $ACC0
+	vpaddq		32*0-192($tp0), $ACC0, $ACC0
+	vpmuludq	32*0-128($aap), $B1, $ACC1
+	vpaddq		32*1-192($tp0), $ACC1, $ACC1
+	vpmuludq	32*1-128($aap), $B1, $ACC2
+	vpaddq		32*2-192($tp0), $ACC2, $ACC2
+	vpmuludq	32*2-128($aap), $B1, $ACC3
+	vpaddq		32*3-192($tp0), $ACC3, $ACC3
+	vpmuludq	32*3-128($aap), $B1, $ACC4
+	vpaddq		32*4-192($tp0), $ACC4, $ACC4
+	vpmuludq	32*4-128($aap), $B1, $ACC5
+	vpaddq		32*5-192($tp0), $ACC5, $ACC5
+	vpmuludq	32*5-128($aap), $B1, $ACC6
+	vpaddq		32*6-192($tp0), $ACC6, $ACC6
+	vpmuludq	32*6-128($aap), $B1, $ACC7
+	vpaddq		32*7-192($tp0), $ACC7, $ACC7
+	vpmuludq	32*7-128($aap), $B1, $ACC8
+	 vpbroadcastq	32*2-128($tpa), $B1
+	vpaddq		32*8-192($tp0), $ACC8, $ACC8
+.Lsqr_entry_1024:
+	vmovdqu		$ACC0, 32*0-192($tp0)
+	vmovdqu		$ACC1, 32*1-192($tp0)
+
+	vpmuludq	32*1-128($ap), $B2, $TEMP0
+	vpaddq		$TEMP0, $ACC2, $ACC2
+	vpmuludq	32*1-128($aap), $B2, $TEMP1
+	vpaddq		$TEMP1, $ACC3, $ACC3
+	vpmuludq	32*2-128($aap), $B2, $TEMP2
+	vpaddq		$TEMP2, $ACC4, $ACC4
+	vpmuludq	32*3-128($aap), $B2, $TEMP0
+	vpaddq		$TEMP0, $ACC5, $ACC5
+	vpmuludq	32*4-128($aap), $B2, $TEMP1
+	vpaddq		$TEMP1, $ACC6, $ACC6
+	vpmuludq	32*5-128($aap), $B2, $TEMP2
+	vpaddq		$TEMP2, $ACC7, $ACC7
+	vpmuludq	32*6-128($aap), $B2, $TEMP0
+	vpaddq		$TEMP0, $ACC8, $ACC8
+	vpmuludq	32*7-128($aap), $B2, $ACC0
+	 vpbroadcastq	32*3-128($tpa), $B2
+	vpaddq		32*9-192($tp0), $ACC0, $ACC0
+
+	vmovdqu		$ACC2, 32*2-192($tp0)
+	vmovdqu		$ACC3, 32*3-192($tp0)
+
+	vpmuludq	32*2-128($ap), $B1, $TEMP2
+	vpaddq		$TEMP2, $ACC4, $ACC4
+	vpmuludq	32*2-128($aap), $B1, $TEMP0
+	vpaddq		$TEMP0, $ACC5, $ACC5
+	vpmuludq	32*3-128($aap), $B1, $TEMP1
+	vpaddq		$TEMP1, $ACC6, $ACC6
+	vpmuludq	32*4-128($aap), $B1, $TEMP2
+	vpaddq		$TEMP2, $ACC7, $ACC7
+	vpmuludq	32*5-128($aap), $B1, $TEMP0
+	vpaddq		$TEMP0, $ACC8, $ACC8
+	vpmuludq	32*6-128($aap), $B1, $TEMP1
+	vpaddq		$TEMP1, $ACC0, $ACC0
+	vpmuludq	32*7-128($aap), $B1, $ACC1
+	 vpbroadcastq	32*4-128($tpa), $B1
+	vpaddq		32*10-448($tp1), $ACC1, $ACC1
+
+	vmovdqu		$ACC4, 32*4-192($tp0)
+	vmovdqu		$ACC5, 32*5-192($tp0)
+
+	vpmuludq	32*3-128($ap), $B2, $TEMP0
+	vpaddq		$TEMP0, $ACC6, $ACC6
+	vpmuludq	32*3-128($aap), $B2, $TEMP1
+	vpaddq		$TEMP1, $ACC7, $ACC7
+	vpmuludq	32*4-128($aap), $B2, $TEMP2
+	vpaddq		$TEMP2, $ACC8, $ACC8
+	vpmuludq	32*5-128($aap), $B2, $TEMP0
+	vpaddq		$TEMP0, $ACC0, $ACC0
+	vpmuludq	32*6-128($aap), $B2, $TEMP1
+	vpaddq		$TEMP1, $ACC1, $ACC1
+	vpmuludq	32*7-128($aap), $B2, $ACC2
+	 vpbroadcastq	32*5-128($tpa), $B2
+	vpaddq		32*11-448($tp1), $ACC2, $ACC2	
+
+	vmovdqu		$ACC6, 32*6-192($tp0)
+	vmovdqu		$ACC7, 32*7-192($tp0)
+
+	vpmuludq	32*4-128($ap), $B1, $TEMP0
+	vpaddq		$TEMP0, $ACC8, $ACC8
+	vpmuludq	32*4-128($aap), $B1, $TEMP1
+	vpaddq		$TEMP1, $ACC0, $ACC0
+	vpmuludq	32*5-128($aap), $B1, $TEMP2
+	vpaddq		$TEMP2, $ACC1, $ACC1
+	vpmuludq	32*6-128($aap), $B1, $TEMP0
+	vpaddq		$TEMP0, $ACC2, $ACC2
+	vpmuludq	32*7-128($aap), $B1, $ACC3
+	 vpbroadcastq	32*6-128($tpa), $B1
+	vpaddq		32*12-448($tp1), $ACC3, $ACC3
+
+	vmovdqu		$ACC8, 32*8-192($tp0)
+	vmovdqu		$ACC0, 32*9-192($tp0)
+	lea		8($tp0), $tp0
+
+	vpmuludq	32*5-128($ap), $B2, $TEMP2
+	vpaddq		$TEMP2, $ACC1, $ACC1
+	vpmuludq	32*5-128($aap), $B2, $TEMP0
+	vpaddq		$TEMP0, $ACC2, $ACC2
+	vpmuludq	32*6-128($aap), $B2, $TEMP1
+	vpaddq		$TEMP1, $ACC3, $ACC3
+	vpmuludq	32*7-128($aap), $B2, $ACC4
+	 vpbroadcastq	32*7-128($tpa), $B2
+	vpaddq		32*13-448($tp1), $ACC4, $ACC4
+
+	vmovdqu		$ACC1, 32*10-448($tp1)
+	vmovdqu		$ACC2, 32*11-448($tp1)
+
+	vpmuludq	32*6-128($ap), $B1, $TEMP0
+	vpaddq		$TEMP0, $ACC3, $ACC3
+	vpmuludq	32*6-128($aap), $B1, $TEMP1
+	 vpbroadcastq	32*8-128($tpa), $ACC0		# borrow $ACC0 for $B1
+	vpaddq		$TEMP1, $ACC4, $ACC4
+	vpmuludq	32*7-128($aap), $B1, $ACC5
+	 vpbroadcastq	32*0+8-128($tpa), $B1		# for next iteration
+	vpaddq		32*14-448($tp1), $ACC5, $ACC5
+
+	vmovdqu		$ACC3, 32*12-448($tp1)
+	vmovdqu		$ACC4, 32*13-448($tp1)
+	lea		8($tpa), $tpa
+
+	vpmuludq	32*7-128($ap), $B2, $TEMP0
+	vpaddq		$TEMP0, $ACC5, $ACC5
+	vpmuludq	32*7-128($aap), $B2, $ACC6
+	vpaddq		32*15-448($tp1), $ACC6, $ACC6
+
+	vpmuludq	32*8-128($ap), $ACC0, $ACC7
+	vmovdqu		$ACC5, 32*14-448($tp1)
+	vpaddq		32*16-448($tp1), $ACC7, $ACC7
+	vmovdqu		$ACC6, 32*15-448($tp1)
+	vmovdqu		$ACC7, 32*16-448($tp1)
+	lea		8($tp1), $tp1
+
+	dec	$i        
+	jnz	.LOOP_SQR_1024
+___
+$ZERO = $ACC9;
+$TEMP0 = $B1;
+$TEMP2 = $B2;
+$TEMP3 = $Y1;
+$TEMP4 = $Y2;
+$code.=<<___;
+	#we need to fix indexes 32-39 to avoid overflow
+	vmovdqu		32*8(%rsp), $ACC8		# 32*8-192($tp0),
+	vmovdqu		32*9(%rsp), $ACC1		# 32*9-192($tp0)
+	vmovdqu		32*10(%rsp), $ACC2		# 32*10-192($tp0)
+	lea		192(%rsp), $tp0			# 64+128=192
+
+	vpsrlq		\$29, $ACC8, $TEMP1
+	vpand		$AND_MASK, $ACC8, $ACC8
+	vpsrlq		\$29, $ACC1, $TEMP2
+	vpand		$AND_MASK, $ACC1, $ACC1
+
+	vpermq		\$0x93, $TEMP1, $TEMP1
+	vpxor		$ZERO, $ZERO, $ZERO
+	vpermq		\$0x93, $TEMP2, $TEMP2
+
+	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
+	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
+	vpaddq		$TEMP0, $ACC8, $ACC8
+	vpblendd	\$3, $TEMP2, $ZERO, $TEMP2
+	vpaddq		$TEMP1, $ACC1, $ACC1
+	vpaddq		$TEMP2, $ACC2, $ACC2
+	vmovdqu		$ACC1, 32*9-192($tp0)
+	vmovdqu		$ACC2, 32*10-192($tp0)
+
+	mov	(%rsp), %rax
+	mov	8(%rsp), $r1
+	mov	16(%rsp), $r2
+	mov	24(%rsp), $r3
+	vmovdqu	32*1(%rsp), $ACC1
+	vmovdqu	32*2-192($tp0), $ACC2
+	vmovdqu	32*3-192($tp0), $ACC3
+	vmovdqu	32*4-192($tp0), $ACC4
+	vmovdqu	32*5-192($tp0), $ACC5
+	vmovdqu	32*6-192($tp0), $ACC6
+	vmovdqu	32*7-192($tp0), $ACC7
+
+	mov	%rax, $r0
+	imull	$n0, %eax
+	and	\$0x1fffffff, %eax
+	vmovd	%eax, $Y1
+
+	mov	%rax, %rdx
+	imulq	-128($np), %rax
+	 vpbroadcastq	$Y1, $Y1
+	add	%rax, $r0
+	mov	%rdx, %rax
+	imulq	8-128($np), %rax
+	shr	\$29, $r0
+	add	%rax, $r1
+	mov	%rdx, %rax
+	imulq	16-128($np), %rax
+	add	$r0, $r1
+	add	%rax, $r2
+	imulq	24-128($np), %rdx
+	add	%rdx, $r3
+
+	mov	$r1, %rax
+	imull	$n0, %eax
+	and	\$0x1fffffff, %eax
+
+	mov \$9, $i
+	jmp .LOOP_REDUCE_1024
+
+.align	32
+.LOOP_REDUCE_1024:
+	vmovd	%eax, $Y2
+	vpbroadcastq	$Y2, $Y2
+
+	vpmuludq	32*1-128($np), $Y1, $TEMP0
+	 mov	%rax, %rdx
+	 imulq	-128($np), %rax
+	vpaddq		$TEMP0, $ACC1, $ACC1
+	 add	%rax, $r1
+	vpmuludq	32*2-128($np), $Y1, $TEMP1
+	 mov	%rdx, %rax
+	 imulq	8-128($np), %rax
+	vpaddq		$TEMP1, $ACC2, $ACC2
+	vpmuludq	32*3-128($np), $Y1, $TEMP2
+	 .byte	0x67
+	 add	%rax, $r2
+	 .byte	0x67
+	 mov	%rdx, %rax
+	 imulq	16-128($np), %rax
+	 shr	\$29, $r1
+	vpaddq		$TEMP2, $ACC3, $ACC3
+	vpmuludq	32*4-128($np), $Y1, $TEMP0
+	 add	%rax, $r3
+	 add	$r1, $r2
+	vpaddq		$TEMP0, $ACC4, $ACC4
+	vpmuludq	32*5-128($np), $Y1, $TEMP1
+	 mov	$r2, %rax
+	 imull	$n0, %eax
+	vpaddq		$TEMP1, $ACC5, $ACC5
+	vpmuludq	32*6-128($np), $Y1, $TEMP2
+	 and	\$0x1fffffff, %eax
+	vpaddq		$TEMP2, $ACC6, $ACC6
+	vpmuludq	32*7-128($np), $Y1, $TEMP0
+	vpaddq		$TEMP0, $ACC7, $ACC7
+	vpmuludq	32*8-128($np), $Y1, $TEMP1
+	 vmovd	%eax, $Y1
+	 #vmovdqu	32*1-8-128($np), $TEMP2		# moved below
+	vpaddq		$TEMP1, $ACC8, $ACC8
+	 #vmovdqu	32*2-8-128($np), $TEMP0		# moved below
+	 vpbroadcastq	$Y1, $Y1
+
+	vpmuludq	32*1-8-128($np), $Y2, $TEMP2	# see above
+	vmovdqu		32*3-8-128($np), $TEMP1
+	 mov	%rax, %rdx
+	 imulq	-128($np), %rax
+	vpaddq		$TEMP2, $ACC1, $ACC1
+	vpmuludq	32*2-8-128($np), $Y2, $TEMP0	# see above
+	vmovdqu		32*4-8-128($np), $TEMP2
+	 add	%rax, $r2
+	 mov	%rdx, %rax
+	 imulq	8-128($np), %rax
+	vpaddq		$TEMP0, $ACC2, $ACC2
+	 add	$r3, %rax
+	 shr	\$29, $r2
+	vpmuludq	$Y2, $TEMP1, $TEMP1
+	vmovdqu		32*5-8-128($np), $TEMP0
+	 add	$r2, %rax
+	vpaddq		$TEMP1, $ACC3, $ACC3
+	vpmuludq	$Y2, $TEMP2, $TEMP2
+	vmovdqu		32*6-8-128($np), $TEMP1
+	 .byte	0x67
+	 mov	%rax, $r3
+	 imull	$n0, %eax
+	vpaddq		$TEMP2, $ACC4, $ACC4
+	vpmuludq	$Y2, $TEMP0, $TEMP0
+	.byte	0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00	# vmovdqu		32*7-8-128($np), $TEMP2
+	 and	\$0x1fffffff, %eax
+	vpaddq		$TEMP0, $ACC5, $ACC5
+	vpmuludq	$Y2, $TEMP1, $TEMP1
+	vmovdqu		32*8-8-128($np), $TEMP0
+	vpaddq		$TEMP1, $ACC6, $ACC6
+	vpmuludq	$Y2, $TEMP2, $TEMP2
+	vmovdqu		32*9-8-128($np), $ACC9
+	 vmovd	%eax, $ACC0			# borrow ACC0 for Y2
+	 imulq	-128($np), %rax
+	vpaddq		$TEMP2, $ACC7, $ACC7
+	vpmuludq	$Y2, $TEMP0, $TEMP0
+	 vmovdqu	32*1-16-128($np), $TEMP1
+	 vpbroadcastq	$ACC0, $ACC0
+	vpaddq		$TEMP0, $ACC8, $ACC8
+	vpmuludq	$Y2, $ACC9, $ACC9
+	 vmovdqu	32*2-16-128($np), $TEMP2
+	 add	%rax, $r3
+
+___
+($ACC0,$Y2)=($Y2,$ACC0);
+$code.=<<___;
+	 vmovdqu	32*1-24-128($np), $ACC0
+	vpmuludq	$Y1, $TEMP1, $TEMP1
+	vmovdqu		32*3-16-128($np), $TEMP0
+	vpaddq		$TEMP1, $ACC1, $ACC1
+	 vpmuludq	$Y2, $ACC0, $ACC0
+	vpmuludq	$Y1, $TEMP2, $TEMP2
+	.byte	0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff	# vmovdqu		32*4-16-128($np), $TEMP1
+	 vpaddq		$ACC1, $ACC0, $ACC0
+	vpaddq		$TEMP2, $ACC2, $ACC2
+	vpmuludq	$Y1, $TEMP0, $TEMP0
+	vmovdqu		32*5-16-128($np), $TEMP2
+	 .byte	0x67
+	 vmovq		$ACC0, %rax
+	 vmovdqu	$ACC0, (%rsp)		# transfer $r0-$r3
+	vpaddq		$TEMP0, $ACC3, $ACC3
+	vpmuludq	$Y1, $TEMP1, $TEMP1
+	vmovdqu		32*6-16-128($np), $TEMP0
+	vpaddq		$TEMP1, $ACC4, $ACC4
+	vpmuludq	$Y1, $TEMP2, $TEMP2
+	vmovdqu		32*7-16-128($np), $TEMP1
+	vpaddq		$TEMP2, $ACC5, $ACC5
+	vpmuludq	$Y1, $TEMP0, $TEMP0
+	vmovdqu		32*8-16-128($np), $TEMP2
+	vpaddq		$TEMP0, $ACC6, $ACC6
+	vpmuludq	$Y1, $TEMP1, $TEMP1
+	 shr	\$29, $r3
+	vmovdqu		32*9-16-128($np), $TEMP0
+	 add	$r3, %rax
+	vpaddq		$TEMP1, $ACC7, $ACC7
+	vpmuludq	$Y1, $TEMP2, $TEMP2
+	 #vmovdqu	32*2-24-128($np), $TEMP1	# moved below
+	 mov	%rax, $r0
+	 imull	$n0, %eax
+	vpaddq		$TEMP2, $ACC8, $ACC8
+	vpmuludq	$Y1, $TEMP0, $TEMP0
+	 and	\$0x1fffffff, %eax
+	 vmovd	%eax, $Y1
+	 vmovdqu	32*3-24-128($np), $TEMP2
+	.byte	0x67
+	vpaddq		$TEMP0, $ACC9, $ACC9
+	 vpbroadcastq	$Y1, $Y1
+
+	vpmuludq	32*2-24-128($np), $Y2, $TEMP1	# see above
+	vmovdqu		32*4-24-128($np), $TEMP0
+	 mov	%rax, %rdx
+	 imulq	-128($np), %rax
+	 mov	8(%rsp), $r1
+	vpaddq		$TEMP1, $ACC2, $ACC1
+	vpmuludq	$Y2, $TEMP2, $TEMP2
+	vmovdqu		32*5-24-128($np), $TEMP1
+	 add	%rax, $r0
+	 mov	%rdx, %rax
+	 imulq	8-128($np), %rax
+	 .byte	0x67
+	 shr	\$29, $r0
+	 mov	16(%rsp), $r2
+	vpaddq		$TEMP2, $ACC3, $ACC2
+	vpmuludq	$Y2, $TEMP0, $TEMP0
+	vmovdqu		32*6-24-128($np), $TEMP2
+	 add	%rax, $r1
+	 mov	%rdx, %rax
+	 imulq	16-128($np), %rax
+	vpaddq		$TEMP0, $ACC4, $ACC3
+	vpmuludq	$Y2, $TEMP1, $TEMP1
+	vmovdqu		32*7-24-128($np), $TEMP0
+	 imulq	24-128($np), %rdx		# future $r3
+	 add	%rax, $r2
+	 lea	($r0,$r1), %rax
+	vpaddq		$TEMP1, $ACC5, $ACC4
+	vpmuludq	$Y2, $TEMP2, $TEMP2
+	vmovdqu		32*8-24-128($np), $TEMP1
+	 mov	%rax, $r1
+	 imull	$n0, %eax
+	vpmuludq	$Y2, $TEMP0, $TEMP0
+	vpaddq		$TEMP2, $ACC6, $ACC5
+	vmovdqu		32*9-24-128($np), $TEMP2
+	 and	\$0x1fffffff, %eax
+	vpaddq		$TEMP0, $ACC7, $ACC6
+	vpmuludq	$Y2, $TEMP1, $TEMP1
+	 add	24(%rsp), %rdx
+	vpaddq		$TEMP1, $ACC8, $ACC7
+	vpmuludq	$Y2, $TEMP2, $TEMP2
+	vpaddq		$TEMP2, $ACC9, $ACC8
+	 vmovq	$r3, $ACC9
+	 mov	%rdx, $r3
+
+	dec	$i
+	jnz	.LOOP_REDUCE_1024
+___
+($ACC0,$Y2)=($Y2,$ACC0);
+$code.=<<___;
+	lea	448(%rsp), $tp1			# size optimization
+	vpaddq	$ACC9, $Y2, $ACC0
+	vpxor	$ZERO, $ZERO, $ZERO
+
+	vpaddq		32*9-192($tp0), $ACC0, $ACC0
+	vpaddq		32*10-448($tp1), $ACC1, $ACC1
+	vpaddq		32*11-448($tp1), $ACC2, $ACC2
+	vpaddq		32*12-448($tp1), $ACC3, $ACC3
+	vpaddq		32*13-448($tp1), $ACC4, $ACC4
+	vpaddq		32*14-448($tp1), $ACC5, $ACC5
+	vpaddq		32*15-448($tp1), $ACC6, $ACC6
+	vpaddq		32*16-448($tp1), $ACC7, $ACC7
+	vpaddq		32*17-448($tp1), $ACC8, $ACC8
+
+	vpsrlq		\$29, $ACC0, $TEMP1
+	vpand		$AND_MASK, $ACC0, $ACC0
+	vpsrlq		\$29, $ACC1, $TEMP2
+	vpand		$AND_MASK, $ACC1, $ACC1
+	vpsrlq		\$29, $ACC2, $TEMP3
+	vpermq		\$0x93, $TEMP1, $TEMP1
+	vpand		$AND_MASK, $ACC2, $ACC2
+	vpsrlq		\$29, $ACC3, $TEMP4
+	vpermq		\$0x93, $TEMP2, $TEMP2
+	vpand		$AND_MASK, $ACC3, $ACC3
+	vpermq		\$0x93, $TEMP3, $TEMP3
+
+	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
+	vpermq		\$0x93, $TEMP4, $TEMP4
+	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
+	vpaddq		$TEMP0, $ACC0, $ACC0
+	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
+	vpaddq		$TEMP1, $ACC1, $ACC1
+	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
+	vpaddq		$TEMP2, $ACC2, $ACC2
+	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
+	vpaddq		$TEMP3, $ACC3, $ACC3
+	vpaddq		$TEMP4, $ACC4, $ACC4
+
+	vpsrlq		\$29, $ACC0, $TEMP1
+	vpand		$AND_MASK, $ACC0, $ACC0
+	vpsrlq		\$29, $ACC1, $TEMP2
+	vpand		$AND_MASK, $ACC1, $ACC1
+	vpsrlq		\$29, $ACC2, $TEMP3
+	vpermq		\$0x93, $TEMP1, $TEMP1
+	vpand		$AND_MASK, $ACC2, $ACC2
+	vpsrlq		\$29, $ACC3, $TEMP4
+	vpermq		\$0x93, $TEMP2, $TEMP2
+	vpand		$AND_MASK, $ACC3, $ACC3
+	vpermq		\$0x93, $TEMP3, $TEMP3
+
+	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
+	vpermq		\$0x93, $TEMP4, $TEMP4
+	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
+	vpaddq		$TEMP0, $ACC0, $ACC0
+	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
+	vpaddq		$TEMP1, $ACC1, $ACC1
+	vmovdqu		$ACC0, 32*0-128($rp)
+	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
+	vpaddq		$TEMP2, $ACC2, $ACC2
+	vmovdqu		$ACC1, 32*1-128($rp)
+	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
+	vpaddq		$TEMP3, $ACC3, $ACC3
+	vmovdqu		$ACC2, 32*2-128($rp)
+	vpaddq		$TEMP4, $ACC4, $ACC4
+	vmovdqu		$ACC3, 32*3-128($rp)
+___
+$TEMP5=$ACC0;
+$code.=<<___;
+	vpsrlq		\$29, $ACC4, $TEMP1
+	vpand		$AND_MASK, $ACC4, $ACC4
+	vpsrlq		\$29, $ACC5, $TEMP2
+	vpand		$AND_MASK, $ACC5, $ACC5
+	vpsrlq		\$29, $ACC6, $TEMP3
+	vpermq		\$0x93, $TEMP1, $TEMP1
+	vpand		$AND_MASK, $ACC6, $ACC6
+	vpsrlq		\$29, $ACC7, $TEMP4
+	vpermq		\$0x93, $TEMP2, $TEMP2
+	vpand		$AND_MASK, $ACC7, $ACC7
+	vpsrlq		\$29, $ACC8, $TEMP5
+	vpermq		\$0x93, $TEMP3, $TEMP3
+	vpand		$AND_MASK, $ACC8, $ACC8
+	vpermq		\$0x93, $TEMP4, $TEMP4
+
+	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
+	vpermq		\$0x93, $TEMP5, $TEMP5
+	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
+	vpaddq		$TEMP0, $ACC4, $ACC4
+	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
+	vpaddq		$TEMP1, $ACC5, $ACC5
+	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
+	vpaddq		$TEMP2, $ACC6, $ACC6
+	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
+	vpaddq		$TEMP3, $ACC7, $ACC7
+	vpaddq		$TEMP4, $ACC8, $ACC8
+     
+	vpsrlq		\$29, $ACC4, $TEMP1
+	vpand		$AND_MASK, $ACC4, $ACC4
+	vpsrlq		\$29, $ACC5, $TEMP2
+	vpand		$AND_MASK, $ACC5, $ACC5
+	vpsrlq		\$29, $ACC6, $TEMP3
+	vpermq		\$0x93, $TEMP1, $TEMP1
+	vpand		$AND_MASK, $ACC6, $ACC6
+	vpsrlq		\$29, $ACC7, $TEMP4
+	vpermq		\$0x93, $TEMP2, $TEMP2
+	vpand		$AND_MASK, $ACC7, $ACC7
+	vpsrlq		\$29, $ACC8, $TEMP5
+	vpermq		\$0x93, $TEMP3, $TEMP3
+	vpand		$AND_MASK, $ACC8, $ACC8
+	vpermq		\$0x93, $TEMP4, $TEMP4
+
+	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
+	vpermq		\$0x93, $TEMP5, $TEMP5
+	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
+	vpaddq		$TEMP0, $ACC4, $ACC4
+	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
+	vpaddq		$TEMP1, $ACC5, $ACC5
+	vmovdqu		$ACC4, 32*4-128($rp)
+	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
+	vpaddq		$TEMP2, $ACC6, $ACC6
+	vmovdqu		$ACC5, 32*5-128($rp)
+	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
+	vpaddq		$TEMP3, $ACC7, $ACC7
+	vmovdqu		$ACC6, 32*6-128($rp)
+	vpaddq		$TEMP4, $ACC8, $ACC8
+	vmovdqu		$ACC7, 32*7-128($rp)
+	vmovdqu		$ACC8, 32*8-128($rp)
+
+	mov	$rp, $ap
+	dec	$rep
+	jne	.LOOP_GRANDE_SQR_1024
+
+	vzeroall
+	mov	%rbp, %rax
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd8(%rax),%xmm6
+	movaps	-0xc8(%rax),%xmm7
+	movaps	-0xb8(%rax),%xmm8
+	movaps	-0xa8(%rax),%xmm9
+	movaps	-0x98(%rax),%xmm10
+	movaps	-0x88(%rax),%xmm11
+	movaps	-0x78(%rax),%xmm12
+	movaps	-0x68(%rax),%xmm13
+	movaps	-0x58(%rax),%xmm14
+	movaps	-0x48(%rax),%xmm15
+___
+$code.=<<___;
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp		# restore %rsp
+.Lsqr_1024_epilogue:
+	ret
+.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+___
+}
+
+{ # void AMM_WW(
+my $rp="%rdi";	# BN_ULONG *rp,
+my $ap="%rsi";	# const BN_ULONG *ap,
+my $bp="%rdx";	# const BN_ULONG *bp,
+my $np="%rcx";	# const BN_ULONG *np,
+my $n0="%r8d";	# unsigned int n0);
+
+# The registers that hold the accumulated redundant result
+# The AMM works on 1024 bit operands, and redundant word size is 29
+# Therefore: ceil(1024/29)/4 = 9
+my $ACC0="%ymm0";
+my $ACC1="%ymm1";
+my $ACC2="%ymm2";
+my $ACC3="%ymm3";
+my $ACC4="%ymm4";
+my $ACC5="%ymm5";
+my $ACC6="%ymm6";
+my $ACC7="%ymm7";
+my $ACC8="%ymm8";
+my $ACC9="%ymm9";
+
+# Registers that hold the broadcasted words of multiplier, currently used
+my $Bi="%ymm10";
+my $Yi="%ymm11";
+
+# Helper registers
+my $TEMP0=$ACC0;
+my $TEMP1="%ymm12";
+my $TEMP2="%ymm13";
+my $ZERO="%ymm14";
+my $AND_MASK="%ymm15";
+
+# alu registers that hold the first words of the ACC
+my $r0="%r9";
+my $r1="%r10";
+my $r2="%r11";
+my $r3="%r12";
+
+my $i="%r14d";
+my $tmp="%r15";
+
+$bp="%r13";	# reassigned argument
+
+$code.=<<___;
+.globl	rsaz_1024_mul_avx2
+.type	rsaz_1024_mul_avx2,\@function,5
+.align	64
+rsaz_1024_mul_avx2:
+	lea	(%rsp), %rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	vzeroupper
+	lea	-0xa8(%rsp),%rsp
+	vmovaps	%xmm6,-0xd8(%rax)
+	vmovaps	%xmm7,-0xc8(%rax)
+	vmovaps	%xmm8,-0xb8(%rax)
+	vmovaps	%xmm9,-0xa8(%rax)
+	vmovaps	%xmm10,-0x98(%rax)
+	vmovaps	%xmm11,-0x88(%rax)
+	vmovaps	%xmm12,-0x78(%rax)
+	vmovaps	%xmm13,-0x68(%rax)
+	vmovaps	%xmm14,-0x58(%rax)
+	vmovaps	%xmm15,-0x48(%rax)
+.Lmul_1024_body:
+___
+$code.=<<___;
+	mov	%rax,%rbp
+	vzeroall
+	mov	%rdx, $bp	# reassigned argument
+	sub	\$64,%rsp
+
+	# unaligned 256-bit load that crosses page boundary can
+	# cause severe performance degradation here, so if $ap does
+	# cross page boundary, swap it with $bp [meaning that caller
+	# is advised to lay down $ap and $bp next to each other, so
+	# that only one can cross page boundary].
+	.byte	0x67,0x67
+	mov	$ap, $tmp
+	and	\$4095, $tmp
+	add	\$32*10, $tmp
+	shr	\$12, $tmp
+	mov	$ap, $tmp
+	cmovnz	$bp, $ap
+	cmovnz	$tmp, $bp
+
+	mov	$np, $tmp
+	sub	\$-128,$ap	# size optimization
+	sub	\$-128,$np
+	sub	\$-128,$rp
+
+	and	\$4095, $tmp	# see if $np crosses page
+	add	\$32*10, $tmp
+	.byte	0x67,0x67
+	shr	\$12, $tmp
+	jz	.Lmul_1024_no_n_copy
+
+	# unaligned 256-bit load that crosses page boundary can
+	# cause severe performance degradation here, so if $np does
+	# cross page boundary, copy it to stack and make sure stack
+	# frame doesn't...
+	sub		\$32*10,%rsp
+	vmovdqu		32*0-128($np), $ACC0
+	and		\$-512, %rsp
+	vmovdqu		32*1-128($np), $ACC1
+	vmovdqu		32*2-128($np), $ACC2
+	vmovdqu		32*3-128($np), $ACC3
+	vmovdqu		32*4-128($np), $ACC4
+	vmovdqu		32*5-128($np), $ACC5
+	vmovdqu		32*6-128($np), $ACC6
+	vmovdqu		32*7-128($np), $ACC7
+	vmovdqu		32*8-128($np), $ACC8
+	lea		64+128(%rsp),$np
+	vmovdqu		$ACC0, 32*0-128($np)
+	vpxor		$ACC0, $ACC0, $ACC0
+	vmovdqu		$ACC1, 32*1-128($np)
+	vpxor		$ACC1, $ACC1, $ACC1
+	vmovdqu		$ACC2, 32*2-128($np)
+	vpxor		$ACC2, $ACC2, $ACC2
+	vmovdqu		$ACC3, 32*3-128($np)
+	vpxor		$ACC3, $ACC3, $ACC3
+	vmovdqu		$ACC4, 32*4-128($np)
+	vpxor		$ACC4, $ACC4, $ACC4
+	vmovdqu		$ACC5, 32*5-128($np)
+	vpxor		$ACC5, $ACC5, $ACC5
+	vmovdqu		$ACC6, 32*6-128($np)
+	vpxor		$ACC6, $ACC6, $ACC6
+	vmovdqu		$ACC7, 32*7-128($np)
+	vpxor		$ACC7, $ACC7, $ACC7
+	vmovdqu		$ACC8, 32*8-128($np)
+	vmovdqa		$ACC0, $ACC8
+	vmovdqu		$ACC9, 32*9-128($np)	# $ACC9 is zero after vzeroall
+.Lmul_1024_no_n_copy:
+	and	\$-64,%rsp
+
+	mov	($bp), %rbx
+	vpbroadcastq ($bp), $Bi
+	vmovdqu	$ACC0, (%rsp)			# clear top of stack
+	xor	$r0, $r0
+	.byte	0x67
+	xor	$r1, $r1
+	xor	$r2, $r2
+	xor	$r3, $r3
+
+	vmovdqu	.Land_mask(%rip), $AND_MASK
+	mov	\$9, $i
+	jmp	.Loop_mul_1024
+
+.align	32
+.Loop_mul_1024:
+	 vpsrlq		\$29, $ACC3, $ACC9		# correct $ACC3(*)
+	mov	%rbx, %rax
+	imulq	-128($ap), %rax
+	add	$r0, %rax
+	mov	%rbx, $r1
+	imulq	8-128($ap), $r1
+	add	8(%rsp), $r1
+
+	mov	%rax, $r0
+	imull	$n0, %eax
+	and	\$0x1fffffff, %eax
+
+	 mov	%rbx, $r2
+	 imulq	16-128($ap), $r2
+	 add	16(%rsp), $r2
+
+	 mov	%rbx, $r3
+	 imulq	24-128($ap), $r3
+	 add	24(%rsp), $r3
+	vpmuludq	32*1-128($ap),$Bi,$TEMP0
+	 vmovd		%eax, $Yi
+	vpaddq		$TEMP0,$ACC1,$ACC1
+	vpmuludq	32*2-128($ap),$Bi,$TEMP1
+	 vpbroadcastq	$Yi, $Yi
+	vpaddq		$TEMP1,$ACC2,$ACC2
+	vpmuludq	32*3-128($ap),$Bi,$TEMP2
+	 vpand		$AND_MASK, $ACC3, $ACC3		# correct $ACC3
+	vpaddq		$TEMP2,$ACC3,$ACC3
+	vpmuludq	32*4-128($ap),$Bi,$TEMP0
+	vpaddq		$TEMP0,$ACC4,$ACC4
+	vpmuludq	32*5-128($ap),$Bi,$TEMP1
+	vpaddq		$TEMP1,$ACC5,$ACC5
+	vpmuludq	32*6-128($ap),$Bi,$TEMP2
+	vpaddq		$TEMP2,$ACC6,$ACC6
+	vpmuludq	32*7-128($ap),$Bi,$TEMP0
+	 vpermq		\$0x93, $ACC9, $ACC9		# correct $ACC3
+	vpaddq		$TEMP0,$ACC7,$ACC7
+	vpmuludq	32*8-128($ap),$Bi,$TEMP1
+	 vpbroadcastq	8($bp), $Bi
+	vpaddq		$TEMP1,$ACC8,$ACC8
+
+	mov	%rax,%rdx
+	imulq	-128($np),%rax
+	add	%rax,$r0
+	mov	%rdx,%rax
+	imulq	8-128($np),%rax
+	add	%rax,$r1
+	mov	%rdx,%rax
+	imulq	16-128($np),%rax
+	add	%rax,$r2
+	shr	\$29, $r0
+	imulq	24-128($np),%rdx
+	add	%rdx,$r3
+	add	$r0, $r1
+
+	vpmuludq	32*1-128($np),$Yi,$TEMP2
+	 vmovq		$Bi, %rbx
+	vpaddq		$TEMP2,$ACC1,$ACC1
+	vpmuludq	32*2-128($np),$Yi,$TEMP0
+	vpaddq		$TEMP0,$ACC2,$ACC2
+	vpmuludq	32*3-128($np),$Yi,$TEMP1
+	vpaddq		$TEMP1,$ACC3,$ACC3
+	vpmuludq	32*4-128($np),$Yi,$TEMP2
+	vpaddq		$TEMP2,$ACC4,$ACC4
+	vpmuludq	32*5-128($np),$Yi,$TEMP0
+	vpaddq		$TEMP0,$ACC5,$ACC5
+	vpmuludq	32*6-128($np),$Yi,$TEMP1
+	vpaddq		$TEMP1,$ACC6,$ACC6
+	vpmuludq	32*7-128($np),$Yi,$TEMP2
+	 vpblendd	\$3, $ZERO, $ACC9, $ACC9	# correct $ACC3
+	vpaddq		$TEMP2,$ACC7,$ACC7
+	vpmuludq	32*8-128($np),$Yi,$TEMP0
+	 vpaddq		$ACC9, $ACC3, $ACC3		# correct $ACC3
+	vpaddq		$TEMP0,$ACC8,$ACC8
+
+	mov	%rbx, %rax
+	imulq	-128($ap),%rax
+	add	%rax,$r1
+	 vmovdqu	-8+32*1-128($ap),$TEMP1
+	mov	%rbx, %rax
+	imulq	8-128($ap),%rax
+	add	%rax,$r2
+	 vmovdqu	-8+32*2-128($ap),$TEMP2
+
+	mov	$r1, %rax
+	imull	$n0, %eax
+	and	\$0x1fffffff, %eax
+
+	 imulq	16-128($ap),%rbx
+	 add	%rbx,$r3
+	vpmuludq	$Bi,$TEMP1,$TEMP1
+	 vmovd		%eax, $Yi
+	vmovdqu		-8+32*3-128($ap),$TEMP0
+	vpaddq		$TEMP1,$ACC1,$ACC1
+	vpmuludq	$Bi,$TEMP2,$TEMP2
+	 vpbroadcastq	$Yi, $Yi
+	vmovdqu		-8+32*4-128($ap),$TEMP1
+	vpaddq		$TEMP2,$ACC2,$ACC2
+	vpmuludq	$Bi,$TEMP0,$TEMP0
+	vmovdqu		-8+32*5-128($ap),$TEMP2
+	vpaddq		$TEMP0,$ACC3,$ACC3
+	vpmuludq	$Bi,$TEMP1,$TEMP1
+	vmovdqu		-8+32*6-128($ap),$TEMP0
+	vpaddq		$TEMP1,$ACC4,$ACC4
+	vpmuludq	$Bi,$TEMP2,$TEMP2
+	vmovdqu		-8+32*7-128($ap),$TEMP1
+	vpaddq		$TEMP2,$ACC5,$ACC5
+	vpmuludq	$Bi,$TEMP0,$TEMP0
+	vmovdqu		-8+32*8-128($ap),$TEMP2
+	vpaddq		$TEMP0,$ACC6,$ACC6
+	vpmuludq	$Bi,$TEMP1,$TEMP1
+	vmovdqu		-8+32*9-128($ap),$ACC9
+	vpaddq		$TEMP1,$ACC7,$ACC7
+	vpmuludq	$Bi,$TEMP2,$TEMP2
+	vpaddq		$TEMP2,$ACC8,$ACC8
+	vpmuludq	$Bi,$ACC9,$ACC9
+	 vpbroadcastq	16($bp), $Bi
+
+	mov	%rax,%rdx
+	imulq	-128($np),%rax
+	add	%rax,$r1
+	 vmovdqu	-8+32*1-128($np),$TEMP0
+	mov	%rdx,%rax
+	imulq	8-128($np),%rax
+	add	%rax,$r2
+	 vmovdqu	-8+32*2-128($np),$TEMP1
+	shr	\$29, $r1
+	imulq	16-128($np),%rdx
+	add	%rdx,$r3
+	add	$r1, $r2
+
+	vpmuludq	$Yi,$TEMP0,$TEMP0
+	 vmovq		$Bi, %rbx
+	vmovdqu		-8+32*3-128($np),$TEMP2
+	vpaddq		$TEMP0,$ACC1,$ACC1
+	vpmuludq	$Yi,$TEMP1,$TEMP1
+	vmovdqu		-8+32*4-128($np),$TEMP0
+	vpaddq		$TEMP1,$ACC2,$ACC2
+	vpmuludq	$Yi,$TEMP2,$TEMP2
+	vmovdqu		-8+32*5-128($np),$TEMP1
+	vpaddq		$TEMP2,$ACC3,$ACC3
+	vpmuludq	$Yi,$TEMP0,$TEMP0
+	vmovdqu		-8+32*6-128($np),$TEMP2
+	vpaddq		$TEMP0,$ACC4,$ACC4
+	vpmuludq	$Yi,$TEMP1,$TEMP1
+	vmovdqu		-8+32*7-128($np),$TEMP0
+	vpaddq		$TEMP1,$ACC5,$ACC5
+	vpmuludq	$Yi,$TEMP2,$TEMP2
+	vmovdqu		-8+32*8-128($np),$TEMP1
+	vpaddq		$TEMP2,$ACC6,$ACC6
+	vpmuludq	$Yi,$TEMP0,$TEMP0
+	vmovdqu		-8+32*9-128($np),$TEMP2
+	vpaddq		$TEMP0,$ACC7,$ACC7
+	vpmuludq	$Yi,$TEMP1,$TEMP1
+	vpaddq		$TEMP1,$ACC8,$ACC8
+	vpmuludq	$Yi,$TEMP2,$TEMP2
+	vpaddq		$TEMP2,$ACC9,$ACC9
+
+	 vmovdqu	-16+32*1-128($ap),$TEMP0
+	mov	%rbx,%rax
+	imulq	-128($ap),%rax
+	add	$r2,%rax
+
+	 vmovdqu	-16+32*2-128($ap),$TEMP1
+	mov	%rax,$r2
+	imull	$n0, %eax
+	and	\$0x1fffffff, %eax
+
+	 imulq	8-128($ap),%rbx
+	 add	%rbx,$r3
+	vpmuludq	$Bi,$TEMP0,$TEMP0
+	 vmovd		%eax, $Yi
+	vmovdqu		-16+32*3-128($ap),$TEMP2
+	vpaddq		$TEMP0,$ACC1,$ACC1
+	vpmuludq	$Bi,$TEMP1,$TEMP1
+	 vpbroadcastq	$Yi, $Yi
+	vmovdqu		-16+32*4-128($ap),$TEMP0
+	vpaddq		$TEMP1,$ACC2,$ACC2
+	vpmuludq	$Bi,$TEMP2,$TEMP2
+	vmovdqu		-16+32*5-128($ap),$TEMP1
+	vpaddq		$TEMP2,$ACC3,$ACC3
+	vpmuludq	$Bi,$TEMP0,$TEMP0
+	vmovdqu		-16+32*6-128($ap),$TEMP2
+	vpaddq		$TEMP0,$ACC4,$ACC4
+	vpmuludq	$Bi,$TEMP1,$TEMP1
+	vmovdqu		-16+32*7-128($ap),$TEMP0
+	vpaddq		$TEMP1,$ACC5,$ACC5
+	vpmuludq	$Bi,$TEMP2,$TEMP2
+	vmovdqu		-16+32*8-128($ap),$TEMP1
+	vpaddq		$TEMP2,$ACC6,$ACC6
+	vpmuludq	$Bi,$TEMP0,$TEMP0
+	vmovdqu		-16+32*9-128($ap),$TEMP2
+	vpaddq		$TEMP0,$ACC7,$ACC7
+	vpmuludq	$Bi,$TEMP1,$TEMP1
+	vpaddq		$TEMP1,$ACC8,$ACC8
+	vpmuludq	$Bi,$TEMP2,$TEMP2
+	 vpbroadcastq	24($bp), $Bi
+	vpaddq		$TEMP2,$ACC9,$ACC9
+
+	 vmovdqu	-16+32*1-128($np),$TEMP0
+	mov	%rax,%rdx
+	imulq	-128($np),%rax
+	add	%rax,$r2
+	 vmovdqu	-16+32*2-128($np),$TEMP1
+	imulq	8-128($np),%rdx
+	add	%rdx,$r3
+	shr	\$29, $r2
+
+	vpmuludq	$Yi,$TEMP0,$TEMP0
+	 vmovq		$Bi, %rbx
+	vmovdqu		-16+32*3-128($np),$TEMP2
+	vpaddq		$TEMP0,$ACC1,$ACC1
+	vpmuludq	$Yi,$TEMP1,$TEMP1
+	vmovdqu		-16+32*4-128($np),$TEMP0
+	vpaddq		$TEMP1,$ACC2,$ACC2
+	vpmuludq	$Yi,$TEMP2,$TEMP2
+	vmovdqu		-16+32*5-128($np),$TEMP1
+	vpaddq		$TEMP2,$ACC3,$ACC3
+	vpmuludq	$Yi,$TEMP0,$TEMP0
+	vmovdqu		-16+32*6-128($np),$TEMP2
+	vpaddq		$TEMP0,$ACC4,$ACC4
+	vpmuludq	$Yi,$TEMP1,$TEMP1
+	vmovdqu		-16+32*7-128($np),$TEMP0
+	vpaddq		$TEMP1,$ACC5,$ACC5
+	vpmuludq	$Yi,$TEMP2,$TEMP2
+	vmovdqu		-16+32*8-128($np),$TEMP1
+	vpaddq		$TEMP2,$ACC6,$ACC6
+	vpmuludq	$Yi,$TEMP0,$TEMP0
+	vmovdqu		-16+32*9-128($np),$TEMP2
+	vpaddq		$TEMP0,$ACC7,$ACC7
+	vpmuludq	$Yi,$TEMP1,$TEMP1
+	 vmovdqu	-24+32*1-128($ap),$TEMP0
+	vpaddq		$TEMP1,$ACC8,$ACC8
+	vpmuludq	$Yi,$TEMP2,$TEMP2
+	 vmovdqu	-24+32*2-128($ap),$TEMP1
+	vpaddq		$TEMP2,$ACC9,$ACC9
+
+	add	$r2, $r3
+	imulq	-128($ap),%rbx
+	add	%rbx,$r3
+
+	mov	$r3, %rax
+	imull	$n0, %eax
+	and	\$0x1fffffff, %eax
+
+	vpmuludq	$Bi,$TEMP0,$TEMP0
+	 vmovd		%eax, $Yi
+	vmovdqu		-24+32*3-128($ap),$TEMP2
+	vpaddq		$TEMP0,$ACC1,$ACC1
+	vpmuludq	$Bi,$TEMP1,$TEMP1
+	 vpbroadcastq	$Yi, $Yi
+	vmovdqu		-24+32*4-128($ap),$TEMP0
+	vpaddq		$TEMP1,$ACC2,$ACC2
+	vpmuludq	$Bi,$TEMP2,$TEMP2
+	vmovdqu		-24+32*5-128($ap),$TEMP1
+	vpaddq		$TEMP2,$ACC3,$ACC3
+	vpmuludq	$Bi,$TEMP0,$TEMP0
+	vmovdqu		-24+32*6-128($ap),$TEMP2
+	vpaddq		$TEMP0,$ACC4,$ACC4
+	vpmuludq	$Bi,$TEMP1,$TEMP1
+	vmovdqu		-24+32*7-128($ap),$TEMP0
+	vpaddq		$TEMP1,$ACC5,$ACC5
+	vpmuludq	$Bi,$TEMP2,$TEMP2
+	vmovdqu		-24+32*8-128($ap),$TEMP1
+	vpaddq		$TEMP2,$ACC6,$ACC6
+	vpmuludq	$Bi,$TEMP0,$TEMP0
+	vmovdqu		-24+32*9-128($ap),$TEMP2
+	vpaddq		$TEMP0,$ACC7,$ACC7
+	vpmuludq	$Bi,$TEMP1,$TEMP1
+	vpaddq		$TEMP1,$ACC8,$ACC8
+	vpmuludq	$Bi,$TEMP2,$TEMP2
+	 vpbroadcastq	32($bp), $Bi
+	vpaddq		$TEMP2,$ACC9,$ACC9
+	 add		\$32, $bp			# $bp++
+
+	vmovdqu		-24+32*1-128($np),$TEMP0
+	imulq	-128($np),%rax
+	add	%rax,$r3
+	shr	\$29, $r3
+
+	vmovdqu		-24+32*2-128($np),$TEMP1
+	vpmuludq	$Yi,$TEMP0,$TEMP0
+	 vmovq		$Bi, %rbx
+	vmovdqu		-24+32*3-128($np),$TEMP2
+	vpaddq		$TEMP0,$ACC1,$ACC0		# $ACC0==$TEMP0
+	vpmuludq	$Yi,$TEMP1,$TEMP1
+	 vmovdqu	$ACC0, (%rsp)			# transfer $r0-$r3
+	vpaddq		$TEMP1,$ACC2,$ACC1
+	vmovdqu		-24+32*4-128($np),$TEMP0
+	vpmuludq	$Yi,$TEMP2,$TEMP2
+	vmovdqu		-24+32*5-128($np),$TEMP1
+	vpaddq		$TEMP2,$ACC3,$ACC2
+	vpmuludq	$Yi,$TEMP0,$TEMP0
+	vmovdqu		-24+32*6-128($np),$TEMP2
+	vpaddq		$TEMP0,$ACC4,$ACC3
+	vpmuludq	$Yi,$TEMP1,$TEMP1
+	vmovdqu		-24+32*7-128($np),$TEMP0
+	vpaddq		$TEMP1,$ACC5,$ACC4
+	vpmuludq	$Yi,$TEMP2,$TEMP2
+	vmovdqu		-24+32*8-128($np),$TEMP1
+	vpaddq		$TEMP2,$ACC6,$ACC5
+	vpmuludq	$Yi,$TEMP0,$TEMP0
+	vmovdqu		-24+32*9-128($np),$TEMP2
+	 mov	$r3, $r0
+	vpaddq		$TEMP0,$ACC7,$ACC6
+	vpmuludq	$Yi,$TEMP1,$TEMP1
+	 add	(%rsp), $r0
+	vpaddq		$TEMP1,$ACC8,$ACC7
+	vpmuludq	$Yi,$TEMP2,$TEMP2
+	 vmovq	$r3, $TEMP1
+	vpaddq		$TEMP2,$ACC9,$ACC8
+
+	dec	$i
+	jnz	.Loop_mul_1024
+___
+
+# (*)	Original implementation was correcting ACC1-ACC3 for overflow
+#	after 7 loop runs, or after 28 iterations, or 56 additions.
+#	But as we underutilize resources, it's possible to correct in
+#	each iteration with marginal performance loss. But then, as
+#	we do it in each iteration, we can correct less digits, and
+#	avoid performance penalties completely. Also note that we
+#	correct only three digits out of four. This works because
+#	most significant digit is subjected to less additions.
+
+$TEMP0 = $ACC9;
+$TEMP3 = $Bi;
+$TEMP4 = $Yi;
+$code.=<<___;
+	vpermq		\$0, $AND_MASK, $AND_MASK
+	vpaddq		(%rsp), $TEMP1, $ACC0
+
+	vpsrlq		\$29, $ACC0, $TEMP1
+	vpand		$AND_MASK, $ACC0, $ACC0
+	vpsrlq		\$29, $ACC1, $TEMP2
+	vpand		$AND_MASK, $ACC1, $ACC1
+	vpsrlq		\$29, $ACC2, $TEMP3
+	vpermq		\$0x93, $TEMP1, $TEMP1
+	vpand		$AND_MASK, $ACC2, $ACC2
+	vpsrlq		\$29, $ACC3, $TEMP4
+	vpermq		\$0x93, $TEMP2, $TEMP2
+	vpand		$AND_MASK, $ACC3, $ACC3
+
+	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
+	vpermq		\$0x93, $TEMP3, $TEMP3
+	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
+	vpermq		\$0x93, $TEMP4, $TEMP4
+	vpaddq		$TEMP0, $ACC0, $ACC0
+	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
+	vpaddq		$TEMP1, $ACC1, $ACC1
+	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
+	vpaddq		$TEMP2, $ACC2, $ACC2
+	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
+	vpaddq		$TEMP3, $ACC3, $ACC3
+	vpaddq		$TEMP4, $ACC4, $ACC4
+
+	vpsrlq		\$29, $ACC0, $TEMP1
+	vpand		$AND_MASK, $ACC0, $ACC0
+	vpsrlq		\$29, $ACC1, $TEMP2
+	vpand		$AND_MASK, $ACC1, $ACC1
+	vpsrlq		\$29, $ACC2, $TEMP3
+	vpermq		\$0x93, $TEMP1, $TEMP1
+	vpand		$AND_MASK, $ACC2, $ACC2
+	vpsrlq		\$29, $ACC3, $TEMP4
+	vpermq		\$0x93, $TEMP2, $TEMP2
+	vpand		$AND_MASK, $ACC3, $ACC3
+	vpermq		\$0x93, $TEMP3, $TEMP3
+
+	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
+	vpermq		\$0x93, $TEMP4, $TEMP4
+	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
+	vpaddq		$TEMP0, $ACC0, $ACC0
+	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
+	vpaddq		$TEMP1, $ACC1, $ACC1
+	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
+	vpaddq		$TEMP2, $ACC2, $ACC2
+	vpblendd	\$3, $TEMP4, $ZERO, $TEMP4
+	vpaddq		$TEMP3, $ACC3, $ACC3
+	vpaddq		$TEMP4, $ACC4, $ACC4
+
+	vmovdqu		$ACC0, 0-128($rp)
+	vmovdqu		$ACC1, 32-128($rp)
+	vmovdqu		$ACC2, 64-128($rp)
+	vmovdqu		$ACC3, 96-128($rp)
+___
+
+$TEMP5=$ACC0;
+$code.=<<___;
+	vpsrlq		\$29, $ACC4, $TEMP1
+	vpand		$AND_MASK, $ACC4, $ACC4
+	vpsrlq		\$29, $ACC5, $TEMP2
+	vpand		$AND_MASK, $ACC5, $ACC5
+	vpsrlq		\$29, $ACC6, $TEMP3
+	vpermq		\$0x93, $TEMP1, $TEMP1
+	vpand		$AND_MASK, $ACC6, $ACC6
+	vpsrlq		\$29, $ACC7, $TEMP4
+	vpermq		\$0x93, $TEMP2, $TEMP2
+	vpand		$AND_MASK, $ACC7, $ACC7
+	vpsrlq		\$29, $ACC8, $TEMP5
+	vpermq		\$0x93, $TEMP3, $TEMP3
+	vpand		$AND_MASK, $ACC8, $ACC8
+	vpermq		\$0x93, $TEMP4, $TEMP4
+
+	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
+	vpermq		\$0x93, $TEMP5, $TEMP5
+	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
+	vpaddq		$TEMP0, $ACC4, $ACC4
+	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
+	vpaddq		$TEMP1, $ACC5, $ACC5
+	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
+	vpaddq		$TEMP2, $ACC6, $ACC6
+	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
+	vpaddq		$TEMP3, $ACC7, $ACC7
+	vpaddq		$TEMP4, $ACC8, $ACC8
+
+	vpsrlq		\$29, $ACC4, $TEMP1
+	vpand		$AND_MASK, $ACC4, $ACC4
+	vpsrlq		\$29, $ACC5, $TEMP2
+	vpand		$AND_MASK, $ACC5, $ACC5
+	vpsrlq		\$29, $ACC6, $TEMP3
+	vpermq		\$0x93, $TEMP1, $TEMP1
+	vpand		$AND_MASK, $ACC6, $ACC6
+	vpsrlq		\$29, $ACC7, $TEMP4
+	vpermq		\$0x93, $TEMP2, $TEMP2
+	vpand		$AND_MASK, $ACC7, $ACC7
+	vpsrlq		\$29, $ACC8, $TEMP5
+	vpermq		\$0x93, $TEMP3, $TEMP3
+	vpand		$AND_MASK, $ACC8, $ACC8
+	vpermq		\$0x93, $TEMP4, $TEMP4
+
+	vpblendd	\$3, $ZERO, $TEMP1, $TEMP0
+	vpermq		\$0x93, $TEMP5, $TEMP5
+	vpblendd	\$3, $TEMP1, $TEMP2, $TEMP1
+	vpaddq		$TEMP0, $ACC4, $ACC4
+	vpblendd	\$3, $TEMP2, $TEMP3, $TEMP2
+	vpaddq		$TEMP1, $ACC5, $ACC5
+	vpblendd	\$3, $TEMP3, $TEMP4, $TEMP3
+	vpaddq		$TEMP2, $ACC6, $ACC6
+	vpblendd	\$3, $TEMP4, $TEMP5, $TEMP4
+	vpaddq		$TEMP3, $ACC7, $ACC7
+	vpaddq		$TEMP4, $ACC8, $ACC8
+
+	vmovdqu		$ACC4, 128-128($rp)
+	vmovdqu		$ACC5, 160-128($rp)    
+	vmovdqu		$ACC6, 192-128($rp)
+	vmovdqu		$ACC7, 224-128($rp)
+	vmovdqu		$ACC8, 256-128($rp)
+	vzeroupper
+
+	mov	%rbp, %rax
+___
+$code.=<<___ if ($win64);
+	movaps	-0xd8(%rax),%xmm6
+	movaps	-0xc8(%rax),%xmm7
+	movaps	-0xb8(%rax),%xmm8
+	movaps	-0xa8(%rax),%xmm9
+	movaps	-0x98(%rax),%xmm10
+	movaps	-0x88(%rax),%xmm11
+	movaps	-0x78(%rax),%xmm12
+	movaps	-0x68(%rax),%xmm13
+	movaps	-0x58(%rax),%xmm14
+	movaps	-0x48(%rax),%xmm15
+___
+$code.=<<___;
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	lea	(%rax),%rsp		# restore %rsp
+.Lmul_1024_epilogue:
+	ret
+.size	rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
+___
+}
+{
+my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
+my @T = map("%r$_",(8..11));
+
+$code.=<<___;
+.globl	rsaz_1024_red2norm_avx2
+.type	rsaz_1024_red2norm_avx2,\@abi-omnipotent
+.align	32
+rsaz_1024_red2norm_avx2:
+	sub	\$-128,$inp	# size optimization
+	xor	%rax,%rax
+___
+
+for ($j=0,$i=0; $i<16; $i++) {
+    my $k=0;
+    while (29*$j<64*($i+1)) {	# load data till boundary
+	$code.="	mov	`8*$j-128`($inp), @T[0]\n";
+	$j++; $k++; push(@T,shift(@T));
+    }
+    $l=$k;
+    while ($k>1) {		# shift loaded data but last value
+	$code.="	shl	\$`29*($j-$k)`,@T[-$k]\n";
+	$k--;
+    }
+    $code.=<<___;		# shift last value
+	mov	@T[-1], @T[0]
+	shl	\$`29*($j-1)`, @T[-1]
+	shr	\$`-29*($j-1)`, @T[0]
+___
+    while ($l) {		# accumulate all values
+	$code.="	add	@T[-$l], %rax\n";
+	$l--;
+    }
+	$code.=<<___;
+	adc	\$0, @T[0]	# consume eventual carry
+	mov	%rax, 8*$i($out)
+	mov	@T[0], %rax
+___
+    push(@T,shift(@T));
+}
+$code.=<<___;
+	ret
+.size	rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
+
+.globl	rsaz_1024_norm2red_avx2
+.type	rsaz_1024_norm2red_avx2,\@abi-omnipotent
+.align	32
+rsaz_1024_norm2red_avx2:
+	sub	\$-128,$out	# size optimization
+	mov	($inp),@T[0]
+	mov	\$0x1fffffff,%eax
+___
+for ($j=0,$i=0; $i<16; $i++) {
+    $code.="	mov	`8*($i+1)`($inp),@T[1]\n"	if ($i<15);
+    $code.="	xor	@T[1],@T[1]\n"			if ($i==15);
+    my $k=1;
+    while (29*($j+1)<64*($i+1)) {
+    	$code.=<<___;
+	mov	@T[0],@T[-$k]
+	shr	\$`29*$j`,@T[-$k]
+	and	%rax,@T[-$k]				# &0x1fffffff
+	mov	@T[-$k],`8*$j-128`($out)
+___
+	$j++; $k++;
+    }
+    $code.=<<___;
+	shrd	\$`29*$j`,@T[1],@T[0]
+	and	%rax,@T[0]
+	mov	@T[0],`8*$j-128`($out)
+___
+    $j++;
+    push(@T,shift(@T));
+}
+$code.=<<___;
+	mov	@T[0],`8*$j-128`($out)			# zero
+	mov	@T[0],`8*($j+1)-128`($out)
+	mov	@T[0],`8*($j+2)-128`($out)
+	mov	@T[0],`8*($j+3)-128`($out)
+	ret
+.size	rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
+___
+}
+{
+my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
+
+$code.=<<___;
+.globl	rsaz_1024_scatter5_avx2
+.type	rsaz_1024_scatter5_avx2,\@abi-omnipotent
+.align	32
+rsaz_1024_scatter5_avx2:
+	vzeroupper
+	vmovdqu	.Lscatter_permd(%rip),%ymm5
+	shl	\$4,$power
+	lea	($out,$power),$out
+	mov	\$9,%eax
+	jmp	.Loop_scatter_1024
+
+.align	32
+.Loop_scatter_1024:
+	vmovdqu		($inp),%ymm0
+	lea		32($inp),$inp
+	vpermd		%ymm0,%ymm5,%ymm0
+	vmovdqu		%xmm0,($out)
+	lea		16*32($out),$out
+	dec	%eax
+	jnz	.Loop_scatter_1024
+
+	vzeroupper
+	ret
+.size	rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
+
+.globl	rsaz_1024_gather5_avx2
+.type	rsaz_1024_gather5_avx2,\@abi-omnipotent
+.align	32
+rsaz_1024_gather5_avx2:
+___
+$code.=<<___ if ($win64);
+	lea	-0x88(%rsp),%rax
+	vzeroupper
+.LSEH_begin_rsaz_1024_gather5:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x8d,0x60,0xe0		#lea	-0x20(%rax),%rsp
+	.byte	0xc5,0xf8,0x29,0x70,0xe0	#vmovaps %xmm6,-0x20(%rax)
+	.byte	0xc5,0xf8,0x29,0x78,0xf0	#vmovaps %xmm7,-0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x40,0x00	#vmovaps %xmm8,0(%rax)
+	.byte	0xc5,0x78,0x29,0x48,0x10	#vmovaps %xmm9,0x10(%rax)
+	.byte	0xc5,0x78,0x29,0x50,0x20	#vmovaps %xmm10,0x20(%rax)
+	.byte	0xc5,0x78,0x29,0x58,0x30	#vmovaps %xmm11,0x30(%rax)
+	.byte	0xc5,0x78,0x29,0x60,0x40	#vmovaps %xmm12,0x40(%rax)
+	.byte	0xc5,0x78,0x29,0x68,0x50	#vmovaps %xmm13,0x50(%rax)
+	.byte	0xc5,0x78,0x29,0x70,0x60	#vmovaps %xmm14,0x60(%rax)
+	.byte	0xc5,0x78,0x29,0x78,0x70	#vmovaps %xmm15,0x70(%rax)
+___
+$code.=<<___;
+	lea	.Lgather_table(%rip),%r11
+	mov	$power,%eax
+	and	\$3,$power
+	shr	\$2,%eax			# cache line number
+	shl	\$4,$power			# offset within cache line
+
+	vmovdqu		-32(%r11),%ymm7		# .Lgather_permd
+	vpbroadcastb	8(%r11,%rax), %xmm8
+	vpbroadcastb	7(%r11,%rax), %xmm9
+	vpbroadcastb	6(%r11,%rax), %xmm10
+	vpbroadcastb	5(%r11,%rax), %xmm11
+	vpbroadcastb	4(%r11,%rax), %xmm12
+	vpbroadcastb	3(%r11,%rax), %xmm13
+	vpbroadcastb	2(%r11,%rax), %xmm14
+	vpbroadcastb	1(%r11,%rax), %xmm15
+
+	lea	64($inp,$power),$inp
+	mov	\$64,%r11			# size optimization
+	mov	\$9,%eax
+	jmp	.Loop_gather_1024
+
+.align	32
+.Loop_gather_1024:
+	vpand		-64($inp),		%xmm8,%xmm0
+	vpand		($inp),			%xmm9,%xmm1
+	vpand		64($inp),		%xmm10,%xmm2
+	vpand		($inp,%r11,2),		%xmm11,%xmm3
+	 vpor					%xmm0,%xmm1,%xmm1
+	vpand		64($inp,%r11,2),	%xmm12,%xmm4
+	 vpor					%xmm2,%xmm3,%xmm3
+	vpand		($inp,%r11,4),		%xmm13,%xmm5
+	 vpor					%xmm1,%xmm3,%xmm3
+	vpand		64($inp,%r11,4),	%xmm14,%xmm6
+	 vpor					%xmm4,%xmm5,%xmm5
+	vpand		-128($inp,%r11,8),	%xmm15,%xmm2
+	lea		($inp,%r11,8),$inp
+	 vpor					%xmm3,%xmm5,%xmm5
+	 vpor					%xmm2,%xmm6,%xmm6
+	 vpor					%xmm5,%xmm6,%xmm6
+	vpermd		%ymm6,%ymm7,%ymm6
+	vmovdqu		%ymm6,($out)
+	lea		32($out),$out
+	dec	%eax
+	jnz	.Loop_gather_1024
+
+	vpxor	%ymm0,%ymm0,%ymm0
+	vmovdqu	%ymm0,($out)
+	vzeroupper
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	movaps	0x20(%rsp),%xmm8
+	movaps	0x30(%rsp),%xmm9
+	movaps	0x40(%rsp),%xmm10
+	movaps	0x50(%rsp),%xmm11
+	movaps	0x60(%rsp),%xmm12
+	movaps	0x70(%rsp),%xmm13
+	movaps	0x80(%rsp),%xmm14
+	movaps	0x90(%rsp),%xmm15
+	lea	0xa8(%rsp),%rsp
+.LSEH_end_rsaz_1024_gather5:
+___
+$code.=<<___;
+	ret
+.size	rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
+___
+}
+
+$code.=<<___;
+.extern	OPENSSL_ia32cap_P
+.globl	rsaz_avx2_eligible
+.type	rsaz_avx2_eligible,\@abi-omnipotent
+.align	32
+rsaz_avx2_eligible:
+	mov	OPENSSL_ia32cap_P+8(%rip),%eax
+	and	\$`1<<5`,%eax
+	shr	\$5,%eax
+	ret
+.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.align	64
+.Land_mask:
+	.quad	0x1fffffff,0x1fffffff,0x1fffffff,-1
+.Lscatter_permd:
+	.long	0,2,4,6,7,7,7,7
+.Lgather_permd:
+	.long	0,7,1,7,2,7,3,7
+.Lgather_table:
+	.byte	0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
+.align	64
+___
+
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___
+.extern	__imp_RtlVirtualUnwind
+.type	rsaz_se_handler,\@abi-omnipotent
+.align	16
+rsaz_se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# prologue label
+	cmp	%r10,%rbx		# context->Rip<prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	160($context),%rax	# pull context->Rbp
+
+	mov	-48(%rax),%r15
+	mov	-40(%rax),%r14
+	mov	-32(%rax),%r13
+	mov	-24(%rax),%r12
+	mov	-16(%rax),%rbp
+	mov	-8(%rax),%rbx
+	mov	%r15,240($context)
+	mov	%r14,232($context)
+	mov	%r13,224($context)
+	mov	%r12,216($context)
+	mov	%rbp,160($context)
+	mov	%rbx,144($context)
+
+	lea	-0xd8(%rax),%rsi	# %xmm save area
+	lea	512($context),%rdi	# & context.Xmm6
+	mov	\$20,%ecx		# 10*sizeof(%xmm0)/sizeof(%rax)
+	.long	0xa548f3fc		# cld; rep movsq
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	rsaz_se_handler,.-rsaz_se_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_rsaz_1024_sqr_avx2
+	.rva	.LSEH_end_rsaz_1024_sqr_avx2
+	.rva	.LSEH_info_rsaz_1024_sqr_avx2
+
+	.rva	.LSEH_begin_rsaz_1024_mul_avx2
+	.rva	.LSEH_end_rsaz_1024_mul_avx2
+	.rva	.LSEH_info_rsaz_1024_mul_avx2
+
+	.rva	.LSEH_begin_rsaz_1024_gather5
+	.rva	.LSEH_end_rsaz_1024_gather5
+	.rva	.LSEH_info_rsaz_1024_gather5
+.section	.xdata
+.align	8
+.LSEH_info_rsaz_1024_sqr_avx2:
+	.byte	9,0,0,0
+	.rva	rsaz_se_handler
+	.rva	.Lsqr_1024_body,.Lsqr_1024_epilogue
+.LSEH_info_rsaz_1024_mul_avx2:
+	.byte	9,0,0,0
+	.rva	rsaz_se_handler
+	.rva	.Lmul_1024_body,.Lmul_1024_epilogue
+.LSEH_info_rsaz_1024_gather5:
+	.byte	0x01,0x33,0x16,0x00
+	.byte	0x36,0xf8,0x09,0x00	#vmovaps 0x90(rsp),xmm15
+	.byte	0x31,0xe8,0x08,0x00	#vmovaps 0x80(rsp),xmm14
+	.byte	0x2c,0xd8,0x07,0x00	#vmovaps 0x70(rsp),xmm13
+	.byte	0x27,0xc8,0x06,0x00	#vmovaps 0x60(rsp),xmm12
+	.byte	0x22,0xb8,0x05,0x00	#vmovaps 0x50(rsp),xmm11
+	.byte	0x1d,0xa8,0x04,0x00	#vmovaps 0x40(rsp),xmm10
+	.byte	0x18,0x98,0x03,0x00	#vmovaps 0x30(rsp),xmm9
+	.byte	0x13,0x88,0x02,0x00	#vmovaps 0x20(rsp),xmm8
+	.byte	0x0e,0x78,0x01,0x00	#vmovaps 0x10(rsp),xmm7
+	.byte	0x09,0x68,0x00,0x00	#vmovaps 0x00(rsp),xmm6
+	.byte	0x04,0x01,0x15,0x00	#sub	rsp,0xa8
+___
+}
+
+foreach (split("\n",$code)) {
+	s/\`([^\`]*)\`/eval($1)/ge;
+
+	s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge		or
+
+	s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go		or
+	s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go		or
+	s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
+	s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go	or
+	s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
+	print $_,"\n";
+}
+
+}}} else {{{
+print <<___;	# assembler is too old
+.text
+
+.globl	rsaz_avx2_eligible
+.type	rsaz_avx2_eligible,\@abi-omnipotent
+rsaz_avx2_eligible:
+	xor	%eax,%eax
+	ret
+.size	rsaz_avx2_eligible,.-rsaz_avx2_eligible
+
+.globl	rsaz_1024_sqr_avx2
+.globl	rsaz_1024_mul_avx2
+.globl	rsaz_1024_norm2red_avx2
+.globl	rsaz_1024_red2norm_avx2
+.globl	rsaz_1024_scatter5_avx2
+.globl	rsaz_1024_gather5_avx2
+.type	rsaz_1024_sqr_avx2,\@abi-omnipotent
+rsaz_1024_sqr_avx2:
+rsaz_1024_mul_avx2:
+rsaz_1024_norm2red_avx2:
+rsaz_1024_red2norm_avx2:
+rsaz_1024_scatter5_avx2:
+rsaz_1024_gather5_avx2:
+	.byte	0x0f,0x0b	# ud2
+	ret
+.size	rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
+___
+}}}
+
+close STDOUT;

diff --git a/crypto/bn/asm/rsaz-x86_64.pl b/crypto/bn/asm/rsaz-x86_64.pl
new file mode 100644
index 0000000..120b473
--- /dev/null
+++ b/crypto/bn/asm/rsaz-x86_64.pl

@@ -0,0 +1,2139 @@
+#!/usr/bin/env perl
+
+##############################################################################
+#                                                                            #
+#  Copyright (c) 2012, Intel Corporation                                     #
+#                                                                            #
+#  All rights reserved.                                                      #
+#                                                                            #
+#  Redistribution and use in source and binary forms, with or without        #
+#  modification, are permitted provided that the following conditions are    #
+#  met:                                                                      #
+#                                                                            #
+#  *  Redistributions of source code must retain the above copyright         #
+#     notice, this list of conditions and the following disclaimer.          #
+#                                                                            #
+#  *  Redistributions in binary form must reproduce the above copyright      #
+#     notice, this list of conditions and the following disclaimer in the    #
+#     documentation and/or other materials provided with the                 #
+#     distribution.                                                          #
+#                                                                            #
+#  *  Neither the name of the Intel Corporation nor the names of its         #
+#     contributors may be used to endorse or promote products derived from   #
+#     this software without specific prior written permission.               #
+#                                                                            #
+#                                                                            #
+#  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          #
+#  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         #
+#  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        #
+#  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            #
+#  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     #
+#  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       #
+#  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        #
+#  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    #
+#  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      #
+#  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        #
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              #
+#                                                                            #
+##############################################################################
+# Developers and authors:                                                    #
+# Shay Gueron (1, 2), and Vlad Krasnov (1)                                   #
+# (1) Intel Architecture Group, Microprocessor and Chipset Development,      #
+#     Israel Development Center, Haifa, Israel                               #
+# (2) University of Haifa                                                    #
+##############################################################################
+# Reference:                                                                 #
+# [1] S. Gueron, "Efficient Software Implementations of Modular              #
+#     Exponentiation", http://eprint.iacr.org/2011/239                       #
+# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".             #
+#     IEEE Proceedings of 9th International Conference on Information        #
+#     Technology: New Generations (ITNG 2012), 821-823 (2012).               #
+# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
+#     Journal of Cryptographic Engineering 2:31-43 (2012).                   #
+# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis    #
+#     resistant 512-bit and 1024-bit modular exponentiation for optimizing   #
+#     RSA1024 and RSA2048 on x86_64 platforms",                              #
+#     http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
+##############################################################################
+
+# While original submission covers 512- and 1024-bit exponentiation,
+# this module is limited to 512-bit version only (and as such
+# accelerates RSA1024 sign). This is because improvement for longer
+# keys is not high enough to justify the effort, highest measured
+# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
+# for the moment of this writing!] Nor does this module implement
+# "monolithic" complete exponentiation jumbo-subroutine, but adheres
+# to more modular mixture of C and assembly. And it's optimized even
+# for processors other than Intel Core family (see table below for
+# improvement coefficients).
+# 						<appro@openssl.org>
+#
+# RSA1024 sign/sec	this/original	|this/rsax(*)	this/fips(*)
+#			----------------+---------------------------
+# Opteron		+13%		|+5%		+20%
+# Bulldozer		-0%		|-1%		+10%
+# P4			+11%		|+7%		+8%
+# Westmere		+5%		|+14%		+17%
+# Sandy Bridge		+2%		|+12%		+29%
+# Ivy Bridge		+1%		|+11%		+35%
+# Haswell(**)		-0%		|+12%		+39%
+# Atom			+13%		|+11%		+4%
+# VIA Nano		+70%		|+9%		+25%
+#
+# (*)	rsax engine and fips numbers are presented for reference
+#	purposes;
+# (**)	MULX was attempted, but found to give only marginal improvement;
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| $^X $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$addx = ($1>=11);
+}
+
+($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp");	# common internal API
+{
+my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
+
+$code.=<<___;
+.text
+
+.extern	OPENSSL_ia32cap_P
+
+.globl	rsaz_512_sqr
+.type	rsaz_512_sqr,\@function,5
+.align	32
+rsaz_512_sqr:				# 25-29% faster than rsaz_512_mul
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	subq	\$128+24, %rsp
+.Lsqr_body:
+	movq	$mod, %rbp		# common argument
+	movq	($inp), %rdx
+	movq	8($inp), %rax
+	movq	$n0, 128(%rsp)
+___
+$code.=<<___ if ($addx);
+	movl	\$0x80100,%r11d
+	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
+	je	.Loop_sqrx
+___
+$code.=<<___;
+	jmp	.Loop_sqr
+
+.align	32
+.Loop_sqr:
+	movl	$times,128+8(%rsp)
+#first iteration
+	movq	%rdx, %rbx
+	mulq	%rdx
+	movq	%rax, %r8
+	movq	16($inp), %rax
+	movq	%rdx, %r9
+
+	mulq	%rbx
+	addq	%rax, %r9
+	movq	24($inp), %rax
+	movq	%rdx, %r10
+	adcq	\$0, %r10
+
+	mulq	%rbx
+	addq	%rax, %r10
+	movq	32($inp), %rax
+	movq	%rdx, %r11
+	adcq	\$0, %r11
+
+	mulq	%rbx
+	addq	%rax, %r11
+	movq	40($inp), %rax
+	movq	%rdx, %r12
+	adcq	\$0, %r12
+
+	mulq	%rbx
+	addq	%rax, %r12
+	movq	48($inp), %rax
+	movq	%rdx, %r13
+	adcq	\$0, %r13
+
+	mulq	%rbx
+	addq	%rax, %r13
+	movq	56($inp), %rax
+	movq	%rdx, %r14
+	adcq	\$0, %r14
+
+	mulq	%rbx
+	addq	%rax, %r14
+	movq	%rbx, %rax
+	movq	%rdx, %r15
+	adcq	\$0, %r15
+
+	addq	%r8, %r8		#shlq	\$1, %r8
+	movq	%r9, %rcx
+	adcq	%r9, %r9		#shld	\$1, %r8, %r9
+
+	mulq	%rax
+	movq	%rax, (%rsp)
+	addq	%rdx, %r8
+	adcq	\$0, %r9
+
+	movq	%r8, 8(%rsp)
+	shrq	\$63, %rcx
+
+#second iteration
+	movq	8($inp), %r8
+	movq	16($inp), %rax
+	mulq	%r8
+	addq	%rax, %r10
+	movq	24($inp), %rax
+	movq	%rdx, %rbx
+	adcq	\$0, %rbx
+
+	mulq	%r8
+	addq	%rax, %r11
+	movq	32($inp), %rax
+	adcq	\$0, %rdx
+	addq	%rbx, %r11
+	movq	%rdx, %rbx
+	adcq	\$0, %rbx
+
+	mulq	%r8
+	addq	%rax, %r12
+	movq	40($inp), %rax
+	adcq	\$0, %rdx
+	addq	%rbx, %r12
+	movq	%rdx, %rbx
+	adcq	\$0, %rbx
+
+	mulq	%r8
+	addq	%rax, %r13
+	movq	48($inp), %rax
+	adcq	\$0, %rdx
+	addq	%rbx, %r13
+	movq	%rdx, %rbx
+	adcq	\$0, %rbx
+
+	mulq	%r8
+	addq	%rax, %r14
+	movq	56($inp), %rax
+	adcq	\$0, %rdx
+	addq	%rbx, %r14
+	movq	%rdx, %rbx
+	adcq	\$0, %rbx
+
+	mulq	%r8
+	addq	%rax, %r15
+	movq	%r8, %rax
+	adcq	\$0, %rdx
+	addq	%rbx, %r15
+	movq	%rdx, %r8
+	movq	%r10, %rdx
+	adcq	\$0, %r8
+
+	add	%rdx, %rdx
+	lea	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
+	movq	%r11, %rbx
+	adcq	%r11, %r11		#shld	\$1, %r10, %r11
+
+	mulq	%rax
+	addq	%rax, %r9
+	adcq	%rdx, %r10
+	adcq	\$0, %r11
+
+	movq	%r9, 16(%rsp)
+	movq	%r10, 24(%rsp)
+	shrq	\$63, %rbx
+	
+#third iteration
+	movq	16($inp), %r9	
+	movq	24($inp), %rax
+	mulq	%r9
+	addq	%rax, %r12
+	movq	32($inp), %rax
+	movq	%rdx, %rcx
+	adcq	\$0, %rcx
+
+	mulq	%r9
+	addq	%rax, %r13
+	movq	40($inp), %rax
+	adcq	\$0, %rdx
+	addq	%rcx, %r13
+	movq	%rdx, %rcx
+	adcq	\$0, %rcx
+
+	mulq	%r9
+	addq	%rax, %r14
+	movq	48($inp), %rax
+	adcq	\$0, %rdx
+	addq	%rcx, %r14
+	movq	%rdx, %rcx
+	adcq	\$0, %rcx
+
+	mulq	%r9
+	 movq	%r12, %r10
+	 lea	(%rbx,%r12,2), %r12	#shld	\$1, %rbx, %r12
+	addq	%rax, %r15
+	movq	56($inp), %rax
+	adcq	\$0, %rdx
+	addq	%rcx, %r15
+	movq	%rdx, %rcx
+	adcq	\$0, %rcx
+
+	mulq	%r9
+	 shrq	\$63, %r10
+	addq	%rax, %r8
+	movq	%r9, %rax
+	adcq	\$0, %rdx
+	addq	%rcx, %r8
+	movq	%rdx, %r9
+	adcq	\$0, %r9
+
+	movq	%r13, %rcx
+	leaq	(%r10,%r13,2), %r13	#shld	\$1, %r12, %r13
+
+	mulq	%rax
+	addq	%rax, %r11
+	adcq	%rdx, %r12
+	adcq	\$0, %r13
+
+	movq	%r11, 32(%rsp)
+	movq	%r12, 40(%rsp)
+	shrq	\$63, %rcx
+
+#fourth iteration
+	movq	24($inp), %r10
+	movq	32($inp), %rax
+	mulq	%r10
+	addq	%rax, %r14
+	movq	40($inp), %rax
+	movq	%rdx, %rbx
+	adcq	\$0, %rbx
+
+	mulq	%r10
+	addq	%rax, %r15
+	movq	48($inp), %rax
+	adcq	\$0, %rdx
+	addq	%rbx, %r15
+	movq	%rdx, %rbx
+	adcq	\$0, %rbx
+
+	mulq	%r10
+	 movq	%r14, %r12
+	 leaq	(%rcx,%r14,2), %r14	#shld	\$1, %rcx, %r14
+	addq	%rax, %r8
+	movq	56($inp), %rax
+	adcq	\$0, %rdx
+	addq	%rbx, %r8
+	movq	%rdx, %rbx
+	adcq	\$0, %rbx
+
+	mulq	%r10
+	 shrq	\$63, %r12
+	addq	%rax, %r9
+	movq	%r10, %rax
+	adcq	\$0, %rdx
+	addq	%rbx, %r9
+	movq	%rdx, %r10
+	adcq	\$0, %r10
+
+	movq	%r15, %rbx
+	leaq	(%r12,%r15,2),%r15	#shld	\$1, %r14, %r15
+
+	mulq	%rax
+	addq	%rax, %r13
+	adcq	%rdx, %r14
+	adcq	\$0, %r15
+
+	movq	%r13, 48(%rsp)
+	movq	%r14, 56(%rsp)
+	shrq	\$63, %rbx
+
+#fifth iteration
+	movq	32($inp), %r11
+	movq	40($inp), %rax
+	mulq	%r11
+	addq	%rax, %r8
+	movq	48($inp), %rax
+	movq	%rdx, %rcx
+	adcq	\$0, %rcx
+
+	mulq	%r11
+	addq	%rax, %r9
+	movq	56($inp), %rax
+	adcq	\$0, %rdx
+	 movq	%r8, %r12
+	 leaq	(%rbx,%r8,2), %r8	#shld	\$1, %rbx, %r8
+	addq	%rcx, %r9
+	movq	%rdx, %rcx
+	adcq	\$0, %rcx
+
+	mulq	%r11
+	 shrq	\$63, %r12
+	addq	%rax, %r10
+	movq	%r11, %rax
+	adcq	\$0, %rdx
+	addq	%rcx, %r10
+	movq	%rdx, %r11
+	adcq	\$0, %r11
+
+	movq	%r9, %rcx
+	leaq	(%r12,%r9,2), %r9	#shld	\$1, %r8, %r9
+
+	mulq	%rax
+	addq	%rax, %r15
+	adcq	%rdx, %r8
+	adcq	\$0, %r9
+
+	movq	%r15, 64(%rsp)
+	movq	%r8, 72(%rsp)
+	shrq	\$63, %rcx
+
+#sixth iteration
+	movq	40($inp), %r12
+	movq	48($inp), %rax
+	mulq	%r12
+	addq	%rax, %r10
+	movq	56($inp), %rax
+	movq	%rdx, %rbx
+	adcq	\$0, %rbx
+
+	mulq	%r12
+	addq	%rax, %r11
+	movq	%r12, %rax
+	 movq	%r10, %r15
+	 leaq	(%rcx,%r10,2), %r10	#shld	\$1, %rcx, %r10
+	adcq	\$0, %rdx
+	 shrq	\$63, %r15
+	addq	%rbx, %r11
+	movq	%rdx, %r12
+	adcq	\$0, %r12
+
+	movq	%r11, %rbx
+	leaq	(%r15,%r11,2), %r11	#shld	\$1, %r10, %r11
+
+	mulq	%rax
+	addq	%rax, %r9
+	adcq	%rdx, %r10
+	adcq	\$0, %r11
+
+	movq	%r9, 80(%rsp)
+	movq	%r10, 88(%rsp)
+
+#seventh iteration
+	movq	48($inp), %r13
+	movq	56($inp), %rax
+	mulq	%r13
+	addq	%rax, %r12
+	movq	%r13, %rax
+	movq	%rdx, %r13
+	adcq	\$0, %r13
+
+	xorq	%r14, %r14
+	shlq	\$1, %rbx
+	adcq	%r12, %r12		#shld	\$1, %rbx, %r12
+	adcq	%r13, %r13		#shld	\$1, %r12, %r13
+	adcq	%r14, %r14		#shld	\$1, %r13, %r14
+
+	mulq	%rax
+	addq	%rax, %r11
+	adcq	%rdx, %r12
+	adcq	\$0, %r13
+
+	movq	%r11, 96(%rsp)
+	movq	%r12, 104(%rsp)
+
+#eighth iteration
+	movq	56($inp), %rax
+	mulq	%rax
+	addq	%rax, %r13
+	adcq	\$0, %rdx
+
+	addq	%rdx, %r14
+
+	movq	%r13, 112(%rsp)
+	movq	%r14, 120(%rsp)
+
+	movq	(%rsp), %r8
+	movq	8(%rsp), %r9
+	movq	16(%rsp), %r10
+	movq	24(%rsp), %r11
+	movq	32(%rsp), %r12
+	movq	40(%rsp), %r13
+	movq	48(%rsp), %r14
+	movq	56(%rsp), %r15
+
+	call	__rsaz_512_reduce
+
+	addq	64(%rsp), %r8
+	adcq	72(%rsp), %r9
+	adcq	80(%rsp), %r10
+	adcq	88(%rsp), %r11
+	adcq	96(%rsp), %r12
+	adcq	104(%rsp), %r13
+	adcq	112(%rsp), %r14
+	adcq	120(%rsp), %r15
+	sbbq	%rcx, %rcx
+
+	call	__rsaz_512_subtract
+
+	movq	%r8, %rdx
+	movq	%r9, %rax
+	movl	128+8(%rsp), $times
+	movq	$out, $inp
+
+	decl	$times
+	jnz	.Loop_sqr
+___
+if ($addx) {
+$code.=<<___;
+	jmp	.Lsqr_tail
+
+.align	32
+.Loop_sqrx:
+	movl	$times,128+8(%rsp)
+	movq	$out, %xmm0		# off-load
+	movq	%rbp, %xmm1		# off-load
+#first iteration	
+	mulx	%rax, %r8, %r9
+
+	mulx	16($inp), %rcx, %r10
+	xor	%rbp, %rbp		# cf=0, of=0
+
+	mulx	24($inp), %rax, %r11
+	adcx	%rcx, %r9
+
+	mulx	32($inp), %rcx, %r12
+	adcx	%rax, %r10
+
+	mulx	40($inp), %rax, %r13
+	adcx	%rcx, %r11
+
+	.byte	0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($inp), %rcx, %r14
+	adcx	%rax, %r12
+	adcx	%rcx, %r13
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r15
+	adcx	%rax, %r14
+	adcx	%rbp, %r15		# %rbp is 0
+
+	mov	%r9, %rcx
+	shld	\$1, %r8, %r9
+	shl	\$1, %r8
+
+	xor	%ebp, %ebp
+	mulx	%rdx, %rax, %rdx
+	adcx	%rdx, %r8
+	 mov	8($inp), %rdx
+	adcx	%rbp, %r9
+
+	mov	%rax, (%rsp)
+	mov	%r8, 8(%rsp)
+
+#second iteration	
+	mulx	16($inp), %rax, %rbx
+	adox	%rax, %r10
+	adcx	%rbx, %r11
+
+	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r8
+	adox	$out, %r11
+	adcx	%r8, %r12
+
+	mulx	32($inp), %rax, %rbx
+	adox	%rax, %r12
+	adcx	%rbx, %r13
+
+	mulx	40($inp), $out, %r8
+	adox	$out, %r13
+	adcx	%r8, %r14
+
+	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
+	adox	%rax, %r14
+	adcx	%rbx, %r15
+
+	.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r8
+	adox	$out, %r15
+	adcx	%rbp, %r8
+	adox	%rbp, %r8
+
+	mov	%r11, %rbx
+	shld	\$1, %r10, %r11
+	shld	\$1, %rcx, %r10
+
+	xor	%ebp,%ebp
+	mulx	%rdx, %rax, %rcx
+	 mov	16($inp), %rdx
+	adcx	%rax, %r9
+	adcx	%rcx, %r10
+	adcx	%rbp, %r11
+
+	mov	%r9, 16(%rsp)
+	.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00		# mov	%r10, 24(%rsp)
+	
+#third iteration	
+	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00	# mulx	24($inp), $out, %r9
+	adox	$out, %r12
+	adcx	%r9, %r13
+
+	mulx	32($inp), %rax, %rcx
+	adox	%rax, %r13
+	adcx	%rcx, %r14
+
+	mulx	40($inp), $out, %r9
+	adox	$out, %r14
+	adcx	%r9, %r15
+
+	.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rcx
+	adox	%rax, %r15
+	adcx	%rcx, %r8
+
+	.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r9
+	adox	$out, %r8
+	adcx	%rbp, %r9
+	adox	%rbp, %r9
+
+	mov	%r13, %rcx
+	shld	\$1, %r12, %r13
+	shld	\$1, %rbx, %r12
+
+	xor	%ebp, %ebp
+	mulx	%rdx, %rax, %rdx
+	adcx	%rax, %r11
+	adcx	%rdx, %r12
+	 mov	24($inp), %rdx
+	adcx	%rbp, %r13
+
+	mov	%r11, 32(%rsp)
+	.byte	0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00		# mov	%r12, 40(%rsp)
+	
+#fourth iteration	
+	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00	# mulx	32($inp), %rax, %rbx
+	adox	%rax, %r14
+	adcx	%rbx, %r15
+
+	mulx	40($inp), $out, %r10
+	adox	$out, %r15
+	adcx	%r10, %r8
+
+	mulx	48($inp), %rax, %rbx
+	adox	%rax, %r8
+	adcx	%rbx, %r9
+
+	mulx	56($inp), $out, %r10
+	adox	$out, %r9
+	adcx	%rbp, %r10
+	adox	%rbp, %r10
+
+	.byte	0x66
+	mov	%r15, %rbx
+	shld	\$1, %r14, %r15
+	shld	\$1, %rcx, %r14
+
+	xor	%ebp, %ebp
+	mulx	%rdx, %rax, %rdx
+	adcx	%rax, %r13
+	adcx	%rdx, %r14
+	 mov	32($inp), %rdx
+	adcx	%rbp, %r15
+
+	mov	%r13, 48(%rsp)
+	mov	%r14, 56(%rsp)
+	
+#fifth iteration	
+	.byte	0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00	# mulx	40($inp), $out, %r11
+	adox	$out, %r8
+	adcx	%r11, %r9
+
+	mulx	48($inp), %rax, %rcx
+	adox	%rax, %r9
+	adcx	%rcx, %r10
+
+	mulx	56($inp), $out, %r11
+	adox	$out, %r10
+	adcx	%rbp, %r11
+	adox	%rbp, %r11
+
+	mov	%r9, %rcx
+	shld	\$1, %r8, %r9
+	shld	\$1, %rbx, %r8
+
+	xor	%ebp, %ebp
+	mulx	%rdx, %rax, %rdx
+	adcx	%rax, %r15
+	adcx	%rdx, %r8
+	 mov	40($inp), %rdx
+	adcx	%rbp, %r9
+
+	mov	%r15, 64(%rsp)
+	mov	%r8, 72(%rsp)
+	
+#sixth iteration	
+	.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	48($inp), %rax, %rbx
+	adox	%rax, %r10
+	adcx	%rbx, %r11
+
+	.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00	# mulx	56($inp), $out, %r12
+	adox	$out, %r11
+	adcx	%rbp, %r12
+	adox	%rbp, %r12
+
+	mov	%r11, %rbx
+	shld	\$1, %r10, %r11
+	shld	\$1, %rcx, %r10
+
+	xor	%ebp, %ebp
+	mulx	%rdx, %rax, %rdx
+	adcx	%rax, %r9
+	adcx	%rdx, %r10
+	 mov	48($inp), %rdx
+	adcx	%rbp, %r11
+
+	mov	%r9, 80(%rsp)
+	mov	%r10, 88(%rsp)
+
+#seventh iteration
+	.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	56($inp), %rax, %r13
+	adox	%rax, %r12
+	adox	%rbp, %r13
+
+	xor	%r14, %r14
+	shld	\$1, %r13, %r14
+	shld	\$1, %r12, %r13
+	shld	\$1, %rbx, %r12
+
+	xor	%ebp, %ebp
+	mulx	%rdx, %rax, %rdx
+	adcx	%rax, %r11
+	adcx	%rdx, %r12
+	 mov	56($inp), %rdx
+	adcx	%rbp, %r13
+
+	.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00		# mov	%r11, 96(%rsp)
+	.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00		# mov	%r12, 104(%rsp)
+
+#eighth iteration
+	mulx	%rdx, %rax, %rdx
+	adox	%rax, %r13
+	adox	%rbp, %rdx
+
+	.byte	0x66
+	add	%rdx, %r14
+
+	movq	%r13, 112(%rsp)
+	movq	%r14, 120(%rsp)
+	movq	%xmm0, $out
+	movq	%xmm1, %rbp
+
+	movq	128(%rsp), %rdx		# pull $n0
+	movq	(%rsp), %r8
+	movq	8(%rsp), %r9
+	movq	16(%rsp), %r10
+	movq	24(%rsp), %r11
+	movq	32(%rsp), %r12
+	movq	40(%rsp), %r13
+	movq	48(%rsp), %r14
+	movq	56(%rsp), %r15
+
+	call	__rsaz_512_reducex
+
+	addq	64(%rsp), %r8
+	adcq	72(%rsp), %r9
+	adcq	80(%rsp), %r10
+	adcq	88(%rsp), %r11
+	adcq	96(%rsp), %r12
+	adcq	104(%rsp), %r13
+	adcq	112(%rsp), %r14
+	adcq	120(%rsp), %r15
+	sbbq	%rcx, %rcx
+
+	call	__rsaz_512_subtract
+
+	movq	%r8, %rdx
+	movq	%r9, %rax
+	movl	128+8(%rsp), $times
+	movq	$out, $inp
+
+	decl	$times
+	jnz	.Loop_sqrx
+
+.Lsqr_tail:
+___
+}
+$code.=<<___;
+
+	leaq	128+24+48(%rsp), %rax
+	movq	-48(%rax), %r15
+	movq	-40(%rax), %r14
+	movq	-32(%rax), %r13
+	movq	-24(%rax), %r12
+	movq	-16(%rax), %rbp
+	movq	-8(%rax), %rbx
+	leaq	(%rax), %rsp
+.Lsqr_epilogue:
+	ret
+.size	rsaz_512_sqr,.-rsaz_512_sqr
+___
+}
+{
+my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
+$code.=<<___;
+.globl	rsaz_512_mul
+.type	rsaz_512_mul,\@function,5
+.align	32
+rsaz_512_mul:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	subq	\$128+24, %rsp
+.Lmul_body:
+	movq	$out, %xmm0		# off-load arguments
+	movq	$mod, %xmm1
+	movq	$n0, 128(%rsp)
+___
+$code.=<<___ if ($addx);
+	movl	\$0x80100,%r11d
+	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
+	je	.Lmulx
+___
+$code.=<<___;
+	movq	($bp), %rbx		# pass b[0]
+	movq	$bp, %rbp		# pass argument
+	call	__rsaz_512_mul
+
+	movq	%xmm0, $out
+	movq	%xmm1, %rbp
+
+	movq	(%rsp), %r8
+	movq	8(%rsp), %r9
+	movq	16(%rsp), %r10
+	movq	24(%rsp), %r11
+	movq	32(%rsp), %r12
+	movq	40(%rsp), %r13
+	movq	48(%rsp), %r14
+	movq	56(%rsp), %r15
+
+	call	__rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+	jmp	.Lmul_tail
+
+.align	32
+.Lmulx:
+	movq	$bp, %rbp		# pass argument
+	movq	($bp), %rdx		# pass b[0]
+	call	__rsaz_512_mulx
+
+	movq	%xmm0, $out
+	movq	%xmm1, %rbp
+
+	movq	128(%rsp), %rdx		# pull $n0
+	movq	(%rsp), %r8
+	movq	8(%rsp), %r9
+	movq	16(%rsp), %r10
+	movq	24(%rsp), %r11
+	movq	32(%rsp), %r12
+	movq	40(%rsp), %r13
+	movq	48(%rsp), %r14
+	movq	56(%rsp), %r15
+
+	call	__rsaz_512_reducex
+.Lmul_tail:
+___
+$code.=<<___;
+	addq	64(%rsp), %r8
+	adcq	72(%rsp), %r9
+	adcq	80(%rsp), %r10
+	adcq	88(%rsp), %r11
+	adcq	96(%rsp), %r12
+	adcq	104(%rsp), %r13
+	adcq	112(%rsp), %r14
+	adcq	120(%rsp), %r15
+	sbbq	%rcx, %rcx
+
+	call	__rsaz_512_subtract
+
+	leaq	128+24+48(%rsp), %rax
+	movq	-48(%rax), %r15
+	movq	-40(%rax), %r14
+	movq	-32(%rax), %r13
+	movq	-24(%rax), %r12
+	movq	-16(%rax), %rbp
+	movq	-8(%rax), %rbx
+	leaq	(%rax), %rsp
+.Lmul_epilogue:
+	ret
+.size	rsaz_512_mul,.-rsaz_512_mul
+___
+}
+{
+my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+$code.=<<___;
+.globl	rsaz_512_mul_gather4
+.type	rsaz_512_mul_gather4,\@function,6
+.align	32
+rsaz_512_mul_gather4:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	$pwr, $pwr
+	subq	\$128+24, %rsp
+.Lmul_gather4_body:
+___
+$code.=<<___ if ($addx);
+	movl	\$0x80100,%r11d
+	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
+	je	.Lmulx_gather
+___
+$code.=<<___;
+	movl	64($bp,$pwr,4), %eax
+	movq	$out, %xmm0		# off-load arguments
+	movl	($bp,$pwr,4), %ebx
+	movq	$mod, %xmm1
+	movq	$n0, 128(%rsp)
+
+	shlq	\$32, %rax
+	or	%rax, %rbx
+	movq	($ap), %rax
+	 movq	8($ap), %rcx
+	 leaq	128($bp,$pwr,4), %rbp
+	mulq	%rbx			# 0 iteration
+	movq	%rax, (%rsp)
+	movq	%rcx, %rax
+	movq	%rdx, %r8
+
+	mulq	%rbx
+	 movd	(%rbp), %xmm4
+	addq	%rax, %r8
+	movq	16($ap), %rax
+	movq	%rdx, %r9
+	adcq	\$0, %r9
+
+	mulq	%rbx
+	 movd	64(%rbp), %xmm5
+	addq	%rax, %r9
+	movq	24($ap), %rax
+	movq	%rdx, %r10
+	adcq	\$0, %r10
+
+	mulq	%rbx
+	 pslldq	\$4, %xmm5
+	addq	%rax, %r10
+	movq	32($ap), %rax
+	movq	%rdx, %r11
+	adcq	\$0, %r11
+
+	mulq	%rbx
+	 por	%xmm5, %xmm4
+	addq	%rax, %r11
+	movq	40($ap), %rax
+	movq	%rdx, %r12
+	adcq	\$0, %r12
+
+	mulq	%rbx
+	addq	%rax, %r12
+	movq	48($ap), %rax
+	movq	%rdx, %r13
+	adcq	\$0, %r13
+
+	mulq	%rbx
+	 leaq	128(%rbp), %rbp
+	addq	%rax, %r13
+	movq	56($ap), %rax
+	movq	%rdx, %r14
+	adcq	\$0, %r14
+	
+	mulq	%rbx
+	 movq	%xmm4, %rbx
+	addq	%rax, %r14
+	 movq	($ap), %rax
+	movq	%rdx, %r15
+	adcq	\$0, %r15
+
+	leaq	8(%rsp), %rdi
+	movl	\$7, %ecx
+	jmp	.Loop_mul_gather
+
+.align	32
+.Loop_mul_gather:
+	mulq	%rbx
+	addq	%rax, %r8
+	movq	8($ap), %rax
+	movq	%r8, (%rdi)
+	movq	%rdx, %r8
+	adcq	\$0, %r8
+
+	mulq	%rbx
+	 movd	(%rbp), %xmm4
+	addq	%rax, %r9
+	movq	16($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r9, %r8
+	movq	%rdx, %r9
+	adcq	\$0, %r9
+
+	mulq	%rbx
+	 movd	64(%rbp), %xmm5
+	addq	%rax, %r10
+	movq	24($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r10, %r9
+	movq	%rdx, %r10
+	adcq	\$0, %r10
+
+	mulq	%rbx
+	 pslldq	\$4, %xmm5
+	addq	%rax, %r11
+	movq	32($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r11, %r10
+	movq	%rdx, %r11
+	adcq	\$0, %r11
+
+	mulq	%rbx
+	 por	%xmm5, %xmm4
+	addq	%rax, %r12
+	movq	40($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r12, %r11
+	movq	%rdx, %r12
+	adcq	\$0, %r12
+
+	mulq	%rbx
+	addq	%rax, %r13
+	movq	48($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r13, %r12
+	movq	%rdx, %r13
+	adcq	\$0, %r13
+
+	mulq	%rbx
+	addq	%rax, %r14
+	movq	56($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r14, %r13
+	movq	%rdx, %r14
+	adcq	\$0, %r14
+
+	mulq	%rbx
+	 movq	%xmm4, %rbx
+	addq	%rax, %r15
+	 movq	($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r15, %r14
+	movq	%rdx, %r15	
+	adcq	\$0, %r15
+
+	leaq	128(%rbp), %rbp
+	leaq	8(%rdi), %rdi
+
+	decl	%ecx
+	jnz	.Loop_mul_gather
+
+	movq	%r8, (%rdi)
+	movq	%r9, 8(%rdi)
+	movq	%r10, 16(%rdi)
+	movq	%r11, 24(%rdi)
+	movq	%r12, 32(%rdi)
+	movq	%r13, 40(%rdi)
+	movq	%r14, 48(%rdi)
+	movq	%r15, 56(%rdi)
+
+	movq	%xmm0, $out
+	movq	%xmm1, %rbp
+
+	movq	(%rsp), %r8
+	movq	8(%rsp), %r9
+	movq	16(%rsp), %r10
+	movq	24(%rsp), %r11
+	movq	32(%rsp), %r12
+	movq	40(%rsp), %r13
+	movq	48(%rsp), %r14
+	movq	56(%rsp), %r15
+
+	call	__rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+	jmp	.Lmul_gather_tail
+
+.align	32
+.Lmulx_gather:
+	mov	64($bp,$pwr,4), %eax
+	movq	$out, %xmm0		# off-load arguments
+	lea	128($bp,$pwr,4), %rbp
+	mov	($bp,$pwr,4), %edx
+	movq	$mod, %xmm1
+	mov	$n0, 128(%rsp)
+
+	shl	\$32, %rax
+	or	%rax, %rdx
+	mulx	($ap), %rbx, %r8	# 0 iteration
+	mov	%rbx, (%rsp)
+	xor	%edi, %edi		# cf=0, of=0
+
+	mulx	8($ap), %rax, %r9
+	 movd	(%rbp), %xmm4
+
+	mulx	16($ap), %rbx, %r10
+	 movd	64(%rbp), %xmm5
+	adcx	%rax, %r8
+
+	mulx	24($ap), %rax, %r11
+	 pslldq	\$4, %xmm5
+	adcx	%rbx, %r9
+
+	mulx	32($ap), %rbx, %r12
+	 por	%xmm5, %xmm4
+	adcx	%rax, %r10
+
+	mulx	40($ap), %rax, %r13
+	adcx	%rbx, %r11
+
+	mulx	48($ap), %rbx, %r14
+	 lea	128(%rbp), %rbp
+	adcx	%rax, %r12
+	
+	mulx	56($ap), %rax, %r15
+	 movq	%xmm4, %rdx
+	adcx	%rbx, %r13
+	adcx	%rax, %r14
+	mov	%r8, %rbx
+	adcx	%rdi, %r15		# %rdi is 0
+
+	mov	\$-7, %rcx
+	jmp	.Loop_mulx_gather
+
+.align	32
+.Loop_mulx_gather:
+	mulx	($ap), %rax, %r8
+	adcx	%rax, %rbx
+	adox	%r9, %r8
+
+	mulx	8($ap), %rax, %r9
+	.byte	0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00		# movd	(%rbp), %xmm4
+	adcx	%rax, %r8
+	adox	%r10, %r9
+
+	mulx	16($ap), %rax, %r10
+	 movd	64(%rbp), %xmm5
+	 lea	128(%rbp), %rbp
+	adcx	%rax, %r9
+	adox	%r11, %r10
+
+	.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00	# mulx	24($ap), %rax, %r11
+	 pslldq	\$4, %xmm5
+	 por	%xmm5, %xmm4
+	adcx	%rax, %r10
+	adox	%r12, %r11
+
+	mulx	32($ap), %rax, %r12
+	adcx	%rax, %r11
+	adox	%r13, %r12
+
+	mulx	40($ap), %rax, %r13
+	adcx	%rax, %r12
+	adox	%r14, %r13
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
+	adcx	%rax, %r13
+	adox	%r15, %r14
+
+	mulx	56($ap), %rax, %r15
+	 movq	%xmm4, %rdx
+	 mov	%rbx, 64(%rsp,%rcx,8)
+	adcx	%rax, %r14
+	adox	%rdi, %r15
+	mov	%r8, %rbx
+	adcx	%rdi, %r15		# cf=0
+
+	inc	%rcx			# of=0
+	jnz	.Loop_mulx_gather
+
+	mov	%r8, 64(%rsp)
+	mov	%r9, 64+8(%rsp)
+	mov	%r10, 64+16(%rsp)
+	mov	%r11, 64+24(%rsp)
+	mov	%r12, 64+32(%rsp)
+	mov	%r13, 64+40(%rsp)
+	mov	%r14, 64+48(%rsp)
+	mov	%r15, 64+56(%rsp)
+
+	movq	%xmm0, $out
+	movq	%xmm1, %rbp
+
+	mov	128(%rsp), %rdx		# pull $n0
+	mov	(%rsp), %r8
+	mov	8(%rsp), %r9
+	mov	16(%rsp), %r10
+	mov	24(%rsp), %r11
+	mov	32(%rsp), %r12
+	mov	40(%rsp), %r13
+	mov	48(%rsp), %r14
+	mov	56(%rsp), %r15
+
+	call	__rsaz_512_reducex
+
+.Lmul_gather_tail:
+___
+$code.=<<___;
+	addq	64(%rsp), %r8
+	adcq	72(%rsp), %r9
+	adcq	80(%rsp), %r10
+	adcq	88(%rsp), %r11
+	adcq	96(%rsp), %r12
+	adcq	104(%rsp), %r13
+	adcq	112(%rsp), %r14
+	adcq	120(%rsp), %r15
+	sbbq	%rcx, %rcx
+
+	call	__rsaz_512_subtract
+
+	leaq	128+24+48(%rsp), %rax
+	movq	-48(%rax), %r15
+	movq	-40(%rax), %r14
+	movq	-32(%rax), %r13
+	movq	-24(%rax), %r12
+	movq	-16(%rax), %rbp
+	movq	-8(%rax), %rbx
+	leaq	(%rax), %rsp
+.Lmul_gather4_epilogue:
+	ret
+.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
+___
+}
+{
+my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+$code.=<<___;
+.globl	rsaz_512_mul_scatter4
+.type	rsaz_512_mul_scatter4,\@function,6
+.align	32
+rsaz_512_mul_scatter4:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	$pwr, $pwr
+	subq	\$128+24, %rsp
+.Lmul_scatter4_body:
+	leaq	($tbl,$pwr,4), $tbl
+	movq	$out, %xmm0		# off-load arguments
+	movq	$mod, %xmm1
+	movq	$tbl, %xmm2
+	movq	$n0, 128(%rsp)
+
+	movq	$out, %rbp
+___
+$code.=<<___ if ($addx);
+	movl	\$0x80100,%r11d
+	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
+	cmpl	\$0x80100,%r11d		# check for MULX and ADO/CX
+	je	.Lmulx_scatter
+___
+$code.=<<___;
+	movq	($out),%rbx		# pass b[0]
+	call	__rsaz_512_mul
+
+	movq	%xmm0, $out
+	movq	%xmm1, %rbp
+
+	movq	(%rsp), %r8
+	movq	8(%rsp), %r9
+	movq	16(%rsp), %r10
+	movq	24(%rsp), %r11
+	movq	32(%rsp), %r12
+	movq	40(%rsp), %r13
+	movq	48(%rsp), %r14
+	movq	56(%rsp), %r15
+
+	call	__rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+	jmp	.Lmul_scatter_tail
+	
+.align	32
+.Lmulx_scatter:
+	movq	($out), %rdx		# pass b[0]
+	call	__rsaz_512_mulx
+
+	movq	%xmm0, $out
+	movq	%xmm1, %rbp
+
+	movq	128(%rsp), %rdx		# pull $n0
+	movq	(%rsp), %r8
+	movq	8(%rsp), %r9
+	movq	16(%rsp), %r10
+	movq	24(%rsp), %r11
+	movq	32(%rsp), %r12
+	movq	40(%rsp), %r13
+	movq	48(%rsp), %r14
+	movq	56(%rsp), %r15
+
+	call	__rsaz_512_reducex
+
+.Lmul_scatter_tail:
+___
+$code.=<<___;
+	addq	64(%rsp), %r8
+	adcq	72(%rsp), %r9
+	adcq	80(%rsp), %r10
+	adcq	88(%rsp), %r11
+	adcq	96(%rsp), %r12
+	adcq	104(%rsp), %r13
+	adcq	112(%rsp), %r14
+	adcq	120(%rsp), %r15
+	movq	%xmm2, $inp
+	sbbq	%rcx, %rcx
+
+	call	__rsaz_512_subtract
+
+	movl	%r8d, 64*0($inp)	# scatter
+	shrq	\$32, %r8
+	movl	%r9d, 64*2($inp)
+	shrq	\$32, %r9
+	movl	%r10d, 64*4($inp)
+	shrq	\$32, %r10
+	movl	%r11d, 64*6($inp)
+	shrq	\$32, %r11
+	movl	%r12d, 64*8($inp)
+	shrq	\$32, %r12
+	movl	%r13d, 64*10($inp)
+	shrq	\$32, %r13
+	movl	%r14d, 64*12($inp)
+	shrq	\$32, %r14
+	movl	%r15d, 64*14($inp)
+	shrq	\$32, %r15
+	movl	%r8d, 64*1($inp)
+	movl	%r9d, 64*3($inp)
+	movl	%r10d, 64*5($inp)
+	movl	%r11d, 64*7($inp)
+	movl	%r12d, 64*9($inp)
+	movl	%r13d, 64*11($inp)
+	movl	%r14d, 64*13($inp)
+	movl	%r15d, 64*15($inp)
+
+	leaq	128+24+48(%rsp), %rax
+	movq	-48(%rax), %r15
+	movq	-40(%rax), %r14
+	movq	-32(%rax), %r13
+	movq	-24(%rax), %r12
+	movq	-16(%rax), %rbp
+	movq	-8(%rax), %rbx
+	leaq	(%rax), %rsp
+.Lmul_scatter4_epilogue:
+	ret
+.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
+___
+}
+{
+my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
+$code.=<<___;
+.globl	rsaz_512_mul_by_one
+.type	rsaz_512_mul_by_one,\@function,4
+.align	32
+rsaz_512_mul_by_one:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	subq	\$128+24, %rsp
+.Lmul_by_one_body:
+___
+$code.=<<___ if ($addx);
+	movl	OPENSSL_ia32cap_P+8(%rip),%eax
+___
+$code.=<<___;
+	movq	$mod, %rbp	# reassign argument
+	movq	$n0, 128(%rsp)
+
+	movq	($inp), %r8
+	pxor	%xmm0, %xmm0
+	movq	8($inp), %r9
+	movq	16($inp), %r10
+	movq	24($inp), %r11
+	movq	32($inp), %r12
+	movq	40($inp), %r13
+	movq	48($inp), %r14
+	movq	56($inp), %r15
+
+	movdqa	%xmm0, (%rsp)
+	movdqa	%xmm0, 16(%rsp)
+	movdqa	%xmm0, 32(%rsp)
+	movdqa	%xmm0, 48(%rsp)
+	movdqa	%xmm0, 64(%rsp)
+	movdqa	%xmm0, 80(%rsp)
+	movdqa	%xmm0, 96(%rsp)
+___
+$code.=<<___ if ($addx);
+	andl	\$0x80100,%eax
+	cmpl	\$0x80100,%eax		# check for MULX and ADO/CX
+	je	.Lby_one_callx
+___
+$code.=<<___;
+	call	__rsaz_512_reduce
+___
+$code.=<<___ if ($addx);
+	jmp	.Lby_one_tail
+.align	32
+.Lby_one_callx:
+	movq	128(%rsp), %rdx		# pull $n0
+	call	__rsaz_512_reducex
+.Lby_one_tail:
+___
+$code.=<<___;
+	movq	%r8, ($out)
+	movq	%r9, 8($out)
+	movq	%r10, 16($out)
+	movq	%r11, 24($out)
+	movq	%r12, 32($out)
+	movq	%r13, 40($out)
+	movq	%r14, 48($out)
+	movq	%r15, 56($out)
+
+	leaq	128+24+48(%rsp), %rax
+	movq	-48(%rax), %r15
+	movq	-40(%rax), %r14
+	movq	-32(%rax), %r13
+	movq	-24(%rax), %r12
+	movq	-16(%rax), %rbp
+	movq	-8(%rax), %rbx
+	leaq	(%rax), %rsp
+.Lmul_by_one_epilogue:
+	ret
+.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
+___
+}
+{	# __rsaz_512_reduce
+	#
+	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
+	# output:	%r8-%r15
+	# clobbers:	everything except %rbp and %rdi
+$code.=<<___;
+.type	__rsaz_512_reduce,\@abi-omnipotent
+.align	32
+__rsaz_512_reduce:
+	movq	%r8, %rbx
+	imulq	128+8(%rsp), %rbx
+	movq	0(%rbp), %rax
+	movl	\$8, %ecx
+	jmp	.Lreduction_loop
+
+.align	32
+.Lreduction_loop:
+	mulq	%rbx
+	movq	8(%rbp), %rax
+	negq	%r8
+	movq	%rdx, %r8
+	adcq	\$0, %r8
+
+	mulq	%rbx
+	addq	%rax, %r9
+	movq	16(%rbp), %rax
+	adcq	\$0, %rdx
+	addq	%r9, %r8
+	movq	%rdx, %r9
+	adcq	\$0, %r9
+
+	mulq	%rbx
+	addq	%rax, %r10
+	movq	24(%rbp), %rax
+	adcq	\$0, %rdx
+	addq	%r10, %r9
+	movq	%rdx, %r10
+	adcq	\$0, %r10
+
+	mulq	%rbx
+	addq	%rax, %r11
+	movq	32(%rbp), %rax
+	adcq	\$0, %rdx
+	addq	%r11, %r10
+	 movq	128+8(%rsp), %rsi
+	#movq	%rdx, %r11
+	#adcq	\$0, %r11
+	adcq	\$0, %rdx
+	movq	%rdx, %r11
+
+	mulq	%rbx
+	addq	%rax, %r12
+	movq	40(%rbp), %rax
+	adcq	\$0, %rdx
+	 imulq	%r8, %rsi
+	addq	%r12, %r11
+	movq	%rdx, %r12
+	adcq	\$0, %r12
+
+	mulq	%rbx
+	addq	%rax, %r13
+	movq	48(%rbp), %rax
+	adcq	\$0, %rdx
+	addq	%r13, %r12
+	movq	%rdx, %r13
+	adcq	\$0, %r13
+
+	mulq	%rbx
+	addq	%rax, %r14
+	movq	56(%rbp), %rax
+	adcq	\$0, %rdx
+	addq	%r14, %r13
+	movq	%rdx, %r14
+	adcq	\$0, %r14
+
+	mulq	%rbx
+	 movq	%rsi, %rbx
+	addq	%rax, %r15
+	 movq	0(%rbp), %rax
+	adcq	\$0, %rdx
+	addq	%r15, %r14
+	movq	%rdx, %r15
+	adcq	\$0, %r15
+
+	decl	%ecx
+	jne	.Lreduction_loop
+
+	ret
+.size	__rsaz_512_reduce,.-__rsaz_512_reduce
+___
+}
+if ($addx) {
+	# __rsaz_512_reducex
+	#
+	# input:	%r8-%r15, %rbp - mod, 128(%rsp) - n0
+	# output:	%r8-%r15
+	# clobbers:	everything except %rbp and %rdi
+$code.=<<___;
+.type	__rsaz_512_reducex,\@abi-omnipotent
+.align	32
+__rsaz_512_reducex:
+	#movq	128+8(%rsp), %rdx		# pull $n0
+	imulq	%r8, %rdx
+	xorq	%rsi, %rsi			# cf=0,of=0
+	movl	\$8, %ecx
+	jmp	.Lreduction_loopx
+
+.align	32
+.Lreduction_loopx:
+	mov	%r8, %rbx
+	mulx	0(%rbp), %rax, %r8
+	adcx	%rbx, %rax
+	adox	%r9, %r8
+
+	mulx	8(%rbp), %rax, %r9
+	adcx	%rax, %r8
+	adox	%r10, %r9
+
+	mulx	16(%rbp), %rbx, %r10
+	adcx	%rbx, %r9
+	adox	%r11, %r10
+
+	mulx	24(%rbp), %rbx, %r11
+	adcx	%rbx, %r10
+	adox	%r12, %r11
+
+	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	32(%rbp), %rbx, %r12
+	 mov	%rdx, %rax
+	 mov	%r8, %rdx
+	adcx	%rbx, %r11
+	adox	%r13, %r12
+
+	 mulx	128+8(%rsp), %rbx, %rdx
+	 mov	%rax, %rdx
+
+	mulx	40(%rbp), %rax, %r13
+	adcx	%rax, %r12
+	adox	%r14, %r13
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00	# mulx	48(%rbp), %rax, %r14
+	adcx	%rax, %r13
+	adox	%r15, %r14
+
+	mulx	56(%rbp), %rax, %r15
+	 mov	%rbx, %rdx
+	adcx	%rax, %r14
+	adox	%rsi, %r15			# %rsi is 0
+	adcx	%rsi, %r15			# cf=0
+
+	decl	%ecx				# of=0
+	jne	.Lreduction_loopx
+
+	ret
+.size	__rsaz_512_reducex,.-__rsaz_512_reducex
+___
+}
+{	# __rsaz_512_subtract
+	# input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
+	# output:
+	# clobbers: everything but %rdi, %rsi and %rbp
+$code.=<<___;
+.type	__rsaz_512_subtract,\@abi-omnipotent
+.align	32
+__rsaz_512_subtract:
+	movq	%r8, ($out)
+	movq	%r9, 8($out)
+	movq	%r10, 16($out)
+	movq	%r11, 24($out)
+	movq	%r12, 32($out)
+	movq	%r13, 40($out)
+	movq	%r14, 48($out)
+	movq	%r15, 56($out)
+
+	movq	0($mod), %r8
+	movq	8($mod), %r9
+	negq	%r8
+	notq	%r9
+	andq	%rcx, %r8
+	movq	16($mod), %r10
+	andq	%rcx, %r9
+	notq	%r10
+	movq	24($mod), %r11
+	andq	%rcx, %r10
+	notq	%r11
+	movq	32($mod), %r12
+	andq	%rcx, %r11
+	notq	%r12
+	movq	40($mod), %r13
+	andq	%rcx, %r12
+	notq	%r13
+	movq	48($mod), %r14
+	andq	%rcx, %r13
+	notq	%r14
+	movq	56($mod), %r15
+	andq	%rcx, %r14
+	notq	%r15
+	andq	%rcx, %r15
+
+	addq	($out), %r8
+	adcq	8($out), %r9
+	adcq	16($out), %r10
+	adcq	24($out), %r11
+	adcq	32($out), %r12
+	adcq	40($out), %r13
+	adcq	48($out), %r14
+	adcq	56($out), %r15
+
+	movq	%r8, ($out)
+	movq	%r9, 8($out)
+	movq	%r10, 16($out)
+	movq	%r11, 24($out)
+	movq	%r12, 32($out)
+	movq	%r13, 40($out)
+	movq	%r14, 48($out)
+	movq	%r15, 56($out)
+
+	ret
+.size	__rsaz_512_subtract,.-__rsaz_512_subtract
+___
+}
+{	# __rsaz_512_mul
+	#
+	# input: %rsi - ap, %rbp - bp
+	# ouput:
+	# clobbers: everything
+my ($ap,$bp) = ("%rsi","%rbp");
+$code.=<<___;
+.type	__rsaz_512_mul,\@abi-omnipotent
+.align	32
+__rsaz_512_mul:
+	leaq	8(%rsp), %rdi
+
+	movq	($ap), %rax
+	mulq	%rbx
+	movq	%rax, (%rdi)
+	movq	8($ap), %rax
+	movq	%rdx, %r8
+
+	mulq	%rbx
+	addq	%rax, %r8
+	movq	16($ap), %rax
+	movq	%rdx, %r9
+	adcq	\$0, %r9
+
+	mulq	%rbx
+	addq	%rax, %r9
+	movq	24($ap), %rax
+	movq	%rdx, %r10
+	adcq	\$0, %r10
+
+	mulq	%rbx
+	addq	%rax, %r10
+	movq	32($ap), %rax
+	movq	%rdx, %r11
+	adcq	\$0, %r11
+
+	mulq	%rbx
+	addq	%rax, %r11
+	movq	40($ap), %rax
+	movq	%rdx, %r12
+	adcq	\$0, %r12
+
+	mulq	%rbx
+	addq	%rax, %r12
+	movq	48($ap), %rax
+	movq	%rdx, %r13
+	adcq	\$0, %r13
+
+	mulq	%rbx
+	addq	%rax, %r13
+	movq	56($ap), %rax
+	movq	%rdx, %r14
+	adcq	\$0, %r14
+	
+	mulq	%rbx
+	addq	%rax, %r14
+	 movq	($ap), %rax
+	movq	%rdx, %r15
+	adcq	\$0, %r15
+
+	leaq	8($bp), $bp
+	leaq	8(%rdi), %rdi
+
+	movl	\$7, %ecx
+	jmp	.Loop_mul
+
+.align	32
+.Loop_mul:
+	movq	($bp), %rbx
+	mulq	%rbx
+	addq	%rax, %r8
+	movq	8($ap), %rax
+	movq	%r8, (%rdi)
+	movq	%rdx, %r8
+	adcq	\$0, %r8
+
+	mulq	%rbx
+	addq	%rax, %r9
+	movq	16($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r9, %r8
+	movq	%rdx, %r9
+	adcq	\$0, %r9
+
+	mulq	%rbx
+	addq	%rax, %r10
+	movq	24($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r10, %r9
+	movq	%rdx, %r10
+	adcq	\$0, %r10
+
+	mulq	%rbx
+	addq	%rax, %r11
+	movq	32($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r11, %r10
+	movq	%rdx, %r11
+	adcq	\$0, %r11
+
+	mulq	%rbx
+	addq	%rax, %r12
+	movq	40($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r12, %r11
+	movq	%rdx, %r12
+	adcq	\$0, %r12
+
+	mulq	%rbx
+	addq	%rax, %r13
+	movq	48($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r13, %r12
+	movq	%rdx, %r13
+	adcq	\$0, %r13
+
+	mulq	%rbx
+	addq	%rax, %r14
+	movq	56($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r14, %r13
+	movq	%rdx, %r14
+	 leaq	8($bp), $bp
+	adcq	\$0, %r14
+
+	mulq	%rbx
+	addq	%rax, %r15
+	 movq	($ap), %rax
+	adcq	\$0, %rdx
+	addq	%r15, %r14
+	movq	%rdx, %r15	
+	adcq	\$0, %r15
+
+	leaq	8(%rdi), %rdi
+
+	decl	%ecx
+	jnz	.Loop_mul
+
+	movq	%r8, (%rdi)
+	movq	%r9, 8(%rdi)
+	movq	%r10, 16(%rdi)
+	movq	%r11, 24(%rdi)
+	movq	%r12, 32(%rdi)
+	movq	%r13, 40(%rdi)
+	movq	%r14, 48(%rdi)
+	movq	%r15, 56(%rdi)
+
+	ret
+.size	__rsaz_512_mul,.-__rsaz_512_mul
+___
+}
+if ($addx) {
+	# __rsaz_512_mulx
+	#
+	# input: %rsi - ap, %rbp - bp
+	# ouput:
+	# clobbers: everything
+my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
+$code.=<<___;
+.type	__rsaz_512_mulx,\@abi-omnipotent
+.align	32
+__rsaz_512_mulx:
+	mulx	($ap), %rbx, %r8	# initial %rdx preloaded by caller
+	mov	\$-6, %rcx
+
+	mulx	8($ap), %rax, %r9
+	movq	%rbx, 8(%rsp)
+
+	mulx	16($ap), %rbx, %r10
+	adc	%rax, %r8
+
+	mulx	24($ap), %rax, %r11
+	adc	%rbx, %r9
+
+	mulx	32($ap), %rbx, %r12
+	adc	%rax, %r10
+
+	mulx	40($ap), %rax, %r13
+	adc	%rbx, %r11
+
+	mulx	48($ap), %rbx, %r14
+	adc	%rax, %r12
+
+	mulx	56($ap), %rax, %r15
+	 mov	8($bp), %rdx
+	adc	%rbx, %r13
+	adc	%rax, %r14
+	adc	\$0, %r15
+
+	xor	$zero, $zero		# cf=0,of=0
+	jmp	.Loop_mulx
+
+.align	32
+.Loop_mulx:
+	movq	%r8, %rbx
+	mulx	($ap), %rax, %r8
+	adcx	%rax, %rbx
+	adox	%r9, %r8
+
+	mulx	8($ap), %rax, %r9
+	adcx	%rax, %r8
+	adox	%r10, %r9
+
+	mulx	16($ap), %rax, %r10
+	adcx	%rax, %r9
+	adox	%r11, %r10
+
+	mulx	24($ap), %rax, %r11
+	adcx	%rax, %r10
+	adox	%r12, %r11
+
+	.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00	# mulx	32($ap), %rax, %r12
+	adcx	%rax, %r11
+	adox	%r13, %r12
+
+	mulx	40($ap), %rax, %r13
+	adcx	%rax, %r12
+	adox	%r14, %r13
+
+	mulx	48($ap), %rax, %r14
+	adcx	%rax, %r13
+	adox	%r15, %r14
+
+	mulx	56($ap), %rax, %r15
+	 movq	64($bp,%rcx,8), %rdx
+	 movq	%rbx, 8+64-8(%rsp,%rcx,8)
+	adcx	%rax, %r14
+	adox	$zero, %r15
+	adcx	$zero, %r15		# cf=0
+
+	inc	%rcx			# of=0
+	jnz	.Loop_mulx
+
+	movq	%r8, %rbx
+	mulx	($ap), %rax, %r8
+	adcx	%rax, %rbx
+	adox	%r9, %r8
+
+	.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00	# mulx	8($ap), %rax, %r9
+	adcx	%rax, %r8
+	adox	%r10, %r9
+
+	.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00	# mulx	16($ap), %rax, %r10
+	adcx	%rax, %r9
+	adox	%r11, %r10
+
+	mulx	24($ap), %rax, %r11
+	adcx	%rax, %r10
+	adox	%r12, %r11
+
+	mulx	32($ap), %rax, %r12
+	adcx	%rax, %r11
+	adox	%r13, %r12
+
+	mulx	40($ap), %rax, %r13
+	adcx	%rax, %r12
+	adox	%r14, %r13
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00	# mulx	48($ap), %rax, %r14
+	adcx	%rax, %r13
+	adox	%r15, %r14
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00	# mulx	56($ap), %rax, %r15
+	adcx	%rax, %r14
+	adox	$zero, %r15
+	adcx	$zero, %r15
+
+	mov	%rbx, 8+64-8(%rsp)
+	mov	%r8, 8+64(%rsp)
+	mov	%r9, 8+64+8(%rsp)
+	mov	%r10, 8+64+16(%rsp)
+	mov	%r11, 8+64+24(%rsp)
+	mov	%r12, 8+64+32(%rsp)
+	mov	%r13, 8+64+40(%rsp)
+	mov	%r14, 8+64+48(%rsp)
+	mov	%r15, 8+64+56(%rsp)
+
+	ret
+.size	__rsaz_512_mulx,.-__rsaz_512_mulx
+___
+}
+{
+my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
+$code.=<<___;
+.globl	rsaz_512_scatter4
+.type	rsaz_512_scatter4,\@abi-omnipotent
+.align	16
+rsaz_512_scatter4:
+	leaq	($out,$power,4), $out
+	movl	\$8, %r9d
+	jmp	.Loop_scatter
+.align	16
+.Loop_scatter:
+	movq	($inp), %rax
+	leaq	8($inp), $inp
+	movl	%eax, ($out)
+	shrq	\$32, %rax
+	movl	%eax, 64($out)
+	leaq	128($out), $out
+	decl	%r9d
+	jnz	.Loop_scatter
+	ret
+.size	rsaz_512_scatter4,.-rsaz_512_scatter4
+
+.globl	rsaz_512_gather4
+.type	rsaz_512_gather4,\@abi-omnipotent
+.align	16
+rsaz_512_gather4:
+	leaq	($inp,$power,4), $inp
+	movl	\$8, %r9d
+	jmp	.Loop_gather
+.align	16
+.Loop_gather:
+	movl	($inp), %eax
+	movl	64($inp), %r8d
+	leaq	128($inp), $inp
+	shlq	\$32, %r8
+	or	%r8, %rax
+	movq	%rax, ($out)
+	leaq	8($out), $out
+	decl	%r9d
+	jnz	.Loop_gather
+	ret
+.size	rsaz_512_gather4,.-rsaz_512_gather4
+___
+}
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	se_handler,\@abi-omnipotent
+.align	16
+se_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	128+24+48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	sqr_handler,.-sqr_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_rsaz_512_sqr
+	.rva	.LSEH_end_rsaz_512_sqr
+	.rva	.LSEH_info_rsaz_512_sqr
+
+	.rva	.LSEH_begin_rsaz_512_mul
+	.rva	.LSEH_end_rsaz_512_mul
+	.rva	.LSEH_info_rsaz_512_mul
+
+	.rva	.LSEH_begin_rsaz_512_mul_gather4
+	.rva	.LSEH_end_rsaz_512_mul_gather4
+	.rva	.LSEH_info_rsaz_512_mul_gather4
+
+	.rva	.LSEH_begin_rsaz_512_mul_scatter4
+	.rva	.LSEH_end_rsaz_512_mul_scatter4
+	.rva	.LSEH_info_rsaz_512_mul_scatter4
+
+	.rva	.LSEH_begin_rsaz_512_mul_by_one
+	.rva	.LSEH_end_rsaz_512_mul_by_one
+	.rva	.LSEH_info_rsaz_512_mul_by_one
+
+.section	.xdata
+.align	8
+.LSEH_info_rsaz_512_sqr:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lsqr_body,.Lsqr_epilogue			# HandlerData[]
+.LSEH_info_rsaz_512_mul:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lmul_body,.Lmul_epilogue			# HandlerData[]
+.LSEH_info_rsaz_512_mul_gather4:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lmul_gather4_body,.Lmul_gather4_epilogue	# HandlerData[]
+.LSEH_info_rsaz_512_mul_scatter4:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lmul_scatter4_body,.Lmul_scatter4_epilogue	# HandlerData[]
+.LSEH_info_rsaz_512_mul_by_one:
+	.byte	9,0,0,0
+	.rva	se_handler
+	.rva	.Lmul_by_one_body,.Lmul_by_one_epilogue		# HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval $1/gem;
+print $code;
+close STDOUT;

diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl
new file mode 100644
index 0000000..e8f6b05
--- /dev/null
+++ b/crypto/bn/asm/x86-mont.pl

@@ -0,0 +1,593 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";	$tp="esi";		# overlapping variables!!!
+$rp="edi";	$bp="edi";		# overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");			# stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");	$_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;				# size of above frame rounded up to 16n
+
+	&xor	("eax","eax");
+	&mov	("edi",&wparam(5));	# int num
+	&cmp	("edi",4);
+	&jl	(&label("just_leave"));
+
+	&lea	("esi",&wparam(0));	# put aside pointer to argument block
+	&lea	("edx",&wparam(1));	# load ap
+	&mov	("ebp","esp");		# saved stack pointer!
+	&add	("edi",2);		# extra two words on top of tp
+	&neg	("edi");
+	&lea	("esp",&DWP(-$frame,"esp","edi",4));	# alloca($frame+4*(num+2))
+	&neg	("edi");
+
+	# minimize cache contention by arraning 2K window between stack
+	# pointer and ap argument [np is also position sensitive vector,
+	# but it's assumed to be near ap, as it's allocated at ~same
+	# time].
+	&mov	("eax","esp");
+	&sub	("eax","edx");
+	&and	("eax",2047);
+	&sub	("esp","eax");		# this aligns sp and ap modulo 2048
+
+	&xor	("edx","esp");
+	&and	("edx",2048);
+	&xor	("edx",2048);
+	&sub	("esp","edx");		# this splits them apart modulo 4096
+
+	&and	("esp",-64);		# align to cache line
+
+	################################# load argument block...
+	&mov	("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+	&mov	("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+	&mov	("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+	&mov	("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+	&mov	("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+	#&mov	("edi",&DWP(5*4,"esi"));# int num
+
+	&mov	("esi",&DWP(0,"esi"));	# pull n0[0]
+	&mov	($_rp,"eax");		# ... save a copy of argument block
+	&mov	($_ap,"ebx");
+	&mov	($_bp,"ecx");
+	&mov	($_np,"edx");
+	&mov	($_n0,"esi");
+	&lea	($num,&DWP(-3,"edi"));	# num=num-1 to assist modulo-scheduling
+	#&mov	($_num,$num);		# redundant as $num is not reused
+	&mov	($_sp,"ebp");		# saved stack pointer!
+
+if($sse2) {
+$acc0="mm0";	# mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+	&picmeup("eax","OPENSSL_ia32cap_P");
+	&bt	(&DWP(0,"eax"),26);
+	&jnc	(&label("non_sse2"));
+
+	&mov	("eax",-1);
+	&movd	($mask,"eax");		# mask 32 lower bits
+
+	&mov	($ap,$_ap);		# load input pointers
+	&mov	($bp,$_bp);
+	&mov	($np,$_np);
+
+	&xor	($i,$i);		# i=0
+	&xor	($j,$j);		# j=0
+
+	&movd	($mul0,&DWP(0,$bp));		# bp[0]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[0]
+	&movq	($car0,$mul1);
+	&movq	($acc0,$mul1);			# I wish movd worked for
+	&pand	($acc0,$mask);			# inter-register transfers
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);			# "t[0]"*np[0]*n0
+	&paddq	($car1,$acc0);
+
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&inc	($j);				# j++
+&set_label("1st",16);
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[0];
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[j-1]=
+	&psrlq	($car1,32);
+
+	&lea	($j,&DWP(1,$j));
+	&cmp	($j,$num);
+	&jl	(&label("1st"));
+
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[0]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[0];
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&paddq	($car1,$car0);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&inc	($i);				# i++
+&set_label("outer");
+	&xor	($j,$j);			# j=0
+
+	&movd	($mul0,&DWP(0,$bp,$i,4));	# bp[i]
+	&movd	($mul1,&DWP(0,$ap));		# ap[0]
+	&movd	($temp,&DWP($frame,"esp"));	# tp[0]
+	&movd	($car1,&DWP(0,$np));		# np[0]
+	&pmuludq($mul1,$mul0);			# ap[0]*bp[i]
+
+	&paddq	($mul1,$temp);			# +=tp[0]
+	&movq	($acc0,$mul1);
+	&movq	($car0,$mul1);
+	&pand	($acc0,$mask);
+
+	&pmuludq($mul1,$_n0q);			# *=n0
+
+	&pmuludq($car1,$mul1);
+	&paddq	($car1,$acc0);
+
+	&movd	($temp,&DWP($frame+4,"esp"));	# tp[1]
+	&movd	($acc1,&DWP(4,$np));		# np[1]
+	&movd	($acc0,&DWP(4,$ap));		# ap[1]
+
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[1]
+
+	&inc	($j);				# j++
+	&dec	($num);
+&set_label("inner");
+	&pmuludq($acc0,$mul0);			# ap[j]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[j]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&movd	($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+	&pand	($acc0,$mask);
+	&movd	($acc1,&DWP(4,$np,$j,4));	# np[j+1]
+	&paddq	($car1,$acc0);			# +=ap[j]*bp[i]+tp[j]
+	&movd	($acc0,&DWP(4,$ap,$j,4));	# ap[j+1]
+	&psrlq	($car0,32);
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+	&psrlq	($car1,32);
+	&paddq	($car0,$temp);			# +=tp[j+1]
+
+	&dec	($num);
+	&lea	($j,&DWP(1,$j));		# j++
+	&jnz	(&label("inner"));
+
+	&mov	($num,$j);
+	&pmuludq($acc0,$mul0);			# ap[num-1]*bp[i]
+	&pmuludq($acc1,$mul1);			# np[num-1]*m1
+	&paddq	($car0,$acc0);			# +=c0
+	&paddq	($car1,$acc1);			# +=c1
+
+	&movq	($acc0,$car0);
+	&pand	($acc0,$mask);
+	&paddq	($car1,$acc0);			# +=ap[num-1]*bp[i]+tp[num-1]
+	&movd	(&DWP($frame-4,"esp",$j,4),$car1);	# tp[num-2]=
+	&psrlq	($car0,32);
+	&psrlq	($car1,32);
+
+	&movd	($temp,&DWP($frame+4,"esp",$num,4));	# += tp[num]
+	&paddq	($car1,$car0);
+	&paddq	($car1,$temp);
+	&movq	(&QWP($frame,"esp",$num,4),$car1);	# tp[num].tp[num-1]
+
+	&lea	($i,&DWP(1,$i));		# i++
+	&cmp	($i,$num);
+	&jle	(&label("outer"));
+
+	&emms	();				# done with mmx bank
+	&jmp	(&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+
+if (0) {
+	&mov	("esp",$_sp);
+	&xor	("eax","eax");	# signal "not fast enough [yet]"
+	&jmp	(&label("just_leave"));
+	# While the below code provides competitive performance for
+	# all key lengthes on modern Intel cores, it's still more
+	# than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+	# means compared to the original integer-only assembler.
+	# 512-bit RSA sign is better by ~40%, but that's about all
+	# one can say about all CPUs...
+} else {
+$inp="esi";	# integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+	&mov	($inp,$_ap);
+	&lea	($carry,&DWP(1,$num));
+	&mov	($word,$_bp);
+	&xor	($j,$j);				# j=0
+	&mov	("edx",$inp);
+	&and	($carry,1);				# see if num is even
+	&sub	("edx",$word);				# see if ap==bp
+	&lea	("eax",&DWP(4,$word,$num,4));		# &bp[num]
+	&or	($carry,"edx");
+	&mov	($word,&DWP(0,$word));			# bp[0]
+	&jz	(&label("bn_sqr_mont"));
+	&mov	($_bpend,"eax");
+	&mov	("eax",&DWP(0,$inp));
+	&xor	("edx","edx");
+
+&set_label("mull",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[0]
+	&add	($carry,"eax");
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("mull"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[0]
+	 &mov	($word,$_n0);
+	&add	("eax",$carry);
+	 &mov	($inp,$_np);
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&mov	(&DWP($frame,"esp",$num,4),"eax");	# tp[num-1]=
+	&xor	($j,$j);
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&inc	($j);
+
+	&jmp	(&label("2ndmadd"));
+
+&set_label("1stmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*bp[i]
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("1stmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*bp[i]
+	&add	("eax",&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&xor	($j,$j);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&mov	(&DWP($frame,"esp",$num,4),$carry);	# tp[num-1]=
+	&adc	($j,0);
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&mov	(&DWP($frame+4,"esp",$num,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$num,4),$j);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&adc	("edx",0);
+	&mov	($j,1);
+
+&set_label("2ndmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j-1]=
+	&jl	(&label("2ndmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&xor	("eax","eax");
+	 &mov	($j,$_bp);				# &bp[i]
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	 &lea	($j,&DWP(4,$j));
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	 &cmp	($j,$_bpend);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(0,$j));			# bp[i+1]
+	&mov	($inp,$_ap);
+	&mov	($_bp,$j);				# &bp[++i]
+	&xor	($j,$j);
+	&xor	("edx","edx");
+	&mov	("eax",&DWP(0,$inp));
+	&jmp	(&label("1stmadd"));
+
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+	&mov	($_num,$num);
+	&mov	($_bp,$j);				# i=0
+
+	&mov	("eax",$word);				# ap[0]
+	&mul	($word);				# ap[0]*ap[0]
+	&mov	(&DWP($frame,"esp"),"eax");		# tp[0]=
+	&mov	($sbit,"edx");
+	&shr	("edx",1);
+	&and	($sbit,1);
+	&inc	($j);
+&set_label("sqr",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[0]
+	&add	("eax",$carry);
+	&lea	($j,&DWP(1,$j));
+	&adc	("edx",0);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	&shr	("eax",31);
+	&cmp	($j,$_num);
+	&mov	($sbit,"eax");
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("sqr"));
+
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[num-1]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[num-1]*ap[0]
+	&add	("eax",$carry);
+	 &mov	($word,$_n0);
+	&adc	("edx",0);
+	 &mov	($inp,$_np);
+	&lea	($carry,&DWP(0,$sbit,"eax",2));
+	 &imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+	&shr	("eax",31);
+	&mov	(&DWP($frame,"esp",$j,4),$carry);	# tp[num-1]=
+
+	&lea	($carry,&DWP(0,"eax","edx",2));
+	 &mov	("eax",&DWP(0,$inp));			# np[0]
+	&shr	("edx",31);
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num]=
+	&mov	(&DWP($frame+8,"esp",$j,4),"edx");	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&mov	($num,$j);
+	&adc	("edx",0);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+	&mov	($j,1);
+
+&set_label("3rdmadd",16);
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(4,$inp,$j,4));		# np[j+1]
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j-1]=
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j+1]*m
+	&add	($carry,&DWP($frame+4,"esp",$j,4));	# +=tp[j+1]
+	&lea	($j,&DWP(2,$j));
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&mov	("eax",&DWP(0,$inp,$j,4));		# np[j+2]
+	&adc	("edx",0);
+	&cmp	($j,$num);
+	&mov	(&DWP($frame-8,"esp",$j,4),$carry);	# tp[j]=
+	&jl	(&label("3rdmadd"));
+
+	&mov	($carry,"edx");
+	&mul	($word);				# np[j]*m
+	&add	($carry,&DWP($frame,"esp",$num,4));	# +=tp[num-1]
+	&adc	("edx",0);
+	&add	($carry,"eax");
+	&adc	("edx",0);
+	&mov	(&DWP($frame-4,"esp",$num,4),$carry);	# tp[num-2]=
+
+	&mov	($j,$_bp);				# i
+	&xor	("eax","eax");
+	&mov	($inp,$_ap);
+	&add	("edx",&DWP($frame+4,"esp",$num,4));	# carry+=tp[num]
+	&adc	("eax",&DWP($frame+8,"esp",$num,4));	# +=tp[num+1]
+	&mov	(&DWP($frame,"esp",$num,4),"edx");	# tp[num-1]=
+	&cmp	($j,$num);
+	&mov	(&DWP($frame+4,"esp",$num,4),"eax");	# tp[num]=
+	&je	(&label("common_tail"));
+
+	&mov	($word,&DWP(4,$inp,$j,4));		# ap[i]
+	&lea	($j,&DWP(1,$j));
+	&mov	("eax",$word);
+	&mov	($_bp,$j);				# ++i
+	&mul	($word);				# ap[i]*ap[i]
+	&add	("eax",&DWP($frame,"esp",$j,4));	# +=tp[i]
+	&adc	("edx",0);
+	&mov	(&DWP($frame,"esp",$j,4),"eax");	# tp[i]=
+	&xor	($carry,$carry);
+	&cmp	($j,$num);
+	&lea	($j,&DWP(1,$j));
+	&je	(&label("sqrlast"));
+
+	&mov	($sbit,"edx");				# zaps $num
+	&shr	("edx",1);
+	&and	($sbit,1);
+&set_label("sqradd",16);
+	&mov	("eax",&DWP(0,$inp,$j,4));		# ap[j]
+	&mov	($carry,"edx");
+	&mul	($word);				# ap[j]*ap[i]
+	&add	("eax",$carry);
+	&lea	($carry,&DWP(0,"eax","eax"));
+	&adc	("edx",0);
+	&shr	("eax",31);
+	&add	($carry,&DWP($frame,"esp",$j,4));	# +=tp[j]
+	&lea	($j,&DWP(1,$j));
+	&adc	("eax",0);
+	&add	($carry,$sbit);
+	&adc	("eax",0);
+	&cmp	($j,$_num);
+	&mov	(&DWP($frame-4,"esp",$j,4),$carry);	# tp[j]=
+	&mov	($sbit,"eax");
+	&jle	(&label("sqradd"));
+
+	&mov	($carry,"edx");
+	&add	("edx","edx");
+	&shr	($carry,31);
+	&add	("edx",$sbit);
+	&adc	($carry,0);
+&set_label("sqrlast");
+	&mov	($word,$_n0);
+	&mov	($inp,$_np);
+	&imul	($word,&DWP($frame,"esp"));		# n0*tp[0]
+
+	&add	("edx",&DWP($frame,"esp",$j,4));	# +=tp[num]
+	&mov	("eax",&DWP(0,$inp));			# np[0]
+	&adc	($carry,0);
+	&mov	(&DWP($frame,"esp",$j,4),"edx");	# tp[num]=
+	&mov	(&DWP($frame+4,"esp",$j,4),$carry);	# tp[num+1]=
+
+	&mul	($word);				# np[0]*m
+	&add	("eax",&DWP($frame,"esp"));		# +=tp[0]
+	&lea	($num,&DWP(-1,$j));
+	&adc	("edx",0);
+	&mov	($j,1);
+	&mov	("eax",&DWP(4,$inp));			# np[1]
+
+	&jmp	(&label("3rdmadd"));
+}
+
+&set_label("common_tail",16);
+	&mov	($np,$_np);			# load modulus pointer
+	&mov	($rp,$_rp);			# load result pointer
+	&lea	($tp,&DWP($frame,"esp"));	# [$ap and $bp are zapped]
+
+	&mov	("eax",&DWP(0,$tp));		# tp[0]
+	&mov	($j,$num);			# j=num-1
+	&xor	($i,$i);			# i=0 and clear CF!
+
+&set_label("sub",16);
+	&sbb	("eax",&DWP(0,$np,$i,4));
+	&mov	(&DWP(0,$rp,$i,4),"eax");	# rp[i]=tp[i]-np[i]
+	&dec	($j);				# doesn't affect CF!
+	&mov	("eax",&DWP(4,$tp,$i,4));	# tp[i+1]
+	&lea	($i,&DWP(1,$i));		# i++
+	&jge	(&label("sub"));
+
+	&sbb	("eax",0);			# handle upmost overflow bit
+	&and	($tp,"eax");
+	&not	("eax");
+	&mov	($np,$rp);
+	&and	($np,"eax");
+	&or	($tp,$np);			# tp=carry?tp:rp
+
+&set_label("copy",16);				# copy or in-place refresh
+	&mov	("eax",&DWP(0,$tp,$num,4));
+	&mov	(&DWP(0,$rp,$num,4),"eax");	# rp[i]=tp[i]
+	&mov	(&DWP($frame,"esp",$num,4),$j);	# zap temporary vector
+	&dec	($num);
+	&jge	(&label("copy"));
+
+	&mov	("esp",$_sp);		# pull saved stack pointer
+	&mov	("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();

diff --git a/crypto/bn/asm/x86.pl b/crypto/bn/asm/x86.pl
new file mode 100644
index 0000000..783f8dd
--- /dev/null
+++ b/crypto/bn/asm/x86.pl

@@ -0,0 +1,28 @@
+#!/usr/bin/env perl
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+require("x86/mul_add.pl");
+require("x86/mul.pl");
+require("x86/sqr.pl");
+require("x86/div.pl");
+require("x86/add.pl");
+require("x86/sub.pl");
+require("x86/comba.pl");
+
+&asm_init($ARGV[0],$0);
+
+&bn_mul_add_words("bn_mul_add_words");
+&bn_mul_words("bn_mul_words");
+&bn_sqr_words("bn_sqr_words");
+&bn_div_words("bn_div_words");
+&bn_add_words("bn_add_words");
+&bn_sub_words("bn_sub_words");
+&bn_mul_comba("bn_mul_comba8",8);
+&bn_mul_comba("bn_mul_comba4",4);
+&bn_sqr_comba("bn_sqr_comba8",8);
+&bn_sqr_comba("bn_sqr_comba4",4);
+
+&asm_finish();

diff --git a/crypto/bn/asm/x86/add.pl b/crypto/bn/asm/x86/add.pl
new file mode 100644
index 0000000..0b5cf58
--- /dev/null
+++ b/crypto/bn/asm/x86/add.pl

@@ -0,0 +1,76 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&add($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &add($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+1;

diff --git a/crypto/bn/asm/x86/comba.pl b/crypto/bn/asm/x86/comba.pl
new file mode 100644
index 0000000..2291253
--- /dev/null
+++ b/crypto/bn/asm/x86/comba.pl

@@ -0,0 +1,277 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub mul_add_c
+	{
+	local($a,$ai,$b,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("mul a[$ai]*b[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	&mul("edx");
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# laod next a
+	 &mov("eax",&wparam(0)) if $pos > 0;			# load r[]
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 0;	# laod next b
+	 &mov("edx",&DWP(($nb)*4,$b,"",0)) if $pos == 1;	# laod next b
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,"eax","",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# laod next a
+	}
+
+sub sqr_add_c
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$b,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add($c0,"eax");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 ###
+	&adc($c1,"edx");
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos == 1) && ($na != $nb);
+	 ###
+	&adc($c2,0);
+	 # is pos > 1, it means it is the last loop 
+	 &mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	&mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;		# load next b
+	}
+
+sub sqr_add_c2
+	{
+	local($r,$a,$ai,$bi,$c0,$c1,$c2,$pos,$i,$na,$nb)=@_;
+
+	# pos == -1 if eax and edx are pre-loaded, 0 to load from next
+	# words, and 1 if load return value
+
+	&comment("sqr a[$ai]*a[$bi]");
+
+	# "eax" and "edx" will always be pre-loaded.
+	# &mov("eax",&DWP($ai*4,$a,"",0)) ;
+	# &mov("edx",&DWP($bi*4,$a,"",0));
+
+	if ($ai == $bi)
+		{ &mul("eax");}
+	else
+		{ &mul("edx");}
+	&add("eax","eax");
+	 ###
+	&adc("edx","edx");
+	 ###
+	&adc($c2,0);
+	 &add($c0,"eax");
+	&adc($c1,"edx");
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 0;	# load next a
+	 &mov("eax",&DWP(($na)*4,$a,"",0)) if $pos == 1;	# load next b
+	&adc($c2,0);
+	&mov(&DWP($i*4,$r,"",0),$c0) if $pos > 0;		# save r[];
+	 &mov("edx",&DWP(($nb)*4,$a,"",0)) if ($pos <= 1) && ($na != $nb);
+	 ###
+	}
+
+sub bn_mul_comba
+	{
+	local($name,$num)=@_;
+	local($a,$b,$c0,$c1,$c2);
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($tot,$end);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$b="edi";
+	
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	&push("esi");
+	 &mov($a,&wparam(1));
+	&push("edi");
+	 &mov($b,&wparam(2));
+	&push("ebp");
+	 &push("ebx");
+
+	&xor($c0,$c0);
+	 &mov("eax",&DWP(0,$a,"",0));	# load the first word 
+	&xor($c1,$c1);
+	 &mov("edx",&DWP(0,$b,"",0));	# load the first second 
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("################## Calculate word $i"); 
+
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($j+1) == $end)
+				{
+				$v=1;
+				$v=2 if (($i+1) == $tot);
+				}
+			else
+				{ $v=0; }
+			if (($j+1) != $end)
+				{
+				$na=($ai-1);
+				$nb=($bi+1);
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+#printf STDERR "[$ai,$bi] -> [$na,$nb]\n";
+			&mul_add_c($a,$ai,$b,$bi,$c0,$c1,$c2,$v,$i,$na,$nb);
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				# &mov("eax",&wparam(0));
+				# &mov(&DWP($i*4,"eax","",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&comment("save r[$i]");
+	# &mov("eax",&wparam(0));
+	&mov(&DWP($i*4,"eax","",0),$c0);
+
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+sub bn_sqr_comba
+	{
+	local($name,$num)=@_;
+	local($r,$a,$c0,$c1,$c2)=@_;
+	local($i,$as,$ae,$bs,$be,$ai,$bi);
+	local($b,$tot,$end,$half);
+
+	&function_begin_B($name,"");
+
+	$c0="ebx";
+	$c1="ecx";
+	$c2="ebp";
+	$a="esi";
+	$r="edi";
+
+	&push("esi");
+	 &push("edi");
+	&push("ebp");
+	 &push("ebx");
+	&mov($r,&wparam(0));
+	 &mov($a,&wparam(1));
+	&xor($c0,$c0);
+	 &xor($c1,$c1);
+	&mov("eax",&DWP(0,$a,"",0)); # load the first word
+
+	$as=0;
+	$ae=0;
+	$bs=0;
+	$be=0;
+	$tot=$num+$num-1;
+
+	for ($i=0; $i<$tot; $i++)
+		{
+		$ai=$as;
+		$bi=$bs;
+		$end=$be+1;
+
+		&comment("############### Calculate word $i");
+		for ($j=$bs; $j<$end; $j++)
+			{
+			&xor($c2,$c2) if ($j == $bs);
+			if (($ai-1) < ($bi+1))
+				{
+				$v=1;
+				$v=2 if ($i+1) == $tot;
+				}
+			else
+				{ $v=0; }
+			if (!$v)
+				{
+				$na=$ai-1;
+				$nb=$bi+1;
+				}
+			else
+				{
+				$na=$as+($i < ($num-1));
+				$nb=$bs+($i >= ($num-1));
+				}
+			if ($ai == $bi)
+				{
+				&sqr_add_c($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			else
+				{
+				&sqr_add_c2($r,$a,$ai,$bi,
+					$c0,$c1,$c2,$v,$i,$na,$nb);
+				}
+			if ($v)
+				{
+				&comment("saved r[$i]");
+				#&mov(&DWP($i*4,$r,"",0),$c0);
+				($c0,$c1,$c2)=($c1,$c2,$c0);
+				last;
+				}
+			$ai--;
+			$bi++;
+			}
+		$as++ if ($i < ($num-1));
+		$ae++ if ($i >= ($num-1));
+
+		$bs++ if ($i >= ($num-1));
+		$be++ if ($i < ($num-1));
+		}
+	&mov(&DWP($i*4,$r,"",0),$c0);
+	&pop("ebx");
+	&pop("ebp");
+	&pop("edi");
+	&pop("esi");
+	&ret();
+	&function_end_B($name);
+	}
+
+1;

diff --git a/crypto/bn/asm/x86/div.pl b/crypto/bn/asm/x86/div.pl
new file mode 100644
index 0000000..0e90152
--- /dev/null
+++ b/crypto/bn/asm/x86/div.pl

@@ -0,0 +1,15 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_div_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+	&mov("edx",&wparam(0));	#
+	&mov("eax",&wparam(1));	#
+	&mov("ebx",&wparam(2));	#
+	&div("ebx");
+	&function_end($name);
+	}
+1;

diff --git a/crypto/bn/asm/x86/mul.pl b/crypto/bn/asm/x86/mul.pl
new file mode 100644
index 0000000..674cb9b
--- /dev/null
+++ b/crypto/bn/asm/x86/mul.pl

@@ -0,0 +1,77 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_mul_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ecx";
+	$r="edi";
+	$c="esi";
+	$num="ebp";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+	&mov($w,&wparam(3));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("mw_finish"));
+
+	&set_label("mw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,32);
+	&sub($num,8);
+	&jz(&label("mw_finish"));
+	&jmp(&label("mw_loop"));
+
+	&set_label("mw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jnz(&label("mw_finish2"));
+	&jmp(&label("mw_end"));
+
+	&set_label("mw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 # XXX
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i*4,$r,"",0),"eax");# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		 &dec($num) if ($i != 7-1);
+		&jz(&label("mw_end")) if ($i != 7-1);
+		}
+	&set_label("mw_end",0);
+	&mov("eax",$c);
+
+	&function_end($name);
+	}
+
+1;

diff --git a/crypto/bn/asm/x86/mul_add.pl b/crypto/bn/asm/x86/mul_add.pl
new file mode 100644
index 0000000..61830d3
--- /dev/null
+++ b/crypto/bn/asm/x86/mul_add.pl

@@ -0,0 +1,87 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_mul_add_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$Low="eax";
+	$High="edx";
+	$a="ebx";
+	$w="ebp";
+	$r="edi";
+	$c="esi";
+
+	&xor($c,$c);		# clear carry
+	&mov($r,&wparam(0));	#
+
+	&mov("ecx",&wparam(2));	#
+	&mov($a,&wparam(1));	#
+
+	&and("ecx",0xfffffff8);	# num / 8
+	&mov($w,&wparam(3));	#
+
+	&push("ecx");		# Up the stack for a tmp variable
+
+	&jz(&label("maw_finish"));
+
+	&set_label("maw_loop",0);
+
+	&mov(&swtmp(0),"ecx");	#
+
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+
+		 &mov("eax",&DWP($i,$a,"",0)); 	# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);		# L(t)+= *r
+		 &mov($c,&DWP($i,$r,"",0));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",$c);		# L(t)+=c
+		&adc("edx",0);			# H(t)+=carry
+		 &mov(&DWP($i,$r,"",0),"eax");	# *r= L(t);
+		&mov($c,"edx");			# c=  H(t);
+		}
+
+	&comment("");
+	&mov("ecx",&swtmp(0));	#
+	&add($a,32);
+	&add($r,32);
+	&sub("ecx",8);
+	&jnz(&label("maw_loop"));
+
+	&set_label("maw_finish",0);
+	&mov("ecx",&wparam(2));	# get num
+	&and("ecx",7);
+	&jnz(&label("maw_finish2"));	# helps branch prediction
+	&jmp(&label("maw_end"));
+
+	&set_label("maw_finish2",1);
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		 &mov("eax",&DWP($i*4,$a,"",0));# *a
+		&mul($w);			# *a * w
+		&add("eax",$c);			# L(t)+=c
+		 &mov($c,&DWP($i*4,$r,"",0));	# L(t)+= *r
+		&adc("edx",0);			# H(t)+=carry
+		 &add("eax",$c);
+		&adc("edx",0);			# H(t)+=carry
+		 &dec("ecx") if ($i != 7-1);
+		&mov(&DWP($i*4,$r,"",0),"eax");	# *r= L(t);
+		 &mov($c,"edx");			# c=  H(t);
+		&jz(&label("maw_end")) if ($i != 7-1);
+		}
+	&set_label("maw_end",0);
+	&mov("eax",$c);
+
+	&pop("ecx");	# clear variable from
+
+	&function_end($name);
+	}
+
+1;

diff --git a/crypto/bn/asm/x86/sqr.pl b/crypto/bn/asm/x86/sqr.pl
new file mode 100644
index 0000000..1f90993
--- /dev/null
+++ b/crypto/bn/asm/x86/sqr.pl

@@ -0,0 +1,60 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_sqr_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$r="esi";
+	$a="edi";
+	$num="ebx";
+
+	&mov($r,&wparam(0));	#
+	&mov($a,&wparam(1));	#
+	&mov($num,&wparam(2));	#
+
+	&and($num,0xfffffff8);	# num / 8
+	&jz(&label("sw_finish"));
+
+	&set_label("sw_loop",0);
+	for ($i=0; $i<32; $i+=4)
+		{
+		&comment("Round $i");
+		&mov("eax",&DWP($i,$a,"",0)); 	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*2,$r,"",0),"eax");	#
+		 &mov(&DWP($i*2+4,$r,"",0),"edx");#
+		}
+
+	&comment("");
+	&add($a,32);
+	&add($r,64);
+	&sub($num,8);
+	&jnz(&label("sw_loop"));
+
+	&set_label("sw_finish",0);
+	&mov($num,&wparam(2));	# get num
+	&and($num,7);
+	&jz(&label("sw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov("eax",&DWP($i*4,$a,"",0));	# *a
+		 # XXX
+		&mul("eax");			# *a * *a
+		&mov(&DWP($i*8,$r,"",0),"eax");	#
+		 &dec($num) if ($i != 7-1);
+		&mov(&DWP($i*8+4,$r,"",0),"edx");
+		 &jz(&label("sw_end")) if ($i != 7-1);
+		}
+	&set_label("sw_end",0);
+
+	&function_end($name);
+	}
+
+1;

diff --git a/crypto/bn/asm/x86/sub.pl b/crypto/bn/asm/x86/sub.pl
new file mode 100644
index 0000000..837b0e1
--- /dev/null
+++ b/crypto/bn/asm/x86/sub.pl

@@ -0,0 +1,76 @@
+#!/usr/local/bin/perl
+# x86 assember
+
+sub bn_sub_words
+	{
+	local($name)=@_;
+
+	&function_begin($name,"");
+
+	&comment("");
+	$a="esi";
+	$b="edi";
+	$c="eax";
+	$r="ebx";
+	$tmp1="ecx";
+	$tmp2="edx";
+	$num="ebp";
+
+	&mov($r,&wparam(0));	# get r
+	 &mov($a,&wparam(1));	# get a
+	&mov($b,&wparam(2));	# get b
+	 &mov($num,&wparam(3));	# get num
+	&xor($c,$c);		# clear carry
+	 &and($num,0xfffffff8);	# num / 8
+
+	&jz(&label("aw_finish"));
+
+	&set_label("aw_loop",0);
+	for ($i=0; $i<8; $i++)
+		{
+		&comment("Round $i");
+
+		&mov($tmp1,&DWP($i*4,$a,"",0)); 	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0)); 	# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &mov(&DWP($i*4,$r,"",0),$tmp1); 	# *r
+		}
+
+	&comment("");
+	&add($a,32);
+	 &add($b,32);
+	&add($r,32);
+	 &sub($num,8);
+	&jnz(&label("aw_loop"));
+
+	&set_label("aw_finish",0);
+	&mov($num,&wparam(3));	# get num
+	&and($num,7);
+	 &jz(&label("aw_end"));
+
+	for ($i=0; $i<7; $i++)
+		{
+		&comment("Tail Round $i");
+		&mov($tmp1,&DWP($i*4,$a,"",0));	# *a
+		 &mov($tmp2,&DWP($i*4,$b,"",0));# *b
+		&sub($tmp1,$c);
+		 &mov($c,0);
+		&adc($c,$c);
+		 &sub($tmp1,$tmp2);
+		&adc($c,0);
+		 &dec($num) if ($i != 6);
+		&mov(&DWP($i*4,$r,"",0),$tmp1);	# *a
+		 &jz(&label("aw_end")) if ($i != 6);
+		}
+	&set_label("aw_end",0);
+
+#	&mov("eax",$c);		# $c is "eax"
+
+	&function_end($name);
+	}
+
+1;

diff --git a/crypto/bn/asm/x86_64-gcc.c b/crypto/bn/asm/x86_64-gcc.c
new file mode 100644
index 0000000..836f001
--- /dev/null
+++ b/crypto/bn/asm/x86_64-gcc.c

@@ -0,0 +1,613 @@
+#include <openssl/bn.h>
+
+#if defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS)
+
+#include "../internal.h"
+
+/* x86_64 BIGNUM accelerator version 0.1, December 2002.
+ *
+ * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+ * project.
+ *
+ * Rights for redistribution and usage in source and binary forms are
+ * granted according to the OpenSSL license. Warranty of any kind is
+ * disclaimed.
+ *
+ * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
+ *    versions, like 1.0...
+ * A. Well, that's because this code is basically a quick-n-dirty
+ *    proof-of-concept hack. As you can see it's implemented with
+ *    inline assembler, which means that you're bound to GCC and that
+ *    there might be enough room for further improvement.
+ *
+ * Q. Why inline assembler?
+ * A. x86_64 features own ABI which I'm not familiar with. This is
+ *    why I decided to let the compiler take care of subroutine
+ *    prologue/epilogue as well as register allocation. For reference.
+ *    Win64 implements different ABI for AMD64, different from Linux.
+ *
+ * Q. How much faster does it get?
+ * A. 'apps/openssl speed rsa dsa' output with no-asm:
+ *
+ *	                  sign    verify    sign/s verify/s
+ *	rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
+ *	rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
+ *	rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
+ *	rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
+ *	                  sign    verify    sign/s verify/s
+ *	dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
+ *	dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
+ *	dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
+ *
+ *    'apps/openssl speed rsa dsa' output with this module:
+ *
+ *	                  sign    verify    sign/s verify/s
+ *	rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
+ *	rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
+ *	rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
+ *	rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
+ *	                  sign    verify    sign/s verify/s
+ *	dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
+ *	dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
+ *	dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
+ *
+ *    For the reference. IA-32 assembler implementation performs
+ *    very much like 64-bit code compiled with no-asm on the same
+ *    machine.
+ */
+
+#undef mul
+#undef mul_add
+
+#define asm __asm__
+
+/*
+ * "m"(a), "+m"(r)	is the way to favor DirectPath µ-code;
+ * "g"(0)		let the compiler to decide where does it
+ *			want to keep the value of zero;
+ */
+#define mul_add(r, a, word, carry)                                     \
+  do {                                                                 \
+    register BN_ULONG high, low;                                       \
+    asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
+    asm("addq %2,%0; adcq %3,%1"                                       \
+        : "+r"(carry), "+d"(high)                                      \
+        : "a"(low), "g"(0)                                             \
+        : "cc");                                                       \
+    asm("addq %2,%0; adcq %3,%1"                                       \
+        : "+m"(r), "+d"(high)                                          \
+        : "r"(carry), "g"(0)                                           \
+        : "cc");                                                       \
+    carry = high;                                                      \
+  } while (0)
+
+#define mul(r, a, word, carry)                                         \
+  do {                                                                 \
+    register BN_ULONG high, low;                                       \
+    asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
+    asm("addq %2,%0; adcq %3,%1"                                       \
+        : "+r"(carry), "+d"(high)                                      \
+        : "a"(low), "g"(0)                                             \
+        : "cc");                                                       \
+    (r) = carry, carry = high;                                         \
+  } while (0)
+#undef sqr
+#define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+                          BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  if (num <= 0)
+    return (c1);
+
+  while (num & ~3) {
+    mul_add(rp[0], ap[0], w, c1);
+    mul_add(rp[1], ap[1], w, c1);
+    mul_add(rp[2], ap[2], w, c1);
+    mul_add(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  if (num) {
+    mul_add(rp[0], ap[0], w, c1);
+    if (--num == 0)
+      return c1;
+    mul_add(rp[1], ap[1], w, c1);
+    if (--num == 0)
+      return c1;
+    mul_add(rp[2], ap[2], w, c1);
+    return c1;
+  }
+
+  return (c1);
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  if (num <= 0)
+    return (c1);
+
+  while (num & ~3) {
+    mul(rp[0], ap[0], w, c1);
+    mul(rp[1], ap[1], w, c1);
+    mul(rp[2], ap[2], w, c1);
+    mul(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  if (num) {
+    mul(rp[0], ap[0], w, c1);
+    if (--num == 0)
+      return c1;
+    mul(rp[1], ap[1], w, c1);
+    if (--num == 0)
+      return c1;
+    mul(rp[2], ap[2], w, c1);
+  }
+  return (c1);
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
+  if (n <= 0)
+    return;
+
+  while (n & ~3) {
+    sqr(r[0], r[1], a[0]);
+    sqr(r[2], r[3], a[1]);
+    sqr(r[4], r[5], a[2]);
+    sqr(r[6], r[7], a[3]);
+    a += 4;
+    r += 8;
+    n -= 4;
+  }
+  if (n) {
+    sqr(r[0], r[1], a[0]);
+    if (--n == 0)
+      return;
+    sqr(r[2], r[3], a[1]);
+    if (--n == 0)
+      return;
+    sqr(r[4], r[5], a[2]);
+  }
+}
+
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
+  BN_ULONG ret, waste;
+
+  asm("divq	%4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
+
+  return ret;
+}
+
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int n) {
+  BN_ULONG ret;
+  size_t i = 0;
+
+  if (n <= 0)
+    return 0;
+
+  asm("	subq	%0,%0		\n" /* clear carry */
+      "	jmp	1f		\n"
+      ".p2align 4			\n"
+      "1:	movq	(%4,%2,8),%0	\n"
+      "	adcq	(%5,%2,8),%0	\n"
+      "	movq	%0,(%3,%2,8)	\n"
+      "	lea	1(%2),%2	\n"
+      "	loop	1b		\n"
+      "	sbbq	%0,%0		\n"
+      : "=&r"(ret), "+c"(n), "+r"(i)
+      : "r"(rp), "r"(ap), "r"(bp)
+      : "cc");
+
+  return ret & 1;
+}
+
+#ifndef SIMICS
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                      int n) {
+  BN_ULONG ret;
+  size_t i = 0;
+
+  if (n <= 0)
+    return 0;
+
+  asm("	subq	%0,%0		\n" /* clear borrow */
+      "	jmp	1f		\n"
+      ".p2align 4			\n"
+      "1:	movq	(%4,%2,8),%0	\n"
+      "	sbbq	(%5,%2,8),%0	\n"
+      "	movq	%0,(%3,%2,8)	\n"
+      "	lea	1(%2),%2	\n"
+      "	loop	1b		\n"
+      "	sbbq	%0,%0		\n"
+      : "=&r"(ret), "+c"(n), "+r"(i)
+      : "r"(rp), "r"(ap), "r"(bp)
+      : "cc");
+
+  return ret & 1;
+}
+#else
+/* Simics 1.4<7 has buggy sbbq:-( */
+#define BN_MASK2 0xffffffffffffffffL
+BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) {
+  BN_ULONG t1, t2;
+  int c = 0;
+
+  if (n <= 0)
+    return ((BN_ULONG)0);
+
+  for (;;) {
+    t1 = a[0];
+    t2 = b[0];
+    r[0] = (t1 - t2 - c) & BN_MASK2;
+    if (t1 != t2)
+      c = (t1 < t2);
+    if (--n <= 0)
+      break;
+
+    t1 = a[1];
+    t2 = b[1];
+    r[1] = (t1 - t2 - c) & BN_MASK2;
+    if (t1 != t2)
+      c = (t1 < t2);
+    if (--n <= 0)
+      break;
+
+    t1 = a[2];
+    t2 = b[2];
+    r[2] = (t1 - t2 - c) & BN_MASK2;
+    if (t1 != t2)
+      c = (t1 < t2);
+    if (--n <= 0)
+      break;
+
+    t1 = a[3];
+    t2 = b[3];
+    r[3] = (t1 - t2 - c) & BN_MASK2;
+    if (t1 != t2)
+      c = (t1 < t2);
+    if (--n <= 0)
+      break;
+
+    a += 4;
+    b += 4;
+    r += 4;
+  }
+  return (c);
+}
+#endif
+
+/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
+/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
+/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
+/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
+ */
+
+#if 0
+/* original macros are kept for reference purposes */
+#define mul_add_c(a, b, c0, c1, c2) \
+  {                                 \
+    BN_ULONG ta = (a), tb = (b);    \
+    t1 = ta * tb;                   \
+    t2 = BN_UMULT_HIGH(ta, tb);     \
+    c0 += t1;                       \
+    t2 += (c0 < t1) ? 1 : 0;        \
+    c1 += t2;                       \
+    c2 += (c1 < t2) ? 1 : 0;        \
+  }
+
+#define mul_add_c2(a, b, c0, c1, c2) \
+  {                                  \
+    BN_ULONG ta = (a), tb = (b), t0; \
+    t1 = BN_UMULT_HIGH(ta, tb);      \
+    t0 = ta * tb;                    \
+    t2 = t1 + t1;                    \
+    c2 += (t2 < t1) ? 1 : 0;         \
+    t1 = t0 + t0;                    \
+    t2 += (t1 < t0) ? 1 : 0;         \
+    c0 += t1;                        \
+    t2 += (c0 < t1) ? 1 : 0;         \
+    c1 += t2;                        \
+    c2 += (c1 < t2) ? 1 : 0;         \
+  }
+#else
+#define mul_add_c(a, b, c0, c1, c2)                              \
+  do {                                                           \
+    asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
+    asm("addq %2,%0; adcq %3,%1"                                 \
+        : "+r"(c0), "+d"(t2)                                     \
+        : "a"(t1), "g"(0)                                        \
+        : "cc");                                                 \
+    asm("addq %2,%0; adcq %3,%1"                                 \
+        : "+r"(c1), "+r"(c2)                                     \
+        : "d"(t2), "g"(0)                                        \
+        : "cc");                                                 \
+  } while (0)
+
+#define sqr_add_c(a, i, c0, c1, c2)                         \
+  do {                                                      \
+    asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"(a[i]) : "cc"); \
+    asm("addq %2,%0; adcq %3,%1"                            \
+        : "+r"(c0), "+d"(t2)                                \
+        : "a"(t1), "g"(0)                                   \
+        : "cc");                                            \
+    asm("addq %2,%0; adcq %3,%1"                            \
+        : "+r"(c1), "+r"(c2)                                \
+        : "d"(t2), "g"(0)                                   \
+        : "cc");                                            \
+  } while (0)
+
+#define mul_add_c2(a, b, c0, c1, c2)                                    \
+  do {                                                                  \
+    asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc");        \
+    asm("addq %0,%0; adcq %2,%1" : "+d"(t2), "+r"(c2) : "g"(0) : "cc"); \
+    asm("addq %0,%0; adcq %2,%1" : "+a"(t1), "+d"(t2) : "g"(0) : "cc"); \
+    asm("addq %2,%0; adcq %3,%1"                                        \
+        : "+r"(c0), "+d"(t2)                                            \
+        : "a"(t1), "g"(0)                                               \
+        : "cc");                                                        \
+    asm("addq %2,%0; adcq %3,%1"                                        \
+        : "+r"(c1), "+r"(c2)                                            \
+        : "d"(t2), "g"(0)                                               \
+        : "cc");                                                        \
+  } while (0)
+#endif
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+  BN_ULONG t1, t2;
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[4], b[0], c2, c3, c1);
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  mul_add_c(a[0], b[4], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[0], b[5], c3, c1, c2);
+  mul_add_c(a[1], b[4], c3, c1, c2);
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  mul_add_c(a[4], b[1], c3, c1, c2);
+  mul_add_c(a[5], b[0], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[6], b[0], c1, c2, c3);
+  mul_add_c(a[5], b[1], c1, c2, c3);
+  mul_add_c(a[4], b[2], c1, c2, c3);
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  mul_add_c(a[2], b[4], c1, c2, c3);
+  mul_add_c(a[1], b[5], c1, c2, c3);
+  mul_add_c(a[0], b[6], c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[7], c2, c3, c1);
+  mul_add_c(a[1], b[6], c2, c3, c1);
+  mul_add_c(a[2], b[5], c2, c3, c1);
+  mul_add_c(a[3], b[4], c2, c3, c1);
+  mul_add_c(a[4], b[3], c2, c3, c1);
+  mul_add_c(a[5], b[2], c2, c3, c1);
+  mul_add_c(a[6], b[1], c2, c3, c1);
+  mul_add_c(a[7], b[0], c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[1], c3, c1, c2);
+  mul_add_c(a[6], b[2], c3, c1, c2);
+  mul_add_c(a[5], b[3], c3, c1, c2);
+  mul_add_c(a[4], b[4], c3, c1, c2);
+  mul_add_c(a[3], b[5], c3, c1, c2);
+  mul_add_c(a[2], b[6], c3, c1, c2);
+  mul_add_c(a[1], b[7], c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  mul_add_c(a[2], b[7], c1, c2, c3);
+  mul_add_c(a[3], b[6], c1, c2, c3);
+  mul_add_c(a[4], b[5], c1, c2, c3);
+  mul_add_c(a[5], b[4], c1, c2, c3);
+  mul_add_c(a[6], b[3], c1, c2, c3);
+  mul_add_c(a[7], b[2], c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  mul_add_c(a[7], b[3], c2, c3, c1);
+  mul_add_c(a[6], b[4], c2, c3, c1);
+  mul_add_c(a[5], b[5], c2, c3, c1);
+  mul_add_c(a[4], b[6], c2, c3, c1);
+  mul_add_c(a[3], b[7], c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  mul_add_c(a[4], b[7], c3, c1, c2);
+  mul_add_c(a[5], b[6], c3, c1, c2);
+  mul_add_c(a[6], b[5], c3, c1, c2);
+  mul_add_c(a[7], b[4], c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  mul_add_c(a[7], b[5], c1, c2, c3);
+  mul_add_c(a[6], b[6], c1, c2, c3);
+  mul_add_c(a[5], b[7], c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  mul_add_c(a[6], b[7], c2, c3, c1);
+  mul_add_c(a[7], b[6], c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[7], c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+  BN_ULONG t1, t2;
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
+  BN_ULONG t1, t2;
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  sqr_add_c2(a, 4, 0, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 5, 0, c3, c1, c2);
+  sqr_add_c2(a, 4, 1, c3, c1, c2);
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  sqr_add_c2(a, 4, 2, c1, c2, c3);
+  sqr_add_c2(a, 5, 1, c1, c2, c3);
+  sqr_add_c2(a, 6, 0, c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 0, c2, c3, c1);
+  sqr_add_c2(a, 6, 1, c2, c3, c1);
+  sqr_add_c2(a, 5, 2, c2, c3, c1);
+  sqr_add_c2(a, 4, 3, c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  sqr_add_c(a, 4, c3, c1, c2);
+  sqr_add_c2(a, 5, 3, c3, c1, c2);
+  sqr_add_c2(a, 6, 2, c3, c1, c2);
+  sqr_add_c2(a, 7, 1, c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 7, 2, c1, c2, c3);
+  sqr_add_c2(a, 6, 3, c1, c2, c3);
+  sqr_add_c2(a, 5, 4, c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  sqr_add_c(a, 5, c2, c3, c1);
+  sqr_add_c2(a, 6, 4, c2, c3, c1);
+  sqr_add_c2(a, 7, 3, c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 7, 4, c3, c1, c2);
+  sqr_add_c2(a, 6, 5, c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  sqr_add_c(a, 6, c1, c2, c3);
+  sqr_add_c2(a, 7, 5, c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 6, c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  sqr_add_c(a, 7, c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
+  BN_ULONG t1, t2;
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+#endif  /* defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS) */

diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl
new file mode 100644
index 0000000..3803928
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont.pl

@@ -0,0 +1,1400 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005.
+#
+# Montgomery multiplication routine for x86_64. While it gives modest
+# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
+# than twice, >2x, as fast. Most common rsa1024 sign is improved by
+# respectful 50%. It remains to be seen if loop unrolling and
+# dedicated squaring routine can provide further improvement...
+
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# June 2013.
+#
+# Optimize reduction in squaring procedure and improve 1024+-bit RSA
+# sign performance by 10-16% on Intel Sandy Bridge and later
+# (virtually same on non-Intel processors).
+
+# August 2013.
+#
+# Add MULX/ADOX/ADCX code path.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$addx = ($1>=11);
+}
+
+# int bn_mul_mont(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+$num="%r9";	# int num);
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.extern	OPENSSL_ia32cap_P
+
+.globl	bn_mul_mont
+.type	bn_mul_mont,\@function,6
+.align	16
+bn_mul_mont:
+	test	\$3,${num}d
+	jnz	.Lmul_enter
+	cmp	\$8,${num}d
+	jb	.Lmul_enter
+___
+$code.=<<___ if ($addx);
+	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
+___
+$code.=<<___;
+	cmp	$ap,$bp
+	jne	.Lmul4x_enter
+	test	\$7,${num}d
+	jz	.Lsqr8x_enter
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	${num}d,${num}d
+	lea	2($num),%r10
+	mov	%rsp,%r11
+	neg	%r10
+	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+2))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$lo0
+	mov	($np),%rax
+
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.L1st
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+	mov	$lo0,$hi0
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	jmp	.Louter
+.align	16
+.Louter:
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
+	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.Linner
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# pull upmost overflow bit
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	cmp	$num,$i
+	jb	.Louter
+
+	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
+	jmp	.Lsub
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	mov	8($ap,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
+	mov	$num,$j			# j=num
+	or	$np,$ap			# ap=borrow?tp:rp
+.align	16
+.Lcopy:					# copy or in-place refresh
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul_epilogue:
+	ret
+.size	bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont,\@function,6
+.align	16
+bn_mul4x_mont:
+.Lmul4x_enter:
+___
+$code.=<<___ if ($addx);
+	and	\$0x80100,%r11d
+	cmp	\$0x80100,%r11d
+	je	.Lmulx4x_enter
+___
+$code.=<<___;
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	${num}d,${num}d
+	lea	4($num),%r10
+	mov	%rsp,%r11
+	neg	%r10
+	lea	(%rsp,%r10,8),%rsp	# tp=alloca(8*(num+4))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%r11,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul4x_body:
+	mov	$rp,16(%rsp,$num,8)	# tp[num+2]=$rp
+	mov	%rdx,%r12		# reassign $bp
+___
+		$bp="%r12";
+$code.=<<___;
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($bp),$m0		# m0=bp[0]
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4($j),$j		# j++
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+.align	16
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jb	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+.align	4
+.Louter4x:
+	mov	($bp,$i,8),$m0		# m0=bp[i]
+	xor	$j,$j			# j=0
+	mov	(%rsp),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np),%rax
+	adc	\$0,%rdx
+	add	8(%rsp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4($j),$j		# j+=2
+	adc	\$0,%rdx
+	mov	$N[1],(%rsp)		# tp[j-1]
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+.align	16
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	4($j),$j		# j++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	-16($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-32(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+	cmp	$num,$j
+	jb	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-16(%rsp,$j,8),$A[0]	# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-8($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	-8(%rsp,$j,8),$A[1]
+	adc	\$0,%rdx
+	lea	1($i),$i		# i++
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[1],-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$N[0]
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	(%rsp,$num,8),$N[0]	# pull upmost overflow bit
+	adc	\$0,$N[1]
+	mov	$N[0],-8(%rsp,$j,8)
+	mov	$N[1],(%rsp,$j,8)	# store upmost overflow bit
+
+	cmp	$num,$i
+	jb	.Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+	mov	16(%rsp,$num,8),$rp	# restore $rp
+	mov	0(%rsp),@ri[0]		# tp[0]
+	pxor	%xmm0,%xmm0
+	mov	8(%rsp),@ri[1]		# tp[1]
+	shr	\$2,$num		# num/=4
+	lea	(%rsp),$ap		# borrow ap for tp
+	xor	$i,$i			# i=0 and clear CF!
+
+	sub	0($np),@ri[0]
+	mov	16($ap),@ri[2]		# tp[2]
+	mov	24($ap),@ri[3]		# tp[3]
+	sbb	8($np),@ri[1]
+	lea	-1($num),$j		# j=num/4-1
+	jmp	.Lsub4x
+.align	16
+.Lsub4x:
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	16($np,$i,8),@ri[2]
+	mov	32($ap,$i,8),@ri[0]	# tp[i+1]
+	mov	40($ap,$i,8),@ri[1]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	32($np,$i,8),@ri[0]
+	mov	48($ap,$i,8),@ri[2]
+	mov	56($ap,$i,8),@ri[3]
+	sbb	40($np,$i,8),@ri[1]
+	lea	4($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub4x
+
+	mov	@ri[0],0($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	mov	32($ap,$i,8),@ri[0]	# load overflow bit
+	sbb	16($np,$i,8),@ri[2]
+	mov	@ri[1],8($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	sbb	24($np,$i,8),@ri[3]
+	mov	@ri[2],16($rp,$i,8)	# rp[i]=tp[i]-np[i]
+
+	sbb	\$0,@ri[0]		# handle upmost overflow bit
+	mov	@ri[3],24($rp,$i,8)	# rp[i]=tp[i]-np[i]
+	xor	$i,$i			# i=0
+	and	@ri[0],$ap
+	not	@ri[0]
+	mov	$rp,$np
+	and	@ri[0],$np
+	lea	-1($num),$j
+	or	$np,$ap			# ap=borrow?tp:rp
+
+	movdqu	($ap),%xmm1
+	movdqa	%xmm0,(%rsp)
+	movdqu	%xmm1,($rp)
+	jmp	.Lcopy4x
+.align	16
+.Lcopy4x:					# copy or in-place refresh
+	movdqu	16($ap,$i),%xmm2
+	movdqu	32($ap,$i),%xmm1
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+	movdqa	%xmm0,32(%rsp,$i)
+	movdqu	%xmm1,32($rp,$i)
+	lea	32($i),$i
+	dec	$j
+	jnz	.Lcopy4x
+
+	shl	\$2,$num
+	movdqu	16($ap,$i),%xmm2
+	movdqa	%xmm0,16(%rsp,$i)
+	movdqu	%xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+	mov	(%rsi),%r15
+	mov	8(%rsi),%r14
+	mov	16(%rsi),%r13
+	mov	24(%rsi),%r12
+	mov	32(%rsi),%rbp
+	mov	40(%rsi),%rbx
+	lea	48(%rsi),%rsp
+.Lmul4x_epilogue:
+	ret
+.size	bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+{{{
+######################################################################
+# void bn_sqr8x_mont(
+my $rptr="%rdi";	# const BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $bptr="%rdx";	# not used
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.extern	bn_sqr8x_internal		# see x86_64-mont5 module
+.extern	bn_sqrx8x_internal		# see x86_64-mont5 module
+
+.type	bn_sqr8x_mont,\@function,6
+.align	32
+bn_sqr8x_mont:
+.Lsqr8x_enter:
+	mov	%rsp,%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	mov	${num}d,%r10d
+	shl	\$3,${num}d		# convert $num to bytes
+	shl	\$3+2,%r10		# 4*$num
+	neg	$num
+
+	##############################################################
+	# ensure that stack frame doesn't alias with $aptr modulo
+	# 4096. this is done to allow memory disambiguation logic
+	# do its job.
+	#
+	lea	-64(%rsp,$num,4),%r11
+	mov	($n0),$n0		# *n0
+	sub	$aptr,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lsqr8x_sp_alt
+	sub	%r11,%rsp		# align with $aptr
+	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
+	jmp	.Lsqr8x_sp_done
+
+.align	32
+.Lsqr8x_sp_alt:
+	lea	4096-64(,$num,4),%r10	# 4096-frame-4*$num
+	lea	-64(%rsp,$num,4),%rsp	# alloca(frame+4*$num)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rsp
+.Lsqr8x_sp_done:
+	and	\$-64,%rsp
+	mov	$num,%r10	
+	neg	$num
+
+	lea	64(%rsp,$num,2),%r11	# copy of modulus
+	mov	$n0,  32(%rsp)
+	mov	%rax, 40(%rsp)		# save original %rsp
+.Lsqr8x_body:
+
+	mov	$num,$i
+	movq	%r11, %xmm2		# save pointer to modulus copy
+	shr	\$3+2,$i
+	mov	OPENSSL_ia32cap_P+8(%rip),%eax
+	jmp	.Lsqr8x_copy_n
+
+.align	32
+.Lsqr8x_copy_n:
+	movq	8*0($nptr),%xmm0
+	movq	8*1($nptr),%xmm1
+	movq	8*2($nptr),%xmm3
+	movq	8*3($nptr),%xmm4
+	lea	8*4($nptr),$nptr
+	movdqa	%xmm0,16*0(%r11)
+	movdqa	%xmm1,16*1(%r11)
+	movdqa	%xmm3,16*2(%r11)
+	movdqa	%xmm4,16*3(%r11)
+	lea	16*4(%r11),%r11
+	dec	$i
+	jnz	.Lsqr8x_copy_n
+
+	pxor	%xmm0,%xmm0
+	movq	$rptr,%xmm1		# save $rptr
+	movq	%r10, %xmm3		# -$num
+___
+$code.=<<___ if ($addx);
+	and	\$0x80100,%eax
+	cmp	\$0x80100,%eax
+	jne	.Lsqr8x_nox
+
+	call	bn_sqrx8x_internal	# see x86_64-mont5 module
+
+	pxor	%xmm0,%xmm0
+	lea	48(%rsp),%rax
+	lea	64(%rsp,$num,2),%rdx
+	shr	\$3+2,$num
+	mov	40(%rsp),%rsi		# restore %rsp
+	jmp	.Lsqr8x_zero
+
+.align	32
+.Lsqr8x_nox:
+___
+$code.=<<___;
+	call	bn_sqr8x_internal	# see x86_64-mont5 module
+
+	pxor	%xmm0,%xmm0
+	lea	48(%rsp),%rax
+	lea	64(%rsp,$num,2),%rdx
+	shr	\$3+2,$num
+	mov	40(%rsp),%rsi		# restore %rsp
+	jmp	.Lsqr8x_zero
+
+.align	32
+.Lsqr8x_zero:
+	movdqa	%xmm0,16*0(%rax)	# wipe t
+	movdqa	%xmm0,16*1(%rax)
+	movdqa	%xmm0,16*2(%rax)
+	movdqa	%xmm0,16*3(%rax)
+	lea	16*4(%rax),%rax
+	movdqa	%xmm0,16*0(%rdx)	# wipe n
+	movdqa	%xmm0,16*1(%rdx)
+	movdqa	%xmm0,16*2(%rdx)
+	movdqa	%xmm0,16*3(%rdx)
+	lea	16*4(%rdx),%rdx
+	dec	$num
+	jnz	.Lsqr8x_zero
+
+	mov	\$1,%rax
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lsqr8x_epilogue:
+	ret
+.size	bn_sqr8x_mont,.-bn_sqr8x_mont
+___
+}}}
+
+if ($addx) {{{
+my $bp="%rdx";	# original value
+
+$code.=<<___;
+.type	bn_mulx4x_mont,\@function,6
+.align	32
+bn_mulx4x_mont:
+.Lmulx4x_enter:
+	mov	%rsp,%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+
+	shl	\$3,${num}d		# convert $num to bytes
+	.byte	0x67
+	xor	%r10,%r10
+	sub	$num,%r10		# -$num
+	mov	($n0),$n0		# *n0
+	lea	-72(%rsp,%r10),%rsp	# alloca(frame+$num+8)
+	lea	($bp,$num),%r10
+	and	\$-128,%rsp
+	##############################################################
+	# Stack layout
+	# +0	num
+	# +8	off-loaded &b[i]
+	# +16	end of b[num]
+	# +24	saved n0
+	# +32	saved rp
+	# +40	saved %rsp
+	# +48	inner counter
+	# +56
+	# +64	tmp[num+1]
+	#
+	mov	$num,0(%rsp)		# save $num
+	shr	\$5,$num
+	mov	%r10,16(%rsp)		# end of b[num]
+	sub	\$1,$num
+	mov	$n0, 24(%rsp)		# save *n0
+	mov	$rp, 32(%rsp)		# save $rp
+	mov	%rax,40(%rsp)		# save original %rsp
+	mov	$num,48(%rsp)		# inner counter
+	jmp	.Lmulx4x_body
+
+.align	32
+.Lmulx4x_body:
+___
+my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
+   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
+my $rptr=$bptr;
+$code.=<<___;
+	lea	8($bp),$bptr
+	mov	($bp),%rdx		# b[0], $bp==%rdx actually
+	lea	64+32(%rsp),$tptr
+	mov	%rdx,$bi
+
+	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
+	mulx	1*8($aptr),%r11,%r14	# a[1]*b[0]
+	add	%rax,%r11
+	mov	$bptr,8(%rsp)		# off-load &b[i]
+	mulx	2*8($aptr),%r12,%r13	# ...
+	adc	%r14,%r12
+	adc	\$0,%r13
+
+	mov	$mi,$bptr		# borrow $bptr
+	imulq	24(%rsp),$mi		# "t[0]"*n0
+	xor	$zero,$zero		# cf=0, of=0
+
+	mulx	3*8($aptr),%rax,%r14
+	 mov	$mi,%rdx
+	lea	4*8($aptr),$aptr
+	adcx	%rax,%r13
+	adcx	$zero,%r14		# cf=0
+
+	mulx	0*8($nptr),%rax,%r10
+	adcx	%rax,$bptr		# discarded
+	adox	%r11,%r10
+	mulx	1*8($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+	.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00	# mulx	2*8($nptr),%rax,%r12
+	mov	48(%rsp),$bptr		# counter value
+	mov	%r10,-4*8($tptr)
+	adcx	%rax,%r11
+	adox	%r13,%r12
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r11,-3*8($tptr)
+	adcx	%rax,%r12
+	adox	$zero,%r15		# of=0
+	lea	4*8($nptr),$nptr
+	mov	%r12,-2*8($tptr)
+
+	jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcx	$zero,%r15		# cf=0, modulo-scheduled
+	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
+	adcx	%r14,%r10
+	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
+	adcx	%rax,%r11
+	mulx	2*8($aptr),%r12,%rax	# ...
+	adcx	%r14,%r12
+	mulx	3*8($aptr),%r13,%r14
+	 .byte	0x67,0x67
+	 mov	$mi,%rdx
+	adcx	%rax,%r13
+	adcx	$zero,%r14		# cf=0
+	lea	4*8($aptr),$aptr
+	lea	4*8($tptr),$tptr
+
+	adox	%r15,%r10
+	mulx	0*8($nptr),%rax,%r15
+	adcx	%rax,%r10
+	adox	%r15,%r11
+	mulx	1*8($nptr),%rax,%r15
+	adcx	%rax,%r11
+	adox	%r15,%r12
+	mulx	2*8($nptr),%rax,%r15
+	mov	%r10,-5*8($tptr)
+	adcx	%rax,%r12
+	mov	%r11,-4*8($tptr)
+	adox	%r15,%r13
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r12,-3*8($tptr)
+	adcx	%rax,%r13
+	adox	$zero,%r15
+	lea	4*8($nptr),$nptr
+	mov	%r13,-2*8($tptr)
+
+	dec	$bptr			# of=0, pass cf
+	jnz	.Lmulx4x_1st
+
+	mov	0(%rsp),$num		# load num
+	mov	8(%rsp),$bptr		# re-load &b[i]
+	adc	$zero,%r15		# modulo-scheduled
+	add	%r15,%r14
+	sbb	%r15,%r15		# top-most carry
+	mov	%r14,-1*8($tptr)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	mov	($bptr),%rdx		# b[i]
+	lea	8($bptr),$bptr		# b++
+	sub	$num,$aptr		# rewind $aptr
+	mov	%r15,($tptr)		# save top-most carry
+	lea	64+4*8(%rsp),$tptr
+	sub	$num,$nptr		# rewind $nptr
+
+	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
+	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
+	mov	%rdx,$bi
+	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
+	adox	-4*8($tptr),$mi
+	adcx	%r14,%r11
+	mulx	2*8($aptr),%r15,%r13	# ...
+	adox	-3*8($tptr),%r11
+	adcx	%r15,%r12
+	adox	$zero,%r12
+	adcx	$zero,%r13
+
+	mov	$bptr,8(%rsp)		# off-load &b[i]
+	.byte	0x67
+	mov	$mi,%r15
+	imulq	24(%rsp),$mi		# "t[0]"*n0
+	xor	%ebp,%ebp		# xor	$zero,$zero	# cf=0, of=0
+
+	mulx	3*8($aptr),%rax,%r14
+	 mov	$mi,%rdx
+	adox	-2*8($tptr),%r12
+	adcx	%rax,%r13
+	adox	-1*8($tptr),%r13
+	adcx	$zero,%r14
+	lea	4*8($aptr),$aptr
+	adox	$zero,%r14
+
+	mulx	0*8($nptr),%rax,%r10
+	adcx	%rax,%r15		# discarded
+	adox	%r11,%r10
+	mulx	1*8($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+	mulx	2*8($nptr),%rax,%r12
+	mov	%r10,-4*8($tptr)
+	adcx	%rax,%r11
+	adox	%r13,%r12
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r11,-3*8($tptr)
+	lea	4*8($nptr),$nptr
+	adcx	%rax,%r12
+	adox	$zero,%r15		# of=0
+	mov	48(%rsp),$bptr		# counter value
+	mov	%r12,-2*8($tptr)
+
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
+	adcx	$zero,%r15		# cf=0, modulo-scheduled
+	adox	%r14,%r10
+	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
+	adcx	0*8($tptr),%r10
+	adox	%rax,%r11
+	mulx	2*8($aptr),%r12,%rax	# ...
+	adcx	1*8($tptr),%r11
+	adox	%r14,%r12
+	mulx	3*8($aptr),%r13,%r14
+	 mov	$mi,%rdx
+	adcx	2*8($tptr),%r12
+	adox	%rax,%r13
+	adcx	3*8($tptr),%r13
+	adox	$zero,%r14		# of=0
+	lea	4*8($aptr),$aptr
+	lea	4*8($tptr),$tptr
+	adcx	$zero,%r14		# cf=0
+
+	adox	%r15,%r10
+	mulx	0*8($nptr),%rax,%r15
+	adcx	%rax,%r10
+	adox	%r15,%r11
+	mulx	1*8($nptr),%rax,%r15
+	adcx	%rax,%r11
+	adox	%r15,%r12
+	mulx	2*8($nptr),%rax,%r15
+	mov	%r10,-5*8($tptr)
+	adcx	%rax,%r12
+	adox	%r15,%r13
+	mulx	3*8($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r11,-4*8($tptr)
+	mov	%r12,-3*8($tptr)
+	adcx	%rax,%r13
+	adox	$zero,%r15
+	lea	4*8($nptr),$nptr
+	mov	%r13,-2*8($tptr)
+
+	dec	$bptr			# of=0, pass cf
+	jnz	.Lmulx4x_inner
+
+	mov	0(%rsp),$num		# load num
+	mov	8(%rsp),$bptr		# re-load &b[i]
+	adc	$zero,%r15		# modulo-scheduled
+	sub	0*8($tptr),$zero	# pull top-most carry
+	adc	%r15,%r14
+	mov	-8($nptr),$mi
+	sbb	%r15,%r15		# top-most carry
+	mov	%r14,-1*8($tptr)
+
+	cmp	16(%rsp),$bptr
+	jne	.Lmulx4x_outer
+
+	sub	%r14,$mi		# compare top-most words
+	sbb	$mi,$mi
+	or	$mi,%r15
+
+	neg	$num
+	xor	%rdx,%rdx
+	mov	32(%rsp),$rptr		# restore rp
+	lea	64(%rsp),$tptr
+
+	pxor	%xmm0,%xmm0
+	mov	0*8($nptr,$num),%r8
+	mov	1*8($nptr,$num),%r9
+	neg	%r8
+	jmp	.Lmulx4x_sub_entry
+
+.align	32
+.Lmulx4x_sub:
+	mov	0*8($nptr,$num),%r8
+	mov	1*8($nptr,$num),%r9
+	not	%r8
+.Lmulx4x_sub_entry:
+	mov	2*8($nptr,$num),%r10
+	not	%r9
+	and	%r15,%r8
+	mov	3*8($nptr,$num),%r11
+	not	%r10
+	and	%r15,%r9
+	not	%r11
+	and	%r15,%r10
+	and	%r15,%r11
+
+	neg	%rdx			# mov %rdx,%cf
+	adc	0*8($tptr),%r8
+	adc	1*8($tptr),%r9
+	movdqa	%xmm0,($tptr)
+	adc	2*8($tptr),%r10
+	adc	3*8($tptr),%r11
+	movdqa	%xmm0,16($tptr)
+	lea	4*8($tptr),$tptr
+	sbb	%rdx,%rdx		# mov %cf,%rdx
+
+	mov	%r8,0*8($rptr)
+	mov	%r9,1*8($rptr)
+	mov	%r10,2*8($rptr)
+	mov	%r11,3*8($rptr)
+	lea	4*8($rptr),$rptr
+
+	add	\$32,$num
+	jnz	.Lmulx4x_sub
+
+	mov	40(%rsp),%rsi		# restore %rsp
+	mov	\$1,%rax
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lmulx4x_epilogue:
+	ret
+.size	bn_mulx4x_mont,.-bn_mulx4x_mont
+___
+}}}
+$code.=<<___;
+.asciz	"Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align	16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mul_handler,\@abi-omnipotent
+.align	16
+mul_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+	lea	48(%rax),%rax
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+	jmp	.Lcommon_seh_tail
+.size	mul_handler,.-mul_handler
+
+.type	sqr_handler,\@abi-omnipotent
+.align	16
+sqr_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<.Lsqr_body
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=.Lsqr_epilogue
+	jae	.Lcommon_seh_tail
+
+	mov	40(%rax),%rax		# pull saved stack pointer
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	sqr_handler,.-sqr_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul_mont
+	.rva	.LSEH_end_bn_mul_mont
+	.rva	.LSEH_info_bn_mul_mont
+
+	.rva	.LSEH_begin_bn_mul4x_mont
+	.rva	.LSEH_end_bn_mul4x_mont
+	.rva	.LSEH_info_bn_mul4x_mont
+
+	.rva	.LSEH_begin_bn_sqr8x_mont
+	.rva	.LSEH_end_bn_sqr8x_mont
+	.rva	.LSEH_info_bn_sqr8x_mont
+___
+$code.=<<___ if ($addx);
+	.rva	.LSEH_begin_bn_mulx4x_mont
+	.rva	.LSEH_end_bn_mulx4x_mont
+	.rva	.LSEH_info_bn_mulx4x_mont
+___
+$code.=<<___;
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul_mont:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul_body,.Lmul_epilogue	# HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_body,.Lmul4x_epilogue	# HandlerData[]
+.LSEH_info_bn_sqr8x_mont:
+	.byte	9,0,0,0
+	.rva	sqr_handler
+	.rva	.Lsqr8x_body,.Lsqr8x_epilogue	# HandlerData[]
+___
+$code.=<<___ if ($addx);
+.LSEH_info_bn_mulx4x_mont:
+	.byte	9,0,0,0
+	.rva	sqr_handler
+	.rva	.Lmulx4x_body,.Lmulx4x_epilogue	# HandlerData[]
+___
+}
+
+print $code;
+close STDOUT;

diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
new file mode 100644
index 0000000..c107df9
--- /dev/null
+++ b/crypto/bn/asm/x86_64-mont5.pl

@@ -0,0 +1,3514 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+# August 2013.
+#
+# Add MULX/AD*X code paths and additional interfaces to optimize for
+# branch prediction unit. For input lengths that are multiples of 8
+# the np argument is not just modulus value, but one interleaved
+# with 0. This is to optimize post-condition...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
+		=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
+	$addx = ($1>=2.23);
+}
+
+if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
+	    `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
+	$addx = ($1>=2.10);
+}
+
+if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
+	    `ml64 2>&1` =~ /Version ([0-9]+)\./) {
+	$addx = ($1>=11);
+}
+
+# int bn_mul_mont_gather5(
+$rp="%rdi";	# BN_ULONG *rp,
+$ap="%rsi";	# const BN_ULONG *ap,
+$bp="%rdx";	# const BN_ULONG *bp,
+$np="%rcx";	# const BN_ULONG *np,
+$n0="%r8";	# const BN_ULONG *n0,
+$num="%r9";	# int num,
+		# int idx);	# 0 to 2^5-1, "index" in $bp holding
+				# pre-computed powers of a', interlaced
+				# in such manner that b[0] is $bp[idx],
+				# b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.extern	OPENSSL_ia32cap_P
+
+.globl	bn_mul_mont_gather5
+.type	bn_mul_mont_gather5,\@function,6
+.align	64
+bn_mul_mont_gather5:
+	test	\$7,${num}d
+	jnz	.Lmul_enter
+___
+$code.=<<___ if ($addx);
+	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
+___
+$code.=<<___;
+	jmp	.Lmul4x_enter
+
+.align	16
+.Lmul_enter:
+	mov	${num}d,${num}d
+	mov	%rsp,%rax
+	mov	`($win64?56:8)`(%rsp),%r10d	# load 7th argument
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	lea	2($num),%r11
+	neg	%r11
+	lea	(%rsp,%r11,8),%rsp	# tp=alloca(8*(num+2))
+	and	\$-1024,%rsp		# minimize TLB usage
+
+	mov	%rax,8(%rsp,$num,8)	# tp[num+1]=%rsp
+.Lmul_body:
+	mov	$bp,%r12		# reassign $bp
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,$m0		# m0=bp[0]
+
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+
+	xor	$i,$i			# i=0
+	xor	$j,$j			# j=0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$lo0
+	mov	($np),%rax
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# "tp[0]"*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.L1st_enter
+
+.align	16
+.L1st:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	mov	$lo0,$hi0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.L1st_enter:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	lea	1($j),$j		# j++
+	mov	%rdx,$lo0
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.L1st
+
+	movq	%xmm0,$m0		# bp[1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$hi0,$hi1		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+	mov	$lo0,$hi0
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	jmp	.Louter
+.align	16
+.Louter:
+	xor	$j,$j			# j=0
+	mov	$n0,$m1
+	mov	(%rsp),$lo0
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$lo0		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+
+	imulq	$lo0,$m1		# tp[0]*n0
+	mov	%rdx,$hi0
+
+	por	%xmm2,%xmm0
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$lo0		# discarded
+	mov	8($ap),%rax
+	adc	\$0,%rdx
+	mov	8(%rsp),$lo0		# tp[1]
+	mov	%rdx,$hi1
+
+	lea	1($j),$j		# j++
+	jmp	.Linner_enter
+
+.align	16
+.Linner:
+	add	%rax,$hi1
+	mov	($ap,$j,8),%rax
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+.Linner_enter:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$hi0
+	mov	($np,$j,8),%rax
+	adc	\$0,%rdx
+	add	$hi0,$lo0		# ap[j]*bp[i]+tp[j]
+	mov	%rdx,$hi0
+	adc	\$0,$hi0
+	lea	1($j),$j		# j++
+
+	mulq	$m1			# np[j]*m1
+	cmp	$num,$j
+	jne	.Linner
+
+	movq	%xmm0,$m0		# bp[i+1]
+
+	add	%rax,$hi1
+	mov	($ap),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	mov	(%rsp,$j,8),$lo0
+	adc	\$0,%rdx
+	mov	$hi1,-16(%rsp,$j,8)	# tp[j-1]
+	mov	%rdx,$hi1
+
+	xor	%rdx,%rdx
+	add	$hi0,$hi1
+	adc	\$0,%rdx
+	add	$lo0,$hi1		# pull upmost overflow bit
+	adc	\$0,%rdx
+	mov	$hi1,-8(%rsp,$num,8)
+	mov	%rdx,(%rsp,$num,8)	# store upmost overflow bit
+
+	lea	1($i),$i		# i++
+	cmp	$num,$i
+	jb	.Louter
+
+	xor	$i,$i			# i=0 and clear CF!
+	mov	(%rsp),%rax		# tp[0]
+	lea	(%rsp),$ap		# borrow ap for tp
+	mov	$num,$j			# j=num
+	jmp	.Lsub
+.align	16
+.Lsub:	sbb	($np,$i,8),%rax
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]-np[i]
+	mov	8($ap,$i,8),%rax	# tp[i+1]
+	lea	1($i),$i		# i++
+	dec	$j			# doesnn't affect CF!
+	jnz	.Lsub
+
+	sbb	\$0,%rax		# handle upmost overflow bit
+	xor	$i,$i
+	and	%rax,$ap
+	not	%rax
+	mov	$rp,$np
+	and	%rax,$np
+	mov	$num,$j			# j=num
+	or	$np,$ap			# ap=borrow?tp:rp
+.align	16
+.Lcopy:					# copy or in-place refresh
+	mov	($ap,$i,8),%rax
+	mov	$i,(%rsp,$i,8)		# zap temporary vector
+	mov	%rax,($rp,$i,8)		# rp[i]=tp[i]
+	lea	1($i),$i
+	sub	\$1,$j
+	jnz	.Lcopy
+
+	mov	8(%rsp,$num,8),%rsi	# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	-88(%rsi),%xmm6
+	movaps	-72(%rsi),%xmm7
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lmul_epilogue:
+	ret
+.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type	bn_mul4x_mont_gather5,\@function,6
+.align	32
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+___
+$code.=<<___ if ($addx);
+	and	\$0x80100,%r11d
+	cmp	\$0x80100,%r11d
+	je	.Lmulx4x_enter
+___
+$code.=<<___;
+	.byte	0x67
+	mov	%rsp,%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	.byte	0x67
+	mov	${num}d,%r10d
+	shl	\$3,${num}d
+	shl	\$3+2,%r10d		# 4*$num
+	neg	$num			# -$num
+
+	##############################################################
+	# ensure that stack frame doesn't alias with $aptr+4*$num
+	# modulo 4096, which covers ret[num], am[num] and n[2*num]
+	# (see bn_exp.c). this is done to allow memory disambiguation
+	# logic do its magic. [excessive frame is allocated in order
+	# to allow bn_from_mont8x to clear it.]
+	#
+	lea	-64(%rsp,$num,2),%r11
+	sub	$ap,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lmul4xsp_alt
+	sub	%r11,%rsp		# align with $ap
+	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
+	jmp	.Lmul4xsp_done
+
+.align	32
+.Lmul4xsp_alt:
+	lea	4096-64(,$num,2),%r10
+	lea	-64(%rsp,$num,2),%rsp	# alloca(128+num*8)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rsp
+.Lmul4xsp_done:
+	and	\$-64,%rsp
+	neg	$num
+
+	mov	%rax,40(%rsp)
+.Lmul4x_body:
+
+	call	mul4x_internal
+
+	mov	40(%rsp),%rsi		# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	-88(%rsi),%xmm6
+	movaps	-72(%rsi),%xmm7
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lmul4x_epilogue:
+	ret
+.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+
+.type	mul4x_internal,\@abi-omnipotent
+.align	32
+mul4x_internal:
+	shl	\$5,$num
+	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
+	lea	256(%rdx,$num),%r13
+	shr	\$5,$num		# restore $num
+___
+		$bp="%r12";
+		$STRIDE=2**5*8;		# 5 is "window size"
+		$N=$STRIDE/4;		# should match cache line size
+		$tp=$i;
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96(%rdx,%r11,8),$bp	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	add	\$7,%r11
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+	and	\$7,%r11
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	lea	$STRIDE($bp),$tp	# borrow $tp
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+	pand	%xmm6,%xmm2
+	.byte	0x67
+	por	%xmm1,%xmm0
+	movq	`0*$STRIDE/4-96`($tp),%xmm1
+	.byte	0x67
+	pand	%xmm7,%xmm3
+	.byte	0x67
+	por	%xmm2,%xmm0
+	movq	`1*$STRIDE/4-96`($tp),%xmm2
+	.byte	0x67
+	pand	%xmm4,%xmm1
+	.byte	0x67
+	por	%xmm3,%xmm0
+	movq	`2*$STRIDE/4-96`($tp),%xmm3
+
+	movq	%xmm0,$m0		# m0=bp[0]
+	movq	`3*$STRIDE/4-96`($tp),%xmm0
+	mov	%r13,16+8(%rsp)		# save end of b[num]
+	mov	$rp, 56+8(%rsp)		# save $rp
+
+	mov	($n0),$n0		# pull n0[0] value
+	mov	($ap),%rax
+	lea	($ap,$num),$ap		# end of a[num]
+	neg	$num
+
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[0]
+	mov	%rax,$A[0]
+	mov	($np),%rax
+
+	pand	%xmm5,%xmm2
+	pand	%xmm6,%xmm3
+	por	%xmm2,%xmm1
+
+	imulq	$A[0],$m1		# "tp[0]"*n0
+	##############################################################
+	# $tp is chosen so that writing to top-most element of the
+	# vector occurs just "above" references to powers table,
+	# "above" modulo cache-line size, which effectively precludes
+	# possibility of memory disambiguation logic failure when
+	# accessing the table.
+	# 
+	lea	64+8(%rsp,%r11,8),$tp
+	mov	%rdx,$A[1]
+
+	pand	%xmm7,%xmm0
+	por	%xmm3,%xmm1
+	lea	2*$STRIDE($bp),$bp
+	por	%xmm1,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# discarded
+	mov	8($ap,$num),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0
+	add	%rax,$A[1]
+	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1
+	add	%rax,$N[1]
+	mov	16($ap,$num),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	4*8($num),$j		# j=4
+	lea	16*4($np),$np
+	adc	\$0,%rdx
+	mov	$N[1],($tp)
+	mov	%rdx,$N[0]
+	jmp	.L1st4x
+
+.align	32
+.L1st4x:
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16*2($np),%rax
+	lea	32($tp),$tp
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-16*1($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	16*0($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-8($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	16*1($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	lea	16*4($np),$np
+	adc	\$0,%rdx
+	mov	$N[1],($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	add	\$32,$j			# j+=4
+	jnz	.L1st4x
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[0]
+	mov	-16*2($np),%rax
+	lea	32($tp),$tp
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[0],-24($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[0]
+	add	%rax,$A[1]
+	mov	-16*1($np),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$num),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[1]
+	lea	($np,$num,2),$np	# rewind $np
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	mov	$N[0],-8($tp)
+
+	jmp	.Louter4x
+
+.align	32
+.Louter4x:
+	mov	($tp,$num),$A[0]
+	mov	$n0,$m1
+	mulq	$m0			# ap[0]*bp[i]
+	add	%rax,$A[0]		# ap[0]*bp[i]+tp[0]
+	mov	($np),%rax
+	adc	\$0,%rdx
+
+	movq	`0*$STRIDE/4-96`($bp),%xmm0
+	movq	`1*$STRIDE/4-96`($bp),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bp),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bp),%xmm3
+
+	imulq	$A[0],$m1		# tp[0]*n0
+	.byte	0x67
+	mov	%rdx,$A[1]
+	mov	$N[1],($tp)		# store upmost overflow bit
+
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	lea	($tp,$num),$tp		# rewind $tp
+	lea	$STRIDE($bp),$bp
+	por	%xmm3,%xmm0
+
+	mulq	$m1			# np[0]*m1
+	add	%rax,$A[0]		# "$N[0]", discarded
+	mov	8($ap,$num),%rax
+	adc	\$0,%rdx
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	16*1($np),%rax		# interleaved with 0, therefore 16*n
+	adc	\$0,%rdx
+	add	8($tp),$A[1]		# +tp[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap,$num),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]		# np[j]*m1+ap[j]*bp[i]+tp[j]
+	lea	4*8($num),$j		# j=4
+	lea	16*4($np),$np
+	adc	\$0,%rdx
+	mov	%rdx,$N[0]
+	jmp	.Linner4x
+
+.align	32
+.Linner4x:
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16*2($np),%rax
+	adc	\$0,%rdx
+	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
+	lea	32($tp),$tp
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	-16*1($np),%rax
+	adc	\$0,%rdx
+	add	-8($tp),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	16*0($np),%rax
+	adc	\$0,%rdx
+	add	($tp),$A[0]		# ap[j]*bp[i]+tp[j]
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	8($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-16($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	16*1($np),%rax
+	adc	\$0,%rdx
+	add	8($tp),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	16($ap,$j),%rax
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	lea	16*4($np),$np
+	adc	\$0,%rdx
+	mov	$N[0],-8($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	add	\$32,$j			# j+=4
+	jnz	.Linner4x
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[0]
+	mov	-16*2($np),%rax
+	adc	\$0,%rdx
+	add	16($tp),$A[0]		# ap[j]*bp[i]+tp[j]
+	lea	32($tp),$tp
+	adc	\$0,%rdx
+	mov	%rdx,$A[1]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[0]
+	mov	-8($ap),%rax
+	adc	\$0,%rdx
+	add	$A[0],$N[0]
+	adc	\$0,%rdx
+	mov	$N[1],-32($tp)		# tp[j-1]
+	mov	%rdx,$N[1]
+
+	mulq	$m0			# ap[j]*bp[i]
+	add	%rax,$A[1]
+	mov	$m1,%rax
+	mov	-16*1($np),$m1
+	adc	\$0,%rdx
+	add	-8($tp),$A[1]
+	adc	\$0,%rdx
+	mov	%rdx,$A[0]
+
+	mulq	$m1			# np[j]*m1
+	add	%rax,$N[1]
+	mov	($ap,$num),%rax		# ap[0]
+	adc	\$0,%rdx
+	add	$A[1],$N[1]
+	adc	\$0,%rdx
+	mov	$N[0],-24($tp)		# tp[j-1]
+	mov	%rdx,$N[0]
+
+	movq	%xmm0,$m0		# bp[i+1]
+	mov	$N[1],-16($tp)		# tp[j-1]
+	lea	($np,$num,2),$np	# rewind $np
+
+	xor	$N[1],$N[1]
+	add	$A[0],$N[0]
+	adc	\$0,$N[1]
+	add	($tp),$N[0]		# pull upmost overflow bit
+	adc	\$0,$N[1]		# upmost overflow bit
+	mov	$N[0],-8($tp)
+
+	cmp	16+8(%rsp),$bp
+	jb	.Louter4x
+___
+if (1) {
+$code.=<<___;
+	sub	$N[0],$m1		# compare top-most words
+	adc	$j,$j			# $j is zero
+	or	$j,$N[1]
+	xor	\$1,$N[1]
+	lea	($tp,$num),%rbx		# tptr in .sqr4x_sub
+	lea	($np,$N[1],8),%rbp	# nptr in .sqr4x_sub
+	mov	%r9,%rcx
+	sar	\$3+2,%rcx		# cf=0
+	mov	56+8(%rsp),%rdi		# rptr in .sqr4x_sub
+	jmp	.Lsqr4x_sub
+___
+} else {
+my @ri=("%rax",$bp,$m0,$m1);
+my $rp="%rdx";
+$code.=<<___
+	xor	\$1,$N[1]
+	lea	($tp,$num),$tp		# rewind $tp
+	sar	\$5,$num		# cf=0
+	lea	($np,$N[1],8),$np
+	mov	56+8(%rsp),$rp		# restore $rp
+	jmp	.Lsub4x
+
+.align	32
+.Lsub4x:
+	.byte	0x66
+	mov	8*0($tp),@ri[0]
+	mov	8*1($tp),@ri[1]
+	.byte	0x66
+	sbb	16*0($np),@ri[0]
+	mov	8*2($tp),@ri[2]
+	sbb	16*1($np),@ri[1]
+	mov	3*8($tp),@ri[3]
+	lea	4*8($tp),$tp
+	sbb	16*2($np),@ri[2]
+	mov	@ri[0],8*0($rp)
+	sbb	16*3($np),@ri[3]
+	lea	16*4($np),$np
+	mov	@ri[1],8*1($rp)
+	mov	@ri[2],8*2($rp)
+	mov	@ri[3],8*3($rp)
+	lea	8*4($rp),$rp
+
+	inc	$num
+	jnz	.Lsub4x
+
+	ret
+___
+}
+$code.=<<___;
+.size	mul4x_internal,.-mul4x_internal
+___
+}}}
+{{{
+######################################################################
+# void bn_power5(
+my $rptr="%rdi";	# BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $bptr="%rdx";	# const void *table,
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 8
+			# int pwr 
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.globl	bn_power5
+.type	bn_power5,\@function,6
+.align	32
+bn_power5:
+___
+$code.=<<___ if ($addx);
+	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
+	and	\$0x80100,%r11d
+	cmp	\$0x80100,%r11d
+	je	.Lpowerx5_enter
+___
+$code.=<<___;
+	mov	%rsp,%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	mov	${num}d,%r10d
+	shl	\$3,${num}d		# convert $num to bytes
+	shl	\$3+2,%r10d		# 4*$num
+	neg	$num
+	mov	($n0),$n0		# *n0
+
+	##############################################################
+	# ensure that stack frame doesn't alias with $aptr+4*$num
+	# modulo 4096, which covers ret[num], am[num] and n[2*num]
+	# (see bn_exp.c). this is done to allow memory disambiguation
+	# logic do its magic.
+	#
+	lea	-64(%rsp,$num,2),%r11
+	sub	$aptr,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lpwr_sp_alt
+	sub	%r11,%rsp		# align with $aptr
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	jmp	.Lpwr_sp_done
+
+.align	32
+.Lpwr_sp_alt:
+	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rsp
+.Lpwr_sp_done:
+	and	\$-64,%rsp
+	mov	$num,%r10	
+	neg	$num
+
+	##############################################################
+	# Stack layout
+	#
+	# +0	saved $num, used in reduction section
+	# +8	&t[2*$num], used in reduction section
+	# +32	saved *n0
+	# +40	saved %rsp
+	# +48	t[2*$num]
+	#
+	mov	$n0,  32(%rsp)
+	mov	%rax, 40(%rsp)		# save original %rsp
+.Lpower5_body:
+	movq	$rptr,%xmm1		# save $rptr
+	movq	$nptr,%xmm2		# save $nptr
+	movq	%r10, %xmm3		# -$num
+	movq	$bptr,%xmm4
+
+	call	__bn_sqr8x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_sqr8x_internal
+	call	__bn_sqr8x_internal
+
+	movq	%xmm2,$nptr
+	movq	%xmm4,$bptr
+	mov	$aptr,$rptr
+	mov	40(%rsp),%rax
+	lea	32(%rsp),$n0
+
+	call	mul4x_internal
+
+	mov	40(%rsp),%rsi		# restore %rsp
+	mov	\$1,%rax
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lpower5_epilogue:
+	ret
+.size	bn_power5,.-bn_power5
+
+.globl	bn_sqr8x_internal
+.hidden	bn_sqr8x_internal
+.type	bn_sqr8x_internal,\@abi-omnipotent
+.align	32
+bn_sqr8x_internal:
+__bn_sqr8x_internal:
+	##############################################################
+	# Squaring part:
+	#
+	# a) multiply-n-add everything but a[i]*a[i];
+	# b) shift result of a) by 1 to the left and accumulate
+	#    a[i]*a[i] products;
+	#
+	##############################################################
+	#                                                     a[1]a[0]
+	#                                                 a[2]a[0]
+	#                                             a[3]a[0]
+	#                                             a[2]a[1]
+	#                                         a[4]a[0]
+	#                                         a[3]a[1]
+	#                                     a[5]a[0]
+	#                                     a[4]a[1]
+	#                                     a[3]a[2]
+	#                                 a[6]a[0]
+	#                                 a[5]a[1]
+	#                                 a[4]a[2]
+	#                             a[7]a[0]
+	#                             a[6]a[1]
+	#                             a[5]a[2]
+	#                             a[4]a[3]
+	#                         a[7]a[1]
+	#                         a[6]a[2]
+	#                         a[5]a[3]
+	#                     a[7]a[2]
+	#                     a[6]a[3]
+	#                     a[5]a[4]
+	#                 a[7]a[3]
+	#                 a[6]a[4]
+	#             a[7]a[4]
+	#             a[6]a[5]
+	#         a[7]a[5]
+	#     a[7]a[6]
+	#                                                     a[1]a[0]
+	#                                                 a[2]a[0]
+	#                                             a[3]a[0]
+	#                                         a[4]a[0]
+	#                                     a[5]a[0]
+	#                                 a[6]a[0]
+	#                             a[7]a[0]
+	#                                             a[2]a[1]
+	#                                         a[3]a[1]
+	#                                     a[4]a[1]
+	#                                 a[5]a[1]
+	#                             a[6]a[1]
+	#                         a[7]a[1]
+	#                                     a[3]a[2]
+	#                                 a[4]a[2]
+	#                             a[5]a[2]
+	#                         a[6]a[2]
+	#                     a[7]a[2]
+	#                             a[4]a[3]
+	#                         a[5]a[3]
+	#                     a[6]a[3]
+	#                 a[7]a[3]
+	#                     a[5]a[4]
+	#                 a[6]a[4]
+	#             a[7]a[4]
+	#             a[6]a[5]
+	#         a[7]a[5]
+	#     a[7]a[6]
+	#                                                         a[0]a[0]
+	#                                                 a[1]a[1]
+	#                                         a[2]a[2]
+	#                                 a[3]a[3]
+	#                         a[4]a[4]
+	#                 a[5]a[5]
+	#         a[6]a[6]
+	# a[7]a[7]
+
+	lea	32(%r10),$i		# $i=-($num-32)
+	lea	($aptr,$num),$aptr	# end of a[] buffer, ($aptr,$i)=&ap[2]
+
+	mov	$num,$j			# $j=$num
+
+					# comments apply to $num==8 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	mov	%rax,$A0[0]		# a[1]*a[0]
+	 mov	$ai,%rax		# a[2]
+	mov	%rdx,$A0[1]
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+	mov	%rdx,$A0[0]
+
+
+	 mov	-8($aptr,$i),$ai	# a[3]
+	mul	$a1			# a[2]*a[1]
+	mov	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[1]
+
+	 lea	($i),$j
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[3]
+	jmp	.Lsqr4x_1st
+
+.align	32
+.Lsqr4x_1st:
+	 mov	($aptr,$j),$ai		# a[4]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[0]
+	adc	\$0,$A1[0]
+
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	 mov	8($aptr,$j),$ai		# a[5]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+
+
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	 mov	$A0[1],($tptr,$j)	# t[4]
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	 mov	16($aptr,$j),$ai	# a[6]
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]		# a[5]*a[3]+t[6]
+	 mov	$ai,%rax
+	 mov	$A0[0],8($tptr,$j)	# t[5]
+	mov	%rdx,$A1[0]
+	adc	\$0,$A1[0]
+
+	mul	$a0			# a[6]*a[2]
+	add	%rax,$A0[1]		# a[6]*a[2]+a[5]*a[3]+t[6]
+	 mov	$ai,%rax		# a[3]
+	 mov	24($aptr,$j),$ai	# a[7]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+
+
+	mul	$a1			# a[6]*a[5]
+	add	%rax,$A1[0]		# a[6]*a[5]+t[7]
+	 mov	$ai,%rax
+	 mov	$A0[1],16($tptr,$j)	# t[6]
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+	 lea	32($j),$j
+
+	mul	$a0			# a[7]*a[4]
+	add	%rax,$A0[0]		# a[7]*a[4]+a[6]*a[5]+t[6]
+	 mov	$ai,%rax
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[7]
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_1st
+
+	mul	$a1			# a[7]*a[5]
+	add	%rax,$A1[1]
+	lea	16($i),$i
+	adc	\$0,%rdx
+	add	$A0[1],$A1[1]
+	adc	\$0,%rdx
+
+	mov	$A1[1],($tptr)		# t[8]
+	mov	%rdx,$A1[0]
+	mov	%rdx,8($tptr)		# t[9]
+	jmp	.Lsqr4x_outer
+
+.align	32
+.Lsqr4x_outer:				# comments apply to $num==6 case
+	mov	-32($aptr,$i),$a0	# a[0]
+	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr,$i),%rax	# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr,$i),$ai	# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	mov	-24($tptr,$i),$A0[0]	# t[1]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1]
+	 mov	$ai,%rax		# a[2]
+	adc	\$0,%rdx
+	mov	$A0[0],-24($tptr,$i)	# t[1]
+	mov	%rdx,$A0[1]
+
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	add	-16($tptr,$i),$A0[1]	# a[2]*a[0]+t[2]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	mov	$A0[1],-16($tptr,$i)	# t[2]
+
+	xor	$A1[0],$A1[0]
+
+	 mov	-8($aptr,$i),$ai	# a[3]
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	add	-8($tptr,$i),$A1[0]
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	add	$A1[0],$A0[0]
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr,$i)	# t[3]
+
+	lea	($i),$j
+	jmp	.Lsqr4x_inner
+
+.align	32
+.Lsqr4x_inner:
+	 mov	($aptr,$j),$ai		# a[4]
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]		# a[3]*a[1]+t[4]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[0]
+	adc	\$0,$A1[0]
+	add	($tptr,$j),$A1[1]
+	adc	\$0,$A1[0]
+
+	.byte	0x67
+	mul	$a0			# a[4]*a[0]
+	add	%rax,$A0[1]		# a[4]*a[0]+a[3]*a[1]+t[4]
+	 mov	$ai,%rax		# a[3]
+	 mov	8($aptr,$j),$ai		# a[5]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	add	$A1[1],$A0[1]
+	adc	\$0,$A0[0]
+
+	mul	$a1			# a[4]*a[3]
+	add	%rax,$A1[0]		# a[4]*a[3]+t[5]
+	mov	$A0[1],($tptr,$j)	# t[4]
+	 mov	$ai,%rax
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+	add	8($tptr,$j),$A1[0]
+	lea	16($j),$j		# j++
+	adc	\$0,$A1[1]
+
+	mul	$a0			# a[5]*a[2]
+	add	%rax,$A0[0]		# a[5]*a[2]+a[4]*a[3]+t[5]
+	 mov	$ai,%rax
+	adc	\$0,%rdx
+	add	$A1[0],$A0[0]
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr,$j)	# t[5], "preloaded t[1]" below
+
+	cmp	\$0,$j
+	jne	.Lsqr4x_inner
+
+	.byte	0x67
+	mul	$a1			# a[5]*a[3]
+	add	%rax,$A1[1]
+	adc	\$0,%rdx
+	add	$A0[1],$A1[1]
+	adc	\$0,%rdx
+
+	mov	$A1[1],($tptr)		# t[6], "preloaded t[2]" below
+	mov	%rdx,$A1[0]
+	mov	%rdx,8($tptr)		# t[7], "preloaded t[3]" below
+
+	add	\$16,$i
+	jnz	.Lsqr4x_outer
+
+					# comments apply to $num==4 case
+	mov	-32($aptr),$a0		# a[0]
+	lea	48+8(%rsp,$num,2),$tptr	# end of tp[] buffer, &tp[2*$num]
+	mov	-24($aptr),%rax		# a[1]
+	lea	-32($tptr,$i),$tptr	# end of tp[] window, &tp[2*$num-"$i"]
+	mov	-16($aptr),$ai		# a[2]
+	mov	%rax,$a1
+
+	mul	$a0			# a[1]*a[0]
+	add	%rax,$A0[0]		# a[1]*a[0]+t[1], preloaded t[1]
+	 mov	$ai,%rax		# a[2]
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+
+	mul	$a0			# a[2]*a[0]
+	add	%rax,$A0[1]
+	 mov	$ai,%rax
+	 mov	$A0[0],-24($tptr)	# t[1]
+	mov	%rdx,$A0[0]
+	adc	\$0,$A0[0]
+	add	$A1[1],$A0[1]		# a[2]*a[0]+t[2], preloaded t[2]
+	 mov	-8($aptr),$ai		# a[3]
+	adc	\$0,$A0[0]
+
+	mul	$a1			# a[2]*a[1]
+	add	%rax,$A1[0]		# a[2]*a[1]+t[3], preloaded t[3]
+	 mov	$ai,%rax
+	 mov	$A0[1],-16($tptr)	# t[2]
+	mov	%rdx,$A1[1]
+	adc	\$0,$A1[1]
+
+	mul	$a0			# a[3]*a[0]
+	add	%rax,$A0[0]		# a[3]*a[0]+a[2]*a[1]+t[3]
+	 mov	$ai,%rax
+	mov	%rdx,$A0[1]
+	adc	\$0,$A0[1]
+	add	$A1[0],$A0[0]
+	adc	\$0,$A0[1]
+	mov	$A0[0],-8($tptr)	# t[3]
+
+	mul	$a1			# a[3]*a[1]
+	add	%rax,$A1[1]
+	 mov	-16($aptr),%rax		# a[2]
+	adc	\$0,%rdx
+	add	$A0[1],$A1[1]
+	adc	\$0,%rdx
+
+	mov	$A1[1],($tptr)		# t[4]
+	mov	%rdx,$A1[0]
+	mov	%rdx,8($tptr)		# t[5]
+
+	mul	$ai			# a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+	 add	\$16,$i
+	 xor	$shift,$shift
+	 sub	$num,$i			# $i=16-$num
+	 xor	$carry,$carry
+
+	add	$A1[0],%rax		# t[5]
+	adc	\$0,%rdx
+	mov	%rax,8($tptr)		# t[5]
+	mov	%rdx,16($tptr)		# t[6]
+	mov	$carry,24($tptr)	# t[7]
+
+	 mov	-16($aptr,$i),%rax	# a[0]
+	lea	48+8(%rsp),$tptr
+	 xor	$A0[0],$A0[0]		# t[0]
+	 mov	8($tptr),$A0[1]		# t[1]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],8($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],16($tptr)
+	adc	%rdx,$S[3]
+	lea	16($i),$i
+	mov	$S[3],24($tptr)
+	sbb	$carry,$carry		# mov cf,$carry
+	lea	64($tptr),$tptr
+	jmp	.Lsqr4x_shift_n_add
+
+.align	32
+.Lsqr4x_shift_n_add:
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],-24($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	0($tptr),$A0[0]		# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	8($tptr),$A0[1]		# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	0($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],-16($tptr)
+	adc	%rdx,$S[3]
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	 mov	$S[3],-8($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	24($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	8($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[0],0($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1 | shift
+	 mov	$S[1],8($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	 mov	32($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	40($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[2]
+	 mov	16($aptr,$i),%rax	# a[i+1]	# prefetch
+	mov	$S[2],16($tptr)
+	adc	%rdx,$S[3]
+	mov	$S[3],24($tptr)
+	sbb	$carry,$carry		# mov cf,$carry
+	lea	64($tptr),$tptr
+	add	\$32,$i
+	jnz	.Lsqr4x_shift_n_add
+
+	lea	($shift,$A0[0],2),$S[0]	# t[2*i]<<1 | shift
+	.byte	0x67
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[1]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[1]		# | t[2*i]>>63
+	 mov	-16($tptr),$A0[0]	# t[2*i+2]	# prefetch
+	mov	$A0[1],$shift		# shift=t[2*i+1]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	 mov	-8($tptr),$A0[1]	# t[2*i+2+1]	# prefetch
+	adc	%rax,$S[0]
+	 mov	-8($aptr),%rax		# a[i+1]	# prefetch
+	mov	$S[0],-32($tptr)
+	adc	%rdx,$S[1]
+
+	lea	($shift,$A0[0],2),$S[2]	# t[2*i]<<1|shift
+	 mov	$S[1],-24($tptr)
+	 sbb	$carry,$carry		# mov cf,$carry
+	shr	\$63,$A0[0]
+	lea	($j,$A0[1],2),$S[3]	# t[2*i+1]<<1 |
+	shr	\$63,$A0[1]
+	or	$A0[0],$S[3]		# | t[2*i]>>63
+	mul	%rax			# a[i]*a[i]
+	neg	$carry			# mov $carry,cf
+	adc	%rax,$S[2]
+	adc	%rdx,$S[3]
+	mov	$S[2],-16($tptr)
+	mov	$S[3],-8($tptr)
+___
+}
+######################################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+# This new path is inspired by multiple submissions from Intel, by
+# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
+# Vinodh Gopal...
+{
+my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx");
+
+$code.=<<___;
+	movq	%xmm2,$nptr
+sqr8x_reduction:
+	xor	%rax,%rax
+	lea	($nptr,$num,2),%rcx	# end of n[]
+	lea	48+8(%rsp,$num,2),%rdx	# end of t[] buffer
+	mov	%rcx,0+8(%rsp)
+	lea	48+8(%rsp,$num),$tptr	# end of initial t[] window
+	mov	%rdx,8+8(%rsp)
+	neg	$num
+	jmp	.L8x_reduction_loop
+
+.align	32
+.L8x_reduction_loop:
+	lea	($tptr,$num),$tptr	# start of current t[] window
+	.byte	0x66
+	mov	8*0($tptr),$m0
+	mov	8*1($tptr),%r9
+	mov	8*2($tptr),%r10
+	mov	8*3($tptr),%r11
+	mov	8*4($tptr),%r12
+	mov	8*5($tptr),%r13
+	mov	8*6($tptr),%r14
+	mov	8*7($tptr),%r15
+	mov	%rax,(%rdx)		# store top-most carry bit
+	lea	8*8($tptr),$tptr
+
+	.byte	0x67
+	mov	$m0,%r8
+	imulq	32+8(%rsp),$m0		# n0*a[0]
+	mov	16*0($nptr),%rax	# n[0]
+	mov	\$8,%ecx
+	jmp	.L8x_reduce
+
+.align	32
+.L8x_reduce:
+	mulq	$m0
+	 mov	16*1($nptr),%rax	# n[1]
+	neg	%r8
+	mov	%rdx,%r8
+	adc	\$0,%r8
+
+	mulq	$m0
+	add	%rax,%r9
+	 mov	16*2($nptr),%rax
+	adc	\$0,%rdx
+	add	%r9,%r8
+	 mov	$m0,48-8+8(%rsp,%rcx,8)	# put aside n0*a[i]
+	mov	%rdx,%r9
+	adc	\$0,%r9
+
+	mulq	$m0
+	add	%rax,%r10
+	 mov	16*3($nptr),%rax
+	adc	\$0,%rdx
+	add	%r10,%r9
+	 mov	32+8(%rsp),$carry	# pull n0, borrow $carry
+	mov	%rdx,%r10
+	adc	\$0,%r10
+
+	mulq	$m0
+	add	%rax,%r11
+	 mov	16*4($nptr),%rax
+	adc	\$0,%rdx
+	 imulq	%r8,$carry		# modulo-scheduled
+	add	%r11,%r10
+	mov	%rdx,%r11
+	adc	\$0,%r11
+
+	mulq	$m0
+	add	%rax,%r12
+	 mov	16*5($nptr),%rax
+	adc	\$0,%rdx
+	add	%r12,%r11
+	mov	%rdx,%r12
+	adc	\$0,%r12
+
+	mulq	$m0
+	add	%rax,%r13
+	 mov	16*6($nptr),%rax
+	adc	\$0,%rdx
+	add	%r13,%r12
+	mov	%rdx,%r13
+	adc	\$0,%r13
+
+	mulq	$m0
+	add	%rax,%r14
+	 mov	16*7($nptr),%rax
+	adc	\$0,%rdx
+	add	%r14,%r13
+	mov	%rdx,%r14
+	adc	\$0,%r14
+
+	mulq	$m0
+	 mov	$carry,$m0		# n0*a[i]
+	add	%rax,%r15
+	 mov	16*0($nptr),%rax	# n[0]
+	adc	\$0,%rdx
+	add	%r15,%r14
+	mov	%rdx,%r15
+	adc	\$0,%r15
+
+	dec	%ecx
+	jnz	.L8x_reduce
+
+	lea	16*8($nptr),$nptr
+	xor	%rax,%rax
+	mov	8+8(%rsp),%rdx		# pull end of t[]
+	cmp	0+8(%rsp),$nptr		# end of n[]?
+	jae	.L8x_no_tail
+
+	.byte	0x66
+	add	8*0($tptr),%r8
+	adc	8*1($tptr),%r9
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	sbb	$carry,$carry		# top carry
+
+	mov	48+56+8(%rsp),$m0	# pull n0*a[0]
+	mov	\$8,%ecx
+	mov	16*0($nptr),%rax
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail:
+	mulq	$m0
+	add	%rax,%r8
+	 mov	16*1($nptr),%rax
+	 mov	%r8,($tptr)		# save result
+	mov	%rdx,%r8
+	adc	\$0,%r8
+
+	mulq	$m0
+	add	%rax,%r9
+	 mov	16*2($nptr),%rax
+	adc	\$0,%rdx
+	add	%r9,%r8
+	 lea	8($tptr),$tptr		# $tptr++
+	mov	%rdx,%r9
+	adc	\$0,%r9
+
+	mulq	$m0
+	add	%rax,%r10
+	 mov	16*3($nptr),%rax
+	adc	\$0,%rdx
+	add	%r10,%r9
+	mov	%rdx,%r10
+	adc	\$0,%r10
+
+	mulq	$m0
+	add	%rax,%r11
+	 mov	16*4($nptr),%rax
+	adc	\$0,%rdx
+	add	%r11,%r10
+	mov	%rdx,%r11
+	adc	\$0,%r11
+
+	mulq	$m0
+	add	%rax,%r12
+	 mov	16*5($nptr),%rax
+	adc	\$0,%rdx
+	add	%r12,%r11
+	mov	%rdx,%r12
+	adc	\$0,%r12
+
+	mulq	$m0
+	add	%rax,%r13
+	 mov	16*6($nptr),%rax
+	adc	\$0,%rdx
+	add	%r13,%r12
+	mov	%rdx,%r13
+	adc	\$0,%r13
+
+	mulq	$m0
+	add	%rax,%r14
+	 mov	16*7($nptr),%rax
+	adc	\$0,%rdx
+	add	%r14,%r13
+	mov	%rdx,%r14
+	adc	\$0,%r14
+
+	mulq	$m0
+	 mov	48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i]
+	add	%rax,%r15
+	adc	\$0,%rdx
+	add	%r15,%r14
+	 mov	16*0($nptr),%rax	# pull n[0]
+	mov	%rdx,%r15
+	adc	\$0,%r15
+
+	dec	%ecx
+	jnz	.L8x_tail
+
+	lea	16*8($nptr),$nptr
+	mov	8+8(%rsp),%rdx		# pull end of t[]
+	cmp	0+8(%rsp),$nptr		# end of n[]?
+	jae	.L8x_tail_done		# break out of loop
+
+	 mov	48+56+8(%rsp),$m0	# pull n0*a[0]
+	neg	$carry
+	 mov	8*0($nptr),%rax		# pull n[0]
+	adc	8*0($tptr),%r8
+	adc	8*1($tptr),%r9
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	sbb	$carry,$carry		# top carry
+
+	mov	\$8,%ecx
+	jmp	.L8x_tail
+
+.align	32
+.L8x_tail_done:
+	add	(%rdx),%r8		# can this overflow?
+	xor	%rax,%rax
+
+	neg	$carry
+.L8x_no_tail:
+	adc	8*0($tptr),%r8
+	adc	8*1($tptr),%r9
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	adc	\$0,%rax		# top-most carry
+	 mov	-16($nptr),%rcx		# np[num-1]
+	 xor	$carry,$carry
+
+	movq	%xmm2,$nptr		# restore $nptr
+
+	mov	%r8,8*0($tptr)		# store top 512 bits
+	mov	%r9,8*1($tptr)
+	 movq	%xmm3,$num		# $num is %r9, can't be moved upwards
+	mov	%r10,8*2($tptr)
+	mov	%r11,8*3($tptr)
+	mov	%r12,8*4($tptr)
+	mov	%r13,8*5($tptr)
+	mov	%r14,8*6($tptr)
+	mov	%r15,8*7($tptr)
+	lea	8*8($tptr),$tptr
+
+	cmp	%rdx,$tptr		# end of t[]?
+	jb	.L8x_reduction_loop
+___
+}
+##############################################################
+# Post-condition, 4x unrolled
+#
+{
+my ($tptr,$nptr)=("%rbx","%rbp");
+$code.=<<___;
+	#xor	%rsi,%rsi		# %rsi was $carry above
+	sub	%r15,%rcx		# compare top-most words
+	lea	(%rdi,$num),$tptr	# %rdi was $tptr above
+	adc	%rsi,%rsi
+	mov	$num,%rcx
+	or	%rsi,%rax
+	movq	%xmm1,$rptr		# restore $rptr
+	xor	\$1,%rax
+	movq	%xmm1,$aptr		# prepare for back-to-back call
+	lea	($nptr,%rax,8),$nptr
+	sar	\$3+2,%rcx		# cf=0
+	jmp	.Lsqr4x_sub
+
+.align	32
+.Lsqr4x_sub:
+	.byte	0x66
+	mov	8*0($tptr),%r12
+	mov	8*1($tptr),%r13
+	sbb	16*0($nptr),%r12
+	mov	8*2($tptr),%r14
+	sbb	16*1($nptr),%r13
+	mov	8*3($tptr),%r15
+	lea	8*4($tptr),$tptr
+	sbb	16*2($nptr),%r14
+	mov	%r12,8*0($rptr)
+	sbb	16*3($nptr),%r15
+	lea	16*4($nptr),$nptr
+	mov	%r13,8*1($rptr)
+	mov	%r14,8*2($rptr)
+	mov	%r15,8*3($rptr)
+	lea	8*4($rptr),$rptr
+
+	inc	%rcx			# pass %cf
+	jnz	.Lsqr4x_sub
+___
+}
+$code.=<<___;
+	mov	$num,%r10		# prepare for back-to-back call
+	neg	$num			# restore $num	
+	ret
+.size	bn_sqr8x_internal,.-bn_sqr8x_internal
+___
+{
+$code.=<<___;
+.globl	bn_from_montgomery
+.type	bn_from_montgomery,\@abi-omnipotent
+.align	32
+bn_from_montgomery:
+	testl	\$7,`($win64?"48(%rsp)":"%r9d")`
+	jz	bn_from_mont8x
+	xor	%eax,%eax
+	ret
+.size	bn_from_montgomery,.-bn_from_montgomery
+
+.type	bn_from_mont8x,\@function,6
+.align	32
+bn_from_mont8x:
+	.byte	0x67
+	mov	%rsp,%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	.byte	0x67
+	mov	${num}d,%r10d
+	shl	\$3,${num}d		# convert $num to bytes
+	shl	\$3+2,%r10d		# 4*$num
+	neg	$num
+	mov	($n0),$n0		# *n0
+
+	##############################################################
+	# ensure that stack frame doesn't alias with $aptr+4*$num
+	# modulo 4096, which covers ret[num], am[num] and n[2*num]
+	# (see bn_exp.c). this is done to allow memory disambiguation
+	# logic do its magic.
+	#
+	lea	-64(%rsp,$num,2),%r11
+	sub	$aptr,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lfrom_sp_alt
+	sub	%r11,%rsp		# align with $aptr
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	jmp	.Lfrom_sp_done
+
+.align	32
+.Lfrom_sp_alt:
+	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rsp
+.Lfrom_sp_done:
+	and	\$-64,%rsp
+	mov	$num,%r10	
+	neg	$num
+
+	##############################################################
+	# Stack layout
+	#
+	# +0	saved $num, used in reduction section
+	# +8	&t[2*$num], used in reduction section
+	# +32	saved *n0
+	# +40	saved %rsp
+	# +48	t[2*$num]
+	#
+	mov	$n0,  32(%rsp)
+	mov	%rax, 40(%rsp)		# save original %rsp
+.Lfrom_body:
+	mov	$num,%r11
+	lea	48(%rsp),%rax
+	pxor	%xmm0,%xmm0
+	jmp	.Lmul_by_1
+
+.align	32
+.Lmul_by_1:
+	movdqu	($aptr),%xmm1
+	movdqu	16($aptr),%xmm2
+	movdqu	32($aptr),%xmm3
+	movdqa	%xmm0,(%rax,$num)
+	movdqu	48($aptr),%xmm4
+	movdqa	%xmm0,16(%rax,$num)
+	.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00	# lea	64($aptr),$aptr
+	movdqa	%xmm1,(%rax)
+	movdqa	%xmm0,32(%rax,$num)
+	movdqa	%xmm2,16(%rax)
+	movdqa	%xmm0,48(%rax,$num)
+	movdqa	%xmm3,32(%rax)
+	movdqa	%xmm4,48(%rax)
+	lea	64(%rax),%rax
+	sub	\$64,%r11
+	jnz	.Lmul_by_1
+
+	movq	$rptr,%xmm1
+	movq	$nptr,%xmm2
+	.byte	0x67
+	mov	$nptr,%rbp
+	movq	%r10, %xmm3		# -num
+___
+$code.=<<___ if ($addx);
+	mov	OPENSSL_ia32cap_P+8(%rip),%r11d
+	and	\$0x80100,%r11d
+	cmp	\$0x80100,%r11d
+	jne	.Lfrom_mont_nox
+
+	lea	(%rax,$num),$rptr
+	call	sqrx8x_reduction
+
+	pxor	%xmm0,%xmm0
+	lea	48(%rsp),%rax
+	mov	40(%rsp),%rsi		# restore %rsp
+	jmp	.Lfrom_mont_zero
+
+.align	32
+.Lfrom_mont_nox:
+___
+$code.=<<___;
+	call	sqr8x_reduction
+
+	pxor	%xmm0,%xmm0
+	lea	48(%rsp),%rax
+	mov	40(%rsp),%rsi		# restore %rsp
+	jmp	.Lfrom_mont_zero
+
+.align	32
+.Lfrom_mont_zero:
+	movdqa	%xmm0,16*0(%rax)
+	movdqa	%xmm0,16*1(%rax)
+	movdqa	%xmm0,16*2(%rax)
+	movdqa	%xmm0,16*3(%rax)
+	lea	16*4(%rax),%rax
+	sub	\$32,$num
+	jnz	.Lfrom_mont_zero
+
+	mov	\$1,%rax
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lfrom_epilogue:
+	ret
+.size	bn_from_mont8x,.-bn_from_mont8x
+___
+}
+}}}
+
+if ($addx) {{{
+my $bp="%rdx";	# restore original value
+
+$code.=<<___;
+.type	bn_mulx4x_mont_gather5,\@function,6
+.align	32
+bn_mulx4x_mont_gather5:
+.Lmulx4x_enter:
+	.byte	0x67
+	mov	%rsp,%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	.byte	0x67
+	mov	${num}d,%r10d
+	shl	\$3,${num}d		# convert $num to bytes
+	shl	\$3+2,%r10d		# 4*$num
+	neg	$num			# -$num
+	mov	($n0),$n0		# *n0
+
+	##############################################################
+	# ensure that stack frame doesn't alias with $aptr+4*$num
+	# modulo 4096, which covers a[num], ret[num] and n[2*num]
+	# (see bn_exp.c). this is done to allow memory disambiguation
+	# logic do its magic. [excessive frame is allocated in order
+	# to allow bn_from_mont8x to clear it.]
+	#
+	lea	-64(%rsp,$num,2),%r11
+	sub	$ap,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lmulx4xsp_alt
+	sub	%r11,%rsp		# align with $aptr
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
+	jmp	.Lmulx4xsp_done
+
+.align	32
+.Lmulx4xsp_alt:
+	lea	4096-64(,$num,2),%r10	# 4096-frame-$num
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+$num)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rsp
+.Lmulx4xsp_done:	
+	and	\$-64,%rsp		# ensure alignment
+	##############################################################
+	# Stack layout
+	# +0	-num
+	# +8	off-loaded &b[i]
+	# +16	end of b[num]
+	# +24	inner counter
+	# +32	saved n0
+	# +40	saved %rsp
+	# +48
+	# +56	saved rp
+	# +64	tmp[num+1]
+	#
+	mov	$n0, 32(%rsp)		# save *n0
+	mov	%rax,40(%rsp)		# save original %rsp
+.Lmulx4x_body:
+	call	mulx4x_internal
+
+	mov	40(%rsp),%rsi		# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	-88(%rsi),%xmm6
+	movaps	-72(%rsi),%xmm7
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lmulx4x_epilogue:
+	ret
+.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
+
+.type	mulx4x_internal,\@abi-omnipotent
+.align	32
+mulx4x_internal:
+	.byte	0x4c,0x89,0x8c,0x24,0x08,0x00,0x00,0x00	# mov	$num,8(%rsp)		# save -$num
+	.byte	0x67
+	neg	$num			# restore $num
+	shl	\$5,$num
+	lea	256($bp,$num),%r13
+	shr	\$5+5,$num
+	mov	`($win64?56:8)`(%rax),%r10d	# load 7th argument
+	sub	\$1,$num
+	mov	%r13,16+8(%rsp)		# end of b[num]
+	mov	$num,24+8(%rsp)		# inner counter
+	mov	$rp, 56+8(%rsp)		# save $rp
+___
+my ($aptr, $bptr, $nptr, $tptr, $mi,  $bi,  $zero, $num)=
+   ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
+my $rptr=$bptr;
+my $STRIDE=2**5*8;		# 5 is "window size"
+my $N=$STRIDE/4;		# should match cache line size
+$code.=<<___;
+	mov	%r10,%r11
+	shr	\$`log($N/8)/log(2)`,%r10
+	and	\$`$N/8-1`,%r11
+	not	%r10
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,%r10	# 5 is "window size"
+	lea	96($bp,%r11,8),$bptr	# pointer within 1st cache line
+	movq	0(%rax,%r10,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,%r10,8),%xmm5	# cache line contains element
+	add	\$7,%r11
+	movq	16(%rax,%r10,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,%r10,8),%xmm7
+	and	\$7,%r11
+
+	movq	`0*$STRIDE/4-96`($bptr),%xmm0
+	lea	$STRIDE($bptr),$tptr	# borrow $tptr
+	movq	`1*$STRIDE/4-96`($bptr),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bptr),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bptr),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	movq	`0*$STRIDE/4-96`($tptr),%xmm1
+	pand	%xmm7,%xmm3
+	por	%xmm2,%xmm0
+	movq	`1*$STRIDE/4-96`($tptr),%xmm2
+	por	%xmm3,%xmm0
+	.byte	0x67,0x67
+	pand	%xmm4,%xmm1
+	movq	`2*$STRIDE/4-96`($tptr),%xmm3
+
+	movq	%xmm0,%rdx		# bp[0]
+	movq	`3*$STRIDE/4-96`($tptr),%xmm0
+	lea	2*$STRIDE($bptr),$bptr	# next &b[i]
+	pand	%xmm5,%xmm2
+	.byte	0x67,0x67
+	pand	%xmm6,%xmm3
+	##############################################################
+	# $tptr is chosen so that writing to top-most element of the
+	# vector occurs just "above" references to powers table,
+	# "above" modulo cache-line size, which effectively precludes
+	# possibility of memory disambiguation logic failure when
+	# accessing the table.
+	# 
+	lea	64+8*4+8(%rsp,%r11,8),$tptr
+
+	mov	%rdx,$bi
+	mulx	0*8($aptr),$mi,%rax	# a[0]*b[0]
+	mulx	1*8($aptr),%r11,%r12	# a[1]*b[0]
+	add	%rax,%r11
+	mulx	2*8($aptr),%rax,%r13	# ...
+	adc	%rax,%r12
+	adc	\$0,%r13
+	mulx	3*8($aptr),%rax,%r14
+
+	mov	$mi,%r15
+	imulq	32+8(%rsp),$mi		# "t[0]"*n0
+	xor	$zero,$zero		# cf=0, of=0
+	mov	$mi,%rdx
+
+	por	%xmm2,%xmm1
+	pand	%xmm7,%xmm0
+	por	%xmm3,%xmm1
+	mov	$bptr,8+8(%rsp)		# off-load &b[i]
+	por	%xmm1,%xmm0
+
+	.byte	0x48,0x8d,0xb6,0x20,0x00,0x00,0x00	# lea	4*8($aptr),$aptr
+	adcx	%rax,%r13
+	adcx	$zero,%r14		# cf=0
+
+	mulx	0*16($nptr),%rax,%r10
+	adcx	%rax,%r15		# discarded
+	adox	%r11,%r10
+	mulx	1*16($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+	mulx	2*16($nptr),%rax,%r12
+	mov	24+8(%rsp),$bptr	# counter value
+	.byte	0x66
+	mov	%r10,-8*4($tptr)
+	adcx	%rax,%r11
+	adox	%r13,%r12
+	mulx	3*16($nptr),%rax,%r15
+	 .byte	0x67,0x67
+	 mov	$bi,%rdx
+	mov	%r11,-8*3($tptr)
+	adcx	%rax,%r12
+	adox	$zero,%r15		# of=0
+	.byte	0x48,0x8d,0x89,0x40,0x00,0x00,0x00	# lea	4*16($nptr),$nptr
+	mov	%r12,-8*2($tptr)
+	#jmp	.Lmulx4x_1st
+
+.align	32
+.Lmulx4x_1st:
+	adcx	$zero,%r15		# cf=0, modulo-scheduled
+	mulx	0*8($aptr),%r10,%rax	# a[4]*b[0]
+	adcx	%r14,%r10
+	mulx	1*8($aptr),%r11,%r14	# a[5]*b[0]
+	adcx	%rax,%r11
+	mulx	2*8($aptr),%r12,%rax	# ...
+	adcx	%r14,%r12
+	mulx	3*8($aptr),%r13,%r14
+	 .byte	0x67,0x67
+	 mov	$mi,%rdx
+	adcx	%rax,%r13
+	adcx	$zero,%r14		# cf=0
+	lea	4*8($aptr),$aptr
+	lea	4*8($tptr),$tptr
+
+	adox	%r15,%r10
+	mulx	0*16($nptr),%rax,%r15
+	adcx	%rax,%r10
+	adox	%r15,%r11
+	mulx	1*16($nptr),%rax,%r15
+	adcx	%rax,%r11
+	adox	%r15,%r12
+	mulx	2*16($nptr),%rax,%r15
+	mov	%r10,-5*8($tptr)
+	adcx	%rax,%r12
+	mov	%r11,-4*8($tptr)
+	adox	%r15,%r13
+	mulx	3*16($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	mov	%r12,-3*8($tptr)
+	adcx	%rax,%r13
+	adox	$zero,%r15
+	lea	4*16($nptr),$nptr
+	mov	%r13,-2*8($tptr)
+
+	dec	$bptr			# of=0, pass cf
+	jnz	.Lmulx4x_1st
+
+	mov	8(%rsp),$num		# load -num
+	movq	%xmm0,%rdx		# bp[1]
+	adc	$zero,%r15		# modulo-scheduled
+	lea	($aptr,$num),$aptr	# rewind $aptr
+	add	%r15,%r14
+	mov	8+8(%rsp),$bptr		# re-load &b[i]
+	adc	$zero,$zero		# top-most carry
+	mov	%r14,-1*8($tptr)
+	jmp	.Lmulx4x_outer
+
+.align	32
+.Lmulx4x_outer:
+	mov	$zero,($tptr)		# save top-most carry
+	lea	4*8($tptr,$num),$tptr	# rewind $tptr
+	mulx	0*8($aptr),$mi,%r11	# a[0]*b[i]
+	xor	$zero,$zero		# cf=0, of=0
+	mov	%rdx,$bi
+	mulx	1*8($aptr),%r14,%r12	# a[1]*b[i]
+	adox	-4*8($tptr),$mi		# +t[0]
+	adcx	%r14,%r11
+	mulx	2*8($aptr),%r15,%r13	# ...
+	adox	-3*8($tptr),%r11
+	adcx	%r15,%r12
+	mulx	3*8($aptr),%rdx,%r14
+	adox	-2*8($tptr),%r12
+	adcx	%rdx,%r13
+	lea	($nptr,$num,2),$nptr	# rewind $nptr
+	lea	4*8($aptr),$aptr
+	adox	-1*8($tptr),%r13
+	adcx	$zero,%r14
+	adox	$zero,%r14
+
+	.byte	0x67
+	mov	$mi,%r15
+	imulq	32+8(%rsp),$mi		# "t[0]"*n0
+
+	movq	`0*$STRIDE/4-96`($bptr),%xmm0
+	.byte	0x67,0x67
+	mov	$mi,%rdx
+	movq	`1*$STRIDE/4-96`($bptr),%xmm1
+	.byte	0x67
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-96`($bptr),%xmm2
+	.byte	0x67
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-96`($bptr),%xmm3
+	add	\$$STRIDE,$bptr		# next &b[i]
+	.byte	0x67
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	xor	$zero,$zero		# cf=0, of=0
+	mov	$bptr,8+8(%rsp)		# off-load &b[i]
+
+	mulx	0*16($nptr),%rax,%r10
+	adcx	%rax,%r15		# discarded
+	adox	%r11,%r10
+	mulx	1*16($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+	mulx	2*16($nptr),%rax,%r12
+	adcx	%rax,%r11
+	adox	%r13,%r12
+	mulx	3*16($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	 por	%xmm2,%xmm0
+	mov	24+8(%rsp),$bptr	# counter value
+	mov	%r10,-8*4($tptr)
+	 por	%xmm3,%xmm0
+	adcx	%rax,%r12
+	mov	%r11,-8*3($tptr)
+	adox	$zero,%r15		# of=0
+	mov	%r12,-8*2($tptr)
+	lea	4*16($nptr),$nptr
+	jmp	.Lmulx4x_inner
+
+.align	32
+.Lmulx4x_inner:
+	mulx	0*8($aptr),%r10,%rax	# a[4]*b[i]
+	adcx	$zero,%r15		# cf=0, modulo-scheduled
+	adox	%r14,%r10
+	mulx	1*8($aptr),%r11,%r14	# a[5]*b[i]
+	adcx	0*8($tptr),%r10
+	adox	%rax,%r11
+	mulx	2*8($aptr),%r12,%rax	# ...
+	adcx	1*8($tptr),%r11
+	adox	%r14,%r12
+	mulx	3*8($aptr),%r13,%r14
+	 mov	$mi,%rdx
+	adcx	2*8($tptr),%r12
+	adox	%rax,%r13
+	adcx	3*8($tptr),%r13
+	adox	$zero,%r14		# of=0
+	lea	4*8($aptr),$aptr
+	lea	4*8($tptr),$tptr
+	adcx	$zero,%r14		# cf=0
+
+	adox	%r15,%r10
+	mulx	0*16($nptr),%rax,%r15
+	adcx	%rax,%r10
+	adox	%r15,%r11
+	mulx	1*16($nptr),%rax,%r15
+	adcx	%rax,%r11
+	adox	%r15,%r12
+	mulx	2*16($nptr),%rax,%r15
+	mov	%r10,-5*8($tptr)
+	adcx	%rax,%r12
+	adox	%r15,%r13
+	mov	%r11,-4*8($tptr)
+	mulx	3*16($nptr),%rax,%r15
+	 mov	$bi,%rdx
+	lea	4*16($nptr),$nptr
+	mov	%r12,-3*8($tptr)
+	adcx	%rax,%r13
+	adox	$zero,%r15
+	mov	%r13,-2*8($tptr)
+
+	dec	$bptr			# of=0, pass cf
+	jnz	.Lmulx4x_inner
+
+	mov	0+8(%rsp),$num		# load -num
+	movq	%xmm0,%rdx		# bp[i+1]
+	adc	$zero,%r15		# modulo-scheduled
+	sub	0*8($tptr),$bptr	# pull top-most carry to %cf
+	mov	8+8(%rsp),$bptr		# re-load &b[i]
+	mov	16+8(%rsp),%r10
+	adc	%r15,%r14
+	lea	($aptr,$num),$aptr	# rewind $aptr
+	adc	$zero,$zero		# top-most carry
+	mov	%r14,-1*8($tptr)
+
+	cmp	%r10,$bptr
+	jb	.Lmulx4x_outer
+
+	mov	-16($nptr),%r10
+	xor	%r15,%r15
+	sub	%r14,%r10		# compare top-most words
+	adc	%r15,%r15
+	or	%r15,$zero
+	xor	\$1,$zero
+	lea	($tptr,$num),%rdi	# rewind $tptr
+	lea	($nptr,$num,2),$nptr	# rewind $nptr
+	.byte	0x67,0x67
+	sar	\$3+2,$num		# cf=0
+	lea	($nptr,$zero,8),%rbp
+	mov	56+8(%rsp),%rdx		# restore rp
+	mov	$num,%rcx
+	jmp	.Lsqrx4x_sub		# common post-condition
+.size	mulx4x_internal,.-mulx4x_internal
+___
+}{
+######################################################################
+# void bn_power5(
+my $rptr="%rdi";	# BN_ULONG *rptr,
+my $aptr="%rsi";	# const BN_ULONG *aptr,
+my $bptr="%rdx";	# const void *table,
+my $nptr="%rcx";	# const BN_ULONG *nptr,
+my $n0  ="%r8";		# const BN_ULONG *n0);
+my $num ="%r9";		# int num, has to be divisible by 8
+			# int pwr);
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type	bn_powerx5,\@function,6
+.align	32
+bn_powerx5:
+.Lpowerx5_enter:
+	.byte	0x67
+	mov	%rsp,%rax
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+___
+$code.=<<___ if ($win64);
+	lea	-0x28(%rsp),%rsp
+	movaps	%xmm6,(%rsp)
+	movaps	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	.byte	0x67
+	mov	${num}d,%r10d
+	shl	\$3,${num}d		# convert $num to bytes
+	shl	\$3+2,%r10d		# 4*$num
+	neg	$num
+	mov	($n0),$n0		# *n0
+
+	##############################################################
+	# ensure that stack frame doesn't alias with $aptr+4*$num
+	# modulo 4096, which covers ret[num], am[num] and n[2*num]
+	# (see bn_exp.c). this is done to allow memory disambiguation
+	# logic do its magic.
+	#
+	lea	-64(%rsp,$num,2),%r11
+	sub	$aptr,%r11
+	and	\$4095,%r11
+	cmp	%r11,%r10
+	jb	.Lpwrx_sp_alt
+	sub	%r11,%rsp		# align with $aptr
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	jmp	.Lpwrx_sp_done
+
+.align	32
+.Lpwrx_sp_alt:
+	lea	4096-64(,$num,2),%r10	# 4096-frame-2*$num
+	lea	-64(%rsp,$num,2),%rsp	# alloca(frame+2*$num)
+	sub	%r10,%r11
+	mov	\$0,%r10
+	cmovc	%r10,%r11
+	sub	%r11,%rsp
+.Lpwrx_sp_done:
+	and	\$-64,%rsp
+	mov	$num,%r10	
+	neg	$num
+
+	##############################################################
+	# Stack layout
+	#
+	# +0	saved $num, used in reduction section
+	# +8	&t[2*$num], used in reduction section
+	# +16	intermediate carry bit
+	# +24	top-most carry bit, used in reduction section
+	# +32	saved *n0
+	# +40	saved %rsp
+	# +48	t[2*$num]
+	#
+	pxor	%xmm0,%xmm0
+	movq	$rptr,%xmm1		# save $rptr
+	movq	$nptr,%xmm2		# save $nptr
+	movq	%r10, %xmm3		# -$num
+	movq	$bptr,%xmm4
+	mov	$n0,  32(%rsp)
+	mov	%rax, 40(%rsp)		# save original %rsp
+.Lpowerx5_body:
+
+	call	__bn_sqrx8x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_sqrx8x_internal
+	call	__bn_sqrx8x_internal
+
+	mov	%r10,$num		# -num
+	mov	$aptr,$rptr
+	movq	%xmm2,$nptr
+	movq	%xmm4,$bptr
+	mov	40(%rsp),%rax
+
+	call	mulx4x_internal
+
+	mov	40(%rsp),%rsi		# restore %rsp
+	mov	\$1,%rax
+___
+$code.=<<___ if ($win64);
+	movaps	-88(%rsi),%xmm6
+	movaps	-72(%rsi),%xmm7
+___
+$code.=<<___;
+	mov	-48(%rsi),%r15
+	mov	-40(%rsi),%r14
+	mov	-32(%rsi),%r13
+	mov	-24(%rsi),%r12
+	mov	-16(%rsi),%rbp
+	mov	-8(%rsi),%rbx
+	lea	(%rsi),%rsp
+.Lpowerx5_epilogue:
+	ret
+.size	bn_powerx5,.-bn_powerx5
+
+.globl	bn_sqrx8x_internal
+.hidden	bn_sqrx8x_internal
+.type	bn_sqrx8x_internal,\@abi-omnipotent
+.align	32
+bn_sqrx8x_internal:
+__bn_sqrx8x_internal:
+	##################################################################
+	# Squaring part:
+	#
+	# a) multiply-n-add everything but a[i]*a[i];
+	# b) shift result of a) by 1 to the left and accumulate
+	#    a[i]*a[i] products;
+	#
+	##################################################################
+	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
+	#                                                     a[1]a[0]
+	#                                                 a[2]a[0]
+	#                                             a[3]a[0]
+	#                                             a[2]a[1]
+	#                                         a[3]a[1]
+	#                                     a[3]a[2]
+	#
+	#                                         a[4]a[0]
+	#                                     a[5]a[0]
+	#                                 a[6]a[0]
+	#                             a[7]a[0]
+	#                                     a[4]a[1]
+	#                                 a[5]a[1]
+	#                             a[6]a[1]
+	#                         a[7]a[1]
+	#                                 a[4]a[2]
+	#                             a[5]a[2]
+	#                         a[6]a[2]
+	#                     a[7]a[2]
+	#                             a[4]a[3]
+	#                         a[5]a[3]
+	#                     a[6]a[3]
+	#                 a[7]a[3]
+	#
+	#                     a[5]a[4]
+	#                 a[6]a[4]
+	#             a[7]a[4]
+	#             a[6]a[5]
+	#         a[7]a[5]
+	#     a[7]a[6]
+	# a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0]
+___
+{
+my ($zero,$carry)=("%rbp","%rcx");
+my $aaptr=$zero;
+$code.=<<___;
+	lea	48+8(%rsp),$tptr
+	lea	($aptr,$num),$aaptr
+	mov	$num,0+8(%rsp)			# save $num
+	mov	$aaptr,8+8(%rsp)		# save end of $aptr
+	jmp	.Lsqr8x_zero_start
+
+.align	32
+.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
+.Lsqrx8x_zero:
+	.byte	0x3e
+	movdqa	%xmm0,0*8($tptr)
+	movdqa	%xmm0,2*8($tptr)
+	movdqa	%xmm0,4*8($tptr)
+	movdqa	%xmm0,6*8($tptr)
+.Lsqr8x_zero_start:			# aligned at 32
+	movdqa	%xmm0,8*8($tptr)
+	movdqa	%xmm0,10*8($tptr)
+	movdqa	%xmm0,12*8($tptr)
+	movdqa	%xmm0,14*8($tptr)
+	lea	16*8($tptr),$tptr
+	sub	\$64,$num
+	jnz	.Lsqrx8x_zero
+
+	mov	0*8($aptr),%rdx		# a[0], modulo-scheduled
+	#xor	%r9,%r9			# t[1], ex-$num, zero already
+	xor	%r10,%r10
+	xor	%r11,%r11
+	xor	%r12,%r12
+	xor	%r13,%r13
+	xor	%r14,%r14
+	xor	%r15,%r15
+	lea	48+8(%rsp),$tptr
+	xor	$zero,$zero		# cf=0, cf=0
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_loop:
+	mulx	1*8($aptr),%r8,%rax	# a[1]*a[0]
+	adcx	%r9,%r8			# a[1]*a[0]+=t[1]
+	adox	%rax,%r10
+	mulx	2*8($aptr),%r9,%rax	# a[2]*a[0]
+	adcx	%r10,%r9
+	adox	%rax,%r11
+	.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00	# mulx	3*8($aptr),%r10,%rax	# ...
+	adcx	%r11,%r10
+	adox	%rax,%r12
+	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00	# mulx	4*8($aptr),%r11,%rax
+	adcx	%r12,%r11
+	adox	%rax,%r13
+	mulx	5*8($aptr),%r12,%rax
+	adcx	%r13,%r12
+	adox	%rax,%r14
+	mulx	6*8($aptr),%r13,%rax
+	adcx	%r14,%r13
+	adox	%r15,%rax
+	mulx	7*8($aptr),%r14,%r15
+	 mov	1*8($aptr),%rdx		# a[1]
+	adcx	%rax,%r14
+	adox	$zero,%r15
+	adc	8*8($tptr),%r15
+	mov	%r8,1*8($tptr)		# t[1]
+	mov	%r9,2*8($tptr)		# t[2]
+	sbb	$carry,$carry		# mov %cf,$carry
+	xor	$zero,$zero		# cf=0, of=0
+
+
+	mulx	2*8($aptr),%r8,%rbx	# a[2]*a[1]
+	mulx	3*8($aptr),%r9,%rax	# a[3]*a[1]
+	adcx	%r10,%r8
+	adox	%rbx,%r9
+	mulx	4*8($aptr),%r10,%rbx	# ...
+	adcx	%r11,%r9
+	adox	%rax,%r10
+	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00	# mulx	5*8($aptr),%r11,%rax
+	adcx	%r12,%r10
+	adox	%rbx,%r11
+	.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r12,%rbx
+	adcx	%r13,%r11
+	adox	%r14,%r12
+	.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r13,%r14
+	 mov	2*8($aptr),%rdx		# a[2]
+	adcx	%rax,%r12
+	adox	%rbx,%r13
+	adcx	%r15,%r13
+	adox	$zero,%r14		# of=0
+	adcx	$zero,%r14		# cf=0
+
+	mov	%r8,3*8($tptr)		# t[3]
+	mov	%r9,4*8($tptr)		# t[4]
+
+	mulx	3*8($aptr),%r8,%rbx	# a[3]*a[2]
+	mulx	4*8($aptr),%r9,%rax	# a[4]*a[2]
+	adcx	%r10,%r8
+	adox	%rbx,%r9
+	mulx	5*8($aptr),%r10,%rbx	# ...
+	adcx	%r11,%r9
+	adox	%rax,%r10
+	.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00	# mulx	6*8($aptr),%r11,%rax
+	adcx	%r12,%r10
+	adox	%r13,%r11
+	.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00	# mulx	7*8($aptr),%r12,%r13
+	.byte	0x3e
+	 mov	3*8($aptr),%rdx		# a[3]
+	adcx	%rbx,%r11
+	adox	%rax,%r12
+	adcx	%r14,%r12
+	mov	%r8,5*8($tptr)		# t[5]
+	mov	%r9,6*8($tptr)		# t[6]
+	 mulx	4*8($aptr),%r8,%rax	# a[4]*a[3]
+	adox	$zero,%r13		# of=0
+	adcx	$zero,%r13		# cf=0
+
+	mulx	5*8($aptr),%r9,%rbx	# a[5]*a[3]
+	adcx	%r10,%r8
+	adox	%rax,%r9
+	mulx	6*8($aptr),%r10,%rax	# ...
+	adcx	%r11,%r9
+	adox	%r12,%r10
+	mulx	7*8($aptr),%r11,%r12
+	 mov	4*8($aptr),%rdx		# a[4]
+	 mov	5*8($aptr),%r14		# a[5]
+	adcx	%rbx,%r10
+	adox	%rax,%r11
+	 mov	6*8($aptr),%r15		# a[6]
+	adcx	%r13,%r11
+	adox	$zero,%r12		# of=0
+	adcx	$zero,%r12		# cf=0
+
+	mov	%r8,7*8($tptr)		# t[7]
+	mov	%r9,8*8($tptr)		# t[8]
+
+	mulx	%r14,%r9,%rax		# a[5]*a[4]
+	 mov	7*8($aptr),%r8		# a[7]
+	adcx	%r10,%r9
+	mulx	%r15,%r10,%rbx		# a[6]*a[4]
+	adox	%rax,%r10
+	adcx	%r11,%r10
+	mulx	%r8,%r11,%rax		# a[7]*a[4]
+	 mov	%r14,%rdx		# a[5]
+	adox	%rbx,%r11
+	adcx	%r12,%r11
+	#adox	$zero,%rax		# of=0
+	adcx	$zero,%rax		# cf=0
+
+	mulx	%r15,%r14,%rbx		# a[6]*a[5]
+	mulx	%r8,%r12,%r13		# a[7]*a[5]
+	 mov	%r15,%rdx		# a[6]
+	 lea	8*8($aptr),$aptr
+	adcx	%r14,%r11
+	adox	%rbx,%r12
+	adcx	%rax,%r12
+	adox	$zero,%r13
+
+	.byte	0x67,0x67
+	mulx	%r8,%r8,%r14		# a[7]*a[6]
+	adcx	%r8,%r13
+	adcx	$zero,%r14
+
+	cmp	8+8(%rsp),$aptr
+	je	.Lsqrx8x_outer_break
+
+	neg	$carry			# mov $carry,%cf
+	mov	\$-8,%rcx
+	mov	$zero,%r15
+	mov	8*8($tptr),%r8
+	adcx	9*8($tptr),%r9		# +=t[9]
+	adcx	10*8($tptr),%r10	# ...
+	adcx	11*8($tptr),%r11
+	adc	12*8($tptr),%r12
+	adc	13*8($tptr),%r13
+	adc	14*8($tptr),%r14
+	adc	15*8($tptr),%r15
+	lea	($aptr),$aaptr
+	lea	2*64($tptr),$tptr
+	sbb	%rax,%rax		# mov %cf,$carry
+
+	mov	-64($aptr),%rdx		# a[0]
+	mov	%rax,16+8(%rsp)		# offload $carry
+	mov	$tptr,24+8(%rsp)
+
+	#lea	8*8($tptr),$tptr	# see 2*8*8($tptr) above
+	xor	%eax,%eax		# cf=0, of=0
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_loop:
+	mov	%r8,%rbx
+	mulx	0*8($aaptr),%rax,%r8	# a[8]*a[i]
+	adcx	%rax,%rbx		# +=t[8]
+	adox	%r9,%r8
+
+	mulx	1*8($aaptr),%rax,%r9	# ...
+	adcx	%rax,%r8
+	adox	%r10,%r9
+
+	mulx	2*8($aaptr),%rax,%r10
+	adcx	%rax,%r9
+	adox	%r11,%r10
+
+	mulx	3*8($aaptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00	# mulx	4*8($aaptr),%rax,%r12
+	adcx	%rax,%r11
+	adox	%r13,%r12
+
+	mulx	5*8($aaptr),%rax,%r13
+	adcx	%rax,%r12
+	adox	%r14,%r13
+
+	mulx	6*8($aaptr),%rax,%r14
+	 mov	%rbx,($tptr,%rcx,8)	# store t[8+i]
+	 mov	\$0,%ebx
+	adcx	%rax,%r13
+	adox	%r15,%r14
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00	# mulx	7*8($aaptr),%rax,%r15
+	 mov	8($aptr,%rcx,8),%rdx	# a[i]
+	adcx	%rax,%r14
+	adox	%rbx,%r15		# %rbx is 0, of=0
+	adcx	%rbx,%r15		# cf=0
+
+	.byte	0x67
+	inc	%rcx			# of=0
+	jnz	.Lsqrx8x_loop
+
+	lea	8*8($aaptr),$aaptr
+	mov	\$-8,%rcx
+	cmp	8+8(%rsp),$aaptr	# done?
+	je	.Lsqrx8x_break
+
+	sub	16+8(%rsp),%rbx		# mov 16(%rsp),%cf
+	.byte	0x66
+	mov	-64($aptr),%rdx
+	adcx	0*8($tptr),%r8
+	adcx	1*8($tptr),%r9
+	adc	2*8($tptr),%r10
+	adc	3*8($tptr),%r11
+	adc	4*8($tptr),%r12
+	adc	5*8($tptr),%r13
+	adc	6*8($tptr),%r14
+	adc	7*8($tptr),%r15
+	lea	8*8($tptr),$tptr
+	.byte	0x67
+	sbb	%rax,%rax		# mov %cf,%rax
+	xor	%ebx,%ebx		# cf=0, of=0
+	mov	%rax,16+8(%rsp)		# offload carry
+	jmp	.Lsqrx8x_loop
+
+.align	32
+.Lsqrx8x_break:
+	sub	16+8(%rsp),%r8		# consume last carry
+	mov	24+8(%rsp),$carry	# initial $tptr, borrow $carry
+	mov	0*8($aptr),%rdx		# a[8], modulo-scheduled
+	xor	%ebp,%ebp		# xor	$zero,$zero
+	mov	%r8,0*8($tptr)
+	cmp	$carry,$tptr		# cf=0, of=0
+	je	.Lsqrx8x_outer_loop
+
+	mov	%r9,1*8($tptr)
+	 mov	1*8($carry),%r9
+	mov	%r10,2*8($tptr)
+	 mov	2*8($carry),%r10
+	mov	%r11,3*8($tptr)
+	 mov	3*8($carry),%r11
+	mov	%r12,4*8($tptr)
+	 mov	4*8($carry),%r12
+	mov	%r13,5*8($tptr)
+	 mov	5*8($carry),%r13
+	mov	%r14,6*8($tptr)
+	 mov	6*8($carry),%r14
+	mov	%r15,7*8($tptr)
+	 mov	7*8($carry),%r15
+	mov	$carry,$tptr
+	jmp	.Lsqrx8x_outer_loop
+
+.align	32
+.Lsqrx8x_outer_break:
+	mov	%r9,9*8($tptr)		# t[9]
+	 movq	%xmm3,%rcx		# -$num
+	mov	%r10,10*8($tptr)	# ...
+	mov	%r11,11*8($tptr)
+	mov	%r12,12*8($tptr)
+	mov	%r13,13*8($tptr)
+	mov	%r14,14*8($tptr)
+___
+}{
+my $i="%rcx";
+$code.=<<___;
+	lea	48+8(%rsp),$tptr
+	mov	($aptr,$i),%rdx		# a[0]
+
+	mov	8($tptr),$A0[1]		# t[1]
+	xor	$A0[0],$A0[0]		# t[0], of=0, cf=0
+	mov	0+8(%rsp),$num		# restore $num
+	adox	$A0[1],$A0[1]
+	 mov	16($tptr),$A1[0]	# t[2]	# prefetch
+	 mov	24($tptr),$A1[1]	# t[3]	# prefetch
+	#jmp	.Lsqrx4x_shift_n_add	# happens to be aligned
+
+.align	32
+.Lsqrx4x_shift_n_add:
+	mulx	%rdx,%rax,%rbx
+	 adox	$A1[0],$A1[0]
+	adcx	$A0[0],%rax
+	 .byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00	# mov	8($aptr,$i),%rdx	# a[i+1]	# prefetch
+	 .byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00	# mov	32($tptr),$A0[0]	# t[2*i+4]	# prefetch
+	 adox	$A1[1],$A1[1]
+	adcx	$A0[1],%rbx
+	 mov	40($tptr),$A0[1]		# t[2*i+4+1]	# prefetch
+	mov	%rax,0($tptr)
+	mov	%rbx,8($tptr)
+
+	mulx	%rdx,%rax,%rbx
+	 adox	$A0[0],$A0[0]
+	adcx	$A1[0],%rax
+	 mov	16($aptr,$i),%rdx	# a[i+2]	# prefetch
+	 mov	48($tptr),$A1[0]	# t[2*i+6]	# prefetch
+	 adox	$A0[1],$A0[1]
+	adcx	$A1[1],%rbx
+	 mov	56($tptr),$A1[1]	# t[2*i+6+1]	# prefetch
+	mov	%rax,16($tptr)
+	mov	%rbx,24($tptr)
+
+	mulx	%rdx,%rax,%rbx
+	 adox	$A1[0],$A1[0]
+	adcx	$A0[0],%rax
+	 mov	24($aptr,$i),%rdx	# a[i+3]	# prefetch
+	 lea	32($i),$i
+	 mov	64($tptr),$A0[0]	# t[2*i+8]	# prefetch
+	 adox	$A1[1],$A1[1]
+	adcx	$A0[1],%rbx
+	 mov	72($tptr),$A0[1]	# t[2*i+8+1]	# prefetch
+	mov	%rax,32($tptr)
+	mov	%rbx,40($tptr)
+
+	mulx	%rdx,%rax,%rbx
+	 adox	$A0[0],$A0[0]
+	adcx	$A1[0],%rax
+	jrcxz	.Lsqrx4x_shift_n_add_break
+	 .byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00	# mov	0($aptr,$i),%rdx	# a[i+4]	# prefetch
+	 adox	$A0[1],$A0[1]
+	adcx	$A1[1],%rbx
+	 mov	80($tptr),$A1[0]	# t[2*i+10]	# prefetch
+	 mov	88($tptr),$A1[1]	# t[2*i+10+1]	# prefetch
+	mov	%rax,48($tptr)
+	mov	%rbx,56($tptr)
+	lea	64($tptr),$tptr
+	nop
+	jmp	.Lsqrx4x_shift_n_add
+
+.align	32
+.Lsqrx4x_shift_n_add_break:
+	adcx	$A1[1],%rbx
+	mov	%rax,48($tptr)
+	mov	%rbx,56($tptr)
+	lea	64($tptr),$tptr		# end of t[] buffer
+___
+}
+######################################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+# This new path is inspired by multiple submissions from Intel, by
+# Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford,
+# Vinodh Gopal...
+{
+my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx");
+
+$code.=<<___;
+	movq	%xmm2,$nptr
+sqrx8x_reduction:
+	xor	%eax,%eax		# initial top-most carry bit
+	mov	32+8(%rsp),%rbx		# n0
+	mov	48+8(%rsp),%rdx		# "%r8", 8*0($tptr)
+	lea	-128($nptr,$num,2),%rcx	# end of n[]
+	#lea	48+8(%rsp,$num,2),$tptr	# end of t[] buffer
+	mov	%rcx, 0+8(%rsp)		# save end of n[]
+	mov	$tptr,8+8(%rsp)		# save end of t[]
+
+	lea	48+8(%rsp),$tptr		# initial t[] window
+	jmp	.Lsqrx8x_reduction_loop
+
+.align	32
+.Lsqrx8x_reduction_loop:
+	mov	8*1($tptr),%r9
+	mov	8*2($tptr),%r10
+	mov	8*3($tptr),%r11
+	mov	8*4($tptr),%r12
+	mov	%rdx,%r8
+	imulq	%rbx,%rdx		# n0*a[i]
+	mov	8*5($tptr),%r13
+	mov	8*6($tptr),%r14
+	mov	8*7($tptr),%r15
+	mov	%rax,24+8(%rsp)		# store top-most carry bit
+
+	lea	8*8($tptr),$tptr
+	xor	$carry,$carry		# cf=0,of=0
+	mov	\$-8,%rcx
+	jmp	.Lsqrx8x_reduce
+
+.align	32
+.Lsqrx8x_reduce:
+	mov	%r8, %rbx
+	mulx	16*0($nptr),%rax,%r8	# n[0]
+	adcx	%rbx,%rax		# discarded
+	adox	%r9,%r8
+
+	mulx	16*1($nptr),%rbx,%r9	# n[1]
+	adcx	%rbx,%r8
+	adox	%r10,%r9
+
+	mulx	16*2($nptr),%rbx,%r10
+	adcx	%rbx,%r9
+	adox	%r11,%r10
+
+	mulx	16*3($nptr),%rbx,%r11
+	adcx	%rbx,%r10
+	adox	%r12,%r11
+
+	.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rbx,%r12
+	 mov	%rdx,%rax
+	 mov	%r8,%rdx
+	adcx	%rbx,%r11
+	adox	%r13,%r12
+
+	 mulx	32+8(%rsp),%rbx,%rdx	# %rdx discarded
+	 mov	%rax,%rdx
+	 mov	%rax,64+48+8(%rsp,%rcx,8)	# put aside n0*a[i]
+
+	mulx	16*5($nptr),%rax,%r13
+	adcx	%rax,%r12
+	adox	%r14,%r13
+
+	mulx	16*6($nptr),%rax,%r14
+	adcx	%rax,%r13
+	adox	%r15,%r14
+
+	mulx	16*7($nptr),%rax,%r15
+	 mov	%rbx,%rdx
+	adcx	%rax,%r14
+	adox	$carry,%r15		# $carry is 0
+	adcx	$carry,%r15		# cf=0
+
+	.byte	0x67,0x67,0x67
+	inc	%rcx			# of=0
+	jnz	.Lsqrx8x_reduce
+
+	mov	$carry,%rax		# xor	%rax,%rax
+	cmp	0+8(%rsp),$nptr		# end of n[]?
+	jae	.Lsqrx8x_no_tail
+
+	mov	48+8(%rsp),%rdx		# pull n0*a[0]
+	add	8*0($tptr),%r8
+	lea	16*8($nptr),$nptr
+	mov	\$-8,%rcx
+	adcx	8*1($tptr),%r9
+	adcx	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	lea	8*8($tptr),$tptr
+	sbb	%rax,%rax		# top carry
+
+	xor	$carry,$carry		# of=0, cf=0
+	mov	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail:
+	mov	%r8,%rbx
+	mulx	16*0($nptr),%rax,%r8
+	adcx	%rax,%rbx
+	adox	%r9,%r8
+
+	mulx	16*1($nptr),%rax,%r9
+	adcx	%rax,%r8
+	adox	%r10,%r9
+
+	mulx	16*2($nptr),%rax,%r10
+	adcx	%rax,%r9
+	adox	%r11,%r10
+
+	mulx	16*3($nptr),%rax,%r11
+	adcx	%rax,%r10
+	adox	%r12,%r11
+
+	.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x40,0x00,0x00,0x00	# mulx	16*4($nptr),%rax,%r12
+	adcx	%rax,%r11
+	adox	%r13,%r12
+
+	mulx	16*5($nptr),%rax,%r13
+	adcx	%rax,%r12
+	adox	%r14,%r13
+
+	mulx	16*6($nptr),%rax,%r14
+	adcx	%rax,%r13
+	adox	%r15,%r14
+
+	mulx	16*7($nptr),%rax,%r15
+	 mov	72+48+8(%rsp,%rcx,8),%rdx	# pull n0*a[i]
+	adcx	%rax,%r14
+	adox	$carry,%r15
+	 mov	%rbx,($tptr,%rcx,8)	# save result
+	 mov	%r8,%rbx
+	adcx	$carry,%r15		# cf=0
+
+	inc	%rcx			# of=0
+	jnz	.Lsqrx8x_tail
+
+	cmp	0+8(%rsp),$nptr		# end of n[]?
+	jae	.Lsqrx8x_tail_done	# break out of loop
+
+	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
+	 mov	48+8(%rsp),%rdx		# pull n0*a[0]
+	 lea	16*8($nptr),$nptr
+	adc	8*0($tptr),%r8
+	adc	8*1($tptr),%r9
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	lea	8*8($tptr),$tptr
+	sbb	%rax,%rax
+	sub	\$8,%rcx		# mov	\$-8,%rcx
+
+	xor	$carry,$carry		# of=0, cf=0
+	mov	%rax,16+8(%rsp)
+	jmp	.Lsqrx8x_tail
+
+.align	32
+.Lsqrx8x_tail_done:
+	add	24+8(%rsp),%r8		# can this overflow?
+	mov	$carry,%rax		# xor	%rax,%rax
+
+	sub	16+8(%rsp),$carry	# mov 16(%rsp),%cf
+.Lsqrx8x_no_tail:			# %cf is 0 if jumped here
+	adc	8*0($tptr),%r8
+	 movq	%xmm3,%rcx
+	adc	8*1($tptr),%r9
+	 mov	16*7($nptr),$carry
+	 movq	%xmm2,$nptr		# restore $nptr
+	adc	8*2($tptr),%r10
+	adc	8*3($tptr),%r11
+	adc	8*4($tptr),%r12
+	adc	8*5($tptr),%r13
+	adc	8*6($tptr),%r14
+	adc	8*7($tptr),%r15
+	adc	%rax,%rax		# top-most carry
+
+	mov	32+8(%rsp),%rbx		# n0
+	mov	8*8($tptr,%rcx),%rdx	# modulo-scheduled "%r8"
+
+	mov	%r8,8*0($tptr)		# store top 512 bits
+	 lea	8*8($tptr),%r8		# borrow %r8
+	mov	%r9,8*1($tptr)
+	mov	%r10,8*2($tptr)
+	mov	%r11,8*3($tptr)
+	mov	%r12,8*4($tptr)
+	mov	%r13,8*5($tptr)
+	mov	%r14,8*6($tptr)
+	mov	%r15,8*7($tptr)
+
+	lea	8*8($tptr,%rcx),$tptr	# start of current t[] window
+	cmp	8+8(%rsp),%r8		# end of t[]?
+	jb	.Lsqrx8x_reduction_loop
+___
+}
+##############################################################
+# Post-condition, 4x unrolled
+#
+{
+my ($rptr,$nptr)=("%rdx","%rbp");
+my @ri=map("%r$_",(10..13));
+my @ni=map("%r$_",(14..15));
+$code.=<<___;
+	xor	%rbx,%rbx
+	sub	%r15,%rsi		# compare top-most words
+	adc	%rbx,%rbx
+	mov	%rcx,%r10		# -$num
+	.byte	0x67
+	or	%rbx,%rax
+	.byte	0x67
+	mov	%rcx,%r9		# -$num
+	xor	\$1,%rax
+	sar	\$3+2,%rcx		# cf=0
+	#lea	48+8(%rsp,%r9),$tptr
+	lea	($nptr,%rax,8),$nptr
+	movq	%xmm1,$rptr		# restore $rptr
+	movq	%xmm1,$aptr		# prepare for back-to-back call
+	jmp	.Lsqrx4x_sub
+
+.align	32
+.Lsqrx4x_sub:
+	.byte	0x66
+	mov	8*0($tptr),%r12
+	mov	8*1($tptr),%r13
+	sbb	16*0($nptr),%r12
+	mov	8*2($tptr),%r14
+	sbb	16*1($nptr),%r13
+	mov	8*3($tptr),%r15
+	lea	8*4($tptr),$tptr
+	sbb	16*2($nptr),%r14
+	mov	%r12,8*0($rptr)
+	sbb	16*3($nptr),%r15
+	lea	16*4($nptr),$nptr
+	mov	%r13,8*1($rptr)
+	mov	%r14,8*2($rptr)
+	mov	%r15,8*3($rptr)
+	lea	8*4($rptr),$rptr
+
+	inc	%rcx
+	jnz	.Lsqrx4x_sub
+___
+}
+$code.=<<___;
+	neg	%r9			# restore $num
+
+	ret
+.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
+___
+}}}
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order
+				("%rdi","%esi","%rdx","%ecx");  # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl	bn_get_bits5
+.type	bn_get_bits5,\@abi-omnipotent
+.align	16
+bn_get_bits5:
+	mov	$inp,%r10
+	mov	$num,%ecx
+	shr	\$3,$num
+	movzw	(%r10,$num),%eax
+	and	\$7,%ecx
+	shrl	%cl,%eax
+	and	\$31,%eax
+	ret
+.size	bn_get_bits5,.-bn_get_bits5
+
+.globl	bn_scatter5
+.type	bn_scatter5,\@abi-omnipotent
+.align	16
+bn_scatter5:
+	cmp	\$0, $num
+	jz	.Lscatter_epilogue
+	lea	($tbl,$idx,8),$tbl
+.Lscatter:
+	mov	($inp),%rax
+	lea	8($inp),$inp
+	mov	%rax,($tbl)
+	lea	32*8($tbl),$tbl
+	sub	\$1,$num
+	jnz	.Lscatter
+.Lscatter_epilogue:
+	ret
+.size	bn_scatter5,.-bn_scatter5
+
+.globl	bn_gather5
+.type	bn_gather5,\@abi-omnipotent
+.align	16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+	# I can't trust assembler to use specific encoding:-(
+	.byte	0x48,0x83,0xec,0x28		#sub	\$0x28,%rsp
+	.byte	0x0f,0x29,0x34,0x24		#movaps	%xmm6,(%rsp)
+	.byte	0x0f,0x29,0x7c,0x24,0x10	#movdqa	%xmm7,0x10(%rsp)
+___
+$code.=<<___;
+	mov	$idx,%r11d
+	shr	\$`log($N/8)/log(2)`,$idx
+	and	\$`$N/8-1`,%r11
+	not	$idx
+	lea	.Lmagic_masks(%rip),%rax
+	and	\$`2**5/($N/8)-1`,$idx	# 5 is "window size"
+	lea	128($tbl,%r11,8),$tbl	# pointer within 1st cache line
+	movq	0(%rax,$idx,8),%xmm4	# set of masks denoting which
+	movq	8(%rax,$idx,8),%xmm5	# cache line contains element
+	movq	16(%rax,$idx,8),%xmm6	# denoted by 7th argument
+	movq	24(%rax,$idx,8),%xmm7
+	jmp	.Lgather
+.align	16
+.Lgather:
+	movq	`0*$STRIDE/4-128`($tbl),%xmm0
+	movq	`1*$STRIDE/4-128`($tbl),%xmm1
+	pand	%xmm4,%xmm0
+	movq	`2*$STRIDE/4-128`($tbl),%xmm2
+	pand	%xmm5,%xmm1
+	movq	`3*$STRIDE/4-128`($tbl),%xmm3
+	pand	%xmm6,%xmm2
+	por	%xmm1,%xmm0
+	pand	%xmm7,%xmm3
+	.byte	0x67,0x67
+	por	%xmm2,%xmm0
+	lea	$STRIDE($tbl),$tbl
+	por	%xmm3,%xmm0
+
+	movq	%xmm0,($out)		# m0=bp[0]
+	lea	8($out),$out
+	sub	\$1,$num
+	jnz	.Lgather
+___
+$code.=<<___ if ($win64);
+	movaps	(%rsp),%xmm6
+	movaps	0x10(%rsp),%xmm7
+	lea	0x28(%rsp),%rsp
+___
+$code.=<<___;
+	ret
+.LSEH_end_bn_gather5:
+.size	bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align	64
+.Lmagic_masks:
+	.long	0,0, 0,0, 0,0, -1,-1
+	.long	0,0, 0,0, 0,0,  0,0
+.asciz	"Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#		CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern	__imp_RtlVirtualUnwind
+.type	mul_handler,\@abi-omnipotent
+.align	16
+mul_handler:
+	push	%rsi
+	push	%rdi
+	push	%rbx
+	push	%rbp
+	push	%r12
+	push	%r13
+	push	%r14
+	push	%r15
+	pushfq
+	sub	\$64,%rsp
+
+	mov	120($context),%rax	# pull context->Rax
+	mov	248($context),%rbx	# pull context->Rip
+
+	mov	8($disp),%rsi		# disp->ImageBase
+	mov	56($disp),%r11		# disp->HandlerData
+
+	mov	0(%r11),%r10d		# HandlerData[0]
+	lea	(%rsi,%r10),%r10	# end of prologue label
+	cmp	%r10,%rbx		# context->Rip<end of prologue label
+	jb	.Lcommon_seh_tail
+
+	mov	152($context),%rax	# pull context->Rsp
+
+	mov	4(%r11),%r10d		# HandlerData[1]
+	lea	(%rsi,%r10),%r10	# epilogue label
+	cmp	%r10,%rbx		# context->Rip>=epilogue label
+	jae	.Lcommon_seh_tail
+
+	lea	.Lmul_epilogue(%rip),%r10
+	cmp	%r10,%rbx
+	jb	.Lbody_40
+
+	mov	192($context),%r10	# pull $num
+	mov	8(%rax,%r10,8),%rax	# pull saved stack pointer
+	jmp	.Lbody_proceed
+
+.Lbody_40:
+	mov	40(%rax),%rax		# pull saved stack pointer
+.Lbody_proceed:
+
+	movaps	-88(%rax),%xmm0
+	movaps	-72(%rax),%xmm1
+
+	mov	-8(%rax),%rbx
+	mov	-16(%rax),%rbp
+	mov	-24(%rax),%r12
+	mov	-32(%rax),%r13
+	mov	-40(%rax),%r14
+	mov	-48(%rax),%r15
+	mov	%rbx,144($context)	# restore context->Rbx
+	mov	%rbp,160($context)	# restore context->Rbp
+	mov	%r12,216($context)	# restore context->R12
+	mov	%r13,224($context)	# restore context->R13
+	mov	%r14,232($context)	# restore context->R14
+	mov	%r15,240($context)	# restore context->R15
+	movups	%xmm0,512($context)	# restore context->Xmm6
+	movups	%xmm1,528($context)	# restore context->Xmm7
+
+.Lcommon_seh_tail:
+	mov	8(%rax),%rdi
+	mov	16(%rax),%rsi
+	mov	%rax,152($context)	# restore context->Rsp
+	mov	%rsi,168($context)	# restore context->Rsi
+	mov	%rdi,176($context)	# restore context->Rdi
+
+	mov	40($disp),%rdi		# disp->ContextRecord
+	mov	$context,%rsi		# context
+	mov	\$154,%ecx		# sizeof(CONTEXT)
+	.long	0xa548f3fc		# cld; rep movsq
+
+	mov	$disp,%rsi
+	xor	%rcx,%rcx		# arg1, UNW_FLAG_NHANDLER
+	mov	8(%rsi),%rdx		# arg2, disp->ImageBase
+	mov	0(%rsi),%r8		# arg3, disp->ControlPc
+	mov	16(%rsi),%r9		# arg4, disp->FunctionEntry
+	mov	40(%rsi),%r10		# disp->ContextRecord
+	lea	56(%rsi),%r11		# &disp->HandlerData
+	lea	24(%rsi),%r12		# &disp->EstablisherFrame
+	mov	%r10,32(%rsp)		# arg5
+	mov	%r11,40(%rsp)		# arg6
+	mov	%r12,48(%rsp)		# arg7
+	mov	%rcx,56(%rsp)		# arg8, (NULL)
+	call	*__imp_RtlVirtualUnwind(%rip)
+
+	mov	\$1,%eax		# ExceptionContinueSearch
+	add	\$64,%rsp
+	popfq
+	pop	%r15
+	pop	%r14
+	pop	%r13
+	pop	%r12
+	pop	%rbp
+	pop	%rbx
+	pop	%rdi
+	pop	%rsi
+	ret
+.size	mul_handler,.-mul_handler
+
+.section	.pdata
+.align	4
+	.rva	.LSEH_begin_bn_mul_mont_gather5
+	.rva	.LSEH_end_bn_mul_mont_gather5
+	.rva	.LSEH_info_bn_mul_mont_gather5
+
+	.rva	.LSEH_begin_bn_mul4x_mont_gather5
+	.rva	.LSEH_end_bn_mul4x_mont_gather5
+	.rva	.LSEH_info_bn_mul4x_mont_gather5
+
+	.rva	.LSEH_begin_bn_power5
+	.rva	.LSEH_end_bn_power5
+	.rva	.LSEH_info_bn_power5
+
+	.rva	.LSEH_begin_bn_from_mont8x
+	.rva	.LSEH_end_bn_from_mont8x
+	.rva	.LSEH_info_bn_from_mont8x
+___
+$code.=<<___ if ($addx);
+	.rva	.LSEH_begin_bn_mulx4x_mont_gather5
+	.rva	.LSEH_end_bn_mulx4x_mont_gather5
+	.rva	.LSEH_info_bn_mulx4x_mont_gather5
+
+	.rva	.LSEH_begin_bn_powerx5
+	.rva	.LSEH_end_bn_powerx5
+	.rva	.LSEH_info_bn_powerx5
+___
+$code.=<<___;
+	.rva	.LSEH_begin_bn_gather5
+	.rva	.LSEH_end_bn_gather5
+	.rva	.LSEH_info_bn_gather5
+
+.section	.xdata
+.align	8
+.LSEH_info_bn_mul_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul_body,.Lmul_epilogue		# HandlerData[]
+.align	8
+.LSEH_info_bn_mul4x_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmul4x_body,.Lmul4x_epilogue		# HandlerData[]
+.align	8
+.LSEH_info_bn_power5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lpower5_body,.Lpower5_epilogue		# HandlerData[]
+.align	8
+.LSEH_info_bn_from_mont8x:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lfrom_body,.Lfrom_epilogue		# HandlerData[]
+___
+$code.=<<___ if ($addx);
+.align	8
+.LSEH_info_bn_mulx4x_mont_gather5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lmulx4x_body,.Lmulx4x_epilogue		# HandlerData[]
+.align	8
+.LSEH_info_bn_powerx5:
+	.byte	9,0,0,0
+	.rva	mul_handler
+	.rva	.Lpowerx5_body,.Lpowerx5_epilogue	# HandlerData[]
+___
+$code.=<<___;
+.align	8
+.LSEH_info_bn_gather5:
+        .byte   0x01,0x0d,0x05,0x00
+        .byte   0x0d,0x78,0x01,0x00	#movaps	0x10(rsp),xmm7
+        .byte   0x08,0x68,0x00,0x00	#movaps	(rsp),xmm6
+        .byte   0x04,0x42,0x00,0x00	#sub	rsp,0x28
+.align	8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;

diff --git a/crypto/bn/bn.c b/crypto/bn/bn.c
new file mode 100644
index 0000000..a996a47
--- /dev/null
+++ b/crypto/bn/bn.c

@@ -0,0 +1,336 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <limits.h>
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+
+BIGNUM *BN_new(void) {
+  BIGNUM *bn = OPENSSL_malloc(sizeof(BIGNUM));
+
+  if (bn == NULL) {
+    OPENSSL_PUT_ERROR(BN, BN_new, ERR_R_MALLOC_FAILURE);
+    return NULL;
+  }
+
+  memset(bn, 0, sizeof(BIGNUM));
+  bn->flags = BN_FLG_MALLOCED;
+
+  return bn;
+}
+
+void BN_init(BIGNUM *bn) {
+  memset(bn, 0, sizeof(BIGNUM));
+}
+
+void BN_free(BIGNUM *bn) {
+  if (bn == NULL) {
+    return;
+  }
+
+  if (bn->d != NULL && (bn->flags & BN_FLG_STATIC_DATA) == 0) {
+    OPENSSL_free(bn->d);
+  }
+
+  if (bn->flags & BN_FLG_MALLOCED) {
+    OPENSSL_free(bn);
+  } else {
+    bn->d = NULL;
+  }
+}
+
+void BN_clear_free(BIGNUM *bn) {
+  char should_free;
+
+  if (bn == NULL) {
+    return;
+  }
+
+  if (bn->d != NULL) {
+    OPENSSL_cleanse(bn->d, bn->dmax * sizeof(bn->d[0]));
+    if ((bn->flags & BN_FLG_STATIC_DATA) == 0) {
+      OPENSSL_free(bn->d);
+    }
+  }
+
+  should_free = (bn->flags & BN_FLG_MALLOCED) != 0;
+  OPENSSL_cleanse(bn, sizeof(BIGNUM));
+  if (should_free) {
+    OPENSSL_free(bn);
+  }
+}
+
+BIGNUM *BN_dup(const BIGNUM *src) {
+  BIGNUM *copy;
+
+  if (src == NULL) {
+    return NULL;
+  }
+
+  copy = BN_new();
+  if (copy == NULL) {
+    return NULL;
+  }
+
+  if (!BN_copy(copy, src)) {
+    BN_free(copy);
+    return NULL;
+  }
+
+  return copy;
+}
+
+BIGNUM *BN_copy(BIGNUM *dest, const BIGNUM *src) {
+  if (src == dest) {
+    return dest;
+  }
+
+  if (bn_wexpand(dest, src->top) == NULL) {
+    return NULL;
+  }
+
+  memcpy(dest->d, src->d, sizeof(src->d[0]) * src->top);
+
+  dest->top = src->top;
+  dest->neg = src->neg;
+  return dest;
+}
+
+void BN_clear(BIGNUM *bn) {
+  if (bn->d != NULL) {
+    memset(bn->d, 0, bn->dmax * sizeof(bn->d[0]));
+  }
+
+  bn->top = 0;
+  bn->neg = 0;
+}
+
+const BIGNUM *BN_value_one(void) {
+  static const BN_ULONG data_one = 1;
+  static const BIGNUM const_one = {(BN_ULONG *)&data_one, 1, 1, 0,
+                                   BN_FLG_STATIC_DATA};
+
+  return &const_one;
+}
+
+void BN_with_flags(BIGNUM *out, const BIGNUM *in, int flags) {
+  memcpy(out, in, sizeof(BIGNUM));
+  out->flags &= ~BN_FLG_MALLOCED;
+  out->flags |= BN_FLG_STATIC_DATA | flags;
+}
+
+/* BN_num_bits_word returns the minimum number of bits needed to represent the
+ * value in |l|. */
+unsigned BN_num_bits_word(BN_ULONG l) {
+  static const unsigned char bits[256] = {
+      0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5,
+      5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+      6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+      7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8};
+
+#if defined(OPENSSL_64_BIT)
+  if (l & 0xffffffff00000000L) {
+    if (l & 0xffff000000000000L) {
+      if (l & 0xff00000000000000L) {
+        return (bits[(int)(l >> 56)] + 56);
+      } else
+        return (bits[(int)(l >> 48)] + 48);
+    } else {
+      if (l & 0x0000ff0000000000L) {
+        return (bits[(int)(l >> 40)] + 40);
+      } else
+        return (bits[(int)(l >> 32)] + 32);
+    }
+  } else
+#endif
+  {
+    if (l & 0xffff0000L) {
+      if (l & 0xff000000L) {
+        return (bits[(int)(l >> 24L)] + 24);
+      } else {
+        return (bits[(int)(l >> 16L)] + 16);
+      }
+    } else {
+      if (l & 0xff00L) {
+        return (bits[(int)(l >> 8)] + 8);
+      } else {
+        return (bits[(int)(l)]);
+      }
+    }
+  }
+}
+
+unsigned BN_num_bits(const BIGNUM *bn) {
+  const int max = bn->top - 1;
+
+  if (BN_is_zero(bn)) {
+    return 0;
+  }
+
+  return max*BN_BITS2 + BN_num_bits_word(bn->d[max]);
+}
+
+unsigned BN_num_bytes(const BIGNUM *bn) {
+  return (BN_num_bits(bn) + 7) / 8;
+}
+
+void BN_zero(BIGNUM *bn) {
+  bn->top = bn->neg = 0;
+}
+
+int BN_one(BIGNUM *bn) {
+  return BN_set_word(bn, 1);
+}
+
+int BN_set_word(BIGNUM *bn, BN_ULONG value) {
+  if (value == 0) {
+    BN_zero(bn);
+    return 1;
+  }
+
+  if (bn_wexpand(bn, 1) == NULL) {
+    return 0;
+  }
+
+  bn->neg = 0;
+  bn->d[0] = value;
+  bn->top = 1;
+  return 1;
+}
+
+int BN_is_negative(const BIGNUM *bn) {
+  return bn->neg != 0;
+}
+
+void BN_set_negative(BIGNUM *bn, int sign) {
+  if (sign && !BN_is_zero(bn)) {
+    bn->neg = 1;
+  } else {
+    bn->neg = 0;
+  }
+}
+
+BIGNUM *bn_wexpand(BIGNUM *bn, unsigned words) {
+  BN_ULONG *a;
+
+  if (words <= (unsigned) bn->dmax) {
+    return bn;
+  }
+
+  if (words > (INT_MAX / (4 * BN_BITS2))) {
+    OPENSSL_PUT_ERROR(BN, bn_wexpand, BN_R_BIGNUM_TOO_LONG);
+    return NULL;
+  }
+
+  if (bn->flags & BN_FLG_STATIC_DATA) {
+    OPENSSL_PUT_ERROR(BN, bn_wexpand, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA);
+    return NULL;
+  }
+
+  a = (BN_ULONG *)OPENSSL_malloc(sizeof(BN_ULONG) * words);
+  if (a == NULL) {
+    OPENSSL_PUT_ERROR(BN, bn_wexpand, ERR_R_MALLOC_FAILURE);
+    return NULL;
+  }
+
+  memcpy(a, bn->d, sizeof(BN_ULONG) * bn->top);
+
+  if (bn->d) {
+    OPENSSL_free(bn->d);
+  }
+  bn->d = a;
+  bn->dmax = words;
+
+  return bn;
+}
+
+BIGNUM *bn_expand(BIGNUM *bn, unsigned bits) {
+  return bn_wexpand(bn, (bits+BN_BITS2-1)/BN_BITS2);
+}
+
+void bn_correct_top(BIGNUM *bn) {
+  BN_ULONG *ftl;
+  int tmp_top = bn->top;
+
+  if (tmp_top > 0) {
+    for (ftl = &(bn->d[tmp_top - 1]); tmp_top > 0; tmp_top--) {
+      if (*(ftl--)) {
+        break;
+      }
+    }
+    bn->top = tmp_top;
+  }
+}
+
+int BN_get_flags(const BIGNUM *bn, int flags) {
+  return bn->flags & flags;
+}
+
+void BN_set_flags(BIGNUM *bn, int flags) {
+  bn->flags |= flags;
+}

diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h
new file mode 100644
index 0000000..16ce65f
--- /dev/null
+++ b/crypto/bn/bn.h

@@ -0,0 +1,790 @@
+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the Eric Young open source
+ * license provided above.
+ *
+ * The binary polynomial arithmetic software is originally written by
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
+ * Laboratories. */
+
+#ifndef OPENSSL_HEADER_BN_H
+#define OPENSSL_HEADER_BN_H
+
+#include <openssl/base.h>
+
+#include <stdio.h>  /* for FILE* */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+
+/* BN provides support for working with arbitary sized integers. For example,
+ * although the largest integer supported by the compiler might be 64 bits, BN
+ * will allow you to work with numbers until you run out of memory. */
+
+
+/* BN_ULONG is the native word size when working with big integers. */
+#if defined(OPENSSL_64_BIT)
+#define BN_ULONG uint64_t
+#define BN_BITS2 64
+#elif defined(OPENSSL_32_BIT)
+#define BN_ULONG uint32_t
+#define BN_BITS2 32
+#else
+#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
+#endif
+
+
+/* Allocation and freeing. */
+
+/* BN_new creates a new, allocated BIGNUM and initialises it. */
+BIGNUM *BN_new(void);
+
+/* BN_init initialises a stack allocated |BIGNUM|. */
+void BN_init(BIGNUM *bn);
+
+/* BN_free frees the data referenced by |bn| and, if |bn| was originally
+ * allocated on the heap, frees |bn| also. */
+void BN_free(BIGNUM *bn);
+
+/* BN_clear_free erases and frees the data referenced by |bn| and, if |bn| was
+ * originally allocated on the heap, frees |bn| also. */
+void BN_clear_free(BIGNUM *bn);
+
+/* BN_dup allocates a new BIGNUM and sets it equal to |src|. It returns the
+ * allocated BIGNUM on success or NULL otherwise. */
+BIGNUM *BN_dup(const BIGNUM *src);
+
+/* BN_copy sets |dest| equal to |src| and returns |dest|. */
+BIGNUM *BN_copy(BIGNUM *dest, const BIGNUM *src);
+
+/* BN_clear sets |bn| to zero and erases the old data. */
+void BN_clear(BIGNUM *bn);
+
+/* BN_value_one returns a static BIGNUM with value 1. */
+const BIGNUM *BN_value_one(void);
+
+/* BN_with_flags initialises a stack allocated |BIGNUM| with pointers to the
+ * contents of |in| but with |flags| ORed into the flags field.
+ *
+ * Note: the two BIGNUMs share state and so |out| should /not/ be passed to
+ * |BN_free|. */
+void BN_with_flags(BIGNUM *out, const BIGNUM *in, int flags);
+
+
+/* Basic functions. */
+
+/* BN_num_bits returns the minimum number of bits needed to represent the
+ * absolute value of |bn|. */
+unsigned BN_num_bits(const BIGNUM *bn);
+
+/* BN_num_bytes returns the minimum number of bytes needed to represent the
+ * absolute value of |bn|. */
+unsigned BN_num_bytes(const BIGNUM *bn);
+
+/* BN_zero sets |bn| to zero. */
+void BN_zero(BIGNUM *bn);
+
+/* BN_one sets |bn| to one. It returns one on success or zero on allocation
+ * failure. */
+int BN_one(BIGNUM *bn);
+
+/* BN_set_word sets |bn| to |value|. It returns one on success or zero on
+ * allocation failure. */
+int BN_set_word(BIGNUM *bn, BN_ULONG value);
+
+/* BN_set_negative sets the sign of |bn|. */
+void BN_set_negative(BIGNUM *bn, int sign);
+
+/* BN_is_negative returns one if |bn| is negative and zero otherwise. */
+int BN_is_negative(const BIGNUM *bn);
+
+/* BN_get_flags returns |bn->flags| & |flags|. */
+int BN_get_flags(const BIGNUM *bn, int flags);
+
+/* BN_set_flags sets |flags| on |bn|. */
+void BN_set_flags(BIGNUM *bn, int flags);
+
+
+/* Conversion functions. */
+
+/* BN_bin2bn sets |*ret| to the value of |len| bytes from |in|, interpreted as
+ * a big-endian number, and returns |ret|. If |ret| is NULL then a fresh
+ * |BIGNUM| is allocated and returned. It returns NULL on allocation
+ * failure. */
+BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret);
+
+/* BN_bn2bin serialises the absolute value of |in| to |out| as a big-endian
+ * integer, which must have |BN_num_bytes| of space available. It returns the
+ * number of bytes written. */
+size_t BN_bn2bin(const BIGNUM *in, uint8_t *out);
+
+/* BN_bn2hex returns an allocated string that contains a NUL-terminated, hex
+ * representation of |bn|. If |bn| is negative, the first char in the resulting
+ * string will be '-'. Returns NULL on allocation failure. */
+char *BN_bn2hex(const BIGNUM *bn);
+
+/* BN_hex2bn parses the leading hex number from |in|, which may be proceeded by
+ * a '-' to indicate a negative number and may contain trailing, non-hex data.
+ * If |outp| is not NULL, it constructs a BIGNUM equal to the hex number and
+ * stores it in |*outp|. If |*outp| is NULL then it allocates a new BIGNUM and
+ * updates |*outp|. It returns the number of bytes of |in| processed or zero on
+ * error. */
+int BN_hex2bn(BIGNUM **outp, const char *in);
+
+/* BN_bn2dec returns an allocated string that contains a NUL-terminated,
+ * decimal representation of |bn|. If |bn| is negative, the first char in the
+ * resulting string will be '-'. Returns NULL on allocation failure. */
+char *BN_bn2dec(const BIGNUM *a);
+
+/* BN_dec2bn parses the leading decimal number from |in|, which may be
+ * proceeded by a '-' to indicate a negative number and may contain trailing,
+ * non-decimal data. If |outp| is not NULL, it constructs a BIGNUM equal to the
+ * decimal number and stores it in |*outp|. If |*outp| is NULL then it
+ * allocates a new BIGNUM and updates |*outp|. It returns the number of bytes
+ * of |in| processed or zero on error. */
+int BN_dec2bn(BIGNUM **outp, const char *in);
+
+/* BN_asc2bn acts like |BN_dec2bn| or |BN_hex2bn| depending on whether |in|
+ * begins with "0X" or "0x" (indicating hex) or not (indicating decimal). A
+ * leading '-' is still permitted and comes before the optional 0X/0x. It
+ * returns one on success or zero on error. */
+int BN_asc2bn(BIGNUM **outp, const char *in);
+
+/* BN_print writes a hex encoding of |a| to |bio|. It returns one on success
+ * and zero on error. */
+int BN_print(BIO *bio, const BIGNUM *a);
+
+/* BN_print_fp acts like |BIO_print|, but wraps |fp| in a |BIO| first. */
+int BN_print_fp(FILE *fp, const BIGNUM *a);
+
+/* BN_get_word returns the absolute value of |bn| as a single word. If |bn| is
+ * too large to be represented as a single word, the maximum possible value
+ * will be returned. */
+BN_ULONG BN_get_word(const BIGNUM *bn);
+
+
+/* BIGNUM pools.
+ *
+ * Certain BIGNUM operations need to use many temporary variables and
+ * allocating and freeing them can be quite slow. Thus such opertions typically
+ * take a |BN_CTX| parameter, which contains a pool of |BIGNUMs|. The |ctx|
+ * argument to a public function may be NULL, in which case a local |BN_CTX|
+ * will be created just for the lifetime of that call.
+ *
+ * A function must call |BN_CTX_start| first. Then, |BN_CTX_get| may be called
+ * repeatedly to obtain temporary |BIGNUM|s. All |BN_CTX_get| calls must be made
+ * before calling any other functions that use the |ctx| as an argument.
+ *
+ * Finally, |BN_CTX_end| must be called before returning from the function.
+ * When |BN_CTX_end| is called, the |BIGNUM| pointers obtained from
+ * |BN_CTX_get| become invalid. */
+
+/* BN_CTX_new returns a new, empty BN_CTX or NULL on allocation failure. */
+BN_CTX *BN_CTX_new(void);
+
+/* BN_CTX_free frees all BIGNUMs contained in |ctx| and then frees |ctx|
+ * itself. */
+void BN_CTX_free(BN_CTX *ctx);
+
+/* BN_CTX_start "pushes" a new entry onto the |ctx| stack and allows future
+ * calls to |BN_CTX_get|. */
+void BN_CTX_start(BN_CTX *ctx);
+
+/* BN_CTX_get returns a new |BIGNUM|, or NULL on allocation failure. Once
+ * |BN_CTX_get| has returned NULL, all future calls will also return NULL until
+ * |BN_CTX_end| is called. */
+BIGNUM *BN_CTX_get(BN_CTX *ctx);
+
+/* BN_CTX_end invalidates all |BIGNUM|s returned from |BN_CTX_get| since the
+ * matching |BN_CTX_start| call. */
+void BN_CTX_end(BN_CTX *ctx);
+
+
+/* Simple arithmetic */
+
+/* BN_add sets |r| = |a| + |b|, where |r| may be the same pointer as either |a|
+ * or |b|. It returns one on success and zero on allocation failure. */
+int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+
+/* BN_uadd sets |r| = |a| + |b|, where |a| and |b| are non-negative and |r| may
+ * be the same pointer as either |a| or |b|. It returns one on success and zero
+ * on allocation failure. */
+int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+
+/* BN_add_word adds |w| to |a|. It returns one on success and zero otherwise. */
+int BN_add_word(BIGNUM *a, BN_ULONG w);
+
+/* BN_sub sets |r| = |a| + |b|, where |r| must be a distinct pointer from |a|
+ * and |b|. It returns one on success and zero on allocation failure. */
+int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+
+/* BN_usub sets |r| = |a| + |b|, where |a| and |b| are non-negative integers,
+ * |b| < |a| and |r| must be a distinct pointer from |a| and |b|. It returns
+ * one on success and zero on allocation failure. */
+int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b);
+
+/* BN_sub_word subtracts |w| from |a|. It returns one on success and zero on
+ * allocation failure. */
+int BN_sub_word(BIGNUM *a, BN_ULONG w);
+
+/* BN_mul sets |r| = |a| * |b|, where |r| may be the same pointer as |a| or
+ * |b|. Returns one on success and zero otherwise. */
+int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+
+/* BN_mul_word sets |bn| = |bn| * |w|. It returns one on success or zero on
+ * allocation failure. */
+int BN_mul_word(BIGNUM *bn, BN_ULONG w);
+
+/* BN_sqr sets |r| = |a|^2 (i.e. squares), where |r| may be the same pointer as
+ * |a|. Returns one on success and zero otherwise. This is more efficient than
+ * BN_mul(r, a, a, ctx). */
+int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx);
+
+/* BN_div divides |numerator| by |divisor| and places the result in |quotient|
+ * and the remainder in |rem|. Either of |quotient| or |rem| may be NULL, in
+ * which case the respective value is not returned. The result is rounded
+ * towards zero; thus if |numerator| is negative, the remainder will be zero or
+ * negative. It returns one on success or zero on error. */
+int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator,
+           const BIGNUM *divisor, BN_CTX *ctx);
+
+/* BN_div_word sets |numerator| = |numerator|/|divisor| and returns the
+ * remainder or (BN_ULONG)-1 on error. */
+BN_ULONG BN_div_word(BIGNUM *numerator, BN_ULONG divisor);
+
+
+/* Comparison functions */
+
+/* BN_cmp returns a value less than, equal to or greater than zero if |a| is
+ * less than, equal to or greater than |b|, respectively. */
+int BN_cmp(const BIGNUM *a, const BIGNUM *b);
+
+/* BN_ucmp returns a value less than, equal to or greater than zero if the
+ * absolute value of |a| is less than, equal to or greater than the absolute
+ * value of |b|, respectively. */
+int BN_ucmp(const BIGNUM *a, const BIGNUM *b);
+
+/* BN_abs_is_word returns one if the absolute value of |bn| equals |w| and zero
+ * otherwise. */
+int BN_abs_is_word(const BIGNUM *bn, BN_ULONG w);
+
+/* BN_is_zero returns one if |bn| is zero and zero otherwise. */
+int BN_is_zero(const BIGNUM *bn);
+
+/* BN_is_one returns one if |bn| equals one and zero otherwise. */
+int BN_is_one(const BIGNUM *bn);
+
+/* BN_is_word returns one if |bn| is exactly |w| and zero otherwise. */
+int BN_is_word(const BIGNUM *bn, BN_ULONG w);
+
+/* BN_is_odd returns one if |bn| is odd and zero otherwise. */
+int BN_is_odd(const BIGNUM *bn);
+
+
+/* Bitwise operations. */
+
+/* BN_lshift sets |r| equal to |a| << n. The |a| and |r| arguments may be the
+ * same |BIGNUM|. It returns one on success and zero on allocation failure. */
+int BN_lshift(BIGNUM *r, const BIGNUM *a, int n);
+
+/* BN_lshift1 sets |r| equal to |a| << 1, where |r| and |a| may be the same
+ * pointer. It returns one on success and zero on allocation failure. */
+int BN_lshift1(BIGNUM *r, const BIGNUM *a);
+
+/* BN_rshift sets |r| equal to |a| >> n, where |r| and |a| may be the same
+ * pointer. It returns one on success and zero on allocation failure. */
+int BN_rshift(BIGNUM *r, const BIGNUM *a, int n);
+
+/* BN_rshift1 sets |r| equal to |a| >> 1, where |r| and |a| may be the same
+ * pointer. It returns one on success and zero on allocation failure. */
+int BN_rshift1(BIGNUM *r, const BIGNUM *a);
+
+/* BN_set_bit sets the |n|th, least-significant bit in |a|. For example, if |a|
+ * is 2 then setting bit zero will make it 3. It returns one on success or zero
+ * on allocation failure. */
+int BN_set_bit(BIGNUM *a, int n);
+
+/* BN_clear_bit clears the |n|th, least-significant bit in |a|. For example, if
+ * |a| is 3, clearing bit zero will make it two. It returns one on success or
+ * zero on allocation failure. */
+int BN_clear_bit(BIGNUM *a, int n);
+
+/* BN_is_bit_set returns the value of the |n|th, least-significant bit in |a|,
+ * or zero if the bit doesn't exist. */
+int BN_is_bit_set(const BIGNUM *a, int n);
+
+/* BN_mask_bits truncates |a| so that it is only |n| bits long. It returns one
+ * on success or zero if |n| is greater than the length of |a| already. */
+int BN_mask_bits(BIGNUM *a, int n);
+
+
+/* Modulo arithmetic. */
+
+/* BN_mod_word returns |a| mod |w|. */
+BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w);
+
+/* BN_mod is a helper macro that calls |BN_div| and discards the quotient. */
+#define BN_mod(rem, numerator, divisor, ctx) \
+  BN_div(NULL, (rem), (numerator), (divisor), (ctx))
+
+/* BN_nnmod is a non-negative modulo function. It acts like |BN_mod|, but 0 <=
+ * |rem| < |divisor| is always true. */
+int BN_nnmod(BIGNUM *rem, const BIGNUM *numerator, const BIGNUM *divisor,
+             BN_CTX *ctx);
+
+/* BN_mod_add sets |r| = |a| + |b| mod |m|. It returns one on success and zero
+ * on error. */
+int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx);
+
+/* BN_mod_add_quick acts like |BN_mod_add| but requires that |a| and |b| be
+ * non-negative and less than |m|. */
+int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m);
+
+/* BN_mod_sub sets |r| = |a| - |b| mod |m|. It returns one on success and zero
+ * on error. */
+int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx);
+
+/* BN_mod_sub_quick acts like |BN_mod_sub| but requires that |a| and |b| be
+ * non-negative and less than |m|. */
+int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m);
+
+/* BN_mod_mul sets |r| = |a|*|b| mod |m|. It returns one on success and zero
+ * on error. */
+int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx);
+
+/* BN_mod_mul sets |r| = |a|^2 mod |m|. It returns one on success and zero
+ * on error. */
+int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
+
+/* BN_mod_lshift sets |r| = (|a| << n) mod |m|, where |r| and |a| may be the
+ * same pointer. It returns one on success and zero on error. */
+int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                  BN_CTX *ctx);
+
+/* BN_mod_lshift_quick acts like |BN_mod_lshift| but requires that |a| be
+ * non-negative and less than |m|. */
+int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m);
+
+/* BN_mod_lshift1 sets |r| = (|a| << 1) mod |m|, where |r| and |a| may be the
+ * same pointer. It returns one on success and zero on error. */
+int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx);
+
+/* BN_mod_lshift1_quick acts like |BN_mod_lshift1| but requires that |a| be
+ * non-negative and less than |m|. */
+int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m);
+
+/* BN_mod_sqrt returns a |BIGNUM|, r, such that r^2 == a (mod p). */
+BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+
+
+/* Random and prime number generation. */
+
+/* BN_rand sets |rnd| to a random number of length |bits|. If |top| is zero,
+ * the most-significant bit will be set. If |top| is one, the two most
+ * significant bits will be set.
+ *
+ * If |top| is -1 then no extra action will be taken and |BN_num_bits(rnd)| may
+ * not equal |bits| if the most significant bits randomly ended up as zeros.
+ *
+ * If |bottom| is non-zero, the least-significant bit will be set. The function
+ * returns one on success or zero otherwise. */
+int BN_rand(BIGNUM *rnd, int bits, int top, int bottom);
+
+/* BN_pseudo_rand is an alias for |BN_rand|. */
+int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom);
+
+/* BN_rand_range sets |rnd| to a random value [0..range). It returns one on
+ * success and zero otherwise. */
+int BN_rand_range(BIGNUM *rnd, const BIGNUM *range);
+
+/* BN_pseudo_rand_range is an alias for BN_rand_range. */
+int BN_pseudo_rand_range(BIGNUM *rnd, const BIGNUM *range);
+
+/* BN_GENCB holds a callback function that is used by generation functions that
+ * can take a very long time to complete. Use |BN_GENCB_set| to initialise a
+ * |BN_GENCB| structure.
+ *
+ * The callback receives the address of that |BN_GENCB| structure as its last
+ * argument and the user is free to put an arbitary pointer in |arg|. The other
+ * arguments are set as follows:
+ *   event=BN_GENCB_GENERATED, n=i:   after generating the i'th possible prime
+ *                                    number.
+ *   event=BN_GENCB_PRIME_TEST, n=-1: when finished trial division primality
+ *                                    checks.
+ *   event=BN_GENCB_PRIME_TEST, n=i:  when the i'th primality test has finished.
+ *
+ * The callback can return zero to abort the generation progress or one to
+ * allow it to continue.
+ *
+ * When other code needs to call a BN generation function it will often take a
+ * BN_GENCB argument and may call the function with other argument values. */
+#define BN_GENCB_GENERATED 0
+#define BN_GENCB_PRIME_TEST 1
+
+struct bn_gencb_st {
+  void *arg;        /* callback-specific data */
+  int (*callback)(int event, int n, struct bn_gencb_st *);
+};
+
+/* BN_GENCB_set configures |callback| to call |f| and sets |callout->arg| to
+ * |arg|. */
+void BN_GENCB_set(BN_GENCB *callback,
+                  int (*f)(int event, int n, struct bn_gencb_st *),
+                  void *arg);
+
+/* BN_GENCB_call calls |callback|, if not NULL, and returns the return value of
+ * the callback, or 1 if |callback| is NULL. */
+int BN_GENCB_call(BN_GENCB *callback, int event, int n);
+
+/* BN_generate_prime_ex sets |ret| to a prime number of |bits| length. If safe
+ * is non-zero then the prime will be such that (ret-1)/2 is also a prime.
+ * (This is needed for Diffie-Hellman groups to ensure that the only subgroups
+ * are of size 2 and (p-1)/2.).
+ *
+ * If |add| is not NULL, the prime will fulfill the condition |ret| % |add| ==
+ * |rem| in order to suit a given generator. (If |rem| is NULL then |ret| %
+ * |add| == 1.)
+ *
+ * If |cb| is not NULL, it will be called during processing to give an
+ * indication of progress. See the comments for |BN_GENCB|. It returns one on
+ * success and zero otherwise. */
+int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
+                         const BIGNUM *rem, BN_GENCB *cb);
+
+/* BN_prime_checks is magic value that can be used as the |checks| argument to
+ * the primality testing functions in order to automatically select a number of
+ * Miller-Rabin checks that gives a false positive rate of ~2^{-80}. */
+#define BN_prime_checks 0
+
+/* BN_primality_test sets |*is_probably_prime| to one if |candidate| is
+ * probably a prime number by the Miller-Rabin test or zero if it's certainly
+ * not.
+ *
+ * If |do_trial_division| is non-zero then |candidate| will be tested against a
+ * list of small primes before Miller-Rabin tests. The probability of this
+ * function returning a false positive is 2^{2*checks}. If |checks| is
+ * |BN_prime_checks| then a value that results in approximately 2^{-80} false
+ * positive probability is used. If |cb| is not NULL then it is called during
+ * the checking process. See the comment above |BN_GENCB|.
+ *
+ * The function returns one on success and zero on error.
+ *
+ * (If you are unsure whether you want |do_trial_division|, don't set it.) */
+int BN_primality_test(int *is_probably_prime, const BIGNUM *candidate,
+                      int checks, BN_CTX *ctx, int do_trial_division,
+                      BN_GENCB *cb);
+
+/* BN_is_prime_fasttest_ex returns one if |candidate| is probably a prime
+ * number by the Miller-Rabin test, zero if it's certainly not and -1 on error.
+ *
+ * If |do_trial_division| is non-zero then |candidate| will be tested against a
+ * list of small primes before Miller-Rabin tests. The probability of this
+ * function returning one when |candidate| is composite is 2^{2*checks}. If
+ * |checks| is |BN_prime_checks| then a value that results in approximately
+ * 2^{-80} false positive probability is used. If |cb| is not NULL then it is
+ * called during the checking process. See the comment above |BN_GENCB|.
+ *
+ * WARNING: deprecated. Use |BN_primality_test|. */
+int BN_is_prime_fasttest_ex(const BIGNUM *candidate, int checks, BN_CTX *ctx,
+                            int do_trial_division, BN_GENCB *cb);
+
+/* BN_is_prime_ex acts the same as |BN_is_prime_fasttest_ex| with
+ * |do_trial_division| set to zero.
+ *
+ * WARNING: deprecated: Use |BN_primality_test|. */
+int BN_is_prime_ex(const BIGNUM *candidate, int checks, BN_CTX *ctx,
+                   BN_GENCB *cb);
+
+
+/* Number theory functions */
+
+/* BN_gcd sets |r| = gcd(|a|, |b|). It returns one on success and zero
+ * otherwise. */
+int BN_gcd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+
+/* BN_mod_inverse sets |out| equal to |a|^-1, mod |n|. If either of |a| or |n|
+ * have |BN_FLG_CONSTTIME| set then the operation is performed in constant
+ * time. If |out| is NULL, a fresh BIGNUM is allocated. It returns the result
+ * or NULL on error. */
+BIGNUM *BN_mod_inverse(BIGNUM *out, const BIGNUM *a, const BIGNUM *n,
+                       BN_CTX *ctx);
+
+/* BN_kronecker returns the Kronecker symbol of |a| and |b| (which is -1, 0 or
+ * 1), or -2 on error. */
+int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx);
+
+
+/* Montgomery arithmetic. */
+
+/* BN_MONT_CTX contains the precomputed values needed to work in a specific
+ * Montgomery domain. */
+
+/* BN_MONT_CTX_new returns a fresh BN_MONT_CTX or NULL on allocation failure. */
+BN_MONT_CTX *BN_MONT_CTX_new(void);
+
+/* BN_MONT_CTX_init initialises a stack allocated |BN_MONT_CTX|. */
+void BN_MONT_CTX_init(BN_MONT_CTX *mont);
+
+/* BN_MONT_CTX_free frees the contexts of |mont| and, if it was originally
+ * allocated with |BN_MONT_CTX_new|, |mont| itself. */
+void BN_MONT_CTX_free(BN_MONT_CTX *mont);
+
+/* BN_MONT_CTX_copy sets |to| equal to |from|. It returns |to| on success or
+ * NULL on error. */
+BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from);
+
+/* BN_MONT_CTX_set sets up a Montgomery context given the modulus, |mod|. It
+ * returns one on success and zero on error. */
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx);
+
+/* BN_MONT_CTX_set_locked takes the lock indicated by |lock| and checks whether
+ * |*pmont| is NULL. If so, it creates a new |BN_MONT_CTX| and sets the modulus
+ * for it to |mod|. It then stores it as |*pmont| and returns it, or NULL on
+ * error.
+ *
+ * If |*pmont| is already non-NULL then the existing value is returned. */
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+                                    const BIGNUM *mod, BN_CTX *ctx);
+
+/* BN_to_montgomery sets |ret| equal to |a| in the Montgomery domain. It
+ * returns one on success and zero on error. */
+int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
+                     BN_CTX *ctx);
+
+/* BN_from_montgomery sets |ret| equal to |a| * R^-1, i.e. translates values
+ * out of the Montgomery domain. It returns one on success or zero on error. */
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
+                       BN_CTX *ctx);
+
+/* BN_mod_mul_montgomery set |r| equal to |a| * |b|, in the Montgomery domain.
+ * Both |a| and |b| must already be in the Montgomery domain (by
+ * |BN_to_montgomery|). It returns one on success or zero on error. */
+int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                          const BN_MONT_CTX *mont, BN_CTX *ctx);
+
+
+/* Exponentiation. */
+
+/* BN_exp sets |r| equal to |a|^{|p|}. It does so with a square-and-multiply
+ * algorithm that leaks side-channel information. It returns one on success or
+ * zero otherwise. */
+int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx);
+
+/* BN_exp sets |r| equal to |a|^{|p|} mod |m|. It does so with the best
+ * algorithm for the values provided and can run in constant time if
+ * |BN_FLG_CONSTTIME| is set for |p|. It returns one on success or zero
+ * otherwise. */
+int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
+               BN_CTX *ctx);
+
+int BN_mod_exp_mont(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+                    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+
+int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+                              const BIGNUM *m, BN_CTX *ctx,
+                              BN_MONT_CTX *in_mont);
+
+int BN_mod_exp_mont_word(BIGNUM *r, BN_ULONG a, const BIGNUM *p,
+                         const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int BN_mod_exp2_mont(BIGNUM *r, const BIGNUM *a1, const BIGNUM *p1,
+                     const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
+                     BN_CTX *ctx, BN_MONT_CTX *m_ctx);
+int BN_mod_exp_simple(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+                      const BIGNUM *m, BN_CTX *ctx);
+
+
+
+/* Private functions */
+
+struct bignum_st {
+  BN_ULONG *d; /* Pointer to an array of 'BN_BITS2' bit chunks in little-endian
+                  order. */
+  int top;   /* Index of last used element in |d|, plus one. */
+  int dmax;  /* Size of |d|, in words. */
+  int neg;   /* one if the number is negative */
+  int flags; /* bitmask of BN_FLG_* values */
+};
+
+struct bn_mont_ctx_st {
+  BIGNUM RR; /* used to convert to montgomery form */
+  BIGNUM N;  /* The modulus */
+  BIGNUM Ni; /* R*(1/R mod N) - N*Ni = 1
+              * (Ni is only stored for bignum algorithm) */
+  BN_ULONG n0[2]; /* least significant word(s) of Ni;
+                     (type changed with 0.9.9, was "BN_ULONG n0;" before) */
+  int flags;
+  int ri;    /* number of bits in R */
+};
+
+unsigned BN_num_bits_word(BN_ULONG l);
+
+#define BN_FLG_MALLOCED 0x01
+#define BN_FLG_STATIC_DATA 0x02
+/* avoid leaking exponent information through timing, BN_mod_exp_mont() will
+ * call BN_mod_exp_mont_consttime, BN_div() will call BN_div_no_branch,
+ * BN_mod_inverse() will call BN_mod_inverse_no_branch. */
+#define BN_FLG_CONSTTIME 0x04
+
+
+#if defined(__cplusplus)
+}  /* extern C */
+#endif
+
+#define BN_F_BN_bn2hex 100
+#define BN_F_BN_new 101
+#define BN_F_BN_exp 102
+#define BN_F_mod_exp_recp 103
+#define BN_F_BN_mod_sqrt 104
+#define BN_F_BN_rand 105
+#define BN_F_BN_rand_range 106
+#define BN_F_bn_wexpand 107
+#define BN_F_BN_mod_exp_mont 108
+#define BN_F_BN_mod_exp2_mont 109
+#define BN_F_BN_CTX_get 110
+#define BN_F_BN_mod_inverse 111
+#define BN_F_BN_bn2dec 112
+#define BN_F_BN_div 113
+#define BN_F_BN_div_recp 114
+#define BN_F_BN_mod_exp_mont_consttime 115
+#define BN_F_BN_mod_exp_mont_word 116
+#define BN_F_BN_CTX_start 117
+#define BN_F_BN_usub 118
+#define BN_F_BN_mod_lshift_quick 119
+#define BN_F_BN_CTX_new 120
+#define BN_F_BN_mod_inverse_no_branch 121
+#define BN_R_NOT_A_SQUARE 100
+#define BN_R_TOO_MANY_ITERATIONS 101
+#define BN_R_INPUT_NOT_REDUCED 102
+#define BN_R_TOO_MANY_TEMPORARY_VARIABLES 103
+#define BN_R_NO_INVERSE 104
+#define BN_R_NOT_INITIALIZED 105
+#define BN_R_DIV_BY_ZERO 106
+#define BN_R_CALLED_WITH_EVEN_MODULUS 107
+#define BN_R_EXPAND_ON_STATIC_BIGNUM_DATA 108
+#define BN_R_BAD_RECIPROCAL 109
+#define BN_R_P_IS_NOT_PRIME 110
+#define BN_R_INVALID_RANGE 111
+#define BN_R_ARG2_LT_ARG3 112
+#define BN_R_BIGNUM_TOO_LONG 113
+
+#endif  /* OPENSSL_HEADER_BN_H */

diff --git a/crypto/bn/bn_error.c b/crypto/bn/bn_error.c
new file mode 100644
index 0000000..a73571c
--- /dev/null
+++ b/crypto/bn/bn_error.c

@@ -0,0 +1,57 @@
+/* Copyright (c) 2014, Google Inc.
+ *
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
+ * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
+ * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+ * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
+
+#include <openssl/err.h>
+
+#include "bn.h"
+
+const ERR_STRING_DATA BN_error_string_data[] = {
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_CTX_get, 0), "BN_CTX_get"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_CTX_new, 0), "BN_CTX_new"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_CTX_start, 0), "BN_CTX_start"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_bn2dec, 0), "BN_bn2dec"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_bn2hex, 0), "BN_bn2hex"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_div, 0), "BN_div"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_div_recp, 0), "BN_div_recp"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_exp, 0), "BN_exp"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_exp2_mont, 0), "BN_mod_exp2_mont"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_exp_mont, 0), "BN_mod_exp_mont"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_exp_mont_consttime, 0), "BN_mod_exp_mont_consttime"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_exp_mont_word, 0), "BN_mod_exp_mont_word"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_inverse, 0), "BN_mod_inverse"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_inverse_no_branch, 0), "BN_mod_inverse_no_branch"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_lshift_quick, 0), "BN_mod_lshift_quick"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_mod_sqrt, 0), "BN_mod_sqrt"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_new, 0), "BN_new"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_rand, 0), "BN_rand"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_rand_range, 0), "BN_rand_range"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_BN_usub, 0), "BN_usub"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_bn_wexpand, 0), "bn_wexpand"},
+  {ERR_PACK(ERR_LIB_BN, BN_F_mod_exp_recp, 0), "mod_exp_recp"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_ARG2_LT_ARG3), "ARG2_LT_ARG3"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_BAD_RECIPROCAL), "BAD_RECIPROCAL"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_BIGNUM_TOO_LONG), "BIGNUM_TOO_LONG"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_CALLED_WITH_EVEN_MODULUS), "CALLED_WITH_EVEN_MODULUS"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_DIV_BY_ZERO), "DIV_BY_ZERO"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA), "EXPAND_ON_STATIC_BIGNUM_DATA"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_INPUT_NOT_REDUCED), "INPUT_NOT_REDUCED"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_INVALID_RANGE), "INVALID_RANGE"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_NOT_A_SQUARE), "NOT_A_SQUARE"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_NOT_INITIALIZED), "NOT_INITIALIZED"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_NO_INVERSE), "NO_INVERSE"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_P_IS_NOT_PRIME), "P_IS_NOT_PRIME"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_TOO_MANY_ITERATIONS), "TOO_MANY_ITERATIONS"},
+  {ERR_PACK(ERR_LIB_BN, 0, BN_R_TOO_MANY_TEMPORARY_VARIABLES), "TOO_MANY_TEMPORARY_VARIABLES"},
+  {0, NULL},
+};

diff --git a/crypto/bn/bn_test.c b/crypto/bn/bn_test.c
new file mode 100644
index 0000000..1a89802
--- /dev/null
+++ b/crypto/bn/bn_test.c

@@ -0,0 +1,1152 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the Eric Young open source
+ * license provided above.
+ *
+ * The binary polynomial arithmetic software is originally written by
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
+ * Laboratories. */
+
+#include <stdio.h>
+
+#include <openssl/bn.h>
+#include <openssl/err.h>
+#include <openssl/bio.h>
+
+#include "internal.h"
+
+static const int num0 = 100; /* number of tests */
+static const int num1 = 50;  /* additional tests for some functions */
+static const int num2 = 5;   /* number of tests for slow functions */
+
+int test_add(BIO *bp);
+int test_sub(BIO *bp);
+int test_lshift1(BIO *bp);
+int test_lshift(BIO *bp, BN_CTX *ctx, BIGNUM *a_);
+int test_rshift1(BIO *bp);
+int test_rshift(BIO *bp, BN_CTX *ctx);
+int test_sqr(BIO *bp, BN_CTX *ctx);
+int test_mul(BIO *bp);
+int test_div(BIO *bp, BN_CTX *ctx);
+int rand_neg(void);
+
+int test_div_word(BIO *bp);
+int test_mont(BIO *bp, BN_CTX *ctx);
+int test_mod(BIO *bp, BN_CTX *ctx);
+int test_mod_mul(BIO *bp, BN_CTX *ctx);
+int test_mod_exp(BIO *bp, BN_CTX *ctx);
+int test_mod_exp_mont_consttime(BIO *bp, BN_CTX *ctx);
+int test_exp(BIO *bp, BN_CTX *ctx);
+int test_mod_sqrt(BIO *bp, BN_CTX *ctx);
+#if 0
+int test_gf2m_add(BIO *bp);
+int test_gf2m_mod(BIO *bp);
+int test_gf2m_mod_mul(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_sqr(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_inv(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_div(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_exp(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_sqrt(BIO *bp, BN_CTX *ctx);
+int test_gf2m_mod_solve_quad(BIO *bp, BN_CTX *ctx);
+#endif
+static int results = 0;
+
+static unsigned char lst[] =
+    "\xC6\x4F\x43\x04\x2A\xEA\xCA\x6E\x58\x36\x80\x5B\xE8\xC9"
+    "\x9B\x04\x5D\x48\x36\xC2\xFD\x16\xC9\x64\xF0";
+
+static void ERR_print_errors_fp(FILE *out) {
+}
+
+static void message(BIO *out, char *m) {
+  BIO_puts(out, "print \"test ");
+  BIO_puts(out, m);
+  BIO_puts(out, "\\n\"\n");
+}
+
+int main(int argc, char *argv[]) {
+  BN_CTX *ctx;
+  BIO *out = NULL;
+  char *outfile = NULL;
+
+  results = 0;
+
+  argc--;
+  argv++;
+  while (argc >= 1) {
+    if (strcmp(*argv, "-results") == 0)
+      results = 1;
+    else if (strcmp(*argv, "-out") == 0) {
+      if (--argc < 1)
+        break;
+      outfile = *(++argv);
+    }
+    argc--;
+    argv++;
+  }
+
+
+  ctx = BN_CTX_new();
+  if (ctx == NULL)
+    return 1;
+
+  out = BIO_new(BIO_s_file());
+  if (out == NULL) {
+    return 1;
+  }
+
+  if (outfile == NULL) {
+    BIO_set_fp(out, stdout, BIO_NOCLOSE);
+  } else {
+    if (!BIO_write_filename(out, outfile)) {
+      perror(outfile);
+      return 1;
+    }
+  }
+
+  if (!results)
+    BIO_puts(out, "obase=16\nibase=16\n");
+
+  message(out, "BN_add");
+  if (!test_add(out))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_sub");
+  if (!test_sub(out))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_lshift1");
+  if (!test_lshift1(out))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_lshift (fixed)");
+  if (!test_lshift(out, ctx, BN_bin2bn(lst, sizeof(lst) - 1, NULL)))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_lshift");
+  if (!test_lshift(out, ctx, NULL))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_rshift1");
+  if (!test_rshift1(out))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_rshift");
+  if (!test_rshift(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_sqr");
+  if (!test_sqr(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_mul");
+  if (!test_mul(out))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_div");
+  if (!test_div(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_div_word");
+  if (!test_div_word(out))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_mod");
+  if (!test_mod(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_mod_mul");
+  if (!test_mod_mul(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_mont");
+  if (!test_mont(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_mod_exp");
+  if (!test_mod_exp(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_mod_exp_mont_consttime");
+  if (!test_mod_exp_mont_consttime(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_exp");
+  if (!test_exp(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  message(out, "BN_mod_sqrt");
+  if (!test_mod_sqrt(out, ctx))
+    goto err;
+  (void)BIO_flush(out);
+
+  BN_CTX_free(ctx);
+  BIO_free(out);
+
+  printf("PASS\n");
+  return 0;
+
+err:
+  BIO_puts(out, "1\n"); /* make sure the Perl script fed by bc notices
+                         * the failure, see test_bn in test/Makefile.ssl*/
+  (void)BIO_flush(out);
+
+  return 1;
+}
+
+int test_add(BIO *bp) {
+  BIGNUM a, b, c;
+  int i;
+
+  BN_init(&a);
+  BN_init(&b);
+  BN_init(&c);
+
+  BN_rand(&a, 512, 0, 0);
+  for (i = 0; i < num0; i++) {
+    BN_rand(&b, 450 + i, 0, 0);
+    a.neg = rand_neg();
+    b.neg = rand_neg();
+    BN_add(&c, &a, &b);
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, &a);
+        BIO_puts(bp, " + ");
+        BN_print(bp, &b);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, &c);
+      BIO_puts(bp, "\n");
+    }
+    a.neg = !a.neg;
+    b.neg = !b.neg;
+    BN_add(&c, &c, &b);
+    BN_add(&c, &c, &a);
+    if (!BN_is_zero(&c)) {
+      fprintf(stderr, "Add test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(&a);
+  BN_free(&b);
+  BN_free(&c);
+  return (1);
+}
+
+int test_sub(BIO *bp) {
+  BIGNUM a, b, c;
+  int i;
+
+  BN_init(&a);
+  BN_init(&b);
+  BN_init(&c);
+
+  for (i = 0; i < num0 + num1; i++) {
+    if (i < num1) {
+      BN_rand(&a, 512, 0, 0);
+      BN_copy(&b, &a);
+      if (BN_set_bit(&a, i) == 0)
+        return (0);
+      BN_add_word(&b, i);
+    } else {
+      BN_rand(&b, 400 + i - num1, 0, 0);
+      a.neg = rand_neg();
+      b.neg = rand_neg();
+    }
+    BN_sub(&c, &a, &b);
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, &a);
+        BIO_puts(bp, " - ");
+        BN_print(bp, &b);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, &c);
+      BIO_puts(bp, "\n");
+    }
+    BN_add(&c, &c, &b);
+    BN_sub(&c, &c, &a);
+    if (!BN_is_zero(&c)) {
+      fprintf(stderr, "Subtract test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(&a);
+  BN_free(&b);
+  BN_free(&c);
+  return (1);
+}
+
+int test_div(BIO *bp, BN_CTX *ctx) {
+  BIGNUM a, b, c, d, e;
+  int i;
+
+  BN_init(&a);
+  BN_init(&b);
+  BN_init(&c);
+  BN_init(&d);
+  BN_init(&e);
+
+  for (i = 0; i < num0 + num1; i++) {
+    if (i < num1) {
+      BN_rand(&a, 400, 0, 0);
+      BN_copy(&b, &a);
+      BN_lshift(&a, &a, i);
+      BN_add_word(&a, i);
+    } else
+      BN_rand(&b, 50 + 3 * (i - num1), 0, 0);
+    a.neg = rand_neg();
+    b.neg = rand_neg();
+    BN_div(&d, &c, &a, &b, ctx);
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, &a);
+        BIO_puts(bp, " / ");
+        BN_print(bp, &b);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, &d);
+      BIO_puts(bp, "\n");
+
+      if (!results) {
+        BN_print(bp, &a);
+        BIO_puts(bp, " % ");
+        BN_print(bp, &b);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, &c);
+      BIO_puts(bp, "\n");
+    }
+    BN_mul(&e, &d, &b, ctx);
+    BN_add(&d, &e, &c);
+    BN_sub(&d, &d, &a);
+    if (!BN_is_zero(&d)) {
+      fprintf(stderr, "Division test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(&a);
+  BN_free(&b);
+  BN_free(&c);
+  BN_free(&d);
+  BN_free(&e);
+  return (1);
+}
+
+int test_lshift1(BIO *bp) {
+  BIGNUM *a, *b, *c;
+  int i;
+
+  a = BN_new();
+  b = BN_new();
+  c = BN_new();
+
+  BN_rand(a, 200, 0, 0); /**/
+  a->neg = rand_neg();
+  for (i = 0; i < num0; i++) {
+    BN_lshift1(b, a);
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, a);
+        BIO_puts(bp, " * 2");
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, b);
+      BIO_puts(bp, "\n");
+    }
+    BN_add(c, a, a);
+    BN_sub(a, b, c);
+    if (!BN_is_zero(a)) {
+      fprintf(stderr, "Left shift one test failed!\n");
+      return 0;
+    }
+
+    BN_copy(a, b);
+  }
+  BN_free(a);
+  BN_free(b);
+  BN_free(c);
+  return (1);
+}
+
+int test_rshift(BIO *bp, BN_CTX *ctx) {
+  BIGNUM *a, *b, *c, *d, *e;
+  int i;
+
+  a = BN_new();
+  b = BN_new();
+  c = BN_new();
+  d = BN_new();
+  e = BN_new();
+  BN_one(c);
+
+  BN_rand(a, 200, 0, 0); /**/
+  a->neg = rand_neg();
+  for (i = 0; i < num0; i++) {
+    BN_rshift(b, a, i + 1);
+    BN_add(c, c, c);
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, a);
+        BIO_puts(bp, " / ");
+        BN_print(bp, c);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, b);
+      BIO_puts(bp, "\n");
+    }
+    BN_div(d, e, a, c, ctx);
+    BN_sub(d, d, b);
+    if (!BN_is_zero(d)) {
+      fprintf(stderr, "Right shift test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(a);
+  BN_free(b);
+  BN_free(c);
+  BN_free(d);
+  BN_free(e);
+  return (1);
+}
+
+int test_rshift1(BIO *bp) {
+  BIGNUM *a, *b, *c;
+  int i;
+
+  a = BN_new();
+  b = BN_new();
+  c = BN_new();
+
+  BN_rand(a, 200, 0, 0); /**/
+  a->neg = rand_neg();
+  for (i = 0; i < num0; i++) {
+    BN_rshift1(b, a);
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, a);
+        BIO_puts(bp, " / 2");
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, b);
+      BIO_puts(bp, "\n");
+    }
+    BN_sub(c, a, b);
+    BN_sub(c, c, b);
+    if (!BN_is_zero(c) && !BN_abs_is_word(c, 1)) {
+      fprintf(stderr, "Right shift one test failed!\n");
+      return 0;
+    }
+    BN_copy(a, b);
+  }
+  BN_free(a);
+  BN_free(b);
+  BN_free(c);
+  return (1);
+}
+
+int test_lshift(BIO *bp, BN_CTX *ctx, BIGNUM *a_) {
+  BIGNUM *a, *b, *c, *d;
+  int i;
+
+  b = BN_new();
+  c = BN_new();
+  d = BN_new();
+  BN_one(c);
+
+  if (a_)
+    a = a_;
+  else {
+    a = BN_new();
+    BN_rand(a, 200, 0, 0); /**/
+    a->neg = rand_neg();
+  }
+  for (i = 0; i < num0; i++) {
+    BN_lshift(b, a, i + 1);
+    BN_add(c, c, c);
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, a);
+        BIO_puts(bp, " * ");
+        BN_print(bp, c);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, b);
+      BIO_puts(bp, "\n");
+    }
+    BN_mul(d, a, c, ctx);
+    BN_sub(d, d, b);
+    if (!BN_is_zero(d)) {
+      fprintf(stderr, "Left shift test failed!\n");
+      fprintf(stderr, "a=");
+      BN_print_fp(stderr, a);
+      fprintf(stderr, "\nb=");
+      BN_print_fp(stderr, b);
+      fprintf(stderr, "\nc=");
+      BN_print_fp(stderr, c);
+      fprintf(stderr, "\nd=");
+      BN_print_fp(stderr, d);
+      fprintf(stderr, "\n");
+      return 0;
+    }
+  }
+  BN_free(a);
+  BN_free(b);
+  BN_free(c);
+  BN_free(d);
+  return (1);
+}
+
+int test_mul(BIO *bp) {
+  BIGNUM a, b, c, d, e;
+  int i;
+  BN_CTX *ctx;
+
+  ctx = BN_CTX_new();
+  if (ctx == NULL)
+    abort();
+
+  BN_init(&a);
+  BN_init(&b);
+  BN_init(&c);
+  BN_init(&d);
+  BN_init(&e);
+
+  for (i = 0; i < num0 + num1; i++) {
+    if (i <= num1) {
+      BN_rand(&a, 100, 0, 0);
+      BN_rand(&b, 100, 0, 0);
+    } else
+      BN_rand(&b, i - num1, 0, 0);
+    a.neg = rand_neg();
+    b.neg = rand_neg();
+    BN_mul(&c, &a, &b, ctx);
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, &a);
+        BIO_puts(bp, " * ");
+        BN_print(bp, &b);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, &c);
+      BIO_puts(bp, "\n");
+    }
+    BN_div(&d, &e, &c, &a, ctx);
+    BN_sub(&d, &d, &b);
+    if (!BN_is_zero(&d) || !BN_is_zero(&e)) {
+      fprintf(stderr, "Multiplication test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(&a);
+  BN_free(&b);
+  BN_free(&c);
+  BN_free(&d);
+  BN_free(&e);
+  BN_CTX_free(ctx);
+  return (1);
+}
+
+int test_sqr(BIO *bp, BN_CTX *ctx) {
+  BIGNUM a, c, d, e;
+  int i;
+
+  BN_init(&a);
+  BN_init(&c);
+  BN_init(&d);
+  BN_init(&e);
+
+  for (i = 0; i < num0; i++) {
+    BN_rand(&a, 40 + i * 10, 0, 0);
+    a.neg = rand_neg();
+    BN_sqr(&c, &a, ctx);
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, &a);
+        BIO_puts(bp, " * ");
+        BN_print(bp, &a);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, &c);
+      BIO_puts(bp, "\n");
+    }
+    BN_div(&d, &e, &c, &a, ctx);
+    BN_sub(&d, &d, &a);
+    if (!BN_is_zero(&d) || !BN_is_zero(&e)) {
+      fprintf(stderr, "Square test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(&a);
+  BN_free(&c);
+  BN_free(&d);
+  BN_free(&e);
+  return (1);
+}
+
+
+int rand_neg(void) {
+  static unsigned int neg = 0;
+  static int sign[8] = {0, 0, 0, 1, 1, 0, 1, 1};
+
+  return (sign[(neg++) % 8]);
+}
+
+static void print_word(BIO *bp, BN_ULONG w) {
+#ifdef OPENSSL_64_BIT
+  if (sizeof(w) > sizeof(unsigned long)) {
+    unsigned long h = (unsigned long)(w >> 32), l = (unsigned long)(w);
+
+    if (h)
+      BIO_printf(bp, "%lX%08lX", h, l);
+    else
+      BIO_printf(bp, "%lX", l);
+    return;
+  }
+#endif
+  BIO_printf(bp, BN_HEX_FMT1, w);
+}
+
+int test_div_word(BIO *bp) {
+  BIGNUM a, b;
+  BN_ULONG r, s;
+  int i;
+
+  BN_init(&a);
+  BN_init(&b);
+
+  for (i = 0; i < num0; i++) {
+    do {
+      BN_rand(&a, 512, -1, 0);
+      BN_rand(&b, BN_BITS2, -1, 0);
+      s = b.d[0];
+    } while (!s);
+
+    BN_copy(&b, &a);
+    r = BN_div_word(&b, s);
+
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, &a);
+        BIO_puts(bp, " / ");
+        print_word(bp, s);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, &b);
+      BIO_puts(bp, "\n");
+
+      if (!results) {
+        BN_print(bp, &a);
+        BIO_puts(bp, " % ");
+        print_word(bp, s);
+        BIO_puts(bp, " - ");
+      }
+      print_word(bp, r);
+      BIO_puts(bp, "\n");
+    }
+    BN_mul_word(&b, s);
+    BN_add_word(&b, r);
+    BN_sub(&b, &a, &b);
+    if (!BN_is_zero(&b)) {
+      fprintf(stderr, "Division (word) test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(&a);
+  BN_free(&b);
+  return (1);
+}
+
+int test_mont(BIO *bp, BN_CTX *ctx) {
+  BIGNUM a, b, c, d, A, B;
+  BIGNUM n;
+  int i;
+  BN_MONT_CTX *mont;
+
+  BN_init(&a);
+  BN_init(&b);
+  BN_init(&c);
+  BN_init(&d);
+  BN_init(&A);
+  BN_init(&B);
+  BN_init(&n);
+
+  mont = BN_MONT_CTX_new();
+  if (mont == NULL)
+    return 0;
+
+  BN_rand(&a, 100, 0, 0); /**/
+  BN_rand(&b, 100, 0, 0); /**/
+  for (i = 0; i < num2; i++) {
+    int bits = (200 * (i + 1)) / num2;
+
+    if (bits == 0)
+      continue;
+    BN_rand(&n, bits, 0, 1);
+    BN_MONT_CTX_set(mont, &n, ctx);
+
+    BN_nnmod(&a, &a, &n, ctx);
+    BN_nnmod(&b, &b, &n, ctx);
+
+    BN_to_montgomery(&A, &a, mont, ctx);
+    BN_to_montgomery(&B, &b, mont, ctx);
+
+    BN_mod_mul_montgomery(&c, &A, &B, mont, ctx); /**/
+    BN_from_montgomery(&A, &c, mont, ctx);        /**/
+    if (bp != NULL) {
+      if (!results) {
+#ifdef undef
+        fprintf(stderr, "%d * %d %% %d\n", BN_num_bits(&a), BN_num_bits(&b),
+                BN_num_bits(mont->N));
+#endif
+        BN_print(bp, &a);
+        BIO_puts(bp, " * ");
+        BN_print(bp, &b);
+        BIO_puts(bp, " % ");
+        BN_print(bp, &(mont->N));
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, &A);
+      BIO_puts(bp, "\n");
+    }
+    BN_mod_mul(&d, &a, &b, &n, ctx);
+    BN_sub(&d, &d, &A);
+    if (!BN_is_zero(&d)) {
+      fprintf(stderr, "Montgomery multiplication test failed!\n");
+      return 0;
+    }
+  }
+  BN_MONT_CTX_free(mont);
+  BN_free(&a);
+  BN_free(&b);
+  BN_free(&c);
+  BN_free(&d);
+  BN_free(&A);
+  BN_free(&B);
+  BN_free(&n);
+  return (1);
+}
+
+int test_mod(BIO *bp, BN_CTX *ctx) {
+  BIGNUM *a, *b, *c, *d, *e;
+  int i;
+
+  a = BN_new();
+  b = BN_new();
+  c = BN_new();
+  d = BN_new();
+  e = BN_new();
+
+  BN_rand(a, 1024, 0, 0); /**/
+  for (i = 0; i < num0; i++) {
+    BN_rand(b, 450 + i * 10, 0, 0); /**/
+    a->neg = rand_neg();
+    b->neg = rand_neg();
+    BN_mod(c, a, b, ctx); /**/
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, a);
+        BIO_puts(bp, " % ");
+        BN_print(bp, b);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, c);
+      BIO_puts(bp, "\n");
+    }
+    BN_div(d, e, a, b, ctx);
+    BN_sub(e, e, c);
+    if (!BN_is_zero(e)) {
+      fprintf(stderr, "Modulo test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(a);
+  BN_free(b);
+  BN_free(c);
+  BN_free(d);
+  BN_free(e);
+  return (1);
+}
+
+int test_mod_mul(BIO *bp, BN_CTX *ctx) {
+  BIGNUM *a, *b, *c, *d, *e;
+  int i, j;
+
+  a = BN_new();
+  b = BN_new();
+  c = BN_new();
+  d = BN_new();
+  e = BN_new();
+
+  for (j = 0; j < 3; j++) {
+    BN_rand(c, 1024, 0, 0); /**/
+    for (i = 0; i < num0; i++) {
+      BN_rand(a, 475 + i * 10, 0, 0); /**/
+      BN_rand(b, 425 + i * 11, 0, 0); /**/
+      a->neg = rand_neg();
+      b->neg = rand_neg();
+      if (!BN_mod_mul(e, a, b, c, ctx)) {
+        unsigned long l;
+
+        while ((l = ERR_get_error()))
+          fprintf(stderr, "ERROR:%s\n", ERR_error_string(l, NULL));
+        abort();
+      }
+      if (bp != NULL) {
+        if (!results) {
+          BN_print(bp, a);
+          BIO_puts(bp, " * ");
+          BN_print(bp, b);
+          BIO_puts(bp, " % ");
+          BN_print(bp, c);
+          if ((a->neg ^ b->neg) && !BN_is_zero(e)) {
+            /* If  (a*b) % c  is negative,  c  must be added
+             * in order to obtain the normalized remainder
+             * (new with OpenSSL 0.9.7, previous versions of
+             * BN_mod_mul could generate negative results)
+             */
+            BIO_puts(bp, " + ");
+            BN_print(bp, c);
+          }
+          BIO_puts(bp, " - ");
+        }
+        BN_print(bp, e);
+        BIO_puts(bp, "\n");
+      }
+      BN_mul(d, a, b, ctx);
+      BN_sub(d, d, e);
+      BN_div(a, b, d, c, ctx);
+      if (!BN_is_zero(b)) {
+        fprintf(stderr, "Modulo multiply test failed!\n");
+        ERR_print_errors_fp(stderr);
+        return 0;
+      }
+    }
+  }
+  BN_free(a);
+  BN_free(b);
+  BN_free(c);
+  BN_free(d);
+  BN_free(e);
+  return (1);
+}
+
+int test_mod_exp(BIO *bp, BN_CTX *ctx) {
+  BIGNUM *a, *b, *c, *d, *e;
+  int i;
+
+  a = BN_new();
+  b = BN_new();
+  c = BN_new();
+  d = BN_new();
+  e = BN_new();
+
+  BN_rand(c, 30, 0, 1); /* must be odd for montgomery */
+  for (i = 0; i < num2; i++) {
+    BN_rand(a, 20 + i * 5, 0, 0); /**/
+    BN_rand(b, 2 + i, 0, 0);      /**/
+
+    if (!BN_mod_exp(d, a, b, c, ctx))
+      return (0);
+
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, a);
+        BIO_puts(bp, " ^ ");
+        BN_print(bp, b);
+        BIO_puts(bp, " % ");
+        BN_print(bp, c);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, d);
+      BIO_puts(bp, "\n");
+    }
+    BN_exp(e, a, b, ctx);
+    BN_sub(e, e, d);
+    BN_div(a, b, e, c, ctx);
+    if (!BN_is_zero(b)) {
+      fprintf(stderr, "Modulo exponentiation test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(a);
+  BN_free(b);
+  BN_free(c);
+  BN_free(d);
+  BN_free(e);
+  return (1);
+}
+
+int test_mod_exp_mont_consttime(BIO *bp, BN_CTX *ctx) {
+  BIGNUM *a, *b, *c, *d, *e;
+  int i;
+
+  a = BN_new();
+  b = BN_new();
+  c = BN_new();
+  d = BN_new();
+  e = BN_new();
+
+  BN_rand(c, 30, 0, 1); /* must be odd for montgomery */
+  for (i = 0; i < num2; i++) {
+    BN_rand(a, 20 + i * 5, 0, 0); /**/
+    BN_rand(b, 2 + i, 0, 0);      /**/
+
+    if (!BN_mod_exp_mont_consttime(d, a, b, c, ctx, NULL))
+      return (00);
+
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, a);
+        BIO_puts(bp, " ^ ");
+        BN_print(bp, b);
+        BIO_puts(bp, " % ");
+        BN_print(bp, c);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, d);
+      BIO_puts(bp, "\n");
+    }
+    BN_exp(e, a, b, ctx);
+    BN_sub(e, e, d);
+    BN_div(a, b, e, c, ctx);
+    if (!BN_is_zero(b)) {
+      fprintf(stderr, "Modulo exponentiation test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(a);
+  BN_free(b);
+  BN_free(c);
+  BN_free(d);
+  BN_free(e);
+  return (1);
+}
+
+int test_exp(BIO *bp, BN_CTX *ctx) {
+  BIGNUM *a, *b, *d, *e, *one;
+  int i;
+
+  a = BN_new();
+  b = BN_new();
+  d = BN_new();
+  e = BN_new();
+  one = BN_new();
+  BN_one(one);
+
+  for (i = 0; i < num2; i++) {
+    BN_rand(a, 20 + i * 5, 0, 0); /**/
+    BN_rand(b, 2 + i, 0, 0);      /**/
+
+    if (BN_exp(d, a, b, ctx) <= 0)
+      return (0);
+
+    if (bp != NULL) {
+      if (!results) {
+        BN_print(bp, a);
+        BIO_puts(bp, " ^ ");
+        BN_print(bp, b);
+        BIO_puts(bp, " - ");
+      }
+      BN_print(bp, d);
+      BIO_puts(bp, "\n");
+    }
+    BN_one(e);
+    for (; !BN_is_zero(b); BN_sub(b, b, one))
+      BN_mul(e, e, a, ctx);
+    BN_sub(e, e, d);
+    if (!BN_is_zero(e)) {
+      fprintf(stderr, "Exponentiation test failed!\n");
+      return 0;
+    }
+  }
+  BN_free(a);
+  BN_free(b);
+  BN_free(d);
+  BN_free(e);
+  BN_free(one);
+  return (1);
+}
+
+static int genprime_cb(int p, int n, BN_GENCB *arg) {
+  char c = '*';
+
+  if (p == 0)
+    c = '.';
+  if (p == 1)
+    c = '+';
+  if (p == 2)
+    c = '*';
+  if (p == 3)
+    c = '\n';
+  putc(c, stdout);
+  fflush(stdout);
+  return 1;
+}
+
+int test_mod_sqrt(BIO *bp, BN_CTX *ctx) {
+  BN_GENCB cb;
+  BIGNUM *a, *p, *r;
+  int i, j;
+  int ret = 0;
+
+  a = BN_new();
+  p = BN_new();
+  r = BN_new();
+  if (a == NULL || p == NULL || r == NULL)
+    goto err;
+
+  BN_GENCB_set(&cb, genprime_cb, NULL);
+
+  for (i = 0; i < 16; i++) {
+    if (i < 8) {
+      unsigned primes[8] = {2, 3, 5, 7, 11, 13, 17, 19};
+
+      if (!BN_set_word(p, primes[i]))
+        goto err;
+    } else {
+      if (!BN_set_word(a, 32))
+        goto err;
+      if (!BN_set_word(r, 2 * i + 1))
+        goto err;
+
+      if (!BN_generate_prime_ex(p, 256, 0, a, r, &cb))
+        goto err;
+      putc('\n', stdout);
+    }
+    p->neg = rand_neg();
+
+    for (j = 0; j < num2; j++) {
+      /* construct 'a' such that it is a square modulo p,
+       * but in general not a proper square and not reduced modulo p */
+      if (!BN_rand(r, 256, 0, 3))
+        goto err;
+      if (!BN_nnmod(r, r, p, ctx))
+        goto err;
+      if (!BN_mod_sqr(r, r, p, ctx))
+        goto err;
+      if (!BN_rand(a, 256, 0, 3))
+        goto err;
+      if (!BN_nnmod(a, a, p, ctx))
+        goto err;
+      if (!BN_mod_sqr(a, a, p, ctx))
+        goto err;
+      if (!BN_mul(a, a, r, ctx))
+        goto err;
+      if (rand_neg())
+        if (!BN_sub(a, a, p))
+          goto err;
+
+      if (!BN_mod_sqrt(r, a, p, ctx))
+        goto err;
+      if (!BN_mod_sqr(r, r, p, ctx))
+        goto err;
+
+      if (!BN_nnmod(a, a, p, ctx))
+        goto err;
+
+      if (BN_cmp(a, r) != 0) {
+        fprintf(stderr, "BN_mod_sqrt failed: a = ");
+        BN_print_fp(stderr, a);
+        fprintf(stderr, ", r = ");
+        BN_print_fp(stderr, r);
+        fprintf(stderr, ", p = ");
+        BN_print_fp(stderr, p);
+        fprintf(stderr, "\n");
+        goto err;
+      }
+
+      putc('.', stdout);
+      fflush(stdout);
+    }
+
+    putc('\n', stdout);
+    fflush(stderr);
+  }
+  ret = 1;
+err:
+  if (a != NULL)
+    BN_free(a);
+  if (p != NULL)
+    BN_free(p);
+  if (r != NULL)
+    BN_free(r);
+  return ret;
+}

diff --git a/crypto/bn/cmp.c b/crypto/bn/cmp.c
new file mode 100644
index 0000000..fce7233
--- /dev/null
+++ b/crypto/bn/cmp.c

@@ -0,0 +1,200 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include "internal.h"
+
+
+int BN_ucmp(const BIGNUM *a, const BIGNUM *b) {
+  int i;
+  BN_ULONG t1, t2, *ap, *bp;
+
+  i = a->top - b->top;
+  if (i != 0) {
+    return i;
+  }
+
+  ap = a->d;
+  bp = b->d;
+  for (i = a->top - 1; i >= 0; i--) {
+    t1 = ap[i];
+    t2 = bp[i];
+    if (t1 != t2) {
+      return (t1 > t2) ? 1 : -1;
+    }
+  }
+
+  return 0;
+}
+
+int BN_cmp(const BIGNUM *a, const BIGNUM *b) {
+  int i;
+  int gt, lt;
+  BN_ULONG t1, t2;
+
+  if ((a == NULL) || (b == NULL)) {
+    if (a != NULL) {
+      return -1;
+    } else if (b != NULL) {
+      return 1;
+    } else {
+      return 0;
+    }
+  }
+
+  if (a->neg != b->neg) {
+    if (a->neg) {
+      return -1;
+    }
+    return 1;
+  }
+  if (a->neg == 0) {
+    gt = 1;
+    lt = -1;
+  } else {
+    gt = -1;
+    lt = 1;
+  }
+
+  if (a->top > b->top) {
+    return gt;
+  }
+  if (a->top < b->top) {
+    return lt;
+  }
+
+  for (i = a->top - 1; i >= 0; i--) {
+    t1 = a->d[i];
+    t2 = b->d[i];
+    if (t1 > t2) {
+      return gt;
+    } if (t1 < t2) {
+      return lt;
+    }
+  }
+
+  return 0;
+}
+
+int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n) {
+  int i;
+  BN_ULONG aa, bb;
+
+  aa = a[n - 1];
+  bb = b[n - 1];
+  if (aa != bb) {
+    return (aa > bb) ? 1 : -1;
+  }
+
+  for (i = n - 2; i >= 0; i--) {
+    aa = a[i];
+    bb = b[i];
+    if (aa != bb) {
+      return (aa > bb) ? 1 : -1;
+    }
+  }
+  return 0;
+}
+
+int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl) {
+  int n, i;
+  n = cl - 1;
+
+  if (dl < 0) {
+    for (i = dl; i < 0; i++) {
+      if (b[n - i] != 0) {
+        return -1; /* a < b */
+      }
+    }
+  }
+  if (dl > 0) {
+    for (i = dl; i > 0; i--) {
+      if (a[n + i] != 0) {
+        return 1; /* a > b */
+      }
+    }
+  }
+
+  return bn_cmp_words(a, b, cl);
+}
+
+int BN_abs_is_word(const BIGNUM *bn, BN_ULONG w) {
+  switch (bn->top) {
+    case 1:
+      return bn->d[0] == w;
+    case 0:
+      return w == 0;
+    default:
+      return 0;
+  }
+}
+
+int BN_is_zero(const BIGNUM *bn) {
+  return bn->top == 0;
+}
+
+int BN_is_one(const BIGNUM *bn) {
+  return bn->neg == 0 && BN_abs_is_word(bn, 1);
+}
+
+int BN_is_word(const BIGNUM *bn, BN_ULONG w) {
+  return BN_abs_is_word(bn, w) && (w == 0 || bn->neg == 0);
+}
+
+int BN_is_odd(const BIGNUM *bn) {
+  return bn->top > 0 && (bn->d[0] & 1) == 1;
+}

diff --git a/crypto/bn/convert.c b/crypto/bn/convert.c
new file mode 100644
index 0000000..c6b391a
--- /dev/null
+++ b/crypto/bn/convert.c

@@ -0,0 +1,439 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <ctype.h>
+#include <stdio.h>
+
+#include <openssl/bio.h>
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) {
+  unsigned num_words, m;
+  BN_ULONG word = 0;
+  BIGNUM *bn = NULL;
+
+  if (ret == NULL) {
+    ret = bn = BN_new();
+  }
+
+  if (ret == NULL) {
+    return NULL;
+  }
+
+  if (len == 0) {
+    ret->top = 0;
+    return ret;
+  }
+
+  num_words = ((len - 1) / BN_BYTES) + 1;
+  m = (len - 1) % BN_BYTES;
+  if (bn_wexpand(ret, num_words) == NULL) {
+    if (bn) {
+      BN_free(bn);
+    }
+    return NULL;
+  }
+
+  ret->top = num_words;
+  ret->neg = 0;
+
+  while (len--) {
+    word = (word << 8) | *(in++);
+    if (m-- == 0) {
+      ret->d[--num_words] = word;
+      word = 0;
+      m = BN_BYTES - 1;
+    }
+  }
+
+  /* need to call this due to clear byte at top if avoiding having the top bit
+   * set (-ve number) */
+  bn_correct_top(ret);
+  return ret;
+}
+
+size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) {
+  size_t n, i;
+  BN_ULONG l;
+
+  n = i = BN_num_bytes(in);
+  while (i--) {
+    l = in->d[i / BN_BYTES];
+    *(out++) = (unsigned char)(l >> (8 * (i % BN_BYTES))) & 0xff;
+  }
+  return n;
+}
+
+static const char hextable[] = "0123456789abcdef";
+
+char *BN_bn2hex(const BIGNUM *bn) {
+  int i, j, v, z = 0;
+  char *buf;
+  char *p;
+
+  buf = (char *)OPENSSL_malloc(bn->top * BN_BYTES * 2 + 2);
+  if (buf == NULL) {
+    OPENSSL_PUT_ERROR(BN, BN_bn2hex, ERR_R_MALLOC_FAILURE);
+    return NULL;
+  }
+
+  p = buf;
+  if (bn->neg) {
+    *(p++) = '-';
+  }
+
+  if (BN_is_zero(bn)) {
+    *(p++) = '0';
+  }
+
+  for (i = bn->top - 1; i >= 0; i--) {
+    for (j = BN_BITS2 - 8; j >= 0; j -= 8) {
+      /* strip leading zeros */
+      v = ((int)(bn->d[i] >> (long)j)) & 0xff;
+      if (z || v != 0) {
+        *(p++) = hextable[v >> 4];
+        *(p++) = hextable[v & 0x0f];
+        z = 1;
+      }
+    }
+  }
+  *p = '\0';
+
+  return buf;
+}
+
+/* decode_hex decodes |i| bytes of hex data from |in| and updates |bn|. */
+static void decode_hex(BIGNUM *bn, const char *in, int i) {
+  int h, m, j, k, c;
+  BN_ULONG l=0;
+
+  j = i; /* least significant 'hex' */
+  m = 0;
+  h = 0;
+  while (j > 0) {
+    m = ((BN_BYTES * 2) <= j) ? (BN_BYTES * 2) : j;
+    l = 0;
+    for (;;) {
+      c = in[j - m];
+      if ((c >= '0') && (c <= '9')) {
+        k = c - '0';
+      } else if ((c >= 'a') && (c <= 'f')) {
+        k = c - 'a' + 10;
+      } else if ((c >= 'A') && (c <= 'F')) {
+        k = c - 'A' + 10;
+      } else {
+        k = 0; /* paranoia */
+      }
+
+      l = (l << 4) | k;
+
+      if (--m <= 0) {
+        bn->d[h++] = l;
+        break;
+      }
+    }
+
+    j -= (BN_BYTES * 2);
+  }
+
+  bn->top = h;
+}
+
+/* decode_dec decodes |i| bytes of decimal data from |in| and updates |bn|. */
+static void decode_dec(BIGNUM *bn, const char *in, int i) {
+  int j;
+  BN_ULONG l = 0;
+
+  j = BN_DEC_NUM - (i % BN_DEC_NUM);
+  if (j == BN_DEC_NUM) {
+    j = 0;
+  }
+  l = 0;
+  while (*in) {
+    l *= 10;
+    l += *in - '0';
+    in++;
+    if (++j == BN_DEC_NUM) {
+      BN_mul_word(bn, BN_DEC_CONV);
+      BN_add_word(bn, l);
+      l = 0;
+      j = 0;
+    }
+  }
+}
+
+typedef void (*decode_func) (BIGNUM *bn, const char *in, int i);
+typedef int (*char_test_func) (int c);
+
+static int bn_x2bn(BIGNUM **outp, const char *in, decode_func decode, char_test_func want_char) {
+  BIGNUM *ret = NULL;
+  int neg = 0, i;
+  int num;
+
+  if (in == NULL || *in == 0) {
+    return 0;
+  }
+
+  if (*in == '-') {
+    neg = 1;
+    in++;
+  }
+
+  for (i = 0; want_char((unsigned char)in[i]); i++) {}
+
+  num = i + neg;
+  if (outp == NULL) {
+    return num;
+  }
+
+  /* in is the start of the hex digits, and it is 'i' long */
+  if (*outp == NULL) {
+    ret = BN_new();
+    if (ret == NULL) {
+      return 0;
+    }
+  } else {
+    ret = *outp;
+    BN_zero(ret);
+  }
+  ret->neg = neg;
+
+  /* i is the number of hex digests; */
+  if (bn_expand(ret, i * 4) == NULL) {
+    goto err;
+  }
+
+  decode(ret, in, i);
+
+  bn_correct_top(ret);
+
+  *outp = ret;
+  return num;
+
+err:
+  if (*outp == NULL) {
+    BN_free(ret);
+  }
+
+  return 0;
+}
+
+int BN_hex2bn(BIGNUM **outp, const char *in) {
+  return bn_x2bn(outp, in, decode_hex, isxdigit);
+}
+
+char *BN_bn2dec(const BIGNUM *a) {
+  int i = 0, num, ok = 0;
+  char *buf = NULL;
+  char *p;
+  BIGNUM *t = NULL;
+  BN_ULONG *bn_data = NULL, *lp;
+
+  /* get an upper bound for the length of the decimal integer
+   * num <= (BN_num_bits(a) + 1) * log(2)
+   *     <= 3 * BN_num_bits(a) * 0.1001 + log(2) + 1     (rounding error)
+   *     <= BN_num_bits(a)/10 + BN_num_bits/1000 + 1 + 1
+   */
+  i = BN_num_bits(a) * 3;
+  num = i / 10 + i / 1000 + 1 + 1;
+  bn_data =
+      (BN_ULONG *)OPENSSL_malloc((num / BN_DEC_NUM + 1) * sizeof(BN_ULONG));
+  buf = (char *)OPENSSL_malloc(num + 3);
+  if ((buf == NULL) || (bn_data == NULL)) {
+    OPENSSL_PUT_ERROR(BN, BN_bn2dec, ERR_R_MALLOC_FAILURE);
+    goto err;
+  }
+  t = BN_dup(a);
+  if (t == NULL) {
+    goto err;
+  }
+
+#define BUF_REMAIN (num + 3 - (size_t)(p - buf))
+  p = buf;
+  lp = bn_data;
+  if (BN_is_zero(t)) {
+    *(p++) = '0';
+    *(p++) = '\0';
+  } else {
+    if (BN_is_negative(t)) {
+      *p++ = '-';
+    }
+
+    i = 0;
+    while (!BN_is_zero(t)) {
+      *lp = BN_div_word(t, BN_DEC_CONV);
+      lp++;
+    }
+    lp--;
+    /* We now have a series of blocks, BN_DEC_NUM chars
+     * in length, where the last one needs truncation.
+     * The blocks need to be reversed in order. */
+    BIO_snprintf(p, BUF_REMAIN, BN_DEC_FMT1, *lp);
+    while (*p) {
+      p++;
+    }
+    while (lp != bn_data) {
+      lp--;
+      BIO_snprintf(p, BUF_REMAIN, BN_DEC_FMT2, *lp);
+      while (*p) {
+        p++;
+      }
+    }
+  }
+  ok = 1;
+
+err:
+  if (bn_data != NULL) {
+    OPENSSL_free(bn_data);
+  }
+  if (t != NULL) {
+    BN_free(t);
+  }
+  if (!ok && buf) {
+    OPENSSL_free(buf);
+    buf = NULL;
+  }
+
+  return buf;
+}
+
+int BN_dec2bn(BIGNUM **outp, const char *in) {
+  return bn_x2bn(outp, in, decode_dec, isdigit);
+}
+
+int BN_asc2bn(BIGNUM **outp, const char *in) {
+  const char *const orig_in = in;
+  if (*in == '-') {
+    in++;
+  }
+
+  if (in[0] == '0' && (in[1] == 'X' || in[1] == 'x')) {
+    if (!BN_hex2bn(outp, in+2)) {
+      return 0;
+    }
+  } else {
+    if (!BN_dec2bn(outp, in)) {
+      return 0;
+    }
+  }
+
+  if (*orig_in == '-') {
+    (*outp)->neg = 1;
+  }
+
+  return 1;
+}
+
+int BN_print(BIO *bp, const BIGNUM *a) {
+  int i, j, v, z = 0;
+  int ret = 0;
+
+  if (a->neg && BIO_write(bp, "-", 1) != 1) {
+    goto end;
+  }
+
+  if (BN_is_zero(a) && BIO_write(bp, "0", 1) != 1) {
+    goto end;
+  }
+
+  for (i = a->top - 1; i >= 0; i--) {
+    for (j = BN_BITS2 - 4; j >= 0; j -= 4) {
+      /* strip leading zeros */
+      v = ((int)(a->d[i] >> (long)j)) & 0x0f;
+      if (z || v != 0) {
+        if (BIO_write(bp, &hextable[v], 1) != 1) {
+          goto end;
+        }
+        z = 1;
+      }
+    }
+  }
+  ret = 1;
+
+end:
+  return ret;
+}
+
+int BN_print_fp(FILE *fp, const BIGNUM *a) {
+  BIO *b;
+  int ret;
+
+  b = BIO_new(BIO_s_file());
+  if (b == NULL) {
+    return 0;
+  }
+  BIO_set_fp(b, fp, BIO_NOCLOSE);
+  ret = BN_print(b, a);
+  BIO_free(b);
+
+  return ret;
+}
+
+BN_ULONG BN_get_word(const BIGNUM *bn) {
+  switch (bn->top) {
+    case 0:
+      return 0;
+    case 1:
+      return bn->d[0];
+    default:
+      return BN_MASK2;
+  }
+}

diff --git a/crypto/bn/ctx.c b/crypto/bn/ctx.c
new file mode 100644
index 0000000..a89f87d
--- /dev/null
+++ b/crypto/bn/ctx.c

@@ -0,0 +1,313 @@
+/* Written by Ulf Moeller for the OpenSSL project. */
+/* ====================================================================
+ * Copyright (c) 1998-2004 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+
+/* How many bignums are in each "pool item"; */
+#define BN_CTX_POOL_SIZE 16
+/* The stack frame info is resizing, set a first-time expansion size; */
+#define BN_CTX_START_FRAMES 32
+
+/* A bundle of bignums that can be linked with other bundles */
+typedef struct bignum_pool_item {
+  /* The bignum values */
+  BIGNUM vals[BN_CTX_POOL_SIZE];
+  /* Linked-list admin */
+  struct bignum_pool_item *prev, *next;
+} BN_POOL_ITEM;
+
+
+typedef struct bignum_pool {
+  /* Linked-list admin */
+  BN_POOL_ITEM *head, *current, *tail;
+  /* Stack depth and allocation size */
+  unsigned used, size;
+} BN_POOL;
+
+static void BN_POOL_init(BN_POOL *);
+static void BN_POOL_finish(BN_POOL *);
+static BIGNUM *BN_POOL_get(BN_POOL *);
+static void BN_POOL_release(BN_POOL *, unsigned int);
+
+/************/
+/* BN_STACK */
+/************/
+
+/* A wrapper to manage the "stack frames" */
+typedef struct bignum_ctx_stack {
+  /* Array of indexes into the bignum stack */
+  unsigned int *indexes;
+  /* Number of stack frames, and the size of the allocated array */
+  unsigned int depth, size;
+} BN_STACK;
+
+static void		BN_STACK_init(BN_STACK *);
+static void		BN_STACK_finish(BN_STACK *);
+static int		BN_STACK_push(BN_STACK *, unsigned int);
+static unsigned int	BN_STACK_pop(BN_STACK *);
+
+/**********/
+/* BN_CTX */
+/**********/
+
+/* The opaque BN_CTX type */
+struct bignum_ctx {
+  /* The bignum bundles */
+  BN_POOL pool;
+  /* The "stack frames", if you will */
+  BN_STACK stack;
+  /* The number of bignums currently assigned */
+  unsigned int used;
+  /* Depth of stack overflow */
+  int err_stack;
+  /* Block "gets" until an "end" (compatibility behaviour) */
+  int too_many;
+};
+
+BN_CTX *BN_CTX_new(void) {
+  BN_CTX *ret = OPENSSL_malloc(sizeof(BN_CTX));
+  if (!ret) {
+    OPENSSL_PUT_ERROR(BN, BN_CTX_new, ERR_R_MALLOC_FAILURE);
+    return NULL;
+  }
+
+  /* Initialise the structure */
+  BN_POOL_init(&ret->pool);
+  BN_STACK_init(&ret->stack);
+  ret->used = 0;
+  ret->err_stack = 0;
+  ret->too_many = 0;
+  return ret;
+}
+
+void BN_CTX_free(BN_CTX *ctx) {
+  if (ctx == NULL) {
+    return;
+  }
+
+  BN_STACK_finish(&ctx->stack);
+  BN_POOL_finish(&ctx->pool);
+  OPENSSL_free(ctx);
+}
+
+void BN_CTX_start(BN_CTX *ctx) {
+  /* If we're already overflowing ... */
+  if (ctx->err_stack || ctx->too_many) {
+    ctx->err_stack++;
+  } else if (!BN_STACK_push(&ctx->stack, ctx->used)) {
+    /* (Try to) get a new frame pointer */
+    OPENSSL_PUT_ERROR(BN, BN_CTX_start, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+    ctx->err_stack++;
+  }
+}
+
+BIGNUM *BN_CTX_get(BN_CTX *ctx) {
+  BIGNUM *ret;
+  if (ctx->err_stack || ctx->too_many) {
+    return NULL;
+  }
+
+  ret = BN_POOL_get(&ctx->pool);
+  if (ret == NULL) {
+    /* Setting too_many prevents repeated "get" attempts from
+     * cluttering the error stack. */
+    ctx->too_many = 1;
+    OPENSSL_PUT_ERROR(BN, BN_CTX_get, BN_R_TOO_MANY_TEMPORARY_VARIABLES);
+    return NULL;
+  }
+
+  /* OK, make sure the returned bignum is "zero" */
+  BN_zero(ret);
+  ctx->used++;
+  return ret;
+}
+
+void BN_CTX_end(BN_CTX *ctx) {
+  if (ctx->err_stack) {
+    ctx->err_stack--;
+  } else {
+    unsigned int fp = BN_STACK_pop(&ctx->stack);
+    /* Does this stack frame have anything to release? */
+    if (fp < ctx->used) {
+      BN_POOL_release(&ctx->pool, ctx->used - fp);
+    }
+
+    ctx->used = fp;
+    /* Unjam "too_many" in case "get" had failed */
+    ctx->too_many = 0;
+  }
+}
+
+/************/
+/* BN_STACK */
+/************/
+
+static void BN_STACK_init(BN_STACK *st) {
+  st->indexes = NULL;
+  st->depth = st->size = 0;
+}
+
+static void BN_STACK_finish(BN_STACK *st) {
+  if (st->size)
+    OPENSSL_free(st->indexes);
+}
+
+static int BN_STACK_push(BN_STACK *st, unsigned int idx) {
+  if (st->depth == st->size)
+      /* Need to expand */
+  {
+    unsigned int newsize =
+        (st->size ? (st->size * 3 / 2) : BN_CTX_START_FRAMES);
+    unsigned int *newitems = OPENSSL_malloc(newsize * sizeof(unsigned int));
+    if (!newitems) {
+      return 0;
+    }
+    if (st->depth) {
+      memcpy(newitems, st->indexes, st->depth * sizeof(unsigned int));
+    }
+    if (st->size) {
+      OPENSSL_free(st->indexes);
+    }
+    st->indexes = newitems;
+    st->size = newsize;
+  }
+
+  st->indexes[(st->depth)++] = idx;
+  return 1;
+}
+
+static unsigned int BN_STACK_pop(BN_STACK *st) {
+  return st->indexes[--(st->depth)];
+}
+
+static void BN_POOL_init(BN_POOL *p) {
+  p->head = p->current = p->tail = NULL;
+  p->used = p->size = 0;
+}
+
+static void BN_POOL_finish(BN_POOL *p) {
+  while (p->head) {
+    unsigned int loop = 0;
+    BIGNUM *bn = p->head->vals;
+    while (loop++ < BN_CTX_POOL_SIZE) {
+      if (bn->d) {
+        BN_clear_free(bn);
+      }
+      bn++;
+    }
+
+    p->current = p->head->next;
+    OPENSSL_free(p->head);
+    p->head = p->current;
+  }
+}
+
+static BIGNUM *BN_POOL_get(BN_POOL *p) {
+  if (p->used == p->size) {
+    BIGNUM *bn;
+    unsigned int loop = 0;
+    BN_POOL_ITEM *item = OPENSSL_malloc(sizeof(BN_POOL_ITEM));
+    if (!item) {
+      return NULL;
+    }
+
+    /* Initialise the structure */
+    bn = item->vals;
+    while (loop++ < BN_CTX_POOL_SIZE) {
+      BN_init(bn++);
+    }
+
+    item->prev = p->tail;
+    item->next = NULL;
+    /* Link it in */
+    if (!p->head) {
+      p->head = p->current = p->tail = item;
+    } else {
+      p->tail->next = item;
+      p->tail = item;
+      p->current = item;
+    }
+
+    p->size += BN_CTX_POOL_SIZE;
+    p->used++;
+    /* Return the first bignum from the new pool */
+    return item->vals;
+  }
+
+  if (!p->used) {
+    p->current = p->head;
+  } else if ((p->used % BN_CTX_POOL_SIZE) == 0) {
+    p->current = p->current->next;
+  }
+
+  return p->current->vals + ((p->used++) % BN_CTX_POOL_SIZE);
+}
+
+static void BN_POOL_release(BN_POOL *p, unsigned int num) {
+  unsigned int offset = (p->used - 1) % BN_CTX_POOL_SIZE;
+  p->used -= num;
+
+  while (num--) {
+    if (!offset) {
+      offset = BN_CTX_POOL_SIZE - 1;
+      p->current = p->current->prev;
+    } else {
+      offset--;
+    }
+  }
+}

diff --git a/crypto/bn/div.c b/crypto/bn/div.c
new file mode 100644
index 0000000..5f92e9e
--- /dev/null
+++ b/crypto/bn/div.c

@@ -0,0 +1,619 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <limits.h>
+#include <openssl/err.h>
+
+#include "internal.h"
+
+
+#define asm __asm__
+
+#if !defined(OPENSSL_NO_ASM)
+# if defined(__GNUC__) && __GNUC__>=2
+#  if defined(OPENSSL_X86)
+   /*
+    * There were two reasons for implementing this template:
+    * - GNU C generates a call to a function (__udivdi3 to be exact)
+    *   in reply to ((((BN_ULLONG)n0)<<BN_BITS2)|n1)/d0 (I fail to
+    *   understand why...);
+    * - divl doesn't only calculate quotient, but also leaves
+    *   remainder in %edx which we can definitely use here:-)
+    *
+    *					<appro@fy.chalmers.se>
+    */
+#undef div_asm
+#  define div_asm(n0,n1,d0)		\
+	({  asm volatile (			\
+		"divl	%4"			\
+		: "=a"(q), "=d"(rem)		\
+		: "a"(n1), "d"(n0), "g"(d0)	\
+		: "cc");			\
+	    q;					\
+	})
+#  define REMAINDER_IS_ALREADY_CALCULATED
+#  elif defined(OPENSSL_X86_64)
+   /*
+    * Same story here, but it's 128-bit by 64-bit division. Wow!
+    *					<appro@fy.chalmers.se>
+    */
+#  undef div_asm
+#  define div_asm(n0,n1,d0)		\
+	({  asm volatile (			\
+		"divq	%4"			\
+		: "=a"(q), "=d"(rem)		\
+		: "a"(n1), "d"(n0), "g"(d0)	\
+		: "cc");			\
+	    q;					\
+	})
+#  define REMAINDER_IS_ALREADY_CALCULATED
+#  endif /* __<cpu> */
+# endif /* __GNUC__ */
+#endif /* OPENSSL_NO_ASM */
+
+/* BN_div computes  dv := num / divisor,  rounding towards
+ * zero, and sets up rm  such that  dv*divisor + rm = num  holds.
+ * Thus:
+ *     dv->neg == num->neg ^ divisor->neg  (unless the result is zero)
+ *     rm->neg == num->neg                 (unless the remainder is zero)
+ * If 'dv' or 'rm' is NULL, the respective value is not returned. */
+int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
+           BN_CTX *ctx) {
+  int norm_shift, i, loop;
+  BIGNUM *tmp, wnum, *snum, *sdiv, *res;
+  BN_ULONG *resp, *wnump;
+  BN_ULONG d0, d1;
+  int num_n, div_n;
+  int no_branch = 0;
+
+  /* Invalid zero-padding would have particularly bad consequences
+   * in the case of 'num'. */
+  if (num->top > 0 && num->d[num->top - 1] == 0) {
+    OPENSSL_PUT_ERROR(BN, BN_div, BN_R_NOT_INITIALIZED);
+    return 0;
+  }
+
+  if ((num->flags & BN_FLG_CONSTTIME) != 0 ||
+      (divisor->flags & BN_FLG_CONSTTIME) != 0) {
+    no_branch = 1;
+  }
+
+  if (BN_is_zero(divisor)) {
+    OPENSSL_PUT_ERROR(BN, BN_div, BN_R_DIV_BY_ZERO);
+    return 0;
+  }
+
+  if (!no_branch && BN_ucmp(num, divisor) < 0) {
+    if (rm != NULL) {
+      if (BN_copy(rm, num) == NULL) {
+        return 0;
+      }
+    }
+    if (dv != NULL) {
+      BN_zero(dv);
+    }
+    return 1;
+  }
+
+  BN_CTX_start(ctx);
+  tmp = BN_CTX_get(ctx);
+  snum = BN_CTX_get(ctx);
+  sdiv = BN_CTX_get(ctx);
+  if (dv == NULL) {
+    res = BN_CTX_get(ctx);
+  } else {
+    res = dv;
+  }
+  if (sdiv == NULL || res == NULL || tmp == NULL || snum == NULL) {
+    goto err;
+  }
+
+  /* First we normalise the numbers */
+  norm_shift = BN_BITS2 - ((BN_num_bits(divisor)) % BN_BITS2);
+  if (!(BN_lshift(sdiv, divisor, norm_shift))) {
+    goto err;
+  }
+  sdiv->neg = 0;
+  norm_shift += BN_BITS2;
+  if (!(BN_lshift(snum, num, norm_shift))) {
+    goto err;
+  }
+  snum->neg = 0;
+
+  if (no_branch) {
+    /* Since we don't know whether snum is larger than sdiv,
+     * we pad snum with enough zeroes without changing its
+     * value.
+     */
+    if (snum->top <= sdiv->top + 1) {
+      if (bn_wexpand(snum, sdiv->top + 2) == NULL) {
+        goto err;
+      }
+      for (i = snum->top; i < sdiv->top + 2; i++) {
+        snum->d[i] = 0;
+      }
+      snum->top = sdiv->top + 2;
+    } else {
+      if (bn_wexpand(snum, snum->top + 1) == NULL) {
+        goto err;
+      }
+      snum->d[snum->top] = 0;
+      snum->top++;
+    }
+  }
+
+  div_n = sdiv->top;
+  num_n = snum->top;
+  loop = num_n - div_n;
+  /* Lets setup a 'window' into snum
+   * This is the part that corresponds to the current
+   * 'area' being divided */
+  wnum.neg = 0;
+  wnum.d = &(snum->d[loop]);
+  wnum.top = div_n;
+  /* only needed when BN_ucmp messes up the values between top and max */
+  wnum.dmax = snum->dmax - loop; /* so we don't step out of bounds */
+
+  /* Get the top 2 words of sdiv */
+  /* div_n=sdiv->top; */
+  d0 = sdiv->d[div_n - 1];
+  d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2];
+
+  /* pointer to the 'top' of snum */
+  wnump = &(snum->d[num_n - 1]);
+
+  /* Setup to 'res' */
+  res->neg = (num->neg ^ divisor->neg);
+  if (!bn_wexpand(res, (loop + 1))) {
+    goto err;
+  }
+  res->top = loop - no_branch;
+  resp = &(res->d[loop - 1]);
+
+  /* space for temp */
+  if (!bn_wexpand(tmp, (div_n + 1))) {
+    goto err;
+  }
+
+  if (!no_branch) {
+    if (BN_ucmp(&wnum, sdiv) >= 0) {
+      bn_sub_words(wnum.d, wnum.d, sdiv->d, div_n);
+      *resp = 1;
+    } else {
+      res->top--;
+    }
+  }
+
+  /* if res->top == 0 then clear the neg value otherwise decrease
+   * the resp pointer */
+  if (res->top == 0) {
+    res->neg = 0;
+  } else {
+    resp--;
+  }
+
+  for (i = 0; i < loop - 1; i++, wnump--, resp--) {
+    BN_ULONG q, l0;
+    /* the first part of the loop uses the top two words of snum and sdiv to
+     * calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv */
+    BN_ULONG n0, n1, rem = 0;
+
+    n0 = wnump[0];
+    n1 = wnump[-1];
+    if (n0 == d0) {
+      q = BN_MASK2;
+    } else {
+      /* n0 < d0 */
+#ifdef BN_LLONG
+      BN_ULLONG t2;
+
+#if defined(BN_LLONG) && defined(BN_DIV2W) && !defined(div_asm)
+      q = (BN_ULONG)(((((BN_ULLONG)n0) << BN_BITS2) | n1) / d0);
+#else
+      q = div_asm(n0, n1, d0);
+#endif
+
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+      /* rem doesn't have to be BN_ULLONG. The least we know it's less that d0,
+       * isn't it? */
+      rem = (n1 - q * d0) & BN_MASK2;
+#endif
+
+      t2 = (BN_ULLONG)d1 * q;
+
+      for (;;) {
+        if (t2 <= ((((BN_ULLONG)rem) << BN_BITS2) | wnump[-2]))
+          break;
+        q--;
+        rem += d0;
+        if (rem < d0)
+          break; /* don't let rem overflow */
+        t2 -= d1;
+      }
+#else /* !BN_LLONG */
+      BN_ULONG t2l, t2h;
+
+#if defined(div_asm)
+      q = div_asm(n0, n1, d0);
+#else
+      q = bn_div_words(n0, n1, d0);
+#endif
+
+#ifndef REMAINDER_IS_ALREADY_CALCULATED
+      rem = (n1 - q * d0) & BN_MASK2;
+#endif
+
+#if defined(BN_UMULT_LOHI)
+      BN_UMULT_LOHI(t2l, t2h, d1, q);
+#elif defined(BN_UMULT_HIGH)
+      t2l = d1 * q;
+      t2h = BN_UMULT_HIGH(d1, q);
+#else
+      {
+        BN_ULONG ql, qh;
+        t2l = LBITS(d1);
+        t2h = HBITS(d1);
+        ql = LBITS(q);
+        qh = HBITS(q);
+        mul64(t2l, t2h, ql, qh); /* t2=(BN_ULLONG)d1*q; */
+      }
+#endif
+
+      for (;;) {
+        if ((t2h < rem) || ((t2h == rem) && (t2l <= wnump[-2])))
+          break;
+        q--;
+        rem += d0;
+        if (rem < d0)
+          break; /* don't let rem overflow */
+        if (t2l < d1)
+          t2h--;
+        t2l -= d1;
+      }
+#endif /* !BN_LLONG */
+    }
+
+    l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q);
+    tmp->d[div_n] = l0;
+    wnum.d--;
+    /* ingore top values of the bignums just sub the two
+     * BN_ULONG arrays with bn_sub_words */
+    if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) {
+      /* Note: As we have considered only the leading
+       * two BN_ULONGs in the calculation of q, sdiv * q
+       * might be greater than wnum (but then (q-1) * sdiv
+       * is less or equal than wnum)
+       */
+      q--;
+      if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) {
+        /* we can't have an overflow here (assuming
+         * that q != 0, but if q == 0 then tmp is
+         * zero anyway) */
+        (*wnump)++;
+      }
+    }
+    /* store part of the result */
+    *resp = q;
+  }
+  bn_correct_top(snum);
+  if (rm != NULL) {
+    /* Keep a copy of the neg flag in num because if rm==num
+     * BN_rshift() will overwrite it.
+     */
+    int neg = num->neg;
+    BN_rshift(rm, snum, norm_shift);
+    if (!BN_is_zero(rm)) {
+      rm->neg = neg;
+    }
+  }
+  if (no_branch) {
+    bn_correct_top(res);
+  }
+  BN_CTX_end(ctx);
+  return 1;
+
+err:
+  BN_CTX_end(ctx);
+  return 0;
+}
+
+int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx) {
+  if (!(BN_mod(r, m, d, ctx))) {
+    return 0;
+  }
+  if (!r->neg) {
+    return 1;
+  }
+
+  /* now -|d| < r < 0, so we have to set r := r + |d|. */
+  return (d->neg ? BN_sub : BN_add)(r, r, d);
+}
+
+int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx) {
+  if (!BN_add(r, a, b)) {
+    return 0;
+  }
+  return BN_nnmod(r, r, m, ctx);
+}
+
+int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m) {
+  if (!BN_uadd(r, a, b)) {
+    return 0;
+  }
+  if (BN_ucmp(r, m) >= 0) {
+    return BN_usub(r, r, m);
+  }
+  return 1;
+}
+
+int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx) {
+  if (!BN_sub(r, a, b)) {
+    return 0;
+  }
+  return BN_nnmod(r, r, m, ctx);
+}
+
+/* BN_mod_sub variant that may be used if both  a  and  b  are non-negative
+ * and less than  m */
+int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                     const BIGNUM *m) {
+  if (!BN_sub(r, a, b)) {
+    return 0;
+  }
+  if (r->neg) {
+    return BN_add(r, r, m);
+  }
+  return 1;
+}
+
+int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m,
+               BN_CTX *ctx) {
+  BIGNUM *t;
+  int ret = 0;
+
+  BN_CTX_start(ctx);
+  t = BN_CTX_get(ctx);
+  if (t == NULL) {
+    goto err;
+  }
+
+  if (a == b) {
+    if (!BN_sqr(t, a, ctx)) {
+      goto err;
+    }
+  } else {
+    if (!BN_mul(t, a, b, ctx)) {
+      goto err;
+    }
+  }
+
+  if (!BN_nnmod(r, t, m, ctx)) {
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
+  if (!BN_sqr(r, a, ctx)) {
+    return 0;
+  }
+
+  /* r->neg == 0,  thus we don't need BN_nnmod */
+  return BN_mod(r, r, m, ctx);
+}
+
+int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m,
+                  BN_CTX *ctx) {
+  BIGNUM *abs_m = NULL;
+  int ret;
+
+  if (!BN_nnmod(r, a, m, ctx)) {
+    return 0;
+  }
+
+  if (m->neg) {
+    abs_m = BN_dup(m);
+    if (abs_m == NULL) {
+      return 0;
+    }
+    abs_m->neg = 0;
+  }
+
+  ret = BN_mod_lshift_quick(r, r, n, (abs_m ? abs_m : m));
+
+  if (abs_m) {
+    BN_free(abs_m);
+  }
+  return ret;
+}
+
+int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) {
+  if (r != a) {
+    if (BN_copy(r, a) == NULL) {
+      return 0;
+    }
+  }
+
+  while (n > 0) {
+    int max_shift;
+
+    /* 0 < r < m */
+    max_shift = BN_num_bits(m) - BN_num_bits(r);
+    /* max_shift >= 0 */
+
+    if (max_shift < 0) {
+      OPENSSL_PUT_ERROR(BN, BN_mod_lshift_quick, BN_R_INPUT_NOT_REDUCED);
+      return 0;
+    }
+
+    if (max_shift > n) {
+      max_shift = n;
+    }
+
+    if (max_shift) {
+      if (!BN_lshift(r, r, max_shift)) {
+        return 0;
+      }
+      n -= max_shift;
+    } else {
+      if (!BN_lshift1(r, r)) {
+        return 0;
+      }
+      --n;
+    }
+
+    /* BN_num_bits(r) <= BN_num_bits(m) */
+    if (BN_cmp(r, m) >= 0) {
+      if (!BN_sub(r, r, m)) {
+        return 0;
+      }
+    }
+  }
+
+  return 1;
+}
+
+int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) {
+  if (!BN_lshift1(r, a)) {
+    return 0;
+  }
+
+  return BN_nnmod(r, r, m, ctx);
+}
+
+int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) {
+  if (!BN_lshift1(r, a)) {
+    return 0;
+  }
+  if (BN_cmp(r, m) >= 0) {
+    return BN_sub(r, r, m);
+  }
+
+  return 1;
+}
+
+BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) {
+  BN_ULONG ret = 0;
+  int i, j;
+
+  w &= BN_MASK2;
+
+  if (!w) {
+    /* actually this an error (division by zero) */
+    return (BN_ULONG) - 1;
+  }
+
+  if (a->top == 0) {
+    return 0;
+  }
+
+  /* normalize input (so bn_div_words doesn't complain) */
+  j = BN_BITS2 - BN_num_bits_word(w);
+  w <<= j;
+  if (!BN_lshift(a, a, j)) {
+    return (BN_ULONG) - 1;
+  }
+
+  for (i = a->top - 1; i >= 0; i--) {
+    BN_ULONG l, d;
+
+    l = a->d[i];
+    d = bn_div_words(ret, l, w);
+    ret = (l - ((d * w) & BN_MASK2)) & BN_MASK2;
+    a->d[i] = d;
+  }
+
+  if ((a->top > 0) && (a->d[a->top - 1] == 0)) {
+    a->top--;
+  }
+
+  ret >>= j;
+  return ret;
+}
+
+BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) {
+#ifndef BN_LLONG
+  BN_ULONG ret = 0;
+#else
+  BN_ULLONG ret = 0;
+#endif
+  int i;
+
+  if (w == 0) {
+    return (BN_ULONG) -1;
+  }
+
+  w &= BN_MASK2;
+  for (i = a->top - 1; i >= 0; i--) {
+#ifndef BN_LLONG
+    ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w;
+    ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w;
+#else
+    ret = (BN_ULLONG)(((ret << (BN_ULLONG)BN_BITS2) | a->d[i]) % (BN_ULLONG)w);
+#endif
+  }
+  return (BN_ULONG)ret;
+}

diff --git a/crypto/bn/exponentiation.c b/crypto/bn/exponentiation.c
new file mode 100644
index 0000000..b51f15f
--- /dev/null
+++ b/crypto/bn/exponentiation.c

@@ -0,0 +1,1500 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2005 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <openssl/cpu.h>
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+#define OPENSSL_BN_ASM_MONT5
+#define RSAZ_ENABLED
+
+#include "rsaz_exp.h"
+#endif
+
+int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
+  int i, bits, ret = 0;
+  BIGNUM *v, *rr;
+
+  if ((p->flags & BN_FLG_CONSTTIME) != 0) {
+    /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+    OPENSSL_PUT_ERROR(BN, BN_exp, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+
+  BN_CTX_start(ctx);
+  if (r == a || r == p) {
+    rr = BN_CTX_get(ctx);
+  } else {
+    rr = r;
+  }
+
+  v = BN_CTX_get(ctx);
+  if (rr == NULL || v == NULL) {
+    goto err;
+  }
+
+  if (BN_copy(v, a) == NULL) {
+    goto err;
+  }
+  bits = BN_num_bits(p);
+
+  if (BN_is_odd(p)) {
+    if (BN_copy(rr, a) == NULL) {
+      goto err;
+    }
+  } else {
+    if (!BN_one(rr)) {
+      goto err;
+    }
+  }
+
+  for (i = 1; i < bits; i++) {
+    if (!BN_sqr(v, v, ctx)) {
+      goto err;
+    }
+    if (BN_is_bit_set(p, i)) {
+      if (!BN_mul(rr, rr, v, ctx)) {
+        goto err;
+      }
+    }
+  }
+  ret = 1;
+
+err:
+  if (r != rr) {
+    BN_copy(r, rr);
+  }
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+/* maximum precomputation table size for *variable* sliding windows */
+#define TABLE_SIZE 32
+
+typedef struct bn_recp_ctx_st {
+  BIGNUM N;  /* the divisor */
+  BIGNUM Nr; /* the reciprocal */
+  int num_bits;
+  int shift;
+  int flags;
+} BN_RECP_CTX;
+
+static void BN_RECP_CTX_init(BN_RECP_CTX *recp) {
+  BN_init(&recp->N);
+  BN_init(&recp->Nr);
+  recp->num_bits = 0;
+  recp->flags = 0;
+}
+
+static void BN_RECP_CTX_free(BN_RECP_CTX *recp) {
+  if (recp == NULL) {
+    return;
+  }
+
+  BN_free(&recp->N);
+  BN_free(&recp->Nr);
+}
+
+static int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx) {
+  if (!BN_copy(&(recp->N), d)) {
+    return 0;
+  }
+  BN_zero(&recp->Nr);
+  recp->num_bits = BN_num_bits(d);
+  recp->shift = 0;
+
+  return 1;
+}
+
+/* len is the expected size of the result We actually calculate with an extra
+ * word of precision, so we can do faster division if the remainder is not
+ * required.
+ * r := 2^len / m */
+static int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx) {
+  int ret = -1;
+  BIGNUM *t;
+
+  BN_CTX_start(ctx);
+  t = BN_CTX_get(ctx);
+  if (t == NULL) {
+    goto err;
+  }
+
+  if (!BN_set_bit(t, len)) {
+    goto err;
+  }
+
+  if (!BN_div(r, NULL, t, m, ctx)) {
+    goto err;
+  }
+
+  ret = len;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+static int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m,
+                       BN_RECP_CTX *recp, BN_CTX *ctx) {
+  int i, j, ret = 0;
+  BIGNUM *a, *b, *d, *r;
+
+  BN_CTX_start(ctx);
+  a = BN_CTX_get(ctx);
+  b = BN_CTX_get(ctx);
+  if (dv != NULL) {
+    d = dv;
+  } else {
+    d = BN_CTX_get(ctx);
+  }
+
+  if (rem != NULL) {
+    r = rem;
+  } else {
+    r = BN_CTX_get(ctx);
+  }
+
+  if (a == NULL || b == NULL || d == NULL || r == NULL) {
+    goto err;
+  }
+
+  if (BN_ucmp(m, &(recp->N)) < 0) {
+    BN_zero(d);
+    if (!BN_copy(r, m)) {
+      return 0;
+    }
+    BN_CTX_end(ctx);
+    return 1;
+  }
+
+  /* We want the remainder
+   * Given input of ABCDEF / ab
+   * we need multiply ABCDEF by 3 digests of the reciprocal of ab */
+
+  /* i := max(BN_num_bits(m), 2*BN_num_bits(N)) */
+  i = BN_num_bits(m);
+  j = recp->num_bits << 1;
+  if (j > i) {
+    i = j;
+  }
+
+  /* Nr := round(2^i / N) */
+  if (i != recp->shift) {
+    recp->shift =
+        BN_reciprocal(&(recp->Nr), &(recp->N), i,
+                      ctx); /* BN_reciprocal returns i, or -1 for an error */
+  }
+
+  if (recp->shift == -1) {
+    goto err;
+  }
+
+  /* d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i -
+   * BN_num_bits(N)))|
+   *    = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i -
+   * BN_num_bits(N)))|
+   *   <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)|
+   *    = |m/N| */
+  if (!BN_rshift(a, m, recp->num_bits)) {
+    goto err;
+  }
+  if (!BN_mul(b, a, &(recp->Nr), ctx)) {
+    goto err;
+  }
+  if (!BN_rshift(d, b, i - recp->num_bits)) {
+    goto err;
+  }
+  d->neg = 0;
+
+  if (!BN_mul(b, &(recp->N), d, ctx)) {
+    goto err;
+  }
+  if (!BN_usub(r, m, b)) {
+    goto err;
+  }
+  r->neg = 0;
+
+  j = 0;
+  while (BN_ucmp(r, &(recp->N)) >= 0) {
+    if (j++ > 2) {
+      OPENSSL_PUT_ERROR(BN, BN_div_recp, BN_R_BAD_RECIPROCAL);
+      goto err;
+    }
+    if (!BN_usub(r, r, &(recp->N))) {
+      goto err;
+    }
+    if (!BN_add_word(d, 1)) {
+      goto err;
+    }
+  }
+
+  r->neg = BN_is_zero(r) ? 0 : m->neg;
+  d->neg = m->neg ^ recp->N.neg;
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+static int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y,
+                                 BN_RECP_CTX *recp, BN_CTX *ctx) {
+  int ret = 0;
+  BIGNUM *a;
+  const BIGNUM *ca;
+
+  BN_CTX_start(ctx);
+  a = BN_CTX_get(ctx);
+  if (a == NULL) {
+    goto err;
+  }
+
+  if (y != NULL) {
+    if (x == y) {
+      if (!BN_sqr(a, x, ctx)) {
+        goto err;
+      }
+    } else {
+      if (!BN_mul(a, x, y, ctx)) {
+        goto err;
+      }
+    }
+    ca = a;
+  } else {
+    ca = x; /* Just do the mod */
+  }
+
+  ret = BN_div_recp(NULL, r, ca, recp, ctx);
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+/* BN_window_bits_for_exponent_size -- macro for sliding window mod_exp
+ * functions
+ *
+ * For window size 'w' (w >= 2) and a random 'b' bits exponent, the number of
+ * multiplications is a constant plus on average
+ *
+ *    2^(w-1) + (b-w)/(w+1);
+ *
+ * here 2^(w-1)  is for precomputing the table (we actually need entries only
+ * for windows that have the lowest bit set), and (b-w)/(w+1)  is an
+ * approximation for the expected number of w-bit windows, not counting the
+ * first one.
+ *
+ * Thus we should use
+ *
+ *    w >= 6  if        b > 671
+ *     w = 5  if  671 > b > 239
+ *     w = 4  if  239 > b >  79
+ *     w = 3  if   79 > b >  23
+ *    w <= 2  if   23 > b
+ *
+ * (with draws in between).  Very small exponents are often selected
+ * with low Hamming weight, so we use  w = 1  for b <= 23. */
+#define BN_window_bits_for_exponent_size(b) \
+		((b) > 671 ? 6 : \
+		 (b) > 239 ? 5 : \
+		 (b) >  79 ? 4 : \
+		 (b) >  23 ? 3 : 1)
+
+static int mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p,
+                        const BIGNUM *m, BN_CTX *ctx) {
+  int i, j, bits, ret = 0, wstart, wend, window, wvalue;
+  int start = 1;
+  BIGNUM *aa;
+  /* Table of variables obtained from 'ctx' */
+  BIGNUM *val[TABLE_SIZE];
+  BN_RECP_CTX recp;
+
+  if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
+    /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+    OPENSSL_PUT_ERROR(BN, mod_exp_recp, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+
+  bits = BN_num_bits(p);
+
+  if (bits == 0) {
+    ret = BN_one(r);
+    return ret;
+  }
+
+  BN_CTX_start(ctx);
+  aa = BN_CTX_get(ctx);
+  val[0] = BN_CTX_get(ctx);
+  if (!aa || !val[0]) {
+    goto err;
+  }
+
+  BN_RECP_CTX_init(&recp);
+  if (m->neg) {
+    /* ignore sign of 'm' */
+    if (!BN_copy(aa, m)) {
+      goto err;
+    }
+    aa->neg = 0;
+    if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0) {
+      goto err;
+    }
+  } else {
+    if (BN_RECP_CTX_set(&recp, m, ctx) <= 0) {
+      goto err;
+    }
+  }
+
+  if (!BN_nnmod(val[0], a, m, ctx)) {
+    goto err; /* 1 */
+  }
+  if (BN_is_zero(val[0])) {
+    BN_zero(r);
+    ret = 1;
+    goto err;
+  }
+
+  window = BN_window_bits_for_exponent_size(bits);
+  if (window > 1) {
+    if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx)) {
+      goto err; /* 2 */
+    }
+    j = 1 << (window - 1);
+    for (i = 1; i < j; i++) {
+      if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
+          !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx)) {
+        goto err;
+      }
+    }
+  }
+
+  start = 1; /* This is used to avoid multiplication etc
+              * when there is only the value '1' in the
+              * buffer. */
+  wvalue = 0;        /* The 'value' of the window */
+  wstart = bits - 1; /* The top bit of the window */
+  wend = 0;          /* The bottom bit of the window */
+
+  if (!BN_one(r)) {
+    goto err;
+  }
+
+  for (;;) {
+    if (BN_is_bit_set(p, wstart) == 0) {
+      if (!start) {
+        if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) {
+          goto err;
+        }
+      }
+      if (wstart == 0) {
+        break;
+      }
+      wstart--;
+      continue;
+    }
+
+    /* We now have wstart on a 'set' bit, we now need to work out
+     * how bit a window to do.  To do this we need to scan
+     * forward until the last set bit before the end of the
+     * window */
+    j = wstart;
+    wvalue = 1;
+    wend = 0;
+    for (i = 1; i < window; i++) {
+      if (wstart - i < 0) {
+        break;
+      }
+      if (BN_is_bit_set(p, wstart - i)) {
+        wvalue <<= (i - wend);
+        wvalue |= 1;
+        wend = i;
+      }
+    }
+
+    /* wend is the size of the current window */
+    j = wend + 1;
+    /* add the 'bytes above' */
+    if (!start) {
+      for (i = 0; i < j; i++) {
+        if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) {
+          goto err;
+        }
+      }
+    }
+
+    /* wvalue will be an odd number < 2^window */
+    if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx)) {
+      goto err;
+    }
+
+    /* move the 'window' down further */
+    wstart -= wend + 1;
+    wvalue = 0;
+    start = 0;
+    if (wstart < 0) {
+      break;
+    }
+  }
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  BN_RECP_CTX_free(&recp);
+  return ret;
+}
+
+int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m,
+               BN_CTX *ctx) {
+  /* For even modulus  m = 2^k*m_odd,  it might make sense to compute
+   * a^p mod m_odd  and  a^p mod 2^k  separately (with Montgomery
+   * exponentiation for the odd part), using appropriate exponent
+   * reductions, and combine the results using the CRT.
+   *
+   * For now, we use Montgomery only if the modulus is odd; otherwise,
+   * exponentiation using the reciprocal-based quick remaindering
+   * algorithm is used.
+   *
+   * (Timing obtained with expspeed.c [computations  a^p mod m
+   * where  a, p, m  are of the same length: 256, 512, 1024, 2048,
+   * 4096, 8192 bits], compared to the running time of the
+   * standard algorithm:
+   *
+   *   BN_mod_exp_mont   33 .. 40 %  [AMD K6-2, Linux, debug configuration]
+   *                     55 .. 77 %  [UltraSparc processor, but
+   *                                  debug-solaris-sparcv8-gcc conf.]
+   *
+   *   BN_mod_exp_recp   50 .. 70 %  [AMD K6-2, Linux, debug configuration]
+   *                     62 .. 118 % [UltraSparc, debug-solaris-sparcv8-gcc]
+   *
+   * On the Sparc, BN_mod_exp_recp was faster than BN_mod_exp_mont
+   * at 2048 and more bits, but at 512 and 1024 bits, it was
+   * slower even than the standard algorithm!
+   *
+   * "Real" timings [linux-elf, solaris-sparcv9-gcc configurations]
+   * should be obtained when the new Montgomery reduction code
+   * has been integrated into OpenSSL.) */
+
+  if (BN_is_odd(m)) {
+    if (a->top == 1 && !a->neg && BN_get_flags(p, BN_FLG_CONSTTIME) == 0) {
+      BN_ULONG A = a->d[0];
+      return BN_mod_exp_mont_word(r, A, p, m, ctx, NULL);
+    }
+
+    return BN_mod_exp_mont(r, a, p, m, ctx, NULL);
+  }
+
+  return mod_exp_recp(r, a, p, m, ctx);
+}
+
+int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+                    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) {
+  int i, j, bits, ret = 0, wstart, wend, window, wvalue;
+  int start = 1;
+  BIGNUM *d, *r;
+  const BIGNUM *aa;
+  /* Table of variables obtained from 'ctx' */
+  BIGNUM *val[TABLE_SIZE];
+  BN_MONT_CTX *mont = NULL;
+
+  if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
+    return BN_mod_exp_mont_consttime(rr, a, p, m, ctx, in_mont);
+  }
+
+  if (!BN_is_odd(m)) {
+    OPENSSL_PUT_ERROR(BN, BN_mod_exp_mont, BN_R_CALLED_WITH_EVEN_MODULUS);
+    return 0;
+  }
+  bits = BN_num_bits(p);
+  if (bits == 0) {
+    ret = BN_one(rr);
+    return ret;
+  }
+
+  BN_CTX_start(ctx);
+  d = BN_CTX_get(ctx);
+  r = BN_CTX_get(ctx);
+  val[0] = BN_CTX_get(ctx);
+  if (!d || !r || !val[0]) {
+    goto err;
+  }
+
+  /* If this is not done, things will break in the montgomery part */
+
+  if (in_mont != NULL) {
+    mont = in_mont;
+  } else {
+    mont = BN_MONT_CTX_new();
+    if (mont == NULL) {
+      goto err;
+    }
+    if (!BN_MONT_CTX_set(mont, m, ctx)) {
+      goto err;
+    }
+  }
+
+  if (a->neg || BN_ucmp(a, m) >= 0) {
+    if (!BN_nnmod(val[0], a, m, ctx)) {
+      goto err;
+    }
+    aa = val[0];
+  } else {
+    aa = a;
+  }
+
+  if (BN_is_zero(aa)) {
+    BN_zero(rr);
+    ret = 1;
+    goto err;
+  }
+  if (!BN_to_montgomery(val[0], aa, mont, ctx)) {
+    goto err; /* 1 */
+  }
+
+  window = BN_window_bits_for_exponent_size(bits);
+  if (window > 1) {
+    if (!BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) {
+      goto err; /* 2 */
+    }
+    j = 1 << (window - 1);
+    for (i = 1; i < j; i++) {
+      if (((val[i] = BN_CTX_get(ctx)) == NULL) ||
+          !BN_mod_mul_montgomery(val[i], val[i - 1], d, mont, ctx)) {
+        goto err;
+      }
+    }
+  }
+
+  start = 1; /* This is used to avoid multiplication etc
+              * when there is only the value '1' in the
+              * buffer. */
+  wvalue = 0;        /* The 'value' of the window */
+  wstart = bits - 1; /* The top bit of the window */
+  wend = 0;          /* The bottom bit of the window */
+
+  j = m->top; /* borrow j */
+  if (m->d[j - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
+    if (bn_wexpand(r, j) == NULL)
+      goto err;
+    /* 2^(top*BN_BITS2) - m */
+    r->d[0] = (0 - m->d[0]) & BN_MASK2;
+    for (i = 1; i < j; i++)
+      r->d[i] = (~m->d[i]) & BN_MASK2;
+    r->top = j;
+  } else if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) {
+    goto err;
+  }
+
+  for (;;) {
+    if (BN_is_bit_set(p, wstart) == 0) {
+      if (!start) {
+        if (!BN_mod_mul_montgomery(r, r, r, mont, ctx))
+          goto err;
+      }
+      if (wstart == 0) {
+        break;
+      }
+      wstart--;
+      continue;
+    }
+
+    /* We now have wstart on a 'set' bit, we now need to work out how bit a
+     * window to do.  To do this we need to scan forward until the last set bit
+     * before the end of the window */
+    j = wstart;
+    wvalue = 1;
+    wend = 0;
+    for (i = 1; i < window; i++) {
+      if (wstart - i < 0) {
+        break;
+      }
+      if (BN_is_bit_set(p, wstart - i)) {
+        wvalue <<= (i - wend);
+        wvalue |= 1;
+        wend = i;
+      }
+    }
+
+    /* wend is the size of the current window */
+    j = wend + 1;
+    /* add the 'bytes above' */
+    if (!start) {
+      for (i = 0; i < j; i++) {
+        if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
+          goto err;
+        }
+      }
+    }
+
+    /* wvalue will be an odd number < 2^window */
+    if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) {
+      goto err;
+    }
+
+    /* move the 'window' down further */
+    wstart -= wend + 1;
+    wvalue = 0;
+    start = 0;
+    if (wstart < 0) {
+      break;
+    }
+  }
+
+  if (!BN_from_montgomery(rr, r, mont, ctx)) {
+    goto err;
+  }
+  ret = 1;
+
+err:
+  if (in_mont == NULL && mont != NULL) {
+    BN_MONT_CTX_free(mont);
+  }
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+/* BN_mod_exp_mont_consttime() stores the precomputed powers in a specific
+ * layout so that accessing any of these table values shows the same access
+ * pattern as far as cache lines are concerned. The following functions are
+ * used to transfer a BIGNUM from/to that table. */
+static int copy_to_prebuf(const BIGNUM *b, int top, unsigned char *buf, int idx,
+                          int width) {
+  size_t i, j;
+
+  if (top > b->top) {
+    top = b->top; /* this works because 'buf' is explicitly zeroed */
+  }
+  for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
+    buf[j] = ((unsigned char *)b->d)[i];
+  }
+
+  return 1;
+}
+
+static int copy_from_prebuf(BIGNUM *b, int top, unsigned char *buf, int idx,
+                            int width) {
+  size_t i, j;
+
+  if (bn_wexpand(b, top) == NULL) {
+    return 0;
+  }
+
+  for (i = 0, j = idx; i < top * sizeof b->d[0]; i++, j += width) {
+    ((unsigned char *)b->d)[i] = buf[j];
+  }
+
+  b->top = top;
+  bn_correct_top(b);
+  return 1;
+}
+
+/* BN_mod_exp_mont_conttime is based on the assumption that the L1 data cache
+ * line width of the target processor is at least the following value. */
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH (64)
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK \
+  (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
+
+/* Window sizes optimized for fixed window size modular exponentiation
+ * algorithm (BN_mod_exp_mont_consttime).
+ *
+ * To achieve the security goals of BN_mode_exp_mont_consttime, the maximum
+ * size of the window must not exceed
+ * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH). 
+ *
+ * Window size thresholds are defined for cache line sizes of 32 and 64, cache
+ * line sizes where log_2(32)=5 and log_2(64)=6 respectively. A window size of
+ * 7 should only be used on processors that have a 128 byte or greater cache
+ * line size. */
+#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
+
+#define BN_window_bits_for_ctime_exponent_size(b) \
+  ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
+#define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
+
+#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
+
+#define BN_window_bits_for_ctime_exponent_size(b) \
+  ((b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1)
+#define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
+
+#endif
+
+/* Given a pointer value, compute the next address that is a cache line
+ * multiple. */
+#define MOD_EXP_CTIME_ALIGN(x_)          \
+  ((unsigned char *)(x_) +               \
+   (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - \
+    (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+
+/* This variant of BN_mod_exp_mont() uses fixed windows and the special
+ * precomputation memory layout to limit data-dependency to a minimum
+ * to protect secret exponents (cf. the hyper-threading timing attacks
+ * pointed out by Colin Percival,
+ * http://www.daemonology.net/hyperthreading-considered-harmful/)
+ */
+int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
+                              const BIGNUM *m, BN_CTX *ctx,
+                              BN_MONT_CTX *in_mont) {
+  int i, bits, ret = 0, window, wvalue;
+  int top;
+  BN_MONT_CTX *mont = NULL;
+
+  int numPowers;
+  unsigned char *powerbufFree = NULL;
+  int powerbufLen = 0;
+  unsigned char *powerbuf = NULL;
+  BIGNUM tmp, am;
+
+  top = m->top;
+
+  if (!(m->d[0] & 1)) {
+    OPENSSL_PUT_ERROR(BN, BN_mod_exp_mont_consttime,
+                      BN_R_CALLED_WITH_EVEN_MODULUS);
+    return 0;
+  }
+  bits = BN_num_bits(p);
+  if (bits == 0) {
+    ret = BN_one(rr);
+    return ret;
+  }
+
+  BN_CTX_start(ctx);
+
+  /* Allocate a montgomery context if it was not supplied by the caller.
+   * If this is not done, things will break in the montgomery part.
+   */
+  if (in_mont != NULL)
+    mont = in_mont;
+  else {
+    if ((mont = BN_MONT_CTX_new()) == NULL)
+      goto err;
+    if (!BN_MONT_CTX_set(mont, m, ctx))
+      goto err;
+  }
+
+#ifdef RSAZ_ENABLED
+  /* If the size of the operands allow it, perform the optimized
+   * RSAZ exponentiation. For further information see
+   * crypto/bn/rsaz_exp.c and accompanying assembly modules. */
+  if (((OPENSSL_ia32cap_P[2] & 0x80100) != 0x80100) /* check for MULX/AD*X */
+      && (16 == a->top) && (16 == p->top) && (BN_num_bits(m) == 1024) &&
+      rsaz_avx2_eligible()) {
+    if (NULL == bn_wexpand(rr, 16))
+      goto err;
+    RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0]);
+    rr->top = 16;
+    rr->neg = 0;
+    bn_correct_top(rr);
+    ret = 1;
+    goto err;
+  } else if ((8 == a->top) && (8 == p->top) && (BN_num_bits(m) == 512)) {
+    if (NULL == bn_wexpand(rr, 8))
+      goto err;
+    RSAZ_512_mod_exp(rr->d, a->d, p->d, m->d, mont->n0[0], mont->RR.d);
+    rr->top = 8;
+    rr->neg = 0;
+    bn_correct_top(rr);
+    ret = 1;
+    goto err;
+  }
+#endif
+
+  /* Get the window size to use with size of p. */
+  window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+  if (window >= 5) {
+    window = 5; /* ~5% improvement for RSA2048 sign, and even for RSA4096 */
+    if ((top & 7) == 0)
+      powerbufLen += 2 * top * sizeof(m->d[0]);
+  }
+#endif
+  (void)0;
+
+  /* Allocate a buffer large enough to hold all of the pre-computed
+   * powers of am, am itself and tmp.
+   */
+  numPowers = 1 << window;
+  powerbufLen +=
+      sizeof(m->d[0]) *
+      (top * numPowers + ((2 * top) > numPowers ? (2 * top) : numPowers));
+#ifdef alloca
+  if (powerbufLen < 3072)
+    powerbufFree = alloca(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+  else
+#endif
+      if ((powerbufFree = (unsigned char *)OPENSSL_malloc(
+               powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
+    goto err;
+
+  powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
+  memset(powerbuf, 0, powerbufLen);
+
+#ifdef alloca
+  if (powerbufLen < 3072)
+    powerbufFree = NULL;
+#endif
+
+  /* lay down tmp and am right after powers table */
+  tmp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0]) * top * numPowers);
+  am.d = tmp.d + top;
+  tmp.top = am.top = 0;
+  tmp.dmax = am.dmax = top;
+  tmp.neg = am.neg = 0;
+  tmp.flags = am.flags = BN_FLG_STATIC_DATA;
+
+/* prepare a^0 in Montgomery domain */
+/* by Shay Gueron's suggestion */
+  if (m->d[top - 1] & (((BN_ULONG)1) << (BN_BITS2 - 1))) {
+    /* 2^(top*BN_BITS2) - m */
+    tmp.d[0] = (0 - m->d[0]) & BN_MASK2;
+    for (i = 1; i < top; i++)
+      tmp.d[i] = (~m->d[i]) & BN_MASK2;
+    tmp.top = top;
+  } else if (!BN_to_montgomery(&tmp, BN_value_one(), mont, ctx))
+    goto err;
+
+  /* prepare a^1 in Montgomery domain */
+  if (a->neg || BN_ucmp(a, m) >= 0) {
+    if (!BN_mod(&am, a, m, ctx))
+      goto err;
+    if (!BN_to_montgomery(&am, &am, mont, ctx))
+      goto err;
+  } else if (!BN_to_montgomery(&am, a, mont, ctx))
+    goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+  /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+   * specifically optimization of cache-timing attack countermeasures
+   * and pre-computation optimization. */
+
+  /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+   * 512-bit RSA is hardly relevant, we omit it to spare size... */
+  if (window == 5) {
+    void bn_mul_mont_gather5(BN_ULONG * rp, const BN_ULONG * ap,
+                             const void * table, const BN_ULONG * np,
+                             const BN_ULONG * n0, int num, int power);
+    void bn_scatter5(const BN_ULONG * inp, size_t num, void * table,
+                     size_t power);
+    void bn_gather5(BN_ULONG * out, size_t num, void * table, size_t power);
+    void bn_power5(BN_ULONG * rp, const BN_ULONG * ap, const void * table,
+                   const BN_ULONG * np, const BN_ULONG * n0, int num,
+                   int power);
+    int bn_get_bits5(const BN_ULONG * ap, int off);
+    int bn_from_montgomery(BN_ULONG * rp, const BN_ULONG * ap,
+                           const BN_ULONG * not_used, const BN_ULONG * np,
+                           const BN_ULONG * n0, int num);
+
+    BN_ULONG *np = mont->N.d, *n0 = mont->n0, *np2;
+
+    /* BN_to_montgomery can contaminate words above .top
+     * [in BN_DEBUG[_DEBUG] build]... */
+    for (i = am.top; i < top; i++)
+      am.d[i] = 0;
+    for (i = tmp.top; i < top; i++)
+      tmp.d[i] = 0;
+
+    if (top & 7)
+      np2 = np;
+    else
+      for (np2 = am.d + top, i = 0; i < top; i++)
+        np2[2 * i] = np[i];
+
+    bn_scatter5(tmp.d, top, powerbuf, 0);
+    bn_scatter5(am.d, am.top, powerbuf, 1);
+    bn_mul_mont(tmp.d, am.d, am.d, np, n0, top);
+    bn_scatter5(tmp.d, top, powerbuf, 2);
+
+    /* same as above, but uses squaring for 1/2 of operations */
+    for (i = 4; i < 32; i *= 2) {
+      bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+      bn_scatter5(tmp.d, top, powerbuf, i);
+    }
+    for (i = 3; i < 8; i += 2) {
+      int j;
+      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+      bn_scatter5(tmp.d, top, powerbuf, i);
+      for (j = 2 * i; j < 32; j *= 2) {
+        bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+        bn_scatter5(tmp.d, top, powerbuf, j);
+      }
+    }
+    for (; i < 16; i += 2) {
+      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+      bn_scatter5(tmp.d, top, powerbuf, i);
+      bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+      bn_scatter5(tmp.d, top, powerbuf, 2 * i);
+    }
+    for (; i < 32; i += 2) {
+      bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np2, n0, top, i - 1);
+      bn_scatter5(tmp.d, top, powerbuf, i);
+    }
+
+    bits--;
+    for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--)
+      wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+    bn_gather5(tmp.d, top, powerbuf, wvalue);
+
+    /* Scan the exponent one window at a time starting from the most
+     * significant bits.
+     */
+    if (top & 7)
+      while (bits >= 0) {
+        for (wvalue = 0, i = 0; i < 5; i++, bits--)
+          wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+
+        bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+        bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+        bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+        bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+        bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top);
+        bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue);
+      }
+    else {
+      while (bits >= 0) {
+        wvalue = bn_get_bits5(p->d, bits - 4);
+        bits -= 5;
+        bn_power5(tmp.d, tmp.d, powerbuf, np2, n0, top, wvalue);
+      }
+    }
+
+    ret = bn_from_montgomery(tmp.d, tmp.d, NULL, np2, n0, top);
+    tmp.top = top;
+    bn_correct_top(&tmp);
+    if (ret) {
+      if (!BN_copy(rr, &tmp))
+        ret = 0;
+      goto err; /* non-zero ret means it's not error */
+    }
+  } else
+#endif
+  {
+    if (!copy_to_prebuf(&tmp, top, powerbuf, 0, numPowers))
+      goto err;
+    if (!copy_to_prebuf(&am, top, powerbuf, 1, numPowers))
+      goto err;
+
+    /* If the window size is greater than 1, then calculate
+     * val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1)
+     * (even powers could instead be computed as (a^(i/2))^2
+     * to use the slight performance advantage of sqr over mul).
+     */
+    if (window > 1) {
+      if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx))
+        goto err;
+      if (!copy_to_prebuf(&tmp, top, powerbuf, 2, numPowers))
+        goto err;
+      for (i = 3; i < numPowers; i++) {
+        /* Calculate a^i = a^(i-1) * a */
+        if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx))
+          goto err;
+        if (!copy_to_prebuf(&tmp, top, powerbuf, i, numPowers))
+          goto err;
+      }
+    }
+
+    bits--;
+    for (wvalue = 0, i = bits % window; i >= 0; i--, bits--)
+      wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+    if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, numPowers))
+      goto err;
+
+    /* Scan the exponent one window at a time starting from the most
+     * significant bits.
+     */
+    while (bits >= 0) {
+      wvalue = 0; /* The 'value' of the window */
+
+      /* Scan the window, squaring the result as we go */
+      for (i = 0; i < window; i++, bits--) {
+        if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx))
+          goto err;
+        wvalue = (wvalue << 1) + BN_is_bit_set(p, bits);
+      }
+
+      /* Fetch the appropriate pre-computed value from the pre-buf */
+      if (!copy_from_prebuf(&am, top, powerbuf, wvalue, numPowers))
+        goto err;
+
+      /* Multiply the result into the intermediate result */
+      if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx))
+        goto err;
+    }
+  }
+
+  /* Convert the final result from montgomery to standard format */
+  if (!BN_from_montgomery(rr, &tmp, mont, ctx))
+    goto err;
+  ret = 1;
+err:
+  if ((in_mont == NULL) && (mont != NULL))
+    BN_MONT_CTX_free(mont);
+  if (powerbuf != NULL) {
+    OPENSSL_cleanse(powerbuf, powerbufLen);
+    if (powerbufFree)
+      OPENSSL_free(powerbufFree);
+  }
+  BN_CTX_end(ctx);
+  return (ret);
+}
+
+int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
+                         const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) {
+  BN_MONT_CTX *mont = NULL;
+  int b, bits, ret = 0;
+  int r_is_one;
+  BN_ULONG w, next_w;
+  BIGNUM *d, *r, *t;
+  BIGNUM *swap_tmp;
+#define BN_MOD_MUL_WORD(r, w, m)   \
+  (BN_mul_word(r, (w)) &&          \
+   (/* BN_ucmp(r, (m)) < 0 ? 1 :*/ \
+    (BN_mod(t, r, m, ctx) && (swap_tmp = r, r = t, t = swap_tmp, 1))))
+  /* BN_MOD_MUL_WORD is only used with 'w' large, so the BN_ucmp test is
+   * probably more overhead than always using BN_mod (which uses BN_copy if a
+   * similar test returns true). We can use BN_mod and do not need BN_nnmod
+   * because our accumulator is never negative (the result of BN_mod does not
+   * depend on the sign of the modulus). */
+#define BN_TO_MONTGOMERY_WORD(r, w, mont) \
+  (BN_set_word(r, (w)) && BN_to_montgomery(r, r, (mont), ctx))
+
+  if (BN_get_flags(p, BN_FLG_CONSTTIME) != 0) {
+    /* BN_FLG_CONSTTIME only supported by BN_mod_exp_mont() */
+    OPENSSL_PUT_ERROR(BN, BN_mod_exp_mont_word,
+        ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED);
+    return 0;
+  }
+
+  if (!BN_is_odd(m)) {
+    OPENSSL_PUT_ERROR(BN, BN_mod_exp_mont_word, BN_R_CALLED_WITH_EVEN_MODULUS);
+    return 0;
+  }
+
+  if (m->top == 1) {
+    a %= m->d[0]; /* make sure that 'a' is reduced */
+  }
+
+  bits = BN_num_bits(p);
+  if (bits == 0) {
+    ret = BN_one(rr);
+    return ret;
+  }
+  if (a == 0) {
+    BN_zero(rr);
+    ret = 1;
+    return ret;
+  }
+
+  BN_CTX_start(ctx);
+  d = BN_CTX_get(ctx);
+  r = BN_CTX_get(ctx);
+  t = BN_CTX_get(ctx);
+  if (d == NULL || r == NULL || t == NULL) {
+    goto err;
+  }
+
+  if (in_mont != NULL)
+    mont = in_mont;
+  else {
+    if ((mont = BN_MONT_CTX_new()) == NULL) {
+      goto err;
+    }
+    if (!BN_MONT_CTX_set(mont, m, ctx)) {
+      goto err;
+    }
+  }
+
+  r_is_one = 1; /* except for Montgomery factor */
+
+  /* bits-1 >= 0 */
+
+  /* The result is accumulated in the product r*w. */
+  w = a; /* bit 'bits-1' of 'p' is always set */
+  for (b = bits - 2; b >= 0; b--) {
+    /* First, square r*w. */
+    next_w = w * w;
+    if ((next_w / w) != w) {
+      /* overflow */
+      if (r_is_one) {
+        if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) {
+          goto err;
+        }
+        r_is_one = 0;
+      } else {
+        if (!BN_MOD_MUL_WORD(r, w, m)) {
+          goto err;
+        }
+      }
+      next_w = 1;
+    }
+
+    w = next_w;
+    if (!r_is_one) {
+      if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
+        goto err;
+      }
+    }
+
+    /* Second, multiply r*w by 'a' if exponent bit is set. */
+    if (BN_is_bit_set(p, b)) {
+      next_w = w * a;
+      if ((next_w / a) != w) {
+        /* overflow */
+        if (r_is_one) {
+          if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) {
+            goto err;
+          }
+          r_is_one = 0;
+        } else {
+          if (!BN_MOD_MUL_WORD(r, w, m)) {
+            goto err;
+          }
+        }
+        next_w = a;
+      }
+      w = next_w;
+    }
+  }
+
+  /* Finally, set r:=r*w. */
+  if (w != 1) {
+    if (r_is_one) {
+      if (!BN_TO_MONTGOMERY_WORD(r, w, mont)) {
+        goto err;
+      }
+      r_is_one = 0;
+    } else {
+      if (!BN_MOD_MUL_WORD(r, w, m)) {
+        goto err;
+      }
+    }
+  }
+
+  if (r_is_one) {
+    /* can happen only if a == 1*/
+    if (!BN_one(rr)) {
+      goto err;
+    }
+  } else {
+    if (!BN_from_montgomery(rr, r, mont, ctx)) {
+      goto err;
+    }
+  }
+  ret = 1;
+
+err:
+  if (in_mont == NULL && mont != NULL) {
+    BN_MONT_CTX_free(mont);
+  }
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+#define TABLE_SIZE 32
+
+int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1,
+                     const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m,
+                     BN_CTX *ctx, BN_MONT_CTX *in_mont) {
+  int i, j, bits, b, bits1, bits2, ret = 0, wpos1, wpos2, window1, window2,
+                                   wvalue1, wvalue2;
+  int r_is_one = 1;
+  BIGNUM *d, *r;
+  const BIGNUM *a_mod_m;
+  /* Tables of variables obtained from 'ctx' */
+  BIGNUM *val1[TABLE_SIZE], *val2[TABLE_SIZE];
+  BN_MONT_CTX *mont = NULL;
+
+  if (!(m->d[0] & 1)) {
+    OPENSSL_PUT_ERROR(BN, BN_mod_exp2_mont, BN_R_CALLED_WITH_EVEN_MODULUS);
+    return 0;
+  }
+  bits1 = BN_num_bits(p1);
+  bits2 = BN_num_bits(p2);
+  if (bits1 == 0 && bits2 == 0) {
+    ret = BN_one(rr);
+    return ret;
+  }
+
+  bits = (bits1 > bits2) ? bits1 : bits2;
+
+  BN_CTX_start(ctx);
+  d = BN_CTX_get(ctx);
+  r = BN_CTX_get(ctx);
+  val1[0] = BN_CTX_get(ctx);
+  val2[0] = BN_CTX_get(ctx);
+  if (!d || !r || !val1[0] || !val2[0]) {
+    goto err;
+  }
+
+  if (in_mont != NULL) {
+    mont = in_mont;
+  } else {
+    mont = BN_MONT_CTX_new();
+    if (mont == NULL) {
+      goto err;
+    }
+    if (!BN_MONT_CTX_set(mont, m, ctx)) {
+      goto err;
+    }
+  }
+
+  window1 = BN_window_bits_for_exponent_size(bits1);
+  window2 = BN_window_bits_for_exponent_size(bits2);
+
+  /* Build table for a1:   val1[i] := a1^(2*i + 1) mod m  for i = 0 ..
+   * 2^(window1-1) */
+  if (a1->neg || BN_ucmp(a1, m) >= 0) {
+    if (!BN_mod(val1[0], a1, m, ctx)) {
+      goto err;
+    }
+    a_mod_m = val1[0];
+  } else {
+    a_mod_m = a1;
+  }
+
+  if (BN_is_zero(a_mod_m)) {
+    BN_zero(rr);
+    ret = 1;
+    goto err;
+  }
+
+  if (!BN_to_montgomery(val1[0], a_mod_m, mont, ctx)) {
+    goto err;
+  }
+
+  if (window1 > 1) {
+    if (!BN_mod_mul_montgomery(d, val1[0], val1[0], mont, ctx)) {
+      goto err;
+    }
+
+    j = 1 << (window1 - 1);
+    for (i = 1; i < j; i++) {
+      if (((val1[i] = BN_CTX_get(ctx)) == NULL) ||
+          !BN_mod_mul_montgomery(val1[i], val1[i - 1], d, mont, ctx)) {
+        goto err;
+      }
+    }
+  }
+
+  /* Build table for a2:   val2[i] := a2^(2*i + 1) mod m  for i = 0 ..
+   * 2^(window2-1) */
+  if (a2->neg || BN_ucmp(a2, m) >= 0) {
+    if (!BN_mod(val2[0], a2, m, ctx)) {
+      goto err;
+    }
+    a_mod_m = val2[0];
+  } else {
+    a_mod_m = a2;
+  }
+
+  if (BN_is_zero(a_mod_m)) {
+    BN_zero(rr);
+    ret = 1;
+    goto err;
+  }
+
+  if (!BN_to_montgomery(val2[0], a_mod_m, mont, ctx)) {
+    goto err;
+  }
+
+  if (window2 > 1) {
+    if (!BN_mod_mul_montgomery(d, val2[0], val2[0], mont, ctx)) {
+      goto err;
+    }
+
+    j = 1 << (window2 - 1);
+    for (i = 1; i < j; i++) {
+      if (((val2[i] = BN_CTX_get(ctx)) == NULL) ||
+          !BN_mod_mul_montgomery(val2[i], val2[i - 1], d, mont, ctx)) {
+        goto err;
+      }
+    }
+  }
+
+  /* Now compute the power product, using independent windows. */
+  r_is_one = 1;
+  wvalue1 = 0; /* The 'value' of the first window */
+  wvalue2 = 0; /* The 'value' of the second window */
+  wpos1 = 0;   /* If wvalue1 > 0, the bottom bit of the first window */
+  wpos2 = 0;   /* If wvalue2 > 0, the bottom bit of the second window */
+
+  if (!BN_to_montgomery(r, BN_value_one(), mont, ctx)) {
+    goto err;
+  }
+
+  for (b = bits - 1; b >= 0; b--) {
+    if (!r_is_one) {
+      if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) {
+        goto err;
+      }
+    }
+
+    if (!wvalue1 && BN_is_bit_set(p1, b)) {
+      /* consider bits b-window1+1 .. b for this window */
+      i = b - window1 + 1;
+      while (!BN_is_bit_set(p1, i)) /* works for i<0 */
+        i++;
+      wpos1 = i;
+      wvalue1 = 1;
+      for (i = b - 1; i >= wpos1; i--) {
+        wvalue1 <<= 1;
+        if (BN_is_bit_set(p1, i))
+          wvalue1++;
+      }
+    }
+
+    if (!wvalue2 && BN_is_bit_set(p2, b)) {
+      /* consider bits b-window2+1 .. b for this window */
+      i = b - window2 + 1;
+      while (!BN_is_bit_set(p2, i))
+        i++;
+      wpos2 = i;
+      wvalue2 = 1;
+      for (i = b - 1; i >= wpos2; i--) {
+        wvalue2 <<= 1;
+        if (BN_is_bit_set(p2, i))
+          wvalue2++;
+      }
+    }
+
+    if (wvalue1 && b == wpos1) {
+      /* wvalue1 is odd and < 2^window1 */
+      if (!BN_mod_mul_montgomery(r, r, val1[wvalue1 >> 1], mont, ctx)) {
+        goto err;
+      }
+      wvalue1 = 0;
+      r_is_one = 0;
+    }
+
+    if (wvalue2 && b == wpos2) {
+      /* wvalue2 is odd and < 2^window2 */
+      if (!BN_mod_mul_montgomery(r, r, val2[wvalue2 >> 1], mont, ctx)) {
+        goto err;
+      }
+      wvalue2 = 0;
+      r_is_one = 0;
+    }
+  }
+
+  if (!BN_from_montgomery(rr, r, mont, ctx)) {
+    goto err;
+  }
+  ret = 1;
+
+err:
+  if (in_mont == NULL && mont != NULL) {
+    BN_MONT_CTX_free(mont);
+  }
+  BN_CTX_end(ctx);
+  return ret;
+}

diff --git a/crypto/bn/gcd.c b/crypto/bn/gcd.c
new file mode 100644
index 0000000..2dce296
--- /dev/null
+++ b/crypto/bn/gcd.c

@@ -0,0 +1,704 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+
+#include "internal.h"
+
+static BIGNUM *euclid(BIGNUM *a, BIGNUM *b) {
+  BIGNUM *t;
+  int shifts = 0;
+
+  /* 0 <= b <= a */
+  while (!BN_is_zero(b)) {
+    /* 0 < b <= a */
+
+    if (BN_is_odd(a)) {
+      if (BN_is_odd(b)) {
+        if (!BN_sub(a, a, b)) {
+          goto err;
+        }
+        if (!BN_rshift1(a, a)) {
+          goto err;
+        }
+        if (BN_cmp(a, b) < 0) {
+          t = a;
+          a = b;
+          b = t;
+        }
+      } else {
+        /* a odd - b even */
+        if (!BN_rshift1(b, b)) {
+          goto err;
+        }
+        if (BN_cmp(a, b) < 0) {
+          t = a;
+          a = b;
+          b = t;
+        }
+      }
+    } else {
+      /* a is even */
+      if (BN_is_odd(b)) {
+        if (!BN_rshift1(a, a)) {
+          goto err;
+        }
+        if (BN_cmp(a, b) < 0) {
+          t = a;
+          a = b;
+          b = t;
+        }
+      } else {
+        /* a even - b even */
+        if (!BN_rshift1(a, a)) {
+          goto err;
+        }
+        if (!BN_rshift1(b, b)) {
+          goto err;
+        }
+        shifts++;
+      }
+    }
+    /* 0 <= b <= a */
+  }
+
+  if (shifts) {
+    if (!BN_lshift(a, a, shifts)) {
+      goto err;
+    }
+  }
+
+  return a;
+
+err:
+  return NULL;
+}
+
+int BN_gcd(BIGNUM *r, const BIGNUM *in_a, const BIGNUM *in_b, BN_CTX *ctx) {
+  BIGNUM *a, *b, *t;
+  int ret = 0;
+
+  BN_CTX_start(ctx);
+  a = BN_CTX_get(ctx);
+  b = BN_CTX_get(ctx);
+
+  if (a == NULL || b == NULL) {
+    goto err;
+  }
+  if (BN_copy(a, in_a) == NULL) {
+    goto err;
+  }
+  if (BN_copy(b, in_b) == NULL) {
+    goto err;
+  }
+
+  a->neg = 0;
+  b->neg = 0;
+
+  if (BN_cmp(a, b) < 0) {
+    t = a;
+    a = b;
+    b = t;
+  }
+  t = euclid(a, b);
+  if (t == NULL) {
+    goto err;
+  }
+
+  if (BN_copy(r, t) == NULL) {
+    goto err;
+  }
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+/* solves ax == 1 (mod n) */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *out, const BIGNUM *a,
+                                        const BIGNUM *n, BN_CTX *ctx);
+
+BIGNUM *BN_mod_inverse(BIGNUM *out, const BIGNUM *a, const BIGNUM *n,
+                       BN_CTX *ctx) {
+  BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
+  BIGNUM *ret = NULL;
+  int sign;
+
+  if ((a->flags & BN_FLG_CONSTTIME) != 0 ||
+      (n->flags & BN_FLG_CONSTTIME) != 0) {
+    return BN_mod_inverse_no_branch(out, a, n, ctx);
+  }
+
+  BN_CTX_start(ctx);
+  A = BN_CTX_get(ctx);
+  B = BN_CTX_get(ctx);
+  X = BN_CTX_get(ctx);
+  D = BN_CTX_get(ctx);
+  M = BN_CTX_get(ctx);
+  Y = BN_CTX_get(ctx);
+  T = BN_CTX_get(ctx);
+  if (T == NULL) {
+    goto err;
+  }
+
+  if (out == NULL) {
+    R = BN_new();
+  } else {
+    R = out;
+  }
+  if (R == NULL) {
+    goto err;
+  }
+
+  BN_one(X);
+  BN_zero(Y);
+  if (BN_copy(B, a) == NULL) {
+    goto err;
+  }
+  if (BN_copy(A, n) == NULL) {
+    goto err;
+  }
+  A->neg = 0;
+  if (B->neg || (BN_ucmp(B, A) >= 0)) {
+    if (!BN_nnmod(B, B, A, ctx)) {
+      goto err;
+    }
+  }
+  sign = -1;
+  /* From  B = a mod |n|,  A = |n|  it follows that
+   *
+   *      0 <= B < A,
+   *     -sign*X*a  ==  B   (mod |n|),
+   *      sign*Y*a  ==  A   (mod |n|).
+   */
+
+  if (BN_is_odd(n) && (BN_num_bits(n) <= (BN_BITS <= 32 ? 450 : 2048))) {
+    /* Binary inversion algorithm; requires odd modulus.
+     * This is faster than the general algorithm if the modulus
+     * is sufficiently small (about 400 .. 500 bits on 32-bit
+     * sytems, but much more on 64-bit systems) */
+    int shift;
+
+    while (!BN_is_zero(B)) {
+      /*      0 < B < |n|,
+       *      0 < A <= |n|,
+       * (1) -sign*X*a  ==  B   (mod |n|),
+       * (2)  sign*Y*a  ==  A   (mod |n|) */
+
+      /* Now divide  B  by the maximum possible power of two in the integers,
+       * and divide  X  by the same value mod |n|.
+       * When we're done, (1) still holds. */
+      shift = 0;
+      while (!BN_is_bit_set(B, shift)) {
+        /* note that 0 < B */
+        shift++;
+
+        if (BN_is_odd(X)) {
+          if (!BN_uadd(X, X, n)) {
+            goto err;
+          }
+        }
+        /* now X is even, so we can easily divide it by two */
+        if (!BN_rshift1(X, X)) {
+          goto err;
+        }
+      }
+      if (shift > 0) {
+        if (!BN_rshift(B, B, shift)) {
+          goto err;
+        }
+      }
+
+      /* Same for A and Y. Afterwards, (2) still holds. */
+      shift = 0;
+      while (!BN_is_bit_set(A, shift)) {
+        /* note that 0 < A */
+        shift++;
+
+        if (BN_is_odd(Y)) {
+          if (!BN_uadd(Y, Y, n)) {
+            goto err;
+          }
+        }
+        /* now Y is even */
+        if (!BN_rshift1(Y, Y)) {
+          goto err;
+        }
+      }
+      if (shift > 0) {
+        if (!BN_rshift(A, A, shift)) {
+          goto err;
+        }
+      }
+
+      /* We still have (1) and (2).
+       * Both  A  and  B  are odd.
+       * The following computations ensure that
+       *
+       *     0 <= B < |n|,
+       *      0 < A < |n|,
+       * (1) -sign*X*a  ==  B   (mod |n|),
+       * (2)  sign*Y*a  ==  A   (mod |n|),
+       *
+       * and that either  A  or  B  is even in the next iteration. */
+      if (BN_ucmp(B, A) >= 0) {
+        /* -sign*(X + Y)*a == B - A  (mod |n|) */
+        if (!BN_uadd(X, X, Y)) {
+          goto err;
+        }
+        /* NB: we could use BN_mod_add_quick(X, X, Y, n), but that
+         * actually makes the algorithm slower */
+        if (!BN_usub(B, B, A)) {
+          goto err;
+        }
+      } else {
+        /*  sign*(X + Y)*a == A - B  (mod |n|) */
+        if (!BN_uadd(Y, Y, X)) {
+          goto err;
+        }
+        /* as above, BN_mod_add_quick(Y, Y, X, n) would slow things down */
+        if (!BN_usub(A, A, B)) {
+          goto err;
+        }
+      }
+    }
+  } else {
+    /* general inversion algorithm */
+
+    while (!BN_is_zero(B)) {
+      BIGNUM *tmp;
+
+      /*
+       *      0 < B < A,
+       * (*) -sign*X*a  ==  B   (mod |n|),
+       *      sign*Y*a  ==  A   (mod |n|) */
+
+      /* (D, M) := (A/B, A%B) ... */
+      if (BN_num_bits(A) == BN_num_bits(B)) {
+        if (!BN_one(D)) {
+          goto err;
+        }
+        if (!BN_sub(M, A, B)) {
+          goto err;
+        }
+      } else if (BN_num_bits(A) == BN_num_bits(B) + 1) {
+        /* A/B is 1, 2, or 3 */
+        if (!BN_lshift1(T, B)) {
+          goto err;
+        }
+        if (BN_ucmp(A, T) < 0) {
+          /* A < 2*B, so D=1 */
+          if (!BN_one(D)) {
+            goto err;
+          }
+          if (!BN_sub(M, A, B)) {
+            goto err;
+          }
+        } else {
+          /* A >= 2*B, so D=2 or D=3 */
+          if (!BN_sub(M, A, T)) {
+            goto err;
+          }
+          if (!BN_add(D, T, B)) {
+            goto err; /* use D (:= 3*B) as temp */
+          }
+          if (BN_ucmp(A, D) < 0) {
+            /* A < 3*B, so D=2 */
+            if (!BN_set_word(D, 2)) {
+              goto err;
+            }
+            /* M (= A - 2*B) already has the correct value */
+          } else {
+            /* only D=3 remains */
+            if (!BN_set_word(D, 3)) {
+              goto err;
+            }
+            /* currently  M = A - 2*B,  but we need  M = A - 3*B */
+            if (!BN_sub(M, M, B)) {
+              goto err;
+            }
+          }
+        }
+      } else {
+        if (!BN_div(D, M, A, B, ctx)) {
+          goto err;
+        }
+      }
+
+      /* Now
+       *      A = D*B + M;
+       * thus we have
+       * (**)  sign*Y*a  ==  D*B + M   (mod |n|). */
+
+      tmp = A; /* keep the BIGNUM object, the value does not matter */
+
+      /* (A, B) := (B, A mod B) ... */
+      A = B;
+      B = M;
+      /* ... so we have  0 <= B < A  again */
+
+      /* Since the former  M  is now  B  and the former  B  is now  A,
+       * (**) translates into
+       *       sign*Y*a  ==  D*A + B    (mod |n|),
+       * i.e.
+       *       sign*Y*a - D*A  ==  B    (mod |n|).
+       * Similarly, (*) translates into
+       *      -sign*X*a  ==  A          (mod |n|).
+       *
+       * Thus,
+       *   sign*Y*a + D*sign*X*a  ==  B  (mod |n|),
+       * i.e.
+       *        sign*(Y + D*X)*a  ==  B  (mod |n|).
+       *
+       * So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
+       *      -sign*X*a  ==  B   (mod |n|),
+       *       sign*Y*a  ==  A   (mod |n|).
+       * Note that  X  and  Y  stay non-negative all the time. */
+
+      /* most of the time D is very small, so we can optimize tmp := D*X+Y */
+      if (BN_is_one(D)) {
+        if (!BN_add(tmp, X, Y)) {
+          goto err;
+        }
+      } else {
+        if (BN_is_word(D, 2)) {
+          if (!BN_lshift1(tmp, X)) {
+            goto err;
+          }
+        } else if (BN_is_word(D, 4)) {
+          if (!BN_lshift(tmp, X, 2)) {
+            goto err;
+          }
+        } else if (D->top == 1) {
+          if (!BN_copy(tmp, X)) {
+            goto err;
+          }
+          if (!BN_mul_word(tmp, D->d[0])) {
+            goto err;
+          }
+        } else {
+          if (!BN_mul(tmp, D, X, ctx)) {
+            goto err;
+          }
+        }
+        if (!BN_add(tmp, tmp, Y)) {
+          goto err;
+        }
+      }
+
+      M = Y; /* keep the BIGNUM object, the value does not matter */
+      Y = X;
+      X = tmp;
+      sign = -sign;
+    }
+  }
+
+  /* The while loop (Euclid's algorithm) ends when
+   *      A == gcd(a,n);
+   * we have
+   *       sign*Y*a  ==  A  (mod |n|),
+   * where  Y  is non-negative. */
+
+  if (sign < 0) {
+    if (!BN_sub(Y, n, Y)) {
+      goto err;
+    }
+  }
+  /* Now  Y*a  ==  A  (mod |n|).  */
+
+  if (BN_is_one(A)) {
+    /* Y*a == 1  (mod |n|) */
+    if (!Y->neg && BN_ucmp(Y, n) < 0) {
+      if (!BN_copy(R, Y)) {
+        goto err;
+      }
+    } else {
+      if (!BN_nnmod(R, Y, n, ctx)) {
+        goto err;
+      }
+    }
+  } else {
+    OPENSSL_PUT_ERROR(BN, BN_mod_inverse, BN_R_NO_INVERSE);
+    goto err;
+  }
+  ret = R;
+
+err:
+  if (ret == NULL && out == NULL) {
+    BN_free(R);
+  }
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+/* BN_mod_inverse_no_branch is a special version of BN_mod_inverse.
+ * It does not contain branches that may leak sensitive information. */
+static BIGNUM *BN_mod_inverse_no_branch(BIGNUM *out, const BIGNUM *a,
+                                        const BIGNUM *n, BN_CTX *ctx) {
+  BIGNUM *A, *B, *X, *Y, *M, *D, *T, *R = NULL;
+  BIGNUM local_A, local_B;
+  BIGNUM *pA, *pB;
+  BIGNUM *ret = NULL;
+  int sign;
+
+  BN_CTX_start(ctx);
+  A = BN_CTX_get(ctx);
+  B = BN_CTX_get(ctx);
+  X = BN_CTX_get(ctx);
+  D = BN_CTX_get(ctx);
+  M = BN_CTX_get(ctx);
+  Y = BN_CTX_get(ctx);
+  T = BN_CTX_get(ctx);
+  if (T == NULL) {
+    goto err;
+  }
+
+  if (out == NULL) {
+    R = BN_new();
+  } else {
+    R = out;
+  }
+  if (R == NULL) {
+    goto err;
+  }
+
+  BN_one(X);
+  BN_zero(Y);
+  if (BN_copy(B, a) == NULL) {
+    goto err;
+  }
+  if (BN_copy(A, n) == NULL) {
+    goto err;
+  }
+  A->neg = 0;
+
+  if (B->neg || (BN_ucmp(B, A) >= 0)) {
+    /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+     * BN_div_no_branch will be called eventually.
+     */
+    pB = &local_B;
+    BN_with_flags(pB, B, BN_FLG_CONSTTIME);
+    if (!BN_nnmod(B, pB, A, ctx))
+      goto err;
+  }
+  sign = -1;
+  /* From  B = a mod |n|,  A = |n|  it follows that
+   *
+   *      0 <= B < A,
+   *     -sign*X*a  ==  B   (mod |n|),
+   *      sign*Y*a  ==  A   (mod |n|).
+   */
+
+  while (!BN_is_zero(B)) {
+    BIGNUM *tmp;
+
+    /*
+     *      0 < B < A,
+     * (*) -sign*X*a  ==  B   (mod |n|),
+     *      sign*Y*a  ==  A   (mod |n|)
+     */
+
+    /* Turn BN_FLG_CONSTTIME flag on, so that when BN_div is invoked,
+     * BN_div_no_branch will be called eventually.
+     */
+    pA = &local_A;
+    BN_with_flags(pA, A, BN_FLG_CONSTTIME);
+
+    /* (D, M) := (A/B, A%B) ... */
+    if (!BN_div(D, M, pA, B, ctx)) {
+      goto err;
+    }
+
+    /* Now
+     *      A = D*B + M;
+     * thus we have
+     * (**)  sign*Y*a  ==  D*B + M   (mod |n|).
+     */
+
+    tmp = A; /* keep the BIGNUM object, the value does not matter */
+
+    /* (A, B) := (B, A mod B) ... */
+    A = B;
+    B = M;
+    /* ... so we have  0 <= B < A  again */
+
+    /* Since the former  M  is now  B  and the former  B  is now  A,
+     * (**) translates into
+     *       sign*Y*a  ==  D*A + B    (mod |n|),
+     * i.e.
+     *       sign*Y*a - D*A  ==  B    (mod |n|).
+     * Similarly, (*) translates into
+     *      -sign*X*a  ==  A          (mod |n|).
+     *
+     * Thus,
+     *   sign*Y*a + D*sign*X*a  ==  B  (mod |n|),
+     * i.e.
+     *        sign*(Y + D*X)*a  ==  B  (mod |n|).
+     *
+     * So if we set  (X, Y, sign) := (Y + D*X, X, -sign),  we arrive back at
+     *      -sign*X*a  ==  B   (mod |n|),
+     *       sign*Y*a  ==  A   (mod |n|).
+     * Note that  X  and  Y  stay non-negative all the time.
+     */
+
+    if (!BN_mul(tmp, D, X, ctx)) {
+      goto err;
+    }
+    if (!BN_add(tmp, tmp, Y)) {
+      goto err;
+    }
+
+    M = Y; /* keep the BIGNUM object, the value does not matter */
+    Y = X;
+    X = tmp;
+    sign = -sign;
+  }
+
+  /*
+   * The while loop (Euclid's algorithm) ends when
+   *      A == gcd(a,n);
+   * we have
+   *       sign*Y*a  ==  A  (mod |n|),
+   * where  Y  is non-negative.
+   */
+
+  if (sign < 0) {
+    if (!BN_sub(Y, n, Y)) {
+      goto err;
+    }
+  }
+  /* Now  Y*a  ==  A  (mod |n|).  */
+
+  if (BN_is_one(A)) {
+    /* Y*a == 1  (mod |n|) */
+    if (!Y->neg && BN_ucmp(Y, n) < 0) {
+      if (!BN_copy(R, Y)) {
+        goto err;
+      }
+    } else {
+      if (!BN_nnmod(R, Y, n, ctx)) {
+        goto err;
+      }
+    }
+  } else {
+    OPENSSL_PUT_ERROR(BN, BN_mod_inverse_no_branch, BN_R_NO_INVERSE);
+    goto err;
+  }
+  ret = R;
+
+err:
+  if (ret == NULL && out == NULL) {
+    BN_free(R);
+  }
+
+  BN_CTX_end(ctx);
+  return ret;
+}

diff --git a/crypto/bn/generic.c b/crypto/bn/generic.c
new file mode 100644
index 0000000..b745750
--- /dev/null
+++ b/crypto/bn/generic.c

@@ -0,0 +1,1189 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+
+#include "internal.h"
+
+
+#if defined(OPENSSL_WINDOWS) || defined(OPENSSL_NO_ASM) || \
+    (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86))
+
+#if defined(OPENSSL_WINDOWS)
+#define alloca _alloca
+#else
+#include <alloca.h>
+#endif
+
+#ifdef BN_LLONG
+#define mul_add(r, a, w, c)             \
+  {                                     \
+    BN_ULLONG t;                        \
+    t = (BN_ULLONG)w * (a) + (r) + (c); \
+    (r) = Lw(t);                        \
+    (c) = Hw(t);                        \
+  }
+
+#define mul(r, a, w, c)           \
+  {                               \
+    BN_ULLONG t;                  \
+    t = (BN_ULLONG)w * (a) + (c); \
+    (r) = Lw(t);                  \
+    (c) = Hw(t);                  \
+  }
+
+#define sqr(r0, r1, a)        \
+  {                           \
+    BN_ULLONG t;              \
+    t = (BN_ULLONG)(a) * (a); \
+    (r0) = Lw(t);             \
+    (r1) = Hw(t);             \
+  }
+
+#elif defined(BN_UMULT_LOHI)
+#define mul_add(r, a, w, c)             \
+  {                                     \
+    BN_ULONG high, low, ret, tmp = (a); \
+    ret = (r);                          \
+    BN_UMULT_LOHI(low, high, w, tmp);   \
+    ret += (c);                         \
+    (c) = (ret < (c)) ? 1 : 0;          \
+    (c) += high;                        \
+    ret += low;                         \
+    (c) += (ret < low) ? 1 : 0;         \
+    (r) = ret;                          \
+  }
+
+#define mul(r, a, w, c)                \
+  {                                    \
+    BN_ULONG high, low, ret, ta = (a); \
+    BN_UMULT_LOHI(low, high, w, ta);   \
+    ret = low + (c);                   \
+    (c) = high;                        \
+    (c) += (ret < low) ? 1 : 0;        \
+    (r) = ret;                         \
+  }
+
+#define sqr(r0, r1, a)               \
+  {                                  \
+    BN_ULONG tmp = (a);              \
+    BN_UMULT_LOHI(r0, r1, tmp, tmp); \
+  }
+
+#elif defined(BN_UMULT_HIGH)
+#define mul_add(r, a, w, c)             \
+  {                                     \
+    BN_ULONG high, low, ret, tmp = (a); \
+    ret = (r);                          \
+    high = BN_UMULT_HIGH(w, tmp);       \
+    ret += (c);                         \
+    low = (w) * tmp;                    \
+    (c) = (ret < (c)) ? 1 : 0;          \
+    (c) += high;                        \
+    ret += low;                         \
+    (c) += (ret < low) ? 1 : 0;         \
+    (r) = ret;                          \
+  }
+
+#define mul(r, a, w, c)                \
+  {                                    \
+    BN_ULONG high, low, ret, ta = (a); \
+    low = (w) * ta;                    \
+    high = BN_UMULT_HIGH(w, ta);       \
+    ret = low + (c);                   \
+    (c) = high;                        \
+    (c) += (ret < low) ? 1 : 0;        \
+    (r) = ret;                         \
+  }
+
+#define sqr(r0, r1, a)              \
+  {                                 \
+    BN_ULONG tmp = (a);             \
+    (r0) = tmp * tmp;               \
+    (r1) = BN_UMULT_HIGH(tmp, tmp); \
+  }
+
+#else
+/*************************************************************
+ * No long long type
+ */
+
+#define LBITS(a) ((a) & BN_MASK2l)
+#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
+#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
+
+#define LLBITS(a) ((a) & BN_MASKl)
+#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
+#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
+
+#define mul64(l, h, bl, bh)       \
+  {                               \
+    BN_ULONG m, m1, lt, ht;       \
+                                  \
+    lt = l;                       \
+    ht = h;                       \
+    m = (bh) * (lt);              \
+    lt = (bl) * (lt);             \
+    m1 = (bl) * (ht);             \
+    ht = (bh) * (ht);             \
+    m = (m + m1) & BN_MASK2;      \
+    if (m < m1)                   \
+      ht += L2HBITS((BN_ULONG)1); \
+    ht += HBITS(m);               \
+    m1 = L2HBITS(m);              \
+    lt = (lt + m1) & BN_MASK2;    \
+    if (lt < m1)                  \
+      ht++;                       \
+    (l) = lt;                     \
+    (h) = ht;                     \
+  }
+
+#define sqr64(lo, ho, in)                    \
+  {                                          \
+    BN_ULONG l, h, m;                        \
+                                             \
+    h = (in);                                \
+    l = LBITS(h);                            \
+    h = HBITS(h);                            \
+    m = (l) * (h);                           \
+    l *= l;                                  \
+    h *= h;                                  \
+    h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
+    m = (m & BN_MASK2l) << (BN_BITS4 + 1);   \
+    l = (l + m) & BN_MASK2;                  \
+    if (l < m)                               \
+      h++;                                   \
+    (lo) = l;                                \
+    (ho) = h;                                \
+  }
+
+#define mul_add(r, a, bl, bh, c) \
+  {                              \
+    BN_ULONG l, h;               \
+                                 \
+    h = (a);                     \
+    l = LBITS(h);                \
+    h = HBITS(h);                \
+    mul64(l, h, (bl), (bh));     \
+                                 \
+    /* non-multiply part */      \
+    l = (l + (c)) & BN_MASK2;    \
+    if (l < (c))                 \
+      h++;                       \
+    (c) = (r);                   \
+    l = (l + (c)) & BN_MASK2;    \
+    if (l < (c))                 \
+      h++;                       \
+    (c) = h & BN_MASK2;          \
+    (r) = l;                     \
+  }
+
+#define mul(r, a, bl, bh, c)  \
+  {                           \
+    BN_ULONG l, h;            \
+                              \
+    h = (a);                  \
+    l = LBITS(h);             \
+    h = HBITS(h);             \
+    mul64(l, h, (bl), (bh));  \
+                              \
+    /* non-multiply part */   \
+    l += (c);                 \
+    if ((l & BN_MASK2) < (c)) \
+      h++;                    \
+    (c) = h & BN_MASK2;       \
+    (r) = l & BN_MASK2;       \
+  }
+#endif /* !BN_LLONG */
+
+#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+                          BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  assert(num >= 0);
+  if (num <= 0) {
+    return c1;
+  }
+
+  while (num & ~3) {
+    mul_add(rp[0], ap[0], w, c1);
+    mul_add(rp[1], ap[1], w, c1);
+    mul_add(rp[2], ap[2], w, c1);
+    mul_add(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+
+  while (num) {
+    mul_add(rp[0], ap[0], w, c1);
+    ap++;
+    rp++;
+    num--;
+  }
+
+  return c1;
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
+  BN_ULONG c1 = 0;
+
+  assert(num >= 0);
+  if (num <= 0) {
+    return c1;
+  }
+
+  while (num & ~3) {
+    mul(rp[0], ap[0], w, c1);
+    mul(rp[1], ap[1], w, c1);
+    mul(rp[2], ap[2], w, c1);
+    mul(rp[3], ap[3], w, c1);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  while (num) {
+    mul(rp[0], ap[0], w, c1);
+    ap++;
+    rp++;
+    num--;
+  }
+  return c1;
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
+  assert(n >= 0);
+  if (n <= 0) {
+    return;
+  }
+
+  while (n & ~3) {
+    sqr(r[0], r[1], a[0]);
+    sqr(r[2], r[3], a[1]);
+    sqr(r[4], r[5], a[2]);
+    sqr(r[6], r[7], a[3]);
+    a += 4;
+    r += 8;
+    n -= 4;
+  }
+  while (n) {
+    sqr(r[0], r[1], a[0]);
+    a++;
+    r += 2;
+    n--;
+  }
+}
+
+#else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
+                          BN_ULONG w) {
+  BN_ULONG c = 0;
+  BN_ULONG bl, bh;
+
+  assert(num >= 0);
+  if (num <= 0) {
+    return (BN_ULONG)0;
+  }
+
+  bl = LBITS(w);
+  bh = HBITS(w);
+
+  while (num & ~3) {
+    mul_add(rp[0], ap[0], bl, bh, c);
+    mul_add(rp[1], ap[1], bl, bh, c);
+    mul_add(rp[2], ap[2], bl, bh, c);
+    mul_add(rp[3], ap[3], bl, bh, c);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  while (num) {
+    mul_add(rp[0], ap[0], bl, bh, c);
+    ap++;
+    rp++;
+    num--;
+  }
+  return c;
+}
+
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
+  BN_ULONG carry = 0;
+  BN_ULONG bl, bh;
+
+  assert(num >= 0);
+  if (num <= 0) {
+    return (BN_ULONG)0;
+  }
+
+  bl = LBITS(w);
+  bh = HBITS(w);
+
+  while (num & ~3) {
+    mul(rp[0], ap[0], bl, bh, carry);
+    mul(rp[1], ap[1], bl, bh, carry);
+    mul(rp[2], ap[2], bl, bh, carry);
+    mul(rp[3], ap[3], bl, bh, carry);
+    ap += 4;
+    rp += 4;
+    num -= 4;
+  }
+  while (num) {
+    mul(rp[0], ap[0], bl, bh, carry);
+    ap++;
+    rp++;
+    num--;
+  }
+  return carry;
+}
+
+void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
+  assert(n >= 0);
+  if (n <= 0) {
+    return;
+  }
+
+  while (n & ~3) {
+    sqr64(r[0], r[1], a[0]);
+    sqr64(r[2], r[3], a[1]);
+    sqr64(r[4], r[5], a[2]);
+    sqr64(r[6], r[7], a[3]);
+    a += 4;
+    r += 8;
+    n -= 4;
+  }
+  while (n) {
+    sqr64(r[0], r[1], a[0]);
+    a++;
+    r += 2;
+    n--;
+  }
+}
+
+#endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
+
+#if defined(BN_LLONG) && defined(BN_DIV2W)
+
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
+  return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
+}
+
+#else
+
+/* Divide h,l by d and return the result. */
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
+  BN_ULONG dh, dl, q, ret = 0, th, tl, t;
+  int i, count = 2;
+
+  if (d == 0) {
+    return BN_MASK2;
+  }
+
+  i = BN_num_bits_word(d);
+  assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
+
+  i = BN_BITS2 - i;
+  if (h >= d) {
+    h -= d;
+  }
+
+  if (i) {
+    d <<= i;
+    h = (h << i) | (l >> (BN_BITS2 - i));
+    l <<= i;
+  }
+  dh = (d & BN_MASK2h) >> BN_BITS4;
+  dl = (d & BN_MASK2l);
+  for (;;) {
+    if ((h >> BN_BITS4) == dh) {
+      q = BN_MASK2l;
+    } else {
+      q = h / dh;
+    }
+
+    th = q * dh;
+    tl = dl * q;
+    for (;;) {
+      t = h - th;
+      if ((t & BN_MASK2h) ||
+          ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
+        break;
+      }
+      q--;
+      th -= dh;
+      tl -= dl;
+    }
+    t = (tl >> BN_BITS4);
+    tl = (tl << BN_BITS4) & BN_MASK2h;
+    th += t;
+
+    if (l < tl) {
+      th++;
+    }
+    l -= tl;
+    if (h < th) {
+      h += d;
+      q--;
+    }
+    h -= th;
+
+    if (--count == 0) {
+      break;
+    }
+
+    ret = q << BN_BITS4;
+    h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
+    l = (l & BN_MASK2l) << BN_BITS4;
+  }
+
+  ret |= q;
+  return ret;
+}
+
+#endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
+
+#ifdef BN_LLONG
+BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      int n) {
+  BN_ULLONG ll = 0;
+
+  assert(n >= 0);
+  if (n <= 0) {
+    return (BN_ULONG)0;
+  }
+
+  while (n & ~3) {
+    ll += (BN_ULLONG)a[0] + b[0];
+    r[0] = (BN_ULONG)ll & BN_MASK2;
+    ll >>= BN_BITS2;
+    ll += (BN_ULLONG)a[1] + b[1];
+    r[1] = (BN_ULONG)ll & BN_MASK2;
+    ll >>= BN_BITS2;
+    ll += (BN_ULLONG)a[2] + b[2];
+    r[2] = (BN_ULONG)ll & BN_MASK2;
+    ll >>= BN_BITS2;
+    ll += (BN_ULLONG)a[3] + b[3];
+    r[3] = (BN_ULONG)ll & BN_MASK2;
+    ll >>= BN_BITS2;
+    a += 4;
+    b += 4;
+    r += 4;
+    n -= 4;
+  }
+  while (n) {
+    ll += (BN_ULLONG)a[0] + b[0];
+    r[0] = (BN_ULONG)ll & BN_MASK2;
+    ll >>= BN_BITS2;
+    a++;
+    b++;
+    r++;
+    n--;
+  }
+  return (BN_ULONG)ll;
+}
+
+#else /* !BN_LLONG */
+
+BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      int n) {
+  BN_ULONG c, l, t;
+
+  assert(n >= 0);
+  if (n <= 0) {
+    return (BN_ULONG)0;
+  }
+
+  c = 0;
+  while (n & ~3) {
+    t = a[0];
+    t = (t + c) & BN_MASK2;
+    c = (t < c);
+    l = (t + b[0]) & BN_MASK2;
+    c += (l < t);
+    r[0] = l;
+    t = a[1];
+    t = (t + c) & BN_MASK2;
+    c = (t < c);
+    l = (t + b[1]) & BN_MASK2;
+    c += (l < t);
+    r[1] = l;
+    t = a[2];
+    t = (t + c) & BN_MASK2;
+    c = (t < c);
+    l = (t + b[2]) & BN_MASK2;
+    c += (l < t);
+    r[2] = l;
+    t = a[3];
+    t = (t + c) & BN_MASK2;
+    c = (t < c);
+    l = (t + b[3]) & BN_MASK2;
+    c += (l < t);
+    r[3] = l;
+    a += 4;
+    b += 4;
+    r += 4;
+    n -= 4;
+  }
+  while (n) {
+    t = a[0];
+    t = (t + c) & BN_MASK2;
+    c = (t < c);
+    l = (t + b[0]) & BN_MASK2;
+    c += (l < t);
+    r[0] = l;
+    a++;
+    b++;
+    r++;
+    n--;
+  }
+  return (BN_ULONG)c;
+}
+
+#endif /* !BN_LLONG */
+
+BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                      int n) {
+  BN_ULONG t1, t2;
+  int c = 0;
+
+  assert(n >= 0);
+  if (n <= 0) {
+    return (BN_ULONG)0;
+  }
+
+  while (n & ~3) {
+    t1 = a[0];
+    t2 = b[0];
+    r[0] = (t1 - t2 - c) & BN_MASK2;
+    if (t1 != t2)
+      c = (t1 < t2);
+    t1 = a[1];
+    t2 = b[1];
+    r[1] = (t1 - t2 - c) & BN_MASK2;
+    if (t1 != t2)
+      c = (t1 < t2);
+    t1 = a[2];
+    t2 = b[2];
+    r[2] = (t1 - t2 - c) & BN_MASK2;
+    if (t1 != t2)
+      c = (t1 < t2);
+    t1 = a[3];
+    t2 = b[3];
+    r[3] = (t1 - t2 - c) & BN_MASK2;
+    if (t1 != t2)
+      c = (t1 < t2);
+    a += 4;
+    b += 4;
+    r += 4;
+    n -= 4;
+  }
+  while (n) {
+    t1 = a[0];
+    t2 = b[0];
+    r[0] = (t1 - t2 - c) & BN_MASK2;
+    if (t1 != t2)
+      c = (t1 < t2);
+    a++;
+    b++;
+    r++;
+    n--;
+  }
+  return c;
+}
+
+/* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
+/* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
+/* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
+/* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
+
+#ifdef BN_LLONG
+#define mul_add_c(a, b, c0, c1, c2) \
+  t = (BN_ULLONG)a * b;             \
+  t1 = (BN_ULONG)Lw(t);             \
+  t2 = (BN_ULONG)Hw(t);             \
+  c0 = (c0 + t1) & BN_MASK2;        \
+  if ((c0) < t1)                    \
+    t2++;                           \
+  c1 = (c1 + t2) & BN_MASK2;        \
+  if ((c1) < t2)                    \
+    c2++;
+
+#define mul_add_c2(a, b, c0, c1, c2)           \
+  t = (BN_ULLONG)a * b;                        \
+  tt = (t + t) & BN_MASK;                      \
+  if (tt < t)                                  \
+    c2++;                                      \
+  t1 = (BN_ULONG)Lw(tt);                       \
+  t2 = (BN_ULONG)Hw(tt);                       \
+  c0 = (c0 + t1) & BN_MASK2;                   \
+  if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
+    c2++;                                      \
+  c1 = (c1 + t2) & BN_MASK2;                   \
+  if ((c1) < t2)                               \
+    c2++;
+
+#define sqr_add_c(a, i, c0, c1, c2) \
+  t = (BN_ULLONG)a[i] * a[i];       \
+  t1 = (BN_ULONG)Lw(t);             \
+  t2 = (BN_ULONG)Hw(t);             \
+  c0 = (c0 + t1) & BN_MASK2;        \
+  if ((c0) < t1)                    \
+    t2++;                           \
+  c1 = (c1 + t2) & BN_MASK2;        \
+  if ((c1) < t2)                    \
+    c2++;
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+#elif defined(BN_UMULT_LOHI)
+
+#define mul_add_c(a, b, c0, c1, c2) \
+  {                                 \
+    BN_ULONG ta = (a), tb = (b);    \
+    BN_UMULT_LOHI(t1, t2, ta, tb);  \
+    c0 += t1;                       \
+    t2 += (c0 < t1) ? 1 : 0;        \
+    c1 += t2;                       \
+    c2 += (c1 < t2) ? 1 : 0;        \
+  }
+
+#define mul_add_c2(a, b, c0, c1, c2) \
+  {                                  \
+    BN_ULONG ta = (a), tb = (b), t0; \
+    BN_UMULT_LOHI(t0, t1, ta, tb);   \
+    t2 = t1 + t1;                    \
+    c2 += (t2 < t1) ? 1 : 0;         \
+    t1 = t0 + t0;                    \
+    t2 += (t1 < t0) ? 1 : 0;         \
+    c0 += t1;                        \
+    t2 += (c0 < t1) ? 1 : 0;         \
+    c1 += t2;                        \
+    c2 += (c1 < t2) ? 1 : 0;         \
+  }
+
+#define sqr_add_c(a, i, c0, c1, c2) \
+  {                                 \
+    BN_ULONG ta = (a)[i];           \
+    BN_UMULT_LOHI(t1, t2, ta, ta);  \
+    c0 += t1;                       \
+    t2 += (c0 < t1) ? 1 : 0;        \
+    c1 += t2;                       \
+    c2 += (c1 < t2) ? 1 : 0;        \
+  }
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+#elif defined(BN_UMULT_HIGH)
+
+#define mul_add_c(a, b, c0, c1, c2) \
+  {                                 \
+    BN_ULONG ta = (a), tb = (b);    \
+    t1 = ta * tb;                   \
+    t2 = BN_UMULT_HIGH(ta, tb);     \
+    c0 += t1;                       \
+    t2 += (c0 < t1) ? 1 : 0;        \
+    c1 += t2;                       \
+    c2 += (c1 < t2) ? 1 : 0;        \
+  }
+
+#define mul_add_c2(a, b, c0, c1, c2) \
+  {                                  \
+    BN_ULONG ta = (a), tb = (b), t0; \
+    t1 = BN_UMULT_HIGH(ta, tb);      \
+    t0 = ta * tb;                    \
+    t2 = t1 + t1;                    \
+    c2 += (t2 < t1) ? 1 : 0;         \
+    t1 = t0 + t0;                    \
+    t2 += (t1 < t0) ? 1 : 0;         \
+    c0 += t1;                        \
+    t2 += (c0 < t1) ? 1 : 0;         \
+    c1 += t2;                        \
+    c2 += (c1 < t2) ? 1 : 0;         \
+  }
+
+#define sqr_add_c(a, i, c0, c1, c2) \
+  {                                 \
+    BN_ULONG ta = (a)[i];           \
+    t1 = ta * ta;                   \
+    t2 = BN_UMULT_HIGH(ta, ta);     \
+    c0 += t1;                       \
+    t2 += (c0 < t1) ? 1 : 0;        \
+    c1 += t2;                       \
+    c2 += (c1 < t2) ? 1 : 0;        \
+  }
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+
+#else /* !BN_LLONG */
+#define mul_add_c(a, b, c0, c1, c2) \
+  t1 = LBITS(a);                    \
+  t2 = HBITS(a);                    \
+  bl = LBITS(b);                    \
+  bh = HBITS(b);                    \
+  mul64(t1, t2, bl, bh);            \
+  c0 = (c0 + t1) & BN_MASK2;        \
+  if ((c0) < t1)                    \
+    t2++;                           \
+  c1 = (c1 + t2) & BN_MASK2;        \
+  if ((c1) < t2)                    \
+    c2++;
+
+#define mul_add_c2(a, b, c0, c1, c2)           \
+  t1 = LBITS(a);                               \
+  t2 = HBITS(a);                               \
+  bl = LBITS(b);                               \
+  bh = HBITS(b);                               \
+  mul64(t1, t2, bl, bh);                       \
+  if (t2 & BN_TBIT)                            \
+    c2++;                                      \
+  t2 = (t2 + t2) & BN_MASK2;                   \
+  if (t1 & BN_TBIT)                            \
+    t2++;                                      \
+  t1 = (t1 + t1) & BN_MASK2;                   \
+  c0 = (c0 + t1) & BN_MASK2;                   \
+  if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
+    c2++;                                      \
+  c1 = (c1 + t2) & BN_MASK2;                   \
+  if ((c1) < t2)                               \
+    c2++;
+
+#define sqr_add_c(a, i, c0, c1, c2) \
+  sqr64(t1, t2, (a)[i]);            \
+  c0 = (c0 + t1) & BN_MASK2;        \
+  if ((c0) < t1)                    \
+    t2++;                           \
+  c1 = (c1 + t2) & BN_MASK2;        \
+  if ((c1) < t2)                    \
+    c2++;
+
+#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
+#endif /* !BN_LLONG */
+
+void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+#ifdef BN_LLONG
+  BN_ULLONG t;
+#else
+  BN_ULONG bl, bh;
+#endif
+  BN_ULONG t1, t2;
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[4], b[0], c2, c3, c1);
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  mul_add_c(a[0], b[4], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[0], b[5], c3, c1, c2);
+  mul_add_c(a[1], b[4], c3, c1, c2);
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  mul_add_c(a[4], b[1], c3, c1, c2);
+  mul_add_c(a[5], b[0], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[6], b[0], c1, c2, c3);
+  mul_add_c(a[5], b[1], c1, c2, c3);
+  mul_add_c(a[4], b[2], c1, c2, c3);
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  mul_add_c(a[2], b[4], c1, c2, c3);
+  mul_add_c(a[1], b[5], c1, c2, c3);
+  mul_add_c(a[0], b[6], c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[7], c2, c3, c1);
+  mul_add_c(a[1], b[6], c2, c3, c1);
+  mul_add_c(a[2], b[5], c2, c3, c1);
+  mul_add_c(a[3], b[4], c2, c3, c1);
+  mul_add_c(a[4], b[3], c2, c3, c1);
+  mul_add_c(a[5], b[2], c2, c3, c1);
+  mul_add_c(a[6], b[1], c2, c3, c1);
+  mul_add_c(a[7], b[0], c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[1], c3, c1, c2);
+  mul_add_c(a[6], b[2], c3, c1, c2);
+  mul_add_c(a[5], b[3], c3, c1, c2);
+  mul_add_c(a[4], b[4], c3, c1, c2);
+  mul_add_c(a[3], b[5], c3, c1, c2);
+  mul_add_c(a[2], b[6], c3, c1, c2);
+  mul_add_c(a[1], b[7], c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  mul_add_c(a[2], b[7], c1, c2, c3);
+  mul_add_c(a[3], b[6], c1, c2, c3);
+  mul_add_c(a[4], b[5], c1, c2, c3);
+  mul_add_c(a[5], b[4], c1, c2, c3);
+  mul_add_c(a[6], b[3], c1, c2, c3);
+  mul_add_c(a[7], b[2], c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  mul_add_c(a[7], b[3], c2, c3, c1);
+  mul_add_c(a[6], b[4], c2, c3, c1);
+  mul_add_c(a[5], b[5], c2, c3, c1);
+  mul_add_c(a[4], b[6], c2, c3, c1);
+  mul_add_c(a[3], b[7], c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  mul_add_c(a[4], b[7], c3, c1, c2);
+  mul_add_c(a[5], b[6], c3, c1, c2);
+  mul_add_c(a[6], b[5], c3, c1, c2);
+  mul_add_c(a[7], b[4], c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  mul_add_c(a[7], b[5], c1, c2, c3);
+  mul_add_c(a[6], b[6], c1, c2, c3);
+  mul_add_c(a[5], b[7], c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  mul_add_c(a[6], b[7], c2, c3, c1);
+  mul_add_c(a[7], b[6], c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  mul_add_c(a[7], b[7], c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
+#ifdef BN_LLONG
+  BN_ULLONG t;
+#else
+  BN_ULONG bl, bh;
+#endif
+  BN_ULONG t1, t2;
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  mul_add_c(a[0], b[0], c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  mul_add_c(a[0], b[1], c2, c3, c1);
+  mul_add_c(a[1], b[0], c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[0], c3, c1, c2);
+  mul_add_c(a[1], b[1], c3, c1, c2);
+  mul_add_c(a[0], b[2], c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  mul_add_c(a[0], b[3], c1, c2, c3);
+  mul_add_c(a[1], b[2], c1, c2, c3);
+  mul_add_c(a[2], b[1], c1, c2, c3);
+  mul_add_c(a[3], b[0], c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  mul_add_c(a[3], b[1], c2, c3, c1);
+  mul_add_c(a[2], b[2], c2, c3, c1);
+  mul_add_c(a[1], b[3], c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  mul_add_c(a[2], b[3], c3, c1, c2);
+  mul_add_c(a[3], b[2], c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  mul_add_c(a[3], b[3], c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
+#ifdef BN_LLONG
+  BN_ULLONG t, tt;
+#else
+  BN_ULONG bl, bh;
+#endif
+  BN_ULONG t1, t2;
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  sqr_add_c2(a, 4, 0, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 5, 0, c3, c1, c2);
+  sqr_add_c2(a, 4, 1, c3, c1, c2);
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  sqr_add_c2(a, 4, 2, c1, c2, c3);
+  sqr_add_c2(a, 5, 1, c1, c2, c3);
+  sqr_add_c2(a, 6, 0, c1, c2, c3);
+  r[6] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 0, c2, c3, c1);
+  sqr_add_c2(a, 6, 1, c2, c3, c1);
+  sqr_add_c2(a, 5, 2, c2, c3, c1);
+  sqr_add_c2(a, 4, 3, c2, c3, c1);
+  r[7] = c2;
+  c2 = 0;
+  sqr_add_c(a, 4, c3, c1, c2);
+  sqr_add_c2(a, 5, 3, c3, c1, c2);
+  sqr_add_c2(a, 6, 2, c3, c1, c2);
+  sqr_add_c2(a, 7, 1, c3, c1, c2);
+  r[8] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 7, 2, c1, c2, c3);
+  sqr_add_c2(a, 6, 3, c1, c2, c3);
+  sqr_add_c2(a, 5, 4, c1, c2, c3);
+  r[9] = c1;
+  c1 = 0;
+  sqr_add_c(a, 5, c2, c3, c1);
+  sqr_add_c2(a, 6, 4, c2, c3, c1);
+  sqr_add_c2(a, 7, 3, c2, c3, c1);
+  r[10] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 7, 4, c3, c1, c2);
+  sqr_add_c2(a, 6, 5, c3, c1, c2);
+  r[11] = c3;
+  c3 = 0;
+  sqr_add_c(a, 6, c1, c2, c3);
+  sqr_add_c2(a, 7, 5, c1, c2, c3);
+  r[12] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 7, 6, c2, c3, c1);
+  r[13] = c2;
+  c2 = 0;
+  sqr_add_c(a, 7, c3, c1, c2);
+  r[14] = c3;
+  r[15] = c1;
+}
+
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
+#ifdef BN_LLONG
+  BN_ULLONG t, tt;
+#else
+  BN_ULONG bl, bh;
+#endif
+  BN_ULONG t1, t2;
+  BN_ULONG c1, c2, c3;
+
+  c1 = 0;
+  c2 = 0;
+  c3 = 0;
+  sqr_add_c(a, 0, c1, c2, c3);
+  r[0] = c1;
+  c1 = 0;
+  sqr_add_c2(a, 1, 0, c2, c3, c1);
+  r[1] = c2;
+  c2 = 0;
+  sqr_add_c(a, 1, c3, c1, c2);
+  sqr_add_c2(a, 2, 0, c3, c1, c2);
+  r[2] = c3;
+  c3 = 0;
+  sqr_add_c2(a, 3, 0, c1, c2, c3);
+  sqr_add_c2(a, 2, 1, c1, c2, c3);
+  r[3] = c1;
+  c1 = 0;
+  sqr_add_c(a, 2, c2, c3, c1);
+  sqr_add_c2(a, 3, 1, c2, c3, c1);
+  r[4] = c2;
+  c2 = 0;
+  sqr_add_c2(a, 3, 2, c3, c1, c2);
+  r[5] = c3;
+  c3 = 0;
+  sqr_add_c(a, 3, c1, c2, c3);
+  r[6] = c1;
+  r[7] = c2;
+}
+
+#if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
+/* This is essentially reference implementation, which may or may not
+ * result in performance improvement. E.g. on IA-32 this routine was
+ * observed to give 40% faster rsa1024 private key operations and 10%
+ * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
+ * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
+ * reference implementation, one to be used as starting point for
+ * platform-specific assembler. Mentioned numbers apply to compiler
+ * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
+ * can vary not only from platform to platform, but even for compiler
+ * versions. Assembler vs. assembler improvement coefficients can
+ * [and are known to] differ and are to be documented elsewhere. */
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                const BN_ULONG *np, const BN_ULONG *n0p, int num) {
+  BN_ULONG c0, c1, ml, *tp, n0;
+#ifdef mul64
+  BN_ULONG mh;
+#endif
+  volatile BN_ULONG *vp;
+  int i = 0, j;
+
+#if 0 /* template for platform-specific implementation */
+	if (ap==bp)	return bn_sqr_mont(rp,ap,np,n0p,num);
+#endif
+  vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
+
+  n0 = *n0p;
+
+  c0 = 0;
+  ml = bp[0];
+#ifdef mul64
+  mh = HBITS(ml);
+  ml = LBITS(ml);
+  for (j = 0; j < num; ++j)
+    mul(tp[j], ap[j], ml, mh, c0);
+#else
+  for (j = 0; j < num; ++j)
+    mul(tp[j], ap[j], ml, c0);
+#endif
+
+  tp[num] = c0;
+  tp[num + 1] = 0;
+  goto enter;
+
+  for (i = 0; i < num; i++) {
+    c0 = 0;
+    ml = bp[i];
+#ifdef mul64
+    mh = HBITS(ml);
+    ml = LBITS(ml);
+    for (j = 0; j < num; ++j)
+      mul_add(tp[j], ap[j], ml, mh, c0);
+#else
+    for (j = 0; j < num; ++j)
+      mul_add(tp[j], ap[j], ml, c0);
+#endif
+    c1 = (tp[num] + c0) & BN_MASK2;
+    tp[num] = c1;
+    tp[num + 1] = (c1 < c0 ? 1 : 0);
+  enter:
+    c1 = tp[0];
+    ml = (c1 * n0) & BN_MASK2;
+    c0 = 0;
+#ifdef mul64
+    mh = HBITS(ml);
+    ml = LBITS(ml);
+    mul_add(c1, np[0], ml, mh, c0);
+#else
+    mul_add(c1, ml, np[0], c0);
+#endif
+    for (j = 1; j < num; j++) {
+      c1 = tp[j];
+#ifdef mul64
+      mul_add(c1, np[j], ml, mh, c0);
+#else
+      mul_add(c1, ml, np[j], c0);
+#endif
+      tp[j - 1] = c1 & BN_MASK2;
+    }
+    c1 = (tp[num] + c0) & BN_MASK2;
+    tp[num - 1] = c1;
+    tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
+  }
+
+  if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
+    c0 = bn_sub_words(rp, tp, np, num);
+    if (tp[num] != 0 || c0 == 0) {
+      for (i = 0; i < num + 2; i++)
+        vp[i] = 0;
+      return 1;
+    }
+  }
+  for (i = 0; i < num; i++)
+    rp[i] = tp[i], vp[i] = 0;
+  vp[num] = 0;
+  vp[num + 1] = 0;
+  return 1;
+}
+#endif
+
+#endif

diff --git a/crypto/bn/internal.h b/crypto/bn/internal.h
new file mode 100644
index 0000000..3d6d75d
--- /dev/null
+++ b/crypto/bn/internal.h

@@ -0,0 +1,291 @@
+/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com).
+ *
+ */
+/* ====================================================================
+ * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED.
+ *
+ * Portions of the attached software ("Contribution") are developed by
+ * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project.
+ *
+ * The Contribution is licensed pursuant to the Eric Young open source
+ * license provided above.
+ *
+ * The binary polynomial arithmetic software is originally written by
+ * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems
+ * Laboratories. */
+
+#ifndef OPENSSL_HEADER_BN_INTERNAL_H
+#define OPENSSL_HEADER_BN_INTERNAL_H
+
+#include <openssl/base.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* BN_wexpand ensures that |bn| has at least |words| works of space without
+ * altering its value. It returns one on success or zero on allocation
+ * failure. */
+BIGNUM *bn_wexpand(BIGNUM *bn, unsigned words);
+
+/* bn_expand acts the same as |BN_wexpand|, but takes a number of bits rather
+ * than a number of words. */
+BIGNUM *bn_expand(BIGNUM *bn, unsigned bits);
+
+/* bn_correct_top decrements |bn->top| until |bn->d[top-1]| is non-zero or
+ * until |top| is zero. */
+void bn_correct_top(BIGNUM *bn);
+
+#if defined(OPENSSL_64_BIT)
+
+#define BN_ULLONG	unsigned long long
+#define BN_LONG		long
+#define BN_BITS		128
+#define BN_BYTES	8
+#define BN_BITS4	32
+#define BN_MASK		(0xffffffffffffffffffffffffffffffffLL)
+#define BN_MASK2	(0xffffffffffffffffL)
+#define BN_MASK2l	(0xffffffffL)
+#define BN_MASK2h	(0xffffffff00000000L)
+#define BN_MASK2h1	(0xffffffff80000000L)
+#define BN_TBIT		(0x8000000000000000L)
+#define BN_DEC_CONV	(10000000000000000000UL)
+#define BN_DEC_FMT1	"%lu"
+#define BN_DEC_FMT2	"%019lu"
+#define BN_DEC_NUM	19
+#define BN_HEX_FMT1	"%lX"
+#define BN_HEX_FMT2	"%016lX"
+
+#elif defined(OPENSSL_32_BIT)
+
+#define BN_ULLONG	unsigned long long
+#define BN_MASK	(0xffffffffffffffffLL)
+#define BN_LONG		int32_t
+#define BN_BITS		64
+#define BN_BYTES	4
+#define BN_BITS4	16
+#define BN_MASK2	(0xffffffffL)
+#define BN_MASK2l	(0xffff)
+#define BN_MASK2h1	(0xffff8000L)
+#define BN_MASK2h	(0xffff0000L)
+#define BN_TBIT		(0x80000000L)
+#define BN_DEC_CONV	(1000000000L)
+#define BN_DEC_FMT1	"%u"
+#define BN_DEC_FMT2	"%09u"
+#define BN_DEC_NUM	9
+#define BN_HEX_FMT1	"%X"
+#define BN_HEX_FMT2	"%08X"
+
+#else
+#error "Must define either OPENSSL_32_BIT or OPENSSL_64_BIT"
+#endif
+
+/* Pentium pro 16,16,16,32,64 */
+/* Alpha       16,16,16,16.64 */
+#define BN_MULL_SIZE_NORMAL (16)              /* 32 */
+#define BN_MUL_RECURSIVE_SIZE_NORMAL (16)     /* 32 less than */
+#define BN_SQR_RECURSIVE_SIZE_NORMAL (16)     /* 32 */
+#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32) /* 32 */
+#define BN_MONT_CTX_SET_SIZE_WORD (64)        /* 32 */
+
+BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
+void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
+BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d);
+BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,int num);
+
+void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b);
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a);
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a);
+
+/* bn_cmp_words returns a value less than, equal to or greater than zero if
+ * the, length |n|, array |a| is less than, equal to or greater than |b|. */
+int bn_cmp_words(const BN_ULONG *a, const BN_ULONG *b, int n);
+
+/* bn_cmp_words returns a value less than, equal to or greater than zero if the
+ * array |a| is less than, equal to or greater than |b|. The arrays can be of
+ * different lengths: |cl| gives the minimum of the two lengths and |dl| gives
+ * the length of |a| minus the length of |b|. */
+int bn_cmp_part_words(const BN_ULONG *a, const BN_ULONG *b, int cl, int dl);
+
+int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
+                const BN_ULONG *np, const BN_ULONG *n0, int num);
+
+#define LBITS(a) ((a) & BN_MASK2l)
+#define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
+#define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
+
+#define LLBITS(a) ((a) & BN_MASKl)
+#define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
+#define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
+
+#define mul64(l, h, bl, bh)       \
+  {                               \
+    BN_ULONG m, m1, lt, ht;       \
+                                  \
+    lt = l;                       \
+    ht = h;                       \
+    m = (bh) * (lt);              \
+    lt = (bl) * (lt);             \
+    m1 = (bl) * (ht);             \
+    ht = (bh) * (ht);             \
+    m = (m + m1) & BN_MASK2;      \
+    if (m < m1)                   \
+      ht += L2HBITS((BN_ULONG)1); \
+    ht += HBITS(m);               \
+    m1 = L2HBITS(m);              \
+    lt = (lt + m1) & BN_MASK2;    \
+    if (lt < m1)                  \
+      ht++;                       \
+    (l) = lt;                     \
+    (h) = ht;                     \
+  }
+
+#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64)
+# if defined(__GNUC__) && __GNUC__>=2
+#  define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret,discard;	\
+	__asm__ ("mulq	%3"		\
+	     : "=a"(discard),"=d"(ret)	\
+	     : "a"(a), "g"(b)		\
+	     : "cc");			\
+	ret;			})
+#  define BN_UMULT_LOHI(low,high,a,b)	\
+	__asm__ ("mulq	%3"		\
+		: "=a"(low),"=d"(high)	\
+		: "a"(a),"g"(b)		\
+		: "cc");
+# endif
+# if defined(_MSC_VER) && _MSC_VER>=1400
+   unsigned __int64 __umulh	(unsigned __int64 a,unsigned __int64 b);
+   unsigned __int64 _umul128	(unsigned __int64 a,unsigned __int64 b,
+				 unsigned __int64 *h);
+#  pragma intrinsic(__umulh,_umul128)
+#  define BN_UMULT_HIGH(a,b)		__umulh((a),(b))
+#  define BN_UMULT_LOHI(low,high,a,b)	((low)=_umul128((a),(b),&(high)))
+# endif
+#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
+# if defined(__GNUC__) && __GNUC__>=2
+#  define BN_UMULT_HIGH(a,b)	({	\
+	register BN_ULONG ret;		\
+	__asm__ ("umulh	%0,%1,%2"	\
+	     : "=r"(ret)		\
+	     : "r"(a), "r"(b));		\
+	ret;			})
+# endif
+#endif
+
+
+#if defined(__cplusplus)
+}  /* extern C */
+#endif
+
+#endif  /* OPENSSL_HEADER_BN_INTERNAL_H */

diff --git a/crypto/bn/kronecker.c b/crypto/bn/kronecker.c
new file mode 100644
index 0000000..23ef79a
--- /dev/null
+++ b/crypto/bn/kronecker.c

@@ -0,0 +1,175 @@
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include "internal.h"
+
+
+/* least significant word */
+#define BN_lsw(n) (((n)->top == 0) ? (BN_ULONG) 0 : (n)->d[0])
+
+/* Returns -2 for errors because both -1 and 0 are valid results. */
+int BN_kronecker(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+  int i;
+  int ret = -2;
+  BIGNUM *A, *B, *tmp;
+  /* In 'tab', only odd-indexed entries are relevant:
+   * For any odd BIGNUM n,
+   *     tab[BN_lsw(n) & 7]
+   * is $(-1)^{(n^2-1)/8}$ (using TeX notation).
+   * Note that the sign of n does not matter. */
+  static const int tab[8] = {0, 1, 0, -1, 0, -1, 0, 1};
+
+  BN_CTX_start(ctx);
+  A = BN_CTX_get(ctx);
+  B = BN_CTX_get(ctx);
+  if (B == NULL) {
+    goto end;
+  }
+
+  if (!BN_copy(A, a) ||
+      !BN_copy(B, b)) {
+    goto end;
+  }
+
+  /* Kronecker symbol, imlemented according to Henri Cohen,
+   * "A Course in Computational Algebraic Number Theory"
+   * (algorithm 1.4.10). */
+
+  /* Cohen's step 1: */
+
+  if (BN_is_zero(B)) {
+    ret = BN_abs_is_word(A, 1);
+    goto end;
+  }
+
+  /* Cohen's step 2: */
+
+  if (!BN_is_odd(A) && !BN_is_odd(B)) {
+    ret = 0;
+    goto end;
+  }
+
+  /* now B is non-zero */
+  i = 0;
+  while (!BN_is_bit_set(B, i)) {
+    i++;
+  }
+  if (!BN_rshift(B, B, i)) {
+    goto end;
+  }
+  if (i & 1) {
+    /* i is odd */
+    /* (thus B was even, thus A must be odd!)  */
+
+    /* set 'ret' to $(-1)^{(A^2-1)/8}$ */
+    ret = tab[BN_lsw(A) & 7];
+  } else {
+    /* i is even */
+    ret = 1;
+  }
+
+  if (B->neg) {
+    B->neg = 0;
+    if (A->neg) {
+      ret = -ret;
+    }
+  }
+
+  /* now B is positive and odd, so what remains to be done is to compute the
+   * Jacobi symbol (A/B) and multiply it by 'ret' */
+
+  while (1) {
+    /* Cohen's step 3: */
+
+    /* B is positive and odd */
+    if (BN_is_zero(A)) {
+      ret = BN_is_one(B) ? ret : 0;
+      goto end;
+    }
+
+    /* now A is non-zero */
+    i = 0;
+    while (!BN_is_bit_set(A, i)) {
+      i++;
+    }
+    if (!BN_rshift(A, A, i)) {
+      goto end;
+    }
+    if (i & 1) {
+      /* i is odd */
+      /* multiply 'ret' by  $(-1)^{(B^2-1)/8}$ */
+      ret = ret * tab[BN_lsw(B) & 7];
+    }
+
+    /* Cohen's step 4: */
+    /* multiply 'ret' by  $(-1)^{(A-1)(B-1)/4}$ */
+    if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2) {
+      ret = -ret;
+    }
+
+    /* (A, B) := (B mod |A|, |A|) */
+    if (!BN_nnmod(B, B, A, ctx)) {
+      ret = -2;
+      goto end;
+    }
+    tmp = A;
+    A = B;
+    B = tmp;
+    tmp->neg = 0;
+  }
+
+end:
+  BN_CTX_end(ctx);
+  return ret;
+}

diff --git a/crypto/bn/montgomery.c b/crypto/bn/montgomery.c
new file mode 100644
index 0000000..4c803d2
--- /dev/null
+++ b/crypto/bn/montgomery.c

@@ -0,0 +1,561 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2006 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <openssl/mem.h>
+#include <openssl/thread.h>
+
+#include "internal.h"
+
+
+#if !defined(OPENSSL_NO_ASM) && \
+    (defined(OPENSSL_X86) || defined(OPENSSL_X86_64))
+#define OPENSSL_BN_ASM_MONT
+#endif
+
+BN_MONT_CTX *BN_MONT_CTX_new(void) {
+  BN_MONT_CTX *ret = OPENSSL_malloc(sizeof(BN_MONT_CTX));
+
+  if (ret == NULL) {
+    return NULL;
+  }
+
+  BN_MONT_CTX_init(ret);
+  ret->flags = BN_FLG_MALLOCED;
+  return ret;
+}
+
+void BN_MONT_CTX_init(BN_MONT_CTX *mont) {
+  memset(mont, 0, sizeof(BN_MONT_CTX));
+  BN_init(&mont->RR);
+  BN_init(&mont->N);
+  BN_init(&mont->Ni);
+}
+
+void BN_MONT_CTX_free(BN_MONT_CTX *mont) {
+  if (mont == NULL) {
+    return;
+  }
+
+  BN_free(&mont->RR);
+  BN_free(&mont->N);
+  BN_free(&mont->Ni);
+  if (mont->flags & BN_FLG_MALLOCED) {
+    OPENSSL_free(mont);
+  }
+}
+
+BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from) {
+  if (to == from) {
+    return to;
+  }
+
+  if (!BN_copy(&to->RR, &from->RR) ||
+      !BN_copy(&to->N, &from->N) ||
+      !BN_copy(&to->Ni, &from->Ni)) {
+    return NULL;
+  }
+  to->ri = from->ri;
+  to->n0[0] = from->n0[0];
+  to->n0[1] = from->n0[1];
+  return to;
+}
+
+int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) {
+  int ret = 0;
+  BIGNUM *Ri, *R;
+  BIGNUM tmod;
+  BN_ULONG buf[2];
+
+  BN_CTX_start(ctx);
+  Ri = BN_CTX_get(ctx);
+  if (Ri == NULL) {
+    goto err;
+  }
+  R = &mont->RR; /* grab RR as a temp */
+  if (!BN_copy(&mont->N, mod)) {
+    goto err; /* Set N */
+  }
+  mont->N.neg = 0;
+
+  BN_init(&tmod);
+  tmod.d = buf;
+  tmod.dmax = 2;
+  tmod.neg = 0;
+
+  mont->ri = (BN_num_bits(mod) + (BN_BITS2 - 1)) / BN_BITS2 * BN_BITS2;
+
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2 <= 32)
+  /* Only certain BN_BITS2<=32 platforms actually make use of
+   * n0[1], and we could use the #else case (with a shorter R
+   * value) for the others.  However, currently only the assembler
+   * files do know which is which. */
+
+  BN_zero(R);
+  if (!BN_set_bit(R, 2 * BN_BITS2)) {
+    goto err;
+  }
+
+  tmod.top = 0;
+  if ((buf[0] = mod->d[0])) {
+    tmod.top = 1;
+  }
+  if ((buf[1] = mod->top > 1 ? mod->d[1] : 0)) {
+    tmod.top = 2;
+  }
+
+  if (BN_mod_inverse(Ri, R, &tmod, ctx) == NULL) {
+    goto err;
+  }
+  if (!BN_lshift(Ri, Ri, 2 * BN_BITS2)) {
+    goto err; /* R*Ri */
+  }
+  if (!BN_is_zero(Ri)) {
+    if (!BN_sub_word(Ri, 1)) {
+      goto err;
+    }
+  } else {
+    /* if N mod word size == 1 */
+    if (bn_expand(Ri, (int)sizeof(BN_ULONG) * 2) == NULL) {
+      goto err;
+    }
+    /* Ri-- (mod double word size) */
+    Ri->neg = 0;
+    Ri->d[0] = BN_MASK2;
+    Ri->d[1] = BN_MASK2;
+    Ri->top = 2;
+  }
+
+  if (!BN_div(Ri, NULL, Ri, &tmod, ctx)) {
+    goto err;
+  }
+  /* Ni = (R*Ri-1)/N,
+   * keep only couple of least significant words: */
+  mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+  mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
+#else
+  BN_zero(R);
+  if (!BN_set_bit(R, BN_BITS2)) {
+    goto err; /* R */
+  }
+
+  buf[0] = mod->d[0]; /* tmod = N mod word size */
+  buf[1] = 0;
+  tmod.top = buf[0] != 0 ? 1 : 0;
+  /* Ri = R^-1 mod N*/
+  if (BN_mod_inverse(Ri, R, &tmod, ctx) == NULL) {
+    goto err;
+  }
+  if (!BN_lshift(Ri, Ri, BN_BITS2)) {
+    goto err; /* R*Ri */
+  }
+  if (!BN_is_zero(Ri)) {
+    if (!BN_sub_word(Ri, 1)) {
+      goto err;
+    }
+  } else {
+    /* if N mod word size == 1 */
+    if (!BN_set_word(Ri, BN_MASK2)) {
+      goto err; /* Ri-- (mod word size) */
+    }
+  }
+  if (!BN_div(Ri, NULL, Ri, &tmod, ctx)) {
+    goto err;
+  }
+  /* Ni = (R*Ri-1)/N,
+   * keep only least significant word: */
+  mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+  mont->n0[1] = 0;
+#endif
+
+  /* setup RR for conversions */
+  BN_zero(&(mont->RR));
+  if (!BN_set_bit(&(mont->RR), mont->ri * 2)) {
+    goto err;
+  }
+  if (!BN_mod(&(mont->RR), &(mont->RR), &(mont->N), ctx)) {
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
+                                    const BIGNUM *mod, BN_CTX *ctx) {
+  int got_write_lock = 0;
+  BN_MONT_CTX *ret;
+
+  CRYPTO_r_lock(lock);
+  if (!*pmont) {
+    CRYPTO_r_unlock(lock);
+    CRYPTO_w_lock(lock);
+    got_write_lock = 1;
+
+    if (!*pmont) {
+      ret = BN_MONT_CTX_new();
+      if (ret && !BN_MONT_CTX_set(ret, mod, ctx)) {
+        BN_MONT_CTX_free(ret);
+      } else {
+        *pmont = ret;
+      }
+    }
+  }
+
+  ret = *pmont;
+
+  if (got_write_lock) {
+    CRYPTO_w_unlock(lock);
+  } else {
+    CRYPTO_r_unlock(lock);
+  }
+
+  return ret;
+}
+
+int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
+                     BN_CTX *ctx) {
+  return BN_mod_mul_montgomery(ret, a, &mont->RR, mont, ctx);
+}
+
+#if 0
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r,
+                                   const BN_MONT_CTX *mont) {
+  const BIGNUM *n;
+  BN_ULONG *ap, *np, *rp, n0, v, carry;
+  int nl, max, i;
+
+  n = &mont->N;
+  nl = n->top;
+  if (nl == 0) {
+    ret->top = 0;
+    return 1;
+  }
+
+  max = (2 * nl); /* carry is stored separately */
+  if (bn_wexpand(r, max) == NULL) {
+    return 0;
+  }
+
+  r->neg ^= n->neg;
+  np = n->d;
+  rp = r->d;
+
+  /* clear the top words of T */
+  if (max > r->top) {
+    memset(&rp[r->top], 0, (max - r->top) * sizeof(BN_ULONG));
+  }
+
+  r->top = max;
+  n0 = mont->n0[0];
+
+  for (carry = 0, i = 0; i < nl; i++, rp++) {
+    v = bn_mul_add_words(rp, np, nl, (rp[0] * n0) & BN_MASK2);
+    v = (v + carry + rp[nl]) & BN_MASK2;
+    carry |= (v != rp[nl]);
+    carry &= (v <= rp[nl]);
+    rp[nl] = v;
+  }
+
+  if (bn_wexpand(ret, nl) == NULL) {
+    return 0;
+  }
+  ret->top = nl;
+  ret->neg = r->neg;
+
+  rp = ret->d;
+  ap = &(r->d[nl]);
+
+  {
+    BN_ULONG *nrp;
+    size_t m;
+
+    v = bn_sub_words(rp, ap, np, nl) - carry;
+    /* if subtraction result is real, then trick unconditional memcpy below to
+     * perform in-place "refresh" instead of actual copy. */
+    m = (0 - (size_t)v);
+    nrp = (BN_ULONG *)(((intptr_t)rp & ~m) | ((intptr_t)ap & m));
+
+    for (i = 0, nl -= 4; i < nl; i += 4) {
+      BN_ULONG t1, t2, t3, t4;
+
+      t1 = nrp[i + 0];
+      t2 = nrp[i + 1];
+      t3 = nrp[i + 2];
+      ap[i + 0] = 0;
+      t4 = nrp[i + 3];
+      ap[i + 1] = 0;
+      rp[i + 0] = t1;
+      ap[i + 2] = 0;
+      rp[i + 1] = t2;
+      ap[i + 3] = 0;
+      rp[i + 2] = t3;
+      rp[i + 3] = t4;
+    }
+
+    for (nl += 4; i < nl; i++) {
+      rp[i] = nrp[i], ap[i] = 0;
+    }
+  }
+
+  bn_correct_top(r);
+  bn_correct_top(ret);
+
+  return 1;
+}
+#endif
+
+#define PTR_SIZE_INT size_t
+
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, const BN_MONT_CTX *mont)
+	{
+	BIGNUM *n;
+	BN_ULONG *ap,*np,*rp,n0,v,carry;
+	int nl,max,i;
+
+	n= (BIGNUM*) &(mont->N);
+	nl=n->top;
+	if (nl == 0) { ret->top=0; return(1); }
+
+	max=(2*nl); /* carry is stored separately */
+	if (bn_wexpand(r,max) == NULL) return(0);
+
+	r->neg^=n->neg;
+	np=n->d;
+	rp=r->d;
+
+	/* clear the top words of T */
+#if 1
+	for (i=r->top; i<max; i++) /* memset? XXX */
+		rp[i]=0;
+#else
+	memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
+#endif
+
+	r->top=max;
+	n0=mont->n0[0];
+
+	for (carry=0, i=0; i<nl; i++, rp++)
+		{
+		v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
+		v = (v+carry+rp[nl])&BN_MASK2;
+		carry |= (v != rp[nl]);
+		carry &= (v <= rp[nl]);
+		rp[nl]=v;
+		}
+
+	if (bn_wexpand(ret,nl) == NULL) return(0);
+	ret->top=nl;
+	ret->neg=r->neg;
+
+	rp=ret->d;
+	ap=&(r->d[nl]);
+
+	{
+	BN_ULONG *nrp;
+	size_t m;
+
+	v=bn_sub_words(rp,ap,np,nl)-carry;
+	/* if subtraction result is real, then
+	 * trick unconditional memcpy below to perform in-place
+	 * "refresh" instead of actual copy. */
+	m=(0-(size_t)v);
+	nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
+
+	for (i=0,nl-=4; i<nl; i+=4)
+		{
+		BN_ULONG t1,t2,t3,t4;
+		
+		t1=nrp[i+0];
+		t2=nrp[i+1];
+		t3=nrp[i+2];	ap[i+0]=0;
+		t4=nrp[i+3];	ap[i+1]=0;
+		rp[i+0]=t1;	ap[i+2]=0;
+		rp[i+1]=t2;	ap[i+3]=0;
+		rp[i+2]=t3;
+		rp[i+3]=t4;
+		}
+	for (nl+=4; i<nl; i++)
+		rp[i]=nrp[i], ap[i]=0;
+	}
+	bn_correct_top(r);
+	bn_correct_top(ret);
+
+	return(1);
+	}
+
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont,
+                       BN_CTX *ctx) {
+  int retn = 0;
+  BIGNUM *t;
+
+  BN_CTX_start(ctx);
+  t = BN_CTX_get(ctx);
+  if (t == NULL) {
+    return 0;
+  }
+
+  if (BN_copy(t, a))
+    retn = BN_from_montgomery_word(ret, t, mont);
+  BN_CTX_end(ctx);
+
+  return retn;
+}
+
+int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
+                          const BN_MONT_CTX *mont, BN_CTX *ctx) {
+  BIGNUM *tmp;
+  int ret = 0;
+
+#if defined(OPENSSL_BN_ASM_MONT)
+  int num = mont->N.top;
+
+  if (num > 1 && a->top == num && b->top == num) {
+    if (bn_wexpand(r, num) == NULL) {
+      return 0;
+    }
+    if (bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) {
+      r->neg = a->neg ^ b->neg;
+      r->top = num;
+      bn_correct_top(r);
+      return 1;
+    }
+  }
+#endif
+
+  BN_CTX_start(ctx);
+  tmp = BN_CTX_get(ctx);
+  if (tmp == NULL) {
+    goto err;
+  }
+
+  if (a == b) {
+    if (!BN_sqr(tmp, a, ctx)) {
+      goto err;
+    }
+  } else {
+    if (!BN_mul(tmp, a, b, ctx)) {
+      goto err;
+    }
+  }
+
+  /* reduce from aRR to aR */
+  if (!BN_from_montgomery_word(r, tmp, mont)) {
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}

diff --git a/crypto/bn/mul.c b/crypto/bn/mul.c
new file mode 100644
index 0000000..af26eba
--- /dev/null
+++ b/crypto/bn/mul.c

@@ -0,0 +1,884 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include <assert.h>
+
+#include "internal.h"
+
+
+void bn_mul_normal(BN_ULONG *r, BN_ULONG *a, int na, BN_ULONG *b, int nb) {
+  BN_ULONG *rr;
+
+  if (na < nb) {
+    int itmp;
+    BN_ULONG *ltmp;
+
+    itmp = na;
+    na = nb;
+    nb = itmp;
+    ltmp = a;
+    a = b;
+    b = ltmp;
+  }
+  rr = &(r[na]);
+  if (nb <= 0) {
+    (void)bn_mul_words(r, a, na, 0);
+    return;
+  } else {
+    rr[0] = bn_mul_words(r, a, na, b[0]);
+  }
+
+  for (;;) {
+    if (--nb <= 0) {
+      return;
+    }
+    rr[1] = bn_mul_add_words(&(r[1]), a, na, b[1]);
+    if (--nb <= 0) {
+      return;
+    }
+    rr[2] = bn_mul_add_words(&(r[2]), a, na, b[2]);
+    if (--nb <= 0) {
+      return;
+    }
+    rr[3] = bn_mul_add_words(&(r[3]), a, na, b[3]);
+    if (--nb <= 0) {
+      return;
+    }
+    rr[4] = bn_mul_add_words(&(r[4]), a, na, b[4]);
+    rr += 4;
+    r += 4;
+    b += 4;
+  }
+}
+
+void bn_mul_low_normal(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) {
+  bn_mul_words(r, a, n, b[0]);
+
+  for (;;) {
+    if (--n <= 0) {
+      return;
+    }
+    bn_mul_add_words(&(r[1]), a, n, b[1]);
+    if (--n <= 0) {
+      return;
+    }
+    bn_mul_add_words(&(r[2]), a, n, b[2]);
+    if (--n <= 0) {
+      return;
+    }
+    bn_mul_add_words(&(r[3]), a, n, b[3]);
+    if (--n <= 0) {
+      return;
+    }
+    bn_mul_add_words(&(r[4]), a, n, b[4]);
+    r += 4;
+    b += 4;
+  }
+}
+
+#if !defined(OPENSSL_X86)
+/* Here follows specialised variants of bn_add_words() and bn_sub_words(). They
+ * have the property performing operations on arrays of different sizes. The
+ * sizes of those arrays is expressed through cl, which is the common length (
+ * basicall, min(len(a),len(b)) ), and dl, which is the delta between the two
+ * lengths, calculated as len(a)-len(b). All lengths are the number of
+ * BN_ULONGs...  For the operations that require a result array as parameter,
+ * it must have the length cl+abs(dl). These functions should probably end up
+ * in bn_asm.c as soon as there are assembler counterparts for the systems that
+ * use assembler files.  */
+
+static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a,
+                                  const BN_ULONG *b, int cl, int dl) {
+  BN_ULONG c, t;
+
+  assert(cl >= 0);
+  c = bn_sub_words(r, a, b, cl);
+
+  if (dl == 0)
+    return c;
+
+  r += cl;
+  a += cl;
+  b += cl;
+
+  if (dl < 0) {
+    for (;;) {
+      t = b[0];
+      r[0] = (0 - t - c) & BN_MASK2;
+      if (t != 0) {
+        c = 1;
+      }
+      if (++dl >= 0) {
+        break;
+      }
+
+      t = b[1];
+      r[1] = (0 - t - c) & BN_MASK2;
+      if (t != 0) {
+        c = 1;
+      }
+      if (++dl >= 0) {
+        break;
+      }
+
+      t = b[2];
+      r[2] = (0 - t - c) & BN_MASK2;
+      if (t != 0) {
+        c = 1;
+      }
+      if (++dl >= 0) {
+        break;
+      }
+
+      t = b[3];
+      r[3] = (0 - t - c) & BN_MASK2;
+      if (t != 0) {
+        c = 1;
+      }
+      if (++dl >= 0) {
+        break;
+      }
+
+      b += 4;
+      r += 4;
+    }
+  } else {
+    int save_dl = dl;
+    while (c) {
+      t = a[0];
+      r[0] = (t - c) & BN_MASK2;
+      if (t != 0) {
+        c = 0;
+      }
+      if (--dl <= 0) {
+        break;
+      }
+
+      t = a[1];
+      r[1] = (t - c) & BN_MASK2;
+      if (t != 0) {
+        c = 0;
+      }
+      if (--dl <= 0) {
+        break;
+      }
+
+      t = a[2];
+      r[2] = (t - c) & BN_MASK2;
+      if (t != 0) {
+        c = 0;
+      }
+      if (--dl <= 0) {
+        break;
+      }
+
+      t = a[3];
+      r[3] = (t - c) & BN_MASK2;
+      if (t != 0) {
+        c = 0;
+      }
+      if (--dl <= 0) {
+        break;
+      }
+
+      save_dl = dl;
+      a += 4;
+      r += 4;
+    }
+    if (dl > 0) {
+      if (save_dl > dl) {
+        switch (save_dl - dl) {
+          case 1:
+            r[1] = a[1];
+            if (--dl <= 0) {
+              break;
+            }
+          case 2:
+            r[2] = a[2];
+            if (--dl <= 0) {
+              break;
+            }
+          case 3:
+            r[3] = a[3];
+            if (--dl <= 0) {
+              break;
+            }
+        }
+        a += 4;
+        r += 4;
+      }
+    }
+
+    if (dl > 0) {
+      for (;;) {
+        r[0] = a[0];
+        if (--dl <= 0) {
+          break;
+        }
+        r[1] = a[1];
+        if (--dl <= 0) {
+          break;
+        }
+        r[2] = a[2];
+        if (--dl <= 0) {
+          break;
+        }
+        r[3] = a[3];
+        if (--dl <= 0) {
+          break;
+        }
+
+        a += 4;
+        r += 4;
+      }
+    }
+  }
+
+  return c;
+}
+#else
+/* On other platforms the function is defined in asm. */
+BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
+                           int cl, int dl);
+#endif
+
+/* Karatsuba recursive multiplication algorithm
+ * (cf. Knuth, The Art of Computer Programming, Vol. 2) */
+
+/* r is 2*n2 words in size,
+ * a and b are both n2 words in size.
+ * n2 must be a power of 2.
+ * We multiply and return the result.
+ * t must be 2*n2 words in size
+ * We calculate
+ * a[0]*b[0]
+ * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
+ * a[1]*b[1]
+ */
+/* dnX may not be positive, but n2/2+dnX has to be */
+static void bn_mul_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n2,
+                             int dna, int dnb, BN_ULONG *t) {
+  int n = n2 / 2, c1, c2;
+  int tna = n + dna, tnb = n + dnb;
+  unsigned int neg, zero;
+  BN_ULONG ln, lo, *p;
+
+  /* Only call bn_mul_comba 8 if n2 == 8 and the
+   * two arrays are complete [steve]
+   */
+  if (n2 == 8 && dna == 0 && dnb == 0) {
+    bn_mul_comba8(r, a, b);
+    return;
+  }
+
+  /* Else do normal multiply */
+  if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) {
+    bn_mul_normal(r, a, n2 + dna, b, n2 + dnb);
+    if ((dna + dnb) < 0)
+      memset(&r[2 * n2 + dna + dnb], 0, sizeof(BN_ULONG) * -(dna + dnb));
+    return;
+  }
+
+  /* r=(a[0]-a[1])*(b[1]-b[0]) */
+  c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
+  c2 = bn_cmp_part_words(&(b[n]), b, tnb, tnb - n);
+  zero = neg = 0;
+  switch (c1 * 3 + c2) {
+    case -4:
+      bn_sub_part_words(t, &(a[n]), a, tna, tna - n);       /* - */
+      bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
+      break;
+    case -3:
+      zero = 1;
+      break;
+    case -2:
+      bn_sub_part_words(t, &(a[n]), a, tna, tna - n);       /* - */
+      bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n); /* + */
+      neg = 1;
+      break;
+    case -1:
+    case 0:
+    case 1:
+      zero = 1;
+      break;
+    case 2:
+      bn_sub_part_words(t, a, &(a[n]), tna, n - tna);       /* + */
+      bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
+      neg = 1;
+      break;
+    case 3:
+      zero = 1;
+      break;
+    case 4:
+      bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
+      bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
+      break;
+  }
+
+  if (n == 4 && dna == 0 && dnb == 0) {
+    /* XXX: bn_mul_comba4 could take extra args to do this well */
+    if (!zero) {
+      bn_mul_comba4(&(t[n2]), t, &(t[n]));
+    } else {
+      memset(&(t[n2]), 0, 8 * sizeof(BN_ULONG));
+    }
+
+    bn_mul_comba4(r, a, b);
+    bn_mul_comba4(&(r[n2]), &(a[n]), &(b[n]));
+  } else if (n == 8 && dna == 0 && dnb == 0) {
+    /* XXX: bn_mul_comba8 could take extra args to do this well */
+    if (!zero) {
+      bn_mul_comba8(&(t[n2]), t, &(t[n]));
+    } else {
+      memset(&(t[n2]), 0, 16 * sizeof(BN_ULONG));
+    }
+
+    bn_mul_comba8(r, a, b);
+    bn_mul_comba8(&(r[n2]), &(a[n]), &(b[n]));
+  } else {
+    p = &(t[n2 * 2]);
+    if (!zero) {
+      bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
+    } else {
+      memset(&(t[n2]), 0, n2 * sizeof(BN_ULONG));
+    }
+    bn_mul_recursive(r, a, b, n, 0, 0, p);
+    bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), n, dna, dnb, p);
+  }
+
+  /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
+   * r[10] holds (a[0]*b[0])
+   * r[32] holds (b[1]*b[1]) */
+
+  c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
+
+  if (neg) {
+    /* if t[32] is negative */
+    c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
+  } else {
+    /* Might have a carry */
+    c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
+  }
+
+  /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
+   * r[10] holds (a[0]*b[0])
+   * r[32] holds (b[1]*b[1])
+   * c1 holds the carry bits */
+  c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
+  if (c1) {
+    p = &(r[n + n2]);
+    lo = *p;
+    ln = (lo + c1) & BN_MASK2;
+    *p = ln;
+
+    /* The overflow will stop before we over write
+     * words we should not overwrite */
+    if (ln < (BN_ULONG)c1) {
+      do {
+        p++;
+        lo = *p;
+        ln = (lo + 1) & BN_MASK2;
+        *p = ln;
+      } while (ln == 0);
+    }
+  }
+}
+
+/* n+tn is the word length
+ * t needs to be n*4 is size, as does r */
+/* tnX may not be negative but less than n */
+static void bn_mul_part_recursive(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n,
+                                  int tna, int tnb, BN_ULONG *t) {
+  int i, j, n2 = n * 2;
+  int c1, c2, neg;
+  BN_ULONG ln, lo, *p;
+
+  if (n < 8) {
+    bn_mul_normal(r, a, n + tna, b, n + tnb);
+    return;
+  }
+
+  /* r=(a[0]-a[1])*(b[1]-b[0]) */
+  c1 = bn_cmp_part_words(a, &(a[n]), tna, n - tna);
+  c2 = bn_cmp_part_words(&(b[n]), b, tnb, tnb - n);
+  neg = 0;
+  switch (c1 * 3 + c2) {
+    case -4:
+      bn_sub_part_words(t, &(a[n]), a, tna, tna - n);       /* - */
+      bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
+      break;
+    case -3:
+    /* break; */
+    case -2:
+      bn_sub_part_words(t, &(a[n]), a, tna, tna - n);       /* - */
+      bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n); /* + */
+      neg = 1;
+      break;
+    case -1:
+    case 0:
+    case 1:
+    /* break; */
+    case 2:
+      bn_sub_part_words(t, a, &(a[n]), tna, n - tna);       /* + */
+      bn_sub_part_words(&(t[n]), b, &(b[n]), tnb, n - tnb); /* - */
+      neg = 1;
+      break;
+    case 3:
+    /* break; */
+    case 4:
+      bn_sub_part_words(t, a, &(a[n]), tna, n - tna);
+      bn_sub_part_words(&(t[n]), &(b[n]), b, tnb, tnb - n);
+      break;
+  }
+
+  if (n == 8) {
+    bn_mul_comba8(&(t[n2]), t, &(t[n]));
+    bn_mul_comba8(r, a, b);
+    bn_mul_normal(&(r[n2]), &(a[n]), tna, &(b[n]), tnb);
+    memset(&(r[n2 + tna + tnb]), 0, sizeof(BN_ULONG) * (n2 - tna - tnb));
+  } else {
+    p = &(t[n2 * 2]);
+    bn_mul_recursive(&(t[n2]), t, &(t[n]), n, 0, 0, p);
+    bn_mul_recursive(r, a, b, n, 0, 0, p);
+    i = n / 2;
+    /* If there is only a bottom half to the number,
+     * just do it */
+    if (tna > tnb) {
+      j = tna - i;
+    } else {
+      j = tnb - i;
+    }
+
+    if (j == 0) {
+      bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i, tnb - i, p);
+      memset(&(r[n2 + i * 2]), 0, sizeof(BN_ULONG) * (n2 - i * 2));
+    } else if (j > 0) {
+      /* eg, n == 16, i == 8 and tn == 11 */
+      bn_mul_part_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i, tnb - i, p);
+      memset(&(r[n2 + tna + tnb]), 0, sizeof(BN_ULONG) * (n2 - tna - tnb));
+    } else {
+      /* (j < 0) eg, n == 16, i == 8 and tn == 5 */
+      memset(&(r[n2]), 0, sizeof(BN_ULONG) * n2);
+      if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL &&
+          tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) {
+        bn_mul_normal(&(r[n2]), &(a[n]), tna, &(b[n]), tnb);
+      } else {
+        for (;;) {
+          i /= 2;
+          /* these simplified conditions work
+           * exclusively because difference
+           * between tna and tnb is 1 or 0 */
+          if (i < tna || i < tnb) {
+            bn_mul_part_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i,
+                                  tnb - i, p);
+            break;
+          } else if (i == tna || i == tnb) {
+            bn_mul_recursive(&(r[n2]), &(a[n]), &(b[n]), i, tna - i, tnb - i,
+                             p);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  /* t[32] holds (a[0]-a[1])*(b[1]-b[0]), c1 is the sign
+   * r[10] holds (a[0]*b[0])
+   * r[32] holds (b[1]*b[1])
+   */
+
+  c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
+
+  if (neg) {
+    /* if t[32] is negative */
+    c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
+  } else {
+    /* Might have a carry */
+    c1 += (int)(bn_add_words(&(t[n2]), &(t[n2]), t, n2));
+  }
+
+  /* t[32] holds (a[0]-a[1])*(b[1]-b[0])+(a[0]*b[0])+(a[1]*b[1])
+   * r[10] holds (a[0]*b[0])
+   * r[32] holds (b[1]*b[1])
+   * c1 holds the carry bits */
+  c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
+  if (c1) {
+    p = &(r[n + n2]);
+    lo = *p;
+    ln = (lo + c1) & BN_MASK2;
+    *p = ln;
+
+    /* The overflow will stop before we over write
+     * words we should not overwrite */
+    if (ln < (BN_ULONG)c1) {
+      do {
+        p++;
+        lo = *p;
+        ln = (lo + 1) & BN_MASK2;
+        *p = ln;
+      } while (ln == 0);
+    }
+  }
+}
+
+int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) {
+  int ret = 0;
+  int top, al, bl;
+  BIGNUM *rr;
+  int i;
+  BIGNUM *t = NULL;
+  int j = 0, k;
+
+  al = a->top;
+  bl = b->top;
+
+  if ((al == 0) || (bl == 0)) {
+    BN_zero(r);
+    return 1;
+  }
+  top = al + bl;
+
+  BN_CTX_start(ctx);
+  if ((r == a) || (r == b)) {
+    if ((rr = BN_CTX_get(ctx)) == NULL) {
+      goto err;
+    }
+  } else {
+    rr = r;
+  }
+  rr->neg = a->neg ^ b->neg;
+
+  i = al - bl;
+  if (i == 0) {
+    if (al == 8) {
+      if (bn_wexpand(rr, 16) == NULL) {
+        goto err;
+      }
+      rr->top = 16;
+      bn_mul_comba8(rr->d, a->d, b->d);
+      goto end;
+    }
+  }
+
+  if ((al >= BN_MULL_SIZE_NORMAL) && (bl >= BN_MULL_SIZE_NORMAL)) {
+    if (i >= -1 && i <= 1) {
+      /* Find out the power of two lower or equal
+         to the longest of the two numbers */
+      if (i >= 0) {
+        j = BN_num_bits_word((BN_ULONG)al);
+      }
+      if (i == -1) {
+        j = BN_num_bits_word((BN_ULONG)bl);
+      }
+      j = 1 << (j - 1);
+      assert(j <= al || j <= bl);
+      k = j + j;
+      t = BN_CTX_get(ctx);
+      if (t == NULL) {
+        goto err;
+      }
+      if (al > j || bl > j) {
+        if (bn_wexpand(t, k * 4) == NULL) {
+          goto err;
+        }
+        if (bn_wexpand(rr, k * 4) == NULL) {
+          goto err;
+        }
+        bn_mul_part_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
+      } else {
+        /* al <= j || bl <= j */
+        if (bn_wexpand(t, k * 2) == NULL) {
+          goto err;
+        }
+        if (bn_wexpand(rr, k * 2) == NULL) {
+          goto err;
+        }
+        bn_mul_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d);
+      }
+      rr->top = top;
+      goto end;
+    }
+  }
+
+  if (bn_wexpand(rr, top) == NULL) {
+    goto err;
+  }
+  rr->top = top;
+  bn_mul_normal(rr->d, a->d, al, b->d, bl);
+
+end:
+  bn_correct_top(rr);
+  if (r != rr) {
+    BN_copy(r, rr);
+  }
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+/* tmp must have 2*n words */
+static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, int n, BN_ULONG *tmp) {
+  int i, j, max;
+  const BN_ULONG *ap;
+  BN_ULONG *rp;
+
+  max = n * 2;
+  ap = a;
+  rp = r;
+  rp[0] = rp[max - 1] = 0;
+  rp++;
+  j = n;
+
+  if (--j > 0) {
+    ap++;
+    rp[j] = bn_mul_words(rp, ap, j, ap[-1]);
+    rp += 2;
+  }
+
+  for (i = n - 2; i > 0; i--) {
+    j--;
+    ap++;
+    rp[j] = bn_mul_add_words(rp, ap, j, ap[-1]);
+    rp += 2;
+  }
+
+  bn_add_words(r, r, r, max);
+
+  /* There will not be a carry */
+
+  bn_sqr_words(tmp, a, n);
+
+  bn_add_words(r, r, tmp, max);
+}
+
+/* r is 2*n words in size,
+ * a and b are both n words in size.    (There's not actually a 'b' here ...)
+ * n must be a power of 2.
+ * We multiply and return the result.
+ * t must be 2*n words in size
+ * We calculate
+ * a[0]*b[0]
+ * a[0]*b[0]+a[1]*b[1]+(a[0]-a[1])*(b[1]-b[0])
+ * a[1]*b[1]
+ */
+static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, int n2, BN_ULONG *t) {
+  int n = n2 / 2;
+  int zero, c1;
+  BN_ULONG ln, lo, *p;
+
+  if (n2 == 4) {
+    bn_sqr_comba4(r, a);
+    return;
+  } else if (n2 == 8) {
+    bn_sqr_comba8(r, a);
+    return;
+  }
+  if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL) {
+    bn_sqr_normal(r, a, n2, t);
+    return;
+  }
+  /* r=(a[0]-a[1])*(a[1]-a[0]) */
+  c1 = bn_cmp_words(a, &(a[n]), n);
+  zero = 0;
+  if (c1 > 0) {
+    bn_sub_words(t, a, &(a[n]), n);
+  } else if (c1 < 0) {
+    bn_sub_words(t, &(a[n]), a, n);
+  } else {
+    zero = 1;
+  }
+
+  /* The result will always be negative unless it is zero */
+  p = &(t[n2 * 2]);
+
+  if (!zero) {
+    bn_sqr_recursive(&(t[n2]), t, n, p);
+  } else {
+    memset(&(t[n2]), 0, n2 * sizeof(BN_ULONG));
+  }
+  bn_sqr_recursive(r, a, n, p);
+  bn_sqr_recursive(&(r[n2]), &(a[n]), n, p);
+
+  /* t[32] holds (a[0]-a[1])*(a[1]-a[0]), it is negative or zero
+   * r[10] holds (a[0]*b[0])
+   * r[32] holds (b[1]*b[1]) */
+
+  c1 = (int)(bn_add_words(t, r, &(r[n2]), n2));
+
+  /* t[32] is negative */
+  c1 -= (int)(bn_sub_words(&(t[n2]), t, &(t[n2]), n2));
+
+  /* t[32] holds (a[0]-a[1])*(a[1]-a[0])+(a[0]*a[0])+(a[1]*a[1])
+   * r[10] holds (a[0]*a[0])
+   * r[32] holds (a[1]*a[1])
+   * c1 holds the carry bits */
+  c1 += (int)(bn_add_words(&(r[n]), &(r[n]), &(t[n2]), n2));
+  if (c1) {
+    p = &(r[n + n2]);
+    lo = *p;
+    ln = (lo + c1) & BN_MASK2;
+    *p = ln;
+
+    /* The overflow will stop before we over write
+     * words we should not overwrite */
+    if (ln < (BN_ULONG)c1) {
+      do {
+        p++;
+        lo = *p;
+        ln = (lo + 1) & BN_MASK2;
+        *p = ln;
+      } while (ln == 0);
+    }
+  }
+}
+
+int BN_mul_word(BIGNUM *bn, BN_ULONG w) {
+  BN_ULONG ll;
+
+  w &= BN_MASK2;
+  if (!bn->top) {
+    return 1;
+  }
+
+  if (w == 0) {
+    BN_zero(bn);
+    return 1;
+  }
+
+  ll = bn_mul_words(bn->d, bn->d, bn->top, w);
+  if (ll) {
+    if (bn_wexpand(bn, bn->top + 1) == NULL) {
+      return 0;
+    }
+    bn->d[bn->top++] = ll;
+  }
+
+  return 1;
+}
+
+int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) {
+  int max, al;
+  int ret = 0;
+  BIGNUM *tmp, *rr;
+
+  al = a->top;
+  if (al <= 0) {
+    r->top = 0;
+    return 1;
+  }
+
+  BN_CTX_start(ctx);
+  rr = (a != r) ? r : BN_CTX_get(ctx);
+  tmp = BN_CTX_get(ctx);
+  if (!rr || !tmp) {
+    goto err;
+  }
+
+  max = 2 * al; /* Non-zero (from above) */
+  if (bn_wexpand(rr, max) == NULL) {
+    goto err;
+  }
+
+  if (al == 4) {
+    bn_sqr_comba4(rr->d, a->d);
+  } else if (al == 8) {
+    bn_sqr_comba8(rr->d, a->d);
+  } else {
+    if (al < BN_SQR_RECURSIVE_SIZE_NORMAL) {
+      BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL * 2];
+      bn_sqr_normal(rr->d, a->d, al, t);
+    } else {
+      int j, k;
+
+      j = BN_num_bits_word((BN_ULONG)al);
+      j = 1 << (j - 1);
+      k = j + j;
+      if (al == j) {
+        if (bn_wexpand(tmp, k * 2) == NULL) {
+          goto err;
+        }
+        bn_sqr_recursive(rr->d, a->d, al, tmp->d);
+      } else {
+        if (bn_wexpand(tmp, max) == NULL) {
+          goto err;
+        }
+        bn_sqr_normal(rr->d, a->d, al, tmp->d);
+      }
+    }
+  }
+
+  rr->neg = 0;
+  /* If the most-significant half of the top word of 'a' is zero, then
+   * the square of 'a' will max-1 words. */
+  if (a->d[al - 1] == (a->d[al - 1] & BN_MASK2l)) {
+    rr->top = max - 1;
+  } else {
+    rr->top = max;
+  }
+
+  if (rr != r) {
+    BN_copy(r, rr);
+  }
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}

diff --git a/crypto/bn/prime.c b/crypto/bn/prime.c
new file mode 100644
index 0000000..1b8221b
--- /dev/null
+++ b/crypto/bn/prime.c

@@ -0,0 +1,790 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+
+#include "internal.h"
+
+/* number of Miller-Rabin iterations for an error rate  of less than 2^-80
+ * for random 'b'-bit input, b >= 100 (taken from table 4.4 in the Handbook
+ * of Applied Cryptography [Menezes, van Oorschot, Vanstone; CRC Press 1996];
+ * original paper: Damgaard, Landrock, Pomerance: Average case error estimates
+ * for the strong probable prime test. -- Math. Comp. 61 (1993) 177-194) */
+#define BN_prime_checks_for_size(b) ((b) >= 1300 ?  2 : \
+                                (b) >=  850 ?  3 : \
+                                (b) >=  650 ?  4 : \
+                                (b) >=  550 ?  5 : \
+                                (b) >=  450 ?  6 : \
+                                (b) >=  400 ?  7 : \
+                                (b) >=  350 ?  8 : \
+                                (b) >=  300 ?  9 : \
+                                (b) >=  250 ? 12 : \
+                                (b) >=  200 ? 15 : \
+                                (b) >=  150 ? 18 : \
+                                /* b >= 100 */ 27)
+
+/* The quick sieve algorithm approach to weeding out primes is Philip
+ * Zimmermann's, as implemented in PGP.  I have had a read of his comments and
+ * implemented my own version. */
+
+/* NUMPRIMES is the number of primes that fit into a uint16_t. */
+#define NUMPRIMES 2048
+
+/* primes is defined at the bottom of the file and contains all the primes that
+ * fit into a uint16_t. */
+static const uint16_t primes[NUMPRIMES];
+
+static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
+                   const BIGNUM *a1_odd, int k, BN_CTX *ctx, BN_MONT_CTX *mont);
+static int probable_prime(BIGNUM *rnd, int bits);
+static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add,
+                             const BIGNUM *rem, BN_CTX *ctx);
+static int probable_prime_dh_safe(BIGNUM *rnd, int bits, const BIGNUM *add,
+                                  const BIGNUM *rem, BN_CTX *ctx);
+
+void BN_GENCB_set(BN_GENCB *callback,
+                  int (*f)(int event, int n, struct bn_gencb_st *),
+                  void *arg) {
+  callback->callback = f;
+  callback->arg = arg;
+}
+
+int BN_GENCB_call(BN_GENCB *callback, int event, int n) {
+  if (!callback) {
+    return 1;
+  }
+
+  return callback->callback(event, n, callback);
+}
+
+int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add,
+                         const BIGNUM *rem, BN_GENCB *cb) {
+  BIGNUM *t;
+  int found = 0;
+  int i, j, c1 = 0;
+  BN_CTX *ctx;
+  int checks = BN_prime_checks_for_size(bits);
+
+  ctx = BN_CTX_new();
+  if (ctx == NULL) {
+    goto err;
+  }
+  BN_CTX_start(ctx);
+  t = BN_CTX_get(ctx);
+  if (!t) {
+    goto err;
+  }
+
+loop:
+  /* make a random number and set the top and bottom bits */
+  if (add == NULL) {
+    if (!probable_prime(ret, bits)) {
+      goto err;
+    }
+  } else {
+    if (safe) {
+      if (!probable_prime_dh_safe(ret, bits, add, rem, ctx)) {
+        goto err;
+      }
+    } else {
+      if (!probable_prime_dh(ret, bits, add, rem, ctx)) {
+        goto err;
+      }
+    }
+  }
+
+  if (!BN_GENCB_call(cb, BN_GENCB_GENERATED, c1++)) {
+    /* aborted */
+    goto err;
+  }
+
+  if (!safe) {
+    i = BN_is_prime_fasttest_ex(ret, checks, ctx, 0, cb);
+    if (i == -1) {
+      goto err;
+    } else if (i == 0) {
+      goto loop;
+    }
+  } else {
+    /* for "safe prime" generation, check that (p-1)/2 is prime. Since a prime
+     * is odd, We just need to divide by 2 */
+    if (!BN_rshift1(t, ret)) {
+      goto err;
+    }
+
+    for (i = 0; i < checks; i++) {
+      j = BN_is_prime_fasttest_ex(ret, 1, ctx, 0, NULL);
+      if (j == -1) {
+        goto err;
+      } else if (j == 0) {
+        goto loop;
+      }
+
+      j = BN_is_prime_fasttest_ex(t, 1, ctx, 0, NULL);
+      if (j == -1) {
+        goto err;
+      } else if (j == 0) {
+        goto loop;
+      }
+
+      if (!BN_GENCB_call(cb, i, c1 - 1)) {
+        goto err;
+      }
+      /* We have a safe prime test pass */
+    }
+  }
+
+  /* we have a prime :-) */
+  found = 1;
+
+err:
+  if (ctx != NULL) {
+    BN_CTX_end(ctx);
+    BN_CTX_free(ctx);
+  }
+
+  return found;
+}
+
+int BN_primality_test(int *is_probably_prime, const BIGNUM *candidate,
+                      int checks, BN_CTX *ctx, int do_trial_division,
+                      BN_GENCB *cb) {
+  switch (BN_is_prime_fasttest_ex(candidate, checks, ctx, do_trial_division, cb)) {
+    case 1:
+      *is_probably_prime = 1;
+      return 1;
+    case 0:
+      *is_probably_prime = 0;
+      return 1;
+    default:
+      *is_probably_prime = 0;
+      return 0;
+  }
+}
+
+int BN_is_prime_ex(const BIGNUM *candidate, int checks, BN_CTX *ctx, BN_GENCB *cb) {
+  return BN_is_prime_fasttest_ex(candidate, checks, ctx, 0, cb);
+}
+
+int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx_passed,
+                            int do_trial_division, BN_GENCB *cb) {
+  int i, j, ret = -1;
+  int k;
+  BN_CTX *ctx = NULL;
+  BIGNUM *A1, *A1_odd, *check; /* taken from ctx */
+  BN_MONT_CTX *mont = NULL;
+  const BIGNUM *A = NULL;
+
+  if (BN_cmp(a, BN_value_one()) <= 0) {
+    return 0;
+  }
+
+  if (checks == BN_prime_checks) {
+    checks = BN_prime_checks_for_size(BN_num_bits(a));
+  }
+
+  /* first look for small factors */
+  if (!BN_is_odd(a)) {
+    /* a is even => a is prime if and only if a == 2 */
+    return BN_is_word(a, 2);
+  }
+
+  if (do_trial_division) {
+    for (i = 1; i < NUMPRIMES; i++) {
+      if (BN_mod_word(a, primes[i]) == 0) {
+        return 0;
+      }
+    }
+
+    if (!BN_GENCB_call(cb, 1, -1)) {
+      goto err;
+    }
+  }
+
+  if (ctx_passed != NULL) {
+    ctx = ctx_passed;
+  } else if ((ctx = BN_CTX_new()) == NULL) {
+    goto err;
+  }
+  BN_CTX_start(ctx);
+
+  /* A := abs(a) */
+  if (a->neg) {
+    BIGNUM *t;
+    if ((t = BN_CTX_get(ctx)) == NULL) {
+      goto err;
+    }
+    BN_copy(t, a);
+    t->neg = 0;
+    A = t;
+  } else {
+    A = a;
+  }
+
+  A1 = BN_CTX_get(ctx);
+  A1_odd = BN_CTX_get(ctx);
+  check = BN_CTX_get(ctx);
+  if (check == NULL) {
+    goto err;
+  }
+
+  /* compute A1 := A - 1 */
+  if (!BN_copy(A1, A)) {
+    goto err;
+  }
+  if (!BN_sub_word(A1, 1)) {
+    goto err;
+  }
+  if (BN_is_zero(A1)) {
+    ret = 0;
+    goto err;
+  }
+
+  /* write  A1  as  A1_odd * 2^k */
+  k = 1;
+  while (!BN_is_bit_set(A1, k)) {
+    k++;
+  }
+  if (!BN_rshift(A1_odd, A1, k)) {
+    goto err;
+  }
+
+  /* Montgomery setup for computations mod A */
+  mont = BN_MONT_CTX_new();
+  if (mont == NULL) {
+    goto err;
+  }
+  if (!BN_MONT_CTX_set(mont, A, ctx)) {
+    goto err;
+  }
+
+  for (i = 0; i < checks; i++) {
+    if (!BN_pseudo_rand_range(check, A1)) {
+      goto err;
+    }
+    if (!BN_add_word(check, 1)) {
+      goto err;
+    }
+    /* now 1 <= check < A */
+
+    j = witness(check, A, A1, A1_odd, k, ctx, mont);
+    if (j == -1) {
+      goto err;
+    }
+    if (j) {
+      ret = 0;
+      goto err;
+    }
+    if (!BN_GENCB_call(cb, 1, i)) {
+      goto err;
+    }
+  }
+  ret = 1;
+
+err:
+  if (ctx != NULL) {
+    BN_CTX_end(ctx);
+    if (ctx_passed == NULL) {
+      BN_CTX_free(ctx);
+    }
+  }
+  if (mont != NULL) {
+    BN_MONT_CTX_free(mont);
+  }
+
+  return ret;
+}
+
+static int witness(BIGNUM *w, const BIGNUM *a, const BIGNUM *a1,
+                   const BIGNUM *a1_odd, int k, BN_CTX *ctx,
+                   BN_MONT_CTX *mont) {
+  if (!BN_mod_exp_mont(w, w, a1_odd, a, ctx, mont)) { /* w := w^a1_odd mod a */
+    return -1;
+  }
+  if (BN_is_one(w)) {
+    return 0; /* probably prime */
+  }
+  if (BN_cmp(w, a1) == 0) {
+    return 0; /* w == -1 (mod a),  'a' is probably prime */
+  }
+
+  while (--k) {
+    if (!BN_mod_mul(w, w, w, a, ctx)) { /* w := w^2 mod a */
+      return -1;
+    }
+
+    if (BN_is_one(w)) {
+      return 1; /* 'a' is composite, otherwise a previous 'w' would
+                 * have been == -1 (mod 'a') */
+    }
+
+    if (BN_cmp(w, a1) == 0) {
+      return 0; /* w == -1 (mod a), 'a' is probably prime */
+    }
+  }
+
+  /* If we get here, 'w' is the (a-1)/2-th power of the original 'w',
+   * and it is neither -1 nor +1 -- so 'a' cannot be prime */
+  return 1;
+}
+
+static int probable_prime(BIGNUM *rnd, int bits) {
+  int i;
+  uint16_t mods[NUMPRIMES];
+  BN_ULONG delta, maxdelta;
+
+again:
+  if (!BN_rand(rnd, bits, 1, 1)) {
+    return 0;
+  }
+
+  /* we now have a random number 'rand' to test. */
+  for (i = 1; i < NUMPRIMES; i++) {
+    mods[i] = (uint16_t)BN_mod_word(rnd, (BN_ULONG)primes[i]);
+  }
+  maxdelta = BN_MASK2 - primes[NUMPRIMES - 1];
+  delta = 0;
+
+loop:
+  for (i = 1; i < NUMPRIMES; i++) {
+    /* check that rnd is not a prime and also
+     * that gcd(rnd-1,primes) == 1 (except for 2) */
+    if (((mods[i] + delta) % primes[i]) <= 1) {
+      delta += 2;
+      if (delta > maxdelta) {
+        goto again;
+      }
+      goto loop;
+    }
+  }
+
+  if (!BN_add_word(rnd, delta)) {
+    return 0;
+  }
+
+  return 1;
+}
+
+static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add,
+                             const BIGNUM *rem, BN_CTX *ctx) {
+  int i, ret = 0;
+  BIGNUM *t1;
+
+  BN_CTX_start(ctx);
+  if ((t1 = BN_CTX_get(ctx)) == NULL) {
+    goto err;
+  }
+
+  if (!BN_rand(rnd, bits, 0, 1)) {
+    goto err;
+  }
+
+  /* we need ((rnd-rem) % add) == 0 */
+
+  if (!BN_mod(t1, rnd, add, ctx)) {
+    goto err;
+  }
+  if (!BN_sub(rnd, rnd, t1)) {
+    goto err;
+  }
+  if (rem == NULL) {
+    if (!BN_add_word(rnd, 1)) {
+      goto err;
+    }
+  } else {
+    if (!BN_add(rnd, rnd, rem)) {
+      goto err;
+    }
+  }
+  /* we now have a random number 'rand' to test. */
+
+loop:
+  for (i = 1; i < NUMPRIMES; i++) {
+    /* check that rnd is a prime */
+    if (BN_mod_word(rnd, (BN_ULONG)primes[i]) <= 1) {
+      if (!BN_add(rnd, rnd, add)) {
+        goto err;
+      }
+      goto loop;
+    }
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd,
+                                  const BIGNUM *rem, BN_CTX *ctx) {
+  int i, ret = 0;
+  BIGNUM *t1, *qadd, *q;
+
+  bits--;
+  BN_CTX_start(ctx);
+  t1 = BN_CTX_get(ctx);
+  q = BN_CTX_get(ctx);
+  qadd = BN_CTX_get(ctx);
+  if (qadd == NULL) {
+    goto err;
+  }
+
+  if (!BN_rshift1(qadd, padd)) {
+    goto err;
+  }
+
+  if (!BN_rand(q, bits, 0, 1)) {
+    goto err;
+  }
+
+  /* we need ((rnd-rem) % add) == 0 */
+  if (!BN_mod(t1, q, qadd, ctx)) {
+    goto err;
+  }
+
+  if (!BN_sub(q, q, t1)) {
+    goto err;
+  }
+
+  if (rem == NULL) {
+    if (!BN_add_word(q, 1)) {
+      goto err;
+    }
+  } else {
+    if (!BN_rshift1(t1, rem)) {
+      goto err;
+    }
+    if (!BN_add(q, q, t1)) {
+      goto err;
+    }
+  }
+
+  /* we now have a random number 'rand' to test. */
+  if (!BN_lshift1(p, q)) {
+    goto err;
+  }
+  if (!BN_add_word(p, 1)) {
+    goto err;
+  }
+
+loop:
+  for (i = 1; i < NUMPRIMES; i++) {
+    /* check that p and q are prime */
+    /* check that for p and q
+     * gcd(p-1,primes) == 1 (except for 2) */
+    if ((BN_mod_word(p, (BN_ULONG)primes[i]) == 0) ||
+        (BN_mod_word(q, (BN_ULONG)primes[i]) == 0)) {
+      if (!BN_add(p, p, padd)) {
+        goto err;
+      }
+      if (!BN_add(q, q, qadd)) {
+        goto err;
+      }
+      goto loop;
+    }
+  }
+
+  ret = 1;
+
+err:
+  BN_CTX_end(ctx);
+  return ret;
+}
+
+static const uint16_t primes[NUMPRIMES] = {
+    2,     3,     5,     7,     11,    13,    17,    19,    23,    29,    31,
+    37,    41,    43,    47,    53,    59,    61,    67,    71,    73,    79,
+    83,    89,    97,    101,   103,   107,   109,   113,   127,   131,   137,
+    139,   149,   151,   157,   163,   167,   173,   179,   181,   191,   193,
+    197,   199,   211,   223,   227,   229,   233,   239,   241,   251,   257,
+    263,   269,   271,   277,   281,   283,   293,   307,   311,   313,   317,
+    331,   337,   347,   349,   353,   359,   367,   373,   379,   383,   389,
+    397,   401,   409,   419,   421,   431,   433,   439,   443,   449,   457,
+    461,   463,   467,   479,   487,   491,   499,   503,   509,   521,   523,
+    541,   547,   557,   563,   569,   571,   577,   587,   593,   599,   601,
+    607,   613,   617,   619,   631,   641,   643,   647,   653,   659,   661,
+    673,   677,   683,   691,   701,   709,   719,   727,   733,   739,   743,
+    751,   757,   761,   769,   773,   787,   797,   809,   811,   821,   823,
+    827,   829,   839,   853,   857,   859,   863,   877,   881,   883,   887,
+    907,   911,   919,   929,   937,   941,   947,   953,   967,   971,   977,
+    983,   991,   997,   1009,  1013,  1019,  1021,  1031,  1033,  1039,  1049,
+    1051,  1061,  1063,  1069,  1087,  1091,  1093,  1097,  1103,  1109,  1117,
+    1123,  1129,  1151,  1153,  1163,  1171,  1181,  1187,  1193,  1201,  1213,
+    1217,  1223,  1229,  1231,  1237,  1249,  1259,  1277,  1279,  1283,  1289,
+    1291,  1297,  1301,  1303,  1307,  1319,  1321,  1327,  1361,  1367,  1373,
+    1381,  1399,  1409,  1423,  1427,  1429,  1433,  1439,  1447,  1451,  1453,
+    1459,  1471,  1481,  1483,  1487,  1489,  1493,  1499,  1511,  1523,  1531,
+    1543,  1549,  1553,  1559,  1567,  1571,  1579,  1583,  1597,  1601,  1607,
+    1609,  1613,  1619,  1621,  1627,  1637,  1657,  1663,  1667,  1669,  1693,
+    1697,  1699,  1709,  1721,  1723,  1733,  1741,  1747,  1753,  1759,  1777,
+    1783,  1787,  1789,  1801,  1811,  1823,  1831,  1847,  1861,  1867,  1871,
+    1873,  1877,  1879,  1889,  1901,  1907,  1913,  1931,  1933,  1949,  1951,
+    1973,  1979,  1987,  1993,  1997,  1999,  2003,  2011,  2017,  2027,  2029,
+    2039,  2053,  2063,  2069,  2081,  2083,  2087,  2089,  2099,  2111,  2113,
+    2129,  2131,  2137,  2141,  2143,  2153,  2161,  2179,  2203,  2207,  2213,
+    2221,  2237,  2239,  2243,  2251,  2267,  2269,  2273,  2281,  2287,  2293,
+    2297,  2309,  2311,  2333,  2339,  2341,  2347,  2351,  2357,  2371,  2377,
+    2381,  2383,  2389,  2393,  2399,  2411,  2417,  2423,  2437,  2441,  2447,
+    2459,  2467,  2473,  2477,  2503,  2521,  2531,  2539,  2543,  2549,  2551,
+    2557,  2579,  2591,  2593,  2609,  2617,  2621,  2633,  2647,  2657,  2659,
+    2663,  2671,  2677,  2683,  2687,  2689,  2693,  2699,  2707,  2711,  2713,
+    2719,  2729,  2731,  2741,  2749,  2753,  2767,  2777,  2789,  2791,  2797,
+    2801,  2803,  2819,  2833,  2837,  2843,  2851,  2857,  2861,  2879,  2887,
+    2897,  2903,  2909,  2917,  2927,  2939,  2953,  2957,  2963,  2969,  2971,
+    2999,  3001,  3011,  3019,  3023,  3037,  3041,  3049,  3061,  3067,  3079,
+    3083,  3089,  3109,  3119,  3121,  3137,  3163,  3167,  3169,  3181,  3187,
+    3191,  3203,  3209,  3217,  3221,  3229,  3251,  3253,  3257,  3259,  3271,
+    3299,  3301,  3307,  3313,  3319,  3323,  3329,  3331,  3343,  3347,  3359,
+    3361,  3371,  3373,  3389,  3391,  3407,  3413,  3433,  3449,  3457,  3461,
+    3463,  3467,  3469,  3491,  3499,  3511,  3517,  3527,  3529,  3533,  3539,
+    3541,  3547,  3557,  3559,  3571,  3581,  3583,  3593,  3607,  3613,  3617,
+    3623,  3631,  3637,  3643,  3659,  3671,  3673,  3677,  3691,  3697,  3701,
+    3709,  3719,  3727,  3733,  3739,  3761,  3767,  3769,  3779,  3793,  3797,
+    3803,  3821,  3823,  3833,  3847,  3851,  3853,  3863,  3877,  3881,  3889,
+    3907,  3911,  3917,  3919,  3923,  3929,  3931,  3943,  3947,  3967,  3989,
+    4001,  4003,  4007,  4013,  4019,  4021,  4027,  4049,  4051,  4057,  4073,
+    4079,  4091,  4093,  4099,  4111,  4127,  4129,  4133,  4139,  4153,  4157,
+    4159,  4177,  4201,  4211,  4217,  4219,  4229,  4231,  4241,  4243,  4253,
+    4259,  4261,  4271,  4273,  4283,  4289,  4297,  4327,  4337,  4339,  4349,
+    4357,  4363,  4373,  4391,  4397,  4409,  4421,  4423,  4441,  4447,  4451,
+    4457,  4463,  4481,  4483,  4493,  4507,  4513,  4517,  4519,  4523,  4547,
+    4549,  4561,  4567,  4583,  4591,  4597,  4603,  4621,  4637,  4639,  4643,
+    4649,  4651,  4657,  4663,  4673,  4679,  4691,  4703,  4721,  4723,  4729,
+    4733,  4751,  4759,  4783,  4787,  4789,  4793,  4799,  4801,  4813,  4817,
+    4831,  4861,  4871,  4877,  4889,  4903,  4909,  4919,  4931,  4933,  4937,
+    4943,  4951,  4957,  4967,  4969,  4973,  4987,  4993,  4999,  5003,  5009,
+    5011,  5021,  5023,  5039,  5051,  5059,  5077,  5081,  5087,  5099,  5101,
+    5107,  5113,  5119,  5147,  5153,  5167,  5171,  5179,  5189,  5197,  5209,
+    5227,  5231,  5233,  5237,  5261,  5273,  5279,  5281,  5297,  5303,  5309,
+    5323,  5333,  5347,  5351,  5381,  5387,  5393,  5399,  5407,  5413,  5417,
+    5419,  5431,  5437,  5441,  5443,  5449,  5471,  5477,  5479,  5483,  5501,
+    5503,  5507,  5519,  5521,  5527,  5531,  5557,  5563,  5569,  5573,  5581,
+    5591,  5623,  5639,  5641,  5647,  5651,  5653,  5657,  5659,  5669,  5683,
+    5689,  5693,  5701,  5711,  5717,  5737,  5741,  5743,  5749,  5779,  5783,
+    5791,  5801,  5807,  5813,  5821,  5827,  5839,  5843,  5849,  5851,  5857,
+    5861,  5867,  5869,  5879,  5881,  5897,  5903,  5923,  5927,  5939,  5953,
+    5981,  5987,  6007,  6011,  6029,  6037,  6043,  6047,  6053,  6067,  6073,
+    6079,  6089,  6091,  6101,  6113,  6121,  6131,  6133,  6143,  6151,  6163,
+    6173,  6197,  6199,  6203,  6211,  6217,  6221,  6229,  6247,  6257,  6263,
+    6269,  6271,  6277,  6287,  6299,  6301,  6311,  6317,  6323,  6329,  6337,
+    6343,  6353,  6359,  6361,  6367,  6373,  6379,  6389,  6397,  6421,  6427,
+    6449,  6451,  6469,  6473,  6481,  6491,  6521,  6529,  6547,  6551,  6553,
+    6563,  6569,  6571,  6577,  6581,  6599,  6607,  6619,  6637,  6653,  6659,
+    6661,  6673,  6679,  6689,  6691,  6701,  6703,  6709,  6719,  6733,  6737,
+    6761,  6763,  6779,  6781,  6791,  6793,  6803,  6823,  6827,  6829,  6833,
+    6841,  6857,  6863,  6869,  6871,  6883,  6899,  6907,  6911,  6917,  6947,
+    6949,  6959,  6961,  6967,  6971,  6977,  6983,  6991,  6997,  7001,  7013,
+    7019,  7027,  7039,  7043,  7057,  7069,  7079,  7103,  7109,  7121,  7127,
+    7129,  7151,  7159,  7177,  7187,  7193,  7207,  7211,  7213,  7219,  7229,
+    7237,  7243,  7247,  7253,  7283,  7297,  7307,  7309,  7321,  7331,  7333,
+    7349,  7351,  7369,  7393,  7411,  7417,  7433,  7451,  7457,  7459,  7477,
+    7481,  7487,  7489,  7499,  7507,  7517,  7523,  7529,  7537,  7541,  7547,
+    7549,  7559,  7561,  7573,  7577,  7583,  7589,  7591,  7603,  7607,  7621,
+    7639,  7643,  7649,  7669,  7673,  7681,  7687,  7691,  7699,  7703,  7717,
+    7723,  7727,  7741,  7753,  7757,  7759,  7789,  7793,  7817,  7823,  7829,
+    7841,  7853,  7867,  7873,  7877,  7879,  7883,  7901,  7907,  7919,  7927,
+    7933,  7937,  7949,  7951,  7963,  7993,  8009,  8011,  8017,  8039,  8053,
+    8059,  8069,  8081,  8087,  8089,  8093,  8101,  8111,  8117,  8123,  8147,
+    8161,  8167,  8171,  8179,  8191,  8209,  8219,  8221,  8231,  8233,  8237,
+    8243,  8263,  8269,  8273,  8287,  8291,  8293,  8297,  8311,  8317,  8329,
+    8353,  8363,  8369,  8377,  8387,  8389,  8419,  8423,  8429,  8431,  8443,
+    8447,  8461,  8467,  8501,  8513,  8521,  8527,  8537,  8539,  8543,  8563,
+    8573,  8581,  8597,  8599,  8609,  8623,  8627,  8629,  8641,  8647,  8663,
+    8669,  8677,  8681,  8689,  8693,  8699,  8707,  8713,  8719,  8731,  8737,
+    8741,  8747,  8753,  8761,  8779,  8783,  8803,  8807,  8819,  8821,  8831,
+    8837,  8839,  8849,  8861,  8863,  8867,  8887,  8893,  8923,  8929,  8933,
+    8941,  8951,  8963,  8969,  8971,  8999,  9001,  9007,  9011,  9013,  9029,
+    9041,  9043,  9049,  9059,  9067,  9091,  9103,  9109,  9127,  9133,  9137,
+    9151,  9157,  9161,  9173,  9181,  9187,  9199,  9203,  9209,  9221,  9227,
+    9239,  9241,  9257,  9277,  9281,  9283,  9293,  9311,  9319,  9323,  9337,
+    9341,  9343,  9349,  9371,  9377,  9391,  9397,  9403,  9413,  9419,  9421,
+    9431,  9433,  9437,  9439,  9461,  9463,  9467,  9473,  9479,  9491,  9497,
+    9511,  9521,  9533,  9539,  9547,  9551,  9587,  9601,  9613,  9619,  9623,
+    9629,  9631,  9643,  9649,  9661,  9677,  9679,  9689,  9697,  9719,  9721,
+    9733,  9739,  9743,  9749,  9767,  9769,  9781,  9787,  9791,  9803,  9811,
+    9817,  9829,  9833,  9839,  9851,  9857,  9859,  9871,  9883,  9887,  9901,
+    9907,  9923,  9929,  9931,  9941,  9949,  9967,  9973,  10007, 10009, 10037,
+    10039, 10061, 10067, 10069, 10079, 10091, 10093, 10099, 10103, 10111, 10133,
+    10139, 10141, 10151, 10159, 10163, 10169, 10177, 10181, 10193, 10211, 10223,
+    10243, 10247, 10253, 10259, 10267, 10271, 10273, 10289, 10301, 10303, 10313,
+    10321, 10331, 10333, 10337, 10343, 10357, 10369, 10391, 10399, 10427, 10429,
+    10433, 10453, 10457, 10459, 10463, 10477, 10487, 10499, 10501, 10513, 10529,
+    10531, 10559, 10567, 10589, 10597, 10601, 10607, 10613, 10627, 10631, 10639,
+    10651, 10657, 10663, 10667, 10687, 10691, 10709, 10711, 10723, 10729, 10733,
+    10739, 10753, 10771, 10781, 10789, 10799, 10831, 10837, 10847, 10853, 10859,
+    10861, 10867, 10883, 10889, 10891, 10903, 10909, 10937, 10939, 10949, 10957,
+    10973, 10979, 10987, 10993, 11003, 11027, 11047, 11057, 11059, 11069, 11071,
+    11083, 11087, 11093, 11113, 11117, 11119, 11131, 11149, 11159, 11161, 11171,
+    11173, 11177, 11197, 11213, 11239, 11243, 11251, 11257, 11261, 11273, 11279,
+    11287, 11299, 11311, 11317, 11321, 11329, 11351, 11353, 11369, 11383, 11393,
+    11399, 11411, 11423, 11437, 11443, 11447, 11467, 11471, 11483, 11489, 11491,
+    11497, 11503, 11519, 11527, 11549, 11551, 11579, 11587, 11593, 11597, 11617,
+    11621, 11633, 11657, 11677, 11681, 11689, 11699, 11701, 11717, 11719, 11731,
+    11743, 11777, 11779, 11783, 11789, 11801, 11807, 11813, 11821, 11827, 11831,
+    11833, 11839, 11863, 11867, 11887, 11897, 11903, 11909, 11923, 11927, 11933,
+    11939, 11941, 11953, 11959, 11969, 11971, 11981, 11987, 12007, 12011, 12037,
+    12041, 12043, 12049, 12071, 12073, 12097, 12101, 12107, 12109, 12113, 12119,
+    12143, 12149, 12157, 12161, 12163, 12197, 12203, 12211, 12227, 12239, 12241,
+    12251, 12253, 12263, 12269, 12277, 12281, 12289, 12301, 12323, 12329, 12343,
+    12347, 12373, 12377, 12379, 12391, 12401, 12409, 12413, 12421, 12433, 12437,
+    12451, 12457, 12473, 12479, 12487, 12491, 12497, 12503, 12511, 12517, 12527,
+    12539, 12541, 12547, 12553, 12569, 12577, 12583, 12589, 12601, 12611, 12613,
+    12619, 12637, 12641, 12647, 12653, 12659, 12671, 12689, 12697, 12703, 12713,
+    12721, 12739, 12743, 12757, 12763, 12781, 12791, 12799, 12809, 12821, 12823,
+    12829, 12841, 12853, 12889, 12893, 12899, 12907, 12911, 12917, 12919, 12923,
+    12941, 12953, 12959, 12967, 12973, 12979, 12983, 13001, 13003, 13007, 13009,
+    13033, 13037, 13043, 13049, 13063, 13093, 13099, 13103, 13109, 13121, 13127,
+    13147, 13151, 13159, 13163, 13171, 13177, 13183, 13187, 13217, 13219, 13229,
+    13241, 13249, 13259, 13267, 13291, 13297, 13309, 13313, 13327, 13331, 13337,
+    13339, 13367, 13381, 13397, 13399, 13411, 13417, 13421, 13441, 13451, 13457,
+    13463, 13469, 13477, 13487, 13499, 13513, 13523, 13537, 13553, 13567, 13577,
+    13591, 13597, 13613, 13619, 13627, 13633, 13649, 13669, 13679, 13681, 13687,
+    13691, 13693, 13697, 13709, 13711, 13721, 13723, 13729, 13751, 13757, 13759,
+    13763, 13781, 13789, 13799, 13807, 13829, 13831, 13841, 13859, 13873, 13877,
+    13879, 13883, 13901, 13903, 13907, 13913, 13921, 13931, 13933, 13963, 13967,
+    13997, 13999, 14009, 14011, 14029, 14033, 14051, 14057, 14071, 14081, 14083,
+    14087, 14107, 14143, 14149, 14153, 14159, 14173, 14177, 14197, 14207, 14221,
+    14243, 14249, 14251, 14281, 14293, 14303, 14321, 14323, 14327, 14341, 14347,
+    14369, 14387, 14389, 14401, 14407, 14411, 14419, 14423, 14431, 14437, 14447,
+    14449, 14461, 14479, 14489, 14503, 14519, 14533, 14537, 14543, 14549, 14551,
+    14557, 14561, 14563, 14591, 14593, 14621, 14627, 14629, 14633, 14639, 14653,
+    14657, 14669, 14683, 14699, 14713, 14717, 14723, 14731, 14737, 14741, 14747,
+    14753, 14759, 14767, 14771, 14779, 14783, 14797, 14813, 14821, 14827, 14831,
+    14843, 14851, 14867, 14869, 14879, 14887, 14891, 14897, 14923, 14929, 14939,
+    14947, 14951, 14957, 14969, 14983, 15013, 15017, 15031, 15053, 15061, 15073,
+    15077, 15083, 15091, 15101, 15107, 15121, 15131, 15137, 15139, 15149, 15161,
+    15173, 15187, 15193, 15199, 15217, 15227, 15233, 15241, 15259, 15263, 15269,
+    15271, 15277, 15287, 15289, 15299, 15307, 15313, 15319, 15329, 15331, 15349,
+    15359, 15361, 15373, 15377, 15383, 15391, 15401, 15413, 15427, 15439, 15443,
+    15451, 15461, 15467, 15473, 15493, 15497, 15511, 15527, 15541, 15551, 15559,
+    15569, 15581, 15583, 15601, 15607, 15619, 15629, 15641, 15643, 15647, 15649,
+    15661, 15667, 15671, 15679, 15683, 15727, 15731, 15733, 15737, 15739, 15749,
+    15761, 15767, 15773, 15787, 15791, 15797, 15803, 15809, 15817, 15823, 15859,
+    15877, 15881, 15887, 15889, 15901, 15907, 15913, 15919, 15923, 15937, 15959,
+    15971, 15973, 15991, 16001, 16007, 16033, 16057, 16061, 16063, 16067, 16069,
+    16073, 16087, 16091, 16097, 16103, 16111, 16127, 16139, 16141, 16183, 16187,
+    16189, 16193, 16217, 16223, 16229, 16231, 16249, 16253, 16267, 16273, 16301,
+    16319, 16333, 16339, 16349, 16361, 16363, 16369, 16381, 16411, 16417, 16421,
+    16427, 16433, 16447, 16451, 16453, 16477, 16481, 16487, 16493, 16519, 16529,
+    16547, 16553, 16561, 16567, 16573, 16603, 16607, 16619, 16631, 16633, 16649,
+    16651, 16657, 16661, 16673, 16691, 16693, 16699, 16703, 16729, 16741, 16747,
+    16759, 16763, 16787, 16811, 16823, 16829, 16831, 16843, 16871, 16879, 16883,
+    16889, 16901, 16903, 16921, 16927, 16931, 16937, 16943, 16963, 16979, 16981,
+    16987, 16993, 17011, 17021, 17027, 17029, 17033, 17041, 17047, 17053, 17077,
+    17093, 17099, 17107, 17117, 17123, 17137, 17159, 17167, 17183, 17189, 17191,
+    17203, 17207, 17209, 17231, 17239, 17257, 17291, 17293, 17299, 17317, 17321,
+    17327, 17333, 17341, 17351, 17359, 17377, 17383, 17387, 17389, 17393, 17401,
+    17417, 17419, 17431, 17443, 17449, 17467, 17471, 17477, 17483, 17489, 17491,
+    17497, 17509, 17519, 17539, 17551, 17569, 17573, 17579, 17581, 17597, 17599,
+    17609, 17623, 17627, 17657, 17659, 17669, 17681, 17683, 17707, 17713, 17729,
+    17737, 17747, 17749, 17761, 17783, 17789, 17791, 17807, 17827, 17837, 17839,
+    17851, 17863, };

diff --git a/crypto/bn/random.c b/crypto/bn/random.c
new file mode 100644
index 0000000..1f1d732
--- /dev/null
+++ b/crypto/bn/random.c

@@ -0,0 +1,237 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.]
+ */
+/* ====================================================================
+ * Copyright (c) 1998-2001 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+#include <openssl/mem.h>
+#include <openssl/rand.h>
+
+int BN_rand(BIGNUM *rnd, int bits, int top, int bottom) {
+  uint8_t *buf = NULL;
+  int ret = 0, bit, bytes, mask;
+
+  if (bits == 0) {
+    BN_zero(rnd);
+    return 1;
+  }
+
+  bytes = (bits + 7) / 8;
+  bit = (bits - 1) % 8;
+  mask = 0xff << (bit + 1);
+
+  buf = OPENSSL_malloc(bytes);
+  if (buf == NULL) {
+    OPENSSL_PUT_ERROR(BN, BN_rand, ERR_R_MALLOC_FAILURE);
+    goto err;
+  }
+
+  /* make a random number and set the top and bottom bits */
+  if (RAND_pseudo_bytes(buf, bytes) <= 0)
+    goto err;
+
+  if (top != -1) {
+    if (top) {
+      if (bit == 0) {
+        buf[0] = 1;
+        buf[1] |= 0x80;
+      } else {
+        buf[0] |= (3 << (bit - 1));
+      }
+    } else {
+      buf[0] |= (1 << bit);
+    }
+  }
+
+  buf[0] &= ~mask;
+
+  /* set bottom bit if requested */
+  if (bottom)  {
+    buf[bytes - 1] |= 1;
+  }
+
+  if (!BN_bin2bn(buf, bytes, rnd)) {
+    goto err;
+  }
+
+  ret = 1;
+
+err:
+  if (buf != NULL) {
+    OPENSSL_cleanse(buf, bytes);
+    OPENSSL_free(buf);
+  }
+  return (ret);
+}
+
+int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom) {
+  return BN_rand(rnd, bits, top, bottom);
+}
+
+int BN_rand_range(BIGNUM *r, const BIGNUM *range) {
+  unsigned n;
+  unsigned count = 100;
+
+  if (range->neg || BN_is_zero(range)) {
+    OPENSSL_PUT_ERROR(BN, BN_rand_range, BN_R_INVALID_RANGE);
+    return 0;
+  }
+
+  n = BN_num_bits(range); /* n > 0 */
+
+  /* BN_is_bit_set(range, n - 1) always holds */
+  if (n == 1) {
+    BN_zero(r);
+  } else if (!BN_is_bit_set(range, n - 2) && !BN_is_bit_set(range, n - 3)) {
+    /* range = 100..._2,
+     * so  3*range (= 11..._2)  is exactly one bit longer than  range */
+    do {
+      if (!BN_rand(r, n + 1, -1 /* don't set most significant bits */,
+                   0 /* don't set least significant bits */)) {
+        return 0;
+      }
+
+      /* If r < 3*range, use r := r MOD range (which is either r, r - range, or
+       * r - 2*range). Otherwise, iterate again. Since 3*range = 11..._2, each
+       * iteration succeeds with probability >= .75. */
+      if (BN_cmp(r, range) >= 0) {
+        if (!BN_sub(r, r, range)) {
+          return 0;
+        }
+        if (BN_cmp(r, range) >= 0) {
+          if (!BN_sub(r, r, range)) {
+            return 0;
+          }
+        }
+      }
+
+      if (!--count) {
+        OPENSSL_PUT_ERROR(BN, BN_rand_range, BN_R_TOO_MANY_ITERATIONS);
+        return 0;
+      }
+    } while (BN_cmp(r, range) >= 0);
+  } else {
+    do {
+      /* range = 11..._2  or  range = 101..._2 */
+      if (!BN_rand(r, n, -1, 0)) {
+        return 0;
+      }
+
+      if (!--count) {
+        OPENSSL_PUT_ERROR(BN, BN_rand_range, BN_R_TOO_MANY_ITERATIONS);
+        return 0;
+      }
+    } while (BN_cmp(r, range) >= 0);
+  }
+
+  return 1;
+}
+
+int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range) {
+  return BN_rand_range(r, range);
+}

diff --git a/crypto/bn/rsaz_exp.c b/crypto/bn/rsaz_exp.c
new file mode 100644
index 0000000..43bb351
--- /dev/null
+++ b/crypto/bn/rsaz_exp.c

@@ -0,0 +1,320 @@
+/*****************************************************************************
+*                                                                            *
+*  Copyright (c) 2012, Intel Corporation                                     *
+*                                                                            *
+*  All rights reserved.                                                      *
+*                                                                            *
+*  Redistribution and use in source and binary forms, with or without        *
+*  modification, are permitted provided that the following conditions are    *
+*  met:                                                                      *
+*                                                                            *
+*  *  Redistributions of source code must retain the above copyright         *
+*     notice, this list of conditions and the following disclaimer.          *
+*                                                                            *
+*  *  Redistributions in binary form must reproduce the above copyright      *
+*     notice, this list of conditions and the following disclaimer in the    *
+*     documentation and/or other materials provided with the                 *
+*     distribution.                                                          *
+*                                                                            *
+*  *  Neither the name of the Intel Corporation nor the names of its         *
+*     contributors may be used to endorse or promote products derived from   *
+*     this software without specific prior written permission.               *
+*                                                                            *
+*                                                                            *
+*  THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY          *
+*  EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE         *
+*  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR        *
+*  PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR            *
+*  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,     *
+*  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,       *
+*  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR        *
+*  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF    *
+*  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING      *
+*  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS        *
+*  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.              *
+*                                                                            *
+******************************************************************************
+* Developers and authors:                                                    *
+* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   *
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel            *
+* (2) University of Haifa, Israel                                            *
+*****************************************************************************/
+
+#include "rsaz_exp.h"
+
+#include <openssl/mem.h>
+
+/*
+ * See crypto/bn/asm/rsaz-avx2.pl for further details.
+ */
+void rsaz_1024_norm2red_avx2(void *red,const void *norm);
+void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,BN_ULONG k);
+void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,BN_ULONG k,int cnt);
+void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i);
+void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i);
+void rsaz_1024_red2norm_avx2(void *norm,const void *red);
+
+#if defined(__GNUC__)
+# define ALIGN64	__attribute__((aligned(64)))
+#elif defined(_MSC_VER)
+# define ALIGN64	__declspec(align(64))
+#elif defined(__SUNPRO_C)
+# define ALIGN64
+# pragma align 64(one,two80)
+#else
+# define ALIGN64	/* not fatal, might hurt performance a little */
+#endif
+
+ALIGN64 static const BN_ULONG one[40] =
+	{1,0,0,    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+ALIGN64 static const BN_ULONG two80[40] =
+	{0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
+	const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
+	const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0)
+{
+	unsigned char	 storage[320*3+32*9*16+64];	/* 5.5KB */
+	unsigned char	*p_str = storage + (64-((size_t)storage%64));
+	unsigned char	*a_inv, *m, *result,
+			*table_s = p_str+320*3,
+			*R2      = table_s;	/* borrow */
+	int index;
+	int wvalue;
+
+	if ((((size_t)p_str&4095)+320)>>12) {
+		result = p_str;
+		a_inv = p_str + 320;
+		m = p_str + 320*2;	/* should not cross page */
+	} else {
+		m = p_str;		/* should not cross page */
+		result = p_str + 320;
+		a_inv = p_str + 320*2;
+	}
+
+	rsaz_1024_norm2red_avx2(m, m_norm);
+	rsaz_1024_norm2red_avx2(a_inv, base_norm);
+	rsaz_1024_norm2red_avx2(R2, RR);
+
+	rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
+	rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
+
+	/* table[0] = 1 */
+	rsaz_1024_mul_avx2(result, R2, one, m, k0);
+	/* table[1] = a_inv^1 */
+	rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
+
+	rsaz_1024_scatter5_avx2(table_s,result,0);
+	rsaz_1024_scatter5_avx2(table_s,a_inv,1);
+
+	/* table[2] = a_inv^2 */
+	rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,2);
+#if 0
+	/* this is almost 2x smaller and less than 1% slower */
+	for (index=3; index<32; index++) {
+		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+		rsaz_1024_scatter5_avx2(table_s,result,index);
+	}
+#else
+	/* table[4] = a_inv^4 */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,4);
+	/* table[8] = a_inv^8 */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,8);
+	/* table[16] = a_inv^16 */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,16);
+	/* table[17] = a_inv^17 */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,17);
+
+	/* table[3] */
+	rsaz_1024_gather5_avx2(result,table_s,2);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,3);
+	/* table[6] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,6);
+	/* table[12] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,12);
+ 	/* table[24] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,24);
+	/* table[25] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,25);
+
+	/* table[5] */
+	rsaz_1024_gather5_avx2(result,table_s,4);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,5);
+	/* table[10] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,10);
+	/* table[20] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,20);
+	/* table[21] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,21);
+
+	/* table[7] */
+	rsaz_1024_gather5_avx2(result,table_s,6);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,7);
+	/* table[14] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,14);
+	/* table[28] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,28);
+	/* table[29] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,29);
+
+	/* table[9] */
+	rsaz_1024_gather5_avx2(result,table_s,8);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,9);
+	/* table[18] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,18);
+	/* table[19] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,19);
+
+	/* table[11] */
+	rsaz_1024_gather5_avx2(result,table_s,10);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,11);
+	/* table[22] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,22);
+	/* table[23] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,23);
+
+	/* table[13] */
+	rsaz_1024_gather5_avx2(result,table_s,12);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,13);
+	/* table[26] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,26);
+	/* table[27] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,27);
+
+	/* table[15] */
+	rsaz_1024_gather5_avx2(result,table_s,14);
+	rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
+	rsaz_1024_scatter5_avx2(table_s,result,15);
+	/* table[30] */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 1);
+	rsaz_1024_scatter5_avx2(table_s,result,30);
+	/* table[31] */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	rsaz_1024_scatter5_avx2(table_s,result,31);
+#endif
+
+	/* load first window */
+	p_str = (unsigned char*)exponent;
+	wvalue = p_str[127] >> 3;
+	rsaz_1024_gather5_avx2(result,table_s,wvalue);
+
+	index = 1014;
+
+	while(index > -1) {	/* loop for the remaining 127 windows */
+
+		rsaz_1024_sqr_avx2(result, result, m, k0, 5);
+
+		wvalue = *((unsigned short*)&p_str[index/8]);
+		wvalue = (wvalue>> (index%8)) & 31;
+		index-=5;
+
+		rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
+		rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+	}
+
+	/* square four times */
+	rsaz_1024_sqr_avx2(result, result, m, k0, 4);
+
+	wvalue = p_str[0] & 15;
+
+	rsaz_1024_gather5_avx2(a_inv,table_s,wvalue);	/* borrow a_inv */
+	rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
+
+	/* from Montgomery */
+	rsaz_1024_mul_avx2(result, result, one, m, k0);
+
+	rsaz_1024_red2norm_avx2(result_norm, result);
+
+	OPENSSL_cleanse(storage,sizeof(storage));
+}
+
+/*
+ * See crypto/bn/rsaz-x86_64.pl for further details.
+ */
+void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,BN_ULONG k);
+void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,BN_ULONG k,const void *tbl,unsigned int power);
+void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,BN_ULONG k,unsigned int power);
+void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,BN_ULONG k);
+void rsaz_512_sqr(void *ret,const void *a,const void *n,BN_ULONG k,int cnt);
+void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power);
+void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power);
+
+void RSAZ_512_mod_exp(BN_ULONG result[8],
+	const BN_ULONG base[8], const BN_ULONG exponent[8],
+	const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
+{
+	unsigned char	 storage[16*8*8+64*2+64];	/* 1.2KB */
+	unsigned char	*table = storage + (64-((size_t)storage%64));
+	BN_ULONG	*a_inv = (BN_ULONG *)(table+16*8*8),
+			*temp  = (BN_ULONG *)(table+16*8*8+8*8);
+	unsigned char	*p_str = (unsigned char*)exponent;
+	int index;
+	unsigned int wvalue;
+
+	/* table[0] = 1_inv */
+	temp[0] = 0-m[0];	temp[1] = ~m[1];
+	temp[2] = ~m[2];	temp[3] = ~m[3];
+	temp[4] = ~m[4];	temp[5] = ~m[5];
+	temp[6] = ~m[6];	temp[7] = ~m[7];
+	rsaz_512_scatter4(table, temp, 0);
+
+	/* table [1] = a_inv^1 */
+	rsaz_512_mul(a_inv, base, RR, m, k0);
+	rsaz_512_scatter4(table, a_inv, 1);
+
+	/* table [2] = a_inv^2 */
+	rsaz_512_sqr(temp, a_inv, m, k0, 1);
+	rsaz_512_scatter4(table, temp, 2);
+
+	for (index=3; index<16; index++)
+		rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
+
+	/* load first window */
+	wvalue = p_str[63];
+
+	rsaz_512_gather4(temp, table, wvalue>>4);
+	rsaz_512_sqr(temp, temp, m, k0, 4);
+	rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf);
+
+	for (index=62; index>=0; index--) {
+		wvalue = p_str[index];
+
+		rsaz_512_sqr(temp, temp, m, k0, 4);
+		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4);
+
+		rsaz_512_sqr(temp, temp, m, k0, 4);
+		rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f);
+	}
+
+	/* from Montgomery */
+	rsaz_512_mul_by_one(result, temp, m, k0);
+
+	OPENSSL_cleanse(storage,sizeof(storage));
+}

diff --git a/crypto/bn/rsaz_exp.h b/crypto/bn/rsaz_exp.h
new file mode 100644
index 0000000..4241a1f
--- /dev/null
+++ b/crypto/bn/rsaz_exp.h

@@ -0,0 +1,44 @@
+/******************************************************************************
+* Copyright(c) 2012, Intel Corp.                                             
+* Developers and authors:                                                    
+* Shay Gueron (1, 2), and Vlad Krasnov (1)                                   
+* (1) Intel Corporation, Israel Development Center, Haifa, Israel                               
+* (2) University of Haifa, Israel                                              
+******************************************************************************
+* LICENSE:                                                                
+* This submission to OpenSSL is to be made available under the OpenSSL  
+* license, and only to the OpenSSL project, in order to allow integration    
+* into the publicly distributed code. 
+* The use of this code, or portions of this code, or concepts embedded in
+* this code, or modification of this code and/or algorithm(s) in it, or the
+* use of this code for any other purpose than stated above, requires special
+* licensing.                                                                  
+******************************************************************************
+* DISCLAIMER:                                                                
+* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS AND THE COPYRIGHT OWNERS     
+* ``AS IS''. ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 
+* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 
+* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE CONTRIBUTORS OR THE COPYRIGHT
+* OWNERS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, 
+* OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF    
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS   
+* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN    
+* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)    
+* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
+* POSSIBILITY OF SUCH DAMAGE.                                                
+******************************************************************************/
+
+#ifndef RSAZ_EXP_H
+#define RSAZ_EXP_H
+
+#include <openssl/bn.h>
+
+void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16],
+	const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
+	const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0);
+int rsaz_avx2_eligible();
+
+void RSAZ_512_mod_exp(BN_ULONG result[8],
+	const BN_ULONG base_norm[8], const BN_ULONG exponent[8],
+	const BN_ULONG m_norm[8], BN_ULONG k0, const BN_ULONG RR[8]);
+#endif

diff --git a/crypto/bn/shift.c b/crypto/bn/shift.c
new file mode 100644
index 0000000..384a697
--- /dev/null
+++ b/crypto/bn/shift.c

@@ -0,0 +1,284 @@
+/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
+ * All rights reserved.
+ *
+ * This package is an SSL implementation written
+ * by Eric Young (eay@cryptsoft.com).
+ * The implementation was written so as to conform with Netscapes SSL.
+ *
+ * This library is free for commercial and non-commercial use as long as
+ * the following conditions are aheared to.  The following conditions
+ * apply to all code found in this distribution, be it the RC4, RSA,
+ * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
+ * included with this distribution is covered by the same copyright terms
+ * except that the holder is Tim Hudson (tjh@cryptsoft.com).
+ *
+ * Copyright remains Eric Young's, and as such any Copyright notices in
+ * the code are not to be removed.
+ * If this package is used in a product, Eric Young should be given attribution
+ * as the author of the parts of the library used.
+ * This can be in the form of a textual message at program startup or
+ * in documentation (online or textual) provided with the package.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *    "This product includes cryptographic software written by
+ *     Eric Young (eay@cryptsoft.com)"
+ *    The word 'cryptographic' can be left out if the rouines from the library
+ *    being used are not cryptographic related :-).
+ * 4. If you include any Windows specific code (or a derivative thereof) from
+ *    the apps directory (application code) you must include an acknowledgement:
+ *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * The licence and distribution terms for any publically available version or
+ * derivative of this code cannot be changed.  i.e. this code cannot simply be
+ * copied and put under another distribution licence
+ * [including the GNU Public Licence.] */
+
+#include <openssl/bn.h>
+
+#include "internal.h"
+
+int BN_lshift(BIGNUM *r, const BIGNUM *a, int n) {
+  int i, nw, lb, rb;
+  BN_ULONG *t, *f;
+  BN_ULONG l;
+
+  r->neg = a->neg;
+  nw = n / BN_BITS2;
+  if (bn_wexpand(r, a->top + nw + 1) == NULL) {
+    return 0;
+  }
+  lb = n % BN_BITS2;
+  rb = BN_BITS2 - lb;
+  f = a->d;
+  t = r->d;
+  t[a->top + nw] = 0;
+  if (lb == 0) {
+    for (i = a->top - 1; i >= 0; i--) {
+      t[nw + i] = f[i];
+    }
+  } else {
+    for (i = a->top - 1; i >= 0; i--) {
+      l = f[i];
+      t[nw + i + 1] |= (l >> rb) & BN_MASK2;
+      t[nw + i] = (l << lb) & BN_MASK2;
+    }
+  }
+  memset(t, 0, nw * sizeof(t[0]));
+  r->top = a->top + nw + 1;
+  bn_correct_top(r);
+
+  return 1;
+}
+
+int BN_lshift1(BIGNUM *r, const BIGNUM *a) {
+  BN_ULONG *ap, *rp, t, c;
+  int i;
+
+  if (r != a) {
+    r->neg = a->neg;
+    if (bn_wexpand(r, a->top + 1) == NULL) {
+      return 0;
+    }
+    r->top = a->top;
+  } else {
+    if (bn_wexpand(r, a->top + 1) == NULL) {
+      return 0;
+    }
+  }
+  ap = a->d;
+  rp = r->d;
+  c = 0;
+  for (i = 0; i < a->top; i++) {
+    t = *(ap++);
+    *(rp++) = ((t << 1) | c) & BN_MASK2;
+    c = (t & BN_TBIT) ? 1 : 0;
+  }
+  if (c) {
+    *rp = 1;
+    r->top++;
+  }
+
+  return 1;
+}
+
+int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) {
+  int i, j, nw, lb, rb;
+  BN_ULONG *t, *f;
+  BN_ULONG l, tmp;
+
+  nw = n / BN_BITS2;
+  rb = n % BN_BITS2;
+  lb = BN_BITS2 - rb;
+  if (nw >= a->top || a->top == 0) {
+    BN_zero(r);
+    return 1;
+  }
+  i = (BN_num_bits(a) - n + (BN_BITS2 - 1)) / BN_BITS2;
+  if (r != a) {
+    r->neg = a->neg;
+    if (bn_wexpand(r, i) == NULL) {
+      return 0;
+    }
+  } else {
+    if (n == 0) {
+      return 1; /* or the copying loop will go berserk */
+    }
+  }
+
+  f = &(a->d[nw]);
+  t = r->d;
+  j = a->top - nw;
+  r->top = i;
+
+  if (rb == 0) {
+    for (i = j; i != 0; i--) {
+      *(t++) = *(f++);
+    }
+  } else {
+    l = *(f++);
+    for (i = j - 1; i != 0; i--) {
+      tmp = (l >> rb) & BN_MASK2;
+      l = *(f++);
+      *(t++) = (tmp | (l << lb)) & BN_MASK2;
+    }
+    if ((l = (l >> rb) & BN_MASK2)) {
+      *(t) = l;
+    }
+  }
+
+  return 1;
+}
+
+int BN_rshift1(BIGNUM *r, const BIGNUM *a) {
+  BN_ULONG *ap, *rp, t, c;
+  int i, j;
+
+  if (BN_is_zero(a)) {
+    BN_zero(r);
+    return 1;
+  }
+  i = a->top;
+  ap = a->d;
+  j = i - (ap[i - 1] == 1);
+  if (a != r) {
+    if (bn_wexpand(r, j) == NULL) {
+      return 0;
+    }
+    r->neg = a->neg;
+  }
+  rp = r->d;
+  t = ap[--i];
+  c = (t & 1) ? BN_TBIT : 0;
+  if (t >>= 1) {
+    rp[i] = t;
+  }
+  while (i > 0) {
+    t = ap[--i];
+    rp[i] = ((t >> 1) & BN_MASK2) | c;
+    c = (t & 1) ? BN_TBIT : 0;
+  }
+  r->top = j;
+
+  return 1;
+}
+
+int BN_set_bit(BIGNUM *a, int n) {
+  int i, j, k;
+
+  if (n < 0) {
+    return 0;
+  }
+
+  i = n / BN_BITS2;
+  j = n % BN_BITS2;
+  if (a->top <= i) {
+    if (bn_wexpand(a, i + 1) == NULL) {
+      return 0;
+    }
+    for (k = a->top; k < i + 1; k++) {
+      a->d[k] = 0;
+    }
+    a->top = i + 1;
+  }
+
+  a->d[i] |= (((BN_ULONG)1) << j);
+
+  return 1;
+}
+
+int BN_clear_bit(BIGNUM *a, int n) {
+  int i, j;
+
+  if (n < 0) {
+    return 0;
+  }
+
+  i = n / BN_BITS2;
+  j = n % BN_BITS2;
+  if (a->top <= i) {
+    return 0;
+  }
+
+  a->d[i] &= (~(((BN_ULONG)1) << j));
+  bn_correct_top(a);
+  return 1;
+}
+
+int BN_is_bit_set(const BIGNUM *a, int n) {
+  int i, j;
+
+  if (n < 0) {
+    return 0;
+  }
+  i = n / BN_BITS2;
+  j = n % BN_BITS2;
+  if (a->top <= i) {
+    return 0;
+  }
+
+  return (a->d[i]>>j)&1;
+}
+
+int BN_mask_bits(BIGNUM *a, int n) {
+  int b, w;
+
+  if (n < 0) {
+    return 0;
+  }
+
+  w = n / BN_BITS2;
+  b = n % BN_BITS2;
+  if (w >= a->top) {
+    return 0;
+  }
+  if (b == 0) {
+    a->top = w;
+  } else {
+    a->top = w + 1;
+    a->d[w] &= ~(BN_MASK2 << b);
+  }
+
+  bn_correct_top(a);
+  return 1;
+}

diff --git a/crypto/bn/sqrt.c b/crypto/bn/sqrt.c
new file mode 100644
index 0000000..3ec763b
--- /dev/null
+++ b/crypto/bn/sqrt.c

@@ -0,0 +1,430 @@
+/* Written by Lenka Fibikova <fibikova@exp-math.uni-essen.de>
+ * and Bodo Moeller for the OpenSSL project. */
+/* ====================================================================
+ * Copyright (c) 1998-2000 The OpenSSL Project.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer. 
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ *
+ * 3. All advertising materials mentioning features or use of this
+ *    software must display the following acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
+ *
+ * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
+ *    endorse or promote products derived from this software without
+ *    prior written permission. For written permission, please contact
+ *    openssl-core@openssl.org.
+ *
+ * 5. Products derived from this software may not be called "OpenSSL"
+ *    nor may "OpenSSL" appear in their names without prior written
+ *    permission of the OpenSSL Project.
+ *
+ * 6. Redistributions of any form whatsoever must retain the following
+ *    acknowledgment:
+ *    "This product includes software developed by the OpenSSL Project
+ *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
+ * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
+ * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ====================================================================
+ *
+ * This product includes cryptographic software written by Eric Young
+ * (eay@cryptsoft.com).  This product includes software written by Tim
+ * Hudson (tjh@cryptsoft.com). */
+
+#include <openssl/bn.h>
+
+#include <openssl/err.h>
+
+
+/* Returns 'ret' such that
+ *      ret^2 == a (mod p),
+ * using the Tonelli/Shanks algorithm (cf. Henri Cohen, "A Course
+ * in Algebraic Computational Number Theory", algorithm 1.5.1).
+ * 'p' must be prime! */
+BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) {
+  BIGNUM *ret = in;
+  int err = 1;
+  int r;
+  BIGNUM *A, *b, *q, *t, *x, *y;
+  int e, i, j;
+
+  if (!BN_is_odd(p) || BN_abs_is_word(p, 1)) {
+    if (BN_abs_is_word(p, 2)) {
+      if (ret == NULL) {
+        ret = BN_new();
+      }
+      if (ret == NULL) {
+        goto end;
+      }
+      if (!BN_set_word(ret, BN_is_bit_set(a, 0))) {
+        if (ret != in) {
+          BN_free(ret);
+        }
+        return NULL;
+      }
+      return ret;
+    }
+
+    OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_P_IS_NOT_PRIME);
+    return (NULL);
+  }
+
+  if (BN_is_zero(a) || BN_is_one(a)) {
+    if (ret == NULL) {
+      ret = BN_new();
+    }
+    if (ret == NULL) {
+      goto end;
+    }
+    if (!BN_set_word(ret, BN_is_one(a))) {
+      if (ret != in) {
+        BN_free(ret);
+      }
+      return NULL;
+    }
+    return ret;
+  }
+
+  BN_CTX_start(ctx);
+  A = BN_CTX_get(ctx);
+  b = BN_CTX_get(ctx);
+  q = BN_CTX_get(ctx);
+  t = BN_CTX_get(ctx);
+  x = BN_CTX_get(ctx);
+  y = BN_CTX_get(ctx);
+  if (y == NULL) {
+    goto end;
+  }
+
+  if (ret == NULL) {
+    ret = BN_new();
+  }
+  if (ret == NULL) {
+    goto end;
+  }
+
+  /* A = a mod p */
+  if (!BN_nnmod(A, a, p, ctx)) {
+    goto end;
+  }
+
+  /* now write  |p| - 1  as  2^e*q  where  q  is odd */
+  e = 1;
+  while (!BN_is_bit_set(p, e)) {
+    e++;
+  }
+  /* we'll set  q  later (if needed) */
+
+  if (e == 1) {
+    /* The easy case:  (|p|-1)/2  is odd, so 2 has an inverse
+     * modulo  (|p|-1)/2,  and square roots can be computed
+     * directly by modular exponentiation.
+     * We have
+     *     2 * (|p|+1)/4 == 1   (mod (|p|-1)/2),
+     * so we can use exponent  (|p|+1)/4,  i.e.  (|p|-3)/4 + 1.
+     */
+    if (!BN_rshift(q, p, 2)) {
+      goto end;
+    }
+    q->neg = 0;
+    if (!BN_add_word(q, 1) ||
+        !BN_mod_exp(ret, A, q, p, ctx)) {
+      goto end;
+    }
+    err = 0;
+    goto vrfy;
+  }
+
+  if (e == 2) {
+    /* |p| == 5  (mod 8)
+     *
+     * In this case  2  is always a non-square since
+     * Legendre(2,p) = (-1)^((p^2-1)/8)  for any odd prime.
+     * So if  a  really is a square, then  2*a  is a non-square.
+     * Thus for
+     *      b := (2*a)^((|p|-5)/8),
+     *      i := (2*a)*b^2
+     * we have
+     *     i^2 = (2*a)^((1 + (|p|-5)/4)*2)
+     *         = (2*a)^((p-1)/2)
+     *         = -1;
+     * so if we set
+     *      x := a*b*(i-1),
+     * then
+     *     x^2 = a^2 * b^2 * (i^2 - 2*i + 1)
+     *         = a^2 * b^2 * (-2*i)
+     *         = a*(-i)*(2*a*b^2)
+     *         = a*(-i)*i
+     *         = a.
+     *
+     * (This is due to A.O.L. Atkin,
+     * <URL:
+     *http://listserv.nodak.edu/scripts/wa.exe?A2=ind9211&L=nmbrthry&O=T&P=562>,
+     * November 1992.)
+     */
+
+    /* t := 2*a */
+    if (!BN_mod_lshift1_quick(t, A, p)) {
+      goto end;
+    }
+
+    /* b := (2*a)^((|p|-5)/8) */
+    if (!BN_rshift(q, p, 3)) {
+      goto end;
+    }
+    q->neg = 0;
+    if (!BN_mod_exp(b, t, q, p, ctx)) {
+      goto end;
+    }
+
+    /* y := b^2 */
+    if (!BN_mod_sqr(y, b, p, ctx)) {
+      goto end;
+    }
+
+    /* t := (2*a)*b^2 - 1*/
+    if (!BN_mod_mul(t, t, y, p, ctx) ||
+        !BN_sub_word(t, 1)) {
+      goto end;
+    }
+
+    /* x = a*b*t */
+    if (!BN_mod_mul(x, A, b, p, ctx) ||
+        !BN_mod_mul(x, x, t, p, ctx)) {
+      goto end;
+    }
+
+    if (!BN_copy(ret, x)) {
+      goto end;
+    }
+    err = 0;
+    goto vrfy;
+  }
+
+  /* e > 2, so we really have to use the Tonelli/Shanks algorithm.
+   * First, find some  y  that is not a square. */
+  if (!BN_copy(q, p)) {
+    goto end; /* use 'q' as temp */
+  }
+  q->neg = 0;
+  i = 2;
+  do {
+    /* For efficiency, try small numbers first;
+     * if this fails, try random numbers.
+     */
+    if (i < 22) {
+      if (!BN_set_word(y, i)) {
+        goto end;
+      }
+    } else {
+      if (!BN_pseudo_rand(y, BN_num_bits(p), 0, 0)) {
+        goto end;
+      }
+      if (BN_ucmp(y, p) >= 0) {
+        if (!(p->neg ? BN_add : BN_sub)(y, y, p)) {
+          goto end;
+        }
+      }
+      /* now 0 <= y < |p| */
+      if (BN_is_zero(y)) {
+        if (!BN_set_word(y, i)) {
+          goto end;
+        }
+      }
+    }
+
+    r = BN_kronecker(y, q, ctx); /* here 'q' is |p| */
+    if (r < -1) {
+      goto end;
+    }
+    if (r == 0) {
+      /* m divides p */
+      OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_P_IS_NOT_PRIME);
+      goto end;
+    }
+  } while (r == 1 && ++i < 82);
+
+  if (r != -1) {
+    /* Many rounds and still no non-square -- this is more likely
+     * a bug than just bad luck.
+     * Even if  p  is not prime, we should have found some  y
+     * such that r == -1.
+     */
+    OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_TOO_MANY_ITERATIONS);
+    goto end;
+  }
+
+  /* Here's our actual 'q': */
+  if (!BN_rshift(q, q, e)) {
+    goto end;
+  }
+
+  /* Now that we have some non-square, we can find an element
+   * of order  2^e  by computing its q'th power. */
+  if (!BN_mod_exp(y, y, q, p, ctx)) {
+    goto end;
+  }
+  if (BN_is_one(y)) {
+    OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_P_IS_NOT_PRIME);
+    goto end;
+  }
+
+  /* Now we know that (if  p  is indeed prime) there is an integer
+   * k,  0 <= k < 2^e,  such that
+   *
+   *      a^q * y^k == 1   (mod p).
+   *
+   * As  a^q  is a square and  y  is not,  k  must be even.
+   * q+1  is even, too, so there is an element
+   *
+   *     X := a^((q+1)/2) * y^(k/2),
+   *
+   * and it satisfies
+   *
+   *     X^2 = a^q * a     * y^k
+   *         = a,
+   *
+   * so it is the square root that we are looking for.
+   */
+
+  /* t := (q-1)/2  (note that  q  is odd) */
+  if (!BN_rshift1(t, q)) {
+    goto end;
+  }
+
+  /* x := a^((q-1)/2) */
+  if (BN_is_zero(t)) /* special case: p = 2^e + 1 */
+  {
+    if (!BN_nnmod(t, A, p, ctx)) {
+      goto end;
+    }
+    if (BN_is_zero(t)) {
+      /* special case: a == 0  (mod p) */
+      BN_zero(ret);
+      err = 0;
+      goto end;
+    } else if (!BN_one(x)) {
+      goto end;
+    }
+  } else {
+    if (!BN_mod_exp(x, A, t, p, ctx)) {
+      goto end;
+    }
+    if (BN_is_zero(x)) {
+      /* special case: a == 0  (mod p) */
+      BN_zero(ret);
+      err = 0;
+      goto end;
+    }
+  }
+
+  /* b := a*x^2  (= a^q) */
+  if (!BN_mod_sqr(b, x, p, ctx) ||
+      !BN_mod_mul(b, b, A, p, ctx)) {
+    goto end;
+  }
+
+  /* x := a*x    (= a^((q+1)/2)) */
+  if (!BN_mod_mul(x, x, A, p, ctx)) {
+    goto end;
+  }
+
+  while (1) {
+    /* Now  b  is  a^q * y^k  for some even  k  (0 <= k < 2^E
+     * where  E  refers to the original value of  e,  which we
+     * don't keep in a variable),  and  x  is  a^((q+1)/2) * y^(k/2).
+     *
+     * We have  a*b = x^2,
+     *    y^2^(e-1) = -1,
+     *    b^2^(e-1) = 1.
+     */
+
+    if (BN_is_one(b)) {
+      if (!BN_copy(ret, x)) {
+        goto end;
+      }
+      err = 0;
+      goto vrfy;
+    }
+
+
+    /* find smallest  i  such that  b^(2^i) = 1 */
+    i = 1;
+    if (!BN_mod_sqr(t, b, p, ctx)) {
+      goto end;
+    }
+    while (!BN_is_one(t)) {
+      i++;
+      if (i == e) {
+        OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_NOT_A_SQUARE);
+        goto end;
+      }
+      if (!BN_mod_mul(t, t, t, p, ctx)) {
+        goto end;
+      }
+    }
+
+
+    /* t := y^2^(e - i - 1) */
+    if (!BN_copy(t, y)) {
+      goto end;
+    }
+    for (j = e - i - 1; j > 0; j--) {
+      if (!BN_mod_sqr(t, t, p, ctx)) {
+        goto end;
+      }
+    }
+    if (!BN_mod_mul(y, t, t, p, ctx) ||
+        !BN_mod_mul(x, x, t, p, ctx) ||
+        !BN_mod_mul(b, b, y, p, ctx)) {
+      goto end;
+    }
+    e = i;
+  }
+
+vrfy:
+  if (!err) {
+    /* verify the result -- the input might have been not a square
+     * (test added in 0.9.8) */
+
+    if (!BN_mod_sqr(x, ret, p, ctx)) {
+      err = 1;
+    }
+
+    if (!err && 0 != BN_cmp(x, A)) {
+      OPENSSL_PUT_ERROR(BN, BN_mod_sqrt, BN_R_NOT_A_SQUARE);
+      err = 1;
+    }
+  }
+
+end:
+  if (err) {
+    if (ret != NULL && ret != in) {
+      BN_clear_free(ret);
+    }
+    ret = NULL;
+  }
+  BN_CTX_end(ctx);
+  return ret;
+}
commit	95c29f3cd1f6c08c6c0927868683392eea727ccb	[log] [tgz]
author	Adam Langley <agl@chromium.org>	Fri Jun 20 12:00:00 2014 -0700
committer	Adam Langley <agl@chromium.org>	Fri Jun 20 13:17:32 2014 -0700
tree	012767320ced9abca61472a4daa4c4a56b7ebe2b