|  | // This file is generated from a similarly-named Perl script in the BoringSSL | 
|  | // source tree. Do not edit by hand. | 
|  |  | 
|  | #include <openssl/asm_base.h> | 
|  |  | 
|  | #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) | 
|  | .text | 
|  |  | 
|  | .globl	_bn_mul_mont_words | 
|  | .private_extern	_bn_mul_mont_words | 
|  |  | 
|  | .align	5 | 
|  | _bn_mul_mont_words: | 
|  | AARCH64_SIGN_LINK_REGISTER | 
|  | tst	x5,#7 | 
|  | b.eq	__bn_sqr8x_mont | 
|  | tst	x5,#3 | 
|  | b.eq	__bn_mul4x_mont | 
|  | Lmul_mont: | 
|  | stp	x29,x30,[sp,#-64]! | 
|  | add	x29,sp,#0 | 
|  | stp	x19,x20,[sp,#16] | 
|  | stp	x21,x22,[sp,#32] | 
|  | stp	x23,x24,[sp,#48] | 
|  |  | 
|  | ldr	x9,[x2],#8		// bp[0] | 
|  | sub	x22,sp,x5,lsl#3 | 
|  | ldp	x7,x8,[x1],#16	// ap[0..1] | 
|  | lsl	x5,x5,#3 | 
|  | ldr	x4,[x4]		// *n0 | 
|  | and	x22,x22,#-16		// ABI says so | 
|  | ldp	x13,x14,[x3],#16	// np[0..1] | 
|  |  | 
|  | mul	x6,x7,x9		// ap[0]*bp[0] | 
|  | sub	x21,x5,#16		// j=num-2 | 
|  | umulh	x7,x7,x9 | 
|  | mul	x10,x8,x9		// ap[1]*bp[0] | 
|  | umulh	x11,x8,x9 | 
|  |  | 
|  | mul	x15,x6,x4		// "tp[0]"*n0 | 
|  | mov	sp,x22			// alloca | 
|  |  | 
|  | // (*)	mul	x12,x13,x15	// np[0]*m1 | 
|  | umulh	x13,x13,x15 | 
|  | mul	x16,x14,x15		// np[1]*m1 | 
|  | // (*)	adds	x12,x12,x6	// discarded | 
|  | // (*)	As for removal of first multiplication and addition | 
|  | //	instructions. The outcome of first addition is | 
|  | //	guaranteed to be zero, which leaves two computationally | 
|  | //	significant outcomes: it either carries or not. Then | 
|  | //	question is when does it carry? Is there alternative | 
|  | //	way to deduce it? If you follow operations, you can | 
|  | //	observe that condition for carry is quite simple: | 
|  | //	x6 being non-zero. So that carry can be calculated | 
|  | //	by adding -1 to x6. That's what next instruction does. | 
|  | subs	xzr,x6,#1		// (*) | 
|  | umulh	x17,x14,x15 | 
|  | adc	x13,x13,xzr | 
|  | cbz	x21,L1st_skip | 
|  |  | 
|  | L1st: | 
|  | ldr	x8,[x1],#8 | 
|  | adds	x6,x10,x7 | 
|  | sub	x21,x21,#8		// j-- | 
|  | adc	x7,x11,xzr | 
|  |  | 
|  | ldr	x14,[x3],#8 | 
|  | adds	x12,x16,x13 | 
|  | mul	x10,x8,x9		// ap[j]*bp[0] | 
|  | adc	x13,x17,xzr | 
|  | umulh	x11,x8,x9 | 
|  |  | 
|  | adds	x12,x12,x6 | 
|  | mul	x16,x14,x15		// np[j]*m1 | 
|  | adc	x13,x13,xzr | 
|  | umulh	x17,x14,x15 | 
|  | str	x12,[x22],#8		// tp[j-1] | 
|  | cbnz	x21,L1st | 
|  |  | 
|  | L1st_skip: | 
|  | adds	x6,x10,x7 | 
|  | sub	x1,x1,x5		// rewind x1 | 
|  | adc	x7,x11,xzr | 
|  |  | 
|  | adds	x12,x16,x13 | 
|  | sub	x3,x3,x5		// rewind x3 | 
|  | adc	x13,x17,xzr | 
|  |  | 
|  | adds	x12,x12,x6 | 
|  | sub	x20,x5,#8		// i=num-1 | 
|  | adcs	x13,x13,x7 | 
|  |  | 
|  | adc	x19,xzr,xzr		// upmost overflow bit | 
|  | stp	x12,x13,[x22] | 
|  |  | 
|  | Louter: | 
|  | ldr	x9,[x2],#8		// bp[i] | 
|  | ldp	x7,x8,[x1],#16 | 
|  | ldr	x23,[sp]		// tp[0] | 
|  | add	x22,sp,#8 | 
|  |  | 
|  | mul	x6,x7,x9		// ap[0]*bp[i] | 
|  | sub	x21,x5,#16		// j=num-2 | 
|  | umulh	x7,x7,x9 | 
|  | ldp	x13,x14,[x3],#16 | 
|  | mul	x10,x8,x9		// ap[1]*bp[i] | 
|  | adds	x6,x6,x23 | 
|  | umulh	x11,x8,x9 | 
|  | adc	x7,x7,xzr | 
|  |  | 
|  | mul	x15,x6,x4 | 
|  | sub	x20,x20,#8		// i-- | 
|  |  | 
|  | // (*)	mul	x12,x13,x15	// np[0]*m1 | 
|  | umulh	x13,x13,x15 | 
|  | mul	x16,x14,x15		// np[1]*m1 | 
|  | // (*)	adds	x12,x12,x6 | 
|  | subs	xzr,x6,#1		// (*) | 
|  | umulh	x17,x14,x15 | 
|  | cbz	x21,Linner_skip | 
|  |  | 
|  | Linner: | 
|  | ldr	x8,[x1],#8 | 
|  | adc	x13,x13,xzr | 
|  | ldr	x23,[x22],#8		// tp[j] | 
|  | adds	x6,x10,x7 | 
|  | sub	x21,x21,#8		// j-- | 
|  | adc	x7,x11,xzr | 
|  |  | 
|  | adds	x12,x16,x13 | 
|  | ldr	x14,[x3],#8 | 
|  | adc	x13,x17,xzr | 
|  |  | 
|  | mul	x10,x8,x9		// ap[j]*bp[i] | 
|  | adds	x6,x6,x23 | 
|  | umulh	x11,x8,x9 | 
|  | adc	x7,x7,xzr | 
|  |  | 
|  | mul	x16,x14,x15		// np[j]*m1 | 
|  | adds	x12,x12,x6 | 
|  | umulh	x17,x14,x15 | 
|  | str	x12,[x22,#-16]		// tp[j-1] | 
|  | cbnz	x21,Linner | 
|  |  | 
|  | Linner_skip: | 
|  | ldr	x23,[x22],#8		// tp[j] | 
|  | adc	x13,x13,xzr | 
|  | adds	x6,x10,x7 | 
|  | sub	x1,x1,x5		// rewind x1 | 
|  | adc	x7,x11,xzr | 
|  |  | 
|  | adds	x12,x16,x13 | 
|  | sub	x3,x3,x5		// rewind x3 | 
|  | adcs	x13,x17,x19 | 
|  | adc	x19,xzr,xzr | 
|  |  | 
|  | adds	x6,x6,x23 | 
|  | adc	x7,x7,xzr | 
|  |  | 
|  | adds	x12,x12,x6 | 
|  | adcs	x13,x13,x7 | 
|  | adc	x19,x19,xzr		// upmost overflow bit | 
|  | stp	x12,x13,[x22,#-16] | 
|  |  | 
|  | cbnz	x20,Louter | 
|  |  | 
|  | // Final step. We see if result is larger than modulus, and | 
|  | // if it is, subtract the modulus. But comparison implies | 
|  | // subtraction. So we subtract modulus, see if it borrowed, | 
|  | // and conditionally copy original value. | 
|  | ldr	x23,[sp]		// tp[0] | 
|  | add	x22,sp,#8 | 
|  | ldr	x14,[x3],#8		// np[0] | 
|  | subs	x21,x5,#8		// j=num-1 and clear borrow | 
|  | mov	x1,x0 | 
|  | Lsub: | 
|  | sbcs	x8,x23,x14		// tp[j]-np[j] | 
|  | ldr	x23,[x22],#8 | 
|  | sub	x21,x21,#8		// j-- | 
|  | ldr	x14,[x3],#8 | 
|  | str	x8,[x1],#8		// rp[j]=tp[j]-np[j] | 
|  | cbnz	x21,Lsub | 
|  |  | 
|  | sbcs	x8,x23,x14 | 
|  | sbcs	x19,x19,xzr		// did it borrow? | 
|  | str	x8,[x1],#8		// rp[num-1] | 
|  |  | 
|  | ldr	x23,[sp]		// tp[0] | 
|  | add	x22,sp,#8 | 
|  | ldr	x8,[x0],#8		// rp[0] | 
|  | sub	x5,x5,#8		// num-- | 
|  | nop | 
|  | Lcond_copy: | 
|  | sub	x5,x5,#8		// num-- | 
|  | csel	x14,x23,x8,lo		// did it borrow? | 
|  | ldr	x23,[x22],#8 | 
|  | ldr	x8,[x0],#8 | 
|  | str	xzr,[x22,#-16]		// wipe tp | 
|  | str	x14,[x0,#-16] | 
|  | cbnz	x5,Lcond_copy | 
|  |  | 
|  | csel	x14,x23,x8,lo | 
|  | str	xzr,[x22,#-8]		// wipe tp | 
|  | str	x14,[x0,#-8] | 
|  |  | 
|  | ldp	x19,x20,[x29,#16] | 
|  | mov	sp,x29 | 
|  | ldp	x21,x22,[x29,#32] | 
|  | // No return value | 
|  | ldp	x23,x24,[x29,#48] | 
|  | ldr	x29,[sp],#64 | 
|  | AARCH64_VALIDATE_LINK_REGISTER | 
|  | ret | 
|  |  | 
|  |  | 
|  | .align	5 | 
|  | __bn_sqr8x_mont: | 
|  | // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to | 
|  | // only from bn_mul_mont_words which has already signed the return address. | 
|  | cmp	x1,x2 | 
|  | b.ne	__bn_mul4x_mont | 
|  | Lsqr8x_mont: | 
|  | stp	x29,x30,[sp,#-128]! | 
|  | add	x29,sp,#0 | 
|  | stp	x19,x20,[sp,#16] | 
|  | stp	x21,x22,[sp,#32] | 
|  | stp	x23,x24,[sp,#48] | 
|  | stp	x25,x26,[sp,#64] | 
|  | stp	x27,x28,[sp,#80] | 
|  | stp	x0,x3,[sp,#96]	// offload rp and np | 
|  |  | 
|  | ldp	x6,x7,[x1,#8*0] | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | ldp	x10,x11,[x1,#8*4] | 
|  | ldp	x12,x13,[x1,#8*6] | 
|  |  | 
|  | sub	x2,sp,x5,lsl#4 | 
|  | lsl	x5,x5,#3 | 
|  | ldr	x4,[x4]		// *n0 | 
|  | mov	sp,x2			// alloca | 
|  | sub	x27,x5,#8*8 | 
|  | b	Lsqr8x_zero_start | 
|  |  | 
|  | Lsqr8x_zero: | 
|  | sub	x27,x27,#8*8 | 
|  | stp	xzr,xzr,[x2,#8*0] | 
|  | stp	xzr,xzr,[x2,#8*2] | 
|  | stp	xzr,xzr,[x2,#8*4] | 
|  | stp	xzr,xzr,[x2,#8*6] | 
|  | Lsqr8x_zero_start: | 
|  | stp	xzr,xzr,[x2,#8*8] | 
|  | stp	xzr,xzr,[x2,#8*10] | 
|  | stp	xzr,xzr,[x2,#8*12] | 
|  | stp	xzr,xzr,[x2,#8*14] | 
|  | add	x2,x2,#8*16 | 
|  | cbnz	x27,Lsqr8x_zero | 
|  |  | 
|  | add	x3,x1,x5 | 
|  | add	x1,x1,#8*8 | 
|  | mov	x19,xzr | 
|  | mov	x20,xzr | 
|  | mov	x21,xzr | 
|  | mov	x22,xzr | 
|  | mov	x23,xzr | 
|  | mov	x24,xzr | 
|  | mov	x25,xzr | 
|  | mov	x26,xzr | 
|  | mov	x2,sp | 
|  | str	x4,[x29,#112]		// offload n0 | 
|  |  | 
|  | // Multiply everything but a[i]*a[i] | 
|  | .align	4 | 
|  | Lsqr8x_outer_loop: | 
|  | //                                                 a[1]a[0]	(i) | 
|  | //                                             a[2]a[0] | 
|  | //                                         a[3]a[0] | 
|  | //                                     a[4]a[0] | 
|  | //                                 a[5]a[0] | 
|  | //                             a[6]a[0] | 
|  | //                         a[7]a[0] | 
|  | //                                         a[2]a[1]		(ii) | 
|  | //                                     a[3]a[1] | 
|  | //                                 a[4]a[1] | 
|  | //                             a[5]a[1] | 
|  | //                         a[6]a[1] | 
|  | //                     a[7]a[1] | 
|  | //                                 a[3]a[2]			(iii) | 
|  | //                             a[4]a[2] | 
|  | //                         a[5]a[2] | 
|  | //                     a[6]a[2] | 
|  | //                 a[7]a[2] | 
|  | //                         a[4]a[3]				(iv) | 
|  | //                     a[5]a[3] | 
|  | //                 a[6]a[3] | 
|  | //             a[7]a[3] | 
|  | //                 a[5]a[4]					(v) | 
|  | //             a[6]a[4] | 
|  | //         a[7]a[4] | 
|  | //         a[6]a[5]						(vi) | 
|  | //     a[7]a[5] | 
|  | // a[7]a[6]							(vii) | 
|  |  | 
|  | mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i) | 
|  | mul	x15,x8,x6 | 
|  | mul	x16,x9,x6 | 
|  | mul	x17,x10,x6 | 
|  | adds	x20,x20,x14		// t[1]+lo(a[1]*a[0]) | 
|  | mul	x14,x11,x6 | 
|  | adcs	x21,x21,x15 | 
|  | mul	x15,x12,x6 | 
|  | adcs	x22,x22,x16 | 
|  | mul	x16,x13,x6 | 
|  | adcs	x23,x23,x17 | 
|  | umulh	x17,x7,x6		// hi(a[1..7]*a[0]) | 
|  | adcs	x24,x24,x14 | 
|  | umulh	x14,x8,x6 | 
|  | adcs	x25,x25,x15 | 
|  | umulh	x15,x9,x6 | 
|  | adcs	x26,x26,x16 | 
|  | umulh	x16,x10,x6 | 
|  | stp	x19,x20,[x2],#8*2	// t[0..1] | 
|  | adc	x19,xzr,xzr		// t[8] | 
|  | adds	x21,x21,x17		// t[2]+lo(a[1]*a[0]) | 
|  | umulh	x17,x11,x6 | 
|  | adcs	x22,x22,x14 | 
|  | umulh	x14,x12,x6 | 
|  | adcs	x23,x23,x15 | 
|  | umulh	x15,x13,x6 | 
|  | adcs	x24,x24,x16 | 
|  | mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii) | 
|  | adcs	x25,x25,x17 | 
|  | mul	x17,x9,x7 | 
|  | adcs	x26,x26,x14 | 
|  | mul	x14,x10,x7 | 
|  | adc	x19,x19,x15 | 
|  |  | 
|  | mul	x15,x11,x7 | 
|  | adds	x22,x22,x16 | 
|  | mul	x16,x12,x7 | 
|  | adcs	x23,x23,x17 | 
|  | mul	x17,x13,x7 | 
|  | adcs	x24,x24,x14 | 
|  | umulh	x14,x8,x7		// hi(a[2..7]*a[1]) | 
|  | adcs	x25,x25,x15 | 
|  | umulh	x15,x9,x7 | 
|  | adcs	x26,x26,x16 | 
|  | umulh	x16,x10,x7 | 
|  | adcs	x19,x19,x17 | 
|  | umulh	x17,x11,x7 | 
|  | stp	x21,x22,[x2],#8*2	// t[2..3] | 
|  | adc	x20,xzr,xzr		// t[9] | 
|  | adds	x23,x23,x14 | 
|  | umulh	x14,x12,x7 | 
|  | adcs	x24,x24,x15 | 
|  | umulh	x15,x13,x7 | 
|  | adcs	x25,x25,x16 | 
|  | mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii) | 
|  | adcs	x26,x26,x17 | 
|  | mul	x17,x10,x8 | 
|  | adcs	x19,x19,x14 | 
|  | mul	x14,x11,x8 | 
|  | adc	x20,x20,x15 | 
|  |  | 
|  | mul	x15,x12,x8 | 
|  | adds	x24,x24,x16 | 
|  | mul	x16,x13,x8 | 
|  | adcs	x25,x25,x17 | 
|  | umulh	x17,x9,x8		// hi(a[3..7]*a[2]) | 
|  | adcs	x26,x26,x14 | 
|  | umulh	x14,x10,x8 | 
|  | adcs	x19,x19,x15 | 
|  | umulh	x15,x11,x8 | 
|  | adcs	x20,x20,x16 | 
|  | umulh	x16,x12,x8 | 
|  | stp	x23,x24,[x2],#8*2	// t[4..5] | 
|  | adc	x21,xzr,xzr		// t[10] | 
|  | adds	x25,x25,x17 | 
|  | umulh	x17,x13,x8 | 
|  | adcs	x26,x26,x14 | 
|  | mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv) | 
|  | adcs	x19,x19,x15 | 
|  | mul	x15,x11,x9 | 
|  | adcs	x20,x20,x16 | 
|  | mul	x16,x12,x9 | 
|  | adc	x21,x21,x17 | 
|  |  | 
|  | mul	x17,x13,x9 | 
|  | adds	x26,x26,x14 | 
|  | umulh	x14,x10,x9		// hi(a[4..7]*a[3]) | 
|  | adcs	x19,x19,x15 | 
|  | umulh	x15,x11,x9 | 
|  | adcs	x20,x20,x16 | 
|  | umulh	x16,x12,x9 | 
|  | adcs	x21,x21,x17 | 
|  | umulh	x17,x13,x9 | 
|  | stp	x25,x26,[x2],#8*2	// t[6..7] | 
|  | adc	x22,xzr,xzr		// t[11] | 
|  | adds	x19,x19,x14 | 
|  | mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v) | 
|  | adcs	x20,x20,x15 | 
|  | mul	x15,x12,x10 | 
|  | adcs	x21,x21,x16 | 
|  | mul	x16,x13,x10 | 
|  | adc	x22,x22,x17 | 
|  |  | 
|  | umulh	x17,x11,x10		// hi(a[5..7]*a[4]) | 
|  | adds	x20,x20,x14 | 
|  | umulh	x14,x12,x10 | 
|  | adcs	x21,x21,x15 | 
|  | umulh	x15,x13,x10 | 
|  | adcs	x22,x22,x16 | 
|  | mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi) | 
|  | adc	x23,xzr,xzr		// t[12] | 
|  | adds	x21,x21,x17 | 
|  | mul	x17,x13,x11 | 
|  | adcs	x22,x22,x14 | 
|  | umulh	x14,x12,x11		// hi(a[6..7]*a[5]) | 
|  | adc	x23,x23,x15 | 
|  |  | 
|  | umulh	x15,x13,x11 | 
|  | adds	x22,x22,x16 | 
|  | mul	x16,x13,x12		// lo(a[7]*a[6])		(vii) | 
|  | adcs	x23,x23,x17 | 
|  | umulh	x17,x13,x12		// hi(a[7]*a[6]) | 
|  | adc	x24,xzr,xzr		// t[13] | 
|  | adds	x23,x23,x14 | 
|  | sub	x27,x3,x1	// done yet? | 
|  | adc	x24,x24,x15 | 
|  |  | 
|  | adds	x24,x24,x16 | 
|  | sub	x14,x3,x5	// rewinded ap | 
|  | adc	x25,xzr,xzr		// t[14] | 
|  | add	x25,x25,x17 | 
|  |  | 
|  | cbz	x27,Lsqr8x_outer_break | 
|  |  | 
|  | mov	x4,x6 | 
|  | ldp	x6,x7,[x2,#8*0] | 
|  | ldp	x8,x9,[x2,#8*2] | 
|  | ldp	x10,x11,[x2,#8*4] | 
|  | ldp	x12,x13,[x2,#8*6] | 
|  | adds	x19,x19,x6 | 
|  | adcs	x20,x20,x7 | 
|  | ldp	x6,x7,[x1,#8*0] | 
|  | adcs	x21,x21,x8 | 
|  | adcs	x22,x22,x9 | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | adcs	x23,x23,x10 | 
|  | adcs	x24,x24,x11 | 
|  | ldp	x10,x11,[x1,#8*4] | 
|  | adcs	x25,x25,x12 | 
|  | mov	x0,x1 | 
|  | adcs	x26,xzr,x13 | 
|  | ldp	x12,x13,[x1,#8*6] | 
|  | add	x1,x1,#8*8 | 
|  | //adc	x28,xzr,xzr		// moved below | 
|  | mov	x27,#-8*8 | 
|  |  | 
|  | //                                                         a[8]a[0] | 
|  | //                                                     a[9]a[0] | 
|  | //                                                 a[a]a[0] | 
|  | //                                             a[b]a[0] | 
|  | //                                         a[c]a[0] | 
|  | //                                     a[d]a[0] | 
|  | //                                 a[e]a[0] | 
|  | //                             a[f]a[0] | 
|  | //                                                     a[8]a[1] | 
|  | //                         a[f]a[1]........................ | 
|  | //                                                 a[8]a[2] | 
|  | //                     a[f]a[2]........................ | 
|  | //                                             a[8]a[3] | 
|  | //                 a[f]a[3]........................ | 
|  | //                                         a[8]a[4] | 
|  | //             a[f]a[4]........................ | 
|  | //                                     a[8]a[5] | 
|  | //         a[f]a[5]........................ | 
|  | //                                 a[8]a[6] | 
|  | //     a[f]a[6]........................ | 
|  | //                             a[8]a[7] | 
|  | // a[f]a[7]........................ | 
|  | Lsqr8x_mul: | 
|  | mul	x14,x6,x4 | 
|  | adc	x28,xzr,xzr		// carry bit, modulo-scheduled | 
|  | mul	x15,x7,x4 | 
|  | add	x27,x27,#8 | 
|  | mul	x16,x8,x4 | 
|  | mul	x17,x9,x4 | 
|  | adds	x19,x19,x14 | 
|  | mul	x14,x10,x4 | 
|  | adcs	x20,x20,x15 | 
|  | mul	x15,x11,x4 | 
|  | adcs	x21,x21,x16 | 
|  | mul	x16,x12,x4 | 
|  | adcs	x22,x22,x17 | 
|  | mul	x17,x13,x4 | 
|  | adcs	x23,x23,x14 | 
|  | umulh	x14,x6,x4 | 
|  | adcs	x24,x24,x15 | 
|  | umulh	x15,x7,x4 | 
|  | adcs	x25,x25,x16 | 
|  | umulh	x16,x8,x4 | 
|  | adcs	x26,x26,x17 | 
|  | umulh	x17,x9,x4 | 
|  | adc	x28,x28,xzr | 
|  | str	x19,[x2],#8 | 
|  | adds	x19,x20,x14 | 
|  | umulh	x14,x10,x4 | 
|  | adcs	x20,x21,x15 | 
|  | umulh	x15,x11,x4 | 
|  | adcs	x21,x22,x16 | 
|  | umulh	x16,x12,x4 | 
|  | adcs	x22,x23,x17 | 
|  | umulh	x17,x13,x4 | 
|  | ldr	x4,[x0,x27] | 
|  | adcs	x23,x24,x14 | 
|  | adcs	x24,x25,x15 | 
|  | adcs	x25,x26,x16 | 
|  | adcs	x26,x28,x17 | 
|  | //adc	x28,xzr,xzr		// moved above | 
|  | cbnz	x27,Lsqr8x_mul | 
|  | // note that carry flag is guaranteed | 
|  | // to be zero at this point | 
|  | cmp	x1,x3		// done yet? | 
|  | b.eq	Lsqr8x_break | 
|  |  | 
|  | ldp	x6,x7,[x2,#8*0] | 
|  | ldp	x8,x9,[x2,#8*2] | 
|  | ldp	x10,x11,[x2,#8*4] | 
|  | ldp	x12,x13,[x2,#8*6] | 
|  | adds	x19,x19,x6 | 
|  | ldr	x4,[x0,#-8*8] | 
|  | adcs	x20,x20,x7 | 
|  | ldp	x6,x7,[x1,#8*0] | 
|  | adcs	x21,x21,x8 | 
|  | adcs	x22,x22,x9 | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | adcs	x23,x23,x10 | 
|  | adcs	x24,x24,x11 | 
|  | ldp	x10,x11,[x1,#8*4] | 
|  | adcs	x25,x25,x12 | 
|  | mov	x27,#-8*8 | 
|  | adcs	x26,x26,x13 | 
|  | ldp	x12,x13,[x1,#8*6] | 
|  | add	x1,x1,#8*8 | 
|  | //adc	x28,xzr,xzr		// moved above | 
|  | b	Lsqr8x_mul | 
|  |  | 
|  | .align	4 | 
|  | Lsqr8x_break: | 
|  | ldp	x6,x7,[x0,#8*0] | 
|  | add	x1,x0,#8*8 | 
|  | ldp	x8,x9,[x0,#8*2] | 
|  | sub	x14,x3,x1		// is it last iteration? | 
|  | ldp	x10,x11,[x0,#8*4] | 
|  | sub	x15,x2,x14 | 
|  | ldp	x12,x13,[x0,#8*6] | 
|  | cbz	x14,Lsqr8x_outer_loop | 
|  |  | 
|  | stp	x19,x20,[x2,#8*0] | 
|  | ldp	x19,x20,[x15,#8*0] | 
|  | stp	x21,x22,[x2,#8*2] | 
|  | ldp	x21,x22,[x15,#8*2] | 
|  | stp	x23,x24,[x2,#8*4] | 
|  | ldp	x23,x24,[x15,#8*4] | 
|  | stp	x25,x26,[x2,#8*6] | 
|  | mov	x2,x15 | 
|  | ldp	x25,x26,[x15,#8*6] | 
|  | b	Lsqr8x_outer_loop | 
|  |  | 
|  | .align	4 | 
|  | Lsqr8x_outer_break: | 
|  | // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] | 
|  | ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0] | 
|  | ldp	x15,x16,[sp,#8*1] | 
|  | ldp	x11,x13,[x14,#8*2] | 
|  | add	x1,x14,#8*4 | 
|  | ldp	x17,x14,[sp,#8*3] | 
|  |  | 
|  | stp	x19,x20,[x2,#8*0] | 
|  | mul	x19,x7,x7 | 
|  | stp	x21,x22,[x2,#8*2] | 
|  | umulh	x7,x7,x7 | 
|  | stp	x23,x24,[x2,#8*4] | 
|  | mul	x8,x9,x9 | 
|  | stp	x25,x26,[x2,#8*6] | 
|  | mov	x2,sp | 
|  | umulh	x9,x9,x9 | 
|  | adds	x20,x7,x15,lsl#1 | 
|  | extr	x15,x16,x15,#63 | 
|  | sub	x27,x5,#8*4 | 
|  |  | 
|  | Lsqr4x_shift_n_add: | 
|  | adcs	x21,x8,x15 | 
|  | extr	x16,x17,x16,#63 | 
|  | sub	x27,x27,#8*4 | 
|  | adcs	x22,x9,x16 | 
|  | ldp	x15,x16,[x2,#8*5] | 
|  | mul	x10,x11,x11 | 
|  | ldp	x7,x9,[x1],#8*2 | 
|  | umulh	x11,x11,x11 | 
|  | mul	x12,x13,x13 | 
|  | umulh	x13,x13,x13 | 
|  | extr	x17,x14,x17,#63 | 
|  | stp	x19,x20,[x2,#8*0] | 
|  | adcs	x23,x10,x17 | 
|  | extr	x14,x15,x14,#63 | 
|  | stp	x21,x22,[x2,#8*2] | 
|  | adcs	x24,x11,x14 | 
|  | ldp	x17,x14,[x2,#8*7] | 
|  | extr	x15,x16,x15,#63 | 
|  | adcs	x25,x12,x15 | 
|  | extr	x16,x17,x16,#63 | 
|  | adcs	x26,x13,x16 | 
|  | ldp	x15,x16,[x2,#8*9] | 
|  | mul	x6,x7,x7 | 
|  | ldp	x11,x13,[x1],#8*2 | 
|  | umulh	x7,x7,x7 | 
|  | mul	x8,x9,x9 | 
|  | umulh	x9,x9,x9 | 
|  | stp	x23,x24,[x2,#8*4] | 
|  | extr	x17,x14,x17,#63 | 
|  | stp	x25,x26,[x2,#8*6] | 
|  | add	x2,x2,#8*8 | 
|  | adcs	x19,x6,x17 | 
|  | extr	x14,x15,x14,#63 | 
|  | adcs	x20,x7,x14 | 
|  | ldp	x17,x14,[x2,#8*3] | 
|  | extr	x15,x16,x15,#63 | 
|  | cbnz	x27,Lsqr4x_shift_n_add | 
|  | ldp	x1,x4,[x29,#104]	// pull np and n0 | 
|  |  | 
|  | adcs	x21,x8,x15 | 
|  | extr	x16,x17,x16,#63 | 
|  | adcs	x22,x9,x16 | 
|  | ldp	x15,x16,[x2,#8*5] | 
|  | mul	x10,x11,x11 | 
|  | umulh	x11,x11,x11 | 
|  | stp	x19,x20,[x2,#8*0] | 
|  | mul	x12,x13,x13 | 
|  | umulh	x13,x13,x13 | 
|  | stp	x21,x22,[x2,#8*2] | 
|  | extr	x17,x14,x17,#63 | 
|  | adcs	x23,x10,x17 | 
|  | extr	x14,x15,x14,#63 | 
|  | ldp	x19,x20,[sp,#8*0] | 
|  | adcs	x24,x11,x14 | 
|  | extr	x15,x16,x15,#63 | 
|  | ldp	x6,x7,[x1,#8*0] | 
|  | adcs	x25,x12,x15 | 
|  | extr	x16,xzr,x16,#63 | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | adc	x26,x13,x16 | 
|  | ldp	x10,x11,[x1,#8*4] | 
|  |  | 
|  | // Reduce by 512 bits per iteration | 
|  | mul	x28,x4,x19		// t[0]*n0 | 
|  | ldp	x12,x13,[x1,#8*6] | 
|  | add	x3,x1,x5 | 
|  | ldp	x21,x22,[sp,#8*2] | 
|  | stp	x23,x24,[x2,#8*4] | 
|  | ldp	x23,x24,[sp,#8*4] | 
|  | stp	x25,x26,[x2,#8*6] | 
|  | ldp	x25,x26,[sp,#8*6] | 
|  | add	x1,x1,#8*8 | 
|  | mov	x30,xzr		// initial top-most carry | 
|  | mov	x2,sp | 
|  | mov	x27,#8 | 
|  |  | 
|  | Lsqr8x_reduction: | 
|  | // (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0) | 
|  | mul	x15,x7,x28 | 
|  | sub	x27,x27,#1 | 
|  | mul	x16,x8,x28 | 
|  | str	x28,[x2],#8		// put aside t[0]*n0 for tail processing | 
|  | mul	x17,x9,x28 | 
|  | // (*)	adds	xzr,x19,x14 | 
|  | subs	xzr,x19,#1		// (*) | 
|  | mul	x14,x10,x28 | 
|  | adcs	x19,x20,x15 | 
|  | mul	x15,x11,x28 | 
|  | adcs	x20,x21,x16 | 
|  | mul	x16,x12,x28 | 
|  | adcs	x21,x22,x17 | 
|  | mul	x17,x13,x28 | 
|  | adcs	x22,x23,x14 | 
|  | umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0) | 
|  | adcs	x23,x24,x15 | 
|  | umulh	x15,x7,x28 | 
|  | adcs	x24,x25,x16 | 
|  | umulh	x16,x8,x28 | 
|  | adcs	x25,x26,x17 | 
|  | umulh	x17,x9,x28 | 
|  | adc	x26,xzr,xzr | 
|  | adds	x19,x19,x14 | 
|  | umulh	x14,x10,x28 | 
|  | adcs	x20,x20,x15 | 
|  | umulh	x15,x11,x28 | 
|  | adcs	x21,x21,x16 | 
|  | umulh	x16,x12,x28 | 
|  | adcs	x22,x22,x17 | 
|  | umulh	x17,x13,x28 | 
|  | mul	x28,x4,x19		// next t[0]*n0 | 
|  | adcs	x23,x23,x14 | 
|  | adcs	x24,x24,x15 | 
|  | adcs	x25,x25,x16 | 
|  | adc	x26,x26,x17 | 
|  | cbnz	x27,Lsqr8x_reduction | 
|  |  | 
|  | ldp	x14,x15,[x2,#8*0] | 
|  | ldp	x16,x17,[x2,#8*2] | 
|  | mov	x0,x2 | 
|  | sub	x27,x3,x1	// done yet? | 
|  | adds	x19,x19,x14 | 
|  | adcs	x20,x20,x15 | 
|  | ldp	x14,x15,[x2,#8*4] | 
|  | adcs	x21,x21,x16 | 
|  | adcs	x22,x22,x17 | 
|  | ldp	x16,x17,[x2,#8*6] | 
|  | adcs	x23,x23,x14 | 
|  | adcs	x24,x24,x15 | 
|  | adcs	x25,x25,x16 | 
|  | adcs	x26,x26,x17 | 
|  | //adc	x28,xzr,xzr		// moved below | 
|  | cbz	x27,Lsqr8x8_post_condition | 
|  |  | 
|  | ldr	x4,[x2,#-8*8] | 
|  | ldp	x6,x7,[x1,#8*0] | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | ldp	x10,x11,[x1,#8*4] | 
|  | mov	x27,#-8*8 | 
|  | ldp	x12,x13,[x1,#8*6] | 
|  | add	x1,x1,#8*8 | 
|  |  | 
|  | Lsqr8x_tail: | 
|  | mul	x14,x6,x4 | 
|  | adc	x28,xzr,xzr		// carry bit, modulo-scheduled | 
|  | mul	x15,x7,x4 | 
|  | add	x27,x27,#8 | 
|  | mul	x16,x8,x4 | 
|  | mul	x17,x9,x4 | 
|  | adds	x19,x19,x14 | 
|  | mul	x14,x10,x4 | 
|  | adcs	x20,x20,x15 | 
|  | mul	x15,x11,x4 | 
|  | adcs	x21,x21,x16 | 
|  | mul	x16,x12,x4 | 
|  | adcs	x22,x22,x17 | 
|  | mul	x17,x13,x4 | 
|  | adcs	x23,x23,x14 | 
|  | umulh	x14,x6,x4 | 
|  | adcs	x24,x24,x15 | 
|  | umulh	x15,x7,x4 | 
|  | adcs	x25,x25,x16 | 
|  | umulh	x16,x8,x4 | 
|  | adcs	x26,x26,x17 | 
|  | umulh	x17,x9,x4 | 
|  | adc	x28,x28,xzr | 
|  | str	x19,[x2],#8 | 
|  | adds	x19,x20,x14 | 
|  | umulh	x14,x10,x4 | 
|  | adcs	x20,x21,x15 | 
|  | umulh	x15,x11,x4 | 
|  | adcs	x21,x22,x16 | 
|  | umulh	x16,x12,x4 | 
|  | adcs	x22,x23,x17 | 
|  | umulh	x17,x13,x4 | 
|  | ldr	x4,[x0,x27] | 
|  | adcs	x23,x24,x14 | 
|  | adcs	x24,x25,x15 | 
|  | adcs	x25,x26,x16 | 
|  | adcs	x26,x28,x17 | 
|  | //adc	x28,xzr,xzr		// moved above | 
|  | cbnz	x27,Lsqr8x_tail | 
|  | // note that carry flag is guaranteed | 
|  | // to be zero at this point | 
|  | ldp	x6,x7,[x2,#8*0] | 
|  | sub	x27,x3,x1	// done yet? | 
|  | sub	x16,x3,x5	// rewinded np | 
|  | ldp	x8,x9,[x2,#8*2] | 
|  | ldp	x10,x11,[x2,#8*4] | 
|  | ldp	x12,x13,[x2,#8*6] | 
|  | cbz	x27,Lsqr8x_tail_break | 
|  |  | 
|  | ldr	x4,[x0,#-8*8] | 
|  | adds	x19,x19,x6 | 
|  | adcs	x20,x20,x7 | 
|  | ldp	x6,x7,[x1,#8*0] | 
|  | adcs	x21,x21,x8 | 
|  | adcs	x22,x22,x9 | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | adcs	x23,x23,x10 | 
|  | adcs	x24,x24,x11 | 
|  | ldp	x10,x11,[x1,#8*4] | 
|  | adcs	x25,x25,x12 | 
|  | mov	x27,#-8*8 | 
|  | adcs	x26,x26,x13 | 
|  | ldp	x12,x13,[x1,#8*6] | 
|  | add	x1,x1,#8*8 | 
|  | //adc	x28,xzr,xzr		// moved above | 
|  | b	Lsqr8x_tail | 
|  |  | 
|  | .align	4 | 
|  | Lsqr8x_tail_break: | 
|  | ldr	x4,[x29,#112]		// pull n0 | 
|  | add	x27,x2,#8*8		// end of current t[num] window | 
|  |  | 
|  | subs	xzr,x30,#1		// "move" top-most carry to carry bit | 
|  | adcs	x14,x19,x6 | 
|  | adcs	x15,x20,x7 | 
|  | ldp	x19,x20,[x0,#8*0] | 
|  | adcs	x21,x21,x8 | 
|  | ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0] | 
|  | adcs	x22,x22,x9 | 
|  | ldp	x8,x9,[x16,#8*2] | 
|  | adcs	x23,x23,x10 | 
|  | adcs	x24,x24,x11 | 
|  | ldp	x10,x11,[x16,#8*4] | 
|  | adcs	x25,x25,x12 | 
|  | adcs	x26,x26,x13 | 
|  | ldp	x12,x13,[x16,#8*6] | 
|  | add	x1,x16,#8*8 | 
|  | adc	x30,xzr,xzr	// top-most carry | 
|  | mul	x28,x4,x19 | 
|  | stp	x14,x15,[x2,#8*0] | 
|  | stp	x21,x22,[x2,#8*2] | 
|  | ldp	x21,x22,[x0,#8*2] | 
|  | stp	x23,x24,[x2,#8*4] | 
|  | ldp	x23,x24,[x0,#8*4] | 
|  | cmp	x27,x29		// did we hit the bottom? | 
|  | stp	x25,x26,[x2,#8*6] | 
|  | mov	x2,x0			// slide the window | 
|  | ldp	x25,x26,[x0,#8*6] | 
|  | mov	x27,#8 | 
|  | b.ne	Lsqr8x_reduction | 
|  |  | 
|  | // Final step. We see if result is larger than modulus, and | 
|  | // if it is, subtract the modulus. But comparison implies | 
|  | // subtraction. So we subtract modulus, see if it borrowed, | 
|  | // and conditionally copy original value. | 
|  | ldr	x0,[x29,#96]		// pull rp | 
|  | add	x2,x2,#8*8 | 
|  | subs	x14,x19,x6 | 
|  | sbcs	x15,x20,x7 | 
|  | sub	x27,x5,#8*8 | 
|  | mov	x3,x0		// x0 copy | 
|  |  | 
|  | Lsqr8x_sub: | 
|  | sbcs	x16,x21,x8 | 
|  | ldp	x6,x7,[x1,#8*0] | 
|  | sbcs	x17,x22,x9 | 
|  | stp	x14,x15,[x0,#8*0] | 
|  | sbcs	x14,x23,x10 | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | sbcs	x15,x24,x11 | 
|  | stp	x16,x17,[x0,#8*2] | 
|  | sbcs	x16,x25,x12 | 
|  | ldp	x10,x11,[x1,#8*4] | 
|  | sbcs	x17,x26,x13 | 
|  | ldp	x12,x13,[x1,#8*6] | 
|  | add	x1,x1,#8*8 | 
|  | ldp	x19,x20,[x2,#8*0] | 
|  | sub	x27,x27,#8*8 | 
|  | ldp	x21,x22,[x2,#8*2] | 
|  | ldp	x23,x24,[x2,#8*4] | 
|  | ldp	x25,x26,[x2,#8*6] | 
|  | add	x2,x2,#8*8 | 
|  | stp	x14,x15,[x0,#8*4] | 
|  | sbcs	x14,x19,x6 | 
|  | stp	x16,x17,[x0,#8*6] | 
|  | add	x0,x0,#8*8 | 
|  | sbcs	x15,x20,x7 | 
|  | cbnz	x27,Lsqr8x_sub | 
|  |  | 
|  | sbcs	x16,x21,x8 | 
|  | mov	x2,sp | 
|  | add	x1,sp,x5 | 
|  | ldp	x6,x7,[x3,#8*0] | 
|  | sbcs	x17,x22,x9 | 
|  | stp	x14,x15,[x0,#8*0] | 
|  | sbcs	x14,x23,x10 | 
|  | ldp	x8,x9,[x3,#8*2] | 
|  | sbcs	x15,x24,x11 | 
|  | stp	x16,x17,[x0,#8*2] | 
|  | sbcs	x16,x25,x12 | 
|  | ldp	x19,x20,[x1,#8*0] | 
|  | sbcs	x17,x26,x13 | 
|  | ldp	x21,x22,[x1,#8*2] | 
|  | sbcs	xzr,x30,xzr	// did it borrow? | 
|  | ldr	x30,[x29,#8]		// pull return address | 
|  | stp	x14,x15,[x0,#8*4] | 
|  | stp	x16,x17,[x0,#8*6] | 
|  |  | 
|  | sub	x27,x5,#8*4 | 
|  | Lsqr4x_cond_copy: | 
|  | sub	x27,x27,#8*4 | 
|  | csel	x14,x19,x6,lo | 
|  | stp	xzr,xzr,[x2,#8*0] | 
|  | csel	x15,x20,x7,lo | 
|  | ldp	x6,x7,[x3,#8*4] | 
|  | ldp	x19,x20,[x1,#8*4] | 
|  | csel	x16,x21,x8,lo | 
|  | stp	xzr,xzr,[x2,#8*2] | 
|  | add	x2,x2,#8*4 | 
|  | csel	x17,x22,x9,lo | 
|  | ldp	x8,x9,[x3,#8*6] | 
|  | ldp	x21,x22,[x1,#8*6] | 
|  | add	x1,x1,#8*4 | 
|  | stp	x14,x15,[x3,#8*0] | 
|  | stp	x16,x17,[x3,#8*2] | 
|  | add	x3,x3,#8*4 | 
|  | stp	xzr,xzr,[x1,#8*0] | 
|  | stp	xzr,xzr,[x1,#8*2] | 
|  | cbnz	x27,Lsqr4x_cond_copy | 
|  |  | 
|  | csel	x14,x19,x6,lo | 
|  | stp	xzr,xzr,[x2,#8*0] | 
|  | csel	x15,x20,x7,lo | 
|  | stp	xzr,xzr,[x2,#8*2] | 
|  | csel	x16,x21,x8,lo | 
|  | csel	x17,x22,x9,lo | 
|  | stp	x14,x15,[x3,#8*0] | 
|  | stp	x16,x17,[x3,#8*2] | 
|  |  | 
|  | b	Lsqr8x_done | 
|  |  | 
|  | .align	4 | 
|  | Lsqr8x8_post_condition: | 
|  | adc	x28,xzr,xzr | 
|  | ldr	x30,[x29,#8]		// pull return address | 
|  | // x19-7,x28 hold result, x6-7 hold modulus | 
|  | subs	x6,x19,x6 | 
|  | ldr	x1,[x29,#96]		// pull rp | 
|  | sbcs	x7,x20,x7 | 
|  | stp	xzr,xzr,[sp,#8*0] | 
|  | sbcs	x8,x21,x8 | 
|  | stp	xzr,xzr,[sp,#8*2] | 
|  | sbcs	x9,x22,x9 | 
|  | stp	xzr,xzr,[sp,#8*4] | 
|  | sbcs	x10,x23,x10 | 
|  | stp	xzr,xzr,[sp,#8*6] | 
|  | sbcs	x11,x24,x11 | 
|  | stp	xzr,xzr,[sp,#8*8] | 
|  | sbcs	x12,x25,x12 | 
|  | stp	xzr,xzr,[sp,#8*10] | 
|  | sbcs	x13,x26,x13 | 
|  | stp	xzr,xzr,[sp,#8*12] | 
|  | sbcs	x28,x28,xzr	// did it borrow? | 
|  | stp	xzr,xzr,[sp,#8*14] | 
|  |  | 
|  | // x6-7 hold result-modulus | 
|  | csel	x6,x19,x6,lo | 
|  | csel	x7,x20,x7,lo | 
|  | csel	x8,x21,x8,lo | 
|  | csel	x9,x22,x9,lo | 
|  | stp	x6,x7,[x1,#8*0] | 
|  | csel	x10,x23,x10,lo | 
|  | csel	x11,x24,x11,lo | 
|  | stp	x8,x9,[x1,#8*2] | 
|  | csel	x12,x25,x12,lo | 
|  | csel	x13,x26,x13,lo | 
|  | stp	x10,x11,[x1,#8*4] | 
|  | stp	x12,x13,[x1,#8*6] | 
|  |  | 
|  | Lsqr8x_done: | 
|  | ldp	x19,x20,[x29,#16] | 
|  | mov	sp,x29 | 
|  | ldp	x21,x22,[x29,#32] | 
|  | // No return value | 
|  | ldp	x23,x24,[x29,#48] | 
|  | ldp	x25,x26,[x29,#64] | 
|  | ldp	x27,x28,[x29,#80] | 
|  | ldr	x29,[sp],#128 | 
|  | // x30 is popped earlier | 
|  | AARCH64_VALIDATE_LINK_REGISTER | 
|  | ret | 
|  |  | 
|  |  | 
|  | .align	5 | 
|  | __bn_mul4x_mont: | 
|  | // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to | 
|  | // only from bn_mul_mont_words or __bn_mul8x_mont which have already signed the | 
|  | // return address. | 
|  | stp	x29,x30,[sp,#-128]! | 
|  | add	x29,sp,#0 | 
|  | stp	x19,x20,[sp,#16] | 
|  | stp	x21,x22,[sp,#32] | 
|  | stp	x23,x24,[sp,#48] | 
|  | stp	x25,x26,[sp,#64] | 
|  | stp	x27,x28,[sp,#80] | 
|  |  | 
|  | sub	x26,sp,x5,lsl#3 | 
|  | lsl	x5,x5,#3 | 
|  | ldr	x4,[x4]		// *n0 | 
|  | sub	sp,x26,#8*4		// alloca | 
|  |  | 
|  | add	x10,x2,x5 | 
|  | add	x27,x1,x5 | 
|  | stp	x0,x10,[x29,#96]	// offload rp and &b[num] | 
|  |  | 
|  | ldr	x24,[x2,#8*0]		// b[0] | 
|  | ldp	x6,x7,[x1,#8*0]	// a[0..3] | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | add	x1,x1,#8*4 | 
|  | mov	x19,xzr | 
|  | mov	x20,xzr | 
|  | mov	x21,xzr | 
|  | mov	x22,xzr | 
|  | ldp	x14,x15,[x3,#8*0]	// n[0..3] | 
|  | ldp	x16,x17,[x3,#8*2] | 
|  | adds	x3,x3,#8*4		// clear carry bit | 
|  | mov	x0,xzr | 
|  | mov	x28,#0 | 
|  | mov	x26,sp | 
|  |  | 
|  | Loop_mul4x_1st_reduction: | 
|  | mul	x10,x6,x24		// lo(a[0..3]*b[0]) | 
|  | adc	x0,x0,xzr	// modulo-scheduled | 
|  | mul	x11,x7,x24 | 
|  | add	x28,x28,#8 | 
|  | mul	x12,x8,x24 | 
|  | and	x28,x28,#31 | 
|  | mul	x13,x9,x24 | 
|  | adds	x19,x19,x10 | 
|  | umulh	x10,x6,x24		// hi(a[0..3]*b[0]) | 
|  | adcs	x20,x20,x11 | 
|  | mul	x25,x19,x4		// t[0]*n0 | 
|  | adcs	x21,x21,x12 | 
|  | umulh	x11,x7,x24 | 
|  | adcs	x22,x22,x13 | 
|  | umulh	x12,x8,x24 | 
|  | adc	x23,xzr,xzr | 
|  | umulh	x13,x9,x24 | 
|  | ldr	x24,[x2,x28]		// next b[i] (or b[0]) | 
|  | adds	x20,x20,x10 | 
|  | // (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0) | 
|  | str	x25,[x26],#8		// put aside t[0]*n0 for tail processing | 
|  | adcs	x21,x21,x11 | 
|  | mul	x11,x15,x25 | 
|  | adcs	x22,x22,x12 | 
|  | mul	x12,x16,x25 | 
|  | adc	x23,x23,x13		// can't overflow | 
|  | mul	x13,x17,x25 | 
|  | // (*)	adds	xzr,x19,x10 | 
|  | subs	xzr,x19,#1		// (*) | 
|  | umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0) | 
|  | adcs	x19,x20,x11 | 
|  | umulh	x11,x15,x25 | 
|  | adcs	x20,x21,x12 | 
|  | umulh	x12,x16,x25 | 
|  | adcs	x21,x22,x13 | 
|  | umulh	x13,x17,x25 | 
|  | adcs	x22,x23,x0 | 
|  | adc	x0,xzr,xzr | 
|  | adds	x19,x19,x10 | 
|  | sub	x10,x27,x1 | 
|  | adcs	x20,x20,x11 | 
|  | adcs	x21,x21,x12 | 
|  | adcs	x22,x22,x13 | 
|  | //adc	x0,x0,xzr | 
|  | cbnz	x28,Loop_mul4x_1st_reduction | 
|  |  | 
|  | cbz	x10,Lmul4x4_post_condition | 
|  |  | 
|  | ldp	x6,x7,[x1,#8*0]	// a[4..7] | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | add	x1,x1,#8*4 | 
|  | ldr	x25,[sp]		// a[0]*n0 | 
|  | ldp	x14,x15,[x3,#8*0]	// n[4..7] | 
|  | ldp	x16,x17,[x3,#8*2] | 
|  | add	x3,x3,#8*4 | 
|  |  | 
|  | Loop_mul4x_1st_tail: | 
|  | mul	x10,x6,x24		// lo(a[4..7]*b[i]) | 
|  | adc	x0,x0,xzr	// modulo-scheduled | 
|  | mul	x11,x7,x24 | 
|  | add	x28,x28,#8 | 
|  | mul	x12,x8,x24 | 
|  | and	x28,x28,#31 | 
|  | mul	x13,x9,x24 | 
|  | adds	x19,x19,x10 | 
|  | umulh	x10,x6,x24		// hi(a[4..7]*b[i]) | 
|  | adcs	x20,x20,x11 | 
|  | umulh	x11,x7,x24 | 
|  | adcs	x21,x21,x12 | 
|  | umulh	x12,x8,x24 | 
|  | adcs	x22,x22,x13 | 
|  | umulh	x13,x9,x24 | 
|  | adc	x23,xzr,xzr | 
|  | ldr	x24,[x2,x28]		// next b[i] (or b[0]) | 
|  | adds	x20,x20,x10 | 
|  | mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0) | 
|  | adcs	x21,x21,x11 | 
|  | mul	x11,x15,x25 | 
|  | adcs	x22,x22,x12 | 
|  | mul	x12,x16,x25 | 
|  | adc	x23,x23,x13		// can't overflow | 
|  | mul	x13,x17,x25 | 
|  | adds	x19,x19,x10 | 
|  | umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0) | 
|  | adcs	x20,x20,x11 | 
|  | umulh	x11,x15,x25 | 
|  | adcs	x21,x21,x12 | 
|  | umulh	x12,x16,x25 | 
|  | adcs	x22,x22,x13 | 
|  | adcs	x23,x23,x0 | 
|  | umulh	x13,x17,x25 | 
|  | adc	x0,xzr,xzr | 
|  | ldr	x25,[sp,x28]		// next t[0]*n0 | 
|  | str	x19,[x26],#8		// result!!! | 
|  | adds	x19,x20,x10 | 
|  | sub	x10,x27,x1		// done yet? | 
|  | adcs	x20,x21,x11 | 
|  | adcs	x21,x22,x12 | 
|  | adcs	x22,x23,x13 | 
|  | //adc	x0,x0,xzr | 
|  | cbnz	x28,Loop_mul4x_1st_tail | 
|  |  | 
|  | sub	x11,x27,x5	// rewinded x1 | 
|  | cbz	x10,Lmul4x_proceed | 
|  |  | 
|  | ldp	x6,x7,[x1,#8*0] | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | add	x1,x1,#8*4 | 
|  | ldp	x14,x15,[x3,#8*0] | 
|  | ldp	x16,x17,[x3,#8*2] | 
|  | add	x3,x3,#8*4 | 
|  | b	Loop_mul4x_1st_tail | 
|  |  | 
|  | .align	5 | 
|  | Lmul4x_proceed: | 
|  | ldr	x24,[x2,#8*4]!		// *++b | 
|  | adc	x30,x0,xzr | 
|  | ldp	x6,x7,[x11,#8*0]	// a[0..3] | 
|  | sub	x3,x3,x5		// rewind np | 
|  | ldp	x8,x9,[x11,#8*2] | 
|  | add	x1,x11,#8*4 | 
|  |  | 
|  | stp	x19,x20,[x26,#8*0]	// result!!! | 
|  | ldp	x19,x20,[sp,#8*4]	// t[0..3] | 
|  | stp	x21,x22,[x26,#8*2]	// result!!! | 
|  | ldp	x21,x22,[sp,#8*6] | 
|  |  | 
|  | ldp	x14,x15,[x3,#8*0]	// n[0..3] | 
|  | mov	x26,sp | 
|  | ldp	x16,x17,[x3,#8*2] | 
|  | adds	x3,x3,#8*4		// clear carry bit | 
|  | mov	x0,xzr | 
|  |  | 
|  | .align	4 | 
|  | Loop_mul4x_reduction: | 
|  | mul	x10,x6,x24		// lo(a[0..3]*b[4]) | 
|  | adc	x0,x0,xzr	// modulo-scheduled | 
|  | mul	x11,x7,x24 | 
|  | add	x28,x28,#8 | 
|  | mul	x12,x8,x24 | 
|  | and	x28,x28,#31 | 
|  | mul	x13,x9,x24 | 
|  | adds	x19,x19,x10 | 
|  | umulh	x10,x6,x24		// hi(a[0..3]*b[4]) | 
|  | adcs	x20,x20,x11 | 
|  | mul	x25,x19,x4		// t[0]*n0 | 
|  | adcs	x21,x21,x12 | 
|  | umulh	x11,x7,x24 | 
|  | adcs	x22,x22,x13 | 
|  | umulh	x12,x8,x24 | 
|  | adc	x23,xzr,xzr | 
|  | umulh	x13,x9,x24 | 
|  | ldr	x24,[x2,x28]		// next b[i] | 
|  | adds	x20,x20,x10 | 
|  | // (*)	mul	x10,x14,x25 | 
|  | str	x25,[x26],#8		// put aside t[0]*n0 for tail processing | 
|  | adcs	x21,x21,x11 | 
|  | mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0 | 
|  | adcs	x22,x22,x12 | 
|  | mul	x12,x16,x25 | 
|  | adc	x23,x23,x13		// can't overflow | 
|  | mul	x13,x17,x25 | 
|  | // (*)	adds	xzr,x19,x10 | 
|  | subs	xzr,x19,#1		// (*) | 
|  | umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0 | 
|  | adcs	x19,x20,x11 | 
|  | umulh	x11,x15,x25 | 
|  | adcs	x20,x21,x12 | 
|  | umulh	x12,x16,x25 | 
|  | adcs	x21,x22,x13 | 
|  | umulh	x13,x17,x25 | 
|  | adcs	x22,x23,x0 | 
|  | adc	x0,xzr,xzr | 
|  | adds	x19,x19,x10 | 
|  | adcs	x20,x20,x11 | 
|  | adcs	x21,x21,x12 | 
|  | adcs	x22,x22,x13 | 
|  | //adc	x0,x0,xzr | 
|  | cbnz	x28,Loop_mul4x_reduction | 
|  |  | 
|  | adc	x0,x0,xzr | 
|  | ldp	x10,x11,[x26,#8*4]	// t[4..7] | 
|  | ldp	x12,x13,[x26,#8*6] | 
|  | ldp	x6,x7,[x1,#8*0]	// a[4..7] | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | add	x1,x1,#8*4 | 
|  | adds	x19,x19,x10 | 
|  | adcs	x20,x20,x11 | 
|  | adcs	x21,x21,x12 | 
|  | adcs	x22,x22,x13 | 
|  | //adc	x0,x0,xzr | 
|  |  | 
|  | ldr	x25,[sp]		// t[0]*n0 | 
|  | ldp	x14,x15,[x3,#8*0]	// n[4..7] | 
|  | ldp	x16,x17,[x3,#8*2] | 
|  | add	x3,x3,#8*4 | 
|  |  | 
|  | .align	4 | 
|  | Loop_mul4x_tail: | 
|  | mul	x10,x6,x24		// lo(a[4..7]*b[4]) | 
|  | adc	x0,x0,xzr	// modulo-scheduled | 
|  | mul	x11,x7,x24 | 
|  | add	x28,x28,#8 | 
|  | mul	x12,x8,x24 | 
|  | and	x28,x28,#31 | 
|  | mul	x13,x9,x24 | 
|  | adds	x19,x19,x10 | 
|  | umulh	x10,x6,x24		// hi(a[4..7]*b[4]) | 
|  | adcs	x20,x20,x11 | 
|  | umulh	x11,x7,x24 | 
|  | adcs	x21,x21,x12 | 
|  | umulh	x12,x8,x24 | 
|  | adcs	x22,x22,x13 | 
|  | umulh	x13,x9,x24 | 
|  | adc	x23,xzr,xzr | 
|  | ldr	x24,[x2,x28]		// next b[i] | 
|  | adds	x20,x20,x10 | 
|  | mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0) | 
|  | adcs	x21,x21,x11 | 
|  | mul	x11,x15,x25 | 
|  | adcs	x22,x22,x12 | 
|  | mul	x12,x16,x25 | 
|  | adc	x23,x23,x13		// can't overflow | 
|  | mul	x13,x17,x25 | 
|  | adds	x19,x19,x10 | 
|  | umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0) | 
|  | adcs	x20,x20,x11 | 
|  | umulh	x11,x15,x25 | 
|  | adcs	x21,x21,x12 | 
|  | umulh	x12,x16,x25 | 
|  | adcs	x22,x22,x13 | 
|  | umulh	x13,x17,x25 | 
|  | adcs	x23,x23,x0 | 
|  | ldr	x25,[sp,x28]		// next a[0]*n0 | 
|  | adc	x0,xzr,xzr | 
|  | str	x19,[x26],#8		// result!!! | 
|  | adds	x19,x20,x10 | 
|  | sub	x10,x27,x1		// done yet? | 
|  | adcs	x20,x21,x11 | 
|  | adcs	x21,x22,x12 | 
|  | adcs	x22,x23,x13 | 
|  | //adc	x0,x0,xzr | 
|  | cbnz	x28,Loop_mul4x_tail | 
|  |  | 
|  | sub	x11,x3,x5		// rewinded np? | 
|  | adc	x0,x0,xzr | 
|  | cbz	x10,Loop_mul4x_break | 
|  |  | 
|  | ldp	x10,x11,[x26,#8*4] | 
|  | ldp	x12,x13,[x26,#8*6] | 
|  | ldp	x6,x7,[x1,#8*0] | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | add	x1,x1,#8*4 | 
|  | adds	x19,x19,x10 | 
|  | adcs	x20,x20,x11 | 
|  | adcs	x21,x21,x12 | 
|  | adcs	x22,x22,x13 | 
|  | //adc	x0,x0,xzr | 
|  | ldp	x14,x15,[x3,#8*0] | 
|  | ldp	x16,x17,[x3,#8*2] | 
|  | add	x3,x3,#8*4 | 
|  | b	Loop_mul4x_tail | 
|  |  | 
|  | .align	4 | 
|  | Loop_mul4x_break: | 
|  | ldp	x12,x13,[x29,#96]	// pull rp and &b[num] | 
|  | adds	x19,x19,x30 | 
|  | add	x2,x2,#8*4		// bp++ | 
|  | adcs	x20,x20,xzr | 
|  | sub	x1,x1,x5		// rewind ap | 
|  | adcs	x21,x21,xzr | 
|  | stp	x19,x20,[x26,#8*0]	// result!!! | 
|  | adcs	x22,x22,xzr | 
|  | ldp	x19,x20,[sp,#8*4]	// t[0..3] | 
|  | adc	x30,x0,xzr | 
|  | stp	x21,x22,[x26,#8*2]	// result!!! | 
|  | cmp	x2,x13			// done yet? | 
|  | ldp	x21,x22,[sp,#8*6] | 
|  | ldp	x14,x15,[x11,#8*0]	// n[0..3] | 
|  | ldp	x16,x17,[x11,#8*2] | 
|  | add	x3,x11,#8*4 | 
|  | b.eq	Lmul4x_post | 
|  |  | 
|  | ldr	x24,[x2] | 
|  | ldp	x6,x7,[x1,#8*0]	// a[0..3] | 
|  | ldp	x8,x9,[x1,#8*2] | 
|  | adds	x1,x1,#8*4		// clear carry bit | 
|  | mov	x0,xzr | 
|  | mov	x26,sp | 
|  | b	Loop_mul4x_reduction | 
|  |  | 
|  | .align	4 | 
|  | Lmul4x_post: | 
|  | // Final step. We see if result is larger than modulus, and | 
|  | // if it is, subtract the modulus. But comparison implies | 
|  | // subtraction. So we subtract modulus, see if it borrowed, | 
|  | // and conditionally copy original value. | 
|  | mov	x0,x12 | 
|  | mov	x27,x12		// x0 copy | 
|  | subs	x10,x19,x14 | 
|  | add	x26,sp,#8*8 | 
|  | sbcs	x11,x20,x15 | 
|  | sub	x28,x5,#8*4 | 
|  |  | 
|  | Lmul4x_sub: | 
|  | sbcs	x12,x21,x16 | 
|  | ldp	x14,x15,[x3,#8*0] | 
|  | sub	x28,x28,#8*4 | 
|  | ldp	x19,x20,[x26,#8*0] | 
|  | sbcs	x13,x22,x17 | 
|  | ldp	x16,x17,[x3,#8*2] | 
|  | add	x3,x3,#8*4 | 
|  | ldp	x21,x22,[x26,#8*2] | 
|  | add	x26,x26,#8*4 | 
|  | stp	x10,x11,[x0,#8*0] | 
|  | sbcs	x10,x19,x14 | 
|  | stp	x12,x13,[x0,#8*2] | 
|  | add	x0,x0,#8*4 | 
|  | sbcs	x11,x20,x15 | 
|  | cbnz	x28,Lmul4x_sub | 
|  |  | 
|  | sbcs	x12,x21,x16 | 
|  | mov	x26,sp | 
|  | add	x1,sp,#8*4 | 
|  | ldp	x6,x7,[x27,#8*0] | 
|  | sbcs	x13,x22,x17 | 
|  | stp	x10,x11,[x0,#8*0] | 
|  | ldp	x8,x9,[x27,#8*2] | 
|  | stp	x12,x13,[x0,#8*2] | 
|  | ldp	x19,x20,[x1,#8*0] | 
|  | ldp	x21,x22,[x1,#8*2] | 
|  | sbcs	xzr,x30,xzr	// did it borrow? | 
|  | ldr	x30,[x29,#8]		// pull return address | 
|  |  | 
|  | sub	x28,x5,#8*4 | 
|  | Lmul4x_cond_copy: | 
|  | sub	x28,x28,#8*4 | 
|  | csel	x10,x19,x6,lo | 
|  | stp	xzr,xzr,[x26,#8*0] | 
|  | csel	x11,x20,x7,lo | 
|  | ldp	x6,x7,[x27,#8*4] | 
|  | ldp	x19,x20,[x1,#8*4] | 
|  | csel	x12,x21,x8,lo | 
|  | stp	xzr,xzr,[x26,#8*2] | 
|  | add	x26,x26,#8*4 | 
|  | csel	x13,x22,x9,lo | 
|  | ldp	x8,x9,[x27,#8*6] | 
|  | ldp	x21,x22,[x1,#8*6] | 
|  | add	x1,x1,#8*4 | 
|  | stp	x10,x11,[x27,#8*0] | 
|  | stp	x12,x13,[x27,#8*2] | 
|  | add	x27,x27,#8*4 | 
|  | cbnz	x28,Lmul4x_cond_copy | 
|  |  | 
|  | csel	x10,x19,x6,lo | 
|  | stp	xzr,xzr,[x26,#8*0] | 
|  | csel	x11,x20,x7,lo | 
|  | stp	xzr,xzr,[x26,#8*2] | 
|  | csel	x12,x21,x8,lo | 
|  | stp	xzr,xzr,[x26,#8*3] | 
|  | csel	x13,x22,x9,lo | 
|  | stp	xzr,xzr,[x26,#8*4] | 
|  | stp	x10,x11,[x27,#8*0] | 
|  | stp	x12,x13,[x27,#8*2] | 
|  |  | 
|  | b	Lmul4x_done | 
|  |  | 
|  | .align	4 | 
|  | Lmul4x4_post_condition: | 
|  | adc	x0,x0,xzr | 
|  | ldr	x1,[x29,#96]		// pull rp | 
|  | // x19-3,x0 hold result, x14-7 hold modulus | 
|  | subs	x6,x19,x14 | 
|  | ldr	x30,[x29,#8]		// pull return address | 
|  | sbcs	x7,x20,x15 | 
|  | stp	xzr,xzr,[sp,#8*0] | 
|  | sbcs	x8,x21,x16 | 
|  | stp	xzr,xzr,[sp,#8*2] | 
|  | sbcs	x9,x22,x17 | 
|  | stp	xzr,xzr,[sp,#8*4] | 
|  | sbcs	xzr,x0,xzr		// did it borrow? | 
|  | stp	xzr,xzr,[sp,#8*6] | 
|  |  | 
|  | // x6-3 hold result-modulus | 
|  | csel	x6,x19,x6,lo | 
|  | csel	x7,x20,x7,lo | 
|  | csel	x8,x21,x8,lo | 
|  | csel	x9,x22,x9,lo | 
|  | stp	x6,x7,[x1,#8*0] | 
|  | stp	x8,x9,[x1,#8*2] | 
|  |  | 
|  | Lmul4x_done: | 
|  | ldp	x19,x20,[x29,#16] | 
|  | mov	sp,x29 | 
|  | ldp	x21,x22,[x29,#32] | 
|  | // No return value | 
|  | ldp	x23,x24,[x29,#48] | 
|  | ldp	x25,x26,[x29,#64] | 
|  | ldp	x27,x28,[x29,#80] | 
|  | ldr	x29,[sp],#128 | 
|  | // x30 is popped earlier | 
|  | AARCH64_VALIDATE_LINK_REGISTER | 
|  | ret | 
|  |  | 
|  | .byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 | 
|  | .align	2 | 
|  | .align	4 | 
|  | #endif  // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) |