SSE2 inner loop for bn_mul_add_words (fwd)

dean gaudet dean-gcrypt@arctic.org
Mon, 24 Mar 2003 18:01:27 -0800 (PST)


i actually hacked the code below for openssl... but i thought you folks
might be interested in it as well.  i'm sure it could be easily adapted to
gcrypt.  results in 30%+ more rsa 1024-bit signs/s.

laters
-dean

---------- Forwarded message ----------
for kicks i decided to see if it really was possible to get RSA speedups
using the SSE2 PMULUDQ and PADDQ instructions ... and i'm seeing a 30%+
1024-bit sign/s improvement on the p4 systems i've measured on.

but i'm too lazy to try to understand the perl asm generation crud, and
don't want to figure out how/if you folks would prefer to switch between
routines optimized for specific platforms.  so what i present here is a
total hack patch against a .s file.  do with as you please :)

note that i use %mm registers rather than %xmm registers because this
code is completely dominated by the carry-propagation, which is a series
of 64-bit adds followed by shift-right 32 ... and if you attempt to
do this with 128-bit registers you waste a lot of slots mucking about
packing and shuffling and such.

even still, this is SSE2-only code because the PMULUDQ and PADDQ
instructions don't exist in MMX/SSE.

if you look closely i'm doing only 32-bit loads and stores ... the
implicit zero-extension on the 32-bit load beats out any sort of creative
attempt to do 64-bit loads and shuffle the halves around.

it's unlikely that this technique can speed up the simple add/sub
routines -- unless there are situations where multiple add/sub could be
done in parallel... in the MMX hardware you can effectively parallelize
non-dependent carry propagation -- something you can't do in the ALUs
due to the conflict on EFLAGS.CF.

this code probably still has slack which could be improved on...  such as
moving the emms somewhere much higher in the call stack... it's required
before any fp code is run.  and rearranging the loop so that it overlaps
multiplication better with the carry chain propagation.

-dean

p.s. i'm not on the mailing list, so please CC me in any reply.

--- openssl-0.9.7a/crypto/bn/asm/bn86-elf.s	2003-03-23 21:29:16.000000000 -0800
+++ openssl-0.9.7a/crypto/bn/asm/bn86-elf.s.dg2	2003-03-23 21:18:05.000000000 -0800
@@ -26,94 +26,76 @@
 	movl	32(%esp),	%ebp
 	pushl	%ecx
 	jz	.L000maw_finish
-.L001maw_loop:
-	movl	%ecx,		(%esp)

-	movl	(%ebx),		%eax
-	mull	%ebp
-	addl	%esi,		%eax
-	movl	(%edi),		%esi
-	adcl	$0,		%edx
-	addl	%esi,		%eax
-	adcl	$0,		%edx
-	movl	%eax,		(%edi)
-	movl	%edx,		%esi
+	movd %ebp,%mm0
+	pxor %mm1,%mm1		/* mm1 = carry in */

-	movl	4(%ebx),	%eax
-	mull	%ebp
-	addl	%esi,		%eax
-	movl	4(%edi),	%esi
-	adcl	$0,		%edx
-	addl	%esi,		%eax
-	adcl	$0,		%edx
-	movl	%eax,		4(%edi)
-	movl	%edx,		%esi
-
-	movl	8(%ebx),	%eax
-	mull	%ebp
-	addl	%esi,		%eax
-	movl	8(%edi),	%esi
-	adcl	$0,		%edx
-	addl	%esi,		%eax
-	adcl	$0,		%edx
-	movl	%eax,		8(%edi)
-	movl	%edx,		%esi
-
-	movl	12(%ebx),	%eax
-	mull	%ebp
-	addl	%esi,		%eax
-	movl	12(%edi),	%esi
-	adcl	$0,		%edx
-	addl	%esi,		%eax
-	adcl	$0,		%edx
-	movl	%eax,		12(%edi)
-	movl	%edx,		%esi
-
-	movl	16(%ebx),	%eax
-	mull	%ebp
-	addl	%esi,		%eax
-	movl	16(%edi),	%esi
-	adcl	$0,		%edx
-	addl	%esi,		%eax
-	adcl	$0,		%edx
-	movl	%eax,		16(%edi)
-	movl	%edx,		%esi
-
-	movl	20(%ebx),	%eax
-	mull	%ebp
-	addl	%esi,		%eax
-	movl	20(%edi),	%esi
-	adcl	$0,		%edx
-	addl	%esi,		%eax
-	adcl	$0,		%edx
-	movl	%eax,		20(%edi)
-	movl	%edx,		%esi
-
-	movl	24(%ebx),	%eax
-	mull	%ebp
-	addl	%esi,		%eax
-	movl	24(%edi),	%esi
-	adcl	$0,		%edx
-	addl	%esi,		%eax
-	adcl	$0,		%edx
-	movl	%eax,		24(%edi)
-	movl	%edx,		%esi
-
-	movl	28(%ebx),	%eax
-	mull	%ebp
-	addl	%esi,		%eax
-	movl	28(%edi),	%esi
-	adcl	$0,		%edx
-	addl	%esi,		%eax
-	adcl	$0,		%edx
-	movl	%eax,		28(%edi)
-	movl	%edx,		%esi
+.L001maw_loop:
+	movd (%edi),%mm3	/* mm3 = C[0]				*/
+	paddq %mm3,%mm1		/* mm1 = carry_in + C[0]		*/
+	movd (%ebx),%mm2	/* mm2 = A[0]				*/
+	pmuludq %mm0,%mm2	/* mm2 = scale*A[0]			*/
+	movd 4(%ebx),%mm4	/* mm4 = A[1]				*/
+	pmuludq %mm0,%mm4	/* mm4 = scale*A[1]			*/
+	movd 8(%ebx),%mm6	/* mm6 = A[2]				*/
+	pmuludq %mm0,%mm6	/* mm6 = scale*A[2]			*/
+	movd 12(%ebx),%mm7	/* mm7 = A[3]				*/
+	pmuludq %mm0,%mm7	/* mm7 = scale*A[3]			*/
+	paddq %mm2,%mm1		/* mm1 = carry_in + C[0] + scale*A[0]	*/
+	movd 4(%edi),%mm3	/* mm3 = C[1]				*/
+	paddq %mm4,%mm3		/* mm3 = C[1] + scale*A[1]		*/
+	movd 8(%edi),%mm5	/* mm5 = C[2]				*/
+	paddq %mm6,%mm5		/* mm5 = C[2] + scale*A[2]		*/
+	movd 12(%edi),%mm4	/* mm4 = C[3]				*/
+	paddq %mm4,%mm7		/* mm7 = C[3] + scale*A[3]		*/
+	movd %mm1,(%edi)
+	movd 16(%ebx),%mm2	/* mm2 = A[4]				*/
+	pmuludq %mm0,%mm2	/* mm2 = scale*A[4]			*/
+	psrlq $32,%mm1		/* mm1 = carry0			*/
+	movd 20(%ebx),%mm4	/* mm4 = A[5]				*/
+	pmuludq %mm0,%mm4	/* mm4 = scale*A[5]			*/
+	paddq %mm3,%mm1		/* mm1 = carry0 + C[1] + scale*A[1]	*/
+	movd 24(%ebx),%mm6	/* mm6 = A[6]				*/
+	pmuludq %mm0,%mm6	/* mm6 = scale*A[6]			*/
+	movd %mm1,4(%edi)
+	psrlq $32,%mm1		/* mm1 = carry1			*/
+	movd 28(%ebx),%mm3	/* mm3 = A[7]				*/
+	add $32,%ebx
+	pmuludq %mm0,%mm3	/* mm3 = scale*A[7]			*/
+	paddq %mm5,%mm1		/* mm1 = carry1 + C[2] + scale*A[2]	*/
+	movd 16(%edi),%mm5	/* mm5 = C[4]				*/
+	paddq %mm5,%mm2		/* mm2 = C[4] + scale*A[4]		*/
+	movd %mm1,8(%edi)
+	psrlq $32,%mm1		/* mm1 = carry2			*/
+	paddq %mm7,%mm1		/* mm1 = carry2 + C[3] + scale*A[3]	*/
+	movd 20(%edi),%mm5	/* mm5 = C[5]				*/
+	paddq %mm5,%mm4		/* mm4 = C[5] + scale*A[5]		*/
+	movd %mm1,12(%edi)
+	psrlq $32,%mm1		/* mm1 = carry3			*/
+	paddq %mm2,%mm1		/* mm1 = carry3 + C[4] + scale*A[4]	*/
+	movd 24(%edi),%mm5	/* mm5 = C[6]				*/
+	paddq %mm5,%mm6		/* mm6 = C[6] + scale*A[6]		*/
+	movd %mm1,16(%edi)
+	psrlq $32,%mm1		/* mm1 = carry4			*/
+	paddq %mm4,%mm1		/* mm1 = carry4 + C[5] + scale*A[5]	*/
+	movd 28(%edi),%mm5	/* mm5 = C[7]				*/
+	paddq %mm5,%mm3		/* mm3 = C[7] + scale*A[7]		*/
+	movd %mm1,20(%edi)
+	psrlq $32,%mm1		/* mm1 = carry5			*/
+	paddq %mm6,%mm1		/* mm1 = carry5 + C[6] + scale*A[6]	*/
+	movd %mm1,24(%edi)
+	psrlq $32,%mm1		/* mm1 = carry6			*/
+	paddq %mm3,%mm1		/* mm1 = carry6 + C[7] + scale*A[7]	*/
+	movd %mm1,28(%edi)
+	add $32,%edi
+	psrlq $32,%mm1		/* mm1 = carry_out			*/
+
+	sub $8,%ecx
+	jne .L001maw_loop
+	movd %mm1,%esi		/* esi = carry_out */
+	emms
+	mov %ecx,(%esp)

-	movl	(%esp),		%ecx
-	addl	$32,		%ebx
-	addl	$32,		%edi
-	subl	$8,		%ecx
-	jnz	.L001maw_loop
 .L000maw_finish:
 	movl	32(%esp),	%ecx
 	andl	$7,		%ecx