[PATCH 4/5] mpi/amd64: optimize add_n and sub_n

Jussi Kivilinna jussi.kivilinna at iki.fi
Sat Apr 22 09:35:38 CEST 2023


* mpi/amd64/mpih-add1.S (_gcry_mpih_add_n): New implementation
with 4x unrolled fast-path loop.
* mpi/amd64/mpih-sub1.S (_gcry_mpih_sub_n): Likewise.
--

Benchmark on AMD Ryzen 9 7900X:

 Before:
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 add                |     0.035 ns/B     27559 MiB/s     0.163 c/B      4700
 sub                |     0.034 ns/B     28332 MiB/s     0.158 c/B      4700

 After (~26% faster):
                    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 add                |     0.027 ns/B     35271 MiB/s     0.127 c/B      4700
 sub                |     0.027 ns/B     35206 MiB/s     0.127 c/B      4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/amd64/mpih-add1.S | 81 ++++++++++++++++++++++++++++++++++++-------
 mpi/amd64/mpih-sub1.S | 80 +++++++++++++++++++++++++++++++++++-------
 2 files changed, 136 insertions(+), 25 deletions(-)

diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S
index 833a43cb..f2e86237 100644
--- a/mpi/amd64/mpih-add1.S
+++ b/mpi/amd64/mpih-add1.S
@@ -3,6 +3,7 @@
  *
  *      Copyright (C) 1992, 1994, 1995, 1998, 
  *                    2001, 2002, 2006 Free Software Foundation, Inc.
+ *      Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -39,26 +40,80 @@
  *		   mpi_ptr_t s2_ptr,		rdx
  *		   mpi_size_t size)		rcx
  */
-
 	TEXT
 	ALIGN(4)
 	.globl C_SYMBOL_NAME(_gcry_mpih_add_n)
 C_SYMBOL_NAME(_gcry_mpih_add_n:)
 	FUNC_ENTRY()
-	leaq	(%rsi,%rcx,8), %rsi
-	leaq	(%rdi,%rcx,8), %rdi
-	leaq	(%rdx,%rcx,8), %rdx
-	negq	%rcx
-	xorl	%eax, %eax		/* clear cy */
+	movl	%ecx, %r9d
+	andl	$3, %r9d
+	je	.Lprehandle0
+	cmpl	$2, %r9d
+	jb	.Lprehandle1
+	je	.Lprehandle2
+
+#define FIRST_ADD() \
+	movq	(%rsi), %rax; \
+	addq	(%rdx), %rax; \
+	movq	%rax, (%rdi)
+
+#define NEXT_ADD(offset) \
+	movq	offset(%rsi), %rax; \
+	adcq	offset(%rdx), %rax; \
+	movq	%rax, offset(%rdi)
+
+.Lprehandle3:
+	leaq	-2(%rcx), %rcx
+	FIRST_ADD();
+	NEXT_ADD(8);
+	NEXT_ADD(16);
+	decq	%rcx
+	je	.Lend
+	leaq	24(%rsi), %rsi
+	leaq	24(%rdx), %rdx
+	leaq	24(%rdi), %rdi
+	jmp	.Loop
+
+	ALIGN(3)
+.Lprehandle2:
+	leaq	-1(%rcx), %rcx
+	FIRST_ADD();
+	NEXT_ADD(8);
+	decq	%rcx
+	je	.Lend
+	leaq	16(%rsi), %rsi
+	leaq	16(%rdx), %rdx
+	leaq	16(%rdi), %rdi
+	jmp	.Loop
+
+	ALIGN(3)
+.Lprehandle1:
+	FIRST_ADD();
+	decq	%rcx
+	je	.Lend
+	leaq	8(%rsi), %rsi
+	leaq	8(%rdx), %rdx
+	leaq	8(%rdi), %rdi
+	jmp	.Loop
+
+	ALIGN(3)
+.Lprehandle0:
+	clc				/* clear cy */
 
 	ALIGN(4)			/* minimal alignment for claimed speed */
-.Loop:	movq	(%rsi,%rcx,8), %rax
-	movq	(%rdx,%rcx,8), %r10
-	adcq	%r10, %rax
-	movq	%rax, (%rdi,%rcx,8)
-	incq	%rcx
+.Loop:	leaq	-3(%rcx), %rcx
+	NEXT_ADD(0);
+	NEXT_ADD(8);
+	NEXT_ADD(16);
+	NEXT_ADD(24);
+	leaq	32(%rsi), %rsi
+	leaq	32(%rdx), %rdx
+	leaq	32(%rdi), %rdi
+	decq	%rcx
 	jne	.Loop
 
-	movq	%rcx, %rax		/* zero %rax */
-	adcq	%rax, %rax
+	ALIGN(2)
+.Lend:
+	movl	$0, %eax		/* zero %rax */
+	adcl	%eax, %eax
 	FUNC_EXIT()
diff --git a/mpi/amd64/mpih-sub1.S b/mpi/amd64/mpih-sub1.S
index 8c61cb20..32799c86 100644
--- a/mpi/amd64/mpih-sub1.S
+++ b/mpi/amd64/mpih-sub1.S
@@ -3,6 +3,7 @@
  *
  *      Copyright (C) 1992, 1994, 1995, 1998, 
  *                    2001, 2002, 2006 Free Software Foundation, Inc.
+ *      Copyright (C) 2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -44,20 +45,75 @@
 	.globl C_SYMBOL_NAME(_gcry_mpih_sub_n)
 C_SYMBOL_NAME(_gcry_mpih_sub_n:)
 	FUNC_ENTRY()
-	leaq	(%rsi,%rcx,8), %rsi
-	leaq	(%rdi,%rcx,8), %rdi
-	leaq	(%rdx,%rcx,8), %rdx
-	negq	%rcx
-	xorl	%eax, %eax		/* clear cy */
+	movl	%ecx, %r9d
+	andl	$3, %r9d
+	je	.Lprehandle0
+	cmpl	$2, %r9d
+	jb	.Lprehandle1
+	je	.Lprehandle2
+
+#define FIRST_SUB() \
+	movq	(%rsi), %rax; \
+	subq	(%rdx), %rax; \
+	movq	%rax, (%rdi)
+
+#define NEXT_SUB(offset) \
+	movq	offset(%rsi), %rax; \
+	sbbq	offset(%rdx), %rax; \
+	movq	%rax, offset(%rdi)
+
+.Lprehandle3:
+	leaq	-2(%rcx), %rcx
+	FIRST_SUB();
+	NEXT_SUB(8);
+	NEXT_SUB(16);
+	decq	%rcx
+	je	.Lend
+	leaq	24(%rsi), %rsi
+	leaq	24(%rdx), %rdx
+	leaq	24(%rdi), %rdi
+	jmp	.Loop
+
+	ALIGN(3)
+.Lprehandle2:
+	leaq	-1(%rcx), %rcx
+	FIRST_SUB();
+	NEXT_SUB(8);
+	decq	%rcx
+	je	.Lend
+	leaq	16(%rsi), %rsi
+	leaq	16(%rdx), %rdx
+	leaq	16(%rdi), %rdi
+	jmp	.Loop
+
+	ALIGN(3)
+.Lprehandle1:
+	FIRST_SUB();
+	decq	%rcx
+	je	.Lend
+	leaq	8(%rsi), %rsi
+	leaq	8(%rdx), %rdx
+	leaq	8(%rdi), %rdi
+	jmp	.Loop
+
+	ALIGN(3)
+.Lprehandle0:
+	clc				/* clear cy */
 
 	ALIGN(4)			/* minimal alignment for claimed speed */
-.Loop:	movq	(%rsi,%rcx,8), %rax
-	movq	(%rdx,%rcx,8), %r10
-	sbbq	%r10, %rax
-	movq	%rax, (%rdi,%rcx,8)
-	incq	%rcx
+.Loop:	leaq	-3(%rcx), %rcx
+	NEXT_SUB(0);
+	NEXT_SUB(8);
+	NEXT_SUB(16);
+	NEXT_SUB(24);
+	leaq	32(%rsi), %rsi
+	leaq	32(%rdx), %rdx
+	leaq	32(%rdi), %rdi
+	decq	%rcx
 	jne	.Loop
 
-	movq	%rcx, %rax		/* zero %rax */
-	adcq	%rax, %rax
+	ALIGN(2)
+.Lend:
+	movl	$0, %eax		/* zero %rax */
+	adcl	%eax, %eax
 	FUNC_EXIT()
-- 
2.39.2




More information about the Gcrypt-devel mailing list