[PATCH 5/5] mpi: add ARMv6 assembly

Mon Aug 19 11:16:16 CEST 2013

* mpi/armv6/mpi-asm-defs.h: New.
* mpi/armv6/mpih-add1.S: New.
* mpi/armv6/mpih-mul1.S: New.
* mpi/armv6/mpih-mul2.S: New.
* mpi/armv6/mpih-mul3.S: New.
* mpi/armv6/mpih-sub1.S: New.
* mpi/config.links [arm]: Enable ARMv6 assembly.
--

Add mpi assembly for ARMv6 (or later). These are partly based on ARM assembly
found in GMP 4.2.1.

Old vs new (Cortex-A8, 1Ghz):

Algorithm         generate  100*sign  100*verify
------------------------------------------------
ECDSA 192 bit        1.14x     1.10x       1.13x
ECDSA 224 bit        1.11x     1.12x       1.12x
ECDSA 256 bit        1.20x     1.13x       1.14x
ECDSA 384 bit        1.13x     1.21x       1.21x
ECDSA 521 bit        1.17x     1.20x       1.22x
Algorithm         generate  100*sign  100*verify
------------------------------------------------
RSA 1024 bit             -     1.31x       1.60x
RSA 2048 bit             -     1.41x       1.47x
RSA 3072 bit             -     1.50x       1.63x
RSA 4096 bit             -     1.50x       1.57x
Algorithm         generate  100*sign  100*verify
------------------------------------------------
DSA 1024/160             -     1.39x       1.38x
DSA 2048/224             -     1.50x       1.51x
DSA 3072/256             -     1.59x       1.64x

NEW:

Algorithm         generate  100*sign  100*verify
------------------------------------------------
ECDSA 192 bit         70ms    1750ms      3170ms
ECDSA 224 bit         90ms    2210ms      4250ms
ECDSA 256 bit        100ms    2710ms      5170ms
ECDSA 384 bit        230ms    5670ms     11040ms
ECDSA 521 bit        540ms   13370ms     25870ms
Algorithm         generate  100*sign  100*verify
------------------------------------------------
RSA 1024 bit         360ms    2200ms        50ms
RSA 2048 bit        2770ms   11900ms       150ms
RSA 3072 bit        6680ms   32530ms       270ms
RSA 4096 bit       10320ms   69440ms       460ms
Algorithm         generate  100*sign  100*verify
------------------------------------------------
DSA 1024/160             -     990ms       910ms
DSA 2048/224             -    3830ms      3410ms
DSA 3072/256             -    8270ms      7030ms

OLD:

Algorithm         generate  100*sign  100*verify
------------------------------------------------
ECDSA 192 bit         80ms    1920ms      3580ms
ECDSA 224 bit        100ms    2470ms      4760ms
ECDSA 256 bit        120ms    3050ms      5870ms
ECDSA 384 bit        260ms    6840ms     13330ms
ECDSA 521 bit        630ms   16080ms     31500ms
Algorithm         generate  100*sign  100*verify
------------------------------------------------
RSA 1024 bit         450ms    2890ms        80ms
RSA 2048 bit        2320ms   16760ms       220ms
RSA 3072 bit       26300ms   48650ms       440ms
RSA 4096 bit       15700ms   103910ms      720ms
Algorithm         generate  100*sign  100*verify
------------------------------------------------
DSA 1024/160             -    1380ms      1260ms
DSA 2048/224             -    5740ms      5140ms
DSA 3072/256             -   13130ms     11510ms

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 mpi/armv6/mpi-asm-defs.h |   10 +++++
 mpi/armv6/mpih-add1.S    |   76 ++++++++++++++++++++++++++++++++++++
 mpi/armv6/mpih-mul1.S    |   80 ++++++++++++++++++++++++++++++++++++++
 mpi/armv6/mpih-mul2.S    |   94 +++++++++++++++++++++++++++++++++++++++++++++
 mpi/armv6/mpih-mul3.S    |   97 ++++++++++++++++++++++++++++++++++++++++++++++
 mpi/armv6/mpih-sub1.S    |   77 +++++++++++++++++++++++++++++++++++++
 mpi/config.links         |   15 +++++++
 7 files changed, 449 insertions(+)
 create mode 100644 mpi/armv6/mpi-asm-defs.h
 create mode 100644 mpi/armv6/mpih-add1.S
 create mode 100644 mpi/armv6/mpih-mul1.S
 create mode 100644 mpi/armv6/mpih-mul2.S
 create mode 100644 mpi/armv6/mpih-mul3.S
 create mode 100644 mpi/armv6/mpih-sub1.S

diff --git a/mpi/armv6/mpi-asm-defs.h b/mpi/armv6/mpi-asm-defs.h
new file mode 100644
index 0000000..13424e2
--- /dev/null
+++ b/mpi/armv6/mpi-asm-defs.h
@@ -0,0 +1,10 @@
+/* This file defines some basic constants for the MPI machinery.  We
+ * need to define the types on a per-CPU basis, so it is done with
+ * this file here.  */
+#define BYTES_PER_MPI_LIMB  (SIZEOF_UNSIGNED_LONG)
+
+
+
+
+
+
diff --git a/mpi/armv6/mpih-add1.S b/mpi/armv6/mpih-add1.S
new file mode 100644
index 0000000..60ea4c3
--- /dev/null
+++ b/mpi/armv6/mpih-add1.S
@@ -0,0 +1,76 @@
+/* ARMv6 add_n -- Add two limb vectors of the same length > 0 and store
+ *                sum in a third limb vector.
+ *
+ *      Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.syntax unified
+.arm
+
+/*******************
+ *  mpi_limb_t
+ *  _gcry_mpih_add_n( mpi_ptr_t res_ptr,	%r0
+ *		   mpi_ptr_t s1_ptr,		%r1
+ *		   mpi_ptr_t s2_ptr,		%r2
+ *		   mpi_size_t size)		%r3
+ */
+
+.text
+
+.globl _gcry_mpih_add_n
+.type  _gcry_mpih_add_n,%function
+_gcry_mpih_add_n:
+	push	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr};
+	cmn	%r0, #0; /* clear carry flag */
+
+	tst	%r3, #3;
+	beq	.Large_loop;
+
+.Loop:
+	ldr	%r4, [%r1], #4;
+	sub	%r3, #1;
+	ldr	%lr, [%r2], #4;
+	adcs	%r4, %lr;
+	tst	%r3, #3;
+	str	%r4, [%r0], #4;
+	bne	.Loop;
+
+	teq	%r3, #0;
+	beq	.Lend;
+
+.Large_loop:
+	ldm	%r1!, {%r4, %r6, %r8, %r10};
+	ldm	%r2!, {%r5, %r7, %r9, %lr};
+	sub	%r3, #4;
+	adcs	%r4, %r5;
+	adcs	%r6, %r7;
+	adcs	%r8, %r9;
+	adcs	%r10, %lr;
+	teq	%r3, #0;
+	stm	%r0!, {%r4, %r6, %r8, %r10};
+	bne	.Large_loop;
+
+.Lend:
+	adc	%r0, %r3, #0;
+	pop	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc};
+.size _gcry_mpih_add_n,.-_gcry_mpih_add_n;
diff --git a/mpi/armv6/mpih-mul1.S b/mpi/armv6/mpih-mul1.S
new file mode 100644
index 0000000..ae19a15
--- /dev/null
+++ b/mpi/armv6/mpih-mul1.S
@@ -0,0 +1,80 @@
+/* ARMv6 mul_1 -- Multiply a limb vector with a limb and store the result in
+ *                a second limb vector.
+ *
+ *      Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.syntax unified
+.arm
+
+/*******************
+ * mpi_limb_t
+ * _gcry_mpih_mul_1( mpi_ptr_t res_ptr,		%r0
+ *		  mpi_ptr_t s1_ptr,		%r1
+ *		  mpi_size_t s1_size,		%r2
+ *		  mpi_limb_t s2_limb)		%r3
+ */
+
+.text
+
+.globl _gcry_mpih_mul_1
+.type  _gcry_mpih_mul_1,%function
+_gcry_mpih_mul_1:
+	push	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %lr};
+	mov	%r4, #0;
+
+	tst	%r2, #3;
+	beq	.Large_loop;
+
+.Loop:
+	ldr	%r5, [%r1], #4;
+	mov	%lr, #0;
+	umlal	%r4, %lr, %r5, %r3;
+	sub	%r2, #1;
+	str	%r4, [%r0], #4;
+	tst	%r2, #3;
+	mov	%r4, %lr;
+	bne	.Loop;
+
+	teq	%r2, #0;
+	beq	.Lend;
+
+.Large_loop:
+	mov	%r9, #0;
+	ldm	%r1!, {%r5, %r6, %r7, %r8};
+	mov	%r10, #0;
+	umlal	%r4, %r9, %r5, %r3;
+	mov	%r11, #0;
+	umlal	%r9, %r10, %r6, %r3;
+	mov	%lr, #0;
+	umlal	%r10, %r11, %r7, %r3;
+	subs	%r2, #4;
+	umlal	%r11, %lr, %r8, %r3;
+	stm	%r0!, {%r4, %r9, %r10, %r11};
+	mov	%r4, %lr;
+	bne	.Large_loop;
+
+.Lend:
+	mov	%r0, %r4;
+	pop	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %r11, %pc};
+.size _gcry_mpih_mul_1,.-_gcry_mpih_mul_1;
diff --git a/mpi/armv6/mpih-mul2.S b/mpi/armv6/mpih-mul2.S
new file mode 100644
index 0000000..02f7c07
--- /dev/null
+++ b/mpi/armv6/mpih-mul2.S
@@ -0,0 +1,94 @@
+/* ARMv6 mul_2 -- Multiply a limb vector with a limb and add the result to
+ *                a second limb vector.
+ *
+ *      Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.syntax unified
+.arm
+
+/*******************
+ * mpi_limb_t
+ * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr,	%r0
+ *		     mpi_ptr_t s1_ptr,		%r1
+ *		     mpi_size_t s1_size,	%r2
+ *		     mpi_limb_t s2_limb)	%r3
+ */
+
+.text
+
+.globl _gcry_mpih_addmul_1
+.type  _gcry_mpih_addmul_1,%function
+_gcry_mpih_addmul_1:
+	push	{%r4, %r5, %r6, %r8, %r10, %lr};
+	mov	%lr, #0;
+	cmn	%r0, #0; /* clear carry flag */
+
+	tst	%r2, #3;
+	beq	.Large_loop;
+.Loop:
+	ldr	%r5, [%r1], #4;
+	ldr	%r4, [%r0];
+	sub	%r2, #1;
+	adcs	%r4, %lr;
+	mov	%lr, #0;
+	umlal	%r4, %lr, %r5, %r3;
+	tst	%r2, #3;
+	str	%r4, [%r0], #4;
+	bne	.Loop;
+
+	teq	%r2, #0;
+	beq	.Lend;
+
+.Large_loop:
+	ldm	%r0, {%r4, %r6, %r8, %r10};
+	ldr	%r5, [%r1], #4;
+
+	sub	%r2, #4;
+	adcs	%r4, %lr;
+	mov	%lr, #0;
+	umlal	%r4, %lr, %r5, %r3;
+
+	ldr	%r5, [%r1], #4;
+	adcs	%r6, %lr;
+	mov	%lr, #0;
+	umlal	%r6, %lr, %r5, %r3;
+
+	ldr	%r5, [%r1], #4;
+	adcs	%r8, %lr;
+	mov	%lr, #0;
+	umlal	%r8, %lr, %r5, %r3;
+
+	ldr	%r5, [%r1], #4;
+	adcs	%r10, %lr;
+	mov	%lr, #0;
+	umlal	%r10, %lr, %r5, %r3;
+
+	teq	%r2, #0;
+	stm	%r0!, {%r4, %r6, %r8, %r10};
+	bne	.Large_loop;
+
+.Lend:
+	adc	%r0, %lr, #0;
+	pop	{%r4, %r5, %r6, %r8, %r10, %pc};
+.size _gcry_mpih_addmul_1,.-_gcry_mpih_addmul_1;
diff --git a/mpi/armv6/mpih-mul3.S b/mpi/armv6/mpih-mul3.S
new file mode 100644
index 0000000..e42fc30
--- /dev/null
+++ b/mpi/armv6/mpih-mul3.S
@@ -0,0 +1,97 @@
+/* ARMv6 mul_3 -- Multiply a limb vector with a limb and subtract the result
+ *                from a second limb vector.
+ *
+ *      Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.syntax unified
+.arm
+
+/*******************
+ * mpi_limb_t
+ * _gcry_mpih_submul_1( mpi_ptr_t res_ptr,	%r0
+ *		     mpi_ptr_t s1_ptr,		%r1
+ *		     mpi_size_t s1_size,	%r2
+ *		     mpi_limb_t s2_limb)	%r3
+ */
+
+.text
+
+.globl _gcry_mpih_submul_1
+.type  _gcry_mpih_submul_1,%function
+_gcry_mpih_submul_1:
+	push	{%r4, %r5, %r6, %r8, %r9, %r10, %lr};
+	mov	%lr, #0;
+	cmp	%r0, #0; /* prepare carry flag for sbc */
+
+	tst	%r2, #3;
+	beq	.Large_loop;
+.Loop:
+	ldr	%r5, [%r1], #4;
+	mov	%r4, %lr;
+	mov	%lr, #0;
+	ldr	%r6, [%r0];
+	umlal	%r4, %lr, %r5, %r3;
+	sub	%r2, #1;
+	sbcs	%r4, %r6, %r4;
+	tst	%r2, #3;
+	str	%r4, [%r0], #4;
+	bne	.Loop;
+
+	teq	%r2, #0;
+	beq	.Lend;
+
+.Large_loop:
+	ldr	%r5, [%r1], #4;
+	ldm	%r0, {%r4, %r6, %r8, %r10};
+
+	mov	%r9, #0;
+	umlal	%lr, %r9, %r5, %r3;
+	ldr	%r5, [%r1], #4;
+	sbcs	%r4, %r4, %lr;
+
+	mov	%lr, #0;
+	umlal	%r9, %lr, %r5, %r3;
+	ldr	%r5, [%r1], #4;
+	sbcs	%r6, %r6, %r9;
+
+	mov	%r9, #0;
+	umlal	%lr, %r9, %r5, %r3;
+	ldr	%r5, [%r1], #4;
+	sbcs	%r8, %r8, %lr;
+
+	mov	%lr, #0;
+	umlal	%r9, %lr, %r5, %r3;
+	sub	%r2, #4;
+	sbcs	%r10, %r10, %r9;
+
+	teq	%r2, #0;
+	stm	%r0!, {%r4, %r6, %r8, %r10};
+	bne	.Large_loop;
+
+.Lend:
+	it	cc
+	movcc	%r2, #1;
+	add	%r0, %lr, %r2;
+	pop	{%r4, %r5, %r6, %r8, %r9, %r10, %pc};
+.size _gcry_mpih_submul_1,.-_gcry_mpih_submul_1;
diff --git a/mpi/armv6/mpih-sub1.S b/mpi/armv6/mpih-sub1.S
new file mode 100644
index 0000000..77d05eb
--- /dev/null
+++ b/mpi/armv6/mpih-sub1.S
@@ -0,0 +1,77 @@
+/* ARMv6 sub_n -- Subtract two limb vectors of the same length > 0 and store
+ *		  sum in a third limb vector.
+ *
+ *      Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ *
+ * Note: This code is heavily based on the GNU MP Library (version 4.2.1).
+ */
+
+#include "sysdep.h"
+#include "asm-syntax.h"
+
+.syntax unified
+.arm
+
+/*******************
+ *  mpi_limb_t
+ *  _gcry_mpih_sub_n( mpi_ptr_t res_ptr,	%r0
+ *		   mpi_ptr_t s1_ptr,		%r1
+ *		   mpi_ptr_t s2_ptr,		%r2
+ *		   mpi_size_t size)		%r3
+ */
+
+.text
+
+.globl _gcry_mpih_sub_n
+.type  _gcry_mpih_sub_n,%function
+_gcry_mpih_sub_n:
+	push	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %lr};
+	cmp	%r0, #0; /* prepare carry flag for sub */
+
+	tst	%r3, #3;
+	beq	.Large_loop;
+
+.Loop:
+	ldr	%r4, [%r1], #4;
+	sub	%r3, #1;
+	ldr	%lr, [%r2], #4;
+	sbcs	%r4, %lr;
+	tst	%r3, #3;
+	str	%r4, [%r0], #4;
+	bne	.Loop;
+
+	teq	%r3, #0;
+	beq	.Lend;
+
+.Large_loop:
+	ldm	%r1!, {%r4, %r6, %r8, %r10};
+	sub	%r3, #4;
+	ldm	%r2!, {%r5, %r7, %r9, %lr};
+	sbcs	%r4, %r5;
+	sbcs	%r6, %r7;
+	sbcs	%r8, %r9;
+	sbcs	%r10, %lr;
+	teq	%r3, #0;
+	stm	%r0!, {%r4, %r6, %r8, %r10};
+	bne	.Large_loop;
+
+.Lend:
+	sbc	%r0, %r3, #0;
+	neg	%r0, %r0;
+	pop	{%r4, %r5, %r6, %r7, %r8, %r9, %r10, %pc};
+.size _gcry_mpih_sub_n,.-_gcry_mpih_sub_n;
diff --git a/mpi/config.links b/mpi/config.links
index bcc6e3e..f300255 100644
--- a/mpi/config.links
+++ b/mpi/config.links
@@ -136,6 +136,21 @@ case "${host}" in
 	mpi_extra_modules="udiv-qrnnd"
         mpi_cpu_arch="alpha"
 	;;
+    arm*-*-*)
+	if test "$gcry_cv_gcc_arm_platform_as_ok" = "yes" ; then
+	  if test "$gcry_cv_cc_arm_arch_is_v6" = "yes" ; then
+	    echo '/* configured for armv6 */' >>./mpi/asm-syntax.h
+	    path="armv6"
+	    mpi_cpu_arch="armv6"
+	  else
+	    echo '/* No assembler modules configured */' >>./mpi/asm-syntax.h
+	    path=""
+	  fi
+	else
+	  echo '/* No assembler modules configured */' >>./mpi/asm-syntax.h
+	  path=""
+	fi
+	;;
     hppa7000*-*-*)
 	echo '/* configured for HPPA (pa7000) */' >>./mpi/asm-syntax.h
 	path="hppa1.1 hppa"