[PATCH 1/3] Use 'vmov' and 'movi' for vector register clearing in ARM assembly

Sat Jan 8 12:06:10 CET 2022

* cipher/chacha20-aarch64.S (clear): Use 'movi'.
* cipher/chacha20-armv7-neon.S (clear): Use 'vmov'.
* cipher/cipher-gcm-armv7-neon.S (clear): Use 'vmov'.
* cipher/cipher-gcm-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'.
* cipher/cipher-gcm-armv8-aarch64-ce.S (CLEAR_REG): Use 'movi'.
* cipher/rijndael-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'.
* cipher/sha1-armv7-neon.S (clear): Use 'vmov'.
* cipher/sha1-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'.
* cipher/sha1-armv8-aarch64-ce.S (CLEAR_REG): Use 'movi'.
* cipher/sha256-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'.
* cipher/sha256-armv8-aarch64-ce.S (CLEAR_REG): Use 'movi'.
* cipher/sha512-armv7-neon.S (CLEAR_REG): New using 'vmov'.
(_gcry_sha512_transform_armv7_neon): Use CLEAR_REG for clearing
registers.
--

Use 'vmov reg, #0' on 32-bit and 'movi reg.16b, #0' instead of
self-xoring register to break false register dependency.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/chacha20-aarch64.S            |  2 +-
 cipher/chacha20-armv7-neon.S         |  2 +-
 cipher/cipher-gcm-armv7-neon.S       |  2 +-
 cipher/cipher-gcm-armv8-aarch32-ce.S |  2 +-
 cipher/cipher-gcm-armv8-aarch64-ce.S |  2 +-
 cipher/rijndael-armv8-aarch32-ce.S   |  2 +-
 cipher/sha1-armv7-neon.S             |  2 +-
 cipher/sha1-armv8-aarch32-ce.S       |  2 +-
 cipher/sha1-armv8-aarch64-ce.S       |  2 +-
 cipher/sha256-armv8-aarch32-ce.S     |  2 +-
 cipher/sha256-armv8-aarch64-ce.S     |  2 +-
 cipher/sha512-armv7-neon.S           | 26 ++++++++++++++------------
 12 files changed, 25 insertions(+), 23 deletions(-)

diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
index b8f9724a..4f76834b 100644
--- a/cipher/chacha20-aarch64.S
+++ b/cipher/chacha20-aarch64.S
@@ -110,7 +110,7 @@
 	vpunpcklqdq(x2, t2, x2);
 
 #define clear(x) \
-	eor x.16b, x.16b, x.16b;
+	movi x.16b, #0;
 
 /**********************************************************************
   4-way chacha20
diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S
index 33a43df1..a862be4e 100644
--- a/cipher/chacha20-armv7-neon.S
+++ b/cipher/chacha20-armv7-neon.S
@@ -132,7 +132,7 @@
 	vswp _q0##h, _q2##l;			\
 	vswp _q1##h, _q3##l;
 
-#define clear(x) veor x,x,x;
+#define clear(x) vmov.i8 x, #0;
 
 /**********************************************************************
   4-way chacha20
diff --git a/cipher/cipher-gcm-armv7-neon.S b/cipher/cipher-gcm-armv7-neon.S
index a801a5e5..16502b4a 100644
--- a/cipher/cipher-gcm-armv7-neon.S
+++ b/cipher/cipher-gcm-armv7-neon.S
@@ -210,7 +210,7 @@ gcry_gcm_reduction_constant:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/cipher-gcm-armv8-aarch32-ce.S b/cipher/cipher-gcm-armv8-aarch32-ce.S
index 1de66a16..fb51b339 100644
--- a/cipher/cipher-gcm-armv8-aarch32-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch32-ce.S
@@ -180,7 +180,7 @@ gcry_gcm_reduction_constant:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S
index 877207d3..13ee83ed 100644
--- a/cipher/cipher-gcm-armv8-aarch64-ce.S
+++ b/cipher/cipher-gcm-armv8-aarch64-ce.S
@@ -149,7 +149,7 @@ gcry_gcm_reduction_constant:
 #define _(...) __VA_ARGS__
 #define __ _()
 
-#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+#define CLEAR_REG(reg) movi reg.16b, #0;
 
 #define VPUSH_ABI \
         stp d8, d9, [sp, #-16]!; \
diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 6d78af0a..1eafa93e 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -249,7 +249,7 @@
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/sha1-armv7-neon.S b/cipher/sha1-armv7-neon.S
index 61cc541c..2de678b8 100644
--- a/cipher/sha1-armv7-neon.S
+++ b/cipher/sha1-armv7-neon.S
@@ -303,7 +303,7 @@ gcry_sha1_armv7_neon_K_VEC:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/sha1-armv8-aarch32-ce.S b/cipher/sha1-armv8-aarch32-ce.S
index bf2b233b..059b9a85 100644
--- a/cipher/sha1-armv8-aarch32-ce.S
+++ b/cipher/sha1-armv8-aarch32-ce.S
@@ -100,7 +100,7 @@ gcry_sha1_aarch32_ce_K_VEC:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S
index 223268ca..8ea1486b 100644
--- a/cipher/sha1-armv8-aarch64-ce.S
+++ b/cipher/sha1-armv8-aarch64-ce.S
@@ -88,7 +88,7 @@ gcry_sha1_aarch64_ce_K_VEC:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+#define CLEAR_REG(reg) movi reg.16b, #0;
 
 
 /*
diff --git a/cipher/sha256-armv8-aarch32-ce.S b/cipher/sha256-armv8-aarch32-ce.S
index 2b17ab1b..95778b40 100644
--- a/cipher/sha256-armv8-aarch32-ce.S
+++ b/cipher/sha256-armv8-aarch32-ce.S
@@ -111,7 +111,7 @@ gcry_sha256_aarch32_ce_K:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) veor reg, reg;
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
 
 
 /*
diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S
index f57cae29..5c39e83e 100644
--- a/cipher/sha256-armv8-aarch64-ce.S
+++ b/cipher/sha256-armv8-aarch64-ce.S
@@ -98,7 +98,7 @@ gcry_sha256_aarch64_ce_K:
 
 /* Other functional macros */
 
-#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
+#define CLEAR_REG(reg) movi reg.16b, #0;
 
 
 /*
diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S
index 6596f2cd..2b186b47 100644
--- a/cipher/sha512-armv7-neon.S
+++ b/cipher/sha512-armv7-neon.S
@@ -91,6 +91,8 @@
 #define RW1213q q14
 #define RW1415q q15
 
+#define CLEAR_REG(reg) vmov.i8 reg, #0;
+
 /***********************************************************************
  * ARM assembly implementation of sha512 transform
  ***********************************************************************/
@@ -426,22 +428,22 @@ _gcry_sha512_transform_armv7_neon:
 
 	/* Clear used registers */
 	/* d16-d31 */
-	veor.u64 RW01q, RW01q;
-	veor.u64 RW23q, RW23q;
-	veor.u64 RW45q, RW45q;
-	veor.u64 RW67q, RW67q;
+	CLEAR_REG(RW01q);
+	CLEAR_REG(RW23q);
+	CLEAR_REG(RW45q);
+	CLEAR_REG(RW67q);
 	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
-	veor.u64 RW89q, RW89q;
-	veor.u64 RW1011q, RW1011q;
-	veor.u64 RW1213q, RW1213q;
-	veor.u64 RW1415q, RW1415q;
+	CLEAR_REG(RW89q);
+	CLEAR_REG(RW1011q);
+	CLEAR_REG(RW1213q);
+	CLEAR_REG(RW1415q);
 	/* d8-d15 */
 	vpop {RT0-RT7};
 	/* d0-d7 (q0-q3) */
-	veor.u64 %q0, %q0;
-	veor.u64 %q1, %q1;
-	veor.u64 %q2, %q2;
-	veor.u64 %q3, %q3;
+	CLEAR_REG(%q0);
+	CLEAR_REG(%q1);
+	CLEAR_REG(%q2);
+	CLEAR_REG(%q3);
 
 	eor %r0, %r0;
 	pop {%pc};
-- 
2.32.0