[PATCH 8/8] camellia-aesni-avx: speed up for round key broadcasting

Wed Feb 22 20:29:24 CET 2023

* cipher/camellia-aesni-avx2-amd64.h (roundsm16, fls16): Broadcast
round key bytes directly with 'vpshufb'.
--

Benchmark on AMD Ryzen 9 7900X (turbo-freq off):

 Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.837 ns/B      1139 MiB/s      3.94 c/B      4700
        ECB dec |     0.839 ns/B      1137 MiB/s      3.94 c/B      4700

 After (~3% faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     0.808 ns/B      1180 MiB/s      3.80 c/B      4700
        ECB dec |     0.810 ns/B      1177 MiB/s      3.81 c/B      4700

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx-amd64.S | 89 ++++++++++++++++---------------
 1 file changed, 47 insertions(+), 42 deletions(-)

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 5ec33b9b..76e62ea8 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1,6 +1,6 @@
 /* camellia-avx-aesni-amd64.S  -  AES-NI/AVX implementation of Camellia cipher
  *
- * Copyright (C) 2013-2015,2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015,2020,2023 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -121,25 +121,14 @@
 	filter_8bit(x2, t2, t3, t7, t6); \
 	filter_8bit(x5, t2, t3, t7, t6); \
 	\
-	vpxor t6, t6, t6; \
 	vmovq key, t0; \
 	\
 	/* postfilter sbox 2 */ \
 	filter_8bit(x1, t4, t5, t7, t2); \
 	filter_8bit(x4, t4, t5, t7, t2); \
 	\
-	vpsrldq $5, t0, t5; \
-	vpsrldq $1, t0, t1; \
-	vpsrldq $2, t0, t2; \
-	vpsrldq $3, t0, t3; \
-	vpsrldq $4, t0, t4; \
-	vpshufb t6, t0, t0; \
-	vpshufb t6, t1, t1; \
-	vpshufb t6, t2, t2; \
-	vpshufb t6, t3, t3; \
-	vpshufb t6, t4, t4; \
-	vpsrldq $2, t5, t7; \
-	vpshufb t6, t7, t7; \
+	vpshufb .Lbyte_threes rRIP, t0, t3; \
+	vpshufb .Lbyte_twos rRIP, t0, t2; \
 	\
 	/* P-function */ \
 	vpxor x5, x0, x0; \
@@ -147,16 +136,23 @@
 	vpxor x7, x2, x2; \
 	vpxor x4, x3, x3; \
 	\
+	vpshufb .Lbyte_ones rRIP, t0, t1; \
+	vpshufb .Lbyte_sevens rRIP, t0, t7; \
+	\
 	vpxor x2, x4, x4; \
 	vpxor x3, x5, x5; \
 	vpxor x0, x6, x6; \
 	vpxor x1, x7, x7; \
 	\
+	vpshufb .Lbyte_sixs rRIP, t0, t6; \
+	vpshufb .Lbyte_fives rRIP, t0, t5; \
 	vpxor x7, x0, x0; \
 	vpxor x4, x1, x1; \
 	vpxor x5, x2, x2; \
 	vpxor x6, x3, x3; \
 	\
+	vpshufb .Lbyte_fours rRIP, t0, t4; \
+	\
 	vpxor x3, x4, x4; \
 	vpxor x0, x5, x5; \
 	vpxor x1, x6, x6; \
@@ -165,15 +161,14 @@
 	/* Add key material and result to CD (x becomes new CD) */ \
 	\
 	vpxor t3, x4, x4; \
+	vpxor t3, t3, t3; \
 	vpxor 0 * 16(mem_cd), x4, x4; \
 	\
+	vpshufb t3, t0, t0; \
+	\
 	vpxor t2, x5, x5; \
 	vpxor 1 * 16(mem_cd), x5, x5; \
 	\
-	vpsrldq $1, t5, t3; \
-	vpshufb t6, t5, t5; \
-	vpshufb t6, t3, t6; \
-	\
 	vpxor t1, x6, x6; \
 	vpxor 2 * 16(mem_cd), x6, x6; \
 	\
@@ -294,12 +289,9 @@
 	vpxor tt0, tt0, tt0; \
 	vmovd kll, t0; \
 	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpshufb .Lbyte_ones rRIP, t0, t2; \
+	vpshufb .Lbyte_twos rRIP, t0, t1; \
+	vpshufb .Lbyte_threes rRIP, t0, t0; \
 	\
 	vpand l0, t0, t0; \
 	vpand l1, t1, t1; \
@@ -325,12 +317,9 @@
 	\
 	vmovd krr, t0; \
 	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpshufb .Lbyte_ones rRIP, t0, t2; \
+	vpshufb .Lbyte_twos rRIP, t0, t1; \
+	vpshufb .Lbyte_threes rRIP, t0, t0; \
 	\
 	vpor 4 * 16(r), t0, t0; \
 	vpor 5 * 16(r), t1, t1; \
@@ -353,12 +342,9 @@
 	 */ \
 	vmovd krl, t0; \
 	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpshufb .Lbyte_ones rRIP, t0, t2; \
+	vpshufb .Lbyte_twos rRIP, t0, t1; \
+	vpshufb .Lbyte_threes rRIP, t0, t0; \
 	\
 	vpand 0 * 16(r), t0, t0; \
 	vpand 1 * 16(r), t1, t1; \
@@ -384,12 +370,9 @@
 	\
 	vmovd klr, t0; \
 	vpshufb tt0, t0, t3; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t2; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t1; \
-	vpsrldq $1, t0, t0; \
-	vpshufb tt0, t0, t0; \
+	vpshufb .Lbyte_ones rRIP, t0, t2; \
+	vpshufb .Lbyte_twos rRIP, t0, t1; \
+	vpshufb .Lbyte_threes rRIP, t0, t0; \
 	\
 	vpor l4, t0, t0; \
 	vpor l5, t1, t1; \
@@ -637,6 +620,28 @@ _camellia_aesni_avx_data:
 	.long 0x80808080
 	.long 0x80808080
 
+.Lbyte_ones:
+	.quad 1 * 0x0101010101010101
+	.quad 1 * 0x0101010101010101
+.Lbyte_twos:
+	.quad 2 * 0x0101010101010101
+	.quad 2 * 0x0101010101010101
+.Lbyte_threes:
+	.quad 3 * 0x0101010101010101
+	.quad 3 * 0x0101010101010101
+.Lbyte_fours:
+	.quad 4 * 0x0101010101010101
+	.quad 4 * 0x0101010101010101
+.Lbyte_fives:
+	.quad 5 * 0x0101010101010101
+	.quad 5 * 0x0101010101010101
+.Lbyte_sixs:
+	.quad 6 * 0x0101010101010101
+	.quad 6 * 0x0101010101010101
+.Lbyte_sevens:
+	.quad 7 * 0x0101010101010101
+	.quad 7 * 0x0101010101010101
+
 /* For CTR-mode IV byteswap */
 .Lbswap128_mask:
 	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-- 
2.37.2