[PATCH 2/3] camellia-simd128: faster sbox filtering with uint8 right shift

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu Mar 2 12:58:59 CET 2023


* cipher/camellia-simd128.h (if_vpsrlb128)
(if_not_vpsrlb128): New.
(filter_8bit): Use 'vpsrlb128' when available on target
architecture (PowerPC and AArch64).
--

Benchmark on POWER9:

 Before:
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      3.26 ns/B     292.8 MiB/s      7.49 c/B
        ECB dec |      3.29 ns/B     290.0 MiB/s      7.56 c/B

 After (~2% faster):
 CAMELLIA128    |  nanosecs/byte   mebibytes/sec   cycles/byte
        ECB enc |      3.16 ns/B     301.4 MiB/s      7.28 c/B
        ECB dec |      3.19 ns/B     298.7 MiB/s      7.34 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-simd128.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/cipher/camellia-simd128.h b/cipher/camellia-simd128.h
index 9cb7b987..6b44961f 100644
--- a/cipher/camellia-simd128.h
+++ b/cipher/camellia-simd128.h
@@ -91,6 +91,8 @@ asm_sbox_be(uint8x16_t b)
 				  o = (__m128i)vec_sld((uint8x16_t)a, \
 						       (uint8x16_t)__tmp, (s) & 15);})
 
+#define if_vpsrlb128(...)       __VA_ARGS__
+#define if_not_vpsrlb128(...)   /*_*/
 #define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
 #define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
 
@@ -182,6 +184,8 @@ static const uint8x16_t shift_row =
 				o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
 						      (uint8x16_t)a, (16 - (s)) & 15);})
 
+#define if_vpsrlb128(...)       __VA_ARGS__
+#define if_not_vpsrlb128(...)   /*_*/
 #define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
 #define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
 
@@ -253,6 +257,8 @@ static const uint8x16_t shift_row =
 #define vpsrldq128(s, a, o)     (o = _mm_srli_si128(a, s))
 #define vpslldq128(s, a, o)     (o = _mm_slli_si128(a, s))
 
+#define if_vpsrlb128(...)       /*_*/
+#define if_not_vpsrlb128(...)   __VA_ARGS__
 #define vpsrl_byte_128(s, a, o) vpsrld128(s, a, o)
 #define vpsll_byte_128(s, a, o) vpslld128(s, a, o)
 
@@ -309,8 +315,9 @@ static const uint8x16_t shift_row =
  **********************************************************************/
 #define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
 	vpand128(x, mask4bit, tmp0); \
-	vpandn128(x, mask4bit, x); \
-	vpsrl_byte_128(4, x, x); \
+	if_vpsrlb128(vpsrlb128(4, x, x)); \
+	if_not_vpsrlb128(vpandn128(x, mask4bit, x)); \
+	if_not_vpsrlb128(vpsrld128(4, x, x)); \
 	\
 	vpshufb128(tmp0, lo_t, tmp0); \
 	vpshufb128(x, hi_t, x); \
-- 
2.37.2




More information about the Gcrypt-devel mailing list