[PATCH 2/3] camellia-simd128: faster sbox filtering with uint8 right shift
Jussi Kivilinna
jussi.kivilinna at iki.fi
Thu Mar 2 12:58:59 CET 2023
* cipher/camellia-simd128.h (if_vpsrlb128)
(if_not_vpsrlb128): New.
(filter_8bit): Use 'vpsrlb128' when available on target
architecture (PowerPC and AArch64).
--
Benchmark on POWER9:
Before:
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 3.26 ns/B 292.8 MiB/s 7.49 c/B
ECB dec | 3.29 ns/B 290.0 MiB/s 7.56 c/B
After (~2% faster):
CAMELLIA128 | nanosecs/byte mebibytes/sec cycles/byte
ECB enc | 3.16 ns/B 301.4 MiB/s 7.28 c/B
ECB dec | 3.19 ns/B 298.7 MiB/s 7.34 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-simd128.h | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/cipher/camellia-simd128.h b/cipher/camellia-simd128.h
index 9cb7b987..6b44961f 100644
--- a/cipher/camellia-simd128.h
+++ b/cipher/camellia-simd128.h
@@ -91,6 +91,8 @@ asm_sbox_be(uint8x16_t b)
o = (__m128i)vec_sld((uint8x16_t)a, \
(uint8x16_t)__tmp, (s) & 15);})
+#define if_vpsrlb128(...) __VA_ARGS__
+#define if_not_vpsrlb128(...) /*_*/
#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
@@ -182,6 +184,8 @@ static const uint8x16_t shift_row =
o = (__m128i)vextq_u8((uint8x16_t)__tmp, \
(uint8x16_t)a, (16 - (s)) & 15);})
+#define if_vpsrlb128(...) __VA_ARGS__
+#define if_not_vpsrlb128(...) /*_*/
#define vpsrl_byte_128(s, a, o) vpsrlb128(s, a, o)
#define vpsll_byte_128(s, a, o) vpsllb128(s, a, o)
@@ -253,6 +257,8 @@ static const uint8x16_t shift_row =
#define vpsrldq128(s, a, o) (o = _mm_srli_si128(a, s))
#define vpslldq128(s, a, o) (o = _mm_slli_si128(a, s))
+#define if_vpsrlb128(...) /*_*/
+#define if_not_vpsrlb128(...) __VA_ARGS__
#define vpsrl_byte_128(s, a, o) vpsrld128(s, a, o)
#define vpsll_byte_128(s, a, o) vpslld128(s, a, o)
@@ -309,8 +315,9 @@ static const uint8x16_t shift_row =
**********************************************************************/
#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \
vpand128(x, mask4bit, tmp0); \
- vpandn128(x, mask4bit, x); \
- vpsrl_byte_128(4, x, x); \
+ if_vpsrlb128(vpsrlb128(4, x, x)); \
+ if_not_vpsrlb128(vpandn128(x, mask4bit, x)); \
+ if_not_vpsrlb128(vpsrld128(4, x, x)); \
\
vpshufb128(tmp0, lo_t, tmp0); \
vpshufb128(x, hi_t, x); \
--
2.37.2
More information about the Gcrypt-devel
mailing list