[PATCH 3/5] aria-avx: small optimization for aria_ark_8way

Taehee Yoo ap420073 at gmail.com
Mon Feb 20 11:49:19 CET 2023


On 2/19/23 17:49, Jussi Kivilinna wrote:

Hi Jussi,
Thank you so much for this optimization!

 > * cipher/aria-aesni-avx-amd64.S (aria_ark_8way): Use 'vmovd' for
 > loading key material and 'vpshufb' for broadcasting from byte
 > locations 3, 2, 1 and 0.

I tested this optimization in the kernel, it works well :)
It will be helpful to the in-kernel aria-avx too!

 > --
 >
 > Benchmark on AMD Ryzen 9 7900X (zen4, turbo-freq off):
 >
 >   Before (GFNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.516 ns/B      1847 MiB/s      2.43 c/B      4700
 >          ECB dec |     0.519 ns/B      1839 MiB/s      2.44 c/B      4700
 >          CTR enc |     0.517 ns/B      1846 MiB/s      2.43 c/B      4700
 >          CTR dec |     0.518 ns/B      1843 MiB/s      2.43 c/B      4700
 >
 >   After (GFNI/AVX, ~5% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.490 ns/B      1947 MiB/s      2.30 c/B      4700
 >          ECB dec |     0.490 ns/B      1946 MiB/s      2.30 c/B      4700
 >          CTR enc |     0.493 ns/B      1935 MiB/s      2.32 c/B      4700
 >          CTR dec |     0.493 ns/B      1934 MiB/s      2.32 c/B      4700
 >
 > ===
 >
 > Benchmark on Intel Core i3-1115G4 (tiger-lake, turbo-freq off):
 >
 >   Before (GFNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.967 ns/B     986.6 MiB/s      2.89 c/B      2992
 >          ECB dec |     0.966 ns/B     987.1 MiB/s      2.89 c/B      2992
 >          CTR enc |     0.972 ns/B     980.8 MiB/s      2.91 c/B      2993
 >          CTR dec |     0.971 ns/B     982.5 MiB/s      2.90 c/B      2993
 >
 >   After (GFNI/AVX, ~6% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.908 ns/B      1050 MiB/s      2.72 c/B      2992
 >          ECB dec |     0.903 ns/B      1056 MiB/s      2.70 c/B      2992
 >          CTR enc |     0.913 ns/B      1045 MiB/s      2.73 c/B      2992
 >          CTR dec |     0.910 ns/B      1048 MiB/s      2.72 c/B      2992
 >
 > ===
 >
 > Benchmark on AMD Ryzen 7 5800X (zen3, turbo-freq off):
 >
 >   Before (AESNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.921 ns/B      1035 MiB/s      3.50 c/B      3800
 >          ECB dec |     0.922 ns/B      1034 MiB/s      3.50 c/B      3800
 >          CTR enc |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800
 >          CTR dec |     0.923 ns/B      1033 MiB/s      3.51 c/B      3800
 >
 >   After (AESNI/AVX, ~6% faster)
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |     0.862 ns/B      1106 MiB/s      3.28 c/B      3800
 >          ECB dec |     0.862 ns/B      1106 MiB/s      3.28 c/B      3800
 >          CTR enc |     0.865 ns/B      1102 MiB/s      3.29 c/B      3800
 >          CTR dec |     0.865 ns/B      1103 MiB/s      3.29 c/B      3800
 >
 > ===
 >
 > Benchmark on AMD EPYC 7642 (zen2):
 >
 >   Before (AESNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      1.22 ns/B     784.5 MiB/s      4.01 c/B      3298
 >          ECB dec |      1.22 ns/B     784.8 MiB/s      4.00 c/B      3292
 >          CTR enc |      1.22 ns/B     780.1 MiB/s      4.03 c/B      3299
 >          CTR dec |      1.22 ns/B     779.1 MiB/s      4.04 c/B      3299
 >
 >   After (AESNI/AVX, ~13% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      1.07 ns/B     888.3 MiB/s      3.54 c/B      3299
 >          ECB dec |      1.08 ns/B     885.3 MiB/s      3.55 c/B      3299
 >          CTR enc |      1.07 ns/B     888.7 MiB/s      3.54 c/B      3298
 >          CTR dec |      1.07 ns/B     887.4 MiB/s      3.55 c/B      3299
 >
 > ===
 >
 > Benchmark on Intel Core i5-6500 (skylake):
 >
 >   Before (AESNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      1.24 ns/B     766.6 MiB/s      4.48 c/B      3598
 >          ECB dec |      1.25 ns/B     764.9 MiB/s      4.49 c/B      3598
 >          CTR enc |      1.25 ns/B     761.7 MiB/s      4.50 c/B      3598
 >          CTR dec |      1.25 ns/B     761.6 MiB/s      4.51 c/B      3598
 >
 >   After (AESNI/AVX, ~1% faster):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      1.22 ns/B     780.0 MiB/s      4.40 c/B      3598
 >          ECB dec |      1.22 ns/B     779.6 MiB/s      4.40 c/B      3598
 >          CTR enc |      1.23 ns/B     776.6 MiB/s      4.42 c/B      3598
 >          CTR dec |      1.23 ns/B     776.6 MiB/s      4.42 c/B      3598
 >
 > ===
 >
 > Benchmark on Intel Core i5-2450M (sandy-bridge, turbo-freq off):
 >
 >   Before (AESNI/AVX):
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      2.11 ns/B     452.7 MiB/s      5.25 c/B      2494
 >          ECB dec |      2.10 ns/B     454.5 MiB/s      5.23 c/B      2494
 >          CTR enc |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494
 >          CTR dec |      2.10 ns/B     453.2 MiB/s      5.25 c/B      2494
 >
 >   After (AESNI/AVX, ~4% faster)
 >   ARIA128        |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 >          ECB enc |      2.00 ns/B     475.8 MiB/s      5.00 c/B      2494
 >          ECB dec |      2.00 ns/B     476.4 MiB/s      4.99 c/B      2494
 >          CTR enc |      2.01 ns/B     474.7 MiB/s      5.01 c/B      2494
 >          CTR dec |      2.01 ns/B     473.9 MiB/s      5.02 c/B      2494
 >
 > Cc: Taehee Yoo <ap420073 at gmail.com>
 > Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
 > ---
 >   cipher/aria-aesni-avx-amd64.S | 29 +++++++++++++++--------------
 >   1 file changed, 15 insertions(+), 14 deletions(-)
 >
 > diff --git a/cipher/aria-aesni-avx-amd64.S 
b/cipher/aria-aesni-avx-amd64.S
 > index 7274b80e..f0c72225 100644
 > --- a/cipher/aria-aesni-avx-amd64.S
 > +++ b/cipher/aria-aesni-avx-amd64.S
 > @@ -357,27 +357,21 @@
 >   		      t0, t1, t2, rk,			\
 >   		      idx, round)			\
 >   	/* AddRoundKey */                               \
 > -	vbroadcastss ((round * 16) + idx + 0)(rk), t0;	\
 > -	vpsrld $24, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vmovd ((round * 16) + idx + 0)(rk), t0;		\
 > +	vpshufb .Lthree_x16 rRIP, t0, t2;		\
 >   	vpxor t2, x0, x0;				\
 > -	vpsrld $16, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vpshufb .Ltwo_x16 rRIP, t0, t2;			\
 >   	vpxor t2, x1, x1;				\
 > -	vpsrld $8, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vpshufb .Lone_x16 rRIP, t0, t2;			\
 >   	vpxor t2, x2, x2;				\
 >   	vpshufb t1, t0, t2;				\
 >   	vpxor t2, x3, x3;				\
 > -	vbroadcastss ((round * 16) + idx + 4)(rk), t0;	\
 > -	vpsrld $24, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vmovd ((round * 16) + idx + 4)(rk), t0;		\
 > +	vpshufb .Lthree_x16 rRIP, t0, t2;		\
 >   	vpxor t2, x4, x4;				\
 > -	vpsrld $16, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vpshufb .Ltwo_x16 rRIP, t0, t2;			\
 >   	vpxor t2, x5, x5;				\
 > -	vpsrld $8, t0, t2;				\
 > -	vpshufb t1, t2, t2;				\
 > +	vpshufb .Lone_x16 rRIP, t0, t2;			\
 >   	vpxor t2, x6, x6;				\
 >   	vpshufb t1, t0, t2;				\
 >   	vpxor t2, x7, x7;
 > @@ -858,6 +852,13 @@ SECTION_RODATA
 >   .Ltf_hi__x2__and__fwd_aff:
 >   	.octa 0x3F893781E95FE1576CDA64D2BA0CB204
 >
 > +.Lthree_x16:
 > +	.byte 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3
 > +.Ltwo_x16:
 > +	.byte 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
 > +.Lone_x16:
 > +	.byte 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
 > +
 >   .Lbige_addb_1:
 >   	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1
 >   .Lbige_addb_2:

Thanks a lot!
Taehee Yoo



More information about the Gcrypt-devel mailing list