[PATCH] twofish-avx2-amd64: replace VPGATHER with manual gather
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Aug 13 14:40:25 CEST 2023
* cipher/twofish-avx2-amd64.S (do_gather): New.
(g16): Switch to use 'do_gather' instead of VPGATHER instruction.
(__twofish_enc_blk16, __twofish_dec_blk16): Prepare stack
for 'do_gather'.
--
As VPGATHER is now slow on majority of CPUs (because of "Downfall"),
switch twofish-avx2 implementation to use manual memory gathering
instead.
Benchmark on Intel Core i3-1115G4 (tigerlake, with "Downfall" mitigated
microcode):
Before:
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 7.00 ns/B 136.3 MiB/s 28.62 c/B 4089
ECB dec | 7.00 ns/B 136.2 MiB/s 28.64 c/B 4090
After (~3.1x faster):
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 2.20 ns/B 433.7 MiB/s 8.99 c/B 4090
ECB dec | 2.20 ns/B 433.7 MiB/s 8.99 c/B 4089
Benchmark on AMD Ryzen 9 7900X (zen4, did not suffer from "Downfall"):
Before:
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 1.91 ns/B 499.0 MiB/s 8.98 c/B 4700
ECB dec | 1.90 ns/B 500.7 MiB/s 8.95 c/B 4700
After (~6% faster):
TWOFISH | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
ECB enc | 1.78 ns/B 534.7 MiB/s 8.38 c/B 4700
ECB dec | 1.79 ns/B 533.7 MiB/s 8.40 c/B 4700
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/twofish-avx2-amd64.S | 168 ++++++++++++++++++++++++------------
cipher/twofish.c | 6 +-
2 files changed, 113 insertions(+), 61 deletions(-)
diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S
index d05ec1f9..2207ac57 100644
--- a/cipher/twofish-avx2-amd64.S
+++ b/cipher/twofish-avx2-amd64.S
@@ -39,14 +39,20 @@
/* register macros */
#define CTX %rdi
-#define RROUND %r12
-#define RROUNDd %r12d
+#define RROUND %r13
+#define RROUNDd %r13d
#define RS0 CTX
#define RS1 %r8
#define RS2 %r9
#define RS3 %r10
#define RK %r11
-#define RW %rax
+#define RW %r12
+#define RIDX0 %rax
+#define RIDX0d %eax
+#define RIDX1 %rbx
+#define RIDX1d %ebx
+#define RIDX2 %r14
+#define RIDX3 %r15
#define RA0 %ymm8
#define RB0 %ymm9
@@ -63,14 +69,14 @@
#define RX1 %ymm2
#define RY1 %ymm3
#define RT0 %ymm4
-#define RIDX %ymm5
+#define RT1 %ymm5
#define RX0x %xmm0
#define RY0x %xmm1
#define RX1x %xmm2
#define RY1x %xmm3
#define RT0x %xmm4
-#define RIDXx %xmm5
+#define RT1x %xmm5
#define RTMP0 RX0
#define RTMP0x RX0x
@@ -80,8 +86,8 @@
#define RTMP2x RY0x
#define RTMP3 RY1
#define RTMP3x RY1x
-#define RTMP4 RIDX
-#define RTMP4x RIDXx
+#define RTMP4 RT1
+#define RTMP4x RT1x
/* vpgatherdd mask and '-1' */
#define RNOT %ymm6
@@ -102,48 +108,42 @@
leaq s2(CTX), RS2; \
leaq s3(CTX), RS3; \
+#define do_gather(stoffs, byteoffs, rs, out) \
+ movzbl (stoffs + 0*4 + byteoffs)(%rsp), RIDX0d; \
+ movzbl (stoffs + 1*4 + byteoffs)(%rsp), RIDX1d; \
+ movzbq (stoffs + 2*4 + byteoffs)(%rsp), RIDX2; \
+ movzbq (stoffs + 3*4 + byteoffs)(%rsp), RIDX3; \
+ vmovd (rs, RIDX0, 4), RT1x; \
+ vpinsrd $1, (rs, RIDX1, 4), RT1x, RT1x; \
+ vpinsrd $2, (rs, RIDX2, 4), RT1x, RT1x; \
+ vpinsrd $3, (rs, RIDX3, 4), RT1x, RT1x; \
+ movzbl (stoffs + 4*4 + byteoffs)(%rsp), RIDX0d; \
+ movzbl (stoffs + 5*4 + byteoffs)(%rsp), RIDX1d; \
+ movzbq (stoffs + 6*4 + byteoffs)(%rsp), RIDX2; \
+ movzbq (stoffs + 7*4 + byteoffs)(%rsp), RIDX3; \
+ vmovd (rs, RIDX0, 4), RT0x; \
+ vpinsrd $1, (rs, RIDX1, 4), RT0x, RT0x; \
+ vpinsrd $2, (rs, RIDX2, 4), RT0x, RT0x; \
+ vpinsrd $3, (rs, RIDX3, 4), RT0x, RT0x; \
+ vinserti128 $1, RT0x, RT1, out;
+
#define g16(ab, rs0, rs1, rs2, rs3, xy) \
- vpand RBYTE, ab ## 0, RIDX; \
- vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 0; \
- vpcmpeqd RNOT, RNOT, RNOT; \
- \
- vpand RBYTE, ab ## 1, RIDX; \
- vpgatherdd RNOT, (rs0, RIDX, 4), xy ## 1; \
- vpcmpeqd RNOT, RNOT, RNOT; \
- \
- vpsrld $8, ab ## 0, RIDX; \
- vpand RBYTE, RIDX, RIDX; \
- vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
- vpcmpeqd RNOT, RNOT, RNOT; \
- vpxor RT0, xy ## 0, xy ## 0; \
- \
- vpsrld $8, ab ## 1, RIDX; \
- vpand RBYTE, RIDX, RIDX; \
- vpgatherdd RNOT, (rs1, RIDX, 4), RT0; \
- vpcmpeqd RNOT, RNOT, RNOT; \
- vpxor RT0, xy ## 1, xy ## 1; \
- \
- vpsrld $16, ab ## 0, RIDX; \
- vpand RBYTE, RIDX, RIDX; \
- vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
- vpcmpeqd RNOT, RNOT, RNOT; \
- vpxor RT0, xy ## 0, xy ## 0; \
- \
- vpsrld $16, ab ## 1, RIDX; \
- vpand RBYTE, RIDX, RIDX; \
- vpgatherdd RNOT, (rs2, RIDX, 4), RT0; \
- vpcmpeqd RNOT, RNOT, RNOT; \
- vpxor RT0, xy ## 1, xy ## 1; \
- \
- vpsrld $24, ab ## 0, RIDX; \
- vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
- vpcmpeqd RNOT, RNOT, RNOT; \
- vpxor RT0, xy ## 0, xy ## 0; \
- \
- vpsrld $24, ab ## 1, RIDX; \
- vpgatherdd RNOT, (rs3, RIDX, 4), RT0; \
- vpcmpeqd RNOT, RNOT, RNOT; \
- vpxor RT0, xy ## 1, xy ## 1;
+ vmovdqa ab ## 0, 0(%rsp); \
+ vmovdqa ab ## 1, 32(%rsp); \
+ do_gather(0*32, 0, rs0, xy ## 0); \
+ do_gather(1*32, 0, rs0, xy ## 1); \
+ do_gather(0*32, 1, rs1, RT1); \
+ vpxor RT1, xy ## 0, xy ## 0; \
+ do_gather(1*32, 1, rs1, RT1); \
+ vpxor RT1, xy ## 1, xy ## 1; \
+ do_gather(0*32, 2, rs2, RT1); \
+ vpxor RT1, xy ## 0, xy ## 0; \
+ do_gather(1*32, 2, rs2, RT1); \
+ vpxor RT1, xy ## 1, xy ## 1; \
+ do_gather(0*32, 3, rs3, RT1); \
+ vpxor RT1, xy ## 0, xy ## 0; \
+ do_gather(1*32, 3, rs3, RT1); \
+ vpxor RT1, xy ## 1, xy ## 1;
#define g1_16(a, x) \
g16(a, RS0, RS1, RS2, RS3, x);
@@ -375,8 +375,23 @@ __twofish_enc_blk16:
*/
CFI_STARTPROC();
- pushq RROUND;
- CFI_PUSH(RROUND);
+ pushq %rbp;
+ CFI_PUSH(%rbp);
+ movq %rsp, %rbp;
+ CFI_DEF_CFA_REGISTER(%rbp);
+ subq $(64 + 5 * 8), %rsp;
+ andq $-64, %rsp;
+
+ movq %rbx, (64 + 0 * 8)(%rsp);
+ movq %r12, (64 + 1 * 8)(%rsp);
+ movq %r13, (64 + 2 * 8)(%rsp);
+ movq %r14, (64 + 3 * 8)(%rsp);
+ movq %r15, (64 + 4 * 8)(%rsp);
+ CFI_REG_ON_STACK(rbx, 64 + 0 * 8);
+ CFI_REG_ON_STACK(r12, 64 + 1 * 8);
+ CFI_REG_ON_STACK(r13, 64 + 2 * 8);
+ CFI_REG_ON_STACK(r14, 64 + 3 * 8);
+ CFI_REG_ON_STACK(r15, 64 + 4 * 8);
init_round_constants();
@@ -400,8 +415,21 @@ __twofish_enc_blk16:
outunpack_enc16(RA, RB, RC, RD);
transpose4x4_16(RA, RB, RC, RD);
- popq RROUND;
- CFI_POP(RROUND);
+ movq (64 + 0 * 8)(%rsp), %rbx;
+ movq (64 + 1 * 8)(%rsp), %r12;
+ movq (64 + 2 * 8)(%rsp), %r13;
+ movq (64 + 3 * 8)(%rsp), %r14;
+ movq (64 + 4 * 8)(%rsp), %r15;
+ CFI_RESTORE(%rbx);
+ CFI_RESTORE(%r12);
+ CFI_RESTORE(%r13);
+ CFI_RESTORE(%r14);
+ CFI_RESTORE(%r15);
+ vpxor RT0, RT0, RT0;
+ vmovdqa RT0, 0(%rsp);
+ vmovdqa RT0, 32(%rsp);
+ leave;
+ CFI_LEAVE();
ret_spec_stop;
CFI_ENDPROC();
@@ -420,8 +448,23 @@ __twofish_dec_blk16:
*/
CFI_STARTPROC();
- pushq RROUND;
- CFI_PUSH(RROUND);
+ pushq %rbp;
+ CFI_PUSH(%rbp);
+ movq %rsp, %rbp;
+ CFI_DEF_CFA_REGISTER(%rbp);
+ subq $(64 + 5 * 8), %rsp;
+ andq $-64, %rsp;
+
+ movq %rbx, (64 + 0 * 8)(%rsp);
+ movq %r12, (64 + 1 * 8)(%rsp);
+ movq %r13, (64 + 2 * 8)(%rsp);
+ movq %r14, (64 + 3 * 8)(%rsp);
+ movq %r15, (64 + 4 * 8)(%rsp);
+ CFI_REG_ON_STACK(rbx, 64 + 0 * 8);
+ CFI_REG_ON_STACK(r12, 64 + 1 * 8);
+ CFI_REG_ON_STACK(r13, 64 + 2 * 8);
+ CFI_REG_ON_STACK(r14, 64 + 3 * 8);
+ CFI_REG_ON_STACK(r15, 64 + 4 * 8);
init_round_constants();
@@ -444,8 +487,21 @@ __twofish_dec_blk16:
outunpack_dec16(RA, RB, RC, RD);
transpose4x4_16(RA, RB, RC, RD);
- popq RROUND;
- CFI_POP(RROUND);
+ movq (64 + 0 * 8)(%rsp), %rbx;
+ movq (64 + 1 * 8)(%rsp), %r12;
+ movq (64 + 2 * 8)(%rsp), %r13;
+ movq (64 + 3 * 8)(%rsp), %r14;
+ movq (64 + 4 * 8)(%rsp), %r15;
+ CFI_RESTORE(%rbx);
+ CFI_RESTORE(%r12);
+ CFI_RESTORE(%r13);
+ CFI_RESTORE(%r14);
+ CFI_RESTORE(%r15);
+ vpxor RT0, RT0, RT0;
+ vmovdqa RT0, 0(%rsp);
+ vmovdqa RT0, 32(%rsp);
+ leave;
+ CFI_LEAVE();
ret_spec_stop;
CFI_ENDPROC();
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 74061913..11a6e251 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -767,11 +767,7 @@ twofish_setkey (void *context, const byte *key, unsigned int keylen,
rc = do_twofish_setkey (ctx, key, keylen);
#ifdef USE_AVX2
- ctx->use_avx2 = 0;
- if ((hwfeatures & HWF_INTEL_AVX2) && (hwfeatures & HWF_INTEL_FAST_VPGATHER))
- {
- ctx->use_avx2 = 1;
- }
+ ctx->use_avx2 = (hwfeatures & HWF_INTEL_AVX2) != 0;
#endif
/* Setup bulk encryption routines. */
--
2.39.2
More information about the Gcrypt-devel
mailing list