[PATCH] rijndael-aesni: use inline checksumming for OCB decryption
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun May 28 16:53:55 CEST 2023
* cipher/rijndael-aesni.c (aesni_ocb_checksum): Remove.
(aesni_ocb_dec): Add inline checksumming.
--
Inline checksumming is far faster on Ryzen processors on i386
builds than two-pass checksumming.
Benchmark on AMD Ryzen 9 7900X (i386):
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB dec | 0.180 ns/B 5292 MiB/s 0.847 c/B 4700
After (~2x faster):
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB dec | 0.091 ns/B 10491 MiB/s 0.427 c/B 4700
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/rijndael-aesni.c | 220 ++++++++--------------------------------
1 file changed, 43 insertions(+), 177 deletions(-)
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index 906737a6..b33ef7ed 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -2710,174 +2710,6 @@ _gcry_aes_aesni_cbc_dec (RIJNDAEL_context *ctx, unsigned char *iv,
}
-static ASM_FUNC_ATTR_INLINE void
-aesni_ocb_checksum (gcry_cipher_hd_t c, const unsigned char *plaintext,
- size_t nblocks)
-{
- RIJNDAEL_context *ctx = (void *)&c->context.c;
-
- /* Calculate checksum */
- asm volatile ("movdqu %[checksum], %%xmm6\n\t"
- "pxor %%xmm1, %%xmm1\n\t"
- "pxor %%xmm2, %%xmm2\n\t"
- "pxor %%xmm3, %%xmm3\n\t"
- :
- :[checksum] "m" (*c->u_ctr.ctr)
- : "memory" );
-
- if (0) {}
-#if defined(HAVE_GCC_INLINE_ASM_AVX2)
- else if (nblocks >= 16 && ctx->use_avx2)
- {
- /* Use wider 256-bit registers for fast xoring of plaintext. */
- asm volatile ("vzeroupper\n\t"
- "vpxor %%xmm0, %%xmm0, %%xmm0\n\t"
- "vpxor %%xmm4, %%xmm4, %%xmm4\n\t"
- "vpxor %%xmm5, %%xmm5, %%xmm5\n\t"
- "vpxor %%xmm7, %%xmm7, %%xmm7\n\t"
- :
- :
- : "memory");
-
- for (;nblocks >= 16; nblocks -= 16)
- {
- asm volatile ("vpxor %[ptr0], %%ymm6, %%ymm6\n\t"
- "vpxor %[ptr1], %%ymm1, %%ymm1\n\t"
- "vpxor %[ptr2], %%ymm2, %%ymm2\n\t"
- "vpxor %[ptr3], %%ymm3, %%ymm3\n\t"
- :
- : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
- [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
- [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
- [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
- : "memory" );
- asm volatile ("vpxor %[ptr4], %%ymm0, %%ymm0\n\t"
- "vpxor %[ptr5], %%ymm4, %%ymm4\n\t"
- "vpxor %[ptr6], %%ymm5, %%ymm5\n\t"
- "vpxor %[ptr7], %%ymm7, %%ymm7\n\t"
- :
- : [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
- [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
- [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
- [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
- : "memory" );
- plaintext += BLOCKSIZE * 16;
- }
-
- asm volatile ("vpxor %%ymm0, %%ymm6, %%ymm6\n\t"
- "vpxor %%ymm4, %%ymm1, %%ymm1\n\t"
- "vpxor %%ymm5, %%ymm2, %%ymm2\n\t"
- "vpxor %%ymm7, %%ymm3, %%ymm3\n\t"
- "vextracti128 $1, %%ymm6, %%xmm0\n\t"
- "vextracti128 $1, %%ymm1, %%xmm4\n\t"
- "vextracti128 $1, %%ymm2, %%xmm5\n\t"
- "vextracti128 $1, %%ymm3, %%xmm7\n\t"
- "vpxor %%xmm0, %%xmm6, %%xmm6\n\t"
- "vpxor %%xmm4, %%xmm1, %%xmm1\n\t"
- "vpxor %%xmm5, %%xmm2, %%xmm2\n\t"
- "vpxor %%xmm7, %%xmm3, %%xmm3\n\t"
- "vzeroupper\n\t"
- :
- :
- : "memory" );
- }
-#endif
-#if defined(HAVE_GCC_INLINE_ASM_AVX)
- else if (nblocks >= 16 && ctx->use_avx)
- {
- /* Same as AVX2, except using 256-bit floating point instructions. */
- asm volatile ("vzeroupper\n\t"
- "vxorpd %%xmm0, %%xmm0, %%xmm0\n\t"
- "vxorpd %%xmm4, %%xmm4, %%xmm4\n\t"
- "vxorpd %%xmm5, %%xmm5, %%xmm5\n\t"
- "vxorpd %%xmm7, %%xmm7, %%xmm7\n\t"
- :
- :
- : "memory");
-
- for (;nblocks >= 16; nblocks -= 16)
- {
- asm volatile ("vxorpd %[ptr0], %%ymm6, %%ymm6\n\t"
- "vxorpd %[ptr1], %%ymm1, %%ymm1\n\t"
- "vxorpd %[ptr2], %%ymm2, %%ymm2\n\t"
- "vxorpd %[ptr3], %%ymm3, %%ymm3\n\t"
- :
- : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE * 2)),
- [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE * 2)),
- [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE * 2)),
- [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE * 2))
- : "memory" );
- asm volatile ("vxorpd %[ptr4], %%ymm0, %%ymm0\n\t"
- "vxorpd %[ptr5], %%ymm4, %%ymm4\n\t"
- "vxorpd %[ptr6], %%ymm5, %%ymm5\n\t"
- "vxorpd %[ptr7], %%ymm7, %%ymm7\n\t"
- :
- : [ptr4] "m" (*(plaintext + 4 * BLOCKSIZE * 2)),
- [ptr5] "m" (*(plaintext + 5 * BLOCKSIZE * 2)),
- [ptr6] "m" (*(plaintext + 6 * BLOCKSIZE * 2)),
- [ptr7] "m" (*(plaintext + 7 * BLOCKSIZE * 2))
- : "memory" );
- plaintext += BLOCKSIZE * 16;
- }
-
- asm volatile ("vxorpd %%ymm0, %%ymm6, %%ymm6\n\t"
- "vxorpd %%ymm4, %%ymm1, %%ymm1\n\t"
- "vxorpd %%ymm5, %%ymm2, %%ymm2\n\t"
- "vxorpd %%ymm7, %%ymm3, %%ymm3\n\t"
- "vextractf128 $1, %%ymm6, %%xmm0\n\t"
- "vextractf128 $1, %%ymm1, %%xmm4\n\t"
- "vextractf128 $1, %%ymm2, %%xmm5\n\t"
- "vextractf128 $1, %%ymm3, %%xmm7\n\t"
- "vxorpd %%xmm0, %%xmm6, %%xmm6\n\t"
- "vxorpd %%xmm4, %%xmm1, %%xmm1\n\t"
- "vxorpd %%xmm5, %%xmm2, %%xmm2\n\t"
- "vxorpd %%xmm7, %%xmm3, %%xmm3\n\t"
- "vzeroupper\n\t"
- :
- :
- : "memory" );
- }
-#endif
-
- for (;nblocks >= 4; nblocks -= 4)
- {
- asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
- "movdqu %[ptr1], %%xmm4\n\t"
- "movdqu %[ptr2], %%xmm5\n\t"
- "movdqu %[ptr3], %%xmm7\n\t"
- "pxor %%xmm0, %%xmm6\n\t"
- "pxor %%xmm4, %%xmm1\n\t"
- "pxor %%xmm5, %%xmm2\n\t"
- "pxor %%xmm7, %%xmm3\n\t"
- :
- : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE)),
- [ptr1] "m" (*(plaintext + 1 * BLOCKSIZE)),
- [ptr2] "m" (*(plaintext + 2 * BLOCKSIZE)),
- [ptr3] "m" (*(plaintext + 3 * BLOCKSIZE))
- : "memory" );
- plaintext += BLOCKSIZE * 4;
- }
-
- for (;nblocks >= 1; nblocks -= 1)
- {
- asm volatile ("movdqu %[ptr0], %%xmm0\n\t"
- "pxor %%xmm0, %%xmm6\n\t"
- :
- : [ptr0] "m" (*(plaintext + 0 * BLOCKSIZE))
- : "memory" );
- plaintext += BLOCKSIZE;
- }
-
- asm volatile ("pxor %%xmm1, %%xmm6\n\t"
- "pxor %%xmm2, %%xmm6\n\t"
- "pxor %%xmm3, %%xmm6\n\t"
- "movdqu %%xmm6, %[checksum]\n\t"
- : [checksum] "=m" (*c->u_ctr.ctr)
- :
- : "memory" );
-}
-
-
static unsigned int ASM_FUNC_ATTR_NOINLINE
aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
const void *inbuf_arg, size_t nblocks)
@@ -3401,9 +3233,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
/* Preload Offset */
asm volatile ("movdqu %[iv], %%xmm5\n\t"
- : /* No output */
- : [iv] "m" (*c->u_iv.iv)
- : "memory" );
+ "movdqu %[ctr], %%xmm7\n\t"
+ : /* No output */
+ : [iv] "m" (*c->u_iv.iv),
+ [ctr] "m" (*c->u_ctr.ctr)
+ : "memory" );
for ( ;nblocks && n % 4; nblocks-- )
{
@@ -3424,6 +3258,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
asm volatile ("pxor %%xmm5, %%xmm0\n\t"
"movdqu %%xmm0, %[outbuf]\n\t"
+ "pxor %%xmm0, %%xmm7\n\t"
: [outbuf] "=m" (*outbuf)
:
: "memory" );
@@ -3452,6 +3287,15 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
"pxor %[first_key], %%xmm5\n\t"
"pxor %[first_key], %%xmm0\n\t"
"movdqa %%xmm0, %[lxfkey]\n\t"
+ /* Clear plaintext blocks */
+ "pxor %%xmm1, %%xmm1\n\t"
+ "pxor %%xmm2, %%xmm2\n\t"
+ "pxor %%xmm3, %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm4\n\t"
+ "pxor %%xmm8, %%xmm8\n\t"
+ "pxor %%xmm9, %%xmm9\n\t"
+ "pxor %%xmm10, %%xmm10\n\t"
+ "pxor %%xmm11, %%xmm11\n\t"
: [lxfkey] "=m" (*lxf_key)
: [l0] "m" (*c->u_mode.ocb.L[0]),
[last_key] "m" (ctx->keyschdec[ctx->rounds][0][0]),
@@ -3463,7 +3307,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
n += 4;
l = aes_ocb_get_l(c, n);
- asm volatile ("movdqu %[l0l1], %%xmm10\n\t"
+ asm volatile ("pxor %%xmm10, %%xmm1\n\t"
+ "pxor %%xmm11, %%xmm2\n\t"
+ "movdqu %[l0l1], %%xmm10\n\t"
"movdqu %[l1], %%xmm11\n\t"
"movdqu %[l3], %%xmm15\n\t"
:
@@ -3477,7 +3323,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */
- asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
+ asm volatile ("pxor %%xmm1, %%xmm4\n\t"
+ "pxor %%xmm2, %%xmm8\n\t"
+ "pxor %%xmm3, %%xmm9\n\t"
+ "movdqu %[inbuf0], %%xmm1\n\t"
"movdqu %[inbuf1], %%xmm2\n\t"
"movdqu %[inbuf2], %%xmm3\n\t"
:
@@ -3485,8 +3334,11 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
[inbuf1] "m" (*(inbuf + 1 * BLOCKSIZE)),
[inbuf2] "m" (*(inbuf + 2 * BLOCKSIZE))
: "memory" );
- asm volatile ("movdqu %[inbuf3], %%xmm4\n\t"
+ asm volatile ("pxor %%xmm4, %%xmm7\n\t"
+ "movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm8, %%xmm7\n\t"
"movdqu %[inbuf4], %%xmm8\n\t"
+ "pxor %%xmm9, %%xmm7\n\t"
"movdqu %[inbuf5], %%xmm9\n\t"
:
: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE)),
@@ -3722,6 +3574,15 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
asm volatile ("pxor %[first_key], %%xmm5\n\t"
"pxor %%xmm0, %%xmm0\n\t"
"movdqu %%xmm0, %[lxfkey]\n\t"
+ /* Add plaintext blocks to checksum */
+ "pxor %%xmm1, %%xmm2\n\t"
+ "pxor %%xmm3, %%xmm4\n\t"
+ "pxor %%xmm9, %%xmm8\n\t"
+ "pxor %%xmm11, %%xmm10\n\t"
+ "pxor %%xmm2, %%xmm4\n\t"
+ "pxor %%xmm8, %%xmm10\n\t"
+ "pxor %%xmm4, %%xmm7\n\t"
+ "pxor %%xmm10, %%xmm7\n\t"
: [lxfkey] "=m" (*lxf_key)
: [first_key] "m" (ctx->keyschdec[0][0][0])
: "memory" );
@@ -3782,8 +3643,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
asm volatile ("pxor %[tmpbuf0],%%xmm1\n\t"
"movdqu %%xmm1, %[outbuf0]\n\t"
+ "pxor %%xmm1, %%xmm7\n\t"
"pxor %[tmpbuf1],%%xmm2\n\t"
"movdqu %%xmm2, %[outbuf1]\n\t"
+ "pxor %%xmm2, %%xmm7\n\t"
: [outbuf0] "=m" (*(outbuf + 0 * BLOCKSIZE)),
[outbuf1] "=m" (*(outbuf + 1 * BLOCKSIZE))
: [tmpbuf0] "m" (*(tmpbuf + 0 * BLOCKSIZE)),
@@ -3791,8 +3654,10 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
: "memory" );
asm volatile ("pxor %[tmpbuf2],%%xmm3\n\t"
"movdqu %%xmm3, %[outbuf2]\n\t"
+ "pxor %%xmm3, %%xmm7\n\t"
"pxor %%xmm5, %%xmm4\n\t"
"movdqu %%xmm4, %[outbuf3]\n\t"
+ "pxor %%xmm4, %%xmm7\n\t"
: [outbuf2] "=m" (*(outbuf + 2 * BLOCKSIZE)),
[outbuf3] "=m" (*(outbuf + 3 * BLOCKSIZE))
: [tmpbuf2] "m" (*(tmpbuf + 2 * BLOCKSIZE))
@@ -3822,6 +3687,7 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
asm volatile ("pxor %%xmm5, %%xmm0\n\t"
"movdqu %%xmm0, %[outbuf]\n\t"
+ "pxor %%xmm0, %%xmm7\n\t"
: [outbuf] "=m" (*outbuf)
:
: "memory" );
@@ -3832,7 +3698,9 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
c->u_mode.ocb.data_nblocks = n;
asm volatile ("movdqu %%xmm5, %[iv]\n\t"
- : [iv] "=m" (*c->u_iv.iv)
+ "movdqu %%xmm7, %[ctr]\n\t"
+ : [iv] "=m" (*c->u_iv.iv),
+ [ctr] "=m" (*c->u_ctr.ctr)
:
: "memory" );
@@ -3846,8 +3714,6 @@ aesni_ocb_dec (gcry_cipher_hd_t c, void *outbuf_arg,
:
: "memory" );
- aesni_ocb_checksum (c, outbuf_arg, nblocks_arg);
-
aesni_cleanup ();
aesni_cleanup_2_7 ();
--
2.39.2
More information about the Gcrypt-devel
mailing list