[PATCH 2/3] AES-NI/OCB: Perform checksumming inline with encryption
Jussi Kivilinna
jussi.kivilinna at iki.fi
Thu Mar 28 21:13:39 CET 2019
* cipher/rijndael-aesni.c (aesni_ocb_enc): Remove call to
'aesni_ocb_checksum', instead perform checksumming inline with offset
calculations.
--
This patch reverts the OCB checksumming split for encryption to avoid
performance issue seen on Intel CPUs.
Commit b42de67f34 "Optimizations for AES-NI OCB" changed AES-NI/OCB
implementation perform checksumming as separate pass from encryption
and decryption. While this change improved performance for buffer
sizes 16 to 4096 bytes (buffer sizes used by bench-slope), it
introduced performance anomalia with OCB encryption on Intel
processors. Below is large buffer OCB encryption results on Intel
Haswell. There we can see that with buffer sizes larger than 32 KiB
performance starts dropping. Decryption does not suffer from the same
issue.
MiB/s Speed by Data Length (at 2 Ghz)
2800 +-------------------------------------------------------------+
2600 |-+ + + **.****.****+ + + +-|
| **.** *.****.****.**** |
2400 |-+ *.** *.*****.****|
2200 |-+ *** +-|
2000 |-+ *.* +-|
| ** |
1800 |-+ ** +-|
1600 |-+ *.* +-|
1400 |-+** +-|
|** |
1200 |*+ + + + + + + +-|
1000 +-------------------------------------------------------------+
1024 4096 16384 65536 262144 1048576
Data Length in Bytes
I've tested and reproduced this issue on Intel Ivy-Bridge, Haswell
and Skylake processors. Same performance drop on large buffers is not
seen on AMD Ryzen. Below is OCB decryption speed plot from Haswell for
reference, showing expected performance curve over increasing buffer
sizes.
MiB/s Speed by Data Length (at 2 Ghz)
2800 +-------------------------------------------------------------+
2600 |-+ + + **.****.****.****.****.****.*****.****|
| **.** |
2400 |-+ *.** +-|
2200 |-+ *** +-|
2000 |-+ *.* +-|
| ** |
1800 |-+ ** +-|
1600 |-+ *.* +-|
1400 |-+** +-|
|** |
1200 |*+ + + + + + + +-|
1000 +-------------------------------------------------------------+
1024 4096 16384 65536 262144 1048576
Data Length in Bytes
After this patch, bench-slope shows ~2% reduction on performance on
Intel Haswell:
Before:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB enc | 0.171 ns/B 5581 MiB/s 0.683 c/B 3998
After:
AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz
OCB enc | 0.174 ns/B 5468 MiB/s 0.697 c/B 3998
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c
index b1f6b0c02..e9d9f680e 100644
--- a/cipher/rijndael-aesni.c
+++ b/cipher/rijndael-aesni.c
@@ -2381,23 +2381,25 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
aesni_prepare ();
aesni_prepare_2_7 ();
- aesni_ocb_checksum (c, inbuf_arg, nblocks);
-
/* Preload Offset */
asm volatile ("movdqu %[iv], %%xmm5\n\t"
- : /* No output */
- : [iv] "m" (*c->u_iv.iv)
- : "memory" );
+ "movdqu %[ctr], %%xmm7\n\t"
+ : /* No output */
+ : [iv] "m" (*c->u_iv.iv),
+ [ctr] "m" (*c->u_ctr.ctr)
+ : "memory" );
for ( ;nblocks && n % 4; nblocks-- )
{
l = aes_ocb_get_l(c, ++n);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
asm volatile ("movdqu %[l], %%xmm1\n\t"
"movdqu %[inbuf], %%xmm0\n\t"
"pxor %%xmm1, %%xmm5\n\t"
+ "pxor %%xmm0, %%xmm7\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
: [l] "m" (*l),
@@ -2445,6 +2447,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
n += 4;
l = aes_ocb_get_l(c, n);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* P_i = Offset_i xor ENCIPHER(K, C_i xor Offset_i) */
asm volatile ("movdqu %[inbuf0], %%xmm1\n\t"
@@ -2465,28 +2468,34 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
: "memory" );
asm volatile ("movdqa %%xmm6, %%xmm12\n\t"
"pxor %%xmm5, %%xmm12\n\t"
+ "pxor %%xmm1, %%xmm7\n\t"
"pxor %%xmm12, %%xmm1\n\t"
"movdqa %%xmm10, %%xmm13\n\t"
"pxor %%xmm5, %%xmm13\n\t"
+ "pxor %%xmm2, %%xmm7\n\t"
"pxor %%xmm13, %%xmm2\n\t"
"movdqa %%xmm11, %%xmm14\n\t"
"pxor %%xmm5, %%xmm14\n\t"
+ "pxor %%xmm3, %%xmm7\n\t"
"pxor %%xmm14, %%xmm3\n\t"
"pxor %%xmm11, %%xmm5\n\t"
"pxor %%xmm15, %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm7\n\t"
"pxor %%xmm5, %%xmm4\n\t"
"movdqa %%xmm5, %%xmm15\n\t"
"movdqa %%xmm5, %%xmm0\n\t"
"pxor %%xmm6, %%xmm0\n\t"
+ "pxor %%xmm8, %%xmm7\n\t"
"pxor %%xmm0, %%xmm8\n\t"
"movdqa %%xmm0, %[tmpbuf0]\n\t"
"movdqa %%xmm10, %%xmm0\n\t"
"pxor %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm9, %%xmm7\n\t"
"pxor %%xmm0, %%xmm9\n\t"
"movdqa %%xmm0, %[tmpbuf1]\n\t"
: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE)),
@@ -2496,6 +2505,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
asm volatile ("movdqu %[inbuf6], %%xmm10\n\t"
"movdqa %%xmm11, %%xmm0\n\t"
"pxor %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm10, %%xmm7\n\t"
"pxor %%xmm0, %%xmm10\n\t"
"movdqa %%xmm0, %[tmpbuf2]\n\t"
: [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
@@ -2505,6 +2515,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
"pxor %%xmm11, %%xmm5\n\t"
"pxor %%xmm0, %%xmm5\n\t"
"movdqu %[inbuf7], %%xmm11\n\t"
+ "pxor %%xmm11, %%xmm7\n\t"
"pxor %%xmm5, %%xmm11\n\t"
:
: [l7] "m" (*l),
@@ -2555,6 +2566,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
n += 4;
l = aes_ocb_get_l(c, n);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
asm volatile ("movdqu %[l0], %%xmm0\n\t"
@@ -2568,6 +2580,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
asm volatile ("movdqu %[l1], %%xmm4\n\t"
"movdqu %[l3], %%xmm6\n\t"
"pxor %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm1, %%xmm7\n\t"
"pxor %%xmm0, %%xmm1\n\t"
"movdqa %%xmm0, %[tmpbuf0]\n\t"
: [tmpbuf0] "=m" (*(tmpbuf + 0 * BLOCKSIZE))
@@ -2576,6 +2589,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
: "memory" );
asm volatile ("movdqu %[inbuf1], %%xmm2\n\t"
"pxor %%xmm5, %%xmm3\n\t"
+ "pxor %%xmm2, %%xmm7\n\t"
"pxor %%xmm3, %%xmm2\n\t"
"movdqa %%xmm3, %[tmpbuf1]\n\t"
: [tmpbuf1] "=m" (*(tmpbuf + 1 * BLOCKSIZE))
@@ -2584,6 +2598,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
asm volatile ("movdqa %%xmm4, %%xmm0\n\t"
"movdqu %[inbuf2], %%xmm3\n\t"
"pxor %%xmm5, %%xmm0\n\t"
+ "pxor %%xmm3, %%xmm7\n\t"
"pxor %%xmm0, %%xmm3\n\t"
"movdqa %%xmm0, %[tmpbuf2]\n\t"
: [tmpbuf2] "=m" (*(tmpbuf + 2 * BLOCKSIZE))
@@ -2593,6 +2608,7 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
asm volatile ("pxor %%xmm6, %%xmm5\n\t"
"pxor %%xmm4, %%xmm5\n\t"
"movdqu %[inbuf3], %%xmm4\n\t"
+ "pxor %%xmm4, %%xmm7\n\t"
"pxor %%xmm5, %%xmm4\n\t"
:
: [inbuf3] "m" (*(inbuf + 3 * BLOCKSIZE))
@@ -2625,11 +2641,13 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
{
l = aes_ocb_get_l(c, ++n);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
/* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
/* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
asm volatile ("movdqu %[l], %%xmm1\n\t"
"movdqu %[inbuf], %%xmm0\n\t"
"pxor %%xmm1, %%xmm5\n\t"
+ "pxor %%xmm0, %%xmm7\n\t"
"pxor %%xmm5, %%xmm0\n\t"
:
: [l] "m" (*l),
@@ -2650,7 +2668,9 @@ aesni_ocb_enc (gcry_cipher_hd_t c, void *outbuf_arg,
c->u_mode.ocb.data_nblocks = n;
asm volatile ("movdqu %%xmm5, %[iv]\n\t"
- : [iv] "=m" (*c->u_iv.iv)
+ "movdqu %%xmm7, %[ctr]\n\t"
+ : [iv] "=m" (*c->u_iv.iv),
+ [ctr] "=m" (*c->u_ctr.ctr)
:
: "memory" );
More information about the Gcrypt-devel
mailing list