[PATCH 4/4] Add ARMv8-CE HW acceleration for GCM-SIV counter mode

Jussi Kivilinna jussi.kivilinna at iki.fi
Fri Aug 13 17:01:29 CEST 2021


* cipher/rijndael-armv8-aarch32-ce.S
(_gcry_aes_ctr32le_enc_armv8_ce): New.
* cipher/rijndael-armv8-aarch64-ce.S
(_gcry_aes_ctr32le_enc_armv8_ce): New.
* cipher/rijndael-armv8-ce.c
(_gcry_aes_ctr32le_enc_armv8_ce)
(_gcry_aes_armv8_ce_ctr32le_enc): New.
* cipher/rijndael.c
(_gcry_aes_armv8_ce_ctr32le_enc): New prototype.
(do_setkey): Add setup of 'bulk_ops->ctr32le_enc' for ARMv8-CE.
--

Benchmark on Cortex-A53 (aarch64):

Before:
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
    GCM-SIV enc |     11.77 ns/B     81.03 MiB/s      7.63 c/B     647.9
    GCM-SIV dec |     11.92 ns/B     79.98 MiB/s      7.73 c/B     647.9
   GCM-SIV auth |      2.99 ns/B     318.9 MiB/s      1.94 c/B     648.0

After (~2.4x faster):
 AES            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
    GCM-SIV enc |      4.66 ns/B     204.5 MiB/s      3.02 c/B     647.9
    GCM-SIV dec |      4.82 ns/B     198.0 MiB/s      3.12 c/B     647.9
   GCM-SIV auth |      3.00 ns/B     318.4 MiB/s      1.94 c/B     648.0

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/rijndael-armv8-aarch32-ce.S | 121 +++++++++++++++++++++++++++++
 cipher/rijndael-armv8-aarch64-ce.S | 109 ++++++++++++++++++++++++++
 cipher/rijndael-armv8-ce.c         |  17 ++++
 cipher/rijndael.c                  |   5 ++
 4 files changed, 252 insertions(+)

diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S
index 66440bd4..6d78af0a 100644
--- a/cipher/rijndael-armv8-aarch32-ce.S
+++ b/cipher/rijndael-armv8-aarch32-ce.S
@@ -1016,6 +1016,127 @@ _gcry_aes_ctr_enc_armv8_ce:
 .size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;
 
 
+/*
+ * void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
+ *                                      unsigned char *outbuf,
+ *                                      const unsigned char *inbuf,
+ *                                      unsigned char *iv,
+ *                                      unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ctr32le_enc_armv8_ce
+.type  _gcry_aes_ctr32le_enc_armv8_ce,%function;
+_gcry_aes_ctr32le_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    %st+0: nblocks => r4
+   *    %st+4: nrounds => r5
+   */
+
+  vpush {q4-q7}
+  push {r4-r12,lr} /* 4*16 + 4*10 = 104b */
+  ldr r4, [sp, #(104+0)]
+  ldr r5, [sp, #(104+4)]
+  cmp r4, #0
+  beq .Lctr32le_enc_skip
+
+  cmp r5, #12
+  vld1.8 {q0}, [r3] /* load IV */
+
+  aes_preload_keys(r0, r6);
+
+  beq .Lctr32le_enc_entry_192
+  bhi .Lctr32le_enc_entry_256
+
+#define CTR_ENC(bits, ...) \
+  .Lctr32le_enc_entry_##bits: \
+    cmp r4, #4; \
+    blo .Lctr32le_enc_loop_##bits; \
+    \
+  .Lctr32le_enc_loop4_##bits: \
+    veor q2, q2; \
+    sub r4, r4, #4; \
+    vmov.i64 d4, #0xffffffff; /* q2 <= -1:0:0:0 */ \
+    vmov q1, q0; \
+    vadd.u32 q3, q2, q2; /* q3 <= -2:0:0:0 */ \
+    vadd.u32 q0, q3, q3; /* q0 <= -4:0:0:0 */ \
+    vadd.u32 q4, q3, q2; /* q4 <= -3:0:0:0 */ \
+    vsub.u32 q0, q1, q0; \
+    vsub.u32 q2, q1, q2; \
+    vst1.8 {q0}, [r3]; \
+    vsub.u32 q3, q1, q3; \
+    vsub.u32 q4, q1, q4; \
+    \
+    cmp r4, #4; \
+    vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+    \
+    do_aes_4_##bits(e, mc, q1, q2, q3, q4, ##__VA_ARGS__); \
+    \
+    veor q1, q1, q0; \
+    vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    vld1.8 {q1}, [r2]!; /* load ciphertext */ \
+    veor q2, q2, q0; \
+    veor q3, q3, q1; \
+    vld1.8 {q0}, [r2]!; /* load ciphertext */ \
+    vst1.8 {q2}, [r1]!; /* store plaintext */ \
+    veor q4, q4, q0; \
+    vld1.8 {q0}, [r3]; /* reload IV */ \
+    vst1.8 {q3-q4}, [r1]!; /* store plaintext */ \
+    \
+    bhs .Lctr32le_enc_loop4_##bits; \
+    cmp r4, #0; \
+    beq .Lctr32le_enc_done; \
+    \
+  .Lctr32le_enc_loop_##bits: \
+    \
+    veor q2, q2; \
+    vmov q1, q0; \
+    vmov.i64 d4, #0xffffffff; /* q2 <= -1:0:0:0 */ \
+    subs r4, r4, #1; \
+    vsub.u32 q0, q0, q2; \
+    vld1.8 {q2}, [r2]!; /* load ciphertext */ \
+    \
+    do_aes_one##bits(e, mc, q1, q1, ##__VA_ARGS__); \
+    \
+    veor q1, q2, q1; \
+    vst1.8 {q1}, [r1]!; /* store plaintext */ \
+    \
+    bne .Lctr32le_enc_loop_##bits; \
+    b .Lctr32le_enc_done;
+
+  CTR_ENC(128)
+  CTR_ENC(192, r0, r6)
+  CTR_ENC(256, r0, r6)
+
+#undef CTR_ENC
+
+.Lctr32le_enc_done:
+  vst1.8 {q0}, [r3] /* store IV */
+
+  CLEAR_REG(q0)
+  CLEAR_REG(q1)
+  CLEAR_REG(q2)
+  CLEAR_REG(q3)
+  CLEAR_REG(q8)
+  CLEAR_REG(q9)
+  CLEAR_REG(q10)
+  CLEAR_REG(q11)
+  CLEAR_REG(q12)
+  CLEAR_REG(q13)
+  CLEAR_REG(q14)
+
+.Lctr32le_enc_skip:
+  pop {r4-r12,lr}
+  vpop {q4-q7}
+  bx lr
+.size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce;
+
+
 /*
  * void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S
index 3af29e0d..a87d2ca5 100644
--- a/cipher/rijndael-armv8-aarch64-ce.S
+++ b/cipher/rijndael-armv8-aarch64-ce.S
@@ -676,6 +676,115 @@ _gcry_aes_ctr_enc_armv8_ce:
 ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;)
 
 
+/*
+ * void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
+ *                                      unsigned char *outbuf,
+ *                                      const unsigned char *inbuf,
+ *                                      unsigned char *iv,
+ *                                      unsigned int nrounds);
+ */
+
+.align 3
+.globl _gcry_aes_ctr32le_enc_armv8_ce
+ELF(.type  _gcry_aes_ctr32le_enc_armv8_ce,%function;)
+_gcry_aes_ctr32le_enc_armv8_ce:
+  /* input:
+   *    r0: keysched
+   *    r1: outbuf
+   *    r2: inbuf
+   *    r3: iv
+   *    x4: nblocks
+   *    w5: nrounds
+   */
+  CFI_STARTPROC();
+
+  cbz x4, .Lctr32le_enc_skip
+
+  mov w6, #1
+  movi v16.16b, #0
+  mov v16.S[0], w6
+
+  /* load IV */
+  ld1 {v0.16b}, [x3]
+
+  aes_preload_keys(x0, w5);
+
+  b.eq .Lctr32le_enc_entry_192
+  b.hi .Lctr32le_enc_entry_256
+
+#define CTR_ENC(bits) \
+  .Lctr32le_enc_entry_##bits: \
+    cmp x4, #4; \
+    b.lo .Lctr32le_enc_loop_##bits; \
+    \
+  .Lctr32le_enc_loop4_##bits: \
+    sub x4, x4, #4; \
+    \
+    add v3.4s, v16.4s, v16.4s; /* 2 */ \
+    mov v1.16b, v0.16b; \
+    add v2.4s, v0.4s, v16.4s; \
+    add v4.4s, v3.4s, v16.4s;  /* 3 */ \
+    add v6.4s, v3.4s, v3.4s;   /* 4 */ \
+    add v3.4s, v0.4s, v3.4s; \
+    add v4.4s, v0.4s, v4.4s; \
+    add v0.4s, v0.4s, v6.4s; \
+    \
+    cmp x4, #4; \
+    ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \
+    \
+    do_aes_4_##bits(e, mc, v1, v2, v3, v4); \
+    \
+    eor v1.16b, v1.16b, v5.16b; \
+    ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \
+    eor v2.16b, v2.16b, v6.16b; \
+    eor v3.16b, v3.16b, v7.16b; \
+    eor v4.16b, v4.16b, v5.16b; \
+    st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \
+    \
+    b.hs .Lctr32le_enc_loop4_##bits; \
+    CLEAR_REG(v3); \
+    CLEAR_REG(v4); \
+    CLEAR_REG(v5); \
+    CLEAR_REG(v6); \
+    CLEAR_REG(v7); \
+    cbz x4, .Lctr32le_enc_done; \
+    \
+  .Lctr32le_enc_loop_##bits: \
+    \
+    mov v1.16b, v0.16b; \
+    ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \
+    sub x4, x4, #1; \
+    add v0.4s, v0.4s, v16.4s; \
+    \
+    do_aes_one##bits(e, mc, v1, v1); \
+    \
+    eor v1.16b, v2.16b, v1.16b; \
+    st1 {v1.16b}, [x1], #16; /* store plaintext */ \
+    \
+    cbnz x4, .Lctr32le_enc_loop_##bits; \
+    b .Lctr32le_enc_done;
+
+  CTR_ENC(128)
+  CTR_ENC(192)
+  CTR_ENC(256)
+
+#undef CTR_ENC
+
+.Lctr32le_enc_done:
+  aes_clear_keys(w5)
+
+  st1 {v0.16b}, [x3] /* store IV */
+
+  CLEAR_REG(v0)
+  CLEAR_REG(v1)
+  CLEAR_REG(v2)
+
+.Lctr32le_enc_skip:
+  ret
+  CFI_ENDPROC();
+ELF(.size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce;)
+
+
 /*
  * void _gcry_aes_cfb_enc_armv8_ce (const void *keysched,
  *                                  unsigned char *outbuf,
diff --git a/cipher/rijndael-armv8-ce.c b/cipher/rijndael-armv8-ce.c
index 6e46830e..b24ae3e9 100644
--- a/cipher/rijndael-armv8-ce.c
+++ b/cipher/rijndael-armv8-ce.c
@@ -75,6 +75,12 @@ extern void _gcry_aes_ctr_enc_armv8_ce (const void *keysched,
                                         unsigned char *iv, size_t nblocks,
                                         unsigned int nrounds);
 
+extern void _gcry_aes_ctr32le_enc_armv8_ce (const void *keysched,
+                                            unsigned char *outbuf,
+                                            const unsigned char *inbuf,
+                                            unsigned char *iv, size_t nblocks,
+                                            unsigned int nrounds);
+
 extern void _gcry_aes_ocb_enc_armv8_ce (const void *keysched,
                                         unsigned char *outbuf,
                                         const unsigned char *inbuf,
@@ -345,6 +351,17 @@ _gcry_aes_armv8_ce_ctr_enc (RIJNDAEL_context *ctx, unsigned char *iv,
   _gcry_aes_ctr_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
 }
 
+void
+_gcry_aes_armv8_ce_ctr32le_enc (RIJNDAEL_context *ctx, unsigned char *iv,
+                                unsigned char *outbuf,
+                                const unsigned char *inbuf, size_t nblocks)
+{
+  const void *keysched = ctx->keyschenc32;
+  unsigned int nrounds = ctx->rounds;
+
+  _gcry_aes_ctr32le_enc_armv8_ce(keysched, outbuf, inbuf, iv, nblocks, nrounds);
+}
+
 size_t
 _gcry_aes_armv8_ce_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
                               const void *inbuf_arg, size_t nblocks,
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index c096321f..df41b911 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -209,6 +209,10 @@ extern void _gcry_aes_armv8_ce_cbc_enc (void *context, unsigned char *iv,
 extern void _gcry_aes_armv8_ce_ctr_enc (void *context, unsigned char *ctr,
                                         void *outbuf_arg, const void *inbuf_arg,
                                         size_t nblocks);
+extern void _gcry_aes_armv8_ce_ctr32le_enc (void *context, unsigned char *ctr,
+                                            void *outbuf_arg,
+                                            const void *inbuf_arg,
+                                            size_t nblocks);
 extern void _gcry_aes_armv8_ce_cfb_dec (void *context, unsigned char *iv,
                                         void *outbuf_arg, const void *inbuf_arg,
                                         size_t nblocks);
@@ -570,6 +574,7 @@ do_setkey (RIJNDAEL_context *ctx, const byte *key, const unsigned keylen,
       bulk_ops->cbc_enc = _gcry_aes_armv8_ce_cbc_enc;
       bulk_ops->cbc_dec = _gcry_aes_armv8_ce_cbc_dec;
       bulk_ops->ctr_enc = _gcry_aes_armv8_ce_ctr_enc;
+      bulk_ops->ctr32le_enc = _gcry_aes_armv8_ce_ctr32le_enc;
       bulk_ops->ocb_crypt = _gcry_aes_armv8_ce_ocb_crypt;
       bulk_ops->ocb_auth = _gcry_aes_armv8_ce_ocb_auth;
       bulk_ops->xts_crypt = _gcry_aes_armv8_ce_xts_crypt;
-- 
2.30.2




More information about the Gcrypt-devel mailing list