[PATCH 1/4] Add 64-bit ARMv8/CE PMULL implementation of CRC

Fri Apr 26 18:33:31 CEST 2019

* cipher/Makefile.am: Add 'crc-armv8-ce.c' and
'crc-armv8-aarch64-ce.S'.
* cipher/asm-common-aarch64.h [HAVE_GCC_ASM_CFI_DIRECTIVES]: Add CFI
helper macros.
* cipher/crc-armv8-aarch64-ce.S: New.
* cipher/crc-armv8-ce.c: New.
* cipher/crc.c (USE_ARM_PMULL): New.
(CRC_CONTEXT) [USE_ARM_PMULL]: Add 'use_pmull'.
[USE_ARM_PMULL] (_gcry_crc32_armv8_ce_pmull)
(_gcry_crc24rfc2440_armv8_ce_pmull): New prototypes.
(crc32_init, crc32rfc1510_init, crc24rfc2440_init): Enable ARM PMULL
implementations if supported by HW features.
(crc32_write, crc24rfc2440_write) [USE_ARM_PMULL]: Use ARM PMULL
implementations if enabled.
* configure.ac: Add 'crc-armv8-ce.lo' and 'crc-armv8-aarch64-ce.lo'.
--

Benchmark on Cortex-A53 (at 1104 Mhz):

Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 CRC32          |      2.89 ns/B     330.2 MiB/s      3.19 c/B
 CRC32RFC1510   |      2.89 ns/B     330.2 MiB/s      3.19 c/B
 CRC24RFC2440   |      2.72 ns/B     350.8 MiB/s      3.00 c/B

After (crc32 ~8.4x faster, crc24 ~6.8x faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 CRC32          |     0.341 ns/B      2796 MiB/s     0.377 c/B
 CRC32RFC1510   |     0.342 ns/B      2792 MiB/s     0.377 c/B
 CRC24RFC2440   |     0.398 ns/B      2396 MiB/s     0.439 c/B

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 0 files changed

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 3f00ed4a8..2acd7cb38 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -78,7 +78,8 @@ EXTRA_libcipher_la_SOURCES = \
 	cast5.c cast5-amd64.S cast5-arm.S \
 	chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \
 	chacha20-armv7-neon.S chacha20-aarch64.S \
-	crc.c crc-intel-pclmul.c \
+	crc.c crc-intel-pclmul.c crc-armv8-ce.c \
+	crc-armv8-aarch64-ce.S \
 	des.c des-amd64.S \
 	dsa.c \
 	elgamal.c \
diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S
new file mode 100644
index 000000000..497d00551
--- /dev/null
+++ b/cipher/crc-armv8-aarch64-ce.S
@@ -0,0 +1,492 @@
+/* crc-armv8-aarch64-ce.S - ARMv8/CE PMULL accelerated CRC implementation
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "asm-common-aarch64.h"
+
+#if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+.cpu generic+simd+crypto
+
+.text
+
+#define GET_DATA_POINTER(reg, name) \
+		adrp    reg, :got:name ; \
+		ldr     reg, [reg, #:got_lo12:name] ;
+
+/* Structure of crc32_consts_s */
+
+#define consts_k(idx)    ((idx) * 8)
+#define consts_my_p(idx) (consts_k(6) + (idx) * 8)
+
+/* Constants */
+
+.align 6
+.Lcrc32_constants:
+.Lcrc32_partial_fold_input_mask:
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+.Lcrc32_refl_shuf_shift:
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
+  .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
+.Lcrc32_shuf_shift:
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.Lcrc32_bswap_shuf:
+  .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
+  .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+
+
+/*
+ * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+ *                                  const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32r_armv8_ce_bulk
+ELF(.type  _gcry_crc32r_armv8_ce_bulk,%function;)
+_gcry_crc32r_armv8_ce_bulk:
+  /* input:
+   *    x0: pcrc
+   *    x1: inbuf
+   *    x2: inlen
+   *    x3: consts
+   */
+
+  GET_DATA_POINTER(x7, .Lcrc32_constants)
+  add x9, x3, #consts_k(5 - 1)
+  cmp x2, #128
+
+  b.lo .Lcrc32r_fold_by_one_setup
+
+  eor v4.16b, v4.16b, v4.16b
+  add x4, x3, #consts_k(1 - 1)
+  ld1 {v4.s}[0], [x0]             /* load pcrc */
+  ld1 {v0.16b-v3.16b}, [x1], #64  /* load 64 bytes of input */
+  sub x2, x2, #64
+  ld1 {v6.16b}, [x4]
+  eor v0.16b, v0.16b, v4.16b
+
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+
+.Lcrc32r_fold_by_four:
+
+  /* Fold by 4. */
+  ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
+  sub x2, x2, #64
+  pmull v20.1q, v0.1d, v6.1d
+  pmull v21.1q, v1.1d, v6.1d
+  pmull v22.1q, v2.1d, v6.1d
+  pmull v23.1q, v3.1d, v6.1d
+  cmp x2, #64
+  pmull2 v24.1q, v0.2d, v6.2d
+  pmull2 v25.1q, v1.2d, v6.2d
+  pmull2 v26.1q, v2.2d, v6.2d
+  pmull2 v27.1q, v3.2d, v6.2d
+  eor v0.16b, v20.16b, v16.16b
+  eor v1.16b, v21.16b, v17.16b
+  eor v2.16b, v22.16b, v18.16b
+  eor v3.16b, v23.16b, v19.16b
+  eor v0.16b, v0.16b, v24.16b
+  eor v1.16b, v1.16b, v25.16b
+  eor v2.16b, v2.16b, v26.16b
+  eor v3.16b, v3.16b, v27.16b
+  b.hs .Lcrc32r_fold_by_four
+
+  ld1 {v6.16b}, [x4]
+  ld1 {v5.16b}, [x5]
+
+  cmp x2, #16
+
+  /* Fold 4 to 1. */
+
+  pmull v16.1q, v0.1d, v6.1d
+  pmull2 v4.1q, v0.2d, v6.2d
+  eor v0.16b, v16.16b, v1.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull v16.1q, v0.1d, v6.1d
+  pmull2 v4.1q, v0.2d, v6.2d
+  eor v0.16b, v16.16b, v2.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull v16.1q, v0.1d, v6.1d
+  pmull2 v4.1q, v0.2d, v6.2d
+  eor v0.16b, v16.16b, v3.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  b.lo .Lcrc32r_fold_by_one_done
+  b .Lcrc32r_fold_by_one
+
+.Lcrc32r_fold_by_one_setup:
+
+  eor v1.16b, v1.16b, v1.16b
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+  sub x2, x2, #16
+  ld1 {v1.s}[0], [x0]             /* load pcrc */
+  ld1 {v0.16b}, [x1], #16         /* load 16 bytes of input */
+  cmp x2, #16
+  ld1 {v6.16b}, [x4]              /* load k3k4 */
+  ld1 {v5.16b}, [x5]              /* load my_p */
+  eor v0.16b, v0.16b, v1.16b
+  b.lo .Lcrc32r_fold_by_one_done
+
+.Lcrc32r_fold_by_one:
+  sub x2, x2, #16
+  ld1 {v2.16b}, [x1], #16         /* load 16 bytes of input */
+  pmull v3.1q, v0.1d, v6.1d
+  pmull2 v1.1q, v0.2d, v6.2d
+  cmp x2, #16
+  eor v0.16b, v3.16b, v2.16b
+  eor v0.16b, v0.16b, v1.16b
+
+  b.hs .Lcrc32r_fold_by_one
+
+.Lcrc32r_fold_by_one_done:
+
+  cmp x2, #0
+  b.eq .Lcrc32r_final_fold
+
+  /* Partial fold. */
+
+  add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants
+  add x5, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 16
+  add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
+  sub x8, x2, #16
+  add x4, x4, x2
+  add x5, x5, x2
+  add x6, x6, x2
+  add x8, x1, x8
+
+  /* Load last input and add padding zeros. */
+  ld1 {v4.16b}, [x4]
+  eor x2, x2, x2
+  ld1 {v3.16b}, [x5]
+  ld1 {v2.16b}, [x6]
+  tbl v30.16b, {v0.16b}, v4.16b
+  ld1 {v4.16b}, [x8]
+  tbl v1.16b, {v0.16b}, v3.16b
+
+  pmull v0.1q, v30.1d, v6.1d
+  and v2.16b, v2.16b, v4.16b
+  pmull2 v31.1q, v30.2d, v6.2d
+  orr v2.16b, v2.16b, v1.16b
+  eor v0.16b, v0.16b, v31.16b
+  eor v0.16b, v0.16b, v2.16b
+
+.Lcrc32r_final_fold:
+
+  /* Final fold. */
+
+  eor v2.16b, v2.16b, v2.16b      /* zero reg */
+  ld1 {v7.16b}, [x9]
+
+  /* reduce 128-bits to 96-bits */
+  ext v6.16b, v6.16b, v6.16b, #8  /* swap high and low parts */
+  mov v1.16b, v0.16b
+  pmull v0.1q, v0.1d, v6.1d
+  ext v6.16b, v5.16b, v5.16b, #8  /* swap high and low parts */
+  ext v1.16b, v1.16b, v2.16b, #8  /* high to low, high zeroed */
+  eor v3.16b, v0.16b, v1.16b
+
+  /* reduce 96-bits to 64-bits */
+  eor v1.16b, v1.16b, v1.16b
+  ext v0.16b, v3.16b, v2.16b, #4  /* [00][00][x2][x1] */
+  mov v1.s[0], v3.s[0]            /* [00][00][00][x0] */
+  eor v3.16b, v3.16b, v3.16b
+  pmull v1.1q, v1.1d, v7.1d       /* [00][00][xx][xx] */
+  eor v0.16b, v0.16b, v1.16b      /* top 64-bit are zero */
+
+  /* barrett reduction */
+  mov v3.s[1], v0.s[0]            /* [00][00][x1][00] */
+  ext v0.16b, v2.16b, v0.16b, #12 /* [??][x1][??][00] */
+  pmull v1.1q, v3.1d, v5.1d       /* [00][xx][xx][00] */
+  pmull v1.1q, v1.1d, v6.1d       /* [00][xx][xx][00] */
+  eor v0.16b, v0.16b, v1.16b
+
+  /* store CRC */
+  st1 {v0.s}[2], [x0]
+
+  ret
+ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;)
+
+/*
+ * void _gcry_crc32r_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
+ *                                         const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32r_armv8_ce_reduction_4
+ELF(.type  _gcry_crc32r_armv8_ce_reduction_4,%function;)
+_gcry_crc32r_armv8_ce_reduction_4:
+  /* input:
+   *    w0: data
+   *    w1: crc
+   *    x2: crc32 constants
+   */
+
+  eor v0.16b, v0.16b, v0.16b
+  add x2, x2, #consts_my_p(0)
+  eor v1.16b, v1.16b, v1.16b
+  ld1 {v5.16b}, [x2]
+
+  mov v0.s[0], w0
+  pmull v0.1q, v0.1d, v5.1d     /* [00][00][xx][xx] */
+  mov v1.s[1], w1
+  mov v0.s[2], v0.s[0]          /* [00][x0][x1][x0] */
+  pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
+  eor v0.16b, v0.16b, v1.16b
+
+  mov w0, v0.s[1]
+
+  ret
+ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;)
+
+/*
+ * void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+ *                                 const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32_armv8_ce_bulk
+ELF(.type  _gcry_crc32_armv8_ce_bulk,%function;)
+_gcry_crc32_armv8_ce_bulk:
+  /* input:
+   *    x0: pcrc
+   *    x1: inbuf
+   *    x2: inlen
+   *    x3: consts
+   */
+
+  GET_DATA_POINTER(x7, .Lcrc32_constants)
+  add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants
+  cmp x2, #128
+  ld1 {v7.16b}, [x4]
+
+  b.lo .Lcrc32_fold_by_one_setup
+
+  eor v4.16b, v4.16b, v4.16b
+  add x4, x3, #consts_k(1 - 1)
+  ld1 {v4.s}[0], [x0]            /* load pcrc */
+  ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */
+  sub x2, x2, #64
+  ld1 {v6.16b}, [x4]
+  eor v0.16b, v0.16b, v4.16b
+  ext v4.16b, v6.16b, v6.16b, #8
+  tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
+  tbl v1.16b, { v1.16b }, v7.16b /* byte swap */
+  tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
+  tbl v3.16b, { v3.16b }, v7.16b /* byte swap */
+
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+
+.Lcrc32_fold_by_four:
+
+  /* Fold by 4. */
+  ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
+  sub x2, x2, #64
+  tbl v16.16b, { v16.16b }, v7.16b /* byte swap */
+  tbl v17.16b, { v17.16b }, v7.16b /* byte swap */
+  tbl v18.16b, { v18.16b }, v7.16b /* byte swap */
+  tbl v19.16b, { v19.16b }, v7.16b /* byte swap */
+  cmp x2, #64
+  pmull2 v20.1q, v0.2d, v4.2d
+  pmull2 v21.1q, v1.2d, v4.2d
+  pmull2 v22.1q, v2.2d, v4.2d
+  pmull2 v23.1q, v3.2d, v4.2d
+  pmull v24.1q, v0.1d, v4.1d
+  pmull v25.1q, v1.1d, v4.1d
+  pmull v26.1q, v2.1d, v4.1d
+  pmull v27.1q, v3.1d, v4.1d
+  eor v0.16b, v20.16b, v16.16b
+  eor v1.16b, v21.16b, v17.16b
+  eor v2.16b, v22.16b, v18.16b
+  eor v3.16b, v23.16b, v19.16b
+  eor v0.16b, v0.16b, v24.16b
+  eor v1.16b, v1.16b, v25.16b
+  eor v2.16b, v2.16b, v26.16b
+  eor v3.16b, v3.16b, v27.16b
+  b.hs .Lcrc32_fold_by_four
+
+  ld1 {v6.16b}, [x4]
+  ld1 {v5.16b}, [x5]
+  ext v6.16b, v6.16b, v6.16b, #8
+  ext v5.16b, v5.16b, v5.16b, #8
+
+  cmp x2, #16
+
+  /* Fold 4 to 1. */
+
+  pmull2 v16.1q, v0.2d, v6.2d
+  pmull v4.1q, v0.1d, v6.1d
+  eor v0.16b, v16.16b, v1.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull2 v16.1q, v0.2d, v6.2d
+  pmull v4.1q, v0.1d, v6.1d
+  eor v0.16b, v16.16b, v2.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  pmull2 v16.1q, v0.2d, v6.2d
+  pmull v4.1q, v0.1d, v6.1d
+  eor v0.16b, v16.16b, v3.16b
+  eor v0.16b, v0.16b, v4.16b
+
+  b.lo .Lcrc32_fold_by_one_done
+  b .Lcrc32_fold_by_one
+
+.Lcrc32_fold_by_one_setup:
+
+  eor v1.16b, v1.16b, v1.16b
+  add x4, x3, #consts_k(3 - 1)
+  add x5, x3, #consts_my_p(0)
+  ld1 {v1.s}[0], [x0]            /* load pcrc */
+  sub x2, x2, #16
+  ld1 {v0.16b}, [x1], #16        /* load 16 bytes of input */
+  ld1 {v6.16b}, [x4]             /* load k3k4 */
+  ld1 {v5.16b}, [x5]             /* load my_p */
+  eor v0.16b, v0.16b, v1.16b
+  cmp x2, #16
+  ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
+  ext v5.16b, v5.16b, v5.16b, #8 /* swap high and low parts */
+  tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
+  b.lo .Lcrc32_fold_by_one_done
+
+.Lcrc32_fold_by_one:
+  sub x2, x2, #16
+  ld1 {v2.16b}, [x1], #16        /* load 16 bytes of input */
+  pmull2 v3.1q, v0.2d, v6.2d
+  tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
+  pmull v1.1q, v0.1d, v6.1d
+  cmp x2, #16
+  eor v0.16b, v3.16b, v2.16b
+  eor v0.16b, v0.16b, v1.16b
+
+  b.hs .Lcrc32_fold_by_one
+
+.Lcrc32_fold_by_one_done:
+
+  cmp x2, #0
+  b.eq .Lcrc32_final_fold
+
+  /* Partial fold. */
+
+  add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 32
+  add x5, x7, #.Lcrc32_shuf_shift - .Lcrc32_constants + 16
+  add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
+  sub x8, x2, #16
+  sub x4, x4, x2
+  add x5, x5, x2
+  add x6, x6, x2
+  add x8, x1, x8
+
+  /* Load last input and add padding zeros. */
+  ld1 {v4.16b}, [x4]
+  eor x2, x2, x2
+  ld1 {v3.16b}, [x5]
+  ld1 {v2.16b}, [x6]
+  tbl v30.16b, {v0.16b}, v4.16b
+  ld1 {v4.16b}, [x8]
+  tbl v1.16b, {v0.16b}, v3.16b
+  and v2.16b, v2.16b, v4.16b
+
+  pmull2 v0.1q, v30.2d, v6.2d
+  orr v2.16b, v2.16b, v1.16b
+  pmull v1.1q, v30.1d, v6.1d
+  tbl v2.16b, {v2.16b}, v7.16b   /* byte swap */
+  eor v0.16b, v0.16b, v1.16b
+  eor v0.16b, v0.16b, v2.16b
+
+.Lcrc32_final_fold:
+
+  /* Final fold. */
+
+  eor v2.16b, v2.16b, v2.16b     /* zero reg */
+
+  /* reduce 128-bits to 96-bits */
+  add x4, x3, #consts_k(4)
+  ext v3.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
+  eor v6.16b, v6.16b, v6.16b
+  mov v1.16b, v0.16b
+  pmull2 v0.1q, v0.2d, v3.2d
+  ld1 {v6.d}[1], [x4]            /* load k4 */
+  ext v1.16b, v2.16b, v1.16b, #8 /* low to high, low zeroed */
+  eor v3.16b, v0.16b, v1.16b     /* bottom 32-bit are zero */
+
+  /* reduce 96-bits to 64-bits */
+  eor v0.16b, v0.16b, v0.16b
+  eor v1.16b, v1.16b, v1.16b
+  mov v0.s[1], v3.s[1]           /* [00][00][x1][00] */
+  mov v1.s[2], v3.s[3]           /* [00][x3][00][00] */
+  mov v0.s[2], v3.s[2]           /* [00][x2][x1][00] */
+  eor v3.16b, v3.16b, v3.16b
+  pmull2 v1.1q, v1.2d, v6.2d     /* [00][xx][xx][00] */
+  eor v0.16b, v0.16b, v1.16b     /* top and bottom 32-bit are zero */
+
+  /* barrett reduction */
+  mov v3.s[0], v0.s[1]           /* [00][00][00][x1] */
+  pmull2 v0.1q, v0.2d, v5.2d     /* [00][xx][xx][xx] */
+  ext v0.16b, v0.16b, v2.16b, #4 /* [00][00][xx][xx] */
+  pmull v0.1q, v0.1d, v5.1d
+  eor v0.16b, v0.16b, v3.16b
+
+  /* store CRC in input endian */
+  rev32 v0.8b, v0.8b             /* byte swap */
+  st1 {v0.s}[0], [x0]
+
+  ret
+ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;)
+
+/*
+ * void _gcry_crc32_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
+ *                                        const struct crc32_consts_s *consts);
+ */
+.align 3
+.globl _gcry_crc32_armv8_ce_reduction_4
+ELF(.type  _gcry_crc32_armv8_ce_reduction_4,%function;)
+_gcry_crc32_armv8_ce_reduction_4:
+  /* input:
+   *    w0: data
+   *    w1: crc
+   *    x2: crc32 constants
+   */
+
+  eor v0.16b, v0.16b, v0.16b
+  add x2, x2, #consts_my_p(0)
+  eor v1.16b, v1.16b, v1.16b
+  ld1 {v5.16b}, [x2]
+
+  mov v0.s[1], w0
+  pmull v0.1q, v0.1d, v5.1d     /* [00][xx][xx][00] */
+  mov v1.s[0], w1
+  pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
+  eor v0.16b, v0.16b, v1.16b
+
+  rev32 v0.8b, v0.8b            /* Return in input endian */
+  mov w0, v0.s[0]
+
+  ret
+ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;)
+
+#endif
diff --git a/cipher/crc-armv8-ce.c b/cipher/crc-armv8-ce.c
new file mode 100644
index 000000000..8dd07cce6
--- /dev/null
+++ b/cipher/crc-armv8-ce.c
@@ -0,0 +1,229 @@
+/* crc-armv8-ce.c - ARMv8-CE PMULL accelerated CRC implementation
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ */
+
+#include <config.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "g10lib.h"
+
+#include "bithelp.h"
+#include "bufhelp.h"
+
+
+#if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+
+
+#define ALIGNED_16 __attribute__ ((aligned (16)))
+
+
+struct u16_unaligned_s
+{
+  u16 a;
+} __attribute__((packed, aligned (1), may_alias));
+
+struct u32_unaligned_s
+{
+  u32 a;
+} __attribute__((packed, aligned (1), may_alias));
+
+
+/* Constants structure for generic reflected/non-reflected CRC32 PMULL
+ * functions. */
+struct crc32_consts_s
+{
+  /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
+  u64 k[6];
+  /* my_p: { floor(x^64 / P(x)), P(x) } */
+  u64 my_p[2];
+};
+
+/* PMULL constants for CRC32 and CRC32RFC1510. */
+static const struct crc32_consts_s crc32_consts ALIGNED_16 =
+{
+  { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
+    U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
+    U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
+    U64_C(0x163cd6124), 0                   /* y = 2 */
+  },
+  { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
+    U64_C(0x1f7011641), U64_C(0x1db710641)
+  }
+};
+
+/* PMULL constants for CRC24RFC2440 (polynomial multiplied with x⁸). */
+static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 =
+{
+  { /* k[6] = x^(32*y) mod P(x) << 32*/
+    U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */
+    U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */
+    U64_C(0xd9fe8c00) << 32, 0                        /* y = 2 */
+  },
+  { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */
+    U64_C(0x1f845fe24), U64_C(0x1864cfb00)
+  }
+};
+
+
+u32 _gcry_crc32r_armv8_ce_reduction_4 (u32 data, u32 crc,
+				       const struct crc32_consts_s *consts);
+void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+                                 const struct crc32_consts_s *consts);
+
+u32 _gcry_crc32_armv8_ce_reduction_4 (u32 data, u32 crc,
+				      const struct crc32_consts_s *consts);
+void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
+                                const struct crc32_consts_s *consts);
+
+
+static inline void
+crc32r_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+		     const struct crc32_consts_s *consts)
+{
+  u32 crc = *pcrc;
+  u32 data;
+
+  while (inlen >= 4)
+    {
+      data = ((const struct u32_unaligned_s *)inbuf)->a;
+      data ^= crc;
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, 0, consts);
+    }
+
+  switch (inlen)
+    {
+    case 0:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data <<= 24;
+      crc >>= 8;
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 2:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data ^= crc;
+      data <<= 16;
+      crc >>= 16;
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 3:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data |= inbuf[2] << 16;
+      data ^= crc;
+      data <<= 8;
+      crc >>= 24;
+      crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    }
+
+  *pcrc = crc;
+}
+
+static inline void
+crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen,
+		    const struct crc32_consts_s *consts)
+{
+  u32 crc = *pcrc;
+  u32 data;
+
+  while (inlen >= 4)
+    {
+      data = ((const struct u32_unaligned_s *)inbuf)->a;
+      data ^= crc;
+      data = _gcry_bswap32(data);
+
+      inlen -= 4;
+      inbuf += 4;
+
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, 0, consts);
+    }
+
+  switch (inlen)
+    {
+    case 0:
+      break;
+    case 1:
+      data = inbuf[0];
+      data ^= crc;
+      data = data & 0xffU;
+      crc = _gcry_bswap32(crc >> 8);
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 2:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data ^= crc;
+      data = _gcry_bswap32(data << 16);
+      crc = _gcry_bswap32(crc >> 16);
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    case 3:
+      data = ((const struct u16_unaligned_s *)inbuf)->a;
+      data |= inbuf[2] << 16;
+      data ^= crc;
+      data = _gcry_bswap32(data << 8);
+      crc = crc & 0xff000000U;
+      crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts);
+      break;
+    }
+
+  *pcrc = crc;
+}
+
+void
+_gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc32_consts;
+
+  if (!inlen)
+    return;
+
+  if (inlen >= 16)
+    _gcry_crc32r_armv8_ce_bulk (pcrc, inbuf, inlen, consts);
+  else
+    crc32r_less_than_16 (pcrc, inbuf, inlen, consts);
+}
+
+void
+_gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen)
+{
+  const struct crc32_consts_s *consts = &crc24rfc2440_consts;
+
+  if (!inlen)
+    return;
+
+  /* Note: *pcrc in input endian. */
+
+  if (inlen >= 16)
+    _gcry_crc32_armv8_ce_bulk (pcrc, inbuf, inlen, consts);
+  else
+    crc32_less_than_16 (pcrc, inbuf, inlen, consts);
+}
+
+#endif /* USE_INTEL_PCLMUL */
diff --git a/cipher/crc.c b/cipher/crc.c
index 4457ff62f..2abbab288 100644
--- a/cipher/crc.c
+++ b/cipher/crc.c
@@ -42,12 +42,24 @@
 # endif
 #endif /* USE_INTEL_PCLMUL */
 
+/* USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */
+#undef USE_ARM_PMULL
+#if defined(ENABLE_ARM_CRYPTO_SUPPORT)
+# if defined(__AARCH64EL__) && \
+    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
+#  define USE_ARM_PMULL 1
+# endif
+#endif /* USE_ARM_PMULL */
 
 typedef struct
 {
   u32 CRC;
 #ifdef USE_INTEL_PCLMUL
   unsigned int use_pclmul:1;           /* Intel PCLMUL shall be used.  */
+#endif
+#ifdef USE_ARM_PMULL
+  unsigned int use_pmull:1;            /* ARMv8 PMULL shall be used. */
 #endif
   byte buf[4];
 }
@@ -61,6 +73,13 @@ void _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf,
 				      size_t inlen);
 #endif
 
+#ifdef USE_ARM_PMULL
+/*-- crc-armv8-ce.c --*/
+void _gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen);
+void _gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf,
+					size_t inlen);
+#endif
+
 
 /*
  * Code generated by universal_crc by Danjel McGougan
@@ -361,13 +380,17 @@ static void
 crc32_init (void *context, unsigned int flags)
 {
   CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
-#ifdef USE_INTEL_PCLMUL
   u32 hwf = _gcry_get_hw_features ();
 
+#ifdef USE_INTEL_PCLMUL
   ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
 #endif
+#ifdef USE_ARM_PMULL
+  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
+#endif
 
   (void)flags;
+  (void)hwf;
 
   ctx->CRC = 0 ^ 0xffffffffL;
 }
@@ -386,6 +409,13 @@ crc32_write (void *context, const void *inbuf_arg, size_t inlen)
       return;
     }
 #endif
+#ifdef USE_ARM_PMULL
+  if (ctx->use_pmull)
+    {
+      _gcry_crc32_armv8_ce_pmull(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
 
   if (!inbuf || !inlen)
     return;
@@ -439,13 +469,17 @@ static void
 crc32rfc1510_init (void *context, unsigned int flags)
 {
   CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
-#ifdef USE_INTEL_PCLMUL
   u32 hwf = _gcry_get_hw_features ();
 
+#ifdef USE_INTEL_PCLMUL
   ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
 #endif
+#ifdef USE_ARM_PMULL
+  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
+#endif
 
   (void)flags;
+  (void)hwf;
 
   ctx->CRC = 0;
 }
@@ -769,12 +803,16 @@ static void
 crc24rfc2440_init (void *context, unsigned int flags)
 {
   CRC_CONTEXT *ctx = (CRC_CONTEXT *) context;
-#ifdef USE_INTEL_PCLMUL
   u32 hwf = _gcry_get_hw_features ();
 
+#ifdef USE_INTEL_PCLMUL
   ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL);
 #endif
+#ifdef USE_ARM_PMULL
+  ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL);
+#endif
 
+  (void)hwf;
   (void)flags;
 
   ctx->CRC = crc24_init();
@@ -794,6 +832,13 @@ crc24rfc2440_write (void *context, const void *inbuf_arg, size_t inlen)
       return;
     }
 #endif
+#ifdef USE_ARM_PMULL
+  if (ctx->use_pmull)
+    {
+      _gcry_crc24rfc2440_armv8_ce_pmull(&ctx->CRC, inbuf, inlen);
+      return;
+    }
+#endif
 
   if (!inbuf || !inlen)
     return;
diff --git a/configure.ac b/configure.ac
index 1aafc320a..aa23a5010 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2409,6 +2409,11 @@ if test "$found" = "1" ; then
          # Build with the assembly implementation
          GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo"
       ;;
+      aarch64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-ce.lo"
+         GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-aarch64-ce.lo"
+      ;;
    esac
 fi