[PATCH] Add SM3 x86-64 AVX/BMI2 assembly implementation

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Dec 12 15:49:55 CET 2021


* cipher/Makefile.am: Add 'sm3-avx-bmi2-amd64.S'.
* cipher/sm3-avx-bmi2-amd64.S: New.
* cipher/sm3.c (USE_AVX_BMI2, ASM_FUNC_ABI, ASM_EXTRA_STACK): New.
(SM3_CONTEXT): Define 'h' as array instead of separate fields 'h1',
'h2', etc.
[USE_AVX_BMI2] (_gcry_sm3_transform_amd64_avx_bmi2)
(do_sm3_transform_amd64_avx_bmi2): New.
(sm3_init): Select AVX/BMI2 transform function if support by HW; Update
to use 'hd->h' as array.
(transform_blk, sm3_final): Update to use 'hd->h' as array.
* configure.ac: Add 'sm3-avx-bmi2-amd64.lo'.
--

Benchmark on AMD Zen3:

 Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 SM3            |      2.18 ns/B     436.6 MiB/s     10.59 c/B      4850

 After (~43% faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 SM3            |      1.52 ns/B     627.4 MiB/s      7.37 c/B      4850


Benchmark on Intel Skylake:

 Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 SM3            |      4.35 ns/B     219.2 MiB/s     13.48 c/B      3098

 After (~34% faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 SM3            |      3.24 ns/B     294.4 MiB/s     10.04 c/B      3098


Benchmark on AMD Zen2:

 Before:
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 SM3            |      2.73 ns/B     348.9 MiB/s     11.86 c/B      4339

 After (~38% faster):
                |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
 SM3            |      1.97 ns/B     483.0 MiB/s      8.52 c/B      4318


Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/Makefile.am          |   2 +-
 cipher/sm3-avx-bmi2-amd64.S | 560 ++++++++++++++++++++++++++++++++++++
 cipher/sm3.c                |  99 +++++--
 configure.ac                |  19 +-
 4 files changed, 647 insertions(+), 33 deletions(-)
 create mode 100644 cipher/sm3-avx-bmi2-amd64.S

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 6c5d48b9..73457a91 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -128,7 +128,7 @@ EXTRA_libcipher_la_SOURCES = \
 	sha512-avx2-bmi2-amd64.S \
 	sha512-armv7-neon.S sha512-arm.S \
 	sha512-ppc.c sha512-ssse3-i386.c \
-	sm3.c \
+	sm3.c sm3-avx-bmi2-amd64.S \
 	keccak.c keccak_permute_32.h keccak_permute_64.h keccak-armv7-neon.S \
 	stribog.c \
 	tiger.c \
diff --git a/cipher/sm3-avx-bmi2-amd64.S b/cipher/sm3-avx-bmi2-amd64.S
new file mode 100644
index 00000000..5be83ca4
--- /dev/null
+++ b/cipher/sm3-avx-bmi2-amd64.S
@@ -0,0 +1,560 @@
+/* sm3-avx-bmi2-amd64.S - Intel AVX/BMI2 accelerated SM3 transform function
+ * Copyright (C) 2021 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+    defined(HAVE_GCC_INLINE_ASM_AVX) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    defined(USE_SM3)
+
+#include "asm-common-amd64.h"
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+#define state_h5 20
+#define state_h6 24
+#define state_h7 28
+
+/* Constants */
+
+.text
+.align 16
+ELF(.type _gcry_sm3_avx2_consts, at object)
+_gcry_sm3_avx2_consts:
+.Lbe32mask:
+  .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+ELF(.size _gcry_sm3_avx2_consts,.-_gcry_sm3_avx2_consts)
+
+/* Round constant macros */
+
+#define K0  0x79cc4519
+#define K1  0xf3988a32
+#define K2  0xe7311465
+#define K3  0xce6228cb
+#define K4  0x9cc45197
+#define K5  0x3988a32f
+#define K6  0x7311465e
+#define K7  0xe6228cbc
+#define K8  0xcc451979
+#define K9  0x988a32f3
+#define K10 0x311465e7
+#define K11 0x6228cbce
+#define K12 0xc451979c
+#define K13 0x88a32f39
+#define K14 0x11465e73
+#define K15 0x228cbce6
+#define K16 0x9d8a7a87
+#define K17 0x3b14f50f
+#define K18 0x7629ea1e
+#define K19 0xec53d43c
+#define K20 0xd8a7a879
+#define K21 0xb14f50f3
+#define K22 0x629ea1e7
+#define K23 0xc53d43ce
+#define K24 0x8a7a879d
+#define K25 0x14f50f3b
+#define K26 0x29ea1e76
+#define K27 0x53d43cec
+#define K28 0xa7a879d8
+#define K29 0x4f50f3b1
+#define K30 0x9ea1e762
+#define K31 0x3d43cec5
+#define K32 0x7a879d8a
+#define K33 0xf50f3b14
+#define K34 0xea1e7629
+#define K35 0xd43cec53
+#define K36 0xa879d8a7
+#define K37 0x50f3b14f
+#define K38 0xa1e7629e
+#define K39 0x43cec53d
+#define K40 0x879d8a7a
+#define K41 0x0f3b14f5
+#define K42 0x1e7629ea
+#define K43 0x3cec53d4
+#define K44 0x79d8a7a8
+#define K45 0xf3b14f50
+#define K46 0xe7629ea1
+#define K47 0xcec53d43
+#define K48 0x9d8a7a87
+#define K49 0x3b14f50f
+#define K50 0x7629ea1e
+#define K51 0xec53d43c
+#define K52 0xd8a7a879
+#define K53 0xb14f50f3
+#define K54 0x629ea1e7
+#define K55 0xc53d43ce
+#define K56 0x8a7a879d
+#define K57 0x14f50f3b
+#define K58 0x29ea1e76
+#define K59 0x53d43cec
+#define K60 0xa7a879d8
+#define K61 0x4f50f3b1
+#define K62 0x9ea1e762
+#define K63 0x3d43cec5
+
+/* Register macros */
+
+#define RSTATE %rdi
+#define RDATA  %rsi
+#define RNBLKS %rdx
+
+#define t0 %eax
+#define t1 %ebx
+#define t2 %ecx
+
+#define a %r8d
+#define b %r9d
+#define c %r10d
+#define d %r11d
+#define e %r12d
+#define f %r13d
+#define g %r14d
+#define h %r15d
+
+#define W0 %xmm0
+#define W1 %xmm1
+#define W2 %xmm2
+#define W3 %xmm3
+#define W4 %xmm4
+#define W5 %xmm5
+
+#define XTMP0 %xmm6
+#define XTMP1 %xmm7
+#define XTMP2 %xmm8
+#define XTMP3 %xmm9
+#define XTMP4 %xmm10
+#define XTMP5 %xmm11
+#define XTMP6 %xmm12
+#define XTMP7 %xmm13
+#define XTMP8 %xmm14
+
+#define BSWAP_REG %xmm15
+
+/* Stack structure */
+
+#define STACK_W_SIZE        (32 * 2 * 3)
+#define STACK_REG_SAVE_SIZE (64)
+
+#define STACK_W             (0)
+#define STACK_REG_SAVE      (STACK_W + STACK_W_SIZE)
+#define STACK_SIZE          (STACK_REG_SAVE + STACK_REG_SAVE_SIZE)
+
+/* Instruction helpers. */
+
+#define roll2(v, reg) \
+        roll $(v), reg;
+
+#define roll3mov(v, src, dst) \
+        movl src, dst; \
+        roll $(v), dst;
+
+#define roll3(v, src, dst) \
+        rorxl $(32-(v)), src, dst;
+
+#define addl2(a, out) \
+        leal (a, out), out;
+
+#define addl3(a, b, out) \
+        leal (b, a), out;
+
+/* Round function macros. */
+
+#define GG1(x, y, z, o, t) \
+        movl x, o; \
+        xorl y, o; \
+        xorl z, o;
+
+#define FF1(x, y, z, o, t) GG1(x, y, z, o, t)
+
+#define GG2(x, y, z, o, t) \
+        andnl z, x, o; \
+        movl y, t; \
+        andl x, t; \
+        addl2(t, o);
+
+#define FF2(x, y, z, o, t) \
+        movl y, o; \
+        xorl x, o; \
+        movl y, t; \
+        andl x, t; \
+        andl z, o; \
+        xorl t, o;
+
+#define R(i, a, b, c, d, e, f, g, h, round, widx, wtype) \
+        /* rol(a, 12) => t0 */ \
+          roll3mov(12, a, t0); /* rorxl here would reduce perf by 6% on zen3 */ \
+        /* rol (t0 + e + t), 7) => t1 */ \
+          leal K##round(t0, e, 1), t1; \
+          roll2(7, t1); \
+        /* h + w1 => h */ \
+          addl wtype##_W1_ADDR(round, widx), h; \
+        /* h + t1 => h */ \
+          addl2(t1, h); \
+        /* t1 ^ t0 => t0 */ \
+          xorl t1, t0; \
+        /* w1w2 + d => d */ \
+          addl wtype##_W1W2_ADDR(round, widx), d; \
+        /* FF##i(a,b,c) => t1 */ \
+          FF##i(a, b, c, t1, t2); \
+        /* d + t1 => d */ \
+          addl2(t1, d); \
+        /* GG#i(e,f,g) => t2 */ \
+          GG##i(e, f, g, t2, t1); \
+        /* h + t2 => h */ \
+          addl2(t2, h); \
+        /* rol (f, 19) => f */ \
+          roll2(19, f); \
+        /* d + t0 => d */ \
+          addl2(t0, d); \
+        /* rol (b, 9) => b */ \
+          roll2(9, b); \
+        /* P0(h) => h */ \
+          roll3(9, h, t2); \
+          roll3(17, h, t1); \
+          xorl t2, h; \
+          xorl t1, h;
+
+#define R1(a, b, c, d, e, f, g, h, round, widx, wtype) \
+        R(1, a, b, c, d, e, f, g, h, round, widx, wtype)
+
+#define R2(a, b, c, d, e, f, g, h, round, widx, wtype) \
+        R(2, a, b, c, d, e, f, g, h, round, widx, wtype)
+
+/* Input expansion macros. */
+
+/* Byte-swapped input address. */
+#define IW_W_ADDR(round, widx, offs) \
+        (STACK_W + ((round) / 4) * 64 + (offs) + ((widx) * 4))(%rsp)
+
+/* Expanded input address. */
+#define XW_W_ADDR(round, widx, offs) \
+        (STACK_W + ((((round) / 3) - 4) % 2) * 64 + (offs) + ((widx) * 4))(%rsp)
+
+/* Rounds 1-12, byte-swapped input block addresses. */
+#define IW_W1_ADDR(round, widx)   IW_W_ADDR(round, widx, 0)
+#define IW_W1W2_ADDR(round, widx) IW_W_ADDR(round, widx, 32)
+
+/* Rounds 1-12, expanded input block addresses. */
+#define XW_W1_ADDR(round, widx)   XW_W_ADDR(round, widx, 0)
+#define XW_W1W2_ADDR(round, widx) XW_W_ADDR(round, widx, 32)
+
+/* Input block loading. */
+#define LOAD_W_XMM_1() \
+        vmovdqu 0*16(RDATA), XTMP0; \
+        vmovdqu 1*16(RDATA), XTMP1; \
+        vmovdqu 2*16(RDATA), XTMP2; \
+        vmovdqu 3*16(RDATA), XTMP3; \
+        vpshufb BSWAP_REG, XTMP0, XTMP0; \
+        vpshufb BSWAP_REG, XTMP1, XTMP1; \
+        vpshufb BSWAP_REG, XTMP2, XTMP2; \
+        vpshufb BSWAP_REG, XTMP3, XTMP3; \
+        vpxor XTMP0, XTMP1, XTMP4; \
+        vpxor XTMP1, XTMP2, XTMP5; \
+        vpxor XTMP2, XTMP3, XTMP6; \
+        leaq 64(RDATA), RDATA; \
+        vmovdqa XTMP0, IW_W1_ADDR(0, 0); \
+        vmovdqa XTMP4, IW_W1W2_ADDR(0, 0); \
+        vmovdqa XTMP1, IW_W1_ADDR(4, 0); \
+        vmovdqa XTMP5, IW_W1W2_ADDR(4, 0);
+
+#define LOAD_W_XMM_2() \
+        vmovdqa XTMP2, IW_W1_ADDR(8, 0); \
+        vmovdqa XTMP6, IW_W1W2_ADDR(8, 0);
+
+#define LOAD_W_XMM_3() \
+        vpshufd $0b00000000, XTMP0, W0; \
+        vpshufd $0b11111001, XTMP0, W1; \
+        vmovdqa XTMP1, W2; \
+        vpalignr $12, XTMP1, XTMP2, W3; \
+        vpalignr $8, XTMP2, XTMP3, W4; \
+        vpshufd $0b11111001, XTMP3, W5;
+
+/* Message scheduling. Note: 3 words per XMM register. */
+#define SCHED_W_0(round, w0, w1, w2, w3, w4, w5) \
+        /* Load (w[i - 16]) => XTMP0 */ \
+        vpshufd $0b10111111, w0, XTMP0; \
+        vpalignr $12, XTMP0, w1, XTMP0; \
+        /* Load (w[i - 13]) => XTMP1 */ \
+        vpshufd $0b10111111, w1, XTMP1; \
+        vpalignr $12, XTMP1, w2, XTMP1; \
+        /* w[i - 9] == w3 */ \
+        /* XMM3 ^ XTMP0 => XTMP0 */ \
+        vpxor w3, XTMP0, XTMP0;
+
+#define SCHED_W_1(round, w0, w1, w2, w3, w4, w5) \
+        /* w[i - 3] == w5 */ \
+        /* rol(XMM5, 15) ^ XTMP0 => XTMP0 */ \
+        vpslld $15, w5, XTMP2; \
+        vpsrld $(32-15), w5, XTMP3; \
+        vpxor XTMP2, XTMP3, XTMP3; \
+        vpxor XTMP3, XTMP0, XTMP0; \
+        /* rol(XTMP1, 7) => XTMP1 */ \
+        vpslld $7, XTMP1, XTMP5; \
+        vpsrld $(32-7), XTMP1, XTMP1; \
+        vpxor XTMP5, XTMP1, XTMP1; \
+        /* XMM4 ^ XTMP1 => XTMP1 */ \
+        vpxor w4, XTMP1, XTMP1; \
+        /* w[i - 6] == XMM4 */ \
+        /* P1(XTMP0) ^ XTMP1 => XMM0 */ \
+        vpslld $15, XTMP0, XTMP5; \
+        vpsrld $(32-15), XTMP0, XTMP6; \
+        vpslld $23, XTMP0, XTMP2; \
+        vpsrld $(32-23), XTMP0, XTMP3; \
+        vpxor XTMP0, XTMP1, XTMP1; \
+        vpxor XTMP6, XTMP5, XTMP5; \
+        vpxor XTMP3, XTMP2, XTMP2; \
+        vpxor XTMP2, XTMP5, XTMP5; \
+        vpxor XTMP5, XTMP1, w0;
+
+#define SCHED_W_2(round, w0, w1, w2, w3, w4, w5) \
+        /* W1 in XMM12 */ \
+        vpshufd $0b10111111, w4, XTMP4; \
+        vpalignr $12, XTMP4, w5, XTMP4; \
+        vmovdqa XTMP4, XW_W1_ADDR((round), 0); \
+        /* W1 ^ W2 => XTMP1 */ \
+        vpxor w0, XTMP4, XTMP1; \
+        vmovdqa XTMP1, XW_W1W2_ADDR((round), 0);
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * _gcry_sm3_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data,
+ *                                     size_t nblks)
+ */
+.globl _gcry_sm3_transform_amd64_avx_bmi2
+ELF(.type _gcry_sm3_transform_amd64_avx_bmi2, at function)
+.align 16
+_gcry_sm3_transform_amd64_avx_bmi2:
+  /* input:
+   *	%rdi: ctx, CTX
+   *	%rsi: data (64*nblks bytes)
+   *	%rdx: nblks
+   */
+  CFI_STARTPROC();
+
+  vzeroupper;
+
+  pushq %rbp;
+  CFI_PUSH(%rbp);
+  movq %rsp, %rbp;
+  CFI_DEF_CFA_REGISTER(%rbp);
+
+  movq %rdx, RNBLKS;
+
+  subq $STACK_SIZE, %rsp;
+  andq $(~63), %rsp;
+
+  movq %rbx, (STACK_REG_SAVE + 0 * 8)(%rsp);
+  CFI_REL_OFFSET(%rbx, STACK_REG_SAVE + 0 * 8);
+  movq %r15, (STACK_REG_SAVE + 1 * 8)(%rsp);
+  CFI_REL_OFFSET(%r15, STACK_REG_SAVE + 1 * 8);
+  movq %r14, (STACK_REG_SAVE + 2 * 8)(%rsp);
+  CFI_REL_OFFSET(%r14, STACK_REG_SAVE + 2 * 8);
+  movq %r13, (STACK_REG_SAVE + 3 * 8)(%rsp);
+  CFI_REL_OFFSET(%r13, STACK_REG_SAVE + 3 * 8);
+  movq %r12, (STACK_REG_SAVE + 4 * 8)(%rsp);
+  CFI_REL_OFFSET(%r12, STACK_REG_SAVE + 4 * 8);
+
+  vmovdqa .Lbe32mask rRIP, BSWAP_REG;
+
+  /* Get the values of the chaining variables. */
+  movl state_h0(RSTATE), a;
+  movl state_h1(RSTATE), b;
+  movl state_h2(RSTATE), c;
+  movl state_h3(RSTATE), d;
+  movl state_h4(RSTATE), e;
+  movl state_h5(RSTATE), f;
+  movl state_h6(RSTATE), g;
+  movl state_h7(RSTATE), h;
+
+.align 16
+.Loop:
+  /* Load data part1. */
+  LOAD_W_XMM_1();
+
+  leaq -1(RNBLKS), RNBLKS;
+
+  /* Transform 0-3 + Load data part2. */
+  R1(a, b, c, d, e, f, g, h, 0, 0, IW); LOAD_W_XMM_2();
+  R1(d, a, b, c, h, e, f, g, 1, 1, IW);
+  R1(c, d, a, b, g, h, e, f, 2, 2, IW);
+  R1(b, c, d, a, f, g, h, e, 3, 3, IW); LOAD_W_XMM_3();
+
+  /* Transform 4-7 + Load data[48-63]. */
+  R1(a, b, c, d, e, f, g, h, 4, 0, IW);
+  R1(d, a, b, c, h, e, f, g, 5, 1, IW);
+  R1(c, d, a, b, g, h, e, f, 6, 2, IW); SCHED_W_0(12, W0, W1, W2, W3, W4, W5);
+  R1(b, c, d, a, f, g, h, e, 7, 3, IW); SCHED_W_1(12, W0, W1, W2, W3, W4, W5);
+
+  /* Transform 8-11 + Precalc 12-14. */
+  R1(a, b, c, d, e, f, g, h, 8, 0, IW); SCHED_W_2(12, W0, W1, W2, W3, W4, W5);
+  R1(d, a, b, c, h, e, f, g, 9, 1, IW); SCHED_W_0(15, W1, W2, W3, W4, W5, W0);
+  R1(c, d, a, b, g, h, e, f, 10, 2, IW); SCHED_W_1(15, W1, W2, W3, W4, W5, W0);
+  R1(b, c, d, a, f, g, h, e, 11, 3, IW); SCHED_W_2(15, W1, W2, W3, W4, W5, W0);
+
+  /* Transform 12-14 + Precalc 15-17 */
+  R1(a, b, c, d, e, f, g, h, 12, 0, XW); SCHED_W_0(18, W2, W3, W4, W5, W0, W1);
+  R1(d, a, b, c, h, e, f, g, 13, 1, XW); SCHED_W_1(18, W2, W3, W4, W5, W0, W1);
+  R1(c, d, a, b, g, h, e, f, 14, 2, XW); SCHED_W_2(18, W2, W3, W4, W5, W0, W1);
+
+  /* Transform 15-17 + Precalc 18-20 */
+  R1(b, c, d, a, f, g, h, e, 15, 0, XW); SCHED_W_0(21, W3, W4, W5, W0, W1, W2);
+  R2(a, b, c, d, e, f, g, h, 16, 1, XW); SCHED_W_1(21, W3, W4, W5, W0, W1, W2);
+  R2(d, a, b, c, h, e, f, g, 17, 2, XW); SCHED_W_2(21, W3, W4, W5, W0, W1, W2);
+
+  /* Transform 18-20 + Precalc 21-23 */
+  R2(c, d, a, b, g, h, e, f, 18, 0, XW); SCHED_W_0(24, W4, W5, W0, W1, W2, W3);
+  R2(b, c, d, a, f, g, h, e, 19, 1, XW); SCHED_W_1(24, W4, W5, W0, W1, W2, W3);
+  R2(a, b, c, d, e, f, g, h, 20, 2, XW); SCHED_W_2(24, W4, W5, W0, W1, W2, W3);
+
+  /* Transform 21-23 + Precalc 24-26 */
+  R2(d, a, b, c, h, e, f, g, 21, 0, XW); SCHED_W_0(27, W5, W0, W1, W2, W3, W4);
+  R2(c, d, a, b, g, h, e, f, 22, 1, XW); SCHED_W_1(27, W5, W0, W1, W2, W3, W4);
+  R2(b, c, d, a, f, g, h, e, 23, 2, XW); SCHED_W_2(27, W5, W0, W1, W2, W3, W4);
+
+  /* Transform 24-26 + Precalc 27-29 */
+  R2(a, b, c, d, e, f, g, h, 24, 0, XW); SCHED_W_0(30, W0, W1, W2, W3, W4, W5);
+  R2(d, a, b, c, h, e, f, g, 25, 1, XW); SCHED_W_1(30, W0, W1, W2, W3, W4, W5);
+  R2(c, d, a, b, g, h, e, f, 26, 2, XW); SCHED_W_2(30, W0, W1, W2, W3, W4, W5);
+
+  /* Transform 27-29 + Precalc 30-32 */
+  R2(b, c, d, a, f, g, h, e, 27, 0, XW); SCHED_W_0(33, W1, W2, W3, W4, W5, W0);
+  R2(a, b, c, d, e, f, g, h, 28, 1, XW); SCHED_W_1(33, W1, W2, W3, W4, W5, W0);
+  R2(d, a, b, c, h, e, f, g, 29, 2, XW); SCHED_W_2(33, W1, W2, W3, W4, W5, W0);
+
+  /* Transform 30-32 + Precalc 33-35 */
+  R2(c, d, a, b, g, h, e, f, 30, 0, XW); SCHED_W_0(36, W2, W3, W4, W5, W0, W1);
+  R2(b, c, d, a, f, g, h, e, 31, 1, XW); SCHED_W_1(36, W2, W3, W4, W5, W0, W1);
+  R2(a, b, c, d, e, f, g, h, 32, 2, XW); SCHED_W_2(36, W2, W3, W4, W5, W0, W1);
+
+  /* Transform 33-35 + Precalc 36-38 */
+  R2(d, a, b, c, h, e, f, g, 33, 0, XW); SCHED_W_0(39, W3, W4, W5, W0, W1, W2);
+  R2(c, d, a, b, g, h, e, f, 34, 1, XW); SCHED_W_1(39, W3, W4, W5, W0, W1, W2);
+  R2(b, c, d, a, f, g, h, e, 35, 2, XW); SCHED_W_2(39, W3, W4, W5, W0, W1, W2);
+
+  /* Transform 36-38 + Precalc 39-41 */
+  R2(a, b, c, d, e, f, g, h, 36, 0, XW); SCHED_W_0(42, W4, W5, W0, W1, W2, W3);
+  R2(d, a, b, c, h, e, f, g, 37, 1, XW); SCHED_W_1(42, W4, W5, W0, W1, W2, W3);
+  R2(c, d, a, b, g, h, e, f, 38, 2, XW); SCHED_W_2(42, W4, W5, W0, W1, W2, W3);
+
+  /* Transform 39-41 + Precalc 42-44 */
+  R2(b, c, d, a, f, g, h, e, 39, 0, XW); SCHED_W_0(45, W5, W0, W1, W2, W3, W4);
+  R2(a, b, c, d, e, f, g, h, 40, 1, XW); SCHED_W_1(45, W5, W0, W1, W2, W3, W4);
+  R2(d, a, b, c, h, e, f, g, 41, 2, XW); SCHED_W_2(45, W5, W0, W1, W2, W3, W4);
+
+  /* Transform 42-44 + Precalc 45-47 */
+  R2(c, d, a, b, g, h, e, f, 42, 0, XW); SCHED_W_0(48, W0, W1, W2, W3, W4, W5);
+  R2(b, c, d, a, f, g, h, e, 43, 1, XW); SCHED_W_1(48, W0, W1, W2, W3, W4, W5);
+  R2(a, b, c, d, e, f, g, h, 44, 2, XW); SCHED_W_2(48, W0, W1, W2, W3, W4, W5);
+
+  /* Transform 45-47 + Precalc 48-50 */
+  R2(d, a, b, c, h, e, f, g, 45, 0, XW); SCHED_W_0(51, W1, W2, W3, W4, W5, W0);
+  R2(c, d, a, b, g, h, e, f, 46, 1, XW); SCHED_W_1(51, W1, W2, W3, W4, W5, W0);
+  R2(b, c, d, a, f, g, h, e, 47, 2, XW); SCHED_W_2(51, W1, W2, W3, W4, W5, W0);
+
+  /* Transform 48-50 + Precalc 51-53 */
+  R2(a, b, c, d, e, f, g, h, 48, 0, XW); SCHED_W_0(54, W2, W3, W4, W5, W0, W1);
+  R2(d, a, b, c, h, e, f, g, 49, 1, XW); SCHED_W_1(54, W2, W3, W4, W5, W0, W1);
+  R2(c, d, a, b, g, h, e, f, 50, 2, XW); SCHED_W_2(54, W2, W3, W4, W5, W0, W1);
+
+  /* Transform 51-53 + Precalc 54-56 */
+  R2(b, c, d, a, f, g, h, e, 51, 0, XW); SCHED_W_0(57, W3, W4, W5, W0, W1, W2);
+  R2(a, b, c, d, e, f, g, h, 52, 1, XW); SCHED_W_1(57, W3, W4, W5, W0, W1, W2);
+  R2(d, a, b, c, h, e, f, g, 53, 2, XW); SCHED_W_2(57, W3, W4, W5, W0, W1, W2);
+
+  /* Transform 54-56 + Precalc 57-59 */
+  R2(c, d, a, b, g, h, e, f, 54, 0, XW); SCHED_W_0(60, W4, W5, W0, W1, W2, W3);
+  R2(b, c, d, a, f, g, h, e, 55, 1, XW); SCHED_W_1(60, W4, W5, W0, W1, W2, W3);
+  R2(a, b, c, d, e, f, g, h, 56, 2, XW); SCHED_W_2(60, W4, W5, W0, W1, W2, W3);
+
+  /* Transform 57-59 + Precalc 60-62 */
+  R2(d, a, b, c, h, e, f, g, 57, 0, XW); SCHED_W_0(63, W5, W0, W1, W2, W3, W4);
+  R2(c, d, a, b, g, h, e, f, 58, 1, XW);
+  R2(b, c, d, a, f, g, h, e, 59, 2, XW); SCHED_W_1(63, W5, W0, W1, W2, W3, W4);
+
+  /* Transform 60-62 + Precalc 63-65 */
+  R2(a, b, c, d, e, f, g, h, 60, 0, XW);
+  R2(d, a, b, c, h, e, f, g, 61, 1, XW); SCHED_W_2(63, W5, W0, W1, W2, W3, W4);
+  R2(c, d, a, b, g, h, e, f, 62, 2, XW);
+
+  /* Transform 63 */
+  R2(b, c, d, a, f, g, h, e, 63, 0, XW);
+
+  /* Update the chaining variables. */
+  xorl state_h0(RSTATE), a;
+  xorl state_h1(RSTATE), b;
+  xorl state_h2(RSTATE), c;
+  xorl state_h3(RSTATE), d;
+  movl a, state_h0(RSTATE);
+  movl b, state_h1(RSTATE);
+  movl c, state_h2(RSTATE);
+  movl d, state_h3(RSTATE);
+  xorl state_h4(RSTATE), e;
+  xorl state_h5(RSTATE), f;
+  xorl state_h6(RSTATE), g;
+  xorl state_h7(RSTATE), h;
+  movl e, state_h4(RSTATE);
+  movl f, state_h5(RSTATE);
+  movl g, state_h6(RSTATE);
+  movl h, state_h7(RSTATE);
+
+  cmpq $0, RNBLKS;
+  jne .Loop;
+
+.align 16
+.Lend:
+  vzeroall;
+
+  movq (STACK_REG_SAVE + 0 * 8)(%rsp), %rbx;
+  CFI_RESTORE(%rbx);
+  movq (STACK_REG_SAVE + 1 * 8)(%rsp), %r15;
+  CFI_RESTORE(%r15);
+  movq (STACK_REG_SAVE + 2 * 8)(%rsp), %r14;
+  CFI_RESTORE(%r14);
+  movq (STACK_REG_SAVE + 3 * 8)(%rsp), %r13;
+  CFI_RESTORE(%r13);
+  movq (STACK_REG_SAVE + 4 * 8)(%rsp), %r12;
+  CFI_RESTORE(%r12);
+
+  vmovdqa %xmm0, IW_W1_ADDR(0, 0);
+  vmovdqa %xmm0, IW_W1W2_ADDR(0, 0);
+  vmovdqa %xmm0, IW_W1_ADDR(4, 0);
+  vmovdqa %xmm0, IW_W1W2_ADDR(4, 0);
+  vmovdqa %xmm0, IW_W1_ADDR(8, 0);
+  vmovdqa %xmm0, IW_W1W2_ADDR(8, 0);
+  xorl %eax, %eax; /* stack burned */
+
+  leave;
+  CFI_LEAVE();
+  ret;
+  CFI_ENDPROC();
+ELF(.size _gcry_sm3_transform_amd64_avx_bmi2,
+          .-_gcry_sm3_transform_amd64_avx_bmi2;)
+
+#endif
+#endif
diff --git a/cipher/sm3.c b/cipher/sm3.c
index d52a7494..05b7b259 100644
--- a/cipher/sm3.c
+++ b/cipher/sm3.c
@@ -47,12 +47,54 @@
 #include "hash-common.h"
 
 
+/* USE_AVX_BMI2 indicates whether to compile with Intel AVX/BMI2 code. */
+#undef USE_AVX_BMI2
+#if defined(__x86_64__) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+    defined(HAVE_GCC_INLINE_ASM_BMI2) && \
+    (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+     defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS))
+# define USE_AVX_BMI2 1
+#endif
+
+
 typedef struct {
   gcry_md_block_ctx_t bctx;
-  u32  h0,h1,h2,h3,h4,h5,h6,h7;
+  u32 h[8];
 } SM3_CONTEXT;
 
 
+/* AMD64 assembly implementations use SystemV ABI, ABI conversion and additional
+ * stack to store XMM6-XMM15 needed on Win64. */
+#undef ASM_FUNC_ABI
+#undef ASM_EXTRA_STACK
+#if defined(USE_AVX_BMI2)
+# ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS
+#  define ASM_FUNC_ABI __attribute__((sysv_abi))
+#  define ASM_EXTRA_STACK (10 * 16 + 4 * sizeof(void *))
+# else
+#  define ASM_FUNC_ABI
+#  define ASM_EXTRA_STACK 0
+# endif
+#endif
+
+
+#ifdef USE_AVX_BMI2
+unsigned int _gcry_sm3_transform_amd64_avx_bmi2(void *state,
+                                                const void *input_data,
+                                                size_t num_blks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sm3_transform_amd64_avx_bmi2(void *context, const unsigned char *data,
+                                size_t nblks)
+{
+  SM3_CONTEXT *hd = context;
+  unsigned int nburn = _gcry_sm3_transform_amd64_avx_bmi2 (hd->h, data, nblks);
+  nburn += nburn ? ASM_EXTRA_STACK : 0;
+  return nburn;
+}
+#endif /* USE_AVX_BMI2 */
+
+
 static unsigned int
 transform (void *c, const unsigned char *data, size_t nblks);
 
@@ -65,14 +107,14 @@ sm3_init (void *context, unsigned int flags)
 
   (void)flags;
 
-  hd->h0 = 0x7380166f;
-  hd->h1 = 0x4914b2b9;
-  hd->h2 = 0x172442d7;
-  hd->h3 = 0xda8a0600;
-  hd->h4 = 0xa96f30bc;
-  hd->h5 = 0x163138aa;
-  hd->h6 = 0xe38dee4d;
-  hd->h7 = 0xb0fb0e4e;
+  hd->h[0] = 0x7380166f;
+  hd->h[1] = 0x4914b2b9;
+  hd->h[2] = 0x172442d7;
+  hd->h[3] = 0xda8a0600;
+  hd->h[4] = 0xa96f30bc;
+  hd->h[5] = 0x163138aa;
+  hd->h[6] = 0xe38dee4d;
+  hd->h[7] = 0xb0fb0e4e;
 
   hd->bctx.nblocks = 0;
   hd->bctx.nblocks_high = 0;
@@ -80,6 +122,11 @@ sm3_init (void *context, unsigned int flags)
   hd->bctx.blocksize_shift = _gcry_ctz(64);
   hd->bctx.bwrite = transform;
 
+#ifdef USE_AVX_BMI2
+  if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2))
+    hd->bctx.bwrite = do_sm3_transform_amd64_avx_bmi2;
+#endif
+
   (void)features;
 }
 
@@ -146,14 +193,14 @@ transform_blk (void *ctx, const unsigned char *data)
   u32 a,b,c,d,e,f,g,h,ss1,ss2;
   u32 w[16];
 
-  a = hd->h0;
-  b = hd->h1;
-  c = hd->h2;
-  d = hd->h3;
-  e = hd->h4;
-  f = hd->h5;
-  g = hd->h6;
-  h = hd->h7;
+  a = hd->h[0];
+  b = hd->h[1];
+  c = hd->h[2];
+  d = hd->h[3];
+  e = hd->h[4];
+  f = hd->h[5];
+  g = hd->h[6];
+  h = hd->h[7];
 
   R1(a, b, c, d, e, f, g, h, K[0], I(0), I(4));
   R1(d, a, b, c, h, e, f, g, K[1], I(1), I(5));
@@ -223,14 +270,14 @@ transform_blk (void *ctx, const unsigned char *data)
   R2(c, d, a, b, g, h, e, f, K[62], W1(62), W2(66));
   R2(b, c, d, a, f, g, h, e, K[63], W1(63), W2(67));
 
-  hd->h0 ^= a;
-  hd->h1 ^= b;
-  hd->h2 ^= c;
-  hd->h3 ^= d;
-  hd->h4 ^= e;
-  hd->h5 ^= f;
-  hd->h6 ^= g;
-  hd->h7 ^= h;
+  hd->h[0] ^= a;
+  hd->h[1] ^= b;
+  hd->h[2] ^= c;
+  hd->h[3] ^= d;
+  hd->h[4] ^= e;
+  hd->h[5] ^= f;
+  hd->h[6] ^= g;
+  hd->h[7] ^= h;
 
   return /*burn_stack*/ 26*4+32;
 }
@@ -313,7 +360,7 @@ sm3_final(void *context)
     }
 
   p = hd->bctx.buf;
-#define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
+#define X(a) do { buf_put_be32(p, hd->h[a]); p += 4; } while(0)
   X(0);
   X(1);
   X(2);
diff --git a/configure.ac b/configure.ac
index 952da248..50a52015 100644
--- a/configure.ac
+++ b/configure.ac
@@ -3026,6 +3026,19 @@ if test "$found" = "1" ; then
    esac
 fi
 
+LIST_MEMBER(sm3, $enabled_digests)
+if test "$found" = "1" ; then
+   GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
+   AC_DEFINE(USE_SM3, 1, [Defined if this module should be included])
+
+   case "${host}" in
+      x86_64-*-*)
+         # Build with the assembly implementation
+         GCRYPT_ASM_DIGESTS="$GCRYPT_ASM_DIGESTS sm3-avx-bmi2-amd64.lo"
+      ;;
+   esac
+fi
+
 # SHA-1 needs to be included always for example because it is used by
 # random-csprng.c.
 GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1.lo"
@@ -3081,12 +3094,6 @@ case "${host}" in
   ;;
 esac
 
-LIST_MEMBER(sm3, $enabled_digests)
-if test "$found" = "1" ; then
-   GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
-   AC_DEFINE(USE_SM3, 1, [Defined if this module should be included])
-fi
-
 LIST_MEMBER(scrypt, $enabled_kdfs)
 if test "$found" = "1" ; then
    GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"
-- 
2.32.0




More information about the Gcrypt-devel mailing list