[PATCH 1/7] Add AVX2/BMI2 implementation of SHA1
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Apr 5 19:25:44 CEST 2019
* cipher/Makefile.am: Add 'sha1-avx2-bmi2-amd64.S'.
* cipher/hash-common.h (MD_BLOCK_CTX_BUFFER_SIZE): New.
(gcry_md_block_ctx): Change buffer length to MD_BLOCK_CTX_BUFFER_SIZE.
* cipher/sha1-avx-amd64.S: Add missing .size for transform function.
* cipher/sha1-ssse3-amd64.S: Add missing .size for transform function.
* cipher/sha1-avx-bmi2-amd64.S: Add missing .size for transform
function; Tweak implementation for small ~1% speed increase.
* cipher/sha1-avx2-bmi2-amd64.S: New.
* cipher/sha1.c (USE_AVX2, _gcry_sha1_transform_amd64_avx2_bmi2)
(do_sha1_transform_amd64_avx2_bmi2): New.
(sha1_init) [USE_AVX2]: Enable AVX2 implementation if supported by
HW features.
(sha1_final): Merge processing of two last blocks when extra block is
needed.
--
Benchmarks on Intel Haswell (4.0 Ghz):
Before (AVX/BMI2):
| nanosecs/byte mebibytes/sec cycles/byte
SHA1 | 0.970 ns/B 983.2 MiB/s 3.88 c/B
After (AVX/BMI2, ~1% faster):
| nanosecs/byte mebibytes/sec cycles/byte
SHA1 | 0.960 ns/B 993.1 MiB/s 3.84 c/B
After (AVX2/BMI2, ~9% faster):
| nanosecs/byte mebibytes/sec cycles/byte
SHA1 | 0.890 ns/B 1071 MiB/s 3.56 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 1e67771e5..3f00ed4a8 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -103,8 +103,8 @@ EXTRA_libcipher_la_SOURCES = \
serpent.c serpent-sse2-amd64.S \
serpent-avx2-amd64.S serpent-armv7-neon.S \
sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \
- sha1-armv7-neon.S sha1-armv8-aarch32-ce.S sha1-armv8-aarch64-ce.S \
- sha1-intel-shaext.c \
+ sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \
+ sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \
sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \
sha256-avx2-bmi2-amd64.S \
sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \
diff --git a/cipher/hash-common.h b/cipher/hash-common.h
index 23f81ed71..0b3ade11e 100644
--- a/cipher/hash-common.h
+++ b/cipher/hash-common.h
@@ -42,9 +42,12 @@ typedef unsigned int (*_gcry_md_block_write_t) (void *c,
# define MD_NBLOCKS_TYPE u32
#endif
+/* SHA1 needs 2x64 bytes and SHA-512 needs 128 bytes. */
+#define MD_BLOCK_CTX_BUFFER_SIZE 128
+
typedef struct gcry_md_block_ctx
{
- byte buf[MD_BLOCK_MAX_BLOCKSIZE];
+ byte buf[MD_BLOCK_CTX_BUFFER_SIZE];
MD_NBLOCKS_TYPE nblocks;
MD_NBLOCKS_TYPE nblocks_high;
int count;
diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S
index 5f5b9c0e4..143e4066d 100644
--- a/cipher/sha1-avx-amd64.S
+++ b/cipher/sha1-avx-amd64.S
@@ -421,6 +421,8 @@ _gcry_sha1_transform_amd64_avx:
.Lret:
ret;
+ELF(.size _gcry_sha1_transform_amd64_avx,
+ .-_gcry_sha1_transform_amd64_avx;)
#endif
#endif
diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S
index 8292c3afb..79ea24ef9 100644
--- a/cipher/sha1-avx-bmi2-amd64.S
+++ b/cipher/sha1-avx-bmi2-amd64.S
@@ -60,20 +60,15 @@
/* Constants */
.text
-#define K1 0x5A827999
-#define K2 0x6ED9EBA1
-#define K3 0x8F1BBCDC
-#define K4 0xCA62C1D6
.align 16
-.LK_XMM:
-.LK1: .long K1, K1, K1, K1
-.LK2: .long K2, K2, K2, K2
-.LK3: .long K3, K3, K3, K3
-.LK4: .long K4, K4, K4, K4
-
.Lbswap_shufb_ctl:
.long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+.LK1: .long 0x5A827999
+.LK2: .long 0x6ED9EBA1
+.LK3: .long 0x8F1BBCDC
+.LK4: .long 0xCA62C1D6
+
/* Register macros */
@@ -82,14 +77,15 @@
#define ROLDSTACK %r10
#define RNBLKS %r11
-#define a %eax
-#define b %ebx
-#define c %ecx
+#define a %esi
+#define b %edi
+#define c %ebp
#define d %edx
-#define e %edi
+#define e %ecx
+#define ne %ebx
-#define RT0 %esi
-#define RT1 %ebp
+#define RT0 %eax
+#define RT1 %r12d
#define Wtmp0 %xmm0
#define Wtmp1 %xmm1
@@ -105,6 +101,11 @@
#define BSWAP_REG %xmm10
+#define K1 %xmm11
+#define K2 %xmm12
+#define K3 %xmm13
+#define K4 %xmm14
+
/* Round function macros. */
@@ -117,9 +118,9 @@
andl b, RT0; \
rorxl $2, b, b; \
addl RT1, e; \
- leal (RT0,e), e; \
- rorxl $27, a, RT1; \
- addl RT1, e;
+ addl ne, a; \
+ leal (RT0,e), ne; \
+ rorxl $27, a, e;
#define R_F2(a,b,c,d,e,i) \
movl c, RT0; \
@@ -127,22 +128,22 @@
xorl b, RT0; \
rorxl $2, b, b; \
xorl d, RT0; \
- leal (RT0,e), e; \
- rorxl $27, a, RT1; \
- addl RT1, e;
+ addl ne, a; \
+ leal (RT0,e), ne; \
+ rorxl $27, a, e;
#define R_F3(a,b,c,d,e,i) \
movl c, RT0; \
movl b, RT1; \
+ addl WK(i), e; \
xorl b, RT0; \
andl c, RT1; \
andl d, RT0; \
addl RT1, e; \
- addl WK(i), e; \
rorxl $2, b, b; \
- leal (RT0,e), e; \
- rorxl $27, a, RT1; \
- addl RT1, e;
+ addl ne, a; \
+ leal (RT0,e), ne; \
+ rorxl $27, a, e;
#define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i)
@@ -158,8 +159,8 @@
#define W_PRECALC_00_15_1(i, W, tmp0) \
vpshufb BSWAP_REG, tmp0, W;
-#define W_PRECALC_00_15_2(i, W, tmp0) \
- vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0;
+#define W_PRECALC_00_15_2(i, W, tmp0, K) \
+ vpaddd K, W, tmp0;
#define W_PRECALC_00_15_3(i, W, tmp0) \
vmovdqa tmp0, WK(i&~3);
@@ -181,10 +182,10 @@
vpsrld $30, tmp1, W; \
vpslld $2, tmp1, tmp1;
-#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \
vpxor W, tmp0, tmp0; \
vpxor tmp1, tmp0, W; \
- vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+ vpaddd K, W, tmp0; \
vmovdqa tmp0, WK((i)&~3);
#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
@@ -199,9 +200,9 @@
vpsrld $30, W, tmp0; \
vpslld $2, W, W;
-#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \
vpor W, tmp0, W; \
- vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \
+ vpaddd K, W, tmp0; \
vmovdqa tmp0, WK((i)&~3);
@@ -233,6 +234,7 @@ _gcry_sha1_transform_amd64_avx_bmi2:
movq %rsi, RDATA;
pushq %rbx;
pushq %rbp;
+ pushq %r12;
movq %rsp, ROLDSTACK;
@@ -245,25 +247,30 @@ _gcry_sha1_transform_amd64_avx_bmi2:
movl state_h2(RSTATE), c;
movl state_h3(RSTATE), d;
movl state_h4(RSTATE), e;
+ xorl ne, ne;
vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG;
+ vpbroadcastd .LK1 RIP, K1;
+ vpbroadcastd .LK2 RIP, K2;
+ vpbroadcastd .LK3 RIP, K3;
+ vpbroadcastd .LK4 RIP, K4;
/* Precalc 0-15. */
W_PRECALC_00_15_0(0, W0, Wtmp0);
W_PRECALC_00_15_1(1, W0, Wtmp0);
- W_PRECALC_00_15_2(2, W0, Wtmp0);
+ W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
W_PRECALC_00_15_3(3, W0, Wtmp0);
W_PRECALC_00_15_0(4, W7, Wtmp0);
W_PRECALC_00_15_1(5, W7, Wtmp0);
- W_PRECALC_00_15_2(6, W7, Wtmp0);
+ W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
W_PRECALC_00_15_3(7, W7, Wtmp0);
W_PRECALC_00_15_0(8, W6, Wtmp0);
W_PRECALC_00_15_1(9, W6, Wtmp0);
- W_PRECALC_00_15_2(10, W6, Wtmp0);
+ W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
W_PRECALC_00_15_3(11, W6, Wtmp0);
W_PRECALC_00_15_0(12, W5, Wtmp0);
W_PRECALC_00_15_1(13, W5, Wtmp0);
- W_PRECALC_00_15_2(14, W5, Wtmp0);
+ W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
W_PRECALC_00_15_3(15, W5, Wtmp0);
.align 8
@@ -274,69 +281,69 @@ _gcry_sha1_transform_amd64_avx_bmi2:
R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
- R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
- R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
- R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
- R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
/* Transform 16-63 + Precalc 32-79. */
R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2);
R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2);
R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3);
R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
- R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3);
R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
- R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3);
R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
- R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3);
R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
- R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3);
R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
- R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4);
R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
- R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4);
R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
- R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4);
R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
- R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4);
R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
- R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4);
decq RNBLKS;
jz .Lend;
@@ -344,21 +351,23 @@ _gcry_sha1_transform_amd64_avx_bmi2:
/* Transform 64-79 + Precalc 0-15 of next block. */
R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
- R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0);
+ R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
- R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0);
+ R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
- R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0);
+ R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
R( c, d, e, a, b, F4, 78 );
- addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0);
+ addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
+ addl ne, a;
+ xorl ne, ne;
/* Update the chaining variables. */
addl state_h3(RSTATE), d;
@@ -396,6 +405,8 @@ _gcry_sha1_transform_amd64_avx_bmi2:
R( c, d, e, a, b, F4, 78 );
addl state_h0(RSTATE), a;
R( b, c, d, e, a, F4, 79 );
+ addl ne, a;
+ xorl ne, ne;
/* Update the chaining variables. */
addl state_h3(RSTATE), d;
@@ -411,14 +422,17 @@ _gcry_sha1_transform_amd64_avx_bmi2:
movq ROLDSTACK, %rsp;
+ popq %r12;
popq %rbp;
popq %rbx;
/* burn_stack */
- movl $(16*4 + 2*8 + 31), %eax;
+ movl $(16*4 + 3*8 + 31), %eax;
.Lret:
ret;
+ELF(.size _gcry_sha1_transform_amd64_avx_bmi2,
+ .-_gcry_sha1_transform_amd64_avx_bmi2;)
#endif
#endif
diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S
new file mode 100644
index 000000000..c666290f2
--- /dev/null
+++ b/cipher/sha1-avx2-bmi2-amd64.S
@@ -0,0 +1,570 @@
+/* sha1-avx2-bmi2-amd64.S - Intel AVX2/BMI2 accelerated SHA-1 transform function
+ * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * Based on sha1.c:
+ * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
+ * "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+ * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \
+ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \
+ defined(HAVE_GCC_INLINE_ASM_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX) && \
+ defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1)
+
+#ifdef __PIC__
+# define RIP (%rip)
+#else
+# define RIP
+#endif
+
+
+#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS
+# define ELF(...) __VA_ARGS__
+#else
+# define ELF(...) /*_*/
+#endif
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define WK_STACK_WORDS (80 * 2)
+
+.text
+.align 16
+.Lbswap_shufb_ctl:
+ .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
+
+.LK1: .long 0x5A827999
+.LK2: .long 0x6ED9EBA1
+.LK3: .long 0x8F1BBCDC
+.LK4: .long 0xCA62C1D6
+
+
+/* Register macros */
+
+#define RSTATE %r8
+#define RDATA %r9
+#define ROLDSTACK %r10
+#define RNBLKS %r11
+
+#define a %eax
+#define b %ebx
+#define c %ecx
+#define d %edx
+#define e %edi
+#define ne %r12d
+
+#define RT0 %esi
+#define RT1 %ebp
+
+#define Wtmp0 %ymm0
+#define Wtmp1 %ymm1
+#define Wtmp0x %xmm0
+#define Wtmp1x %xmm1
+
+#define W0 %ymm2
+#define W1 %ymm3
+#define W2 %ymm4
+#define W3 %ymm5
+#define W4 %ymm6
+#define W5 %ymm7
+#define W6 %ymm8
+#define W7 %ymm9
+
+#define BSWAP_REG %ymm10
+
+#define K1 %ymm11
+#define K2 %ymm12
+#define K3 %ymm13
+#define K4 %ymm14
+
+
+/* Round function macros. */
+
+#define WK(i,block) ((block) * 16 + ((i) / 4) * 32 + ((i) % 4) * 4)(%rsp)
+#define PRE_WK(i) ((i) * 4 * 2)(%rsp)
+
+#define R_F1(a,b,c,d,e,i,block) \
+ movl c, RT0; \
+ andn d, b, RT1; \
+ addl WK(i,block), e; \
+ andl b, RT0; \
+ leal (a,ne), a; \
+ rorxl $2, b, b; \
+ addl RT1, e; \
+ rorxl $27, a, ne; \
+ addl RT0, e;
+
+#define R_F2(a,b,c,d,e,i,block) \
+ addl WK(i,block), e; \
+ movl c, RT0; \
+ xorl b, RT0; \
+ leal (a,ne), a; \
+ rorxl $2, b, b; \
+ xorl d, RT0; \
+ addl RT0, e; \
+ rorxl $27, a, ne;
+
+#define R_F3(a,b,c,d,e,i,block) \
+ movl c, RT0; \
+ addl WK(i,block), e; \
+ movl b, RT1; \
+ xorl b, RT0; \
+ leal (a,ne), a; \
+ rorxl $2, b, b; \
+ andl c, RT1; \
+ addl RT1, e; \
+ andl d, RT0; \
+ rorxl $27, a, ne; \
+ addl RT0, e;
+
+#define R_F4(a,b,c,d,e,i,block) R_F2(a,b,c,d,e,i,block)
+
+#define R(a,b,c,d,e,f,i,block) \
+ R_##f(a,b,c,d,e,i,block)
+
+
+/* Input expansion macros. */
+
+#define W_PRECALC_00_15_0(i, W, tmp0) \
+ vmovdqu (4*(i))(RDATA), tmp0##x; \
+ vinserti128 $1, (4*(i) + 64)(RDATA), tmp0, tmp0;
+
+#define W_PRECALC_00_15_1(i, W, tmp0) \
+ vpshufb BSWAP_REG, tmp0, W;
+
+#define W_PRECALC_00_15_2(i, W, tmp0, K) \
+ vpaddd K, W, tmp0;
+
+#define W_PRECALC_00_15_3(i, W, tmp0) \
+ vmovdqa tmp0, PRE_WK((i)&~3);
+
+#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ vpalignr $8, W_m16, W_m12, W; \
+ vpsrldq $4, W_m04, tmp0; \
+ vpxor W_m08, W, W;
+
+#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ vpxor W_m16, tmp0, tmp0; \
+ vpxor tmp0, W, W; \
+ vpslld $1, W, tmp0; \
+ vpslldq $12, W, tmp1; \
+ vpsrld $31, W, W;
+
+#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ vpor W, tmp0, tmp0; \
+ vpsrld $30, tmp1, W; \
+ vpslld $2, tmp1, tmp1;
+
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \
+ vpxor W, tmp0, tmp0; \
+ vpxor tmp1, tmp0, W; \
+ vpaddd K, W, tmp0; \
+ vmovdqa tmp0, PRE_WK((i)&~3);
+
+#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ vpxor W_m28, W, W; \
+ vpalignr $8, W_m08, W_m04, tmp0;
+
+#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ vpxor W_m16, W, W; \
+ vpxor tmp0, W, W;
+
+#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ vpsrld $30, W, tmp0; \
+ vpslld $2, W, W;
+
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \
+ vpor W, tmp0, W; \
+ vpaddd K, W, tmp0; \
+ vmovdqa tmp0, PRE_WK((i)&~3);
+
+
+/*
+ * Transform 2*nblks*64 bytes (2*nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * _gcry_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data,
+ * size_t nblks)
+ */
+.globl _gcry_sha1_transform_amd64_avx2_bmi2
+ELF(.type _gcry_sha1_transform_amd64_avx2_bmi2, at function)
+.align 16
+_gcry_sha1_transform_amd64_avx2_bmi2:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: data (64*nblks bytes)
+ * %rdx: nblks (multiple of 2, larger than 0)
+ */
+
+ vzeroupper;
+
+ movq %rdx, RNBLKS;
+ movq %rdi, RSTATE;
+ movq %rsi, RDATA;
+ pushq %rbx;
+ pushq %rbp;
+ pushq %r12;
+
+ movq %rsp, ROLDSTACK;
+
+ subq $(WK_STACK_WORDS*4), %rsp;
+ andq $(~63), %rsp;
+
+ /* Get the values of the chaining variables. */
+ movl state_h0(RSTATE), a;
+ movl state_h1(RSTATE), b;
+ movl state_h2(RSTATE), c;
+ movl state_h3(RSTATE), d;
+ movl state_h4(RSTATE), e;
+ xorl ne, ne;
+
+ vbroadcasti128 .Lbswap_shufb_ctl RIP, BSWAP_REG;
+ vpbroadcastd .LK1 RIP, K1;
+ vpbroadcastd .LK2 RIP, K2;
+ vpbroadcastd .LK3 RIP, K3;
+ vpbroadcastd .LK4 RIP, K4;
+
+ /* Precalc 0-31 for block 1 & 2. */
+ W_PRECALC_00_15_0(0, W0, Wtmp0);
+ W_PRECALC_00_15_1(1, W0, Wtmp0);
+ W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
+ W_PRECALC_00_15_3(3, W0, Wtmp0);
+ W_PRECALC_00_15_0(4, W7, Wtmp0);
+ W_PRECALC_00_15_1(5, W7, Wtmp0);
+ W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
+ W_PRECALC_00_15_3(7, W7, Wtmp0);
+ W_PRECALC_00_15_0(8, W6, Wtmp0);
+ W_PRECALC_00_15_1(9, W6, Wtmp0);
+ W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
+ W_PRECALC_00_15_3(11, W6, Wtmp0);
+ W_PRECALC_00_15_0(12, W5, Wtmp0);
+ W_PRECALC_00_15_1(13, W5, Wtmp0);
+ W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
+ W_PRECALC_00_15_3(15, W5, Wtmp0);
+ W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
+ W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
+ W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
+ W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
+
+.align 8
+.Loop:
+ addq $(2 * 64), RDATA;
+
+ /* Transform 0-15 for block 1 + Precalc 32-47 for block 1 & 2. */
+ R( a, b, c, d, e, F1, 0, 0 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( e, a, b, c, d, F1, 1, 0 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( d, e, a, b, c, F1, 2, 0 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( c, d, e, a, b, F1, 3, 0 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2);
+ R( b, c, d, e, a, F1, 4, 0 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( a, b, c, d, e, F1, 5, 0 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( e, a, b, c, d, F1, 6, 0 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( d, e, a, b, c, F1, 7, 0 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2);
+ R( c, d, e, a, b, F1, 8, 0 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( b, c, d, e, a, F1, 9, 0 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( a, b, c, d, e, F1, 10, 0 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( e, a, b, c, d, F1, 11, 0 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3);
+ R( d, e, a, b, c, F1, 12, 0 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( c, d, e, a, b, F1, 13, 0 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( b, c, d, e, a, F1, 14, 0 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( a, b, c, d, e, F1, 15, 0 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3);
+
+ /* Transform 16-47 for block 1 + Precalc 48-79 for block 1 & 2. */
+ R( e, a, b, c, d, F1, 16, 0 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( d, e, a, b, c, F1, 17, 0 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( c, d, e, a, b, F1, 18, 0 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( b, c, d, e, a, F1, 19, 0 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3);
+ R( a, b, c, d, e, F2, 20, 0 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( e, a, b, c, d, F2, 21, 0 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( d, e, a, b, c, F2, 22, 0 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( c, d, e, a, b, F2, 23, 0 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3);
+ R( b, c, d, e, a, F2, 24, 0 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( a, b, c, d, e, F2, 25, 0 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( e, a, b, c, d, F2, 26, 0 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( d, e, a, b, c, F2, 27, 0 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3);
+ R( c, d, e, a, b, F2, 28, 0 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( b, c, d, e, a, F2, 29, 0 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( a, b, c, d, e, F2, 30, 0 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( e, a, b, c, d, F2, 31, 0 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4);
+ R( d, e, a, b, c, F2, 32, 0 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( c, d, e, a, b, F2, 33, 0 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( b, c, d, e, a, F2, 34, 0 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( a, b, c, d, e, F2, 35, 0 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4);
+ R( e, a, b, c, d, F2, 36, 0 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( d, e, a, b, c, F2, 37, 0 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( c, d, e, a, b, F2, 38, 0 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( b, c, d, e, a, F2, 39, 0 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4);
+ R( a, b, c, d, e, F3, 40, 0 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( e, a, b, c, d, F3, 41, 0 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( d, e, a, b, c, F3, 42, 0 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( c, d, e, a, b, F3, 43, 0 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4);
+ R( b, c, d, e, a, F3, 44, 0 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( a, b, c, d, e, F3, 45, 0 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( e, a, b, c, d, F3, 46, 0 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( d, e, a, b, c, F3, 47, 0 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4);
+
+ /* Transform 48-79 for block 1. */
+ R( c, d, e, a, b, F3, 48, 0 );
+ R( b, c, d, e, a, F3, 49, 0 );
+ R( a, b, c, d, e, F3, 50, 0 );
+ R( e, a, b, c, d, F3, 51, 0 );
+ R( d, e, a, b, c, F3, 52, 0 );
+ R( c, d, e, a, b, F3, 53, 0 );
+ R( b, c, d, e, a, F3, 54, 0 );
+ R( a, b, c, d, e, F3, 55, 0 );
+ R( e, a, b, c, d, F3, 56, 0 );
+ R( d, e, a, b, c, F3, 57, 0 );
+ R( c, d, e, a, b, F3, 58, 0 );
+ R( b, c, d, e, a, F3, 59, 0 );
+ R( a, b, c, d, e, F4, 60, 0 );
+ R( e, a, b, c, d, F4, 61, 0 );
+ R( d, e, a, b, c, F4, 62, 0 );
+ R( c, d, e, a, b, F4, 63, 0 );
+ R( b, c, d, e, a, F4, 64, 0 );
+ R( a, b, c, d, e, F4, 65, 0 );
+ R( e, a, b, c, d, F4, 66, 0 );
+ R( d, e, a, b, c, F4, 67, 0 );
+ R( c, d, e, a, b, F4, 68, 0 );
+ R( b, c, d, e, a, F4, 69, 0 );
+ R( a, b, c, d, e, F4, 70, 0 );
+ R( e, a, b, c, d, F4, 71, 0 );
+ R( d, e, a, b, c, F4, 72, 0 );
+ R( c, d, e, a, b, F4, 73, 0 );
+ R( b, c, d, e, a, F4, 74, 0 );
+ R( a, b, c, d, e, F4, 75, 0 );
+ R( e, a, b, c, d, F4, 76, 0 );
+ R( d, e, a, b, c, F4, 77, 0 );
+ R( c, d, e, a, b, F4, 78, 0 );
+ addl state_h0(RSTATE), a;
+ R( b, c, d, e, a, F4, 79, 0 );
+ addl ne, a;
+ xorl ne, ne;
+
+ /* Update the chaining variables. */
+ addl state_h3(RSTATE), d;
+ addl state_h2(RSTATE), c;
+ addl state_h1(RSTATE), b;
+ addl state_h4(RSTATE), e;
+
+ movl d, state_h3(RSTATE);
+ movl c, state_h2(RSTATE);
+ movl b, state_h1(RSTATE);
+ movl a, state_h0(RSTATE);
+ movl e, state_h4(RSTATE);
+
+ /* Transform 0-47 for block 2. */
+ R( a, b, c, d, e, F1, 0, 1 );
+ R( e, a, b, c, d, F1, 1, 1 );
+ R( d, e, a, b, c, F1, 2, 1 );
+ R( c, d, e, a, b, F1, 3, 1 );
+ R( b, c, d, e, a, F1, 4, 1 );
+ R( a, b, c, d, e, F1, 5, 1 );
+ R( e, a, b, c, d, F1, 6, 1 );
+ R( d, e, a, b, c, F1, 7, 1 );
+ R( c, d, e, a, b, F1, 8, 1 );
+ R( b, c, d, e, a, F1, 9, 1 );
+ R( a, b, c, d, e, F1, 10, 1 );
+ R( e, a, b, c, d, F1, 11, 1 );
+ R( d, e, a, b, c, F1, 12, 1 );
+ R( c, d, e, a, b, F1, 13, 1 );
+ R( b, c, d, e, a, F1, 14, 1 );
+ R( a, b, c, d, e, F1, 15, 1 );
+ R( e, a, b, c, d, F1, 16, 1 );
+ R( d, e, a, b, c, F1, 17, 1 );
+ R( c, d, e, a, b, F1, 18, 1 );
+ R( b, c, d, e, a, F1, 19, 1 );
+ R( a, b, c, d, e, F2, 20, 1 );
+ R( e, a, b, c, d, F2, 21, 1 );
+ R( d, e, a, b, c, F2, 22, 1 );
+ R( c, d, e, a, b, F2, 23, 1 );
+ R( b, c, d, e, a, F2, 24, 1 );
+ R( a, b, c, d, e, F2, 25, 1 );
+ R( e, a, b, c, d, F2, 26, 1 );
+ R( d, e, a, b, c, F2, 27, 1 );
+ R( c, d, e, a, b, F2, 28, 1 );
+ R( b, c, d, e, a, F2, 29, 1 );
+ R( a, b, c, d, e, F2, 30, 1 );
+ R( e, a, b, c, d, F2, 31, 1 );
+ R( d, e, a, b, c, F2, 32, 1 );
+ R( c, d, e, a, b, F2, 33, 1 );
+ R( b, c, d, e, a, F2, 34, 1 );
+ R( a, b, c, d, e, F2, 35, 1 );
+ R( e, a, b, c, d, F2, 36, 1 );
+ R( d, e, a, b, c, F2, 37, 1 );
+ R( c, d, e, a, b, F2, 38, 1 );
+ R( b, c, d, e, a, F2, 39, 1 );
+ R( a, b, c, d, e, F3, 40, 1 );
+ R( e, a, b, c, d, F3, 41, 1 );
+ R( d, e, a, b, c, F3, 42, 1 );
+ R( c, d, e, a, b, F3, 43, 1 );
+ R( b, c, d, e, a, F3, 44, 1 );
+ R( a, b, c, d, e, F3, 45, 1 );
+ R( e, a, b, c, d, F3, 46, 1 );
+ R( d, e, a, b, c, F3, 47, 1 );
+
+ addq $-2, RNBLKS;
+ jz .Lend;
+
+ /* Transform 48-79 for block 2 + Precalc 0-31 for next two blocks. */
+ R( c, d, e, a, b, F3, 48, 1 ); W_PRECALC_00_15_0(0, W0, Wtmp0);
+ R( b, c, d, e, a, F3, 49, 1 ); W_PRECALC_00_15_1(1, W0, Wtmp0);
+ R( a, b, c, d, e, F3, 50, 1 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1);
+ R( e, a, b, c, d, F3, 51, 1 ); W_PRECALC_00_15_3(3, W0, Wtmp0);
+ R( d, e, a, b, c, F3, 52, 1 ); W_PRECALC_00_15_0(4, W7, Wtmp0);
+ R( c, d, e, a, b, F3, 53, 1 ); W_PRECALC_00_15_1(5, W7, Wtmp0);
+ R( b, c, d, e, a, F3, 54, 1 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1);
+ R( a, b, c, d, e, F3, 55, 1 ); W_PRECALC_00_15_3(7, W7, Wtmp0);
+ R( e, a, b, c, d, F3, 56, 1 ); W_PRECALC_00_15_0(8, W6, Wtmp0);
+ R( d, e, a, b, c, F3, 57, 1 ); W_PRECALC_00_15_1(9, W6, Wtmp0);
+ R( c, d, e, a, b, F3, 58, 1 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1);
+ R( b, c, d, e, a, F3, 59, 1 ); W_PRECALC_00_15_3(11, W6, Wtmp0);
+ R( a, b, c, d, e, F4, 60, 1 ); W_PRECALC_00_15_0(12, W5, Wtmp0);
+ R( e, a, b, c, d, F4, 61, 1 ); W_PRECALC_00_15_1(13, W5, Wtmp0);
+ R( d, e, a, b, c, F4, 62, 1 ); W_PRECALC_00_15_2(14, W5, Wtmp0, K1);
+ R( c, d, e, a, b, F4, 63, 1 ); W_PRECALC_00_15_3(15, W5, Wtmp0);
+ R( b, c, d, e, a, F4, 64, 1 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F4, 65, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( e, a, b, c, d, F4, 66, 1 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( d, e, a, b, c, F4, 67, 1 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1);
+ R( c, d, e, a, b, F4, 68, 1 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( b, c, d, e, a, F4, 69, 1 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F4, 70, 1 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( e, a, b, c, d, F4, 71, 1 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2);
+ R( d, e, a, b, c, F4, 72, 1 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( c, d, e, a, b, F4, 73, 1 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( b, c, d, e, a, F4, 74, 1 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F4, 75, 1 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2);
+ R( e, a, b, c, d, F4, 76, 1 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ R( d, e, a, b, c, F4, 77, 1 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ R( c, d, e, a, b, F4, 78, 1 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ addl state_h0(RSTATE), a; W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2);
+ R( b, c, d, e, a, F4, 79, 1 );
+ addl ne, a;
+ xorl ne, ne;
+
+ /* Update the chaining variables. */
+ addl state_h3(RSTATE), d;
+ addl state_h2(RSTATE), c;
+ addl state_h1(RSTATE), b;
+ addl state_h4(RSTATE), e;
+
+ movl d, state_h3(RSTATE);
+ movl c, state_h2(RSTATE);
+ movl b, state_h1(RSTATE);
+ movl a, state_h0(RSTATE);
+ movl e, state_h4(RSTATE);
+
+ jmp .Loop;
+
+.align 16
+.Lend:
+ vzeroall;
+
+ /* Transform 48-79 for block 2. */
+ R( c, d, e, a, b, F3, 48, 1 );
+ R( b, c, d, e, a, F3, 49, 1 );
+ R( a, b, c, d, e, F3, 50, 1 );
+ R( e, a, b, c, d, F3, 51, 1 );
+ R( d, e, a, b, c, F3, 52, 1 );
+ R( c, d, e, a, b, F3, 53, 1 );
+ R( b, c, d, e, a, F3, 54, 1 );
+ R( a, b, c, d, e, F3, 55, 1 );
+ R( e, a, b, c, d, F3, 56, 1 );
+ R( d, e, a, b, c, F3, 57, 1 );
+ R( c, d, e, a, b, F3, 58, 1 );
+ R( b, c, d, e, a, F3, 59, 1 );
+ R( a, b, c, d, e, F4, 60, 1 );
+ R( e, a, b, c, d, F4, 61, 1 );
+ R( d, e, a, b, c, F4, 62, 1 );
+ R( c, d, e, a, b, F4, 63, 1 );
+ R( b, c, d, e, a, F4, 64, 1 );
+ R( a, b, c, d, e, F4, 65, 1 );
+ R( e, a, b, c, d, F4, 66, 1 );
+ R( d, e, a, b, c, F4, 67, 1 );
+ R( c, d, e, a, b, F4, 68, 1 );
+ R( b, c, d, e, a, F4, 69, 1 );
+ R( a, b, c, d, e, F4, 70, 1 );
+ R( e, a, b, c, d, F4, 71, 1 );
+ R( d, e, a, b, c, F4, 72, 1 );
+ R( c, d, e, a, b, F4, 73, 1 );
+ R( b, c, d, e, a, F4, 74, 1 );
+ R( a, b, c, d, e, F4, 75, 1 );
+ R( e, a, b, c, d, F4, 76, 1 );
+ R( d, e, a, b, c, F4, 77, 1 );
+ R( c, d, e, a, b, F4, 78, 1 );
+ addl state_h0(RSTATE), a;
+ R( b, c, d, e, a, F4, 79, 1 );
+ addl ne, a;
+ xorl ne, ne;
+
+ /* Update the chaining variables. */
+ addl state_h3(RSTATE), d;
+ addl state_h2(RSTATE), c;
+ addl state_h1(RSTATE), b;
+ addl state_h4(RSTATE), e;
+
+ movl d, state_h3(RSTATE);
+ movl c, state_h2(RSTATE);
+ movl b, state_h1(RSTATE);
+ movl a, state_h0(RSTATE);
+ movl e, state_h4(RSTATE);
+
+ movq ROLDSTACK, %rsp;
+
+ popq %r12;
+ popq %rbp;
+ popq %rbx;
+
+ /* burn_stack */
+ movl $((WK_STACK_WORDS)*4 + 3*8 + 31), %eax;
+
+ ret;
+ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2,
+ .-_gcry_sha1_transform_amd64_avx2_bmi2;)
+
+#endif
+#endif
diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S
index 2b4394765..421bebeca 100644
--- a/cipher/sha1-ssse3-amd64.S
+++ b/cipher/sha1-ssse3-amd64.S
@@ -429,6 +429,8 @@ _gcry_sha1_transform_amd64_ssse3:
.Lret:
ret;
+ELF(.size _gcry_sha1_transform_amd64_ssse3,
+ .-_gcry_sha1_transform_amd64_ssse3;)
#endif
#endif
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 76c486c7e..affabfb07 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -68,6 +68,12 @@
# define USE_BMI2 1
#endif
+/* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */
+#undef USE_AVX2
+#if defined(USE_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX2)
+# define USE_AVX2 1
+#endif
+
/* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */
#undef USE_SHAEXT
#if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \
@@ -171,7 +177,37 @@ do_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data,
return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks)
+ ASM_EXTRA_STACK;
}
-#endif
+
+#ifdef USE_AVX2
+unsigned int
+_gcry_sha1_transform_amd64_avx2_bmi2 (void *state, const unsigned char *data,
+ size_t nblks) ASM_FUNC_ABI;
+
+static unsigned int
+do_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data,
+ size_t nblks)
+{
+ SHA1_CONTEXT *hd = ctx;
+
+ /* AVX2/BMI2 function only handles pair of blocks so nblks needs to be
+ * multiple of 2 and function does not handle zero nblks. Use AVX/BMI2
+ * code to handle these cases. */
+
+ if (nblks <= 1)
+ return do_sha1_transform_amd64_avx_bmi2 (ctx, data, nblks);
+
+ if (nblks & 1)
+ {
+ (void)_gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, 1);
+ nblks--;
+ data += 64;
+ }
+
+ return _gcry_sha1_transform_amd64_avx2_bmi2 (&hd->h0, data, nblks)
+ + ASM_EXTRA_STACK;
+}
+#endif /* USE_AVX2 */
+#endif /* USE_BMI2 */
#ifdef USE_SHAEXT
/* Does not need ASM_FUNC_ABI */
@@ -258,6 +294,11 @@ sha1_init (void *context, unsigned int flags)
if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2))
hd->bctx.bwrite = do_sha1_transform_amd64_avx_bmi2;
#endif
+#ifdef USE_AVX2
+ if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_AVX) &&
+ (features & HWF_INTEL_BMI2))
+ hd->bctx.bwrite = do_sha1_transform_amd64_avx2_bmi2;
+#endif
#ifdef USE_SHAEXT
if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1))
hd->bctx.bwrite = do_sha1_transform_intel_shaext;
@@ -494,22 +535,27 @@ sha1_final(void *context)
if( hd->bctx.count < 56 ) /* enough room */
{
hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */
- while( hd->bctx.count < 56 )
- hd->bctx.buf[hd->bctx.count++] = 0; /* pad */
+ if (hd->bctx.count < 56)
+ memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count);
+ hd->bctx.count = 56;
+
+ /* append the 64 bit count */
+ buf_put_be32(hd->bctx.buf + 56, msb);
+ buf_put_be32(hd->bctx.buf + 60, lsb);
+ burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 );
}
else /* need one extra block */
{
hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */
- while( hd->bctx.count < 64 )
- hd->bctx.buf[hd->bctx.count++] = 0;
- _gcry_md_block_write(hd, NULL, 0); /* flush */;
- memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
+ /* fill pad and next block with zeroes */
+ memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56);
+ hd->bctx.count = 64 + 56;
+
+ /* append the 64 bit count */
+ buf_put_be32(hd->bctx.buf + 64 + 56, msb);
+ buf_put_be32(hd->bctx.buf + 64 + 60, lsb);
+ burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 2 );
}
- /* append the 64 bit count */
- buf_put_be32(hd->bctx.buf + 56, msb);
- buf_put_be32(hd->bctx.buf + 60, lsb);
- burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 );
- _gcry_burn_stack (burn);
p = hd->bctx.buf;
#define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0)
@@ -520,6 +566,7 @@ sha1_final(void *context)
X(4);
#undef X
+ _gcry_burn_stack (burn);
}
static unsigned char *
diff --git a/configure.ac b/configure.ac
index bb3c666f4..0a931f952 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2541,6 +2541,7 @@ case "${host}" in
GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo"
GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo"
GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo"
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx2-bmi2-amd64.lo"
;;
arm*-*-*)
# Build with the assembly implementation
More information about the Gcrypt-devel
mailing list