[PATCH] SHA-1: Add SSSE3 implementation
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Dec 13 15:10:32 CET 2013
* cipher/Makefile.am: Add 'sha1-ssse3-amd64.c'.
* cipher/sha1-ssse3-amd64.c: New.
* cipher/sha1.c (USE_SSSE3): New.
(SHA1_CONTEXT) [USE_SSSE3]: Add 'use_ssse3'.
(sha1_init) [USE_SSSE3]: Initialize 'use_ssse3'.
(transform): Rename to...
(_transform): this.
(transform): New.
* configure.ac [host=x86_64]: Add 'sha1-ssse3-amd64.lo'.
--
Patch adds SSSE3 implementation based on white paper "Improving the Performance
of the Secure Hash Algorithm (SHA-1)" at
http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
Benchmarks:
cpu Old New Diff
Intel i5-4570 9.02 c/B 5.22 c/B 1.72x
Intel i5-2450M 12.27 c/B 7.24 c/B 1.69x
Intel Core2 T8100 7.94 c/B 6.76 c/B 1.17x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/Makefile.am | 2
cipher/sha1-ssse3-amd64.c | 319 +++++++++++++++++++++++++++++++++++++++++++++
cipher/sha1.c | 39 +++++-
configure.ac | 7 +
4 files changed, 365 insertions(+), 2 deletions(-)
create mode 100644 cipher/sha1-ssse3-amd64.c
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 7c85af2..0477772 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -77,7 +77,7 @@ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
scrypt.c \
seed.c \
serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \
-sha1.c \
+sha1.c sha1-ssse3-amd64.c \
sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \
stribog.c \
diff --git a/cipher/sha1-ssse3-amd64.c b/cipher/sha1-ssse3-amd64.c
new file mode 100644
index 0000000..1342235
--- /dev/null
+++ b/cipher/sha1-ssse3-amd64.c
@@ -0,0 +1,319 @@
+/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * Based on sha1.c:
+ * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
+ * "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+ * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1)
+
+#ifdef HAVE_STDINT_H
+# include <stdint.h> /* uintptr_t */
+#elif defined(HAVE_INTTYPES_H)
+# include <inttypes.h>
+#else
+/* In this case, uintptr_t is provided by config.h. */
+#endif
+
+#include "bithelp.h"
+
+
+/* Helper macro to force alignment to 16 bytes. */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16 __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+
+typedef struct
+{
+ u32 h0,h1,h2,h3,h4;
+} SHA1_STATE;
+
+
+/* Round function macros. */
+#define K1 0x5A827999L
+#define K2 0x6ED9EBA1L
+#define K3 0x8F1BBCDCL
+#define K4 0xCA62C1D6L
+#define F1(x,y,z) ( z ^ ( x & ( y ^ z ) ) )
+#define F2(x,y,z) ( x ^ y ^ z )
+#define F3(x,y,z) ( ( x & y ) | ( z & ( x | y ) ) )
+#define F4(x,y,z) ( x ^ y ^ z )
+#define R(a,b,c,d,e,f,wk) do { e += rol( a, 5 ) \
+ + f( b, c, d ) \
+ + wk; \
+ b = rol( b, 30 ); \
+ } while(0)
+
+#define WK(i) (wk[i & 15])
+
+
+static const u32 K_XMM[4][4] ATTR_ALIGNED_16 =
+ {
+ { K1, K1, K1, K1 },
+ { K2, K2, K2, K2 },
+ { K3, K3, K3, K3 },
+ { K4, K4, K4, K4 },
+ };
+static const u32 bswap_shufb_ctl[4] ATTR_ALIGNED_16 =
+ { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f };
+
+
+/*
+ * Transform 64 bytes (16 32-bit words) at DATA.
+ */
+unsigned int
+_gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data)
+{
+ SHA1_STATE *state = ctx;
+ register u32 a, b, c, d, e; /* Local copies of the chaining variables. */
+ byte wk_unaligned[4*16+15]; /* The array we work on. */
+ u32 *wk = (u32 *)(wk_unaligned
+ + ((16 - ((uintptr_t)wk_unaligned & 15)) & 15));
+
+ /* Get the values of the chaining variables. */
+ a = state->h0;
+ b = state->h1;
+ c = state->h2;
+ d = state->h3;
+ e = state->h4;
+
+#define Wtmp0 "xmm0"
+#define Wtmp1 "xmm1"
+
+#define W0 "xmm2"
+#define W1 "xmm3"
+#define W2 "xmm4"
+#define W3 "xmm5"
+#define W4 "xmm6"
+#define W5 "xmm7"
+#define W6 "xmm8"
+#define W7 "xmm9"
+
+#define BSWAP_REG "xmm10"
+
+ __asm__ volatile ("movdqa %[bswap], %%"BSWAP_REG";\n\t"
+ :: [bswap] "m" (bswap_shufb_ctl[0]));
+
+#define W_PRECALC_00_15_0(i, W, tmp0) \
+ __asm__ volatile ("movdqu %[data], %%"tmp0";\n\t" \
+ ::[data] "m" (*(data+4*(i))));
+
+#define W_PRECALC_00_15_1(i, W, tmp0) \
+ __asm__ volatile ("pshufb %%"BSWAP_REG", %%"tmp0";\n\t" \
+ "movdqa %%"tmp0", %%"W";\n\t" \
+ ::: "cc");
+
+#define W_PRECALC_00_15_2(i, W, tmp0) \
+ __asm__ volatile ("paddd %[k_xmm], %%"tmp0";\n\t" \
+ ::[k_xmm] "m" (K_XMM[i / 20][0]));
+
+#define W_PRECALC_00_15_3(i, W, tmp0) \
+ __asm__ volatile ("movdqa %%"tmp0", %[wk];\n\t" \
+ :[wk] "=m" (WK(i&~3)));
+
+ /* Precalc 0-15. */
+ W_PRECALC_00_15_0(0, W0, Wtmp0);
+ W_PRECALC_00_15_1(1, W0, Wtmp0);
+ W_PRECALC_00_15_2(2, W0, Wtmp0);
+ W_PRECALC_00_15_3(3, W0, Wtmp0);
+ W_PRECALC_00_15_0(4, W7, Wtmp0);
+ W_PRECALC_00_15_1(5, W7, Wtmp0);
+ W_PRECALC_00_15_2(6, W7, Wtmp0);
+ W_PRECALC_00_15_3(7, W7, Wtmp0);
+ W_PRECALC_00_15_0(8, W6, Wtmp0);
+ W_PRECALC_00_15_1(9, W6, Wtmp0);
+ W_PRECALC_00_15_2(10, W6, Wtmp0);
+ W_PRECALC_00_15_3(11, W6, Wtmp0);
+ W_PRECALC_00_15_0(12, W5, Wtmp0);
+ W_PRECALC_00_15_1(13, W5, Wtmp0);
+ W_PRECALC_00_15_2(14, W5, Wtmp0);
+ W_PRECALC_00_15_3(15, W5, Wtmp0);
+
+#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ __asm__ volatile ("movdqa %%"W_m12", %%"W";\n\t" \
+ "palignr $8, %%"W_m16", %%"W";\n\t" \
+ "movdqa %%"W_m04", %%"tmp0";\n\t" \
+ "psrldq $4, %%"tmp0";\n\t" \
+ "pxor %%"W_m08", %%"W";\n\t" \
+ :::"cc");
+
+#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ __asm__ volatile ("pxor %%"W_m16", %%"tmp0";\n\t" \
+ "pxor %%"tmp0", %%"W";\n\t" \
+ "movdqa %%"W", %%"tmp1";\n\t" \
+ "movdqa %%"W", %%"tmp0";\n\t" \
+ "pslldq $12, %%"tmp1";\n\t" \
+ :::"cc");
+
+#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ __asm__ volatile ("psrld $31, %%"W";\n\t" \
+ "pslld $1, %%"tmp0";\n\t" \
+ "por %%"W", %%"tmp0";\n\t" \
+ "movdqa %%"tmp1", %%"W";\n\t" \
+ "psrld $30, %%"tmp1";\n\t" \
+ "pslld $2, %%"W";\n\t" \
+ :::"cc");
+
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+ __asm__ volatile ("pxor %%"W", %%"tmp0";\n\t" \
+ "pxor %%"tmp1", %%"tmp0";\n\t" \
+ "movdqa %%"tmp0", %%"W";\n\t" \
+ "paddd %[k_xmm], %%"tmp0";\n\t" \
+ "movdqa %%"tmp0", %[wk];\n\t" \
+ : [wk] "=m" (WK(i&~3)) \
+ : [k_xmm] "m" (K_XMM[i / 20][0]));
+
+ /* Transform 0-15 + Precalc 16-31. */
+ R( a, b, c, d, e, F1, WK( 0) ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( e, a, b, c, d, F1, WK( 1) ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( d, e, a, b, c, F1, WK( 2) ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( c, d, e, a, b, F1, WK( 3) ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+ R( b, c, d, e, a, F1, WK( 4) ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F1, WK( 5) ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( e, a, b, c, d, F1, WK( 6) ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( d, e, a, b, c, F1, WK( 7) ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+ R( c, d, e, a, b, F1, WK( 8) ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( b, c, d, e, a, F1, WK( 9) ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F1, WK(10) ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( e, a, b, c, d, F1, WK(11) ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+ R( d, e, a, b, c, F1, WK(12) ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ R( c, d, e, a, b, F1, WK(13) ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ R( b, c, d, e, a, F1, WK(14) ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+ R( a, b, c, d, e, F1, WK(15) ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+
+#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ __asm__ volatile ("movdqa %%"W_m04", %%"tmp0";\n\t" \
+ "pxor %%"W_m28", %%"W";\n\t" \
+ "palignr $8, %%"W_m08", %%"tmp0";\n\t" \
+ :::"cc");
+
+#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ __asm__ volatile ("pxor %%"W_m16", %%"W";\n\t" \
+ "pxor %%"tmp0", %%"W";\n\t" \
+ "movdqa %%"W", %%"tmp0";\n\t" \
+ :::"cc");
+
+#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ __asm__ volatile ("psrld $30, %%"W";\n\t" \
+ "pslld $2, %%"tmp0";\n\t" \
+ "por %%"W", %%"tmp0";\n\t" \
+ :::"cc");
+
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+ __asm__ volatile ("movdqa %%"tmp0", %%"W";\n\t" \
+ "paddd %[k_xmm], %%"tmp0";\n\t" \
+ "movdqa %%"tmp0", %[wk];\n\t" \
+ : [wk] "=m" (WK(i&~3)) \
+ : [k_xmm] "m" (K_XMM[i / 20][0]));
+
+ /* Transform 16-63 + Precalc 32-79. */
+ R( e, a, b, c, d, F1, WK(16) ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( d, e, a, b, c, F1, WK(17) ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( c, d, e, a, b, F1, WK(18) ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( b, c, d, e, a, F1, WK(19) ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( a, b, c, d, e, F2, WK(20) ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( e, a, b, c, d, F2, WK(21) ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( d, e, a, b, c, F2, WK(22) ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( c, d, e, a, b, F2, WK(23) ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( b, c, d, e, a, F2, WK(24) ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( a, b, c, d, e, F2, WK(25) ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( e, a, b, c, d, F2, WK(26) ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( d, e, a, b, c, F2, WK(27) ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( c, d, e, a, b, F2, WK(28) ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( b, c, d, e, a, F2, WK(29) ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( a, b, c, d, e, F2, WK(30) ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( e, a, b, c, d, F2, WK(31) ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( d, e, a, b, c, F2, WK(32) ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( c, d, e, a, b, F2, WK(33) ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( b, c, d, e, a, F2, WK(34) ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( a, b, c, d, e, F2, WK(35) ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+ R( e, a, b, c, d, F2, WK(36) ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( d, e, a, b, c, F2, WK(37) ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( c, d, e, a, b, F2, WK(38) ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( b, c, d, e, a, F2, WK(39) ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+ R( a, b, c, d, e, F3, WK(40) ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( e, a, b, c, d, F3, WK(41) ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( d, e, a, b, c, F3, WK(42) ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( c, d, e, a, b, F3, WK(43) ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+ R( b, c, d, e, a, F3, WK(44) ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( a, b, c, d, e, F3, WK(45) ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( e, a, b, c, d, F3, WK(46) ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( d, e, a, b, c, F3, WK(47) ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+ R( c, d, e, a, b, F3, WK(48) ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( b, c, d, e, a, F3, WK(49) ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( a, b, c, d, e, F3, WK(50) ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( e, a, b, c, d, F3, WK(51) ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+ R( d, e, a, b, c, F3, WK(52) ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( c, d, e, a, b, F3, WK(53) ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( b, c, d, e, a, F3, WK(54) ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( a, b, c, d, e, F3, WK(55) ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+ R( e, a, b, c, d, F3, WK(56) ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( d, e, a, b, c, F3, WK(57) ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( c, d, e, a, b, F3, WK(58) ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( b, c, d, e, a, F3, WK(59) ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+ R( a, b, c, d, e, F4, WK(60) ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( e, a, b, c, d, F4, WK(61) ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( d, e, a, b, c, F4, WK(62) ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+ R( c, d, e, a, b, F4, WK(63) ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+
+#define CLEAR_REG(reg) __asm__ volatile ("pxor %%"reg", %%"reg";\n\t":::"cc");
+
+ /* Transform 64-79 + Clear XMM registers. */
+ R( b, c, d, e, a, F4, WK(64) ); CLEAR_REG(BSWAP_REG);
+ R( a, b, c, d, e, F4, WK(65) ); CLEAR_REG(Wtmp0);
+ R( e, a, b, c, d, F4, WK(66) ); CLEAR_REG(Wtmp1);
+ R( d, e, a, b, c, F4, WK(67) ); CLEAR_REG(W0);
+ R( c, d, e, a, b, F4, WK(68) ); CLEAR_REG(W1);
+ R( b, c, d, e, a, F4, WK(69) ); CLEAR_REG(W2);
+ R( a, b, c, d, e, F4, WK(70) ); CLEAR_REG(W3);
+ R( e, a, b, c, d, F4, WK(71) ); CLEAR_REG(W4);
+ R( d, e, a, b, c, F4, WK(72) ); CLEAR_REG(W5);
+ R( c, d, e, a, b, F4, WK(73) ); CLEAR_REG(W6);
+ R( b, c, d, e, a, F4, WK(74) ); CLEAR_REG(W7);
+ R( a, b, c, d, e, F4, WK(75) );
+ R( e, a, b, c, d, F4, WK(76) );
+ R( d, e, a, b, c, F4, WK(77) );
+ R( c, d, e, a, b, F4, WK(78) );
+ R( b, c, d, e, a, F4, WK(79) );
+
+ /* Update the chaining variables. */
+ state->h0 += a;
+ state->h1 += b;
+ state->h2 += c;
+ state->h3 += d;
+ state->h4 += e;
+
+ return /* burn_stack */ 84+15;
+}
+
+#endif
+#endif
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 025b3ab..af57b19 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -43,6 +43,15 @@
#include "hash-common.h"
+/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
+#undef USE_SSSE3
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+ defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
+# define USE_SSSE3 1
+#endif
+
+
/* A macro to test whether P is properly aligned for an u32 type.
Note that config.h provides a suitable replacement for uintptr_t if
it does not exist in stdint.h. */
@@ -56,6 +65,9 @@ typedef struct
{
gcry_md_block_ctx_t bctx;
u32 h0,h1,h2,h3,h4;
+#ifdef USE_SSSE3
+ unsigned int use_ssse3:1;
+#endif
} SHA1_CONTEXT;
static unsigned int
@@ -78,6 +90,10 @@ sha1_init (void *context)
hd->bctx.count = 0;
hd->bctx.blocksize = 64;
hd->bctx.bwrite = transform;
+
+#ifdef USE_SSSE3
+ hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0;
+#endif
}
@@ -107,7 +123,7 @@ sha1_init (void *context)
* Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA.
*/
static unsigned int
-transform (void *ctx, const unsigned char *data)
+_transform (void *ctx, const unsigned char *data)
{
SHA1_CONTEXT *hd = ctx;
const u32 *idata = (const void *)data;
@@ -217,6 +233,27 @@ transform (void *ctx, const unsigned char *data)
}
+#ifdef USE_SSSE3
+unsigned int
+_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data);
+#endif
+
+
+static unsigned int
+transform (void *ctx, const unsigned char *data)
+{
+ SHA1_CONTEXT *hd = ctx;
+
+#ifdef USE_SSSE3
+ if (hd->use_ssse3)
+ return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data)
+ + 4 * sizeof(void*);
+#endif
+
+ return _transform (hd, data);
+}
+
+
/* The routine final terminates the computation and
* returns the digest.
* The handle is prepared for a new cycle, but adding bytes to the
diff --git a/configure.ac b/configure.ac
index b91240b..9088d02 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1773,6 +1773,13 @@ GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo sha1.lo"
AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included])
AC_DEFINE(USE_SHA1, 1, [Defined if this module should be included])
+case "${host}" in
+ x86_64-*-*)
+ # Build with the assembly implementation
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo"
+ ;;
+esac
+
LIST_MEMBER(scrypt, $enabled_kdfs)
if test "$found" = "1" ; then
GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"
More information about the Gcrypt-devel
mailing list