[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-440-gbe2238f

by Jussi Kivilinna cvs at cvs.gnupg.org
Fri Dec 13 15:51:09 CET 2013


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  be2238f68abcc6f2b4e8c38ad9141376ce622a22 (commit)
      from  04615cc6803cdede25fa92e3ff697e252a23cd7a (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit be2238f68abcc6f2b4e8c38ad9141376ce622a22
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Fri Dec 13 12:47:56 2013 +0200

    SHA-1: Add SSSE3 implementation
    
    * cipher/Makefile.am: Add 'sha1-ssse3-amd64.c'.
    * cipher/sha1-ssse3-amd64.c: New.
    * cipher/sha1.c (USE_SSSE3): New.
    (SHA1_CONTEXT) [USE_SSSE3]: Add 'use_ssse3'.
    (sha1_init) [USE_SSSE3]: Initialize 'use_ssse3'.
    (transform): Rename to...
    (_transform): this.
    (transform): New.
    * configure.ac [host=x86_64]: Add 'sha1-ssse3-amd64.lo'.
    --
    
    Patch adds SSSE3 implementation based on white paper "Improving the Performance
    of the Secure Hash Algorithm (SHA-1)" at
     http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
    
    Benchmarks:
    
    cpu                Old        New        Diff
    Intel i5-4570      9.02 c/B   5.22 c/B   1.72x
    Intel i5-2450M     12.27 c/B  7.24 c/B   1.69x
    Intel Core2 T8100  7.94 c/B   6.76 c/B   1.17x
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 7c85af2..0477772 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -77,7 +77,7 @@ salsa20.c salsa20-amd64.S salsa20-armv7-neon.S \
 scrypt.c \
 seed.c \
 serpent.c serpent-sse2-amd64.S serpent-avx2-amd64.S \
-sha1.c \
+sha1.c sha1-ssse3-amd64.c \
 sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S sha256-avx2-bmi2-amd64.S \
 sha512.c sha512-ssse3-amd64.S sha512-armv7-neon.S \
 stribog.c \
diff --git a/cipher/sha1-ssse3-amd64.c b/cipher/sha1-ssse3-amd64.c
new file mode 100644
index 0000000..1342235
--- /dev/null
+++ b/cipher/sha1-ssse3-amd64.c
@@ -0,0 +1,319 @@
+/* sha1-ssse3-amd64.c - Intel SSSE3 accelerated SHA-1 transform function
+ * Copyright © 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ *
+ * Based on sha1.c:
+ *  Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc.
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+/*
+ * Intel SSSE3 accelerated SHA-1 implementation based on white paper:
+ *  "Improving the Performance of the Secure Hash Algorithm (SHA-1)"
+ *  http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1
+ */
+
+#ifdef __x86_64__
+#include <config.h>
+
+#if defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1)
+
+#ifdef HAVE_STDINT_H
+# include <stdint.h> /* uintptr_t */
+#elif defined(HAVE_INTTYPES_H)
+# include <inttypes.h>
+#else
+/* In this case, uintptr_t is provided by config.h. */
+#endif
+
+#include "bithelp.h"
+
+
+/* Helper macro to force alignment to 16 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_16  __attribute__ ((aligned (16)))
+#else
+# define ATTR_ALIGNED_16
+#endif
+
+
+typedef struct
+{
+  u32           h0,h1,h2,h3,h4;
+} SHA1_STATE;
+
+
+/* Round function macros. */
+#define K1  0x5A827999L
+#define K2  0x6ED9EBA1L
+#define K3  0x8F1BBCDCL
+#define K4  0xCA62C1D6L
+#define F1(x,y,z)   ( z ^ ( x & ( y ^ z ) ) )
+#define F2(x,y,z)   ( x ^ y ^ z )
+#define F3(x,y,z)   ( ( x & y ) | ( z & ( x | y ) ) )
+#define F4(x,y,z)   ( x ^ y ^ z )
+#define R(a,b,c,d,e,f,wk)  do { e += rol( a, 5 )	\
+				      + f( b, c, d )	\
+				      + wk;	 	\
+				 b = rol( b, 30 );	\
+			       } while(0)
+
+#define WK(i) (wk[i & 15])
+
+
+static const u32 K_XMM[4][4] ATTR_ALIGNED_16 =
+  {
+    { K1, K1, K1, K1 },
+    { K2, K2, K2, K2 },
+    { K3, K3, K3, K3 },
+    { K4, K4, K4, K4 },
+  };
+static const u32 bswap_shufb_ctl[4] ATTR_ALIGNED_16 =
+  { 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f };
+
+
+/*
+ * Transform 64 bytes (16 32-bit words) at DATA.
+ */
+unsigned int
+_gcry_sha1_transform_amd64_ssse3 (void *ctx, const unsigned char *data)
+{
+  SHA1_STATE *state = ctx;
+  register u32 a, b, c, d, e; /* Local copies of the chaining variables.  */
+  byte wk_unaligned[4*16+15];  /* The array we work on. */
+  u32 *wk = (u32 *)(wk_unaligned
+                    + ((16 - ((uintptr_t)wk_unaligned & 15)) & 15));
+
+  /* Get the values of the chaining variables. */
+  a = state->h0;
+  b = state->h1;
+  c = state->h2;
+  d = state->h3;
+  e = state->h4;
+
+#define Wtmp0 "xmm0"
+#define Wtmp1 "xmm1"
+
+#define W0 "xmm2"
+#define W1 "xmm3"
+#define W2 "xmm4"
+#define W3 "xmm5"
+#define W4 "xmm6"
+#define W5 "xmm7"
+#define W6 "xmm8"
+#define W7 "xmm9"
+
+#define BSWAP_REG "xmm10"
+
+  __asm__ volatile ("movdqa %[bswap], %%"BSWAP_REG";\n\t"
+                    :: [bswap] "m" (bswap_shufb_ctl[0]));
+
+#define W_PRECALC_00_15_0(i, W, tmp0) \
+  __asm__ volatile ("movdqu %[data], %%"tmp0";\n\t" \
+                    ::[data] "m" (*(data+4*(i))));
+
+#define W_PRECALC_00_15_1(i, W, tmp0) \
+  __asm__ volatile ("pshufb %%"BSWAP_REG", %%"tmp0";\n\t" \
+                    "movdqa %%"tmp0", %%"W";\n\t" \
+                    ::: "cc");
+
+#define W_PRECALC_00_15_2(i, W, tmp0) \
+  __asm__ volatile ("paddd %[k_xmm], %%"tmp0";\n\t" \
+                    ::[k_xmm] "m" (K_XMM[i / 20][0]));
+
+#define W_PRECALC_00_15_3(i, W, tmp0) \
+  __asm__ volatile ("movdqa %%"tmp0", %[wk];\n\t" \
+                    :[wk] "=m" (WK(i&~3)));
+
+  /* Precalc 0-15. */
+  W_PRECALC_00_15_0(0, W0, Wtmp0);
+  W_PRECALC_00_15_1(1, W0, Wtmp0);
+  W_PRECALC_00_15_2(2, W0, Wtmp0);
+  W_PRECALC_00_15_3(3, W0, Wtmp0);
+  W_PRECALC_00_15_0(4, W7, Wtmp0);
+  W_PRECALC_00_15_1(5, W7, Wtmp0);
+  W_PRECALC_00_15_2(6, W7, Wtmp0);
+  W_PRECALC_00_15_3(7, W7, Wtmp0);
+  W_PRECALC_00_15_0(8, W6, Wtmp0);
+  W_PRECALC_00_15_1(9, W6, Wtmp0);
+  W_PRECALC_00_15_2(10, W6, Wtmp0);
+  W_PRECALC_00_15_3(11, W6, Wtmp0);
+  W_PRECALC_00_15_0(12, W5, Wtmp0);
+  W_PRECALC_00_15_1(13, W5, Wtmp0);
+  W_PRECALC_00_15_2(14, W5, Wtmp0);
+  W_PRECALC_00_15_3(15, W5, Wtmp0);
+
+#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+  __asm__ volatile ("movdqa %%"W_m12", %%"W";\n\t" \
+                    "palignr $8, %%"W_m16", %%"W";\n\t" \
+                    "movdqa %%"W_m04", %%"tmp0";\n\t" \
+                    "psrldq $4, %%"tmp0";\n\t" \
+                    "pxor %%"W_m08", %%"W";\n\t" \
+                    :::"cc");
+
+#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+  __asm__ volatile ("pxor %%"W_m16", %%"tmp0";\n\t" \
+                    "pxor %%"tmp0", %%"W";\n\t" \
+                    "movdqa %%"W", %%"tmp1";\n\t" \
+                    "movdqa %%"W", %%"tmp0";\n\t" \
+                    "pslldq $12, %%"tmp1";\n\t" \
+                    :::"cc");
+
+#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+  __asm__ volatile ("psrld $31, %%"W";\n\t" \
+                    "pslld $1, %%"tmp0";\n\t" \
+                    "por %%"W", %%"tmp0";\n\t" \
+                    "movdqa %%"tmp1", %%"W";\n\t" \
+                    "psrld $30, %%"tmp1";\n\t" \
+                    "pslld $2, %%"W";\n\t" \
+                    :::"cc");
+
+#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \
+  __asm__ volatile ("pxor %%"W", %%"tmp0";\n\t" \
+                    "pxor %%"tmp1", %%"tmp0";\n\t" \
+                    "movdqa %%"tmp0", %%"W";\n\t" \
+                    "paddd %[k_xmm], %%"tmp0";\n\t" \
+                    "movdqa %%"tmp0", %[wk];\n\t" \
+                    : [wk] "=m" (WK(i&~3)) \
+                    : [k_xmm] "m" (K_XMM[i / 20][0]));
+
+  /* Transform 0-15 + Precalc 16-31. */
+  R( a, b, c, d, e, F1, WK( 0) ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1, WK( 1) ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1, WK( 2) ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1, WK( 3) ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1, WK( 4) ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, WK( 5) ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1, WK( 6) ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1, WK( 7) ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1, WK( 8) ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1, WK( 9) ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, WK(10) ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( e, a, b, c, d, F1, WK(11) ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1);
+  R( d, e, a, b, c, F1, WK(12) ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( c, d, e, a, b, F1, WK(13) ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( b, c, d, e, a, F1, WK(14) ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+  R( a, b, c, d, e, F1, WK(15) ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1);
+
+#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+  __asm__ volatile ("movdqa %%"W_m04", %%"tmp0";\n\t" \
+                    "pxor %%"W_m28", %%"W";\n\t" \
+                    "palignr $8, %%"W_m08", %%"tmp0";\n\t" \
+                    :::"cc");
+
+#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+  __asm__ volatile ("pxor %%"W_m16", %%"W";\n\t" \
+                    "pxor %%"tmp0", %%"W";\n\t" \
+                    "movdqa %%"W", %%"tmp0";\n\t" \
+                    :::"cc");
+
+#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+  __asm__ volatile ("psrld $30, %%"W";\n\t" \
+                    "pslld $2, %%"tmp0";\n\t" \
+                    "por %%"W", %%"tmp0";\n\t" \
+                    :::"cc");
+
+#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \
+  __asm__ volatile ("movdqa %%"tmp0", %%"W";\n\t" \
+                    "paddd %[k_xmm], %%"tmp0";\n\t" \
+                    "movdqa %%"tmp0", %[wk];\n\t" \
+                    : [wk] "=m" (WK(i&~3)) \
+                    : [k_xmm] "m" (K_XMM[i / 20][0]));
+
+  /* Transform 16-63 + Precalc 32-79. */
+  R( e, a, b, c, d, F1, WK(16) ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( d, e, a, b, c, F1, WK(17) ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( c, d, e, a, b, F1, WK(18) ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F1, WK(19) ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( a, b, c, d, e, F2, WK(20) ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( e, a, b, c, d, F2, WK(21) ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( d, e, a, b, c, F2, WK(22) ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F2, WK(23) ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( b, c, d, e, a, F2, WK(24) ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( a, b, c, d, e, F2, WK(25) ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( e, a, b, c, d, F2, WK(26) ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F2, WK(27) ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( c, d, e, a, b, F2, WK(28) ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( b, c, d, e, a, F2, WK(29) ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( a, b, c, d, e, F2, WK(30) ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F2, WK(31) ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( d, e, a, b, c, F2, WK(32) ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( c, d, e, a, b, F2, WK(33) ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( b, c, d, e, a, F2, WK(34) ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( a, b, c, d, e, F2, WK(35) ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0);
+  R( e, a, b, c, d, F2, WK(36) ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( d, e, a, b, c, F2, WK(37) ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( c, d, e, a, b, F2, WK(38) ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( b, c, d, e, a, F2, WK(39) ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0);
+  R( a, b, c, d, e, F3, WK(40) ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( e, a, b, c, d, F3, WK(41) ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( d, e, a, b, c, F3, WK(42) ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( c, d, e, a, b, F3, WK(43) ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0);
+  R( b, c, d, e, a, F3, WK(44) ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( a, b, c, d, e, F3, WK(45) ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( e, a, b, c, d, F3, WK(46) ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( d, e, a, b, c, F3, WK(47) ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0);
+  R( c, d, e, a, b, F3, WK(48) ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( b, c, d, e, a, F3, WK(49) ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( a, b, c, d, e, F3, WK(50) ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( e, a, b, c, d, F3, WK(51) ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0);
+  R( d, e, a, b, c, F3, WK(52) ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( c, d, e, a, b, F3, WK(53) ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( b, c, d, e, a, F3, WK(54) ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( a, b, c, d, e, F3, WK(55) ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0);
+  R( e, a, b, c, d, F3, WK(56) ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( d, e, a, b, c, F3, WK(57) ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( c, d, e, a, b, F3, WK(58) ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( b, c, d, e, a, F3, WK(59) ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0);
+  R( a, b, c, d, e, F4, WK(60) ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( e, a, b, c, d, F4, WK(61) ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( d, e, a, b, c, F4, WK(62) ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+  R( c, d, e, a, b, F4, WK(63) ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0);
+
+#define CLEAR_REG(reg) __asm__ volatile ("pxor %%"reg", %%"reg";\n\t":::"cc");
+
+  /* Transform 64-79 + Clear XMM registers. */
+  R( b, c, d, e, a, F4, WK(64) ); CLEAR_REG(BSWAP_REG);
+  R( a, b, c, d, e, F4, WK(65) ); CLEAR_REG(Wtmp0);
+  R( e, a, b, c, d, F4, WK(66) ); CLEAR_REG(Wtmp1);
+  R( d, e, a, b, c, F4, WK(67) ); CLEAR_REG(W0);
+  R( c, d, e, a, b, F4, WK(68) ); CLEAR_REG(W1);
+  R( b, c, d, e, a, F4, WK(69) ); CLEAR_REG(W2);
+  R( a, b, c, d, e, F4, WK(70) ); CLEAR_REG(W3);
+  R( e, a, b, c, d, F4, WK(71) ); CLEAR_REG(W4);
+  R( d, e, a, b, c, F4, WK(72) ); CLEAR_REG(W5);
+  R( c, d, e, a, b, F4, WK(73) ); CLEAR_REG(W6);
+  R( b, c, d, e, a, F4, WK(74) ); CLEAR_REG(W7);
+  R( a, b, c, d, e, F4, WK(75) );
+  R( e, a, b, c, d, F4, WK(76) );
+  R( d, e, a, b, c, F4, WK(77) );
+  R( c, d, e, a, b, F4, WK(78) );
+  R( b, c, d, e, a, F4, WK(79) );
+
+  /* Update the chaining variables. */
+  state->h0 += a;
+  state->h1 += b;
+  state->h2 += c;
+  state->h3 += d;
+  state->h4 += e;
+
+  return /* burn_stack */ 84+15;
+}
+
+#endif
+#endif
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 025b3ab..af57b19 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -43,6 +43,15 @@
 #include "hash-common.h"
 
 
+/* USE_SSSE3 indicates whether to compile with Intel SSSE3 code. */
+#undef USE_SSSE3
+#if defined(__x86_64__) && defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) && \
+    defined(HAVE_GCC_INLINE_ASM_SSSE3) && \
+    defined(HAVE_INTEL_SYNTAX_PLATFORM_AS)
+# define USE_SSSE3 1
+#endif
+
+
 /* A macro to test whether P is properly aligned for an u32 type.
    Note that config.h provides a suitable replacement for uintptr_t if
    it does not exist in stdint.h.  */
@@ -56,6 +65,9 @@ typedef struct
 {
   gcry_md_block_ctx_t bctx;
   u32           h0,h1,h2,h3,h4;
+#ifdef USE_SSSE3
+  unsigned int use_ssse3:1;
+#endif
 } SHA1_CONTEXT;
 
 static unsigned int
@@ -78,6 +90,10 @@ sha1_init (void *context)
   hd->bctx.count = 0;
   hd->bctx.blocksize = 64;
   hd->bctx.bwrite = transform;
+
+#ifdef USE_SSSE3
+  hd->use_ssse3 = (_gcry_get_hw_features () & HWF_INTEL_SSSE3) != 0;
+#endif
 }
 
 
@@ -107,7 +123,7 @@ sha1_init (void *context)
  * Transform NBLOCKS of each 64 bytes (16 32-bit words) at DATA.
  */
 static unsigned int
-transform (void *ctx, const unsigned char *data)
+_transform (void *ctx, const unsigned char *data)
 {
   SHA1_CONTEXT *hd = ctx;
   const u32 *idata = (const void *)data;
@@ -217,6 +233,27 @@ transform (void *ctx, const unsigned char *data)
 }
 
 
+#ifdef USE_SSSE3
+unsigned int
+_gcry_sha1_transform_amd64_ssse3 (void *state, const unsigned char *data);
+#endif
+
+
+static unsigned int
+transform (void *ctx, const unsigned char *data)
+{
+  SHA1_CONTEXT *hd = ctx;
+
+#ifdef USE_SSSE3
+  if (hd->use_ssse3)
+    return _gcry_sha1_transform_amd64_ssse3 (&hd->h0, data)
+           + 4 * sizeof(void*);
+#endif
+
+  return _transform (hd, data);
+}
+
+
 /* The routine final terminates the computation and
  * returns the digest.
  * The handle is prepared for a new cycle, but adding bytes to the
diff --git a/configure.ac b/configure.ac
index b91240b..9088d02 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1773,6 +1773,13 @@ GCRYPT_DIGESTS="$GCRYPT_DIGESTS rmd160.lo sha1.lo"
 AC_DEFINE(USE_RMD160, 1, [Defined if this module should be included])
 AC_DEFINE(USE_SHA1, 1,   [Defined if this module should be included])
 
+case "${host}" in
+  x86_64-*-*)
+    # Build with the assembly implementation
+    GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo"
+  ;;
+esac
+
 LIST_MEMBER(scrypt, $enabled_kdfs)
 if test "$found" = "1" ; then
    GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"

-----------------------------------------------------------------------

Summary of changes:
 cipher/Makefile.am        |    2 +-
 cipher/sha1-ssse3-amd64.c |  319 +++++++++++++++++++++++++++++++++++++++++++++
 cipher/sha1.c             |   39 +++++-
 configure.ac              |    7 +
 4 files changed, 365 insertions(+), 2 deletions(-)
 create mode 100644 cipher/sha1-ssse3-amd64.c


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org




More information about the Gnupg-commits mailing list