[PATCH 1/1] Improved whirlpool hash performance

And Sch andsch at inbox.com
Thu Aug 28 20:02:17 CEST 2014


* cipher/whirlpool.c (whirlpool_transform, sbox, added macro): Added macro and rearranged round function to alternate between reading to and writing from different state and key variables. Two whirlpool_context_t variables removed, two were replaced, the sizes of state and key doubled, so overall the burn stack stays the same. buffer_to_block and block_xor were combined into one operation. The sbox was converted to one large table, because it is faster than many small tables.
--

Benchmark on different systems:

Intel(R) Atom(TM) CPU N570   @ 1.66GHz
before:
Hash:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 WHIRLPOOL      |     63.40 ns/B     15.04 MiB/s         - c/B
after:
Hash:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 WHIRLPOOL      |     46.21 ns/B     20.64 MiB/s         - c/B

Intel(R) Core(TM) i5-4670 CPU @ 3.40GHz
before:
Hash:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 WHIRLPOOL      |      7.75 ns/B     123.0 MiB/s         - c/B
after:
Hash:
                |  nanosecs/byte   mebibytes/sec   cycles/byte
 WHIRLPOOL      |      6.70 ns/B     142.3 MiB/s         - c/B

This one actually shows greater improvement on the Atom system.

Signed-off-by: Andrei Scherer <andsch at inbox.com>

---

diff -ruNp libgcrypt-1.6.2/cipher/whirlpool.c libgcrypt-1.6.3/cipher/whirlpool.c
--- libgcrypt-1.6.2/cipher/whirlpool.c	2014-08-21 07:50:39.000000000 -0500
+++ libgcrypt-1.6.3/cipher/whirlpool.c	2014-08-28 12:47:04.917824140 -0500
@@ -87,6 +87,17 @@ typedef struct {
   for (i = 0; i < 8; i++) \
     block_dst[i] ^= block_src[i];
 
+/* XOR lookup boxes with index SRC [(SHIFT + n) & 7] >> x. */
+#define WHIRLPOOL_XOR(src, shift) \
+	C[((unsigned int)(src[ (shift)         ] >> 56)       )          ] ^ \
+	C[((unsigned int)(src[((shift) + 7) & 7] >> 48) & 0xff) +  256   ] ^ \
+	C[((unsigned int)(src[((shift) + 6) & 7] >> 40) & 0xff) + (256*2)] ^ \
+	C[((unsigned int)(src[((shift) + 5) & 7] >> 32) & 0xff) + (256*3)] ^ \
+	C[((unsigned int)(src[((shift) + 4) & 7] >> 24) & 0xff) + (256*4)] ^ \
+	C[((unsigned int)(src[((shift) + 3) & 7] >> 16) & 0xff) + (256*5)] ^ \
+	C[((unsigned int)(src[((shift) + 2) & 7] >>  8) & 0xff) + (256*6)] ^ \
+	C[((unsigned int)(src[((shift) + 1) & 7]      ) & 0xff) + (256*7)] \
+
 ?
 
 /* Round constants.  */
@@ -107,7 +118,7 @@ static const u64 rc[R] =
 ?
 
 /* Main lookup boxes.  */
-static const u64 C0[256] =
+static const u64 C[8*256] =
   {
     U64_C (0x18186018c07830d8), U64_C (0x23238c2305af4626),
     U64_C (0xc6c63fc67ef991b8), U64_C (0xe8e887e8136fcdfb),
@@ -237,10 +248,7 @@ static const u64 C0[256] =
     U64_C (0x98985a98b4c22d2c), U64_C (0xa4a4aaa4490e55ed),
     U64_C (0x2828a0285d885075), U64_C (0x5c5c6d5cda31b886),
     U64_C (0xf8f8c7f8933fed6b), U64_C (0x8686228644a411c2),
-  };
 
-static const u64 C1[256] =
-  {
     U64_C (0xd818186018c07830), U64_C (0x2623238c2305af46),
     U64_C (0xb8c6c63fc67ef991), U64_C (0xfbe8e887e8136fcd),
     U64_C (0xcb878726874ca113), U64_C (0x11b8b8dab8a9626d),
@@ -369,10 +377,7 @@ static const u64 C1[256] =
     U64_C (0x2c98985a98b4c22d), U64_C (0xeda4a4aaa4490e55),
     U64_C (0x752828a0285d8850), U64_C (0x865c5c6d5cda31b8),
     U64_C (0x6bf8f8c7f8933fed), U64_C (0xc28686228644a411),
-  };
 
-static const u64 C2[256] =
-  {
     U64_C (0x30d818186018c078), U64_C (0x462623238c2305af),
     U64_C (0x91b8c6c63fc67ef9), U64_C (0xcdfbe8e887e8136f),
     U64_C (0x13cb878726874ca1), U64_C (0x6d11b8b8dab8a962),
@@ -501,10 +506,7 @@ static const u64 C2[256] =
     U64_C (0x2d2c98985a98b4c2), U64_C (0x55eda4a4aaa4490e),
     U64_C (0x50752828a0285d88), U64_C (0xb8865c5c6d5cda31),
     U64_C (0xed6bf8f8c7f8933f), U64_C (0x11c28686228644a4),
-  };
 
-static const u64 C3[256] =
-  {
     U64_C (0x7830d818186018c0), U64_C (0xaf462623238c2305),
     U64_C (0xf991b8c6c63fc67e), U64_C (0x6fcdfbe8e887e813),
     U64_C (0xa113cb878726874c), U64_C (0x626d11b8b8dab8a9),
@@ -633,10 +635,7 @@ static const u64 C3[256] =
     U64_C (0xc22d2c98985a98b4), U64_C (0x0e55eda4a4aaa449),
     U64_C (0x8850752828a0285d), U64_C (0x31b8865c5c6d5cda),
     U64_C (0x3fed6bf8f8c7f893), U64_C (0xa411c28686228644),
-  };
 
-static const u64 C4[256] =
-  {
     U64_C (0xc07830d818186018), U64_C (0x05af462623238c23),
     U64_C (0x7ef991b8c6c63fc6), U64_C (0x136fcdfbe8e887e8),
     U64_C (0x4ca113cb87872687), U64_C (0xa9626d11b8b8dab8),
@@ -765,10 +764,7 @@ static const u64 C4[256] =
     U64_C (0xb4c22d2c98985a98), U64_C (0x490e55eda4a4aaa4),
     U64_C (0x5d8850752828a028), U64_C (0xda31b8865c5c6d5c),
     U64_C (0x933fed6bf8f8c7f8), U64_C (0x44a411c286862286),
-  };
 
-static const u64 C5[256] =
-  {
     U64_C (0x18c07830d8181860), U64_C (0x2305af462623238c),
     U64_C (0xc67ef991b8c6c63f), U64_C (0xe8136fcdfbe8e887),
     U64_C (0x874ca113cb878726), U64_C (0xb8a9626d11b8b8da),
@@ -897,10 +893,7 @@ static const u64 C5[256] =
     U64_C (0x98b4c22d2c98985a), U64_C (0xa4490e55eda4a4aa),
     U64_C (0x285d8850752828a0), U64_C (0x5cda31b8865c5c6d),
     U64_C (0xf8933fed6bf8f8c7), U64_C (0x8644a411c2868622),
-  };
 
-static const u64 C6[256] =
-  {
     U64_C (0x6018c07830d81818), U64_C (0x8c2305af46262323),
     U64_C (0x3fc67ef991b8c6c6), U64_C (0x87e8136fcdfbe8e8),
     U64_C (0x26874ca113cb8787), U64_C (0xdab8a9626d11b8b8),
@@ -1029,10 +1022,7 @@ static const u64 C6[256] =
     U64_C (0x5a98b4c22d2c9898), U64_C (0xaaa4490e55eda4a4),
     U64_C (0xa0285d8850752828), U64_C (0x6d5cda31b8865c5c),
     U64_C (0xc7f8933fed6bf8f8), U64_C (0x228644a411c28686),
-  };
 
-static const u64 C7[256] =
-  {
     U64_C (0x186018c07830d818), U64_C (0x238c2305af462623),
     U64_C (0xc63fc67ef991b8c6), U64_C (0xe887e8136fcdfbe8),
     U64_C (0x8726874ca113cb87), U64_C (0xb8dab8a9626d11b8),
@@ -1163,7 +1153,6 @@ static const u64 C7[256] =
     U64_C (0xf8c7f8933fed6bf8), U64_C (0x86228644a411c286),
   };
 
-
 ?
 /*
  * Transform block.
@@ -1172,97 +1161,36 @@ static unsigned int
 whirlpool_transform (void *ctx, const unsigned char *data)
 {
   whirlpool_context_t *context = ctx;
-  whirlpool_block_t data_block;
-  whirlpool_block_t key;
-  whirlpool_block_t state;
-  whirlpool_block_t block;
+  u64 key[2][BLOCK_SIZE / 8];
+  u64 state[2][BLOCK_SIZE / 8];
   unsigned int r;
   unsigned int i;
 
-  buffer_to_block (data, data_block, i);
-  block_copy (key, context->hash_state, i);
-  block_copy (state, context->hash_state, i);
-  block_xor (state, data_block, i);
+  /* buffer_to_block and block_xor at once */
+
+  for (i = 0; i < 8; i++)
+    state[0][i] = buf_get_be64((data) + i * 8) ^ context->hash_state[i];
+
+  block_copy (key[0], context->hash_state, i);
+  block_copy (context->hash_state, state[0], i);
 
-  for (r = 0; r < R; r++)
+  for (r = 0, i = 0; r < R; r++, i = !i)
     {
-      /* Compute round key K^r.  */
+      /* Compute round key K^r, and apply r-th round transformation, interleaved  */
 
-      block[0] = (C0[(key[0] >> 56) & 0xFF] ^ C1[(key[7] >> 48) & 0xFF] ^
-		  C2[(key[6] >> 40) & 0xFF] ^ C3[(key[5] >> 32) & 0xFF] ^
-		  C4[(key[4] >> 24) & 0xFF] ^ C5[(key[3] >> 16) & 0xFF] ^
-		  C6[(key[2] >>  8) & 0xFF] ^ C7[(key[1] >>  0) & 0xFF] ^ rc[r]);
-      block[1] = (C0[(key[1] >> 56) & 0xFF] ^ C1[(key[0] >> 48) & 0xFF] ^
-		  C2[(key[7] >> 40) & 0xFF] ^ C3[(key[6] >> 32) & 0xFF] ^
-		  C4[(key[5] >> 24) & 0xFF] ^ C5[(key[4] >> 16) & 0xFF] ^
-		  C6[(key[3] >>  8) & 0xFF] ^ C7[(key[2] >>  0) & 0xFF]);
-      block[2] = (C0[(key[2] >> 56) & 0xFF] ^ C1[(key[1] >> 48) & 0xFF] ^
-		  C2[(key[0] >> 40) & 0xFF] ^ C3[(key[7] >> 32) & 0xFF] ^
-		  C4[(key[6] >> 24) & 0xFF] ^ C5[(key[5] >> 16) & 0xFF] ^
-		  C6[(key[4] >>  8) & 0xFF] ^ C7[(key[3] >>  0) & 0xFF]);
-      block[3] = (C0[(key[3] >> 56) & 0xFF] ^ C1[(key[2] >> 48) & 0xFF] ^
-		  C2[(key[1] >> 40) & 0xFF] ^ C3[(key[0] >> 32) & 0xFF] ^
-		  C4[(key[7] >> 24) & 0xFF] ^ C5[(key[6] >> 16) & 0xFF] ^
-		  C6[(key[5] >>  8) & 0xFF] ^ C7[(key[4] >>  0) & 0xFF]);
-      block[4] = (C0[(key[4] >> 56) & 0xFF] ^ C1[(key[3] >> 48) & 0xFF] ^
-		  C2[(key[2] >> 40) & 0xFF] ^ C3[(key[1] >> 32) & 0xFF] ^
-		  C4[(key[0] >> 24) & 0xFF] ^ C5[(key[7] >> 16) & 0xFF] ^
-		  C6[(key[6] >>  8) & 0xFF] ^ C7[(key[5] >>  0) & 0xFF]);
-      block[5] = (C0[(key[5] >> 56) & 0xFF] ^ C1[(key[4] >> 48) & 0xFF] ^
-		  C2[(key[3] >> 40) & 0xFF] ^ C3[(key[2] >> 32) & 0xFF] ^
-		  C4[(key[1] >> 24) & 0xFF] ^ C5[(key[0] >> 16) & 0xFF] ^
-		  C6[(key[7] >>  8) & 0xFF] ^ C7[(key[6] >>  0) & 0xFF]);
-      block[6] = (C0[(key[6] >> 56) & 0xFF] ^ C1[(key[5] >> 48) & 0xFF] ^
-		  C2[(key[4] >> 40) & 0xFF] ^ C3[(key[3] >> 32) & 0xFF] ^
-		  C4[(key[2] >> 24) & 0xFF] ^ C5[(key[1] >> 16) & 0xFF] ^
-		  C6[(key[0] >>  8) & 0xFF] ^ C7[(key[7] >>  0) & 0xFF]);
-      block[7] = (C0[(key[7] >> 56) & 0xFF] ^ C1[(key[6] >> 48) & 0xFF] ^
-		  C2[(key[5] >> 40) & 0xFF] ^ C3[(key[4] >> 32) & 0xFF] ^
-		  C4[(key[3] >> 24) & 0xFF] ^ C5[(key[2] >> 16) & 0xFF] ^
-		  C6[(key[1] >>  8) & 0xFF] ^ C7[(key[0] >>  0) & 0xFF]);
-      block_copy (key, block, i);
-
-      /* Apply r-th round transformation.  */
-
-      block[0] = (C0[(state[0] >> 56) & 0xFF] ^ C1[(state[7] >> 48) & 0xFF] ^
-		  C2[(state[6] >> 40) & 0xFF] ^ C3[(state[5] >> 32) & 0xFF] ^
-		  C4[(state[4] >> 24) & 0xFF] ^ C5[(state[3] >> 16) & 0xFF] ^
-		  C6[(state[2] >>  8) & 0xFF] ^ C7[(state[1] >>  0) & 0xFF] ^ key[0]);
-      block[1] = (C0[(state[1] >> 56) & 0xFF] ^ C1[(state[0] >> 48) & 0xFF] ^
-		  C2[(state[7] >> 40) & 0xFF] ^ C3[(state[6] >> 32) & 0xFF] ^
-		  C4[(state[5] >> 24) & 0xFF] ^ C5[(state[4] >> 16) & 0xFF] ^
-		  C6[(state[3] >>  8) & 0xFF] ^ C7[(state[2] >>  0) & 0xFF] ^ key[1]);
-      block[2] = (C0[(state[2] >> 56) & 0xFF] ^ C1[(state[1] >> 48) & 0xFF] ^
-		  C2[(state[0] >> 40) & 0xFF] ^ C3[(state[7] >> 32) & 0xFF] ^
-		  C4[(state[6] >> 24) & 0xFF] ^ C5[(state[5] >> 16) & 0xFF] ^
-		  C6[(state[4] >>  8) & 0xFF] ^ C7[(state[3] >>  0) & 0xFF] ^ key[2]);
-      block[3] = (C0[(state[3] >> 56) & 0xFF] ^ C1[(state[2] >> 48) & 0xFF] ^
-		  C2[(state[1] >> 40) & 0xFF] ^ C3[(state[0] >> 32) & 0xFF] ^
-		  C4[(state[7] >> 24) & 0xFF] ^ C5[(state[6] >> 16) & 0xFF] ^
-		  C6[(state[5] >>  8) & 0xFF] ^ C7[(state[4] >>  0) & 0xFF] ^ key[3]);
-      block[4] = (C0[(state[4] >> 56) & 0xFF] ^ C1[(state[3] >> 48) & 0xFF] ^
-		  C2[(state[2] >> 40) & 0xFF] ^ C3[(state[1] >> 32) & 0xFF] ^
-		  C4[(state[0] >> 24) & 0xFF] ^ C5[(state[7] >> 16) & 0xFF] ^
-		  C6[(state[6] >>  8) & 0xFF] ^ C7[(state[5] >>  0) & 0xFF] ^ key[4]);
-      block[5] = (C0[(state[5] >> 56) & 0xFF] ^ C1[(state[4] >> 48) & 0xFF] ^
-		  C2[(state[3] >> 40) & 0xFF] ^ C3[(state[2] >> 32) & 0xFF] ^
-		  C4[(state[1] >> 24) & 0xFF] ^ C5[(state[0] >> 16) & 0xFF] ^
-		  C6[(state[7] >>  8) & 0xFF] ^ C7[(state[6] >>  0) & 0xFF] ^ key[5]);
-      block[6] = (C0[(state[6] >> 56) & 0xFF] ^ C1[(state[5] >> 48) & 0xFF] ^
-		  C2[(state[4] >> 40) & 0xFF] ^ C3[(state[3] >> 32) & 0xFF] ^
-		  C4[(state[2] >> 24) & 0xFF] ^ C5[(state[1] >> 16) & 0xFF] ^
-		  C6[(state[0] >>  8) & 0xFF] ^ C7[(state[7] >>  0) & 0xFF] ^ key[6]);
-      block[7] = (C0[(state[7] >> 56) & 0xFF] ^ C1[(state[6] >> 48) & 0xFF] ^
-		  C2[(state[5] >> 40) & 0xFF] ^ C3[(state[4] >> 32) & 0xFF] ^
-		  C4[(state[3] >> 24) & 0xFF] ^ C5[(state[2] >> 16) & 0xFF] ^
-		  C6[(state[1] >>  8) & 0xFF] ^ C7[(state[0] >>  0) & 0xFF] ^ key[7]);
-      block_copy (state, block, i);
+      state[!i][0] = WHIRLPOOL_XOR(state[i], 0) ^ (key[!i][0] = WHIRLPOOL_XOR(key[i], 0) ^ rc[r]);
+      state[!i][1] = WHIRLPOOL_XOR(state[i], 1) ^ (key[!i][1] = WHIRLPOOL_XOR(key[i], 1));
+      state[!i][2] = WHIRLPOOL_XOR(state[i], 2) ^ (key[!i][2] = WHIRLPOOL_XOR(key[i], 2));
+      state[!i][3] = WHIRLPOOL_XOR(state[i], 3) ^ (key[!i][3] = WHIRLPOOL_XOR(key[i], 3));
+      state[!i][4] = WHIRLPOOL_XOR(state[i], 4) ^ (key[!i][4] = WHIRLPOOL_XOR(key[i], 4));
+      state[!i][5] = WHIRLPOOL_XOR(state[i], 5) ^ (key[!i][5] = WHIRLPOOL_XOR(key[i], 5));
+      state[!i][6] = WHIRLPOOL_XOR(state[i], 6) ^ (key[!i][6] = WHIRLPOOL_XOR(key[i], 6));
+      state[!i][7] = WHIRLPOOL_XOR(state[i], 7) ^ (key[!i][7] = WHIRLPOOL_XOR(key[i], 7));
     }
 
   /* Compression.  */
 
-  block_xor (context->hash_state, data_block, i);
-  block_xor (context->hash_state, state, i);
+  block_xor (context->hash_state, state[0], i);
 
   return /*burn_stack*/ 4 * sizeof(whirlpool_block_t) + 2 * sizeof(int) +
                         4 * sizeof(void*);

____________________________________________________________
FREE ONLINE PHOTOSHARING - Share your photos online with your friends and family!
Visit http://www.inbox.com/photosharing to find out more!





More information about the Gcrypt-devel mailing list