[PATCH 1/1] Improved whirlpool hash performance
And Sch
andsch at inbox.com
Thu Aug 28 20:02:17 CEST 2014
* cipher/whirlpool.c (whirlpool_transform, sbox, added macro): Added macro and rearranged round function to alternate between reading to and writing from different state and key variables. Two whirlpool_context_t variables removed, two were replaced, the sizes of state and key doubled, so overall the burn stack stays the same. buffer_to_block and block_xor were combined into one operation. The sbox was converted to one large table, because it is faster than many small tables.
--
Benchmark on different systems:
Intel(R) Atom(TM) CPU N570 @ 1.66GHz
before:
Hash:
| nanosecs/byte mebibytes/sec cycles/byte
WHIRLPOOL | 63.40 ns/B 15.04 MiB/s - c/B
after:
Hash:
| nanosecs/byte mebibytes/sec cycles/byte
WHIRLPOOL | 46.21 ns/B 20.64 MiB/s - c/B
Intel(R) Core(TM) i5-4670 CPU @ 3.40GHz
before:
Hash:
| nanosecs/byte mebibytes/sec cycles/byte
WHIRLPOOL | 7.75 ns/B 123.0 MiB/s - c/B
after:
Hash:
| nanosecs/byte mebibytes/sec cycles/byte
WHIRLPOOL | 6.70 ns/B 142.3 MiB/s - c/B
This one actually shows greater improvement on the Atom system.
Signed-off-by: Andrei Scherer <andsch at inbox.com>
---
diff -ruNp libgcrypt-1.6.2/cipher/whirlpool.c libgcrypt-1.6.3/cipher/whirlpool.c
--- libgcrypt-1.6.2/cipher/whirlpool.c 2014-08-21 07:50:39.000000000 -0500
+++ libgcrypt-1.6.3/cipher/whirlpool.c 2014-08-28 12:47:04.917824140 -0500
@@ -87,6 +87,17 @@ typedef struct {
for (i = 0; i < 8; i++) \
block_dst[i] ^= block_src[i];
+/* XOR lookup boxes with index SRC [(SHIFT + n) & 7] >> x. */
+#define WHIRLPOOL_XOR(src, shift) \
+ C[((unsigned int)(src[ (shift) ] >> 56) ) ] ^ \
+ C[((unsigned int)(src[((shift) + 7) & 7] >> 48) & 0xff) + 256 ] ^ \
+ C[((unsigned int)(src[((shift) + 6) & 7] >> 40) & 0xff) + (256*2)] ^ \
+ C[((unsigned int)(src[((shift) + 5) & 7] >> 32) & 0xff) + (256*3)] ^ \
+ C[((unsigned int)(src[((shift) + 4) & 7] >> 24) & 0xff) + (256*4)] ^ \
+ C[((unsigned int)(src[((shift) + 3) & 7] >> 16) & 0xff) + (256*5)] ^ \
+ C[((unsigned int)(src[((shift) + 2) & 7] >> 8) & 0xff) + (256*6)] ^ \
+ C[((unsigned int)(src[((shift) + 1) & 7] ) & 0xff) + (256*7)] \
+
?
/* Round constants. */
@@ -107,7 +118,7 @@ static const u64 rc[R] =
?
/* Main lookup boxes. */
-static const u64 C0[256] =
+static const u64 C[8*256] =
{
U64_C (0x18186018c07830d8), U64_C (0x23238c2305af4626),
U64_C (0xc6c63fc67ef991b8), U64_C (0xe8e887e8136fcdfb),
@@ -237,10 +248,7 @@ static const u64 C0[256] =
U64_C (0x98985a98b4c22d2c), U64_C (0xa4a4aaa4490e55ed),
U64_C (0x2828a0285d885075), U64_C (0x5c5c6d5cda31b886),
U64_C (0xf8f8c7f8933fed6b), U64_C (0x8686228644a411c2),
- };
-static const u64 C1[256] =
- {
U64_C (0xd818186018c07830), U64_C (0x2623238c2305af46),
U64_C (0xb8c6c63fc67ef991), U64_C (0xfbe8e887e8136fcd),
U64_C (0xcb878726874ca113), U64_C (0x11b8b8dab8a9626d),
@@ -369,10 +377,7 @@ static const u64 C1[256] =
U64_C (0x2c98985a98b4c22d), U64_C (0xeda4a4aaa4490e55),
U64_C (0x752828a0285d8850), U64_C (0x865c5c6d5cda31b8),
U64_C (0x6bf8f8c7f8933fed), U64_C (0xc28686228644a411),
- };
-static const u64 C2[256] =
- {
U64_C (0x30d818186018c078), U64_C (0x462623238c2305af),
U64_C (0x91b8c6c63fc67ef9), U64_C (0xcdfbe8e887e8136f),
U64_C (0x13cb878726874ca1), U64_C (0x6d11b8b8dab8a962),
@@ -501,10 +506,7 @@ static const u64 C2[256] =
U64_C (0x2d2c98985a98b4c2), U64_C (0x55eda4a4aaa4490e),
U64_C (0x50752828a0285d88), U64_C (0xb8865c5c6d5cda31),
U64_C (0xed6bf8f8c7f8933f), U64_C (0x11c28686228644a4),
- };
-static const u64 C3[256] =
- {
U64_C (0x7830d818186018c0), U64_C (0xaf462623238c2305),
U64_C (0xf991b8c6c63fc67e), U64_C (0x6fcdfbe8e887e813),
U64_C (0xa113cb878726874c), U64_C (0x626d11b8b8dab8a9),
@@ -633,10 +635,7 @@ static const u64 C3[256] =
U64_C (0xc22d2c98985a98b4), U64_C (0x0e55eda4a4aaa449),
U64_C (0x8850752828a0285d), U64_C (0x31b8865c5c6d5cda),
U64_C (0x3fed6bf8f8c7f893), U64_C (0xa411c28686228644),
- };
-static const u64 C4[256] =
- {
U64_C (0xc07830d818186018), U64_C (0x05af462623238c23),
U64_C (0x7ef991b8c6c63fc6), U64_C (0x136fcdfbe8e887e8),
U64_C (0x4ca113cb87872687), U64_C (0xa9626d11b8b8dab8),
@@ -765,10 +764,7 @@ static const u64 C4[256] =
U64_C (0xb4c22d2c98985a98), U64_C (0x490e55eda4a4aaa4),
U64_C (0x5d8850752828a028), U64_C (0xda31b8865c5c6d5c),
U64_C (0x933fed6bf8f8c7f8), U64_C (0x44a411c286862286),
- };
-static const u64 C5[256] =
- {
U64_C (0x18c07830d8181860), U64_C (0x2305af462623238c),
U64_C (0xc67ef991b8c6c63f), U64_C (0xe8136fcdfbe8e887),
U64_C (0x874ca113cb878726), U64_C (0xb8a9626d11b8b8da),
@@ -897,10 +893,7 @@ static const u64 C5[256] =
U64_C (0x98b4c22d2c98985a), U64_C (0xa4490e55eda4a4aa),
U64_C (0x285d8850752828a0), U64_C (0x5cda31b8865c5c6d),
U64_C (0xf8933fed6bf8f8c7), U64_C (0x8644a411c2868622),
- };
-static const u64 C6[256] =
- {
U64_C (0x6018c07830d81818), U64_C (0x8c2305af46262323),
U64_C (0x3fc67ef991b8c6c6), U64_C (0x87e8136fcdfbe8e8),
U64_C (0x26874ca113cb8787), U64_C (0xdab8a9626d11b8b8),
@@ -1029,10 +1022,7 @@ static const u64 C6[256] =
U64_C (0x5a98b4c22d2c9898), U64_C (0xaaa4490e55eda4a4),
U64_C (0xa0285d8850752828), U64_C (0x6d5cda31b8865c5c),
U64_C (0xc7f8933fed6bf8f8), U64_C (0x228644a411c28686),
- };
-static const u64 C7[256] =
- {
U64_C (0x186018c07830d818), U64_C (0x238c2305af462623),
U64_C (0xc63fc67ef991b8c6), U64_C (0xe887e8136fcdfbe8),
U64_C (0x8726874ca113cb87), U64_C (0xb8dab8a9626d11b8),
@@ -1163,7 +1153,6 @@ static const u64 C7[256] =
U64_C (0xf8c7f8933fed6bf8), U64_C (0x86228644a411c286),
};
-
?
/*
* Transform block.
@@ -1172,97 +1161,36 @@ static unsigned int
whirlpool_transform (void *ctx, const unsigned char *data)
{
whirlpool_context_t *context = ctx;
- whirlpool_block_t data_block;
- whirlpool_block_t key;
- whirlpool_block_t state;
- whirlpool_block_t block;
+ u64 key[2][BLOCK_SIZE / 8];
+ u64 state[2][BLOCK_SIZE / 8];
unsigned int r;
unsigned int i;
- buffer_to_block (data, data_block, i);
- block_copy (key, context->hash_state, i);
- block_copy (state, context->hash_state, i);
- block_xor (state, data_block, i);
+ /* buffer_to_block and block_xor at once */
+
+ for (i = 0; i < 8; i++)
+ state[0][i] = buf_get_be64((data) + i * 8) ^ context->hash_state[i];
+
+ block_copy (key[0], context->hash_state, i);
+ block_copy (context->hash_state, state[0], i);
- for (r = 0; r < R; r++)
+ for (r = 0, i = 0; r < R; r++, i = !i)
{
- /* Compute round key K^r. */
+ /* Compute round key K^r, and apply r-th round transformation, interleaved */
- block[0] = (C0[(key[0] >> 56) & 0xFF] ^ C1[(key[7] >> 48) & 0xFF] ^
- C2[(key[6] >> 40) & 0xFF] ^ C3[(key[5] >> 32) & 0xFF] ^
- C4[(key[4] >> 24) & 0xFF] ^ C5[(key[3] >> 16) & 0xFF] ^
- C6[(key[2] >> 8) & 0xFF] ^ C7[(key[1] >> 0) & 0xFF] ^ rc[r]);
- block[1] = (C0[(key[1] >> 56) & 0xFF] ^ C1[(key[0] >> 48) & 0xFF] ^
- C2[(key[7] >> 40) & 0xFF] ^ C3[(key[6] >> 32) & 0xFF] ^
- C4[(key[5] >> 24) & 0xFF] ^ C5[(key[4] >> 16) & 0xFF] ^
- C6[(key[3] >> 8) & 0xFF] ^ C7[(key[2] >> 0) & 0xFF]);
- block[2] = (C0[(key[2] >> 56) & 0xFF] ^ C1[(key[1] >> 48) & 0xFF] ^
- C2[(key[0] >> 40) & 0xFF] ^ C3[(key[7] >> 32) & 0xFF] ^
- C4[(key[6] >> 24) & 0xFF] ^ C5[(key[5] >> 16) & 0xFF] ^
- C6[(key[4] >> 8) & 0xFF] ^ C7[(key[3] >> 0) & 0xFF]);
- block[3] = (C0[(key[3] >> 56) & 0xFF] ^ C1[(key[2] >> 48) & 0xFF] ^
- C2[(key[1] >> 40) & 0xFF] ^ C3[(key[0] >> 32) & 0xFF] ^
- C4[(key[7] >> 24) & 0xFF] ^ C5[(key[6] >> 16) & 0xFF] ^
- C6[(key[5] >> 8) & 0xFF] ^ C7[(key[4] >> 0) & 0xFF]);
- block[4] = (C0[(key[4] >> 56) & 0xFF] ^ C1[(key[3] >> 48) & 0xFF] ^
- C2[(key[2] >> 40) & 0xFF] ^ C3[(key[1] >> 32) & 0xFF] ^
- C4[(key[0] >> 24) & 0xFF] ^ C5[(key[7] >> 16) & 0xFF] ^
- C6[(key[6] >> 8) & 0xFF] ^ C7[(key[5] >> 0) & 0xFF]);
- block[5] = (C0[(key[5] >> 56) & 0xFF] ^ C1[(key[4] >> 48) & 0xFF] ^
- C2[(key[3] >> 40) & 0xFF] ^ C3[(key[2] >> 32) & 0xFF] ^
- C4[(key[1] >> 24) & 0xFF] ^ C5[(key[0] >> 16) & 0xFF] ^
- C6[(key[7] >> 8) & 0xFF] ^ C7[(key[6] >> 0) & 0xFF]);
- block[6] = (C0[(key[6] >> 56) & 0xFF] ^ C1[(key[5] >> 48) & 0xFF] ^
- C2[(key[4] >> 40) & 0xFF] ^ C3[(key[3] >> 32) & 0xFF] ^
- C4[(key[2] >> 24) & 0xFF] ^ C5[(key[1] >> 16) & 0xFF] ^
- C6[(key[0] >> 8) & 0xFF] ^ C7[(key[7] >> 0) & 0xFF]);
- block[7] = (C0[(key[7] >> 56) & 0xFF] ^ C1[(key[6] >> 48) & 0xFF] ^
- C2[(key[5] >> 40) & 0xFF] ^ C3[(key[4] >> 32) & 0xFF] ^
- C4[(key[3] >> 24) & 0xFF] ^ C5[(key[2] >> 16) & 0xFF] ^
- C6[(key[1] >> 8) & 0xFF] ^ C7[(key[0] >> 0) & 0xFF]);
- block_copy (key, block, i);
-
- /* Apply r-th round transformation. */
-
- block[0] = (C0[(state[0] >> 56) & 0xFF] ^ C1[(state[7] >> 48) & 0xFF] ^
- C2[(state[6] >> 40) & 0xFF] ^ C3[(state[5] >> 32) & 0xFF] ^
- C4[(state[4] >> 24) & 0xFF] ^ C5[(state[3] >> 16) & 0xFF] ^
- C6[(state[2] >> 8) & 0xFF] ^ C7[(state[1] >> 0) & 0xFF] ^ key[0]);
- block[1] = (C0[(state[1] >> 56) & 0xFF] ^ C1[(state[0] >> 48) & 0xFF] ^
- C2[(state[7] >> 40) & 0xFF] ^ C3[(state[6] >> 32) & 0xFF] ^
- C4[(state[5] >> 24) & 0xFF] ^ C5[(state[4] >> 16) & 0xFF] ^
- C6[(state[3] >> 8) & 0xFF] ^ C7[(state[2] >> 0) & 0xFF] ^ key[1]);
- block[2] = (C0[(state[2] >> 56) & 0xFF] ^ C1[(state[1] >> 48) & 0xFF] ^
- C2[(state[0] >> 40) & 0xFF] ^ C3[(state[7] >> 32) & 0xFF] ^
- C4[(state[6] >> 24) & 0xFF] ^ C5[(state[5] >> 16) & 0xFF] ^
- C6[(state[4] >> 8) & 0xFF] ^ C7[(state[3] >> 0) & 0xFF] ^ key[2]);
- block[3] = (C0[(state[3] >> 56) & 0xFF] ^ C1[(state[2] >> 48) & 0xFF] ^
- C2[(state[1] >> 40) & 0xFF] ^ C3[(state[0] >> 32) & 0xFF] ^
- C4[(state[7] >> 24) & 0xFF] ^ C5[(state[6] >> 16) & 0xFF] ^
- C6[(state[5] >> 8) & 0xFF] ^ C7[(state[4] >> 0) & 0xFF] ^ key[3]);
- block[4] = (C0[(state[4] >> 56) & 0xFF] ^ C1[(state[3] >> 48) & 0xFF] ^
- C2[(state[2] >> 40) & 0xFF] ^ C3[(state[1] >> 32) & 0xFF] ^
- C4[(state[0] >> 24) & 0xFF] ^ C5[(state[7] >> 16) & 0xFF] ^
- C6[(state[6] >> 8) & 0xFF] ^ C7[(state[5] >> 0) & 0xFF] ^ key[4]);
- block[5] = (C0[(state[5] >> 56) & 0xFF] ^ C1[(state[4] >> 48) & 0xFF] ^
- C2[(state[3] >> 40) & 0xFF] ^ C3[(state[2] >> 32) & 0xFF] ^
- C4[(state[1] >> 24) & 0xFF] ^ C5[(state[0] >> 16) & 0xFF] ^
- C6[(state[7] >> 8) & 0xFF] ^ C7[(state[6] >> 0) & 0xFF] ^ key[5]);
- block[6] = (C0[(state[6] >> 56) & 0xFF] ^ C1[(state[5] >> 48) & 0xFF] ^
- C2[(state[4] >> 40) & 0xFF] ^ C3[(state[3] >> 32) & 0xFF] ^
- C4[(state[2] >> 24) & 0xFF] ^ C5[(state[1] >> 16) & 0xFF] ^
- C6[(state[0] >> 8) & 0xFF] ^ C7[(state[7] >> 0) & 0xFF] ^ key[6]);
- block[7] = (C0[(state[7] >> 56) & 0xFF] ^ C1[(state[6] >> 48) & 0xFF] ^
- C2[(state[5] >> 40) & 0xFF] ^ C3[(state[4] >> 32) & 0xFF] ^
- C4[(state[3] >> 24) & 0xFF] ^ C5[(state[2] >> 16) & 0xFF] ^
- C6[(state[1] >> 8) & 0xFF] ^ C7[(state[0] >> 0) & 0xFF] ^ key[7]);
- block_copy (state, block, i);
+ state[!i][0] = WHIRLPOOL_XOR(state[i], 0) ^ (key[!i][0] = WHIRLPOOL_XOR(key[i], 0) ^ rc[r]);
+ state[!i][1] = WHIRLPOOL_XOR(state[i], 1) ^ (key[!i][1] = WHIRLPOOL_XOR(key[i], 1));
+ state[!i][2] = WHIRLPOOL_XOR(state[i], 2) ^ (key[!i][2] = WHIRLPOOL_XOR(key[i], 2));
+ state[!i][3] = WHIRLPOOL_XOR(state[i], 3) ^ (key[!i][3] = WHIRLPOOL_XOR(key[i], 3));
+ state[!i][4] = WHIRLPOOL_XOR(state[i], 4) ^ (key[!i][4] = WHIRLPOOL_XOR(key[i], 4));
+ state[!i][5] = WHIRLPOOL_XOR(state[i], 5) ^ (key[!i][5] = WHIRLPOOL_XOR(key[i], 5));
+ state[!i][6] = WHIRLPOOL_XOR(state[i], 6) ^ (key[!i][6] = WHIRLPOOL_XOR(key[i], 6));
+ state[!i][7] = WHIRLPOOL_XOR(state[i], 7) ^ (key[!i][7] = WHIRLPOOL_XOR(key[i], 7));
}
/* Compression. */
- block_xor (context->hash_state, data_block, i);
- block_xor (context->hash_state, state, i);
+ block_xor (context->hash_state, state[0], i);
return /*burn_stack*/ 4 * sizeof(whirlpool_block_t) + 2 * sizeof(int) +
4 * sizeof(void*);
____________________________________________________________
FREE ONLINE PHOTOSHARING - Share your photos online with your friends and family!
Visit http://www.inbox.com/photosharing to find out more!
More information about the Gcrypt-devel
mailing list