[PATCH 3/4] sha512: reduce stack use in transform function by 512 bytes

Sat Aug 31 12:36:19 CEST 2013

* cipher/sha512.c (transform): Change 'u64 w[80]' to 'u64 w[16]' and
inline input expansion to first 64 rounds.
(sha512_write, sha512_final): Reduce burn_stack depth by 512 bytes.
--

The input expansion to w[] array can be inlined with rounds and size of array
reduced from u64[80] to u64[16]. On Cortex-A8, this change gives small boost,
possibly thanks to reduced burn_stack depth.

New vs old (tests/benchmark md sha512 sha384):
SHA512	1.09x	1.11x	1.06x	1.09x	1.08x
SHA384	1.09x	1.11x	1.06x	1.09x	1.09x

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/sha512.c |  191 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 173 insertions(+), 18 deletions(-)

diff --git a/cipher/sha512.c b/cipher/sha512.c
index 2163e60..1bbcd11 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -135,7 +135,7 @@ static void
 transform (SHA512_CONTEXT *hd, const unsigned char *data)
 {
   u64 a, b, c, d, e, f, g, h;
-  u64 w[80];
+  u64 w[16];
   int t;
   static const u64 k[] =
     {
@@ -215,11 +215,8 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data)
 #define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
 #define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
 
-  for (t = 16; t < 80; t++)
-    w[t] = S1 (w[t - 2]) + w[t - 7] + S0 (w[t - 15]) + w[t - 16];
 
-
-  for (t = 0; t < 80; )
+  for (t = 0; t < 80 - 16; )
     {
       u64 t1, t2;
 
@@ -232,7 +229,125 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data)
          Unrolled with inline:      330ms
       */
 #if 0 /* Not unrolled.  */
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t];
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
+      w[t%16] += S1 (w[(t - 2)%16]) + w[(t - 7)%16] + S0 (w[(t - 15)%16]);
+      t2 = Sum0 (a) + Maj (a, b, c);
+      h = g;
+      g = f;
+      f = e;
+      e = d + t1;
+      d = c;
+      c = b;
+      b = a;
+      a = t1 + t2;
+      t++;
+#else /* Unrolled to interweave the chain variables.  */
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
+      w[0] += S1 (w[14]) + w[9] + S0 (w[1]);
+      t2 = Sum0 (a) + Maj (a, b, c);
+      d += t1;
+      h = t1 + t2;
+
+      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
+      w[1] += S1 (w[15]) + w[10] + S0 (w[2]);
+      t2 = Sum0 (h) + Maj (h, a, b);
+      c += t1;
+      g  = t1 + t2;
+
+      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
+      w[2] += S1 (w[0]) + w[11] + S0 (w[3]);
+      t2 = Sum0 (g) + Maj (g, h, a);
+      b += t1;
+      f  = t1 + t2;
+
+      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
+      w[3] += S1 (w[1]) + w[12] + S0 (w[4]);
+      t2 = Sum0 (f) + Maj (f, g, h);
+      a += t1;
+      e  = t1 + t2;
+
+      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
+      w[4] += S1 (w[2]) + w[13] + S0 (w[5]);
+      t2 = Sum0 (e) + Maj (e, f, g);
+      h += t1;
+      d  = t1 + t2;
+
+      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
+      w[5] += S1 (w[3]) + w[14] + S0 (w[6]);
+      t2 = Sum0 (d) + Maj (d, e, f);
+      g += t1;
+      c  = t1 + t2;
+
+      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
+      w[6] += S1 (w[4]) + w[15] + S0 (w[7]);
+      t2 = Sum0 (c) + Maj (c, d, e);
+      f += t1;
+      b  = t1 + t2;
+
+      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
+      w[7] += S1 (w[5]) + w[0] + S0 (w[8]);
+      t2 = Sum0 (b) + Maj (b, c, d);
+      e += t1;
+      a  = t1 + t2;
+
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
+      w[8] += S1 (w[6]) + w[1] + S0 (w[9]);
+      t2 = Sum0 (a) + Maj (a, b, c);
+      d += t1;
+      h  = t1 + t2;
+
+      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
+      w[9] += S1 (w[7]) + w[2] + S0 (w[10]);
+      t2 = Sum0 (h) + Maj (h, a, b);
+      c += t1;
+      g  = t1 + t2;
+
+      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
+      w[10] += S1 (w[8]) + w[3] + S0 (w[11]);
+      t2 = Sum0 (g) + Maj (g, h, a);
+      b += t1;
+      f  = t1 + t2;
+
+      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
+      w[11] += S1 (w[9]) + w[4] + S0 (w[12]);
+      t2 = Sum0 (f) + Maj (f, g, h);
+      a += t1;
+      e  = t1 + t2;
+
+      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
+      w[12] += S1 (w[10]) + w[5] + S0 (w[13]);
+      t2 = Sum0 (e) + Maj (e, f, g);
+      h += t1;
+      d  = t1 + t2;
+
+      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
+      w[13] += S1 (w[11]) + w[6] + S0 (w[14]);
+      t2 = Sum0 (d) + Maj (d, e, f);
+      g += t1;
+      c  = t1 + t2;
+
+      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
+      w[14] += S1 (w[12]) + w[7] + S0 (w[15]);
+      t2 = Sum0 (c) + Maj (c, d, e);
+      f += t1;
+      b  = t1 + t2;
+
+      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
+      w[15] += S1 (w[13]) + w[8] + S0 (w[0]);
+      t2 = Sum0 (b) + Maj (b, c, d);
+      e += t1;
+      a  = t1 + t2;
+
+      t += 16;
+#endif
+    }
+
+  for (; t < 80; )
+    {
+      u64 t1, t2;
+
+#if 0 /* Not unrolled.  */
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t%16];
       t2 = Sum0 (a) + Maj (a, b, c);
       h = g;
       g = f;
@@ -244,47 +359,87 @@ transform (SHA512_CONTEXT *hd, const unsigned char *data)
       a = t1 + t2;
       t++;
 #else /* Unrolled to interweave the chain variables.  */
-      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t];
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[0];
+      t2 = Sum0 (a) + Maj (a, b, c);
+      d += t1;
+      h  = t1 + t2;
+
+      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[1];
+      t2 = Sum0 (h) + Maj (h, a, b);
+      c += t1;
+      g  = t1 + t2;
+
+      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[2];
+      t2 = Sum0 (g) + Maj (g, h, a);
+      b += t1;
+      f  = t1 + t2;
+
+      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[3];
+      t2 = Sum0 (f) + Maj (f, g, h);
+      a += t1;
+      e  = t1 + t2;
+
+      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[4];
+      t2 = Sum0 (e) + Maj (e, f, g);
+      h += t1;
+      d  = t1 + t2;
+
+      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[5];
+      t2 = Sum0 (d) + Maj (d, e, f);
+      g += t1;
+      c  = t1 + t2;
+
+      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[6];
+      t2 = Sum0 (c) + Maj (c, d, e);
+      f += t1;
+      b  = t1 + t2;
+
+      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[7];
+      t2 = Sum0 (b) + Maj (b, c, d);
+      e += t1;
+      a  = t1 + t2;
+
+      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t+8] + w[8];
       t2 = Sum0 (a) + Maj (a, b, c);
       d += t1;
       h  = t1 + t2;
 
-      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[t+1];
+      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+9] + w[9];
       t2 = Sum0 (h) + Maj (h, a, b);
       c += t1;
       g  = t1 + t2;
 
-      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[t+2];
+      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+10] + w[10];
       t2 = Sum0 (g) + Maj (g, h, a);
       b += t1;
       f  = t1 + t2;
 
-      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[t+3];
+      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+11] + w[11];
       t2 = Sum0 (f) + Maj (f, g, h);
       a += t1;
       e  = t1 + t2;
 
-      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[t+4];
+      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+12] + w[12];
       t2 = Sum0 (e) + Maj (e, f, g);
       h += t1;
       d  = t1 + t2;
 
-      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[t+5];
+      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+13] + w[13];
       t2 = Sum0 (d) + Maj (d, e, f);
       g += t1;
       c  = t1 + t2;
 
-      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[t+6];
+      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+14] + w[14];
       t2 = Sum0 (c) + Maj (c, d, e);
       f += t1;
       b  = t1 + t2;
 
-      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[t+7];
+      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+15] + w[15];
       t2 = Sum0 (b) + Maj (b, c, d);
       e += t1;
       a  = t1 + t2;
 
-      t += 8;
+      t += 16;
 #endif
     }
 
@@ -312,7 +467,7 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen)
   if (hd->count == 128)
     {				/* flush the buffer */
       transform (hd, hd->buf);
-      _gcry_burn_stack (768);
+      _gcry_burn_stack (256);
       hd->count = 0;
       hd->nblocks++;
     }
@@ -335,7 +490,7 @@ sha512_write (void *context, const void *inbuf_arg, size_t inlen)
       inlen -= 128;
       inbuf += 128;
     }
-  _gcry_burn_stack (768);
+  _gcry_burn_stack (256);
   for (; inlen && hd->count < 128; inlen--)
     hd->buf[hd->count++] = *inbuf++;
 }
@@ -405,7 +560,7 @@ sha512_final (void *context)
   hd->buf[126] = lsb >> 8;
   hd->buf[127] = lsb;
   transform (hd, hd->buf);
-  _gcry_burn_stack (768);
+  _gcry_burn_stack (256);
 
   p = hd->buf;
 #ifdef WORDS_BIGENDIAN