[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-65-gfaec12e

Mon Nov 26 11:50:44 CET 2012

This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  faec12e23f03c7cd1614594bfdd51f1302cadb42 (commit)
       via  fc37e805c6394c2e635d1a033670be961f36a6d2 (commit)
       via  35aff0cd43885b5f5c076432ec614698abeb63d8 (commit)
       via  5acd0e5ae2a58dda51c2b56c879b80a1a6d2c42f (commit)
       via  be3768994ad362dfc849a8cd0146b4c9bb287d20 (commit)
       via  3369d960158ab4231b83926a0f982e2a8819f173 (commit)
       via  55b96be08531664ed3f4230acebe0f45954bbc33 (commit)
      from  dfb4673da8ee52d95e0a62c9f49ca8599943f22e (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit faec12e23f03c7cd1614594bfdd51f1302cadb42
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Fri Nov 23 19:22:35 2012 +0200

    Optimize wipememory2 for i386 and x86-64
    
    * src/g10lib.h (wipememory2): Add call to fast_wipememory2.
    (fast_wipememory2): New macros for i386 and x86-64 architectures.
    Empty macro provided for other architectures.
    --
    
    Optimizing wipememory2 give broad range of speed improvements, as seen below.
    
    Cipher speed ratios, old-vs-new (AMD Phenom II, x86-64):
    
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    IDEA            1.32x   1.35x   1.29x   1.25x   1.30x   1.33x   1.33x   1.33x   1.22x   1.22x
    3DES            1.13x   1.10x   1.11x   1.12x   1.13x   1.16x   1.13x   1.13x   1.10x   1.12x
    CAST5           1.57x   1.51x   1.56x   1.43x   1.48x   1.50x   1.49x   1.51x   1.28x   1.27x
    BLOWFISH        1.53x   1.52x   1.56x   1.42x   1.50x   1.51x   1.49x   1.52x   1.27x   1.28x
    AES             1.33x   1.33x   1.00x   1.02x   1.04x   1.02x   1.26x   1.26x   1.00x   0.98x
    AES192          1.33x   1.36x   1.05x   1.00x   1.04x   1.00x   1.28x   1.24x   1.02x   1.00x
    AES256          1.22x   1.33x   0.98x   1.00x   1.03x   1.02x   1.28x   1.25x   1.00x   1.00x
    TWOFISH         1.34x   1.34x   1.44x   1.25x   1.35x   1.28x   1.37x   1.37x   1.14x   1.16x
    ARCFOUR         1.00x   1.00x
    DES             1.31x   1.30x   1.34x   1.25x   1.28x   1.28x   1.34x   1.26x   1.22x   1.24x
    TWOFISH128      1.41x   1.45x   1.46x   1.28x   1.32x   1.37x   1.34x   1.28x   1.16x   1.16x
    SERPENT128      1.16x   1.20x   1.22x   1.16x   1.16x   1.16x   1.18x   1.18x   1.14x   1.11x
    SERPENT192      1.16x   1.20x   1.23x   1.16x   1.19x   1.18x   1.16x   1.16x   1.10x   1.10x
    SERPENT256      1.18x   1.23x   1.23x   1.13x   1.18x   1.16x   1.18x   1.16x   1.11x   1.11x
    RFC2268_40      1.00x   1.00x   1.03x   0.96x   0.98x   1.00x   0.99x   1.00x   0.99x   0.98x
    SEED            1.20x   1.24x   1.25x   1.18x   1.19x   1.18x   1.21x   1.22x   1.14x   1.12x
    CAMELLIA128     1.60x   1.69x   1.56x   1.50x   1.60x   1.53x   1.64x   1.63x   1.29x   1.32x
    CAMELLIA192     1.55x   1.46x   1.44x   1.34x   1.42x   1.50x   1.46x   1.51x   1.26x   1.28x
    CAMELLIA256     1.52x   1.50x   1.47x   1.40x   1.51x   1.44x   1.41x   1.50x   1.28x   1.28x
    
    Cipher speed ratios, old-vs-new (AMD Phenom II, i386):
    
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    IDEA            1.15x   1.11x   1.10x   1.08x   1.09x   1.13x   1.16x   1.07x   1.10x   1.14x
    3DES            1.08x   1.08x   1.08x   1.07x   1.06x   1.06x   1.06x   1.05x   1.05x   1.05x
    CAST5           1.23x   1.25x   1.18x   1.17x   1.25x   1.21x   1.22x   1.17x   1.14x   1.12x
    BLOWFISH        1.25x   1.22x   1.21x   1.11x   1.23x   1.23x   1.24x   1.17x   1.14x   1.14x
    AES             1.13x   1.13x   1.02x   1.02x   0.98x   0.98x   1.16x   1.03x   1.02x   0.98x
    AES192          1.11x   1.12x   1.02x   0.99x   1.02x   0.95x   1.06x   1.00x   0.94x   0.91x
    AES256          1.05x   1.05x   0.97x   1.00x   1.00x   0.99x   1.11x   1.01x   0.99x   1.00x
    TWOFISH         1.11x   1.15x   1.16x   1.13x   1.12x   1.14x   1.13x   1.05x   1.07x   1.08x
    ARCFOUR         1.00x   0.97x
    DES             1.14x   1.14x   1.10x   1.07x   1.11x   1.12x   1.14x   1.08x   1.11x   1.17x
    TWOFISH128      1.16x   1.23x   1.18x   1.15x   1.14x   1.20x   1.15x   1.05x   1.08x   1.08x
    SERPENT128      1.08x   1.08x   1.08x   1.05x   1.06x   1.05x   1.09x   1.04x   1.05x   1.05x
    SERPENT192      1.07x   1.08x   1.08x   1.04x   1.04x   1.06x   1.08x   1.04x   1.01x   1.05x
    SERPENT256      1.06x   1.08x   1.05x   1.04x   1.05x   1.08x   1.07x   1.03x   1.06x   1.06x
    RFC2268_40      1.00x   0.99x   1.02x   1.01x   1.01x   1.00x   1.02x   0.99x   0.98x   0.99x
    SEED            1.12x   1.07x   1.12x   1.07x   1.09x   1.10x   1.10x   1.03x   1.07x   1.05x
    CAMELLIA128     1.24x   1.21x   1.16x   1.17x   1.16x   1.16x   1.21x   1.16x   1.13x   1.12x
    CAMELLIA192     1.19x   1.20x   1.14x   1.19x   1.20x   1.20x   1.18x   1.13x   1.13x   1.15x
    CAMELLIA256     1.21x   1.19x   1.14x   1.17x   1.17x   1.16x   1.17x   1.11x   1.12x   1.14x
    
    Hash speed ratios, old-vs-new (Intel Sandy-Bridge, x86-64):
    
    MD5             1.00x   1.47x   1.07x   1.00x   1.00x
    SHA1            1.06x   1.27x   1.06x   1.00x   1.00x
    RIPEMD160       1.04x   1.32x   1.11x   1.00x   1.00x
    TIGER192        1.05x   1.50x   1.15x   1.03x   1.05x
    SHA256          1.05x   1.38x   1.21x   1.04x   1.03x
    SHA384          1.15x   1.76x   1.25x   1.10x   1.04x
    SHA512          1.15x   1.76x   1.27x   1.08x   1.04x
    SHA224          1.05x   1.38x   1.21x   1.06x   1.00x
    MD4             1.17x   1.55x   1.06x   1.06x   1.00x
    CRC32           1.00x   1.00x   0.99x   1.04x   1.00x
    CRC32RFC1510    0.93x   1.00x   1.01x   1.00x   1.00x
    CRC24RFC2440    1.00x   1.00x   1.00x   0.99x   1.00x
    WHIRLPOOL       1.02x   1.00x   0.99x   1.00x   1.00x
    TIGER           1.05x   1.50x   1.15x   1.09x   1.05x
    TIGER2          1.05x   1.48x   1.16x   1.06x   0.95x
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>

diff --git a/src/g10lib.h b/src/g10lib.h
index c580c08..f1af399 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -238,11 +238,52 @@ void _gcry_burn_stack (int bytes);
 #define wipememory2(_ptr,_set,_len) do { \
               volatile char *_vptr=(volatile char *)(_ptr); \
               size_t _vlen=(_len); \
-              while(_vlen) { *_vptr=(_set); _vptr++; _vlen--; } \
+              unsigned char _vset=(_set); \
+              fast_wipememory2(_vptr,_vset,_vlen); \
+              while(_vlen) { *_vptr=(_vset); _vptr++; _vlen--; } \
                   } while(0)
 #define wipememory(_ptr,_len) wipememory2(_ptr,0,_len)
 
 
+/* Optimized fast_wipememory2 for i386 and x86-64 architechtures.  Maybe leave
+   tail bytes unhandled, in which case tail bytes are handled by wipememory2.
+ */
+#if defined(__x86_64__) && __GNUC__ >= 4
+#define fast_wipememory2(_vptr,_vset,_vlen) do { \
+              unsigned long long int _vset8 = _vset; \
+              if (_vlen < 8) \
+                break; \
+              _vset8 *= 0x0101010101010101ULL; \
+              do { \
+                asm volatile("movq %[set], %[ptr]\n\t" \
+                             : /**/ \
+                             : [set] "Cr" (_vset8), \
+                               [ptr] "m" (*_vptr) \
+                             : "memory"); \
+                _vlen -= 8; \
+                _vptr += 8; \
+              } while (_vlen >= 8); \
+                  } while (0)
+#elif defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4
+#define fast_wipememory2(_ptr,_set,_len) do { \
+              unsigned long _vset4 = _vset; \
+              if (_vlen < 4) \
+                break; \
+              _vset4 *= 0x01010101; \
+              do { \
+                asm volatile("movl %[set], %[ptr]\n\t" \
+                             : /**/ \
+                             : [set] "Cr" (_vset4), \
+                               [ptr] "m" (*_vptr) \
+                             : "memory"); \
+                _vlen -= 4; \
+                _vptr += 4; \
+              } while (_vlen >= 4); \
+                  } while (0)
+#else
+#define fast_wipememory2(_ptr,_set,_len)
+#endif
+
 
 /* Digit predicates.  */
 

commit fc37e805c6394c2e635d1a033670be961f36a6d2
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Fri Nov 23 19:22:30 2012 +0200

    Fix missing 64bit carry handling in AES-NI CTR mode
    
    * cipher/rijndael.c [USE_AESNI] (do_aesni_ctr, do_aesni_ctr_4): Add
    carry handling to 64-bit addition.
    (selftest_ctr_128): New function for testing IV handling in bulk CTR
    function.
    (selftest): Add call to selftest_ctr_128.
    --
    
    Carry handling checks if lower 64-bit part of SSE register was overflowed and
    if it was, increment upper parts since that point. Also add selftests to verify
    correct operation.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>

diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 34a0f8c..860dcf8 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1011,16 +1011,33 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
   static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
     { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
 
-  asm volatile ("movdqa %[ctr], %%xmm0\n\t"     /* xmm0, xmm2 := CTR   */
+  asm volatile ("movdqa (%[ctr]), %%xmm0\n\t"   /* xmm0, xmm2 := CTR   */
                 "movaps %%xmm0, %%xmm2\n\t"
                 "mov    $1, %%esi\n\t"          /* xmm2++ (big-endian) */
                 "movd   %%esi, %%xmm1\n\t"
+
+                "movl   12(%[ctr]), %%esi\n\t"  /* load lower parts of CTR */
+                "bswapl %%esi\n\t"
+                "movl   8(%[ctr]), %%edi\n\t"
+                "bswapl %%edi\n\t"
+
                 "pshufb %[mask], %%xmm2\n\t"
                 "paddq  %%xmm1, %%xmm2\n\t"
+
+                "addl   $1, %%esi\n\t"
+                "adcl   $0, %%edi\n\t"          /* detect 64bit overflow */
+                "jnc    .Lno_carry%=\n\t"
+
+                /* swap upper and lower halfs */
+                "pshufd $0x4e, %%xmm1, %%xmm1\n\t"
+                "paddq   %%xmm1, %%xmm2\n\t"	/* add carry to upper 64bits */
+
+                ".Lno_carry%=:\n\t"
+
                 "pshufb %[mask], %%xmm2\n\t"
-                "movdqa %%xmm2, %[ctr]\n"       /* Update CTR.         */
+                "movdqa %%xmm2, (%[ctr])\n"     /* Update CTR.         */
 
-                "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0]    */
+                "movdqa (%[key]), %%xmm1\n\t"   /* xmm1 := key[0]    */
                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
                 "movdqa 0x10(%[key]), %%xmm1\n\t"
                 aesenc_xmm1_xmm0
@@ -1060,12 +1077,13 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
                 "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR ^= input  */
                 "movdqu %%xmm0, %[dst]"          /* Store EncCTR.    */
 
-                : [ctr] "+m" (*ctr), [dst] "=m" (*b)
+                : [dst] "=m" (*b)
                 : [src] "m" (*a),
+                  [ctr] "r" (ctr),
                   [key] "r" (ctx->keyschenc),
                   [rounds] "g" (ctx->rounds),
                   [mask] "m" (*be_mask)
-                : "%esi", "cc", "memory");
+                : "%esi", "%edi", "cc", "memory");
 #undef aesenc_xmm1_xmm0
 #undef aesenclast_xmm1_xmm0
 }
@@ -1098,10 +1116,16 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
       xmm5  temp
    */
 
-  asm volatile ("movdqa %[ctr], %%xmm0\n\t"     /* xmm0, xmm2 := CTR   */
+  asm volatile ("movdqa (%[ctr]), %%xmm0\n\t"   /* xmm0, xmm2 := CTR   */
                 "movaps %%xmm0, %%xmm2\n\t"
                 "mov    $1, %%esi\n\t"          /* xmm1 := 1 */
                 "movd   %%esi, %%xmm1\n\t"
+
+                "movl   12(%[ctr]), %%esi\n\t"  /* load lower parts of CTR */
+                "bswapl %%esi\n\t"
+                "movl   8(%[ctr]), %%edi\n\t"
+                "bswapl %%edi\n\t"
+
                 "pshufb %[mask], %%xmm2\n\t"    /* xmm2 := le(xmm2) */
                 "paddq  %%xmm1, %%xmm2\n\t"     /* xmm2++           */
                 "movaps %%xmm2, %%xmm3\n\t"     /* xmm3 := xmm2     */
@@ -1110,11 +1134,39 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
                 "paddq  %%xmm1, %%xmm4\n\t"     /* xmm4++           */
                 "movaps %%xmm4, %%xmm5\n\t"     /* xmm5 := xmm4     */
                 "paddq  %%xmm1, %%xmm5\n\t"     /* xmm5++           */
+
+                /* swap upper and lower halfs */
+                "pshufd $0x4e, %%xmm1, %%xmm1\n\t"
+
+                "addl   $1, %%esi\n\t"
+                "adcl   $0, %%edi\n\t"          /* detect 64bit overflow */
+                "jc     .Lcarry_xmm2%=\n\t"
+                "addl   $1, %%esi\n\t"
+                "adcl   $0, %%edi\n\t"          /* detect 64bit overflow */
+                "jc     .Lcarry_xmm3%=\n\t"
+                "addl   $1, %%esi\n\t"
+                "adcl   $0, %%edi\n\t"          /* detect 64bit overflow */
+                "jc     .Lcarry_xmm4%=\n\t"
+                "addl   $1, %%esi\n\t"
+                "adcl   $0, %%edi\n\t"          /* detect 64bit overflow */
+                "jc     .Lcarry_xmm5%=\n\t"
+                "jmp    .Lno_carry%=\n\t"
+
+                ".Lcarry_xmm2%=:\n\t"
+                "paddq   %%xmm1, %%xmm2\n\t"
+                ".Lcarry_xmm3%=:\n\t"
+                "paddq   %%xmm1, %%xmm3\n\t"
+                ".Lcarry_xmm4%=:\n\t"
+                "paddq   %%xmm1, %%xmm4\n\t"
+                ".Lcarry_xmm5%=:\n\t"
+                "paddq   %%xmm1, %%xmm5\n\t"
+
+                ".Lno_carry%=:\n\t"
                 "pshufb %[mask], %%xmm2\n\t"    /* xmm2 := be(xmm2) */
                 "pshufb %[mask], %%xmm3\n\t"    /* xmm3 := be(xmm3) */
                 "pshufb %[mask], %%xmm4\n\t"    /* xmm4 := be(xmm4) */
                 "pshufb %[mask], %%xmm5\n\t"    /* xmm5 := be(xmm5) */
-                "movdqa %%xmm5, %[ctr]\n"       /* Update CTR.      */
+                "movdqa %%xmm5, (%[ctr])\n"     /* Update CTR.      */
 
                 "movdqa (%[key]), %%xmm1\n\t"    /* xmm1 := key[0]    */
                 "pxor   %%xmm1, %%xmm0\n\t"     /* xmm0 ^= key[0]    */
@@ -1198,28 +1250,30 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
                 aesenclast_xmm1_xmm3
                 aesenclast_xmm1_xmm4
 
-                "movdqu %[src], %%xmm1\n\t"      /* Get block 1.      */
+                "movdqu (%[src]), %%xmm1\n\t"    /* Get block 1.      */
                 "pxor %%xmm1, %%xmm0\n\t"        /* EncCTR-1 ^= input */
-                "movdqu %%xmm0, %[dst]\n\t"      /* Store block 1     */
+                "movdqu %%xmm0, (%[dst])\n\t"    /* Store block 1     */
 
-                "movdqu (16)%[src], %%xmm1\n\t"  /* Get block 2.      */
+                "movdqu 16(%[src]), %%xmm1\n\t"  /* Get block 2.      */
                 "pxor %%xmm1, %%xmm2\n\t"        /* EncCTR-2 ^= input */
-                "movdqu %%xmm2, (16)%[dst]\n\t"  /* Store block 2.    */
+                "movdqu %%xmm2, 16(%[dst])\n\t"  /* Store block 2.    */
 
-                "movdqu (32)%[src], %%xmm1\n\t"  /* Get block 3.      */
+                "movdqu 32(%[src]), %%xmm1\n\t"  /* Get block 3.      */
                 "pxor %%xmm1, %%xmm3\n\t"        /* EncCTR-3 ^= input */
-                "movdqu %%xmm3, (32)%[dst]\n\t"  /* Store block 3.    */
+                "movdqu %%xmm3, 32(%[dst])\n\t"  /* Store block 3.    */
 
-                "movdqu (48)%[src], %%xmm1\n\t"  /* Get block 4.      */
+                "movdqu 48(%[src]), %%xmm1\n\t"  /* Get block 4.      */
                 "pxor %%xmm1, %%xmm4\n\t"        /* EncCTR-4 ^= input */
-                "movdqu %%xmm4, (48)%[dst]"      /* Store block 4.   */
+                "movdqu %%xmm4, 48(%[dst])"      /* Store block 4.   */
 
-                : [ctr] "+m" (*ctr), [dst] "=m" (*b)
-                : [src] "m" (*a),
+                :
+                : [ctr] "r" (ctr),
+                  [src] "r" (a),
+                  [dst] "r" (b),
                   [key] "r" (ctx->keyschenc),
                   [rounds] "g" (ctx->rounds),
                   [mask] "m" (*be_mask)
-                : "%esi", "cc", "memory");
+                : "%esi", "%edi", "cc", "memory");
 #undef aesenc_xmm1_xmm0
 #undef aesenc_xmm1_xmm2
 #undef aesenc_xmm1_xmm3
@@ -1970,6 +2024,102 @@ selftest_basic_256 (void)
   return NULL;
 }
 
+
+/* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  RIJNDAEL_context ctx ATTR_ALIGNED_16;
+  unsigned char plaintext[7*16] ATTR_ALIGNED_16;
+  unsigned char ciphertext[7*16] ATTR_ALIGNED_16;
+  unsigned char plaintext2[7*16] ATTR_ALIGNED_16;
+  unsigned char iv[16] ATTR_ALIGNED_16;
+  unsigned char iv2[16] ATTR_ALIGNED_16;
+  int i, j, diff;
+
+  static const unsigned char key[16] ATTR_ALIGNED_16 = {
+      0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+      0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
+    };
+  static char error_str[128];
+
+  rijndael_setkey (&ctx, key, sizeof (key));
+
+  /* Test single block code path */
+  memset(iv, 0xff, sizeof(iv));
+  for (i = 0; i < 16; i++)
+    plaintext[i] = i;
+
+  /* CTR manually.  */
+  rijndael_encrypt (&ctx, ciphertext, iv);
+  for (i = 0; i < 16; i++)
+    ciphertext[i] ^= plaintext[i];
+  for (i = 16; i > 0; i--)
+    {
+      iv[i-1]++;
+      if (iv[i-1])
+        break;
+    }
+
+  memset(iv2, 0xff, sizeof(iv2));
+  _gcry_aes_ctr_enc (&ctx, iv2, plaintext2, ciphertext, 1);
+
+  if (memcmp(plaintext2, plaintext, 16))
+    return "AES-128-CTR test failed (plaintext mismatch)";
+
+  if (memcmp(iv2, iv, 16))
+    return "AES-128-CTR test failed (IV mismatch)";
+
+  /* Test parallelized code paths */
+  for (diff = 0; diff < 7; diff++) {
+    memset(iv, 0xff, sizeof(iv));
+    iv[15] -= diff;
+
+    for (i = 0; i < sizeof(plaintext); i++)
+      plaintext[i] = i;
+
+    /* Create CTR ciphertext manually.  */
+    for (i = 0; i < sizeof(plaintext); i+=16)
+      {
+        rijndael_encrypt (&ctx, &ciphertext[i], iv);
+        for (j = 0; j < 16; j++)
+          ciphertext[i+j] ^= plaintext[i+j];
+        for (j = 16; j > 0; j--)
+          {
+            iv[j-1]++;
+            if (iv[j-1])
+              break;
+          }
+      }
+
+    /* Decrypt using bulk CTR and compare result.  */
+    memset(iv2, 0xff, sizeof(iv2));
+    iv2[15] -= diff;
+
+    _gcry_aes_ctr_enc (&ctx, iv2, plaintext2, ciphertext,
+                       sizeof(ciphertext) / BLOCKSIZE);
+
+    if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
+      {
+        snprintf(error_str, sizeof(error_str),
+                 "AES-128-CTR test failed (plaintext mismatch, diff: %d)",
+                 diff);
+        return error_str;
+      }
+    if (memcmp(iv2, iv, sizeof(iv)))
+      {
+        snprintf(error_str, sizeof(error_str),
+                 "AES-128-CTR test failed (IV mismatch, diff: %d)",
+                 diff);
+        return error_str;
+      }
+  }
+
+  return NULL;
+}
+
+
 /* Run all the self-tests and return NULL on success.  This function
    is used for the on-the-fly self-tests. */
 static const char *
@@ -1982,6 +2132,9 @@ selftest (void)
        || (r = selftest_basic_256 ()) )
     return r;
 
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
   return r;
 }
 

commit 35aff0cd43885b5f5c076432ec614698abeb63d8
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Fri Nov 23 19:22:14 2012 +0200

    Add parallelized AES-NI CBC decryption
    
    * cipher/rijndael.c [USE_AESNI] (aesni_cleanup_5): New macro.
    [USE_AESNI] (do_aesni_dec_vec4): New function.
    (_gcry_aes_cbc_dec) [USE_AESNI]: Add parallelized CBC loop.
    (_gcry_aes_cbc_dec) [USE_AESNI]: Change IV storage register from xmm3
    to xmm5.
    --
    
    This gives ~60% improvement in CBC decryption speed on sandy-bridge (x86-64).
    Overall speed improvement with this and previous CBC patches is over 400%.
    
    Before:
    
    $ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
    Running each test 1000 times.
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    AES            670ms   770ms  2920ms   720ms  1900ms   660ms  2260ms  2250ms   480ms   500ms
    AES192         860ms   930ms  3250ms   870ms  2210ms   830ms  2580ms  2580ms   570ms   570ms
    AES256        1020ms  1080ms  3580ms  1030ms  2550ms   970ms  2880ms  2870ms   660ms   660ms
    
    After:
    
    $ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
    Running each test 1000 times.
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    AES            670ms   770ms  2130ms   450ms  1880ms   670ms  2250ms  2280ms   490ms   490ms
    AES192         880ms   920ms  2460ms   540ms  2210ms   830ms  2580ms  2570ms   580ms   570ms
    AES256        1020ms  1070ms  2800ms   620ms  2560ms   970ms  2880ms  2880ms   660ms   650ms
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>

diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 69e1df1..34a0f8c 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -822,6 +822,115 @@ do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
 }
 
 
+/* Decrypt four blocks using the Intel AES-NI instructions.  Blocks are input
+ * and output through SSE registers xmm1 to xmm4.  */
+static void
+do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
+#define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
+#define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
+#define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
+#define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
+#define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
+#define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
+#define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
+  asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+                "pxor   %%xmm0, %%xmm1\n\t"     /* xmm1 ^= key[0] */
+                "pxor   %%xmm0, %%xmm2\n\t"     /* xmm2 ^= key[0] */
+                "pxor   %%xmm0, %%xmm3\n\t"     /* xmm3 ^= key[0] */
+                "pxor   %%xmm0, %%xmm4\n\t"     /* xmm4 ^= key[0] */
+                "movdqa 0x10(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x20(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x30(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x40(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x50(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x60(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x70(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x80(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0x90(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xa0(%[key]), %%xmm0\n\t"
+                "cmp $10, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xb0(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xc0(%[key]), %%xmm0\n\t"
+                "cmp $12, %[rounds]\n\t"
+                "jz .Ldeclast%=\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xd0(%[key]), %%xmm0\n\t"
+                aesdec_xmm0_xmm1
+                aesdec_xmm0_xmm2
+                aesdec_xmm0_xmm3
+                aesdec_xmm0_xmm4
+                "movdqa 0xe0(%[key]), %%xmm0\n"
+
+                ".Ldeclast%=:\n\t"
+                aesdeclast_xmm0_xmm1
+                aesdeclast_xmm0_xmm2
+                aesdeclast_xmm0_xmm3
+                aesdeclast_xmm0_xmm4
+                : /* no output */
+                : [key] "r" (ctx->keyschdec),
+                  [rounds] "r" (ctx->rounds)
+                : "cc", "memory");
+#undef aesdec_xmm0_xmm1
+#undef aesdec_xmm0_xmm2
+#undef aesdec_xmm0_xmm3
+#undef aesdec_xmm0_xmm4
+#undef aesdeclast_xmm0_xmm1
+#undef aesdeclast_xmm0_xmm2
+#undef aesdeclast_xmm0_xmm3
+#undef aesdeclast_xmm0_xmm4
+}
+
+
 /* Perform a CFB encryption or decryption round using the
    initialization vector IV and the input block A.  Write the result
    to the output block B and update IV.  IV needs to be 16 byte
@@ -1623,17 +1732,51 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
           ctx->decryption_prepared = 1;
         }
 
-      /* As we avoid memcpy to/from stack by using xmm2 and xmm3 for temporary
-         storage, out-of-order CPUs see parallellism even over loop iterations
-         and see 2.5x to 2.9x speed up on Intel Sandy-Bridge. Further
-         improvements are possible with do_aesni_cbc_dec_4() when implemented.
-       */
       asm volatile
-        ("movdqu %[iv], %%xmm3\n\t"	/* use xmm3 as fast IV storage */
+        ("movdqu %[iv], %%xmm5\n\t"	/* use xmm5 as fast IV storage */
          : /* No output */
          : [iv] "m" (*iv)
          : "memory");
 
+      for ( ;nblocks > 3 ; nblocks -= 4 )
+        {
+          asm volatile
+            ("movdqu 0*16(%[inbuf]), %%xmm1\n\t"	/* load input blocks */
+             "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+             "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+             "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+             : /* No output */
+             : [inbuf] "r" (inbuf)
+             : "memory");
+
+          do_aesni_dec_vec4 (ctx);
+
+          asm volatile
+            ("pxor %%xmm5, %%xmm1\n\t"			/* xor IV with output */
+             "movdqu 0*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+             "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+             "pxor %%xmm5, %%xmm2\n\t"			/* xor IV with output */
+             "movdqu 1*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+             "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+             "pxor %%xmm5, %%xmm3\n\t"			/* xor IV with output */
+             "movdqu 2*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+             "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+             "pxor %%xmm5, %%xmm4\n\t"			/* xor IV with output */
+             "movdqu 3*16(%[inbuf]), %%xmm5\n\t"	/* load new IV */
+             "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+             : /* No output */
+             : [inbuf] "r" (inbuf),
+               [outbuf] "r" (outbuf)
+             : "memory");
+
+          outbuf += 4*BLOCKSIZE;
+          inbuf  += 4*BLOCKSIZE;
+        }
+
       for ( ;nblocks; nblocks-- )
         {
           asm volatile
@@ -1647,9 +1790,9 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
 
           asm volatile
             ("movdqu %[outbuf], %%xmm0\n\t"
-             "pxor %%xmm3, %%xmm0\n\t"		/* xor IV with output */
+             "pxor %%xmm5, %%xmm0\n\t"		/* xor IV with output */
              "movdqu %%xmm0, %[outbuf]\n\t"
-             "movdqu %%xmm2, %%xmm3\n\t"	/* store savebuf as new IV */
+             "movdqu %%xmm2, %%xmm5\n\t"	/* store savebuf as new IV */
              : /* No output */
              : [outbuf] "m" (*outbuf)
              : "memory");
@@ -1659,7 +1802,7 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
         }
 
       asm volatile
-        ("movdqu %%xmm3, %[iv]\n\t"	/* store IV */
+        ("movdqu %%xmm5, %[iv]\n\t"	/* store IV */
          : /* No output */
          : [iv] "m" (*iv)
          : "memory");

commit 5acd0e5ae2a58dda51c2b56c879b80a1a6d2c42f
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Fri Nov 23 19:22:09 2012 +0200

    Clear xmm5 after use in AES-NI CTR mode
    
    * cipher/rijndael.c [USE_AESNI]: Rename aesni_cleanup_2_4 to
    aesni_cleanup_2_5.
    [USE_AESNI] (aesni_cleanup_2_5): Clear xmm5 register.
    (_gcry_aes_ctr_enc, _gcry_aes_cbc_dec) [USE_AESNI]: Use
    aesni_cleanup_2_5 instead of aesni_cleanup_2_4.
    --
    
    xmm5 register is used by parallelized AES-NI CTR mode, so it should be cleaned
    up after use too.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>

diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 982c54e..69e1df1 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -144,10 +144,11 @@ typedef struct
   do { asm volatile ("pxor %%xmm0, %%xmm0\n\t"                          \
                      "pxor %%xmm1, %%xmm1\n" :: );                      \
   } while (0)
-# define aesni_cleanup_2_4()                                            \
+# define aesni_cleanup_2_5()                                            \
   do { asm volatile ("pxor %%xmm2, %%xmm2\n\t"                          \
                      "pxor %%xmm3, %%xmm3\n"                            \
-                     "pxor %%xmm4, %%xmm4\n":: );                       \
+                     "pxor %%xmm4, %%xmm4\n"                            \
+                     "pxor %%xmm5, %%xmm5\n":: );                       \
   } while (0)
 #else
 # define aesni_prepare() do { } while (0)
@@ -1338,7 +1339,7 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
           inbuf  += BLOCKSIZE;
         }
       aesni_cleanup ();
-      aesni_cleanup_2_4 ();
+      aesni_cleanup_2_5 ();
     }
 #endif /*USE_AESNI*/
   else
@@ -1664,7 +1665,7 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
          : "memory");
 
       aesni_cleanup ();
-      aesni_cleanup_2_4 ();
+      aesni_cleanup_2_5 ();
     }
 #endif /*USE_AESNI*/
   else

commit be3768994ad362dfc849a8cd0146b4c9bb287d20
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Fri Nov 23 19:22:04 2012 +0200

    Optimize AES-NI CBC encryption
    
    * cipher/rijndeal.c (_gcry_aes_cbc_enc) [USE_AESNI]: Add AES-NI
    spesific loop and use SSE2 assembler for xoring and copying of
    blocks.
    --
    
    This gives ~35% improvement in 'tests/benchmark cipher aes' on Sandy-Bridge
    CPU (x86-64).
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>

diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 104f869..982c54e 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1249,23 +1249,50 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
   aesni_prepare ();
   for ( ;nblocks; nblocks-- )
     {
-      for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
-        outbuf[i] = inbuf[i] ^ *ivp++;
-
       if (0)
         ;
-#ifdef USE_PADLOCK
-      else if (ctx->use_padlock)
-        do_padlock (ctx, 0, outbuf, outbuf);
-#endif /*USE_PADLOCK*/
 #ifdef USE_AESNI
       else if (ctx->use_aesni)
-        do_aesni (ctx, 0, outbuf, outbuf);
+        {
+          /* ~35% speed up on Sandy-Bridge when doing xoring and copying with
+             SSE registers.  */
+          asm volatile ("movdqu %[iv], %%xmm0\n\t"
+                        "movdqu %[inbuf], %%xmm1\n\t"
+                        "pxor %%xmm0, %%xmm1\n\t"
+                        "movdqu %%xmm1, %[outbuf]\n\t"
+                        : /* No output */
+                        : [iv] "m" (*iv),
+                          [inbuf] "m" (*inbuf),
+                          [outbuf] "m" (*outbuf)
+                        : "memory" );
+
+          do_aesni (ctx, 0, outbuf, outbuf);
+
+          asm volatile ("movdqu %[outbuf], %%xmm0\n\t"
+                        "movdqu %%xmm0, %[iv]\n\t"
+                        : /* No output */
+                        : [outbuf] "m" (*outbuf),
+                          [iv] "m" (*iv)
+                        : "memory" );
+        }
 #endif /*USE_AESNI*/
       else
-        do_encrypt (ctx, outbuf, outbuf );
+        {
+          for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
+            outbuf[i] = inbuf[i] ^ *ivp++;
+
+          if (0)
+            ;
+#ifdef USE_PADLOCK
+          else if (ctx->use_padlock)
+            do_padlock (ctx, 0, outbuf, outbuf);
+#endif /*USE_PADLOCK*/
+          else
+            do_encrypt (ctx, outbuf, outbuf );
+
+          memcpy (iv, outbuf, BLOCKSIZE);
+        }
 
-      memcpy (iv, outbuf, BLOCKSIZE);
       inbuf += BLOCKSIZE;
       if (!cbc_mac)
         outbuf += BLOCKSIZE;

commit 3369d960158ab4231b83926a0f982e2a8819f173
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Fri Nov 23 19:21:59 2012 +0200

    Improve parallelizability of CBC decryption for AES-NI
    
    * cipher/rijndael.c (_gcry_aes_cbc_dec) [USE_AESNI]: Add AES-NI
    specific CBC mode loop with temporary block and IV stored in free SSE
    registers.
    --
    
    Benchmark results on Intel Core i5-2450M (x86-64) show ~2.5x improvement:
    
    Before:
    
    $ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
    Running each test 1000 times.
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    AES            690ms   780ms  2940ms  2110ms  1880ms   670ms  2250ms  2250ms   490ms   500ms
    AES192         890ms   930ms  3260ms  2390ms  2220ms   820ms  2580ms  2590ms   560ms   570ms
    AES256        1040ms  1070ms  3590ms  2640ms  2540ms   970ms  2880ms  2890ms   650ms   650ms
    
    After:
    
    $ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
    Running each test 1000 times.
                    ECB/Stream         CBC             CFB             OFB             CTR
                 --------------- --------------- --------------- --------------- ---------------
    AES            670ms   770ms  2920ms   720ms  1900ms   660ms  2260ms  2250ms   480ms   500ms
    AES192         860ms   930ms  3250ms   870ms  2210ms   830ms  2580ms  2580ms   570ms   570ms
    AES256        1020ms  1080ms  3580ms  1030ms  2550ms   970ms  2880ms  2870ms   660ms   660ms
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>

diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index d081b42..104f869 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1582,33 +1582,86 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
   int i;
   unsigned char savebuf[BLOCKSIZE];
 
-  aesni_prepare ();
-  for ( ;nblocks; nblocks-- )
+  if (0)
+    ;
+#ifdef USE_AESNI
+  else if (ctx->use_aesni)
     {
-      /* We need to save INBUF away because it may be identical to
-         OUTBUF.  */
-      memcpy (savebuf, inbuf, BLOCKSIZE);
+      aesni_prepare ();
 
-      if (0)
-        ;
+      if (!ctx->decryption_prepared )
+        {
+          prepare_decryption ( ctx );
+          ctx->decryption_prepared = 1;
+        }
+
+      /* As we avoid memcpy to/from stack by using xmm2 and xmm3 for temporary
+         storage, out-of-order CPUs see parallellism even over loop iterations
+         and see 2.5x to 2.9x speed up on Intel Sandy-Bridge. Further
+         improvements are possible with do_aesni_cbc_dec_4() when implemented.
+       */
+      asm volatile
+        ("movdqu %[iv], %%xmm3\n\t"	/* use xmm3 as fast IV storage */
+         : /* No output */
+         : [iv] "m" (*iv)
+         : "memory");
+
+      for ( ;nblocks; nblocks-- )
+        {
+          asm volatile
+            ("movdqu %[inbuf], %%xmm2\n\t"	/* use xmm2 as savebuf */
+             : /* No output */
+             : [inbuf] "m" (*inbuf)
+             : "memory");
+
+          /* uses only xmm0 and xmm1 */
+          do_aesni_dec_aligned (ctx, outbuf, inbuf);
+
+          asm volatile
+            ("movdqu %[outbuf], %%xmm0\n\t"
+             "pxor %%xmm3, %%xmm0\n\t"		/* xor IV with output */
+             "movdqu %%xmm0, %[outbuf]\n\t"
+             "movdqu %%xmm2, %%xmm3\n\t"	/* store savebuf as new IV */
+             : /* No output */
+             : [outbuf] "m" (*outbuf)
+             : "memory");
+
+          outbuf += BLOCKSIZE;
+          inbuf  += BLOCKSIZE;
+        }
+
+      asm volatile
+        ("movdqu %%xmm3, %[iv]\n\t"	/* store IV */
+         : /* No output */
+         : [iv] "m" (*iv)
+         : "memory");
+
+      aesni_cleanup ();
+      aesni_cleanup_2_4 ();
+    }
+#endif /*USE_AESNI*/
+  else
+    for ( ;nblocks; nblocks-- )
+      {
+        /* We need to save INBUF away because it may be identical to
+           OUTBUF.  */
+        memcpy (savebuf, inbuf, BLOCKSIZE);
+
+        if (0)
+          ;
 #ifdef USE_PADLOCK
-      else if (ctx->use_padlock)
-        do_padlock (ctx, 1, outbuf, inbuf);
+        else if (ctx->use_padlock)
+          do_padlock (ctx, 1, outbuf, inbuf);
 #endif /*USE_PADLOCK*/
-#ifdef USE_AESNI
-      else if (ctx->use_aesni)
-        do_aesni (ctx, 1, outbuf, inbuf);
-#endif /*USE_AESNI*/
-      else
-        do_decrypt (ctx, outbuf, inbuf);
+        else
+          do_decrypt (ctx, outbuf, inbuf);
 
-      for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
-        outbuf[i] ^= *ivp++;
-      memcpy (iv, savebuf, BLOCKSIZE);
-      inbuf += BLOCKSIZE;
-      outbuf += BLOCKSIZE;
-    }
-  aesni_cleanup ();
+        for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
+          outbuf[i] ^= *ivp++;
+        memcpy (iv, savebuf, BLOCKSIZE);
+        inbuf += BLOCKSIZE;
+        outbuf += BLOCKSIZE;
+      }
 
   _gcry_burn_stack (48 + 2*sizeof(int) + BLOCKSIZE + 4*sizeof (char*));
 }

commit 55b96be08531664ed3f4230acebe0f45954bbc33
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date:   Fri Nov 23 19:21:54 2012 +0200

    Extend test of chained modes for 128bit ciphers
    
    * tests/basic.c (check_one_cipher_core, check_one_cipher): Increase
    input and output buffer sizes from 16 bytes to 1024+16=1040 bytes.
    (check_one_cipher_core): Add asserts to verify sizes of temporary
    buffers.
    --
    
    Currently check_one_cipher() has buffer size of 16 bytes, which is one block
    with 128bit cipher. As result chained modes for 128bit ciphers are not well
    tested. Increase buffer size to 1040 bytes, so that iterations of chained
    modes and parallellized code paths (AES-NI CTR, etc) are also tested.
    Extra 16 bytes after 1024 bytes to ensure that loop transision from
    parallelized code paths to serialized code paths get tested too.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>

diff --git a/tests/basic.c b/tests/basic.c
index 8001e86..656d76c 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -1367,13 +1367,15 @@ check_one_cipher_core (int algo, int mode, int flags,
                        int bufshift, int pass)
 {
   gcry_cipher_hd_t hd;
-  unsigned char in_buffer[17], out_buffer[17];
+  unsigned char in_buffer[1040+1], out_buffer[1040+1];
   unsigned char *in, *out;
   int keylen;
   gcry_error_t err = 0;
 
   assert (nkey == 32);
-  assert (nplain == 16);
+  assert (nplain == 1040);
+  assert (sizeof(in_buffer) == nplain + 1);
+  assert (sizeof(out_buffer) == sizeof(in_buffer));
 
   if (!bufshift)
     {
@@ -1427,7 +1429,7 @@ check_one_cipher_core (int algo, int mode, int flags,
       return -1;
     }
 
-  err = gcry_cipher_encrypt (hd, out, 16, plain, 16);
+  err = gcry_cipher_encrypt (hd, out, nplain, plain, nplain);
   if (err)
     {
       fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt failed: %s\n",
@@ -1438,7 +1440,7 @@ check_one_cipher_core (int algo, int mode, int flags,
 
   gcry_cipher_reset (hd);
 
-  err = gcry_cipher_decrypt (hd, in, 16, out, 16);
+  err = gcry_cipher_decrypt (hd, in, nplain, out, nplain);
   if (err)
     {
       fail ("pass %d, algo %d, mode %d, gcry_cipher_decrypt failed: %s\n",
@@ -1447,15 +1449,15 @@ check_one_cipher_core (int algo, int mode, int flags,
       return -1;
     }
 
-  if (memcmp (plain, in, 16))
+  if (memcmp (plain, in, nplain))
     fail ("pass %d, algo %d, mode %d, encrypt-decrypt mismatch\n",
           pass, algo, mode);
 
   /* Again, using in-place encryption.  */
   gcry_cipher_reset (hd);
 
-  memcpy (out, plain, 16);
-  err = gcry_cipher_encrypt (hd, out, 16, NULL, 0);
+  memcpy (out, plain, nplain);
+  err = gcry_cipher_encrypt (hd, out, nplain, NULL, 0);
   if (err)
     {
       fail ("pass %d, algo %d, mode %d, in-place, gcry_cipher_encrypt failed:"
@@ -1467,7 +1469,7 @@ check_one_cipher_core (int algo, int mode, int flags,
 
   gcry_cipher_reset (hd);
 
-  err = gcry_cipher_decrypt (hd, out, 16, NULL, 0);
+  err = gcry_cipher_decrypt (hd, out, nplain, NULL, 0);
   if (err)
     {
       fail ("pass %d, algo %d, mode %d, in-place, gcry_cipher_decrypt failed:"
@@ -1477,7 +1479,7 @@ check_one_cipher_core (int algo, int mode, int flags,
       return -1;
     }
 
-  if (memcmp (plain, out, 16))
+  if (memcmp (plain, out, nplain))
     fail ("pass %d, algo %d, mode %d, in-place, encrypt-decrypt mismatch\n",
           pass, algo, mode);
 
@@ -1492,34 +1494,43 @@ check_one_cipher_core (int algo, int mode, int flags,
 static void
 check_one_cipher (int algo, int mode, int flags)
 {
-  char key[33];
-  unsigned char plain[17];
-  int bufshift;
+  char key[32+1];
+  unsigned char plain[1040+1];
+  int bufshift, i;
 
   for (bufshift=0; bufshift < 4; bufshift++)
     {
       /* Pass 0: Standard test.  */
       memcpy (key, "0123456789abcdef.,;/[]{}-=ABCDEF", 32);
       memcpy (plain, "foobar42FOOBAR17", 16);
-      if (check_one_cipher_core (algo, mode, flags, key, 32, plain, 16,
+      for (i = 16; i < 1040; i += 16)
+        {
+          memcpy (&plain[i], &plain[i-16], 16);
+          if (!++plain[i+7])
+            plain[i+6]++;
+          if (!++plain[i+15])
+            plain[i+14]++;
+        }
+
+      if (check_one_cipher_core (algo, mode, flags, key, 32, plain, 1040,
                                  bufshift, 0+10*bufshift))
         return;
 
       /* Pass 1: Key not aligned.  */
       memmove (key+1, key, 32);
-      if (check_one_cipher_core (algo, mode, flags, key+1, 32, plain, 16,
+      if (check_one_cipher_core (algo, mode, flags, key+1, 32, plain, 1040,
                                  bufshift, 1+10*bufshift))
         return;
 
       /* Pass 2: Key not aligned and data not aligned.  */
-      memmove (plain+1, plain, 16);
-      if (check_one_cipher_core (algo, mode, flags, key+1, 32, plain+1, 16,
+      memmove (plain+1, plain, 1024);
+      if (check_one_cipher_core (algo, mode, flags, key+1, 32, plain+1, 1040,
                                  bufshift, 2+10*bufshift))
         return;
 
       /* Pass 3: Key aligned and data not aligned.  */
       memmove (key, key+1, 32);
-      if (check_one_cipher_core (algo, mode, flags, key, 32, plain+1, 16,
+      if (check_one_cipher_core (algo, mode, flags, key, 32, plain+1, 1040,
                                  bufshift, 3+10*bufshift))
         return;
     }

-----------------------------------------------------------------------

Summary of changes:
 cipher/rijndael.c |  483 +++++++++++++++++++++++++++++++++++++++++++++++------
 src/g10lib.h      |   43 +++++-
 tests/basic.c     |   45 +++--
 3 files changed, 500 insertions(+), 71 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org