[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-65-gfaec12e
by Jussi Kivilinna
cvs at cvs.gnupg.org
Mon Nov 26 11:50:44 CET 2012
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".
The branch, master has been updated
via faec12e23f03c7cd1614594bfdd51f1302cadb42 (commit)
via fc37e805c6394c2e635d1a033670be961f36a6d2 (commit)
via 35aff0cd43885b5f5c076432ec614698abeb63d8 (commit)
via 5acd0e5ae2a58dda51c2b56c879b80a1a6d2c42f (commit)
via be3768994ad362dfc849a8cd0146b4c9bb287d20 (commit)
via 3369d960158ab4231b83926a0f982e2a8819f173 (commit)
via 55b96be08531664ed3f4230acebe0f45954bbc33 (commit)
from dfb4673da8ee52d95e0a62c9f49ca8599943f22e (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit faec12e23f03c7cd1614594bfdd51f1302cadb42
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date: Fri Nov 23 19:22:35 2012 +0200
Optimize wipememory2 for i386 and x86-64
* src/g10lib.h (wipememory2): Add call to fast_wipememory2.
(fast_wipememory2): New macros for i386 and x86-64 architectures.
Empty macro provided for other architectures.
--
Optimizing wipememory2 give broad range of speed improvements, as seen below.
Cipher speed ratios, old-vs-new (AMD Phenom II, x86-64):
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
IDEA 1.32x 1.35x 1.29x 1.25x 1.30x 1.33x 1.33x 1.33x 1.22x 1.22x
3DES 1.13x 1.10x 1.11x 1.12x 1.13x 1.16x 1.13x 1.13x 1.10x 1.12x
CAST5 1.57x 1.51x 1.56x 1.43x 1.48x 1.50x 1.49x 1.51x 1.28x 1.27x
BLOWFISH 1.53x 1.52x 1.56x 1.42x 1.50x 1.51x 1.49x 1.52x 1.27x 1.28x
AES 1.33x 1.33x 1.00x 1.02x 1.04x 1.02x 1.26x 1.26x 1.00x 0.98x
AES192 1.33x 1.36x 1.05x 1.00x 1.04x 1.00x 1.28x 1.24x 1.02x 1.00x
AES256 1.22x 1.33x 0.98x 1.00x 1.03x 1.02x 1.28x 1.25x 1.00x 1.00x
TWOFISH 1.34x 1.34x 1.44x 1.25x 1.35x 1.28x 1.37x 1.37x 1.14x 1.16x
ARCFOUR 1.00x 1.00x
DES 1.31x 1.30x 1.34x 1.25x 1.28x 1.28x 1.34x 1.26x 1.22x 1.24x
TWOFISH128 1.41x 1.45x 1.46x 1.28x 1.32x 1.37x 1.34x 1.28x 1.16x 1.16x
SERPENT128 1.16x 1.20x 1.22x 1.16x 1.16x 1.16x 1.18x 1.18x 1.14x 1.11x
SERPENT192 1.16x 1.20x 1.23x 1.16x 1.19x 1.18x 1.16x 1.16x 1.10x 1.10x
SERPENT256 1.18x 1.23x 1.23x 1.13x 1.18x 1.16x 1.18x 1.16x 1.11x 1.11x
RFC2268_40 1.00x 1.00x 1.03x 0.96x 0.98x 1.00x 0.99x 1.00x 0.99x 0.98x
SEED 1.20x 1.24x 1.25x 1.18x 1.19x 1.18x 1.21x 1.22x 1.14x 1.12x
CAMELLIA128 1.60x 1.69x 1.56x 1.50x 1.60x 1.53x 1.64x 1.63x 1.29x 1.32x
CAMELLIA192 1.55x 1.46x 1.44x 1.34x 1.42x 1.50x 1.46x 1.51x 1.26x 1.28x
CAMELLIA256 1.52x 1.50x 1.47x 1.40x 1.51x 1.44x 1.41x 1.50x 1.28x 1.28x
Cipher speed ratios, old-vs-new (AMD Phenom II, i386):
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
IDEA 1.15x 1.11x 1.10x 1.08x 1.09x 1.13x 1.16x 1.07x 1.10x 1.14x
3DES 1.08x 1.08x 1.08x 1.07x 1.06x 1.06x 1.06x 1.05x 1.05x 1.05x
CAST5 1.23x 1.25x 1.18x 1.17x 1.25x 1.21x 1.22x 1.17x 1.14x 1.12x
BLOWFISH 1.25x 1.22x 1.21x 1.11x 1.23x 1.23x 1.24x 1.17x 1.14x 1.14x
AES 1.13x 1.13x 1.02x 1.02x 0.98x 0.98x 1.16x 1.03x 1.02x 0.98x
AES192 1.11x 1.12x 1.02x 0.99x 1.02x 0.95x 1.06x 1.00x 0.94x 0.91x
AES256 1.05x 1.05x 0.97x 1.00x 1.00x 0.99x 1.11x 1.01x 0.99x 1.00x
TWOFISH 1.11x 1.15x 1.16x 1.13x 1.12x 1.14x 1.13x 1.05x 1.07x 1.08x
ARCFOUR 1.00x 0.97x
DES 1.14x 1.14x 1.10x 1.07x 1.11x 1.12x 1.14x 1.08x 1.11x 1.17x
TWOFISH128 1.16x 1.23x 1.18x 1.15x 1.14x 1.20x 1.15x 1.05x 1.08x 1.08x
SERPENT128 1.08x 1.08x 1.08x 1.05x 1.06x 1.05x 1.09x 1.04x 1.05x 1.05x
SERPENT192 1.07x 1.08x 1.08x 1.04x 1.04x 1.06x 1.08x 1.04x 1.01x 1.05x
SERPENT256 1.06x 1.08x 1.05x 1.04x 1.05x 1.08x 1.07x 1.03x 1.06x 1.06x
RFC2268_40 1.00x 0.99x 1.02x 1.01x 1.01x 1.00x 1.02x 0.99x 0.98x 0.99x
SEED 1.12x 1.07x 1.12x 1.07x 1.09x 1.10x 1.10x 1.03x 1.07x 1.05x
CAMELLIA128 1.24x 1.21x 1.16x 1.17x 1.16x 1.16x 1.21x 1.16x 1.13x 1.12x
CAMELLIA192 1.19x 1.20x 1.14x 1.19x 1.20x 1.20x 1.18x 1.13x 1.13x 1.15x
CAMELLIA256 1.21x 1.19x 1.14x 1.17x 1.17x 1.16x 1.17x 1.11x 1.12x 1.14x
Hash speed ratios, old-vs-new (Intel Sandy-Bridge, x86-64):
MD5 1.00x 1.47x 1.07x 1.00x 1.00x
SHA1 1.06x 1.27x 1.06x 1.00x 1.00x
RIPEMD160 1.04x 1.32x 1.11x 1.00x 1.00x
TIGER192 1.05x 1.50x 1.15x 1.03x 1.05x
SHA256 1.05x 1.38x 1.21x 1.04x 1.03x
SHA384 1.15x 1.76x 1.25x 1.10x 1.04x
SHA512 1.15x 1.76x 1.27x 1.08x 1.04x
SHA224 1.05x 1.38x 1.21x 1.06x 1.00x
MD4 1.17x 1.55x 1.06x 1.06x 1.00x
CRC32 1.00x 1.00x 0.99x 1.04x 1.00x
CRC32RFC1510 0.93x 1.00x 1.01x 1.00x 1.00x
CRC24RFC2440 1.00x 1.00x 1.00x 0.99x 1.00x
WHIRLPOOL 1.02x 1.00x 0.99x 1.00x 1.00x
TIGER 1.05x 1.50x 1.15x 1.09x 1.05x
TIGER2 1.05x 1.48x 1.16x 1.06x 0.95x
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
diff --git a/src/g10lib.h b/src/g10lib.h
index c580c08..f1af399 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -238,11 +238,52 @@ void _gcry_burn_stack (int bytes);
#define wipememory2(_ptr,_set,_len) do { \
volatile char *_vptr=(volatile char *)(_ptr); \
size_t _vlen=(_len); \
- while(_vlen) { *_vptr=(_set); _vptr++; _vlen--; } \
+ unsigned char _vset=(_set); \
+ fast_wipememory2(_vptr,_vset,_vlen); \
+ while(_vlen) { *_vptr=(_vset); _vptr++; _vlen--; } \
} while(0)
#define wipememory(_ptr,_len) wipememory2(_ptr,0,_len)
+/* Optimized fast_wipememory2 for i386 and x86-64 architechtures. Maybe leave
+ tail bytes unhandled, in which case tail bytes are handled by wipememory2.
+ */
+#if defined(__x86_64__) && __GNUC__ >= 4
+#define fast_wipememory2(_vptr,_vset,_vlen) do { \
+ unsigned long long int _vset8 = _vset; \
+ if (_vlen < 8) \
+ break; \
+ _vset8 *= 0x0101010101010101ULL; \
+ do { \
+ asm volatile("movq %[set], %[ptr]\n\t" \
+ : /**/ \
+ : [set] "Cr" (_vset8), \
+ [ptr] "m" (*_vptr) \
+ : "memory"); \
+ _vlen -= 8; \
+ _vptr += 8; \
+ } while (_vlen >= 8); \
+ } while (0)
+#elif defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4
+#define fast_wipememory2(_ptr,_set,_len) do { \
+ unsigned long _vset4 = _vset; \
+ if (_vlen < 4) \
+ break; \
+ _vset4 *= 0x01010101; \
+ do { \
+ asm volatile("movl %[set], %[ptr]\n\t" \
+ : /**/ \
+ : [set] "Cr" (_vset4), \
+ [ptr] "m" (*_vptr) \
+ : "memory"); \
+ _vlen -= 4; \
+ _vptr += 4; \
+ } while (_vlen >= 4); \
+ } while (0)
+#else
+#define fast_wipememory2(_ptr,_set,_len)
+#endif
+
/* Digit predicates. */
commit fc37e805c6394c2e635d1a033670be961f36a6d2
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date: Fri Nov 23 19:22:30 2012 +0200
Fix missing 64bit carry handling in AES-NI CTR mode
* cipher/rijndael.c [USE_AESNI] (do_aesni_ctr, do_aesni_ctr_4): Add
carry handling to 64-bit addition.
(selftest_ctr_128): New function for testing IV handling in bulk CTR
function.
(selftest): Add call to selftest_ctr_128.
--
Carry handling checks if lower 64-bit part of SSE register was overflowed and
if it was, increment upper parts since that point. Also add selftests to verify
correct operation.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 34a0f8c..860dcf8 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1011,16 +1011,33 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
static unsigned char be_mask[16] __attribute__ ((aligned (16))) =
{ 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
- asm volatile ("movdqa %[ctr], %%xmm0\n\t" /* xmm0, xmm2 := CTR */
+ asm volatile ("movdqa (%[ctr]), %%xmm0\n\t" /* xmm0, xmm2 := CTR */
"movaps %%xmm0, %%xmm2\n\t"
"mov $1, %%esi\n\t" /* xmm2++ (big-endian) */
"movd %%esi, %%xmm1\n\t"
+
+ "movl 12(%[ctr]), %%esi\n\t" /* load lower parts of CTR */
+ "bswapl %%esi\n\t"
+ "movl 8(%[ctr]), %%edi\n\t"
+ "bswapl %%edi\n\t"
+
"pshufb %[mask], %%xmm2\n\t"
"paddq %%xmm1, %%xmm2\n\t"
+
+ "addl $1, %%esi\n\t"
+ "adcl $0, %%edi\n\t" /* detect 64bit overflow */
+ "jnc .Lno_carry%=\n\t"
+
+ /* swap upper and lower halfs */
+ "pshufd $0x4e, %%xmm1, %%xmm1\n\t"
+ "paddq %%xmm1, %%xmm2\n\t" /* add carry to upper 64bits */
+
+ ".Lno_carry%=:\n\t"
+
"pshufb %[mask], %%xmm2\n\t"
- "movdqa %%xmm2, %[ctr]\n" /* Update CTR. */
+ "movdqa %%xmm2, (%[ctr])\n" /* Update CTR. */
- "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
+ "movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
"pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
"movdqa 0x10(%[key]), %%xmm1\n\t"
aesenc_xmm1_xmm0
@@ -1060,12 +1077,13 @@ do_aesni_ctr (const RIJNDAEL_context *ctx,
"pxor %%xmm1, %%xmm0\n\t" /* EncCTR ^= input */
"movdqu %%xmm0, %[dst]" /* Store EncCTR. */
- : [ctr] "+m" (*ctr), [dst] "=m" (*b)
+ : [dst] "=m" (*b)
: [src] "m" (*a),
+ [ctr] "r" (ctr),
[key] "r" (ctx->keyschenc),
[rounds] "g" (ctx->rounds),
[mask] "m" (*be_mask)
- : "%esi", "cc", "memory");
+ : "%esi", "%edi", "cc", "memory");
#undef aesenc_xmm1_xmm0
#undef aesenclast_xmm1_xmm0
}
@@ -1098,10 +1116,16 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
xmm5 temp
*/
- asm volatile ("movdqa %[ctr], %%xmm0\n\t" /* xmm0, xmm2 := CTR */
+ asm volatile ("movdqa (%[ctr]), %%xmm0\n\t" /* xmm0, xmm2 := CTR */
"movaps %%xmm0, %%xmm2\n\t"
"mov $1, %%esi\n\t" /* xmm1 := 1 */
"movd %%esi, %%xmm1\n\t"
+
+ "movl 12(%[ctr]), %%esi\n\t" /* load lower parts of CTR */
+ "bswapl %%esi\n\t"
+ "movl 8(%[ctr]), %%edi\n\t"
+ "bswapl %%edi\n\t"
+
"pshufb %[mask], %%xmm2\n\t" /* xmm2 := le(xmm2) */
"paddq %%xmm1, %%xmm2\n\t" /* xmm2++ */
"movaps %%xmm2, %%xmm3\n\t" /* xmm3 := xmm2 */
@@ -1110,11 +1134,39 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
"paddq %%xmm1, %%xmm4\n\t" /* xmm4++ */
"movaps %%xmm4, %%xmm5\n\t" /* xmm5 := xmm4 */
"paddq %%xmm1, %%xmm5\n\t" /* xmm5++ */
+
+ /* swap upper and lower halfs */
+ "pshufd $0x4e, %%xmm1, %%xmm1\n\t"
+
+ "addl $1, %%esi\n\t"
+ "adcl $0, %%edi\n\t" /* detect 64bit overflow */
+ "jc .Lcarry_xmm2%=\n\t"
+ "addl $1, %%esi\n\t"
+ "adcl $0, %%edi\n\t" /* detect 64bit overflow */
+ "jc .Lcarry_xmm3%=\n\t"
+ "addl $1, %%esi\n\t"
+ "adcl $0, %%edi\n\t" /* detect 64bit overflow */
+ "jc .Lcarry_xmm4%=\n\t"
+ "addl $1, %%esi\n\t"
+ "adcl $0, %%edi\n\t" /* detect 64bit overflow */
+ "jc .Lcarry_xmm5%=\n\t"
+ "jmp .Lno_carry%=\n\t"
+
+ ".Lcarry_xmm2%=:\n\t"
+ "paddq %%xmm1, %%xmm2\n\t"
+ ".Lcarry_xmm3%=:\n\t"
+ "paddq %%xmm1, %%xmm3\n\t"
+ ".Lcarry_xmm4%=:\n\t"
+ "paddq %%xmm1, %%xmm4\n\t"
+ ".Lcarry_xmm5%=:\n\t"
+ "paddq %%xmm1, %%xmm5\n\t"
+
+ ".Lno_carry%=:\n\t"
"pshufb %[mask], %%xmm2\n\t" /* xmm2 := be(xmm2) */
"pshufb %[mask], %%xmm3\n\t" /* xmm3 := be(xmm3) */
"pshufb %[mask], %%xmm4\n\t" /* xmm4 := be(xmm4) */
"pshufb %[mask], %%xmm5\n\t" /* xmm5 := be(xmm5) */
- "movdqa %%xmm5, %[ctr]\n" /* Update CTR. */
+ "movdqa %%xmm5, (%[ctr])\n" /* Update CTR. */
"movdqa (%[key]), %%xmm1\n\t" /* xmm1 := key[0] */
"pxor %%xmm1, %%xmm0\n\t" /* xmm0 ^= key[0] */
@@ -1198,28 +1250,30 @@ do_aesni_ctr_4 (const RIJNDAEL_context *ctx,
aesenclast_xmm1_xmm3
aesenclast_xmm1_xmm4
- "movdqu %[src], %%xmm1\n\t" /* Get block 1. */
+ "movdqu (%[src]), %%xmm1\n\t" /* Get block 1. */
"pxor %%xmm1, %%xmm0\n\t" /* EncCTR-1 ^= input */
- "movdqu %%xmm0, %[dst]\n\t" /* Store block 1 */
+ "movdqu %%xmm0, (%[dst])\n\t" /* Store block 1 */
- "movdqu (16)%[src], %%xmm1\n\t" /* Get block 2. */
+ "movdqu 16(%[src]), %%xmm1\n\t" /* Get block 2. */
"pxor %%xmm1, %%xmm2\n\t" /* EncCTR-2 ^= input */
- "movdqu %%xmm2, (16)%[dst]\n\t" /* Store block 2. */
+ "movdqu %%xmm2, 16(%[dst])\n\t" /* Store block 2. */
- "movdqu (32)%[src], %%xmm1\n\t" /* Get block 3. */
+ "movdqu 32(%[src]), %%xmm1\n\t" /* Get block 3. */
"pxor %%xmm1, %%xmm3\n\t" /* EncCTR-3 ^= input */
- "movdqu %%xmm3, (32)%[dst]\n\t" /* Store block 3. */
+ "movdqu %%xmm3, 32(%[dst])\n\t" /* Store block 3. */
- "movdqu (48)%[src], %%xmm1\n\t" /* Get block 4. */
+ "movdqu 48(%[src]), %%xmm1\n\t" /* Get block 4. */
"pxor %%xmm1, %%xmm4\n\t" /* EncCTR-4 ^= input */
- "movdqu %%xmm4, (48)%[dst]" /* Store block 4. */
+ "movdqu %%xmm4, 48(%[dst])" /* Store block 4. */
- : [ctr] "+m" (*ctr), [dst] "=m" (*b)
- : [src] "m" (*a),
+ :
+ : [ctr] "r" (ctr),
+ [src] "r" (a),
+ [dst] "r" (b),
[key] "r" (ctx->keyschenc),
[rounds] "g" (ctx->rounds),
[mask] "m" (*be_mask)
- : "%esi", "cc", "memory");
+ : "%esi", "%edi", "cc", "memory");
#undef aesenc_xmm1_xmm0
#undef aesenc_xmm1_xmm2
#undef aesenc_xmm1_xmm3
@@ -1970,6 +2024,102 @@ selftest_basic_256 (void)
return NULL;
}
+
+/* Run the self-tests for AES-CTR-128, tests IV increment of bulk CTR
+ encryption. Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+ RIJNDAEL_context ctx ATTR_ALIGNED_16;
+ unsigned char plaintext[7*16] ATTR_ALIGNED_16;
+ unsigned char ciphertext[7*16] ATTR_ALIGNED_16;
+ unsigned char plaintext2[7*16] ATTR_ALIGNED_16;
+ unsigned char iv[16] ATTR_ALIGNED_16;
+ unsigned char iv2[16] ATTR_ALIGNED_16;
+ int i, j, diff;
+
+ static const unsigned char key[16] ATTR_ALIGNED_16 = {
+ 0x06,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+ 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x21
+ };
+ static char error_str[128];
+
+ rijndael_setkey (&ctx, key, sizeof (key));
+
+ /* Test single block code path */
+ memset(iv, 0xff, sizeof(iv));
+ for (i = 0; i < 16; i++)
+ plaintext[i] = i;
+
+ /* CTR manually. */
+ rijndael_encrypt (&ctx, ciphertext, iv);
+ for (i = 0; i < 16; i++)
+ ciphertext[i] ^= plaintext[i];
+ for (i = 16; i > 0; i--)
+ {
+ iv[i-1]++;
+ if (iv[i-1])
+ break;
+ }
+
+ memset(iv2, 0xff, sizeof(iv2));
+ _gcry_aes_ctr_enc (&ctx, iv2, plaintext2, ciphertext, 1);
+
+ if (memcmp(plaintext2, plaintext, 16))
+ return "AES-128-CTR test failed (plaintext mismatch)";
+
+ if (memcmp(iv2, iv, 16))
+ return "AES-128-CTR test failed (IV mismatch)";
+
+ /* Test parallelized code paths */
+ for (diff = 0; diff < 7; diff++) {
+ memset(iv, 0xff, sizeof(iv));
+ iv[15] -= diff;
+
+ for (i = 0; i < sizeof(plaintext); i++)
+ plaintext[i] = i;
+
+ /* Create CTR ciphertext manually. */
+ for (i = 0; i < sizeof(plaintext); i+=16)
+ {
+ rijndael_encrypt (&ctx, &ciphertext[i], iv);
+ for (j = 0; j < 16; j++)
+ ciphertext[i+j] ^= plaintext[i+j];
+ for (j = 16; j > 0; j--)
+ {
+ iv[j-1]++;
+ if (iv[j-1])
+ break;
+ }
+ }
+
+ /* Decrypt using bulk CTR and compare result. */
+ memset(iv2, 0xff, sizeof(iv2));
+ iv2[15] -= diff;
+
+ _gcry_aes_ctr_enc (&ctx, iv2, plaintext2, ciphertext,
+ sizeof(ciphertext) / BLOCKSIZE);
+
+ if (memcmp(plaintext2, plaintext, sizeof(plaintext)))
+ {
+ snprintf(error_str, sizeof(error_str),
+ "AES-128-CTR test failed (plaintext mismatch, diff: %d)",
+ diff);
+ return error_str;
+ }
+ if (memcmp(iv2, iv, sizeof(iv)))
+ {
+ snprintf(error_str, sizeof(error_str),
+ "AES-128-CTR test failed (IV mismatch, diff: %d)",
+ diff);
+ return error_str;
+ }
+ }
+
+ return NULL;
+}
+
+
/* Run all the self-tests and return NULL on success. This function
is used for the on-the-fly self-tests. */
static const char *
@@ -1982,6 +2132,9 @@ selftest (void)
|| (r = selftest_basic_256 ()) )
return r;
+ if ( (r = selftest_ctr_128 ()) )
+ return r;
+
return r;
}
commit 35aff0cd43885b5f5c076432ec614698abeb63d8
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date: Fri Nov 23 19:22:14 2012 +0200
Add parallelized AES-NI CBC decryption
* cipher/rijndael.c [USE_AESNI] (aesni_cleanup_5): New macro.
[USE_AESNI] (do_aesni_dec_vec4): New function.
(_gcry_aes_cbc_dec) [USE_AESNI]: Add parallelized CBC loop.
(_gcry_aes_cbc_dec) [USE_AESNI]: Change IV storage register from xmm3
to xmm5.
--
This gives ~60% improvement in CBC decryption speed on sandy-bridge (x86-64).
Overall speed improvement with this and previous CBC patches is over 400%.
Before:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 670ms 770ms 2920ms 720ms 1900ms 660ms 2260ms 2250ms 480ms 500ms
AES192 860ms 930ms 3250ms 870ms 2210ms 830ms 2580ms 2580ms 570ms 570ms
AES256 1020ms 1080ms 3580ms 1030ms 2550ms 970ms 2880ms 2870ms 660ms 660ms
After:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 670ms 770ms 2130ms 450ms 1880ms 670ms 2250ms 2280ms 490ms 490ms
AES192 880ms 920ms 2460ms 540ms 2210ms 830ms 2580ms 2570ms 580ms 570ms
AES256 1020ms 1070ms 2800ms 620ms 2560ms 970ms 2880ms 2880ms 660ms 650ms
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 69e1df1..34a0f8c 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -822,6 +822,115 @@ do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
}
+/* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input
+ * and output through SSE registers xmm1 to xmm4. */
+static void
+do_aesni_dec_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesdec_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xde, 0xc8\n\t"
+#define aesdec_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd0\n\t"
+#define aesdec_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xde, 0xd8\n\t"
+#define aesdec_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xde, 0xe0\n\t"
+#define aesdeclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xc8\n\t"
+#define aesdeclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd0\n\t"
+#define aesdeclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xd8\n\t"
+#define aesdeclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdf, 0xe0\n\t"
+ asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x20(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x30(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x40(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x50(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x60(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x70(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x80(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0x90(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xa0(%[key]), %%xmm0\n\t"
+ "cmp $10, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xb0(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xc0(%[key]), %%xmm0\n\t"
+ "cmp $12, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xd0(%[key]), %%xmm0\n\t"
+ aesdec_xmm0_xmm1
+ aesdec_xmm0_xmm2
+ aesdec_xmm0_xmm3
+ aesdec_xmm0_xmm4
+ "movdqa 0xe0(%[key]), %%xmm0\n"
+
+ ".Ldeclast%=:\n\t"
+ aesdeclast_xmm0_xmm1
+ aesdeclast_xmm0_xmm2
+ aesdeclast_xmm0_xmm3
+ aesdeclast_xmm0_xmm4
+ : /* no output */
+ : [key] "r" (ctx->keyschdec),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+#undef aesdec_xmm0_xmm1
+#undef aesdec_xmm0_xmm2
+#undef aesdec_xmm0_xmm3
+#undef aesdec_xmm0_xmm4
+#undef aesdeclast_xmm0_xmm1
+#undef aesdeclast_xmm0_xmm2
+#undef aesdeclast_xmm0_xmm3
+#undef aesdeclast_xmm0_xmm4
+}
+
+
/* Perform a CFB encryption or decryption round using the
initialization vector IV and the input block A. Write the result
to the output block B and update IV. IV needs to be 16 byte
@@ -1623,17 +1732,51 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
ctx->decryption_prepared = 1;
}
- /* As we avoid memcpy to/from stack by using xmm2 and xmm3 for temporary
- storage, out-of-order CPUs see parallellism even over loop iterations
- and see 2.5x to 2.9x speed up on Intel Sandy-Bridge. Further
- improvements are possible with do_aesni_cbc_dec_4() when implemented.
- */
asm volatile
- ("movdqu %[iv], %%xmm3\n\t" /* use xmm3 as fast IV storage */
+ ("movdqu %[iv], %%xmm5\n\t" /* use xmm5 as fast IV storage */
: /* No output */
: [iv] "m" (*iv)
: "memory");
+ for ( ;nblocks > 3 ; nblocks -= 4 )
+ {
+ asm volatile
+ ("movdqu 0*16(%[inbuf]), %%xmm1\n\t" /* load input blocks */
+ "movdqu 1*16(%[inbuf]), %%xmm2\n\t"
+ "movdqu 2*16(%[inbuf]), %%xmm3\n\t"
+ "movdqu 3*16(%[inbuf]), %%xmm4\n\t"
+ : /* No output */
+ : [inbuf] "r" (inbuf)
+ : "memory");
+
+ do_aesni_dec_vec4 (ctx);
+
+ asm volatile
+ ("pxor %%xmm5, %%xmm1\n\t" /* xor IV with output */
+ "movdqu 0*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+ "pxor %%xmm5, %%xmm2\n\t" /* xor IV with output */
+ "movdqu 1*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+ "pxor %%xmm5, %%xmm3\n\t" /* xor IV with output */
+ "movdqu 2*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+ "pxor %%xmm5, %%xmm4\n\t" /* xor IV with output */
+ "movdqu 3*16(%[inbuf]), %%xmm5\n\t" /* load new IV */
+ "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+ : /* No output */
+ : [inbuf] "r" (inbuf),
+ [outbuf] "r" (outbuf)
+ : "memory");
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
+
for ( ;nblocks; nblocks-- )
{
asm volatile
@@ -1647,9 +1790,9 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
asm volatile
("movdqu %[outbuf], %%xmm0\n\t"
- "pxor %%xmm3, %%xmm0\n\t" /* xor IV with output */
+ "pxor %%xmm5, %%xmm0\n\t" /* xor IV with output */
"movdqu %%xmm0, %[outbuf]\n\t"
- "movdqu %%xmm2, %%xmm3\n\t" /* store savebuf as new IV */
+ "movdqu %%xmm2, %%xmm5\n\t" /* store savebuf as new IV */
: /* No output */
: [outbuf] "m" (*outbuf)
: "memory");
@@ -1659,7 +1802,7 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
}
asm volatile
- ("movdqu %%xmm3, %[iv]\n\t" /* store IV */
+ ("movdqu %%xmm5, %[iv]\n\t" /* store IV */
: /* No output */
: [iv] "m" (*iv)
: "memory");
commit 5acd0e5ae2a58dda51c2b56c879b80a1a6d2c42f
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date: Fri Nov 23 19:22:09 2012 +0200
Clear xmm5 after use in AES-NI CTR mode
* cipher/rijndael.c [USE_AESNI]: Rename aesni_cleanup_2_4 to
aesni_cleanup_2_5.
[USE_AESNI] (aesni_cleanup_2_5): Clear xmm5 register.
(_gcry_aes_ctr_enc, _gcry_aes_cbc_dec) [USE_AESNI]: Use
aesni_cleanup_2_5 instead of aesni_cleanup_2_4.
--
xmm5 register is used by parallelized AES-NI CTR mode, so it should be cleaned
up after use too.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 982c54e..69e1df1 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -144,10 +144,11 @@ typedef struct
do { asm volatile ("pxor %%xmm0, %%xmm0\n\t" \
"pxor %%xmm1, %%xmm1\n" :: ); \
} while (0)
-# define aesni_cleanup_2_4() \
+# define aesni_cleanup_2_5() \
do { asm volatile ("pxor %%xmm2, %%xmm2\n\t" \
"pxor %%xmm3, %%xmm3\n" \
- "pxor %%xmm4, %%xmm4\n":: ); \
+ "pxor %%xmm4, %%xmm4\n" \
+ "pxor %%xmm5, %%xmm5\n":: ); \
} while (0)
#else
# define aesni_prepare() do { } while (0)
@@ -1338,7 +1339,7 @@ _gcry_aes_ctr_enc (void *context, unsigned char *ctr,
inbuf += BLOCKSIZE;
}
aesni_cleanup ();
- aesni_cleanup_2_4 ();
+ aesni_cleanup_2_5 ();
}
#endif /*USE_AESNI*/
else
@@ -1664,7 +1665,7 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
: "memory");
aesni_cleanup ();
- aesni_cleanup_2_4 ();
+ aesni_cleanup_2_5 ();
}
#endif /*USE_AESNI*/
else
commit be3768994ad362dfc849a8cd0146b4c9bb287d20
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date: Fri Nov 23 19:22:04 2012 +0200
Optimize AES-NI CBC encryption
* cipher/rijndeal.c (_gcry_aes_cbc_enc) [USE_AESNI]: Add AES-NI
spesific loop and use SSE2 assembler for xoring and copying of
blocks.
--
This gives ~35% improvement in 'tests/benchmark cipher aes' on Sandy-Bridge
CPU (x86-64).
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 104f869..982c54e 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1249,23 +1249,50 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
aesni_prepare ();
for ( ;nblocks; nblocks-- )
{
- for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
- outbuf[i] = inbuf[i] ^ *ivp++;
-
if (0)
;
-#ifdef USE_PADLOCK
- else if (ctx->use_padlock)
- do_padlock (ctx, 0, outbuf, outbuf);
-#endif /*USE_PADLOCK*/
#ifdef USE_AESNI
else if (ctx->use_aesni)
- do_aesni (ctx, 0, outbuf, outbuf);
+ {
+ /* ~35% speed up on Sandy-Bridge when doing xoring and copying with
+ SSE registers. */
+ asm volatile ("movdqu %[iv], %%xmm0\n\t"
+ "movdqu %[inbuf], %%xmm1\n\t"
+ "pxor %%xmm0, %%xmm1\n\t"
+ "movdqu %%xmm1, %[outbuf]\n\t"
+ : /* No output */
+ : [iv] "m" (*iv),
+ [inbuf] "m" (*inbuf),
+ [outbuf] "m" (*outbuf)
+ : "memory" );
+
+ do_aesni (ctx, 0, outbuf, outbuf);
+
+ asm volatile ("movdqu %[outbuf], %%xmm0\n\t"
+ "movdqu %%xmm0, %[iv]\n\t"
+ : /* No output */
+ : [outbuf] "m" (*outbuf),
+ [iv] "m" (*iv)
+ : "memory" );
+ }
#endif /*USE_AESNI*/
else
- do_encrypt (ctx, outbuf, outbuf );
+ {
+ for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
+ outbuf[i] = inbuf[i] ^ *ivp++;
+
+ if (0)
+ ;
+#ifdef USE_PADLOCK
+ else if (ctx->use_padlock)
+ do_padlock (ctx, 0, outbuf, outbuf);
+#endif /*USE_PADLOCK*/
+ else
+ do_encrypt (ctx, outbuf, outbuf );
+
+ memcpy (iv, outbuf, BLOCKSIZE);
+ }
- memcpy (iv, outbuf, BLOCKSIZE);
inbuf += BLOCKSIZE;
if (!cbc_mac)
outbuf += BLOCKSIZE;
commit 3369d960158ab4231b83926a0f982e2a8819f173
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date: Fri Nov 23 19:21:59 2012 +0200
Improve parallelizability of CBC decryption for AES-NI
* cipher/rijndael.c (_gcry_aes_cbc_dec) [USE_AESNI]: Add AES-NI
specific CBC mode loop with temporary block and IV stored in free SSE
registers.
--
Benchmark results on Intel Core i5-2450M (x86-64) show ~2.5x improvement:
Before:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 690ms 780ms 2940ms 2110ms 1880ms 670ms 2250ms 2250ms 490ms 500ms
AES192 890ms 930ms 3260ms 2390ms 2220ms 820ms 2580ms 2590ms 560ms 570ms
AES256 1040ms 1070ms 3590ms 2640ms 2540ms 970ms 2880ms 2890ms 650ms 650ms
After:
$ tests/benchmark --cipher-repetitions 1000 cipher aes aes192 aes256
Running each test 1000 times.
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
AES 670ms 770ms 2920ms 720ms 1900ms 660ms 2260ms 2250ms 480ms 500ms
AES192 860ms 930ms 3250ms 870ms 2210ms 830ms 2580ms 2580ms 570ms 570ms
AES256 1020ms 1080ms 3580ms 1030ms 2550ms 970ms 2880ms 2870ms 660ms 660ms
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index d081b42..104f869 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -1582,33 +1582,86 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
int i;
unsigned char savebuf[BLOCKSIZE];
- aesni_prepare ();
- for ( ;nblocks; nblocks-- )
+ if (0)
+ ;
+#ifdef USE_AESNI
+ else if (ctx->use_aesni)
{
- /* We need to save INBUF away because it may be identical to
- OUTBUF. */
- memcpy (savebuf, inbuf, BLOCKSIZE);
+ aesni_prepare ();
- if (0)
- ;
+ if (!ctx->decryption_prepared )
+ {
+ prepare_decryption ( ctx );
+ ctx->decryption_prepared = 1;
+ }
+
+ /* As we avoid memcpy to/from stack by using xmm2 and xmm3 for temporary
+ storage, out-of-order CPUs see parallellism even over loop iterations
+ and see 2.5x to 2.9x speed up on Intel Sandy-Bridge. Further
+ improvements are possible with do_aesni_cbc_dec_4() when implemented.
+ */
+ asm volatile
+ ("movdqu %[iv], %%xmm3\n\t" /* use xmm3 as fast IV storage */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
+
+ for ( ;nblocks; nblocks-- )
+ {
+ asm volatile
+ ("movdqu %[inbuf], %%xmm2\n\t" /* use xmm2 as savebuf */
+ : /* No output */
+ : [inbuf] "m" (*inbuf)
+ : "memory");
+
+ /* uses only xmm0 and xmm1 */
+ do_aesni_dec_aligned (ctx, outbuf, inbuf);
+
+ asm volatile
+ ("movdqu %[outbuf], %%xmm0\n\t"
+ "pxor %%xmm3, %%xmm0\n\t" /* xor IV with output */
+ "movdqu %%xmm0, %[outbuf]\n\t"
+ "movdqu %%xmm2, %%xmm3\n\t" /* store savebuf as new IV */
+ : /* No output */
+ : [outbuf] "m" (*outbuf)
+ : "memory");
+
+ outbuf += BLOCKSIZE;
+ inbuf += BLOCKSIZE;
+ }
+
+ asm volatile
+ ("movdqu %%xmm3, %[iv]\n\t" /* store IV */
+ : /* No output */
+ : [iv] "m" (*iv)
+ : "memory");
+
+ aesni_cleanup ();
+ aesni_cleanup_2_4 ();
+ }
+#endif /*USE_AESNI*/
+ else
+ for ( ;nblocks; nblocks-- )
+ {
+ /* We need to save INBUF away because it may be identical to
+ OUTBUF. */
+ memcpy (savebuf, inbuf, BLOCKSIZE);
+
+ if (0)
+ ;
#ifdef USE_PADLOCK
- else if (ctx->use_padlock)
- do_padlock (ctx, 1, outbuf, inbuf);
+ else if (ctx->use_padlock)
+ do_padlock (ctx, 1, outbuf, inbuf);
#endif /*USE_PADLOCK*/
-#ifdef USE_AESNI
- else if (ctx->use_aesni)
- do_aesni (ctx, 1, outbuf, inbuf);
-#endif /*USE_AESNI*/
- else
- do_decrypt (ctx, outbuf, inbuf);
+ else
+ do_decrypt (ctx, outbuf, inbuf);
- for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
- outbuf[i] ^= *ivp++;
- memcpy (iv, savebuf, BLOCKSIZE);
- inbuf += BLOCKSIZE;
- outbuf += BLOCKSIZE;
- }
- aesni_cleanup ();
+ for (ivp=iv, i=0; i < BLOCKSIZE; i++ )
+ outbuf[i] ^= *ivp++;
+ memcpy (iv, savebuf, BLOCKSIZE);
+ inbuf += BLOCKSIZE;
+ outbuf += BLOCKSIZE;
+ }
_gcry_burn_stack (48 + 2*sizeof(int) + BLOCKSIZE + 4*sizeof (char*));
}
commit 55b96be08531664ed3f4230acebe0f45954bbc33
Author: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
Date: Fri Nov 23 19:21:54 2012 +0200
Extend test of chained modes for 128bit ciphers
* tests/basic.c (check_one_cipher_core, check_one_cipher): Increase
input and output buffer sizes from 16 bytes to 1024+16=1040 bytes.
(check_one_cipher_core): Add asserts to verify sizes of temporary
buffers.
--
Currently check_one_cipher() has buffer size of 16 bytes, which is one block
with 128bit cipher. As result chained modes for 128bit ciphers are not well
tested. Increase buffer size to 1040 bytes, so that iterations of chained
modes and parallellized code paths (AES-NI CTR, etc) are also tested.
Extra 16 bytes after 1024 bytes to ensure that loop transision from
parallelized code paths to serialized code paths get tested too.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at mbnet.fi>
diff --git a/tests/basic.c b/tests/basic.c
index 8001e86..656d76c 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -1367,13 +1367,15 @@ check_one_cipher_core (int algo, int mode, int flags,
int bufshift, int pass)
{
gcry_cipher_hd_t hd;
- unsigned char in_buffer[17], out_buffer[17];
+ unsigned char in_buffer[1040+1], out_buffer[1040+1];
unsigned char *in, *out;
int keylen;
gcry_error_t err = 0;
assert (nkey == 32);
- assert (nplain == 16);
+ assert (nplain == 1040);
+ assert (sizeof(in_buffer) == nplain + 1);
+ assert (sizeof(out_buffer) == sizeof(in_buffer));
if (!bufshift)
{
@@ -1427,7 +1429,7 @@ check_one_cipher_core (int algo, int mode, int flags,
return -1;
}
- err = gcry_cipher_encrypt (hd, out, 16, plain, 16);
+ err = gcry_cipher_encrypt (hd, out, nplain, plain, nplain);
if (err)
{
fail ("pass %d, algo %d, mode %d, gcry_cipher_encrypt failed: %s\n",
@@ -1438,7 +1440,7 @@ check_one_cipher_core (int algo, int mode, int flags,
gcry_cipher_reset (hd);
- err = gcry_cipher_decrypt (hd, in, 16, out, 16);
+ err = gcry_cipher_decrypt (hd, in, nplain, out, nplain);
if (err)
{
fail ("pass %d, algo %d, mode %d, gcry_cipher_decrypt failed: %s\n",
@@ -1447,15 +1449,15 @@ check_one_cipher_core (int algo, int mode, int flags,
return -1;
}
- if (memcmp (plain, in, 16))
+ if (memcmp (plain, in, nplain))
fail ("pass %d, algo %d, mode %d, encrypt-decrypt mismatch\n",
pass, algo, mode);
/* Again, using in-place encryption. */
gcry_cipher_reset (hd);
- memcpy (out, plain, 16);
- err = gcry_cipher_encrypt (hd, out, 16, NULL, 0);
+ memcpy (out, plain, nplain);
+ err = gcry_cipher_encrypt (hd, out, nplain, NULL, 0);
if (err)
{
fail ("pass %d, algo %d, mode %d, in-place, gcry_cipher_encrypt failed:"
@@ -1467,7 +1469,7 @@ check_one_cipher_core (int algo, int mode, int flags,
gcry_cipher_reset (hd);
- err = gcry_cipher_decrypt (hd, out, 16, NULL, 0);
+ err = gcry_cipher_decrypt (hd, out, nplain, NULL, 0);
if (err)
{
fail ("pass %d, algo %d, mode %d, in-place, gcry_cipher_decrypt failed:"
@@ -1477,7 +1479,7 @@ check_one_cipher_core (int algo, int mode, int flags,
return -1;
}
- if (memcmp (plain, out, 16))
+ if (memcmp (plain, out, nplain))
fail ("pass %d, algo %d, mode %d, in-place, encrypt-decrypt mismatch\n",
pass, algo, mode);
@@ -1492,34 +1494,43 @@ check_one_cipher_core (int algo, int mode, int flags,
static void
check_one_cipher (int algo, int mode, int flags)
{
- char key[33];
- unsigned char plain[17];
- int bufshift;
+ char key[32+1];
+ unsigned char plain[1040+1];
+ int bufshift, i;
for (bufshift=0; bufshift < 4; bufshift++)
{
/* Pass 0: Standard test. */
memcpy (key, "0123456789abcdef.,;/[]{}-=ABCDEF", 32);
memcpy (plain, "foobar42FOOBAR17", 16);
- if (check_one_cipher_core (algo, mode, flags, key, 32, plain, 16,
+ for (i = 16; i < 1040; i += 16)
+ {
+ memcpy (&plain[i], &plain[i-16], 16);
+ if (!++plain[i+7])
+ plain[i+6]++;
+ if (!++plain[i+15])
+ plain[i+14]++;
+ }
+
+ if (check_one_cipher_core (algo, mode, flags, key, 32, plain, 1040,
bufshift, 0+10*bufshift))
return;
/* Pass 1: Key not aligned. */
memmove (key+1, key, 32);
- if (check_one_cipher_core (algo, mode, flags, key+1, 32, plain, 16,
+ if (check_one_cipher_core (algo, mode, flags, key+1, 32, plain, 1040,
bufshift, 1+10*bufshift))
return;
/* Pass 2: Key not aligned and data not aligned. */
- memmove (plain+1, plain, 16);
- if (check_one_cipher_core (algo, mode, flags, key+1, 32, plain+1, 16,
+ memmove (plain+1, plain, 1024);
+ if (check_one_cipher_core (algo, mode, flags, key+1, 32, plain+1, 1040,
bufshift, 2+10*bufshift))
return;
/* Pass 3: Key aligned and data not aligned. */
memmove (key, key+1, 32);
- if (check_one_cipher_core (algo, mode, flags, key, 32, plain+1, 16,
+ if (check_one_cipher_core (algo, mode, flags, key, 32, plain+1, 1040,
bufshift, 3+10*bufshift))
return;
}
-----------------------------------------------------------------------
Summary of changes:
cipher/rijndael.c | 483 +++++++++++++++++++++++++++++++++++++++++++++++------
src/g10lib.h | 43 +++++-
tests/basic.c | 45 +++--
3 files changed, 500 insertions(+), 71 deletions(-)
hooks/post-receive
--
The GNU crypto library
http://git.gnupg.org
More information about the Gnupg-commits
mailing list