[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-141-g6deb0cc
by Jussi Kivilinna
cvs at cvs.gnupg.org
Thu May 23 17:59:37 CEST 2013
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".
The branch, master has been updated
via 6deb0ccdf718a0670f80e6762a3842caf76437d6 (commit)
via b60f06f70227c1e69e1010da8b47ea51ade48145 (commit)
via 319ee14f2aab8db56a830fd7ac8926f91b4f738a (commit)
from b402de8b9c4a9f269faf03ca952b1eb68a1f33c8 (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 6deb0ccdf718a0670f80e6762a3842caf76437d6
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Thu May 23 14:15:51 2013 +0300
serpent: add parallel processing for CFB decryption
* cipher/cipher.c (gcry_cipher_open): Add bulf CFB decryption function
for Serpent.
* cipher/serpent-sse2-amd64.S (_gcry_serpent_sse2_cfb_dec): New
function.
* cipher/serpent.c (_gcry_serpent_sse2_cfb_dec): New prototype.
(_gcry_serpent_cfb_dec) New function.
(selftest_cfb_128) New function.
(selftest) Call selftest_cfb_128.
* src/cipher.h (_gcry_serpent_cfb_dec): New prototype.
--
Patch makes Serpent-CFB decryption 4.0 times faster on Intel Sandy-Bridge and
2.7 times faster on AMD K10.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/cipher.c b/cipher/cipher.c
index e9a652f..652d795 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -732,6 +732,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
case GCRY_CIPHER_SERPENT192:
case GCRY_CIPHER_SERPENT256:
h->bulk.cbc_dec = _gcry_serpent_cbc_dec;
+ h->bulk.cfb_dec = _gcry_serpent_cfb_dec;
h->bulk.ctr_enc = _gcry_serpent_ctr_enc;
break;
#endif /*USE_SERPENT*/
diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S
index 8d8c8dd..5f9e9d2 100644
--- a/cipher/serpent-sse2-amd64.S
+++ b/cipher/serpent-sse2-amd64.S
@@ -822,5 +822,71 @@ _gcry_serpent_sse2_cbc_dec:
ret
.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;
+.align 8
+.global _gcry_serpent_sse2_cfb_dec
+.type _gcry_serpent_sse2_cfb_dec, at function;
+_gcry_serpent_sse2_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (8 blocks)
+ * %rdx: src (8 blocks)
+ * %rcx: iv
+ */
+
+ .set RA0, enc_in_a0
+ .set RA1, enc_in_a1
+ .set RA2, enc_in_a2
+ .set RA3, enc_in_a3
+ .set RB0, enc_in_b0
+ .set RB1, enc_in_b1
+ .set RB2, enc_in_b2
+ .set RB3, enc_in_b3
+
+ /* Load input */
+ movdqu (%rcx), RA0;
+ movdqu 0 * 16(%rdx), RA1;
+ movdqu 1 * 16(%rdx), RA2;
+ movdqu 2 * 16(%rdx), RA3;
+ movdqu 3 * 16(%rdx), RB0;
+ movdqu 4 * 16(%rdx), RB1;
+ movdqu 5 * 16(%rdx), RB2;
+ movdqu 6 * 16(%rdx), RB3;
+
+ /* Update IV */
+ movdqu 7 * 16(%rdx), RNOT;
+ movdqu RNOT, (%rcx);
+
+ call __serpent_enc_blk8;
+
+ .set RA0, enc_out_a0
+ .set RA1, enc_out_a1
+ .set RA2, enc_out_a2
+ .set RA3, enc_out_a3
+ .set RB0, enc_out_b0
+ .set RB1, enc_out_b1
+ .set RB2, enc_out_b2
+ .set RB3, enc_out_b3
+
+ pxor_u((0 * 16)(%rdx), RA0, RTMP0);
+ pxor_u((1 * 16)(%rdx), RA1, RTMP0);
+ pxor_u((2 * 16)(%rdx), RA2, RTMP0);
+ pxor_u((3 * 16)(%rdx), RA3, RTMP0);
+ pxor_u((4 * 16)(%rdx), RB0, RTMP0);
+ pxor_u((5 * 16)(%rdx), RB1, RTMP0);
+ pxor_u((6 * 16)(%rdx), RB2, RTMP0);
+ pxor_u((7 * 16)(%rdx), RB3, RTMP0);
+
+ movdqu RA0, (0 * 16)(%rsi);
+ movdqu RA1, (1 * 16)(%rsi);
+ movdqu RA2, (2 * 16)(%rsi);
+ movdqu RA3, (3 * 16)(%rsi);
+ movdqu RB0, (4 * 16)(%rsi);
+ movdqu RB1, (5 * 16)(%rsi);
+ movdqu RB2, (6 * 16)(%rsi);
+ movdqu RB3, (7 * 16)(%rsi);
+
+ ret
+.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;
+
#endif /*defined(USE_SERPENT)*/
#endif /*__x86_64*/
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 7b82b48..95ac7c1 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -74,6 +74,11 @@ extern void _gcry_serpent_sse2_cbc_dec(serpent_context_t *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
+
+extern void _gcry_serpent_sse2_cfb_dec(serpent_context_t *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv);
#endif
/* A prototype. */
@@ -916,6 +921,71 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}
+/* Bulk decryption of complete blocks in CFB mode. This function is only
+ intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_serpent_cfb_dec(void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ unsigned int nblocks)
+{
+ serpent_context_t *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = 2 * sizeof (serpent_block_t);
+
+#ifdef USE_SSE2
+ {
+ int did_use_sse2 = 0;
+
+ /* Process data in 8 block chunks. */
+ while (nblocks >= 8)
+ {
+ _gcry_serpent_sse2_cfb_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 8;
+ outbuf += 8 * sizeof(serpent_block_t);
+ inbuf += 8 * sizeof(serpent_block_t);
+ did_use_sse2 = 1;
+ }
+
+ if (did_use_sse2)
+ {
+ /* clear SSE2 registers used by serpent-sse2 */
+ asm volatile (
+ "pxor %%xmm0, %%xmm0;\n"
+ "pxor %%xmm1, %%xmm1;\n"
+ "pxor %%xmm2, %%xmm2;\n"
+ "pxor %%xmm3, %%xmm3;\n"
+ "pxor %%xmm4, %%xmm4;\n"
+ "pxor %%xmm5, %%xmm5;\n"
+ "pxor %%xmm6, %%xmm6;\n"
+ "pxor %%xmm7, %%xmm7;\n"
+ "pxor %%xmm10, %%xmm10;\n"
+ "pxor %%xmm11, %%xmm11;\n"
+ "pxor %%xmm12, %%xmm12;\n"
+ "pxor %%xmm13, %%xmm13;\n"
+ :::);
+
+ /* serpent-sse2 assembly code does not use stack */
+ if (nblocks == 0)
+ burn_stack_depth = 0;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
+ for ( ;nblocks; nblocks-- )
+ {
+ serpent_encrypt_internal(ctx, iv, iv);
+ buf_xor_n_copy(outbuf, iv, inbuf, sizeof(serpent_block_t));
+ outbuf += sizeof(serpent_block_t);
+ inbuf += sizeof(serpent_block_t);
+ }
+
+ _gcry_burn_stack(burn_stack_depth);
+}
+
/* Run the self-tests for SERPENT-CTR-128, tests IV increment of bulk CTR
@@ -948,6 +1018,21 @@ selftest_cbc_128 (void)
}
+/* Run the self-tests for SERPENT-CBC-128, tests bulk CBC decryption.
+ Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+ const int nblocks = 8+2;
+ const int blocksize = sizeof(serpent_block_t);
+ const int context_size = sizeof(serpent_context_t);
+
+ return _gcry_selftest_helper_cfb_128("SERPENT", &serpent_setkey,
+ &serpent_encrypt, &_gcry_serpent_cfb_dec, nblocks, blocksize,
+ context_size);
+}
+
+
/* Serpent test. */
static const char *
@@ -1034,6 +1119,9 @@ serpent_test (void)
if ( (r = selftest_cbc_128 ()) )
return r;
+ if ( (r = selftest_cfb_128 ()) )
+ return r;
+
return NULL;
}
diff --git a/src/cipher.h b/src/cipher.h
index f28990d..9d6cc01 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -113,6 +113,9 @@ void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
void _gcry_serpent_cbc_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
unsigned int nblocks);
+void _gcry_serpent_cfb_dec (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ unsigned int nblocks);
/*-- dsa.c --*/
void _gcry_register_pk_dsa_progress (gcry_handler_progress_t cbc, void *cb_data);
commit b60f06f70227c1e69e1010da8b47ea51ade48145
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Thu May 23 14:15:46 2013 +0300
camellia: add parallel processing for CFB decryption
* cipher/camellia-aesni-avx-amd64.S
(_gcry_camellia_aesni_avx_cfb_dec): New function.
* cipher/camellia-glue.c (_gcry_camellia_aesni_avx_cfb_dec): New
prototype.
(_gcry_camellia_cfb_dec): New function.
(selftest_cfb_128): New function.
(selftest): Call selftest_cfb_128.
* cipher/cipher.c (gry_cipher_open): Add bulk CFB decryption function
for Camellia.
* src/cipher.h (_gcry_camellia_cfb_dec): New prototype.
--
Patch makes Camellia-CFB decryption 4.7 times faster on Intel Sandy-Bridge.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index 2b1df17..95c96b8 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1116,5 +1116,70 @@ _gcry_camellia_aesni_avx_cbc_dec:
ret;
.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
+.align 8
+.global _gcry_camellia_aesni_avx_cfb_dec
+.type _gcry_camellia_aesni_avx_cfb_dec, at function;
+
+_gcry_camellia_aesni_avx_cfb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: iv
+ */
+
+ subq $(16 * 16), %rsp;
+ movq %rsp, %rax;
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX), %xmm0;
+ vpshufb .Lpack_bswap RIP, %xmm0, %xmm0;
+ vpxor (%rcx), %xmm0, %xmm15;
+ vmovdqu 15 * 16(%rdx), %xmm1;
+ vmovdqu %xmm1, (%rcx); /* store new IV */
+ vpxor 0 * 16(%rdx), %xmm0, %xmm14;
+ vpxor 1 * 16(%rdx), %xmm0, %xmm13;
+ vpxor 2 * 16(%rdx), %xmm0, %xmm12;
+ vpxor 3 * 16(%rdx), %xmm0, %xmm11;
+ vpxor 4 * 16(%rdx), %xmm0, %xmm10;
+ vpxor 5 * 16(%rdx), %xmm0, %xmm9;
+ vpxor 6 * 16(%rdx), %xmm0, %xmm8;
+ vpxor 7 * 16(%rdx), %xmm0, %xmm7;
+ vpxor 8 * 16(%rdx), %xmm0, %xmm6;
+ vpxor 9 * 16(%rdx), %xmm0, %xmm5;
+ vpxor 10 * 16(%rdx), %xmm0, %xmm4;
+ vpxor 11 * 16(%rdx), %xmm0, %xmm3;
+ vpxor 12 * 16(%rdx), %xmm0, %xmm2;
+ vpxor 13 * 16(%rdx), %xmm0, %xmm1;
+ vpxor 14 * 16(%rdx), %xmm0, %xmm0;
+
+ call __camellia_enc_blk16;
+
+ addq $(16 * 16), %rsp;
+
+ vpxor 0 * 16(%rdx), %xmm7, %xmm7;
+ vpxor 1 * 16(%rdx), %xmm6, %xmm6;
+ vpxor 2 * 16(%rdx), %xmm5, %xmm5;
+ vpxor 3 * 16(%rdx), %xmm4, %xmm4;
+ vpxor 4 * 16(%rdx), %xmm3, %xmm3;
+ vpxor 5 * 16(%rdx), %xmm2, %xmm2;
+ vpxor 6 * 16(%rdx), %xmm1, %xmm1;
+ vpxor 7 * 16(%rdx), %xmm0, %xmm0;
+ vpxor 8 * 16(%rdx), %xmm15, %xmm15;
+ vpxor 9 * 16(%rdx), %xmm14, %xmm14;
+ vpxor 10 * 16(%rdx), %xmm13, %xmm13;
+ vpxor 11 * 16(%rdx), %xmm12, %xmm12;
+ vpxor 12 * 16(%rdx), %xmm11, %xmm11;
+ vpxor 13 * 16(%rdx), %xmm10, %xmm10;
+ vpxor 14 * 16(%rdx), %xmm9, %xmm9;
+ vpxor 15 * 16(%rdx), %xmm8, %xmm8;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ ret;
+.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
+
#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/
#endif /*__x86_64*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 4c724a9..f9bbb33 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -102,6 +102,11 @@ extern void _gcry_camellia_aesni_avx_cbc_dec(CAMELLIA_context *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv);
+
+extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *iv);
#endif
static const char *selftest(void);
@@ -308,6 +313,58 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}
+/* Bulk decryption of complete blocks in CFB mode. This function is only
+ intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_camellia_cfb_dec(void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ unsigned int nblocks)
+{
+ CAMELLIA_context *ctx = context;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx)
+ {
+ int did_use_aesni_avx = 0;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ _gcry_camellia_aesni_avx_cfb_dec(ctx, outbuf, inbuf, iv);
+
+ nblocks -= 16;
+ outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx = 1;
+ }
+
+ if (did_use_aesni_avx)
+ {
+ /* clear AVX registers */
+ asm volatile ("vzeroall;\n":::);
+
+ if (burn_stack_depth < 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *))
+ burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 2 * sizeof(void *);
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
+ for ( ;nblocks; nblocks-- )
+ {
+ Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
+ buf_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
+ outbuf += CAMELLIA_BLOCK_SIZE;
+ inbuf += CAMELLIA_BLOCK_SIZE;
+ }
+
+ _gcry_burn_stack(burn_stack_depth);
+}
+
/* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
encryption. Returns NULL on success. */
static const char*
@@ -336,6 +393,20 @@ selftest_cbc_128 (void)
context_size);
}
+/* Run the self-tests for CAMELLIA-CFB-128, tests bulk CFB decryption.
+ Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+ const int nblocks = 16+2;
+ const int blocksize = CAMELLIA_BLOCK_SIZE;
+ const int context_size = sizeof(CAMELLIA_context);
+
+ return _gcry_selftest_helper_cfb_128("CAMELLIA", &camellia_setkey,
+ &camellia_encrypt, &_gcry_camellia_cfb_dec, nblocks, blocksize,
+ context_size);
+}
+
static const char *
selftest(void)
{
@@ -411,6 +482,9 @@ selftest(void)
if ( (r = selftest_cbc_128 ()) )
return r;
+ if ( (r = selftest_cfb_128 ()) )
+ return r;
+
return NULL;
}
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 20ac2c7..e9a652f 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -723,6 +723,7 @@ gcry_cipher_open (gcry_cipher_hd_t *handle,
case GCRY_CIPHER_CAMELLIA192:
case GCRY_CIPHER_CAMELLIA256:
h->bulk.cbc_dec = _gcry_camellia_cbc_dec;
+ h->bulk.cfb_dec = _gcry_camellia_cfb_dec;
h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
break;
#endif /*USE_CAMELLIA*/
diff --git a/src/cipher.h b/src/cipher.h
index 4e68487..f28990d 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -102,6 +102,9 @@ void _gcry_camellia_ctr_enc (void *context, unsigned char *ctr,
void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
unsigned int nblocks);
+void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
+ void *outbuf_arg, const void *inbuf_arg,
+ unsigned int nblocks);
/*-- serpent.c --*/
void _gcry_serpent_ctr_enc (void *context, unsigned char *ctr,
commit 319ee14f2aab8db56a830fd7ac8926f91b4f738a
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date: Thu May 23 14:15:41 2013 +0300
rinjdael: add parallel processing for CFB decryption with AES-NI
* cipher/cipher-selftest.c (_gcry_selftest_helper_cfb_128): New
function for CFB selftests.
* cipher/cipher-selftest.h (_gcry_selftest_helper_cfb_128): New
prototype.
* cipher/rijndael.c [USE_AESNI] (do_aesni_enc_vec4): New function.
(_gcry_aes_cfb_dec) [USE_AESNI]: Add parallelized CFB decryption.
(selftest_cfb_128): New function.
(selftest): Call selftest_cfb_128.
--
CFB decryption can be parallelized for additional performance. On Intel
Sandy-Bridge processor, this change makes CFB decryption 4.6 times faster.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
diff --git a/cipher/cipher-selftest.c b/cipher/cipher-selftest.c
index 439f3ae..41eb405 100644
--- a/cipher/cipher-selftest.c
+++ b/cipher/cipher-selftest.c
@@ -160,6 +160,119 @@ _gcry_selftest_helper_cbc_128 (const char *cipher,
return NULL;
}
+/* Run the self-tests for <block cipher>-CFB-128, tests bulk CFB
+ decryption. Returns NULL on success. */
+const char *
+_gcry_selftest_helper_cfb_128 (const char *cipher,
+ gcry_cipher_setkey_t setkey_func,
+ gcry_cipher_encrypt_t encrypt_one,
+ gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+ const int nblocks, const int blocksize,
+ const int context_size)
+{
+ int i, offs;
+ unsigned char *ctx, *plaintext, *plaintext2, *ciphertext, *iv, *iv2, *mem;
+ unsigned int ctx_aligned_size, memsize;
+
+ static const unsigned char key[16] ATTR_ALIGNED_16 = {
+ 0x11,0x9A,0x00,0x7F,0xC7,0x6A,0x45,0x9F,
+ 0x98,0xBA,0xF9,0x17,0xFE,0xDF,0x95,0x33
+ };
+
+ /* Allocate buffers, align elements to 16 bytes. */
+ ctx_aligned_size = context_size + 15;
+ ctx_aligned_size -= ctx_aligned_size & 0xf;
+
+ memsize = ctx_aligned_size + (blocksize * 2) + (blocksize * nblocks * 3) + 16;
+
+ mem = gcry_calloc (1, memsize);
+ if (!mem)
+ return "failed to allocate memory";
+
+ offs = (16 - ((uintptr_t)mem & 15)) & 15;
+ ctx = (void*)(mem + offs);
+ iv = ctx + ctx_aligned_size;
+ iv2 = iv + blocksize;
+ plaintext = iv2 + blocksize;
+ plaintext2 = plaintext + nblocks * blocksize;
+ ciphertext = plaintext2 + nblocks * blocksize;
+
+ /* Initialize ctx */
+ setkey_func (ctx, key, sizeof(key));
+
+ /* Test single block code path */
+ memset(iv, 0xd3, blocksize);
+ memset(iv2, 0xd3, blocksize);
+ for (i = 0; i < blocksize; i++)
+ plaintext[i] = i;
+
+ /* CFB manually. */
+ encrypt_one (ctx, ciphertext, iv);
+ buf_xor_2dst (iv, ciphertext, plaintext, blocksize);
+
+ /* CFB decrypt. */
+ bulk_cfb_dec (ctx, iv2, plaintext2, ciphertext, 1);
+ if (memcmp(plaintext2, plaintext, blocksize))
+ {
+ gcry_free(mem);
+#ifdef HAVE_SYSLOG
+ syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+ "%s-128-CFB test failed (plaintext mismatch)", cipher);
+#endif
+ return "selftest for 128 bit CFB failed - see syslog for details";
+ }
+
+ if (memcmp(iv2, iv, blocksize))
+ {
+ gcry_free(mem);
+#ifdef HAVE_SYSLOG
+ syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+ "%s-128-CFB test failed (IV mismatch)", cipher);
+#endif
+ return "selftest for 128 bit CFB failed - see syslog for details";
+ }
+
+ /* Test parallelized code paths */
+ memset(iv, 0xe6, blocksize);
+ memset(iv2, 0xe6, blocksize);
+
+ for (i = 0; i < nblocks * blocksize; i++)
+ plaintext[i] = i;
+
+ /* Create CFB ciphertext manually. */
+ for (i = 0; i < nblocks * blocksize; i+=blocksize)
+ {
+ encrypt_one (ctx, &ciphertext[i], iv);
+ buf_xor_2dst (iv, &ciphertext[i], &plaintext[i], blocksize);
+ }
+
+ /* Decrypt using bulk CBC and compare result. */
+ bulk_cfb_dec (ctx, iv2, plaintext2, ciphertext, nblocks);
+
+ if (memcmp(plaintext2, plaintext, nblocks * blocksize))
+ {
+ gcry_free(mem);
+#ifdef HAVE_SYSLOG
+ syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+ "%s-128-CFB test failed (plaintext mismatch, parallel path)",
+ cipher);
+#endif
+ return "selftest for 128 bit CFB failed - see syslog for details";
+ }
+ if (memcmp(iv2, iv, blocksize))
+ {
+ gcry_free(mem);
+#ifdef HAVE_SYSLOG
+ syslog (LOG_USER|LOG_WARNING, "Libgcrypt warning: "
+ "%s-128-CFB test failed (IV mismatch, parallel path)", cipher);
+#endif
+ return "selftest for 128 bit CFB failed - see syslog for details";
+ }
+
+ gcry_free(mem);
+ return NULL;
+}
+
/* Run the self-tests for <block cipher>-CTR-128, tests IV increment of bulk CTR
encryption. Returns NULL on success. */
const char *
diff --git a/cipher/cipher-selftest.h b/cipher/cipher-selftest.h
index 89d79c2..30bc251 100644
--- a/cipher/cipher-selftest.h
+++ b/cipher/cipher-selftest.h
@@ -30,6 +30,11 @@ typedef void (*gcry_cipher_bulk_cbc_dec_t)(void *context, unsigned char *iv,
const void *inbuf_arg,
unsigned int nblocks);
+typedef void (*gcry_cipher_bulk_cfb_dec_t)(void *context, unsigned char *iv,
+ void *outbuf_arg,
+ const void *inbuf_arg,
+ unsigned int nblocks);
+
typedef void (*gcry_cipher_bulk_ctr_enc_t)(void *context, unsigned char *iv,
void *outbuf_arg,
const void *inbuf_arg,
@@ -43,6 +48,14 @@ _gcry_selftest_helper_cbc_128 (const char *cipher, gcry_cipher_setkey_t setkey,
const int nblocks, const int blocksize,
const int context_size);
+/* Helper function for bulk CFB decryption selftest */
+const char *
+_gcry_selftest_helper_cfb_128 (const char *cipher, gcry_cipher_setkey_t setkey,
+ gcry_cipher_encrypt_t encrypt_one,
+ gcry_cipher_bulk_cfb_dec_t bulk_cfb_dec,
+ const int nblocks, const int blocksize,
+ const int context_size);
+
/* Helper function for bulk CTR encryption selftest */
const char *
_gcry_selftest_helper_ctr_128 (const char *cipher, gcry_cipher_setkey_t setkey,
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index 4c81688..9f075ff 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -821,6 +821,115 @@ do_aesni_dec_aligned (const RIJNDAEL_context *ctx,
}
+/* Encrypt four blocks using the Intel AES-NI instructions. Blocks are input
+ * and output through SSE registers xmm1 to xmm4. */
+static void
+do_aesni_enc_vec4 (const RIJNDAEL_context *ctx)
+{
+#define aesenc_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc8\n\t"
+#define aesenc_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd0\n\t"
+#define aesenc_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xd8\n\t"
+#define aesenc_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xe0\n\t"
+#define aesenclast_xmm0_xmm1 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc8\n\t"
+#define aesenclast_xmm0_xmm2 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd0\n\t"
+#define aesenclast_xmm0_xmm3 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xd8\n\t"
+#define aesenclast_xmm0_xmm4 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xe0\n\t"
+ asm volatile ("movdqa (%[key]), %%xmm0\n\t"
+ "pxor %%xmm0, %%xmm1\n\t" /* xmm1 ^= key[0] */
+ "pxor %%xmm0, %%xmm2\n\t" /* xmm2 ^= key[0] */
+ "pxor %%xmm0, %%xmm3\n\t" /* xmm3 ^= key[0] */
+ "pxor %%xmm0, %%xmm4\n\t" /* xmm4 ^= key[0] */
+ "movdqa 0x10(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x20(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x30(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x40(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x50(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x60(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x70(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x80(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0x90(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xa0(%[key]), %%xmm0\n\t"
+ "cmpl $10, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xb0(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xc0(%[key]), %%xmm0\n\t"
+ "cmpl $12, %[rounds]\n\t"
+ "jz .Ldeclast%=\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xd0(%[key]), %%xmm0\n\t"
+ aesenc_xmm0_xmm1
+ aesenc_xmm0_xmm2
+ aesenc_xmm0_xmm3
+ aesenc_xmm0_xmm4
+ "movdqa 0xe0(%[key]), %%xmm0\n"
+
+ ".Ldeclast%=:\n\t"
+ aesenclast_xmm0_xmm1
+ aesenclast_xmm0_xmm2
+ aesenclast_xmm0_xmm3
+ aesenclast_xmm0_xmm4
+ : /* no output */
+ : [key] "r" (ctx->keyschenc),
+ [rounds] "r" (ctx->rounds)
+ : "cc", "memory");
+#undef aesenc_xmm0_xmm1
+#undef aesenc_xmm0_xmm2
+#undef aesenc_xmm0_xmm3
+#undef aesenc_xmm0_xmm4
+#undef aesenclast_xmm0_xmm1
+#undef aesenclast_xmm0_xmm2
+#undef aesenclast_xmm0_xmm3
+#undef aesenclast_xmm0_xmm4
+}
+
+
/* Decrypt four blocks using the Intel AES-NI instructions. Blocks are input
* and output through SSE registers xmm1 to xmm4. */
static void
@@ -1685,7 +1794,7 @@ rijndael_decrypt (void *context, byte *b, const byte *a)
/* Bulk decryption of complete blocks in CFB mode. Caller needs to
- make sure that IV is aligned on an unisgned lonhg boundary. This
+ make sure that IV is aligned on an unsigned long boundary. This
function is only intended for the bulk encryption feature of
cipher.c. */
void
@@ -1716,6 +1825,50 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
else if (ctx->use_aesni)
{
aesni_prepare ();
+
+ /* CFB decryption can be parallelized */
+ for ( ;nblocks >= 4; nblocks -= 4)
+ {
+ asm volatile
+ ("movdqu (%[iv]), %%xmm1\n\t" /* load input blocks */
+ "movdqu 0*16(%[inbuf]), %%xmm2\n\t"
+ "movdqu 1*16(%[inbuf]), %%xmm3\n\t"
+ "movdqu 2*16(%[inbuf]), %%xmm4\n\t"
+
+ "movdqu 3*16(%[inbuf]), %%xmm0\n\t" /* update IV */
+ "movdqu %%xmm0, (%[iv])\n\t"
+ : /* No output */
+ : [inbuf] "r" (inbuf), [iv] "r" (iv)
+ : "memory");
+
+ do_aesni_enc_vec4 (ctx);
+
+ asm volatile
+ ("movdqu 0*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm1, 0*16(%[outbuf])\n\t"
+
+ "movdqu 1*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm2\n\t"
+ "movdqu %%xmm2, 1*16(%[outbuf])\n\t"
+
+ "movdqu 2*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm3, 2*16(%[outbuf])\n\t"
+
+ "movdqu 3*16(%[inbuf]), %%xmm5\n\t"
+ "pxor %%xmm5, %%xmm4\n\t"
+ "movdqu %%xmm4, 3*16(%[outbuf])\n\t"
+
+ : /* No output */
+ : [inbuf] "r" (inbuf),
+ [outbuf] "r" (outbuf)
+ : "memory");
+
+ outbuf += 4*BLOCKSIZE;
+ inbuf += 4*BLOCKSIZE;
+ }
+
for ( ;nblocks; nblocks-- )
{
do_aesni_cfb (ctx, 1, iv, outbuf, inbuf);
@@ -1723,6 +1876,7 @@ _gcry_aes_cfb_dec (void *context, unsigned char *iv,
inbuf += BLOCKSIZE;
}
aesni_cleanup ();
+ aesni_cleanup_2_5 ();
}
#endif /*USE_AESNI*/
else
@@ -2035,6 +2189,21 @@ selftest_cbc_128 (void)
}
+/* Run the self-tests for AES-CFB-128, tests bulk CFB decryption.
+ Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+ const int nblocks = 8+2;
+ const int blocksize = BLOCKSIZE;
+ const int context_size = sizeof(RIJNDAEL_context);
+
+ return _gcry_selftest_helper_cfb_128("AES", &rijndael_setkey,
+ &rijndael_encrypt, &_gcry_aes_cfb_dec, nblocks, blocksize,
+ context_size);
+}
+
+
/* Run all the self-tests and return NULL on success. This function
is used for the on-the-fly self-tests. */
static const char *
@@ -2053,6 +2222,9 @@ selftest (void)
if ( (r = selftest_cbc_128 ()) )
return r;
+ if ( (r = selftest_cfb_128 ()) )
+ return r;
+
return r;
}
-----------------------------------------------------------------------
Summary of changes:
cipher/camellia-aesni-avx-amd64.S | 65 ++++++++++++++
cipher/camellia-glue.c | 74 ++++++++++++++++
cipher/cipher-selftest.c | 113 ++++++++++++++++++++++++
cipher/cipher-selftest.h | 13 +++
cipher/cipher.c | 2 +
cipher/rijndael.c | 174 ++++++++++++++++++++++++++++++++++++-
cipher/serpent-sse2-amd64.S | 66 ++++++++++++++
cipher/serpent.c | 88 +++++++++++++++++++
src/cipher.h | 6 ++
9 files changed, 600 insertions(+), 1 deletions(-)
hooks/post-receive
--
The GNU crypto library
http://git.gnupg.org
More information about the Gnupg-commits
mailing list