[PATCH 6/7] camellia-avx2: add partial parallel block processing

Jussi Kivilinna jussi.kivilinna at iki.fi
Sun Apr 24 20:40:24 CEST 2022


* cipher/camellia-aesni-avx2-amd64.h: Remove unnecessary vzeroupper
from function entry.
(enc_blk1_32, dec_blk1_32): New.
* cipher/camellia-glue.c (avx_burn_stack_depth)
(avx2_burn_stack_depth): Move outside of bulk functions to deduplicate.
(_gcry_camellia_aesni_avx2_enc_blk1_32)
(_gcry_camellia_aesni_avx2_dec_blk1_32)
(_gcry_camellia_vaes_avx2_enc_blk1_32)
(_gcry_camellia_vaes_avx2_dec_blk1_32)
(_gcry_camellia_gfni_avx2_enc_blk1_32)
(_gcry_camellia_gfni_avx2_dec_blk1_32, camellia_encrypt_blk1_32)
(camellia_decrypt_blk1_32): New.
(_gcry_camellia_ctr_enc, _gcry_camellia_cbc_dec, _gcry_camellia_cfb_dec)
(_gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): Use new bulk
processing helpers from 'bulkhelp.h' and 'camellia_encrypt_blk1_32'
and 'camellia_decrypt_blk1_32' for partial parallel processing.
--

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/camellia-aesni-avx2-amd64.h | 209 +++++++++++++++++++--
 cipher/camellia-glue.c             | 292 ++++++++++++++++++++++-------
 2 files changed, 421 insertions(+), 80 deletions(-)

diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h
index 8cd4b1cd..9cc5621e 100644
--- a/cipher/camellia-aesni-avx2-amd64.h
+++ b/cipher/camellia-aesni-avx2-amd64.h
@@ -1152,8 +1152,6 @@ FUNC_NAME(ctr_enc):
 	movq 8(%rcx), %r11;
 	bswapq %r11;
 
-	vzeroupper;
-
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
@@ -1347,8 +1345,6 @@ FUNC_NAME(cbc_dec):
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
-	vzeroupper;
-
 	movq %rcx, %r9;
 
 	cmpl $128, key_bitlength(CTX);
@@ -1424,8 +1420,6 @@ FUNC_NAME(cfb_dec):
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
-	vzeroupper;
-
 	cmpl $128, key_bitlength(CTX);
 	movl $32, %r8d;
 	movl $24, %eax;
@@ -1510,8 +1504,6 @@ FUNC_NAME(ocb_enc):
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
-	vzeroupper;
-
 	subq $(16 * 32 + 4 * 8), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
@@ -1684,8 +1676,6 @@ FUNC_NAME(ocb_dec):
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
-	vzeroupper;
-
 	subq $(16 * 32 + 4 * 8), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
@@ -1880,8 +1870,6 @@ FUNC_NAME(ocb_auth):
 	movq %rsp, %rbp;
 	CFI_DEF_CFA_REGISTER(%rbp);
 
-	vzeroupper;
-
 	subq $(16 * 32 + 4 * 8), %rsp;
 	andq $~63, %rsp;
 	movq %rsp, %rax;
@@ -2032,4 +2020,201 @@ FUNC_NAME(ocb_auth):
 	CFI_ENDPROC();
 ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);)
 
+.align 8
+.globl FUNC_NAME(enc_blk1_32)
+ELF(.type   FUNC_NAME(enc_blk1_32), at function;)
+
+FUNC_NAME(enc_blk1_32):
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%ecx: nblocks (1 to 32)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	movl %ecx, %r9d;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	subq $(16 * 32), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	cmpl $31, %ecx;
+	vpxor %xmm0, %xmm0, %xmm0;
+	ja 1f;
+	jb 2f;
+	  vmovdqu 15 * 32(%rdx), %xmm0;
+	  jmp 2f;
+	1:
+	  vmovdqu 15 * 32(%rdx), %ymm0;
+	2:
+	  vmovdqu %ymm0, (%rax);
+
+	vpbroadcastq (key_table)(CTX), %ymm0;
+	vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
+
+#define LOAD_INPUT(offset, ymm) \
+	cmpl $(1 + 2 * (offset)), %ecx; \
+	jb 2f; \
+	ja 1f; \
+	  vmovdqu (offset) * 32(%rdx), %ymm##_x; \
+	  vpxor %ymm0, %ymm, %ymm; \
+	  jmp 2f; \
+	1: \
+	  vpxor (offset) * 32(%rdx), %ymm0, %ymm;
+
+	LOAD_INPUT(0, ymm15);
+	LOAD_INPUT(1, ymm14);
+	LOAD_INPUT(2, ymm13);
+	LOAD_INPUT(3, ymm12);
+	LOAD_INPUT(4, ymm11);
+	LOAD_INPUT(5, ymm10);
+	LOAD_INPUT(6, ymm9);
+	LOAD_INPUT(7, ymm8);
+	LOAD_INPUT(8, ymm7);
+	LOAD_INPUT(9, ymm6);
+	LOAD_INPUT(10, ymm5);
+	LOAD_INPUT(11, ymm4);
+	LOAD_INPUT(12, ymm3);
+	LOAD_INPUT(13, ymm2);
+	LOAD_INPUT(14, ymm1);
+	vpxor (%rax), %ymm0, %ymm0;
+
+2:
+	call __camellia_enc_blk32;
+
+#define STORE_OUTPUT(ymm, offset) \
+	cmpl $(1 + 2 * (offset)), %r9d; \
+	jb 2f; \
+	ja 1f; \
+	  vmovdqu %ymm##_x, (offset) * 32(%rsi); \
+	  jmp 2f; \
+	1: \
+	  vmovdqu %ymm, (offset) * 32(%rsi);
+
+	STORE_OUTPUT(ymm7, 0);
+	STORE_OUTPUT(ymm6, 1);
+	STORE_OUTPUT(ymm5, 2);
+	STORE_OUTPUT(ymm4, 3);
+	STORE_OUTPUT(ymm3, 4);
+	STORE_OUTPUT(ymm2, 5);
+	STORE_OUTPUT(ymm1, 6);
+	STORE_OUTPUT(ymm0, 7);
+	STORE_OUTPUT(ymm15, 8);
+	STORE_OUTPUT(ymm14, 9);
+	STORE_OUTPUT(ymm13, 10);
+	STORE_OUTPUT(ymm12, 11);
+	STORE_OUTPUT(ymm11, 12);
+	STORE_OUTPUT(ymm10, 13);
+	STORE_OUTPUT(ymm9, 14);
+	STORE_OUTPUT(ymm8, 15);
+
+2:
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size FUNC_NAME(enc_blk1_32),.-FUNC_NAME(enc_blk1_32);)
+
+.align 8
+.globl FUNC_NAME(dec_blk1_32)
+ELF(.type   FUNC_NAME(dec_blk1_32), at function;)
+
+FUNC_NAME(dec_blk1_32):
+	/* input:
+	 *	%rdi: ctx, CTX
+	 *	%rsi: dst (32 blocks)
+	 *	%rdx: src (32 blocks)
+	 *	%ecx: nblocks (1 to 32)
+	 */
+	CFI_STARTPROC();
+
+	pushq %rbp;
+	CFI_PUSH(%rbp);
+	movq %rsp, %rbp;
+	CFI_DEF_CFA_REGISTER(%rbp);
+
+	movl %ecx, %r9d;
+
+	cmpl $128, key_bitlength(CTX);
+	movl $32, %r8d;
+	movl $24, %eax;
+	cmovel %eax, %r8d; /* max */
+
+	subq $(16 * 32), %rsp;
+	andq $~63, %rsp;
+	movq %rsp, %rax;
+
+	cmpl $31, %ecx;
+	vpxor %xmm0, %xmm0, %xmm0;
+	ja 1f;
+	jb 2f;
+	  vmovdqu 15 * 32(%rdx), %xmm0;
+	  jmp 2f;
+	1:
+	  vmovdqu 15 * 32(%rdx), %ymm0;
+	2:
+	  vmovdqu %ymm0, (%rax);
+
+	vpbroadcastq (key_table)(CTX, %r8, 8), %ymm0;
+	vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0;
+
+	LOAD_INPUT(0, ymm15);
+	LOAD_INPUT(1, ymm14);
+	LOAD_INPUT(2, ymm13);
+	LOAD_INPUT(3, ymm12);
+	LOAD_INPUT(4, ymm11);
+	LOAD_INPUT(5, ymm10);
+	LOAD_INPUT(6, ymm9);
+	LOAD_INPUT(7, ymm8);
+	LOAD_INPUT(8, ymm7);
+	LOAD_INPUT(9, ymm6);
+	LOAD_INPUT(10, ymm5);
+	LOAD_INPUT(11, ymm4);
+	LOAD_INPUT(12, ymm3);
+	LOAD_INPUT(13, ymm2);
+	LOAD_INPUT(14, ymm1);
+	vpxor (%rax), %ymm0, %ymm0;
+
+2:
+	call __camellia_dec_blk32;
+
+	STORE_OUTPUT(ymm7, 0);
+	STORE_OUTPUT(ymm6, 1);
+	STORE_OUTPUT(ymm5, 2);
+	STORE_OUTPUT(ymm4, 3);
+	STORE_OUTPUT(ymm3, 4);
+	STORE_OUTPUT(ymm2, 5);
+	STORE_OUTPUT(ymm1, 6);
+	STORE_OUTPUT(ymm0, 7);
+	STORE_OUTPUT(ymm15, 8);
+	STORE_OUTPUT(ymm14, 9);
+	STORE_OUTPUT(ymm13, 10);
+	STORE_OUTPUT(ymm12, 11);
+	STORE_OUTPUT(ymm11, 12);
+	STORE_OUTPUT(ymm10, 13);
+	STORE_OUTPUT(ymm9, 14);
+	STORE_OUTPUT(ymm8, 15);
+
+2:
+	vzeroall;
+
+	leave;
+	CFI_LEAVE();
+	ret_spec_stop;
+	CFI_ENDPROC();
+ELF(.size FUNC_NAME(dec_blk1_32),.-FUNC_NAME(dec_blk1_32);)
+
 #endif /* GCRY_CAMELLIA_AESNI_AVX2_AMD64_H */
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 7f6e92d2..20ab7f7d 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -174,6 +174,10 @@ extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
 extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
 					    const unsigned char *key,
 					    unsigned int keylen) ASM_FUNC_ABI;
+
+static const int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE + 16 +
+                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
+
 #endif
 
 #ifdef USE_AESNI_AVX2
@@ -214,6 +218,22 @@ extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
 					       unsigned char *offset,
 					       unsigned char *checksum,
 					       const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+                                                  unsigned char *out,
+                                                  const unsigned char *in,
+                                                  unsigned int nblocks)
+                                                  ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+                                                  unsigned char *out,
+                                                  const unsigned char *in,
+                                                  unsigned int nblocks)
+                                                  ASM_FUNC_ABI;
+
+static const int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
+                                         2 * sizeof(void *) + ASM_EXTRA_STACK;
+
 #endif
 
 #ifdef USE_VAES_AVX2
@@ -254,6 +274,18 @@ extern void _gcry_camellia_vaes_avx2_ocb_auth(CAMELLIA_context *ctx,
 					      unsigned char *offset,
 					      unsigned char *checksum,
 					      const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in,
+                                                 unsigned int nblocks)
+                                                 ASM_FUNC_ABI;
+
+extern void _gcry_camellia_vaes_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in,
+                                                 unsigned int nblocks)
+                                                 ASM_FUNC_ABI;
 #endif
 
 #ifdef USE_GFNI_AVX2
@@ -294,6 +326,18 @@ extern void _gcry_camellia_gfni_avx2_ocb_auth(CAMELLIA_context *ctx,
 					      unsigned char *offset,
 					      unsigned char *checksum,
 					      const u64 Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_enc_blk1_32(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in,
+                                                 unsigned int nblocks)
+                                                 ASM_FUNC_ABI;
+
+extern void _gcry_camellia_gfni_avx2_dec_blk1_32(const CAMELLIA_context *ctx,
+                                                 unsigned char *out,
+                                                 const unsigned char *in,
+                                                 unsigned int nblocks)
+                                                 ASM_FUNC_ABI;
 #endif
 
 static const char *selftest(void);
@@ -475,6 +519,105 @@ camellia_decrypt(void *c, byte *outbuf, const byte *inbuf)
 
 #endif /*!USE_ARM_ASM*/
 
+
+static unsigned int
+camellia_encrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf,
+                          unsigned int num_blks)
+{
+  const CAMELLIA_context *ctx = priv;
+  unsigned int stack_burn_size = 0;
+
+  gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2 && num_blks >= 3)
+    {
+      /* 3 or more parallel block GFNI processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_gfni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_VAES_AVX2
+  if (ctx->use_vaes_avx2 && num_blks >= 6)
+    {
+      /* 6 or more parallel block VAES processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_vaes_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 && num_blks >= 6)
+    {
+      /* 6 or more parallel block AESNI processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_aesni_avx2_enc_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+
+  while (num_blks)
+    {
+      stack_burn_size = camellia_encrypt((void *)ctx, outbuf, inbuf);
+      outbuf += CAMELLIA_BLOCK_SIZE;
+      inbuf += CAMELLIA_BLOCK_SIZE;
+      num_blks--;
+    }
+
+  return stack_burn_size;
+}
+
+
+static unsigned int
+camellia_decrypt_blk1_32 (const void *priv, byte *outbuf, const byte *inbuf,
+                          unsigned int num_blks)
+{
+  const CAMELLIA_context *ctx = priv;
+  unsigned int stack_burn_size = 0;
+
+  gcry_assert (num_blks <= 32);
+
+#ifdef USE_GFNI_AVX2
+  if (ctx->use_gfni_avx2 && num_blks >= 3)
+    {
+      /* 3 or more parallel block GFNI processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_gfni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_VAES_AVX2
+  if (ctx->use_vaes_avx2 && num_blks >= 6)
+    {
+      /* 6 or more parallel block VAES processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_vaes_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+#ifdef USE_AESNI_AVX2
+  if (ctx->use_aesni_avx2 && num_blks >= 6)
+    {
+      /* 6 or more parallel block AESNI processing is faster than
+       * generic C implementation.  */
+      _gcry_camellia_aesni_avx2_dec_blk1_32 (ctx, outbuf, inbuf, num_blks);
+      return avx2_burn_stack_depth;
+    }
+#endif
+
+  while (num_blks)
+    {
+      stack_burn_size = camellia_decrypt((void *)ctx, outbuf, inbuf);
+      outbuf += CAMELLIA_BLOCK_SIZE;
+      inbuf += CAMELLIA_BLOCK_SIZE;
+      num_blks--;
+    }
+
+  return stack_burn_size;
+}
+
+
 /* Bulk encryption of complete blocks in CTR mode.  This function is only
    intended for the bulk encryption feature of cipher.c.  CTR is expected to be
    of size CAMELLIA_BLOCK_SIZE. */
@@ -486,8 +629,7 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  unsigned char tmpbuf[CAMELLIA_BLOCK_SIZE];
-  int burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
+  int burn_stack_depth = 0;
 
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
@@ -517,9 +659,6 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 
       if (did_use_aesni_avx2)
         {
-          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
         }
@@ -547,9 +686,6 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
 
       if (did_use_aesni_avx)
         {
-          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx_burn_stack_depth)
             burn_stack_depth = avx_burn_stack_depth;
         }
@@ -559,20 +695,23 @@ _gcry_camellia_ctr_enc(void *context, unsigned char *ctr,
     }
 #endif
 
-  for ( ;nblocks; nblocks-- )
+  /* Process remaining blocks. */
+  if (nblocks)
     {
-      /* Encrypt the counter. */
-      Camellia_EncryptBlock(ctx->keybitlength, ctr, ctx->keytable, tmpbuf);
-      /* XOR the input with the encrypted counter and store in output.  */
-      cipher_block_xor(outbuf, tmpbuf, inbuf, CAMELLIA_BLOCK_SIZE);
-      outbuf += CAMELLIA_BLOCK_SIZE;
-      inbuf  += CAMELLIA_BLOCK_SIZE;
-      /* Increment the counter.  */
-      cipher_block_add(ctr, 1, CAMELLIA_BLOCK_SIZE);
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_ctr_enc_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
+                               nblocks, ctr, tmpbuf,
+                               sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
     }
 
-  wipememory(tmpbuf, sizeof(tmpbuf));
-  _gcry_burn_stack(burn_stack_depth);
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk decryption of complete blocks in CBC mode.  This function is only
@@ -585,8 +724,7 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  unsigned char savebuf[CAMELLIA_BLOCK_SIZE];
-  int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+  int burn_stack_depth = 0;
 
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
@@ -616,9 +754,6 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx2)
         {
-          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *) + ASM_EXTRA_STACK;;
-
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
         }
@@ -645,9 +780,6 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx_burn_stack_depth)
             burn_stack_depth = avx_burn_stack_depth;
         }
@@ -656,20 +788,23 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
     }
 #endif
 
-  for ( ;nblocks; nblocks-- )
+  /* Process remaining blocks. */
+  if (nblocks)
     {
-      /* INBUF is needed later and it may be identical to OUTBUF, so store
-         the intermediate result to SAVEBUF.  */
-      Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, savebuf);
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
 
-      cipher_block_xor_n_copy_2(outbuf, savebuf, iv, inbuf,
-                                CAMELLIA_BLOCK_SIZE);
-      inbuf += CAMELLIA_BLOCK_SIZE;
-      outbuf += CAMELLIA_BLOCK_SIZE;
+      nburn = bulk_cbc_dec_128(ctx, camellia_decrypt_blk1_32, outbuf, inbuf,
+                               nblocks, iv, tmpbuf,
+                               sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
     }
 
-  wipememory(savebuf, sizeof(savebuf));
-  _gcry_burn_stack(burn_stack_depth);
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk decryption of complete blocks in CFB mode.  This function is only
@@ -682,7 +817,7 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
   CAMELLIA_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  int burn_stack_depth = CAMELLIA_decrypt_stack_burn_size;
+  int burn_stack_depth = 0;
 
 #ifdef USE_AESNI_AVX2
   if (ctx->use_aesni_avx2)
@@ -712,9 +847,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx2)
         {
-          int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE + 16 +
-                                        2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx2_burn_stack_depth)
             burn_stack_depth = avx2_burn_stack_depth;
         }
@@ -741,9 +873,6 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
 
       if (did_use_aesni_avx)
         {
-          int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-                                       2 * sizeof(void *) + ASM_EXTRA_STACK;
-
           if (burn_stack_depth < avx_burn_stack_depth)
             burn_stack_depth = avx_burn_stack_depth;
         }
@@ -752,15 +881,23 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
     }
 #endif
 
-  for ( ;nblocks; nblocks-- )
+  /* Process remaining blocks. */
+  if (nblocks)
     {
-      Camellia_EncryptBlock(ctx->keybitlength, iv, ctx->keytable, iv);
-      cipher_block_xor_n_copy(outbuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
-      outbuf += CAMELLIA_BLOCK_SIZE;
-      inbuf  += CAMELLIA_BLOCK_SIZE;
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_cfb_dec_128(ctx, camellia_encrypt_blk1_32, outbuf, inbuf,
+                               nblocks, iv, tmpbuf,
+                               sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE, &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
     }
 
-  _gcry_burn_stack(burn_stack_depth);
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
 }
 
 /* Bulk encryption/decryption of complete blocks in OCB mode. */
@@ -772,11 +909,9 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
   CAMELLIA_context *ctx = (void *)&c->context.c;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
-  int burn_stack_depth;
+  int burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.data_nblocks;
 
-  burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size :
-			      CAMELLIA_decrypt_stack_burn_size;
 #else
   (void)c;
   (void)outbuf_arg;
@@ -826,9 +961,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
       if (did_use_aesni_avx2)
 	{
-	  int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
-				      2 * sizeof(void *) + ASM_EXTRA_STACK;
-
 	  if (burn_stack_depth < avx2_burn_stack_depth)
 	    burn_stack_depth = avx2_burn_stack_depth;
 	}
@@ -870,9 +1002,6 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 
       if (did_use_aesni_avx)
 	{
-	  int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-				      2 * sizeof(void *) + ASM_EXTRA_STACK;
-
 	  if (burn_stack_depth < avx_burn_stack_depth)
 	    burn_stack_depth = avx_burn_stack_depth;
 	}
@@ -882,6 +1011,24 @@ _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 #endif
 
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_ocb_crypt_128 (c, ctx, encrypt ? camellia_encrypt_blk1_32
+                                                  : camellia_decrypt_blk1_32,
+                                  outbuf, inbuf, nblocks, &blkn, encrypt,
+                                  tmpbuf, sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+                                  &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+      nblocks = 0;
+    }
+
   c->u_mode.ocb.data_nblocks = blkn;
 
   if (burn_stack_depth)
@@ -899,10 +1046,8 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
   CAMELLIA_context *ctx = (void *)&c->context.c;
   const unsigned char *abuf = abuf_arg;
-  int burn_stack_depth;
+  int burn_stack_depth = 0;
   u64 blkn = c->u_mode.ocb.aad_nblocks;
-
-  burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
 #else
   (void)c;
   (void)abuf_arg;
@@ -948,9 +1093,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
       if (did_use_aesni_avx2)
 	{
-	  int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
-				      2 * sizeof(void *) + ASM_EXTRA_STACK;
-
 	  if (burn_stack_depth < avx2_burn_stack_depth)
 	    burn_stack_depth = avx2_burn_stack_depth;
 	}
@@ -988,9 +1130,6 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 
       if (did_use_aesni_avx)
 	{
-	  int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
-				      2 * sizeof(void *) + ASM_EXTRA_STACK;
-
 	  if (burn_stack_depth < avx_burn_stack_depth)
 	    burn_stack_depth = avx_burn_stack_depth;
 	}
@@ -1000,6 +1139,23 @@ _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 #endif
 
 #if defined(USE_AESNI_AVX) || defined(USE_AESNI_AVX2)
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      byte tmpbuf[CAMELLIA_BLOCK_SIZE * 32];
+      unsigned int tmp_used = CAMELLIA_BLOCK_SIZE;
+      size_t nburn;
+
+      nburn = bulk_ocb_auth_128 (c, ctx, camellia_encrypt_blk1_32,
+                                 abuf, nblocks, &blkn, tmpbuf,
+                                 sizeof(tmpbuf) / CAMELLIA_BLOCK_SIZE,
+                                 &tmp_used);
+      burn_stack_depth = nburn > burn_stack_depth ? nburn : burn_stack_depth;
+
+      wipememory(tmpbuf, tmp_used);
+      nblocks = 0;
+    }
+
   c->u_mode.ocb.aad_nblocks = blkn;
 
   if (burn_stack_depth)
-- 
2.34.1




More information about the Gcrypt-devel mailing list