[PATCH 2/6] Add bulk OCB for Camellia AES-NI/AVX and AES-NI/AVX2 implementations
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Jul 27 11:04:20 CEST 2015
* cipher/camellia-aesni-avx-amd64.S: Add OCB assembly functions.
* cipher/camellia-aesni-avx2-amd64.S: Add OCB assembly functions.
* cipher/camellia-glue.c (_gcry_camellia_aesni_avx_ocb_enc)
(_gcry_camellia_aesni_avx_ocb_dec, _gcry_camellia_aesni_avx_ocb_auth)
(_gcry_camellia_aesni_avx2_ocb_enc, _gcry_camellia_aesni_avx2_ocb_dec)
(_gcry_camellia_aesni_avx2_ocb_auth): New prototypes.
(get_l, _gcry_camellia_ocb_crypt, _gcry_camellia_ocb_auth): New.
* cipher/cipher.c (_gcry_cipher_open_internal): Setup OCB bulk
functions for Camellia.
* src/cipher.h (_gcry_camellia_ocb_crypt)
(_gcry_camellia_ocb_auth): New.
* tests/basic.c (check_ocb_cipher): Add test-vector for Camellia.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/camellia-aesni-avx-amd64.S | 424 ++++++++++++++++++++++++++++++
cipher/camellia-aesni-avx2-amd64.S | 503 ++++++++++++++++++++++++++++++++++++
cipher/camellia-glue.c | 329 ++++++++++++++++++++++++
cipher/cipher.c | 2
src/cipher.h | 5
tests/basic.c | 9 +
6 files changed, 1266 insertions(+), 6 deletions(-)
diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index c047a21..5a3a3cb 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -1,6 +1,6 @@
/* camellia-avx-aesni-amd64.S - AES-NI/AVX implementation of Camellia cipher
*
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -1211,6 +1211,428 @@ _gcry_camellia_aesni_avx_cfb_dec:
ret;
ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;)
+.align 8
+.globl _gcry_camellia_aesni_avx_ocb_enc
+ELF(.type _gcry_camellia_aesni_avx_ocb_enc, at function;)
+
+_gcry_camellia_aesni_avx_ocb_enc:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[16])
+ */
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ vzeroupper;
+
+ subq $(16 * 16 + 4 * 8), %rsp;
+ andq $~31, %rsp;
+ movq %rsp, %rax;
+
+ movq %r10, (16 * 16 + 0 * 8)(%rax);
+ movq %r11, (16 * 16 + 1 * 8)(%rax);
+ movq %r12, (16 * 16 + 2 * 8)(%rax);
+ movq %r13, (16 * 16 + 3 * 8)(%rax);
+
+ vmovdqu (%rcx), %xmm14;
+ vmovdqu (%r8), %xmm15;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, lreg, xreg) \
+ vmovdqu (n * 16)(%rdx), xreg; \
+ vpxor (lreg), %xmm14, %xmm14; \
+ vpxor xreg, %xmm15, %xmm15; \
+ vpxor xreg, %xmm14, xreg; \
+ vmovdqu %xmm14, (n * 16)(%rsi);
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %xmm0);
+ vmovdqu %xmm0, (15 * 16)(%rax);
+ OCB_INPUT(1, %r11, %xmm0);
+ vmovdqu %xmm0, (14 * 16)(%rax);
+ OCB_INPUT(2, %r12, %xmm13);
+ OCB_INPUT(3, %r13, %xmm12);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %xmm11);
+ OCB_INPUT(5, %r11, %xmm10);
+ OCB_INPUT(6, %r12, %xmm9);
+ OCB_INPUT(7, %r13, %xmm8);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(8, %r10, %xmm7);
+ OCB_INPUT(9, %r11, %xmm6);
+ OCB_INPUT(10, %r12, %xmm5);
+ OCB_INPUT(11, %r13, %xmm4);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(12, %r10, %xmm3);
+ OCB_INPUT(13, %r11, %xmm2);
+ OCB_INPUT(14, %r12, %xmm1);
+ OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+ vmovdqu %xmm14, (%rcx);
+ vmovdqu %xmm15, (%r8);
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX), %xmm15;
+ vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+ vpxor %xmm0, %xmm15, %xmm0;
+ vpxor %xmm1, %xmm15, %xmm1;
+ vpxor %xmm2, %xmm15, %xmm2;
+ vpxor %xmm3, %xmm15, %xmm3;
+ vpxor %xmm4, %xmm15, %xmm4;
+ vpxor %xmm5, %xmm15, %xmm5;
+ vpxor %xmm6, %xmm15, %xmm6;
+ vpxor %xmm7, %xmm15, %xmm7;
+ vpxor %xmm8, %xmm15, %xmm8;
+ vpxor %xmm9, %xmm15, %xmm9;
+ vpxor %xmm10, %xmm15, %xmm10;
+ vpxor %xmm11, %xmm15, %xmm11;
+ vpxor %xmm12, %xmm15, %xmm12;
+ vpxor %xmm13, %xmm15, %xmm13;
+ vpxor 14 * 16(%rax), %xmm15, %xmm14;
+ vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+ call __camellia_enc_blk16;
+
+ vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+ vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+ vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+ vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+ vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+ vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+ vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+ vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+ vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+ vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+ vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+ vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+ vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+ vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+ vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+ vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ vzeroall;
+
+ movq (16 * 16 + 0 * 8)(%rax), %r10;
+ movq (16 * 16 + 1 * 8)(%rax), %r11;
+ movq (16 * 16 + 2 * 8)(%rax), %r12;
+ movq (16 * 16 + 3 * 8)(%rax), %r13;
+
+ leave;
+ ret;
+ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_ocb_dec
+ELF(.type _gcry_camellia_aesni_avx_ocb_dec, at function;)
+
+_gcry_camellia_aesni_avx_ocb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (16 blocks)
+ * %rdx: src (16 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[16])
+ */
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ vzeroupper;
+
+ subq $(16 * 16 + 4 * 8), %rsp;
+ andq $~31, %rsp;
+ movq %rsp, %rax;
+
+ movq %r10, (16 * 16 + 0 * 8)(%rax);
+ movq %r11, (16 * 16 + 1 * 8)(%rax);
+ movq %r12, (16 * 16 + 2 * 8)(%rax);
+ movq %r13, (16 * 16 + 3 * 8)(%rax);
+
+ vmovdqu (%rcx), %xmm15;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+
+#define OCB_INPUT(n, lreg, xreg) \
+ vmovdqu (n * 16)(%rdx), xreg; \
+ vpxor (lreg), %xmm15, %xmm15; \
+ vpxor xreg, %xmm15, xreg; \
+ vmovdqu %xmm15, (n * 16)(%rsi);
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %xmm0);
+ vmovdqu %xmm0, (15 * 16)(%rax);
+ OCB_INPUT(1, %r11, %xmm14);
+ OCB_INPUT(2, %r12, %xmm13);
+ OCB_INPUT(3, %r13, %xmm12);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %xmm11);
+ OCB_INPUT(5, %r11, %xmm10);
+ OCB_INPUT(6, %r12, %xmm9);
+ OCB_INPUT(7, %r13, %xmm8);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(8, %r10, %xmm7);
+ OCB_INPUT(9, %r11, %xmm6);
+ OCB_INPUT(10, %r12, %xmm5);
+ OCB_INPUT(11, %r13, %xmm4);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(12, %r10, %xmm3);
+ OCB_INPUT(13, %r11, %xmm2);
+ OCB_INPUT(14, %r12, %xmm1);
+ OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+ vmovdqu %xmm15, (%rcx);
+
+ movq %r8, %r10;
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %r9d;
+ cmovel %r9d, %r8d; /* max */
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX, %r8, 8), %xmm15;
+ vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+ vpxor %xmm0, %xmm15, %xmm0;
+ vpxor %xmm1, %xmm15, %xmm1;
+ vpxor %xmm2, %xmm15, %xmm2;
+ vpxor %xmm3, %xmm15, %xmm3;
+ vpxor %xmm4, %xmm15, %xmm4;
+ vpxor %xmm5, %xmm15, %xmm5;
+ vpxor %xmm6, %xmm15, %xmm6;
+ vpxor %xmm7, %xmm15, %xmm7;
+ vpxor %xmm8, %xmm15, %xmm8;
+ vpxor %xmm9, %xmm15, %xmm9;
+ vpxor %xmm10, %xmm15, %xmm10;
+ vpxor %xmm11, %xmm15, %xmm11;
+ vpxor %xmm12, %xmm15, %xmm12;
+ vpxor %xmm13, %xmm15, %xmm13;
+ vpxor %xmm14, %xmm15, %xmm14;
+ vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+ call __camellia_dec_blk16;
+
+ vpxor 0 * 16(%rsi), %xmm7, %xmm7;
+ vpxor 1 * 16(%rsi), %xmm6, %xmm6;
+ vpxor 2 * 16(%rsi), %xmm5, %xmm5;
+ vpxor 3 * 16(%rsi), %xmm4, %xmm4;
+ vpxor 4 * 16(%rsi), %xmm3, %xmm3;
+ vpxor 5 * 16(%rsi), %xmm2, %xmm2;
+ vpxor 6 * 16(%rsi), %xmm1, %xmm1;
+ vpxor 7 * 16(%rsi), %xmm0, %xmm0;
+ vmovdqu %xmm7, (7 * 16)(%rax);
+ vpxor 8 * 16(%rsi), %xmm15, %xmm15;
+ vpxor 9 * 16(%rsi), %xmm14, %xmm14;
+ vpxor 10 * 16(%rsi), %xmm13, %xmm13;
+ vpxor 11 * 16(%rsi), %xmm12, %xmm12;
+ vpxor 12 * 16(%rsi), %xmm11, %xmm11;
+ vpxor 13 * 16(%rsi), %xmm10, %xmm10;
+ vpxor 14 * 16(%rsi), %xmm9, %xmm9;
+ vpxor 15 * 16(%rsi), %xmm8, %xmm8;
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+
+ vpxor (%r10), %xmm7, %xmm7;
+ vpxor %xmm6, %xmm7, %xmm7;
+ vpxor %xmm5, %xmm7, %xmm7;
+ vpxor %xmm4, %xmm7, %xmm7;
+ vpxor %xmm3, %xmm7, %xmm7;
+ vpxor %xmm2, %xmm7, %xmm7;
+ vpxor %xmm1, %xmm7, %xmm7;
+ vpxor %xmm0, %xmm7, %xmm7;
+ vpxor %xmm15, %xmm7, %xmm7;
+ vpxor %xmm14, %xmm7, %xmm7;
+ vpxor %xmm13, %xmm7, %xmm7;
+ vpxor %xmm12, %xmm7, %xmm7;
+ vpxor %xmm11, %xmm7, %xmm7;
+ vpxor %xmm10, %xmm7, %xmm7;
+ vpxor %xmm9, %xmm7, %xmm7;
+ vpxor %xmm8, %xmm7, %xmm7;
+ vmovdqu %xmm7, (%r10);
+ vmovdqu (7 * 16)(%rax), %xmm7;
+
+ write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
+ %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
+ %xmm8, %rsi);
+
+ vzeroall;
+
+ movq (16 * 16 + 0 * 8)(%rax), %r10;
+ movq (16 * 16 + 1 * 8)(%rax), %r11;
+ movq (16 * 16 + 2 * 8)(%rax), %r12;
+ movq (16 * 16 + 3 * 8)(%rax), %r13;
+
+ leave;
+ ret;
+ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx_ocb_auth
+ELF(.type _gcry_camellia_aesni_avx_ocb_auth, at function;)
+
+_gcry_camellia_aesni_avx_ocb_auth:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: abuf (16 blocks)
+ * %rdx: offset
+ * %rcx: checksum
+ * %r8 : L pointers (void *L[16])
+ */
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ vzeroupper;
+
+ subq $(16 * 16 + 4 * 8), %rsp;
+ andq $~31, %rsp;
+ movq %rsp, %rax;
+
+ movq %r10, (16 * 16 + 0 * 8)(%rax);
+ movq %r11, (16 * 16 + 1 * 8)(%rax);
+ movq %r12, (16 * 16 + 2 * 8)(%rax);
+ movq %r13, (16 * 16 + 3 * 8)(%rax);
+
+ vmovdqu (%rdx), %xmm15;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+
+#define OCB_INPUT(n, lreg, xreg) \
+ vmovdqu (n * 16)(%rsi), xreg; \
+ vpxor (lreg), %xmm15, %xmm15; \
+ vpxor xreg, %xmm15, xreg;
+
+ movq (0 * 8)(%r8), %r10;
+ movq (1 * 8)(%r8), %r11;
+ movq (2 * 8)(%r8), %r12;
+ movq (3 * 8)(%r8), %r13;
+ OCB_INPUT(0, %r10, %xmm0);
+ vmovdqu %xmm0, (15 * 16)(%rax);
+ OCB_INPUT(1, %r11, %xmm14);
+ OCB_INPUT(2, %r12, %xmm13);
+ OCB_INPUT(3, %r13, %xmm12);
+ movq (4 * 8)(%r8), %r10;
+ movq (5 * 8)(%r8), %r11;
+ movq (6 * 8)(%r8), %r12;
+ movq (7 * 8)(%r8), %r13;
+ OCB_INPUT(4, %r10, %xmm11);
+ OCB_INPUT(5, %r11, %xmm10);
+ OCB_INPUT(6, %r12, %xmm9);
+ OCB_INPUT(7, %r13, %xmm8);
+ movq (8 * 8)(%r8), %r10;
+ movq (9 * 8)(%r8), %r11;
+ movq (10 * 8)(%r8), %r12;
+ movq (11 * 8)(%r8), %r13;
+ OCB_INPUT(8, %r10, %xmm7);
+ OCB_INPUT(9, %r11, %xmm6);
+ OCB_INPUT(10, %r12, %xmm5);
+ OCB_INPUT(11, %r13, %xmm4);
+ movq (12 * 8)(%r8), %r10;
+ movq (13 * 8)(%r8), %r11;
+ movq (14 * 8)(%r8), %r12;
+ movq (15 * 8)(%r8), %r13;
+ OCB_INPUT(12, %r10, %xmm3);
+ OCB_INPUT(13, %r11, %xmm2);
+ OCB_INPUT(14, %r12, %xmm1);
+ OCB_INPUT(15, %r13, %xmm0);
+#undef OCB_INPUT
+
+ vmovdqu %xmm15, (%rdx);
+
+ movq %rcx, %r10;
+
+ /* inpack16_pre: */
+ vmovq (key_table)(CTX), %xmm15;
+ vpshufb .Lpack_bswap RIP, %xmm15, %xmm15;
+ vpxor %xmm0, %xmm15, %xmm0;
+ vpxor %xmm1, %xmm15, %xmm1;
+ vpxor %xmm2, %xmm15, %xmm2;
+ vpxor %xmm3, %xmm15, %xmm3;
+ vpxor %xmm4, %xmm15, %xmm4;
+ vpxor %xmm5, %xmm15, %xmm5;
+ vpxor %xmm6, %xmm15, %xmm6;
+ vpxor %xmm7, %xmm15, %xmm7;
+ vpxor %xmm8, %xmm15, %xmm8;
+ vpxor %xmm9, %xmm15, %xmm9;
+ vpxor %xmm10, %xmm15, %xmm10;
+ vpxor %xmm11, %xmm15, %xmm11;
+ vpxor %xmm12, %xmm15, %xmm12;
+ vpxor %xmm13, %xmm15, %xmm13;
+ vpxor %xmm14, %xmm15, %xmm14;
+ vpxor 15 * 16(%rax), %xmm15, %xmm15;
+
+ call __camellia_enc_blk16;
+
+ vpxor %xmm7, %xmm6, %xmm6;
+ vpxor %xmm5, %xmm4, %xmm4;
+ vpxor %xmm3, %xmm2, %xmm2;
+ vpxor %xmm1, %xmm0, %xmm0;
+ vpxor %xmm15, %xmm14, %xmm14;
+ vpxor %xmm13, %xmm12, %xmm12;
+ vpxor %xmm11, %xmm10, %xmm10;
+ vpxor %xmm9, %xmm8, %xmm8;
+
+ vpxor %xmm6, %xmm4, %xmm4;
+ vpxor %xmm2, %xmm0, %xmm0;
+ vpxor %xmm14, %xmm12, %xmm12;
+ vpxor %xmm10, %xmm8, %xmm8;
+
+ vpxor %xmm4, %xmm0, %xmm0;
+ vpxor %xmm12, %xmm8, %xmm8;
+
+ vpxor %xmm0, %xmm8, %xmm0;
+ vpxor (%r10), %xmm0, %xmm0;
+ vmovdqu %xmm0, (%r10);
+
+ vzeroall;
+
+ movq (16 * 16 + 0 * 8)(%rax), %r10;
+ movq (16 * 16 + 1 * 8)(%rax), %r11;
+ movq (16 * 16 + 2 * 8)(%rax), %r12;
+ movq (16 * 16 + 3 * 8)(%rax), %r13;
+
+ leave;
+ ret;
+ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;)
+
/*
* IN:
* ab: 64-bit AB state
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S
index a3fa229..26381df 100644
--- a/cipher/camellia-aesni-avx2-amd64.S
+++ b/cipher/camellia-aesni-avx2-amd64.S
@@ -1,6 +1,6 @@
/* camellia-avx2-aesni-amd64.S - AES-NI/AVX2 implementation of Camellia cipher
*
- * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna at iki.fi>
+ * Copyright (C) 2013-2015 Jussi Kivilinna <jussi.kivilinna at iki.fi>
*
* This file is part of Libgcrypt.
*
@@ -1127,8 +1127,8 @@ ELF(.type _gcry_camellia_aesni_avx2_cbc_dec, at function;)
_gcry_camellia_aesni_avx2_cbc_dec:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
* %rcx: iv
*/
@@ -1199,8 +1199,8 @@ ELF(.type _gcry_camellia_aesni_avx2_cfb_dec, at function;)
_gcry_camellia_aesni_avx2_cfb_dec:
/* input:
* %rdi: ctx, CTX
- * %rsi: dst (16 blocks)
- * %rdx: src (16 blocks)
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
* %rcx: iv
*/
@@ -1266,5 +1266,498 @@ _gcry_camellia_aesni_avx2_cfb_dec:
ret;
ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;)
+.align 8
+.globl _gcry_camellia_aesni_avx2_ocb_enc
+ELF(.type _gcry_camellia_aesni_avx2_ocb_enc, at function;)
+
+_gcry_camellia_aesni_avx2_ocb_enc:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[32])
+ */
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ vzeroupper;
+
+ subq $(16 * 32 + 4 * 8), %rsp;
+ andq $~63, %rsp;
+ movq %rsp, %rax;
+
+ movq %r10, (16 * 32 + 0 * 8)(%rax);
+ movq %r11, (16 * 32 + 1 * 8)(%rax);
+ movq %r12, (16 * 32 + 2 * 8)(%rax);
+ movq %r13, (16 * 32 + 3 * 8)(%rax);
+
+ vmovdqu (%rcx), %xmm14;
+ vmovdqu (%r8), %xmm13;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rdx), yreg; \
+ vpxor (l0reg), %xmm14, %xmm15; \
+ vpxor (l1reg), %xmm15, %xmm14; \
+ vinserti128 $1, %xmm14, %ymm15, %ymm15; \
+ vpxor yreg, %ymm13, %ymm13; \
+ vpxor yreg, %ymm15, yreg; \
+ vmovdqu %ymm15, (n * 32)(%rsi);
+
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %r11, %ymm0);
+ vmovdqu %ymm0, (15 * 32)(%rax);
+ OCB_INPUT(1, %r12, %r13, %ymm0);
+ vmovdqu %ymm0, (14 * 32)(%rax);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(2, %r10, %r11, %ymm0);
+ vmovdqu %ymm0, (13 * 32)(%rax);
+ OCB_INPUT(3, %r12, %r13, %ymm12);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %r11, %ymm11);
+ OCB_INPUT(5, %r12, %r13, %ymm10);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(6, %r10, %r11, %ymm9);
+ OCB_INPUT(7, %r12, %r13, %ymm8);
+ movq (16 * 8)(%r9), %r10;
+ movq (17 * 8)(%r9), %r11;
+ movq (18 * 8)(%r9), %r12;
+ movq (19 * 8)(%r9), %r13;
+ OCB_INPUT(8, %r10, %r11, %ymm7);
+ OCB_INPUT(9, %r12, %r13, %ymm6);
+ movq (20 * 8)(%r9), %r10;
+ movq (21 * 8)(%r9), %r11;
+ movq (22 * 8)(%r9), %r12;
+ movq (23 * 8)(%r9), %r13;
+ OCB_INPUT(10, %r10, %r11, %ymm5);
+ OCB_INPUT(11, %r12, %r13, %ymm4);
+ movq (24 * 8)(%r9), %r10;
+ movq (25 * 8)(%r9), %r11;
+ movq (26 * 8)(%r9), %r12;
+ movq (27 * 8)(%r9), %r13;
+ OCB_INPUT(12, %r10, %r11, %ymm3);
+ OCB_INPUT(13, %r12, %r13, %ymm2);
+ movq (28 * 8)(%r9), %r10;
+ movq (29 * 8)(%r9), %r11;
+ movq (30 * 8)(%r9), %r12;
+ movq (31 * 8)(%r9), %r13;
+ OCB_INPUT(14, %r10, %r11, %ymm1);
+ OCB_INPUT(15, %r12, %r13, %ymm0);
+#undef OCB_INPUT
+
+ vextracti128 $1, %ymm13, %xmm15;
+ vmovdqu %xmm14, (%rcx);
+ vpxor %xmm13, %xmm15, %xmm15;
+ vmovdqu %xmm15, (%r8);
+
+ /* inpack16_pre: */
+ vpbroadcastq (key_table)(CTX), %ymm15;
+ vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+ vpxor %ymm0, %ymm15, %ymm0;
+ vpxor %ymm1, %ymm15, %ymm1;
+ vpxor %ymm2, %ymm15, %ymm2;
+ vpxor %ymm3, %ymm15, %ymm3;
+ vpxor %ymm4, %ymm15, %ymm4;
+ vpxor %ymm5, %ymm15, %ymm5;
+ vpxor %ymm6, %ymm15, %ymm6;
+ vpxor %ymm7, %ymm15, %ymm7;
+ vpxor %ymm8, %ymm15, %ymm8;
+ vpxor %ymm9, %ymm15, %ymm9;
+ vpxor %ymm10, %ymm15, %ymm10;
+ vpxor %ymm11, %ymm15, %ymm11;
+ vpxor %ymm12, %ymm15, %ymm12;
+ vpxor 13 * 32(%rax), %ymm15, %ymm13;
+ vpxor 14 * 32(%rax), %ymm15, %ymm14;
+ vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+ call __camellia_enc_blk32;
+
+ vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+ vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+ vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+ vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+ vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+ vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+ vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+ vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+ vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+ vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+ vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+ vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+ vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+ vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+ vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+ vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroall;
+
+ movq (16 * 32 + 0 * 8)(%rax), %r10;
+ movq (16 * 32 + 1 * 8)(%rax), %r11;
+ movq (16 * 32 + 2 * 8)(%rax), %r12;
+ movq (16 * 32 + 3 * 8)(%rax), %r13;
+
+ leave;
+ ret;
+ELF(.size _gcry_camellia_aesni_avx2_ocb_enc,.-_gcry_camellia_aesni_avx2_ocb_enc;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_ocb_dec
+ELF(.type _gcry_camellia_aesni_avx2_ocb_dec, at function;)
+
+_gcry_camellia_aesni_avx2_ocb_dec:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: dst (32 blocks)
+ * %rdx: src (32 blocks)
+ * %rcx: offset
+ * %r8 : checksum
+ * %r9 : L pointers (void *L[32])
+ */
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ vzeroupper;
+
+ subq $(16 * 32 + 4 * 8), %rsp;
+ andq $~63, %rsp;
+ movq %rsp, %rax;
+
+ movq %r10, (16 * 32 + 0 * 8)(%rax);
+ movq %r11, (16 * 32 + 1 * 8)(%rax);
+ movq %r12, (16 * 32 + 2 * 8)(%rax);
+ movq %r13, (16 * 32 + 3 * 8)(%rax);
+
+ vmovdqu (%rcx), %xmm14;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rdx), yreg; \
+ vpxor (l0reg), %xmm14, %xmm15; \
+ vpxor (l1reg), %xmm15, %xmm14; \
+ vinserti128 $1, %xmm14, %ymm15, %ymm15; \
+ vpxor yreg, %ymm15, yreg; \
+ vmovdqu %ymm15, (n * 32)(%rsi);
+
+ movq (0 * 8)(%r9), %r10;
+ movq (1 * 8)(%r9), %r11;
+ movq (2 * 8)(%r9), %r12;
+ movq (3 * 8)(%r9), %r13;
+ OCB_INPUT(0, %r10, %r11, %ymm0);
+ vmovdqu %ymm0, (15 * 32)(%rax);
+ OCB_INPUT(1, %r12, %r13, %ymm0);
+ vmovdqu %ymm0, (14 * 32)(%rax);
+ movq (4 * 8)(%r9), %r10;
+ movq (5 * 8)(%r9), %r11;
+ movq (6 * 8)(%r9), %r12;
+ movq (7 * 8)(%r9), %r13;
+ OCB_INPUT(2, %r10, %r11, %ymm13);
+ OCB_INPUT(3, %r12, %r13, %ymm12);
+ movq (8 * 8)(%r9), %r10;
+ movq (9 * 8)(%r9), %r11;
+ movq (10 * 8)(%r9), %r12;
+ movq (11 * 8)(%r9), %r13;
+ OCB_INPUT(4, %r10, %r11, %ymm11);
+ OCB_INPUT(5, %r12, %r13, %ymm10);
+ movq (12 * 8)(%r9), %r10;
+ movq (13 * 8)(%r9), %r11;
+ movq (14 * 8)(%r9), %r12;
+ movq (15 * 8)(%r9), %r13;
+ OCB_INPUT(6, %r10, %r11, %ymm9);
+ OCB_INPUT(7, %r12, %r13, %ymm8);
+ movq (16 * 8)(%r9), %r10;
+ movq (17 * 8)(%r9), %r11;
+ movq (18 * 8)(%r9), %r12;
+ movq (19 * 8)(%r9), %r13;
+ OCB_INPUT(8, %r10, %r11, %ymm7);
+ OCB_INPUT(9, %r12, %r13, %ymm6);
+ movq (20 * 8)(%r9), %r10;
+ movq (21 * 8)(%r9), %r11;
+ movq (22 * 8)(%r9), %r12;
+ movq (23 * 8)(%r9), %r13;
+ OCB_INPUT(10, %r10, %r11, %ymm5);
+ OCB_INPUT(11, %r12, %r13, %ymm4);
+ movq (24 * 8)(%r9), %r10;
+ movq (25 * 8)(%r9), %r11;
+ movq (26 * 8)(%r9), %r12;
+ movq (27 * 8)(%r9), %r13;
+ OCB_INPUT(12, %r10, %r11, %ymm3);
+ OCB_INPUT(13, %r12, %r13, %ymm2);
+ movq (28 * 8)(%r9), %r10;
+ movq (29 * 8)(%r9), %r11;
+ movq (30 * 8)(%r9), %r12;
+ movq (31 * 8)(%r9), %r13;
+ OCB_INPUT(14, %r10, %r11, %ymm1);
+ OCB_INPUT(15, %r12, %r13, %ymm0);
+#undef OCB_INPUT
+
+ vmovdqu %xmm14, (%rcx);
+
+ movq %r8, %r10;
+
+ cmpl $128, key_bitlength(CTX);
+ movl $32, %r8d;
+ movl $24, %r9d;
+ cmovel %r9d, %r8d; /* max */
+
+ /* inpack16_pre: */
+ vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15;
+ vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+ vpxor %ymm0, %ymm15, %ymm0;
+ vpxor %ymm1, %ymm15, %ymm1;
+ vpxor %ymm2, %ymm15, %ymm2;
+ vpxor %ymm3, %ymm15, %ymm3;
+ vpxor %ymm4, %ymm15, %ymm4;
+ vpxor %ymm5, %ymm15, %ymm5;
+ vpxor %ymm6, %ymm15, %ymm6;
+ vpxor %ymm7, %ymm15, %ymm7;
+ vpxor %ymm8, %ymm15, %ymm8;
+ vpxor %ymm9, %ymm15, %ymm9;
+ vpxor %ymm10, %ymm15, %ymm10;
+ vpxor %ymm11, %ymm15, %ymm11;
+ vpxor %ymm12, %ymm15, %ymm12;
+ vpxor %ymm13, %ymm15, %ymm13;
+ vpxor 14 * 32(%rax), %ymm15, %ymm14;
+ vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+ call __camellia_dec_blk32;
+
+ vpxor 0 * 32(%rsi), %ymm7, %ymm7;
+ vpxor 1 * 32(%rsi), %ymm6, %ymm6;
+ vpxor 2 * 32(%rsi), %ymm5, %ymm5;
+ vpxor 3 * 32(%rsi), %ymm4, %ymm4;
+ vpxor 4 * 32(%rsi), %ymm3, %ymm3;
+ vpxor 5 * 32(%rsi), %ymm2, %ymm2;
+ vpxor 6 * 32(%rsi), %ymm1, %ymm1;
+ vpxor 7 * 32(%rsi), %ymm0, %ymm0;
+ vmovdqu %ymm7, (7 * 32)(%rax);
+ vmovdqu %ymm6, (6 * 32)(%rax);
+ vpxor 8 * 32(%rsi), %ymm15, %ymm15;
+ vpxor 9 * 32(%rsi), %ymm14, %ymm14;
+ vpxor 10 * 32(%rsi), %ymm13, %ymm13;
+ vpxor 11 * 32(%rsi), %ymm12, %ymm12;
+ vpxor 12 * 32(%rsi), %ymm11, %ymm11;
+ vpxor 13 * 32(%rsi), %ymm10, %ymm10;
+ vpxor 14 * 32(%rsi), %ymm9, %ymm9;
+ vpxor 15 * 32(%rsi), %ymm8, %ymm8;
+
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+
+ vpxor %ymm5, %ymm7, %ymm7;
+ vpxor %ymm4, %ymm6, %ymm6;
+ vpxor %ymm3, %ymm7, %ymm7;
+ vpxor %ymm2, %ymm6, %ymm6;
+ vpxor %ymm1, %ymm7, %ymm7;
+ vpxor %ymm0, %ymm6, %ymm6;
+ vpxor %ymm15, %ymm7, %ymm7;
+ vpxor %ymm14, %ymm6, %ymm6;
+ vpxor %ymm13, %ymm7, %ymm7;
+ vpxor %ymm12, %ymm6, %ymm6;
+ vpxor %ymm11, %ymm7, %ymm7;
+ vpxor %ymm10, %ymm6, %ymm6;
+ vpxor %ymm9, %ymm7, %ymm7;
+ vpxor %ymm8, %ymm6, %ymm6;
+ vpxor %ymm7, %ymm6, %ymm7;
+
+ vextracti128 $1, %ymm7, %xmm6;
+ vpxor %xmm6, %xmm7, %xmm7;
+ vpxor (%r10), %xmm7, %xmm7;
+ vmovdqu %xmm7, (%r10);
+
+ vmovdqu 7 * 32(%rax), %ymm7;
+ vmovdqu 6 * 32(%rax), %ymm6;
+
+ write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
+ %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
+ %ymm8, %rsi);
+
+ vzeroall;
+
+ movq (16 * 32 + 0 * 8)(%rax), %r10;
+ movq (16 * 32 + 1 * 8)(%rax), %r11;
+ movq (16 * 32 + 2 * 8)(%rax), %r12;
+ movq (16 * 32 + 3 * 8)(%rax), %r13;
+
+ leave;
+ ret;
+ELF(.size _gcry_camellia_aesni_avx2_ocb_dec,.-_gcry_camellia_aesni_avx2_ocb_dec;)
+
+.align 8
+.globl _gcry_camellia_aesni_avx2_ocb_auth
+ELF(.type _gcry_camellia_aesni_avx2_ocb_auth, at function;)
+
+_gcry_camellia_aesni_avx2_ocb_auth:
+ /* input:
+ * %rdi: ctx, CTX
+ * %rsi: abuf (16 blocks)
+ * %rdx: offset
+ * %rcx: checksum
+ * %r8 : L pointers (void *L[16])
+ */
+
+ pushq %rbp;
+ movq %rsp, %rbp;
+
+ vzeroupper;
+
+ subq $(16 * 32 + 4 * 8), %rsp;
+ andq $~63, %rsp;
+ movq %rsp, %rax;
+
+ movq %r10, (16 * 32 + 0 * 8)(%rax);
+ movq %r11, (16 * 32 + 1 * 8)(%rax);
+ movq %r12, (16 * 32 + 2 * 8)(%rax);
+ movq %r13, (16 * 32 + 3 * 8)(%rax);
+
+ vmovdqu (%rdx), %xmm14;
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+
+#define OCB_INPUT(n, l0reg, l1reg, yreg) \
+ vmovdqu (n * 32)(%rsi), yreg; \
+ vpxor (l0reg), %xmm14, %xmm15; \
+ vpxor (l1reg), %xmm15, %xmm14; \
+ vinserti128 $1, %xmm14, %ymm15, %ymm15; \
+ vpxor yreg, %ymm15, yreg;
+
+ movq (0 * 8)(%r8), %r10;
+ movq (1 * 8)(%r8), %r11;
+ movq (2 * 8)(%r8), %r12;
+ movq (3 * 8)(%r8), %r13;
+ OCB_INPUT(0, %r10, %r11, %ymm0);
+ vmovdqu %ymm0, (15 * 32)(%rax);
+ OCB_INPUT(1, %r12, %r13, %ymm0);
+ vmovdqu %ymm0, (14 * 32)(%rax);
+ movq (4 * 8)(%r8), %r10;
+ movq (5 * 8)(%r8), %r11;
+ movq (6 * 8)(%r8), %r12;
+ movq (7 * 8)(%r8), %r13;
+ OCB_INPUT(2, %r10, %r11, %ymm13);
+ OCB_INPUT(3, %r12, %r13, %ymm12);
+ movq (8 * 8)(%r8), %r10;
+ movq (9 * 8)(%r8), %r11;
+ movq (10 * 8)(%r8), %r12;
+ movq (11 * 8)(%r8), %r13;
+ OCB_INPUT(4, %r10, %r11, %ymm11);
+ OCB_INPUT(5, %r12, %r13, %ymm10);
+ movq (12 * 8)(%r8), %r10;
+ movq (13 * 8)(%r8), %r11;
+ movq (14 * 8)(%r8), %r12;
+ movq (15 * 8)(%r8), %r13;
+ OCB_INPUT(6, %r10, %r11, %ymm9);
+ OCB_INPUT(7, %r12, %r13, %ymm8);
+ movq (16 * 8)(%r8), %r10;
+ movq (17 * 8)(%r8), %r11;
+ movq (18 * 8)(%r8), %r12;
+ movq (19 * 8)(%r8), %r13;
+ OCB_INPUT(8, %r10, %r11, %ymm7);
+ OCB_INPUT(9, %r12, %r13, %ymm6);
+ movq (20 * 8)(%r8), %r10;
+ movq (21 * 8)(%r8), %r11;
+ movq (22 * 8)(%r8), %r12;
+ movq (23 * 8)(%r8), %r13;
+ OCB_INPUT(10, %r10, %r11, %ymm5);
+ OCB_INPUT(11, %r12, %r13, %ymm4);
+ movq (24 * 8)(%r8), %r10;
+ movq (25 * 8)(%r8), %r11;
+ movq (26 * 8)(%r8), %r12;
+ movq (27 * 8)(%r8), %r13;
+ OCB_INPUT(12, %r10, %r11, %ymm3);
+ OCB_INPUT(13, %r12, %r13, %ymm2);
+ movq (28 * 8)(%r8), %r10;
+ movq (29 * 8)(%r8), %r11;
+ movq (30 * 8)(%r8), %r12;
+ movq (31 * 8)(%r8), %r13;
+ OCB_INPUT(14, %r10, %r11, %ymm1);
+ OCB_INPUT(15, %r12, %r13, %ymm0);
+#undef OCB_INPUT
+
+ vmovdqu %xmm14, (%rdx);
+
+ movq %rcx, %r10;
+
+ /* inpack16_pre: */
+ vpbroadcastq (key_table)(CTX), %ymm15;
+ vpshufb .Lpack_bswap RIP, %ymm15, %ymm15;
+ vpxor %ymm0, %ymm15, %ymm0;
+ vpxor %ymm1, %ymm15, %ymm1;
+ vpxor %ymm2, %ymm15, %ymm2;
+ vpxor %ymm3, %ymm15, %ymm3;
+ vpxor %ymm4, %ymm15, %ymm4;
+ vpxor %ymm5, %ymm15, %ymm5;
+ vpxor %ymm6, %ymm15, %ymm6;
+ vpxor %ymm7, %ymm15, %ymm7;
+ vpxor %ymm8, %ymm15, %ymm8;
+ vpxor %ymm9, %ymm15, %ymm9;
+ vpxor %ymm10, %ymm15, %ymm10;
+ vpxor %ymm11, %ymm15, %ymm11;
+ vpxor %ymm12, %ymm15, %ymm12;
+ vpxor %ymm13, %ymm15, %ymm13;
+ vpxor 14 * 32(%rax), %ymm15, %ymm14;
+ vpxor 15 * 32(%rax), %ymm15, %ymm15;
+
+ call __camellia_enc_blk32;
+
+ vpxor %ymm7, %ymm6, %ymm6;
+ vpxor %ymm5, %ymm4, %ymm4;
+ vpxor %ymm3, %ymm2, %ymm2;
+ vpxor %ymm1, %ymm0, %ymm0;
+ vpxor %ymm15, %ymm14, %ymm14;
+ vpxor %ymm13, %ymm12, %ymm12;
+ vpxor %ymm11, %ymm10, %ymm10;
+ vpxor %ymm9, %ymm8, %ymm8;
+
+ vpxor %ymm6, %ymm4, %ymm4;
+ vpxor %ymm2, %ymm0, %ymm0;
+ vpxor %ymm14, %ymm12, %ymm12;
+ vpxor %ymm10, %ymm8, %ymm8;
+
+ vpxor %ymm4, %ymm0, %ymm0;
+ vpxor %ymm12, %ymm8, %ymm8;
+
+ vpxor %ymm0, %ymm8, %ymm0;
+
+ vextracti128 $1, %ymm0, %xmm1;
+ vpxor (%r10), %xmm0, %xmm0;
+ vpxor %xmm0, %xmm1, %xmm0;
+ vmovdqu %xmm0, (%r10);
+
+ vzeroall;
+
+ movq (16 * 32 + 0 * 8)(%rax), %r10;
+ movq (16 * 32 + 1 * 8)(%rax), %r11;
+ movq (16 * 32 + 2 * 8)(%rax), %r12;
+ movq (16 * 32 + 3 * 8)(%rax), %r13;
+
+ leave;
+ ret;
+ELF(.size _gcry_camellia_aesni_avx2_ocb_auth,.-_gcry_camellia_aesni_avx2_ocb_auth;)
+
#endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/
#endif /*__x86_64*/
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index 5032321..197e1b3 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -63,6 +63,7 @@
#include "cipher.h"
#include "camellia.h"
#include "bufhelp.h"
+#include "cipher-internal.h"
#include "cipher-selftest.h"
/* Helper macro to force alignment to 16 bytes. */
@@ -135,6 +136,26 @@ extern void _gcry_camellia_aesni_avx_cfb_dec(CAMELLIA_context *ctx,
const unsigned char *in,
unsigned char *iv) ASM_FUNC_ABI;
+extern void _gcry_camellia_aesni_avx_ocb_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const void *Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ocb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const void *Ls[16]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx_ocb_auth(CAMELLIA_context *ctx,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const void *Ls[16]) ASM_FUNC_ABI;
+
extern void _gcry_camellia_aesni_avx_keygen(CAMELLIA_context *ctx,
const unsigned char *key,
unsigned int keylen) ASM_FUNC_ABI;
@@ -158,6 +179,26 @@ extern void _gcry_camellia_aesni_avx2_cfb_dec(CAMELLIA_context *ctx,
unsigned char *out,
const unsigned char *in,
unsigned char *iv) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_ocb_enc(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const void *Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_ocb_dec(CAMELLIA_context *ctx,
+ unsigned char *out,
+ const unsigned char *in,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const void *Ls[32]) ASM_FUNC_ABI;
+
+extern void _gcry_camellia_aesni_avx2_ocb_auth(CAMELLIA_context *ctx,
+ const unsigned char *abuf,
+ unsigned char *offset,
+ unsigned char *checksum,
+ const void *Ls[32]) ASM_FUNC_ABI;
#endif
static const char *selftest(void);
@@ -563,6 +604,294 @@ _gcry_camellia_cfb_dec(void *context, unsigned char *iv,
_gcry_burn_stack(burn_stack_depth);
}
+static inline const unsigned char *
+get_l (gcry_cipher_hd_t c, unsigned char *l_tmp, u64 i)
+{
+ unsigned int ntz = _gcry_ctz64 (i);
+
+ if (ntz < OCB_L_TABLE_SIZE)
+ return c->u_mode.ocb.L[ntz];
+ else
+ return _gcry_cipher_ocb_get_l (c, l_tmp, i);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+void
+_gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+ CAMELLIA_context *ctx = (void *)&c->context.c;
+ unsigned char *outbuf = outbuf_arg;
+ const unsigned char *inbuf = inbuf_arg;
+ unsigned char l_tmp[CAMELLIA_BLOCK_SIZE];
+ const unsigned char *l;
+ int burn_stack_depth;
+ u64 blkn = c->u_mode.ocb.data_nblocks;
+
+ burn_stack_depth = encrypt ? CAMELLIA_encrypt_stack_burn_size :
+ CAMELLIA_decrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2)
+ {
+ int did_use_aesni_avx2 = 0;
+ const void *Ls[32];
+ int i;
+
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ for (i = 0; i < 32; i += 4)
+ {
+ Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
+
+ if (encrypt)
+ _gcry_camellia_aesni_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_camellia_aesni_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 32;
+ outbuf += 32 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 32 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx2 = 1;
+ }
+
+ if (did_use_aesni_avx2)
+ {
+ int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+ if (burn_stack_depth < avx2_burn_stack_depth)
+ burn_stack_depth = avx2_burn_stack_depth;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx)
+ {
+ int did_use_aesni_avx = 0;
+ const void *Ls[16];
+ int i;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ for (i = 0; i < 16; i += 4)
+ {
+ Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
+
+ if (encrypt)
+ _gcry_camellia_aesni_avx_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+ else
+ _gcry_camellia_aesni_avx_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
+ c->u_ctr.ctr, Ls);
+
+ nblocks -= 16;
+ outbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ inbuf += 16 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx = 1;
+ }
+
+ if (did_use_aesni_avx)
+ {
+ int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+ if (burn_stack_depth < avx_burn_stack_depth)
+ burn_stack_depth = avx_burn_stack_depth;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
+ if (encrypt)
+ {
+ for (; nblocks; nblocks--)
+ {
+ l = get_l(c, l_tmp, ++blkn);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ buf_xor_1 (c->u_iv.iv, l, CAMELLIA_BLOCK_SIZE);
+ buf_cpy (l_tmp, inbuf, CAMELLIA_BLOCK_SIZE);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ buf_xor_1 (c->u_ctr.ctr, l_tmp, CAMELLIA_BLOCK_SIZE);
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
+ Camellia_EncryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp);
+ buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
+ buf_cpy (outbuf, l_tmp, CAMELLIA_BLOCK_SIZE);
+
+ inbuf += CAMELLIA_BLOCK_SIZE;
+ outbuf += CAMELLIA_BLOCK_SIZE;
+ }
+ }
+ else
+ {
+ for (; nblocks; nblocks--)
+ {
+ l = get_l(c, l_tmp, ++blkn);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ buf_xor_1 (c->u_iv.iv, l, CAMELLIA_BLOCK_SIZE);
+ buf_cpy (l_tmp, inbuf, CAMELLIA_BLOCK_SIZE);
+ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */
+ buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
+ Camellia_DecryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp);
+ buf_xor_1 (l_tmp, c->u_iv.iv, CAMELLIA_BLOCK_SIZE);
+ /* Checksum_i = Checksum_{i-1} xor P_i */
+ buf_xor_1 (c->u_ctr.ctr, l_tmp, CAMELLIA_BLOCK_SIZE);
+ buf_cpy (outbuf, l_tmp, CAMELLIA_BLOCK_SIZE);
+
+ inbuf += CAMELLIA_BLOCK_SIZE;
+ outbuf += CAMELLIA_BLOCK_SIZE;
+ }
+ }
+
+ c->u_mode.ocb.data_nblocks = blkn;
+
+ wipememory(&l_tmp, sizeof(l_tmp));
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+void
+_gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+ size_t nblocks)
+{
+ CAMELLIA_context *ctx = (void *)&c->context.c;
+ const unsigned char *abuf = abuf_arg;
+ unsigned char l_tmp[CAMELLIA_BLOCK_SIZE];
+ const unsigned char *l;
+ int burn_stack_depth;
+ u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+ burn_stack_depth = CAMELLIA_encrypt_stack_burn_size;
+
+#ifdef USE_AESNI_AVX2
+ if (ctx->use_aesni_avx2)
+ {
+ int did_use_aesni_avx2 = 0;
+ const void *Ls[32];
+ int i;
+
+ /* Process data in 32 block chunks. */
+ while (nblocks >= 32)
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ for (i = 0; i < 32; i += 4)
+ {
+ Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
+
+ _gcry_camellia_aesni_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
+
+ nblocks -= 32;
+ abuf += 32 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx2 = 1;
+ }
+
+ if (did_use_aesni_avx2)
+ {
+ int avx2_burn_stack_depth = 32 * CAMELLIA_BLOCK_SIZE +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+ if (burn_stack_depth < avx2_burn_stack_depth)
+ burn_stack_depth = avx2_burn_stack_depth;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
+#ifdef USE_AESNI_AVX
+ if (ctx->use_aesni_avx)
+ {
+ int did_use_aesni_avx = 0;
+ const void *Ls[16];
+ int i;
+
+ /* Process data in 16 block chunks. */
+ while (nblocks >= 16)
+ {
+ /* l_tmp will be used only every 65536-th block. */
+ for (i = 0; i < 16; i += 4)
+ {
+ Ls[i + 0] = get_l(c, l_tmp, blkn + 1);
+ Ls[i + 1] = get_l(c, l_tmp, blkn + 2);
+ Ls[i + 2] = get_l(c, l_tmp, blkn + 3);
+ Ls[i + 3] = get_l(c, l_tmp, blkn + 4);
+ blkn += 4;
+ }
+
+ _gcry_camellia_aesni_avx_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
+ c->u_mode.ocb.aad_sum, Ls);
+
+ nblocks -= 16;
+ abuf += 16 * CAMELLIA_BLOCK_SIZE;
+ did_use_aesni_avx = 1;
+ }
+
+ if (did_use_aesni_avx)
+ {
+ int avx_burn_stack_depth = 16 * CAMELLIA_BLOCK_SIZE +
+ 2 * sizeof(void *) + ASM_EXTRA_STACK;
+
+ if (burn_stack_depth < avx_burn_stack_depth)
+ burn_stack_depth = avx_burn_stack_depth;
+ }
+
+ /* Use generic code to handle smaller chunks... */
+ }
+#endif
+
+ for (; nblocks; nblocks--)
+ {
+ l = get_l(c, l_tmp, ++blkn);
+
+ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+ buf_xor_1 (c->u_mode.ocb.aad_offset, l, CAMELLIA_BLOCK_SIZE);
+ /* Sum_i = Sum_{i-1} xor ENCIPHER(K, A_i xor Offset_i) */
+ buf_xor (l_tmp, c->u_mode.ocb.aad_offset, abuf, CAMELLIA_BLOCK_SIZE);
+ Camellia_EncryptBlock(ctx->keybitlength, l_tmp, ctx->keytable, l_tmp);
+ buf_xor_1 (c->u_mode.ocb.aad_sum, l_tmp, CAMELLIA_BLOCK_SIZE);
+
+ abuf += CAMELLIA_BLOCK_SIZE;
+ }
+
+ c->u_mode.ocb.aad_nblocks = blkn;
+
+ wipememory(&l_tmp, sizeof(l_tmp));
+
+ if (burn_stack_depth)
+ _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
+}
+
/* Run the self-tests for CAMELLIA-CTR-128, tests IV increment of bulk CTR
encryption. Returns NULL on success. */
static const char*
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 7a29824..2d2b0ad 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -535,6 +535,8 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
h->bulk.cbc_dec = _gcry_camellia_cbc_dec;
h->bulk.cfb_dec = _gcry_camellia_cfb_dec;
h->bulk.ctr_enc = _gcry_camellia_ctr_enc;
+ h->bulk.ocb_crypt = _gcry_camellia_ocb_crypt;
+ h->bulk.ocb_auth = _gcry_camellia_ocb_auth;
break;
#endif /*USE_CAMELLIA*/
#ifdef USE_DES
diff --git a/src/cipher.h b/src/cipher.h
index ef183fd..a0aac51 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -177,6 +177,11 @@ void _gcry_camellia_cbc_dec (void *context, unsigned char *iv,
void _gcry_camellia_cfb_dec (void *context, unsigned char *iv,
void *outbuf_arg, const void *inbuf_arg,
size_t nblocks);
+void _gcry_camellia_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+ const void *inbuf_arg, size_t nblocks,
+ int encrypt);
+void _gcry_camellia_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+ size_t nblocks);
/*-- des.c --*/
void _gcry_3des_ctr_enc (void *context, unsigned char *ctr,
diff --git a/tests/basic.c b/tests/basic.c
index 2c664c0..e3f4bfd 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -3335,6 +3335,15 @@ check_ocb_cipher (void)
check_ocb_cipher_largebuf(GCRY_CIPHER_AES256, 32,
"\xfa\x26\xa5\xbf\xf6\x7d\x3a\x8d"
"\xfe\x96\x67\xc9\xc8\x41\x03\x51");
+ check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA128, 16,
+ "\x28\x23\x38\x45\x2b\xfd\x42\x45"
+ "\x43\x64\x7e\x67\x7f\xf4\x8b\xcd");
+ check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA192, 24,
+ "\xee\xca\xe5\x39\x27\x2d\x33\xe7"
+ "\x79\x74\xb0\x1d\x37\x12\xd5\x6c");
+ check_ocb_cipher_largebuf(GCRY_CIPHER_CAMELLIA256, 32,
+ "\x39\x39\xd0\x2d\x05\x68\x74\xee"
+ "\x18\x6b\xea\x3d\x0b\xd3\x58\xae");
}
More information about the Gcrypt-devel
mailing list