[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-404-g3ef21e7

by Jussi Kivilinna cvs at cvs.gnupg.org
Tue Nov 26 11:25:45 CET 2013


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  3ef21e7e1b8003db9792155044db95f9d9ced184 (commit)
      from  a34448c929b13bfb7b66d69169c89e7319a18b31 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 3ef21e7e1b8003db9792155044db95f9d9ced184
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Sun Nov 24 17:54:15 2013 +0200

    Camellia: Tweaks for AES-NI implementations
    
    * cipher/camellia-aesni-avx-amd64.S: Align stack to 16 bytes; tweak
    key-setup for small speed up.
    * cipher/camellia-aesni-avx2-amd64.S: Use vmovdqu even with aligned
    stack; reorder vinsert128 instructions; use rbp for stack frame.
    --
    
    Use of 'vmovdqa' with ymm registers produces quite interesting scattering in
    measurement timings. By using 'vmovdqu' instead, repeated measuments produce
    more stable results.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S
index ffb1aed..38ec7a3 100644
--- a/cipher/camellia-aesni-avx-amd64.S
+++ b/cipher/camellia-aesni-avx-amd64.S
@@ -958,9 +958,13 @@ _gcry_camellia_aesni_avx_ctr_enc:
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 
+	pushq %rbp;
+	movq %rsp, %rbp;
+
 	vzeroupper;
 
 	subq $(16 * 16), %rsp;
+	andq $~31, %rsp;
 	movq %rsp, %rax;
 
 	vmovdqa .Lbswap128_mask RIP, %xmm14;
@@ -1033,8 +1037,6 @@ _gcry_camellia_aesni_avx_ctr_enc:
 
 	call __camellia_enc_blk16;
 
-	addq $(16 * 16), %rsp;
-
 	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
 	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
 	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
@@ -1058,6 +1060,7 @@ _gcry_camellia_aesni_avx_ctr_enc:
 
 	vzeroall;
 
+	leave;
 	ret;
 .size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;
 
@@ -1073,6 +1076,9 @@ _gcry_camellia_aesni_avx_cbc_dec:
 	 *	%rcx: iv
 	 */
 
+	pushq %rbp;
+	movq %rsp, %rbp;
+
 	vzeroupper;
 
 	movq %rcx, %r9;
@@ -1087,12 +1093,11 @@ _gcry_camellia_aesni_avx_cbc_dec:
 		     %xmm15, %rdx, (key_table)(CTX, %r8, 8));
 
 	subq $(16 * 16), %rsp;
+	andq $~31, %rsp;
 	movq %rsp, %rax;
 
 	call __camellia_dec_blk16;
 
-	addq $(16 * 16), %rsp;
-
 	/* XOR output with IV */
 	vpxor (%r9), %xmm7, %xmm7;
 	vpxor (0 * 16)(%rdx), %xmm6, %xmm6;
@@ -1112,6 +1117,7 @@ _gcry_camellia_aesni_avx_cbc_dec:
 	vpxor (14 * 16)(%rdx), %xmm8, %xmm8;
 	movq (15 * 16 + 0)(%rdx), %r10;
 	movq (15 * 16 + 8)(%rdx), %r11;
+
 	write_output(%xmm7, %xmm6, %xmm5, %xmm4, %xmm3, %xmm2, %xmm1, %xmm0,
 		     %xmm15, %xmm14, %xmm13, %xmm12, %xmm11, %xmm10, %xmm9,
 		     %xmm8, %rsi);
@@ -1122,6 +1128,7 @@ _gcry_camellia_aesni_avx_cbc_dec:
 
 	vzeroall;
 
+	leave;
 	ret;
 .size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;
 
@@ -1137,9 +1144,13 @@ _gcry_camellia_aesni_avx_cfb_dec:
 	 *	%rcx: iv
 	 */
 
+	pushq %rbp;
+	movq %rsp, %rbp;
+
 	vzeroupper;
 
 	subq $(16 * 16), %rsp;
+	andq $~31, %rsp;
 	movq %rsp, %rax;
 
 	/* inpack16_pre: */
@@ -1166,8 +1177,6 @@ _gcry_camellia_aesni_avx_cfb_dec:
 
 	call __camellia_enc_blk16;
 
-	addq $(16 * 16), %rsp;
-
 	vpxor 0 * 16(%rdx), %xmm7, %xmm7;
 	vpxor 1 * 16(%rdx), %xmm6, %xmm6;
 	vpxor 2 * 16(%rdx), %xmm5, %xmm5;
@@ -1191,6 +1200,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
 
 	vzeroall;
 
+	leave;
 	ret;
 .size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;
 
@@ -1199,7 +1209,7 @@ _gcry_camellia_aesni_avx_cfb_dec:
  *  ab: 64-bit AB state
  *  cd: 64-bit CD state
  */
-#define camellia_f(ab, x, t0, t1, t2, t3, t4, sbox2mask, sbox4mask, \
+#define camellia_f(ab, x, t0, t1, t2, t3, t4, inv_shift_row, sbox4mask, \
 		   _0f0f0f0fmask, pre_s1lo_mask, pre_s1hi_mask, key) \
 	vmovq key, t0; \
 	vpxor x, x, t3; \
@@ -1221,37 +1231,33 @@ _gcry_camellia_aesni_avx_cfb_dec:
 	\
 	vmovdqa .Lpost_tf_lo_s1 RIP, t0; \
 	vmovdqa .Lpost_tf_hi_s1 RIP, t1; \
-	vmovq .Lsbox3_output_mask RIP, t4; \
 	\
 	/* prefilter sboxes */ \
 	filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \
 	\
 	/* AES subbytes + AES shift rows + AES inv shift rows */ \
 	vaesenclast t3, x, x; \
-	vpshufb .Linv_shift_row RIP, x, x; \
 	\
 	/* postfilter sboxes */ \
 	filter_8bit(x, t0, t1, _0f0f0f0fmask, t2); \
 	\
 	/* output rotation for sbox2 (<<< 1) */ \
 	/* output rotation for sbox3 (>>> 1) */ \
-	vpor sbox2mask, t4, t2; \
-	vpand x, sbox2mask, t0; \
-	vpand x, t4, t1; \
-	vpaddb x, x, t2; \
-	vpshufb .Lsp1110111044044404mask RIP, x, t4; \
-	vpshufb .Lsp0044440410011110mask RIP, x, x; \
-	vpsrlw $7, t0, t0; \
+	vpshufb inv_shift_row, x, t1; \
+	vpshufb .Lsp0044440444044404mask RIP, x, t4; \
+	vpshufb .Lsp1110111010011110mask RIP, x, x; \
+	vpaddb t1, t1, t2; \
+	vpsrlw $7, t1, t0; \
 	vpsllw $7, t1, t3; \
-	vpsrlw $1, t1, t1; \
 	vpor t0, t2, t0; \
+	vpsrlw $1, t1, t1; \
 	vpshufb .Lsp0222022222000222mask RIP, t0, t0; \
 	vpor t1, t3, t1; \
-	vpshufb .Lsp3033303303303033mask RIP, t1, t1; \
 	\
 	vpxor x, t4, t4; \
-	vpxor t1, t0, t0; \
+	vpshufb .Lsp3033303303303033mask RIP, t1, t1; \
 	vpxor t4, t0, t0; \
+	vpxor t1, t0, t0; \
 	vpsrldq $8, t0, x; \
 	vpxor t0, x, x;
 
@@ -1270,22 +1276,21 @@ _gcry_camellia_aesni_avx_cfb_dec:
 .data
 
 .align 16
-.Lsp1110111044044404mask:
-	.long 0x000000ff, 0x000000ff;
-	.long 0x0101ff01, 0x0101ff01;
-.Lsp0044440410011110mask:
+.Linv_shift_row_and_unpcklbw:
+	.byte 0x00, 0xff, 0x0d, 0xff, 0x0a, 0xff, 0x07, 0xff
+	.byte 0x04, 0xff, 0x01, 0xff, 0x0e, 0xff, 0x0b, 0xff
+.Lsp0044440444044404mask:
 	.long 0xffff0404, 0x0404ff04;
-	.long 0x07ffff07, 0x070707ff;
+	.long 0x0d0dff0d, 0x0d0dff0d;
+.Lsp1110111010011110mask:
+	.long 0x000000ff, 0x000000ff;
+	.long 0x0bffff0b, 0x0b0b0bff;
 .Lsp0222022222000222mask:
-	.long 0xff030303, 0xff030303;
-	.long 0x0606ffff, 0xff060606;
+	.long 0xff060606, 0xff060606;
+	.long 0x0c0cffff, 0xff0c0c0c;
 .Lsp3033303303303033mask:
-	.long 0x02ff0202, 0x02ff0202;
-	.long 0xff0505ff, 0x05ff0505;
-.Lsbox2_output_mask:
-	.byte 0x00, 0x00, 0x00, 0xff, 0x00, 0x00, 0xff, 0x00;
-.Lsbox3_output_mask:
-	.byte 0x00, 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00;
+	.long 0x04ff0404, 0x04ff0404;
+	.long 0xff0a0aff, 0x0aff0a0a;
 .Lsbox4_input_mask:
 	.byte 0x00, 0xff, 0x00, 0x00, 0xff, 0x00, 0x00, 0x00;
 .Lsigma1:
@@ -1316,7 +1321,7 @@ __camellia_avx_setup128:
 
 	vpshufb .Lbswap128_mask RIP, KL128, KL128;
 
-	vmovq .Lsbox2_output_mask RIP, %xmm11;
+	vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11;
 	vmovq .Lsbox4_input_mask RIP, %xmm12;
 	vbroadcastss .L0f0f0f0f RIP, %xmm13;
 	vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14;
@@ -1663,7 +1668,7 @@ __camellia_avx_setup256:
 	vpshufb .Lbswap128_mask RIP, KL128, KL128;
 	vpshufb .Lbswap128_mask RIP, KR128, KR128;
 
-	vmovq .Lsbox2_output_mask RIP, %xmm11;
+	vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11;
 	vmovq .Lsbox4_input_mask RIP, %xmm12;
 	vbroadcastss .L0f0f0f0f RIP, %xmm13;
 	vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14;
diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S
index 65c923e..1a89ff4 100644
--- a/cipher/camellia-aesni-avx2-amd64.S
+++ b/cipher/camellia-aesni-avx2-amd64.S
@@ -124,15 +124,15 @@
 	vextracti128 $1, x5, t5##_x; \
 	vaesenclast t4##_x, x0##_x, x0##_x; \
 	vaesenclast t4##_x, t0##_x, t0##_x; \
-	vinserti128 $1, t0##_x, x0, x0; \
 	vaesenclast t4##_x, x7##_x, x7##_x; \
 	vaesenclast t4##_x, t1##_x, t1##_x; \
-	vinserti128 $1, t1##_x, x7, x7; \
 	vaesenclast t4##_x, x3##_x, x3##_x; \
 	vaesenclast t4##_x, t3##_x, t3##_x; \
-	vinserti128 $1, t3##_x, x3, x3; \
 	vaesenclast t4##_x, x6##_x, x6##_x; \
 	vaesenclast t4##_x, t2##_x, t2##_x; \
+	vinserti128 $1, t0##_x, x0, x0; \
+	vinserti128 $1, t1##_x, x7, x7; \
+	vinserti128 $1, t3##_x, x3, x3; \
 	vinserti128 $1, t2##_x, x6, x6; \
 	vextracti128 $1, x1, t3##_x; \
 	vextracti128 $1, x4, t2##_x; \
@@ -140,15 +140,15 @@
 	vbroadcasti128 .Lpost_tf_hi_s1 RIP, t1; \
 	vaesenclast t4##_x, x2##_x, x2##_x; \
 	vaesenclast t4##_x, t6##_x, t6##_x; \
-	vinserti128 $1, t6##_x, x2, x2; \
 	vaesenclast t4##_x, x5##_x, x5##_x; \
 	vaesenclast t4##_x, t5##_x, t5##_x; \
-	vinserti128 $1, t5##_x, x5, x5; \
 	vaesenclast t4##_x, x1##_x, x1##_x; \
 	vaesenclast t4##_x, t3##_x, t3##_x; \
-	vinserti128 $1, t3##_x, x1, x1; \
 	vaesenclast t4##_x, x4##_x, x4##_x; \
 	vaesenclast t4##_x, t2##_x, t2##_x; \
+	vinserti128 $1, t6##_x, x2, x2; \
+	vinserti128 $1, t5##_x, x5, x5; \
+	vinserti128 $1, t3##_x, x1, x1; \
 	vinserti128 $1, t2##_x, x4, x4; \
 	\
 	/* postfilter sboxes 1 and 4 */ \
@@ -249,14 +249,14 @@
 	roundsm32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		  y6, y7, mem_cd, (key_table + (i) * 8)(CTX)); \
 	\
-	vmovdqa x0, 4 * 32(mem_cd); \
-	vmovdqa x1, 5 * 32(mem_cd); \
-	vmovdqa x2, 6 * 32(mem_cd); \
-	vmovdqa x3, 7 * 32(mem_cd); \
-	vmovdqa x4, 0 * 32(mem_cd); \
-	vmovdqa x5, 1 * 32(mem_cd); \
-	vmovdqa x6, 2 * 32(mem_cd); \
-	vmovdqa x7, 3 * 32(mem_cd); \
+	vmovdqu x0, 4 * 32(mem_cd); \
+	vmovdqu x1, 5 * 32(mem_cd); \
+	vmovdqu x2, 6 * 32(mem_cd); \
+	vmovdqu x3, 7 * 32(mem_cd); \
+	vmovdqu x4, 0 * 32(mem_cd); \
+	vmovdqu x5, 1 * 32(mem_cd); \
+	vmovdqu x6, 2 * 32(mem_cd); \
+	vmovdqu x7, 3 * 32(mem_cd); \
 	\
 	roundsm32(x4, x5, x6, x7, x0, x1, x2, x3, y0, y1, y2, y3, y4, y5, \
 		  y6, y7, mem_ab, (key_table + ((i) + (dir)) * 8)(CTX)); \
@@ -267,14 +267,14 @@
 
 #define store_ab_state(x0, x1, x2, x3, x4, x5, x6, x7, mem_ab) \
 	/* Store new AB state */ \
-	vmovdqa x4, 4 * 32(mem_ab); \
-	vmovdqa x5, 5 * 32(mem_ab); \
-	vmovdqa x6, 6 * 32(mem_ab); \
-	vmovdqa x7, 7 * 32(mem_ab); \
-	vmovdqa x0, 0 * 32(mem_ab); \
-	vmovdqa x1, 1 * 32(mem_ab); \
-	vmovdqa x2, 2 * 32(mem_ab); \
-	vmovdqa x3, 3 * 32(mem_ab);
+	vmovdqu x4, 4 * 32(mem_ab); \
+	vmovdqu x5, 5 * 32(mem_ab); \
+	vmovdqu x6, 6 * 32(mem_ab); \
+	vmovdqu x7, 7 * 32(mem_ab); \
+	vmovdqu x0, 0 * 32(mem_ab); \
+	vmovdqu x1, 1 * 32(mem_ab); \
+	vmovdqu x2, 2 * 32(mem_ab); \
+	vmovdqu x3, 3 * 32(mem_ab);
 
 #define enc_rounds32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \
 		      y6, y7, mem_ab, mem_cd, i) \
@@ -356,13 +356,13 @@
 	\
 	vpxor l4, t0, l4; \
 	vpbroadcastd krr, t0; /* only lowest 32-bit used */ \
-	vmovdqa l4, 4 * 32(l); \
+	vmovdqu l4, 4 * 32(l); \
 	vpxor l5, t1, l5; \
-	vmovdqa l5, 5 * 32(l); \
+	vmovdqu l5, 5 * 32(l); \
 	vpxor l6, t2, l6; \
-	vmovdqa l6, 6 * 32(l); \
+	vmovdqu l6, 6 * 32(l); \
 	vpxor l7, t3, l7; \
-	vmovdqa l7, 7 * 32(l); \
+	vmovdqu l7, 7 * 32(l); \
 	\
 	/* \
 	 * t2 = krr; \
@@ -387,11 +387,11 @@
 	vpxor 1 * 32(r), t1, t1; \
 	vpxor 2 * 32(r), t2, t2; \
 	vpxor 3 * 32(r), t3, t3; \
-	vmovdqa t0, 0 * 32(r); \
+	vmovdqu t0, 0 * 32(r); \
 	vpbroadcastd krl, t0; /* only lowest 32-bit used */ \
-	vmovdqa t1, 1 * 32(r); \
-	vmovdqa t2, 2 * 32(r); \
-	vmovdqa t3, 3 * 32(r); \
+	vmovdqu t1, 1 * 32(r); \
+	vmovdqu t2, 2 * 32(r); \
+	vmovdqu t3, 3 * 32(r); \
 	\
 	/* \
 	 * t2 = krl; \
@@ -417,11 +417,11 @@
 	vpxor 5 * 32(r), t1, t1; \
 	vpxor 6 * 32(r), t2, t2; \
 	vpxor 7 * 32(r), t3, t3; \
-	vmovdqa t0, 4 * 32(r); \
+	vmovdqu t0, 4 * 32(r); \
 	vpbroadcastd klr, t0; /* only lowest 32-bit used */ \
-	vmovdqa t1, 5 * 32(r); \
-	vmovdqa t2, 6 * 32(r); \
-	vmovdqa t3, 7 * 32(r); \
+	vmovdqu t1, 5 * 32(r); \
+	vmovdqu t2, 6 * 32(r); \
+	vmovdqu t3, 7 * 32(r); \
 	\
 	/* \
 	 * t0 = klr; \
@@ -443,13 +443,13 @@
 	vpor l7, t3, t3; \
 	\
 	vpxor l0, t0, l0; \
-	vmovdqa l0, 0 * 32(l); \
+	vmovdqu l0, 0 * 32(l); \
 	vpxor l1, t1, l1; \
-	vmovdqa l1, 1 * 32(l); \
+	vmovdqu l1, 1 * 32(l); \
 	vpxor l2, t2, l2; \
-	vmovdqa l2, 2 * 32(l); \
+	vmovdqu l2, 2 * 32(l); \
 	vpxor l3, t3, l3; \
-	vmovdqa l3, 3 * 32(l);
+	vmovdqu l3, 3 * 32(l);
 
 #define transpose_4x4(x0, x1, x2, x3, t1, t2) \
 	vpunpckhdq x1, x0, t2; \
@@ -466,20 +466,20 @@
 
 #define byteslice_16x16b_fast(a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, \
 			      a3, b3, c3, d3, st0, st1) \
-	vmovdqa d2, st0; \
-	vmovdqa d3, st1; \
+	vmovdqu d2, st0; \
+	vmovdqu d3, st1; \
 	transpose_4x4(a0, a1, a2, a3, d2, d3); \
 	transpose_4x4(b0, b1, b2, b3, d2, d3); \
-	vmovdqa st0, d2; \
-	vmovdqa st1, d3; \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
 	\
-	vmovdqa a0, st0; \
-	vmovdqa a1, st1; \
+	vmovdqu a0, st0; \
+	vmovdqu a1, st1; \
 	transpose_4x4(c0, c1, c2, c3, a0, a1); \
 	transpose_4x4(d0, d1, d2, d3, a0, a1); \
 	\
 	vbroadcasti128 .Lshufb_16x16b RIP, a0; \
-	vmovdqa st1, a1; \
+	vmovdqu st1, a1; \
 	vpshufb a0, a2, a2; \
 	vpshufb a0, a3, a3; \
 	vpshufb a0, b0, b0; \
@@ -495,22 +495,22 @@
 	vpshufb a0, d1, d1; \
 	vpshufb a0, d2, d2; \
 	vpshufb a0, d3, d3; \
-	vmovdqa d3, st1; \
-	vmovdqa st0, d3; \
+	vmovdqu d3, st1; \
+	vmovdqu st0, d3; \
 	vpshufb a0, d3, a0; \
-	vmovdqa d2, st0; \
+	vmovdqu d2, st0; \
 	\
 	transpose_4x4(a0, b0, c0, d0, d2, d3); \
 	transpose_4x4(a1, b1, c1, d1, d2, d3); \
-	vmovdqa st0, d2; \
-	vmovdqa st1, d3; \
+	vmovdqu st0, d2; \
+	vmovdqu st1, d3; \
 	\
-	vmovdqa b0, st0; \
-	vmovdqa b1, st1; \
+	vmovdqu b0, st0; \
+	vmovdqu b1, st1; \
 	transpose_4x4(a2, b2, c2, d2, b0, b1); \
 	transpose_4x4(a3, b3, c3, d3, b0, b1); \
-	vmovdqa st0, b0; \
-	vmovdqa st1, b1; \
+	vmovdqu st0, b0; \
+	vmovdqu st1, b1; \
 	/* does not adjust output bytes inside vectors */
 
 /* load blocks to registers and apply pre-whitening */
@@ -542,22 +542,22 @@
 	byteslice_16x16b_fast(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, \
 			      y4, y5, y6, y7, (mem_ab), (mem_cd)); \
 	\
-	vmovdqa x0, 0 * 32(mem_ab); \
-	vmovdqa x1, 1 * 32(mem_ab); \
-	vmovdqa x2, 2 * 32(mem_ab); \
-	vmovdqa x3, 3 * 32(mem_ab); \
-	vmovdqa x4, 4 * 32(mem_ab); \
-	vmovdqa x5, 5 * 32(mem_ab); \
-	vmovdqa x6, 6 * 32(mem_ab); \
-	vmovdqa x7, 7 * 32(mem_ab); \
-	vmovdqa y0, 0 * 32(mem_cd); \
-	vmovdqa y1, 1 * 32(mem_cd); \
-	vmovdqa y2, 2 * 32(mem_cd); \
-	vmovdqa y3, 3 * 32(mem_cd); \
-	vmovdqa y4, 4 * 32(mem_cd); \
-	vmovdqa y5, 5 * 32(mem_cd); \
-	vmovdqa y6, 6 * 32(mem_cd); \
-	vmovdqa y7, 7 * 32(mem_cd);
+	vmovdqu x0, 0 * 32(mem_ab); \
+	vmovdqu x1, 1 * 32(mem_ab); \
+	vmovdqu x2, 2 * 32(mem_ab); \
+	vmovdqu x3, 3 * 32(mem_ab); \
+	vmovdqu x4, 4 * 32(mem_ab); \
+	vmovdqu x5, 5 * 32(mem_ab); \
+	vmovdqu x6, 6 * 32(mem_ab); \
+	vmovdqu x7, 7 * 32(mem_ab); \
+	vmovdqu y0, 0 * 32(mem_cd); \
+	vmovdqu y1, 1 * 32(mem_cd); \
+	vmovdqu y2, 2 * 32(mem_cd); \
+	vmovdqu y3, 3 * 32(mem_cd); \
+	vmovdqu y4, 4 * 32(mem_cd); \
+	vmovdqu y5, 5 * 32(mem_cd); \
+	vmovdqu y6, 6 * 32(mem_cd); \
+	vmovdqu y7, 7 * 32(mem_cd);
 
 /* de-byteslice, apply post-whitening and store blocks */
 #define outunpack32(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, \
@@ -565,7 +565,7 @@
 	byteslice_16x16b_fast(y0, y4, x0, x4, y1, y5, x1, x5, y2, y6, x2, x6, \
 			      y3, y7, x3, x7, stack_tmp0, stack_tmp1); \
 	\
-	vmovdqa x0, stack_tmp0; \
+	vmovdqu x0, stack_tmp0; \
 	\
 	vpbroadcastq key, x0; \
 	vpshufb .Lpack_bswap RIP, x0, x0; \
@@ -800,14 +800,14 @@ __camellia_enc_blk32:
 
 .Lenc_done:
 	/* load CD for output */
-	vmovdqa 0 * 32(%rcx), %ymm8;
-	vmovdqa 1 * 32(%rcx), %ymm9;
-	vmovdqa 2 * 32(%rcx), %ymm10;
-	vmovdqa 3 * 32(%rcx), %ymm11;
-	vmovdqa 4 * 32(%rcx), %ymm12;
-	vmovdqa 5 * 32(%rcx), %ymm13;
-	vmovdqa 6 * 32(%rcx), %ymm14;
-	vmovdqa 7 * 32(%rcx), %ymm15;
+	vmovdqu 0 * 32(%rcx), %ymm8;
+	vmovdqu 1 * 32(%rcx), %ymm9;
+	vmovdqu 2 * 32(%rcx), %ymm10;
+	vmovdqu 3 * 32(%rcx), %ymm11;
+	vmovdqu 4 * 32(%rcx), %ymm12;
+	vmovdqu 5 * 32(%rcx), %ymm13;
+	vmovdqu 6 * 32(%rcx), %ymm14;
+	vmovdqu 7 * 32(%rcx), %ymm15;
 
 	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
@@ -887,14 +887,14 @@ __camellia_dec_blk32:
 		     %ymm15, %rax, %rcx, 0);
 
 	/* load CD for output */
-	vmovdqa 0 * 32(%rcx), %ymm8;
-	vmovdqa 1 * 32(%rcx), %ymm9;
-	vmovdqa 2 * 32(%rcx), %ymm10;
-	vmovdqa 3 * 32(%rcx), %ymm11;
-	vmovdqa 4 * 32(%rcx), %ymm12;
-	vmovdqa 5 * 32(%rcx), %ymm13;
-	vmovdqa 6 * 32(%rcx), %ymm14;
-	vmovdqa 7 * 32(%rcx), %ymm15;
+	vmovdqu 0 * 32(%rcx), %ymm8;
+	vmovdqu 1 * 32(%rcx), %ymm9;
+	vmovdqu 2 * 32(%rcx), %ymm10;
+	vmovdqu 3 * 32(%rcx), %ymm11;
+	vmovdqu 4 * 32(%rcx), %ymm12;
+	vmovdqu 5 * 32(%rcx), %ymm13;
+	vmovdqu 6 * 32(%rcx), %ymm14;
+	vmovdqu 7 * 32(%rcx), %ymm15;
 
 	outunpack32(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
 		    %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14,
@@ -937,14 +937,16 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	 *	%rcx: iv (big endian, 128bit)
 	 */
 
+	pushq %rbp;
+	movq %rsp, %rbp;
+
 	movq 8(%rcx), %r11;
 	bswapq %r11;
 
 	vzeroupper;
 
-	movq %rsp, %r10;
 	subq $(16 * 32), %rsp;
-	andq $~31, %rsp;
+	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	vpcmpeqd %ymm15, %ymm15, %ymm15;
@@ -958,7 +960,7 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	vbroadcasti128 .Lbswap128_mask RIP, %ymm14;
 	vinserti128 $1, %xmm0, %ymm1, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
-	vmovdqa %ymm13, 15 * 32(%rax);
+	vmovdqu %ymm13, 15 * 32(%rax);
 
 	/* check need for handling 64-bit overflow and carry */
 	cmpq $(0xffffffffffffffff - 32), %r11;
@@ -968,10 +970,10 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	vpaddq %ymm15, %ymm15, %ymm15; /* ab: -2:0 ; cd: -2:0 */
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
-	vmovdqa %ymm13, 14 * 32(%rax);
+	vmovdqu %ymm13, 14 * 32(%rax);
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm13;
-	vmovdqa %ymm13, 13 * 32(%rax);
+	vmovdqu %ymm13, 13 * 32(%rax);
 	vpsubq %ymm15, %ymm0, %ymm0;
 	vpshufb %ymm14, %ymm0, %ymm12;
 	vpsubq %ymm15, %ymm0, %ymm0;
@@ -1010,11 +1012,11 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 	inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le1 ; cd: le2 */
 	inc_le128(%ymm0, %ymm15, %ymm13); /* ab: le2 ; cd: le3 */
 	vpshufb %ymm14, %ymm0, %ymm13;
-	vmovdqa %ymm13, 14 * 32(%rax);
+	vmovdqu %ymm13, 14 * 32(%rax);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm13;
-	vmovdqa %ymm13, 13 * 32(%rax);
+	vmovdqu %ymm13, 13 * 32(%rax);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	inc_le128(%ymm0, %ymm15, %ymm13);
 	vpshufb %ymm14, %ymm0, %ymm12;
@@ -1083,8 +1085,6 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 
 	call __camellia_enc_blk32;
 
-	movq %r10, %rsp;
-
 	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
 	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
 	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
@@ -1109,6 +1109,7 @@ _gcry_camellia_aesni_avx2_ctr_enc:
 
 	vzeroall;
 
+	leave;
 	ret;
 .size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;
 
@@ -1124,6 +1125,9 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 	 *	%rcx: iv
 	 */
 
+	pushq %rbp;
+	movq %rsp, %rbp;
+
 	vzeroupper;
 
 	movq %rcx, %r9;
@@ -1133,9 +1137,8 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 	movl $24, %eax;
 	cmovel %eax, %r8d; /* max */
 
-	movq %rsp, %r10;
 	subq $(16 * 32), %rsp;
-	andq $~31, %rsp;
+	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	inpack32_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7,
@@ -1145,11 +1148,11 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 	call __camellia_dec_blk32;
 
 	/* XOR output with IV */
-	vmovdqa %ymm8, (%rax);
+	vmovdqu %ymm8, (%rax);
 	vmovdqu (%r9), %xmm8;
 	vinserti128 $1, (%rdx), %ymm8, %ymm8;
 	vpxor %ymm8, %ymm7, %ymm7;
-	vmovdqa (%rax), %ymm8;
+	vmovdqu (%rax), %ymm8;
 	vpxor (0 * 32 + 16)(%rdx), %ymm6, %ymm6;
 	vpxor (1 * 32 + 16)(%rdx), %ymm5, %ymm5;
 	vpxor (2 * 32 + 16)(%rdx), %ymm4, %ymm4;
@@ -1168,8 +1171,6 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 	movq (15 * 32 + 16 + 0)(%rdx), %rax;
 	movq (15 * 32 + 16 + 8)(%rdx), %rcx;
 
-	movq %r10, %rsp;
-
 	write_output(%ymm7, %ymm6, %ymm5, %ymm4, %ymm3, %ymm2, %ymm1, %ymm0,
 		     %ymm15, %ymm14, %ymm13, %ymm12, %ymm11, %ymm10, %ymm9,
 		     %ymm8, %rsi);
@@ -1180,6 +1181,7 @@ _gcry_camellia_aesni_avx2_cbc_dec:
 
 	vzeroall;
 
+	leave;
 	ret;
 .size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;
 
@@ -1195,11 +1197,13 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 	 *	%rcx: iv
 	 */
 
+	pushq %rbp;
+	movq %rsp, %rbp;
+
 	vzeroupper;
 
-	movq %rsp, %r10;
 	subq $(16 * 32), %rsp;
-	andq $~31, %rsp;
+	andq $~63, %rsp;
 	movq %rsp, %rax;
 
 	/* inpack16_pre: */
@@ -1228,8 +1232,6 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 
 	call __camellia_enc_blk32;
 
-	movq %r10, %rsp;
-
 	vpxor 0 * 32(%rdx), %ymm7, %ymm7;
 	vpxor 1 * 32(%rdx), %ymm6, %ymm6;
 	vpxor 2 * 32(%rdx), %ymm5, %ymm5;
@@ -1253,6 +1255,7 @@ _gcry_camellia_aesni_avx2_cfb_dec:
 
 	vzeroall;
 
+	leave;
 	ret;
 .size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;
 

-----------------------------------------------------------------------

Summary of changes:
 cipher/camellia-aesni-avx-amd64.S  |   73 ++++++------
 cipher/camellia-aesni-avx2-amd64.S |  215 ++++++++++++++++++------------------
 2 files changed, 148 insertions(+), 140 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org


_______________________________________________
Gnupg-commits mailing list
Gnupg-commits at gnupg.org
http://lists.gnupg.org/mailman/listinfo/gnupg-commits




More information about the Gcrypt-devel mailing list