From jussi.kivilinna at iki.fi Fri Apr 5 19:25:49 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 5 Apr 2019 20:25:49 +0300 Subject: [PATCH 2/7] Burn stack in transform functions for SHA1 AMD64 implementations In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> Message-ID: <155448514938.14473.7912761314620953735.stgit@localhost.localdomain> * cipher/sha1-avx-amd64.S: Burn stack inside transform functions. * cipher/sha1-avx-bmi2-amd64.S: Ditto. * cipher/sha1-avx2-bmi2-amd64.S: Ditto. * cipher/sha1-ssse3-amd64.S: Ditto. -- This change reduces per call overhead for SHA1. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S index 143e4066d..5d674c151 100644 --- a/cipher/sha1-avx-amd64.S +++ b/cipher/sha1-avx-amd64.S @@ -380,7 +380,7 @@ _gcry_sha1_transform_amd64_avx: .Lend: vzeroall; - /* Transform 64-79. */ + /* Transform 64-79 + burn stack */ R( b, c, d, e, a, F4, 64 ); R( a, b, c, d, e, F4, 65 ); R( e, a, b, c, d, F4, 66 ); @@ -393,12 +393,15 @@ _gcry_sha1_transform_amd64_avx: R( c, d, e, a, b, F4, 73 ); R( b, c, d, e, a, F4, 74 ); R( a, b, c, d, e, F4, 75 ); - R( e, a, b, c, d, F4, 76 ); - R( d, e, a, b, c, F4, 77 ); - R( c, d, e, a, b, F4, 78 ); + R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp); + R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp); + R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); + /* 16*4/16-1 = 3 */ + vmovdqa %xmm0, (3*16)(%rsp); + /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; @@ -416,8 +419,8 @@ _gcry_sha1_transform_amd64_avx: popq %rbp; popq %rbx; - /* burn_stack */ - movl $(16*4 + 2*8 + 31), %eax; + /* stack already burned */ + xorl %eax, %eax; .Lret: ret; diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S index 79ea24ef9..fe8901eff 100644 --- a/cipher/sha1-avx-bmi2-amd64.S +++ b/cipher/sha1-avx-bmi2-amd64.S @@ -387,7 +387,7 @@ _gcry_sha1_transform_amd64_avx_bmi2: .Lend: vzeroall; - /* Transform 64-79. */ + /* Transform 64-79 + burn stack */ R( b, c, d, e, a, F4, 64 ); R( a, b, c, d, e, F4, 65 ); R( e, a, b, c, d, F4, 66 ); @@ -400,14 +400,17 @@ _gcry_sha1_transform_amd64_avx_bmi2: R( c, d, e, a, b, F4, 73 ); R( b, c, d, e, a, F4, 74 ); R( a, b, c, d, e, F4, 75 ); - R( e, a, b, c, d, F4, 76 ); - R( d, e, a, b, c, F4, 77 ); - R( c, d, e, a, b, F4, 78 ); + R( e, a, b, c, d, F4, 76 ); vmovdqa %xmm0, (0*16)(%rsp); + R( d, e, a, b, c, F4, 77 ); vmovdqa %xmm0, (1*16)(%rsp); + R( c, d, e, a, b, F4, 78 ); vmovdqa %xmm0, (2*16)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); addl ne, a; xorl ne, ne; + /* 16*4/16-1 = 3 */ + vmovdqa %xmm0, (3*16)(%rsp); + /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; @@ -426,8 +429,8 @@ _gcry_sha1_transform_amd64_avx_bmi2: popq %rbp; popq %rbx; - /* burn_stack */ - movl $(16*4 + 3*8 + 31), %eax; + /* stack already burned */ + xorl %eax, %eax; .Lret: ret; diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S index c666290f2..2a2f21a56 100644 --- a/cipher/sha1-avx2-bmi2-amd64.S +++ b/cipher/sha1-avx2-bmi2-amd64.S @@ -504,7 +504,7 @@ _gcry_sha1_transform_amd64_avx2_bmi2: .Lend: vzeroall; - /* Transform 48-79 for block 2. */ + /* Transform 48-79 for block 2 + burn stack */ R( c, d, e, a, b, F3, 48, 1 ); R( b, c, d, e, a, F3, 49, 1 ); R( a, b, c, d, e, F3, 50, 1 ); @@ -517,30 +517,33 @@ _gcry_sha1_transform_amd64_avx2_bmi2: R( d, e, a, b, c, F3, 57, 1 ); R( c, d, e, a, b, F3, 58, 1 ); R( b, c, d, e, a, F3, 59, 1 ); - R( a, b, c, d, e, F4, 60, 1 ); - R( e, a, b, c, d, F4, 61, 1 ); - R( d, e, a, b, c, F4, 62, 1 ); - R( c, d, e, a, b, F4, 63, 1 ); - R( b, c, d, e, a, F4, 64, 1 ); - R( a, b, c, d, e, F4, 65, 1 ); - R( e, a, b, c, d, F4, 66, 1 ); - R( d, e, a, b, c, F4, 67, 1 ); - R( c, d, e, a, b, F4, 68, 1 ); - R( b, c, d, e, a, F4, 69, 1 ); - R( a, b, c, d, e, F4, 70, 1 ); - R( e, a, b, c, d, F4, 71, 1 ); - R( d, e, a, b, c, F4, 72, 1 ); - R( c, d, e, a, b, F4, 73, 1 ); - R( b, c, d, e, a, F4, 74, 1 ); - R( a, b, c, d, e, F4, 75, 1 ); - R( e, a, b, c, d, F4, 76, 1 ); - R( d, e, a, b, c, F4, 77, 1 ); - R( c, d, e, a, b, F4, 78, 1 ); + R( a, b, c, d, e, F4, 60, 1 ); vmovdqa %ymm0, (0*32)(%rsp); + R( e, a, b, c, d, F4, 61, 1 ); vmovdqa %ymm0, (1*32)(%rsp); + R( d, e, a, b, c, F4, 62, 1 ); vmovdqa %ymm0, (2*32)(%rsp); + R( c, d, e, a, b, F4, 63, 1 ); vmovdqa %ymm0, (3*32)(%rsp); + R( b, c, d, e, a, F4, 64, 1 ); vmovdqa %ymm0, (4*32)(%rsp); + R( a, b, c, d, e, F4, 65, 1 ); vmovdqa %ymm0, (5*32)(%rsp); + R( e, a, b, c, d, F4, 66, 1 ); vmovdqa %ymm0, (6*32)(%rsp); + R( d, e, a, b, c, F4, 67, 1 ); vmovdqa %ymm0, (7*32)(%rsp); + R( c, d, e, a, b, F4, 68, 1 ); vmovdqa %ymm0, (8*32)(%rsp); + R( b, c, d, e, a, F4, 69, 1 ); vmovdqa %ymm0, (9*32)(%rsp); + R( a, b, c, d, e, F4, 70, 1 ); vmovdqa %ymm0, (10*32)(%rsp); + R( e, a, b, c, d, F4, 71, 1 ); vmovdqa %ymm0, (11*32)(%rsp); + R( d, e, a, b, c, F4, 72, 1 ); vmovdqa %ymm0, (12*32)(%rsp); + R( c, d, e, a, b, F4, 73, 1 ); vmovdqa %ymm0, (13*32)(%rsp); + R( b, c, d, e, a, F4, 74, 1 ); vmovdqa %ymm0, (14*32)(%rsp); + R( a, b, c, d, e, F4, 75, 1 ); vmovdqa %ymm0, (15*32)(%rsp); + R( e, a, b, c, d, F4, 76, 1 ); vmovdqa %ymm0, (16*32)(%rsp); + R( d, e, a, b, c, F4, 77, 1 ); vmovdqa %ymm0, (17*32)(%rsp); + R( c, d, e, a, b, F4, 78, 1 ); vmovdqa %ymm0, (18*32)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79, 1 ); addl ne, a; xorl ne, ne; + /* WK_STACK_WORDS*4/32-1 = 19 */ + vmovdqa %ymm0, (19*32)(%rsp); + /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; @@ -559,8 +562,8 @@ _gcry_sha1_transform_amd64_avx2_bmi2: popq %rbp; popq %rbx; - /* burn_stack */ - movl $((WK_STACK_WORDS)*4 + 3*8 + 31), %eax; + /* stack already burned */ + xorl %eax, %eax; ret; ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2, diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index 421bebeca..fff140345 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -388,7 +388,7 @@ _gcry_sha1_transform_amd64_ssse3: .align 16 .Lend: - /* Transform 64-79 + Clear XMM registers. */ + /* Transform 64-79 + Clear XMM registers + Burn stack. */ R( b, c, d, e, a, F4, 64 ); CLEAR_REG(BSWAP_REG); R( a, b, c, d, e, F4, 65 ); CLEAR_REG(Wtmp0); R( e, a, b, c, d, F4, 66 ); CLEAR_REG(Wtmp1); @@ -401,12 +401,15 @@ _gcry_sha1_transform_amd64_ssse3: R( c, d, e, a, b, F4, 73 ); CLEAR_REG(W6); R( b, c, d, e, a, F4, 74 ); CLEAR_REG(W7); R( a, b, c, d, e, F4, 75 ); - R( e, a, b, c, d, F4, 76 ); - R( d, e, a, b, c, F4, 77 ); - R( c, d, e, a, b, F4, 78 ); + R( e, a, b, c, d, F4, 76 ); movdqa Wtmp0, (0*16)(%rsp); + R( d, e, a, b, c, F4, 77 ); movdqa Wtmp0, (1*16)(%rsp); + R( c, d, e, a, b, F4, 78 ); movdqa Wtmp0, (2*16)(%rsp); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); + /* 16*4/16-1 = 3 */ + vmovdqa Wtmp0, (3*16)(%rsp); + /* Update the chaining variables. */ addl state_h3(RSTATE), d; addl state_h2(RSTATE), c; @@ -424,8 +427,8 @@ _gcry_sha1_transform_amd64_ssse3: popq %rbp; popq %rbx; - /* burn_stack */ - movl $(16*4 + 2*8 + 31), %eax; + /* stack already burned */ + xorl %eax, %eax; .Lret: ret; From jussi.kivilinna at iki.fi Fri Apr 5 19:25:59 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 5 Apr 2019 20:25:59 +0300 Subject: [PATCH 4/7] tests/basic: add hash test for small block sizes In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> Message-ID: <155448515972.14473.1366753321398503046.stgit@localhost.localdomain> Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/tests/basic.c b/tests/basic.c index 190b0060b..a28dc6997 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -7951,7 +7951,7 @@ check_one_md (int algo, const char *data, int len, const char *expect, int elen, gcry_md_hd_t hd, hd2; unsigned char *p; int mdlen; - int i; + int i, j; int xof = 0; gcry_error_t err = 0; @@ -7988,6 +7988,66 @@ check_one_md (int algo, const char *data, int len, const char *expect, int elen, } } + if (*data == '!' && !data[1] && !xof) + { + unsigned char *p1, *p2; + char buf[129]; + + /* Test hashing small input sizes first as full block, then byte-by-byte + * and check that resulting digests are the same. */ + + err = gcry_md_open (&hd2, algo, 0); + if (err) + { + gcry_md_close (hd); + fail ("algo %d, gcry_md_open failed: %s\n", algo, gpg_strerror (err)); + return; + } + + if (key && klen) + { + err = gcry_md_setkey (hd2, key, klen); + if (err) + { + gcry_md_close (hd); + gcry_md_close (hd2); + fail ("algo %d, gcry_md_setkey failed: %s\n", algo, gpg_strerror (err)); + return; + } + } + + for (i = 0; i < sizeof(buf); i++) + buf[i] = i; + + for (i = 1; i < sizeof(buf); i++) + { + gcry_md_reset (hd); + gcry_md_reset (hd2); + + gcry_md_write (hd, buf, i); + for (j = 0; j < i; j++) + gcry_md_write (hd2, &buf[j], 1); + + p1 = gcry_md_read (hd, algo); + p2 = gcry_md_read (hd2, algo); + if (memcmp (p1, p2, mdlen)) + { + printf ("full block (input length %d): ", i); + for (i = 0; i < mdlen; i++) + printf ("%02x ", p1[i] & 0xFF); + printf ("\nbyte-by-byte: "); + for (i = 0; i < mdlen; i++) + printf ("%02x ", p2[i] & 0xFF); + printf ("\n"); + + fail ("algo %d, digest mismatch\n", algo); + } + } + + gcry_md_close (hd2); + gcry_md_reset (hd); + } + if ((*data == '!' && !data[1]) || /* hash one million times a "a" */ (*data == '?' && !data[1])) /* hash million byte data-set with byte pattern 0x00,0x01,0x02,... */ { From jussi.kivilinna at iki.fi Fri Apr 5 19:25:54 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 5 Apr 2019 20:25:54 +0300 Subject: [PATCH 3/7] Burn stack in transform functions for SHA2 AMD64 implementations In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> Message-ID: <155448515455.14473.5548704477061402190.stgit@localhost.localdomain> * cipher/sha256-avx-amd64.S: Burn stack inside transform functions. * cipher/sha256-avx2-bmi2-amd64.S: Ditto. * cipher/sha256-ssse3-amd64.S: Ditto. * cipher/sha512-avx-amd64.S: Ditto. * cipher/sha512-avx2-bmi2-amd64.S: Ditto. * cipher/sha512-ssse3-amd64.S: Ditto. -- This change reduces per call overhead for SHA256 & SHA512. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S index 6953855bb..b8b01b15b 100644 --- a/cipher/sha256-avx-amd64.S +++ b/cipher/sha256-avx-amd64.S @@ -480,9 +480,12 @@ _gcry_sha256_transform_amd64_avx: cmp INP, [rsp + _INP_END] jne .Loop0 +.Ldone_hash: vzeroall -.Ldone_hash: + vmovdqa [rsp + _XFER], XFER + xor eax, eax + add rsp, STACK_SIZE pop r15 @@ -491,8 +494,6 @@ _gcry_sha256_transform_amd64_avx: pop rbp pop rbx - mov eax, STACK_SIZE + 5*8 - ret diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S index 85e663fef..598f93821 100644 --- a/cipher/sha256-avx2-bmi2-amd64.S +++ b/cipher/sha256-avx2-bmi2-amd64.S @@ -747,10 +747,29 @@ _gcry_sha256_transform_amd64_avx2: jmp .Ldo_last_block .Ldone_hash: - mov rsp, [rsp + _RSP] - vzeroall + /* burn stack */ + vmovdqa [rsp + _XFER + 0 * 32], ymm0 + vmovdqa [rsp + _XFER + 1 * 32], ymm0 + vmovdqa [rsp + _XFER + 2 * 32], ymm0 + vmovdqa [rsp + _XFER + 3 * 32], ymm0 + vmovdqa [rsp + _XFER + 4 * 32], ymm0 + vmovdqa [rsp + _XFER + 5 * 32], ymm0 + vmovdqa [rsp + _XFER + 6 * 32], ymm0 + vmovdqa [rsp + _XFER + 7 * 32], ymm0 + vmovdqa [rsp + _XFER + 8 * 32], ymm0 + vmovdqa [rsp + _XFER + 9 * 32], ymm0 + vmovdqa [rsp + _XFER + 10 * 32], ymm0 + vmovdqa [rsp + _XFER + 11 * 32], ymm0 + vmovdqa [rsp + _XFER + 12 * 32], ymm0 + vmovdqa [rsp + _XFER + 13 * 32], ymm0 + vmovdqa [rsp + _XFER + 14 * 32], ymm0 + vmovdqa [rsp + _XFER + 15 * 32], ymm0 + xor eax, eax + + mov rsp, [rsp + _RSP] + pop r15 pop r14 pop r13 @@ -758,9 +777,6 @@ _gcry_sha256_transform_amd64_avx2: pop rbp pop rbx - /* stack burn depth */ - mov eax, STACK_SIZE + 6*8 + 31 - ret .align 64 diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index a9213e419..ca5c9fd1d 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -503,6 +503,10 @@ _gcry_sha256_transform_amd64_ssse3: pxor xmm12, xmm12 .Ldone_hash: + pxor XFER, XFER + movdqa [rsp + _XFER], XFER + xor eax, eax + add rsp, STACK_SIZE pop r15 @@ -511,8 +515,6 @@ _gcry_sha256_transform_amd64_ssse3: pop rbp pop rbx - mov eax, STACK_SIZE + 5*8 - ret diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 446a8b4e5..534351e44 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -352,13 +352,19 @@ _gcry_sha512_transform_amd64_avx: mov r14, [rsp + frame_GPRSAVE + 8 * 3] mov r15, [rsp + frame_GPRSAVE + 8 * 4] - /* Restore Stack Pointer */ - add rsp, frame_size - vzeroall - /* Return stack burn depth */ - mov rax, frame_size + /* Burn stack */ + t = 0 + .rept frame_W_size / 32 + vmovups [rsp + frame_W + (t) * 32], ymm0 + t = ((t)+1) + .endr + vmovdqu [rsp + frame_WK], xmm0 + xor eax, eax + + /* Restore Stack Pointer */ + add rsp, frame_size .Lnowork: ret diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 05bef64cf..914f920af 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -714,6 +714,7 @@ _gcry_sha512_transform_amd64_avx2: jne .Loop0 .Ldone_hash: + vzeroall /* Restore GPRs */ mov rbp, [rsp + frame_GPRSAVE + 8 * 0] @@ -723,12 +724,12 @@ _gcry_sha512_transform_amd64_avx2: mov r14, [rsp + frame_GPRSAVE + 8 * 4] mov r15, [rsp + frame_GPRSAVE + 8 * 5] + /* Burn stack */ + vmovdqa [rsp + frame_XFER], XFER + xor eax, eax + /* Restore Stack Pointer */ mov rsp, [rsp + frame_RSPSAVE] - - vzeroall - - mov eax, frame_size + 31 .Lnowork: ret diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 51193b361..8e950e0e4 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -352,9 +352,6 @@ _gcry_sha512_transform_amd64_ssse3: mov r14, [rsp + frame_GPRSAVE + 8 * 3] mov r15, [rsp + frame_GPRSAVE + 8 * 4] - /* Restore Stack Pointer */ - add rsp, frame_size - pxor xmm0, xmm0 pxor xmm1, xmm1 pxor xmm2, xmm2 @@ -362,8 +359,17 @@ _gcry_sha512_transform_amd64_ssse3: pxor xmm4, xmm4 pxor xmm5, xmm5 - /* Return stack burn depth */ - mov rax, frame_size + /* Burn stack */ + t = 0 + .rept frame_W_size / 16 + movdqu [rsp + frame_W + (t) * 16], xmm0 + t = ((t)+1) + .endr + movdqu [rsp + frame_WK], xmm0 + xor eax, eax + + /* Restore Stack Pointer */ + add rsp, frame_size .Lnowork: ret From jussi.kivilinna at iki.fi Fri Apr 5 19:26:05 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 5 Apr 2019 20:26:05 +0300 Subject: [PATCH 5/7] Optimizations for digest final functions In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> Message-ID: <155448516489.14473.7380936271683534052.stgit@localhost.localdomain> * cipher/md4.c (md4_final): Avoid byte-by-byte buffer setting when padding; Merge extra and last block processing. * cipher/md5.c (md5_final): Ditto. * cipher/rmd160.c (rmd160_final): Ditto. * cipher/sha1.c (sha1_final): Ditto. * cipher/sha256.c (sha256_final): Ditto. * cipher/sm3.c (sm3_final): Ditto. * cipher/tiger.c (tiger_final): Ditto. * cipher/sha512.c (sha512_final): Avoid byte-by-byte buffer setting when padding. * cipher/stribog.c (stribog_final): Ditto. * cipher/whirlpool.c (whirlpool_final): Ditto. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/md4.c b/cipher/md4.c index 098380801..997dbe0ce 100644 --- a/cipher/md4.c +++ b/cipher/md4.c @@ -234,25 +234,30 @@ md4_final( void *context ) msb <<= 3; msb |= t >> 29; - if( hd->bctx.count < 56 ) /* enough room */ + if (hd->bctx.count < 56) /* enough room */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ - while( hd->bctx.count < 56 ) - hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ + if (hd->bctx.count < 56) + memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count); + hd->bctx.count = 56; + + /* append the 64 bit count */ + buf_put_le32(hd->bctx.buf + 56, lsb); + buf_put_le32(hd->bctx.buf + 60, msb); + burn = transform (hd, hd->bctx.buf, 1); } else /* need one extra block */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ - while( hd->bctx.count < 64 ) - hd->bctx.buf[hd->bctx.count++] = 0; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */ + /* fill pad and next block with zeroes */ + memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56); + hd->bctx.count = 64 + 56; + + /* append the 64 bit count */ + buf_put_le32(hd->bctx.buf + 56, lsb); + buf_put_le32(hd->bctx.buf + 60, msb); + burn = transform (hd, hd->bctx.buf, 2); } - /* append the 64 bit count */ - buf_put_le32(hd->bctx.buf + 56, lsb); - buf_put_le32(hd->bctx.buf + 60, msb); - burn = transform ( hd, hd->bctx.buf, 1 ); - _gcry_burn_stack (burn); p = hd->bctx.buf; #define X(a) do { buf_put_le32(p, hd->a); p += 4; } while(0) @@ -262,6 +267,7 @@ md4_final( void *context ) X(D); #undef X + _gcry_burn_stack (burn); } static byte * diff --git a/cipher/md5.c b/cipher/md5.c index e35a500c4..c432502ff 100644 --- a/cipher/md5.c +++ b/cipher/md5.c @@ -258,25 +258,30 @@ md5_final( void *context) msb <<= 3; msb |= t >> 29; - if( hd->bctx.count < 56 ) /* enough room */ + if (hd->bctx.count < 56) /* enough room */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ - while( hd->bctx.count < 56 ) - hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ + if (hd->bctx.count < 56) + memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count); + hd->bctx.count = 56; + + /* append the 64 bit count */ + buf_put_le32(hd->bctx.buf + 56, lsb); + buf_put_le32(hd->bctx.buf + 60, msb); + burn = transform (hd, hd->bctx.buf, 1); } - else /* need one extra block */ + else /* need one extra block */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ - while( hd->bctx.count < 64 ) - hd->bctx.buf[hd->bctx.count++] = 0; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */ + /* fill pad and next block with zeroes */ + memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56); + hd->bctx.count = 64 + 56; + + /* append the 64 bit count */ + buf_put_le32(hd->bctx.buf + 56, lsb); + buf_put_le32(hd->bctx.buf + 60, msb); + burn = transform (hd, hd->bctx.buf, 2); } - /* append the 64 bit count */ - buf_put_le32(hd->bctx.buf + 56, lsb); - buf_put_le32(hd->bctx.buf + 60, msb); - burn = transform ( hd, hd->bctx.buf, 1 ); - _gcry_burn_stack (burn); p = hd->bctx.buf; #define X(a) do { buf_put_le32(p, hd->a); p += 4; } while(0) @@ -286,6 +291,7 @@ md5_final( void *context) X(D); #undef X + _gcry_burn_stack (burn); } static byte * diff --git a/cipher/rmd160.c b/cipher/rmd160.c index 2d2fae916..231640d27 100644 --- a/cipher/rmd160.c +++ b/cipher/rmd160.c @@ -431,25 +431,30 @@ rmd160_final( void *context ) msb <<= 3; msb |= t >> 29; - if( hd->bctx.count < 56 ) /* enough room */ + if (hd->bctx.count < 56) /* enough room */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ - while( hd->bctx.count < 56 ) - hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ + if (hd->bctx.count < 56) + memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count); + hd->bctx.count = 56; + + /* append the 64 bit count */ + buf_put_le32(hd->bctx.buf + 56, lsb); + buf_put_le32(hd->bctx.buf + 60, msb); + burn = transform (hd, hd->bctx.buf, 1); } - else /* need one extra block */ + else /* need one extra block */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ - while( hd->bctx.count < 64 ) - hd->bctx.buf[hd->bctx.count++] = 0; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */ + /* fill pad and next block with zeroes */ + memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56); + hd->bctx.count = 64 + 56; + + /* append the 64 bit count */ + buf_put_le32(hd->bctx.buf + 56, lsb); + buf_put_le32(hd->bctx.buf + 60, msb); + burn = transform (hd, hd->bctx.buf, 2); } - /* append the 64 bit count */ - buf_put_le32(hd->bctx.buf + 56, lsb); - buf_put_le32(hd->bctx.buf + 60, msb); - burn = transform ( hd, hd->bctx.buf, 1 ); - _gcry_burn_stack (burn); p = hd->bctx.buf; #define X(a) do { buf_put_le32(p, hd->h##a); p += 4; } while(0) @@ -459,6 +464,8 @@ rmd160_final( void *context ) X(3); X(4); #undef X + + _gcry_burn_stack (burn); } static byte * diff --git a/cipher/sha256.c b/cipher/sha256.c index e82a9d902..327e1029f 100644 --- a/cipher/sha256.c +++ b/cipher/sha256.c @@ -498,25 +498,30 @@ sha256_final(void *context) msb <<= 3; msb |= t >> 29; - if (hd->bctx.count < 56) - { /* enough room */ + if (hd->bctx.count < 56) /* enough room */ + { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ - while (hd->bctx.count < 56) - hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ + if (hd->bctx.count < 56) + memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count); + hd->bctx.count = 56; + + /* append the 64 bit count */ + buf_put_be32(hd->bctx.buf + 56, msb); + buf_put_be32(hd->bctx.buf + 60, lsb); + burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1); } - else - { /* need one extra block */ + else /* need one extra block */ + { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ - while (hd->bctx.count < 64) - hd->bctx.buf[hd->bctx.count++] = 0; - _gcry_md_block_write (hd, NULL, 0); /* flush */; - memset (hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */ + /* fill pad and next block with zeroes */ + memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56); + hd->bctx.count = 64 + 56; + + /* append the 64 bit count */ + buf_put_be32(hd->bctx.buf + 64 + 56, msb); + buf_put_be32(hd->bctx.buf + 64 + 60, lsb); + burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 2); } - /* append the 64 bit count */ - buf_put_be32(hd->bctx.buf + 56, msb); - buf_put_be32(hd->bctx.buf + 60, lsb); - burn = (*hd->bctx.bwrite) (hd, hd->bctx.buf, 1); - _gcry_burn_stack (burn); p = hd->bctx.buf; #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0) @@ -529,6 +534,8 @@ sha256_final(void *context) X(6); X(7); #undef X + + _gcry_burn_stack (burn); } static byte * diff --git a/cipher/sha512.c b/cipher/sha512.c index 721f34054..615b55357 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -659,14 +659,16 @@ sha512_final (void *context) if (hd->bctx.count < 112) { /* enough room */ hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ - while (hd->bctx.count < 112) - hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ + if (hd->bctx.count < 112) + memset (&hd->bctx.buf[hd->bctx.count], 0, 112 - hd->bctx.count); + hd->bctx.count = 112; } else { /* need one extra block */ hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ - while (hd->bctx.count < 128) - hd->bctx.buf[hd->bctx.count++] = 0; + if (hd->bctx.count < 128) + memset (&hd->bctx.buf[hd->bctx.count], 0, 128 - hd->bctx.count); + hd->bctx.count = 128; _gcry_md_block_write (context, NULL, 0); /* flush */ ; memset (hd->bctx.buf, 0, 112); /* fill next block with zeroes */ } diff --git a/cipher/sm3.c b/cipher/sm3.c index c6f1a091d..7bfb37b95 100644 --- a/cipher/sm3.c +++ b/cipher/sm3.c @@ -291,25 +291,30 @@ sm3_final(void *context) msb <<= 3; msb |= t >> 29; - if (hd->bctx.count < 56) - { /* enough room */ + if (hd->bctx.count < 56) /* enough room */ + { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ - while (hd->bctx.count < 56) - hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ + if (hd->bctx.count < 56) + memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count); + hd->bctx.count = 56; + + /* append the 64 bit count */ + buf_put_be32(hd->bctx.buf + 56, msb); + buf_put_be32(hd->bctx.buf + 60, lsb); + burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 ); } - else - { /* need one extra block */ + else /* need one extra block */ + { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ - while (hd->bctx.count < 64) - hd->bctx.buf[hd->bctx.count++] = 0; - _gcry_md_block_write (hd, NULL, 0); /* flush */; - memset (hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */ + /* fill pad and next block with zeroes */ + memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56); + hd->bctx.count = 64 + 56; + + /* append the 64 bit count */ + buf_put_be32(hd->bctx.buf + 64 + 56, msb); + buf_put_be32(hd->bctx.buf + 64 + 60, lsb); + burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 2 ); } - /* append the 64 bit count */ - buf_put_be32(hd->bctx.buf + 56, msb); - buf_put_be32(hd->bctx.buf + 60, lsb); - burn = transform (hd, hd->bctx.buf, 1); - _gcry_burn_stack (burn); p = hd->bctx.buf; #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0) @@ -322,6 +327,8 @@ sm3_final(void *context) X(6); X(7); #undef X + + _gcry_burn_stack (burn); } static byte * diff --git a/cipher/stribog.c b/cipher/stribog.c index 459e4db99..d31dddd37 100644 --- a/cipher/stribog.c +++ b/cipher/stribog.c @@ -1292,8 +1292,9 @@ stribog_final (void *context) i = hd->bctx.count; /* After flush we have at least one byte free) */ hd->bctx.buf[i++] = 1; - while (i < 64) - hd->bctx.buf[i++] = 0; + if (i < 64) + memset (&hd->bctx.buf[i], 0, 64 - i); + i = 64; transform_bits (hd, hd->bctx.buf, hd->bctx.count * 8); g (hd->h, hd->N, Z); diff --git a/cipher/tiger.c b/cipher/tiger.c index d24d1603b..0319b7115 100644 --- a/cipher/tiger.c +++ b/cipher/tiger.c @@ -760,22 +760,26 @@ tiger_final( void *context ) if( hd->bctx.count < 56 ) /* enough room */ { hd->bctx.buf[hd->bctx.count++] = pad; - while( hd->bctx.count < 56 ) - hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ + if (hd->bctx.count < 56) + memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count); + hd->bctx.count = 56; + /* append the 64 bit count */ + buf_put_le32(hd->bctx.buf + 56, lsb); + buf_put_le32(hd->bctx.buf + 60, msb); + burn = transform( hd, hd->bctx.buf, 1 ); } else /* need one extra block */ { hd->bctx.buf[hd->bctx.count++] = pad; /* pad character */ - while( hd->bctx.count < 64 ) - hd->bctx.buf[hd->bctx.count++] = 0; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */ + /* fill pad and next block with zeroes */ + memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56); + hd->bctx.count = 64 + 56; + + /* append the 64 bit count */ + buf_put_le32(hd->bctx.buf + 64 + 56, lsb); + buf_put_le32(hd->bctx.buf + 64 + 60, msb); + burn = transform( hd, hd->bctx.buf, 2 ); } - /* append the 64 bit count */ - buf_put_le32(hd->bctx.buf + 56, lsb); - buf_put_le32(hd->bctx.buf + 60, msb); - burn = transform( hd, hd->bctx.buf, 1 ); - _gcry_burn_stack (burn); p = hd->bctx.buf; #define X(a) do { buf_put_be64(p, hd->a); p += 8; } while(0) @@ -794,6 +798,8 @@ tiger_final( void *context ) } #undef X #undef Y + + _gcry_burn_stack (burn); } static byte * diff --git a/cipher/whirlpool.c b/cipher/whirlpool.c index d52375ada..d9b79cf1a 100644 --- a/cipher/whirlpool.c +++ b/cipher/whirlpool.c @@ -1494,12 +1494,16 @@ whirlpool_final (void *ctx) if (context->bctx.count > 32) { /* An extra block is necessary. */ - while (context->bctx.count < 64) - context->bctx.buf[context->bctx.count++] = 0; + if (context->bctx.count < 64) + memset (&context->bctx.buf[context->bctx.count], 0, + 64 - context->bctx.count); + context->bctx.count = 64; whirlpool_write (context, NULL, 0); } - while (context->bctx.count < 32) - context->bctx.buf[context->bctx.count++] = 0; + if (context->bctx.count < 32) + memset (&context->bctx.buf[context->bctx.count], 0, + 32 - context->bctx.count); + context->bctx.count = 32; /* Add length of message. */ length = context->bctx.buf + context->bctx.count; From jussi.kivilinna at iki.fi Fri Apr 5 19:26:10 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 5 Apr 2019 20:26:10 +0300 Subject: [PATCH 6/7] Remove extra buffer flush at begining of digest final functions In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> Message-ID: <155448517006.14473.15729038755403086657.stgit@localhost.localdomain> * cipher/md2.c (md2_final): Remove _gcry_md_block_write flush call from entry. * cipher/md4.c (md4_final): Ditto. * cipher/md5.c (md5_final): Ditto. * cipher/rmd160.c (rmd160_final): Ditto. * cipher/sha1.c (sha1_final): Ditto. * cipher/sha256.c (sha256_final): Ditto. * cipher/sha512.c (sha512_final): Ditto. * cipher/sm3.c (sm3_final): Ditto. * cipher/stribog.c (stribog_final): Ditto. * cipher/tiger.c (tiger_final): Ditto. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/md2.c b/cipher/md2.c index b6f7e94f4..bf2fbee4c 100644 --- a/cipher/md2.c +++ b/cipher/md2.c @@ -146,8 +146,6 @@ md2_final (void *context) MD2_CONTEXT *hd = context; unsigned int burn; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - /* pad */ memset (hd->bctx.buf + hd->bctx.count, 16 - hd->bctx.count, 16 - hd->bctx.count); diff --git a/cipher/md4.c b/cipher/md4.c index 997dbe0ce..f6258893e 100644 --- a/cipher/md4.c +++ b/cipher/md4.c @@ -213,8 +213,6 @@ md4_final( void *context ) byte *p; unsigned int burn; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; diff --git a/cipher/md5.c b/cipher/md5.c index c432502ff..67511ba01 100644 --- a/cipher/md5.c +++ b/cipher/md5.c @@ -237,8 +237,6 @@ md5_final( void *context) byte *p; unsigned int burn; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; diff --git a/cipher/rmd160.c b/cipher/rmd160.c index 231640d27..f15eec225 100644 --- a/cipher/rmd160.c +++ b/cipher/rmd160.c @@ -410,8 +410,6 @@ rmd160_final( void *context ) byte *p; unsigned int burn; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; diff --git a/cipher/sha1.c b/cipher/sha1.c index affabfb07..23aceef32 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -511,8 +511,6 @@ sha1_final(void *context) unsigned char *p; unsigned int burn; - _gcry_md_block_write (hd, NULL, 0); /* flush */; - t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; @@ -532,7 +530,7 @@ sha1_final(void *context) msb <<= 3; msb |= t >> 29; - if( hd->bctx.count < 56 ) /* enough room */ + if (hd->bctx.count < 56) /* enough room */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ if (hd->bctx.count < 56) diff --git a/cipher/sha256.c b/cipher/sha256.c index 327e1029f..6c6833482 100644 --- a/cipher/sha256.c +++ b/cipher/sha256.c @@ -477,8 +477,6 @@ sha256_final(void *context) byte *p; unsigned int burn; - _gcry_md_block_write (hd, NULL, 0); /* flush */; - t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; diff --git a/cipher/sha512.c b/cipher/sha512.c index 615b55357..59e65f07a 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -635,8 +635,6 @@ sha512_final (void *context) u64 t, th, msb, lsb; byte *p; - _gcry_md_block_write (context, NULL, 0); /* flush */ ; - t = hd->bctx.nblocks; /* if (sizeof t == sizeof hd->bctx.nblocks) */ th = hd->bctx.nblocks_high; diff --git a/cipher/sm3.c b/cipher/sm3.c index 7bfb37b95..e76f32297 100644 --- a/cipher/sm3.c +++ b/cipher/sm3.c @@ -270,8 +270,6 @@ sm3_final(void *context) byte *p; unsigned int burn; - _gcry_md_block_write (hd, NULL, 0); /* flush */; - t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; diff --git a/cipher/stribog.c b/cipher/stribog.c index d31dddd37..3eb077356 100644 --- a/cipher/stribog.c +++ b/cipher/stribog.c @@ -1287,7 +1287,6 @@ stribog_final (void *context) u64 Z[8] = {}; int i; - _gcry_md_block_write (context, NULL, 0); /* flush */ ; /* PAD. It does not count towards message length */ i = hd->bctx.count; /* After flush we have at least one byte free) */ diff --git a/cipher/tiger.c b/cipher/tiger.c index 0319b7115..c78e3ac35 100644 --- a/cipher/tiger.c +++ b/cipher/tiger.c @@ -736,8 +736,6 @@ tiger_final( void *context ) unsigned int burn; byte pad = hd->variant == 2? 0x80 : 0x01; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - t = hd->bctx.nblocks; if (sizeof t == sizeof hd->bctx.nblocks) th = hd->bctx.nblocks_high; From jussi.kivilinna at iki.fi Fri Apr 5 19:26:15 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 5 Apr 2019 20:26:15 +0300 Subject: [PATCH 7/7] Add SHA512/224 and SHA512/256 algorithms In-Reply-To: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> References: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> Message-ID: <155448517522.14473.14391969797660348419.stgit@localhost.localdomain> * cipher/mac-hmac.c (map_mac_algo_to_md): Add mapping for SHA512/224 and SHA512/256. (_gcry_mac_type_spec_hmac_sha512_256) (_gcry_mac_type_spec_hmac_sha512_224): New. * cipher/mac-internal.h (_gcry_mac_type_spec_hmac_sha512_256) (_gcry_mac_type_spec_hmac_sha512_224): New. * cipher/mac.c (mac_list, mac_list_algo101): Add SHA512/224 and SHA512/256. * cipher/md.c (digest_list, digest_list_algo301) (prepare_macpads): Ditto. * cipher/sha512.c (run_selftests): Ditto. (sha512_init_common): Move common initialization here. (sha512_init, sha384_init): Use common initialization function. (sha512_224_init, sha512_256_init, _gcry_sha512_224_hash_buffer) (_gcry_sha512_224_hash_buffers, _gcry_sha512_256_hash_buffer) (_gcry_sha512_256_hash_buffers, selftests_sha512_224) (selftests_sha512_256, sha512_224_asn, oid_spec_sha512_224) (_gcry_digest_spec_sha512_224, sha512_256_asn, oid_spec_sha512_256) (_gcry_digest_spec_sha512_256): New. * doc/gcrypt.texi: Add SHA512/224 and SHA512/256; Add missing HMAC-BLAKE2s and HMAC-BLAKE2b. * src/cipher.h (_gcry_digest_spec_sha512_224) (_gcry_digest_spec_sha512_256): New. * src/gcrypt.h.in (GCRY_MD_SHA512_256, GCRY_MD_SHA512_224): New. (GCRY_MAC_HMAC_SHA512_256, GCRY_MAC_HMAC_SHA512_224): New. * tests/basic.c (check_digests): Add SHA512/224 and SHA512/256 test vectors. -- This change adds truncated SHA512/224 and SHA512/256 algorithms specified in FIPS 180-4. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/mac-hmac.c b/cipher/mac-hmac.c index 86281acdf..e488d03aa 100644 --- a/cipher/mac-hmac.c +++ b/cipher/mac-hmac.c @@ -51,6 +51,10 @@ map_mac_algo_to_md (int mac_algo) return GCRY_MD_SHA384; case GCRY_MAC_HMAC_SHA512: return GCRY_MD_SHA512; + case GCRY_MAC_HMAC_SHA512_256: + return GCRY_MD_SHA512_256; + case GCRY_MAC_HMAC_SHA512_224: + return GCRY_MD_SHA512_224; case GCRY_MAC_HMAC_SHA3_224: return GCRY_MD_SHA3_224; case GCRY_MAC_HMAC_SHA3_256: @@ -260,6 +264,17 @@ gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha384 = { GCRY_MAC_HMAC_SHA384, {0, 1}, "HMAC_SHA384", &hmac_ops }; + +gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_256 = { + GCRY_MAC_HMAC_SHA512_256, {0, 1}, "HMAC_SHA512_256", + &hmac_ops +}; + +gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_224 = { + GCRY_MAC_HMAC_SHA512_224, {0, 1}, "HMAC_SHA512_224", + &hmac_ops +}; + #endif #if USE_SHA3 gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_224 = { diff --git a/cipher/mac-internal.h b/cipher/mac-internal.h index eb5467380..03f5b8da8 100644 --- a/cipher/mac-internal.h +++ b/cipher/mac-internal.h @@ -133,6 +133,8 @@ extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha224; #if USE_SHA512 extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512; extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha384; +extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_224; +extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha512_256; #endif #if USE_SHA3 extern gcry_mac_spec_t _gcry_mac_type_spec_hmac_sha3_224; diff --git a/cipher/mac.c b/cipher/mac.c index 1b79bf315..0bbac3e41 100644 --- a/cipher/mac.c +++ b/cipher/mac.c @@ -40,6 +40,8 @@ static gcry_mac_spec_t * const mac_list[] = { #if USE_SHA512 &_gcry_mac_type_spec_hmac_sha512, &_gcry_mac_type_spec_hmac_sha384, + &_gcry_mac_type_spec_hmac_sha512_256, + &_gcry_mac_type_spec_hmac_sha512_224, #endif #if USE_SHA3 &_gcry_mac_type_spec_hmac_sha3_224, @@ -230,9 +232,16 @@ static gcry_mac_spec_t * const mac_list_algo101[] = NULL, #endif #if USE_SM3 - &_gcry_mac_type_spec_hmac_sm3 + &_gcry_mac_type_spec_hmac_sm3, #else - NULL + NULL, +#endif +#if USE_SHA512 + &_gcry_mac_type_spec_hmac_sha512_256, + &_gcry_mac_type_spec_hmac_sha512_224, +#else + NULL, + NULL, #endif }; diff --git a/cipher/md.c b/cipher/md.c index 15e19a95f..6ca390ff6 100644 --- a/cipher/md.c +++ b/cipher/md.c @@ -48,6 +48,8 @@ static gcry_md_spec_t * const digest_list[] = #if USE_SHA512 &_gcry_digest_spec_sha512, &_gcry_digest_spec_sha384, + &_gcry_digest_spec_sha512_256, + &_gcry_digest_spec_sha512_224, #endif #if USE_SHA3 &_gcry_digest_spec_sha3_224, @@ -232,9 +234,16 @@ static gcry_md_spec_t * const digest_list_algo301[] = NULL, #endif #if USE_SM3 - &_gcry_digest_spec_sm3 + &_gcry_digest_spec_sm3, #else - NULL + NULL, +#endif +#if USE_SHA512 + &_gcry_digest_spec_sha512_256, + &_gcry_digest_spec_sha512_224, +#else + NULL, + NULL, #endif }; @@ -928,6 +937,8 @@ prepare_macpads (gcry_md_hd_t a, const unsigned char *key, size_t keylen) break; case GCRY_MD_SHA384: case GCRY_MD_SHA512: + case GCRY_MD_SHA512_256: + case GCRY_MD_SHA512_224: case GCRY_MD_BLAKE2B_512: case GCRY_MD_BLAKE2B_384: case GCRY_MD_BLAKE2B_256: diff --git a/cipher/sha512.c b/cipher/sha512.c index 59e65f07a..1a808f884 100644 --- a/cipher/sha512.c +++ b/cipher/sha512.c @@ -254,24 +254,13 @@ do_transform_generic (void *context, const unsigned char *data, size_t nblks); static void -sha512_init (void *context, unsigned int flags) +sha512_init_common (SHA512_CONTEXT *ctx, unsigned int flags) { - SHA512_CONTEXT *ctx = context; - SHA512_STATE *hd = &ctx->state; unsigned int features = _gcry_get_hw_features (); (void)flags; (void)k; - hd->h0 = U64_C(0x6a09e667f3bcc908); - hd->h1 = U64_C(0xbb67ae8584caa73b); - hd->h2 = U64_C(0x3c6ef372fe94f82b); - hd->h3 = U64_C(0xa54ff53a5f1d36f1); - hd->h4 = U64_C(0x510e527fade682d1); - hd->h5 = U64_C(0x9b05688c2b3e6c1f); - hd->h6 = U64_C(0x1f83d9abfb41bd6b); - hd->h7 = U64_C(0x5be0cd19137e2179); - ctx->bctx.nblocks = 0; ctx->bctx.nblocks_high = 0; ctx->bctx.count = 0; @@ -300,14 +289,30 @@ sha512_init (void *context, unsigned int flags) (void)features; } + static void -sha384_init (void *context, unsigned int flags) +sha512_init (void *context, unsigned int flags) { SHA512_CONTEXT *ctx = context; SHA512_STATE *hd = &ctx->state; - unsigned int features = _gcry_get_hw_features (); - (void)flags; + hd->h0 = U64_C(0x6a09e667f3bcc908); + hd->h1 = U64_C(0xbb67ae8584caa73b); + hd->h2 = U64_C(0x3c6ef372fe94f82b); + hd->h3 = U64_C(0xa54ff53a5f1d36f1); + hd->h4 = U64_C(0x510e527fade682d1); + hd->h5 = U64_C(0x9b05688c2b3e6c1f); + hd->h6 = U64_C(0x1f83d9abfb41bd6b); + hd->h7 = U64_C(0x5be0cd19137e2179); + + sha512_init_common (ctx, flags); +} + +static void +sha384_init (void *context, unsigned int flags) +{ + SHA512_CONTEXT *ctx = context; + SHA512_STATE *hd = &ctx->state; hd->h0 = U64_C(0xcbbb9d5dc1059ed8); hd->h1 = U64_C(0x629a292a367cd507); @@ -318,35 +323,49 @@ sha384_init (void *context, unsigned int flags) hd->h6 = U64_C(0xdb0c2e0d64f98fa7); hd->h7 = U64_C(0x47b5481dbefa4fa4); - ctx->bctx.nblocks = 0; - ctx->bctx.nblocks_high = 0; - ctx->bctx.count = 0; - ctx->bctx.blocksize = 128; + sha512_init_common (ctx, flags); +} - /* Order of feature checks is important here; last match will be - * selected. Keep slower implementations at the top and faster at - * the bottom. */ - ctx->bctx.bwrite = do_transform_generic; -#ifdef USE_ARM_NEON_ASM - if ((features & HWF_ARM_NEON) != 0) - ctx->bctx.bwrite = do_sha512_transform_armv7_neon; -#endif -#ifdef USE_SSSE3 - if ((features & HWF_INTEL_SSSE3) != 0) - ctx->bctx.bwrite = do_sha512_transform_amd64_ssse3; -#endif -#ifdef USE_AVX - if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_FAST_SHLD)) - ctx->bctx.bwrite = do_sha512_transform_amd64_avx; -#endif -#ifdef USE_AVX2 - if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_BMI2)) - ctx->bctx.bwrite = do_sha512_transform_amd64_avx2; -#endif - (void)features; + +static void +sha512_256_init (void *context, unsigned int flags) +{ + SHA512_CONTEXT *ctx = context; + SHA512_STATE *hd = &ctx->state; + + hd->h0 = U64_C(0x22312194fc2bf72c); + hd->h1 = U64_C(0x9f555fa3c84c64c2); + hd->h2 = U64_C(0x2393b86b6f53b151); + hd->h3 = U64_C(0x963877195940eabd); + hd->h4 = U64_C(0x96283ee2a88effe3); + hd->h5 = U64_C(0xbe5e1e2553863992); + hd->h6 = U64_C(0x2b0199fc2c85b8aa); + hd->h7 = U64_C(0x0eb72ddc81c52ca2); + + sha512_init_common (ctx, flags); } +static void +sha512_224_init (void *context, unsigned int flags) +{ + SHA512_CONTEXT *ctx = context; + SHA512_STATE *hd = &ctx->state; + + hd->h0 = U64_C(0x8c3d37c819544da2); + hd->h1 = U64_C(0x73e1996689dcd4d6); + hd->h2 = U64_C(0x1dfab7ae32ff9c82); + hd->h3 = U64_C(0x679dd514582f9fcf); + hd->h4 = U64_C(0x0f6d2b697bd44da8); + hd->h5 = U64_C(0x77e36f7304c48942); + hd->h6 = U64_C(0x3f9d85a86a1d36c8); + hd->h7 = U64_C(0x1112e6ad91d692a1); + + sha512_init_common (ctx, flags); +} + + + #ifndef USE_ARM_ASM static inline u64 @@ -758,6 +777,68 @@ _gcry_sha384_hash_buffers (void *outbuf, const gcry_buffer_t *iov, int iovcnt) } + +/* Shortcut functions which puts the hash value of the supplied buffer + * into outbuf which must have a size of 32 bytes. */ +static void +_gcry_sha512_256_hash_buffer (void *outbuf, const void *buffer, size_t length) +{ + SHA512_CONTEXT hd; + + sha512_256_init (&hd, 0); + _gcry_md_block_write (&hd, buffer, length); + sha512_final (&hd); + memcpy (outbuf, hd.bctx.buf, 32); +} + + +/* Variant of the above shortcut function using multiple buffers. */ +static void +_gcry_sha512_256_hash_buffers (void *outbuf, const gcry_buffer_t *iov, + int iovcnt) +{ + SHA512_CONTEXT hd; + + sha512_256_init (&hd, 0); + for (;iovcnt > 0; iov++, iovcnt--) + _gcry_md_block_write (&hd, + (const char*)iov[0].data + iov[0].off, iov[0].len); + sha512_final (&hd); + memcpy (outbuf, hd.bctx.buf, 32); +} + + + +/* Shortcut functions which puts the hash value of the supplied buffer + * into outbuf which must have a size of 28 bytes. */ +static void +_gcry_sha512_224_hash_buffer (void *outbuf, const void *buffer, size_t length) +{ + SHA512_CONTEXT hd; + + sha512_224_init (&hd, 0); + _gcry_md_block_write (&hd, buffer, length); + sha512_final (&hd); + memcpy (outbuf, hd.bctx.buf, 28); +} + + +/* Variant of the above shortcut function using multiple buffers. */ +static void +_gcry_sha512_224_hash_buffers (void *outbuf, const gcry_buffer_t *iov, + int iovcnt) +{ + SHA512_CONTEXT hd; + + sha512_224_init (&hd, 0); + for (;iovcnt > 0; iov++, iovcnt--) + _gcry_md_block_write (&hd, + (const char*)iov[0].data + iov[0].off, iov[0].len); + sha512_final (&hd); + memcpy (outbuf, hd.bctx.buf, 28); +} + + /* Self-test section. @@ -867,6 +948,102 @@ selftests_sha512 (int extended, selftest_report_func_t report) return GPG_ERR_SELFTEST_FAILED; } +static gpg_err_code_t +selftests_sha512_224 (int extended, selftest_report_func_t report) +{ + const char *what; + const char *errtxt; + + what = "short string"; + errtxt = _gcry_hash_selftest_check_one + (GCRY_MD_SHA512_224, 0, + "abc", 3, + "\x46\x34\x27\x0F\x70\x7B\x6A\x54\xDA\xAE\x75\x30\x46\x08\x42\xE2" + "\x0E\x37\xED\x26\x5C\xEE\xE9\xA4\x3E\x89\x24\xAA", + 28); + if (errtxt) + goto failed; + + if (extended) + { + what = "long string"; + errtxt = _gcry_hash_selftest_check_one + (GCRY_MD_SHA512_224, 0, + "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn" + "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112, + "\x23\xFE\xC5\xBB\x94\xD6\x0B\x23\x30\x81\x92\x64\x0B\x0C\x45\x33" + "\x35\xD6\x64\x73\x4F\xE4\x0E\x72\x68\x67\x4A\xF9", + 28); + if (errtxt) + goto failed; + + what = "one million \"a\""; + errtxt = _gcry_hash_selftest_check_one + (GCRY_MD_SHA512_224, 1, + NULL, 0, + "\x37\xab\x33\x1d\x76\xf0\xd3\x6d\xe4\x22\xbd\x0e\xde\xb2\x2a\x28" + "\xac\xcd\x48\x7b\x7a\x84\x53\xae\x96\x5d\xd2\x87", + 28); + if (errtxt) + goto failed; + } + + return 0; /* Succeeded. */ + + failed: + if (report) + report ("digest", GCRY_MD_SHA512_224, what, errtxt); + return GPG_ERR_SELFTEST_FAILED; +} + +static gpg_err_code_t +selftests_sha512_256 (int extended, selftest_report_func_t report) +{ + const char *what; + const char *errtxt; + + what = "short string"; + errtxt = _gcry_hash_selftest_check_one + (GCRY_MD_SHA512_256, 0, + "abc", 3, + "\x53\x04\x8E\x26\x81\x94\x1E\xF9\x9B\x2E\x29\xB7\x6B\x4C\x7D\xAB" + "\xE4\xC2\xD0\xC6\x34\xFC\x6D\x46\xE0\xE2\xF1\x31\x07\xE7\xAF\x23", + 32); + if (errtxt) + goto failed; + + if (extended) + { + what = "long string"; + errtxt = _gcry_hash_selftest_check_one + (GCRY_MD_SHA512_256, 0, + "abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmn" + "hijklmnoijklmnopjklmnopqklmnopqrlmnopqrsmnopqrstnopqrstu", 112, + "\x39\x28\xE1\x84\xFB\x86\x90\xF8\x40\xDA\x39\x88\x12\x1D\x31\xBE" + "\x65\xCB\x9D\x3E\xF8\x3E\xE6\x14\x6F\xEA\xC8\x61\xE1\x9B\x56\x3A", + 32); + if (errtxt) + goto failed; + + what = "one million \"a\""; + errtxt = _gcry_hash_selftest_check_one + (GCRY_MD_SHA512_256, 1, + NULL, 0, + "\x9a\x59\xa0\x52\x93\x01\x87\xa9\x70\x38\xca\xe6\x92\xf3\x07\x08" + "\xaa\x64\x91\x92\x3e\xf5\x19\x43\x94\xdc\x68\xd5\x6c\x74\xfb\x21", + 32); + if (errtxt) + goto failed; + } + + return 0; /* Succeeded. */ + + failed: + if (report) + report ("digest", GCRY_MD_SHA512_256, what, errtxt); + return GPG_ERR_SELFTEST_FAILED; +} + /* Run a full self-test for ALGO and return 0 on success. */ static gpg_err_code_t @@ -882,6 +1059,12 @@ run_selftests (int algo, int extended, selftest_report_func_t report) case GCRY_MD_SHA512: ec = selftests_sha512 (extended, report); break; + case GCRY_MD_SHA512_224: + ec = selftests_sha512_224 (extended, report); + break; + case GCRY_MD_SHA512_256: + ec = selftests_sha512_256 (extended, report); + break; default: ec = GPG_ERR_DIGEST_ALGO; break; @@ -949,3 +1132,41 @@ gcry_md_spec_t _gcry_digest_spec_sha384 = sizeof (SHA512_CONTEXT), run_selftests }; + +static byte sha512_256_asn[] = { 0x30 }; + +static gcry_md_oid_spec_t oid_spec_sha512_256[] = + { + { "2.16.840.1.101.3.4.2.6" }, + + { NULL }, + }; + +gcry_md_spec_t _gcry_digest_spec_sha512_256 = + { + GCRY_MD_SHA512_256, {0, 1}, + "SHA512_256", sha512_256_asn, DIM (sha512_256_asn), oid_spec_sha512_256, 32, + sha512_256_init, _gcry_md_block_write, sha512_final, sha512_read, NULL, + _gcry_sha512_256_hash_buffer, _gcry_sha512_256_hash_buffers, + sizeof (SHA512_CONTEXT), + run_selftests + }; + +static byte sha512_224_asn[] = { 0x30 }; + +static gcry_md_oid_spec_t oid_spec_sha512_224[] = + { + { "2.16.840.1.101.3.4.2.5" }, + + { NULL }, + }; + +gcry_md_spec_t _gcry_digest_spec_sha512_224 = + { + GCRY_MD_SHA512_224, {0, 1}, + "SHA512_224", sha512_224_asn, DIM (sha512_224_asn), oid_spec_sha512_224, 28, + sha512_224_init, _gcry_md_block_write, sha512_final, sha512_read, NULL, + _gcry_sha512_224_hash_buffer, _gcry_sha512_224_hash_buffers, + sizeof (SHA512_CONTEXT), + run_selftests + }; diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 8adf3a355..8b765ba80 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -3141,7 +3141,7 @@ are also supported. @c begin table of hash algorithms @cindex SHA-1 - at cindex SHA-224, SHA-256, SHA-384, SHA-512 + at cindex SHA-224, SHA-256, SHA-384, SHA-512, SHA-512/224, SHA-512/256 @cindex SHA3-224, SHA3-256, SHA3-384, SHA3-512, SHAKE128, SHAKE256 @cindex RIPE-MD-160 @cindex MD2, MD4, MD5 @@ -3214,9 +3214,17 @@ This is the SHA-384 algorithm which yields a message digest of 48 bytes. See FIPS 180-2 for the specification. @item GCRY_MD_SHA512 -This is the SHA-384 algorithm which yields a message digest of 64 bytes. +This is the SHA-512 algorithm which yields a message digest of 64 bytes. See FIPS 180-2 for the specification. + at item GCRY_MD_SHA512_224 +This is the SHA-512/224 algorithm which yields a message digest of 28 bytes. +See FIPS 180-4 for the specification. + + at item GCRY_MD_SHA512_256 +This is the SHA-512/256 algorithm which yields a message digest of 32 bytes. +See FIPS 180-4 for the specification. + @item GCRY_MD_SHA3_224 This is the SHA3-224 algorithm which yields a message digest of 28 bytes. See FIPS 202 for the specification. @@ -3680,6 +3688,7 @@ provided by Libgcrypt. @c begin table of MAC algorithms @cindex HMAC-SHA-1 @cindex HMAC-SHA-224, HMAC-SHA-256, HMAC-SHA-384, HMAC-SHA-512 + at cindex HMAC-SHA-512/224, HMAC-SHA-512/256 @cindex HMAC-SHA3-224, HMAC-SHA3-256, HMAC-SHA3-384, HMAC-SHA3-512 @cindex HMAC-RIPE-MD-160 @cindex HMAC-MD2, HMAC-MD4, HMAC-MD5 @@ -3687,6 +3696,7 @@ provided by Libgcrypt. @cindex HMAC-Whirlpool @cindex HMAC-Stribog-256, HMAC-Stribog-512 @cindex HMAC-GOSTR-3411-94 + at cindex HMAC-BLAKE2s, HMAC-BLAKE2b @table @code @item GCRY_MAC_NONE This is not a real algorithm but used by some functions as an error @@ -3724,6 +3734,14 @@ algorithm. This is HMAC message authentication algorithm based on the SHA3-384 hash algorithm. + at item GCRY_MAC_HMAC_SHA512_224 +This is HMAC message authentication algorithm based on the SHA-512/224 hash +algorithm. + + at item GCRY_MAC_HMAC_SHA512_256 +This is HMAC message authentication algorithm based on the SHA-512/256 hash +algorithm. + @item GCRY_MAC_HMAC_SHA1 This is HMAC message authentication algorithm based on the SHA-1 hash algorithm. @@ -3756,6 +3774,38 @@ algorithm described in GOST R 34.11-2012. This is HMAC message authentication algorithm based on the 512-bit hash algorithm described in GOST R 34.11-2012. + at item GCRY_MAC_HMAC_BLAKE2B_512 +This is HMAC message authentication algorithm based on the BLAKE2b-512 hash +algorithm. + + at item GCRY_MAC_HMAC_BLAKE2B_384 +This is HMAC message authentication algorithm based on the BLAKE2b-384 hash +algorithm. + + at item GCRY_MAC_HMAC_BLAKE2B_256 +This is HMAC message authentication algorithm based on the BLAKE2b-256 hash +algorithm. + + at item GCRY_MAC_HMAC_BLAKE2B_160 +This is HMAC message authentication algorithm based on the BLAKE2b-160 hash +algorithm. + + at item GCRY_MAC_HMAC_BLAKE2S_256 +This is HMAC message authentication algorithm based on the BLAKE2s-256 hash +algorithm. + + at item GCRY_MAC_HMAC_BLAKE2S_224 +This is HMAC message authentication algorithm based on the BLAKE2s-224 hash +algorithm. + + at item GCRY_MAC_HMAC_BLAKE2S_160 +This is HMAC message authentication algorithm based on the BLAKE2s-160 hash +algorithm. + + at item GCRY_MAC_HMAC_BLAKE2S_128 +This is HMAC message authentication algorithm based on the BLAKE2s-128 hash +algorithm. + @item GCRY_MAC_CMAC_AES This is CMAC (Cipher-based MAC) message authentication algorithm based on the AES block cipher algorithm. diff --git a/src/cipher.h b/src/cipher.h index 6e89be3da..5aac19f17 100644 --- a/src/cipher.h +++ b/src/cipher.h @@ -318,6 +318,8 @@ extern gcry_md_spec_t _gcry_digest_spec_sha224; extern gcry_md_spec_t _gcry_digest_spec_sha256; extern gcry_md_spec_t _gcry_digest_spec_sha384; extern gcry_md_spec_t _gcry_digest_spec_sha512; +extern gcry_md_spec_t _gcry_digest_spec_sha512_224; +extern gcry_md_spec_t _gcry_digest_spec_sha512_256; extern gcry_md_spec_t _gcry_digest_spec_sha3_224; extern gcry_md_spec_t _gcry_digest_spec_sha3_256; extern gcry_md_spec_t _gcry_digest_spec_sha3_512; diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 36bbf200c..8346ce151 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1248,6 +1248,8 @@ enum gcry_md_algos GCRY_MD_BLAKE2S_160 = 324, GCRY_MD_BLAKE2S_128 = 325, GCRY_MD_SM3 = 326, + GCRY_MD_SHA512_256 = 327, + GCRY_MD_SHA512_224 = 328, }; /* Flags used with the open function. */ @@ -1443,6 +1445,8 @@ enum gcry_mac_algos GCRY_MAC_HMAC_BLAKE2S_160 = 126, GCRY_MAC_HMAC_BLAKE2S_128 = 127, GCRY_MAC_HMAC_SM3 = 128, + GCRY_MAC_HMAC_SHA512_256 = 129, + GCRY_MAC_HMAC_SHA512_224 = 130, GCRY_MAC_CMAC_AES = 201, GCRY_MAC_CMAC_3DES = 202, diff --git a/tests/basic.c b/tests/basic.c index a28dc6997..3d6e8fc1e 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -8536,6 +8536,18 @@ check_digests (void) "\x74\xee\x78\xeb\x79\x1f\x94\x38\x5b\x73\xef\xf8\xfd\x5d\x74\xd8" "\x51\x36\xfe\x63\x52\xde\x07\x70\x95\xd6\x78\x2b\x7b\x46\x8a\x2c" "\x30\x0f\x48\x0c\x74\x43\x06\xdb\xa3\x8d\x64\x3d\xe9\xa1\xa7\x72" }, + { GCRY_MD_SHA512_256, "abc", + "\x53\x04\x8E\x26\x81\x94\x1E\xF9\x9B\x2E\x29\xB7\x6B\x4C\x7D\xAB" + "\xE4\xC2\xD0\xC6\x34\xFC\x6D\x46\xE0\xE2\xF1\x31\x07\xE7\xAF\x23" }, + { GCRY_MD_SHA512_256, "!", + "\x9a\x59\xa0\x52\x93\x01\x87\xa9\x70\x38\xca\xe6\x92\xf3\x07\x08" + "\xaa\x64\x91\x92\x3e\xf5\x19\x43\x94\xdc\x68\xd5\x6c\x74\xfb\x21" }, + { GCRY_MD_SHA512_224, "abc", + "\x46\x34\x27\x0F\x70\x7B\x6A\x54\xDA\xAE\x75\x30\x46\x08\x42\xE2" + "\x0E\x37\xED\x26\x5C\xEE\xE9\xA4\x3E\x89\x24\xAA" }, + { GCRY_MD_SHA512_224, "!", + "\x37\xab\x33\x1d\x76\xf0\xd3\x6d\xe4\x22\xbd\x0e\xde\xb2\x2a\x28" + "\xac\xcd\x48\x7b\x7a\x84\x53\xae\x96\x5d\xd2\x87" }, { GCRY_MD_SHA3_224, "abc", "\xe6\x42\x82\x4c\x3f\x8c\xf2\x4a\xd0\x92\x34\xee\x7d\x3c\x76\x6f" "\xc9\xa3\xa5\x16\x8d\x0c\x94\xad\x73\xb4\x6f\xdf" }, From jussi.kivilinna at iki.fi Fri Apr 5 19:25:44 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 5 Apr 2019 20:25:44 +0300 Subject: [PATCH 1/7] Add AVX2/BMI2 implementation of SHA1 Message-ID: <155448514419.14473.7900660968388649981.stgit@localhost.localdomain> * cipher/Makefile.am: Add 'sha1-avx2-bmi2-amd64.S'. * cipher/hash-common.h (MD_BLOCK_CTX_BUFFER_SIZE): New. (gcry_md_block_ctx): Change buffer length to MD_BLOCK_CTX_BUFFER_SIZE. * cipher/sha1-avx-amd64.S: Add missing .size for transform function. * cipher/sha1-ssse3-amd64.S: Add missing .size for transform function. * cipher/sha1-avx-bmi2-amd64.S: Add missing .size for transform function; Tweak implementation for small ~1% speed increase. * cipher/sha1-avx2-bmi2-amd64.S: New. * cipher/sha1.c (USE_AVX2, _gcry_sha1_transform_amd64_avx2_bmi2) (do_sha1_transform_amd64_avx2_bmi2): New. (sha1_init) [USE_AVX2]: Enable AVX2 implementation if supported by HW features. (sha1_final): Merge processing of two last blocks when extra block is needed. -- Benchmarks on Intel Haswell (4.0 Ghz): Before (AVX/BMI2): | nanosecs/byte mebibytes/sec cycles/byte SHA1 | 0.970 ns/B 983.2 MiB/s 3.88 c/B After (AVX/BMI2, ~1% faster): | nanosecs/byte mebibytes/sec cycles/byte SHA1 | 0.960 ns/B 993.1 MiB/s 3.84 c/B After (AVX2/BMI2, ~9% faster): | nanosecs/byte mebibytes/sec cycles/byte SHA1 | 0.890 ns/B 1071 MiB/s 3.56 c/B Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 1e67771e5..3f00ed4a8 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -103,8 +103,8 @@ EXTRA_libcipher_la_SOURCES = \ serpent.c serpent-sse2-amd64.S \ serpent-avx2-amd64.S serpent-armv7-neon.S \ sha1.c sha1-ssse3-amd64.S sha1-avx-amd64.S sha1-avx-bmi2-amd64.S \ - sha1-armv7-neon.S sha1-armv8-aarch32-ce.S sha1-armv8-aarch64-ce.S \ - sha1-intel-shaext.c \ + sha1-avx2-bmi2-amd64.S sha1-armv7-neon.S sha1-armv8-aarch32-ce.S \ + sha1-armv8-aarch64-ce.S sha1-intel-shaext.c \ sha256.c sha256-ssse3-amd64.S sha256-avx-amd64.S \ sha256-avx2-bmi2-amd64.S \ sha256-armv8-aarch32-ce.S sha256-armv8-aarch64-ce.S \ diff --git a/cipher/hash-common.h b/cipher/hash-common.h index 23f81ed71..0b3ade11e 100644 --- a/cipher/hash-common.h +++ b/cipher/hash-common.h @@ -42,9 +42,12 @@ typedef unsigned int (*_gcry_md_block_write_t) (void *c, # define MD_NBLOCKS_TYPE u32 #endif +/* SHA1 needs 2x64 bytes and SHA-512 needs 128 bytes. */ +#define MD_BLOCK_CTX_BUFFER_SIZE 128 + typedef struct gcry_md_block_ctx { - byte buf[MD_BLOCK_MAX_BLOCKSIZE]; + byte buf[MD_BLOCK_CTX_BUFFER_SIZE]; MD_NBLOCKS_TYPE nblocks; MD_NBLOCKS_TYPE nblocks_high; int count; diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S index 5f5b9c0e4..143e4066d 100644 --- a/cipher/sha1-avx-amd64.S +++ b/cipher/sha1-avx-amd64.S @@ -421,6 +421,8 @@ _gcry_sha1_transform_amd64_avx: .Lret: ret; +ELF(.size _gcry_sha1_transform_amd64_avx, + .-_gcry_sha1_transform_amd64_avx;) #endif #endif diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S index 8292c3afb..79ea24ef9 100644 --- a/cipher/sha1-avx-bmi2-amd64.S +++ b/cipher/sha1-avx-bmi2-amd64.S @@ -60,20 +60,15 @@ /* Constants */ .text -#define K1 0x5A827999 -#define K2 0x6ED9EBA1 -#define K3 0x8F1BBCDC -#define K4 0xCA62C1D6 .align 16 -.LK_XMM: -.LK1: .long K1, K1, K1, K1 -.LK2: .long K2, K2, K2, K2 -.LK3: .long K3, K3, K3, K3 -.LK4: .long K4, K4, K4, K4 - .Lbswap_shufb_ctl: .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f +.LK1: .long 0x5A827999 +.LK2: .long 0x6ED9EBA1 +.LK3: .long 0x8F1BBCDC +.LK4: .long 0xCA62C1D6 + /* Register macros */ @@ -82,14 +77,15 @@ #define ROLDSTACK %r10 #define RNBLKS %r11 -#define a %eax -#define b %ebx -#define c %ecx +#define a %esi +#define b %edi +#define c %ebp #define d %edx -#define e %edi +#define e %ecx +#define ne %ebx -#define RT0 %esi -#define RT1 %ebp +#define RT0 %eax +#define RT1 %r12d #define Wtmp0 %xmm0 #define Wtmp1 %xmm1 @@ -105,6 +101,11 @@ #define BSWAP_REG %xmm10 +#define K1 %xmm11 +#define K2 %xmm12 +#define K3 %xmm13 +#define K4 %xmm14 + /* Round function macros. */ @@ -117,9 +118,9 @@ andl b, RT0; \ rorxl $2, b, b; \ addl RT1, e; \ - leal (RT0,e), e; \ - rorxl $27, a, RT1; \ - addl RT1, e; + addl ne, a; \ + leal (RT0,e), ne; \ + rorxl $27, a, e; #define R_F2(a,b,c,d,e,i) \ movl c, RT0; \ @@ -127,22 +128,22 @@ xorl b, RT0; \ rorxl $2, b, b; \ xorl d, RT0; \ - leal (RT0,e), e; \ - rorxl $27, a, RT1; \ - addl RT1, e; + addl ne, a; \ + leal (RT0,e), ne; \ + rorxl $27, a, e; #define R_F3(a,b,c,d,e,i) \ movl c, RT0; \ movl b, RT1; \ + addl WK(i), e; \ xorl b, RT0; \ andl c, RT1; \ andl d, RT0; \ addl RT1, e; \ - addl WK(i), e; \ rorxl $2, b, b; \ - leal (RT0,e), e; \ - rorxl $27, a, RT1; \ - addl RT1, e; + addl ne, a; \ + leal (RT0,e), ne; \ + rorxl $27, a, e; #define R_F4(a,b,c,d,e,i) R_F2(a,b,c,d,e,i) @@ -158,8 +159,8 @@ #define W_PRECALC_00_15_1(i, W, tmp0) \ vpshufb BSWAP_REG, tmp0, W; -#define W_PRECALC_00_15_2(i, W, tmp0) \ - vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; +#define W_PRECALC_00_15_2(i, W, tmp0, K) \ + vpaddd K, W, tmp0; #define W_PRECALC_00_15_3(i, W, tmp0) \ vmovdqa tmp0, WK(i&~3); @@ -181,10 +182,10 @@ vpsrld $30, tmp1, W; \ vpslld $2, tmp1, tmp1; -#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ +#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \ vpxor W, tmp0, tmp0; \ vpxor tmp1, tmp0, W; \ - vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \ + vpaddd K, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ @@ -199,9 +200,9 @@ vpsrld $30, W, tmp0; \ vpslld $2, W, W; -#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ +#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \ vpor W, tmp0, W; \ - vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \ + vpaddd K, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); @@ -233,6 +234,7 @@ _gcry_sha1_transform_amd64_avx_bmi2: movq %rsi, RDATA; pushq %rbx; pushq %rbp; + pushq %r12; movq %rsp, ROLDSTACK; @@ -245,25 +247,30 @@ _gcry_sha1_transform_amd64_avx_bmi2: movl state_h2(RSTATE), c; movl state_h3(RSTATE), d; movl state_h4(RSTATE), e; + xorl ne, ne; vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG; + vpbroadcastd .LK1 RIP, K1; + vpbroadcastd .LK2 RIP, K2; + vpbroadcastd .LK3 RIP, K3; + vpbroadcastd .LK4 RIP, K4; /* Precalc 0-15. */ W_PRECALC_00_15_0(0, W0, Wtmp0); W_PRECALC_00_15_1(1, W0, Wtmp0); - W_PRECALC_00_15_2(2, W0, Wtmp0); + W_PRECALC_00_15_2(2, W0, Wtmp0, K1); W_PRECALC_00_15_3(3, W0, Wtmp0); W_PRECALC_00_15_0(4, W7, Wtmp0); W_PRECALC_00_15_1(5, W7, Wtmp0); - W_PRECALC_00_15_2(6, W7, Wtmp0); + W_PRECALC_00_15_2(6, W7, Wtmp0, K1); W_PRECALC_00_15_3(7, W7, Wtmp0); W_PRECALC_00_15_0(8, W6, Wtmp0); W_PRECALC_00_15_1(9, W6, Wtmp0); - W_PRECALC_00_15_2(10, W6, Wtmp0); + W_PRECALC_00_15_2(10, W6, Wtmp0, K1); W_PRECALC_00_15_3(11, W6, Wtmp0); W_PRECALC_00_15_0(12, W5, Wtmp0); W_PRECALC_00_15_1(13, W5, Wtmp0); - W_PRECALC_00_15_2(14, W5, Wtmp0); + W_PRECALC_00_15_2(14, W5, Wtmp0, K1); W_PRECALC_00_15_3(15, W5, Wtmp0); .align 8 @@ -274,69 +281,69 @@ _gcry_sha1_transform_amd64_avx_bmi2: R( a, b, c, d, e, F1, 0 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); R( d, e, a, b, c, F1, 2 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); - R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( c, d, e, a, b, F1, 3 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1); R( b, c, d, e, a, F1, 4 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 5 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); R( e, a, b, c, d, F1, 6 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); - R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( d, e, a, b, c, F1, 7 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2); R( c, d, e, a, b, F1, 8 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 9 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); R( a, b, c, d, e, F1, 10 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); - R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( e, a, b, c, d, F1, 11 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2); R( d, e, a, b, c, F1, 12 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( c, d, e, a, b, F1, 13 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); R( b, c, d, e, a, F1, 14 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); - R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( a, b, c, d, e, F1, 15 ); W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2); /* Transform 16-63 + Precalc 32-79. */ R( e, a, b, c, d, F1, 16 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( d, e, a, b, c, F1, 17 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( c, d, e, a, b, F1, 18 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F1, 19 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2); R( a, b, c, d, e, F2, 20 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( e, a, b, c, d, F2, 21 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( d, e, a, b, c, F2, 22 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F2, 23 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2); R( b, c, d, e, a, F2, 24 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( a, b, c, d, e, F2, 25 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( e, a, b, c, d, F2, 26 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F2, 27 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3); R( c, d, e, a, b, F2, 28 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( b, c, d, e, a, F2, 29 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( a, b, c, d, e, F2, 30 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F2, 31 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3); R( d, e, a, b, c, F2, 32 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( c, d, e, a, b, F2, 33 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); R( b, c, d, e, a, F2, 34 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); - R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( a, b, c, d, e, F2, 35 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3); R( e, a, b, c, d, F2, 36 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( d, e, a, b, c, F2, 37 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); R( c, d, e, a, b, F2, 38 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); - R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( b, c, d, e, a, F2, 39 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3); R( a, b, c, d, e, F3, 40 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( e, a, b, c, d, F3, 41 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); R( d, e, a, b, c, F3, 42 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); - R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( c, d, e, a, b, F3, 43 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3); R( b, c, d, e, a, F3, 44 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( a, b, c, d, e, F3, 45 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); R( e, a, b, c, d, F3, 46 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); - R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( d, e, a, b, c, F3, 47 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4); R( c, d, e, a, b, F3, 48 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( b, c, d, e, a, F3, 49 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); R( a, b, c, d, e, F3, 50 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); - R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( e, a, b, c, d, F3, 51 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4); R( d, e, a, b, c, F3, 52 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( c, d, e, a, b, F3, 53 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); R( b, c, d, e, a, F3, 54 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); - R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( a, b, c, d, e, F3, 55 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4); R( e, a, b, c, d, F3, 56 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( d, e, a, b, c, F3, 57 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); R( c, d, e, a, b, F3, 58 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); - R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( b, c, d, e, a, F3, 59 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4); R( a, b, c, d, e, F4, 60 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( e, a, b, c, d, F4, 61 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); R( d, e, a, b, c, F4, 62 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); - R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( c, d, e, a, b, F4, 63 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4); decq RNBLKS; jz .Lend; @@ -344,21 +351,23 @@ _gcry_sha1_transform_amd64_avx_bmi2: /* Transform 64-79 + Precalc 0-15 of next block. */ R( b, c, d, e, a, F4, 64 ); W_PRECALC_00_15_0(0, W0, Wtmp0); R( a, b, c, d, e, F4, 65 ); W_PRECALC_00_15_1(1, W0, Wtmp0); - R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0); + R( e, a, b, c, d, F4, 66 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1); R( d, e, a, b, c, F4, 67 ); W_PRECALC_00_15_3(3, W0, Wtmp0); R( c, d, e, a, b, F4, 68 ); W_PRECALC_00_15_0(4, W7, Wtmp0); R( b, c, d, e, a, F4, 69 ); W_PRECALC_00_15_1(5, W7, Wtmp0); - R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0); + R( a, b, c, d, e, F4, 70 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1); R( e, a, b, c, d, F4, 71 ); W_PRECALC_00_15_3(7, W7, Wtmp0); R( d, e, a, b, c, F4, 72 ); W_PRECALC_00_15_0(8, W6, Wtmp0); R( c, d, e, a, b, F4, 73 ); W_PRECALC_00_15_1(9, W6, Wtmp0); - R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0); + R( b, c, d, e, a, F4, 74 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1); R( a, b, c, d, e, F4, 75 ); W_PRECALC_00_15_3(11, W6, Wtmp0); R( e, a, b, c, d, F4, 76 ); W_PRECALC_00_15_0(12, W5, Wtmp0); R( d, e, a, b, c, F4, 77 ); W_PRECALC_00_15_1(13, W5, Wtmp0); R( c, d, e, a, b, F4, 78 ); - addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0); + addl state_h0(RSTATE), a; W_PRECALC_00_15_2(14, W5, Wtmp0, K1); R( b, c, d, e, a, F4, 79 ); W_PRECALC_00_15_3(15, W5, Wtmp0); + addl ne, a; + xorl ne, ne; /* Update the chaining variables. */ addl state_h3(RSTATE), d; @@ -396,6 +405,8 @@ _gcry_sha1_transform_amd64_avx_bmi2: R( c, d, e, a, b, F4, 78 ); addl state_h0(RSTATE), a; R( b, c, d, e, a, F4, 79 ); + addl ne, a; + xorl ne, ne; /* Update the chaining variables. */ addl state_h3(RSTATE), d; @@ -411,14 +422,17 @@ _gcry_sha1_transform_amd64_avx_bmi2: movq ROLDSTACK, %rsp; + popq %r12; popq %rbp; popq %rbx; /* burn_stack */ - movl $(16*4 + 2*8 + 31), %eax; + movl $(16*4 + 3*8 + 31), %eax; .Lret: ret; +ELF(.size _gcry_sha1_transform_amd64_avx_bmi2, + .-_gcry_sha1_transform_amd64_avx_bmi2;) #endif #endif diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S new file mode 100644 index 000000000..c666290f2 --- /dev/null +++ b/cipher/sha1-avx2-bmi2-amd64.S @@ -0,0 +1,570 @@ +/* sha1-avx2-bmi2-amd64.S - Intel AVX2/BMI2 accelerated SHA-1 transform function + * Copyright (C) 2019 Jussi Kivilinna + * + * Based on sha1.c: + * Copyright (C) 1998, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +/* + * Intel SSSE3 accelerated SHA-1 implementation based on white paper: + * "Improving the Performance of the Secure Hash Algorithm (SHA-1)" + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 + */ + +#ifdef __x86_64__ +#include + +#if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ + defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ + defined(HAVE_GCC_INLINE_ASM_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX) && \ + defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1) + +#ifdef __PIC__ +# define RIP (%rip) +#else +# define RIP +#endif + + +#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS +# define ELF(...) __VA_ARGS__ +#else +# define ELF(...) /*_*/ +#endif + + +/* Context structure */ + +#define state_h0 0 +#define state_h1 4 +#define state_h2 8 +#define state_h3 12 +#define state_h4 16 + + +/* Constants */ + +#define WK_STACK_WORDS (80 * 2) + +.text +.align 16 +.Lbswap_shufb_ctl: + .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f + +.LK1: .long 0x5A827999 +.LK2: .long 0x6ED9EBA1 +.LK3: .long 0x8F1BBCDC +.LK4: .long 0xCA62C1D6 + + +/* Register macros */ + +#define RSTATE %r8 +#define RDATA %r9 +#define ROLDSTACK %r10 +#define RNBLKS %r11 + +#define a %eax +#define b %ebx +#define c %ecx +#define d %edx +#define e %edi +#define ne %r12d + +#define RT0 %esi +#define RT1 %ebp + +#define Wtmp0 %ymm0 +#define Wtmp1 %ymm1 +#define Wtmp0x %xmm0 +#define Wtmp1x %xmm1 + +#define W0 %ymm2 +#define W1 %ymm3 +#define W2 %ymm4 +#define W3 %ymm5 +#define W4 %ymm6 +#define W5 %ymm7 +#define W6 %ymm8 +#define W7 %ymm9 + +#define BSWAP_REG %ymm10 + +#define K1 %ymm11 +#define K2 %ymm12 +#define K3 %ymm13 +#define K4 %ymm14 + + +/* Round function macros. */ + +#define WK(i,block) ((block) * 16 + ((i) / 4) * 32 + ((i) % 4) * 4)(%rsp) +#define PRE_WK(i) ((i) * 4 * 2)(%rsp) + +#define R_F1(a,b,c,d,e,i,block) \ + movl c, RT0; \ + andn d, b, RT1; \ + addl WK(i,block), e; \ + andl b, RT0; \ + leal (a,ne), a; \ + rorxl $2, b, b; \ + addl RT1, e; \ + rorxl $27, a, ne; \ + addl RT0, e; + +#define R_F2(a,b,c,d,e,i,block) \ + addl WK(i,block), e; \ + movl c, RT0; \ + xorl b, RT0; \ + leal (a,ne), a; \ + rorxl $2, b, b; \ + xorl d, RT0; \ + addl RT0, e; \ + rorxl $27, a, ne; + +#define R_F3(a,b,c,d,e,i,block) \ + movl c, RT0; \ + addl WK(i,block), e; \ + movl b, RT1; \ + xorl b, RT0; \ + leal (a,ne), a; \ + rorxl $2, b, b; \ + andl c, RT1; \ + addl RT1, e; \ + andl d, RT0; \ + rorxl $27, a, ne; \ + addl RT0, e; + +#define R_F4(a,b,c,d,e,i,block) R_F2(a,b,c,d,e,i,block) + +#define R(a,b,c,d,e,f,i,block) \ + R_##f(a,b,c,d,e,i,block) + + +/* Input expansion macros. */ + +#define W_PRECALC_00_15_0(i, W, tmp0) \ + vmovdqu (4*(i))(RDATA), tmp0##x; \ + vinserti128 $1, (4*(i) + 64)(RDATA), tmp0, tmp0; + +#define W_PRECALC_00_15_1(i, W, tmp0) \ + vpshufb BSWAP_REG, tmp0, W; + +#define W_PRECALC_00_15_2(i, W, tmp0, K) \ + vpaddd K, W, tmp0; + +#define W_PRECALC_00_15_3(i, W, tmp0) \ + vmovdqa tmp0, PRE_WK((i)&~3); + +#define W_PRECALC_16_31_0(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpalignr $8, W_m16, W_m12, W; \ + vpsrldq $4, W_m04, tmp0; \ + vpxor W_m08, W, W; + +#define W_PRECALC_16_31_1(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpxor W_m16, tmp0, tmp0; \ + vpxor tmp0, W, W; \ + vpslld $1, W, tmp0; \ + vpslldq $12, W, tmp1; \ + vpsrld $31, W, W; + +#define W_PRECALC_16_31_2(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ + vpor W, tmp0, tmp0; \ + vpsrld $30, tmp1, W; \ + vpslld $2, tmp1, tmp1; + +#define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1, K) \ + vpxor W, tmp0, tmp0; \ + vpxor tmp1, tmp0, W; \ + vpaddd K, W, tmp0; \ + vmovdqa tmp0, PRE_WK((i)&~3); + +#define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpxor W_m28, W, W; \ + vpalignr $8, W_m08, W_m04, tmp0; + +#define W_PRECALC_32_79_1(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpxor W_m16, W, W; \ + vpxor tmp0, W, W; + +#define W_PRECALC_32_79_2(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ + vpsrld $30, W, tmp0; \ + vpslld $2, W, W; + +#define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0, K) \ + vpor W, tmp0, W; \ + vpaddd K, W, tmp0; \ + vmovdqa tmp0, PRE_WK((i)&~3); + + +/* + * Transform 2*nblks*64 bytes (2*nblks*16 32-bit words) at DATA. + * + * unsigned int + * _gcry_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data, + * size_t nblks) + */ +.globl _gcry_sha1_transform_amd64_avx2_bmi2 +ELF(.type _gcry_sha1_transform_amd64_avx2_bmi2, at function) +.align 16 +_gcry_sha1_transform_amd64_avx2_bmi2: + /* input: + * %rdi: ctx, CTX + * %rsi: data (64*nblks bytes) + * %rdx: nblks (multiple of 2, larger than 0) + */ + + vzeroupper; + + movq %rdx, RNBLKS; + movq %rdi, RSTATE; + movq %rsi, RDATA; + pushq %rbx; + pushq %rbp; + pushq %r12; + + movq %rsp, ROLDSTACK; + + subq $(WK_STACK_WORDS*4), %rsp; + andq $(~63), %rsp; + + /* Get the values of the chaining variables. */ + movl state_h0(RSTATE), a; + movl state_h1(RSTATE), b; + movl state_h2(RSTATE), c; + movl state_h3(RSTATE), d; + movl state_h4(RSTATE), e; + xorl ne, ne; + + vbroadcasti128 .Lbswap_shufb_ctl RIP, BSWAP_REG; + vpbroadcastd .LK1 RIP, K1; + vpbroadcastd .LK2 RIP, K2; + vpbroadcastd .LK3 RIP, K3; + vpbroadcastd .LK4 RIP, K4; + + /* Precalc 0-31 for block 1 & 2. */ + W_PRECALC_00_15_0(0, W0, Wtmp0); + W_PRECALC_00_15_1(1, W0, Wtmp0); + W_PRECALC_00_15_2(2, W0, Wtmp0, K1); + W_PRECALC_00_15_3(3, W0, Wtmp0); + W_PRECALC_00_15_0(4, W7, Wtmp0); + W_PRECALC_00_15_1(5, W7, Wtmp0); + W_PRECALC_00_15_2(6, W7, Wtmp0, K1); + W_PRECALC_00_15_3(7, W7, Wtmp0); + W_PRECALC_00_15_0(8, W6, Wtmp0); + W_PRECALC_00_15_1(9, W6, Wtmp0); + W_PRECALC_00_15_2(10, W6, Wtmp0, K1); + W_PRECALC_00_15_3(11, W6, Wtmp0); + W_PRECALC_00_15_0(12, W5, Wtmp0); + W_PRECALC_00_15_1(13, W5, Wtmp0); + W_PRECALC_00_15_2(14, W5, Wtmp0, K1); + W_PRECALC_00_15_3(15, W5, Wtmp0); + W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1); + W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2); + W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2); + W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2); + +.align 8 +.Loop: + addq $(2 * 64), RDATA; + + /* Transform 0-15 for block 1 + Precalc 32-47 for block 1 & 2. */ + R( a, b, c, d, e, F1, 0, 0 ); W_PRECALC_32_79_0(32, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( e, a, b, c, d, F1, 1, 0 ); W_PRECALC_32_79_1(33, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( d, e, a, b, c, F1, 2, 0 ); W_PRECALC_32_79_2(34, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( c, d, e, a, b, F1, 3, 0 ); W_PRECALC_32_79_3(35, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K2); + R( b, c, d, e, a, F1, 4, 0 ); W_PRECALC_32_79_0(36, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( a, b, c, d, e, F1, 5, 0 ); W_PRECALC_32_79_1(37, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( e, a, b, c, d, F1, 6, 0 ); W_PRECALC_32_79_2(38, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( d, e, a, b, c, F1, 7, 0 ); W_PRECALC_32_79_3(39, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K2); + R( c, d, e, a, b, F1, 8, 0 ); W_PRECALC_32_79_0(40, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( b, c, d, e, a, F1, 9, 0 ); W_PRECALC_32_79_1(41, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( a, b, c, d, e, F1, 10, 0 ); W_PRECALC_32_79_2(42, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( e, a, b, c, d, F1, 11, 0 ); W_PRECALC_32_79_3(43, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K3); + R( d, e, a, b, c, F1, 12, 0 ); W_PRECALC_32_79_0(44, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( c, d, e, a, b, F1, 13, 0 ); W_PRECALC_32_79_1(45, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( b, c, d, e, a, F1, 14, 0 ); W_PRECALC_32_79_2(46, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( a, b, c, d, e, F1, 15, 0 ); W_PRECALC_32_79_3(47, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K3); + + /* Transform 16-47 for block 1 + Precalc 48-79 for block 1 & 2. */ + R( e, a, b, c, d, F1, 16, 0 ); W_PRECALC_32_79_0(48, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( d, e, a, b, c, F1, 17, 0 ); W_PRECALC_32_79_1(49, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( c, d, e, a, b, F1, 18, 0 ); W_PRECALC_32_79_2(50, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0); + R( b, c, d, e, a, F1, 19, 0 ); W_PRECALC_32_79_3(51, W4, W5, W6, W7, W0, W1, W2, W3, Wtmp0, K3); + R( a, b, c, d, e, F2, 20, 0 ); W_PRECALC_32_79_0(52, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( e, a, b, c, d, F2, 21, 0 ); W_PRECALC_32_79_1(53, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( d, e, a, b, c, F2, 22, 0 ); W_PRECALC_32_79_2(54, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0); + R( c, d, e, a, b, F2, 23, 0 ); W_PRECALC_32_79_3(55, W3, W4, W5, W6, W7, W0, W1, W2, Wtmp0, K3); + R( b, c, d, e, a, F2, 24, 0 ); W_PRECALC_32_79_0(56, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( a, b, c, d, e, F2, 25, 0 ); W_PRECALC_32_79_1(57, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( e, a, b, c, d, F2, 26, 0 ); W_PRECALC_32_79_2(58, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0); + R( d, e, a, b, c, F2, 27, 0 ); W_PRECALC_32_79_3(59, W2, W3, W4, W5, W6, W7, W0, W1, Wtmp0, K3); + R( c, d, e, a, b, F2, 28, 0 ); W_PRECALC_32_79_0(60, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( b, c, d, e, a, F2, 29, 0 ); W_PRECALC_32_79_1(61, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( a, b, c, d, e, F2, 30, 0 ); W_PRECALC_32_79_2(62, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0); + R( e, a, b, c, d, F2, 31, 0 ); W_PRECALC_32_79_3(63, W1, W2, W3, W4, W5, W6, W7, W0, Wtmp0, K4); + R( d, e, a, b, c, F2, 32, 0 ); W_PRECALC_32_79_0(64, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( c, d, e, a, b, F2, 33, 0 ); W_PRECALC_32_79_1(65, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( b, c, d, e, a, F2, 34, 0 ); W_PRECALC_32_79_2(66, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0); + R( a, b, c, d, e, F2, 35, 0 ); W_PRECALC_32_79_3(67, W0, W1, W2, W3, W4, W5, W6, W7, Wtmp0, K4); + R( e, a, b, c, d, F2, 36, 0 ); W_PRECALC_32_79_0(68, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( d, e, a, b, c, F2, 37, 0 ); W_PRECALC_32_79_1(69, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( c, d, e, a, b, F2, 38, 0 ); W_PRECALC_32_79_2(70, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0); + R( b, c, d, e, a, F2, 39, 0 ); W_PRECALC_32_79_3(71, W7, W0, W1, W2, W3, W4, W5, W6, Wtmp0, K4); + R( a, b, c, d, e, F3, 40, 0 ); W_PRECALC_32_79_0(72, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( e, a, b, c, d, F3, 41, 0 ); W_PRECALC_32_79_1(73, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( d, e, a, b, c, F3, 42, 0 ); W_PRECALC_32_79_2(74, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0); + R( c, d, e, a, b, F3, 43, 0 ); W_PRECALC_32_79_3(75, W6, W7, W0, W1, W2, W3, W4, W5, Wtmp0, K4); + R( b, c, d, e, a, F3, 44, 0 ); W_PRECALC_32_79_0(76, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( a, b, c, d, e, F3, 45, 0 ); W_PRECALC_32_79_1(77, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( e, a, b, c, d, F3, 46, 0 ); W_PRECALC_32_79_2(78, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0); + R( d, e, a, b, c, F3, 47, 0 ); W_PRECALC_32_79_3(79, W5, W6, W7, W0, W1, W2, W3, W4, Wtmp0, K4); + + /* Transform 48-79 for block 1. */ + R( c, d, e, a, b, F3, 48, 0 ); + R( b, c, d, e, a, F3, 49, 0 ); + R( a, b, c, d, e, F3, 50, 0 ); + R( e, a, b, c, d, F3, 51, 0 ); + R( d, e, a, b, c, F3, 52, 0 ); + R( c, d, e, a, b, F3, 53, 0 ); + R( b, c, d, e, a, F3, 54, 0 ); + R( a, b, c, d, e, F3, 55, 0 ); + R( e, a, b, c, d, F3, 56, 0 ); + R( d, e, a, b, c, F3, 57, 0 ); + R( c, d, e, a, b, F3, 58, 0 ); + R( b, c, d, e, a, F3, 59, 0 ); + R( a, b, c, d, e, F4, 60, 0 ); + R( e, a, b, c, d, F4, 61, 0 ); + R( d, e, a, b, c, F4, 62, 0 ); + R( c, d, e, a, b, F4, 63, 0 ); + R( b, c, d, e, a, F4, 64, 0 ); + R( a, b, c, d, e, F4, 65, 0 ); + R( e, a, b, c, d, F4, 66, 0 ); + R( d, e, a, b, c, F4, 67, 0 ); + R( c, d, e, a, b, F4, 68, 0 ); + R( b, c, d, e, a, F4, 69, 0 ); + R( a, b, c, d, e, F4, 70, 0 ); + R( e, a, b, c, d, F4, 71, 0 ); + R( d, e, a, b, c, F4, 72, 0 ); + R( c, d, e, a, b, F4, 73, 0 ); + R( b, c, d, e, a, F4, 74, 0 ); + R( a, b, c, d, e, F4, 75, 0 ); + R( e, a, b, c, d, F4, 76, 0 ); + R( d, e, a, b, c, F4, 77, 0 ); + R( c, d, e, a, b, F4, 78, 0 ); + addl state_h0(RSTATE), a; + R( b, c, d, e, a, F4, 79, 0 ); + addl ne, a; + xorl ne, ne; + + /* Update the chaining variables. */ + addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; + addl state_h4(RSTATE), e; + + movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); + movl e, state_h4(RSTATE); + + /* Transform 0-47 for block 2. */ + R( a, b, c, d, e, F1, 0, 1 ); + R( e, a, b, c, d, F1, 1, 1 ); + R( d, e, a, b, c, F1, 2, 1 ); + R( c, d, e, a, b, F1, 3, 1 ); + R( b, c, d, e, a, F1, 4, 1 ); + R( a, b, c, d, e, F1, 5, 1 ); + R( e, a, b, c, d, F1, 6, 1 ); + R( d, e, a, b, c, F1, 7, 1 ); + R( c, d, e, a, b, F1, 8, 1 ); + R( b, c, d, e, a, F1, 9, 1 ); + R( a, b, c, d, e, F1, 10, 1 ); + R( e, a, b, c, d, F1, 11, 1 ); + R( d, e, a, b, c, F1, 12, 1 ); + R( c, d, e, a, b, F1, 13, 1 ); + R( b, c, d, e, a, F1, 14, 1 ); + R( a, b, c, d, e, F1, 15, 1 ); + R( e, a, b, c, d, F1, 16, 1 ); + R( d, e, a, b, c, F1, 17, 1 ); + R( c, d, e, a, b, F1, 18, 1 ); + R( b, c, d, e, a, F1, 19, 1 ); + R( a, b, c, d, e, F2, 20, 1 ); + R( e, a, b, c, d, F2, 21, 1 ); + R( d, e, a, b, c, F2, 22, 1 ); + R( c, d, e, a, b, F2, 23, 1 ); + R( b, c, d, e, a, F2, 24, 1 ); + R( a, b, c, d, e, F2, 25, 1 ); + R( e, a, b, c, d, F2, 26, 1 ); + R( d, e, a, b, c, F2, 27, 1 ); + R( c, d, e, a, b, F2, 28, 1 ); + R( b, c, d, e, a, F2, 29, 1 ); + R( a, b, c, d, e, F2, 30, 1 ); + R( e, a, b, c, d, F2, 31, 1 ); + R( d, e, a, b, c, F2, 32, 1 ); + R( c, d, e, a, b, F2, 33, 1 ); + R( b, c, d, e, a, F2, 34, 1 ); + R( a, b, c, d, e, F2, 35, 1 ); + R( e, a, b, c, d, F2, 36, 1 ); + R( d, e, a, b, c, F2, 37, 1 ); + R( c, d, e, a, b, F2, 38, 1 ); + R( b, c, d, e, a, F2, 39, 1 ); + R( a, b, c, d, e, F3, 40, 1 ); + R( e, a, b, c, d, F3, 41, 1 ); + R( d, e, a, b, c, F3, 42, 1 ); + R( c, d, e, a, b, F3, 43, 1 ); + R( b, c, d, e, a, F3, 44, 1 ); + R( a, b, c, d, e, F3, 45, 1 ); + R( e, a, b, c, d, F3, 46, 1 ); + R( d, e, a, b, c, F3, 47, 1 ); + + addq $-2, RNBLKS; + jz .Lend; + + /* Transform 48-79 for block 2 + Precalc 0-31 for next two blocks. */ + R( c, d, e, a, b, F3, 48, 1 ); W_PRECALC_00_15_0(0, W0, Wtmp0); + R( b, c, d, e, a, F3, 49, 1 ); W_PRECALC_00_15_1(1, W0, Wtmp0); + R( a, b, c, d, e, F3, 50, 1 ); W_PRECALC_00_15_2(2, W0, Wtmp0, K1); + R( e, a, b, c, d, F3, 51, 1 ); W_PRECALC_00_15_3(3, W0, Wtmp0); + R( d, e, a, b, c, F3, 52, 1 ); W_PRECALC_00_15_0(4, W7, Wtmp0); + R( c, d, e, a, b, F3, 53, 1 ); W_PRECALC_00_15_1(5, W7, Wtmp0); + R( b, c, d, e, a, F3, 54, 1 ); W_PRECALC_00_15_2(6, W7, Wtmp0, K1); + R( a, b, c, d, e, F3, 55, 1 ); W_PRECALC_00_15_3(7, W7, Wtmp0); + R( e, a, b, c, d, F3, 56, 1 ); W_PRECALC_00_15_0(8, W6, Wtmp0); + R( d, e, a, b, c, F3, 57, 1 ); W_PRECALC_00_15_1(9, W6, Wtmp0); + R( c, d, e, a, b, F3, 58, 1 ); W_PRECALC_00_15_2(10, W6, Wtmp0, K1); + R( b, c, d, e, a, F3, 59, 1 ); W_PRECALC_00_15_3(11, W6, Wtmp0); + R( a, b, c, d, e, F4, 60, 1 ); W_PRECALC_00_15_0(12, W5, Wtmp0); + R( e, a, b, c, d, F4, 61, 1 ); W_PRECALC_00_15_1(13, W5, Wtmp0); + R( d, e, a, b, c, F4, 62, 1 ); W_PRECALC_00_15_2(14, W5, Wtmp0, K1); + R( c, d, e, a, b, F4, 63, 1 ); W_PRECALC_00_15_3(15, W5, Wtmp0); + R( b, c, d, e, a, F4, 64, 1 ); W_PRECALC_16_31_0(16, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( a, b, c, d, e, F4, 65, 1 ); W_PRECALC_16_31_1(17, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( e, a, b, c, d, F4, 66, 1 ); W_PRECALC_16_31_2(18, W4, W5, W6, W7, W0, Wtmp0, Wtmp1); + R( d, e, a, b, c, F4, 67, 1 ); W_PRECALC_16_31_3(19, W4, W5, W6, W7, W0, Wtmp0, Wtmp1, K1); + R( c, d, e, a, b, F4, 68, 1 ); W_PRECALC_16_31_0(20, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( b, c, d, e, a, F4, 69, 1 ); W_PRECALC_16_31_1(21, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( a, b, c, d, e, F4, 70, 1 ); W_PRECALC_16_31_2(22, W3, W4, W5, W6, W7, Wtmp0, Wtmp1); + R( e, a, b, c, d, F4, 71, 1 ); W_PRECALC_16_31_3(23, W3, W4, W5, W6, W7, Wtmp0, Wtmp1, K2); + R( d, e, a, b, c, F4, 72, 1 ); W_PRECALC_16_31_0(24, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( c, d, e, a, b, F4, 73, 1 ); W_PRECALC_16_31_1(25, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( b, c, d, e, a, F4, 74, 1 ); W_PRECALC_16_31_2(26, W2, W3, W4, W5, W6, Wtmp0, Wtmp1); + R( a, b, c, d, e, F4, 75, 1 ); W_PRECALC_16_31_3(27, W2, W3, W4, W5, W6, Wtmp0, Wtmp1, K2); + R( e, a, b, c, d, F4, 76, 1 ); W_PRECALC_16_31_0(28, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( d, e, a, b, c, F4, 77, 1 ); W_PRECALC_16_31_1(29, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + R( c, d, e, a, b, F4, 78, 1 ); W_PRECALC_16_31_2(30, W1, W2, W3, W4, W5, Wtmp0, Wtmp1); + addl state_h0(RSTATE), a; W_PRECALC_16_31_3(31, W1, W2, W3, W4, W5, Wtmp0, Wtmp1, K2); + R( b, c, d, e, a, F4, 79, 1 ); + addl ne, a; + xorl ne, ne; + + /* Update the chaining variables. */ + addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; + addl state_h4(RSTATE), e; + + movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); + movl e, state_h4(RSTATE); + + jmp .Loop; + +.align 16 +.Lend: + vzeroall; + + /* Transform 48-79 for block 2. */ + R( c, d, e, a, b, F3, 48, 1 ); + R( b, c, d, e, a, F3, 49, 1 ); + R( a, b, c, d, e, F3, 50, 1 ); + R( e, a, b, c, d, F3, 51, 1 ); + R( d, e, a, b, c, F3, 52, 1 ); + R( c, d, e, a, b, F3, 53, 1 ); + R( b, c, d, e, a, F3, 54, 1 ); + R( a, b, c, d, e, F3, 55, 1 ); + R( e, a, b, c, d, F3, 56, 1 ); + R( d, e, a, b, c, F3, 57, 1 ); + R( c, d, e, a, b, F3, 58, 1 ); + R( b, c, d, e, a, F3, 59, 1 ); + R( a, b, c, d, e, F4, 60, 1 ); + R( e, a, b, c, d, F4, 61, 1 ); + R( d, e, a, b, c, F4, 62, 1 ); + R( c, d, e, a, b, F4, 63, 1 ); + R( b, c, d, e, a, F4, 64, 1 ); + R( a, b, c, d, e, F4, 65, 1 ); + R( e, a, b, c, d, F4, 66, 1 ); + R( d, e, a, b, c, F4, 67, 1 ); + R( c, d, e, a, b, F4, 68, 1 ); + R( b, c, d, e, a, F4, 69, 1 ); + R( a, b, c, d, e, F4, 70, 1 ); + R( e, a, b, c, d, F4, 71, 1 ); + R( d, e, a, b, c, F4, 72, 1 ); + R( c, d, e, a, b, F4, 73, 1 ); + R( b, c, d, e, a, F4, 74, 1 ); + R( a, b, c, d, e, F4, 75, 1 ); + R( e, a, b, c, d, F4, 76, 1 ); + R( d, e, a, b, c, F4, 77, 1 ); + R( c, d, e, a, b, F4, 78, 1 ); + addl state_h0(RSTATE), a; + R( b, c, d, e, a, F4, 79, 1 ); + addl ne, a; + xorl ne, ne; + + /* Update the chaining variables. */ + addl state_h3(RSTATE), d; + addl state_h2(RSTATE), c; + addl state_h1(RSTATE), b; + addl state_h4(RSTATE), e; + + movl d, state_h3(RSTATE); + movl c, state_h2(RSTATE); + movl b, state_h1(RSTATE); + movl a, state_h0(RSTATE); + movl e, state_h4(RSTATE); + + movq ROLDSTACK, %rsp; + + popq %r12; + popq %rbp; + popq %rbx; + + /* burn_stack */ + movl $((WK_STACK_WORDS)*4 + 3*8 + 31), %eax; + + ret; +ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2, + .-_gcry_sha1_transform_amd64_avx2_bmi2;) + +#endif +#endif diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index 2b4394765..421bebeca 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -429,6 +429,8 @@ _gcry_sha1_transform_amd64_ssse3: .Lret: ret; +ELF(.size _gcry_sha1_transform_amd64_ssse3, + .-_gcry_sha1_transform_amd64_ssse3;) #endif #endif diff --git a/cipher/sha1.c b/cipher/sha1.c index 76c486c7e..affabfb07 100644 --- a/cipher/sha1.c +++ b/cipher/sha1.c @@ -68,6 +68,12 @@ # define USE_BMI2 1 #endif +/* USE_AVX2 indicates whether to compile with Intel AVX2/BMI2 code. */ +#undef USE_AVX2 +#if defined(USE_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX2) +# define USE_AVX2 1 +#endif + /* USE_SHAEXT indicates whether to compile with Intel SHA Extension code. */ #undef USE_SHAEXT #if defined(HAVE_GCC_INLINE_ASM_SHAEXT) && \ @@ -171,7 +177,37 @@ do_sha1_transform_amd64_avx_bmi2 (void *ctx, const unsigned char *data, return _gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, nblks) + ASM_EXTRA_STACK; } -#endif + +#ifdef USE_AVX2 +unsigned int +_gcry_sha1_transform_amd64_avx2_bmi2 (void *state, const unsigned char *data, + size_t nblks) ASM_FUNC_ABI; + +static unsigned int +do_sha1_transform_amd64_avx2_bmi2 (void *ctx, const unsigned char *data, + size_t nblks) +{ + SHA1_CONTEXT *hd = ctx; + + /* AVX2/BMI2 function only handles pair of blocks so nblks needs to be + * multiple of 2 and function does not handle zero nblks. Use AVX/BMI2 + * code to handle these cases. */ + + if (nblks <= 1) + return do_sha1_transform_amd64_avx_bmi2 (ctx, data, nblks); + + if (nblks & 1) + { + (void)_gcry_sha1_transform_amd64_avx_bmi2 (&hd->h0, data, 1); + nblks--; + data += 64; + } + + return _gcry_sha1_transform_amd64_avx2_bmi2 (&hd->h0, data, nblks) + + ASM_EXTRA_STACK; +} +#endif /* USE_AVX2 */ +#endif /* USE_BMI2 */ #ifdef USE_SHAEXT /* Does not need ASM_FUNC_ABI */ @@ -258,6 +294,11 @@ sha1_init (void *context, unsigned int flags) if ((features & HWF_INTEL_AVX) && (features & HWF_INTEL_BMI2)) hd->bctx.bwrite = do_sha1_transform_amd64_avx_bmi2; #endif +#ifdef USE_AVX2 + if ((features & HWF_INTEL_AVX2) && (features & HWF_INTEL_AVX) && + (features & HWF_INTEL_BMI2)) + hd->bctx.bwrite = do_sha1_transform_amd64_avx2_bmi2; +#endif #ifdef USE_SHAEXT if ((features & HWF_INTEL_SHAEXT) && (features & HWF_INTEL_SSE4_1)) hd->bctx.bwrite = do_sha1_transform_intel_shaext; @@ -494,22 +535,27 @@ sha1_final(void *context) if( hd->bctx.count < 56 ) /* enough room */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad */ - while( hd->bctx.count < 56 ) - hd->bctx.buf[hd->bctx.count++] = 0; /* pad */ + if (hd->bctx.count < 56) + memset (&hd->bctx.buf[hd->bctx.count], 0, 56 - hd->bctx.count); + hd->bctx.count = 56; + + /* append the 64 bit count */ + buf_put_be32(hd->bctx.buf + 56, msb); + buf_put_be32(hd->bctx.buf + 60, lsb); + burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 ); } else /* need one extra block */ { hd->bctx.buf[hd->bctx.count++] = 0x80; /* pad character */ - while( hd->bctx.count < 64 ) - hd->bctx.buf[hd->bctx.count++] = 0; - _gcry_md_block_write(hd, NULL, 0); /* flush */; - memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */ + /* fill pad and next block with zeroes */ + memset (&hd->bctx.buf[hd->bctx.count], 0, 64 - hd->bctx.count + 56); + hd->bctx.count = 64 + 56; + + /* append the 64 bit count */ + buf_put_be32(hd->bctx.buf + 64 + 56, msb); + buf_put_be32(hd->bctx.buf + 64 + 60, lsb); + burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 2 ); } - /* append the 64 bit count */ - buf_put_be32(hd->bctx.buf + 56, msb); - buf_put_be32(hd->bctx.buf + 60, lsb); - burn = (*hd->bctx.bwrite) ( hd, hd->bctx.buf, 1 ); - _gcry_burn_stack (burn); p = hd->bctx.buf; #define X(a) do { buf_put_be32(p, hd->h##a); p += 4; } while(0) @@ -520,6 +566,7 @@ sha1_final(void *context) X(4); #undef X + _gcry_burn_stack (burn); } static unsigned char * diff --git a/configure.ac b/configure.ac index bb3c666f4..0a931f952 100644 --- a/configure.ac +++ b/configure.ac @@ -2541,6 +2541,7 @@ case "${host}" in GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-ssse3-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-amd64.lo" GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx-bmi2-amd64.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sha1-avx2-bmi2-amd64.lo" ;; arm*-*-*) # Build with the assembly implementation From jussi.kivilinna at iki.fi Sun Apr 7 22:07:28 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 7 Apr 2019 23:07:28 +0300 Subject: [PATCH] Tune SHA-512/AVX2 and SHA-256/AVX2 implementations Message-ID: <155466764826.17126.15898822326526391088.stgit@localhost.localdomain> * cipher/sha256-avx2-bmi2-amd64.S (ONE_ROUND_PART1, ONE_ROUND_PART2) (ONE_ROUND): New round function. (FOUR_ROUNDS_AND_SCHED, FOUR_ROUNDS): Use new round function. (_gcry_sha256_transform_amd64_avx2): Exit early if number of blocks is zero; Writing XFER to stack earlier and handle XREF writing in FOUR_ROUNDS_AND_SCHED. * cipher/sha512-avx2-bmi2-amd64.S (MASK_YMM_LO, MASK_YMM_LOx): New. (ONE_ROUND_PART1, ONE_ROUND_PART2, ONE_ROUND): New round function. (FOUR_ROUNDS_AND_SCHED, FOUR_ROUNDS): Use new round function. (_gcry_sha512_transform_amd64_avx2): Writing XFER to stack earlier and handle XREF writing in FOUR_ROUNDS_AND_SCHED. -- Benchmark on Intel Haswell (4.0Ghz): Before: | nanosecs/byte mebibytes/sec cycles/byte SHA256 | 2.17 ns/B 439.0 MiB/s 8.68 c/B SHA512 | 1.56 ns/B 612.5 MiB/s 6.23 c/B After (~4-6% faster): | nanosecs/byte mebibytes/sec cycles/byte SHA256 | 2.05 ns/B 465.9 MiB/s 8.18 c/B SHA512 | 1.49 ns/B 640.3 MiB/s 5.95 c/B Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S index 598f93821..5fc402cd1 100644 --- a/cipher/sha256-avx2-bmi2-amd64.S +++ b/cipher/sha256-avx2-bmi2-amd64.S @@ -176,379 +176,128 @@ b = a a = TMP_ .endm -.macro FOUR_ROUNDS_AND_SCHED XFER -/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - mov y3, a /* y3 = a ; MAJA */ - rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ - rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ - - add h, [\XFER+0*4] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */ - mov y2, f /* y2 = f ; CH */ - rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ +.macro ONE_ROUND_PART1 XFER + /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); + * d += h; + * h += Sum0 (a) + Maj (a, b, c); + * + * Ch(x, y, z) => ((x & y) + (~x & z)) + * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) + */ + + mov y3, e + add h, [\XFER] + and y3, f + rorx y0, e, 25 + rorx y1, e, 11 + lea h, [h + y3] + andn y3, e, g + rorx T1, a, 13 + xor y0, y1 + lea h, [h + y3] +.endm +.macro ONE_ROUND_PART2 + rorx y2, a, 22 + rorx y1, e, 6 + mov y3, a + xor T1, y2 + xor y0, y1 + xor y3, b + lea h, [h + y0] + mov y0, a + rorx y2, a, 2 + add d, h + and y3, c + xor T1, y2 + lea h, [h + y3] + lea h, [h + T1] + and y0, b + lea h, [h + y0] +.endm - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ - xor y2, g /* y2 = f^g ; CH */ - vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16]; y1 = (e >> 6) ; S1 */ - rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ +.macro ONE_ROUND XFER + ONE_ROUND_PART1 \XFER + ONE_ROUND_PART2 +.endm - and y2, e /* y2 = (f^g)&e ; CH */ - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ - rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ - add d, h /* d = k + w + h + d ; -- */ +.macro FOUR_ROUNDS_AND_SCHED XFER, XFEROUT +/* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - and y3, b /* y3 = (a|c)&b ; MAJA */ + vpalignr XTMP0, X3, X2, 4 /* XTMP0 = W[-7] */ + vpaddd XTMP0, XTMP0, X0 /* XTMP0 = W[-7] + W[-16]; y1 = (e >> 6); S1 */ vpalignr XTMP1, X1, X0, 4 /* XTMP1 = W[-15] */ - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ - rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ - - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ vpsrld XTMP2, XTMP1, 7 - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and T1, c /* T1 = a&c ; MAJB */ - - add y2, y0 /* y2 = S1 + CH ; -- */ vpslld XTMP3, XTMP1, (32-7) - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ vpor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 */ - vpsrld XTMP2, XTMP1,18 - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ - -ROTATE_ARGS + ONE_ROUND 0*4+\XFER + ROTATE_ARGS /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - - mov y3, a /* y3 = a ; MAJA */ - rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ - rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ - add h, [\XFER+1*4] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - vpsrld XTMP4, XTMP1, 3 /* XTMP4 = W[-15] >> 3 */ - mov y2, f /* y2 = f ; CH */ - rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ - xor y2, g /* y2 = f^g ; CH */ - - - rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ - rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ - and y2, e /* y2 = (f^g)&e ; CH */ - add d, h /* d = k + w + h + d ; -- */ - vpslld XTMP1, XTMP1, (32-18) - and y3, b /* y3 = (a|c)&b ; MAJA */ - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ - vpxor XTMP3, XTMP3, XTMP1 - rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - vpxor XTMP3, XTMP3, XTMP2 /* XTMP3 = W[-15] ror 7 ^ W[-15] ror 18 */ - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - vpxor XTMP1, XTMP3, XTMP4 /* XTMP1 = s0 */ vpshufd XTMP2, X3, 0b11111010 /* XTMP2 = W[-2] {BBAA} */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - vpaddd XTMP0, XTMP0, XTMP1 /* XTMP0 = W[-16] + W[-7] + s0 */ - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ - vpsrld XTMP4, XTMP2, 10 /* XTMP4 = W[-2] >> 10 {BBAA} */ - -ROTATE_ARGS + ONE_ROUND 1*4+\XFER + ROTATE_ARGS /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - mov y3, a /* y3 = a ; MAJA */ - rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ - add h, [\XFER+2*4] /* h = k + w + h ; -- */ - vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xBxA} */ - rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ - or y3, c /* y3 = a|c ; MAJA */ - mov y2, f /* y2 = f ; CH */ - xor y2, g /* y2 = f^g ; CH */ - - rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xBxA} */ - and y2, e /* y2 = (f^g)&e ; CH */ - - rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ vpxor XTMP2, XTMP2, XTMP3 - add d, h /* d = k + w + h + d ; -- */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ - rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ vpxor XTMP4, XTMP4, XTMP2 /* XTMP4 = s1 {xBxA} */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - vpshufb XTMP4, XTMP4, SHUF_00BA /* XTMP4 = s1 {00BA} */ - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ - rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ vpaddd XTMP0, XTMP0, XTMP4 /* XTMP0 = {..., ..., W[1], W[0]} */ - - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ vpshufd XTMP2, XTMP0, 0b1010000 /* XTMP2 = W[-2] {DDCC} */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - - lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ - - -ROTATE_ARGS + ONE_ROUND 2*4+\XFER + ROTATE_ARGS /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - mov y3, a /* y3 = a ; MAJA */ - rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ - rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ - add h, [\XFER+3*4] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - vpsrld XTMP5, XTMP2, 10 /* XTMP5 = W[-2] >> 10 {DDCC} */ - mov y2, f /* y2 = f ; CH */ - rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ - xor y2, g /* y2 = f^g ; CH */ - - vpsrlq XTMP3, XTMP2, 19 /* XTMP3 = W[-2] ror 19 {xDxC} */ - rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - add d, h /* d = k + w + h + d ; -- */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - vpsrlq XTMP2, XTMP2, 17 /* XTMP2 = W[-2] ror 17 {xDxC} */ - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - vpxor XTMP2, XTMP2, XTMP3 - rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ - add y2, y0 /* y2 = S1 + CH ; -- */ - vpxor XTMP5, XTMP5, XTMP2 /* XTMP5 = s1 {xDxC} */ - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ vpshufb XTMP5, XTMP5, SHUF_DC00 /* XTMP5 = s1 {DC00} */ - vpaddd X0, XTMP5, XTMP0 /* X0 = {W[3], W[2], W[1], W[0]} */ - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and T1, c /* T1 = a&c ; MAJB */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - - add h, y1 /* h = k + w + h + S0 ; -- */ - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ + vpaddd XFER, X0, [TBL + \XFEROUT] -ROTATE_ARGS -rotate_Xs + ONE_ROUND_PART1 3*4+\XFER + vmovdqa [rsp + _XFER + \XFEROUT], XFER + ONE_ROUND_PART2 + ROTATE_ARGS + rotate_Xs .endm .macro DO_4ROUNDS XFER /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - mov y2, f /* y2 = f ; CH */ - rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ - rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ - xor y2, g /* y2 = f^g ; CH */ - - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ - rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ - rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ - mov y3, a /* y3 = a ; MAJA */ - - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ - rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ - add h, [\XFER + 4*0] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - - add d, h /* d = k + w + h + d ; -- */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - - /* add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - - /* lea h, [h + y3] ; h = t1 + S0 + MAJ ; -- */ - + ONE_ROUND 0*4+\XFER ROTATE_ARGS /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - mov y2, f /* y2 = f ; CH */ - rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ - rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ - xor y2, g /* y2 = f^g ; CH */ - - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ - rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ - - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ - rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ - mov y3, a /* y3 = a ; MAJA */ - - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ - rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ - add h, [\XFER + 4*1] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - - add d, h /* d = k + w + h + d ; -- */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - - /* add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - - /* lea h, [h + y3] ; h = t1 + S0 + MAJ ; -- */ - + ONE_ROUND 1*4+\XFER ROTATE_ARGS /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - mov y2, f /* y2 = f ; CH */ - rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ - rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ - xor y2, g /* y2 = f^g ; CH */ - - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ - rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ - - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ - rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ - mov y3, a /* y3 = a ; MAJA */ - - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ - rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ - add h, [\XFER + 4*2] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - - add d, h /* d = k + w + h + d ; -- */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - - /* add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - - /* lea h, [h + y3] ; h = t1 + S0 + MAJ ; -- */ - + ONE_ROUND 2*4+\XFER ROTATE_ARGS /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - mov y2, f /* y2 = f ; CH */ - rorx y0, e, 25 /* y0 = e >> 25 ; S1A */ - rorx y1, e, 11 /* y1 = e >> 11 ; S1B */ - xor y2, g /* y2 = f^g ; CH */ - - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ; S1 */ - rorx y1, e, 6 /* y1 = (e >> 6) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ - - xor y0, y1 /* y0 = (e>>25) ^ (e>>11) ^ (e>>6) ; S1 */ - rorx T1, a, 13 /* T1 = a >> 13 ; S0B */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - rorx y1, a, 22 /* y1 = a >> 22 ; S0A */ - mov y3, a /* y3 = a ; MAJA */ - - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ; S0 */ - rorx T1, a, 2 /* T1 = (a >> 2) ; S0 */ - add h, [\XFER + 4*3] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - xor y1, T1 /* y1 = (a>>22) ^ (a>>13) ^ (a>>2) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - - add d, h /* d = k + w + h + d ; -- */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0; -- */ - - lea h, [h + y3] /* h = t1 + S0 + MAJ ; -- */ - + ONE_ROUND 3*4+\XFER ROTATE_ARGS .endm @@ -565,6 +314,11 @@ rotate_Xs ELF(.type _gcry_sha256_transform_amd64_avx2, at function) .align 32 _gcry_sha256_transform_amd64_avx2: + xor eax, eax + + cmp rdx, 0 + je .Lnowork + push rbx push rbp push r12 @@ -574,19 +328,19 @@ _gcry_sha256_transform_amd64_avx2: vzeroupper + vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] + vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] + vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] + mov rax, rsp sub rsp, STACK_SIZE - and rsp, -32 + and rsp, ~63 mov [rsp + _RSP], rax shl NUM_BLKS, 6 /* convert to bytes */ - jz .Ldone_hash lea NUM_BLKS, [NUM_BLKS + INP - 64] /* pointer to last block */ mov [rsp + _INP_END], NUM_BLKS - cmp INP, NUM_BLKS - je .Lonly_one_block - /* ; load initial digest */ mov a,[4*0 + CTX] mov b,[4*1 + CTX] @@ -597,10 +351,6 @@ _gcry_sha256_transform_amd64_avx2: mov g,[4*6 + CTX] mov h,[4*7 + CTX] - vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] - vmovdqa SHUF_00BA, [.L_SHUF_00BA ADD_RIP] - vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] - mov [rsp + _CTX], CTX .Loop0: @@ -631,43 +381,31 @@ _gcry_sha256_transform_amd64_avx2: /* ; schedule 48 input dwords, by doing 3 rounds of 12 each */ xor SRND, SRND + vpaddd XFER, X0, [TBL + 0*32] + vmovdqa [rsp + _XFER + 0*32], XFER + vpaddd XFER, X1, [TBL + 1*32] + vmovdqa [rsp + _XFER + 1*32], XFER + vpaddd XFER, X2, [TBL + 2*32] + vmovdqa [rsp + _XFER + 2*32], XFER + vpaddd XFER, X3, [TBL + 3*32] + vmovdqa [rsp + _XFER + 3*32], XFER + .align 16 .Loop1: - vpaddd XFER, X0, [TBL + SRND + 0*32] - vmovdqa [rsp + _XFER + SRND + 0*32], XFER - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32 - - vpaddd XFER, X0, [TBL + SRND + 1*32] - vmovdqa [rsp + _XFER + SRND + 1*32], XFER - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32 - - vpaddd XFER, X0, [TBL + SRND + 2*32] - vmovdqa [rsp + _XFER + SRND + 2*32], XFER - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32 - - vpaddd XFER, X0, [TBL + SRND + 3*32] - vmovdqa [rsp + _XFER + SRND + 3*32], XFER - FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32 + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 0*32, SRND + 4*32 + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 1*32, SRND + 5*32 + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 2*32, SRND + 6*32 + FOUR_ROUNDS_AND_SCHED rsp + _XFER + SRND + 3*32, SRND + 7*32 add SRND, 4*32 cmp SRND, 3 * 4*32 jb .Loop1 -.Loop2: /* ; Do last 16 rounds with no scheduling */ - vpaddd XFER, X0, [TBL + SRND + 0*32] - vmovdqa [rsp + _XFER + SRND + 0*32], XFER - DO_4ROUNDS rsp + _XFER + SRND + 0*32 - vpaddd XFER, X1, [TBL + SRND + 1*32] - vmovdqa [rsp + _XFER + SRND + 1*32], XFER - DO_4ROUNDS rsp + _XFER + SRND + 1*32 - add SRND, 2*32 - - vmovdqa X0, X2 - vmovdqa X1, X3 - - cmp SRND, 4 * 4*32 - jb .Loop2 + DO_4ROUNDS rsp + _XFER + (3*4*32 + 0*32) + DO_4ROUNDS rsp + _XFER + (3*4*32 + 1*32) + DO_4ROUNDS rsp + _XFER + (3*4*32 + 2*32) + DO_4ROUNDS rsp + _XFER + (3*4*32 + 3*32) mov CTX, [rsp + _CTX] mov INP, [rsp + _INP] @@ -777,6 +515,7 @@ _gcry_sha256_transform_amd64_avx2: pop rbp pop rbx +.Lnowork: ret .align 64 diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 914f920af..32cfceb0b 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -79,6 +79,8 @@ YTMP4 = ymm8 XFER = YTMP0 BYTE_FLIP_MASK = ymm9 +MASK_YMM_LO = ymm10 +MASK_YMM_LOx = xmm10 INP = rdi /* 1st arg */ CTX = rsi /* 2nd arg */ @@ -96,7 +98,7 @@ b = rbx f = r9 g = r10 h = r11 -old_h = r11 +old_h = rax T1 = r12 y0 = r13 @@ -107,14 +109,14 @@ y4 = r12 /* Local variables (stack frame) */ #define frame_XFER 0 -#define frame_XFER_size (4*8) +#define frame_XFER_size (4*4*8) #define frame_SRND (frame_XFER + frame_XFER_size) #define frame_SRND_size (1*8) #define frame_INP (frame_SRND + frame_SRND_size) #define frame_INP_size (1*8) -#define frame_INPEND (frame_INP + frame_INP_size) -#define frame_INPEND_size (1*8) -#define frame_RSPSAVE (frame_INPEND + frame_INPEND_size) +#define frame_NBLKS (frame_INP + frame_INP_size) +#define frame_NBLKS_size (1*8) +#define frame_RSPSAVE (frame_NBLKS + frame_NBLKS_size) #define frame_RSPSAVE_size (1*8) #define frame_GPRSAVE (frame_RSPSAVE + frame_RSPSAVE_size) #define frame_GPRSAVE_size (6*8) @@ -168,7 +170,51 @@ y4 = r12 vpalignr \YDST, \YDST, \YSRC2, \RVAL /* YDST = {YDS1, YS2} >> RVAL*8 */ .endm -.macro FOUR_ROUNDS_AND_SCHED +.macro ONE_ROUND_PART1 XFER + /* h += Sum1 (e) + Ch (e, f, g) + (k[t] + w[0]); + * d += h; + * h += Sum0 (a) + Maj (a, b, c); + * + * Ch(x, y, z) => ((x & y) + (~x & z)) + * Maj(x, y, z) => ((x & y) + (z & (x ^ y))) + */ + + mov y3, e + add h, [\XFER] + and y3, f + rorx y0, e, 41 + rorx y1, e, 18 + lea h, [h + y3] + andn y3, e, g + rorx T1, a, 34 + xor y0, y1 + lea h, [h + y3] +.endm +.macro ONE_ROUND_PART2 + rorx y2, a, 39 + rorx y1, e, 14 + mov y3, a + xor T1, y2 + xor y0, y1 + xor y3, b + lea h, [h + y0] + mov y0, a + rorx y2, a, 28 + add d, h + and y3, c + xor T1, y2 + lea h, [h + y3] + lea h, [h + T1] + and y0, b + lea h, [h + y0] +.endm + +.macro ONE_ROUND XFER + ONE_ROUND_PART1 \XFER + ONE_ROUND_PART2 +.endm + +.macro FOUR_ROUNDS_AND_SCHED X /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /* Extract w[t-7] */ @@ -187,43 +233,8 @@ y4 = r12 /* Calculate w[t-15] shr 7 */ vpsrlq YTMP4, YTMP1, 7 /* YTMP4 = W[-15] >> 7 */ - mov y3, a /* y3 = a ; MAJA */ - rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ - rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ - - add h, [rsp+frame_XFER+0*8] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - mov y2, f /* y2 = f ; CH */ - rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ - xor y2, g /* y2 = f^g ; CH */ - rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ - - and y2, e /* y2 = (f^g)&e ; CH */ - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ - rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ - add d, h /* d = k + w + h + d ; -- */ - - and y3, b /* y3 = (a|c)&b ; MAJA */ - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ - rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ - - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and T1, c /* T1 = a&c ; MAJB */ - - add y2, y0 /* y2 = S1 + CH ; -- */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - add h, y3 /* h = t1 + S0 + MAJ ; -- */ - -RotateState + ONE_ROUND rsp+frame_XFER+0*8+\X*32 + RotateState /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ @@ -243,7 +254,7 @@ RotateState /* Move to appropriate lanes for calculating w[16] and w[17] */ vperm2f128 Y_0, YTMP0, YTMP0, 0x0 /* Y_0 = W[-16] + W[-7] + s0 {BABA} */ /* Move to appropriate lanes for calculating w[18] and w[19] */ - vpand YTMP0, YTMP0, [.LMASK_YMM_LO ADD_RIP] /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */ + vpand YTMP0, YTMP0, MASK_YMM_LO /* YTMP0 = W[-16] + W[-7] + s0 {DC00} */ /* Calculate w[16] and w[17] in both 128 bit lanes */ @@ -251,48 +262,8 @@ RotateState vperm2f128 YTMP2, Y_3, Y_3, 0x11 /* YTMP2 = W[-2] {BABA} */ vpsrlq YTMP4, YTMP2, 6 /* YTMP4 = W[-2] >> 6 {BABA} */ - - mov y3, a /* y3 = a ; MAJA */ - rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ - rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ - add h, [rsp+frame_XFER+1*8] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - - mov y2, f /* y2 = f ; CH */ - rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ - xor y2, g /* y2 = f^g ; CH */ - - - rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ - rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ - and y2, e /* y2 = (f^g)&e ; CH */ - add d, h /* d = k + w + h + d ; -- */ - - and y3, b /* y3 = (a|c)&b ; MAJA */ - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ - - rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - add h, y3 /* h = t1 + S0 + MAJ ; -- */ - -RotateState - - - + ONE_ROUND rsp+frame_XFER+1*8+\X*32 + RotateState /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ @@ -314,43 +285,8 @@ RotateState /* Calculate sigma1 for w[18] and w[19] for upper 128 bit lane */ vpsrlq YTMP4, Y_0, 6 /* YTMP4 = W[-2] >> 6 {DC--} */ - mov y3, a /* y3 = a ; MAJA */ - rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ - add h, [rsp+frame_XFER+2*8] /* h = k + w + h ; -- */ - - rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ - or y3, c /* y3 = a|c ; MAJA */ - mov y2, f /* y2 = f ; CH */ - xor y2, g /* y2 = f^g ; CH */ - - rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - - rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ - add d, h /* d = k + w + h + d ; -- */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ - rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ - rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - - add h, y3 /* h = t1 + S0 + MAJ ; -- */ - -RotateState + ONE_ROUND rsp+frame_XFER+2*8+\X*32 + RotateState /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ @@ -370,225 +306,35 @@ RotateState /* Form w[19, w[18], w17], w[16] */ vpblendd Y_0, Y_0, YTMP2, 0xF0 /* Y_0 = {W[3], W[2], W[1], W[0]} */ -/* vperm2f128 Y_0, Y_0, YTMP2, 0x30 */ - - mov y3, a /* y3 = a ; MAJA */ - rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ - rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ - add h, [rsp+frame_XFER+3*8] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - - mov y2, f /* y2 = f ; CH */ - rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ - xor y2, g /* y2 = f^g ; CH */ - - rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - add d, h /* d = k + w + h + d ; -- */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - - rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and T1, c /* T1 = a&c ; MAJB */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - - add h, y1 /* h = k + w + h + S0 ; -- */ - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - add h, y3 /* h = t1 + S0 + MAJ ; -- */ - -RotateState - -rotate_Ys + ONE_ROUND_PART1 rsp+frame_XFER+3*8+\X*32 + vpaddq XFER, Y_0, [TBL + (4+\X)*32] + vmovdqa [rsp + frame_XFER + \X*32], XFER + ONE_ROUND_PART2 + RotateState + rotate_Ys .endm -.macro DO_4ROUNDS +.macro DO_4ROUNDS X /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 0 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - mov y2, f /* y2 = f ; CH */ - rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ - rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ - xor y2, g /* y2 = f^g ; CH */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ - rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ - rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ - mov y3, a /* y3 = a ; MAJA */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ - rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ - add h, [rsp + frame_XFER + 8*0] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - - add d, h /* d = k + w + h + d ; -- */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - - /*add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - - /*add h, y3 ; h = t1 + S0 + MAJ ; -- */ - + ONE_ROUND rsp+frame_XFER+0*8+\X*32 RotateState /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - mov y2, f /* y2 = f ; CH */ - rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ - rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ - xor y2, g /* y2 = f^g ; CH */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ - rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ - rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ - mov y3, a /* y3 = a ; MAJA */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ - rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ - add h, [rsp + frame_XFER + 8*1] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - - add d, h /* d = k + w + h + d ; -- */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - - /*add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - - /*add h, y3 ; h = t1 + S0 + MAJ ; -- */ - + ONE_ROUND rsp+frame_XFER+1*8+\X*32 RotateState /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - mov y2, f /* y2 = f ; CH */ - rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ - rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ - xor y2, g /* y2 = f^g ; CH */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ - rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ - rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ - mov y3, a /* y3 = a ; MAJA */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ - rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ - add h, [rsp + frame_XFER + 8*2] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - - add d, h /* d = k + w + h + d ; -- */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - - /*add h, y2 ; h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - - /*add h, y3 ; h = t1 + S0 + MAJ ; -- */ - + ONE_ROUND rsp+frame_XFER+2*8+\X*32 RotateState /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; RND N + 3 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ - add old_h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - mov y2, f /* y2 = f ; CH */ - rorx y0, e, 41 /* y0 = e >> 41 ; S1A */ - rorx y1, e, 18 /* y1 = e >> 18 ; S1B */ - xor y2, g /* y2 = f^g ; CH */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ; S1 */ - rorx y1, e, 14 /* y1 = (e >> 14) ; S1 */ - and y2, e /* y2 = (f^g)&e ; CH */ - add old_h, y3 /* h = t1 + S0 + MAJ ; -- */ - - xor y0, y1 /* y0 = (e>>41) ^ (e>>18) ^ (e>>14) ; S1 */ - rorx T1, a, 34 /* T1 = a >> 34 ; S0B */ - xor y2, g /* y2 = CH = ((f^g)&e)^g ; CH */ - rorx y1, a, 39 /* y1 = a >> 39 ; S0A */ - mov y3, a /* y3 = a ; MAJA */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ; S0 */ - rorx T1, a, 28 /* T1 = (a >> 28) ; S0 */ - add h, [rsp + frame_XFER + 8*3] /* h = k + w + h ; -- */ - or y3, c /* y3 = a|c ; MAJA */ - - xor y1, T1 /* y1 = (a>>39) ^ (a>>34) ^ (a>>28) ; S0 */ - mov T1, a /* T1 = a ; MAJB */ - and y3, b /* y3 = (a|c)&b ; MAJA */ - and T1, c /* T1 = a&c ; MAJB */ - add y2, y0 /* y2 = S1 + CH ; -- */ - - - add d, h /* d = k + w + h + d ; -- */ - or y3, T1 /* y3 = MAJ = (a|c)&b)|(a&c) ; MAJ */ - add h, y1 /* h = k + w + h + S0 ; -- */ - - add d, y2 /* d = k + w + h + d + S1 + CH = d + t1 ; -- */ - - - add h, y2 /* h = k + w + h + S0 + S1 + CH = t1 + S0 ; -- */ - - add h, y3 /* h = t1 + S0 + MAJ ; -- */ - + ONE_ROUND rsp+frame_XFER+3*8+\X*32 RotateState .endm @@ -616,7 +362,7 @@ _gcry_sha512_transform_amd64_avx2: /* Allocate Stack Space */ mov rax, rsp sub rsp, frame_size - and rsp, ~(0x20 - 1) + and rsp, ~(0x40 - 1) mov [rsp + frame_RSPSAVE], rax /* Save GPRs */ @@ -627,13 +373,7 @@ _gcry_sha512_transform_amd64_avx2: mov [rsp + frame_GPRSAVE + 8 * 4], r14 mov [rsp + frame_GPRSAVE + 8 * 5], r15 - vpblendd xmm0, xmm0, xmm1, 0xf0 - vpblendd ymm0, ymm0, ymm1, 0xf0 - - shl NUM_BLKS, 7 /* convert to bytes */ - jz .Ldone_hash - add NUM_BLKS, INP /* pointer to end of data */ - mov [rsp + frame_INPEND], NUM_BLKS + mov [rsp + frame_NBLKS], NUM_BLKS /*; load initial digest */ mov a,[8*0 + CTX] @@ -646,8 +386,8 @@ _gcry_sha512_transform_amd64_avx2: mov h,[8*7 + CTX] vmovdqa BYTE_FLIP_MASK, [.LPSHUFFLE_BYTE_FLIP_MASK ADD_RIP] + vmovdqa MASK_YMM_LO, [.LMASK_YMM_LO ADD_RIP] -.Loop0: lea TBL,[.LK512 ADD_RIP] /*; byte swap first 16 dwords */ @@ -656,48 +396,60 @@ _gcry_sha512_transform_amd64_avx2: COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK + add INP, 128 mov [rsp + frame_INP], INP + vpaddq XFER, Y_0, [TBL + 0*32] + vmovdqa [rsp + frame_XFER + 0*32], XFER + vpaddq XFER, Y_1, [TBL + 1*32] + vmovdqa [rsp + frame_XFER + 1*32], XFER + vpaddq XFER, Y_2, [TBL + 2*32] + vmovdqa [rsp + frame_XFER + 2*32], XFER + vpaddq XFER, Y_3, [TBL + 3*32] + vmovdqa [rsp + frame_XFER + 3*32], XFER + /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ movq [rsp + frame_SRND],4 .align 16 -.Loop1: - vpaddq XFER, Y_0, [TBL + 0*32] - vmovdqa [rsp + frame_XFER], XFER - FOUR_ROUNDS_AND_SCHED +.Loop0: + FOUR_ROUNDS_AND_SCHED 0 + FOUR_ROUNDS_AND_SCHED 1 + FOUR_ROUNDS_AND_SCHED 2 + FOUR_ROUNDS_AND_SCHED 3 + add TBL, 4*32 - vpaddq XFER, Y_0, [TBL + 1*32] - vmovdqa [rsp + frame_XFER], XFER - FOUR_ROUNDS_AND_SCHED + subq [rsp + frame_SRND], 1 + jne .Loop0 - vpaddq XFER, Y_0, [TBL + 2*32] - vmovdqa [rsp + frame_XFER], XFER - FOUR_ROUNDS_AND_SCHED + subq [rsp + frame_NBLKS], 1 + je .Ldone_hash - vpaddq XFER, Y_0, [TBL + 3*32] - vmovdqa [rsp + frame_XFER], XFER - add TBL, 4*32 - FOUR_ROUNDS_AND_SCHED + mov INP, [rsp + frame_INP] - subq [rsp + frame_SRND], 1 - jne .Loop1 + lea TBL,[.LK512 ADD_RIP] - movq [rsp + frame_SRND], 2 -.Loop2: - vpaddq XFER, Y_0, [TBL + 0*32] - vmovdqa [rsp + frame_XFER], XFER - DO_4ROUNDS - vpaddq XFER, Y_1, [TBL + 1*32] - vmovdqa [rsp + frame_XFER], XFER - add TBL, 2*32 - DO_4ROUNDS + /* load next block and byte swap */ + COPY_YMM_AND_BSWAP Y_0, [INP + 0*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_1, [INP + 1*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_2, [INP + 2*32], BYTE_FLIP_MASK + COPY_YMM_AND_BSWAP Y_3, [INP + 3*32], BYTE_FLIP_MASK - vmovdqa Y_0, Y_2 - vmovdqa Y_1, Y_3 + add INP, 128 + mov [rsp + frame_INP], INP - subq [rsp + frame_SRND], 1 - jne .Loop2 + DO_4ROUNDS 0 + vpaddq XFER, Y_0, [TBL + 0*32] + vmovdqa [rsp + frame_XFER + 0*32], XFER + DO_4ROUNDS 1 + vpaddq XFER, Y_1, [TBL + 1*32] + vmovdqa [rsp + frame_XFER + 1*32], XFER + DO_4ROUNDS 2 + vpaddq XFER, Y_2, [TBL + 2*32] + vmovdqa [rsp + frame_XFER + 2*32], XFER + DO_4ROUNDS 3 + vpaddq XFER, Y_3, [TBL + 3*32] + vmovdqa [rsp + frame_XFER + 3*32], XFER addm [8*0 + CTX],a addm [8*1 + CTX],b @@ -708,14 +460,33 @@ _gcry_sha512_transform_amd64_avx2: addm [8*6 + CTX],g addm [8*7 + CTX],h - mov INP, [rsp + frame_INP] - add INP, 128 - cmp INP, [rsp + frame_INPEND] - jne .Loop0 + /*; schedule 64 input dwords, by doing 12 rounds of 4 each */ + movq [rsp + frame_SRND],4 + + jmp .Loop0 .Ldone_hash: vzeroall + DO_4ROUNDS 0 + vmovdqa [rsp + frame_XFER + 0*32], ymm0 /* burn stack */ + DO_4ROUNDS 1 + vmovdqa [rsp + frame_XFER + 1*32], ymm0 /* burn stack */ + DO_4ROUNDS 2 + vmovdqa [rsp + frame_XFER + 2*32], ymm0 /* burn stack */ + DO_4ROUNDS 3 + vmovdqa [rsp + frame_XFER + 3*32], ymm0 /* burn stack */ + + addm [8*0 + CTX],a + xor eax, eax /* burn stack */ + addm [8*1 + CTX],b + addm [8*2 + CTX],c + addm [8*3 + CTX],d + addm [8*4 + CTX],e + addm [8*5 + CTX],f + addm [8*6 + CTX],g + addm [8*7 + CTX],h + /* Restore GPRs */ mov rbp, [rsp + frame_GPRSAVE + 8 * 0] mov rbx, [rsp + frame_GPRSAVE + 8 * 1] @@ -724,10 +495,6 @@ _gcry_sha512_transform_amd64_avx2: mov r14, [rsp + frame_GPRSAVE + 8 * 4] mov r15, [rsp + frame_GPRSAVE + 8 * 5] - /* Burn stack */ - vmovdqa [rsp + frame_XFER], XFER - xor eax, eax - /* Restore Stack Pointer */ mov rsp, [rsp + frame_RSPSAVE] .Lnowork: From jussi.kivilinna at iki.fi Mon Apr 8 18:07:00 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 8 Apr 2019 19:07:00 +0300 Subject: [PATCH] Disable SM3 in FIPS mode Message-ID: <155473962034.28973.13794332078146246445.stgit@localhost.localdomain> * cipher/sm3.h (_gcry_digest_spec_sm3): Set flags.fips to zero. -- Signed-off-by: Jussi Kivilinna --- cipher/sm3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cipher/sm3.c b/cipher/sm3.c index e76f32297..b6f0ab28c 100644 --- a/cipher/sm3.c +++ b/cipher/sm3.c @@ -464,7 +464,7 @@ static gcry_md_oid_spec_t oid_spec_sm3[] = gcry_md_spec_t _gcry_digest_spec_sm3 = { - GCRY_MD_SM3, {0, 1}, + GCRY_MD_SM3, {0, 0}, "SM3", asn_sm3, DIM (asn_sm3), oid_spec_sm3, 32, sm3_init, _gcry_md_block_write, sm3_final, sm3_read, NULL, _gcry_sm3_hash_buffer, _gcry_sm3_hash_buffers, From jussi.kivilinna at iki.fi Mon Apr 8 20:01:15 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Mon, 8 Apr 2019 21:01:15 +0300 Subject: [PATCH] Use getauxval system function for detecting ARM HW features Message-ID: <155474647569.15904.17215999863429401661.stgit@localhost.localdomain> * configure.ac: Add header check for 'sys/auxv.h'. * src/hwf-arm.c [HAVE_SYS_AUXV_H]: Include 'sys/auxv.h'. (AT_HWCAP, AT_HWCAP2, HWCAP_NEON, HWCAP2_AES, HWCAP2_PMULL) (HWCAP2_SHA1, HWCAP2_SHA2, HWCAP_ASIMD, HWCAP_AES) (HWCAP_PMULL, HWCAP_SHA1, HWCAP_SHA2): Define these macros only if not already defined. (get_hwcap) [HAVE_SYS_AUXV_H]: Use 'getauxval' to fetch HW capability flags. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/configure.ac b/configure.ac index 0a931f952..63a275079 100644 --- a/configure.ac +++ b/configure.ac @@ -805,7 +805,7 @@ AC_SEARCH_LIBS(setsockopt, [nsl]) ################################## AC_HEADER_STDC -AC_CHECK_HEADERS(unistd.h sys/select.h sys/msg.h) +AC_CHECK_HEADERS(unistd.h sys/select.h sys/msg.h sys/auxv.h) INSERT_SYS_SELECT_H= if test x"$ac_cv_header_sys_select_h" = xyes; then INSERT_SYS_SELECT_H=" include " diff --git a/src/hwf-arm.c b/src/hwf-arm.c index a762b5eab..efbbd0c2d 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -23,6 +23,10 @@ #include #include #include +#include +#ifdef HAVE_SYS_AUXV_H +#include +#endif #include "g10lib.h" #include "hwf-common.h" @@ -47,15 +51,29 @@ struct feature_map_s { #ifdef __arm__ -#define AT_HWCAP 16 -#define AT_HWCAP2 26 +#ifndef AT_HWCAP +# define AT_HWCAP 16 +#endif +#ifndef AT_HWCAP2 +# define AT_HWCAP2 26 +#endif -#define HWCAP_NEON 4096 +#ifndef HWCAP_NEON +# define HWCAP_NEON 4096 +#endif -#define HWCAP2_AES 1 -#define HWCAP2_PMULL 2 -#define HWCAP2_SHA1 3 -#define HWCAP2_SHA2 4 +#ifndef HWCAP2_AES +# define HWCAP2_AES 1 +#endif +#ifndef HWCAP2_PMULL +# define HWCAP2_PMULL 2 +#endif +#ifndef HWCAP2_SHA1 +# define HWCAP2_SHA1 3 +#endif +#ifndef HWCAP2_SHA2 +# define HWCAP2_SHA2 4 +#endif static const struct feature_map_s arm_features[] = { @@ -72,14 +90,28 @@ static const struct feature_map_s arm_features[] = #elif defined(__aarch64__) -#define AT_HWCAP 16 -#define AT_HWCAP2 -1 +#ifndef AT_HWCAP +# define AT_HWCAP 16 +#endif +#ifndef AT_HWCAP2 +# define AT_HWCAP2 -1 +#endif -#define HWCAP_ASIMD 2 -#define HWCAP_AES 8 -#define HWCAP_PMULL 16 -#define HWCAP_SHA1 32 -#define HWCAP_SHA2 64 +#ifndef HWCAP_ASIMD +# define HWCAP_ASIMD 2 +#endif +#ifndef HWCAP_AES +# define HWCAP_AES 8 +#endif +#ifndef HWCAP_PMULL +# define HWCAP_PMULL 16 +#endif +#ifndef HWCAP_SHA1 +# define HWCAP_SHA1 32 +#endif +#ifndef HWCAP_SHA2 +# define HWCAP_SHA2 64 +#endif static const struct feature_map_s arm_features[] = { @@ -113,6 +145,34 @@ get_hwcap(unsigned int *hwcap, unsigned int *hwcap2) return 0; } +#ifdef HAVE_SYS_AUXV_H + errno = 0; + auxv.a_val = getauxval (AT_HWCAP); + if (errno == 0) + { + stored_hwcap |= auxv.a_val; + hwcap_initialized = 1; + } + + if (AT_HWCAP2 >= 0) + { + errno = 0; + auxv.a_val = getauxval (AT_HWCAP2); + if (errno == 0) + { + stored_hwcap2 |= auxv.a_val; + hwcap_initialized = 1; + } + } + + if (hwcap_initialized && (stored_hwcap || stored_hwcap2)) + { + *hwcap = stored_hwcap; + *hwcap2 = stored_hwcap2; + return 0; + } +#endif + f = fopen("/proc/self/auxv", "r"); if (!f) { @@ -125,13 +185,13 @@ get_hwcap(unsigned int *hwcap, unsigned int *hwcap2) { if (auxv.a_type == AT_HWCAP) { - stored_hwcap = auxv.a_val; + stored_hwcap |= auxv.a_val; hwcap_initialized = 1; } if (auxv.a_type == AT_HWCAP2) { - stored_hwcap2 = auxv.a_val; + stored_hwcap2 |= auxv.a_val; hwcap_initialized = 1; } } From jussi.kivilinna at iki.fi Tue Apr 9 19:11:04 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 9 Apr 2019 20:11:04 +0300 Subject: [PATCH] Use FreeBSD's elf_aux_info for detecting ARM HW features Message-ID: <155482986392.14628.10419192865532577551.stgit@localhost.localdomain> * configure.ac: Add function check for 'elf_aux_info'. * src/hwf-arm.c [HAVE_ELF_AUX_INFO]: Include 'sys/auxv.h'. [HAVE_ELF_AUX_INFO && !HAVE_GETAUXVAL] (HAVE_GETAUXVAL) (getauxval): New. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/configure.ac b/configure.ac index b0d7f8903..b54b212b3 100644 --- a/configure.ac +++ b/configure.ac @@ -1806,7 +1806,7 @@ AC_CHECK_FUNCS(strtoul memmove stricmp atexit raise) # Other checks AC_CHECK_FUNCS(strerror rand mmap getpagesize sysconf waitpid wait4) AC_CHECK_FUNCS(gettimeofday getrusage gethrtime clock_gettime syslog) -AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval) +AC_CHECK_FUNCS(syscall fcntl ftruncate flockfile getauxval elf_aux_info) AC_CHECK_FUNCS(explicit_bzero getentropy) GNUPG_CHECK_MLOCK diff --git a/src/hwf-arm.c b/src/hwf-arm.c index 0f8f83f61..1d19ea86b 100644 --- a/src/hwf-arm.c +++ b/src/hwf-arm.c @@ -24,7 +24,8 @@ #include #include #include -#if defined(HAVE_SYS_AUXV_H) && defined(HAVE_GETAUXVAL) +#if defined(HAVE_SYS_AUXV_H) && (defined(HAVE_GETAUXVAL) || \ + defined(HAVE_ELF_AUX_INFO)) #include #endif @@ -35,6 +36,30 @@ # error Module build for wrong CPU. #endif + +#if defined(HAVE_SYS_AUXV_H) && defined(HAVE_ELF_AUX_INFO) && \ + !defined(HAVE_GETAUXVAL) && defined(AT_HWCAP) +#define HAVE_GETAUXVAL +static unsigned long getauxval(unsigned long type) +{ + unsigned long auxval = 0; + int err; + + /* FreeBSD provides 'elf_aux_info' function that does the same as + * 'getauxval' on Linux. */ + + err = elf_aux_info (type, &auxval, sizeof(auxval)); + if (err) + { + errno = err; + auxval = 0; + } + + return auxval; +} +#endif + + #undef HAS_SYS_AT_HWCAP #if defined(__linux__) || \ (defined(HAVE_SYS_AUXV_H) && defined(HAVE_GETAUXVAL)) @@ -49,6 +74,7 @@ struct feature_map_s { #ifdef __arm__ +/* Note: These macros have same values on Linux and FreeBSD. */ #ifndef AT_HWCAP # define AT_HWCAP 16 #endif @@ -88,6 +114,7 @@ static const struct feature_map_s arm_features[] = #elif defined(__aarch64__) +/* Note: These macros have same values on Linux and FreeBSD. */ #ifndef AT_HWCAP # define AT_HWCAP 16 #endif From peter at lekensteyn.nl Wed Apr 10 00:18:10 2019 From: peter at lekensteyn.nl (Peter Wu) Date: Tue, 9 Apr 2019 23:18:10 +0100 Subject: Blowfish actually supports more than 128-bit keys Message-ID: <20190409221810.GA10760@al> Hi, The current cipher documentation[1] reports that GCRY_CIPHER_BLOWFISH only supports 128-bit keys. In the long past, a "BLOWFISH160" variant seems to have existed which supported 160-bit keys as reported through gcry_cipher_get_algo_keylen. One of our users would like to use Blowfish with 576 bit keys (don't ask). Based on the documentation it would not be possible. The source code (cipher/blowfish.c) however shows that do_bf_setkey function does not limit the key size. In fact it is designed to support any number of bytes (up to 72 bytes / 576 bits). Could this be documented such that we can rely on it? Attached are two test programs: - One using test vectors from OpenSSL (key lengths 8..200 bits). - One using Eric Young's test vector as linked by Schneier[2] (key lengths 8..192 bits). Rejecting key lengths above 576 bits (72 bytes) might be a good idea. Rejecting 0 bytes would also be good to avoid a buffer overrun by one byte. I have no idea why someone would like to use a very short key though... -- Kind regards, Peter Wu https://lekensteyn.nl [1]: https://gnupg.org/documentation/manuals/gcrypt/Available-ciphers.html [2]: https://www.schneier.com/academic/blowfish/ -------------- next part -------------- /* cc gcry-bf-test.c -lgcrypt && ./a.out */ #include // from openssl test/bftest.c # define KEY_TEST_NUM 25 static unsigned char key_test[KEY_TEST_NUM] = { 0xf0, 0xe1, 0xd2, 0xc3, 0xb4, 0xa5, 0x96, 0x87, 0x78, 0x69, 0x5a, 0x4b, 0x3c, 0x2d, 0x1e, 0x0f, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, 0x88 }; static unsigned char key_data[8] = { 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 }; static unsigned char key_out[KEY_TEST_NUM][8] = { {0xF9, 0xAD, 0x59, 0x7C, 0x49, 0xDB, 0x00, 0x5E}, {0xE9, 0x1D, 0x21, 0xC1, 0xD9, 0x61, 0xA6, 0xD6}, {0xE9, 0xC2, 0xB7, 0x0A, 0x1B, 0xC6, 0x5C, 0xF3}, {0xBE, 0x1E, 0x63, 0x94, 0x08, 0x64, 0x0F, 0x05}, {0xB3, 0x9E, 0x44, 0x48, 0x1B, 0xDB, 0x1E, 0x6E}, {0x94, 0x57, 0xAA, 0x83, 0xB1, 0x92, 0x8C, 0x0D}, {0x8B, 0xB7, 0x70, 0x32, 0xF9, 0x60, 0x62, 0x9D}, {0xE8, 0x7A, 0x24, 0x4E, 0x2C, 0xC8, 0x5E, 0x82}, {0x15, 0x75, 0x0E, 0x7A, 0x4F, 0x4E, 0xC5, 0x77}, {0x12, 0x2B, 0xA7, 0x0B, 0x3A, 0xB6, 0x4A, 0xE0}, {0x3A, 0x83, 0x3C, 0x9A, 0xFF, 0xC5, 0x37, 0xF6}, {0x94, 0x09, 0xDA, 0x87, 0xA9, 0x0F, 0x6B, 0xF2}, {0x88, 0x4F, 0x80, 0x62, 0x50, 0x60, 0xB8, 0xB4}, {0x1F, 0x85, 0x03, 0x1C, 0x19, 0xE1, 0x19, 0x68}, {0x79, 0xD9, 0x37, 0x3A, 0x71, 0x4C, 0xA3, 0x4F}, {0x93, 0x14, 0x28, 0x87, 0xEE, 0x3B, 0xE1, 0x5C}, {0x03, 0x42, 0x9E, 0x83, 0x8C, 0xE2, 0xD1, 0x4B}, {0xA4, 0x29, 0x9E, 0x27, 0x46, 0x9F, 0xF6, 0x7B}, {0xAF, 0xD5, 0xAE, 0xD1, 0xC1, 0xBC, 0x96, 0xA8}, {0x10, 0x85, 0x1C, 0x0E, 0x38, 0x58, 0xDA, 0x9F}, {0xE6, 0xF5, 0x1E, 0xD7, 0x9B, 0x9D, 0xB2, 0x1F}, {0x64, 0xA6, 0xE1, 0x4A, 0xFD, 0x36, 0xB4, 0x6F}, {0x80, 0xC7, 0xD7, 0xD4, 0x5A, 0x54, 0x79, 0xAD}, {0x05, 0x04, 0x4B, 0x62, 0xFA, 0x52, 0xD0, 0x80}, }; int main(void) { gcry_cipher_hd_t hd; gcry_error_t err; err = gcry_cipher_open(&hd, GCRY_CIPHER_BLOWFISH, GCRY_CIPHER_MODE_ECB, 0); if (err != 0) { printf("open: %s\n", gcry_strerror(err)); return 1; } for (unsigned i = 0; i < KEY_TEST_NUM - 1; i++) { err = gcry_cipher_setkey(hd, key_test, i + 1); if (err != 0) { printf("setkey %d: %s\n", i, gcry_strerror(err)); goto end; } unsigned char out[8]; err = gcry_cipher_encrypt(hd, out, sizeof(out), key_data, sizeof(key_data)); if (err != 0) { printf("decrypt %d: %s\n", i, gcry_strerror(err)); goto end; } if (memcmp(out, key_out[i], 8) != 0) { printf("Test failure: %d\n", i); goto end; } } puts("Passed."); end: gcry_cipher_close(hd); return err != 0; } -------------- next part -------------- /* cc gcry-bf-test.c -lgcrypt && ./a.out */ #include // test vectors from https://www.schneier.com/code/vectors.txt # define KEY_TEST_NUM 24 static unsigned char key_test[KEY_TEST_NUM] = { 0xF0, 0xE1, 0xD2, 0xC3, 0xB4, 0xA5, 0x96, 0x87, 0x78, 0x69, 0x5A, 0x4B, 0x3C, 0x2D, 0x1E, 0x0F, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77 }; static unsigned char key_data[8] = { 0xFE, 0xDC, 0xBA, 0x98, 0x76, 0x54, 0x32, 0x10 }; static unsigned char key_out[KEY_TEST_NUM][8] = { { 0xF9, 0xAD, 0x59, 0x7C, 0x49, 0xDB, 0x00, 0x5E }, { 0xE9, 0x1D, 0x21, 0xC1, 0xD9, 0x61, 0xA6, 0xD6 }, { 0xE9, 0xC2, 0xB7, 0x0A, 0x1B, 0xC6, 0x5C, 0xF3 }, { 0xBE, 0x1E, 0x63, 0x94, 0x08, 0x64, 0x0F, 0x05 }, { 0xB3, 0x9E, 0x44, 0x48, 0x1B, 0xDB, 0x1E, 0x6E }, { 0x94, 0x57, 0xAA, 0x83, 0xB1, 0x92, 0x8C, 0x0D }, { 0x8B, 0xB7, 0x70, 0x32, 0xF9, 0x60, 0x62, 0x9D }, { 0xE8, 0x7A, 0x24, 0x4E, 0x2C, 0xC8, 0x5E, 0x82 }, { 0x15, 0x75, 0x0E, 0x7A, 0x4F, 0x4E, 0xC5, 0x77 }, { 0x12, 0x2B, 0xA7, 0x0B, 0x3A, 0xB6, 0x4A, 0xE0 }, { 0x3A, 0x83, 0x3C, 0x9A, 0xFF, 0xC5, 0x37, 0xF6 }, { 0x94, 0x09, 0xDA, 0x87, 0xA9, 0x0F, 0x6B, 0xF2 }, { 0x88, 0x4F, 0x80, 0x62, 0x50, 0x60, 0xB8, 0xB4 }, { 0x1F, 0x85, 0x03, 0x1C, 0x19, 0xE1, 0x19, 0x68 }, { 0x79, 0xD9, 0x37, 0x3A, 0x71, 0x4C, 0xA3, 0x4F }, { 0x93, 0x14, 0x28, 0x87, 0xEE, 0x3B, 0xE1, 0x5C }, { 0x03, 0x42, 0x9E, 0x83, 0x8C, 0xE2, 0xD1, 0x4B }, { 0xA4, 0x29, 0x9E, 0x27, 0x46, 0x9F, 0xF6, 0x7B }, { 0xAF, 0xD5, 0xAE, 0xD1, 0xC1, 0xBC, 0x96, 0xA8 }, { 0x10, 0x85, 0x1C, 0x0E, 0x38, 0x58, 0xDA, 0x9F }, { 0xE6, 0xF5, 0x1E, 0xD7, 0x9B, 0x9D, 0xB2, 0x1F }, { 0x64, 0xA6, 0xE1, 0x4A, 0xFD, 0x36, 0xB4, 0x6F }, { 0x80, 0xC7, 0xD7, 0xD4, 0x5A, 0x54, 0x79, 0xAD }, { 0x05, 0x04, 0x4B, 0x62, 0xFA, 0x52, 0xD0, 0x80 } }; int main(void) { gcry_cipher_hd_t hd; gcry_error_t err; err = gcry_cipher_open(&hd, GCRY_CIPHER_BLOWFISH, GCRY_CIPHER_MODE_ECB, 0); if (err != 0) { printf("open: %s\n", gcry_strerror(err)); return 1; } for (unsigned i = 0; i < KEY_TEST_NUM - 1; i++) { err = gcry_cipher_setkey(hd, key_test, i + 1); if (err != 0) { printf("setkey %d: %s\n", i, gcry_strerror(err)); goto end; } unsigned char out[8]; err = gcry_cipher_encrypt(hd, out, sizeof(out), key_data, sizeof(key_data)); if (err != 0) { printf("decrypt %d: %s\n", i, gcry_strerror(err)); goto end; } if (memcmp(out, key_out[i], 8) != 0) { printf("Test failure: %d\n", i); goto end; } } puts("Passed."); end: gcry_cipher_close(hd); return err != 0; } From jussi.kivilinna at iki.fi Tue Apr 16 22:03:33 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 16 Apr 2019 23:03:33 +0300 Subject: [PATCH] twofish-amd64: do not use xchg instruction Message-ID: <155544501358.19493.14951648802162799382.stgit@localhost.localdomain> * cipher/twofish-amd64.S (g1g2_3): Swap ab and cd registers using 'movq' instructions instead of 'xchgq'. -- Avoiding xchg instruction improves three block parallel performance by ~3% on Intel Haswell. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S index 7a836463c..134d6401e 100644 --- a/cipher/twofish-amd64.S +++ b/cipher/twofish-amd64.S @@ -368,15 +368,21 @@ ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block; /* G1,2 && G2,2 */ \ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ - xchgq cd ## 0, ab ## 0; \ + movq ab ## 0, RT0; \ + movq cd ## 0, ab ## 0; \ + movq RT0, cd ## 0; \ \ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ - xchgq cd ## 1, ab ## 1; \ + movq ab ## 1, RT0; \ + movq cd ## 1, ab ## 1; \ + movq RT0, cd ## 1; \ \ do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ - xchgq cd ## 2, ab ## 2; + movq ab ## 2, RT0; \ + movq cd ## 2, ab ## 2; \ + movq RT0, cd ## 2; #define enc_round_end(ab, x, y, n) \ addl y ## d, x ## d; \ From jussi.kivilinna at iki.fi Wed Apr 17 18:43:24 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 17 Apr 2019 19:43:24 +0300 Subject: Blowfish actually supports more than 128-bit keys In-Reply-To: <20190409221810.GA10760@al> References: <20190409221810.GA10760@al> Message-ID: <42183823-d2f4-8750-8727-19d8d9c87cc9@iki.fi> Hello, On 10.4.2019 1.18, Peter Wu wrote: > Hi, > > The current cipher documentation[1] reports that GCRY_CIPHER_BLOWFISH > only supports 128-bit keys. In the long past, a "BLOWFISH160" variant > seems to have existed which supported 160-bit keys as reported through > gcry_cipher_get_algo_keylen. > > One of our users would like to use Blowfish with 576 bit keys (don't > ask). Based on the documentation it would not be possible. The source > code (cipher/blowfish.c) however shows that do_bf_setkey function does > not limit the key size. In fact it is designed to support any number of > bytes (up to 72 bytes / 576 bits). > > Could this be documented such that we can rely on it? Attached are two > test programs: > - One using test vectors from OpenSSL (key lengths 8..200 bits). > - One using Eric Young's test vector as linked by Schneier[2] (key > lengths 8..192 bits). I guess it would make sense to update documentation to match existing implementation. It would be good to have test vector for maximum key size. I've tried to search for existing test vectors for key lengths of 448-bit and 576-bit, but have not yet found one for the latter. > > Rejecting key lengths above 576 bits (72 bytes) might be a good idea. > Rejecting 0 bytes would also be good to avoid a buffer overrun by one > byte. I have no idea why someone would like to use a very short key > though... > Yes, limiting to supported key length would be a good thing. -Jussi From jussi.kivilinna at iki.fi Wed Apr 17 22:16:17 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Wed, 17 Apr 2019 23:16:17 +0300 Subject: [PATCH] Limit and document Blowfish key lengths to 8-576 bits Message-ID: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain> * cipher/blowfish.c (BLOWFISH_KEY_MIN_BITS) (BLOWFISH_KEY_MAX_BITS): New. (do_bf_setkey): Check input key length to MIN_BITS and MAX_BITS. * doc/gcrypt.texi: Update supported Blowfish key lengths. * tests/basic.c (check_ecb_cipher): New, with Blowfish test vectors for different key lengths. (check_cipher_modes): Call 'check_ecb_cipher'. -- As noted by Peter Wu, Blowfish cipher implementation already supports key lengths 8 to 576 bits [1]. This change updates documentation to reflect that and adds new test vectors to check handling of different key lengths. [1] https://lists.gnupg.org/pipermail/gcrypt-devel/2019-April/004680.html Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/blowfish.c b/cipher/blowfish.c index ea6e64a7b..a1d81d310 100644 --- a/cipher/blowfish.c +++ b/cipher/blowfish.c @@ -41,6 +41,8 @@ #include "cipher-selftest.h" #define BLOWFISH_BLOCKSIZE 8 +#define BLOWFISH_KEY_MIN_BITS 8 +#define BLOWFISH_KEY_MAX_BITS 576 /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ @@ -1018,6 +1020,10 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, unsigned keylen) if( selftest_failed ) return GPG_ERR_SELFTEST_FAILED; + if (keylen < BLOWFISH_KEY_MIN_BITS / 8 || + keylen > BLOWFISH_KEY_MAX_BITS / 8) + return GPG_ERR_INV_KEYLEN; + memset(hset, 0, sizeof(hset)); for(i=0; i < 16+2; i++ ) diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi index 8b765ba80..d7bfa4c27 100644 --- a/doc/gcrypt.texi +++ b/doc/gcrypt.texi @@ -1538,7 +1538,7 @@ This is the IDEA algorithm. @cindex Triple-DES @cindex DES-EDE @cindex Digital Encryption Standard -Triple-DES with 3 Keys as EDE. The key size of this algorithm is 168 but +Triple-DES with 3 Keys as EDE. The key size of this algorithm is 168 bits but you have to pass 192 bits because the most significant bits of each byte are ignored. @@ -1548,8 +1548,8 @@ CAST128-5 block cipher algorithm. The key size is 128 bits. @item GCRY_CIPHER_BLOWFISH @cindex Blowfish -The blowfish algorithm. The current implementation allows only for a key -size of 128 bits. +The blowfish algorithm. The supported key sizes are 8 to 576 bits in +8 bit increments. @item GCRY_CIPHER_SAFER_SK128 Reserved and not currently implemented. diff --git a/tests/basic.c b/tests/basic.c index 3d6e8fc1e..792b7737b 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -446,6 +446,239 @@ check_aes128_cbc_cts_cipher (void) fprintf (stderr, " Completed AES128 CBC CTS checks.\n"); } +static void +check_ecb_cipher (void) +{ + /* ECB cipher check. Mainly for testing underlying block cipher. */ + static const struct tv + { + int algo; + const char *key; + struct + { + const char *plaintext; + int keylen; + int inlen; + const char *out; + } data[MAX_DATA_LEN]; + } tv[] = + { + /* Test vectors from OpenSSL for key lengths of 8 to 200 bits */ + { GCRY_CIPHER_BLOWFISH, + "\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f" + "\x00\x11\x22\x33\x44\x55\x66\x77\x88", + { { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 1, + 8, + "\xf9\xad\x59\x7c\x49\xdb\x00\x5e" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 2, + 8, + "\xe9\x1d\x21\xc1\xd9\x61\xa6\xd6" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 3, + 8, + "\xe9\xc2\xb7\x0a\x1b\xc6\x5c\xf3" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 4, + 8, + "\xbe\x1e\x63\x94\x08\x64\x0f\x05" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 5, + 8, + "\xb3\x9e\x44\x48\x1b\xdb\x1e\x6e" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 6, + 8, + "\x94\x57\xaa\x83\xb1\x92\x8c\x0d" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 7, + 8, + "\x8b\xb7\x70\x32\xf9\x60\x62\x9d" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 8, + 8, + "\xe8\x7a\x24\x4e\x2c\xc8\x5e\x82" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 9, + 8, + "\x15\x75\x0e\x7a\x4f\x4e\xc5\x77" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 10, + 8, + "\x12\x2b\xa7\x0b\x3a\xb6\x4a\xe0" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 11, + 8, + "\x3a\x83\x3c\x9a\xff\xc5\x37\xf6" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 12, + 8, + "\x94\x09\xda\x87\xa9\x0f\x6b\xf2" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 13, + 8, + "\x88\x4f\x80\x62\x50\x60\xb8\xb4" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 14, + 8, + "\x1f\x85\x03\x1c\x19\xe1\x19\x68" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 15, + 8, + "\x79\xd9\x37\x3a\x71\x4c\xa3\x4f" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 16, + 8, + "\x93\x14\x28\x87\xee\x3b\xe1\x5c" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 17, + 8, + "\x03\x42\x9e\x83\x8c\xe2\xd1\x4b" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 18, + 8, + "\xa4\x29\x9e\x27\x46\x9f\xf6\x7b" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 19, + 8, + "\xaf\xd5\xae\xd1\xc1\xbc\x96\xa8" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 20, + 8, + "\x10\x85\x1c\x0e\x38\x58\xda\x9f" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 21, + 8, + "\xe6\xf5\x1e\xd7\x9b\x9d\xb2\x1f" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 22, + 8, + "\x64\xa6\xe1\x4a\xfd\x36\xb4\x6f" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 23, + 8, + "\x80\xc7\xd7\xd4\x5a\x54\x79\xad" }, + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 24, + 8, + "\x05\x04\x4b\x62\xfa\x52\xd0\x80" } + } + }, + /* Test vector from Linux kernel for key length of 448 bits */ + { GCRY_CIPHER_BLOWFISH, + "\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f" + "\x00\x11\x22\x33\x44\x55\x66\x77\x04\x68\x91\x04\xc2\xfd\x3b\x2f" + "\x58\x40\x23\x64\x1a\xba\x61\x76\x1f\x1f\x1f\x1f\x0e\x0e\x0e\x0e" + "\xff\xff\xff\xff\xff\xff\xff\xff", + { { "\xfe\xdc\xba\x98\x76\x54\x32\x10", + 56, + 8, + "\xc0\x45\x04\x01\x2e\x4e\x1f\x53" } } + }, + }; + gcry_cipher_hd_t hde, hdd; + unsigned char out[MAX_DATA_LEN]; + int i, j, keylen, algo; + gcry_error_t err = 0; + + if (verbose) + fprintf (stderr, " Starting ECB checks.\n"); + + for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++) + { + algo = tv[i].algo; + + if (gcry_cipher_test_algo (algo) && in_fips_mode) + { + if (verbose) + fprintf (stderr, " algorithm %d not available in fips mode\n", + algo); + continue; + } + + if (verbose) + fprintf (stderr, " checking ECB mode for %s [%i]\n", + gcry_cipher_algo_name (algo), + algo); + err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_ECB, 0); + if (!err) + err = gcry_cipher_open (&hdd, algo, GCRY_CIPHER_MODE_ECB, 0); + if (err) + { + fail ("ecb-algo:%d-tv:%d, gcry_cipher_open failed: %s\n", algo, i, + gpg_strerror (err)); + return; + } + + for (j = 0; tv[i].data[j].inlen; j++) + { + keylen = tv[i].data[j].keylen; + if (!keylen) + { + keylen = gcry_cipher_get_algo_keylen(algo); + if (!keylen) + { + fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_get_algo_keylen failed\n", + algo, i, j); + return; + } + } + + err = gcry_cipher_setkey (hde, tv[i].key, keylen); + if (!err) + err = gcry_cipher_setkey (hdd, tv[i].key, keylen); + if (err) + { + fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_setkey failed: %s\n", + algo, i, j, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + err = gcry_cipher_encrypt (hde, out, MAX_DATA_LEN, + tv[i].data[j].plaintext, + tv[i].data[j].inlen); + if (err) + { + fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_encrypt failed: %s\n", + algo, i, j, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].data[j].out, out, tv[i].data[j].inlen)) + { + fail ("ecb-algo:%d-tv:%d-data:%d, encrypt mismatch entry\n", + algo, i, j); + } + + err = gcry_cipher_decrypt (hdd, out, tv[i].data[j].inlen, NULL, 0); + if (err) + { + fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_decrypt failed: %s\n", + algo, i, j, gpg_strerror (err)); + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + return; + } + + if (memcmp (tv[i].data[j].plaintext, out, tv[i].data[j].inlen)) + { + fail ("ecb-algo:%d-tv:%d-data:%d, decrypt mismatch entry\n", + algo, i, j); + } + } + + gcry_cipher_close (hde); + gcry_cipher_close (hdd); + } + if (verbose) + fprintf (stderr, " Completed ECB checks.\n"); +} + static void check_ctr_cipher (void) { @@ -7916,6 +8149,7 @@ check_cipher_modes(void) if (verbose) fprintf (stderr, "Starting Cipher Mode checks.\n"); + check_ecb_cipher (); check_aes128_cbc_cts_cipher (); check_cbc_mac_cipher (); check_ctr_cipher (); From peter at lekensteyn.nl Wed Apr 17 23:50:36 2019 From: peter at lekensteyn.nl (Peter Wu) Date: Wed, 17 Apr 2019 22:50:36 +0100 Subject: [PATCH] Limit and document Blowfish key lengths to 8-576 bits In-Reply-To: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain> References: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain> Message-ID: <20190417215036.GA20903@al> Hi Jussi, Just some notes below on the tests, documentation looks good to me. Additionally, indentation in this file is a bit of a mess with mixed tabs and spaces. Should gcry_cipher_get_algo_keylen be modified as well to return "the maximum supported key length"? Hopefully it does not break stuff that assumed this to be fixed. On Wed, Apr 17, 2019 at 11:16:17PM +0300, Jussi Kivilinna wrote: > * cipher/blowfish.c (BLOWFISH_KEY_MIN_BITS) > (BLOWFISH_KEY_MAX_BITS): New. > (do_bf_setkey): Check input key length to MIN_BITS and MAX_BITS. > * doc/gcrypt.texi: Update supported Blowfish key lengths. > * tests/basic.c (check_ecb_cipher): New, with Blowfish test vectors > for different key lengths. > (check_cipher_modes): Call 'check_ecb_cipher'. > -- > > As noted by Peter Wu, Blowfish cipher implementation already supports key > lengths 8 to 576 bits [1]. This change updates documentation to reflect > that and adds new test vectors to check handling of different key lengths. > > [1] https://lists.gnupg.org/pipermail/gcrypt-devel/2019-April/004680.html > > Signed-off-by: Jussi Kivilinna > --- > 0 files changed > > diff --git a/cipher/blowfish.c b/cipher/blowfish.c > index ea6e64a7b..a1d81d310 100644 > --- a/cipher/blowfish.c > +++ b/cipher/blowfish.c > @@ -41,6 +41,8 @@ > #include "cipher-selftest.h" > > #define BLOWFISH_BLOCKSIZE 8 > +#define BLOWFISH_KEY_MIN_BITS 8 > +#define BLOWFISH_KEY_MAX_BITS 576 > > > /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ > @@ -1018,6 +1020,10 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, unsigned keylen) > if( selftest_failed ) > return GPG_ERR_SELFTEST_FAILED; > > + if (keylen < BLOWFISH_KEY_MIN_BITS / 8 || > + keylen > BLOWFISH_KEY_MAX_BITS / 8) > + return GPG_ERR_INV_KEYLEN; > + > memset(hset, 0, sizeof(hset)); > > for(i=0; i < 16+2; i++ ) > diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi > index 8b765ba80..d7bfa4c27 100644 > --- a/doc/gcrypt.texi > +++ b/doc/gcrypt.texi > @@ -1538,7 +1538,7 @@ This is the IDEA algorithm. > @cindex Triple-DES > @cindex DES-EDE > @cindex Digital Encryption Standard > -Triple-DES with 3 Keys as EDE. The key size of this algorithm is 168 but > +Triple-DES with 3 Keys as EDE. The key size of this algorithm is 168 bits but > you have to pass 192 bits because the most significant bits of each byte > are ignored. > > @@ -1548,8 +1548,8 @@ CAST128-5 block cipher algorithm. The key size is 128 bits. > > @item GCRY_CIPHER_BLOWFISH > @cindex Blowfish > -The blowfish algorithm. The current implementation allows only for a key > -size of 128 bits. > +The blowfish algorithm. The supported key sizes are 8 to 576 bits in > +8 bit increments. > > @item GCRY_CIPHER_SAFER_SK128 > Reserved and not currently implemented. > diff --git a/tests/basic.c b/tests/basic.c > index 3d6e8fc1e..792b7737b 100644 > --- a/tests/basic.c > +++ b/tests/basic.c > @@ -446,6 +446,239 @@ check_aes128_cbc_cts_cipher (void) > fprintf (stderr, " Completed AES128 CBC CTS checks.\n"); > } > > +static void > +check_ecb_cipher (void) > +{ > + /* ECB cipher check. Mainly for testing underlying block cipher. */ > + static const struct tv > + { > + int algo; > + const char *key; > + struct > + { > + const char *plaintext; > + int keylen; > + int inlen; > + const char *out; > + } data[MAX_DATA_LEN]; > + } tv[] = > + { > + /* Test vectors from OpenSSL for key lengths of 8 to 200 bits */ > + { GCRY_CIPHER_BLOWFISH, > + "\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f" > + "\x00\x11\x22\x33\x44\x55\x66\x77\x88", > + { { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 1, > + 8, > + "\xf9\xad\x59\x7c\x49\xdb\x00\x5e" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 2, > + 8, > + "\xe9\x1d\x21\xc1\xd9\x61\xa6\xd6" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 3, > + 8, > + "\xe9\xc2\xb7\x0a\x1b\xc6\x5c\xf3" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 4, > + 8, > + "\xbe\x1e\x63\x94\x08\x64\x0f\x05" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 5, > + 8, > + "\xb3\x9e\x44\x48\x1b\xdb\x1e\x6e" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 6, > + 8, > + "\x94\x57\xaa\x83\xb1\x92\x8c\x0d" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 7, > + 8, > + "\x8b\xb7\x70\x32\xf9\x60\x62\x9d" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 8, > + 8, > + "\xe8\x7a\x24\x4e\x2c\xc8\x5e\x82" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 9, > + 8, > + "\x15\x75\x0e\x7a\x4f\x4e\xc5\x77" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 10, > + 8, > + "\x12\x2b\xa7\x0b\x3a\xb6\x4a\xe0" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 11, > + 8, > + "\x3a\x83\x3c\x9a\xff\xc5\x37\xf6" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 12, > + 8, > + "\x94\x09\xda\x87\xa9\x0f\x6b\xf2" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 13, > + 8, > + "\x88\x4f\x80\x62\x50\x60\xb8\xb4" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 14, > + 8, > + "\x1f\x85\x03\x1c\x19\xe1\x19\x68" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 15, > + 8, > + "\x79\xd9\x37\x3a\x71\x4c\xa3\x4f" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 16, > + 8, > + "\x93\x14\x28\x87\xee\x3b\xe1\x5c" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 17, > + 8, > + "\x03\x42\x9e\x83\x8c\xe2\xd1\x4b" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 18, > + 8, > + "\xa4\x29\x9e\x27\x46\x9f\xf6\x7b" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 19, > + 8, > + "\xaf\xd5\xae\xd1\xc1\xbc\x96\xa8" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 20, > + 8, > + "\x10\x85\x1c\x0e\x38\x58\xda\x9f" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 21, > + 8, > + "\xe6\xf5\x1e\xd7\x9b\x9d\xb2\x1f" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 22, > + 8, > + "\x64\xa6\xe1\x4a\xfd\x36\xb4\x6f" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 23, > + 8, > + "\x80\xc7\xd7\xd4\x5a\x54\x79\xad" }, > + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 24, > + 8, > + "\x05\x04\x4b\x62\xfa\x52\xd0\x80" } > + } > + }, > + /* Test vector from Linux kernel for key length of 448 bits */ > + { GCRY_CIPHER_BLOWFISH, > + "\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f" > + "\x00\x11\x22\x33\x44\x55\x66\x77\x04\x68\x91\x04\xc2\xfd\x3b\x2f" > + "\x58\x40\x23\x64\x1a\xba\x61\x76\x1f\x1f\x1f\x1f\x0e\x0e\x0e\x0e" > + "\xff\xff\xff\xff\xff\xff\xff\xff", > + { { "\xfe\xdc\xba\x98\x76\x54\x32\x10", > + 56, > + 8, > + "\xc0\x45\x04\x01\x2e\x4e\x1f\x53" } } > + }, > + }; > + gcry_cipher_hd_t hde, hdd; > + unsigned char out[MAX_DATA_LEN]; > + int i, j, keylen, algo; > + gcry_error_t err = 0; > + > + if (verbose) > + fprintf (stderr, " Starting ECB checks.\n"); > + > + for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++) > + { > + algo = tv[i].algo; > + > + if (gcry_cipher_test_algo (algo) && in_fips_mode) > + { > + if (verbose) > + fprintf (stderr, " algorithm %d not available in fips mode\n", > + algo); > + continue; > + } > + > + if (verbose) > + fprintf (stderr, " checking ECB mode for %s [%i]\n", > + gcry_cipher_algo_name (algo), > + algo); > + err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_ECB, 0); > + if (!err) > + err = gcry_cipher_open (&hdd, algo, GCRY_CIPHER_MODE_ECB, 0); > + if (err) > + { > + fail ("ecb-algo:%d-tv:%d, gcry_cipher_open failed: %s\n", algo, i, > + gpg_strerror (err)); You do close the cipher handle below in the error case. For consistency, should you do it here (and below) as well? > + return; > + } > + > + for (j = 0; tv[i].data[j].inlen; j++) The arrays are not terminated with an empty element, this probably trips over a buffer overflow error if you run it with AddressSanitizer. > + { > + keylen = tv[i].data[j].keylen; > + if (!keylen) > + { > + keylen = gcry_cipher_get_algo_keylen(algo); > + if (!keylen) > + { > + fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_get_algo_keylen failed\n", > + algo, i, j); > + return; > + } > + } This check is dead code, the key length is always specified here. > + > + err = gcry_cipher_setkey (hde, tv[i].key, keylen); > + if (!err) > + err = gcry_cipher_setkey (hdd, tv[i].key, keylen); > + if (err) > + { > + fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_setkey failed: %s\n", > + algo, i, j, gpg_strerror (err)); > + gcry_cipher_close (hde); > + gcry_cipher_close (hdd); > + return; > + } > + > + err = gcry_cipher_encrypt (hde, out, MAX_DATA_LEN, > + tv[i].data[j].plaintext, > + tv[i].data[j].inlen); > + if (err) > + { > + fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_encrypt failed: %s\n", > + algo, i, j, gpg_strerror (err)); > + gcry_cipher_close (hde); > + gcry_cipher_close (hdd); > + return; > + } > + > + if (memcmp (tv[i].data[j].out, out, tv[i].data[j].inlen)) > + { > + fail ("ecb-algo:%d-tv:%d-data:%d, encrypt mismatch entry\n", > + algo, i, j); > + } > + > + err = gcry_cipher_decrypt (hdd, out, tv[i].data[j].inlen, NULL, 0); > + if (err) > + { > + fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_decrypt failed: %s\n", > + algo, i, j, gpg_strerror (err)); > + gcry_cipher_close (hde); > + gcry_cipher_close (hdd); > + return; > + } > + > + if (memcmp (tv[i].data[j].plaintext, out, tv[i].data[j].inlen)) > + { > + fail ("ecb-algo:%d-tv:%d-data:%d, decrypt mismatch entry\n", > + algo, i, j); > + } > + } > + > + gcry_cipher_close (hde); > + gcry_cipher_close (hdd); > + } > + if (verbose) > + fprintf (stderr, " Completed ECB checks.\n"); > +} > + > static void > check_ctr_cipher (void) > { > @@ -7916,6 +8149,7 @@ check_cipher_modes(void) > if (verbose) > fprintf (stderr, "Starting Cipher Mode checks.\n"); > > + check_ecb_cipher (); > check_aes128_cbc_cts_cipher (); > check_cbc_mac_cipher (); > check_ctr_cipher (); > > From jussi.kivilinna at iki.fi Thu Apr 18 17:38:48 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 18 Apr 2019 18:38:48 +0300 Subject: [PATCH] Limit and document Blowfish key lengths to 8-576 bits In-Reply-To: <20190417215036.GA20903@al> References: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain> <20190417215036.GA20903@al> Message-ID: <6fea47bc-24af-cad3-282b-9947accd464e@iki.fi> Hello, On 18.4.2019 0.50, Peter Wu wrote: > Hi Jussi, > > Just some notes below on the tests, documentation looks good to me. > Additionally, indentation in this file is a bit of a mess with mixed > tabs and spaces. > > Should gcry_cipher_get_algo_keylen be modified as well to return "the > maximum supported key length"? Hopefully it does not break stuff that > assumed this to be fixed. I think it's better not to change the return value for gcry_cipher_get_algo_keylen as existing users might depend it to stay fixed to 128bits. > > On Wed, Apr 17, 2019 at 11:16:17PM +0300, Jussi Kivilinna wrote: >> * cipher/blowfish.c (BLOWFISH_KEY_MIN_BITS) >> (BLOWFISH_KEY_MAX_BITS): New. >> (do_bf_setkey): Check input key length to MIN_BITS and MAX_BITS. >> * doc/gcrypt.texi: Update supported Blowfish key lengths. >> * tests/basic.c (check_ecb_cipher): New, with Blowfish test vectors >> for different key lengths. >> (check_cipher_modes): Call 'check_ecb_cipher'. >> -- >> >> As noted by Peter Wu, Blowfish cipher implementation already supports key >> lengths 8 to 576 bits [1]. This change updates documentation to reflect >> that and adds new test vectors to check handling of different key lengths. >> >> [1] https://lists.gnupg.org/pipermail/gcrypt-devel/2019-April/004680.html >> >> Signed-off-by: Jussi Kivilinna >> --- >> 0 files changed >> >> diff --git a/cipher/blowfish.c b/cipher/blowfish.c >> index ea6e64a7b..a1d81d310 100644 >> --- a/cipher/blowfish.c >> +++ b/cipher/blowfish.c >> @@ -41,6 +41,8 @@ >> #include "cipher-selftest.h" >> >> #define BLOWFISH_BLOCKSIZE 8 >> +#define BLOWFISH_KEY_MIN_BITS 8 >> +#define BLOWFISH_KEY_MAX_BITS 576 >> >> >> /* USE_AMD64_ASM indicates whether to use AMD64 assembly code. */ >> @@ -1018,6 +1020,10 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, unsigned keylen) >> if( selftest_failed ) >> return GPG_ERR_SELFTEST_FAILED; >> >> + if (keylen < BLOWFISH_KEY_MIN_BITS / 8 || >> + keylen > BLOWFISH_KEY_MAX_BITS / 8) >> + return GPG_ERR_INV_KEYLEN; >> + >> memset(hset, 0, sizeof(hset)); >> >> for(i=0; i < 16+2; i++ ) >> diff --git a/doc/gcrypt.texi b/doc/gcrypt.texi >> index 8b765ba80..d7bfa4c27 100644 >> --- a/doc/gcrypt.texi >> +++ b/doc/gcrypt.texi >> @@ -1538,7 +1538,7 @@ This is the IDEA algorithm. >> @cindex Triple-DES >> @cindex DES-EDE >> @cindex Digital Encryption Standard >> -Triple-DES with 3 Keys as EDE. The key size of this algorithm is 168 but >> +Triple-DES with 3 Keys as EDE. The key size of this algorithm is 168 bits but >> you have to pass 192 bits because the most significant bits of each byte >> are ignored. >> >> @@ -1548,8 +1548,8 @@ CAST128-5 block cipher algorithm. The key size is 128 bits. >> >> @item GCRY_CIPHER_BLOWFISH >> @cindex Blowfish >> -The blowfish algorithm. The current implementation allows only for a key >> -size of 128 bits. >> +The blowfish algorithm. The supported key sizes are 8 to 576 bits in >> +8 bit increments. >> >> @item GCRY_CIPHER_SAFER_SK128 >> Reserved and not currently implemented. >> diff --git a/tests/basic.c b/tests/basic.c >> index 3d6e8fc1e..792b7737b 100644 >> --- a/tests/basic.c >> +++ b/tests/basic.c >> @@ -446,6 +446,239 @@ check_aes128_cbc_cts_cipher (void) >> fprintf (stderr, " Completed AES128 CBC CTS checks.\n"); >> } >> >> +static void >> +check_ecb_cipher (void) >> +{ >> + /* ECB cipher check. Mainly for testing underlying block cipher. */ >> + static const struct tv >> + { >> + int algo; >> + const char *key; >> + struct >> + { >> + const char *plaintext; >> + int keylen; >> + int inlen; >> + const char *out; >> + } data[MAX_DATA_LEN]; >> + } tv[] = >> + { >> + /* Test vectors from OpenSSL for key lengths of 8 to 200 bits */ >> + { GCRY_CIPHER_BLOWFISH, >> + "\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f" >> + "\x00\x11\x22\x33\x44\x55\x66\x77\x88", >> + { { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 1, >> + 8, >> + "\xf9\xad\x59\x7c\x49\xdb\x00\x5e" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 2, >> + 8, >> + "\xe9\x1d\x21\xc1\xd9\x61\xa6\xd6" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 3, >> + 8, >> + "\xe9\xc2\xb7\x0a\x1b\xc6\x5c\xf3" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 4, >> + 8, >> + "\xbe\x1e\x63\x94\x08\x64\x0f\x05" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 5, >> + 8, >> + "\xb3\x9e\x44\x48\x1b\xdb\x1e\x6e" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 6, >> + 8, >> + "\x94\x57\xaa\x83\xb1\x92\x8c\x0d" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 7, >> + 8, >> + "\x8b\xb7\x70\x32\xf9\x60\x62\x9d" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 8, >> + 8, >> + "\xe8\x7a\x24\x4e\x2c\xc8\x5e\x82" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 9, >> + 8, >> + "\x15\x75\x0e\x7a\x4f\x4e\xc5\x77" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 10, >> + 8, >> + "\x12\x2b\xa7\x0b\x3a\xb6\x4a\xe0" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 11, >> + 8, >> + "\x3a\x83\x3c\x9a\xff\xc5\x37\xf6" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 12, >> + 8, >> + "\x94\x09\xda\x87\xa9\x0f\x6b\xf2" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 13, >> + 8, >> + "\x88\x4f\x80\x62\x50\x60\xb8\xb4" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 14, >> + 8, >> + "\x1f\x85\x03\x1c\x19\xe1\x19\x68" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 15, >> + 8, >> + "\x79\xd9\x37\x3a\x71\x4c\xa3\x4f" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 16, >> + 8, >> + "\x93\x14\x28\x87\xee\x3b\xe1\x5c" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 17, >> + 8, >> + "\x03\x42\x9e\x83\x8c\xe2\xd1\x4b" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 18, >> + 8, >> + "\xa4\x29\x9e\x27\x46\x9f\xf6\x7b" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 19, >> + 8, >> + "\xaf\xd5\xae\xd1\xc1\xbc\x96\xa8" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 20, >> + 8, >> + "\x10\x85\x1c\x0e\x38\x58\xda\x9f" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 21, >> + 8, >> + "\xe6\xf5\x1e\xd7\x9b\x9d\xb2\x1f" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 22, >> + 8, >> + "\x64\xa6\xe1\x4a\xfd\x36\xb4\x6f" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 23, >> + 8, >> + "\x80\xc7\xd7\xd4\x5a\x54\x79\xad" }, >> + { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 24, >> + 8, >> + "\x05\x04\x4b\x62\xfa\x52\xd0\x80" } >> + } >> + }, >> + /* Test vector from Linux kernel for key length of 448 bits */ >> + { GCRY_CIPHER_BLOWFISH, >> + "\xf0\xe1\xd2\xc3\xb4\xa5\x96\x87\x78\x69\x5a\x4b\x3c\x2d\x1e\x0f" >> + "\x00\x11\x22\x33\x44\x55\x66\x77\x04\x68\x91\x04\xc2\xfd\x3b\x2f" >> + "\x58\x40\x23\x64\x1a\xba\x61\x76\x1f\x1f\x1f\x1f\x0e\x0e\x0e\x0e" >> + "\xff\xff\xff\xff\xff\xff\xff\xff", >> + { { "\xfe\xdc\xba\x98\x76\x54\x32\x10", >> + 56, >> + 8, >> + "\xc0\x45\x04\x01\x2e\x4e\x1f\x53" } } >> + }, >> + }; >> + gcry_cipher_hd_t hde, hdd; >> + unsigned char out[MAX_DATA_LEN]; >> + int i, j, keylen, algo; >> + gcry_error_t err = 0; >> + >> + if (verbose) >> + fprintf (stderr, " Starting ECB checks.\n"); >> + >> + for (i = 0; i < sizeof (tv) / sizeof (tv[0]); i++) >> + { >> + algo = tv[i].algo; >> + >> + if (gcry_cipher_test_algo (algo) && in_fips_mode) >> + { >> + if (verbose) >> + fprintf (stderr, " algorithm %d not available in fips mode\n", >> + algo); >> + continue; >> + } >> + >> + if (verbose) >> + fprintf (stderr, " checking ECB mode for %s [%i]\n", >> + gcry_cipher_algo_name (algo), >> + algo); >> + err = gcry_cipher_open (&hde, algo, GCRY_CIPHER_MODE_ECB, 0); >> + if (!err) >> + err = gcry_cipher_open (&hdd, algo, GCRY_CIPHER_MODE_ECB, 0); >> + if (err) >> + { >> + fail ("ecb-algo:%d-tv:%d, gcry_cipher_open failed: %s\n", algo, i, >> + gpg_strerror (err)); > > You do close the cipher handle below in the error case. For consistency, > should you do it here (and below) as well? Yes, handles should be closed here too. > >> + return; >> + } >> + >> + for (j = 0; tv[i].data[j].inlen; j++) > > The arrays are not terminated with an empty element, this probably trips > over a buffer overflow error if you run it with AddressSanitizer. Need to add terminating last entry. > >> + { >> + keylen = tv[i].data[j].keylen; > >> + if (!keylen) >> + { >> + keylen = gcry_cipher_get_algo_keylen(algo); >> + if (!keylen) >> + { >> + fail ("ecb-algo:%d-tv:%d-data:%d, gcry_cipher_get_algo_keylen failed\n", >> + algo, i, j); >> + return; >> + } >> + } > > This check is dead code, the key length is always specified here. For now all test vectors specify key length, but if new vectors are add they could use default key length (and also test that gcry_cipher_get_algo_keylen returns expected value). -Jussi From jussi.kivilinna at iki.fi Thu Apr 18 18:30:12 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 18 Apr 2019 19:30:12 +0300 Subject: [PATCH 1/2] hwf-x86: make stack unwinding work at i386 cpuid functions Message-ID: <155560501207.19038.8971332154736111401.stgit@localhost.localdomain> * src/hwf-x86.c (FORCE_FUNC_FRAME_POINTER): New. [__i386__] (is_cpuid_available): Force use of stack frame pointer as inline assembly modifies stack register; Add 'memory' constraint for inline assembly. [__i386__] (get_cpuid): Avoid push/pop instruction when preserving %ebx register over cpuid. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/src/hwf-x86.c b/src/hwf-x86.c index b644eda1f..796e874f0 100644 --- a/src/hwf-x86.c +++ b/src/hwf-x86.c @@ -39,7 +39,14 @@ #if defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && defined (__GNUC__) # define HAS_X86_CPUID 1 -static int +#if _GCRY_GCC_VERSION >= 40700 /* 4.7 */ +# define FORCE_FUNC_FRAME_POINTER \ + __attribute__ ((optimize("no-omit-frame-pointer"))) +#else +# define FORCE_FUNC_FRAME_POINTER +#endif + +static FORCE_FUNC_FRAME_POINTER int is_cpuid_available(void) { int has_cpuid = 0; @@ -63,7 +70,7 @@ is_cpuid_available(void) ".Lno_cpuid%=:\n\t" : "+r" (has_cpuid) : - : "%eax", "%ecx", "cc" + : "%eax", "%ecx", "cc", "memory" ); return has_cpuid; @@ -76,14 +83,14 @@ get_cpuid(unsigned int in, unsigned int *eax, unsigned int *ebx, unsigned int regs[4]; asm volatile - ("pushl %%ebx\n\t" /* Save GOT register. */ - "movl %1, %%ebx\n\t" + ("movl %%ebx, %%edi\n\t" /* Save GOT register. */ + "xorl %%ebx, %%ebx\n\t" "cpuid\n\t" "movl %%ebx, %1\n\t" - "popl %%ebx\n\t" /* Restore GOT register. */ - : "=a" (regs[0]), "=D" (regs[1]), "=c" (regs[2]), "=d" (regs[3]) - : "0" (in), "1" (0), "2" (0), "3" (0) - : "cc" + "movl %%edi, %%ebx\n\t" /* Restore GOT register. */ + : "=a" (regs[0]), "=g" (regs[1]), "=c" (regs[2]), "=d" (regs[3]) + : "0" (in), "2" (0), "3" (0) + : "cc", "edi" ); if (eax) From jussi.kivilinna at iki.fi Thu Apr 18 18:30:17 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Thu, 18 Apr 2019 19:30:17 +0300 Subject: [PATCH 2/2] mpi: make stack unwinding work at i386 mpi functions In-Reply-To: <155560501207.19038.8971332154736111401.stgit@localhost.localdomain> References: <155560501207.19038.8971332154736111401.stgit@localhost.localdomain> Message-ID: <155560501731.19038.16204103468981344016.stgit@localhost.localdomain> * mpi/i386/syntax.h: Include 'config.h'. (CFI_STARTPROC, CFI_ENDPROC, CFI_ADJUST_CFA_OFFSET, CFI_REL_OFFSET) (CFI_RESTORE, CFI_PUSH, CFI_POP): New. * mpi/i386/mpih-add1.S: Add CFI directives. * mpi/i386/mpih-lshift.S: Add CFI directives. * mpi/i386/mpih-mul1.S: Add CFI directives. * mpi/i386/mpih-mul2.S: Add CFI directives. * mpi/i386/mpih-mul3.S: Add CFI directives. * mpi/i386/mpih-rshift.S: Add CFI directives. * mpi/i386/mpih-sub1.S: Add CFI directives. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/mpi/i386/mpih-add1.S b/mpi/i386/mpih-add1.S index 652b23218..32091f340 100644 --- a/mpi/i386/mpih-add1.S +++ b/mpi/i386/mpih-add1.S @@ -44,8 +44,11 @@ ALIGN (3) .globl C_SYMBOL_NAME(_gcry_mpih_add_n) C_SYMBOL_NAME(_gcry_mpih_add_n:) + CFI_STARTPROC() pushl %edi + CFI_PUSH(%edi) pushl %esi + CFI_PUSH(%esi) movl 12(%esp),%edi /* res_ptr */ movl 16(%esp),%esi /* s1_ptr */ @@ -111,6 +114,9 @@ Loop: movl (%esi),%eax negl %eax popl %esi + CFI_POP(%esi) popl %edi + CFI_POP(%edi) ret + CFI_ENDPROC() diff --git a/mpi/i386/mpih-lshift.S b/mpi/i386/mpih-lshift.S index bf8ed9d4c..55da0678d 100644 --- a/mpi/i386/mpih-lshift.S +++ b/mpi/i386/mpih-lshift.S @@ -42,9 +42,13 @@ ALIGN (3) .globl C_SYMBOL_NAME(_gcry_mpih_lshift) C_SYMBOL_NAME(_gcry_mpih_lshift:) + CFI_STARTPROC() pushl %edi + CFI_PUSH(%edi) pushl %esi + CFI_PUSH(%esi) pushl %ebx + CFI_PUSH(%ebx) movl 16(%esp),%edi /* res_ptr */ movl 20(%esp),%esi /* s_ptr */ @@ -88,7 +92,11 @@ Lend: shll %cl,%ebx /* compute least significant limb */ movl %ebx,(%edi) /* store it */ popl %ebx + CFI_POP(%ebx) popl %esi + CFI_POP(%esi) popl %edi + CFI_POP(%edi) ret + CFI_ENDPROC() diff --git a/mpi/i386/mpih-mul1.S b/mpi/i386/mpih-mul1.S index c9760ef92..9679ea622 100644 --- a/mpi/i386/mpih-mul1.S +++ b/mpi/i386/mpih-mul1.S @@ -49,10 +49,15 @@ GLOBL C_SYMBOL_NAME(_gcry_mpih_mul_1) C_SYMBOL_NAME(_gcry_mpih_mul_1:) + CFI_STARTPROC() INSN1(push,l ,R(edi)) + CFI_PUSH(%edi) INSN1(push,l ,R(esi)) + CFI_PUSH(%esi) INSN1(push,l ,R(ebx)) + CFI_PUSH(%ebx) INSN1(push,l ,R(ebp)) + CFI_PUSH(%ebp) INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) @@ -77,8 +82,13 @@ Loop: INSN2(mov,l ,R(eax),R(ebx)) INSN1(pop,l ,R(ebp)) + CFI_POP(%ebp) INSN1(pop,l ,R(ebx)) + CFI_POP(%ebx) INSN1(pop,l ,R(esi)) + CFI_POP(%esi) INSN1(pop,l ,R(edi)) + CFI_POP(%edi) ret + CFI_ENDPROC() diff --git a/mpi/i386/mpih-mul2.S b/mpi/i386/mpih-mul2.S index 9794e1108..fe4129c43 100644 --- a/mpi/i386/mpih-mul2.S +++ b/mpi/i386/mpih-mul2.S @@ -50,10 +50,15 @@ GLOBL C_SYMBOL_NAME(_gcry_mpih_addmul_1) C_SYMBOL_NAME(_gcry_mpih_addmul_1:) + CFI_STARTPROC() INSN1(push,l ,R(edi)) + CFI_PUSH(%edi) INSN1(push,l ,R(esi)) + CFI_PUSH(%esi) INSN1(push,l ,R(ebx)) + CFI_PUSH(%ebx) INSN1(push,l ,R(ebp)) + CFI_PUSH(%ebp) INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) @@ -79,8 +84,13 @@ Loop: INSN2(mov,l ,R(eax),R(ebx)) INSN1(pop,l ,R(ebp)) + CFI_POP(%ebp) INSN1(pop,l ,R(ebx)) + CFI_POP(%ebx) INSN1(pop,l ,R(esi)) + CFI_POP(%esi) INSN1(pop,l ,R(edi)) + CFI_POP(%edi) ret + CFI_ENDPROC() diff --git a/mpi/i386/mpih-mul3.S b/mpi/i386/mpih-mul3.S index 6df201763..87577d54c 100644 --- a/mpi/i386/mpih-mul3.S +++ b/mpi/i386/mpih-mul3.S @@ -50,10 +50,15 @@ GLOBL C_SYMBOL_NAME(_gcry_mpih_submul_1) C_SYMBOL_NAME(_gcry_mpih_submul_1:) + CFI_STARTPROC() INSN1(push,l ,R(edi)) + CFI_PUSH(%edi) INSN1(push,l ,R(esi)) + CFI_PUSH(%esi) INSN1(push,l ,R(ebx)) + CFI_PUSH(%ebx) INSN1(push,l ,R(ebp)) + CFI_PUSH(%ebp) INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) @@ -79,8 +84,13 @@ Loop: INSN2(mov,l ,R(eax),R(ebx)) INSN1(pop,l ,R(ebp)) + CFI_POP(%ebp) INSN1(pop,l ,R(ebx)) + CFI_POP(%ebx) INSN1(pop,l ,R(esi)) + CFI_POP(%esi) INSN1(pop,l ,R(edi)) + CFI_POP(%edi) ret + CFI_ENDPROC() diff --git a/mpi/i386/mpih-rshift.S b/mpi/i386/mpih-rshift.S index 2920e55d8..35a8201f3 100644 --- a/mpi/i386/mpih-rshift.S +++ b/mpi/i386/mpih-rshift.S @@ -43,9 +43,13 @@ ALIGN (3) .globl C_SYMBOL_NAME(_gcry_mpih_rshift) C_SYMBOL_NAME(_gcry_mpih_rshift:) + CFI_STARTPROC() pushl %edi + CFI_PUSH(%edi) pushl %esi + CFI_PUSH(%esi) pushl %ebx + CFI_PUSH(%ebx) movl 16(%esp),%edi /* wp */ movl 20(%esp),%esi /* up */ @@ -67,7 +71,7 @@ C_SYMBOL_NAME(_gcry_mpih_rshift:) movl %ebx,%eax ALIGN (3) -Loop2: movl (%esi,%edx,4),%ebx /* load next higher limb */ +Loop2: movl (%esi,%edx,4),%ebx /* load next higher limb */ shrdl %cl,%ebx,%eax /* compute result limb */ movl %eax,(%edi,%edx,4) /* store it */ incl %edx @@ -91,7 +95,11 @@ Lend2: shrl %cl,%ebx /* compute most significant limb */ movl %ebx,(%edi) /* store it */ popl %ebx + CFI_POP(%ebx) popl %esi + CFI_POP(%esi) popl %edi + CFI_POP(%edi) ret + CFI_ENDPROC() diff --git a/mpi/i386/mpih-sub1.S b/mpi/i386/mpih-sub1.S index f447f7a66..501c4a9fd 100644 --- a/mpi/i386/mpih-sub1.S +++ b/mpi/i386/mpih-sub1.S @@ -45,8 +45,11 @@ ALIGN (3) .globl C_SYMBOL_NAME(_gcry_mpih_sub_n) C_SYMBOL_NAME(_gcry_mpih_sub_n:) + CFI_STARTPROC() pushl %edi + CFI_PUSH(%edi) pushl %esi + CFI_PUSH(%esi) movl 12(%esp),%edi /* res_ptr */ movl 16(%esp),%esi /* s1_ptr */ @@ -112,6 +115,9 @@ Loop: movl (%esi),%eax negl %eax popl %esi + CFI_POP(%esi) popl %edi + CFI_POP(%edi) ret + CFI_ENDPROC() diff --git a/mpi/i386/syntax.h b/mpi/i386/syntax.h index 39ede988f..9101585a8 100644 --- a/mpi/i386/syntax.h +++ b/mpi/i386/syntax.h @@ -26,6 +26,30 @@ * to avoid revealing of sensitive data due to paging etc. */ +#include + +#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES +# define CFI_STARTPROC() .cfi_startproc +# define CFI_ENDPROC() .cfi_endproc +# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off +# define CFI_REL_OFFSET(reg,off) .cfi_rel_offset reg, off +# define CFI_RESTORE(reg) .cfi_restore reg + +# define CFI_PUSH(reg) \ + CFI_ADJUST_CFA_OFFSET(4); CFI_REL_OFFSET(reg, 0) +# define CFI_POP(reg) \ + CFI_ADJUST_CFA_OFFSET(-4); CFI_RESTORE(reg) +#else +# define CFI_STARTPROC() +# define CFI_ENDPROC() +# define CFI_ADJUST_CFA_OFFSET(off) +# define CFI_REL_OFFSET(reg,off) +# define CFI_RESTORE(reg) + +# define CFI_PUSH(reg) +# define CFI_POP(reg) +#endif + #undef ALIGN #if defined (BSD_SYNTAX) || defined (ELF_SYNTAX) From jussi.kivilinna at iki.fi Tue Apr 16 22:04:23 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 16 Apr 2019 23:04:23 +0300 Subject: [PATCH] Add CFI unwind assembly directives for AMD64 Message-ID: <155544506311.19850.12838764214519531613.stgit@localhost.localdomain> * configure.ac (gcry_cv_gcc_asm_cfi_directives): New. * cipher/asm-common-amd64.h (ADD_RIP, CFI_STARTPROC, CFI_ENDPROC) (CFI_REMEMBER_STATE, CFI_RESTORE_STATE, CFI_ADJUST_CFA_OFFSET) (CFI_REL_OFFSET, CFI_DEF_CFA_REGISTER, CFI_REGISTER, CFI_RESTORE) (CFI_PUSH, CFI_POP, CFI_POP_TMP_REG, CFI_LEAVE, DW_REGNO) (DW_SLEB128_7BIT, DW_SLEB128_28BIT, CFI_CFA_ON_STACK) (CFI_REG_ON_STACK): New. (ENTER_SYSV_FUNCPARAMS_0_4, EXIT_SYSV_FUNC): Add CFI directives. * cipher/arcfour-amd64.S: Add CFI directives. * cipher/blake2b-amd64-avx2.S: Add CFI directives. * cipher/blake2s-amd64-avx.S: Add CFI directives. * cipher/blowfish-amd64.S: Add CFI directives. * cipher/camellia-aesni-avx-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/camellia-aesni-avx2-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/cast5-amd64.S: Add CFI directives. * cipher/chacha20-amd64-avx2.S: Add CFI directives. * cipher/chacha20-amd64-ssse3.S: Add CFI directives. * cipher/des-amd64.S: Add CFI directives. * cipher/rijndael-amd64.S: Add CFI directives. * cipher/rijndael-ssse3-amd64-asm.S: Add CFI directives. * cipher/salsa20-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/serpent-avx2-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/serpent-sse2-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha1-avx-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha1-avx-bmi2-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha1-avx2-bmi2-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha1-ssse3-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha256-avx-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha256-avx2-bmi2-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha256-ssse3-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha512-avx-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha512-avx2-bmi2-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/sha512-ssse3-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/twofish-amd64.S: Add CFI directives. * cipher/twofish-avx2-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * cipher/whirlpool-sse2-amd64.S: Add CFI directives; Use 'asm-common-amd64.h'. * mpi/amd64/func_abi.h: Include 'config.h'. (CFI_STARTPROC, CFI_ENDPROC, CFI_ADJUST_CFA_OFFSET, CFI_REL_OFFSET) (CFI_RESTORE, CFI_PUSH, CFI_POP): New. (FUNC_ENTRY, FUNC_EXIT): Add CFI directives. -- This commit adds CFI directives that add DWARF unwinding information for debugger to backtrace when executing code from AMD64 assembly files. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S index c08f3453b..221dfeff7 100644 --- a/cipher/arcfour-amd64.S +++ b/cipher/arcfour-amd64.S @@ -25,9 +25,12 @@ .globl _gcry_arcfour_amd64 ELF(.type _gcry_arcfour_amd64, at function) _gcry_arcfour_amd64: + CFI_STARTPROC() ENTER_SYSV_FUNC_PARAMS_0_4 push %rbp + CFI_PUSH(%rbp) push %rbx + CFI_PUSH(%rbx) mov %rdi, %rbp # key = ARG(key) mov %rsi, %rbx # rbx = ARG(len) mov %rdx, %rsi # in = ARG(in) @@ -92,9 +95,12 @@ _gcry_arcfour_amd64: movb %cl, (4*256)(%rbp) # key->y = y movb %dl, (4*256+4)(%rbp) # key->x = x pop %rbx + CFI_POP(%rbx) pop %rbp + CFI_POP(%rbp) EXIT_SYSV_FUNC ret + CFI_ENDPROC() .L__gcry_arcfour_amd64_end: ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64) diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h index 7eb426495..9d4a028a0 100644 --- a/cipher/asm-common-amd64.h +++ b/cipher/asm-common-amd64.h @@ -41,6 +41,12 @@ # define RIP #endif +#ifdef __PIC__ +# define ADD_RIP +rip +#else +# define ADD_RIP +#endif + #if defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS) || !defined(__PIC__) # define GET_EXTERN_POINTER(name, reg) movabsq $name, reg #else @@ -60,10 +66,101 @@ # endif #endif +#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES +/* CFI directives to emit DWARF stack unwinding information. */ +# define CFI_STARTPROC() .cfi_startproc +# define CFI_ENDPROC() .cfi_endproc +# define CFI_REMEMBER_STATE() .cfi_remember_state +# define CFI_RESTORE_STATE() .cfi_restore_state +# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off +# define CFI_REL_OFFSET(reg,off) .cfi_rel_offset reg, off +# define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg +# define CFI_REGISTER(ro,rn) .cfi_register ro, rn +# define CFI_RESTORE(reg) .cfi_restore reg + +# define CFI_PUSH(reg) \ + CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0) +# define CFI_POP(reg) \ + CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg) +# define CFI_POP_TMP_REG() \ + CFI_ADJUST_CFA_OFFSET(-8); +# define CFI_LEAVE() \ + CFI_ADJUST_CFA_OFFSET(-8); CFI_DEF_CFA_REGISTER(%rsp) + +/* CFA expressions are used for pointing CFA and registers to + * %rsp relative offsets. */ +# define DW_REGNO_rax 0 +# define DW_REGNO_rdx 1 +# define DW_REGNO_rcx 2 +# define DW_REGNO_rbx 3 +# define DW_REGNO_rsi 4 +# define DW_REGNO_rdi 5 +# define DW_REGNO_rbp 6 +# define DW_REGNO_rsp 7 +# define DW_REGNO_r8 8 +# define DW_REGNO_r9 9 +# define DW_REGNO_r10 10 +# define DW_REGNO_r11 11 +# define DW_REGNO_r12 12 +# define DW_REGNO_r13 13 +# define DW_REGNO_r14 14 +# define DW_REGNO_r15 15 + +# define DW_REGNO(reg) DW_REGNO_ ## reg + +/* Fixed length encoding used for integers for now. */ +# define DW_SLEB128_7BIT(value) \ + 0x00|((value) & 0x7f) +# define DW_SLEB128_28BIT(value) \ + 0x80|((value)&0x7f), \ + 0x80|(((value)>>7)&0x7f), \ + 0x80|(((value)>>14)&0x7f), \ + 0x00|(((value)>>21)&0x7f) + +# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \ + .cfi_escape \ + 0x0f, /* DW_CFA_def_cfa_expression */ \ + DW_SLEB128_7BIT(11), /* length */ \ + 0x77, /* DW_OP_breg7, rsp + constant */ \ + DW_SLEB128_28BIT(rsp_offs), \ + 0x06, /* DW_OP_deref */ \ + 0x23, /* DW_OP_plus_constu */ \ + DW_SLEB128_28BIT((cfa_depth)+8) + +# define CFI_REG_ON_STACK(reg,rsp_offs) \ + .cfi_escape \ + 0x10, /* DW_CFA_expression */ \ + DW_SLEB128_7BIT(DW_REGNO(reg)), \ + DW_SLEB128_7BIT(5), /* length */ \ + 0x77, /* DW_OP_breg7, rsp + constant */ \ + DW_SLEB128_28BIT(rsp_offs) + +#else +# define CFI_STARTPROC() +# define CFI_ENDPROC() +# define CFI_REMEMBER_STATE() +# define CFI_RESTORE_STATE() +# define CFI_ADJUST_CFA_OFFSET(off) +# define CFI_REL_OFFSET(reg,off) +# define CFI_DEF_CFA_REGISTER(reg) +# define CFI_REGISTER(ro,rn) +# define CFI_RESTORE(reg) + +# define CFI_PUSH(reg) +# define CFI_POP(reg) +# define CFI_POP_TMP_REG() +# define CFI_LEAVE() + +# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) +# define CFI_REG_ON_STACK(reg,rsp_offs) +#endif + #ifdef HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS # define ENTER_SYSV_FUNC_PARAMS_0_4 \ pushq %rdi; \ + CFI_PUSH(%rdi); \ pushq %rsi; \ + CFI_PUSH(%rsi); \ movq %rcx, %rdi; \ movq %rdx, %rsi; \ movq %r8, %rdx; \ @@ -79,7 +176,9 @@ # define EXIT_SYSV_FUNC \ popq %rsi; \ - popq %rdi; + CFI_POP(%rsi); \ + popq %rdi; \ + CFI_POP(%rdi); #else # define ENTER_SYSV_FUNC_PARAMS_0_4 # define ENTER_SYSV_FUNC_PARAMS_5 diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S index 6bcc5652d..08c816cdf 100644 --- a/cipher/blake2b-amd64-avx2.S +++ b/cipher/blake2b-amd64-avx2.S @@ -207,6 +207,7 @@ _gcry_blake2b_transform_amd64_avx2: * %rsi: blks * %rdx: num_blks */ + CFI_STARTPROC(); vzeroupper; @@ -291,6 +292,7 @@ _gcry_blake2b_transform_amd64_avx2: xor %eax, %eax; vzeroall; ret; + CFI_ENDPROC(); ELF(.size _gcry_blake2b_transform_amd64_avx2, .-_gcry_blake2b_transform_amd64_avx2;) diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S index f7312dbd0..198373262 100644 --- a/cipher/blake2s-amd64-avx.S +++ b/cipher/blake2s-amd64-avx.S @@ -191,6 +191,7 @@ _gcry_blake2s_transform_amd64_avx: * %rsi: blks * %rdx: num_blks */ + CFI_STARTPROC(); vzeroupper; @@ -269,6 +270,7 @@ _gcry_blake2s_transform_amd64_avx: xor %eax, %eax; vzeroall; ret; + CFI_ENDPROC(); ELF(.size _gcry_blake2s_transform_amd64_avx, .-_gcry_blake2s_transform_amd64_avx;) diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S index 02d3b7102..bdb361d7e 100644 --- a/cipher/blowfish-amd64.S +++ b/cipher/blowfish-amd64.S @@ -133,7 +133,9 @@ __blowfish_enc_blk1: * output: * RX0: output plaintext block */ + CFI_STARTPROC(); movq %rbp, %r11; + CFI_REGISTER(%rbp, %r11); load_roundkey_enc(0); round_enc(2); @@ -147,8 +149,10 @@ __blowfish_enc_blk1: add_roundkey_enc(); movq %r11, %rbp; + CFI_RESTORE(%rbp) ret; + CFI_ENDPROC(); ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;) .align 8 @@ -161,6 +165,7 @@ _gcry_blowfish_amd64_do_encrypt: * %rsi: u32 *ret_xl * %rdx: u32 *ret_xr */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 movl (%rdx), RX0d; @@ -178,6 +183,7 @@ _gcry_blowfish_amd64_do_encrypt: EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;) .align 8 @@ -190,6 +196,7 @@ _gcry_blowfish_amd64_encrypt_block: * %rsi: dst * %rdx: src */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 movq %rsi, %r10; @@ -204,6 +211,7 @@ _gcry_blowfish_amd64_encrypt_block: EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;) .align 8 @@ -216,9 +224,11 @@ _gcry_blowfish_amd64_decrypt_block: * %rsi: dst * %rdx: src */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 movq %rbp, %r11; + CFI_REGISTER(%rbp, %r11); movq %rsi, %r10; movq %rdx, RIO; @@ -240,9 +250,11 @@ _gcry_blowfish_amd64_decrypt_block: write_block(); movq %r11, %rbp; + CFI_RESTORE(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;) /********************************************************************** @@ -340,6 +352,7 @@ __blowfish_enc_blk4: * output: * RX0,RX1,RX2,RX3: four output ciphertext blocks */ + CFI_STARTPROC(); preload_roundkey_enc(0); round_enc4(0); @@ -355,6 +368,7 @@ __blowfish_enc_blk4: outbswap_block4(); ret; + CFI_ENDPROC(); ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;) .align 8 @@ -367,6 +381,7 @@ __blowfish_dec_blk4: * output: * RX0,RX1,RX2,RX3: four output plaintext blocks */ + CFI_STARTPROC(); preload_roundkey_dec(17); inbswap_block4(); @@ -384,6 +399,7 @@ __blowfish_dec_blk4: outbswap_block4(); ret; + CFI_ENDPROC(); ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;) .align 8 @@ -396,12 +412,17 @@ _gcry_blowfish_amd64_ctr_enc: * %rdx: src (4 blocks) * %rcx: iv (big endian, 64bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); /* %r11-%r13 are not used by __blowfish_enc_blk4 */ movq %rcx, %r13; /*iv*/ @@ -438,12 +459,17 @@ _gcry_blowfish_amd64_ctr_enc: movq RX3, 3 * 8(%r11); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;) .align 8 @@ -456,12 +482,17 @@ _gcry_blowfish_amd64_cbc_dec: * %rdx: src (4 blocks) * %rcx: iv (64bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); /* %r11-%r13 are not used by __blowfish_dec_blk4 */ movq %rsi, %r11; /*dst*/ @@ -489,12 +520,17 @@ _gcry_blowfish_amd64_cbc_dec: movq RX3, 3 * 8(%r11); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;) .align 8 @@ -507,12 +543,17 @@ _gcry_blowfish_amd64_cfb_dec: * %rdx: src (4 blocks) * %rcx: iv (64bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); /* %r11-%r13 are not used by __blowfish_enc_blk4 */ movq %rcx, %r13; /*iv*/ @@ -543,12 +584,17 @@ _gcry_blowfish_amd64_cfb_dec: movq RX3, 3 * 8(%r11); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;) #endif /*defined(USE_BLOWFISH)*/ diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index 8022934fb..e16d4f613 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -24,17 +24,7 @@ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT) -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -75,10 +65,10 @@ /* \ * S-function with AES subbytes \ */ \ - vmovdqa .Linv_shift_row RIP, t4; \ - vbroadcastss .L0f0f0f0f RIP, t7; \ - vmovdqa .Lpre_tf_lo_s1 RIP, t0; \ - vmovdqa .Lpre_tf_hi_s1 RIP, t1; \ + vmovdqa .Linv_shift_row rRIP, t4; \ + vbroadcastss .L0f0f0f0f rRIP, t7; \ + vmovdqa .Lpre_tf_lo_s1 rRIP, t0; \ + vmovdqa .Lpre_tf_hi_s1 rRIP, t1; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ @@ -91,8 +81,8 @@ vpshufb t4, x6, x6; \ \ /* prefilter sboxes 1, 2 and 3 */ \ - vmovdqa .Lpre_tf_lo_s4 RIP, t2; \ - vmovdqa .Lpre_tf_hi_s4 RIP, t3; \ + vmovdqa .Lpre_tf_lo_s4 rRIP, t2; \ + vmovdqa .Lpre_tf_hi_s4 rRIP, t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x1, t0, t1, t7, t6); \ @@ -106,8 +96,8 @@ filter_8bit(x6, t2, t3, t7, t6); \ \ /* AES subbytes + AES shift rows */ \ - vmovdqa .Lpost_tf_lo_s1 RIP, t0; \ - vmovdqa .Lpost_tf_hi_s1 RIP, t1; \ + vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \ + vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \ vaesenclast t4, x0, x0; \ vaesenclast t4, x7, x7; \ vaesenclast t4, x1, x1; \ @@ -118,16 +108,16 @@ vaesenclast t4, x6, x6; \ \ /* postfilter sboxes 1 and 4 */ \ - vmovdqa .Lpost_tf_lo_s3 RIP, t2; \ - vmovdqa .Lpost_tf_hi_s3 RIP, t3; \ + vmovdqa .Lpost_tf_lo_s3 rRIP, t2; \ + vmovdqa .Lpost_tf_hi_s3 rRIP, t3; \ filter_8bit(x0, t0, t1, t7, t6); \ filter_8bit(x7, t0, t1, t7, t6); \ filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \ \ /* postfilter sbox 3 */ \ - vmovdqa .Lpost_tf_lo_s2 RIP, t4; \ - vmovdqa .Lpost_tf_hi_s2 RIP, t5; \ + vmovdqa .Lpost_tf_lo_s2 rRIP, t4; \ + vmovdqa .Lpost_tf_hi_s2 rRIP, t5; \ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ @@ -442,7 +432,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vmovdqu .Lshufb_16x16b RIP, a0; \ + vmovdqu .Lshufb_16x16b rRIP, a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -508,7 +498,7 @@ vpunpcklwd t1, t3, e; \ vpunpckhwd t1, t3, f; \ \ - vmovdqa .Ltranspose_8x8_shuf RIP, t3; \ + vmovdqa .Ltranspose_8x8_shuf rRIP, t3; \ \ vpunpcklwd g, c, d; \ vpunpckhwd g, c, c; \ @@ -540,7 +530,7 @@ #define inpack16_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio, key) \ vmovq key, x0; \ - vpshufb .Lpack_bswap RIP, x0, x0; \ + vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor 0 * 16(rio), x0, y7; \ vpxor 1 * 16(rio), x0, y6; \ @@ -591,7 +581,7 @@ vmovdqu x0, stack_tmp0; \ \ vmovq key, x0; \ - vpshufb .Lpack_bswap RIP, x0, x0; \ + vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor x0, y7, y7; \ vpxor x0, y6, y6; \ @@ -786,6 +776,7 @@ __camellia_enc_blk16: * %xmm0..%xmm15: 16 encrypted blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ + CFI_STARTPROC(); leaq 8 * 16(%rax), %rcx; @@ -859,6 +850,7 @@ __camellia_enc_blk16: %xmm15, %rax, %rcx, 24); jmp .Lenc_done; + CFI_ENDPROC(); ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;) .align 8 @@ -874,6 +866,7 @@ __camellia_dec_blk16: * %xmm0..%xmm15: 16 plaintext blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ + CFI_STARTPROC(); leaq 8 * 16(%rax), %rcx; @@ -944,6 +937,7 @@ __camellia_dec_blk16: ((key_table + (24) * 8) + 4)(CTX)); jmp .Ldec_max24; + CFI_ENDPROC(); ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;) #define inc_le128(x, minus_one, tmp) \ @@ -963,9 +957,12 @@ _gcry_camellia_aesni_avx_ctr_enc: * %rdx: src (16 blocks) * %rcx: iv (big endian, 128bit) */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -973,7 +970,7 @@ _gcry_camellia_aesni_avx_ctr_enc: andq $~31, %rsp; movq %rsp, %rax; - vmovdqa .Lbswap128_mask RIP, %xmm14; + vmovdqa .Lbswap128_mask rRIP, %xmm14; /* load IV and byteswap */ vmovdqu (%rcx), %xmm15; @@ -1018,12 +1015,12 @@ _gcry_camellia_aesni_avx_ctr_enc: vmovdqa %xmm0, %xmm13; vpshufb %xmm14, %xmm0, %xmm0; inc_le128(%xmm13, %xmm15, %xmm14); - vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13; /* le => be */ + vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; /* le => be */ vmovdqu %xmm13, (%rcx); /* inpack16_pre: */ vmovq (key_table)(CTX), %xmm15; - vpshufb .Lpack_bswap RIP, %xmm15, %xmm15; + vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; vpxor %xmm0, %xmm15, %xmm0; vpxor %xmm1, %xmm15, %xmm1; vpxor %xmm2, %xmm15, %xmm2; @@ -1067,7 +1064,9 @@ _gcry_camellia_aesni_avx_ctr_enc: vzeroall; leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;) .align 8 @@ -1081,9 +1080,12 @@ _gcry_camellia_aesni_avx_cbc_dec: * %rdx: src (16 blocks) * %rcx: iv */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1135,7 +1137,9 @@ _gcry_camellia_aesni_avx_cbc_dec: vzeroall; leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;) .align 8 @@ -1149,9 +1153,12 @@ _gcry_camellia_aesni_avx_cfb_dec: * %rdx: src (16 blocks) * %rcx: iv */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1161,7 +1168,7 @@ _gcry_camellia_aesni_avx_cfb_dec: /* inpack16_pre: */ vmovq (key_table)(CTX), %xmm0; - vpshufb .Lpack_bswap RIP, %xmm0, %xmm0; + vpshufb .Lpack_bswap rRIP, %xmm0, %xmm0; vpxor (%rcx), %xmm0, %xmm15; vmovdqu 15 * 16(%rdx), %xmm1; vmovdqu %xmm1, (%rcx); /* store new IV */ @@ -1207,7 +1214,9 @@ _gcry_camellia_aesni_avx_cfb_dec: vzeroall; leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;) .align 8 @@ -1223,9 +1232,12 @@ _gcry_camellia_aesni_avx_ocb_enc: * %r8 : checksum * %r9 : L pointers (void *L[16]) */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1233,10 +1245,14 @@ _gcry_camellia_aesni_avx_ocb_enc: andq $~31, %rsp; movq %rsp, %rax; - movq %r10, (16 * 16 + 0 * 8)(%rax); - movq %r11, (16 * 16 + 1 * 8)(%rax); - movq %r12, (16 * 16 + 2 * 8)(%rax); - movq %r13, (16 * 16 + 3 * 8)(%rax); + movq %r10, (16 * 16 + 0 * 8)(%rsp); + movq %r11, (16 * 16 + 1 * 8)(%rsp); + movq %r12, (16 * 16 + 2 * 8)(%rsp); + movq %r13, (16 * 16 + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8); + CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8); + CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8); + CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8); vmovdqu (%rcx), %xmm14; vmovdqu (%r8), %xmm15; @@ -1292,7 +1308,7 @@ _gcry_camellia_aesni_avx_ocb_enc: /* inpack16_pre: */ vmovq (key_table)(CTX), %xmm15; - vpshufb .Lpack_bswap RIP, %xmm15, %xmm15; + vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; vpxor %xmm0, %xmm15, %xmm0; vpxor %xmm1, %xmm15, %xmm1; vpxor %xmm2, %xmm15, %xmm2; @@ -1335,13 +1351,19 @@ _gcry_camellia_aesni_avx_ocb_enc: vzeroall; - movq (16 * 16 + 0 * 8)(%rax), %r10; - movq (16 * 16 + 1 * 8)(%rax), %r11; - movq (16 * 16 + 2 * 8)(%rax), %r12; - movq (16 * 16 + 3 * 8)(%rax), %r13; + movq (16 * 16 + 0 * 8)(%rsp), %r10; + movq (16 * 16 + 1 * 8)(%rsp), %r11; + movq (16 * 16 + 2 * 8)(%rsp), %r12; + movq (16 * 16 + 3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;) .align 8 @@ -1357,9 +1379,12 @@ _gcry_camellia_aesni_avx_ocb_dec: * %r8 : checksum * %r9 : L pointers (void *L[16]) */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1367,10 +1392,14 @@ _gcry_camellia_aesni_avx_ocb_dec: andq $~31, %rsp; movq %rsp, %rax; - movq %r10, (16 * 16 + 0 * 8)(%rax); - movq %r11, (16 * 16 + 1 * 8)(%rax); - movq %r12, (16 * 16 + 2 * 8)(%rax); - movq %r13, (16 * 16 + 3 * 8)(%rax); + movq %r10, (16 * 16 + 0 * 8)(%rsp); + movq %r11, (16 * 16 + 1 * 8)(%rsp); + movq %r12, (16 * 16 + 2 * 8)(%rsp); + movq %r13, (16 * 16 + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8); + CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8); + CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8); + CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8); vmovdqu (%rcx), %xmm15; @@ -1428,7 +1457,7 @@ _gcry_camellia_aesni_avx_ocb_dec: /* inpack16_pre: */ vmovq (key_table)(CTX, %r8, 8), %xmm15; - vpshufb .Lpack_bswap RIP, %xmm15, %xmm15; + vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; vpxor %xmm0, %xmm15, %xmm0; vpxor %xmm1, %xmm15, %xmm1; vpxor %xmm2, %xmm15, %xmm2; @@ -1493,13 +1522,19 @@ _gcry_camellia_aesni_avx_ocb_dec: vzeroall; - movq (16 * 16 + 0 * 8)(%rax), %r10; - movq (16 * 16 + 1 * 8)(%rax), %r11; - movq (16 * 16 + 2 * 8)(%rax), %r12; - movq (16 * 16 + 3 * 8)(%rax), %r13; + movq (16 * 16 + 0 * 8)(%rsp), %r10; + movq (16 * 16 + 1 * 8)(%rsp), %r11; + movq (16 * 16 + 2 * 8)(%rsp), %r12; + movq (16 * 16 + 3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;) .align 8 @@ -1514,9 +1549,12 @@ _gcry_camellia_aesni_avx_ocb_auth: * %rcx: checksum * %r8 : L pointers (void *L[16]) */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1524,10 +1562,14 @@ _gcry_camellia_aesni_avx_ocb_auth: andq $~31, %rsp; movq %rsp, %rax; - movq %r10, (16 * 16 + 0 * 8)(%rax); - movq %r11, (16 * 16 + 1 * 8)(%rax); - movq %r12, (16 * 16 + 2 * 8)(%rax); - movq %r13, (16 * 16 + 3 * 8)(%rax); + movq %r10, (16 * 16 + 0 * 8)(%rsp); + movq %r11, (16 * 16 + 1 * 8)(%rsp); + movq %r12, (16 * 16 + 2 * 8)(%rsp); + movq %r13, (16 * 16 + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r10, 16 * 16 + 0 * 8); + CFI_REG_ON_STACK(r11, 16 * 16 + 1 * 8); + CFI_REG_ON_STACK(r12, 16 * 16 + 2 * 8); + CFI_REG_ON_STACK(r13, 16 * 16 + 3 * 8); vmovdqu (%rdx), %xmm15; @@ -1580,7 +1622,7 @@ _gcry_camellia_aesni_avx_ocb_auth: /* inpack16_pre: */ vmovq (key_table)(CTX), %xmm15; - vpshufb .Lpack_bswap RIP, %xmm15, %xmm15; + vpshufb .Lpack_bswap rRIP, %xmm15, %xmm15; vpxor %xmm0, %xmm15, %xmm0; vpxor %xmm1, %xmm15, %xmm1; vpxor %xmm2, %xmm15, %xmm2; @@ -1623,13 +1665,19 @@ _gcry_camellia_aesni_avx_ocb_auth: vzeroall; - movq (16 * 16 + 0 * 8)(%rax), %r10; - movq (16 * 16 + 1 * 8)(%rax), %r11; - movq (16 * 16 + 2 * 8)(%rax), %r12; - movq (16 * 16 + 3 * 8)(%rax), %r13; + movq (16 * 16 + 0 * 8)(%rsp), %r10; + movq (16 * 16 + 1 * 8)(%rsp), %r11; + movq (16 * 16 + 2 * 8)(%rsp), %r12; + movq (16 * 16 + 3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;) /* @@ -1657,8 +1705,8 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth; vpand sbox4mask, t0, t0; \ vpor t0, x, x; \ \ - vmovdqa .Lpost_tf_lo_s1 RIP, t0; \ - vmovdqa .Lpost_tf_hi_s1 RIP, t1; \ + vmovdqa .Lpost_tf_lo_s1 rRIP, t0; \ + vmovdqa .Lpost_tf_hi_s1 rRIP, t1; \ \ /* prefilter sboxes */ \ filter_8bit(x, pre_s1lo_mask, pre_s1hi_mask, _0f0f0f0fmask, t2); \ @@ -1672,18 +1720,18 @@ ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth; /* output rotation for sbox2 (<<< 1) */ \ /* output rotation for sbox3 (>>> 1) */ \ vpshufb inv_shift_row, x, t1; \ - vpshufb .Lsp0044440444044404mask RIP, x, t4; \ - vpshufb .Lsp1110111010011110mask RIP, x, x; \ + vpshufb .Lsp0044440444044404mask rRIP, x, t4; \ + vpshufb .Lsp1110111010011110mask rRIP, x, x; \ vpaddb t1, t1, t2; \ vpsrlw $7, t1, t0; \ vpsllw $7, t1, t3; \ vpor t0, t2, t0; \ vpsrlw $1, t1, t1; \ - vpshufb .Lsp0222022222000222mask RIP, t0, t0; \ + vpshufb .Lsp0222022222000222mask rRIP, t0, t0; \ vpor t1, t3, t1; \ \ vpxor x, t4, t4; \ - vpshufb .Lsp3033303303303033mask RIP, t1, t1; \ + vpshufb .Lsp3033303303303033mask rRIP, t1, t1; \ vpxor t4, t0, t0; \ vpxor t1, t0, t0; \ vpsrldq $8, t0, x; \ @@ -1741,17 +1789,19 @@ __camellia_avx_setup128: * %rdi: ctx, CTX; subkey storage at key_table(CTX) * %xmm0: key */ + CFI_STARTPROC(); + #define cmll_sub(n, ctx) (key_table+((n)*8))(ctx) #define KL128 %xmm0 #define KA128 %xmm2 - vpshufb .Lbswap128_mask RIP, KL128, KL128; + vpshufb .Lbswap128_mask rRIP, KL128, KL128; - vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11; - vmovq .Lsbox4_input_mask RIP, %xmm12; - vbroadcastss .L0f0f0f0f RIP, %xmm13; - vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14; - vmovdqa .Lpre_tf_hi_s1 RIP, %xmm15; + vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11; + vmovq .Lsbox4_input_mask rRIP, %xmm12; + vbroadcastss .L0f0f0f0f rRIP, %xmm13; + vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14; + vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15; /* * Generate KA @@ -1763,18 +1813,18 @@ __camellia_avx_setup128: camellia_f(%xmm2, %xmm4, %xmm1, %xmm5, %xmm6, %xmm7, %xmm8, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP); vpxor %xmm4, %xmm3, %xmm3; camellia_f(%xmm3, %xmm2, %xmm1, %xmm5, %xmm6, %xmm7, %xmm8, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP); camellia_f(%xmm2, %xmm3, %xmm1, %xmm5, %xmm6, %xmm7, %xmm8, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP); vpxor %xmm4, %xmm3, %xmm3; camellia_f(%xmm3, %xmm4, %xmm1, %xmm5, %xmm6, %xmm7, %xmm8, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP); vpslldq $8, %xmm3, %xmm3; vpxor %xmm4, %xmm2, %xmm2; @@ -2076,6 +2126,7 @@ __camellia_avx_setup128: vzeroall; ret; + CFI_ENDPROC(); ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;) .align 8 @@ -2086,19 +2137,21 @@ __camellia_avx_setup256: * %rdi: ctx, CTX; subkey storage at key_table(CTX) * %xmm0 & %xmm1: key */ + CFI_STARTPROC(); + #define KL128 %xmm0 #define KR128 %xmm1 #define KA128 %xmm2 #define KB128 %xmm3 - vpshufb .Lbswap128_mask RIP, KL128, KL128; - vpshufb .Lbswap128_mask RIP, KR128, KR128; + vpshufb .Lbswap128_mask rRIP, KL128, KL128; + vpshufb .Lbswap128_mask rRIP, KR128, KR128; - vmovdqa .Linv_shift_row_and_unpcklbw RIP, %xmm11; - vmovq .Lsbox4_input_mask RIP, %xmm12; - vbroadcastss .L0f0f0f0f RIP, %xmm13; - vmovdqa .Lpre_tf_lo_s1 RIP, %xmm14; - vmovdqa .Lpre_tf_hi_s1 RIP, %xmm15; + vmovdqa .Linv_shift_row_and_unpcklbw rRIP, %xmm11; + vmovq .Lsbox4_input_mask rRIP, %xmm12; + vbroadcastss .L0f0f0f0f rRIP, %xmm13; + vmovdqa .Lpre_tf_lo_s1 rRIP, %xmm14; + vmovdqa .Lpre_tf_hi_s1 rRIP, %xmm15; /* * Generate KA @@ -2111,20 +2164,20 @@ __camellia_avx_setup256: camellia_f(%xmm2, %xmm4, %xmm5, %xmm7, %xmm8, %xmm9, %xmm10, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma1 rRIP); vpxor %xmm4, %xmm3, %xmm3; camellia_f(%xmm3, %xmm2, %xmm5, %xmm7, %xmm8, %xmm9, %xmm10, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma2 rRIP); vpxor %xmm6, %xmm2, %xmm2; camellia_f(%xmm2, %xmm3, %xmm5, %xmm7, %xmm8, %xmm9, %xmm10, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma3 rRIP); vpxor %xmm4, %xmm3, %xmm3; vpxor KR128, %xmm3, %xmm3; camellia_f(%xmm3, %xmm4, %xmm5, %xmm7, %xmm8, %xmm9, %xmm10, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma4 rRIP); vpslldq $8, %xmm3, %xmm3; vpxor %xmm4, %xmm2, %xmm2; @@ -2142,12 +2195,12 @@ __camellia_avx_setup256: camellia_f(%xmm4, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma5 rRIP); vpxor %xmm5, %xmm3, %xmm3; camellia_f(%xmm3, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9, %xmm10, - %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 RIP); + %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, .Lsigma6 rRIP); vpslldq $8, %xmm3, %xmm3; vpxor %xmm5, %xmm4, %xmm4; vpsrldq $8, %xmm3, %xmm3; @@ -2553,6 +2606,7 @@ __camellia_avx_setup256: vzeroall; ret; + CFI_ENDPROC(); ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;) .align 8 @@ -2565,6 +2619,7 @@ _gcry_camellia_aesni_avx_keygen: * %rsi: key * %rdx: keylen */ + CFI_STARTPROC(); vzeroupper; @@ -2585,6 +2640,7 @@ _gcry_camellia_aesni_avx_keygen: vpor %xmm2, %xmm1, %xmm1; jmp __camellia_avx_setup256; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_keygen,.-_gcry_camellia_aesni_avx_keygen;) #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX_SUPPORT)*/ diff --git a/cipher/camellia-aesni-avx2-amd64.S b/cipher/camellia-aesni-avx2-amd64.S index 897e4aeec..cc01c7743 100644 --- a/cipher/camellia-aesni-avx2-amd64.S +++ b/cipher/camellia-aesni-avx2-amd64.S @@ -24,17 +24,7 @@ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT) -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -92,12 +82,12 @@ /* \ * S-function with AES subbytes \ */ \ - vbroadcasti128 .Linv_shift_row RIP, t4; \ - vpbroadcastd .L0f0f0f0f RIP, t7; \ - vbroadcasti128 .Lpre_tf_lo_s1 RIP, t5; \ - vbroadcasti128 .Lpre_tf_hi_s1 RIP, t6; \ - vbroadcasti128 .Lpre_tf_lo_s4 RIP, t2; \ - vbroadcasti128 .Lpre_tf_hi_s4 RIP, t3; \ + vbroadcasti128 .Linv_shift_row rRIP, t4; \ + vpbroadcastd .L0f0f0f0f rRIP, t7; \ + vbroadcasti128 .Lpre_tf_lo_s1 rRIP, t5; \ + vbroadcasti128 .Lpre_tf_hi_s1 rRIP, t6; \ + vbroadcasti128 .Lpre_tf_lo_s4 rRIP, t2; \ + vbroadcasti128 .Lpre_tf_hi_s4 rRIP, t3; \ \ /* AES inverse shift rows */ \ vpshufb t4, x0, x0; \ @@ -143,8 +133,8 @@ vinserti128 $1, t2##_x, x6, x6; \ vextracti128 $1, x1, t3##_x; \ vextracti128 $1, x4, t2##_x; \ - vbroadcasti128 .Lpost_tf_lo_s1 RIP, t0; \ - vbroadcasti128 .Lpost_tf_hi_s1 RIP, t1; \ + vbroadcasti128 .Lpost_tf_lo_s1 rRIP, t0; \ + vbroadcasti128 .Lpost_tf_hi_s1 rRIP, t1; \ vaesenclast t4##_x, x2##_x, x2##_x; \ vaesenclast t4##_x, t6##_x, t6##_x; \ vaesenclast t4##_x, x5##_x, x5##_x; \ @@ -159,16 +149,16 @@ vinserti128 $1, t2##_x, x4, x4; \ \ /* postfilter sboxes 1 and 4 */ \ - vbroadcasti128 .Lpost_tf_lo_s3 RIP, t2; \ - vbroadcasti128 .Lpost_tf_hi_s3 RIP, t3; \ + vbroadcasti128 .Lpost_tf_lo_s3 rRIP, t2; \ + vbroadcasti128 .Lpost_tf_hi_s3 rRIP, t3; \ filter_8bit(x0, t0, t1, t7, t4); \ filter_8bit(x7, t0, t1, t7, t4); \ filter_8bit(x3, t0, t1, t7, t6); \ filter_8bit(x6, t0, t1, t7, t6); \ \ /* postfilter sbox 3 */ \ - vbroadcasti128 .Lpost_tf_lo_s2 RIP, t4; \ - vbroadcasti128 .Lpost_tf_hi_s2 RIP, t5; \ + vbroadcasti128 .Lpost_tf_lo_s2 rRIP, t4; \ + vbroadcasti128 .Lpost_tf_hi_s2 rRIP, t5; \ filter_8bit(x2, t2, t3, t7, t6); \ filter_8bit(x5, t2, t3, t7, t6); \ \ @@ -485,7 +475,7 @@ transpose_4x4(c0, c1, c2, c3, a0, a1); \ transpose_4x4(d0, d1, d2, d3, a0, a1); \ \ - vbroadcasti128 .Lshufb_16x16b RIP, a0; \ + vbroadcasti128 .Lshufb_16x16b rRIP, a0; \ vmovdqu st1, a1; \ vpshufb a0, a2, a2; \ vpshufb a0, a3, a3; \ @@ -524,7 +514,7 @@ #define inpack32_pre(x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, \ y6, y7, rio, key) \ vpbroadcastq key, x0; \ - vpshufb .Lpack_bswap RIP, x0, x0; \ + vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor 0 * 32(rio), x0, y7; \ vpxor 1 * 32(rio), x0, y6; \ @@ -575,7 +565,7 @@ vmovdqu x0, stack_tmp0; \ \ vpbroadcastq key, x0; \ - vpshufb .Lpack_bswap RIP, x0, x0; \ + vpshufb .Lpack_bswap rRIP, x0, x0; \ \ vpxor x0, y7, y7; \ vpxor x0, y6, y6; \ @@ -765,6 +755,7 @@ __camellia_enc_blk32: * %ymm0..%ymm15: 32 encrypted blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ + CFI_STARTPROC(); leaq 8 * 32(%rax), %rcx; @@ -838,6 +829,7 @@ __camellia_enc_blk32: %ymm15, %rax, %rcx, 24); jmp .Lenc_done; + CFI_ENDPROC(); ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;) .align 8 @@ -853,6 +845,7 @@ __camellia_dec_blk32: * %ymm0..%ymm15: 16 plaintext blocks, order swapped: * 7, 8, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8 */ + CFI_STARTPROC(); leaq 8 * 32(%rax), %rcx; @@ -923,6 +916,7 @@ __camellia_dec_blk32: ((key_table + (24) * 8) + 4)(CTX)); jmp .Ldec_max24; + CFI_ENDPROC(); ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;) #define inc_le128(x, minus_one, tmp) \ @@ -942,9 +936,12 @@ _gcry_camellia_aesni_avx2_ctr_enc: * %rdx: src (32 blocks) * %rcx: iv (big endian, 128bit) */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); movq 8(%rcx), %r11; bswapq %r11; @@ -960,10 +957,10 @@ _gcry_camellia_aesni_avx2_ctr_enc: /* load IV and byteswap */ vmovdqu (%rcx), %xmm0; - vpshufb .Lbswap128_mask RIP, %xmm0, %xmm0; + vpshufb .Lbswap128_mask rRIP, %xmm0, %xmm0; vmovdqa %xmm0, %xmm1; inc_le128(%xmm0, %xmm15, %xmm14); - vbroadcasti128 .Lbswap128_mask RIP, %ymm14; + vbroadcasti128 .Lbswap128_mask rRIP, %ymm14; vinserti128 $1, %xmm0, %ymm1, %ymm0; vpshufb %ymm14, %ymm0, %ymm13; vmovdqu %ymm13, 15 * 32(%rax); @@ -1064,14 +1061,14 @@ _gcry_camellia_aesni_avx2_ctr_enc: vextracti128 $1, %ymm0, %xmm13; vpshufb %ymm14, %ymm0, %ymm0; inc_le128(%xmm13, %xmm15, %xmm14); - vpshufb .Lbswap128_mask RIP, %xmm13, %xmm13; + vpshufb .Lbswap128_mask rRIP, %xmm13, %xmm13; vmovdqu %xmm13, (%rcx); .align 4 .Lload_ctr_done: /* inpack16_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; - vpshufb .Lpack_bswap RIP, %ymm15, %ymm15; + vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; @@ -1116,7 +1113,9 @@ _gcry_camellia_aesni_avx2_ctr_enc: vzeroall; leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx2_ctr_enc,.-_gcry_camellia_aesni_avx2_ctr_enc;) .align 8 @@ -1130,9 +1129,12 @@ _gcry_camellia_aesni_avx2_cbc_dec: * %rdx: src (32 blocks) * %rcx: iv */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1188,7 +1190,9 @@ _gcry_camellia_aesni_avx2_cbc_dec: vzeroall; leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx2_cbc_dec,.-_gcry_camellia_aesni_avx2_cbc_dec;) .align 8 @@ -1202,9 +1206,12 @@ _gcry_camellia_aesni_avx2_cfb_dec: * %rdx: src (32 blocks) * %rcx: iv */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1214,7 +1221,7 @@ _gcry_camellia_aesni_avx2_cfb_dec: /* inpack16_pre: */ vpbroadcastq (key_table)(CTX), %ymm0; - vpshufb .Lpack_bswap RIP, %ymm0, %ymm0; + vpshufb .Lpack_bswap rRIP, %ymm0, %ymm0; vmovdqu (%rcx), %xmm15; vinserti128 $1, (%rdx), %ymm15, %ymm15; vpxor %ymm15, %ymm0, %ymm15; @@ -1262,7 +1269,9 @@ _gcry_camellia_aesni_avx2_cfb_dec: vzeroall; leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx2_cfb_dec,.-_gcry_camellia_aesni_avx2_cfb_dec;) .align 8 @@ -1278,9 +1287,12 @@ _gcry_camellia_aesni_avx2_ocb_enc: * %r8 : checksum * %r9 : L pointers (void *L[32]) */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1288,10 +1300,14 @@ _gcry_camellia_aesni_avx2_ocb_enc: andq $~63, %rsp; movq %rsp, %rax; - movq %r10, (16 * 32 + 0 * 8)(%rax); - movq %r11, (16 * 32 + 1 * 8)(%rax); - movq %r12, (16 * 32 + 2 * 8)(%rax); - movq %r13, (16 * 32 + 3 * 8)(%rax); + movq %r10, (16 * 32 + 0 * 8)(%rsp); + movq %r11, (16 * 32 + 1 * 8)(%rsp); + movq %r12, (16 * 32 + 2 * 8)(%rsp); + movq %r13, (16 * 32 + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); + CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); + CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); + CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); vmovdqu (%rcx), %xmm14; vmovdqu (%r8), %xmm13; @@ -1369,7 +1385,7 @@ _gcry_camellia_aesni_avx2_ocb_enc: /* inpack16_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; - vpshufb .Lpack_bswap RIP, %ymm15, %ymm15; + vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; @@ -1412,13 +1428,19 @@ _gcry_camellia_aesni_avx2_ocb_enc: vzeroall; - movq (16 * 32 + 0 * 8)(%rax), %r10; - movq (16 * 32 + 1 * 8)(%rax), %r11; - movq (16 * 32 + 2 * 8)(%rax), %r12; - movq (16 * 32 + 3 * 8)(%rax), %r13; + movq (16 * 32 + 0 * 8)(%rsp), %r10; + movq (16 * 32 + 1 * 8)(%rsp), %r11; + movq (16 * 32 + 2 * 8)(%rsp), %r12; + movq (16 * 32 + 3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx2_ocb_enc,.-_gcry_camellia_aesni_avx2_ocb_enc;) .align 8 @@ -1434,9 +1456,12 @@ _gcry_camellia_aesni_avx2_ocb_dec: * %r8 : checksum * %r9 : L pointers (void *L[32]) */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1444,10 +1469,14 @@ _gcry_camellia_aesni_avx2_ocb_dec: andq $~63, %rsp; movq %rsp, %rax; - movq %r10, (16 * 32 + 0 * 8)(%rax); - movq %r11, (16 * 32 + 1 * 8)(%rax); - movq %r12, (16 * 32 + 2 * 8)(%rax); - movq %r13, (16 * 32 + 3 * 8)(%rax); + movq %r10, (16 * 32 + 0 * 8)(%rsp); + movq %r11, (16 * 32 + 1 * 8)(%rsp); + movq %r12, (16 * 32 + 2 * 8)(%rsp); + movq %r13, (16 * 32 + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); + CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); + CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); + CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); vmovdqu (%rcx), %xmm14; @@ -1525,7 +1554,7 @@ _gcry_camellia_aesni_avx2_ocb_dec: /* inpack16_pre: */ vpbroadcastq (key_table)(CTX, %r8, 8), %ymm15; - vpshufb .Lpack_bswap RIP, %ymm15, %ymm15; + vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; @@ -1596,13 +1625,19 @@ _gcry_camellia_aesni_avx2_ocb_dec: vzeroall; - movq (16 * 32 + 0 * 8)(%rax), %r10; - movq (16 * 32 + 1 * 8)(%rax), %r11; - movq (16 * 32 + 2 * 8)(%rax), %r12; - movq (16 * 32 + 3 * 8)(%rax), %r13; + movq (16 * 32 + 0 * 8)(%rsp), %r10; + movq (16 * 32 + 1 * 8)(%rsp), %r11; + movq (16 * 32 + 2 * 8)(%rsp), %r12; + movq (16 * 32 + 3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx2_ocb_dec,.-_gcry_camellia_aesni_avx2_ocb_dec;) .align 8 @@ -1617,9 +1652,12 @@ _gcry_camellia_aesni_avx2_ocb_auth: * %rcx: checksum * %r8 : L pointers (void *L[16]) */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -1627,10 +1665,14 @@ _gcry_camellia_aesni_avx2_ocb_auth: andq $~63, %rsp; movq %rsp, %rax; - movq %r10, (16 * 32 + 0 * 8)(%rax); - movq %r11, (16 * 32 + 1 * 8)(%rax); - movq %r12, (16 * 32 + 2 * 8)(%rax); - movq %r13, (16 * 32 + 3 * 8)(%rax); + movq %r10, (16 * 32 + 0 * 8)(%rsp); + movq %r11, (16 * 32 + 1 * 8)(%rsp); + movq %r12, (16 * 32 + 2 * 8)(%rsp); + movq %r13, (16 * 32 + 3 * 8)(%rsp); + CFI_REG_ON_STACK(r10, 16 * 32 + 0 * 8); + CFI_REG_ON_STACK(r11, 16 * 32 + 1 * 8); + CFI_REG_ON_STACK(r12, 16 * 32 + 2 * 8); + CFI_REG_ON_STACK(r13, 16 * 32 + 3 * 8); vmovdqu (%rdx), %xmm14; @@ -1703,7 +1745,7 @@ _gcry_camellia_aesni_avx2_ocb_auth: /* inpack16_pre: */ vpbroadcastq (key_table)(CTX), %ymm15; - vpshufb .Lpack_bswap RIP, %ymm15, %ymm15; + vpshufb .Lpack_bswap rRIP, %ymm15, %ymm15; vpxor %ymm0, %ymm15, %ymm0; vpxor %ymm1, %ymm15, %ymm1; vpxor %ymm2, %ymm15, %ymm2; @@ -1749,13 +1791,19 @@ _gcry_camellia_aesni_avx2_ocb_auth: vzeroall; - movq (16 * 32 + 0 * 8)(%rax), %r10; - movq (16 * 32 + 1 * 8)(%rax), %r11; - movq (16 * 32 + 2 * 8)(%rax), %r12; - movq (16 * 32 + 3 * 8)(%rax), %r13; + movq (16 * 32 + 0 * 8)(%rsp), %r10; + movq (16 * 32 + 1 * 8)(%rsp), %r11; + movq (16 * 32 + 2 * 8)(%rsp), %r12; + movq (16 * 32 + 3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx2_ocb_auth,.-_gcry_camellia_aesni_avx2_ocb_auth;) #endif /*defined(ENABLE_AESNI_SUPPORT) && defined(ENABLE_AVX2_SUPPORT)*/ diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S index 1a1d43fd5..82f678901 100644 --- a/cipher/cast5-amd64.S +++ b/cipher/cast5-amd64.S @@ -183,10 +183,13 @@ _gcry_cast5_amd64_encrypt_block: * %rsi: dst * %rdx: src */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); movq %rsi, %r10; @@ -211,10 +214,13 @@ _gcry_cast5_amd64_encrypt_block: write_block(); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;) .align 8 @@ -227,10 +233,13 @@ _gcry_cast5_amd64_decrypt_block: * %rsi: dst * %rdx: src */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); movq %rsi, %r10; @@ -255,10 +264,13 @@ _gcry_cast5_amd64_decrypt_block: write_block(); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;) /********************************************************************** @@ -371,6 +383,7 @@ __cast5_enc_blk4: * output: * RLR0,RLR1,RLR2,RLR3: four output ciphertext blocks */ + CFI_STARTPROC(); GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); get_round_km(0, RKM0d); @@ -387,6 +400,7 @@ __cast5_enc_blk4: outbswap_block4(RLR0, RLR1, RLR2, RLR3); ret; + CFI_ENDPROC(); ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;) .align 8 @@ -399,6 +413,7 @@ __cast5_dec_blk4: * output: * RLR0,RLR1,RLR2,RLR3: four output plaintext blocks */ + CFI_STARTPROC(); GET_EXTERN_POINTER(_gcry_cast5_s1to4, RTAB); inbswap_block4(RLR0, RLR1, RLR2, RLR3); @@ -416,6 +431,7 @@ __cast5_dec_blk4: round_dec_last4(1, F4_2, F4_1); outbswap_block4(RLR0, RLR1, RLR2, RLR3); + CFI_ENDPROC(); ret; ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;) @@ -425,20 +441,28 @@ ELF(.type _gcry_cast5_amd64_ctr_enc, at function;) _gcry_cast5_amd64_ctr_enc: /* input: * %rdi: ctx, CTX - * %rsi: dst (8 blocks) - * %rdx: src (8 blocks) + * %rsi: dst (4 blocks) + * %rdx: src (4 blocks) * %rcx: iv (big endian, 64bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); pushq %r14; + CFI_PUSH(%r14); pushq %rsi; + CFI_PUSH(%rsi); pushq %rdx; + CFI_PUSH(%rdx); /* load IV and byteswap */ movq (%rcx), RX0; @@ -458,7 +482,9 @@ _gcry_cast5_amd64_ctr_enc: call __cast5_enc_blk4; popq %r14; /*src*/ + CFI_POP_TMP_REG(); popq %r13; /*dst*/ + CFI_POP_TMP_REG(); /* XOR key-stream with plaintext */ xorq 0 * 8(%r14), RLR0; @@ -471,13 +497,19 @@ _gcry_cast5_amd64_ctr_enc: movq RLR3, 3 * 8(%r13); popq %r14; + CFI_POP(%r14); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret + CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;) .align 8 @@ -486,21 +518,30 @@ ELF(.type _gcry_cast5_amd64_cbc_dec, at function;) _gcry_cast5_amd64_cbc_dec: /* input: * %rdi: ctx, CTX - * %rsi: dst (8 blocks) - * %rdx: src (8 blocks) + * %rsi: dst (4 blocks) + * %rdx: src (4 blocks) * %rcx: iv (64bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); pushq %r14; + CFI_PUSH(%r14); pushq %rcx; + CFI_PUSH(%rcx); pushq %rsi; + CFI_PUSH(%rsi); pushq %rdx; + CFI_PUSH(%rdx); /* load input */ movq 0 * 8(%rdx), RLR0; @@ -511,8 +552,11 @@ _gcry_cast5_amd64_cbc_dec: call __cast5_dec_blk4; popq RX0; /*src*/ + CFI_POP_TMP_REG(); popq RX1; /*dst*/ + CFI_POP_TMP_REG(); popq RX2; /*iv*/ + CFI_POP_TMP_REG(); movq 3 * 8(RX0), %r14; xorq (RX2), RLR0; @@ -527,14 +571,19 @@ _gcry_cast5_amd64_cbc_dec: movq RLR3, 3 * 8(RX1); popq %r14; + CFI_POP(%r14); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; - + CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;) .align 8 @@ -543,20 +592,28 @@ ELF(.type _gcry_cast5_amd64_cfb_dec, at function;) _gcry_cast5_amd64_cfb_dec: /* input: * %rdi: ctx, CTX - * %rsi: dst (8 blocks) - * %rdx: src (8 blocks) + * %rsi: dst (4 blocks) + * %rdx: src (4 blocks) * %rcx: iv (64bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); pushq %r14; + CFI_PUSH(%r14); pushq %rsi; + CFI_PUSH(%rsi); pushq %rdx; + CFI_PUSH(%rdx); /* Load input */ movq (%rcx), RLR0; @@ -573,7 +630,9 @@ _gcry_cast5_amd64_cfb_dec: call __cast5_enc_blk4; popq %rdx; /*src*/ + CFI_POP_TMP_REG(); popq %rcx; /*dst*/ + CFI_POP_TMP_REG(); xorq 0 * 8(%rdx), RLR0; xorq 1 * 8(%rdx), RLR1; @@ -585,14 +644,19 @@ _gcry_cast5_amd64_cfb_dec: movq RLR3, 3 * 8(%rcx); popq %r14; + CFI_POP(%r14); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; - + CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;) #endif /*defined(USE_CAST5)*/ diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S index 94c8e8cf7..de6263b69 100644 --- a/cipher/chacha20-amd64-avx2.S +++ b/cipher/chacha20-amd64-avx2.S @@ -179,11 +179,14 @@ _gcry_chacha20_amd64_avx2_blocks8: * %rdx: src * %rcx: nblks (multiple of 8) */ + CFI_STARTPROC(); vzeroupper; pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); subq $STACK_MAX, %rsp; andq $~31, %rsp; @@ -318,7 +321,9 @@ _gcry_chacha20_amd64_avx2_blocks8: /* eax zeroed by round loop. */ leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_avx2_blocks8, .-_gcry_chacha20_amd64_avx2_blocks8;) @@ -339,9 +344,12 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8: * %r9: poly1305-state * %r8: poly1305-src */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); vzeroupper; @@ -353,6 +361,11 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8: movq %r13, (STACK_MAX + 2 * 8)(%rsp); movq %r14, (STACK_MAX + 3 * 8)(%rsp); movq %r15, (STACK_MAX + 4 * 8)(%rsp); + CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8); + CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8); + CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8); + CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8); + CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8); movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST @@ -752,10 +765,17 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8: movq (STACK_MAX + 2 * 8)(%rsp), %r13; movq (STACK_MAX + 3 * 8)(%rsp), %r14; movq (STACK_MAX + 4 * 8)(%rsp), %r15; + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); xorl %eax, %eax; leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8, .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;) diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S index 1657f7712..6bbf12fc1 100644 --- a/cipher/chacha20-amd64-ssse3.S +++ b/cipher/chacha20-amd64-ssse3.S @@ -175,9 +175,12 @@ _gcry_chacha20_amd64_ssse3_blocks4: * %rdx: src * %rcx: nblks (multiple of 4) */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); subq $STACK_MAX, %rsp; andq $~15, %rsp; @@ -329,7 +332,9 @@ _gcry_chacha20_amd64_ssse3_blocks4: /* eax zeroed by round loop. */ leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_ssse3_blocks4, .-_gcry_chacha20_amd64_ssse3_blocks4;) @@ -372,6 +377,7 @@ _gcry_chacha20_amd64_ssse3_blocks1: * %rdx: src * %rcx: nblks */ + CFI_STARTPROC(); /* Load constants */ movdqa .Lcounter1 rRIP, X4; @@ -497,6 +503,7 @@ _gcry_chacha20_amd64_ssse3_blocks1: /* eax zeroed by round loop. */ ret; + CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_ssse3_blocks1, .-_gcry_chacha20_amd64_ssse3_blocks1;) @@ -517,9 +524,12 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4: * %r9: poly1305-state * %r8: poly1305-src */ + CFI_STARTPROC(); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); subq $(8 * 8) + STACK_MAX + 16, %rsp; andq $~15, %rsp; @@ -529,6 +539,11 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4: movq %r13, (STACK_MAX + 2 * 8)(%rsp); movq %r14, (STACK_MAX + 3 * 8)(%rsp); movq %r15, (STACK_MAX + 4 * 8)(%rsp); + CFI_REG_ON_STACK(rbx, STACK_MAX + 0 * 8); + CFI_REG_ON_STACK(r12, STACK_MAX + 1 * 8); + CFI_REG_ON_STACK(r13, STACK_MAX + 2 * 8); + CFI_REG_ON_STACK(r14, STACK_MAX + 3 * 8); + CFI_REG_ON_STACK(r15, STACK_MAX + 4 * 8); movq %rdx, (STACK_MAX + 5 * 8)(%rsp); # SRC movq %rsi, (STACK_MAX + 6 * 8)(%rsp); # DST @@ -901,10 +916,17 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4: movq (STACK_MAX + 2 * 8)(%rsp), %r13; movq (STACK_MAX + 3 * 8)(%rsp), %r14; movq (STACK_MAX + 4 * 8)(%rsp), %r15; + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); xorl %eax, %eax; leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4, .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;) @@ -925,8 +947,12 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1: * %r9: poly1305-state * %r8: poly1305-src */ + CFI_STARTPROC(); + pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, %rbp; + CFI_DEF_CFA_REGISTER(%rbp); subq $(8 * 8), %rsp; movq %rbx, (0 * 8)(%rsp); @@ -934,6 +960,11 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1: movq %r13, (2 * 8)(%rsp); movq %r14, (3 * 8)(%rsp); movq %r15, (4 * 8)(%rsp); + CFI_REG_ON_STACK(rbx, 0 * 8); + CFI_REG_ON_STACK(r12, 1 * 8); + CFI_REG_ON_STACK(r13, 2 * 8); + CFI_REG_ON_STACK(r14, 3 * 8); + CFI_REG_ON_STACK(r15, 4 * 8); movq %rdx, (5 * 8)(%rsp); # SRC movq %rsi, (6 * 8)(%rsp); # DST @@ -1206,10 +1237,17 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1: movq (2 * 8)(%rsp), %r13; movq (3 * 8)(%rsp), %r14; movq (4 * 8)(%rsp), %r15; + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); xorl %eax, %eax; leave; + CFI_LEAVE(); ret; + CFI_ENDPROC(); ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1, .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;) diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S index f25573d99..a211dac38 100644 --- a/cipher/des-amd64.S +++ b/cipher/des-amd64.S @@ -190,15 +190,23 @@ _gcry_3des_amd64_crypt_block: * %rsi: dst * %rdx: src */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); pushq %r14; + CFI_PUSH(%r14); pushq %r15; + CFI_PUSH(%r15); pushq %rsi; /*dst*/ + CFI_PUSH(%rsi); leaq .L_s1 rRIP, SBOXES; @@ -259,18 +267,26 @@ _gcry_3des_amd64_crypt_block: round1(32+15, RL0, RR0, dummy2); popq RW2; /*dst*/ + CFI_POP_TMP_REG(); final_permutation(RR0, RL0); write_block(RW2, RR0, RL0); popq %r15; + CFI_POP(%r15); popq %r14; + CFI_POP(%r14); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;) /*********************************************************************** @@ -465,6 +481,7 @@ _gcry_3des_amd64_crypt_blk3: * RL0d, RR0d, RL1d, RR1d, RL2d, RR2d: 3 input blocks * RR0d, RL0d, RR1d, RL1d, RR2d, RL2d: 3 output blocks */ + CFI_STARTPROC(); leaq .L_s1 rRIP, SBOXES; @@ -528,6 +545,7 @@ _gcry_3des_amd64_crypt_blk3: final_permutation3(RR, RL); ret; + CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;) .align 8 @@ -540,18 +558,28 @@ _gcry_3des_amd64_cbc_dec: * %rdx: src (3 blocks) * %rcx: iv (64bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); pushq %r14; + CFI_PUSH(%r14); pushq %r15; + CFI_PUSH(%r15); pushq %rsi; /*dst*/ + CFI_PUSH(%rsi); pushq %rdx; /*src*/ + CFI_PUSH(%rdx); pushq %rcx; /*iv*/ + CFI_PUSH(%rcx); /* load input */ movl 0 * 4(%rdx), RL0d; @@ -571,8 +599,11 @@ _gcry_3des_amd64_cbc_dec: call _gcry_3des_amd64_crypt_blk3; popq %rcx; /*iv*/ + CFI_POP_TMP_REG(); popq %rdx; /*src*/ + CFI_POP_TMP_REG(); popq %rsi; /*dst*/ + CFI_POP_TMP_REG(); bswapl RR0d; bswapl RL0d; @@ -598,14 +629,21 @@ _gcry_3des_amd64_cbc_dec: movl RL2d, 5 * 4(%rsi); popq %r15; + CFI_POP(%r15); popq %r14; + CFI_POP(%r14); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;) .align 8 @@ -618,17 +656,26 @@ _gcry_3des_amd64_ctr_enc: * %rdx: src (3 blocks) * %rcx: iv (64bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); pushq %r14; + CFI_PUSH(%r14); pushq %r15; + CFI_PUSH(%r15); pushq %rsi; /*dst*/ + CFI_PUSH(%rsi); pushq %rdx; /*src*/ + CFI_PUSH(%rdx); movq %rcx, RW2; /* load IV and byteswap */ @@ -654,7 +701,9 @@ _gcry_3des_amd64_ctr_enc: call _gcry_3des_amd64_crypt_blk3; popq %rdx; /*src*/ + CFI_POP_TMP_REG(); popq %rsi; /*dst*/ + CFI_POP_TMP_REG(); bswapl RR0d; bswapl RL0d; @@ -678,14 +727,21 @@ _gcry_3des_amd64_ctr_enc: movl RL2d, 5 * 4(%rsi); popq %r15; + CFI_POP(%r15); popq %r14; + CFI_POP(%r14); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;) .align 8 @@ -698,17 +754,26 @@ _gcry_3des_amd64_cfb_dec: * %rdx: src (3 blocks) * %rcx: iv (64bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 pushq %rbp; + CFI_PUSH(%rbp); pushq %rbx; + CFI_PUSH(%rbx); pushq %r12; + CFI_PUSH(%r12); pushq %r13; + CFI_PUSH(%r13); pushq %r14; + CFI_PUSH(%r14); pushq %r15; + CFI_PUSH(%r15); pushq %rsi; /*dst*/ + CFI_PUSH(%rsi); pushq %rdx; /*src*/ + CFI_PUSH(%rdx); movq %rcx, RW2; /* Load input */ @@ -733,7 +798,9 @@ _gcry_3des_amd64_cfb_dec: call _gcry_3des_amd64_crypt_blk3; popq %rdx; /*src*/ + CFI_POP_TMP_REG(); popq %rsi; /*dst*/ + CFI_POP_TMP_REG(); bswapl RR0d; bswapl RL0d; @@ -757,14 +824,21 @@ _gcry_3des_amd64_cfb_dec: movl RL2d, 5 * 4(%rsi); popq %r15; + CFI_POP(%r15); popq %r14; + CFI_POP(%r14); popq %r13; + CFI_POP(%r13); popq %r12; + CFI_POP(%r12); popq %rbx; + CFI_POP(%rbx); popq %rbp; + CFI_POP(%rbp); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;) .align 16 diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S index 798ff51af..3dcaa856b 100644 --- a/cipher/rijndael-amd64.S +++ b/cipher/rijndael-amd64.S @@ -212,14 +212,19 @@ _gcry_aes_amd64_encrypt_block: * %ecx: number of rounds.. 10, 12 or 14 * %r8: encryption tables */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_5 subq $(5 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(5 * 8); movq %rsi, (0 * 8)(%rsp); movl %ecx, (1 * 8)(%rsp); movq %rbp, (2 * 8)(%rsp); movq %rbx, (3 * 8)(%rsp); movq %r12, (4 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 2 * 8); + CFI_REL_OFFSET(%rbx, 3 * 8); + CFI_REL_OFFSET(%r12, 4 * 8); leaq (%r8), RTAB; @@ -251,16 +256,23 @@ _gcry_aes_amd64_encrypt_block: movl RCd, 2 * 4(%rsi); movl RDd, 3 * 4(%rsi); + CFI_REMEMBER_STATE(); + movq (4 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %rbp; + CFI_RESTORE(%r12); + CFI_RESTORE(%rbx); + CFI_RESTORE(%rbp); addq $(5 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-5 * 8); movl $(6 * 8), %eax; EXIT_SYSV_FUNC ret; + CFI_RESTORE_STATE(); .align 4 .Lenc_not_128: je .Lenc_192 @@ -280,6 +292,7 @@ _gcry_aes_amd64_encrypt_block: lastencround(11); jmp .Lenc_done; + CFI_ENDPROC(); ELF(.size _gcry_aes_amd64_encrypt_block,.-_gcry_aes_amd64_encrypt_block;) #define do_decround(next_r) \ @@ -376,14 +389,19 @@ _gcry_aes_amd64_decrypt_block: * %ecx: number of rounds.. 10, 12 or 14 * %r8: decryption tables */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_5 subq $(5 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(5 * 8); movq %rsi, (0 * 8)(%rsp); movl %ecx, (1 * 8)(%rsp); movq %rbp, (2 * 8)(%rsp); movq %rbx, (3 * 8)(%rsp); movq %r12, (4 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 2 * 8); + CFI_REL_OFFSET(%rbx, 3 * 8); + CFI_REL_OFFSET(%r12, 4 * 8); leaq (%r8), RTAB; @@ -416,16 +434,23 @@ _gcry_aes_amd64_decrypt_block: movl RCd, 2 * 4(%rsi); movl RDd, 3 * 4(%rsi); + CFI_REMEMBER_STATE(); + movq (4 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %rbx; movq (2 * 8)(%rsp), %rbp; + CFI_RESTORE(%r12); + CFI_RESTORE(%rbx); + CFI_RESTORE(%rbp); addq $(5 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-5 * 8); movl $(6 * 8), %eax; EXIT_SYSV_FUNC ret; + CFI_RESTORE_STATE(); .align 4 .Ldec_256: je .Ldec_192; @@ -445,6 +470,7 @@ _gcry_aes_amd64_decrypt_block: decround(9); jmp .Ldec_tail; + CFI_ENDPROC(); ELF(.size _gcry_aes_amd64_decrypt_block,.-_gcry_aes_amd64_decrypt_block;) #endif /*USE_AES*/ diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S index ffce5df2f..8124eb219 100644 --- a/cipher/rijndael-ssse3-amd64-asm.S +++ b/cipher/rijndael-ssse3-amd64-asm.S @@ -50,6 +50,7 @@ ELF(.type _gcry_aes_ssse3_enc_preload, at function) .globl _gcry_aes_ssse3_enc_preload _gcry_aes_ssse3_enc_preload: + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 lea .Laes_consts(%rip), %rax movdqa (%rax), %xmm9 # 0F @@ -61,6 +62,7 @@ _gcry_aes_ssse3_enc_preload: movdqa .Lk_sb2+16(%rax), %xmm14 # sb2t EXIT_SYSV_FUNC ret + CFI_ENDPROC(); ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload) ## @@ -69,6 +71,7 @@ ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload) ELF(.type _gcry_aes_ssse3_dec_preload, at function) .globl _gcry_aes_ssse3_dec_preload _gcry_aes_ssse3_dec_preload: + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 lea .Laes_consts(%rip), %rax movdqa (%rax), %xmm9 # 0F @@ -81,6 +84,7 @@ _gcry_aes_ssse3_dec_preload: movdqa .Lk_dsbe (%rax), %xmm8 # sbeu EXIT_SYSV_FUNC ret + CFI_ENDPROC(); ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload) ## @@ -111,6 +115,7 @@ ELF(.type _gcry_aes_ssse3_encrypt_core, at function) .globl _gcry_aes_ssse3_encrypt_core _gcry_aes_ssse3_encrypt_core: _aes_encrypt_core: + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 mov %rdi, %rdx leaq -1(%rsi), %rax @@ -190,6 +195,7 @@ _aes_encrypt_core: pshufb .Lk_sr(%rsi,%rcx), %xmm0 EXIT_SYSV_FUNC ret + CFI_ENDPROC(); ELF(.size _aes_encrypt_core,.-_aes_encrypt_core) ## @@ -202,6 +208,7 @@ ELF(.size _aes_encrypt_core,.-_aes_encrypt_core) ELF(.type _gcry_aes_ssse3_decrypt_core, at function) _gcry_aes_ssse3_decrypt_core: _aes_decrypt_core: + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 mov %rdi, %rdx lea .Laes_consts(%rip), %rcx @@ -297,6 +304,7 @@ _aes_decrypt_core: pshufb .Lk_sr(%rsi,%rcx), %xmm0 EXIT_SYSV_FUNC ret + CFI_ENDPROC(); ELF(.size _aes_decrypt_core,.-_aes_decrypt_core) ######################################################## @@ -315,6 +323,7 @@ _aes_schedule_core: # rdx = buffer # rcx = direction. 0=encrypt, 1=decrypt # r8 = rotoffs + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_5 # load the tables @@ -671,6 +680,7 @@ _aes_schedule_core: pxor %xmm8, %xmm8 EXIT_SYSV_FUNC ret + CFI_ENDPROC(); ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core) ######################################################## diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S index 470c32aad..ae8f27155 100644 --- a/cipher/salsa20-amd64.S +++ b/cipher/salsa20-amd64.S @@ -28,11 +28,7 @@ #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SALSA20) -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" .text @@ -40,6 +36,7 @@ .globl _gcry_salsa20_amd64_keysetup ELF(.type _gcry_salsa20_amd64_keysetup, at function;) _gcry_salsa20_amd64_keysetup: + CFI_STARTPROC(); movl 0(%rsi),%r8d movl 4(%rsi),%r9d movl 8(%rsi),%eax @@ -87,11 +84,13 @@ _gcry_salsa20_amd64_keysetup: movl %r8d,12(%rdi) .L_keysetupdone: ret + CFI_ENDPROC(); .align 8 .globl _gcry_salsa20_amd64_ivsetup ELF(.type _gcry_salsa20_amd64_ivsetup, at function;) _gcry_salsa20_amd64_ivsetup: + CFI_STARTPROC(); movl 0(%rsi),%r8d movl 4(%rsi),%esi mov $0,%r9 @@ -101,6 +100,7 @@ _gcry_salsa20_amd64_ivsetup: movl %r9d,32(%rdi) movl %eax,52(%rdi) ret + CFI_ENDPROC(); .align 8 .globl _gcry_salsa20_amd64_encrypt_blocks @@ -112,13 +112,15 @@ _gcry_salsa20_amd64_encrypt_blocks: * - Length is input as number of blocks, so don't handle tail bytes * (this is done in salsa20.c). */ + CFI_STARTPROC(); push %rbx + CFI_PUSH(%rbx); shlq $6, %rcx /* blocks to bytes */ mov %r8, %rbx mov %rsp,%r11 - and $31,%r11 - add $384,%r11 - sub %r11,%rsp + CFI_DEF_CFA_REGISTER(%r11); + sub $384,%rsp + and $~31,%rsp mov %rdi,%r8 mov %rsi,%rsi mov %rdx,%rdi @@ -916,15 +918,22 @@ _gcry_salsa20_amd64_encrypt_blocks: cmp $64,%rdx ja .L_bytes_are_128_or_192 .L_done: - add %r11,%rsp + CFI_REMEMBER_STATE(); mov %r11,%rax + sub %rsp,%rax + mov %r11,%rsp + CFI_REGISTER(%r11, %rsp) + CFI_DEF_CFA_REGISTER(%rsp) pop %rbx + CFI_POP(%rbx) ret + CFI_RESTORE_STATE(); .L_bytes_are_128_or_192: sub $64,%rdx add $64,%rdi add $64,%rsi jmp .L_bytes_are_64_128_or_192 + CFI_ENDPROC(); ELF(.size _gcry_salsa20_amd64_encrypt_blocks,.-_gcry_salsa20_amd64_encrypt_blocks;) #endif /*defined(USE_SALSA20)*/ diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S index 8d60a159e..9b17c2bd1 100644 --- a/cipher/serpent-avx2-amd64.S +++ b/cipher/serpent-avx2-amd64.S @@ -24,17 +24,7 @@ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) && \ defined(ENABLE_AVX2_SUPPORT) -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif - -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif +#include "asm-common-amd64.h" /* struct serpent_context: */ #define ctx_keys 0 @@ -421,6 +411,7 @@ __serpent_enc_blk16: * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: sixteen parallel * ciphertext blocks */ + CFI_STARTPROC(); vpcmpeqd RNOT, RNOT, RNOT; @@ -496,6 +487,7 @@ __serpent_enc_blk16: transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1); ret; + CFI_ENDPROC(); ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;) .align 8 @@ -509,6 +501,7 @@ __serpent_dec_blk16: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: sixteen parallel * plaintext blocks */ + CFI_STARTPROC(); vpcmpeqd RNOT, RNOT, RNOT; @@ -586,6 +579,7 @@ __serpent_dec_blk16: transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ret; + CFI_ENDPROC(); ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;) #define inc_le128(x, minus_one, tmp) \ @@ -604,13 +598,14 @@ _gcry_serpent_avx2_ctr_enc: * %rdx: src (16 blocks) * %rcx: iv (big endian, 128bit) */ + CFI_STARTPROC(); movq 8(%rcx), %rax; bswapq %rax; vzeroupper; - vbroadcasti128 .Lbswap128_mask RIP, RTMP3; + vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; vpcmpeqd RNOT, RNOT, RNOT; vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */ @@ -701,7 +696,8 @@ _gcry_serpent_avx2_ctr_enc: vzeroall; - ret + ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;) .align 8 @@ -714,6 +710,7 @@ _gcry_serpent_avx2_cbc_dec: * %rdx: src (16 blocks) * %rcx: iv */ + CFI_STARTPROC(); vzeroupper; @@ -752,7 +749,8 @@ _gcry_serpent_avx2_cbc_dec: vzeroall; - ret + ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;) .align 8 @@ -765,6 +763,7 @@ _gcry_serpent_avx2_cfb_dec: * %rdx: src (16 blocks) * %rcx: iv */ + CFI_STARTPROC(); vzeroupper; @@ -805,7 +804,8 @@ _gcry_serpent_avx2_cfb_dec: vzeroall; - ret + ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;) .align 8 @@ -821,15 +821,21 @@ _gcry_serpent_avx2_ocb_enc: * %r8 : checksum * %r9 : L pointers (void *L[16]) */ + CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; vmovdqu (%r8), RTMP1x; @@ -882,10 +888,15 @@ _gcry_serpent_avx2_ocb_enc: movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); call __serpent_enc_blk16; addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor (0 * 32)(%rsi), RA4, RA4; vpxor (1 * 32)(%rsi), RA1, RA1; @@ -908,6 +919,7 @@ _gcry_serpent_avx2_ocb_enc: vzeroall; ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;) .align 8 @@ -923,15 +935,21 @@ _gcry_serpent_avx2_ocb_dec: * %r8 : checksum * %r9 : L pointers (void *L[16]) */ + CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; @@ -978,10 +996,15 @@ _gcry_serpent_avx2_ocb_dec: movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); call __serpent_dec_blk16; addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); vmovdqu (%r8), RTMP1x; @@ -1020,6 +1043,7 @@ _gcry_serpent_avx2_ocb_dec: vzeroall; ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;) .align 8 @@ -1034,15 +1058,21 @@ _gcry_serpent_avx2_ocb_auth: * %rcx: checksum * %r8 : L pointers (void *L[16]) */ + CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rdx), RTMP0x; @@ -1088,10 +1118,15 @@ _gcry_serpent_avx2_ocb_auth: movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); call __serpent_enc_blk16; addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor RA4, RB4, RA4; vpxor RA1, RB1, RA1; @@ -1111,6 +1146,7 @@ _gcry_serpent_avx2_ocb_auth: vzeroall; ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;) .align 16 diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S index b149af24e..39cba0029 100644 --- a/cipher/serpent-sse2-amd64.S +++ b/cipher/serpent-sse2-amd64.S @@ -23,17 +23,7 @@ #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_SERPENT) -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif - -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif +#include "asm-common-amd64.h" /* struct serpent_context: */ #define ctx_keys 0 @@ -444,6 +434,7 @@ __serpent_enc_blk8: * RA4, RA1, RA2, RA0, RB4, RB1, RB2, RB0: eight parallel * ciphertext blocks */ + CFI_STARTPROC(); pcmpeqd RNOT, RNOT; @@ -519,6 +510,7 @@ __serpent_enc_blk8: transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1); ret; + CFI_ENDPROC(); ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;) .align 8 @@ -532,6 +524,7 @@ __serpent_dec_blk8: * RA0, RA1, RA2, RA3, RB0, RB1, RB2, RB3: eight parallel plaintext * blocks */ + CFI_STARTPROC(); pcmpeqd RNOT, RNOT; @@ -609,6 +602,7 @@ __serpent_dec_blk8: transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); ret; + CFI_ENDPROC(); ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;) .align 8 @@ -621,6 +615,7 @@ _gcry_serpent_sse2_ctr_enc: * %rdx: src (8 blocks) * %rcx: iv (big endian, 128bit) */ + CFI_STARTPROC(); /* load IV and byteswap */ movdqu (%rcx), RA0; @@ -738,7 +733,8 @@ _gcry_serpent_sse2_ctr_enc: pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret + ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;) .align 8 @@ -751,6 +747,7 @@ _gcry_serpent_sse2_cbc_dec: * %rdx: src (8 blocks) * %rcx: iv */ + CFI_STARTPROC(); movdqu (0 * 16)(%rdx), RA0; movdqu (1 * 16)(%rdx), RA1; @@ -799,7 +796,8 @@ _gcry_serpent_sse2_cbc_dec: pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret + ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;) .align 8 @@ -812,6 +810,7 @@ _gcry_serpent_sse2_cfb_dec: * %rdx: src (8 blocks) * %rcx: iv */ + CFI_STARTPROC(); /* Load input */ movdqu (%rcx), RA0; @@ -863,7 +862,8 @@ _gcry_serpent_sse2_cfb_dec: pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret + ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;) .align 8 @@ -879,13 +879,19 @@ _gcry_serpent_sse2_ocb_enc: * %r8 : checksum * %r9 : L pointers (void *L[8]) */ + CFI_STARTPROC(); subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rcx), RTMP0; movdqu (%r8), RTMP1; @@ -926,10 +932,15 @@ _gcry_serpent_sse2_ocb_enc: movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); call __serpent_enc_blk8; addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); pxor_u((0 * 16)(%rsi), RA4, RTMP0); pxor_u((1 * 16)(%rsi), RA1, RTMP0); @@ -966,6 +977,7 @@ _gcry_serpent_sse2_ocb_enc: pxor RNOT, RNOT; ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;) .align 8 @@ -981,13 +993,19 @@ _gcry_serpent_sse2_ocb_dec: * %r8 : checksum * %r9 : L pointers (void *L[8]) */ + CFI_STARTPROC(); subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rcx), RTMP0; @@ -1024,10 +1042,15 @@ _gcry_serpent_sse2_ocb_dec: movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); call __serpent_dec_blk8; addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); movdqu (%r8), RTMP0; @@ -1078,6 +1101,7 @@ _gcry_serpent_sse2_ocb_dec: pxor RNOT, RNOT; ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;) .align 8 @@ -1092,13 +1116,19 @@ _gcry_serpent_sse2_ocb_auth: * %rcx: checksum * %r8 : L pointers (void *L[8]) */ + CFI_STARTPROC(); subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); movdqu (%rdx), RTMP0; @@ -1134,10 +1164,15 @@ _gcry_serpent_sse2_ocb_auth: movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); call __serpent_enc_blk8; addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); movdqu (%rcx), RTMP0; pxor RB4, RA4; @@ -1169,6 +1204,7 @@ _gcry_serpent_sse2_ocb_auth: pxor RNOT, RNOT; ret; + CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;) #endif /*defined(USE_SERPENT)*/ diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S index 5d674c151..85876ad41 100644 --- a/cipher/sha1-avx-amd64.S +++ b/cipher/sha1-avx-amd64.S @@ -33,18 +33,7 @@ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1) -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif - - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" /* Context structure */ @@ -161,7 +150,7 @@ vpshufb BSWAP_REG, tmp0, W; #define W_PRECALC_00_15_2(i, W, tmp0) \ - vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; + vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; #define W_PRECALC_00_15_3(i, W, tmp0) \ vmovdqa tmp0, WK(i&~3); @@ -186,7 +175,7 @@ #define W_PRECALC_16_31_3(i, W, W_m04, W_m08, W_m12, W_m16, tmp0, tmp1) \ vpxor W, tmp0, tmp0; \ vpxor tmp1, tmp0, W; \ - vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \ + vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ @@ -203,7 +192,7 @@ #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ vpor W, tmp0, W; \ - vpaddd (.LK_XMM + ((i)/20)*16) RIP, W, tmp0; \ + vpaddd (.LK_XMM + ((i)/20)*16) rRIP, W, tmp0; \ vmovdqa tmp0, WK((i)&~3); @@ -223,6 +212,7 @@ _gcry_sha1_transform_amd64_avx: * %rsi: data (64*nblks bytes) * %rdx: nblks */ + CFI_STARTPROC(); xorl %eax, %eax; cmpq $0, %rdx; @@ -234,9 +224,12 @@ _gcry_sha1_transform_amd64_avx: movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; + CFI_PUSH(%rbx); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, ROLDSTACK; + CFI_DEF_CFA_REGISTER(ROLDSTACK); subq $(16*4), %rsp; andq $(~31), %rsp; @@ -248,7 +241,7 @@ _gcry_sha1_transform_amd64_avx: movl state_h3(RSTATE), d; movl state_h4(RSTATE), e; - vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG; + vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG; /* Precalc 0-15. */ W_PRECALC_00_15_0(0, W0, Wtmp0); @@ -415,15 +408,20 @@ _gcry_sha1_transform_amd64_avx: movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; + CFI_REGISTER(ROLDSTACK, %rsp); + CFI_DEF_CFA_REGISTER(%rsp); popq %rbp; + CFI_POP(%rbp); popq %rbx; + CFI_POP(%rbx); /* stack already burned */ xorl %eax, %eax; .Lret: ret; + CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_avx, .-_gcry_sha1_transform_amd64_avx;) diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S index fe8901eff..5dfcdca97 100644 --- a/cipher/sha1-avx-bmi2-amd64.S +++ b/cipher/sha1-avx-bmi2-amd64.S @@ -34,18 +34,7 @@ defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA1) -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif - - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" /* Context structure */ @@ -222,6 +211,7 @@ _gcry_sha1_transform_amd64_avx_bmi2: * %rsi: data (64*nblks bytes) * %rdx: nblks */ + CFI_STARTPROC(); xorl %eax, %eax; cmpq $0, %rdx; @@ -233,10 +223,14 @@ _gcry_sha1_transform_amd64_avx_bmi2: movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; + CFI_PUSH(%rbx); pushq %rbp; + CFI_PUSH(%rbp); pushq %r12; + CFI_PUSH(%r12); movq %rsp, ROLDSTACK; + CFI_DEF_CFA_REGISTER(ROLDSTACK); subq $(16*4), %rsp; andq $(~31), %rsp; @@ -249,11 +243,11 @@ _gcry_sha1_transform_amd64_avx_bmi2: movl state_h4(RSTATE), e; xorl ne, ne; - vmovdqa .Lbswap_shufb_ctl RIP, BSWAP_REG; - vpbroadcastd .LK1 RIP, K1; - vpbroadcastd .LK2 RIP, K2; - vpbroadcastd .LK3 RIP, K3; - vpbroadcastd .LK4 RIP, K4; + vmovdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG; + vpbroadcastd .LK1 rRIP, K1; + vpbroadcastd .LK2 rRIP, K2; + vpbroadcastd .LK3 rRIP, K3; + vpbroadcastd .LK4 rRIP, K4; /* Precalc 0-15. */ W_PRECALC_00_15_0(0, W0, Wtmp0); @@ -424,16 +418,22 @@ _gcry_sha1_transform_amd64_avx_bmi2: movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; + CFI_REGISTER(ROLDSTACK, %rsp); + CFI_DEF_CFA_REGISTER(%rsp); popq %r12; + CFI_POP(%r12); popq %rbp; + CFI_POP(%rbp); popq %rbx; + CFI_POP(%rbx); /* stack already burned */ xorl %eax, %eax; .Lret: ret; + CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_avx_bmi2, .-_gcry_sha1_transform_amd64_avx_bmi2;) diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S index 2a2f21a56..938632305 100644 --- a/cipher/sha1-avx2-bmi2-amd64.S +++ b/cipher/sha1-avx2-bmi2-amd64.S @@ -34,18 +34,7 @@ defined(HAVE_GCC_INLINE_ASM_BMI2) && defined(HAVE_GCC_INLINE_ASM_AVX) && \ defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(USE_SHA1) -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif - - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" /* Context structure */ @@ -228,6 +217,7 @@ _gcry_sha1_transform_amd64_avx2_bmi2: * %rsi: data (64*nblks bytes) * %rdx: nblks (multiple of 2, larger than 0) */ + CFI_STARTPROC(); vzeroupper; @@ -235,10 +225,14 @@ _gcry_sha1_transform_amd64_avx2_bmi2: movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; + CFI_PUSH(%rbx); pushq %rbp; + CFI_PUSH(%rbp); pushq %r12; + CFI_PUSH(%r12); movq %rsp, ROLDSTACK; + CFI_DEF_CFA_REGISTER(ROLDSTACK); subq $(WK_STACK_WORDS*4), %rsp; andq $(~63), %rsp; @@ -251,11 +245,11 @@ _gcry_sha1_transform_amd64_avx2_bmi2: movl state_h4(RSTATE), e; xorl ne, ne; - vbroadcasti128 .Lbswap_shufb_ctl RIP, BSWAP_REG; - vpbroadcastd .LK1 RIP, K1; - vpbroadcastd .LK2 RIP, K2; - vpbroadcastd .LK3 RIP, K3; - vpbroadcastd .LK4 RIP, K4; + vbroadcasti128 .Lbswap_shufb_ctl rRIP, BSWAP_REG; + vpbroadcastd .LK1 rRIP, K1; + vpbroadcastd .LK2 rRIP, K2; + vpbroadcastd .LK3 rRIP, K3; + vpbroadcastd .LK4 rRIP, K4; /* Precalc 0-31 for block 1 & 2. */ W_PRECALC_00_15_0(0, W0, Wtmp0); @@ -557,15 +551,21 @@ _gcry_sha1_transform_amd64_avx2_bmi2: movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; + CFI_REGISTER(ROLDSTACK, %rsp); + CFI_DEF_CFA_REGISTER(%rsp); popq %r12; + CFI_POP(%r12); popq %rbp; + CFI_POP(%rbp); popq %rbx; + CFI_POP(%rbx); /* stack already burned */ xorl %eax, %eax; ret; + CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2, .-_gcry_sha1_transform_amd64_avx2_bmi2;) diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index fff140345..7e32b0f4b 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -33,18 +33,7 @@ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA1) -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif - - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" /* Context structure */ @@ -162,7 +151,7 @@ movdqa tmp0, W; #define W_PRECALC_00_15_2(i, W, tmp0) \ - paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; + paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; #define W_PRECALC_00_15_3(i, W, tmp0) \ movdqa tmp0, WK(i&~3); @@ -193,7 +182,7 @@ pxor W, tmp0; \ pxor tmp1, tmp0; \ movdqa tmp0, W; \ - paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \ + paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \ movdqa tmp0, WK((i)&~3); #define W_PRECALC_32_79_0(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ @@ -213,7 +202,7 @@ #define W_PRECALC_32_79_3(i, W, W_m04, W_m08, W_m12, W_m16, W_m20, W_m24, W_m28, tmp0) \ movdqa tmp0, W; \ - paddd (.LK_XMM + ((i)/20)*16) RIP, tmp0; \ + paddd (.LK_XMM + ((i)/20)*16) rRIP, tmp0; \ movdqa tmp0, WK((i)&~3); #define CLEAR_REG(reg) pxor reg, reg; @@ -235,6 +224,7 @@ _gcry_sha1_transform_amd64_ssse3: * %rsi: data (64*nblks bytes) * %rdx: nblks */ + CFI_STARTPROC(); xorl %eax, %eax; cmpq $0, %rdx; @@ -244,9 +234,12 @@ _gcry_sha1_transform_amd64_ssse3: movq %rdi, RSTATE; movq %rsi, RDATA; pushq %rbx; + CFI_PUSH(%rbx); pushq %rbp; + CFI_PUSH(%rbp); movq %rsp, ROLDSTACK; + CFI_DEF_CFA_REGISTER(ROLDSTACK); subq $(16*4), %rsp; andq $(~31), %rsp; @@ -258,7 +251,7 @@ _gcry_sha1_transform_amd64_ssse3: movl state_h3(RSTATE), d; movl state_h4(RSTATE), e; - movdqa .Lbswap_shufb_ctl RIP, BSWAP_REG; + movdqa .Lbswap_shufb_ctl rRIP, BSWAP_REG; /* Precalc 0-15. */ W_PRECALC_00_15_0(0, W0, Wtmp0); @@ -423,15 +416,20 @@ _gcry_sha1_transform_amd64_ssse3: movl e, state_h4(RSTATE); movq ROLDSTACK, %rsp; + CFI_REGISTER(ROLDSTACK, %rsp); + CFI_DEF_CFA_REGISTER(%rsp); popq %rbp; + CFI_POP(%rbp); popq %rbx; + CFI_POP(%rbx); /* stack already burned */ xorl %eax, %eax; .Lret: ret; + CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_ssse3, .-_gcry_sha1_transform_amd64_ssse3;) diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S index b8b01b15b..77143ff0e 100644 --- a/cipher/sha256-avx-amd64.S +++ b/cipher/sha256-avx-amd64.S @@ -59,17 +59,7 @@ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA256) -#ifdef __PIC__ -# define ADD_RIP +rip -#else -# define ADD_RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" .intel_syntax noprefix @@ -380,15 +370,22 @@ rotate_Xs ELF(.type _gcry_sha256_transform_amd64_avx, at function;) .align 16 _gcry_sha256_transform_amd64_avx: + CFI_STARTPROC() vzeroupper push rbx + CFI_PUSH(rbx) push rbp + CFI_PUSH(rbp) push r13 + CFI_PUSH(r13) push r14 + CFI_PUSH(r14) push r15 + CFI_PUSH(r15) sub rsp, STACK_SIZE + CFI_ADJUST_CFA_OFFSET(STACK_SIZE); shl NUM_BLKS, 6 /* convert to bytes */ jz .Ldone_hash @@ -487,14 +484,21 @@ _gcry_sha256_transform_amd64_avx: xor eax, eax add rsp, STACK_SIZE + CFI_ADJUST_CFA_OFFSET(-STACK_SIZE); pop r15 + CFI_POP(r15) pop r14 + CFI_POP(r14) pop r13 + CFI_POP(r13) pop rbp + CFI_POP(rbp) pop rbx + CFI_POP(rbx) ret + CFI_ENDPROC() .align 16 diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S index 5fc402cd1..52be1a07b 100644 --- a/cipher/sha256-avx2-bmi2-amd64.S +++ b/cipher/sha256-avx2-bmi2-amd64.S @@ -60,17 +60,7 @@ defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(USE_SHA256) -#ifdef __PIC__ -# define ADD_RIP +rip -#else -# define ADD_RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" .intel_syntax noprefix @@ -314,17 +304,24 @@ a = TMP_ ELF(.type _gcry_sha256_transform_amd64_avx2, at function) .align 32 _gcry_sha256_transform_amd64_avx2: + CFI_STARTPROC() xor eax, eax cmp rdx, 0 je .Lnowork push rbx + CFI_PUSH(rbx) push rbp + CFI_PUSH(rbp) push r12 + CFI_PUSH(r12) push r13 + CFI_PUSH(r13) push r14 + CFI_PUSH(r14) push r15 + CFI_PUSH(r15) vzeroupper @@ -333,9 +330,11 @@ _gcry_sha256_transform_amd64_avx2: vmovdqa SHUF_DC00, [.L_SHUF_DC00 ADD_RIP] mov rax, rsp + CFI_DEF_CFA_REGISTER(rax); sub rsp, STACK_SIZE and rsp, ~63 mov [rsp + _RSP], rax + CFI_CFA_ON_STACK(_RSP, 6 * 8) shl NUM_BLKS, 6 /* convert to bytes */ lea NUM_BLKS, [NUM_BLKS + INP - 64] /* pointer to last block */ @@ -507,16 +506,24 @@ _gcry_sha256_transform_amd64_avx2: xor eax, eax mov rsp, [rsp + _RSP] + CFI_DEF_CFA_REGISTER(rsp) pop r15 + CFI_POP(r15) pop r14 + CFI_POP(r14) pop r13 + CFI_POP(r13) pop r12 + CFI_POP(r12) pop rbp + CFI_POP(rbp) pop rbx + CFI_POP(rbx) .Lnowork: ret + CFI_ENDPROC() .align 64 .LK256: diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index ca5c9fd1d..0fb94c1b3 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -60,17 +60,7 @@ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA256) -#ifdef __PIC__ -# define ADD_RIP +rip -#else -# define ADD_RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" .intel_syntax noprefix @@ -386,13 +376,20 @@ rotate_Xs ELF(.type _gcry_sha256_transform_amd64_ssse3, at function;) .align 16 _gcry_sha256_transform_amd64_ssse3: + CFI_STARTPROC() push rbx + CFI_PUSH(rbx) push rbp + CFI_PUSH(rbp) push r13 + CFI_PUSH(r13) push r14 + CFI_PUSH(r14) push r15 + CFI_PUSH(r15) sub rsp, STACK_SIZE + CFI_ADJUST_CFA_OFFSET(STACK_SIZE); shl NUM_BLKS, 6 /* convert to bytes */ jz .Ldone_hash @@ -508,14 +505,21 @@ _gcry_sha256_transform_amd64_ssse3: xor eax, eax add rsp, STACK_SIZE + CFI_ADJUST_CFA_OFFSET(-STACK_SIZE); pop r15 + CFI_POP(r15) pop r14 + CFI_POP(r14) pop r13 + CFI_POP(r13) pop rbp + CFI_POP(rbp) pop rbx + CFI_POP(rbx) ret + CFI_ENDPROC() .align 16 diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 534351e44..991fd6395 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -46,17 +46,7 @@ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_AVX) && defined(USE_SHA512) -#ifdef __PIC__ -# define ADD_RIP +rip -#else -# define ADD_RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" .intel_syntax noprefix @@ -269,6 +259,7 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) ELF(.type _gcry_sha512_transform_amd64_avx, at function;) .align 16 _gcry_sha512_transform_amd64_avx: + CFI_STARTPROC() xor eax, eax cmp msglen, 0 @@ -278,6 +269,7 @@ _gcry_sha512_transform_amd64_avx: /* Allocate Stack Space */ sub rsp, frame_size + CFI_ADJUST_CFA_OFFSET(frame_size); /* Save GPRs */ mov [rsp + frame_GPRSAVE + 8 * 0], rbx @@ -285,6 +277,11 @@ _gcry_sha512_transform_amd64_avx: mov [rsp + frame_GPRSAVE + 8 * 2], r13 mov [rsp + frame_GPRSAVE + 8 * 3], r14 mov [rsp + frame_GPRSAVE + 8 * 4], r15 + CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0); + CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1); + CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2); + CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3); + CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4); .Lupdateblock: @@ -351,6 +348,11 @@ _gcry_sha512_transform_amd64_avx: mov r13, [rsp + frame_GPRSAVE + 8 * 2] mov r14, [rsp + frame_GPRSAVE + 8 * 3] mov r15, [rsp + frame_GPRSAVE + 8 * 4] + CFI_RESTORE(rbx) + CFI_RESTORE(r12) + CFI_RESTORE(r13) + CFI_RESTORE(r14) + CFI_RESTORE(r15) vzeroall @@ -365,9 +367,11 @@ _gcry_sha512_transform_amd64_avx: /* Restore Stack Pointer */ add rsp, frame_size + CFI_ADJUST_CFA_OFFSET(-frame_size); .Lnowork: ret + CFI_ENDPROC() /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 32cfceb0b..3b28ab6c6 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -49,17 +49,7 @@ defined(HAVE_GCC_INLINE_ASM_AVX2) && defined(HAVE_GCC_INLINE_ASM_BMI2) && \ defined(USE_SHA512) -#ifdef __PIC__ -# define ADD_RIP +rip -#else -# define ADD_RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" .intel_syntax noprefix @@ -352,6 +342,7 @@ y4 = r12 ELF(.type _gcry_sha512_transform_amd64_avx2, at function;) .align 16 _gcry_sha512_transform_amd64_avx2: + CFI_STARTPROC() xor eax, eax cmp rdx, 0 @@ -361,9 +352,11 @@ _gcry_sha512_transform_amd64_avx2: /* Allocate Stack Space */ mov rax, rsp + CFI_DEF_CFA_REGISTER(rax); sub rsp, frame_size and rsp, ~(0x40 - 1) mov [rsp + frame_RSPSAVE], rax + CFI_CFA_ON_STACK(frame_RSPSAVE, 0) /* Save GPRs */ mov [rsp + frame_GPRSAVE + 8 * 0], rbp @@ -372,6 +365,12 @@ _gcry_sha512_transform_amd64_avx2: mov [rsp + frame_GPRSAVE + 8 * 3], r13 mov [rsp + frame_GPRSAVE + 8 * 4], r14 mov [rsp + frame_GPRSAVE + 8 * 5], r15 + CFI_REG_ON_STACK(rbp, frame_GPRSAVE + 8 * 0) + CFI_REG_ON_STACK(rbx, frame_GPRSAVE + 8 * 1) + CFI_REG_ON_STACK(r12, frame_GPRSAVE + 8 * 2) + CFI_REG_ON_STACK(r13, frame_GPRSAVE + 8 * 3) + CFI_REG_ON_STACK(r14, frame_GPRSAVE + 8 * 4) + CFI_REG_ON_STACK(r15, frame_GPRSAVE + 8 * 5) mov [rsp + frame_NBLKS], NUM_BLKS @@ -494,11 +493,20 @@ _gcry_sha512_transform_amd64_avx2: mov r13, [rsp + frame_GPRSAVE + 8 * 3] mov r14, [rsp + frame_GPRSAVE + 8 * 4] mov r15, [rsp + frame_GPRSAVE + 8 * 5] + CFI_RESTORE(rbp) + CFI_RESTORE(rbx) + CFI_RESTORE(r12) + CFI_RESTORE(r13) + CFI_RESTORE(r14) + CFI_RESTORE(r15) /* Restore Stack Pointer */ mov rsp, [rsp + frame_RSPSAVE] + CFI_DEF_CFA_REGISTER(rsp) + .Lnowork: ret + CFI_ENDPROC() /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ /*;; Binary Data */ diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 8e950e0e4..39bfe3625 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -49,17 +49,7 @@ defined(HAVE_INTEL_SYNTAX_PLATFORM_AS) && \ defined(HAVE_GCC_INLINE_ASM_SSSE3) && defined(USE_SHA512) -#ifdef __PIC__ -# define ADD_RIP +rip -#else -# define ADD_RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" .intel_syntax noprefix @@ -271,6 +261,7 @@ frame_size = ((frame_GPRSAVE) + (frame_GPRSAVE_size)) ELF(.type _gcry_sha512_transform_amd64_ssse3, at function;) .align 16 _gcry_sha512_transform_amd64_ssse3: + CFI_STARTPROC() xor eax, eax cmp msglen, 0 @@ -278,6 +269,7 @@ _gcry_sha512_transform_amd64_ssse3: /* Allocate Stack Space */ sub rsp, frame_size + CFI_ADJUST_CFA_OFFSET(frame_size); /* Save GPRs */ mov [rsp + frame_GPRSAVE + 8 * 0], rbx @@ -285,6 +277,11 @@ _gcry_sha512_transform_amd64_ssse3: mov [rsp + frame_GPRSAVE + 8 * 2], r13 mov [rsp + frame_GPRSAVE + 8 * 3], r14 mov [rsp + frame_GPRSAVE + 8 * 4], r15 + CFI_REL_OFFSET(rbx, frame_GPRSAVE + 8 * 0); + CFI_REL_OFFSET(r12, frame_GPRSAVE + 8 * 1); + CFI_REL_OFFSET(r13, frame_GPRSAVE + 8 * 2); + CFI_REL_OFFSET(r14, frame_GPRSAVE + 8 * 3); + CFI_REL_OFFSET(r15, frame_GPRSAVE + 8 * 4); .Lupdateblock: @@ -351,6 +348,11 @@ _gcry_sha512_transform_amd64_ssse3: mov r13, [rsp + frame_GPRSAVE + 8 * 2] mov r14, [rsp + frame_GPRSAVE + 8 * 3] mov r15, [rsp + frame_GPRSAVE + 8 * 4] + CFI_RESTORE(rbx) + CFI_RESTORE(r12) + CFI_RESTORE(r13) + CFI_RESTORE(r14) + CFI_RESTORE(r15) pxor xmm0, xmm0 pxor xmm1, xmm1 @@ -370,9 +372,11 @@ _gcry_sha512_transform_amd64_ssse3: /* Restore Stack Pointer */ add rsp, frame_size + CFI_ADJUST_CFA_OFFSET(-frame_size); .Lnowork: ret + CFI_ENDPROC() /* ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S index 134d6401e..3cb734317 100644 --- a/cipher/twofish-amd64.S +++ b/cipher/twofish-amd64.S @@ -171,12 +171,16 @@ _gcry_twofish_amd64_encrypt_block: * %rsi: dst * %rdx: src */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(3 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(3 * 8); movq %rsi, (0 * 8)(%rsp); movq %rbp, (1 * 8)(%rsp); movq %rbx, (2 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 1 * 8); + CFI_REL_OFFSET(%rbx, 2 * 8); movq %rdx, RX; inpack(RX, 0, RAd, 0); @@ -201,10 +205,14 @@ _gcry_twofish_amd64_encrypt_block: movq (2 * 8)(%rsp), %rbx; movq (1 * 8)(%rsp), %rbp; + CFI_RESTORE(%rbx); + CFI_RESTORE(%rbp); addq $(3 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-3 * 8); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;) .align 8 @@ -217,12 +225,16 @@ _gcry_twofish_amd64_decrypt_block: * %rsi: dst * %rdx: src */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(3 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(3 * 8); movq %rsi, (0 * 8)(%rsp); movq %rbp, (1 * 8)(%rsp); movq %rbx, (2 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 1 * 8); + CFI_REL_OFFSET(%rbx, 2 * 8); movq %rdx, RX; inpack(RX, 0, RCd, 4); @@ -247,10 +259,14 @@ _gcry_twofish_amd64_decrypt_block: movq (2 * 8)(%rsp), %rbx; movq (1 * 8)(%rsp), %rbp; + CFI_RESTORE(%rbx); + CFI_RESTORE(%rbp); addq $(3 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-3 * 8); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;) #undef CTX @@ -480,6 +496,8 @@ __twofish_enc_blk3: * output: * RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three ciphertext blocks */ + CFI_STARTPROC(); + inpack_enc3(); encrypt_cycle3(RAB, RCD, 0); @@ -494,6 +512,7 @@ __twofish_enc_blk3: outunpack_enc3(); ret; + CFI_ENDPROC(); ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;) .align 8 @@ -506,6 +525,8 @@ __twofish_dec_blk3: * output: * RCD0,RAB0,RCD1,RAB1,RCD2,RAB2: three plaintext blocks */ + CFI_STARTPROC(); + inpack_dec3(); decrypt_cycle3(RAB, RCD, 7); @@ -520,6 +541,7 @@ __twofish_dec_blk3: outunpack_dec3(); ret; + CFI_ENDPROC(); ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;) .align 8 @@ -532,15 +554,23 @@ _gcry_twofish_amd64_ctr_enc: * %rdx: src (3 blocks) * %rcx: iv (big endian, 128bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 0 * 8); + CFI_REL_OFFSET(%rbx, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + CFI_REL_OFFSET(%r14, 4 * 8); + CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %rdx, (7 * 8)(%rsp); @@ -601,10 +631,18 @@ _gcry_twofish_amd64_ctr_enc: movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; + CFI_RESTORE(%rbp); + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); addq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;) .align 8 @@ -617,15 +655,23 @@ _gcry_twofish_amd64_cbc_dec: * %rdx: src (3 blocks) * %rcx: iv (128bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(9 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(9 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 0 * 8); + CFI_REL_OFFSET(%rbx, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + CFI_REL_OFFSET(%r14, 4 * 8); + CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %rdx, (7 * 8)(%rsp); @@ -670,10 +716,18 @@ _gcry_twofish_amd64_cbc_dec: movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; + CFI_RESTORE(%rbp); + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); addq $(9 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-9 * 8); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;) .align 8 @@ -686,15 +740,23 @@ _gcry_twofish_amd64_cfb_dec: * %rdx: src (3 blocks) * %rcx: iv (128bit) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_0_4 subq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 0 * 8); + CFI_REL_OFFSET(%rbx, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + CFI_REL_OFFSET(%r14, 4 * 8); + CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %rdx, (7 * 8)(%rsp); @@ -739,10 +801,18 @@ _gcry_twofish_amd64_cfb_dec: movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; + CFI_RESTORE(%rbp); + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); addq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;) .align 8 @@ -757,15 +827,23 @@ _gcry_twofish_amd64_ocb_enc: * %r8 : checksum * %r9 : L pointers (void *L[3]) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_6 subq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 0 * 8); + CFI_REL_OFFSET(%rbx, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + CFI_REL_OFFSET(%r14, 4 * 8); + CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %rdx, RX0; @@ -849,10 +927,18 @@ _gcry_twofish_amd64_ocb_enc: movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; + CFI_RESTORE(%rbp); + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); addq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;) .align 8 @@ -867,15 +953,23 @@ _gcry_twofish_amd64_ocb_dec: * %r8 : checksum * %r9 : L pointers (void *L[3]) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_6 subq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 0 * 8); + CFI_REL_OFFSET(%rbx, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + CFI_REL_OFFSET(%r14, 4 * 8); + CFI_REL_OFFSET(%r15, 5 * 8); movq %rsi, (6 * 8)(%rsp); movq %r8, (7 * 8)(%rsp); @@ -967,10 +1061,18 @@ _gcry_twofish_amd64_ocb_dec: movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; + CFI_RESTORE(%rbp); + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); addq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;) .align 8 @@ -984,15 +1086,23 @@ _gcry_twofish_amd64_ocb_auth: * %rcx: checksum * %r8 : L pointers (void *L[3]) */ + CFI_STARTPROC(); ENTER_SYSV_FUNC_PARAMS_5 subq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(8 * 8); movq %rbp, (0 * 8)(%rsp); movq %rbx, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); movq %r14, (4 * 8)(%rsp); movq %r15, (5 * 8)(%rsp); + CFI_REL_OFFSET(%rbp, 0 * 8); + CFI_REL_OFFSET(%rbx, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); + CFI_REL_OFFSET(%r14, 4 * 8); + CFI_REL_OFFSET(%r15, 5 * 8); movq %rcx, (6 * 8)(%rsp); movq %rsi, RX0; @@ -1056,10 +1166,18 @@ _gcry_twofish_amd64_ocb_auth: movq (3 * 8)(%rsp), %r13; movq (4 * 8)(%rsp), %r14; movq (5 * 8)(%rsp), %r15; + CFI_RESTORE(%rbp); + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); addq $(8 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;) #endif /*USE_TWOFISH*/ diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S index db6e21826..74cad3558 100644 --- a/cipher/twofish-avx2-amd64.S +++ b/cipher/twofish-avx2-amd64.S @@ -24,17 +24,7 @@ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_TWOFISH) && \ defined(ENABLE_AVX2_SUPPORT) -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif - -#ifdef __PIC__ -# define RIP (%rip) -#else -# define RIP -#endif +#include "asm-common-amd64.h" .text @@ -423,6 +413,7 @@ __twofish_enc_blk16: * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel * ciphertext blocks */ + CFI_STARTPROC(); init_round_constants(); transpose4x4_16(RA, RB, RC, RD); @@ -441,6 +432,7 @@ __twofish_enc_blk16: transpose4x4_16(RA, RB, RC, RD); ret; + CFI_ENDPROC(); ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;) .align 8 @@ -454,6 +446,7 @@ __twofish_dec_blk16: * RA0, RB0, RC0, RD0, RA1, RB1, RC1, RD1: sixteen parallel * ciphertext blocks */ + CFI_STARTPROC(); init_round_constants(); transpose4x4_16(RA, RB, RC, RD); @@ -472,6 +465,7 @@ __twofish_dec_blk16: transpose4x4_16(RA, RB, RC, RD); ret; + CFI_ENDPROC(); ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;) #define inc_le128(x, minus_one, tmp) \ @@ -490,13 +484,14 @@ _gcry_twofish_avx2_ctr_enc: * %rdx: src (16 blocks) * %rcx: iv (big endian, 128bit) */ + CFI_STARTPROC(); movq 8(%rcx), %rax; bswapq %rax; vzeroupper; - vbroadcasti128 .Lbswap128_mask RIP, RTMP3; + vbroadcasti128 .Lbswap128_mask rRIP, RTMP3; vpcmpeqd RNOT, RNOT, RNOT; vpsrldq $8, RNOT, RNOT; /* ab: -1:0 ; cd: -1:0 */ vpaddq RNOT, RNOT, RTMP2; /* ab: -2:0 ; cd: -2:0 */ @@ -587,7 +582,8 @@ _gcry_twofish_avx2_ctr_enc: vzeroall; - ret + ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;) .align 8 @@ -600,6 +596,7 @@ _gcry_twofish_avx2_cbc_dec: * %rdx: src (16 blocks) * %rcx: iv */ + CFI_STARTPROC(); vzeroupper; @@ -638,7 +635,8 @@ _gcry_twofish_avx2_cbc_dec: vzeroall; - ret + ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;) .align 8 @@ -651,6 +649,7 @@ _gcry_twofish_avx2_cfb_dec: * %rdx: src (16 blocks) * %rcx: iv */ + CFI_STARTPROC(); vzeroupper; @@ -691,7 +690,8 @@ _gcry_twofish_avx2_cfb_dec: vzeroall; - ret + ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;) .align 8 @@ -707,15 +707,21 @@ _gcry_twofish_avx2_ocb_enc: * %r8 : checksum * %r9 : L pointers (void *L[16]) */ + CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; vmovdqu (%r8), RTMP1x; @@ -768,10 +774,15 @@ _gcry_twofish_avx2_ocb_enc: movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); call __twofish_enc_blk16; addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor (0 * 32)(%rsi), RA0, RA0; vpxor (1 * 32)(%rsi), RB0, RB0; @@ -794,6 +805,7 @@ _gcry_twofish_avx2_ocb_enc: vzeroall; ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;) .align 8 @@ -809,15 +821,21 @@ _gcry_twofish_avx2_ocb_dec: * %r8 : checksum * %r9 : L pointers (void *L[16]) */ + CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rcx), RTMP0x; @@ -865,6 +883,10 @@ _gcry_twofish_avx2_ocb_dec: movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); call __twofish_dec_blk16; @@ -880,6 +902,7 @@ _gcry_twofish_avx2_ocb_dec: vpxor (7 * 32)(%rsi), RD1, RD1; addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); /* Checksum_i = Checksum_{i-1} xor P_i */ @@ -907,6 +930,7 @@ _gcry_twofish_avx2_ocb_dec: vzeroall; ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;) .align 8 @@ -921,15 +945,21 @@ _gcry_twofish_avx2_ocb_auth: * %rcx: checksum * %r8 : L pointers (void *L[16]) */ + CFI_STARTPROC(); vzeroupper; subq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(4 * 8); movq %r10, (0 * 8)(%rsp); movq %r11, (1 * 8)(%rsp); movq %r12, (2 * 8)(%rsp); movq %r13, (3 * 8)(%rsp); + CFI_REL_OFFSET(%r10, 0 * 8); + CFI_REL_OFFSET(%r11, 1 * 8); + CFI_REL_OFFSET(%r12, 2 * 8); + CFI_REL_OFFSET(%r13, 3 * 8); vmovdqu (%rdx), RTMP0x; @@ -975,6 +1005,10 @@ _gcry_twofish_avx2_ocb_auth: movq (1 * 8)(%rsp), %r11; movq (2 * 8)(%rsp), %r12; movq (3 * 8)(%rsp), %r13; + CFI_RESTORE(%r10); + CFI_RESTORE(%r11); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); call __twofish_enc_blk16; @@ -987,6 +1021,7 @@ _gcry_twofish_avx2_ocb_auth: vpxor RA1, RC1, RA1; addq $(4 * 8), %rsp; + CFI_ADJUST_CFA_OFFSET(-4 * 8); vpxor RA1, RA0, RTMP1; @@ -998,6 +1033,7 @@ _gcry_twofish_avx2_ocb_auth: vzeroall; ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;) .align 16 diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S index e98b831c0..5631dc567 100644 --- a/cipher/whirlpool-sse2-amd64.S +++ b/cipher/whirlpool-sse2-amd64.S @@ -23,17 +23,7 @@ #if (defined(HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS) || \ defined(HAVE_COMPATIBLE_GCC_WIN64_PLATFORM_AS)) && defined(USE_WHIRLPOOL) -#ifdef __PIC__ -# define RIP %rip -#else -# define RIP -#endif - -#ifdef HAVE_COMPATIBLE_GCC_AMD64_PLATFORM_AS -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "asm-common-amd64.h" .text @@ -173,16 +163,24 @@ _gcry_whirlpool_transform_amd64: * %rdx: nblks * %rcx: look-up tables */ + CFI_STARTPROC(); cmp $0, %rdx; je .Lskip; subq $STACK_MAX, %rsp; + CFI_ADJUST_CFA_OFFSET(STACK_MAX); movq %rbp, STACK_RBP(%rsp); movq %rbx, STACK_RBX(%rsp); movq %r12, STACK_R12(%rsp); movq %r13, STACK_R13(%rsp); movq %r14, STACK_R14(%rsp); movq %r15, STACK_R15(%rsp); + CFI_REL_OFFSET(%rbp, STACK_RBP); + CFI_REL_OFFSET(%rbx, STACK_RBX); + CFI_REL_OFFSET(%r12, STACK_R12); + CFI_REL_OFFSET(%r13, STACK_R13); + CFI_REL_OFFSET(%r14, STACK_R14); + CFI_REL_OFFSET(%r15, STACK_R15); movq %rdx, STACK_NBLKS(%rsp); movq %rdi, STACK_STATEP(%rsp); @@ -332,10 +330,18 @@ _gcry_whirlpool_transform_amd64: movq STACK_R13(%rsp), %r13; movq STACK_R14(%rsp), %r14; movq STACK_R15(%rsp), %r15; + CFI_RESTORE(%rbp); + CFI_RESTORE(%rbx); + CFI_RESTORE(%r12); + CFI_RESTORE(%r13); + CFI_RESTORE(%r14); + CFI_RESTORE(%r15); addq $STACK_MAX, %rsp; + CFI_ADJUST_CFA_OFFSET(-STACK_MAX); .Lskip: movl $(STACK_MAX + 8), %eax; ret; + CFI_ENDPROC(); ELF(.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64;) #endif diff --git a/configure.ac b/configure.ac index b54b212b3..75949f942 100644 --- a/configure.ac +++ b/configure.ac @@ -1171,6 +1171,32 @@ if test "$gcry_cv_gcc_aarch64_platform_as_ok" = "yes" ; then fi +# +# Check whether GCC assembler supports for CFI directives. +# +AC_CACHE_CHECK([whether GCC assembler supports for CFI directives], + [gcry_cv_gcc_asm_cfi_directives], + [gcry_cv_gcc_asm_cfi_directives=no + AC_COMPILE_IFELSE([AC_LANG_SOURCE( + [[__asm__( + ".cfi_startproc\n\t" + ".cfi_remember_state\n\t" + ".cfi_adjust_cfa_offset 8\n\t" + ".cfi_rel_offset 0, 8\n\t" + ".cfi_def_cfa_register 1\n\t" + ".cfi_register 2, 3\n\t" + ".cfi_restore 2\n\t" + ".cfi_escape 0x0f, 0x02, 0x11, 0x00\n\t" + ".cfi_restore_state\n\t" + ".cfi_endproc\n\t" + );]])], + [gcry_cv_gcc_asm_cfi_directives=yes])]) +if test "$gcry_cv_gcc_asm_cfi_directives" = "yes" ; then + AC_DEFINE(HAVE_GCC_ASM_CFI_DIRECTIVES,1, + [Defined if underlying assembler supports for CFI directives]) +fi + + # # Check whether underscores in symbols are required. This needs to be # done before setting up the assembler stuff. @@ -1617,7 +1643,6 @@ if test "$gcry_cv_gcc_platform_as_ok_for_intel_syntax" = "yes" ; then [Defined if underlying assembler is compatible with Intel syntax assembly implementations]) fi - # # Check whether compiler is configured for ARMv6 or newer architecture # diff --git a/mpi/amd64/func_abi.h b/mpi/amd64/func_abi.h index ce4467441..37d5722af 100644 --- a/mpi/amd64/func_abi.h +++ b/mpi/amd64/func_abi.h @@ -1,9 +1,36 @@ +#include + +#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES +# define CFI_STARTPROC() .cfi_startproc +# define CFI_ENDPROC() .cfi_endproc +# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off +# define CFI_REL_OFFSET(reg,off) .cfi_rel_offset reg, off +# define CFI_RESTORE(reg) .cfi_restore reg + +# define CFI_PUSH(reg) \ + CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0) +# define CFI_POP(reg) \ + CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg) +#else +# define CFI_STARTPROC() +# define CFI_ENDPROC() +# define CFI_ADJUST_CFA_OFFSET(off) +# define CFI_REL_OFFSET(reg,off) +# define CFI_RESTORE(reg) + +# define CFI_PUSH(reg) +# define CFI_POP(reg) +#endif + #ifdef USE_MS_ABI /* Store registers and move four first input arguments from MS ABI to * SYSV ABI. */ #define FUNC_ENTRY() \ + CFI_STARTPROC(); \ pushq %rsi; \ + CFI_PUSH(%rsi); \ pushq %rdi; \ + CFI_PUSH(%rdi); \ movq %rdx, %rsi; \ movq %rcx, %rdi; \ movq %r8, %rdx; \ @@ -12,8 +39,16 @@ /* Restore registers. */ #define FUNC_EXIT() \ popq %rdi; \ - popq %rsi; + CFI_POP(%rdi); \ + popq %rsi; \ + CFI_POP(%rsi); \ + ret; \ + CFI_ENDPROC(); #else - #define FUNC_ENTRY() /**/ - #define FUNC_EXIT() /**/ + #define FUNC_ENTRY() \ + CFI_STARTPROC(); + + #define FUNC_EXIT() \ + ret; \ + CFI_ENDPROC(); #endif diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S index 6a9026219..157e5f1e0 100644 --- a/mpi/amd64/mpih-add1.S +++ b/mpi/amd64/mpih-add1.S @@ -62,4 +62,3 @@ C_SYMBOL_NAME(_gcry_mpih_add_n:) adcq %rax, %rax FUNC_EXIT() ret - \ No newline at end of file From devnexen at gmail.com Wed Apr 17 21:08:33 2019 From: devnexen at gmail.com (devnexen at gmail.com) Date: Wed, 17 Apr 2019 15:08:33 -0400 Subject: PATCH: NetBSD - explicit_memset support Message-ID: -----BEGIN This message is encrypted: Open Message Alternatively copy and paste the following link: https://flowcrypt.com/qSJwWL0c2h -------------- next part -------------- An HTML attachment was scrubbed... URL: From wk at gnupg.org Tue Apr 23 17:16:43 2019 From: wk at gnupg.org (Werner Koch) Date: Tue, 23 Apr 2019 17:16:43 +0200 Subject: [PATCH] Limit and document Blowfish key lengths to 8-576 bits In-Reply-To: <6fea47bc-24af-cad3-282b-9947accd464e@iki.fi> (Jussi Kivilinna's message of "Thu, 18 Apr 2019 18:38:48 +0300") References: <155553217686.31071.12317723959985459255.stgit@localhost.localdomain> <20190417215036.GA20903@al> <6fea47bc-24af-cad3-282b-9947accd464e@iki.fi> Message-ID: <871s1syco4.fsf@wheatstone.g10code.de> On Thu, 18 Apr 2019 18:38, jussi.kivilinna at iki.fi said: > gcry_cipher_get_algo_keylen as existing users might depend it to stay > fixed to 128bits. Yes, this is the case. We can't change that. The variable keylength is anyway very specific to Blowfish and dies not justify a new interface. Salam-Shalom, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. -------------- next part -------------- A non-text attachment was scrubbed... Name: signature.asc Type: application/pgp-signature Size: 227 bytes Desc: not available URL: From jussi.kivilinna at iki.fi Fri Apr 26 18:33:31 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 26 Apr 2019 19:33:31 +0300 Subject: [PATCH 1/4] Add 64-bit ARMv8/CE PMULL implementation of CRC Message-ID: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain> * cipher/Makefile.am: Add 'crc-armv8-ce.c' and 'crc-armv8-aarch64-ce.S'. * cipher/asm-common-aarch64.h [HAVE_GCC_ASM_CFI_DIRECTIVES]: Add CFI helper macros. * cipher/crc-armv8-aarch64-ce.S: New. * cipher/crc-armv8-ce.c: New. * cipher/crc.c (USE_ARM_PMULL): New. (CRC_CONTEXT) [USE_ARM_PMULL]: Add 'use_pmull'. [USE_ARM_PMULL] (_gcry_crc32_armv8_ce_pmull) (_gcry_crc24rfc2440_armv8_ce_pmull): New prototypes. (crc32_init, crc32rfc1510_init, crc24rfc2440_init): Enable ARM PMULL implementations if supported by HW features. (crc32_write, crc24rfc2440_write) [USE_ARM_PMULL]: Use ARM PMULL implementations if enabled. * configure.ac: Add 'crc-armv8-ce.lo' and 'crc-armv8-aarch64-ce.lo'. -- Benchmark on Cortex-A53 (at 1104 Mhz): Before: | nanosecs/byte mebibytes/sec cycles/byte CRC32 | 2.89 ns/B 330.2 MiB/s 3.19 c/B CRC32RFC1510 | 2.89 ns/B 330.2 MiB/s 3.19 c/B CRC24RFC2440 | 2.72 ns/B 350.8 MiB/s 3.00 c/B After (crc32 ~8.4x faster, crc24 ~6.8x faster): | nanosecs/byte mebibytes/sec cycles/byte CRC32 | 0.341 ns/B 2796 MiB/s 0.377 c/B CRC32RFC1510 | 0.342 ns/B 2792 MiB/s 0.377 c/B CRC24RFC2440 | 0.398 ns/B 2396 MiB/s 0.439 c/B Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 3f00ed4a8..2acd7cb38 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -78,7 +78,8 @@ EXTRA_libcipher_la_SOURCES = \ cast5.c cast5-amd64.S cast5-arm.S \ chacha20.c chacha20-amd64-ssse3.S chacha20-amd64-avx2.S \ chacha20-armv7-neon.S chacha20-aarch64.S \ - crc.c crc-intel-pclmul.c \ + crc.c crc-intel-pclmul.c crc-armv8-ce.c \ + crc-armv8-aarch64-ce.S \ des.c des-amd64.S \ dsa.c \ elgamal.c \ diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S new file mode 100644 index 000000000..497d00551 --- /dev/null +++ b/cipher/crc-armv8-aarch64-ce.S @@ -0,0 +1,492 @@ +/* crc-armv8-aarch64-ce.S - ARMv8/CE PMULL accelerated CRC implementation + * Copyright (C) 2019 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#include "asm-common-aarch64.h" + +#if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) + +.cpu generic+simd+crypto + +.text + +#define GET_DATA_POINTER(reg, name) \ + adrp reg, :got:name ; \ + ldr reg, [reg, #:got_lo12:name] ; + +/* Structure of crc32_consts_s */ + +#define consts_k(idx) ((idx) * 8) +#define consts_my_p(idx) (consts_k(6) + (idx) * 8) + +/* Constants */ + +.align 6 +.Lcrc32_constants: +.Lcrc32_partial_fold_input_mask: + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 +.Lcrc32_refl_shuf_shift: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 + .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f +.Lcrc32_shuf_shift: + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.Lcrc32_bswap_shuf: + .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 + .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + + +/* + * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + * const struct crc32_consts_s *consts); + */ +.align 3 +.globl _gcry_crc32r_armv8_ce_bulk +ELF(.type _gcry_crc32r_armv8_ce_bulk,%function;) +_gcry_crc32r_armv8_ce_bulk: + /* input: + * x0: pcrc + * x1: inbuf + * x2: inlen + * x3: consts + */ + + GET_DATA_POINTER(x7, .Lcrc32_constants) + add x9, x3, #consts_k(5 - 1) + cmp x2, #128 + + b.lo .Lcrc32r_fold_by_one_setup + + eor v4.16b, v4.16b, v4.16b + add x4, x3, #consts_k(1 - 1) + ld1 {v4.s}[0], [x0] /* load pcrc */ + ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */ + sub x2, x2, #64 + ld1 {v6.16b}, [x4] + eor v0.16b, v0.16b, v4.16b + + add x4, x3, #consts_k(3 - 1) + add x5, x3, #consts_my_p(0) + +.Lcrc32r_fold_by_four: + + /* Fold by 4. */ + ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */ + sub x2, x2, #64 + pmull v20.1q, v0.1d, v6.1d + pmull v21.1q, v1.1d, v6.1d + pmull v22.1q, v2.1d, v6.1d + pmull v23.1q, v3.1d, v6.1d + cmp x2, #64 + pmull2 v24.1q, v0.2d, v6.2d + pmull2 v25.1q, v1.2d, v6.2d + pmull2 v26.1q, v2.2d, v6.2d + pmull2 v27.1q, v3.2d, v6.2d + eor v0.16b, v20.16b, v16.16b + eor v1.16b, v21.16b, v17.16b + eor v2.16b, v22.16b, v18.16b + eor v3.16b, v23.16b, v19.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v25.16b + eor v2.16b, v2.16b, v26.16b + eor v3.16b, v3.16b, v27.16b + b.hs .Lcrc32r_fold_by_four + + ld1 {v6.16b}, [x4] + ld1 {v5.16b}, [x5] + + cmp x2, #16 + + /* Fold 4 to 1. */ + + pmull v16.1q, v0.1d, v6.1d + pmull2 v4.1q, v0.2d, v6.2d + eor v0.16b, v16.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + + pmull v16.1q, v0.1d, v6.1d + pmull2 v4.1q, v0.2d, v6.2d + eor v0.16b, v16.16b, v2.16b + eor v0.16b, v0.16b, v4.16b + + pmull v16.1q, v0.1d, v6.1d + pmull2 v4.1q, v0.2d, v6.2d + eor v0.16b, v16.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + + b.lo .Lcrc32r_fold_by_one_done + b .Lcrc32r_fold_by_one + +.Lcrc32r_fold_by_one_setup: + + eor v1.16b, v1.16b, v1.16b + add x4, x3, #consts_k(3 - 1) + add x5, x3, #consts_my_p(0) + sub x2, x2, #16 + ld1 {v1.s}[0], [x0] /* load pcrc */ + ld1 {v0.16b}, [x1], #16 /* load 16 bytes of input */ + cmp x2, #16 + ld1 {v6.16b}, [x4] /* load k3k4 */ + ld1 {v5.16b}, [x5] /* load my_p */ + eor v0.16b, v0.16b, v1.16b + b.lo .Lcrc32r_fold_by_one_done + +.Lcrc32r_fold_by_one: + sub x2, x2, #16 + ld1 {v2.16b}, [x1], #16 /* load 16 bytes of input */ + pmull v3.1q, v0.1d, v6.1d + pmull2 v1.1q, v0.2d, v6.2d + cmp x2, #16 + eor v0.16b, v3.16b, v2.16b + eor v0.16b, v0.16b, v1.16b + + b.hs .Lcrc32r_fold_by_one + +.Lcrc32r_fold_by_one_done: + + cmp x2, #0 + b.eq .Lcrc32r_final_fold + + /* Partial fold. */ + + add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + add x5, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 16 + add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants + sub x8, x2, #16 + add x4, x4, x2 + add x5, x5, x2 + add x6, x6, x2 + add x8, x1, x8 + + /* Load last input and add padding zeros. */ + ld1 {v4.16b}, [x4] + eor x2, x2, x2 + ld1 {v3.16b}, [x5] + ld1 {v2.16b}, [x6] + tbl v30.16b, {v0.16b}, v4.16b + ld1 {v4.16b}, [x8] + tbl v1.16b, {v0.16b}, v3.16b + + pmull v0.1q, v30.1d, v6.1d + and v2.16b, v2.16b, v4.16b + pmull2 v31.1q, v30.2d, v6.2d + orr v2.16b, v2.16b, v1.16b + eor v0.16b, v0.16b, v31.16b + eor v0.16b, v0.16b, v2.16b + +.Lcrc32r_final_fold: + + /* Final fold. */ + + eor v2.16b, v2.16b, v2.16b /* zero reg */ + ld1 {v7.16b}, [x9] + + /* reduce 128-bits to 96-bits */ + ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */ + mov v1.16b, v0.16b + pmull v0.1q, v0.1d, v6.1d + ext v6.16b, v5.16b, v5.16b, #8 /* swap high and low parts */ + ext v1.16b, v1.16b, v2.16b, #8 /* high to low, high zeroed */ + eor v3.16b, v0.16b, v1.16b + + /* reduce 96-bits to 64-bits */ + eor v1.16b, v1.16b, v1.16b + ext v0.16b, v3.16b, v2.16b, #4 /* [00][00][x2][x1] */ + mov v1.s[0], v3.s[0] /* [00][00][00][x0] */ + eor v3.16b, v3.16b, v3.16b + pmull v1.1q, v1.1d, v7.1d /* [00][00][xx][xx] */ + eor v0.16b, v0.16b, v1.16b /* top 64-bit are zero */ + + /* barrett reduction */ + mov v3.s[1], v0.s[0] /* [00][00][x1][00] */ + ext v0.16b, v2.16b, v0.16b, #12 /* [??][x1][??][00] */ + pmull v1.1q, v3.1d, v5.1d /* [00][xx][xx][00] */ + pmull v1.1q, v1.1d, v6.1d /* [00][xx][xx][00] */ + eor v0.16b, v0.16b, v1.16b + + /* store CRC */ + st1 {v0.s}[2], [x0] + + ret +ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;) + +/* + * void _gcry_crc32r_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc, + * const struct crc32_consts_s *consts); + */ +.align 3 +.globl _gcry_crc32r_armv8_ce_reduction_4 +ELF(.type _gcry_crc32r_armv8_ce_reduction_4,%function;) +_gcry_crc32r_armv8_ce_reduction_4: + /* input: + * w0: data + * w1: crc + * x2: crc32 constants + */ + + eor v0.16b, v0.16b, v0.16b + add x2, x2, #consts_my_p(0) + eor v1.16b, v1.16b, v1.16b + ld1 {v5.16b}, [x2] + + mov v0.s[0], w0 + pmull v0.1q, v0.1d, v5.1d /* [00][00][xx][xx] */ + mov v1.s[1], w1 + mov v0.s[2], v0.s[0] /* [00][x0][x1][x0] */ + pmull2 v0.1q, v0.2d, v5.2d /* [00][00][xx][xx] */ + eor v0.16b, v0.16b, v1.16b + + mov w0, v0.s[1] + + ret +ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;) + +/* + * void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + * const struct crc32_consts_s *consts); + */ +.align 3 +.globl _gcry_crc32_armv8_ce_bulk +ELF(.type _gcry_crc32_armv8_ce_bulk,%function;) +_gcry_crc32_armv8_ce_bulk: + /* input: + * x0: pcrc + * x1: inbuf + * x2: inlen + * x3: consts + */ + + GET_DATA_POINTER(x7, .Lcrc32_constants) + add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants + cmp x2, #128 + ld1 {v7.16b}, [x4] + + b.lo .Lcrc32_fold_by_one_setup + + eor v4.16b, v4.16b, v4.16b + add x4, x3, #consts_k(1 - 1) + ld1 {v4.s}[0], [x0] /* load pcrc */ + ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */ + sub x2, x2, #64 + ld1 {v6.16b}, [x4] + eor v0.16b, v0.16b, v4.16b + ext v4.16b, v6.16b, v6.16b, #8 + tbl v0.16b, { v0.16b }, v7.16b /* byte swap */ + tbl v1.16b, { v1.16b }, v7.16b /* byte swap */ + tbl v2.16b, { v2.16b }, v7.16b /* byte swap */ + tbl v3.16b, { v3.16b }, v7.16b /* byte swap */ + + add x4, x3, #consts_k(3 - 1) + add x5, x3, #consts_my_p(0) + +.Lcrc32_fold_by_four: + + /* Fold by 4. */ + ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */ + sub x2, x2, #64 + tbl v16.16b, { v16.16b }, v7.16b /* byte swap */ + tbl v17.16b, { v17.16b }, v7.16b /* byte swap */ + tbl v18.16b, { v18.16b }, v7.16b /* byte swap */ + tbl v19.16b, { v19.16b }, v7.16b /* byte swap */ + cmp x2, #64 + pmull2 v20.1q, v0.2d, v4.2d + pmull2 v21.1q, v1.2d, v4.2d + pmull2 v22.1q, v2.2d, v4.2d + pmull2 v23.1q, v3.2d, v4.2d + pmull v24.1q, v0.1d, v4.1d + pmull v25.1q, v1.1d, v4.1d + pmull v26.1q, v2.1d, v4.1d + pmull v27.1q, v3.1d, v4.1d + eor v0.16b, v20.16b, v16.16b + eor v1.16b, v21.16b, v17.16b + eor v2.16b, v22.16b, v18.16b + eor v3.16b, v23.16b, v19.16b + eor v0.16b, v0.16b, v24.16b + eor v1.16b, v1.16b, v25.16b + eor v2.16b, v2.16b, v26.16b + eor v3.16b, v3.16b, v27.16b + b.hs .Lcrc32_fold_by_four + + ld1 {v6.16b}, [x4] + ld1 {v5.16b}, [x5] + ext v6.16b, v6.16b, v6.16b, #8 + ext v5.16b, v5.16b, v5.16b, #8 + + cmp x2, #16 + + /* Fold 4 to 1. */ + + pmull2 v16.1q, v0.2d, v6.2d + pmull v4.1q, v0.1d, v6.1d + eor v0.16b, v16.16b, v1.16b + eor v0.16b, v0.16b, v4.16b + + pmull2 v16.1q, v0.2d, v6.2d + pmull v4.1q, v0.1d, v6.1d + eor v0.16b, v16.16b, v2.16b + eor v0.16b, v0.16b, v4.16b + + pmull2 v16.1q, v0.2d, v6.2d + pmull v4.1q, v0.1d, v6.1d + eor v0.16b, v16.16b, v3.16b + eor v0.16b, v0.16b, v4.16b + + b.lo .Lcrc32_fold_by_one_done + b .Lcrc32_fold_by_one + +.Lcrc32_fold_by_one_setup: + + eor v1.16b, v1.16b, v1.16b + add x4, x3, #consts_k(3 - 1) + add x5, x3, #consts_my_p(0) + ld1 {v1.s}[0], [x0] /* load pcrc */ + sub x2, x2, #16 + ld1 {v0.16b}, [x1], #16 /* load 16 bytes of input */ + ld1 {v6.16b}, [x4] /* load k3k4 */ + ld1 {v5.16b}, [x5] /* load my_p */ + eor v0.16b, v0.16b, v1.16b + cmp x2, #16 + ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */ + ext v5.16b, v5.16b, v5.16b, #8 /* swap high and low parts */ + tbl v0.16b, { v0.16b }, v7.16b /* byte swap */ + b.lo .Lcrc32_fold_by_one_done + +.Lcrc32_fold_by_one: + sub x2, x2, #16 + ld1 {v2.16b}, [x1], #16 /* load 16 bytes of input */ + pmull2 v3.1q, v0.2d, v6.2d + tbl v2.16b, { v2.16b }, v7.16b /* byte swap */ + pmull v1.1q, v0.1d, v6.1d + cmp x2, #16 + eor v0.16b, v3.16b, v2.16b + eor v0.16b, v0.16b, v1.16b + + b.hs .Lcrc32_fold_by_one + +.Lcrc32_fold_by_one_done: + + cmp x2, #0 + b.eq .Lcrc32_final_fold + + /* Partial fold. */ + + add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 32 + add x5, x7, #.Lcrc32_shuf_shift - .Lcrc32_constants + 16 + add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants + sub x8, x2, #16 + sub x4, x4, x2 + add x5, x5, x2 + add x6, x6, x2 + add x8, x1, x8 + + /* Load last input and add padding zeros. */ + ld1 {v4.16b}, [x4] + eor x2, x2, x2 + ld1 {v3.16b}, [x5] + ld1 {v2.16b}, [x6] + tbl v30.16b, {v0.16b}, v4.16b + ld1 {v4.16b}, [x8] + tbl v1.16b, {v0.16b}, v3.16b + and v2.16b, v2.16b, v4.16b + + pmull2 v0.1q, v30.2d, v6.2d + orr v2.16b, v2.16b, v1.16b + pmull v1.1q, v30.1d, v6.1d + tbl v2.16b, {v2.16b}, v7.16b /* byte swap */ + eor v0.16b, v0.16b, v1.16b + eor v0.16b, v0.16b, v2.16b + +.Lcrc32_final_fold: + + /* Final fold. */ + + eor v2.16b, v2.16b, v2.16b /* zero reg */ + + /* reduce 128-bits to 96-bits */ + add x4, x3, #consts_k(4) + ext v3.16b, v6.16b, v6.16b, #8 /* swap high and low parts */ + eor v6.16b, v6.16b, v6.16b + mov v1.16b, v0.16b + pmull2 v0.1q, v0.2d, v3.2d + ld1 {v6.d}[1], [x4] /* load k4 */ + ext v1.16b, v2.16b, v1.16b, #8 /* low to high, low zeroed */ + eor v3.16b, v0.16b, v1.16b /* bottom 32-bit are zero */ + + /* reduce 96-bits to 64-bits */ + eor v0.16b, v0.16b, v0.16b + eor v1.16b, v1.16b, v1.16b + mov v0.s[1], v3.s[1] /* [00][00][x1][00] */ + mov v1.s[2], v3.s[3] /* [00][x3][00][00] */ + mov v0.s[2], v3.s[2] /* [00][x2][x1][00] */ + eor v3.16b, v3.16b, v3.16b + pmull2 v1.1q, v1.2d, v6.2d /* [00][xx][xx][00] */ + eor v0.16b, v0.16b, v1.16b /* top and bottom 32-bit are zero */ + + /* barrett reduction */ + mov v3.s[0], v0.s[1] /* [00][00][00][x1] */ + pmull2 v0.1q, v0.2d, v5.2d /* [00][xx][xx][xx] */ + ext v0.16b, v0.16b, v2.16b, #4 /* [00][00][xx][xx] */ + pmull v0.1q, v0.1d, v5.1d + eor v0.16b, v0.16b, v3.16b + + /* store CRC in input endian */ + rev32 v0.8b, v0.8b /* byte swap */ + st1 {v0.s}[0], [x0] + + ret +ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;) + +/* + * void _gcry_crc32_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc, + * const struct crc32_consts_s *consts); + */ +.align 3 +.globl _gcry_crc32_armv8_ce_reduction_4 +ELF(.type _gcry_crc32_armv8_ce_reduction_4,%function;) +_gcry_crc32_armv8_ce_reduction_4: + /* input: + * w0: data + * w1: crc + * x2: crc32 constants + */ + + eor v0.16b, v0.16b, v0.16b + add x2, x2, #consts_my_p(0) + eor v1.16b, v1.16b, v1.16b + ld1 {v5.16b}, [x2] + + mov v0.s[1], w0 + pmull v0.1q, v0.1d, v5.1d /* [00][xx][xx][00] */ + mov v1.s[0], w1 + pmull2 v0.1q, v0.2d, v5.2d /* [00][00][xx][xx] */ + eor v0.16b, v0.16b, v1.16b + + rev32 v0.8b, v0.8b /* Return in input endian */ + mov w0, v0.s[0] + + ret +ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;) + +#endif diff --git a/cipher/crc-armv8-ce.c b/cipher/crc-armv8-ce.c new file mode 100644 index 000000000..8dd07cce6 --- /dev/null +++ b/cipher/crc-armv8-ce.c @@ -0,0 +1,229 @@ +/* crc-armv8-ce.c - ARMv8-CE PMULL accelerated CRC implementation + * Copyright (C) 2019 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA + * + */ + +#include +#include +#include +#include + +#include "g10lib.h" + +#include "bithelp.h" +#include "bufhelp.h" + + +#if defined(ENABLE_ARM_CRYPTO_SUPPORT) && defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) + + +#define ALIGNED_16 __attribute__ ((aligned (16))) + + +struct u16_unaligned_s +{ + u16 a; +} __attribute__((packed, aligned (1), may_alias)); + +struct u32_unaligned_s +{ + u32 a; +} __attribute__((packed, aligned (1), may_alias)); + + +/* Constants structure for generic reflected/non-reflected CRC32 PMULL + * functions. */ +struct crc32_consts_s +{ + /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */ + u64 k[6]; + /* my_p: { floor(x^64 / P(x)), P(x) } */ + u64 my_p[2]; +}; + +/* PMULL constants for CRC32 and CRC32RFC1510. */ +static const struct crc32_consts_s crc32_consts ALIGNED_16 = +{ + { /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */ + U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */ + U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */ + U64_C(0x163cd6124), 0 /* y = 2 */ + }, + { /* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */ + U64_C(0x1f7011641), U64_C(0x1db710641) + } +}; + +/* PMULL constants for CRC24RFC2440 (polynomial multiplied with x?). */ +static const struct crc32_consts_s crc24rfc2440_consts ALIGNED_16 = +{ + { /* k[6] = x^(32*y) mod P(x) << 32*/ + U64_C(0x08289a00) << 32, U64_C(0x74b44a00) << 32, /* y = { 17, 15 } */ + U64_C(0xc4b14d00) << 32, U64_C(0xfd7e0c00) << 32, /* y = { 5, 3 } */ + U64_C(0xd9fe8c00) << 32, 0 /* y = 2 */ + }, + { /* my_p[2] = { floor(x^64 / P(x)), P(x) } */ + U64_C(0x1f845fe24), U64_C(0x1864cfb00) + } +}; + + +u32 _gcry_crc32r_armv8_ce_reduction_4 (u32 data, u32 crc, + const struct crc32_consts_s *consts); +void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts); + +u32 _gcry_crc32_armv8_ce_reduction_4 (u32 data, u32 crc, + const struct crc32_consts_s *consts); +void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts); + + +static inline void +crc32r_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts) +{ + u32 crc = *pcrc; + u32 data; + + while (inlen >= 4) + { + data = ((const struct u32_unaligned_s *)inbuf)->a; + data ^= crc; + + inlen -= 4; + inbuf += 4; + + crc = _gcry_crc32r_armv8_ce_reduction_4 (data, 0, consts); + } + + switch (inlen) + { + case 0: + break; + case 1: + data = inbuf[0]; + data ^= crc; + data <<= 24; + crc >>= 8; + crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts); + break; + case 2: + data = ((const struct u16_unaligned_s *)inbuf)->a; + data ^= crc; + data <<= 16; + crc >>= 16; + crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts); + break; + case 3: + data = ((const struct u16_unaligned_s *)inbuf)->a; + data |= inbuf[2] << 16; + data ^= crc; + data <<= 8; + crc >>= 24; + crc = _gcry_crc32r_armv8_ce_reduction_4 (data, crc, consts); + break; + } + + *pcrc = crc; +} + +static inline void +crc32_less_than_16 (u32 *pcrc, const byte *inbuf, size_t inlen, + const struct crc32_consts_s *consts) +{ + u32 crc = *pcrc; + u32 data; + + while (inlen >= 4) + { + data = ((const struct u32_unaligned_s *)inbuf)->a; + data ^= crc; + data = _gcry_bswap32(data); + + inlen -= 4; + inbuf += 4; + + crc = _gcry_crc32_armv8_ce_reduction_4 (data, 0, consts); + } + + switch (inlen) + { + case 0: + break; + case 1: + data = inbuf[0]; + data ^= crc; + data = data & 0xffU; + crc = _gcry_bswap32(crc >> 8); + crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts); + break; + case 2: + data = ((const struct u16_unaligned_s *)inbuf)->a; + data ^= crc; + data = _gcry_bswap32(data << 16); + crc = _gcry_bswap32(crc >> 16); + crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts); + break; + case 3: + data = ((const struct u16_unaligned_s *)inbuf)->a; + data |= inbuf[2] << 16; + data ^= crc; + data = _gcry_bswap32(data << 8); + crc = crc & 0xff000000U; + crc = _gcry_crc32_armv8_ce_reduction_4 (data, crc, consts); + break; + } + + *pcrc = crc; +} + +void +_gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen) +{ + const struct crc32_consts_s *consts = &crc32_consts; + + if (!inlen) + return; + + if (inlen >= 16) + _gcry_crc32r_armv8_ce_bulk (pcrc, inbuf, inlen, consts); + else + crc32r_less_than_16 (pcrc, inbuf, inlen, consts); +} + +void +_gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen) +{ + const struct crc32_consts_s *consts = &crc24rfc2440_consts; + + if (!inlen) + return; + + /* Note: *pcrc in input endian. */ + + if (inlen >= 16) + _gcry_crc32_armv8_ce_bulk (pcrc, inbuf, inlen, consts); + else + crc32_less_than_16 (pcrc, inbuf, inlen, consts); +} + +#endif /* USE_INTEL_PCLMUL */ diff --git a/cipher/crc.c b/cipher/crc.c index 4457ff62f..2abbab288 100644 --- a/cipher/crc.c +++ b/cipher/crc.c @@ -42,12 +42,24 @@ # endif #endif /* USE_INTEL_PCLMUL */ +/* USE_ARM_PMULL indicates whether to compile GCM with ARMv8 PMULL code. */ +#undef USE_ARM_PMULL +#if defined(ENABLE_ARM_CRYPTO_SUPPORT) +# if defined(__AARCH64EL__) && \ + defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \ + defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) +# define USE_ARM_PMULL 1 +# endif +#endif /* USE_ARM_PMULL */ typedef struct { u32 CRC; #ifdef USE_INTEL_PCLMUL unsigned int use_pclmul:1; /* Intel PCLMUL shall be used. */ +#endif +#ifdef USE_ARM_PMULL + unsigned int use_pmull:1; /* ARMv8 PMULL shall be used. */ #endif byte buf[4]; } @@ -61,6 +73,13 @@ void _gcry_crc24rfc2440_intel_pclmul (u32 *pcrc, const byte *inbuf, size_t inlen); #endif +#ifdef USE_ARM_PMULL +/*-- crc-armv8-ce.c --*/ +void _gcry_crc32_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, size_t inlen); +void _gcry_crc24rfc2440_armv8_ce_pmull (u32 *pcrc, const byte *inbuf, + size_t inlen); +#endif + /* * Code generated by universal_crc by Danjel McGougan @@ -361,13 +380,17 @@ static void crc32_init (void *context, unsigned int flags) { CRC_CONTEXT *ctx = (CRC_CONTEXT *) context; -#ifdef USE_INTEL_PCLMUL u32 hwf = _gcry_get_hw_features (); +#ifdef USE_INTEL_PCLMUL ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL); #endif +#ifdef USE_ARM_PMULL + ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL); +#endif (void)flags; + (void)hwf; ctx->CRC = 0 ^ 0xffffffffL; } @@ -386,6 +409,13 @@ crc32_write (void *context, const void *inbuf_arg, size_t inlen) return; } #endif +#ifdef USE_ARM_PMULL + if (ctx->use_pmull) + { + _gcry_crc32_armv8_ce_pmull(&ctx->CRC, inbuf, inlen); + return; + } +#endif if (!inbuf || !inlen) return; @@ -439,13 +469,17 @@ static void crc32rfc1510_init (void *context, unsigned int flags) { CRC_CONTEXT *ctx = (CRC_CONTEXT *) context; -#ifdef USE_INTEL_PCLMUL u32 hwf = _gcry_get_hw_features (); +#ifdef USE_INTEL_PCLMUL ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL); #endif +#ifdef USE_ARM_PMULL + ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL); +#endif (void)flags; + (void)hwf; ctx->CRC = 0; } @@ -769,12 +803,16 @@ static void crc24rfc2440_init (void *context, unsigned int flags) { CRC_CONTEXT *ctx = (CRC_CONTEXT *) context; -#ifdef USE_INTEL_PCLMUL u32 hwf = _gcry_get_hw_features (); +#ifdef USE_INTEL_PCLMUL ctx->use_pclmul = (hwf & HWF_INTEL_SSE4_1) && (hwf & HWF_INTEL_PCLMUL); #endif +#ifdef USE_ARM_PMULL + ctx->use_pmull = (hwf & HWF_ARM_NEON) && (hwf & HWF_ARM_PMULL); +#endif + (void)hwf; (void)flags; ctx->CRC = crc24_init(); @@ -794,6 +832,13 @@ crc24rfc2440_write (void *context, const void *inbuf_arg, size_t inlen) return; } #endif +#ifdef USE_ARM_PMULL + if (ctx->use_pmull) + { + _gcry_crc24rfc2440_armv8_ce_pmull(&ctx->CRC, inbuf, inlen); + return; + } +#endif if (!inbuf || !inlen) return; diff --git a/configure.ac b/configure.ac index 1aafc320a..aa23a5010 100644 --- a/configure.ac +++ b/configure.ac @@ -2409,6 +2409,11 @@ if test "$found" = "1" ; then # Build with the assembly implementation GCRYPT_DIGESTS="$GCRYPT_DIGESTS crc-intel-pclmul.lo" ;; + aarch64-*-*) + # Build with the assembly implementation + GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-ce.lo" + GCRYPT_CIPHERS="$GCRYPT_CIPHERS crc-armv8-aarch64-ce.lo" + ;; esac fi From jussi.kivilinna at iki.fi Fri Apr 26 18:33:41 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 26 Apr 2019 19:33:41 +0300 Subject: [PATCH 3/4] Move data pointer macro for 64-bit ARM assembly to common header In-Reply-To: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain> References: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain> Message-ID: <155629642157.14985.4393579564393301810.stgit@localhost.localdomain> * cipher/asm-common-aarch64.h (GET_DATA_POINTER): New. * cipher/chacha20-aarch64.S (GET_DATA_POINTER): Remove. * cipher/cipher-gcm-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove. * cipher/crc-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove. * cipher/rijndael-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove. * cipher/sha1-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove. * cipher/sha256-armv8-aarch64-ce.S (GET_DATA_POINTER): Remove. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h index 502c35aeb..4ffc1b711 100644 --- a/cipher/asm-common-aarch64.h +++ b/cipher/asm-common-aarch64.h @@ -29,6 +29,16 @@ # define ELF(...) /*_*/ #endif +#ifdef _WIN32 +#define GET_DATA_POINTER(reg, name) \ + adrp reg, name ; \ + add reg, reg, #:lo12:name ; +#else +#define GET_DATA_POINTER(reg, name) \ + adrp reg, :got:name ; \ + ldr reg, [reg, #:got_lo12:name] ; +#endif + #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES /* CFI directives to emit DWARF stack unwinding information. */ # define CFI_STARTPROC() .cfi_startproc diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S index adb9b1f29..07b4bb5c0 100644 --- a/cipher/chacha20-aarch64.S +++ b/cipher/chacha20-aarch64.S @@ -38,15 +38,6 @@ .text -#ifdef _WIN32 -#define GET_DATA_POINTER(reg, name) \ - adrp reg, name ; \ - add reg, reg, #:lo12:name ; -#else -#define GET_DATA_POINTER(reg, name) \ - adrp reg, :got:name ; \ - ldr reg, [reg, #:got_lo12:name] ; -#endif /* register macros */ #define INPUT x0 diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S index 7c6be94ed..b0c2cccc6 100644 --- a/cipher/cipher-gcm-armv8-aarch64-ce.S +++ b/cipher/cipher-gcm-armv8-aarch64-ce.S @@ -27,10 +27,6 @@ .text -#define GET_DATA_POINTER(reg, name) \ - adrp reg, :got:name ; \ - ldr reg, [reg, #:got_lo12:name] ; - /* Constants */ diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S index f269b74a3..060abdfe9 100644 --- a/cipher/crc-armv8-aarch64-ce.S +++ b/cipher/crc-armv8-aarch64-ce.S @@ -27,9 +27,6 @@ .text -#define GET_DATA_POINTER(reg, name) \ - adrp reg, :got:name ; \ - ldr reg, [reg, #:got_lo12:name] ; /* Structure of crc32_consts_s */ diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S index 71b45b856..3af29e0d0 100644 --- a/cipher/rijndael-armv8-aarch64-ce.S +++ b/cipher/rijndael-armv8-aarch64-ce.S @@ -28,11 +28,6 @@ .text -#define GET_DATA_POINTER(reg, name) \ - adrp reg, :got:name ; \ - ldr reg, [reg, #:got_lo12:name] ; - - /* Register macros */ #define vk0 v17 diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S index 7dc26c0f1..223268cad 100644 --- a/cipher/sha1-armv8-aarch64-ce.S +++ b/cipher/sha1-armv8-aarch64-ce.S @@ -28,11 +28,6 @@ .text -#define GET_DATA_POINTER(reg, name) \ - adrp reg, :got:name ; \ - ldr reg, [reg, #:got_lo12:name] ; - - /* Constants */ #define K1 0x5A827999 diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S index 706e0dfd9..f57cae290 100644 --- a/cipher/sha256-armv8-aarch64-ce.S +++ b/cipher/sha256-armv8-aarch64-ce.S @@ -28,11 +28,6 @@ .text -#define GET_DATA_POINTER(reg, name) \ - adrp reg, :got:name ; \ - ldr reg, [reg, #:got_lo12:name] ; - - /* Constants */ .align 4 From jussi.kivilinna at iki.fi Fri Apr 26 18:33:36 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 26 Apr 2019 19:33:36 +0300 Subject: [PATCH 2/4] Add CFI unwind assembly directives for 64-bit ARM assembly In-Reply-To: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain> References: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain> Message-ID: <155629641640.14985.5996707259227955897.stgit@localhost.localdomain> * cipher/asm-common-aarch64.h (CFI_STARTPROC, CFI_ENDPROC) (CFI_REMEMBER_STATE, CFI_RESTORE_STATE, CFI_ADJUST_CFA_OFFSET) (CFI_REL_OFFSET, CFI_DEF_CFA_REGISTER, CFI_REGISTER, CFI_RESTORE) (DW_REGNO_SP, DW_SLEB128_7BIT, DW_SLEB128_28BIT, CFI_CFA_ON_STACK) (CFI_REG_ON_STACK): New. * cipher/camellia-aarch64.S: Add CFI directives. * cipher/chacha20-aarch64.S: Add CFI directives. * cipher/cipher-gcm-armv8-aarch64-ce.S: Add CFI directives. * cipher/crc-armv8-aarch64-ce.S: Add CFI directives. * cipher/rijndael-aarch64.S: Add CFI directives. * cipher/rijndael-armv8-aarch64-ce.S: Add CFI directives. * cipher/sha1-armv8-aarch64-ce.S: Add CFI directives. * cipher/sha256-armv8-aarch64-ce.S: Add CFI directives. * cipher/twofish-aarch64.S: Add CFI directives. * mpi/aarch64/mpih-add1.S: Add CFI directives. * mpi/aarch64/mpih-mul1.S: Add CFI directives. * mpi/aarch64/mpih-mul2.S: Add CFI directives. * mpi/aarch64/mpih-mul3.S: Add CFI directives. * mpi/aarch64/mpih-sub1.S: Add CFI directives. * mpi/asm-common-aarch64.h: Include "../cipher/asm-common-aarch64.h". (ELF): Remove. -- This commit adds CFI directives that add DWARF unwinding information for debugger to backtrace when executing code from 64-bit ARM assembly files. Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h index 814b7ad16..502c35aeb 100644 --- a/cipher/asm-common-aarch64.h +++ b/cipher/asm-common-aarch64.h @@ -29,4 +29,62 @@ # define ELF(...) /*_*/ #endif +#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES +/* CFI directives to emit DWARF stack unwinding information. */ +# define CFI_STARTPROC() .cfi_startproc +# define CFI_ENDPROC() .cfi_endproc +# define CFI_REMEMBER_STATE() .cfi_remember_state +# define CFI_RESTORE_STATE() .cfi_restore_state +# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off +# define CFI_REL_OFFSET(reg,off) .cfi_rel_offset reg, off +# define CFI_DEF_CFA_REGISTER(reg) .cfi_def_cfa_register reg +# define CFI_REGISTER(ro,rn) .cfi_register ro, rn +# define CFI_RESTORE(reg) .cfi_restore reg + +/* CFA expressions are used for pointing CFA and registers to + * SP relative offsets. */ +# define DW_REGNO_SP 31 + +/* Fixed length encoding used for integers for now. */ +# define DW_SLEB128_7BIT(value) \ + 0x00|((value) & 0x7f) +# define DW_SLEB128_28BIT(value) \ + 0x80|((value)&0x7f), \ + 0x80|(((value)>>7)&0x7f), \ + 0x80|(((value)>>14)&0x7f), \ + 0x00|(((value)>>21)&0x7f) + +# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) \ + .cfi_escape \ + 0x0f, /* DW_CFA_def_cfa_expression */ \ + DW_SLEB128_7BIT(11), /* length */ \ + 0x8f, /* DW_OP_breg31, rsp + constant */ \ + DW_SLEB128_28BIT(rsp_offs), \ + 0x06, /* DW_OP_deref */ \ + 0x23, /* DW_OP_plus_constu */ \ + DW_SLEB128_28BIT((cfa_depth)+8) + +# define CFI_REG_ON_STACK(regno,rsp_offs) \ + .cfi_escape \ + 0x10, /* DW_CFA_expression */ \ + DW_SLEB128_7BIT(regno), \ + DW_SLEB128_7BIT(5), /* length */ \ + 0x8f, /* DW_OP_breg31, rsp + constant */ \ + DW_SLEB128_28BIT(rsp_offs) + +#else +# define CFI_STARTPROC() +# define CFI_ENDPROC() +# define CFI_REMEMBER_STATE() +# define CFI_RESTORE_STATE() +# define CFI_ADJUST_CFA_OFFSET(off) +# define CFI_REL_OFFSET(reg,off) +# define CFI_DEF_CFA_REGISTER(reg) +# define CFI_REGISTER(ro,rn) +# define CFI_RESTORE(reg) + +# define CFI_CFA_ON_STACK(rsp_offs,cfa_depth) +# define CFI_REG_ON_STACK(reg,rsp_offs) +#endif + #endif /* GCRY_ASM_COMMON_AARCH64_H */ diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S index 5c6ab020a..f49808621 100644 --- a/cipher/camellia-aarch64.S +++ b/cipher/camellia-aarch64.S @@ -201,7 +201,12 @@ ELF(.type _gcry_camellia_arm_encrypt_block, at function;) _gcry_camellia_arm_encrypt_block: + CFI_STARTPROC() stp x19, x30, [sp, #-16]! + CFI_ADJUST_CFA_OFFSET(16) + CFI_REG_ON_STACK(19, 0) + CFI_REG_ON_STACK(30, 8) + /* input: * x0: keytable * x1: dst @@ -228,8 +233,13 @@ _gcry_camellia_arm_encrypt_block: outunpack(24); + CFI_REMEMBER_STATE() ldp x19, x30, [sp], #16 + CFI_ADJUST_CFA_OFFSET(-16) + CFI_RESTORE(x19) + CFI_RESTORE(x30) ret; + CFI_RESTORE_STATE() .ltorg .Lenc_256: @@ -239,7 +249,11 @@ _gcry_camellia_arm_encrypt_block: outunpack(32); ldp x19, x30, [sp], #16 + CFI_ADJUST_CFA_OFFSET(-16) + CFI_RESTORE(x19) + CFI_RESTORE(x30) ret; + CFI_ENDPROC() .ltorg ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;) @@ -247,7 +261,12 @@ ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;) ELF(.type _gcry_camellia_arm_decrypt_block, at function;) _gcry_camellia_arm_decrypt_block: + CFI_STARTPROC() stp x19, x30, [sp, #-16]! + CFI_ADJUST_CFA_OFFSET(16) + CFI_REG_ON_STACK(19, 0) + CFI_REG_ON_STACK(30, 8) + /* input: * x0: keytable * x1: dst @@ -275,8 +294,13 @@ _gcry_camellia_arm_decrypt_block: outunpack(0); + CFI_REMEMBER_STATE() ldp x19, x30, [sp], #16 + CFI_ADJUST_CFA_OFFSET(-16) + CFI_RESTORE(x19) + CFI_RESTORE(x30) ret; + CFI_RESTORE_STATE() .ltorg .Ldec_256: @@ -285,6 +309,7 @@ _gcry_camellia_arm_decrypt_block: dec_fls(24); b .Ldec_128; + CFI_ENDPROC() .ltorg ELF(.size _gcry_camellia_arm_decrypt_block,.-_gcry_camellia_arm_decrypt_block;) diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S index 3844d4e10..adb9b1f29 100644 --- a/cipher/chacha20-aarch64.S +++ b/cipher/chacha20-aarch64.S @@ -163,6 +163,7 @@ _gcry_chacha20_aarch64_blocks4: * x2: src * x3: nblks (multiple of 4) */ + CFI_STARTPROC() GET_DATA_POINTER(CTR, .Linc_counter); add INPUT_CTR, INPUT, #(12*4); @@ -309,6 +310,7 @@ _gcry_chacha20_aarch64_blocks4: eor x0, x0, x0 ret + CFI_ENDPROC() ELF(.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;) #endif diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S index b6c4f59d3..7c6be94ed 100644 --- a/cipher/cipher-gcm-armv8-aarch64-ce.S +++ b/cipher/cipher-gcm-armv8-aarch64-ce.S @@ -157,15 +157,23 @@ gcry_gcm_reduction_constant: #define VPUSH_ABI \ stp d8, d9, [sp, #-16]!; \ + CFI_ADJUST_CFA_OFFSET(16); \ stp d10, d11, [sp, #-16]!; \ + CFI_ADJUST_CFA_OFFSET(16); \ stp d12, d13, [sp, #-16]!; \ - stp d14, d15, [sp, #-16]!; + CFI_ADJUST_CFA_OFFSET(16); \ + stp d14, d15, [sp, #-16]!; \ + CFI_ADJUST_CFA_OFFSET(16); #define VPOP_ABI \ ldp d14, d15, [sp], #16; \ + CFI_ADJUST_CFA_OFFSET(-16); \ ldp d12, d13, [sp], #16; \ + CFI_ADJUST_CFA_OFFSET(-16); \ ldp d10, d11, [sp], #16; \ - ldp d8, d9, [sp], #16; + CFI_ADJUST_CFA_OFFSET(-16); \ + ldp d8, d9, [sp], #16; \ + CFI_ADJUST_CFA_OFFSET(-16); /* * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result, @@ -183,6 +191,8 @@ _gcry_ghash_armv8_ce_pmull: * x3: nblocks * x4: gcm_table */ + CFI_STARTPROC(); + cbz x3, .Ldo_nothing; GET_DATA_POINTER(x5, .Lrconst) @@ -360,6 +370,7 @@ _gcry_ghash_armv8_ce_pmull: .Ldo_nothing: mov x0, #0 ret + CFI_ENDPROC() ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;) @@ -374,6 +385,7 @@ _gcry_ghash_setup_armv8_ce_pmull: * x0: gcm_key * x1: gcm_table */ + CFI_STARTPROC() GET_DATA_POINTER(x2, .Lrconst) @@ -408,6 +420,7 @@ _gcry_ghash_setup_armv8_ce_pmull: st1 {rh5.16b-rh6.16b}, [x1] ret + CFI_ENDPROC() ELF(.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;) #endif diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S index 497d00551..f269b74a3 100644 --- a/cipher/crc-armv8-aarch64-ce.S +++ b/cipher/crc-armv8-aarch64-ce.S @@ -72,6 +72,7 @@ _gcry_crc32r_armv8_ce_bulk: * x2: inlen * x3: consts */ + CFI_STARTPROC() GET_DATA_POINTER(x7, .Lcrc32_constants) add x9, x3, #consts_k(5 - 1) @@ -230,6 +231,7 @@ _gcry_crc32r_armv8_ce_bulk: st1 {v0.s}[2], [x0] ret + CFI_ENDPROC() ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;) /* @@ -245,6 +247,7 @@ _gcry_crc32r_armv8_ce_reduction_4: * w1: crc * x2: crc32 constants */ + CFI_STARTPROC() eor v0.16b, v0.16b, v0.16b add x2, x2, #consts_my_p(0) @@ -261,6 +264,7 @@ _gcry_crc32r_armv8_ce_reduction_4: mov w0, v0.s[1] ret + CFI_ENDPROC() ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;) /* @@ -277,6 +281,7 @@ _gcry_crc32_armv8_ce_bulk: * x2: inlen * x3: consts */ + CFI_STARTPROC() GET_DATA_POINTER(x7, .Lcrc32_constants) add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants @@ -456,6 +461,7 @@ _gcry_crc32_armv8_ce_bulk: st1 {v0.s}[0], [x0] ret + CFI_ENDPROC() ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;) /* @@ -471,6 +477,7 @@ _gcry_crc32_armv8_ce_reduction_4: * w1: crc * x2: crc32 constants */ + CFI_STARTPROC() eor v0.16b, v0.16b, v0.16b add x2, x2, #consts_my_p(0) @@ -487,6 +494,7 @@ _gcry_crc32_armv8_ce_reduction_4: mov w0, v0.s[0] ret + CFI_ENDPROC() ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;) #endif diff --git a/cipher/rijndael-aarch64.S b/cipher/rijndael-aarch64.S index aad748753..e77dd4e0b 100644 --- a/cipher/rijndael-aarch64.S +++ b/cipher/rijndael-aarch64.S @@ -216,6 +216,7 @@ _gcry_aes_arm_encrypt_block: * %w3: number of rounds.. 10, 12 or 14 * %x4: encryption table */ + CFI_STARTPROC(); /* read input block */ @@ -285,6 +286,7 @@ _gcry_aes_arm_encrypt_block: lastencround(11, RNA, RNB, RNC, RND, RA, RB, RC, RD); b .Lenc_done; + CFI_ENDPROC(); ELF(.size _gcry_aes_arm_encrypt_block,.-_gcry_aes_arm_encrypt_block;) #define addroundkey_dec(round, ra, rb, rc, rd, rna, rnb, rnc, rnd) \ @@ -439,6 +441,7 @@ _gcry_aes_arm_decrypt_block: * %w3: number of rounds.. 10, 12 or 14 * %x4: decryption table */ + CFI_STARTPROC(); /* read input block */ @@ -504,6 +507,7 @@ _gcry_aes_arm_decrypt_block: decround(9, RA, RB, RC, RD, RNA, RNB, RNC, RND, preload_first_key); b .Ldec_tail; + CFI_ENDPROC(); ELF(.size _gcry_aes_arm_decrypt_block,.-_gcry_aes_arm_decrypt_block;) #endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/ diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S index f0012c20a..71b45b856 100644 --- a/cipher/rijndael-armv8-aarch64-ce.S +++ b/cipher/rijndael-armv8-aarch64-ce.S @@ -247,6 +247,7 @@ _gcry_aes_enc_armv8_ce: * x2: src * w3: nrounds */ + CFI_STARTPROC(); aes_preload_keys(x0, w3); @@ -291,6 +292,7 @@ _gcry_aes_enc_armv8_ce: CLEAR_REG(vk13) CLEAR_REG(vk14) b .Lenc1_tail + CFI_ENDPROC(); ELF(.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;) @@ -309,6 +311,7 @@ _gcry_aes_dec_armv8_ce: * x2: src * w3: nrounds */ + CFI_STARTPROC(); aes_preload_keys(x0, w3); @@ -353,6 +356,7 @@ _gcry_aes_dec_armv8_ce: CLEAR_REG(vk13) CLEAR_REG(vk14) b .Ldec1_tail + CFI_ENDPROC(); ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;) @@ -377,6 +381,7 @@ _gcry_aes_cbc_enc_armv8_ce: * w5: cbc_mac * w6: nrounds */ + CFI_STARTPROC(); cbz x4, .Lcbc_enc_skip @@ -419,6 +424,7 @@ _gcry_aes_cbc_enc_armv8_ce: .Lcbc_enc_skip: ret + CFI_ENDPROC(); ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;) /* @@ -440,6 +446,7 @@ _gcry_aes_cbc_dec_armv8_ce: * x4: nblocks * w5: nrounds */ + CFI_STARTPROC(); cbz x4, .Lcbc_dec_skip @@ -515,6 +522,7 @@ _gcry_aes_cbc_dec_armv8_ce: .Lcbc_dec_skip: ret + CFI_ENDPROC(); ELF(.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;) @@ -537,6 +545,7 @@ _gcry_aes_ctr_enc_armv8_ce: * x4: nblocks * w5: nrounds */ + CFI_STARTPROC(); cbz x4, .Lctr_enc_skip @@ -668,7 +677,7 @@ _gcry_aes_ctr_enc_armv8_ce: .Lctr_enc_skip: ret - + CFI_ENDPROC(); ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;) @@ -691,6 +700,7 @@ _gcry_aes_cfb_enc_armv8_ce: * x4: nblocks * w5: nrounds */ + CFI_STARTPROC(); cbz x4, .Lcfb_enc_skip @@ -732,6 +742,7 @@ _gcry_aes_cfb_enc_armv8_ce: .Lcfb_enc_skip: ret + CFI_ENDPROC(); ELF(.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;) @@ -754,6 +765,7 @@ _gcry_aes_cfb_dec_armv8_ce: * x4: nblocks * w5: nrounds */ + CFI_STARTPROC(); cbz x4, .Lcfb_dec_skip @@ -829,6 +841,7 @@ _gcry_aes_cfb_dec_armv8_ce: .Lcfb_dec_skip: ret + CFI_ENDPROC(); ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;) @@ -859,6 +872,7 @@ _gcry_aes_ocb_enc_armv8_ce: * w7: nrounds * %st+0: blkn => w12 */ + CFI_STARTPROC(); ldr w12, [sp] ld1 {v0.16b}, [x3] /* load offset */ @@ -979,6 +993,7 @@ _gcry_aes_ocb_enc_armv8_ce: CLEAR_REG(v16) ret + CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;) @@ -1009,6 +1024,7 @@ _gcry_aes_ocb_dec_armv8_ce: * w7: nrounds * %st+0: blkn => w12 */ + CFI_STARTPROC(); ldr w12, [sp] ld1 {v0.16b}, [x3] /* load offset */ @@ -1129,6 +1145,7 @@ _gcry_aes_ocb_dec_armv8_ce: CLEAR_REG(v16) ret + CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;) @@ -1157,6 +1174,8 @@ _gcry_aes_ocb_auth_armv8_ce: * w6: nrounds => w7 * w7: blkn => w12 */ + CFI_STARTPROC(); + mov w12, w7 mov w7, w6 mov x6, x5 @@ -1273,6 +1292,7 @@ _gcry_aes_ocb_auth_armv8_ce: CLEAR_REG(v16) ret + CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;) @@ -1297,6 +1317,7 @@ _gcry_aes_xts_enc_armv8_ce: * x4: nblocks * w5: nrounds */ + CFI_STARTPROC(); cbz x4, .Lxts_enc_skip @@ -1411,7 +1432,7 @@ _gcry_aes_xts_enc_armv8_ce: .Lxts_enc_skip: ret - + CFI_ENDPROC(); ELF(.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;) @@ -1436,6 +1457,7 @@ _gcry_aes_xts_dec_armv8_ce: * x4: nblocks * w5: nrounds */ + CFI_STARTPROC(); cbz x4, .Lxts_dec_skip @@ -1550,7 +1572,7 @@ _gcry_aes_xts_dec_armv8_ce: .Lxts_dec_skip: ret - + CFI_ENDPROC(); ELF(.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;) @@ -1564,6 +1586,7 @@ _gcry_aes_sbox4_armv8_ce: /* See "Gouv?a, C. P. L. & L?pez, J. Implementing GCM on ARMv8. Topics in * Cryptology ? CT-RSA 2015" for details. */ + CFI_STARTPROC(); movi v0.16b, #0x52 movi v1.16b, #0 mov v0.S[0], w0 @@ -1572,6 +1595,7 @@ _gcry_aes_sbox4_armv8_ce: mov w0, v0.S[0] CLEAR_REG(v0) ret + CFI_ENDPROC(); ELF(.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;) @@ -1582,11 +1606,13 @@ ELF(.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;) .globl _gcry_aes_invmixcol_armv8_ce ELF(.type _gcry_aes_invmixcol_armv8_ce,%function;) _gcry_aes_invmixcol_armv8_ce: + CFI_STARTPROC(); ld1 {v0.16b}, [x1] aesimc v0.16b, v0.16b st1 {v0.16b}, [x0] CLEAR_REG(v0) ret + CFI_ENDPROC(); ELF(.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;) #endif diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S index aeb67a128..7dc26c0f1 100644 --- a/cipher/sha1-armv8-aarch64-ce.S +++ b/cipher/sha1-armv8-aarch64-ce.S @@ -110,6 +110,7 @@ _gcry_sha1_transform_armv8_ce: * x1: data (64*nblks bytes) * x2: nblks */ + CFI_STARTPROC(); cbz x2, .Ldo_nothing; @@ -199,6 +200,7 @@ _gcry_sha1_transform_armv8_ce: .Ldo_nothing: mov x0, #0 ret + CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_armv8_ce,.-_gcry_sha1_transform_armv8_ce;) #endif diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S index 6b3ad32d8..706e0dfd9 100644 --- a/cipher/sha256-armv8-aarch64-ce.S +++ b/cipher/sha256-armv8-aarch64-ce.S @@ -120,6 +120,7 @@ _gcry_sha256_transform_armv8_ce: * r1: data (64*nblks bytes) * r2: nblks */ + CFI_STARTPROC(); cbz x2, .Ldo_nothing; @@ -213,6 +214,7 @@ _gcry_sha256_transform_armv8_ce: .Ldo_nothing: mov x0, #0 ret + CFI_ENDPROC(); ELF(.size _gcry_sha256_transform_armv8_ce,.-_gcry_sha256_transform_armv8_ce;) #endif diff --git a/cipher/twofish-aarch64.S b/cipher/twofish-aarch64.S index adee412d7..9f35b5cde 100644 --- a/cipher/twofish-aarch64.S +++ b/cipher/twofish-aarch64.S @@ -225,6 +225,7 @@ _gcry_twofish_arm_encrypt_block: * x1: dst * x2: src */ + CFI_STARTPROC(); add CTXw, CTX, #(w); @@ -262,6 +263,7 @@ _gcry_twofish_arm_encrypt_block: str_output_le(RDST, RC, RD, RA, RB, RT0, RT1); ret; + CFI_ENDPROC(); .ltorg ELF(.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;) @@ -274,6 +276,7 @@ _gcry_twofish_arm_decrypt_block: * %r1: dst * %r2: src */ + CFI_STARTPROC(); add CTXw, CTX, #(w); @@ -311,6 +314,7 @@ _gcry_twofish_arm_decrypt_block: str_output_le(RDST, RA, RB, RC, RD, RT0, RT1); ret; + CFI_ENDPROC(); ELF(.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;) #endif /*HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS*/ diff --git a/mpi/aarch64/mpih-add1.S b/mpi/aarch64/mpih-add1.S index 3370320e0..bc62cf987 100644 --- a/mpi/aarch64/mpih-add1.S +++ b/mpi/aarch64/mpih-add1.S @@ -37,6 +37,7 @@ .globl _gcry_mpih_add_n ELF(.type _gcry_mpih_add_n,%function) _gcry_mpih_add_n: + CFI_STARTPROC() and w5, w3, #3; adds xzr, xzr, xzr; /* clear carry flag */ @@ -69,4 +70,5 @@ _gcry_mpih_add_n: .Lend: adc x0, xzr, xzr; ret; + CFI_ENDPROC() ELF(.size _gcry_mpih_add_n,.-_gcry_mpih_add_n;) diff --git a/mpi/aarch64/mpih-mul1.S b/mpi/aarch64/mpih-mul1.S index 8830845a7..92fcd141b 100644 --- a/mpi/aarch64/mpih-mul1.S +++ b/mpi/aarch64/mpih-mul1.S @@ -37,6 +37,7 @@ .globl _gcry_mpih_mul_1 ELF(.type _gcry_mpih_mul_1,%function) _gcry_mpih_mul_1: + CFI_STARTPROC() and w5, w2, #3; mov x4, xzr; @@ -94,4 +95,5 @@ _gcry_mpih_mul_1: .Lend: mov x0, x4; ret; + CFI_ENDPROC() ELF(.size _gcry_mpih_mul_1,.-_gcry_mpih_mul_1;) diff --git a/mpi/aarch64/mpih-mul2.S b/mpi/aarch64/mpih-mul2.S index 5d736990e..aa0e5a2d5 100644 --- a/mpi/aarch64/mpih-mul2.S +++ b/mpi/aarch64/mpih-mul2.S @@ -37,6 +37,7 @@ .globl _gcry_mpih_addmul_1 ELF(.type _gcry_mpih_addmul_1,%function) _gcry_mpih_addmul_1: + CFI_STARTPROC() and w5, w2, #3; mov x6, xzr; mov x7, xzr; @@ -106,4 +107,5 @@ _gcry_mpih_addmul_1: .Lend: mov x0, x6; ret; + CFI_ENDPROC() ELF(.size _gcry_mpih_addmul_1,.-_gcry_mpih_addmul_1;) diff --git a/mpi/aarch64/mpih-mul3.S b/mpi/aarch64/mpih-mul3.S index f785e5e42..5a40b354c 100644 --- a/mpi/aarch64/mpih-mul3.S +++ b/mpi/aarch64/mpih-mul3.S @@ -37,6 +37,7 @@ .globl _gcry_mpih_submul_1 ELF(.type _gcry_mpih_submul_1,%function) _gcry_mpih_submul_1: + CFI_STARTPROC() and w5, w2, #3; mov x7, xzr; cbz w5, .Large_loop; @@ -119,4 +120,5 @@ _gcry_mpih_submul_1: .Loop_end: cinc x0, x7, cc; ret; + CFI_ENDPROC() ELF(.size _gcry_mpih_submul_1,.-_gcry_mpih_submul_1;) diff --git a/mpi/aarch64/mpih-sub1.S b/mpi/aarch64/mpih-sub1.S index 45a7b0417..4f279a123 100644 --- a/mpi/aarch64/mpih-sub1.S +++ b/mpi/aarch64/mpih-sub1.S @@ -37,6 +37,7 @@ .globl _gcry_mpih_sub_n ELF(.type _gcry_mpih_sub_n,%function) _gcry_mpih_sub_n: + CFI_STARTPROC() and w5, w3, #3; subs xzr, xzr, xzr; /* prepare carry flag for sub */ @@ -69,4 +70,5 @@ _gcry_mpih_sub_n: .Lend: cset x0, cc; ret; + CFI_ENDPROC() ELF(.size _gcry_mpih_sub_n,.-_gcry_mpih_sub_n;) diff --git a/mpi/asm-common-aarch64.h b/mpi/asm-common-aarch64.h index 126941307..cf4bdb852 100644 --- a/mpi/asm-common-aarch64.h +++ b/mpi/asm-common-aarch64.h @@ -21,10 +21,6 @@ #ifndef MPI_ASM_COMMON_AARCH64_H #define MPI_ASM_COMMON_AARCH64_H -#ifdef __ELF__ -# define ELF(...) __VA_ARGS__ -#else -# define ELF(...) /*_*/ -#endif +#include "../cipher/asm-common-aarch64.h" #endif /* MPI_ASM_COMMON_AARCH64_H */ From jussi.kivilinna at iki.fi Fri Apr 26 18:33:46 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 26 Apr 2019 19:33:46 +0300 Subject: [PATCH 4/4] Optimizations for GCM Intel/PCLMUL implementation In-Reply-To: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain> References: <155629641122.14985.2140559336495629831.stgit@localhost.localdomain> Message-ID: <155629642673.14985.4669553340338086876.stgit@localhost.localdomain> * cipher/cipher-gcm-intel-pclmul.c (reduction): New. (glmul_pclmul): Include shifting to left into pclmul operations; Use 'reduction' helper function. (gfmul_pclmul_aggr4): Reorder instructions and adjust register usage to free up registers; Use 'reduction' helper function; Include shifting to left into pclmul operations. (gcm_lsh): New. (_gcry_ghash_setup_intel_pclmul): Left shift H values to left by one. (_gcry_ghash_intel_pclmul) [__x86_64__]: Preload H values to unused registers. -- Benchmark on Intel Haswell (amd64): Before: | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 0.206 ns/B 4624 MiB/s 0.825 c/B 3998 After (+12% faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 0.184 ns/B 5195 MiB/s 0.734 c/B 3998 Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c index 60ae7aa9a..da309aead 100644 --- a/cipher/cipher-gcm-intel-pclmul.c +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -1,6 +1,6 @@ /* cipher-gcm-intel-pclmul.c - Intel PCLMUL accelerated Galois Counter Mode * implementation - * Copyright (C) 2013-2014 Jussi Kivilinna + * Copyright (C) 2013-2014,2019 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -47,6 +47,35 @@ "Intel? Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis. */ +static inline void reduction(void) +{ + /* input: */ + + asm volatile (/* first phase of the reduction */ + "movdqa %%xmm3, %%xmm6\n\t" + "movdqa %%xmm3, %%xmm7\n\t" + "psllq $1, %%xmm6\n\t" /* packed right shifting << 63 */ + "pxor %%xmm3, %%xmm6\n\t" + "psllq $57, %%xmm7\n\t" /* packed right shifting << 57 */ + "psllq $62, %%xmm6\n\t" /* packed right shifting << 62 */ + "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ + "pshufd $0x6a, %%xmm6, %%xmm7\n\t" + "pshufd $0xae, %%xmm6, %%xmm6\n\t" + "pxor %%xmm7, %%xmm3\n\t" /* first phase of the reduction + complete */ + + /* second phase of the reduction */ + "pxor %%xmm3, %%xmm1\n\t" /* xor the shifted versions */ + "psrlq $1, %%xmm3\n\t" /* packed left shifting >> 1 */ + "pxor %%xmm3, %%xmm6\n\t" + "psrlq $1, %%xmm3\n\t" /* packed left shifting >> 2 */ + "pxor %%xmm3, %%xmm1\n\t" + "psrlq $5, %%xmm3\n\t" /* packed left shifting >> 7 */ + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm6, %%xmm1\n\t" /* the result is in xmm1 */ + ::: "memory" ); +} + static inline void gfmul_pclmul(void) { /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified. @@ -60,65 +89,22 @@ static inline void gfmul_pclmul(void) "movdqa %%xmm0, %%xmm3\n\t" "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds a0*b0 */ - "movdqa %%xmm0, %%xmm6\n\t" - "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */ + "pclmulqdq $17, %%xmm0, %%xmm1\n\t" /* xmm6 holds a1*b1 */ "movdqa %%xmm3, %%xmm5\n\t" "pclmulqdq $0, %%xmm2, %%xmm4\n\t" /* xmm4 holds (a0+a1)*(b0+b1) */ - "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ + "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ "movdqa %%xmm4, %%xmm5\n\t" "psrldq $8, %%xmm4\n\t" "pslldq $8, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" - "pxor %%xmm4, %%xmm6\n\t" /* holds the result of the + "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the carry-less multiplication of xmm0 by xmm1 */ + ::: "memory" ); - /* shift the result by one bit position to the left cope for - the fact that bits are reversed */ - "movdqa %%xmm3, %%xmm4\n\t" - "movdqa %%xmm6, %%xmm5\n\t" - "pslld $1, %%xmm3\n\t" - "pslld $1, %%xmm6\n\t" - "psrld $31, %%xmm4\n\t" - "psrld $31, %%xmm5\n\t" - "movdqa %%xmm4, %%xmm1\n\t" - "pslldq $4, %%xmm5\n\t" - "pslldq $4, %%xmm4\n\t" - "psrldq $12, %%xmm1\n\t" - "por %%xmm4, %%xmm3\n\t" - "por %%xmm5, %%xmm6\n\t" - "por %%xmm6, %%xmm1\n\t" - - /* first phase of the reduction */ - "movdqa %%xmm3, %%xmm6\n\t" - "movdqa %%xmm3, %%xmm7\n\t" - "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */ - "movdqa %%xmm3, %%xmm5\n\t" - "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */ - "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */ - "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm6\n\t" - "movdqa %%xmm6, %%xmm7\n\t" - "pslldq $12, %%xmm6\n\t" - "psrldq $4, %%xmm7\n\t" - "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction - complete */ - - /* second phase of the reduction */ - "movdqa %%xmm3, %%xmm2\n\t" - "movdqa %%xmm3, %%xmm4\n\t" - "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */ - "movdqa %%xmm3, %%xmm5\n\t" - "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */ - "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */ - "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm2\n\t" - "pxor %%xmm7, %%xmm2\n\t" - "pxor %%xmm2, %%xmm3\n\t" - "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */ - ::: "cc" ); + reduction(); } @@ -136,117 +122,92 @@ static inline void gfmul_pclmul_aggr4(void) Input must be converted to little-endian. */ asm volatile (/* perform clmul and merge results... */ - "pshufd $78, %%xmm10, %%xmm11\n\t" + "pshufd $78, %%xmm10, %%xmm5\n\t" "pshufd $78, %%xmm1, %%xmm12\n\t" - "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ + "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */ "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */ + "movdqa %%xmm10, %%xmm4\n\t" + "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */ + "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ + "pclmulqdq $0, %%xmm5, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */ "pshufd $78, %%xmm9, %%xmm13\n\t" - "pshufd $78, %%xmm2, %%xmm14\n\t" + "pshufd $78, %%xmm2, %%xmm5\n\t" "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */ - "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */ - - "pshufd $78, %%xmm8, %%xmm5\n\t" - "pshufd $78, %%xmm3, %%xmm15\n\t" - "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */ - "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */ - - "movdqa %%xmm10, %%xmm4\n\t" + "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 3:b0+b1 */ "movdqa %%xmm9, %%xmm7\n\t" - "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */ "pclmulqdq $0, %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:a0*b0 */ - "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm9 holds 3:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */ - "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */ - - "pshufd $78, %%xmm0, %%xmm10\n\t" - "pshufd $78, %%xmm6, %%xmm11\n\t" - "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */ - "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */ + "pclmulqdq $0, %%xmm13, %%xmm5\n\t" /* xmm5 holds 3:(a0+a1)*(b0+b1) */ "pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 3+4:a0*b0 */ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */ - "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */ + "pxor %%xmm5, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */ - "movdqa %%xmm8, %%xmm13\n\t" - "pclmulqdq $0, %%xmm3, %%xmm13\n\t" /* xmm13 holds 2:a0*b0 */ + "pshufd $78, %%xmm8, %%xmm5\n\t" + "pshufd $78, %%xmm3, %%xmm2\n\t" + "pxor %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a0+a1 */ + "pxor %%xmm3, %%xmm2\n\t" /* xmm2 holds 2:b0+b1 */ + "movdqa %%xmm8, %%xmm4\n\t" + "pclmulqdq $0, %%xmm3, %%xmm4\n\t" /* xmm4 holds 2:a0*b0 */ "pclmulqdq $17, %%xmm8, %%xmm3\n\t" /* xmm3 holds 2:a1*b1 */ - "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */ + "pclmulqdq $0, %%xmm5, %%xmm2\n\t" /* xmm2 holds 2:(a0+a1)*(b0+b1) */ - "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */ + "pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */ "pxor %%xmm3, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */ - "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */ + "pxor %%xmm12, %%xmm2\n\t" /* xmm2 holds 2+3+4:(a0+a1)*(b0+b1) */ + "pshufd $78, %%xmm0, %%xmm11\n\t" + "pshufd $78, %%xmm6, %%xmm4\n\t" + "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */ + "pxor %%xmm6, %%xmm4\n\t" /* xmm4 holds 1:b0+b1 */ "movdqa %%xmm0, %%xmm3\n\t" "pclmulqdq $0, %%xmm6, %%xmm3\n\t" /* xmm3 holds 1:a0*b0 */ "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */ - "movdqa %%xmm11, %%xmm4\n\t" - "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */ + "pclmulqdq $0, %%xmm11, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */ "pxor %%xmm7, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */ - "pxor %%xmm1, %%xmm6\n\t" /* xmm6 holds 1+2+3+4:a1*b1 */ - "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */ + "pxor %%xmm6, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */ + "pxor %%xmm2, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */ /* aggregated reduction... */ "movdqa %%xmm3, %%xmm5\n\t" - "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ + "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ "movdqa %%xmm4, %%xmm5\n\t" "psrldq $8, %%xmm4\n\t" "pslldq $8, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" - "pxor %%xmm4, %%xmm6\n\t" /* holds the result of the + "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the carry-less multiplication of xmm0 by xmm1 */ + :::"memory"); - /* shift the result by one bit position to the left cope for - the fact that bits are reversed */ - "movdqa %%xmm3, %%xmm4\n\t" - "movdqa %%xmm6, %%xmm5\n\t" - "pslld $1, %%xmm3\n\t" - "pslld $1, %%xmm6\n\t" - "psrld $31, %%xmm4\n\t" - "psrld $31, %%xmm5\n\t" - "movdqa %%xmm4, %%xmm1\n\t" - "pslldq $4, %%xmm5\n\t" - "pslldq $4, %%xmm4\n\t" - "psrldq $12, %%xmm1\n\t" - "por %%xmm4, %%xmm3\n\t" - "por %%xmm5, %%xmm6\n\t" - "por %%xmm6, %%xmm1\n\t" - - /* first phase of the reduction */ - "movdqa %%xmm3, %%xmm6\n\t" - "movdqa %%xmm3, %%xmm7\n\t" - "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */ - "movdqa %%xmm3, %%xmm5\n\t" - "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */ - "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */ - "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm6\n\t" - "movdqa %%xmm6, %%xmm7\n\t" - "pslldq $12, %%xmm6\n\t" - "psrldq $4, %%xmm7\n\t" - "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction - complete */ - - /* second phase of the reduction */ - "movdqa %%xmm3, %%xmm2\n\t" - "movdqa %%xmm3, %%xmm4\n\t" - "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */ - "movdqa %%xmm3, %%xmm5\n\t" - "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */ - "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */ - "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm2\n\t" - "pxor %%xmm7, %%xmm2\n\t" - "pxor %%xmm2, %%xmm3\n\t" - "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */ - :::"cc"); + reduction(); } #endif +static inline void gcm_lsh(void *h, unsigned int hoffs) +{ + static const u64 pconst[2] __attribute__ ((aligned (16))) = + { U64_C(0x0000000000000001), U64_C(0xc200000000000000) }; + + asm volatile ("movdqu (%[h]), %%xmm2\n\t" + "pshufd $0xff, %%xmm2, %%xmm3\n\t" + "movdqa %%xmm2, %%xmm4\n\t" + "psrad $31, %%xmm3\n\t" + "pslldq $8, %%xmm4\n\t" + "pand %[pconst], %%xmm3\n\t" + "paddq %%xmm2, %%xmm2\n\t" + "psrlq $63, %%xmm4\n\t" + "pxor %%xmm3, %%xmm2\n\t" + "pxor %%xmm4, %%xmm2\n\t" + "movdqu %%xmm2, (%[h])\n\t" + : + : [pconst] "m" (pconst), + [h] "r" ((byte *)h + hoffs) + : "memory" ); +} void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) @@ -274,13 +235,16 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) [be_mask] "m" (*be_mask) : "memory"); + gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */ + #ifdef __x86_64__ asm volatile ("movdqa %%xmm0, %%xmm1\n\t" + "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */ : - : + : [key] "r" (c->u_mode.gcm.u_ghash_key.key) : "memory"); - gfmul_pclmul (); /* H?H => H? */ + gfmul_pclmul (); /* H<<<1?H => H? */ asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t" "movdqa %%xmm1, %%xmm8\n\t" @@ -288,22 +252,26 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) : [h_234] "r" (c->u_mode.gcm.gcm_table) : "memory"); - gfmul_pclmul (); /* H?H? => H? */ + gcm_lsh(c->u_mode.gcm.gcm_table, 0); /* H? <<< 1 */ + gfmul_pclmul (); /* H<<<1?H? => H? */ asm volatile ("movdqa %%xmm8, %%xmm0\n\t" "movdqu %%xmm1, 1*16(%[h_234])\n\t" - "movdqa %%xmm8, %%xmm1\n\t" + "movdqu 0*16(%[h_234]), %%xmm1\n\t" /* load H? <<< 1 */ : : [h_234] "r" (c->u_mode.gcm.gcm_table) : "memory"); - gfmul_pclmul (); /* H??H? => H? */ + gfmul_pclmul (); /* H?<<<1?H? => H? */ asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t" : : [h_234] "r" (c->u_mode.gcm.gcm_table) : "memory"); + gcm_lsh(c->u_mode.gcm.gcm_table, 16); /* H? <<< 1 */ + gcm_lsh(c->u_mode.gcm.gcm_table, 32); /* H? <<< 1 */ + #ifdef __WIN64__ /* Clear/restore used registers. */ asm volatile( "pxor %%xmm0, %%xmm0\n\t" @@ -329,7 +297,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) "pxor %%xmm6, %%xmm6\n\t" "pxor %%xmm7, %%xmm7\n\t" "pxor %%xmm8, %%xmm8\n\t" - ::: "cc" ); + ::: "memory" ); #endif #endif } @@ -372,32 +340,36 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, "pshufb %[be_mask], %%xmm1\n\t" /* be => le */ : : [hash] "m" (*result), [be_mask] "m" (*be_mask), - [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key)); + [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory" ); #ifdef __x86_64__ if (nblocks >= 4) { + asm volatile (/* Load H2, H3, H4, be_mask. */ + "movdqu 2*16(%[h_234]), %%xmm10\n\t" + "movdqu 1*16(%[h_234]), %%xmm9\n\t" + "movdqu 0*16(%[h_234]), %%xmm8\n\t" + "movdqa %[be_mask], %%xmm14\n\t" + : + : [h_234] "r" (c->u_mode.gcm.gcm_table), + [be_mask] "m" (*be_mask) + : "memory" ); + do { - asm volatile ("movdqa %[be_mask], %%xmm4\n\t" - "movdqu 0*16(%[buf]), %%xmm5\n\t" + asm volatile ("movdqu 0*16(%[buf]), %%xmm5\n\t" "movdqu 1*16(%[buf]), %%xmm2\n\t" "movdqu 2*16(%[buf]), %%xmm3\n\t" "movdqu 3*16(%[buf]), %%xmm6\n\t" - "pshufb %%xmm4, %%xmm5\n\t" /* be => le */ - - /* Load H2, H3, H4. */ - "movdqu 2*16(%[h_234]), %%xmm10\n\t" - "movdqu 1*16(%[h_234]), %%xmm9\n\t" - "movdqu 0*16(%[h_234]), %%xmm8\n\t" - + "pshufb %%xmm14, %%xmm5\n\t" /* be => le */ + "pshufb %%xmm14, %%xmm2\n\t" /* be => le */ + "pshufb %%xmm14, %%xmm3\n\t" /* be => le */ "pxor %%xmm5, %%xmm1\n\t" - "pshufb %%xmm4, %%xmm2\n\t" /* be => le */ - "pshufb %%xmm4, %%xmm3\n\t" /* be => le */ - "pshufb %%xmm4, %%xmm6\n\t" /* be => le */ + "pshufb %%xmm14, %%xmm6\n\t" /* be => le */ : - : [buf] "r" (buf), [be_mask] "m" (*be_mask), - [h_234] "r" (c->u_mode.gcm.gcm_table)); + : [buf] "r" (buf) + : "memory" ); gfmul_pclmul_aggr4 (); @@ -416,29 +388,32 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, "pxor %%xmm13, %%xmm13\n\t" "pxor %%xmm14, %%xmm14\n\t" "pxor %%xmm15, %%xmm15\n\t" - ::: "cc" ); + ::: "memory" ); #endif } #endif - while (nblocks--) + while (nblocks) { asm volatile ("movdqu %[buf], %%xmm2\n\t" "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ "pxor %%xmm2, %%xmm1\n\t" : - : [buf] "m" (*buf), [be_mask] "m" (*be_mask)); + : [buf] "m" (*buf), [be_mask] "m" (*be_mask) + : "memory" ); gfmul_pclmul (); buf += blocksize; + nblocks--; } /* Store hash. */ asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */ "movdqu %%xmm1, %[hash]\n\t" : [hash] "=m" (*result) - : [be_mask] "m" (*be_mask)); + : [be_mask] "m" (*be_mask) + : "memory" ); #ifdef __WIN64__ /* Clear/restore used registers. */ @@ -471,7 +446,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, "pxor %%xmm5, %%xmm5\n\t" "pxor %%xmm6, %%xmm6\n\t" "pxor %%xmm7, %%xmm7\n\t" - ::: "cc" ); + ::: "memory" ); #endif return 0; From jussi.kivilinna at iki.fi Sat Apr 27 16:37:33 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 27 Apr 2019 17:37:33 +0300 Subject: [PATCH v2] Optimizations for GCM Intel/PCLMUL implementation Message-ID: <155637585351.6430.2160747288549673210.stgit@localhost.localdomain> * cipher/cipher-gcm-intel-pclmul.c (reduction): New. (glmul_pclmul): Include shifting to left into pclmul operations; Use 'reduction' helper function. [__x86_64__] (gfmul_pclmul_aggr4): Reorder instructions and adjust register usage to free up registers; Use 'reduction' helper function; Include shifting to left into pclmul operations; Moving load H values and input from caller into this function. [__x86_64__] (gfmul_pclmul_aggr8): New. (gcm_lsh): New. (_gcry_ghash_setup_intel_pclmul): Left shift H values to left by one; Preserve XMM6-XMM15 registers on WIN64. (_gcry_ghash_intel_pclmul) [__x86_64__]: Use 8 block aggregated reduction function. -- Benchmark on Intel Haswell (amd64): Before: | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 0.206 ns/B 4624 MiB/s 0.825 c/B 3998 After (+50% faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 0.137 ns/B 6953 MiB/s 0.548 c/B 3998 Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c index 60ae7aa9a..46af77eac 100644 --- a/cipher/cipher-gcm-intel-pclmul.c +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -1,6 +1,6 @@ /* cipher-gcm-intel-pclmul.c - Intel PCLMUL accelerated Galois Counter Mode * implementation - * Copyright (C) 2013-2014 Jussi Kivilinna + * Copyright (C) 2013-2014,2019 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -47,6 +47,35 @@ "Intel? Carry-Less Multiplication Instruction and its Usage for Computing the GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis. */ +static inline void reduction(void) +{ + /* input: */ + + asm volatile (/* first phase of the reduction */ + "movdqa %%xmm3, %%xmm6\n\t" + "movdqa %%xmm3, %%xmm7\n\t" + "psllq $1, %%xmm6\n\t" /* packed right shifting << 63 */ + "pxor %%xmm3, %%xmm6\n\t" + "psllq $57, %%xmm7\n\t" /* packed right shifting << 57 */ + "psllq $62, %%xmm6\n\t" /* packed right shifting << 62 */ + "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ + "pshufd $0x6a, %%xmm6, %%xmm7\n\t" + "pshufd $0xae, %%xmm6, %%xmm6\n\t" + "pxor %%xmm7, %%xmm3\n\t" /* first phase of the reduction + complete */ + + /* second phase of the reduction */ + "pxor %%xmm3, %%xmm1\n\t" /* xor the shifted versions */ + "psrlq $1, %%xmm3\n\t" /* packed left shifting >> 1 */ + "pxor %%xmm3, %%xmm6\n\t" + "psrlq $1, %%xmm3\n\t" /* packed left shifting >> 2 */ + "pxor %%xmm3, %%xmm1\n\t" + "psrlq $5, %%xmm3\n\t" /* packed left shifting >> 7 */ + "pxor %%xmm3, %%xmm6\n\t" + "pxor %%xmm6, %%xmm1\n\t" /* the result is in xmm1 */ + ::: "memory" ); +} + static inline void gfmul_pclmul(void) { /* Input: XMM0 and XMM1, Output: XMM1. Input XMM0 stays unmodified. @@ -60,193 +89,304 @@ static inline void gfmul_pclmul(void) "movdqa %%xmm0, %%xmm3\n\t" "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds a0*b0 */ - "movdqa %%xmm0, %%xmm6\n\t" - "pclmulqdq $17, %%xmm1, %%xmm6\n\t" /* xmm6 holds a1*b1 */ + "pclmulqdq $17, %%xmm0, %%xmm1\n\t" /* xmm6 holds a1*b1 */ "movdqa %%xmm3, %%xmm5\n\t" "pclmulqdq $0, %%xmm2, %%xmm4\n\t" /* xmm4 holds (a0+a1)*(b0+b1) */ - "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ + "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ "movdqa %%xmm4, %%xmm5\n\t" "psrldq $8, %%xmm4\n\t" "pslldq $8, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" - "pxor %%xmm4, %%xmm6\n\t" /* holds the result of the + "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the carry-less multiplication of xmm0 by xmm1 */ + ::: "memory" ); - /* shift the result by one bit position to the left cope for - the fact that bits are reversed */ - "movdqa %%xmm3, %%xmm4\n\t" - "movdqa %%xmm6, %%xmm5\n\t" - "pslld $1, %%xmm3\n\t" - "pslld $1, %%xmm6\n\t" - "psrld $31, %%xmm4\n\t" - "psrld $31, %%xmm5\n\t" - "movdqa %%xmm4, %%xmm1\n\t" - "pslldq $4, %%xmm5\n\t" - "pslldq $4, %%xmm4\n\t" - "psrldq $12, %%xmm1\n\t" - "por %%xmm4, %%xmm3\n\t" - "por %%xmm5, %%xmm6\n\t" - "por %%xmm6, %%xmm1\n\t" - - /* first phase of the reduction */ - "movdqa %%xmm3, %%xmm6\n\t" - "movdqa %%xmm3, %%xmm7\n\t" - "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */ - "movdqa %%xmm3, %%xmm5\n\t" - "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */ - "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */ - "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm6\n\t" - "movdqa %%xmm6, %%xmm7\n\t" - "pslldq $12, %%xmm6\n\t" - "psrldq $4, %%xmm7\n\t" - "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction - complete */ - - /* second phase of the reduction */ - "movdqa %%xmm3, %%xmm2\n\t" - "movdqa %%xmm3, %%xmm4\n\t" - "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */ - "movdqa %%xmm3, %%xmm5\n\t" - "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */ - "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */ - "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm2\n\t" - "pxor %%xmm7, %%xmm2\n\t" - "pxor %%xmm2, %%xmm3\n\t" - "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */ - ::: "cc" ); + reduction(); } - #ifdef __x86_64__ -static inline void gfmul_pclmul_aggr4(void) +static inline void gfmul_pclmul_aggr4(const void *buf, const void *h_table) { /* Input: - H?: XMM0 X_i : XMM6 - H?: XMM8 X_(i-1) : XMM3 - H?: XMM9 X_(i-2) : XMM2 - H?: XMM10 X_(i-3)?Y_(i-4): XMM1 + H?: XMM0 + bemask: XMM15 + Hash: XMM1 Output: - Y_i: XMM1 - Inputs XMM0 stays unmodified. - Input must be converted to little-endian. + Hash: XMM1 + Inputs XMM0 and XMM14 stays unmodified. */ - asm volatile (/* perform clmul and merge results... */ - "pshufd $78, %%xmm10, %%xmm11\n\t" - "pshufd $78, %%xmm1, %%xmm12\n\t" - "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ - "pxor %%xmm1, %%xmm12\n\t" /* xmm12 holds 4:b0+b1 */ - - "pshufd $78, %%xmm9, %%xmm13\n\t" - "pshufd $78, %%xmm2, %%xmm14\n\t" - "pxor %%xmm9, %%xmm13\n\t" /* xmm13 holds 3:a0+a1 */ - "pxor %%xmm2, %%xmm14\n\t" /* xmm14 holds 3:b0+b1 */ - - "pshufd $78, %%xmm8, %%xmm5\n\t" - "pshufd $78, %%xmm3, %%xmm15\n\t" - "pxor %%xmm8, %%xmm5\n\t" /* xmm1 holds 2:a0+a1 */ - "pxor %%xmm3, %%xmm15\n\t" /* xmm2 holds 2:b0+b1 */ - - "movdqa %%xmm10, %%xmm4\n\t" - "movdqa %%xmm9, %%xmm7\n\t" - "pclmulqdq $0, %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:a0*b0 */ - "pclmulqdq $0, %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:a0*b0 */ + asm volatile (/* Load H2, H3, H4. */ + "movdqu 2*16(%[h_table]), %%xmm10\n\t" + "movdqu 1*16(%[h_table]), %%xmm9\n\t" + "movdqu 0*16(%[h_table]), %%xmm8\n\t" + + /* perform clmul and merge results... */ + "movdqu 0*16(%[buf]), %%xmm5\n\t" + "movdqu 1*16(%[buf]), %%xmm2\n\t" + "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ + "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ + "pxor %%xmm5, %%xmm1\n\t" + + "pshufd $78, %%xmm10, %%xmm5\n\t" + "pshufd $78, %%xmm1, %%xmm4\n\t" + "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */ + "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */ + "movdqa %%xmm10, %%xmm3\n\t" + "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds 4:a0*b0 */ "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ - "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm9 holds 3:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm12\n\t" /* xmm12 holds 4:(a0+a1)*(b0+b1) */ - "pclmulqdq $0, %%xmm13, %%xmm14\n\t" /* xmm14 holds 3:(a0+a1)*(b0+b1) */ + "pclmulqdq $0, %%xmm5, %%xmm4\n\t" /* xmm4 holds 4:(a0+a1)*(b0+b1) */ + + "pshufd $78, %%xmm9, %%xmm11\n\t" + "pshufd $78, %%xmm2, %%xmm7\n\t" + "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ + "movdqa %%xmm9, %%xmm6\n\t" + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ + "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ + + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */ + + "movdqu 2*16(%[buf]), %%xmm5\n\t" + "movdqu 3*16(%[buf]), %%xmm2\n\t" + "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ + "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ + + "pshufd $78, %%xmm8, %%xmm11\n\t" + "pshufd $78, %%xmm5, %%xmm7\n\t" + "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 2:a0+a1 */ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */ + "movdqa %%xmm8, %%xmm6\n\t" + "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 2:a0*b0 */ + "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */ + + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */ + "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */ + + "pshufd $78, %%xmm0, %%xmm11\n\t" + "pshufd $78, %%xmm2, %%xmm7\n\t" + "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */ + "movdqa %%xmm0, %%xmm6\n\t" + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 1:a0*b0 */ + "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */ - "pshufd $78, %%xmm0, %%xmm10\n\t" - "pshufd $78, %%xmm6, %%xmm11\n\t" - "pxor %%xmm0, %%xmm10\n\t" /* xmm10 holds 1:a0+a1 */ - "pxor %%xmm6, %%xmm11\n\t" /* xmm11 holds 1:b0+b1 */ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */ - "pxor %%xmm4, %%xmm7\n\t" /* xmm7 holds 3+4:a0*b0 */ - "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */ - "pxor %%xmm14, %%xmm12\n\t" /* xmm12 holds 3+4:(a0+a1)*(b0+b1) */ + /* aggregated reduction... */ + "movdqa %%xmm3, %%xmm5\n\t" + "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ + "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ + "movdqa %%xmm4, %%xmm5\n\t" + "psrldq $8, %%xmm4\n\t" + "pslldq $8, %%xmm5\n\t" + "pxor %%xmm5, %%xmm3\n\t" + "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the + carry-less multiplication of xmm0 + by xmm1 */ + : + : [buf] "r" (buf), + [h_table] "r" (h_table) + : "memory" ); - "movdqa %%xmm8, %%xmm13\n\t" - "pclmulqdq $0, %%xmm3, %%xmm13\n\t" /* xmm13 holds 2:a0*b0 */ - "pclmulqdq $17, %%xmm8, %%xmm3\n\t" /* xmm3 holds 2:a1*b1 */ - "pclmulqdq $0, %%xmm5, %%xmm15\n\t" /* xmm15 holds 2:(a0+a1)*(b0+b1) */ + reduction(); +} - "pxor %%xmm13, %%xmm7\n\t" /* xmm7 holds 2+3+4:a0*b0 */ - "pxor %%xmm3, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */ - "pxor %%xmm15, %%xmm12\n\t" /* xmm12 holds 2+3+4:(a0+a1)*(b0+b1) */ +static inline void gfmul_pclmul_aggr8(const void *buf, const void *h_table) +{ + /* Input: + H?: XMM0 + bemask: XMM15 + Hash: XMM1 + Output: + Hash: XMM1 + Inputs XMM0 and XMM14 stays unmodified. + */ + asm volatile (/* Load H6, H7, H8. */ + "movdqu 6*16(%[h_table]), %%xmm10\n\t" + "movdqu 5*16(%[h_table]), %%xmm9\n\t" + "movdqu 4*16(%[h_table]), %%xmm8\n\t" + + /* perform clmul and merge results... */ + "movdqu 0*16(%[buf]), %%xmm5\n\t" + "movdqu 1*16(%[buf]), %%xmm2\n\t" + "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ + "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ + "pxor %%xmm5, %%xmm1\n\t" + + "pshufd $78, %%xmm10, %%xmm5\n\t" + "pshufd $78, %%xmm1, %%xmm4\n\t" + "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */ + "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 8:b0+b1 */ + "movdqa %%xmm10, %%xmm3\n\t" + "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds 8:a0*b0 */ + "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */ + "pclmulqdq $0, %%xmm5, %%xmm4\n\t" /* xmm4 holds 8:(a0+a1)*(b0+b1) */ + + "pshufd $78, %%xmm9, %%xmm11\n\t" + "pshufd $78, %%xmm2, %%xmm7\n\t" + "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 7:b0+b1 */ + "movdqa %%xmm9, %%xmm6\n\t" + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 7:a0*b0 */ + "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */ + + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */ + + "movdqu 2*16(%[buf]), %%xmm5\n\t" + "movdqu 3*16(%[buf]), %%xmm2\n\t" + "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ + "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ + + "pshufd $78, %%xmm8, %%xmm11\n\t" + "pshufd $78, %%xmm5, %%xmm7\n\t" + "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 6:b0+b1 */ + "movdqa %%xmm8, %%xmm6\n\t" + "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 6:a0*b0 */ + "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */ + + /* Load H3, H4, H5. */ + "movdqu 3*16(%[h_table]), %%xmm10\n\t" + "movdqu 2*16(%[h_table]), %%xmm9\n\t" + "movdqu 1*16(%[h_table]), %%xmm8\n\t" + + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */ + "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */ - "movdqa %%xmm0, %%xmm3\n\t" - "pclmulqdq $0, %%xmm6, %%xmm3\n\t" /* xmm3 holds 1:a0*b0 */ - "pclmulqdq $17, %%xmm0, %%xmm6\n\t" /* xmm6 holds 1:a1*b1 */ - "movdqa %%xmm11, %%xmm4\n\t" - "pclmulqdq $0, %%xmm10, %%xmm4\n\t" /* xmm4 holds 1:(a0+a1)*(b0+b1) */ + "pshufd $78, %%xmm10, %%xmm11\n\t" + "pshufd $78, %%xmm2, %%xmm7\n\t" + "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 5:b0+b1 */ + "movdqa %%xmm10, %%xmm6\n\t" + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 5:a0*b0 */ + "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 5:(a0+a1)*(b0+b1) */ + + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */ + + "movdqu 4*16(%[buf]), %%xmm5\n\t" + "movdqu 5*16(%[buf]), %%xmm2\n\t" + "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ + "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ + + "pshufd $78, %%xmm9, %%xmm11\n\t" + "pshufd $78, %%xmm5, %%xmm7\n\t" + "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 4:b0+b1 */ + "movdqa %%xmm9, %%xmm6\n\t" + "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 4:a0*b0 */ + "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */ + + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */ + "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */ + + "pshufd $78, %%xmm8, %%xmm11\n\t" + "pshufd $78, %%xmm2, %%xmm7\n\t" + "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ + "movdqa %%xmm8, %%xmm6\n\t" + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ + "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ + + "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */ + + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */ + + "movdqu 6*16(%[buf]), %%xmm5\n\t" + "movdqu 7*16(%[buf]), %%xmm2\n\t" + "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ + "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ + + "pshufd $78, %%xmm8, %%xmm11\n\t" + "pshufd $78, %%xmm5, %%xmm7\n\t" + "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 4:b0+b1 */ + "movdqa %%xmm8, %%xmm6\n\t" + "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 4:a0*b0 */ + "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */ + + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */ + "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */ + + "pshufd $78, %%xmm0, %%xmm11\n\t" + "pshufd $78, %%xmm2, %%xmm7\n\t" + "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ + "movdqa %%xmm0, %%xmm6\n\t" + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ + "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ - "pxor %%xmm7, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */ - "pxor %%xmm1, %%xmm6\n\t" /* xmm6 holds 1+2+3+4:a1*b1 */ - "pxor %%xmm12, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */ /* aggregated reduction... */ "movdqa %%xmm3, %%xmm5\n\t" - "pxor %%xmm6, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ + "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ "movdqa %%xmm4, %%xmm5\n\t" "psrldq $8, %%xmm4\n\t" "pslldq $8, %%xmm5\n\t" "pxor %%xmm5, %%xmm3\n\t" - "pxor %%xmm4, %%xmm6\n\t" /* holds the result of the + "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the carry-less multiplication of xmm0 by xmm1 */ + : + : [buf] "r" (buf), + [h_table] "r" (h_table) + : "memory" ); - /* shift the result by one bit position to the left cope for - the fact that bits are reversed */ - "movdqa %%xmm3, %%xmm4\n\t" - "movdqa %%xmm6, %%xmm5\n\t" - "pslld $1, %%xmm3\n\t" - "pslld $1, %%xmm6\n\t" - "psrld $31, %%xmm4\n\t" - "psrld $31, %%xmm5\n\t" - "movdqa %%xmm4, %%xmm1\n\t" - "pslldq $4, %%xmm5\n\t" - "pslldq $4, %%xmm4\n\t" - "psrldq $12, %%xmm1\n\t" - "por %%xmm4, %%xmm3\n\t" - "por %%xmm5, %%xmm6\n\t" - "por %%xmm6, %%xmm1\n\t" - - /* first phase of the reduction */ - "movdqa %%xmm3, %%xmm6\n\t" - "movdqa %%xmm3, %%xmm7\n\t" - "pslld $31, %%xmm6\n\t" /* packed right shifting << 31 */ - "movdqa %%xmm3, %%xmm5\n\t" - "pslld $30, %%xmm7\n\t" /* packed right shifting shift << 30 */ - "pslld $25, %%xmm5\n\t" /* packed right shifting shift << 25 */ - "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm6\n\t" - "movdqa %%xmm6, %%xmm7\n\t" - "pslldq $12, %%xmm6\n\t" - "psrldq $4, %%xmm7\n\t" - "pxor %%xmm6, %%xmm3\n\t" /* first phase of the reduction - complete */ - - /* second phase of the reduction */ - "movdqa %%xmm3, %%xmm2\n\t" - "movdqa %%xmm3, %%xmm4\n\t" - "psrld $1, %%xmm2\n\t" /* packed left shifting >> 1 */ - "movdqa %%xmm3, %%xmm5\n\t" - "psrld $2, %%xmm4\n\t" /* packed left shifting >> 2 */ - "psrld $7, %%xmm5\n\t" /* packed left shifting >> 7 */ - "pxor %%xmm4, %%xmm2\n\t" /* xor the shifted versions */ - "pxor %%xmm5, %%xmm2\n\t" - "pxor %%xmm7, %%xmm2\n\t" - "pxor %%xmm2, %%xmm3\n\t" - "pxor %%xmm3, %%xmm1\n\t" /* the result is in xmm1 */ - :::"cc"); + reduction(); } #endif +static inline void gcm_lsh(void *h, unsigned int hoffs) +{ + static const u64 pconst[2] __attribute__ ((aligned (16))) = + { U64_C(0x0000000000000001), U64_C(0xc200000000000000) }; + + asm volatile ("movdqu (%[h]), %%xmm2\n\t" + "pshufd $0xff, %%xmm2, %%xmm3\n\t" + "movdqa %%xmm2, %%xmm4\n\t" + "psrad $31, %%xmm3\n\t" + "pslldq $8, %%xmm4\n\t" + "pand %[pconst], %%xmm3\n\t" + "paddq %%xmm2, %%xmm2\n\t" + "psrlq $63, %%xmm4\n\t" + "pxor %%xmm3, %%xmm2\n\t" + "pxor %%xmm4, %%xmm2\n\t" + "movdqu %%xmm2, (%[h])\n\t" + : + : [pconst] "m" (pconst), + [h] "r" ((byte *)h + hoffs) + : "memory" ); +} void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) @@ -254,15 +394,22 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; #if defined(__x86_64__) && defined(__WIN64__) - char win64tmp[3 * 16]; + char win64tmp[10 * 16]; - /* XMM6-XMM8 need to be restored after use. */ - asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" - "movdqu %%xmm7, 1*16(%0)\n\t" - "movdqu %%xmm8, 2*16(%0)\n\t" + /* XMM6-XMM15 need to be restored after use. */ + asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" + "movdqu %%xmm7, 1*16(%0)\n\t" + "movdqu %%xmm8, 2*16(%0)\n\t" + "movdqu %%xmm9, 3*16(%0)\n\t" + "movdqu %%xmm10, 4*16(%0)\n\t" + "movdqu %%xmm11, 5*16(%0)\n\t" + "movdqu %%xmm12, 6*16(%0)\n\t" + "movdqu %%xmm13, 7*16(%0)\n\t" + "movdqu %%xmm14, 8*16(%0)\n\t" + "movdqu %%xmm15, 9*16(%0)\n\t" : : "r" (win64tmp) - : "memory"); + : "memory" ); #endif /* Swap endianness of hsub. */ @@ -274,36 +421,82 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) [be_mask] "m" (*be_mask) : "memory"); + gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */ + #ifdef __x86_64__ asm volatile ("movdqa %%xmm0, %%xmm1\n\t" + "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */ : - : + : [key] "r" (c->u_mode.gcm.u_ghash_key.key) : "memory"); - gfmul_pclmul (); /* H?H => H? */ + gfmul_pclmul (); /* H<<<1?H => H? */ - asm volatile ("movdqu %%xmm1, 0*16(%[h_234])\n\t" + asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t" "movdqa %%xmm1, %%xmm8\n\t" : - : [h_234] "r" (c->u_mode.gcm.gcm_table) + : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); - gfmul_pclmul (); /* H?H? => H? */ + gcm_lsh(c->u_mode.gcm.gcm_table, 0 * 16); /* H? <<< 1 */ + gfmul_pclmul (); /* H<<<1?H? => H? */ asm volatile ("movdqa %%xmm8, %%xmm0\n\t" - "movdqu %%xmm1, 1*16(%[h_234])\n\t" - "movdqa %%xmm8, %%xmm1\n\t" + "movdqu %%xmm1, 1*16(%[h_table])\n\t" + "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */ : - : [h_234] "r" (c->u_mode.gcm.gcm_table) + : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); - gfmul_pclmul (); /* H??H? => H? */ + gfmul_pclmul (); /* H?<<<1?H? => H? */ - asm volatile ("movdqu %%xmm1, 2*16(%[h_234])\n\t" + asm volatile ("movdqu %%xmm1, 2*16(%[h_table])\n\t" + "movdqa %%xmm1, %%xmm0\n\t" + "movdqu (%[key]), %%xmm1\n\t" /* load H <<< 1 */ : - : [h_234] "r" (c->u_mode.gcm.gcm_table) + : [h_table] "r" (c->u_mode.gcm.gcm_table), + [key] "r" (c->u_mode.gcm.u_ghash_key.key) : "memory"); + gcm_lsh(c->u_mode.gcm.gcm_table, 1 * 16); /* H? <<< 1 */ + gcm_lsh(c->u_mode.gcm.gcm_table, 2 * 16); /* H? <<< 1 */ + + gfmul_pclmul (); /* H<<<1?H? => H? */ + + asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" + "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */ + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul (); /* H?<<<1?H? => H? */ + + asm volatile ("movdqu %%xmm1, 4*16(%[h_table])\n\t" + "movdqu 1*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */ + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul (); /* H?<<<1?H? => H? */ + + asm volatile ("movdqu %%xmm1, 5*16(%[h_table])\n\t" + "movdqu 2*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */ + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gfmul_pclmul (); /* H?<<<1?H? => H? */ + + asm volatile ("movdqu %%xmm1, 6*16(%[h_table])\n\t" + : + : [h_table] "r" (c->u_mode.gcm.gcm_table) + : "memory"); + + gcm_lsh(c->u_mode.gcm.gcm_table, 3 * 16); /* H? <<< 1 */ + gcm_lsh(c->u_mode.gcm.gcm_table, 4 * 16); /* H? <<< 1 */ + gcm_lsh(c->u_mode.gcm.gcm_table, 5 * 16); /* H? <<< 1 */ + gcm_lsh(c->u_mode.gcm.gcm_table, 6 * 16); /* H? <<< 1 */ + #ifdef __WIN64__ /* Clear/restore used registers. */ asm volatile( "pxor %%xmm0, %%xmm0\n\t" @@ -315,9 +508,16 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) "movdqu 0*16(%0), %%xmm6\n\t" "movdqu 1*16(%0), %%xmm7\n\t" "movdqu 2*16(%0), %%xmm8\n\t" + "movdqu 3*16(%0), %%xmm9\n\t" + "movdqu 4*16(%0), %%xmm10\n\t" + "movdqu 5*16(%0), %%xmm11\n\t" + "movdqu 6*16(%0), %%xmm12\n\t" + "movdqu 7*16(%0), %%xmm13\n\t" + "movdqu 8*16(%0), %%xmm14\n\t" + "movdqu 9*16(%0), %%xmm15\n\t" : : "r" (win64tmp) - : "memory"); + : "memory" ); #else /* Clear used registers. */ asm volatile( "pxor %%xmm0, %%xmm0\n\t" @@ -329,7 +529,14 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) "pxor %%xmm6, %%xmm6\n\t" "pxor %%xmm7, %%xmm7\n\t" "pxor %%xmm8, %%xmm8\n\t" - ::: "cc" ); + "pxor %%xmm9, %%xmm9\n\t" + "pxor %%xmm10, %%xmm10\n\t" + "pxor %%xmm11, %%xmm11\n\t" + "pxor %%xmm12, %%xmm12\n\t" + "pxor %%xmm13, %%xmm13\n\t" + "pxor %%xmm14, %%xmm14\n\t" + "pxor %%xmm15, %%xmm15\n\t" + ::: "memory" ); #endif #endif } @@ -342,15 +549,15 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; const unsigned int blocksize = GCRY_GCM_BLOCK_LEN; -#ifdef __WIN64__ +#if defined(__x86_64__) && defined(__WIN64__) char win64tmp[10 * 16]; #endif if (nblocks == 0) return 0; -#ifdef __WIN64__ - /* XMM8-XMM15 need to be restored after use. */ +#if defined(__x86_64__) && defined(__WIN64__) + /* XMM6-XMM15 need to be restored after use. */ asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" "movdqu %%xmm7, 1*16(%0)\n\t" "movdqu %%xmm8, 2*16(%0)\n\t" @@ -367,44 +574,39 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, #endif /* Preload hash and H1. */ - asm volatile ("movdqu %[hash], %%xmm1\n\t" + asm volatile ("movdqa %[be_mask], %%xmm7\n\t" + "movdqu %[hash], %%xmm1\n\t" "movdqa %[hsub], %%xmm0\n\t" - "pshufb %[be_mask], %%xmm1\n\t" /* be => le */ + "pshufb %%xmm7, %%xmm1\n\t" /* be => le */ : - : [hash] "m" (*result), [be_mask] "m" (*be_mask), - [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key)); + : [hash] "m" (*result), + [be_mask] "m" (*be_mask), + [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory" ); #ifdef __x86_64__ if (nblocks >= 4) { - do + asm volatile ("movdqa %%xmm7, %%xmm15\n\t" + : + : + : "memory" ); + + while (nblocks >= 8) { - asm volatile ("movdqa %[be_mask], %%xmm4\n\t" - "movdqu 0*16(%[buf]), %%xmm5\n\t" - "movdqu 1*16(%[buf]), %%xmm2\n\t" - "movdqu 2*16(%[buf]), %%xmm3\n\t" - "movdqu 3*16(%[buf]), %%xmm6\n\t" - "pshufb %%xmm4, %%xmm5\n\t" /* be => le */ - - /* Load H2, H3, H4. */ - "movdqu 2*16(%[h_234]), %%xmm10\n\t" - "movdqu 1*16(%[h_234]), %%xmm9\n\t" - "movdqu 0*16(%[h_234]), %%xmm8\n\t" - - "pxor %%xmm5, %%xmm1\n\t" - "pshufb %%xmm4, %%xmm2\n\t" /* be => le */ - "pshufb %%xmm4, %%xmm3\n\t" /* be => le */ - "pshufb %%xmm4, %%xmm6\n\t" /* be => le */ - : - : [buf] "r" (buf), [be_mask] "m" (*be_mask), - [h_234] "r" (c->u_mode.gcm.gcm_table)); - - gfmul_pclmul_aggr4 (); + gfmul_pclmul_aggr8 (buf, c->u_mode.gcm.gcm_table); + + buf += 8 * blocksize; + nblocks -= 8; + } + + if (nblocks >= 4) + { + gfmul_pclmul_aggr4 (buf, c->u_mode.gcm.gcm_table); buf += 4 * blocksize; nblocks -= 4; } - while (nblocks >= 4); #ifndef __WIN64__ /* Clear used x86-64/XMM registers. */ @@ -416,31 +618,34 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, "pxor %%xmm13, %%xmm13\n\t" "pxor %%xmm14, %%xmm14\n\t" "pxor %%xmm15, %%xmm15\n\t" - ::: "cc" ); + ::: "memory" ); #endif } #endif - while (nblocks--) + while (nblocks) { asm volatile ("movdqu %[buf], %%xmm2\n\t" "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ "pxor %%xmm2, %%xmm1\n\t" : - : [buf] "m" (*buf), [be_mask] "m" (*be_mask)); + : [buf] "m" (*buf), [be_mask] "m" (*be_mask) + : "memory" ); gfmul_pclmul (); buf += blocksize; + nblocks--; } /* Store hash. */ asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */ "movdqu %%xmm1, %[hash]\n\t" : [hash] "=m" (*result) - : [be_mask] "m" (*be_mask)); + : [be_mask] "m" (*be_mask) + : "memory" ); -#ifdef __WIN64__ +#if defined(__x86_64__) && defined(__WIN64__) /* Clear/restore used registers. */ asm volatile( "pxor %%xmm0, %%xmm0\n\t" "pxor %%xmm1, %%xmm1\n\t" @@ -471,7 +676,7 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, "pxor %%xmm5, %%xmm5\n\t" "pxor %%xmm6, %%xmm6\n\t" "pxor %%xmm7, %%xmm7\n\t" - ::: "cc" ); + ::: "memory" ); #endif return 0; diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 970aa9860..fdec0a1bd 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -313,7 +313,10 @@ struct gcry_cipher_handle /* Pre-calculated table for GCM. */ #ifdef GCM_USE_TABLES - #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__)) + #if defined(__x86_64__) && defined(GCM_USE_INTEL_PCLMUL) + #define GCM_TABLES_USE_U64 1 + u64 gcm_table[7 * 16]; /* Extra table space for PCLMUL aggr8 */ + #elif (SIZEOF_UNSIGNED_LONG == 8) || defined(__x86_64__) #define GCM_TABLES_USE_U64 1 u64 gcm_table[2 * 16]; #else From jussi.kivilinna at iki.fi Sat Apr 27 22:03:04 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 27 Apr 2019 23:03:04 +0300 Subject: [PATCH 2/4] Prefetch GCM look-up tables In-Reply-To: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain> References: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain> Message-ID: <155639538390.3345.9222163520460838914.stgit@localhost.localdomain> * cipher/cipher-gcm.c (prefetch_table, do_prefetch_tables) (prefetch_tables): New. (ghash_internal): Call prefetch_tables. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index c19f09f27..11f119aa7 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -118,6 +118,34 @@ static const u16 gcmR[256] = { 0xbbf0, 0xba32, 0xb874, 0xb9b6, 0xbcf8, 0xbd3a, 0xbf7c, 0xbebe, }; +static inline +void prefetch_table(const void *tab, size_t len) +{ + const volatile byte *vtab = tab; + size_t i; + + for (i = 0; i < len; i += 8 * 32) + { + (void)vtab[i + 0 * 32]; + (void)vtab[i + 1 * 32]; + (void)vtab[i + 2 * 32]; + (void)vtab[i + 3 * 32]; + (void)vtab[i + 4 * 32]; + (void)vtab[i + 5 * 32]; + (void)vtab[i + 6 * 32]; + (void)vtab[i + 7 * 32]; + } + + (void)vtab[len - 1]; +} + +static inline void +do_prefetch_tables (const void *gcmM, size_t gcmM_size) +{ + prefetch_table(gcmM, gcmM_size); + prefetch_table(gcmR, sizeof(gcmR)); +} + #ifdef GCM_TABLES_USE_U64 static void bshift (u64 * b0, u64 * b1) @@ -365,6 +393,8 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u32 *gcmM) #define fillM(c) \ do_fillM (c->u_mode.gcm.u_ghash_key.key, c->u_mode.gcm.gcm_table) #define GHASH(c, result, buf) do_ghash (result, buf, c->u_mode.gcm.gcm_table) +#define prefetch_tables(c) \ + do_prefetch_tables(c->u_mode.gcm.gcm_table, sizeof(c->u_mode.gcm.gcm_table)) #else @@ -430,6 +460,7 @@ do_ghash (unsigned char *hsub, unsigned char *result, const unsigned char *buf) #define fillM(c) do { } while (0) #define GHASH(c, result, buf) do_ghash (c->u_mode.gcm.u_ghash_key.key, result, buf) +#define prefetch_tables(c) do {} while (0) #endif /* !GCM_USE_TABLES */ @@ -441,6 +472,8 @@ ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf, const unsigned int blocksize = GCRY_GCM_BLOCK_LEN; unsigned int burn = 0; + prefetch_tables (c); + while (nblocks) { burn = GHASH (c, result, buf); From jussi.kivilinna at iki.fi Sat Apr 27 22:02:58 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 27 Apr 2019 23:02:58 +0300 Subject: [PATCH 1/4] Optimizations for generic table-based GCM implementations Message-ID: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain> * cipher/cipher-gcm.c [GCM_TABLES_USE_U64] (do_fillM): Precalculate M[32..63] values. [GCM_TABLES_USE_U64] (do_ghash): Split processing of two 64-bit halfs of the input to two separate loops; Use precalculated M[] values. [GCM_USE_TABLES && !GCM_TABLES_USE_U64] (do_fillM): Precalculate M[64..127] values. [GCM_USE_TABLES && !GCM_TABLES_USE_U64] (do_ghash): Use precalculated M[] values. [GCM_USE_TABLES] (bshift): Avoid conditional execution for mask calculation. * cipher/cipher-internal.h (gcry_cipher_handle): Double gcm_table size. -- Benchmark on Intel Haswell (amd64, --disable-hwf all): Before: | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 2.79 ns/B 341.3 MiB/s 11.17 c/B 3998 After (~36% faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 2.05 ns/B 464.7 MiB/s 8.20 c/B 3998 Benchmark on Intel Haswell (win32, --disable-hwf all): Before: | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 4.90 ns/B 194.8 MiB/s 19.57 c/B 3997 After (~36% faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 3.58 ns/B 266.4 MiB/s 14.31 c/B 3999 Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index cbda87be2..c19f09f27 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -1,6 +1,6 @@ /* cipher-gcm.c - Generic Galois Counter Mode implementation * Copyright (C) 2013 Dmitry Eremin-Solenikov - * Copyright (C) 2013, 2018 Jussi Kivilinna + * Copyright (C) 2013, 2018-2019 Jussi Kivilinna * * This file is part of Libgcrypt. * @@ -126,7 +126,7 @@ bshift (u64 * b0, u64 * b1) t[0] = *b0; t[1] = *b1; - mask = t[1] & 1 ? 0xe1 : 0; + mask = -(t[1] & 1) & 0xe1; mask <<= 56; *b1 = (t[1] >> 1) ^ (t[0] << 63); @@ -158,6 +158,12 @@ do_fillM (unsigned char *h, u64 *M) M[(i + j) + 0] = M[i + 0] ^ M[j + 0]; M[(i + j) + 16] = M[i + 16] ^ M[j + 16]; } + + for (i = 0; i < 16; i++) + { + M[i + 32] = (M[i + 0] >> 4) ^ ((u64) gcmR[(M[i + 16] & 0xf) << 4] << 48); + M[i + 48] = (M[i + 16] >> 4) ^ (M[i + 0] << 60); + } } static inline unsigned int @@ -175,20 +181,18 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u64 *gcmM) V[1] = be_bswap64 (V[1]); /* First round can be manually tweaked based on fact that 'tmp' is zero. */ - i = 15; - - M = &gcmM[(V[1] & 0xf)]; + M = &gcmM[(V[1] & 0xf) + 32]; V[1] >>= 4; - tmp[0] = (M[0] >> 4) ^ ((u64) gcmR[(M[16] & 0xf) << 4] << 48); - tmp[1] = (M[16] >> 4) ^ (M[0] << 60); + tmp[0] = M[0]; + tmp[1] = M[16]; tmp[0] ^= gcmM[(V[1] & 0xf) + 0]; tmp[1] ^= gcmM[(V[1] & 0xf) + 16]; V[1] >>= 4; - --i; + i = 6; while (1) { - M = &gcmM[(V[1] & 0xf)]; + M = &gcmM[(V[1] & 0xf) + 32]; V[1] >>= 4; A = tmp[1] & 0xff; @@ -196,15 +200,34 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u64 *gcmM) tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[1] & 0xf) + 0]; tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[1] & 0xf) + 16]; - tmp[0] ^= (M[0] >> 4) ^ ((u64) gcmR[(M[16] & 0xf) << 4] << 48); - tmp[1] ^= (M[16] >> 4) ^ (M[0] << 60); + tmp[0] ^= M[0]; + tmp[1] ^= M[16]; + + if (i == 0) + break; + + V[1] >>= 4; + --i; + } + + i = 7; + while (1) + { + M = &gcmM[(V[0] & 0xf) + 32]; + V[0] >>= 4; + + A = tmp[1] & 0xff; + T = tmp[0]; + tmp[0] = (T >> 8) ^ ((u64) gcmR[A] << 48) ^ gcmM[(V[0] & 0xf) + 0]; + tmp[1] = (T << 56) ^ (tmp[1] >> 8) ^ gcmM[(V[0] & 0xf) + 16]; + + tmp[0] ^= M[0]; + tmp[1] ^= M[16]; if (i == 0) break; - else if (i == 8) - V[1] = V[0]; - else - V[1] >>= 4; + + V[0] >>= 4; --i; } @@ -226,7 +249,7 @@ bshift (u32 * M, int i) t[1] = M[i * 4 + 1]; t[2] = M[i * 4 + 2]; t[3] = M[i * 4 + 3]; - mask = t[3] & 1 ? 0xe1 : 0; + mask = -(t[3] & 1) & 0xe1; M[i * 4 + 3] = (t[3] >> 1) ^ (t[2] << 31); M[i * 4 + 2] = (t[2] >> 1) ^ (t[1] << 31); @@ -267,6 +290,15 @@ do_fillM (unsigned char *h, u32 *M) M[(i + j) * 4 + 2] = M[i * 4 + 2] ^ M[j * 4 + 2]; M[(i + j) * 4 + 3] = M[i * 4 + 3] ^ M[j * 4 + 3]; } + + for (i = 0; i < 4 * 16; i += 4) + { + M[i + 0 + 64] = (M[i + 0] >> 4) + ^ ((u64) gcmR[(M[i + 3] << 4) & 0xf0] << 16); + M[i + 1 + 64] = (M[i + 1] >> 4) ^ (M[i + 0] << 28); + M[i + 2 + 64] = (M[i + 2] >> 4) ^ (M[i + 1] << 28); + M[i + 3 + 64] = (M[i + 3] >> 4) ^ (M[i + 2] << 28); + } } static inline unsigned int @@ -285,19 +317,19 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u32 *gcmM) i = 15; v = V[i]; - M = &gcmM[(v & 0xf) * 4]; + M = &gcmM[(v & 0xf) * 4 + 64]; v = (v & 0xf0) >> 4; m = &gcmM[v * 4]; v = V[--i]; - tmp[0] = (M[0] >> 4) ^ ((u64) gcmR[(M[3] << 4) & 0xf0] << 16) ^ m[0]; - tmp[1] = (M[1] >> 4) ^ (M[0] << 28) ^ m[1]; - tmp[2] = (M[2] >> 4) ^ (M[1] << 28) ^ m[2]; - tmp[3] = (M[3] >> 4) ^ (M[2] << 28) ^ m[3]; + tmp[0] = M[0] ^ m[0]; + tmp[1] = M[1] ^ m[1]; + tmp[2] = M[2] ^ m[2]; + tmp[3] = M[3] ^ m[3]; while (1) { - M = &gcmM[(v & 0xf) * 4]; + M = &gcmM[(v & 0xf) * 4 + 64]; v = (v & 0xf0) >> 4; m = &gcmM[v * 4]; @@ -309,10 +341,10 @@ do_ghash (unsigned char *result, const unsigned char *buf, const u32 *gcmM) tmp[2] = (T[1] << 24) ^ (tmp[2] >> 8) ^ m[2]; tmp[3] = (T[2] << 24) ^ (tmp[3] >> 8) ^ m[3]; - tmp[0] ^= (M[0] >> 4) ^ ((u64) gcmR[(M[3] << 4) & 0xf0] << 16); - tmp[1] ^= (M[1] >> 4) ^ (M[0] << 28); - tmp[2] ^= (M[2] >> 4) ^ (M[1] << 28); - tmp[3] ^= (M[3] >> 4) ^ (M[2] << 28); + tmp[0] ^= M[0]; + tmp[1] ^= M[1]; + tmp[2] ^= M[2]; + tmp[3] ^= M[3]; if (i == 0) break; diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 970aa9860..47b7b6f9e 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -315,10 +315,10 @@ struct gcry_cipher_handle #ifdef GCM_USE_TABLES #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__)) #define GCM_TABLES_USE_U64 1 - u64 gcm_table[2 * 16]; + u64 gcm_table[4 * 16]; #else #undef GCM_TABLES_USE_U64 - u32 gcm_table[4 * 16]; + u32 gcm_table[8 * 16]; #endif #endif } gcm; From jussi.kivilinna at iki.fi Sat Apr 27 22:03:09 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 27 Apr 2019 23:03:09 +0300 Subject: [PATCH 3/4] Enable four block aggregated GCM Intel PCLMUL implementation on i386 In-Reply-To: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain> References: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain> Message-ID: <155639538930.3345.2103897558871858741.stgit@localhost.localdomain> * cipher/cipher-gcm-intel-pclmul.c (reduction): Change "%%xmm7" to "%%xmm5". (gfmul_pclmul_aggr4): Move outside [__x86_64__] block; Remove usage of XMM8-XMM15 registers; Do not preload H-values and be_mask to reduce register usage for i386. (_gcry_ghash_setup_intel_pclmul): Enable calculation of H2, H3 and H4 on i386. (_gcry_ghash_intel_pclmul): Adjust to above gfmul_pclmul_aggr4 changes; Move 'aggr4' code path outside [__x86_64__] block. -- Benchmark on Intel Haswell (win32): Before: | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 0.446 ns/B 2140 MiB/s 1.78 c/B 3998 After (~2.38x faster): | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GMAC_AES | 0.187 ns/B 5107 MiB/s 0.747 c/B 3998 Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c index 46af77eac..8e109ba3c 100644 --- a/cipher/cipher-gcm-intel-pclmul.c +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -53,15 +53,15 @@ static inline void reduction(void) asm volatile (/* first phase of the reduction */ "movdqa %%xmm3, %%xmm6\n\t" - "movdqa %%xmm3, %%xmm7\n\t" + "movdqa %%xmm3, %%xmm5\n\t" "psllq $1, %%xmm6\n\t" /* packed right shifting << 63 */ "pxor %%xmm3, %%xmm6\n\t" - "psllq $57, %%xmm7\n\t" /* packed right shifting << 57 */ + "psllq $57, %%xmm5\n\t" /* packed right shifting << 57 */ "psllq $62, %%xmm6\n\t" /* packed right shifting << 62 */ - "pxor %%xmm7, %%xmm6\n\t" /* xor the shifted versions */ - "pshufd $0x6a, %%xmm6, %%xmm7\n\t" + "pxor %%xmm5, %%xmm6\n\t" /* xor the shifted versions */ + "pshufd $0x6a, %%xmm6, %%xmm5\n\t" "pshufd $0xae, %%xmm6, %%xmm6\n\t" - "pxor %%xmm7, %%xmm3\n\t" /* first phase of the reduction + "pxor %%xmm5, %%xmm3\n\t" /* first phase of the reduction complete */ /* second phase of the reduction */ @@ -107,77 +107,83 @@ static inline void gfmul_pclmul(void) reduction(); } -#ifdef __x86_64__ -static inline void gfmul_pclmul_aggr4(const void *buf, const void *h_table) +static inline void gfmul_pclmul_aggr4(const void *buf, const void *h_1, + const void *h_table, + const unsigned char *be_mask) { /* Input: - H?: XMM0 - bemask: XMM15 Hash: XMM1 Output: Hash: XMM1 - Inputs XMM0 and XMM14 stays unmodified. */ - asm volatile (/* Load H2, H3, H4. */ - "movdqu 2*16(%[h_table]), %%xmm10\n\t" - "movdqu 1*16(%[h_table]), %%xmm9\n\t" - "movdqu 0*16(%[h_table]), %%xmm8\n\t" - - /* perform clmul and merge results... */ + asm volatile (/* perform clmul and merge results... */ + "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */ "movdqu 0*16(%[buf]), %%xmm5\n\t" - "movdqu 1*16(%[buf]), %%xmm2\n\t" - "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ - "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ + "pshufb %[be_mask], %%xmm5\n\t" /* be => le */ "pxor %%xmm5, %%xmm1\n\t" - "pshufd $78, %%xmm10, %%xmm5\n\t" + "pshufd $78, %%xmm2, %%xmm5\n\t" "pshufd $78, %%xmm1, %%xmm4\n\t" - "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */ - "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */ - "movdqa %%xmm10, %%xmm3\n\t" + "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */ + "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */ + "movdqa %%xmm2, %%xmm3\n\t" "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds 4:a0*b0 */ - "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ + "pclmulqdq $17, %%xmm2, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ "pclmulqdq $0, %%xmm5, %%xmm4\n\t" /* xmm4 holds 4:(a0+a1)*(b0+b1) */ - "pshufd $78, %%xmm9, %%xmm11\n\t" + "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */ + "movdqu 1*16(%[buf]), %%xmm2\n\t" + "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ + + "pshufd $78, %%xmm5, %%xmm0\n\t" "pshufd $78, %%xmm2, %%xmm7\n\t" - "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ - "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ - "movdqa %%xmm9, %%xmm6\n\t" + "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ + "movdqa %%xmm5, %%xmm6\n\t" "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ - "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ + "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ + "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ + + "movdqu 2*16(%[buf]), %%xmm5\n\t" + "pshufb %[be_mask], %%xmm5\n\t" /* be => le */ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */ - "movdqu 2*16(%[buf]), %%xmm5\n\t" - "movdqu 3*16(%[buf]), %%xmm2\n\t" - "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ - "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ + "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */ - "pshufd $78, %%xmm8, %%xmm11\n\t" + "pshufd $78, %%xmm2, %%xmm0\n\t" "pshufd $78, %%xmm5, %%xmm7\n\t" - "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 2:a0+a1 */ - "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */ - "movdqa %%xmm8, %%xmm6\n\t" + "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */ + "movdqa %%xmm2, %%xmm6\n\t" "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 2:a0*b0 */ - "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */ + "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */ + "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */ - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */ + "movdqu 3*16(%[buf]), %%xmm2\n\t" + "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ + : + : [buf] "r" (buf), + [h_table] "r" (h_table), + [be_mask] "m" (*be_mask) + : "memory" ); + + asm volatile ("pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */ "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */ "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */ - "pshufd $78, %%xmm0, %%xmm11\n\t" + "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */ + + "pshufd $78, %%xmm5, %%xmm0\n\t" "pshufd $78, %%xmm2, %%xmm7\n\t" - "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 1:a0+a1 */ - "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */ - "movdqa %%xmm0, %%xmm6\n\t" + "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */ + "movdqa %%xmm5, %%xmm6\n\t" "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 1:a0*b0 */ - "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */ + "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */ + "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */ "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */ "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */ @@ -195,13 +201,13 @@ static inline void gfmul_pclmul_aggr4(const void *buf, const void *h_table) carry-less multiplication of xmm0 by xmm1 */ : - : [buf] "r" (buf), - [h_table] "r" (h_table) + : [h_1] "m" (*(const unsigned char *)h_1) : "memory" ); reduction(); } +#ifdef __x86_64__ static inline void gfmul_pclmul_aggr8(const void *buf, const void *h_table) { /* Input: @@ -210,7 +216,7 @@ static inline void gfmul_pclmul_aggr8(const void *buf, const void *h_table) Hash: XMM1 Output: Hash: XMM1 - Inputs XMM0 and XMM14 stays unmodified. + Inputs XMM0 and XMM15 stays unmodified. */ asm volatile (/* Load H6, H7, H8. */ "movdqu 6*16(%[h_table]), %%xmm10\n\t" @@ -423,7 +429,6 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) gcm_lsh(c->u_mode.gcm.u_ghash_key.key, 0); /* H <<< 1 */ -#ifdef __x86_64__ asm volatile ("movdqa %%xmm0, %%xmm1\n\t" "movdqu (%[key]), %%xmm0\n\t" /* load H <<< 1 */ : @@ -433,7 +438,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) gfmul_pclmul (); /* H<<<1?H => H? */ asm volatile ("movdqu %%xmm1, 0*16(%[h_table])\n\t" - "movdqa %%xmm1, %%xmm8\n\t" + "movdqa %%xmm1, %%xmm7\n\t" : : [h_table] "r" (c->u_mode.gcm.gcm_table) : "memory"); @@ -441,7 +446,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) gcm_lsh(c->u_mode.gcm.gcm_table, 0 * 16); /* H? <<< 1 */ gfmul_pclmul (); /* H<<<1?H? => H? */ - asm volatile ("movdqa %%xmm8, %%xmm0\n\t" + asm volatile ("movdqa %%xmm7, %%xmm0\n\t" "movdqu %%xmm1, 1*16(%[h_table])\n\t" "movdqu 0*16(%[h_table]), %%xmm1\n\t" /* load H? <<< 1 */ : @@ -461,6 +466,7 @@ _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c) gcm_lsh(c->u_mode.gcm.gcm_table, 1 * 16); /* H? <<< 1 */ gcm_lsh(c->u_mode.gcm.gcm_table, 2 * 16); /* H? <<< 1 */ +#ifdef __x86_64__ gfmul_pclmul (); /* H<<<1?H? => H? */ asm volatile ("movdqu %%xmm1, 3*16(%[h_table])\n\t" @@ -573,23 +579,23 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, : "memory" ); #endif - /* Preload hash and H1. */ + /* Preload hash. */ asm volatile ("movdqa %[be_mask], %%xmm7\n\t" "movdqu %[hash], %%xmm1\n\t" - "movdqa %[hsub], %%xmm0\n\t" "pshufb %%xmm7, %%xmm1\n\t" /* be => le */ : : [hash] "m" (*result), - [be_mask] "m" (*be_mask), - [hsub] "m" (*c->u_mode.gcm.u_ghash_key.key) + [be_mask] "m" (*be_mask) : "memory" ); #ifdef __x86_64__ - if (nblocks >= 4) + if (nblocks >= 8) { + /* Preload H1. */ asm volatile ("movdqa %%xmm7, %%xmm15\n\t" + "movdqa %[h_1], %%xmm0\n\t" : - : + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) : "memory" ); while (nblocks >= 8) @@ -599,15 +605,6 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, buf += 8 * blocksize; nblocks -= 8; } - - if (nblocks >= 4) - { - gfmul_pclmul_aggr4 (buf, c->u_mode.gcm.gcm_table); - - buf += 4 * blocksize; - nblocks -= 4; - } - #ifndef __WIN64__ /* Clear used x86-64/XMM registers. */ asm volatile( "pxor %%xmm8, %%xmm8\n\t" @@ -623,19 +620,37 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, } #endif - while (nblocks) + while (nblocks >= 4) { - asm volatile ("movdqu %[buf], %%xmm2\n\t" - "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ - "pxor %%xmm2, %%xmm1\n\t" + gfmul_pclmul_aggr4 (buf, c->u_mode.gcm.u_ghash_key.key, + c->u_mode.gcm.gcm_table, be_mask); + + buf += 4 * blocksize; + nblocks -= 4; + } + + if (nblocks) + { + /* Preload H1. */ + asm volatile ("movdqa %[h_1], %%xmm0\n\t" : - : [buf] "m" (*buf), [be_mask] "m" (*be_mask) + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) : "memory" ); - gfmul_pclmul (); + while (nblocks) + { + asm volatile ("movdqu %[buf], %%xmm2\n\t" + "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ + "pxor %%xmm2, %%xmm1\n\t" + : + : [buf] "m" (*buf), [be_mask] "m" (*be_mask) + : "memory" ); + + gfmul_pclmul (); - buf += blocksize; - nblocks--; + buf += blocksize; + nblocks--; + } } /* Store hash. */ From jussi.kivilinna at iki.fi Sat Apr 27 22:03:15 2019 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 27 Apr 2019 23:03:15 +0300 Subject: [PATCH 4/4] Fix CFI_PUSH/CFI_POP redefine build warning with AMD64 MPI In-Reply-To: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain> References: <155639537849.3345.2780323880296165039.stgit@localhost.localdomain> Message-ID: <155639539470.3345.14491782800257755337.stgit@localhost.localdomain> * mpi/amd64/func_abi.h: Move CFI macros into [__x86_64__] block. * mpi/i386/syntax.h: Move CFI macros into [__i386__] block. -- Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/mpi/amd64/func_abi.h b/mpi/amd64/func_abi.h index 37d5722af..a60363e4e 100644 --- a/mpi/amd64/func_abi.h +++ b/mpi/amd64/func_abi.h @@ -1,5 +1,6 @@ #include +#ifdef __x86_64__ #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES # define CFI_STARTPROC() .cfi_startproc # define CFI_ENDPROC() .cfi_endproc @@ -21,6 +22,7 @@ # define CFI_PUSH(reg) # define CFI_POP(reg) #endif +#endif #ifdef USE_MS_ABI /* Store registers and move four first input arguments from MS ABI to diff --git a/mpi/i386/syntax.h b/mpi/i386/syntax.h index 9101585a8..dd3003199 100644 --- a/mpi/i386/syntax.h +++ b/mpi/i386/syntax.h @@ -28,6 +28,7 @@ #include +#ifdef __i386__ #ifdef HAVE_GCC_ASM_CFI_DIRECTIVES # define CFI_STARTPROC() .cfi_startproc # define CFI_ENDPROC() .cfi_endproc @@ -49,6 +50,7 @@ # define CFI_PUSH(reg) # define CFI_POP(reg) #endif +#endif #undef ALIGN