From jussi.kivilinna at iki.fi Tue Nov 2 20:44:15 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 2 Nov 2021 21:44:15 +0200 Subject: [PATCH] Add intel-pclmul accelerated POLYVAL for GCM-SIV Message-ID: <20211102194415.993948-1-jussi.kivilinna@iki.fi> * cipher/cipher-gcm-intel-pclmul.c (gfmul_pclmul_aggr4) (gfmul_pclmul_aggr8): Move assembly to new GFMUL_AGGRx_ASM* macros. (GFMUL_AGGR4_ASM_1, GFMUL_AGGR4_ASM_2, gfmul_pclmul_aggr4_le) (GFMUL_AGGR8_ASM, gfmul_pclmul_aggr8_le) (_gcry_polyval_intel_pclmul): New. * cipher/cipher-gcm-siv.c (do_polyval_buf): Use polyval function if available. * cipher/cipher-gcm.c (_gcry_polyval_intel_pclmul): New. (setupM): Setup 'c->u_mode.gcm.polyval_fn' with accelerated polyval function if available. * cipher/cipher-internal.h (gcry_cipher_handle): Add member 'u_mode.gcm.polyval_fn'. -- Benchmark on AMD Ryzen 7 5800X: Before: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GCM-SIV enc | 0.150 ns/B 6337 MiB/s 0.730 c/B 4849 GCM-SIV dec | 0.163 ns/B 5862 MiB/s 0.789 c/B 4850 GCM-SIV auth | 0.119 ns/B 8022 MiB/s 0.577 c/B 4850 After (enc/dec ~26% faster, auth ~43% faster): AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GCM-SIV enc | 0.117 ns/B 8138 MiB/s 0.568 c/B 4850 GCM-SIV dec | 0.128 ns/B 7429 MiB/s 0.623 c/B 4850 GCM-SIV auth | 0.083 ns/B 11507 MiB/s 0.402 c/B 4851 Signed-off-by: Jussi Kivilinna --- cipher/cipher-gcm-intel-pclmul.c | 642 ++++++++++++++++++++----------- cipher/cipher-gcm-siv.c | 35 +- cipher/cipher-gcm.c | 7 + cipher/cipher-internal.h | 3 + 4 files changed, 459 insertions(+), 228 deletions(-) diff --git a/cipher/cipher-gcm-intel-pclmul.c b/cipher/cipher-gcm-intel-pclmul.c index 334c89cd..daf807d0 100644 --- a/cipher/cipher-gcm-intel-pclmul.c +++ b/cipher/cipher-gcm-intel-pclmul.c @@ -114,6 +114,91 @@ static ASM_FUNC_ATTR_INLINE void gfmul_pclmul(void) reduction(); } +#define GFMUL_AGGR4_ASM_1(be_to_le) \ + /* perform clmul and merge results... */ \ + "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */ \ + "movdqu 0*16(%[buf]), %%xmm5\n\t" \ + be_to_le("pshufb %[be_mask], %%xmm5\n\t") /* be => le */ \ + "pxor %%xmm5, %%xmm1\n\t" \ + \ + "pshufd $78, %%xmm2, %%xmm5\n\t" \ + "pshufd $78, %%xmm1, %%xmm4\n\t" \ + "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */ \ + "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */ \ + "movdqa %%xmm2, %%xmm3\n\t" \ + "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds 4:a0*b0 */ \ + "pclmulqdq $17, %%xmm2, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ \ + "pclmulqdq $0, %%xmm5, %%xmm4\n\t" /* xmm4 holds 4:(a0+a1)*(b0+b1) */ \ + \ + "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */ \ + "movdqu 1*16(%[buf]), %%xmm2\n\t" \ + be_to_le("pshufb %[be_mask], %%xmm2\n\t") /* be => le */ \ + \ + "pshufd $78, %%xmm5, %%xmm0\n\t" \ + "pshufd $78, %%xmm2, %%xmm7\n\t" \ + "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */ \ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ \ + "movdqa %%xmm5, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ \ + "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ \ + "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ \ + \ + "movdqu 2*16(%[buf]), %%xmm5\n\t" \ + be_to_le("pshufb %[be_mask], %%xmm5\n\t") /* be => le */ \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */ \ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */ \ + \ + "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */ \ + \ + "pshufd $78, %%xmm2, %%xmm0\n\t" \ + "pshufd $78, %%xmm5, %%xmm7\n\t" \ + "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */ \ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */ \ + "movdqa %%xmm2, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 2:a0*b0 */ \ + "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */ \ + "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */ \ + \ + "movdqu 3*16(%[buf]), %%xmm2\n\t" \ + be_to_le("pshufb %[be_mask], %%xmm2\n\t") /* be => le */ \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */ \ + "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */ + +#define GFMUL_AGGR4_ASM_2() \ + "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */ \ + \ + "pshufd $78, %%xmm5, %%xmm0\n\t" \ + "pshufd $78, %%xmm2, %%xmm7\n\t" \ + "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */ \ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */ \ + "movdqa %%xmm5, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 1:a0*b0 */ \ + "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */ \ + "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */ \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */ \ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */ \ + \ + /* aggregated reduction... */ \ + "movdqa %%xmm3, %%xmm5\n\t" \ + "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ \ + "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ \ + "movdqa %%xmm4, %%xmm5\n\t" \ + "psrldq $8, %%xmm4\n\t" \ + "pslldq $8, %%xmm5\n\t" \ + "pxor %%xmm5, %%xmm3\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the \ + carry-less multiplication of xmm0 \ + by xmm1 */ + +#define be_to_le(...) __VA_ARGS__ +#define le_to_le(...) /*_*/ + static ASM_FUNC_ATTR_INLINE void gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table, const unsigned char *be_mask) @@ -123,90 +208,36 @@ gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table, Output: Hash: XMM1 */ - asm volatile (/* perform clmul and merge results... */ - "movdqu 2*16(%[h_table]), %%xmm2\n\t" /* Load H4 */ - "movdqu 0*16(%[buf]), %%xmm5\n\t" - "pshufb %[be_mask], %%xmm5\n\t" /* be => le */ - "pxor %%xmm5, %%xmm1\n\t" - - "pshufd $78, %%xmm2, %%xmm5\n\t" - "pshufd $78, %%xmm1, %%xmm4\n\t" - "pxor %%xmm2, %%xmm5\n\t" /* xmm5 holds 4:a0+a1 */ - "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 4:b0+b1 */ - "movdqa %%xmm2, %%xmm3\n\t" - "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds 4:a0*b0 */ - "pclmulqdq $17, %%xmm2, %%xmm1\n\t" /* xmm1 holds 4:a1*b1 */ - "pclmulqdq $0, %%xmm5, %%xmm4\n\t" /* xmm4 holds 4:(a0+a1)*(b0+b1) */ - - "movdqu 1*16(%[h_table]), %%xmm5\n\t" /* Load H3 */ - "movdqu 1*16(%[buf]), %%xmm2\n\t" - "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ - - "pshufd $78, %%xmm5, %%xmm0\n\t" - "pshufd $78, %%xmm2, %%xmm7\n\t" - "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 3:a0+a1 */ - "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ - "movdqa %%xmm5, %%xmm6\n\t" - "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ - "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ - "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ - - "movdqu 2*16(%[buf]), %%xmm5\n\t" - "pshufb %[be_mask], %%xmm5\n\t" /* be => le */ - - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4:a0*b0 */ - "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4:(a0+a1)*(b0+b1) */ - - "movdqu 0*16(%[h_table]), %%xmm2\n\t" /* Load H2 */ - - "pshufd $78, %%xmm2, %%xmm0\n\t" - "pshufd $78, %%xmm5, %%xmm7\n\t" - "pxor %%xmm2, %%xmm0\n\t" /* xmm0 holds 2:a0+a1 */ - "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 2:b0+b1 */ - "movdqa %%xmm2, %%xmm6\n\t" - "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 2:a0*b0 */ - "pclmulqdq $17, %%xmm2, %%xmm5\n\t" /* xmm5 holds 2:a1*b1 */ - "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 2:(a0+a1)*(b0+b1) */ - - "movdqu 3*16(%[buf]), %%xmm2\n\t" - "pshufb %[be_mask], %%xmm2\n\t" /* be => le */ + asm volatile (GFMUL_AGGR4_ASM_1(be_to_le) : : [buf] "r" (buf), [h_table] "r" (h_table), [be_mask] "m" (*be_mask) : "memory" ); - asm volatile ("pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4:a0*b0 */ - "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4:(a0+a1)*(b0+b1) */ - - "movdqu %[h_1], %%xmm5\n\t" /* Load H1 */ + asm volatile (GFMUL_AGGR4_ASM_2() + : + : [h_1] "m" (*(const unsigned char *)h_1) + : "memory" ); - "pshufd $78, %%xmm5, %%xmm0\n\t" - "pshufd $78, %%xmm2, %%xmm7\n\t" - "pxor %%xmm5, %%xmm0\n\t" /* xmm0 holds 1:a0+a1 */ - "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 1:b0+b1 */ - "movdqa %%xmm5, %%xmm6\n\t" - "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 1:a0*b0 */ - "pclmulqdq $17, %%xmm5, %%xmm2\n\t" /* xmm2 holds 1:a1*b1 */ - "pclmulqdq $0, %%xmm0, %%xmm7\n\t" /* xmm7 holds 1:(a0+a1)*(b0+b1) */ + reduction(); +} - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+4:a0*b0 */ - "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+4:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+4:(a0+a1)*(b0+b1) */ +static ASM_FUNC_ATTR_INLINE void +gfmul_pclmul_aggr4_le(const void *buf, const void *h_1, const void *h_table) +{ + /* Input: + Hash: XMM1 + Output: + Hash: XMM1 + */ + asm volatile (GFMUL_AGGR4_ASM_1(le_to_le) + : + : [buf] "r" (buf), + [h_table] "r" (h_table) + : "memory" ); - /* aggregated reduction... */ - "movdqa %%xmm3, %%xmm5\n\t" - "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ - "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ - "movdqa %%xmm4, %%xmm5\n\t" - "psrldq $8, %%xmm4\n\t" - "pslldq $8, %%xmm5\n\t" - "pxor %%xmm5, %%xmm3\n\t" - "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the - carry-less multiplication of xmm0 - by xmm1 */ + asm volatile (GFMUL_AGGR4_ASM_2() : : [h_1] "m" (*(const unsigned char *)h_1) : "memory" ); @@ -215,6 +246,154 @@ gfmul_pclmul_aggr4(const void *buf, const void *h_1, const void *h_table, } #ifdef __x86_64__ + +#define GFMUL_AGGR8_ASM(be_to_le) \ + /* Load H6, H7, H8. */ \ + "movdqu 6*16(%[h_table]), %%xmm10\n\t" \ + "movdqu 5*16(%[h_table]), %%xmm9\n\t" \ + "movdqu 4*16(%[h_table]), %%xmm8\n\t" \ + \ + /* perform clmul and merge results... */ \ + "movdqu 0*16(%[buf]), %%xmm5\n\t" \ + "movdqu 1*16(%[buf]), %%xmm2\n\t" \ + be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */ \ + be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */ \ + "pxor %%xmm5, %%xmm1\n\t" \ + \ + "pshufd $78, %%xmm10, %%xmm5\n\t" \ + "pshufd $78, %%xmm1, %%xmm4\n\t" \ + "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */ \ + "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 8:b0+b1 */ \ + "movdqa %%xmm10, %%xmm3\n\t" \ + "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds 8:a0*b0 */ \ + "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */ \ + "pclmulqdq $0, %%xmm5, %%xmm4\n\t" /* xmm4 holds 8:(a0+a1)*(b0+b1) */ \ + \ + "pshufd $78, %%xmm9, %%xmm11\n\t" \ + "pshufd $78, %%xmm2, %%xmm7\n\t" \ + "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */ \ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 7:b0+b1 */ \ + "movdqa %%xmm9, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 7:a0*b0 */ \ + "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */ \ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */ \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */ \ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */ \ + \ + "movdqu 2*16(%[buf]), %%xmm5\n\t" \ + "movdqu 3*16(%[buf]), %%xmm2\n\t" \ + be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */ \ + be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */ \ + \ + "pshufd $78, %%xmm8, %%xmm11\n\t" \ + "pshufd $78, %%xmm5, %%xmm7\n\t" \ + "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */ \ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 6:b0+b1 */ \ + "movdqa %%xmm8, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 6:a0*b0 */ \ + "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */ \ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */ \ + \ + /* Load H3, H4, H5. */ \ + "movdqu 3*16(%[h_table]), %%xmm10\n\t" \ + "movdqu 2*16(%[h_table]), %%xmm9\n\t" \ + "movdqu 1*16(%[h_table]), %%xmm8\n\t" \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */ \ + "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */ \ + \ + "pshufd $78, %%xmm10, %%xmm11\n\t" \ + "pshufd $78, %%xmm2, %%xmm7\n\t" \ + "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */ \ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 5:b0+b1 */ \ + "movdqa %%xmm10, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 5:a0*b0 */ \ + "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */ \ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 5:(a0+a1)*(b0+b1) */ \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */ \ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */ \ + \ + "movdqu 4*16(%[buf]), %%xmm5\n\t" \ + "movdqu 5*16(%[buf]), %%xmm2\n\t" \ + be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */ \ + be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */ \ + \ + "pshufd $78, %%xmm9, %%xmm11\n\t" \ + "pshufd $78, %%xmm5, %%xmm7\n\t" \ + "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ \ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 4:b0+b1 */ \ + "movdqa %%xmm9, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 4:a0*b0 */ \ + "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */ \ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */ \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */ \ + "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */ \ + \ + "pshufd $78, %%xmm8, %%xmm11\n\t" \ + "pshufd $78, %%xmm2, %%xmm7\n\t" \ + "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ \ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ \ + "movdqa %%xmm8, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ \ + "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ \ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ \ + \ + "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */ \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */ \ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */ \ + \ + "movdqu 6*16(%[buf]), %%xmm5\n\t" \ + "movdqu 7*16(%[buf]), %%xmm2\n\t" \ + be_to_le("pshufb %%xmm15, %%xmm5\n\t") /* be => le */ \ + be_to_le("pshufb %%xmm15, %%xmm2\n\t") /* be => le */ \ + \ + "pshufd $78, %%xmm8, %%xmm11\n\t" \ + "pshufd $78, %%xmm5, %%xmm7\n\t" \ + "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ \ + "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 4:b0+b1 */ \ + "movdqa %%xmm8, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 4:a0*b0 */ \ + "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */ \ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */ \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */ \ + "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */ \ + \ + "pshufd $78, %%xmm0, %%xmm11\n\t" \ + "pshufd $78, %%xmm2, %%xmm7\n\t" \ + "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ \ + "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ \ + "movdqa %%xmm0, %%xmm6\n\t" \ + "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ \ + "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ \ + "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ \ + \ + "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */ \ + "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */ \ + "pxor %%xmm7, %%xmm4\n\t"/* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */\ + \ + /* aggregated reduction... */ \ + "movdqa %%xmm3, %%xmm5\n\t" \ + "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ \ + "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ \ + "movdqa %%xmm4, %%xmm5\n\t" \ + "psrldq $8, %%xmm4\n\t" \ + "pslldq $8, %%xmm5\n\t" \ + "pxor %%xmm5, %%xmm3\n\t" \ + "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the \ + carry-less multiplication of xmm0 \ + by xmm1 */ + static ASM_FUNC_ATTR_INLINE void gfmul_pclmul_aggr8(const void *buf, const void *h_table) { @@ -226,151 +405,26 @@ gfmul_pclmul_aggr8(const void *buf, const void *h_table) Hash: XMM1 Inputs XMM0 and XMM15 stays unmodified. */ - asm volatile (/* Load H6, H7, H8. */ - "movdqu 6*16(%[h_table]), %%xmm10\n\t" - "movdqu 5*16(%[h_table]), %%xmm9\n\t" - "movdqu 4*16(%[h_table]), %%xmm8\n\t" - - /* perform clmul and merge results... */ - "movdqu 0*16(%[buf]), %%xmm5\n\t" - "movdqu 1*16(%[buf]), %%xmm2\n\t" - "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ - "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ - "pxor %%xmm5, %%xmm1\n\t" - - "pshufd $78, %%xmm10, %%xmm5\n\t" - "pshufd $78, %%xmm1, %%xmm4\n\t" - "pxor %%xmm10, %%xmm5\n\t" /* xmm5 holds 8:a0+a1 */ - "pxor %%xmm1, %%xmm4\n\t" /* xmm4 holds 8:b0+b1 */ - "movdqa %%xmm10, %%xmm3\n\t" - "pclmulqdq $0, %%xmm1, %%xmm3\n\t" /* xmm3 holds 8:a0*b0 */ - "pclmulqdq $17, %%xmm10, %%xmm1\n\t" /* xmm1 holds 8:a1*b1 */ - "pclmulqdq $0, %%xmm5, %%xmm4\n\t" /* xmm4 holds 8:(a0+a1)*(b0+b1) */ - - "pshufd $78, %%xmm9, %%xmm11\n\t" - "pshufd $78, %%xmm2, %%xmm7\n\t" - "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 7:a0+a1 */ - "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 7:b0+b1 */ - "movdqa %%xmm9, %%xmm6\n\t" - "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 7:a0*b0 */ - "pclmulqdq $17, %%xmm9, %%xmm2\n\t" /* xmm2 holds 7:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 7:(a0+a1)*(b0+b1) */ - - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 7+8:a0*b0 */ - "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 7+8:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 7+8:(a0+a1)*(b0+b1) */ - - "movdqu 2*16(%[buf]), %%xmm5\n\t" - "movdqu 3*16(%[buf]), %%xmm2\n\t" - "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ - "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ - - "pshufd $78, %%xmm8, %%xmm11\n\t" - "pshufd $78, %%xmm5, %%xmm7\n\t" - "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 6:a0+a1 */ - "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 6:b0+b1 */ - "movdqa %%xmm8, %%xmm6\n\t" - "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 6:a0*b0 */ - "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 6:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 6:(a0+a1)*(b0+b1) */ - - /* Load H3, H4, H5. */ - "movdqu 3*16(%[h_table]), %%xmm10\n\t" - "movdqu 2*16(%[h_table]), %%xmm9\n\t" - "movdqu 1*16(%[h_table]), %%xmm8\n\t" - - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 6+7+8:a0*b0 */ - "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 6+7+8:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 6+7+8:(a0+a1)*(b0+b1) */ - - "pshufd $78, %%xmm10, %%xmm11\n\t" - "pshufd $78, %%xmm2, %%xmm7\n\t" - "pxor %%xmm10, %%xmm11\n\t" /* xmm11 holds 5:a0+a1 */ - "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 5:b0+b1 */ - "movdqa %%xmm10, %%xmm6\n\t" - "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 5:a0*b0 */ - "pclmulqdq $17, %%xmm10, %%xmm2\n\t" /* xmm2 holds 5:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 5:(a0+a1)*(b0+b1) */ - - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 5+6+7+8:a0*b0 */ - "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 5+6+7+8:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 5+6+7+8:(a0+a1)*(b0+b1) */ - - "movdqu 4*16(%[buf]), %%xmm5\n\t" - "movdqu 5*16(%[buf]), %%xmm2\n\t" - "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ - "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ - - "pshufd $78, %%xmm9, %%xmm11\n\t" - "pshufd $78, %%xmm5, %%xmm7\n\t" - "pxor %%xmm9, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ - "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 4:b0+b1 */ - "movdqa %%xmm9, %%xmm6\n\t" - "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 4:a0*b0 */ - "pclmulqdq $17, %%xmm9, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */ - - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 4+5+6+7+8:a0*b0 */ - "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 4+5+6+7+8:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 4+5+6+7+8:(a0+a1)*(b0+b1) */ - - "pshufd $78, %%xmm8, %%xmm11\n\t" - "pshufd $78, %%xmm2, %%xmm7\n\t" - "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ - "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ - "movdqa %%xmm8, %%xmm6\n\t" - "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ - "pclmulqdq $17, %%xmm8, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ - - "movdqu 0*16(%[h_table]), %%xmm8\n\t" /* Load H2 */ - - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 3+4+5+6+7+8:a0*b0 */ - "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 3+4+5+6+7+8:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 3+4+5+6+7+8:(a0+a1)*(b0+b1) */ - - "movdqu 6*16(%[buf]), %%xmm5\n\t" - "movdqu 7*16(%[buf]), %%xmm2\n\t" - "pshufb %%xmm15, %%xmm5\n\t" /* be => le */ - "pshufb %%xmm15, %%xmm2\n\t" /* be => le */ - - "pshufd $78, %%xmm8, %%xmm11\n\t" - "pshufd $78, %%xmm5, %%xmm7\n\t" - "pxor %%xmm8, %%xmm11\n\t" /* xmm11 holds 4:a0+a1 */ - "pxor %%xmm5, %%xmm7\n\t" /* xmm7 holds 4:b0+b1 */ - "movdqa %%xmm8, %%xmm6\n\t" - "pclmulqdq $0, %%xmm5, %%xmm6\n\t" /* xmm6 holds 4:a0*b0 */ - "pclmulqdq $17, %%xmm8, %%xmm5\n\t" /* xmm5 holds 4:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 4:(a0+a1)*(b0+b1) */ - - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 2+3+4+5+6+7+8:a0*b0 */ - "pxor %%xmm5, %%xmm1\n\t" /* xmm1 holds 2+3+4+5+6+7+8:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 2+3+4+5+6+7+8:(a0+a1)*(b0+b1) */ - - "pshufd $78, %%xmm0, %%xmm11\n\t" - "pshufd $78, %%xmm2, %%xmm7\n\t" - "pxor %%xmm0, %%xmm11\n\t" /* xmm11 holds 3:a0+a1 */ - "pxor %%xmm2, %%xmm7\n\t" /* xmm7 holds 3:b0+b1 */ - "movdqa %%xmm0, %%xmm6\n\t" - "pclmulqdq $0, %%xmm2, %%xmm6\n\t" /* xmm6 holds 3:a0*b0 */ - "pclmulqdq $17, %%xmm0, %%xmm2\n\t" /* xmm2 holds 3:a1*b1 */ - "pclmulqdq $0, %%xmm11, %%xmm7\n\t" /* xmm7 holds 3:(a0+a1)*(b0+b1) */ - - "pxor %%xmm6, %%xmm3\n\t" /* xmm3 holds 1+2+3+3+4+5+6+7+8:a0*b0 */ - "pxor %%xmm2, %%xmm1\n\t" /* xmm1 holds 1+2+3+3+4+5+6+7+8:a1*b1 */ - "pxor %%xmm7, %%xmm4\n\t" /* xmm4 holds 1+2+3+3+4+5+6+7+8:(a0+a1)*(b0+b1) */ - - /* aggregated reduction... */ - "movdqa %%xmm3, %%xmm5\n\t" - "pxor %%xmm1, %%xmm5\n\t" /* xmm5 holds a0*b0+a1*b1 */ - "pxor %%xmm5, %%xmm4\n\t" /* xmm4 holds a0*b0+a1*b1+(a0+a1)*(b0+b1) */ - "movdqa %%xmm4, %%xmm5\n\t" - "psrldq $8, %%xmm4\n\t" - "pslldq $8, %%xmm5\n\t" - "pxor %%xmm5, %%xmm3\n\t" - "pxor %%xmm4, %%xmm1\n\t" /* holds the result of the - carry-less multiplication of xmm0 - by xmm1 */ + asm volatile (GFMUL_AGGR8_ASM(be_to_le) + : + : [buf] "r" (buf), + [h_table] "r" (h_table) + : "memory" ); + + reduction(); +} + +static ASM_FUNC_ATTR_INLINE void +gfmul_pclmul_aggr8_le(const void *buf, const void *h_table) +{ + /* Input: + H?: XMM0 + Hash: XMM1 + Output: + Hash: XMM1 + Inputs XMM0 and XMM15 stays unmodified. + */ + asm volatile (GFMUL_AGGR8_ASM(le_to_le) : : [buf] "r" (buf), [h_table] "r" (h_table) @@ -705,6 +759,154 @@ _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, return 0; } +unsigned int ASM_FUNC_ATTR +_gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, + size_t nblocks) +{ + static const unsigned char be_mask[16] __attribute__ ((aligned (16))) = + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + const unsigned int blocksize = GCRY_GCM_BLOCK_LEN; +#if defined(__x86_64__) && defined(__WIN64__) + char win64tmp[10 * 16]; +#endif + + if (nblocks == 0) + return 0; + +#if defined(__x86_64__) && defined(__WIN64__) + /* XMM6-XMM15 need to be restored after use. */ + asm volatile ("movdqu %%xmm6, 0*16(%0)\n\t" + "movdqu %%xmm7, 1*16(%0)\n\t" + "movdqu %%xmm8, 2*16(%0)\n\t" + "movdqu %%xmm9, 3*16(%0)\n\t" + "movdqu %%xmm10, 4*16(%0)\n\t" + "movdqu %%xmm11, 5*16(%0)\n\t" + "movdqu %%xmm12, 6*16(%0)\n\t" + "movdqu %%xmm13, 7*16(%0)\n\t" + "movdqu %%xmm14, 8*16(%0)\n\t" + "movdqu %%xmm15, 9*16(%0)\n\t" + : + : "r" (win64tmp) + : "memory" ); +#endif + + /* Preload hash. */ + asm volatile ("pxor %%xmm7, %%xmm7\n\t" + "movdqu %[hash], %%xmm1\n\t" + "pshufb %[be_mask], %%xmm1\n\t" /* be => le */ + : + : [hash] "m" (*result), + [be_mask] "m" (*be_mask) + : "memory" ); + +#ifdef __x86_64__ + if (nblocks >= 8) + { + /* Preload H1. */ + asm volatile ("pxor %%xmm15, %%xmm15\n\t" + "movdqa %[h_1], %%xmm0\n\t" + : + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory" ); + + while (nblocks >= 8) + { + gfmul_pclmul_aggr8_le (buf, c->u_mode.gcm.gcm_table); + + buf += 8 * blocksize; + nblocks -= 8; + } +#ifndef __WIN64__ + /* Clear used x86-64/XMM registers. */ + asm volatile( "pxor %%xmm8, %%xmm8\n\t" + "pxor %%xmm9, %%xmm9\n\t" + "pxor %%xmm10, %%xmm10\n\t" + "pxor %%xmm11, %%xmm11\n\t" + "pxor %%xmm12, %%xmm12\n\t" + "pxor %%xmm13, %%xmm13\n\t" + "pxor %%xmm14, %%xmm14\n\t" + "pxor %%xmm15, %%xmm15\n\t" + ::: "memory" ); +#endif + } +#endif + + while (nblocks >= 4) + { + gfmul_pclmul_aggr4_le (buf, c->u_mode.gcm.u_ghash_key.key, + c->u_mode.gcm.gcm_table); + + buf += 4 * blocksize; + nblocks -= 4; + } + + if (nblocks) + { + /* Preload H1. */ + asm volatile ("movdqa %[h_1], %%xmm0\n\t" + : + : [h_1] "m" (*c->u_mode.gcm.u_ghash_key.key) + : "memory" ); + + while (nblocks) + { + asm volatile ("movdqu %[buf], %%xmm2\n\t" + "pxor %%xmm2, %%xmm1\n\t" + : + : [buf] "m" (*buf) + : "memory" ); + + gfmul_pclmul (); + + buf += blocksize; + nblocks--; + } + } + + /* Store hash. */ + asm volatile ("pshufb %[be_mask], %%xmm1\n\t" /* be => le */ + "movdqu %%xmm1, %[hash]\n\t" + : [hash] "=m" (*result) + : [be_mask] "m" (*be_mask) + : "memory" ); + +#if defined(__x86_64__) && defined(__WIN64__) + /* Clear/restore used registers. */ + asm volatile( "pxor %%xmm0, %%xmm0\n\t" + "pxor %%xmm1, %%xmm1\n\t" + "pxor %%xmm2, %%xmm2\n\t" + "pxor %%xmm3, %%xmm3\n\t" + "pxor %%xmm4, %%xmm4\n\t" + "pxor %%xmm5, %%xmm5\n\t" + "movdqu 0*16(%0), %%xmm6\n\t" + "movdqu 1*16(%0), %%xmm7\n\t" + "movdqu 2*16(%0), %%xmm8\n\t" + "movdqu 3*16(%0), %%xmm9\n\t" + "movdqu 4*16(%0), %%xmm10\n\t" + "movdqu 5*16(%0), %%xmm11\n\t" + "movdqu 6*16(%0), %%xmm12\n\t" + "movdqu 7*16(%0), %%xmm13\n\t" + "movdqu 8*16(%0), %%xmm14\n\t" + "movdqu 9*16(%0), %%xmm15\n\t" + : + : "r" (win64tmp) + : "memory" ); +#else + /* Clear used registers. */ + asm volatile( "pxor %%xmm0, %%xmm0\n\t" + "pxor %%xmm1, %%xmm1\n\t" + "pxor %%xmm2, %%xmm2\n\t" + "pxor %%xmm3, %%xmm3\n\t" + "pxor %%xmm4, %%xmm4\n\t" + "pxor %%xmm5, %%xmm5\n\t" + "pxor %%xmm6, %%xmm6\n\t" + "pxor %%xmm7, %%xmm7\n\t" + ::: "memory" ); +#endif + + return 0; +} + #if __clang__ # pragma clang attribute pop #endif diff --git a/cipher/cipher-gcm-siv.c b/cipher/cipher-gcm-siv.c index 813cf579..9ebc0036 100644 --- a/cipher/cipher-gcm-siv.c +++ b/cipher/cipher-gcm-siv.c @@ -96,6 +96,7 @@ do_polyval_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf, unsigned int blocksize = GCRY_SIV_BLOCK_LEN; unsigned int unused = c->u_mode.gcm.mac_unused; ghash_fn_t ghash_fn = c->u_mode.gcm.ghash_fn; + ghash_fn_t polyval_fn = c->u_mode.gcm.polyval_fn; byte tmp_blocks[16][GCRY_SIV_BLOCK_LEN]; size_t nblocks, n; unsigned int burn = 0, nburn; @@ -137,9 +138,17 @@ do_polyval_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf, gcry_assert (unused == blocksize); /* Process one block from macbuf. */ - cipher_block_bswap (c->u_mode.gcm.macbuf, c->u_mode.gcm.macbuf, - blocksize); - nburn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1); + if (polyval_fn) + { + nburn = polyval_fn (c, hash, c->u_mode.gcm.macbuf, 1); + } + else + { + cipher_block_bswap (c->u_mode.gcm.macbuf, c->u_mode.gcm.macbuf, + blocksize); + nburn = ghash_fn (c, hash, c->u_mode.gcm.macbuf, 1); + } + burn = nburn > burn ? nburn : burn; unused = 0; } @@ -148,12 +157,22 @@ do_polyval_buf(gcry_cipher_hd_t c, byte *hash, const byte *buf, while (nblocks) { - for (n = 0; n < (nblocks > 16 ? 16 : nblocks); n++) - cipher_block_bswap (tmp_blocks[n], buf + n * blocksize, blocksize); - - num_blks_used = n > num_blks_used ? n : num_blks_used; + if (polyval_fn) + { + n = nblocks; + nburn = polyval_fn (c, hash, buf, n); + } + else + { + for (n = 0; n < (nblocks > 16 ? 16 : nblocks); n++) + cipher_block_bswap (tmp_blocks[n], buf + n * blocksize, + blocksize); + + num_blks_used = n > num_blks_used ? n : num_blks_used; + + nburn = ghash_fn (c, hash, tmp_blocks[0], n); + } - nburn = ghash_fn (c, hash, tmp_blocks[0], n); burn = nburn > burn ? nburn : burn; buf += n * blocksize; buflen -= n * blocksize; diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index 6b14cce7..d3ed9cf6 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -43,6 +43,11 @@ extern void _gcry_ghash_setup_intel_pclmul (gcry_cipher_hd_t c); extern unsigned int _gcry_ghash_intel_pclmul (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks); + +extern unsigned int _gcry_polyval_intel_pclmul (gcry_cipher_hd_t c, + byte *result, + const byte *buf, + size_t nblocks); #endif #ifdef GCM_USE_ARM_PMULL @@ -570,6 +575,7 @@ setupM (gcry_cipher_hd_t c) #endif c->u_mode.gcm.ghash_fn = NULL; + c->u_mode.gcm.polyval_fn = NULL; if (0) { } @@ -577,6 +583,7 @@ setupM (gcry_cipher_hd_t c) else if (features & HWF_INTEL_PCLMUL) { c->u_mode.gcm.ghash_fn = _gcry_ghash_intel_pclmul; + c->u_mode.gcm.polyval_fn = _gcry_polyval_intel_pclmul; _gcry_ghash_setup_intel_pclmul (c); } #endif diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index 0bc85b1a..edb29628 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -350,6 +350,9 @@ struct gcry_cipher_handle /* GHASH implementation in use. */ ghash_fn_t ghash_fn; + /* POLYVAL implementation in use (GCM-SIV). */ + ghash_fn_t polyval_fn; + /* Key length used for GCM-SIV key generating key. */ unsigned int siv_keylen; } gcm; -- 2.32.0 From guidovranken at gmail.com Tue Nov 30 14:21:43 2021 From: guidovranken at gmail.com (Guido Vranken) Date: Tue, 30 Nov 2021 14:21:43 +0100 Subject: gcry_mpi_sub_ui result is positive when it should be negative Message-ID: In the program below, the result of the computation -5 - 2 should be -7, but it is 7. Compare with the behavior of gcry_mpi_add_ui and gcry_mpi_mul_ui (commented), which do produce a negative result, as they should. This was tested on the latest repository checkout. #include #define CF_CHECK_EQ(expr, res) if ( (expr) != (res) ) { goto end; } int main(void) { gcry_mpi_t A; gcry_mpi_t res; gcry_error_t err; char *buf; CF_CHECK_EQ(err = gcry_mpi_scan(&A, GCRYMPI_FMT_HEX, "-5", 0, NULL), 0); CF_CHECK_EQ(err = gcry_mpi_scan(&res, GCRYMPI_FMT_HEX, "0", 0, NULL), 0); gcry_mpi_sub_ui(res, A, 2); //gcry_mpi_add_ui(res, A, 2); //gcry_mpi_mul_ui(res, A, 2); CF_CHECK_EQ(err = gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char**)&buf, NULL, res), 0); printf("%s\n", buf); end: return 0; } -------------- next part -------------- An HTML attachment was scrubbed... URL: From jussi.kivilinna at iki.fi Tue Nov 30 21:23:45 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 30 Nov 2021 22:23:45 +0200 Subject: [PATCH] gcry_mpi_sub_ui: fix subtracting from negative value Message-ID: <20211130202345.877377-1-jussi.kivilinna@iki.fi> * mpi/mpi-add.c (_gcry_mpi_sub_ui): Set output sign bit when 'u' is negative. * tests/mpitests.c (test_add): Additional tests for mpi_add_ui; Check test output and fail if output does not match expected. (test_sub): Additional tests for mpi_sub_ui; Check test output and fail if output does not match expected. (test_mul): Additional tests for mpi_mul_ui; Check test output and fail if output does not match expected. -- Signed-off-by: Jussi Kivilinna --- mpi/mpi-add.c | 1 + tests/mpitests.c | 119 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 113 insertions(+), 7 deletions(-) diff --git a/mpi/mpi-add.c b/mpi/mpi-add.c index 53f476e0..38dd352f 100644 --- a/mpi/mpi-add.c +++ b/mpi/mpi-add.c @@ -191,6 +191,7 @@ _gcry_mpi_sub_ui(gcry_mpi_t w, gcry_mpi_t u, unsigned long v ) cy = _gcry_mpih_add_1(wp, up, usize, v); wp[usize] = cy; wsize = usize + cy; + wsign = 1; } else { /* The signs are different. Need exact comparison to determine * which operand to subtract from which. */ diff --git a/tests/mpitests.c b/tests/mpitests.c index 96e01551..48ea18b2 100644 --- a/tests/mpitests.c +++ b/tests/mpitests.c @@ -378,7 +378,8 @@ test_add (void) gcry_mpi_t two; gcry_mpi_t ff; gcry_mpi_t result; - unsigned char* pc; + gcry_mpi_t minusfive; + char *pc; gcry_mpi_scan(&one, GCRYMPI_FMT_USG, ones, sizeof(ones), NULL); gcry_mpi_scan(&two, GCRYMPI_FMT_USG, twos, sizeof(twos), NULL); @@ -386,21 +387,47 @@ test_add (void) result = gcry_mpi_new(0); gcry_mpi_add(result, one, two); - gcry_mpi_aprint(GCRYMPI_FMT_HEX, &pc, NULL, result); + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); if (debug) gcry_log_debug ("Result of one plus two:\n%s\n", pc); + if (strcmp (pc, "030303030303030303030303030303030303030303030303" + "030303030303030303030303030303030303030303030303") != 0) + fail ("mpi_add failed at line %d", __LINE__); gcry_free(pc); gcry_mpi_add(result, ff, one); - gcry_mpi_aprint(GCRYMPI_FMT_HEX, &pc, NULL, result); + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); if (debug) gcry_log_debug ("Result of ff plus one:\n%s\n", pc); + if (strcmp (pc, "010101010101010101010101010101010101010101010101" + "01010101010101010101010101010101010101010101010100") != 0) + fail ("mpi_add failed at line %d", __LINE__); + gcry_free(pc); + + gcry_mpi_scan(&minusfive, GCRYMPI_FMT_HEX, "-5", 0, NULL); + gcry_mpi_add_ui (result, minusfive, 2); + + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); + if (debug) + gcry_log_debug ("Result of minus five plus two:\n%s\n", pc); + if (strcmp (pc, "-03") != 0) + fail ("mpi_add_ui failed at line %d", __LINE__); + gcry_free(pc); + + gcry_mpi_add_ui (result, result, 3); + + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); + if (debug) + gcry_log_debug ("Result of minus three plus three:\n%s\n", pc); + if (strcmp (pc, "00") != 0) + fail ("mpi_add_ui failed at line %d", __LINE__); gcry_free(pc); gcry_mpi_release(one); gcry_mpi_release(two); gcry_mpi_release(ff); gcry_mpi_release(result); + gcry_mpi_release(minusfive); return 1; } @@ -408,24 +435,76 @@ test_add (void) static int test_sub (void) { + gcry_mpi_t zero; gcry_mpi_t one; gcry_mpi_t two; + gcry_mpi_t five; gcry_mpi_t result; - unsigned char* pc; + gcry_mpi_t minusfive; + char *pc; gcry_mpi_scan(&one, GCRYMPI_FMT_USG, ones, sizeof(ones), NULL); gcry_mpi_scan(&two, GCRYMPI_FMT_USG, twos, sizeof(twos), NULL); result = gcry_mpi_new(0); gcry_mpi_sub(result, two, one); - gcry_mpi_aprint(GCRYMPI_FMT_HEX, &pc, NULL, result); + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); if (debug) gcry_log_debug ("Result of two minus one:\n%s\n", pc); + if (strcmp (pc, "010101010101010101010101010101010101010101010101" + "010101010101010101010101010101010101010101010101") != 0) + fail ("mpi_sub failed at line %d", __LINE__); + gcry_free(pc); + + zero = gcry_mpi_new(0); + five = gcry_mpi_new(0); + minusfive = gcry_mpi_new(0); + gcry_mpi_set_ui (zero, 0); + gcry_mpi_set_ui (one, 1); + gcry_mpi_set_ui (two, 2); + gcry_mpi_set_ui (five, 5); + gcry_mpi_sub (minusfive, zero, five); + + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, minusfive); + if (debug) + gcry_log_debug ("Result of zero minus five:\n%s\n", pc); + if (strcmp (pc, "-05") != 0) + fail ("mpi_sub failed at line %d", __LINE__); + gcry_free(pc); + + gcry_mpi_sub_ui (result, five, 2); + + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); + if (debug) + gcry_log_debug ("Result of five minus two:\n%s\n", pc); + if (strcmp (pc, "03") != 0) + fail ("mpi_sub_ui failed at line %d", __LINE__); + gcry_free(pc); + + gcry_mpi_sub_ui (result, one, 10); + + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); + if (debug) + gcry_log_debug ("Result of one minus ten:\n%s\n", pc); + if (strcmp (pc, "-09") != 0) + fail ("mpi_sub_ui failed at line %d", __LINE__); + gcry_free(pc); + + gcry_mpi_sub_ui (result, minusfive, 2); + + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); + if (debug) + gcry_log_debug ("Result of minus five minus two:\n%s\n", pc); + if (strcmp (pc, "-07") != 0) + fail ("mpi_sub_ui failed at line %d", __LINE__); gcry_free(pc); gcry_mpi_release(one); gcry_mpi_release(two); gcry_mpi_release(result); + gcry_mpi_release(zero); + gcry_mpi_release(five); + gcry_mpi_release(minusfive); return 1; } @@ -436,21 +515,47 @@ test_mul (void) gcry_mpi_t two; gcry_mpi_t three; gcry_mpi_t result; - unsigned char* pc; + gcry_mpi_t minusfive; + char *pc; gcry_mpi_scan(&two, GCRYMPI_FMT_USG, twos, sizeof(twos), NULL); gcry_mpi_scan(&three, GCRYMPI_FMT_USG, threes, sizeof(threes), NULL); result = gcry_mpi_new(0); gcry_mpi_mul(result, two, three); - gcry_mpi_aprint(GCRYMPI_FMT_HEX, &pc, NULL, result); + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); if (debug) gcry_log_debug ("Result of two mul three:\n%s\n", pc); + if (strcmp (pc, "060C12181E242A30363C42484E545A60666C72787E848A90" + "969CA2A8AEB4BAC0C6CCD2D8DEE4EAF0F6FD03090F151B21" + "1B150F0902FCF6F0EAE4DED8D2CCC6C0BAB4AEA8A29C9690" + "8A847E78726C66605A544E48423C36302A241E18120C06") != 0) + fail ("mpi_mul failed at line %d", __LINE__); + gcry_free(pc); + + gcry_mpi_scan(&minusfive, GCRYMPI_FMT_HEX, "-5", 0, NULL); + gcry_mpi_mul_ui (result, minusfive, 3); + + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); + if (debug) + gcry_log_debug ("Result of minus five mul three:\n%s\n", pc); + if (strcmp (pc, "-0F") != 0) + fail ("mpi_mul_ui failed at line %d", __LINE__); + gcry_free(pc); + + gcry_mpi_mul_ui (result, result, 0); + + gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char **)&pc, NULL, result); + if (debug) + gcry_log_debug ("Result of minus fifteen mul zero:\n%s\n", pc); + if (strcmp (pc, "00") != 0) + fail ("mpi_mul_ui failed at line %d", __LINE__); gcry_free(pc); gcry_mpi_release(two); gcry_mpi_release(three); gcry_mpi_release(result); + gcry_mpi_release(minusfive); return 1; } -- 2.32.0 From jussi.kivilinna at iki.fi Tue Nov 30 21:34:02 2021 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 30 Nov 2021 22:34:02 +0200 Subject: gcry_mpi_sub_ui result is positive when it should be negative In-Reply-To: References: Message-ID: <2e4bbf20-16ab-759a-5e3d-0d732efddf5c@iki.fi> Hello, On 30.11.2021 15.21, Guido Vranken via Gcrypt-devel wrote: > In the program below, the result of the computation -5 - 2 should be -7, but it is 7. > > Compare with the behavior of gcry_mpi_add_ui and gcry_mpi_mul_ui (commented), which do produce a negative result, as they should. > > This was tested on the latest repository checkout. Thanks for report. This looks to go all the way back to code from 1997, commit "initial checkin" (4b5e71ca4e84e61e595dec19e1c7cab0c0a73f24). -Jussi > > #include > > #define CF_CHECK_EQ(expr, res) if ( (expr) != (res) ) { goto end; } > > int main(void) > { > ? ? gcry_mpi_t A; > ? ? gcry_mpi_t res; > ? ? gcry_error_t err; > ? ? char *buf; > > ? ? CF_CHECK_EQ(err = gcry_mpi_scan(&A, GCRYMPI_FMT_HEX, "-5", 0, NULL), 0); > ? ? CF_CHECK_EQ(err = gcry_mpi_scan(&res, GCRYMPI_FMT_HEX, "0", 0, NULL), 0); > ? ? gcry_mpi_sub_ui(res, A, 2); > ? ? //gcry_mpi_add_ui(res, A, 2); > ? ? //gcry_mpi_mul_ui(res, A, 2); > ? ? CF_CHECK_EQ(err = gcry_mpi_aprint(GCRYMPI_FMT_HEX, (unsigned char**)&buf, NULL, res), 0); > ? ? printf("%s\n", buf); > end: > > ? ? return 0; > } > > _______________________________________________ > Gcrypt-devel mailing list > Gcrypt-devel at gnupg.org > http://lists.gnupg.org/mailman/listinfo/gcrypt-devel >