From jussi.kivilinna at iki.fi Sat Jan 8 12:06:11 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 8 Jan 2022 13:06:11 +0200 Subject: [PATCH 2/3] Add armv8/pmull accelerated POLYVAL for GCM-SIV In-Reply-To: <20220108110612.141943-1-jussi.kivilinna@iki.fi> References: <20220108110612.141943-1-jussi.kivilinna@iki.fi> Message-ID: <20220108110612.141943-2-jussi.kivilinna@iki.fi> * cipher/cipher-gcm-armv8-aarch32-ce.S (_gcry_polyval_armv8_ce_pmull): New. * cipher/cipher-gcm-armv8-aarch64-ce.S (_gcry_polyval_armv8_ce_pmull): New. * cipher/cipher-gcm.c (_gcry_polyval_armv8_ce_pmull) (polyval_armv8_ce_pmull): New. (setupM) [GCM_USE_ARM_PMULL]: Setup 'polyval_armv8_ce_pmull' as POLYVAL function. -- Benchmark on Cortex-A53 (aarch64): Before: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GCM-SIV auth | 1.74 ns/B 547.6 MiB/s 2.01 c/B 1152 After (76% faster): AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz GCM-SIV auth | 0.990 ns/B 963.2 MiB/s 1.14 c/B 1152 Signed-off-by: Jussi Kivilinna --- cipher/cipher-gcm-armv8-aarch32-ce.S | 155 ++++++++++++++++++ cipher/cipher-gcm-armv8-aarch64-ce.S | 228 +++++++++++++++++++++++++++ cipher/cipher-gcm.c | 14 ++ 3 files changed, 397 insertions(+) diff --git a/cipher/cipher-gcm-armv8-aarch32-ce.S b/cipher/cipher-gcm-armv8-aarch32-ce.S index fb51b339..00c547de 100644 --- a/cipher/cipher-gcm-armv8-aarch32-ce.S +++ b/cipher/cipher-gcm-armv8-aarch32-ce.S @@ -358,6 +358,161 @@ _gcry_ghash_armv8_ce_pmull: .size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull; +/* + * unsigned int _gcry_polyval_armv8_ce_pmull (void *gcm_key, byte *result, + * const byte *buf, size_t nblocks, + * void *gcm_table); + */ +.align 3 +.globl _gcry_polyval_armv8_ce_pmull +.type _gcry_polyval_armv8_ce_pmull,%function; +_gcry_polyval_armv8_ce_pmull: + /* input: + * r0: gcm_key + * r1: result/hash + * r2: buf + * r3: nblocks + * %st+0: gcm_table + */ + push {r4-r6, lr} + + cmp r3, #0 + beq .Lpolyval_do_nothing + + GET_DATA_POINTER(r4, .Lrconst64, lr) + + vld1.64 {rhash}, [r1] + vld1.64 {rh1}, [r0] + + vrev64.8 rhash, rhash /* byte-swap */ + vld1.64 {rrconst_h}, [r4] + vext.8 rhash, rhash, rhash, #8 + + cmp r3, #4 + blo .Lpolyval_less_than_4 + + /* Bulk processing of 4 blocks per loop iteration. */ + + ldr r5, [sp, #(4*4)]; + add r6, r5, #32 + + vpush {q4-q7} + + vld1.64 {rh2-rh3}, [r5] + vld1.64 {rh4}, [r6] + + vld1.64 {rbuf-rbuf1}, [r2]! + sub r3, r3, #4 + vld1.64 {rbuf2-rbuf3}, [r2]! + + cmp r3, #4 + veor rhash, rhash, rbuf /* in0 ^ hash */ + + blo .Lpolyval_end_4 + +.Lpolyval_loop_4: + /* (in0 ^ hash) * H? => rr2:rr3 */ + /* (in1) * H? => rr0:rr1 */ + PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) + + vld1.64 {rbuf-rbuf1}, [r2]! + sub r3, r3, #4 + veor rr0, rr0, rr2 + veor rr1, rr1, rr3 + + /* (in2) * H? => rr2:rr3 */ + /* (in3) * H? => rhash:rbuf3 */ + PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1, __) + + vld1.64 {rbuf2}, [r2]! + + veor rr0, rr0, rr2 + veor rr1, rr1, rr3 + + cmp r3, #4 + + veor rr0, rr0, rhash + veor rr1, rr1, rbuf3 + + vld1.64 {rbuf3}, [r2]! + + REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, __) + + veor rhash, rhash, rbuf /* in0 ^ hash */ + + bhs .Lpolyval_loop_4 + +.Lpolyval_end_4: + /* (in0 ^ hash) * H? => rr2:rr3 */ + /* (in1) * H? => rr0:rr1 */ + PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) + + /* (in2) * H? => rhash:rbuf */ + /* (in3) * H? => rbuf1:rbuf2 */ + PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1, + _(veor rr0, rr0, rr2; + veor rr1, rr1, rr3)) + + veor rr0, rr0, rhash + veor rr1, rr1, rbuf + + veor rr0, rr0, rbuf1 + veor rr1, rr1, rbuf2 + + REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, + _(CLEAR_REG(rr2); + CLEAR_REG(rr3); + CLEAR_REG(rbuf1); + CLEAR_REG(rbuf2); + CLEAR_REG(rbuf3); + CLEAR_REG(rh2); + CLEAR_REG(rh3); + CLEAR_REG(rh4))) + + vpop {q4-q7} + + cmp r3, #0 + beq .Lpolyval_done + +.Lpolyval_less_than_4: + /* Handle remaining blocks. */ + + vld1.64 {rbuf}, [r2]! + subs r3, r3, #1 + + veor rhash, rhash, rbuf + + beq .Lpolyval_end + +.Lpolyval_loop: + vld1.64 {rbuf}, [r2]! + subs r3, r3, #1 + PMUL_128x128(rr0, rr1, rhash, rh1, rt0, __) + REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, __) + veor rhash, rhash, rbuf + + bne .Lpolyval_loop + +.Lpolyval_end: + PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf))) + REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1))) + +.Lpolyval_done: + CLEAR_REG(rr1) + vrev64.8 rhash, rhash /* byte-swap */ + CLEAR_REG(rt0) + CLEAR_REG(rr0) + vext.8 rhash, rhash, rhash, #8 + CLEAR_REG(rt1) + vst1.64 {rhash}, [r1] + CLEAR_REG(rhash) + +.Lpolyval_do_nothing: + mov r0, #0 + pop {r4-r6, pc} +.size _gcry_polyval_armv8_ce_pmull,.-_gcry_polyval_armv8_ce_pmull; + + /* * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table); */ diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S index 13ee83ed..2c619f9b 100644 --- a/cipher/cipher-gcm-armv8-aarch64-ce.S +++ b/cipher/cipher-gcm-armv8-aarch64-ce.S @@ -370,6 +370,234 @@ _gcry_ghash_armv8_ce_pmull: ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;) +/* + * unsigned int _gcry_polyval_armv8_ce_pmull (void *gcm_key, byte *result, + * const byte *buf, size_t nblocks, + * void *gcm_table); + */ +.align 3 +.globl _gcry_polyval_armv8_ce_pmull +ELF(.type _gcry_polyval_armv8_ce_pmull,%function;) +_gcry_polyval_armv8_ce_pmull: + /* input: + * x0: gcm_key + * x1: result/hash + * x2: buf + * x3: nblocks + * x4: gcm_table + */ + CFI_STARTPROC(); + + cbz x3, .Lpolyval_do_nothing; + + GET_DATA_POINTER(x5, .Lrconst) + + eor vZZ.16b, vZZ.16b, vZZ.16b + ld1 {rhash.16b}, [x1] + ld1 {rh1.16b}, [x0] + + rbit rhash.16b, rhash.16b /* bit-swap */ + ld1r {rrconst.2d}, [x5] + + cmp x3, #6 + b.lo .Lpolyval_less_than_6 + + add x6, x4, #64 + VPUSH_ABI + + ld1 {rh2.16b-rh5.16b}, [x4] + ld1 {rh6.16b}, [x6] + + sub x3, x3, #6 + + ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16) + ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16) + rev64 rbuf.16b, rbuf.16b /* byte-swap */ + rev64 rbuf1.16b, rbuf1.16b /* byte-swap */ + rev64 rbuf2.16b, rbuf2.16b /* byte-swap */ + rev64 rbuf3.16b, rbuf3.16b /* byte-swap */ + rev64 rbuf4.16b, rbuf4.16b /* byte-swap */ + rev64 rbuf5.16b, rbuf5.16b /* byte-swap */ + ext rbuf.16b, rbuf.16b, rbuf.16b, #8 /* byte-swap */ + ext rbuf1.16b, rbuf1.16b, rbuf1.16b, #8 /* byte-swap */ + ext rbuf2.16b, rbuf2.16b, rbuf2.16b, #8 /* byte-swap */ + ext rbuf3.16b, rbuf3.16b, rbuf3.16b, #8 /* byte-swap */ + ext rbuf4.16b, rbuf4.16b, rbuf4.16b, #8 /* byte-swap */ + ext rbuf5.16b, rbuf5.16b, rbuf5.16b, #8 /* byte-swap */ + rbit rbuf.16b, rbuf.16b /* bit-swap */ + rbit rbuf1.16b, rbuf1.16b /* bit-swap */ + rbit rbuf2.16b, rbuf2.16b /* bit-swap */ + rbit rbuf3.16b, rbuf3.16b /* bit-swap */ + rbit rbuf4.16b, rbuf4.16b /* bit-swap */ + rbit rbuf5.16b, rbuf5.16b /* bit-swap */ + eor rhash.16b, rhash.16b, rbuf.16b + + cmp x3, #6 + b.lo .Lpolyval_end_6 + +.Lpolyval_loop_6: + + /* (in1) * H? => rr0:rr1 */ + /* (in2) * H? => rr2:rr3 */ + /* (in0 ^ hash) * H? => rr4:rr5 */ + PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1, + rr2, rr3, rbuf2, rh4, t2, t3, + rr4, rr5, rhash, rh6, t4, t5, + _(sub x3, x3, #6)) + + ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16) + cmp x3, #6 + + eor rr0.16b, rr0.16b, rr2.16b + eor rr1.16b, rr1.16b, rr3.16b + + /* (in3) * H? => rr2:rr3 */ + /* (in4) * H? => rr6:rr7 */ + /* (in5) * H? => rr8:rr9 */ + PMUL_128x128_3(rr2, rr3, rbuf3, rh3, t0, t1, + rr6, rr7, rbuf4, rh2, t2, t3, + rr8, rr9, rbuf5, rh1, t4, t5, + _(eor rr0.16b, rr0.16b, rr4.16b; + eor rr1.16b, rr1.16b, rr5.16b)) + + rev64 rbuf.16b, rbuf.16b /* byte-swap */ + rev64 rbuf1.16b, rbuf1.16b /* byte-swap */ + rev64 rbuf2.16b, rbuf2.16b /* byte-swap */ + ext rbuf.16b, rbuf.16b, rbuf.16b, #8 /* byte-swap */ + ext rbuf1.16b, rbuf1.16b, rbuf1.16b, #8 /* byte-swap */ + ext rbuf2.16b, rbuf2.16b, rbuf2.16b, #8 /* byte-swap */ + + eor rr0.16b, rr0.16b, rr2.16b + eor rr1.16b, rr1.16b, rr3.16b + rbit rbuf.16b, rbuf.16b /* bit-swap */ + eor rr0.16b, rr0.16b, rr6.16b + eor rr1.16b, rr1.16b, rr7.16b + rbit rbuf1.16b, rbuf1.16b /* bit-swap */ + eor rr0.16b, rr0.16b, rr8.16b + eor rr1.16b, rr1.16b, rr9.16b + ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16) + + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, + _(rbit rbuf2.16b, rbuf2.16b), /* bit-swap */ + _(rev64 rbuf3.16b, rbuf3.16b), /* byte-swap */ + _(rev64 rbuf4.16b, rbuf4.16b)) /* byte-swap */ + + rev64 rbuf5.16b, rbuf5.16b /* byte-swap */ + ext rbuf3.16b, rbuf3.16b, rbuf3.16b, #8 /* byte-swap */ + + eor rhash.16b, rhash.16b, rbuf.16b + + ext rbuf4.16b, rbuf4.16b, rbuf4.16b, #8 /* byte-swap */ + ext rbuf5.16b, rbuf5.16b, rbuf5.16b, #8 /* byte-swap */ + rbit rbuf3.16b, rbuf3.16b /* bit-swap */ + rbit rbuf4.16b, rbuf4.16b /* bit-swap */ + rbit rbuf5.16b, rbuf5.16b /* bit-swap */ + + b.hs .Lpolyval_loop_6 + +.Lpolyval_end_6: + + /* (in1) * H? => rr0:rr1 */ + /* (in0 ^ hash) * H? => rr2:rr3 */ + /* (in2) * H? => rr4:rr5 */ + PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1, + rr2, rr3, rhash, rh6, t2, t3, + rr4, rr5, rbuf2, rh4, t4, t5, + __) + eor rr0.16b, rr0.16b, rr2.16b + eor rr1.16b, rr1.16b, rr3.16b + eor rr0.16b, rr0.16b, rr4.16b + eor rr1.16b, rr1.16b, rr5.16b + + /* (in3) * H? => rhash:rbuf */ + /* (in4) * H? => rr6:rr7 */ + /* (in5) * H? => rr8:rr9 */ + PMUL_128x128_3(rhash, rbuf, rbuf3, rh3, t0, t1, + rr6, rr7, rbuf4, rh2, t2, t3, + rr8, rr9, rbuf5, rh1, t4, t5, + _(CLEAR_REG(rh4); + CLEAR_REG(rh5); + CLEAR_REG(rh6))) + eor rr0.16b, rr0.16b, rhash.16b + eor rr1.16b, rr1.16b, rbuf.16b + eor rr0.16b, rr0.16b, rr6.16b + eor rr1.16b, rr1.16b, rr7.16b + eor rr0.16b, rr0.16b, rr8.16b + eor rr1.16b, rr1.16b, rr9.16b + + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, + _(CLEAR_REG(rh2); + CLEAR_REG(rh3); + CLEAR_REG(rr2); + CLEAR_REG(rbuf2); + CLEAR_REG(rbuf3)), + _(CLEAR_REG(rr3); + CLEAR_REG(rr4); + CLEAR_REG(rr5); + CLEAR_REG(rr6); + CLEAR_REG(rr7)), + _(CLEAR_REG(rr8); + CLEAR_REG(rr9); + CLEAR_REG(rbuf1); + CLEAR_REG(rbuf2))) + + CLEAR_REG(rbuf4) + CLEAR_REG(rbuf5) + CLEAR_REG(t2) + CLEAR_REG(t3) + CLEAR_REG(t4) + CLEAR_REG(t5) + + VPOP_ABI + + cbz x3, .Lpolyval_done + +.Lpolyval_less_than_6: + /* Handle remaining blocks. */ + + ld1 {rbuf.16b}, [x2], #16 + sub x3, x3, #1 + + rev64 rbuf.16b, rbuf.16b /* byte-swap */ + ext rbuf.16b, rbuf.16b, rbuf.16b, #8 /* byte-swap */ + rbit rbuf.16b, rbuf.16b /* bit-swap */ + + eor rhash.16b, rhash.16b, rbuf.16b + + cbz x3, .Lpolyval_end + +.Lpolyval_loop: + PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(ld1 {rbuf.16b}, [x2], #16)) + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, + _(sub x3, x3, #1; + rev64 rbuf.16b, rbuf.16b), /* byte-swap */ + _(ext rbuf.16b, rbuf.16b, rbuf.16b, #8), /* byte-swap */ + _(rbit rbuf.16b, rbuf.16b)) /* bit-swap */ + eor rhash.16b, rhash.16b, rbuf.16b + + cbnz x3, .Lpolyval_loop + +.Lpolyval_end: + PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(CLEAR_REG(rbuf))) + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, __, _(CLEAR_REG(rh1)), __) + +.Lpolyval_done: + CLEAR_REG(rr1) + CLEAR_REG(rr0) + rbit rhash.16b, rhash.16b /* bit-swap */ + CLEAR_REG(t0) + CLEAR_REG(t1) + + st1 {rhash.2d}, [x1] + CLEAR_REG(rhash) + +.Lpolyval_do_nothing: + mov x0, #0 + ret + CFI_ENDPROC() +ELF(.size _gcry_polyval_armv8_ce_pmull,.-_gcry_polyval_armv8_ce_pmull;) + + /* * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table); */ diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index d3ed9cf6..a039c5e9 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -57,6 +57,11 @@ extern unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result, const byte *buf, size_t nblocks, void *gcm_table); +extern unsigned int _gcry_polyval_armv8_ce_pmull (void *gcm_key, byte *result, + const byte *buf, + size_t nblocks, + void *gcm_table); + static void ghash_setup_armv8_ce_pmull (gcry_cipher_hd_t c) { @@ -71,6 +76,14 @@ ghash_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf, return _gcry_ghash_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key, result, buf, nblocks, c->u_mode.gcm.gcm_table); } + +static unsigned int +polyval_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf, + size_t nblocks) +{ + return _gcry_polyval_armv8_ce_pmull(c->u_mode.gcm.u_ghash_key.key, result, + buf, nblocks, c->u_mode.gcm.gcm_table); +} #endif /* GCM_USE_ARM_PMULL */ #ifdef GCM_USE_ARM_NEON @@ -591,6 +604,7 @@ setupM (gcry_cipher_hd_t c) else if (features & HWF_ARM_PMULL) { c->u_mode.gcm.ghash_fn = ghash_armv8_ce_pmull; + c->u_mode.gcm.polyval_fn = polyval_armv8_ce_pmull; ghash_setup_armv8_ce_pmull (c); } #endif -- 2.32.0 From jussi.kivilinna at iki.fi Sat Jan 8 12:06:10 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 8 Jan 2022 13:06:10 +0200 Subject: [PATCH 1/3] Use 'vmov' and 'movi' for vector register clearing in ARM assembly Message-ID: <20220108110612.141943-1-jussi.kivilinna@iki.fi> * cipher/chacha20-aarch64.S (clear): Use 'movi'. * cipher/chacha20-armv7-neon.S (clear): Use 'vmov'. * cipher/cipher-gcm-armv7-neon.S (clear): Use 'vmov'. * cipher/cipher-gcm-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'. * cipher/cipher-gcm-armv8-aarch64-ce.S (CLEAR_REG): Use 'movi'. * cipher/rijndael-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'. * cipher/sha1-armv7-neon.S (clear): Use 'vmov'. * cipher/sha1-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'. * cipher/sha1-armv8-aarch64-ce.S (CLEAR_REG): Use 'movi'. * cipher/sha256-armv8-aarch32-ce.S (CLEAR_REG): Use 'vmov'. * cipher/sha256-armv8-aarch64-ce.S (CLEAR_REG): Use 'movi'. * cipher/sha512-armv7-neon.S (CLEAR_REG): New using 'vmov'. (_gcry_sha512_transform_armv7_neon): Use CLEAR_REG for clearing registers. -- Use 'vmov reg, #0' on 32-bit and 'movi reg.16b, #0' instead of self-xoring register to break false register dependency. Signed-off-by: Jussi Kivilinna --- cipher/chacha20-aarch64.S | 2 +- cipher/chacha20-armv7-neon.S | 2 +- cipher/cipher-gcm-armv7-neon.S | 2 +- cipher/cipher-gcm-armv8-aarch32-ce.S | 2 +- cipher/cipher-gcm-armv8-aarch64-ce.S | 2 +- cipher/rijndael-armv8-aarch32-ce.S | 2 +- cipher/sha1-armv7-neon.S | 2 +- cipher/sha1-armv8-aarch32-ce.S | 2 +- cipher/sha1-armv8-aarch64-ce.S | 2 +- cipher/sha256-armv8-aarch32-ce.S | 2 +- cipher/sha256-armv8-aarch64-ce.S | 2 +- cipher/sha512-armv7-neon.S | 26 ++++++++++++++------------ 12 files changed, 25 insertions(+), 23 deletions(-) diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S index b8f9724a..4f76834b 100644 --- a/cipher/chacha20-aarch64.S +++ b/cipher/chacha20-aarch64.S @@ -110,7 +110,7 @@ vpunpcklqdq(x2, t2, x2); #define clear(x) \ - eor x.16b, x.16b, x.16b; + movi x.16b, #0; /********************************************************************** 4-way chacha20 diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S index 33a43df1..a862be4e 100644 --- a/cipher/chacha20-armv7-neon.S +++ b/cipher/chacha20-armv7-neon.S @@ -132,7 +132,7 @@ vswp _q0##h, _q2##l; \ vswp _q1##h, _q3##l; -#define clear(x) veor x,x,x; +#define clear(x) vmov.i8 x, #0; /********************************************************************** 4-way chacha20 diff --git a/cipher/cipher-gcm-armv7-neon.S b/cipher/cipher-gcm-armv7-neon.S index a801a5e5..16502b4a 100644 --- a/cipher/cipher-gcm-armv7-neon.S +++ b/cipher/cipher-gcm-armv7-neon.S @@ -210,7 +210,7 @@ gcry_gcm_reduction_constant: /* Other functional macros */ -#define CLEAR_REG(reg) veor reg, reg; +#define CLEAR_REG(reg) vmov.i8 reg, #0; /* diff --git a/cipher/cipher-gcm-armv8-aarch32-ce.S b/cipher/cipher-gcm-armv8-aarch32-ce.S index 1de66a16..fb51b339 100644 --- a/cipher/cipher-gcm-armv8-aarch32-ce.S +++ b/cipher/cipher-gcm-armv8-aarch32-ce.S @@ -180,7 +180,7 @@ gcry_gcm_reduction_constant: /* Other functional macros */ -#define CLEAR_REG(reg) veor reg, reg; +#define CLEAR_REG(reg) vmov.i8 reg, #0; /* diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S index 877207d3..13ee83ed 100644 --- a/cipher/cipher-gcm-armv8-aarch64-ce.S +++ b/cipher/cipher-gcm-armv8-aarch64-ce.S @@ -149,7 +149,7 @@ gcry_gcm_reduction_constant: #define _(...) __VA_ARGS__ #define __ _() -#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b; +#define CLEAR_REG(reg) movi reg.16b, #0; #define VPUSH_ABI \ stp d8, d9, [sp, #-16]!; \ diff --git a/cipher/rijndael-armv8-aarch32-ce.S b/cipher/rijndael-armv8-aarch32-ce.S index 6d78af0a..1eafa93e 100644 --- a/cipher/rijndael-armv8-aarch32-ce.S +++ b/cipher/rijndael-armv8-aarch32-ce.S @@ -249,7 +249,7 @@ /* Other functional macros */ -#define CLEAR_REG(reg) veor reg, reg; +#define CLEAR_REG(reg) vmov.i8 reg, #0; /* diff --git a/cipher/sha1-armv7-neon.S b/cipher/sha1-armv7-neon.S index 61cc541c..2de678b8 100644 --- a/cipher/sha1-armv7-neon.S +++ b/cipher/sha1-armv7-neon.S @@ -303,7 +303,7 @@ gcry_sha1_armv7_neon_K_VEC: /* Other functional macros */ -#define CLEAR_REG(reg) veor reg, reg; +#define CLEAR_REG(reg) vmov.i8 reg, #0; /* diff --git a/cipher/sha1-armv8-aarch32-ce.S b/cipher/sha1-armv8-aarch32-ce.S index bf2b233b..059b9a85 100644 --- a/cipher/sha1-armv8-aarch32-ce.S +++ b/cipher/sha1-armv8-aarch32-ce.S @@ -100,7 +100,7 @@ gcry_sha1_aarch32_ce_K_VEC: /* Other functional macros */ -#define CLEAR_REG(reg) veor reg, reg; +#define CLEAR_REG(reg) vmov.i8 reg, #0; /* diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S index 223268ca..8ea1486b 100644 --- a/cipher/sha1-armv8-aarch64-ce.S +++ b/cipher/sha1-armv8-aarch64-ce.S @@ -88,7 +88,7 @@ gcry_sha1_aarch64_ce_K_VEC: /* Other functional macros */ -#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b; +#define CLEAR_REG(reg) movi reg.16b, #0; /* diff --git a/cipher/sha256-armv8-aarch32-ce.S b/cipher/sha256-armv8-aarch32-ce.S index 2b17ab1b..95778b40 100644 --- a/cipher/sha256-armv8-aarch32-ce.S +++ b/cipher/sha256-armv8-aarch32-ce.S @@ -111,7 +111,7 @@ gcry_sha256_aarch32_ce_K: /* Other functional macros */ -#define CLEAR_REG(reg) veor reg, reg; +#define CLEAR_REG(reg) vmov.i8 reg, #0; /* diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S index f57cae29..5c39e83e 100644 --- a/cipher/sha256-armv8-aarch64-ce.S +++ b/cipher/sha256-armv8-aarch64-ce.S @@ -98,7 +98,7 @@ gcry_sha256_aarch64_ce_K: /* Other functional macros */ -#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b; +#define CLEAR_REG(reg) movi reg.16b, #0; /* diff --git a/cipher/sha512-armv7-neon.S b/cipher/sha512-armv7-neon.S index 6596f2cd..2b186b47 100644 --- a/cipher/sha512-armv7-neon.S +++ b/cipher/sha512-armv7-neon.S @@ -91,6 +91,8 @@ #define RW1213q q14 #define RW1415q q15 +#define CLEAR_REG(reg) vmov.i8 reg, #0; + /*********************************************************************** * ARM assembly implementation of sha512 transform ***********************************************************************/ @@ -426,22 +428,22 @@ _gcry_sha512_transform_armv7_neon: /* Clear used registers */ /* d16-d31 */ - veor.u64 RW01q, RW01q; - veor.u64 RW23q, RW23q; - veor.u64 RW45q, RW45q; - veor.u64 RW67q, RW67q; + CLEAR_REG(RW01q); + CLEAR_REG(RW23q); + CLEAR_REG(RW45q); + CLEAR_REG(RW67q); vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */ - veor.u64 RW89q, RW89q; - veor.u64 RW1011q, RW1011q; - veor.u64 RW1213q, RW1213q; - veor.u64 RW1415q, RW1415q; + CLEAR_REG(RW89q); + CLEAR_REG(RW1011q); + CLEAR_REG(RW1213q); + CLEAR_REG(RW1415q); /* d8-d15 */ vpop {RT0-RT7}; /* d0-d7 (q0-q3) */ - veor.u64 %q0, %q0; - veor.u64 %q1, %q1; - veor.u64 %q2, %q2; - veor.u64 %q3, %q3; + CLEAR_REG(%q0); + CLEAR_REG(%q1); + CLEAR_REG(%q2); + CLEAR_REG(%q3); eor %r0, %r0; pop {%pc}; -- 2.32.0 From jussi.kivilinna at iki.fi Sat Jan 8 12:06:12 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 8 Jan 2022 13:06:12 +0200 Subject: [PATCH 3/3] Optimizations for AES aarch64-ce assembly implementation In-Reply-To: <20220108110612.141943-1-jussi.kivilinna@iki.fi> References: <20220108110612.141943-1-jussi.kivilinna@iki.fi> Message-ID: <20220108110612.141943-3-jussi.kivilinna@iki.fi> * cipher/rijndael-armv8-aarch64-ce.S (vk14): Remove. (vklast, __, _): New. (aes_preload_keys): Setup vklast. (do_aes_one128/192/256): Split to ... (do_aes_one_part1, do_aes_part2_128/192/256): ... these and add interleave ops. (do_aes_one128/192/256): New using above part1 and part2 macros. (aes_round_4): Rename to ... (aes_round_4_multikey): ... this and allow different key used for parallel blocks. (aes_round_4): New using above multikey macro. (aes_lastround_4): Reorder AES round and xor instructions, allow different last key for parallel blocks. (do_aes_4_128/192/256): Split to ... (do_aes_4_part1_multikey, do_aes_4_part1) (do_aes_4_part2_128/192/256): ... these. (do_aes_4_128/192/256): New using above part1 and part2 macros. (CLEAR_REG): Use movi for clearing registers. (aes_clear_keys): Remove branching and clear all key registers. (_gcry_aes_enc_armv8_ce, _gcry_aes_dec_armv8_ce): Adjust to macro changes. (_gcry_aes_cbc_enc_armv8_ce, _gcry_aes_cbc_dec_armv8_ce) (_gcry_aes_cfb_enc_armv8_ce, _gcry_aes_cfb_enc_armv8_ce) (_gcry_aes_ctr32le_enc_armv8_ce): Apply entry/loop-body/exit optimization for better interleaving of input/output processing; First/last round key and input/output xoring optimization to reduce critical path length. (_gcry_aes_ctr_enc_armv8_ce): Add fast path for counter incrementing without byte-swaps when counter does not overflow 8-bit; Apply entry/loop-body/exit optimization for better interleaving of input/output processing; First/last round key and input/output xoring optimization to reduce critical path length. (_gcry_aes_ocb_enc_armv8_ce, _gcry_aes_ocb_dec_armv8_ce): Add aligned processing for nblk and OCB offsets; Apply entry/loop-body/exit optimization for better interleaving of input/output processing; First/last round key and input/output xoring optimization to reduce critical path length; Change to use same function body macro for both encryption and decryption. (_gcry_aes_xts_enc_armv8_ce, _gcry_aes_xts_dec_armv8_ce): Apply entry/loop-body/exit optimization for better interleaving of input/output processing; First/last round key and input/output xoring optimization to reduce critical path length; Change to use same function body macro for both encryption and decryption. -- Benchmark on AWS Graviton2 (2500Mhz): Before: AES | nanosecs/byte mebibytes/sec cycles/byte CBC enc | 0.663 ns/B 1439 MiB/s 1.66 c/B CBC dec | 0.288 ns/B 3310 MiB/s 0.720 c/B CFB enc | 0.657 ns/B 1453 MiB/s 1.64 c/B CFB dec | 0.288 ns/B 3313 MiB/s 0.720 c/B CTR dec | 0.314 ns/B 3039 MiB/s 0.785 c/B XTS enc | 0.357 ns/B 2674 MiB/s 0.891 c/B XTS dec | 0.358 ns/B 2666 MiB/s 0.894 c/B OCB enc | 0.343 ns/B 2784 MiB/s 0.856 c/B OCB dec | 0.341 ns/B 2795 MiB/s 0.853 c/B GCM-SIV enc | 0.526 ns/B 1813 MiB/s 1.31 c/B After: AES | nanosecs/byte mebibytes/sec cycles/byte perf increase CBC enc | 0.500 ns/B 1906 MiB/s 1.25 c/B +33% CBC dec | 0.263 ns/B 3622 MiB/s 0.658 c/B +9% CFB enc | 0.500 ns/B 1906 MiB/s 1.25 c/B +31% CFB dec | 0.263 ns/B 3620 MiB/s 0.658 c/B +9% CTR enc | 0.264 ns/B 3618 MiB/s 0.659 c/B +19% XTS enc | 0.350 ns/B 2722 MiB/s 0.876 c/B +2% OCB enc | 0.275 ns/B 3468 MiB/s 0.687 c/B +25% OCB dec | 0.276 ns/B 3459 MiB/s 0.689 c/B +24% GCM-SIV enc | 0.494 ns/B 1929 MiB/s 1.24 c/B +6% Benchmark on Cortex-A53 (1152Mhz): Before: AES | nanosecs/byte mebibytes/sec cycles/byte CBC enc | 1.41 ns/B 675.9 MiB/s 1.63 c/B CBC dec | 0.910 ns/B 1048 MiB/s 1.05 c/B CFB enc | 1.30 ns/B 732.2 MiB/s 1.50 c/B CFB dec | 0.910 ns/B 1048 MiB/s 1.05 c/B CTR enc | 1.03 ns/B 924.4 MiB/s 1.19 c/B XTS enc | 1.25 ns/B 763.0 MiB/s 1.44 c/B OCB enc | 1.21 ns/B 789.5 MiB/s 1.39 c/B OCB dec | 1.21 ns/B 788.9 MiB/s 1.39 c/B GCM-SIV enc | 1.92 ns/B 496.6 MiB/s 2.21 c/B After: AES | nanosecs/byte mebibytes/sec cycles/byte perf increase CBC enc | 1.14 ns/B 836.6 MiB/s 1.31 c/B +24% CBC dec | 0.843 ns/B 1132 MiB/s 0.971 c/B +8% CFB enc | 1.19 ns/B 798.8 MiB/s 1.38 c/B +9% CFB dec | 0.842 ns/B 1132 MiB/s 0.970 c/B +8% CTR enc | 0.898 ns/B 1062 MiB/s 1.03 c/B +16% XTS enc | 1.22 ns/B 779.9 MiB/s 1.41 c/B +2% OCB enc | 0.992 ns/B 961.0 MiB/s 1.14 c/B +22% OCB dec | 0.993 ns/B 960.5 MiB/s 1.14 c/B +22% GCM-SIV enc | 1.88 ns/B 507.3 MiB/s 2.17 c/B +2% Signed-off-by: Jussi Kivilinna --- cipher/rijndael-armv8-aarch64-ce.S | 1227 ++++++++++++++++------------ 1 file changed, 713 insertions(+), 514 deletions(-) diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S index a87d2ca5..9f8d9d49 100644 --- a/cipher/rijndael-armv8-aarch64-ce.S +++ b/cipher/rijndael-armv8-aarch64-ce.S @@ -44,7 +44,13 @@ #define vk11 v28 #define vk12 v29 #define vk13 v30 -#define vk14 v31 +#define vklast v31 + + +/* Helper macros */ + +#define __ /*_*/ +#define _(...) __VA_ARGS__ /* AES macros */ @@ -54,39 +60,40 @@ ld1 {vk0.16b-vk3.16b}, [keysched], #64; \ ld1 {vk4.16b-vk7.16b}, [keysched], #64; \ ld1 {vk8.16b-vk10.16b}, [keysched], #48; \ + mov vklast.16b, vk10.16b; \ b.lo 1f; \ ld1 {vk11.16b-vk12.16b}, [keysched], #32; \ + mov vklast.16b, vk12.16b; \ b.eq 1f; \ - ld1 {vk13.16b-vk14.16b}, [keysched]; \ + ld1 {vk13.16b-vklast.16b}, [keysched]; \ 1: ; -#define do_aes_one128(ed, mcimc, vo, vb) \ - aes##ed vb.16b, vk0.16b; \ +#define do_aes_one_part1(ed, mcimc, vb, vkfirst) \ + aes##ed vb.16b, vkfirst.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk1.16b; \ - aes##mcimc vb.16b, vb.16b; \ + aes##mcimc vb.16b, vb.16b; + +#define do_aes_one_part2_128(ed, mcimc, vb, iop1, iop2) \ aes##ed vb.16b, vk2.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk3.16b; \ aes##mcimc vb.16b, vb.16b; \ + iop1; \ aes##ed vb.16b, vk4.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk5.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk6.16b; \ aes##mcimc vb.16b, vb.16b; \ + iop2; \ aes##ed vb.16b, vk7.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk8.16b; \ aes##mcimc vb.16b, vb.16b; \ - aes##ed vb.16b, vk9.16b; \ - eor vo.16b, vb.16b, vk10.16b; + aes##ed vb.16b, vk9.16b; -#define do_aes_one192(ed, mcimc, vo, vb) \ - aes##ed vb.16b, vk0.16b; \ - aes##mcimc vb.16b, vb.16b; \ - aes##ed vb.16b, vk1.16b; \ - aes##mcimc vb.16b, vb.16b; \ +#define do_aes_one_part2_192(ed, mcimc, vb, iop1, iop2) \ aes##ed vb.16b, vk2.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk3.16b; \ @@ -95,24 +102,21 @@ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk5.16b; \ aes##mcimc vb.16b, vb.16b; \ + iop1; \ aes##ed vb.16b, vk6.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk7.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk8.16b; \ aes##mcimc vb.16b, vb.16b; \ + iop2; \ aes##ed vb.16b, vk9.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk10.16b; \ aes##mcimc vb.16b, vb.16b; \ - aes##ed vb.16b, vk11.16b; \ - eor vo.16b, vb.16b, vk12.16b; + aes##ed vb.16b, vk11.16b; -#define do_aes_one256(ed, mcimc, vo, vb) \ - aes##ed vb.16b, vk0.16b; \ - aes##mcimc vb.16b, vb.16b; \ - aes##ed vb.16b, vk1.16b; \ - aes##mcimc vb.16b, vb.16b; \ +#define do_aes_one_part2_256(ed, mcimc, vb, iop1, iop2) \ aes##ed vb.16b, vk2.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk3.16b; \ @@ -125,56 +129,78 @@ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk7.16b; \ aes##mcimc vb.16b, vb.16b; \ + iop1; \ aes##ed vb.16b, vk8.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk9.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk10.16b; \ aes##mcimc vb.16b, vb.16b; \ + iop2; \ aes##ed vb.16b, vk11.16b; \ aes##mcimc vb.16b, vb.16b; \ aes##ed vb.16b, vk12.16b; \ aes##mcimc vb.16b, vb.16b; \ - aes##ed vb.16b, vk13.16b; \ - eor vo.16b, vb.16b, vk14.16b; + aes##ed vb.16b, vk13.16b; -#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \ - aes##ed b0.16b, key.16b; \ +#define do_aes_one128(ed, mcimc, vo, vb, vkfirst) \ + do_aes_one_part1(ed, mcimc, vb, vkfirst); \ + do_aes_one_part2_128(ed, mcimc, vb, __, __); \ + eor vo.16b, vb.16b, vklast.16b; + +#define do_aes_one192(ed, mcimc, vo, vb, vkfirst) \ + do_aes_one_part1(ed, mcimc, vb, vkfirst); \ + do_aes_one_part2_192(ed, mcimc, vb, __, __); \ + eor vo.16b, vb.16b, vklast.16b; + +#define do_aes_one256(ed, mcimc, vo, vb, vkfirst) \ + do_aes_one_part1(ed, mcimc, vb, vkfirst); \ + do_aes_one_part2_256(ed, mcimc, vb, __, __); \ + eor vo.16b, vb.16b, vklast.16b; + +#define aes_round_4_multikey(ed, mcimc, b0, b1, b2, b3, key0, key1, key2, key3) \ + aes##ed b0.16b, key0.16b; \ aes##mcimc b0.16b, b0.16b; \ - aes##ed b1.16b, key.16b; \ + aes##ed b1.16b, key1.16b; \ aes##mcimc b1.16b, b1.16b; \ - aes##ed b2.16b, key.16b; \ + aes##ed b2.16b, key2.16b; \ aes##mcimc b2.16b, b2.16b; \ - aes##ed b3.16b, key.16b; \ + aes##ed b3.16b, key3.16b; \ aes##mcimc b3.16b, b3.16b; -#define aes_lastround_4(ed, b0, b1, b2, b3, key1, key2) \ +#define aes_round_4(ed, mcimc, b0, b1, b2, b3, key) \ + aes_round_4_multikey(ed, mcimc, b0, b1, b2, b3, key, key, key, key); + +#define aes_lastround_4(ed, o0, o1, o2, o3, b0, b1, b2, b3, key1, b0_key2, b1_key2, b2_key2, b3_key2) \ aes##ed b0.16b, key1.16b; \ - eor b0.16b, b0.16b, key2.16b; \ aes##ed b1.16b, key1.16b; \ - eor b1.16b, b1.16b, key2.16b; \ aes##ed b2.16b, key1.16b; \ - eor b2.16b, b2.16b, key2.16b; \ aes##ed b3.16b, key1.16b; \ - eor b3.16b, b3.16b, key2.16b; + eor o0.16b, b0.16b, b0_key2.16b; \ + eor o1.16b, b1.16b, b1_key2.16b; \ + eor o2.16b, b2.16b, b2_key2.16b; \ + eor o3.16b, b3.16b, b3_key2.16b; -#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \ +#define do_aes_4_part1_multikey(ed, mcimc, b0, b1, b2, b3, key0, key1, key2, key3) \ + aes_round_4_multikey(ed, mcimc, b0, b1, b2, b3, key0, key1, key2, key3); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \ + aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); + +#define do_aes_4_part1(ed, mcimc, b0, b1, b2, b3, vkfirst) \ + do_aes_4_part1_multikey(ed, mcimc, b0, b1, b2, b3, vkfirst, vkfirst, vkfirst, vkfirst); + +#define do_aes_4_part2_128(ed, mcimc, o0, o1, o2, o3, b0, b1, b2, b3, \ + b0_key, b1_key, b2_key, b3_key) \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk7); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \ - aes_lastround_4(ed, b0, b1, b2, b3, vk9, vk10); + aes_lastround_4(ed, o0, o1, o2, o3, b0, b1, b2, b3, vk9, b0_key, b1_key, b2_key, b3_key); -#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \ +#define do_aes_4_part2_192(ed, mcimc, o0, o1, o2, o3, b0, b1, b2, b3, \ + b0_key, b1_key, b2_key, b3_key) \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \ @@ -182,13 +208,10 @@ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk8); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk9); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \ - aes_lastround_4(ed, b0, b1, b2, b3, vk11, vk12); + aes_lastround_4(ed, o0, o1, o2, o3, b0, b1, b2, b3, vk11, b0_key, b1_key, b2_key, b3_key); -#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk0); \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk1); \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk2); \ - aes_round_4(ed, mcimc, b0, b1, b2, b3, vk3); \ +#define do_aes_4_part2_256(ed, mcimc, o0, o1, o2, o3, b0, b1, b2, b3, \ + b0_key, b1_key, b2_key, b3_key) \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk4); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk5); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk6); \ @@ -198,15 +221,25 @@ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk10); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk11); \ aes_round_4(ed, mcimc, b0, b1, b2, b3, vk12); \ - aes_lastround_4(ed, b0, b1, b2, b3, vk13, vk14); + aes_lastround_4(ed, o0, o1, o2, o3, b0, b1, b2, b3, vk13, b0_key, b1_key, b2_key, b3_key); +#define do_aes_4_128(ed, mcimc, b0, b1, b2, b3) \ + do_aes_4_part1(ed, mcimc, b0, b1, b2, b3, vk0); \ + do_aes_4_part2_128(ed, mcimc, b0, b1, b2, b3, b0, b1, b2, b3, vklast, vklast, vklast, vklast); + +#define do_aes_4_192(ed, mcimc, b0, b1, b2, b3) \ + do_aes_4_part1(ed, mcimc, b0, b1, b2, b3, vk0); \ + do_aes_4_part2_192(ed, mcimc, b0, b1, b2, b3, b0, b1, b2, b3, vklast, vklast, vklast, vklast); + +#define do_aes_4_256(ed, mcimc, b0, b1, b2, b3) \ + do_aes_4_part1(ed, mcimc, b0, b1, b2, b3, vk0); \ + do_aes_4_part2_256(ed, mcimc, b0, b1, b2, b3, b0, b1, b2, b3, vklast, vklast, vklast, vklast); /* Other functional macros */ -#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b; +#define CLEAR_REG(reg) movi reg.16b, #0; #define aes_clear_keys(nrounds) \ - cmp nrounds, #12; \ CLEAR_REG(vk0); \ CLEAR_REG(vk1); \ CLEAR_REG(vk2); \ @@ -218,13 +251,10 @@ CLEAR_REG(vk9); \ CLEAR_REG(vk8); \ CLEAR_REG(vk10); \ - b.lo 1f; \ CLEAR_REG(vk11); \ CLEAR_REG(vk12); \ - b.eq 1f; \ CLEAR_REG(vk13); \ - CLEAR_REG(vk14); \ -1: ; + CLEAR_REG(vklast); /* @@ -252,7 +282,7 @@ _gcry_aes_enc_armv8_ce: b.eq .Lenc1_192 .Lenc1_128: - do_aes_one128(e, mc, v0, v0); + do_aes_one128(e, mc, v0, v0, vk0); .Lenc1_tail: CLEAR_REG(vk0) @@ -266,6 +296,7 @@ _gcry_aes_enc_armv8_ce: CLEAR_REG(vk8) CLEAR_REG(vk9) CLEAR_REG(vk10) + CLEAR_REG(vklast) st1 {v0.16b}, [x1] CLEAR_REG(v0) @@ -273,19 +304,18 @@ _gcry_aes_enc_armv8_ce: ret .Lenc1_192: - do_aes_one192(e, mc, v0, v0); + do_aes_one192(e, mc, v0, v0, vk0); CLEAR_REG(vk11) CLEAR_REG(vk12) b .Lenc1_tail .Lenc1_256: - do_aes_one256(e, mc, v0, v0); + do_aes_one256(e, mc, v0, v0, vk0); CLEAR_REG(vk11) CLEAR_REG(vk12) CLEAR_REG(vk13) - CLEAR_REG(vk14) b .Lenc1_tail CFI_ENDPROC(); ELF(.size _gcry_aes_enc_armv8_ce,.-_gcry_aes_enc_armv8_ce;) @@ -316,7 +346,7 @@ _gcry_aes_dec_armv8_ce: b.eq .Ldec1_192 .Ldec1_128: - do_aes_one128(d, imc, v0, v0); + do_aes_one128(d, imc, v0, v0, vk0); .Ldec1_tail: CLEAR_REG(vk0) @@ -330,6 +360,7 @@ _gcry_aes_dec_armv8_ce: CLEAR_REG(vk8) CLEAR_REG(vk9) CLEAR_REG(vk10) + CLEAR_REG(vklast) st1 {v0.16b}, [x1] CLEAR_REG(v0) @@ -337,19 +368,18 @@ _gcry_aes_dec_armv8_ce: ret .Ldec1_192: - do_aes_one192(d, imc, v0, v0); + do_aes_one192(d, imc, v0, v0, vk0); CLEAR_REG(vk11) CLEAR_REG(vk12) b .Ldec1_tail .Ldec1_256: - do_aes_one256(d, imc, v0, v0); + do_aes_one256(d, imc, v0, v0, vk0); CLEAR_REG(vk11) CLEAR_REG(vk12) CLEAR_REG(vk13) - CLEAR_REG(vk14) b .Ldec1_tail CFI_ENDPROC(); ELF(.size _gcry_aes_dec_armv8_ce,.-_gcry_aes_dec_armv8_ce;) @@ -381,26 +411,38 @@ _gcry_aes_cbc_enc_armv8_ce: cbz x4, .Lcbc_enc_skip cmp w5, #0 - ld1 {v1.16b}, [x3] /* load IV */ - cset x5, eq + ld1 {v4.16b}, [x3] /* load IV */ + csetm x5, eq aes_preload_keys(x0, w6); - lsl x5, x5, #4 + and x5, x5, #16 + + ld1 {v3.16b}, [x2], #16; /* load plaintext */ + mov v0.16b, vk0.16b; + sub x4, x4, #1; + eor v16.16b, vk0.16b, vklast.16b; + eor v4.16b, v4.16b, v3.16b; + do_aes_one_part1(e, mc, v4, v0); - b.eq .Lcbc_enc_loop192 - b.hi .Lcbc_enc_loop256 + b.eq .Lcbc_enc_entry_192 + b.hi .Lcbc_enc_entry_256 #define CBC_ENC(bits) \ - .Lcbc_enc_loop##bits: \ - ld1 {v0.16b}, [x2], #16; /* load plaintext */ \ - eor v1.16b, v0.16b, v1.16b; \ - sub x4, x4, #1; \ - \ - do_aes_one##bits(e, mc, v1, v1); \ + .Lcbc_enc_entry_##bits: \ + cbz x4, .Lcbc_enc_done_##bits; \ \ - st1 {v1.16b}, [x1], x5; /* store ciphertext */ \ + .Lcbc_enc_loop_##bits: \ + do_aes_one_part2_##bits(e, mc, v4, \ + _(ld1 {v0.16b}, [x2], #16 /* load plaintext */), \ + _(eor v0.16b, v0.16b, v16.16b)); \ + sub x4, x4, #1; \ + eor v3.16b, v4.16b, vklast.16b; \ + do_aes_one_part1(e, mc, v4, v0); \ + st1 {v3.16b}, [x1], x5; /* store ciphertext */ \ + cbnz x4, .Lcbc_enc_loop_##bits; \ \ - cbnz x4, .Lcbc_enc_loop##bits; \ + .Lcbc_enc_done_##bits: \ + do_aes_one_part2_##bits(e, mc, v4, __, __); \ b .Lcbc_enc_done; CBC_ENC(128) @@ -410,11 +452,14 @@ _gcry_aes_cbc_enc_armv8_ce: #undef CBC_ENC .Lcbc_enc_done: + eor v3.16b, v4.16b, vklast.16b; + st1 {v3.16b}, [x1]; /* store ciphertext */ aes_clear_keys(w6) + st1 {v3.16b}, [x3] /* store IV */ - st1 {v1.16b}, [x3] /* store IV */ - - CLEAR_REG(v1) + CLEAR_REG(v16) + CLEAR_REG(v4) + CLEAR_REG(v3) CLEAR_REG(v0) .Lcbc_enc_skip: @@ -445,7 +490,10 @@ _gcry_aes_cbc_dec_armv8_ce: cbz x4, .Lcbc_dec_skip - ld1 {v0.16b}, [x3] /* load IV */ + add sp, sp, #-64; + CFI_ADJUST_CFA_OFFSET(64); + + ld1 {v16.16b}, [x3] /* load IV */ aes_preload_keys(x0, w5); @@ -457,44 +505,61 @@ _gcry_aes_cbc_dec_armv8_ce: cmp x4, #4; \ b.lo .Lcbc_dec_loop_##bits; \ \ - .Lcbc_dec_loop4_##bits: \ - \ - ld1 {v1.16b-v4.16b}, [x2], #64; /* load ciphertext */ \ + ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \ + cmp x4, #8; \ sub x4, x4, #4; \ - mov v5.16b, v1.16b; \ - mov v6.16b, v2.16b; \ - mov v7.16b, v3.16b; \ - mov v16.16b, v4.16b; \ - cmp x4, #4; \ + eor v4.16b, v16.16b, vklast.16b; \ + eor v5.16b, v0.16b, vklast.16b; \ + eor v6.16b, v1.16b, vklast.16b; \ + eor v7.16b, v2.16b, vklast.16b; \ + mov v16.16b, v3.16b; /* next IV */ \ \ - do_aes_4_##bits(d, imc, v1, v2, v3, v4); \ + do_aes_4_part1(d, imc, v0, v1, v2, v3, vk0); \ + b.lo .Lcbc_dec_done4_##bits; \ \ - eor v1.16b, v1.16b, v0.16b; \ - eor v2.16b, v2.16b, v5.16b; \ - st1 {v1.16b-v2.16b}, [x1], #32; /* store plaintext */ \ - eor v3.16b, v3.16b, v6.16b; \ - eor v4.16b, v4.16b, v7.16b; \ - mov v0.16b, v16.16b; /* next IV */ \ - st1 {v3.16b-v4.16b}, [x1], #32; /* store plaintext */ \ + st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ \ + \ + .Lcbc_dec_loop4_##bits: \ + do_aes_4_part2_##bits(d, imc, v8, v9, v10, v11, v0, v1, v2, v3, v4, v5, v6, v7); \ + ld1 {v0.16b-v3.16b}, [x2], #64; /* load ciphertext */ \ + cmp x4, #8; \ + sub x4, x4, #4; \ + eor v4.16b, v16.16b, vklast.16b; \ + eor v5.16b, v0.16b, vklast.16b; \ + eor v6.16b, v1.16b, vklast.16b; \ + eor v7.16b, v2.16b, vklast.16b; \ + mov v16.16b, v3.16b; /* next IV */ \ + \ + do_aes_4_part1(d, imc, v0, v1, v2, v3, vk0); \ + st1 {v8.16b-v11.16b}, [x1], #64; /* store plaintext */ \ \ b.hs .Lcbc_dec_loop4_##bits; \ - CLEAR_REG(v3); \ + \ + ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ \ + \ + .Lcbc_dec_done4_##bits: \ + do_aes_4_part2_##bits(d, imc, v0, v1, v2, v3, v0, v1, v2, v3, v4, v5, v6, v7); \ + \ CLEAR_REG(v4); \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ - CLEAR_REG(v16); \ + st1 {v0.16b-v3.16b}, [x1], #64; /* store plaintext */ \ + CLEAR_REG(v0); \ + CLEAR_REG(v3); \ cbz x4, .Lcbc_dec_done; \ \ .Lcbc_dec_loop_##bits: \ ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \ sub x4, x4, #1; \ + eor v16.16b, v16.16b, vklast.16b; \ mov v2.16b, v1.16b; \ \ - do_aes_one##bits(d, imc, v1, v1); \ + do_aes_one_part1(d, imc, v1, vk0); \ + do_aes_one_part2_##bits(d, imc, v1, __, __); \ + eor v1.16b, v1.16b, v16.16b; \ \ - eor v1.16b, v1.16b, v0.16b; \ - mov v0.16b, v2.16b; \ + mov v16.16b, v2.16b; \ st1 {v1.16b}, [x1], #16; /* store plaintext */ \ \ cbnz x4, .Lcbc_dec_loop_##bits; \ @@ -509,12 +574,15 @@ _gcry_aes_cbc_dec_armv8_ce: .Lcbc_dec_done: aes_clear_keys(w5) - st1 {v0.16b}, [x3] /* store IV */ + st1 {v16.16b}, [x3] /* store IV */ - CLEAR_REG(v0) + CLEAR_REG(v16) CLEAR_REG(v1) CLEAR_REG(v2) + add sp, sp, #64; + CFI_ADJUST_CFA_OFFSET(-64); + .Lcbc_dec_skip: ret CFI_ENDPROC(); @@ -544,9 +612,13 @@ _gcry_aes_ctr_enc_armv8_ce: cbz x4, .Lctr_enc_skip - mov x6, #1 + add x8, sp, #-64 + add sp, sp, #-128; + CFI_ADJUST_CFA_OFFSET(128); + + mov w6, #(1 << 24) movi v16.16b, #0 - mov v16.D[1], x6 + mov v16.S[3], w6 /* 1 */ /* load IV */ ldp x9, x10, [x3] @@ -554,6 +626,9 @@ _gcry_aes_ctr_enc_armv8_ce: rev x9, x9 rev x10, x10 + mov x12, #(4 << 56) + lsl x11, x10, #56 + aes_preload_keys(x0, w5); b.eq .Lctr_enc_entry_192 @@ -564,73 +639,71 @@ _gcry_aes_ctr_enc_armv8_ce: cmp x4, #4; \ b.lo .Lctr_enc_loop_##bits; \ \ - .Lctr_enc_loop4_##bits: \ - cmp x10, #0xfffffffffffffffc; \ + st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ \ + \ + adds x11, x11, x12; \ + add v9.4s, v16.4s, v16.4s; /* 2 */ \ + add v10.4s, v16.4s, v9.4s; /* 3 */ \ + add v11.4s, v9.4s, v9.4s; /* 4 */ \ + mov x7, #1; \ sub x4, x4, #4; \ - b.lo .Lctr_enc_loop4_##bits##_nocarry; \ + ld1 {v5.16b-v8.16b}, [x2], #64; /* preload ciphertext */ \ + b.cs .Lctr_enc_carry4_##bits; \ \ - adds x10, x10, #1; \ mov v1.16b, v0.16b; \ - adc x9, x9, xzr; \ - mov v2.D[1], x10; \ - mov v2.D[0], x9; \ - \ - adds x10, x10, #1; \ - rev64 v2.16b, v2.16b; \ - adc x9, x9, xzr; \ - mov v3.D[1], x10; \ - mov v3.D[0], x9; \ - \ - adds x10, x10, #1; \ - rev64 v3.16b, v3.16b; \ - adc x9, x9, xzr; \ - mov v4.D[1], x10; \ - mov v4.D[0], x9; \ + add x10, x10, #4; \ + add v2.16b, v0.16b, v16.16b; \ + add v3.8h, v0.8h, v9.8h; \ + add v4.4s, v0.4s, v10.4s; \ + add v0.2d, v0.2d, v11.2d; \ \ - adds x10, x10, #1; \ - rev64 v4.16b, v4.16b; \ - adc x9, x9, xzr; \ - mov v0.D[1], x10; \ - mov v0.D[0], x9; \ - rev64 v0.16b, v0.16b; \ + .Lctr_enc_entry4_##bits##_carry_done: \ + mov x7, #0; \ + cmp x4, #4; \ + do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \ + b.lo .Lctr_enc_done4_##bits; \ \ - b .Lctr_enc_loop4_##bits##_store_ctr; \ + st1 {v12.16b-v15.16b}, [x8]; /* store callee saved registers */ \ \ - .Lctr_enc_loop4_##bits##_nocarry: \ + .Lctr_enc_loop4_##bits: \ + eor v5.16b, v5.16b, vklast.16b; \ + eor v6.16b, v6.16b, vklast.16b; \ + eor v7.16b, v7.16b, vklast.16b; \ + eor v8.16b, v8.16b, vklast.16b; \ + do_aes_4_part2_##bits(e, mc, v12, v13, v14, v15, v1, v2, v3, v4, v5, v6, v7, v8); \ + ld1 {v5.16b-v8.16b}, [x2], #64; /* preload ciphertext */ \ + adds x11, x11, x12; \ + sub x4, x4, #4; \ + b.cs .Lctr_enc_carry4_##bits; \ \ - add v3.2d, v16.2d, v16.2d; /* 2 */ \ - rev64 v6.16b, v0.16b; \ + mov v1.16b, v0.16b; \ add x10, x10, #4; \ - add v4.2d, v3.2d, v16.2d; /* 3 */ \ - add v0.2d, v3.2d, v3.2d; /* 4 */ \ - rev64 v1.16b, v6.16b; \ - add v2.2d, v6.2d, v16.2d; \ - add v3.2d, v6.2d, v3.2d; \ - add v4.2d, v6.2d, v4.2d; \ - add v0.2d, v6.2d, v0.2d; \ - rev64 v2.16b, v2.16b; \ - rev64 v3.16b, v3.16b; \ - rev64 v0.16b, v0.16b; \ - rev64 v4.16b, v4.16b; \ + add v2.16b, v0.16b, v16.16b; \ + add v3.8h, v0.8h, v9.8h; \ + add v4.4s, v0.4s, v10.4s; \ + add v0.2d, v0.2d, v11.2d; \ \ - .Lctr_enc_loop4_##bits##_store_ctr: \ - \ - st1 {v0.16b}, [x3]; \ + .Lctr_enc_loop4_##bits##_carry_done: \ cmp x4, #4; \ - ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \ + do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \ + st1 {v12.16b-v15.16b}, [x1], #64; /* store plaintext */ \ \ - do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ + b.hs .Lctr_enc_loop4_##bits; \ \ - eor v1.16b, v1.16b, v5.16b; \ - ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \ - eor v2.16b, v2.16b, v6.16b; \ - eor v3.16b, v3.16b, v7.16b; \ - eor v4.16b, v4.16b, v5.16b; \ - st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ + ld1 {v12.16b-v15.16b}, [x8]; /* restore callee saved registers */ \ + \ + .Lctr_enc_done4_##bits: \ + eor v5.16b, v5.16b, vklast.16b; \ + eor v6.16b, v6.16b, vklast.16b; \ + eor v7.16b, v7.16b, vklast.16b; \ + eor v8.16b, v8.16b, vklast.16b; \ + do_aes_4_part2_##bits(e, mc, v5, v6, v7, v8, v1, v2, v3, v4, v5, v6, v7, v8); \ + \ + st1 {v5.16b-v8.16b}, [x1], #64; /* store plaintext */ \ \ - b.hs .Lctr_enc_loop4_##bits; \ CLEAR_REG(v3); \ CLEAR_REG(v4); \ + ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ @@ -641,19 +714,48 @@ _gcry_aes_ctr_enc_armv8_ce: adds x10, x10, #1; \ mov v1.16b, v0.16b; \ adc x9, x9, xzr; \ - mov v0.D[1], x10; \ - mov v0.D[0], x9; \ + dup v0.2d, x10; \ sub x4, x4, #1; \ + ins v0.D[0], x9; \ ld1 {v2.16b}, [x2], #16; /* load ciphertext */ \ rev64 v0.16b, v0.16b; \ \ - do_aes_one##bits(e, mc, v1, v1); \ + do_aes_one_part1(e, mc, v1, vk0); \ + eor v2.16b, v2.16b, vklast.16b; \ + do_aes_one_part2_##bits(e, mc, v1, __, __); \ \ - eor v1.16b, v2.16b, v1.16b; \ + eor v1.16b, v1.16b, v2.16b; \ st1 {v1.16b}, [x1], #16; /* store plaintext */ \ \ cbnz x4, .Lctr_enc_loop_##bits; \ - b .Lctr_enc_done; + b .Lctr_enc_done; \ + \ + .Lctr_enc_carry4_##bits: \ + \ + adds x13, x10, #1; \ + mov v1.16b, v0.16b; \ + adc x14, x9, xzr; \ + dup v2.2d, x13; \ + adds x13, x10, #2; \ + ins v2.D[0], x14; \ + adc x14, x9, xzr; \ + rev64 v2.16b, v2.16b; \ + dup v3.2d, x13; \ + adds x13, x10, #3; \ + ins v3.D[0], x14; \ + adc x14, x9, xzr; \ + rev64 v3.16b, v3.16b; \ + dup v4.2d, x13; \ + adds x10, x10, #4; \ + ins v4.D[0], x14; \ + adc x9, x9, xzr; \ + rev64 v4.16b, v4.16b; \ + dup v0.2d, x10; \ + ins v0.D[0], x9; \ + rev64 v0.16b, v0.16b; \ + \ + cbz x7, .Lctr_enc_loop4_##bits##_carry_done; \ + b .Lctr_enc_entry4_##bits##_carry_done; CTR_ENC(128) CTR_ENC(192) @@ -669,6 +771,10 @@ _gcry_aes_ctr_enc_armv8_ce: CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) + CLEAR_REG(v16) + + add sp, sp, #128; + CFI_ADJUST_CFA_OFFSET(-128); .Lctr_enc_skip: ret @@ -700,6 +806,10 @@ _gcry_aes_ctr32le_enc_armv8_ce: cbz x4, .Lctr32le_enc_skip + add x8, sp, #-64 + add sp, sp, #-128; + CFI_ADJUST_CFA_OFFSET(128); + mov w6, #1 movi v16.16b, #0 mov v16.S[0], w6 @@ -712,38 +822,66 @@ _gcry_aes_ctr32le_enc_armv8_ce: b.eq .Lctr32le_enc_entry_192 b.hi .Lctr32le_enc_entry_256 -#define CTR_ENC(bits) \ +#define CTR32LE_ENC(bits) \ .Lctr32le_enc_entry_##bits: \ cmp x4, #4; \ b.lo .Lctr32le_enc_loop_##bits; \ \ - .Lctr32le_enc_loop4_##bits: \ + st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ \ + add v9.4s, v16.4s, v16.4s; /* 2 */ \ + cmp x4, #8; \ + add v10.4s, v9.4s, v16.4s; /* 3 */ \ sub x4, x4, #4; \ + add v11.4s, v9.4s, v9.4s; /* 4 */ \ + \ + ld1 {v5.16b-v8.16b}, [x2], #64; /* preload ciphertext */ \ \ - add v3.4s, v16.4s, v16.4s; /* 2 */ \ mov v1.16b, v0.16b; \ add v2.4s, v0.4s, v16.4s; \ - add v4.4s, v3.4s, v16.4s; /* 3 */ \ - add v6.4s, v3.4s, v3.4s; /* 4 */ \ - add v3.4s, v0.4s, v3.4s; \ - add v4.4s, v0.4s, v4.4s; \ - add v0.4s, v0.4s, v6.4s; \ + add v3.4s, v0.4s, v9.4s; \ + add v4.4s, v0.4s, v10.4s; \ + add v0.4s, v0.4s, v11.4s; \ \ - cmp x4, #4; \ - ld1 {v5.16b-v7.16b}, [x2], #48; /* preload ciphertext */ \ + do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \ + b.lo .Lctr32le_enc_done4_##bits; \ \ - do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ + st1 {v12.16b-v15.16b}, [x8]; /* store callee saved registers */ \ \ - eor v1.16b, v1.16b, v5.16b; \ - ld1 {v5.16b}, [x2], #16; /* load ciphertext */ \ - eor v2.16b, v2.16b, v6.16b; \ - eor v3.16b, v3.16b, v7.16b; \ - eor v4.16b, v4.16b, v5.16b; \ - st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ + .Lctr32le_enc_loop4_##bits: \ + eor v5.16b, v5.16b, vklast.16b; \ + eor v6.16b, v6.16b, vklast.16b; \ + eor v7.16b, v7.16b, vklast.16b; \ + eor v8.16b, v8.16b, vklast.16b; \ + do_aes_4_part2_##bits(e, mc, v12, v13, v14, v15, v1, v2, v3, v4, v5, v6, v7, v8); \ + ld1 {v5.16b-v8.16b}, [x2], #64; /* preload ciphertext */ \ + \ + cmp x4, #8; \ + sub x4, x4, #4; \ + \ + mov v1.16b, v0.16b; \ + add v2.4s, v0.4s, v16.4s; \ + add v3.4s, v0.4s, v9.4s; \ + add v4.4s, v0.4s, v10.4s; \ + add v0.4s, v0.4s, v11.4s; \ + \ + do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \ + st1 {v12.16b-v15.16b}, [x1], #64; /* store plaintext */ \ \ b.hs .Lctr32le_enc_loop4_##bits; \ + \ + ld1 {v12.16b-v15.16b}, [x8]; /* restore callee saved registers */ \ + \ + .Lctr32le_enc_done4_##bits: \ + eor v5.16b, v5.16b, vklast.16b; \ + eor v6.16b, v6.16b, vklast.16b; \ + eor v7.16b, v7.16b, vklast.16b; \ + eor v8.16b, v8.16b, vklast.16b; \ + do_aes_4_part2_##bits(e, mc, v5, v6, v7, v8, v1, v2, v3, v4, v5, v6, v7, v8); \ + \ + st1 {v5.16b-v8.16b}, [x1], #64; /* store plaintext */ \ CLEAR_REG(v3); \ CLEAR_REG(v4); \ + ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ @@ -756,19 +894,21 @@ _gcry_aes_ctr32le_enc_armv8_ce: sub x4, x4, #1; \ add v0.4s, v0.4s, v16.4s; \ \ - do_aes_one##bits(e, mc, v1, v1); \ + do_aes_one_part1(e, mc, v1, vk0); \ + eor v2.16b, v2.16b, vklast.16b; \ + do_aes_one_part2_##bits(e, mc, v1, __, __); \ \ - eor v1.16b, v2.16b, v1.16b; \ + eor v1.16b, v1.16b, v2.16b; \ st1 {v1.16b}, [x1], #16; /* store plaintext */ \ \ cbnz x4, .Lctr32le_enc_loop_##bits; \ b .Lctr32le_enc_done; - CTR_ENC(128) - CTR_ENC(192) - CTR_ENC(256) + CTR32LE_ENC(128) + CTR32LE_ENC(192) + CTR32LE_ENC(256) -#undef CTR_ENC +#undef CTR32LE_ENC .Lctr32le_enc_done: aes_clear_keys(w5) @@ -778,6 +918,10 @@ _gcry_aes_ctr32le_enc_armv8_ce: CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) + CLEAR_REG(v16) + + add sp, sp, #128; + CFI_ADJUST_CFA_OFFSET(-128); .Lctr32le_enc_skip: ret @@ -813,21 +957,34 @@ _gcry_aes_cfb_enc_armv8_ce: aes_preload_keys(x0, w5); + ld1 {v1.16b}, [x2], #16; /* load plaintext */ + eor v3.16b, vk0.16b, vklast.16b; + eor v0.16b, v0.16b, vklast.16b; + sub x4, x4, #1; + mov v4.16b, v3.16b; + do_aes_one_part1(e, mc, v0, v4); + b.eq .Lcfb_enc_entry_192 b.hi .Lcfb_enc_entry_256 #define CFB_ENC(bits) \ .Lcfb_enc_entry_##bits: \ + cbz x4, .Lcfb_enc_done_##bits; \ + \ .Lcfb_enc_loop_##bits: \ - ld1 {v1.16b}, [x2], #16; /* load plaintext */ \ + eor v2.16b, v1.16b, vklast.16b; \ + do_aes_one_part2_##bits(e, mc, v0, \ + _(eor v4.16b, v3.16b, v1.16b), \ + _(ld1 {v1.16b}, [x2], #16 /* load plaintext */)); \ sub x4, x4, #1; \ - \ - do_aes_one##bits(e, mc, v0, v0); \ - \ - eor v0.16b, v1.16b, v0.16b; \ - st1 {v0.16b}, [x1], #16; /* store ciphertext */ \ - \ + eor v2.16b, v2.16b, v0.16b; \ + do_aes_one_part1(e, mc, v0, v4); \ + st1 {v2.16b}, [x1], #16; /* store ciphertext */ \ cbnz x4, .Lcfb_enc_loop_##bits; \ + \ + .Lcfb_enc_done_##bits: \ + eor v2.16b, v1.16b, vklast.16b; \ + do_aes_one_part2_##bits(e, mc, v0, __, __); \ b .Lcfb_enc_done; CFB_ENC(128) @@ -837,12 +994,16 @@ _gcry_aes_cfb_enc_armv8_ce: #undef CFB_ENC .Lcfb_enc_done: + eor v2.16b, v2.16b, v0.16b; + st1 {v2.16b}, [x1]; /* store ciphertext */ aes_clear_keys(w5) - - st1 {v0.16b}, [x3] /* store IV */ + st1 {v2.16b}, [x3] /* store IV */ CLEAR_REG(v0) CLEAR_REG(v1) + CLEAR_REG(v2) + CLEAR_REG(v3) + CLEAR_REG(v4) .Lcfb_enc_skip: ret @@ -873,6 +1034,9 @@ _gcry_aes_cfb_dec_armv8_ce: cbz x4, .Lcfb_dec_skip + add sp, sp, #-64; + CFI_ADJUST_CFA_OFFSET(64); + /* load IV */ ld1 {v0.16b}, [x3] @@ -886,42 +1050,60 @@ _gcry_aes_cfb_dec_armv8_ce: cmp x4, #4; \ b.lo .Lcfb_dec_loop_##bits; \ \ - .Lcfb_dec_loop4_##bits: \ - \ - ld1 {v2.16b-v4.16b}, [x2], #48; /* load ciphertext */ \ + ld1 {v2.16b-v5.16b}, [x2], #64; /* load ciphertext */ \ + cmp x4, #8; \ mov v1.16b, v0.16b; \ sub x4, x4, #4; \ - cmp x4, #4; \ - mov v5.16b, v2.16b; \ - mov v6.16b, v3.16b; \ - mov v7.16b, v4.16b; \ - ld1 {v0.16b}, [x2], #16; /* load next IV / ciphertext */ \ + eor v6.16b, v2.16b, vklast.16b; \ + eor v7.16b, v3.16b, vklast.16b; \ + eor v16.16b, v4.16b, vklast.16b; \ + mov v0.16b, v5.16b; /* next IV */ \ + eor v5.16b, v5.16b, vklast.16b; \ \ - do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ + do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \ + b.lo .Lcfb_dec_done4_##bits; \ \ - eor v1.16b, v1.16b, v5.16b; \ - eor v2.16b, v2.16b, v6.16b; \ - eor v3.16b, v3.16b, v7.16b; \ - eor v4.16b, v4.16b, v0.16b; \ - st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ + st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ \ + \ + .Lcfb_dec_loop4_##bits: \ + do_aes_4_part2_##bits(e, mc, v8, v9, v10, v11, v1, v2, v3, v4, v6, v7, v16, v5); \ + ld1 {v2.16b-v5.16b}, [x2], #64; /* load ciphertext */ \ + cmp x4, #8; \ + mov v1.16b, v0.16b; \ + sub x4, x4, #4; \ + eor v6.16b, v2.16b, vklast.16b; \ + eor v7.16b, v3.16b, vklast.16b; \ + eor v16.16b, v4.16b, vklast.16b; \ + mov v0.16b, v5.16b; /* next IV */ \ + eor v5.16b, v5.16b, vklast.16b; \ + \ + do_aes_4_part1(e, mc, v1, v2, v3, v4, vk0); \ + st1 {v8.16b-v11.16b}, [x1], #64; /* store plaintext */ \ \ b.hs .Lcfb_dec_loop4_##bits; \ - CLEAR_REG(v3); \ - CLEAR_REG(v4); \ + \ + ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ \ + \ + .Lcfb_dec_done4_##bits: \ + do_aes_4_part2_##bits(e, mc, v1, v2, v3, v4, v1, v2, v3, v4, v6, v7, v16, v5); \ + \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ + st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ + CLEAR_REG(v3); \ + CLEAR_REG(v4); \ cbz x4, .Lcfb_dec_done; \ \ .Lcfb_dec_loop_##bits: \ - \ ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \ - \ sub x4, x4, #1; \ \ - do_aes_one##bits(e, mc, v0, v0); \ + do_aes_one_part1(e, mc, v0, vk0); \ + eor v2.16b, v1.16b, vklast.16b; \ + do_aes_one_part2_##bits(e, mc, v0, __, __); \ + eor v2.16b, v2.16b, v0.16b; \ \ - eor v2.16b, v1.16b, v0.16b; \ mov v0.16b, v1.16b; \ st1 {v2.16b}, [x1], #16; /* store plaintext */ \ \ @@ -942,6 +1124,10 @@ _gcry_aes_cfb_dec_armv8_ce: CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) + CLEAR_REG(v16) + + add sp, sp, #64; + CFI_ADJUST_CFA_OFFSET(-64); .Lcfb_dec_skip: ret @@ -972,7 +1158,7 @@ _gcry_aes_ocb_enc_armv8_ce: * x3: offset * x4: checksum * x5: Ltable - * x6: nblocks (0 < nblocks <= 32) + * x6: nblocks (0 < nblocks) * w7: nrounds * %st+0: blkn => w12 */ @@ -982,110 +1168,203 @@ _gcry_aes_ocb_enc_armv8_ce: ld1 {v0.16b}, [x3] /* load offset */ ld1 {v16.16b}, [x4] /* load checksum */ + add x16, sp, #-64; + add sp, sp, #-128; + CFI_ADJUST_CFA_OFFSET(128); + aes_preload_keys(x0, w7); - b.eq .Locb_enc_entry_192 - b.hi .Locb_enc_entry_256 + st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ -#define OCB_ENC(bits, ...) \ - .Locb_enc_entry_##bits: \ - cmp x6, #4; \ - add x12, x12, #1; \ - b.lo .Locb_enc_loop_##bits; \ + eor v0.16b, v0.16b, vk0.16b; /* offset ^ first key */ + eor v9.16b, vk0.16b, vklast.16b; /* first key ^ last key */ + + b.eq .Locb_ecry_entry_192 + b.hi .Locb_ecry_entry_256 + +#define OCB_CRYPT(bits, ed, mcimc) \ + .Locb_##ed##cry_entry_##bits: \ + /* Get number of blocks to align nblk to 4. */ \ + neg x13, x12; \ + add x12, x12, #1; /* Pre-increment nblk for ntz calculation */ \ + and x13, x13, #(4-1); \ + cmp x13, x6; \ + csel x13, x6, x13, hi; \ + cbz x13, .Locb_##ed##cry_alignment_ok_##bits; \ + \ + /* Number of blocks after alignment. */ \ + sub x14, x6, x13; \ \ - .Locb_enc_loop4_##bits: \ + /* If number after alignment is less than 4, skip aligned handling \ + * completely. */ \ + cmp x14, #4; \ + csel x13, x6, x13, lo; \ + \ + .Locb_##ed##cry_unaligned_entry_##bits: \ + cmp x13, #4; \ + \ + .Locb_##ed##cry_loop1_##bits: \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \ \ - add w9, w12, #1; \ - add w10, w12, #2; \ - add w11, w12, #3; \ - rbit w8, w12; \ - add w12, w12, #4; \ - rbit w9, w9; \ - rbit w10, w10; \ - rbit w11, w11; \ - clz w8, w8; /* ntz(i+0) */ \ - clz w9, w9; /* ntz(i+1) */ \ - clz w10, w10; /* ntz(i+2) */ \ - clz w11, w11; /* ntz(i+3) */ \ + rbit x8, x12; \ + add x12, x12, #1; \ + clz x8, x8; /* ntz(i) */ \ add x8, x5, x8, lsl #4; \ - ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \ - add x9, x5, x9, lsl #4; \ - add x10, x5, x10, lsl #4; \ - add x11, x5, x11, lsl #4; \ \ - sub x6, x6, #4; \ + ld1 {v1.16b}, [x2], #16; /* load plaintext */ \ + ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \ + eor v0.16b, v0.16b, v2.16b; \ + sub x13, x13, #1; \ + ENC(eor v16.16b, v16.16b, v1.16b); \ + sub x6, x6, #1; \ \ - ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \ - eor v16.16b, v16.16b, v1.16b; /* Checksum_i+0 */ \ - ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \ - eor v16.16b, v16.16b, v2.16b; /* Checksum_i+1 */ \ - ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \ - eor v16.16b, v16.16b, v3.16b; /* Checksum_i+2 */ \ - eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \ - ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \ - eor v16.16b, v16.16b, v4.16b; /* Checksum_i+3 */ \ - eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \ - eor v1.16b, v1.16b, v5.16b; /* P_i+0 xor Offset_i+0 */ \ - eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \ - eor v2.16b, v2.16b, v6.16b; /* P_i+1 xor Offset_i+1 */ \ - eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \ - cmp x6, #4; \ - eor v3.16b, v3.16b, v7.16b; /* P_i+2 xor Offset_i+2 */ \ - eor v4.16b, v4.16b, v0.16b; /* P_i+3 xor Offset_i+3 */ \ + do_aes_one_part1(ed, mcimc, v1, v0); \ + eor v2.16b, v0.16b, v9.16b; \ + do_aes_one_part2_##bits(ed, mcimc, v1, __, __); \ + eor v1.16b, v1.16b, v2.16b; \ + st1 {v1.16b}, [x1], #16; /* store ciphertext */ \ + DEC(eor v16.16b, v16.16b, v1.16b); \ \ - do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ + cbnz x13, .Locb_##ed##cry_loop1_##bits; \ \ - eor v1.16b, v1.16b, v5.16b; /* xor Offset_i+0 */ \ - eor v2.16b, v2.16b, v6.16b; /* xor Offset_i+1 */ \ - eor v3.16b, v3.16b, v7.16b; /* xor Offset_i+2 */ \ - eor v4.16b, v4.16b, v0.16b; /* xor Offset_i+3 */ \ - st1 {v1.16b-v4.16b}, [x1], #64; \ + cbz x6, .Locb_##ed##cry_done; \ \ - b.hs .Locb_enc_loop4_##bits; \ - CLEAR_REG(v3); \ - CLEAR_REG(v4); \ - CLEAR_REG(v5); \ - CLEAR_REG(v6); \ - CLEAR_REG(v7); \ - cbz x6, .Locb_enc_done; \ + /* nblk is now aligned and we have 4 or more blocks. So jump directly to \ + * aligned processing. */ \ + b .Locb_##ed##cry_aligned_entry_##bits; \ \ - .Locb_enc_loop_##bits: \ + .Locb_##ed##cry_alignment_ok_##bits: \ + cbz x6, .Locb_##ed##cry_done; \ + \ + /* Short buffers do not benefit from L-array optimization. */ \ + cmp x6, #4; \ + mov x13, x6; \ + b.lo .Locb_##ed##cry_unaligned_entry_##bits; \ + \ + .Locb_##ed##cry_aligned_entry_##bits: \ + /* Prepare L-array optimization. \ + * Since nblk is aligned to 4, offsets will have following construction: \ + * - block1 = ntz{0} = offset ^ L[0] \ + * - block2 = ntz{1} = offset ^ L[0] ^ L[1] \ + * - block3 = ntz{0} = offset ^ L[1] \ + * - block4 = ntz{x} = offset ^ L[1] ^ L[ntz{x}] \ + */ \ + ld1 {v10.16b-v11.16b}, [x5]; /* preload L[0] && L[1] */ \ + mov x15, #4; \ + \ + st1 {v12.16b-v15.16b}, [x16]; /* store callee saved registers */ \ \ /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ /* Checksum_i = Checksum_{i-1} xor P_i */ \ /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \ \ - rbit x8, x12; \ - add x12, x12, #1; \ - clz x8, x8; /* ntz(i) */ \ - add x8, x5, x8, lsl #4; \ + add x11, x12, #3; \ + ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \ + rbit x11, x11; \ + eor v6.16b, v10.16b, v11.16b; /* L[0] ^ L[1] */ \ + ENC(eor v16.16b, v16.16b, v1.16b); /* Checksum_i+0 */ \ + add x12, x12, #4; \ + clz x11, x11; /* ntz(i+3) */ \ + add x15, x15, #4; \ + add x11, x5, x11, lsl #4; \ \ - ld1 {v1.16b}, [x2], #16; /* load plaintext */ \ - ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \ - sub x6, x6, #1; \ - eor v0.16b, v0.16b, v2.16b; \ - eor v16.16b, v16.16b, v1.16b; \ - eor v1.16b, v1.16b, v0.16b; \ + eor v5.16b, v0.16b, v10.16b; /* Offset_i+0 */ \ + ENC(eor v16.16b, v16.16b, v2.16b); /* Checksum_i+1 */ \ + ld1 {v8.16b}, [x11]; /* load L_{ntz(i+3)} */ \ + ENC(eor v16.16b, v16.16b, v3.16b); /* Checksum_i+2 */ \ + eor v6.16b, v0.16b, v6.16b; /* Offset_i+1 */ \ + ENC(eor v16.16b, v16.16b, v4.16b); /* Checksum_i+3 */ \ + eor v7.16b, v0.16b, v11.16b; /* Offset_i+2 */ \ + eor v8.16b, v8.16b, v11.16b; /* L[1] ^ L[ntz{x}] */ \ + cmp x15, x13; \ + eor v0.16b, v0.16b, v8.16b; /* Offset_i+3 */ \ + \ + do_aes_4_part1_multikey(ed, mcimc, v1, v2, v3, v4, v5, v6, v7, v0); /* P_i+j xor Offset_i+j */ \ + b.hi .Locb_##ed##cry_aligned_done4_##bits; \ + \ + .Locb_##ed##cry_aligned_loop4_##bits: \ + add x11, x12, #3; \ + eor v5.16b, v5.16b, v9.16b; \ + eor v6.16b, v6.16b, v9.16b; \ + rbit x11, x11; \ + eor v7.16b, v7.16b, v9.16b; \ + eor v8.16b, v0.16b, v9.16b; \ + clz x11, x11; /* ntz(i+3) */ \ + do_aes_4_part2_##bits(ed, mcimc, v12, v13, v14, v15, v1, v2, v3, v4, v5, v6, v7, v8); /* xor Offset_i+j */ \ \ - do_aes_one##bits(e, mc, v1, v1); \ + /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ + /* Checksum_i = Checksum_{i-1} xor P_i */ \ + /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i) */ \ \ - eor v1.16b, v1.16b, v0.16b; \ - st1 {v1.16b}, [x1], #16; /* store ciphertext */ \ + add x12, x12, #4; \ + ld1 {v1.16b-v4.16b}, [x2], #64; /* load P_i+<0-3> */ \ + eor v6.16b, v10.16b, v11.16b; /* L[0] ^ L[1] */ \ + add x15, x15, #4; \ + DEC(eor v16.16b, v16.16b, v12.16b); /* Checksum_i+0 */ \ + ENC(eor v16.16b, v16.16b, v1.16b); /* Checksum_i+0 */ \ + add x11, x5, x11, lsl #4; \ + \ + eor v5.16b, v0.16b, v10.16b; /* Offset_i+0 */ \ + ENC(eor v16.16b, v16.16b, v2.16b); /* Checksum_i+1 */ \ + DEC(eor v16.16b, v16.16b, v13.16b); /* Checksum_1+2 */ \ + ld1 {v8.16b}, [x11]; /* load L_{ntz(i+3)} */ \ + ENC(eor v16.16b, v16.16b, v3.16b); /* Checksum_i+2 */ \ + DEC(eor v16.16b, v16.16b, v14.16b); /* Checksum_i+0+3 */ \ + eor v6.16b, v0.16b, v6.16b; /* Offset_i+1 */ \ + ENC(eor v16.16b, v16.16b, v4.16b); /* Checksum_i+3 */ \ + DEC(eor v16.16b, v16.16b, v15.16b); /* Checksum_i+0+1+2 */ \ + eor v7.16b, v0.16b, v11.16b; /* Offset_i+2 */ \ + eor v8.16b, v8.16b, v11.16b; /* L[1] ^ L[ntz{x}] */ \ + cmp x15, x13; \ + eor v0.16b, v0.16b, v8.16b; /* Offset_i+3 */ \ + \ + do_aes_4_part1_multikey(ed, mcimc, v1, v2, v3, v4, v5, v6, v7, v0); /* P_i+j xor Offset_i+j */ \ + st1 {v12.16b-v15.16b}, [x1], #64; \ + \ + b.ls .Locb_##ed##cry_aligned_loop4_##bits; \ + \ + .Locb_##ed##cry_aligned_done4_##bits: \ + eor v5.16b, v5.16b, v9.16b; \ + eor v6.16b, v6.16b, v9.16b; \ + eor v7.16b, v7.16b, v9.16b; \ + eor v8.16b, v0.16b, v9.16b; \ + do_aes_4_part2_##bits(ed, mcimc, v1, v2, v3, v4, v1, v2, v3, v4, v5, v6, v7, v8); /* xor Offset_i+j */ \ + DEC(eor v16.16b, v16.16b, v1.16b); /* Checksum_i+0 */ \ + DEC(eor v5.16b, v2.16b, v3.16b); /* Checksum_1+2 */ \ + DEC(eor v16.16b, v16.16b, v4.16b); /* Checksum_i+0+3 */ \ + st1 {v1.16b-v4.16b}, [x1], #64; \ + DEC(eor v16.16b, v16.16b, v5.16b); /* Checksum_i+0+1+2 */ \ \ - cbnz x6, .Locb_enc_loop_##bits; \ - b .Locb_enc_done; + sub x15, x15, #4; \ + CLEAR_REG(v3); \ + CLEAR_REG(v4); \ + ld1 {v12.16b-v15.16b}, [x16]; /* restore callee saved registers */ \ + sub x13, x13, x15; \ + sub x6, x6, x15; \ + CLEAR_REG(v5); \ + CLEAR_REG(v6); \ + \ + /* Handle tailing 1?3 blocks in unaligned loop. */ \ + mov x13, x6; \ + cbnz x6, .Locb_##ed##cry_unaligned_entry_##bits; \ + \ + b .Locb_##ed##cry_done; - OCB_ENC(128) - OCB_ENC(192) - OCB_ENC(256) +#define ENC(...) __VA_ARGS__ +#define DEC(...) /*_*/ + OCB_CRYPT(128, e, mc) + OCB_CRYPT(192, e, mc) + OCB_CRYPT(256, e, mc) +#undef ENC +#undef DEC -#undef OCB_ENC +.Locb_ecry_done: + eor v0.16b, v0.16b, vk0.16b; /* restore offset */ -.Locb_enc_done: + ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ aes_clear_keys(w7) st1 {v16.16b}, [x4] /* store checksum */ @@ -1094,8 +1373,12 @@ _gcry_aes_ocb_enc_armv8_ce: CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) + CLEAR_REG(v7) CLEAR_REG(v16) + add sp, sp, #128; + CFI_ADJUST_CFA_OFFSET(-128); + ret CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;) @@ -1124,7 +1407,7 @@ _gcry_aes_ocb_dec_armv8_ce: * x3: offset * x4: checksum * x5: Ltable - * x6: nblocks (0 < nblocks <= 32) + * x6: nblocks (0 < nblocks) * w7: nrounds * %st+0: blkn => w12 */ @@ -1134,110 +1417,34 @@ _gcry_aes_ocb_dec_armv8_ce: ld1 {v0.16b}, [x3] /* load offset */ ld1 {v16.16b}, [x4] /* load checksum */ + add x16, sp, #-64; + add sp, sp, #-128; + CFI_ADJUST_CFA_OFFSET(128); + aes_preload_keys(x0, w7); - b.eq .Locb_dec_entry_192 - b.hi .Locb_dec_entry_256 + st1 {v8.16b-v11.16b}, [sp]; /* store callee saved registers */ -#define OCB_DEC(bits) \ - .Locb_dec_entry_##bits: \ - cmp x6, #4; \ - add w12, w12, #1; \ - b.lo .Locb_dec_loop_##bits; \ - \ - .Locb_dec_loop4_##bits: \ - \ - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ - /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \ - /* Checksum_i = Checksum_{i-1} xor P_i */ \ - \ - add w9, w12, #1; \ - add w10, w12, #2; \ - add w11, w12, #3; \ - rbit w8, w12; \ - add w12, w12, #4; \ - rbit w9, w9; \ - rbit w10, w10; \ - rbit w11, w11; \ - clz w8, w8; /* ntz(i+0) */ \ - clz w9, w9; /* ntz(i+1) */ \ - clz w10, w10; /* ntz(i+2) */ \ - clz w11, w11; /* ntz(i+3) */ \ - add x8, x5, x8, lsl #4; \ - ld1 {v1.16b-v4.16b}, [x2], #64; /* load C_i+<0-3> */ \ - add x9, x5, x9, lsl #4; \ - add x10, x5, x10, lsl #4; \ - add x11, x5, x11, lsl #4; \ - \ - sub x6, x6, #4; \ - \ - ld1 {v5.16b}, [x8]; /* load L_{ntz(i+0)} */ \ - ld1 {v6.16b}, [x9]; /* load L_{ntz(i+1)} */ \ - ld1 {v7.16b}, [x10]; /* load L_{ntz(i+2)} */ \ - eor v5.16b, v5.16b, v0.16b; /* Offset_i+0 */ \ - ld1 {v0.16b}, [x11]; /* load L_{ntz(i+3)} */ \ - eor v6.16b, v6.16b, v5.16b; /* Offset_i+1 */ \ - eor v1.16b, v1.16b, v5.16b; /* C_i+0 xor Offset_i+0 */ \ - eor v7.16b, v7.16b, v6.16b; /* Offset_i+2 */ \ - eor v2.16b, v2.16b, v6.16b; /* C_i+1 xor Offset_i+1 */ \ - eor v0.16b, v0.16b, v7.16b; /* Offset_i+3 */ \ - cmp x6, #4; \ - eor v3.16b, v3.16b, v7.16b; /* C_i+2 xor Offset_i+2 */ \ - eor v4.16b, v4.16b, v0.16b; /* C_i+3 xor Offset_i+3 */ \ - \ - do_aes_4_##bits(d, imc, v1, v2, v3, v4); \ - \ - eor v1.16b, v1.16b, v5.16b; /* xor Offset_i+0 */ \ - eor v2.16b, v2.16b, v6.16b; /* xor Offset_i+1 */ \ - eor v16.16b, v16.16b, v1.16b; /* Checksum_i+0 */ \ - eor v3.16b, v3.16b, v7.16b; /* xor Offset_i+2 */ \ - eor v16.16b, v16.16b, v2.16b; /* Checksum_i+1 */ \ - eor v4.16b, v4.16b, v0.16b; /* xor Offset_i+3 */ \ - eor v16.16b, v16.16b, v3.16b; /* Checksum_i+2 */ \ - eor v16.16b, v16.16b, v4.16b; /* Checksum_i+3 */ \ - st1 {v1.16b-v4.16b}, [x1], #64; \ - \ - b.hs .Locb_dec_loop4_##bits; \ - CLEAR_REG(v3); \ - CLEAR_REG(v4); \ - CLEAR_REG(v5); \ - CLEAR_REG(v6); \ - CLEAR_REG(v7); \ - cbz x6, .Locb_dec_done; \ - \ - .Locb_dec_loop_##bits: \ - \ - /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */ \ - /* P_i = Offset_i xor DECIPHER(K, C_i xor Offset_i) */ \ - /* Checksum_i = Checksum_{i-1} xor P_i */ \ - \ - rbit w8, w12; \ - add w12, w12, #1; \ - clz w8, w8; /* ntz(i) */ \ - add x8, x5, x8, lsl #4; \ - \ - ld1 {v1.16b}, [x2], #16; /* load ciphertext */ \ - ld1 {v2.16b}, [x8]; /* load L_{ntz(i)} */ \ - sub x6, x6, #1; \ - eor v0.16b, v0.16b, v2.16b; \ - eor v1.16b, v1.16b, v0.16b; \ - \ - do_aes_one##bits(d, imc, v1, v1) \ - \ - eor v1.16b, v1.16b, v0.16b; \ - st1 {v1.16b}, [x1], #16; /* store plaintext */ \ - eor v16.16b, v16.16b, v1.16b; \ - \ - cbnz x6, .Locb_dec_loop_##bits; \ - b .Locb_dec_done; + eor v0.16b, v0.16b, vk0.16b; /* offset ^ first key */ + eor v9.16b, vk0.16b, vklast.16b; /* first key ^ last key */ + + b.eq .Locb_dcry_entry_192 + b.hi .Locb_dcry_entry_256 + +#define ENC(...) /*_*/ +#define DEC(...) __VA_ARGS__ + OCB_CRYPT(128, d, imc) + OCB_CRYPT(192, d, imc) + OCB_CRYPT(256, d, imc) +#undef ENC +#undef DEC - OCB_DEC(128) - OCB_DEC(192) - OCB_DEC(256) +#undef OCB_CRYPT -#undef OCB_DEC +.Locb_dcry_done: + eor v0.16b, v0.16b, vk0.16b; /* restore offset */ -.Locb_dec_done: + ld1 {v8.16b-v11.16b}, [sp]; /* restore callee saved registers */ aes_clear_keys(w7) st1 {v16.16b}, [x4] /* store checksum */ @@ -1248,6 +1455,9 @@ _gcry_aes_ocb_dec_armv8_ce: CLEAR_REG(v2) CLEAR_REG(v16) + add sp, sp, #128; + CFI_ADJUST_CFA_OFFSET(-128); + ret CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;) @@ -1371,7 +1581,7 @@ _gcry_aes_ocb_auth_armv8_ce: eor v0.16b, v0.16b, v2.16b; \ eor v1.16b, v1.16b, v0.16b; \ \ - do_aes_one##bits(e, mc, v1, v1) \ + do_aes_one##bits(e, mc, v1, v1, vk0) \ \ eor v16.16b, v16.16b, v1.16b; \ \ @@ -1425,6 +1635,10 @@ _gcry_aes_xts_enc_armv8_ce: cbz x4, .Lxts_enc_skip + add x16, sp, #-64; + add sp, sp, #-128; + CFI_ADJUST_CFA_OFFSET(128); + /* load tweak */ ld1 {v0.16b}, [x3] @@ -1435,18 +1649,66 @@ _gcry_aes_xts_enc_armv8_ce: mov v16.D[1], x7 aes_preload_keys(x0, w5); + eor vklast.16b, vklast.16b, vk0.16b; - b.eq .Lxts_enc_entry_192 - b.hi .Lxts_enc_entry_256 + b.eq .Lxts_ecry_entry_192 + b.hi .Lxts_ecry_entry_256 -#define XTS_ENC(bits) \ - .Lxts_enc_entry_##bits: \ +#define XTS_CRYPT(bits, ed, mcimc) \ + .Lxts_##ed##cry_entry_##bits: \ cmp x4, #4; \ - b.lo .Lxts_enc_loop_##bits; \ + b.lo .Lxts_##ed##cry_loop_##bits; \ \ - .Lxts_enc_loop4_##bits: \ + st1 {v8.16b}, [sp]; /* store callee saved registers */ \ + ext v4.16b, v0.16b, v0.16b, #8; \ + mov v8.16b, v0.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v5.2d, v0.2d, v0.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v5.16b, v5.16b, v2.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v6.2d, v5.2d, v5.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v6.16b, v6.16b, v2.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v7.2d, v6.2d, v6.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v7.16b, v7.16b, v2.16b; \ + \ + sshr v2.2d, v4.2d, #63; \ + add v3.2d, v7.2d, v7.2d; \ + and v2.16b, v2.16b, v16.16b; \ + add v4.2d, v4.2d, v4.2d; \ + eor v0.16b, v3.16b, v2.16b; \ + ld1 {v1.16b-v4.16b}, [x2], #64; /* load plaintext */ \ + cmp x4, #8; \ + sub x4, x4, #4; \ + \ + eor v8.16b, v8.16b, vk0.16b; \ + eor v5.16b, v5.16b, vk0.16b; \ + eor v6.16b, v6.16b, vk0.16b; \ + eor v7.16b, v7.16b, vk0.16b; \ + \ + do_aes_4_part1_multikey(ed, mcimc, v1, v2, v3, v4, v8, v5, v6, v7); \ + b.lo .Lxts_##ed##cry_done4_##bits; \ + \ + st1 {v9.16b-v12.16b}, [x16]; /* store callee saved registers */ \ + \ + .Lxts_##ed##cry_loop4_##bits: \ + eor v8.16b, v8.16b, vklast.16b; \ + eor v5.16b, v5.16b, vklast.16b; \ + eor v6.16b, v6.16b, vklast.16b; \ + eor v7.16b, v7.16b, vklast.16b; \ + do_aes_4_part2_##bits(ed, mcimc, v9, v10, v11, v12, v1, v2, v3, v4, v8, v5, v6, v7); \ \ ext v4.16b, v0.16b, v0.16b, #8; \ + mov v8.16b, v0.16b; \ \ sshr v2.2d, v4.2d, #63; \ add v5.2d, v0.2d, v0.2d; \ @@ -1470,62 +1732,66 @@ _gcry_aes_xts_enc_armv8_ce: add v3.2d, v7.2d, v7.2d; \ and v2.16b, v2.16b, v16.16b; \ add v4.2d, v4.2d, v4.2d; \ - eor v3.16b, v3.16b, v2.16b; \ - ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \ - st1 {v3.16b}, [x3]; \ + eor v0.16b, v3.16b, v2.16b; \ + ld1 {v1.16b-v4.16b}, [x2], #64; /* load plaintext */ \ + cmp x4, #8; \ sub x4, x4, #4; \ - eor v1.16b, v1.16b, v0.16b; \ \ - ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \ - cmp x4, #4; \ - eor v2.16b, v2.16b, v5.16b; \ - eor v3.16b, v3.16b, v6.16b; \ - eor v4.16b, v4.16b, v7.16b; \ + eor v8.16b, v8.16b, vk0.16b; \ + eor v5.16b, v5.16b, vk0.16b; \ + eor v6.16b, v6.16b, vk0.16b; \ + eor v7.16b, v7.16b, vk0.16b; \ \ - do_aes_4_##bits(e, mc, v1, v2, v3, v4); \ + do_aes_4_part1_multikey(ed, mcimc, v1, v2, v3, v4, v8, v5, v6, v7); \ + \ + st1 {v9.16b-v12.16b}, [x1], #64; /* store plaintext */ \ + \ + b.hs .Lxts_##ed##cry_loop4_##bits; \ + \ + ld1 {v9.16b-v12.16b}, [x16]; /* restore callee saved registers */ \ + \ + .Lxts_##ed##cry_done4_##bits: \ + eor v8.16b, v8.16b, vklast.16b; \ + eor v5.16b, v5.16b, vklast.16b; \ + eor v6.16b, v6.16b, vklast.16b; \ + eor v7.16b, v7.16b, vklast.16b; \ + do_aes_4_part2_##bits(ed, mcimc, v1, v2, v3, v4, v1, v2, v3, v4, v8, v5, v6, v7); \ \ - eor v1.16b, v1.16b, v0.16b; \ - ld1 {v0.16b}, [x3]; \ - eor v2.16b, v2.16b, v5.16b; \ - eor v3.16b, v3.16b, v6.16b; \ - eor v4.16b, v4.16b, v7.16b; \ st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ \ - b.hs .Lxts_enc_loop4_##bits; \ - CLEAR_REG(v3); \ CLEAR_REG(v4); \ + ld1 {v8.16b}, [sp]; /* restore callee saved registers */ \ CLEAR_REG(v5); \ CLEAR_REG(v6); \ CLEAR_REG(v7); \ - cbz x4, .Lxts_enc_done; \ + cbz x4, .Lxts_##ed##cry_done; \ \ - .Lxts_enc_loop_##bits: \ + .Lxts_##ed##cry_loop_##bits: \ \ ld1 {v1.16b}, [x2], #16; /* load plaintext */ \ ext v3.16b, v0.16b, v0.16b, #8; \ - mov v2.16b, v0.16b; \ + eor v2.16b, v0.16b, vk0.16b; \ sshr v3.2d, v3.2d, #63; \ add v0.2d, v0.2d, v0.2d; \ and v3.16b, v3.16b, v16.16b; \ - eor v1.16b, v1.16b, v2.16b; \ - eor v0.16b, v0.16b, v3.16b; \ sub x4, x4, #1; \ + eor v0.16b, v0.16b, v3.16b; \ \ - do_aes_one##bits(e, mc, v1, v1); \ - \ + do_aes_one_part1(ed, mcimc, v1, v2); \ + eor v2.16b, v2.16b, vklast.16b; \ + do_aes_one_part2_##bits(ed, mcimc, v1, __, __); \ eor v1.16b, v1.16b, v2.16b; \ + \ st1 {v1.16b}, [x1], #16; /* store ciphertext */ \ \ - cbnz x4, .Lxts_enc_loop_##bits; \ - b .Lxts_enc_done; + cbnz x4, .Lxts_##ed##cry_loop_##bits; \ + b .Lxts_##ed##cry_done; - XTS_ENC(128) - XTS_ENC(192) - XTS_ENC(256) + XTS_CRYPT(128, e, mc) + XTS_CRYPT(192, e, mc) + XTS_CRYPT(256, e, mc) -#undef XTS_ENC - -.Lxts_enc_done: +.Lxts_ecry_done: aes_clear_keys(w5) st1 {v0.16b}, [x3] /* store tweak */ @@ -1533,6 +1799,11 @@ _gcry_aes_xts_enc_armv8_ce: CLEAR_REG(v0) CLEAR_REG(v1) CLEAR_REG(v2) + CLEAR_REG(v3) + CLEAR_REG(v16) + + add sp, sp, 128; + CFI_ADJUST_CFA_OFFSET(-128); .Lxts_enc_skip: ret @@ -1565,6 +1836,10 @@ _gcry_aes_xts_dec_armv8_ce: cbz x4, .Lxts_dec_skip + add x16, sp, #-64; + add sp, sp, #-128; + CFI_ADJUST_CFA_OFFSET(128); + /* load tweak */ ld1 {v0.16b}, [x3] @@ -1575,97 +1850,18 @@ _gcry_aes_xts_dec_armv8_ce: mov v16.D[1], x7 aes_preload_keys(x0, w5); + eor vklast.16b, vklast.16b, vk0.16b; - b.eq .Lxts_dec_entry_192 - b.hi .Lxts_dec_entry_256 + b.eq .Lxts_dcry_entry_192 + b.hi .Lxts_dcry_entry_256 -#define XTS_DEC(bits) \ - .Lxts_dec_entry_##bits: \ - cmp x4, #4; \ - b.lo .Lxts_dec_loop_##bits; \ - \ - .Lxts_dec_loop4_##bits: \ - \ - ext v4.16b, v0.16b, v0.16b, #8; \ - \ - sshr v2.2d, v4.2d, #63; \ - add v5.2d, v0.2d, v0.2d; \ - and v2.16b, v2.16b, v16.16b; \ - add v4.2d, v4.2d, v4.2d; \ - eor v5.16b, v5.16b, v2.16b; \ - \ - sshr v2.2d, v4.2d, #63; \ - add v6.2d, v5.2d, v5.2d; \ - and v2.16b, v2.16b, v16.16b; \ - add v4.2d, v4.2d, v4.2d; \ - eor v6.16b, v6.16b, v2.16b; \ - \ - sshr v2.2d, v4.2d, #63; \ - add v7.2d, v6.2d, v6.2d; \ - and v2.16b, v2.16b, v16.16b; \ - add v4.2d, v4.2d, v4.2d; \ - eor v7.16b, v7.16b, v2.16b; \ - \ - sshr v2.2d, v4.2d, #63; \ - add v3.2d, v7.2d, v7.2d; \ - and v2.16b, v2.16b, v16.16b; \ - add v4.2d, v4.2d, v4.2d; \ - eor v3.16b, v3.16b, v2.16b; \ - ld1 {v1.16b-v2.16b}, [x2], #32; /* load plaintext */ \ - st1 {v3.16b}, [x3]; \ - sub x4, x4, #4; \ - eor v1.16b, v1.16b, v0.16b; \ - \ - ld1 {v3.16b-v4.16b}, [x2], #32; /* load plaintext */ \ - cmp x4, #4; \ - eor v2.16b, v2.16b, v5.16b; \ - eor v3.16b, v3.16b, v6.16b; \ - eor v4.16b, v4.16b, v7.16b; \ - \ - do_aes_4_##bits(d, imc, v1, v2, v3, v4); \ - \ - eor v1.16b, v1.16b, v0.16b; \ - ld1 {v0.16b}, [x3]; \ - eor v2.16b, v2.16b, v5.16b; \ - eor v3.16b, v3.16b, v6.16b; \ - eor v4.16b, v4.16b, v7.16b; \ - st1 {v1.16b-v4.16b}, [x1], #64; /* store plaintext */ \ - \ - b.hs .Lxts_dec_loop4_##bits; \ - CLEAR_REG(v3); \ - CLEAR_REG(v4); \ - CLEAR_REG(v5); \ - CLEAR_REG(v6); \ - CLEAR_REG(v7); \ - cbz x4, .Lxts_dec_done; \ - \ - .Lxts_dec_loop_##bits: \ - \ - ld1 {v1.16b}, [x2], #16; /* load plaintext */ \ - ext v3.16b, v0.16b, v0.16b, #8; \ - mov v2.16b, v0.16b; \ - sshr v3.2d, v3.2d, #63; \ - add v0.2d, v0.2d, v0.2d; \ - and v3.16b, v3.16b, v16.16b; \ - eor v1.16b, v1.16b, v2.16b; \ - eor v0.16b, v0.16b, v3.16b; \ - sub x4, x4, #1; \ - \ - do_aes_one##bits(d, imc, v1, v1); \ - \ - eor v1.16b, v1.16b, v2.16b; \ - st1 {v1.16b}, [x1], #16; /* store ciphertext */ \ - \ - cbnz x4, .Lxts_dec_loop_##bits; \ - b .Lxts_dec_done; - - XTS_DEC(128) - XTS_DEC(192) - XTS_DEC(256) + XTS_CRYPT(128, d, imc) + XTS_CRYPT(192, d, imc) + XTS_CRYPT(256, d, imc) -#undef XTS_DEC +#undef XTS_CRYPT -.Lxts_dec_done: +.Lxts_dcry_done: aes_clear_keys(w5) st1 {v0.16b}, [x3] /* store tweak */ @@ -1674,6 +1870,9 @@ _gcry_aes_xts_dec_armv8_ce: CLEAR_REG(v1) CLEAR_REG(v2) + add sp, sp, 128; + CFI_ADJUST_CFA_OFFSET(-128); + .Lxts_dec_skip: ret CFI_ENDPROC(); -- 2.32.0 From jussi.kivilinna at iki.fi Sat Jan 8 21:13:37 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 8 Jan 2022 22:13:37 +0200 Subject: [PATCH 2/4] Add straight-line speculation hardening for aarch64 assembly In-Reply-To: <20220108201339.360118-1-jussi.kivilinna@iki.fi> References: <20220108201339.360118-1-jussi.kivilinna@iki.fi> Message-ID: <20220108201339.360118-2-jussi.kivilinna@iki.fi> * cipher/asm-common-aarch64.h (ret_spec_stop): New. * cipher/asm-poly1305-aarch64.h: Use 'ret_spec_stop' for 'ret' instruction. * cipher/camellia-aarch64.S: Likewise. * cipher/chacha20-aarch64.S: Likewise. * cipher/cipher-gcm-armv8-aarch64-ce.S: Likewise. * cipher/crc-armv8-aarch64-ce.S: Likewise. * cipher/rijndael-aarch64.S: Likewise. * cipher/rijndael-armv8-aarch64-ce.S: Likewise. * cipher/sha1-armv8-aarch64-ce.S: Likewise. * cipher/sha256-armv8-aarch64-ce.S: Likewise. * cipher/sm3-aarch64.S: Likewise. * cipher/twofish-aarch64.S: Likewise. * mpi/aarch64/mpih-add1.S: Likewise. * mpi/aarch64/mpih-mul1.S: Likewise. * mpi/aarch64/mpih-mul2.S: Likewise. * mpi/aarch64/mpih-mul3.S: Likewise. * mpi/aarch64/mpih-sub1.S: Likewise. -- Signed-off-by: Jussi Kivilinna --- cipher/asm-common-aarch64.h | 4 ++++ cipher/asm-poly1305-aarch64.h | 2 +- cipher/camellia-aarch64.S | 6 +++--- cipher/chacha20-aarch64.S | 4 ++-- cipher/cipher-gcm-armv8-aarch64-ce.S | 6 +++--- cipher/crc-armv8-aarch64-ce.S | 8 ++++---- cipher/rijndael-aarch64.S | 4 ++-- cipher/rijndael-armv8-aarch64-ce.S | 30 ++++++++++++++-------------- cipher/sha1-armv8-aarch64-ce.S | 2 +- cipher/sha256-armv8-aarch64-ce.S | 2 +- cipher/sm3-aarch64.S | 2 +- cipher/twofish-aarch64.S | 4 ++-- mpi/aarch64/mpih-add1.S | 2 +- mpi/aarch64/mpih-mul1.S | 2 +- mpi/aarch64/mpih-mul2.S | 2 +- mpi/aarch64/mpih-mul3.S | 4 ++-- mpi/aarch64/mpih-sub1.S | 2 +- 17 files changed, 45 insertions(+), 41 deletions(-) diff --git a/cipher/asm-common-aarch64.h b/cipher/asm-common-aarch64.h index cf0afe1f..6ce773f2 100644 --- a/cipher/asm-common-aarch64.h +++ b/cipher/asm-common-aarch64.h @@ -101,4 +101,8 @@ # define CFI_REG_ON_STACK(reg,rsp_offs) #endif +/* 'ret' instruction replacement for straight-line speculation mitigation */ +#define ret_spec_stop \ + ret; b .; dsb sy; isb; + #endif /* GCRY_ASM_COMMON_AARCH64_H */ diff --git a/cipher/asm-poly1305-aarch64.h b/cipher/asm-poly1305-aarch64.h index 90092709..2f05aae2 100644 --- a/cipher/asm-poly1305-aarch64.h +++ b/cipher/asm-poly1305-aarch64.h @@ -237,7 +237,7 @@ _gcry_poly1305_aarch64_blocks1: mov x0, #0; POLY1305_POP_REGS(); - ret; + ret_spec_stop; CFI_ENDPROC() ELF(.size _gcry_poly1305_aarch64_blocks1, .-_gcry_poly1305_aarch64_blocks1;) #endif diff --git a/cipher/camellia-aarch64.S b/cipher/camellia-aarch64.S index f4980862..30b568d3 100644 --- a/cipher/camellia-aarch64.S +++ b/cipher/camellia-aarch64.S @@ -238,7 +238,7 @@ _gcry_camellia_arm_encrypt_block: CFI_ADJUST_CFA_OFFSET(-16) CFI_RESTORE(x19) CFI_RESTORE(x30) - ret; + ret_spec_stop; CFI_RESTORE_STATE() .ltorg @@ -252,7 +252,7 @@ _gcry_camellia_arm_encrypt_block: CFI_ADJUST_CFA_OFFSET(-16) CFI_RESTORE(x19) CFI_RESTORE(x30) - ret; + ret_spec_stop; CFI_ENDPROC() .ltorg ELF(.size _gcry_camellia_arm_encrypt_block,.-_gcry_camellia_arm_encrypt_block;) @@ -299,7 +299,7 @@ _gcry_camellia_arm_decrypt_block: CFI_ADJUST_CFA_OFFSET(-16) CFI_RESTORE(x19) CFI_RESTORE(x30) - ret; + ret_spec_stop; CFI_RESTORE_STATE() .ltorg diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S index 4f76834b..2a980b95 100644 --- a/cipher/chacha20-aarch64.S +++ b/cipher/chacha20-aarch64.S @@ -356,7 +356,7 @@ _gcry_chacha20_aarch64_blocks4: clear(X15); eor x0, x0, x0 - ret + ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_chacha20_aarch64_blocks4, .-_gcry_chacha20_aarch64_blocks4;) @@ -641,7 +641,7 @@ _gcry_chacha20_poly1305_aarch64_blocks4: eor x0, x0, x0 POLY1305_POP_REGS() - ret + ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_chacha20_poly1305_aarch64_blocks4, .-_gcry_chacha20_poly1305_aarch64_blocks4;) diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S index 2c619f9b..e6714249 100644 --- a/cipher/cipher-gcm-armv8-aarch64-ce.S +++ b/cipher/cipher-gcm-armv8-aarch64-ce.S @@ -365,7 +365,7 @@ _gcry_ghash_armv8_ce_pmull: .Ldo_nothing: mov x0, #0 - ret + ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;) @@ -593,7 +593,7 @@ _gcry_polyval_armv8_ce_pmull: .Lpolyval_do_nothing: mov x0, #0 - ret + ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_polyval_armv8_ce_pmull,.-_gcry_polyval_armv8_ce_pmull;) @@ -645,7 +645,7 @@ _gcry_ghash_setup_armv8_ce_pmull: st1 {rh2.16b-rh4.16b}, [x1], #(3*16) st1 {rh5.16b-rh6.16b}, [x1] - ret + ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;) diff --git a/cipher/crc-armv8-aarch64-ce.S b/cipher/crc-armv8-aarch64-ce.S index 060abdfe..7ac884af 100644 --- a/cipher/crc-armv8-aarch64-ce.S +++ b/cipher/crc-armv8-aarch64-ce.S @@ -227,7 +227,7 @@ _gcry_crc32r_armv8_ce_bulk: /* store CRC */ st1 {v0.s}[2], [x0] - ret + ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;) @@ -260,7 +260,7 @@ _gcry_crc32r_armv8_ce_reduction_4: mov w0, v0.s[1] - ret + ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;) @@ -457,7 +457,7 @@ _gcry_crc32_armv8_ce_bulk: rev32 v0.8b, v0.8b /* byte swap */ st1 {v0.s}[0], [x0] - ret + ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;) @@ -490,7 +490,7 @@ _gcry_crc32_armv8_ce_reduction_4: rev32 v0.8b, v0.8b /* Return in input endian */ mov w0, v0.s[0] - ret + ret_spec_stop CFI_ENDPROC() ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;) diff --git a/cipher/rijndael-aarch64.S b/cipher/rijndael-aarch64.S index e77dd4e0..184fcd20 100644 --- a/cipher/rijndael-aarch64.S +++ b/cipher/rijndael-aarch64.S @@ -263,7 +263,7 @@ _gcry_aes_arm_encrypt_block: stp RC, RD, [RDST, #8]; mov x0, #(0); - ret; + ret_spec_stop; .ltorg .Lenc_not_128: @@ -486,7 +486,7 @@ _gcry_aes_arm_decrypt_block: stp RC, RD, [RDST, #8]; mov x0, #(0); - ret; + ret_spec_stop; .ltorg .Ldec_256: diff --git a/cipher/rijndael-armv8-aarch64-ce.S b/cipher/rijndael-armv8-aarch64-ce.S index 9f8d9d49..4fef0345 100644 --- a/cipher/rijndael-armv8-aarch64-ce.S +++ b/cipher/rijndael-armv8-aarch64-ce.S @@ -301,7 +301,7 @@ _gcry_aes_enc_armv8_ce: CLEAR_REG(v0) mov x0, #0 - ret + ret_spec_stop .Lenc1_192: do_aes_one192(e, mc, v0, v0, vk0); @@ -365,7 +365,7 @@ _gcry_aes_dec_armv8_ce: CLEAR_REG(v0) mov x0, #0 - ret + ret_spec_stop .Ldec1_192: do_aes_one192(d, imc, v0, v0, vk0); @@ -463,7 +463,7 @@ _gcry_aes_cbc_enc_armv8_ce: CLEAR_REG(v0) .Lcbc_enc_skip: - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_cbc_enc_armv8_ce,.-_gcry_aes_cbc_enc_armv8_ce;) @@ -584,7 +584,7 @@ _gcry_aes_cbc_dec_armv8_ce: CFI_ADJUST_CFA_OFFSET(-64); .Lcbc_dec_skip: - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_cbc_dec_armv8_ce,.-_gcry_aes_cbc_dec_armv8_ce;) @@ -777,7 +777,7 @@ _gcry_aes_ctr_enc_armv8_ce: CFI_ADJUST_CFA_OFFSET(-128); .Lctr_enc_skip: - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ctr_enc_armv8_ce,.-_gcry_aes_ctr_enc_armv8_ce;) @@ -924,7 +924,7 @@ _gcry_aes_ctr32le_enc_armv8_ce: CFI_ADJUST_CFA_OFFSET(-128); .Lctr32le_enc_skip: - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ctr32le_enc_armv8_ce,.-_gcry_aes_ctr32le_enc_armv8_ce;) @@ -1006,7 +1006,7 @@ _gcry_aes_cfb_enc_armv8_ce: CLEAR_REG(v4) .Lcfb_enc_skip: - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_cfb_enc_armv8_ce,.-_gcry_aes_cfb_enc_armv8_ce;) @@ -1130,7 +1130,7 @@ _gcry_aes_cfb_dec_armv8_ce: CFI_ADJUST_CFA_OFFSET(-64); .Lcfb_dec_skip: - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_cfb_dec_armv8_ce,.-_gcry_aes_cfb_dec_armv8_ce;) @@ -1379,7 +1379,7 @@ _gcry_aes_ocb_enc_armv8_ce: add sp, sp, #128; CFI_ADJUST_CFA_OFFSET(-128); - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_enc_armv8_ce,.-_gcry_aes_ocb_enc_armv8_ce;) @@ -1458,7 +1458,7 @@ _gcry_aes_ocb_dec_armv8_ce: add sp, sp, #128; CFI_ADJUST_CFA_OFFSET(-128); - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_dec_armv8_ce,.-_gcry_aes_ocb_dec_armv8_ce;) @@ -1605,7 +1605,7 @@ _gcry_aes_ocb_auth_armv8_ce: CLEAR_REG(v2) CLEAR_REG(v16) - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ocb_auth_armv8_ce,.-_gcry_aes_ocb_auth_armv8_ce;) @@ -1806,7 +1806,7 @@ _gcry_aes_xts_enc_armv8_ce: CFI_ADJUST_CFA_OFFSET(-128); .Lxts_enc_skip: - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_xts_enc_armv8_ce,.-_gcry_aes_xts_enc_armv8_ce;) @@ -1874,7 +1874,7 @@ _gcry_aes_xts_dec_armv8_ce: CFI_ADJUST_CFA_OFFSET(-128); .Lxts_dec_skip: - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_xts_dec_armv8_ce,.-_gcry_aes_xts_dec_armv8_ce;) @@ -1897,7 +1897,7 @@ _gcry_aes_sbox4_armv8_ce: addv s0, v0.4s mov w0, v0.S[0] CLEAR_REG(v0) - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_sbox4_armv8_ce,.-_gcry_aes_sbox4_armv8_ce;) @@ -1914,7 +1914,7 @@ _gcry_aes_invmixcol_armv8_ce: aesimc v0.16b, v0.16b st1 {v0.16b}, [x0] CLEAR_REG(v0) - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_invmixcol_armv8_ce,.-_gcry_aes_invmixcol_armv8_ce;) diff --git a/cipher/sha1-armv8-aarch64-ce.S b/cipher/sha1-armv8-aarch64-ce.S index 8ea1486b..ea26564b 100644 --- a/cipher/sha1-armv8-aarch64-ce.S +++ b/cipher/sha1-armv8-aarch64-ce.S @@ -194,7 +194,7 @@ _gcry_sha1_transform_armv8_ce: .Ldo_nothing: mov x0, #0 - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_armv8_ce,.-_gcry_sha1_transform_armv8_ce;) diff --git a/cipher/sha256-armv8-aarch64-ce.S b/cipher/sha256-armv8-aarch64-ce.S index 5c39e83e..d0fa6285 100644 --- a/cipher/sha256-armv8-aarch64-ce.S +++ b/cipher/sha256-armv8-aarch64-ce.S @@ -208,7 +208,7 @@ _gcry_sha256_transform_armv8_ce: .Ldo_nothing: mov x0, #0 - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_sha256_transform_armv8_ce,.-_gcry_sha256_transform_armv8_ce;) diff --git a/cipher/sm3-aarch64.S b/cipher/sm3-aarch64.S index 77dba2ba..3fb89006 100644 --- a/cipher/sm3-aarch64.S +++ b/cipher/sm3-aarch64.S @@ -650,7 +650,7 @@ _gcry_sm3_transform_aarch64: CFI_ADJUST_CFA_OFFSET(-16); CFI_RESTORE(x28); CFI_RESTORE(x29); - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_sm3_transform_aarch64, .-_gcry_sm3_transform_aarch64;) diff --git a/cipher/twofish-aarch64.S b/cipher/twofish-aarch64.S index 9f35b5cd..7941fe3a 100644 --- a/cipher/twofish-aarch64.S +++ b/cipher/twofish-aarch64.S @@ -262,7 +262,7 @@ _gcry_twofish_arm_encrypt_block: str_output_le(RDST, RC, RD, RA, RB, RT0, RT1); - ret; + ret_spec_stop; CFI_ENDPROC(); .ltorg ELF(.size _gcry_twofish_arm_encrypt_block,.-_gcry_twofish_arm_encrypt_block;) @@ -313,7 +313,7 @@ _gcry_twofish_arm_decrypt_block: str_output_le(RDST, RA, RB, RC, RD, RT0, RT1); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_arm_decrypt_block,.-_gcry_twofish_arm_decrypt_block;) diff --git a/mpi/aarch64/mpih-add1.S b/mpi/aarch64/mpih-add1.S index cc356bce..24859b17 100644 --- a/mpi/aarch64/mpih-add1.S +++ b/mpi/aarch64/mpih-add1.S @@ -69,6 +69,6 @@ C_SYMBOL_NAME(_gcry_mpih_add_n): .Lend: adc x0, xzr, xzr; - ret; + ret_spec_stop; CFI_ENDPROC() ELF(.size C_SYMBOL_NAME(_gcry_mpih_add_n),.-C_SYMBOL_NAME(_gcry_mpih_add_n);) diff --git a/mpi/aarch64/mpih-mul1.S b/mpi/aarch64/mpih-mul1.S index 0db54444..f34c13c5 100644 --- a/mpi/aarch64/mpih-mul1.S +++ b/mpi/aarch64/mpih-mul1.S @@ -94,6 +94,6 @@ C_SYMBOL_NAME(_gcry_mpih_mul_1): .Lend: mov x0, x4; - ret; + ret_spec_stop; CFI_ENDPROC() ELF(.size C_SYMBOL_NAME(_gcry_mpih_mul_1),.-C_SYMBOL_NAME(_gcry_mpih_mul_1);) diff --git a/mpi/aarch64/mpih-mul2.S b/mpi/aarch64/mpih-mul2.S index b4cc6eeb..1880999d 100644 --- a/mpi/aarch64/mpih-mul2.S +++ b/mpi/aarch64/mpih-mul2.S @@ -106,6 +106,6 @@ C_SYMBOL_NAME(_gcry_mpih_addmul_1): .Lend: mov x0, x6; - ret; + ret_spec_stop; CFI_ENDPROC() ELF(.size C_SYMBOL_NAME(_gcry_mpih_addmul_1),.-C_SYMBOL_NAME(_gcry_mpih_addmul_1);) diff --git a/mpi/aarch64/mpih-mul3.S b/mpi/aarch64/mpih-mul3.S index 47a189b6..e5faeddc 100644 --- a/mpi/aarch64/mpih-mul3.S +++ b/mpi/aarch64/mpih-mul3.S @@ -115,10 +115,10 @@ C_SYMBOL_NAME(_gcry_mpih_submul_1): cbnz w2, .Large_loop; mov x0, x7; - ret; + ret_spec_stop; .Loop_end: cinc x0, x7, cc; - ret; + ret_spec_stop; CFI_ENDPROC() ELF(.size C_SYMBOL_NAME(_gcry_mpih_submul_1),.-C_SYMBOL_NAME(_gcry_mpih_submul_1);) diff --git a/mpi/aarch64/mpih-sub1.S b/mpi/aarch64/mpih-sub1.S index 16b6c004..46908286 100644 --- a/mpi/aarch64/mpih-sub1.S +++ b/mpi/aarch64/mpih-sub1.S @@ -69,6 +69,6 @@ C_SYMBOL_NAME(_gcry_mpih_sub_n): .Lend: cset x0, cc; - ret; + ret_spec_stop; CFI_ENDPROC() ELF(.size C_SYMBOL_NAME(_gcry_mpih_sub_n),.-C_SYMBOL_NAME(_gcry_mpih_sub_n);) -- 2.32.0 From jussi.kivilinna at iki.fi Sat Jan 8 21:13:39 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 8 Jan 2022 22:13:39 +0200 Subject: [PATCH 4/4] mpi/config.links: merge i586 targets with rest i*86 targets In-Reply-To: <20220108201339.360118-1-jussi.kivilinna@iki.fi> References: <20220108201339.360118-1-jussi.kivilinna@iki.fi> Message-ID: <20220108201339.360118-4-jussi.kivilinna@iki.fi> * mpi/config.links: Merge i586 targets with rest i[3467]86 targets. -- Signed-off-by: Jussi Kivilinna --- mpi/config.links | 64 ++++++++++++------------------------------------ 1 file changed, 15 insertions(+), 49 deletions(-) diff --git a/mpi/config.links b/mpi/config.links index deb98bf0..8cd6657e 100644 --- a/mpi/config.links +++ b/mpi/config.links @@ -50,26 +50,14 @@ case "${host}" in path="" mpi_cpu_arch="x86" ;; - i[3467]86*-*-openbsd* | \ - i[3467]86*-*-freebsd*-elf | \ - i[3467]86*-*-freebsd[3-9]* | \ - i[3467]86*-*-freebsd[12][0-9]*| \ - i[3467]86*-*-freebsdelf* | \ - i[3467]86*-*-netbsd* | \ - i[3467]86*-*-k*bsd*) - echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h - cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h - path="i386" - mpi_cpu_arch="x86" - ;; - i586*-*-openbsd* | \ - i586*-*-freebsd*-elf | \ - i586*-*-freebsd[3-9]* | \ - i586*-*-freebsd[12][0-9]*| \ - i586*-*-freebsdelf* | \ - i586*-*-netbsd* | \ - i586*-*-k*bsd* | \ - pentium-*-netbsd* | \ + i[34567]86*-*-openbsd* | \ + i[34567]86*-*-freebsd*-elf | \ + i[34567]86*-*-freebsd[3-9]* | \ + i[34567]86*-*-freebsd[12][0-9]*| \ + i[34567]86*-*-freebsdelf* | \ + i[34567]86*-*-netbsd* | \ + i[34567]86*-*-k*bsd* | \ + pentium-*-netbsd* | \ pentiumpro-*-netbsd*) echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h @@ -82,46 +70,24 @@ case "${host}" in path="i386" mpi_cpu_arch="x86" ;; - i[3467]86*-*-linuxaout* | \ - i[3467]86*-*-linuxoldld* | \ - i[3467]86*-*-*bsd*) - echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h - echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h - cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h - path="i386" - mpi_cpu_arch="x86" - ;; - i586*-*-linuxaout* | \ - i586*-*-linuxoldld* | \ - i586*-*-*bsd*) + i[34567]86*-*-linuxaout* | \ + i[34567]86*-*-linuxoldld* | \ + i[34567]86*-*-*bsd*) echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h path="i386" mpi_cpu_arch="x86" ;; - i[3467]86*-msdosdjgpp* | \ - i[34]86*-apple-darwin*) + i[34567]86*-msdosdjgpp* | \ + i[34567]86*-apple-darwin*) echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h path="i386" mpi_cpu_arch="x86" ;; - i586*-msdosdjgpp* | \ - i[567]86*-apple-darwin*) - echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h - cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h - path="i386" - mpi_cpu_arch="x86" - ;; - i[3467]86*-*-*) - echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h - cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h - path="i386" - mpi_cpu_arch="x86" - ;; - i586*-*-* | \ - pentium-*-* | \ + i[34567]86*-*-* | \ + pentium-*-* | \ pentiumpro-*-*) echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h -- 2.32.0 From jussi.kivilinna at iki.fi Sat Jan 8 21:13:36 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 8 Jan 2022 22:13:36 +0200 Subject: [PATCH 1/4] Add straight-line speculation hardening for amd64 and i386 assembly Message-ID: <20220108201339.360118-1-jussi.kivilinna@iki.fi> * cipher/asm-common-amd64.h (ret_spec_stop): New. * cipher/arcfour-amd64.S: Use 'ret_spec_stop' for 'ret' instruction. * cipher/blake2b-amd64-avx2.S: Likewise. * cipher/blake2s-amd64-avx.S: Likewise. * cipher/blowfish-amd64.S: Likewise. * cipher/camellia-aesni-avx-amd64.S: Likewise. * cipher/camellia-aesni-avx2-amd64.h: Likewise. * cipher/cast5-amd64.S: Likewise. * cipher/chacha20-amd64-avx2.S: Likewise. * cipher/chacha20-amd64-ssse3.S: Likewise. * cipher/des-amd64.S: Likewise. * cipher/rijndael-aarch64.S: Likewise. * cipher/rijndael-amd64.S: Likewise. * cipher/rijndael-ssse3-amd64-asm.S: Likewise. * cipher/rijndael-vaes-avx2-amd64.S: Likewise. * cipher/salsa20-amd64.S: Likewise. * cipher/serpent-avx2-amd64.S: Likewise. * cipher/serpent-sse2-amd64.S: Likewise. * cipher/sha1-avx-amd64.S: Likewise. * cipher/sha1-avx-bmi2-amd64.S: Likewise. * cipher/sha1-avx2-bmi2-amd64.S: Likewise. * cipher/sha1-ssse3-amd64.S: Likewise. * cipher/sha256-avx-amd64.S: Likewise. * cipher/sha256-avx2-bmi2-amd64.S: Likewise. * cipher/sha256-ssse3-amd64.S: Likewise. * cipher/sha512-avx-amd64.S: Likewise. * cipher/sha512-avx2-bmi2-amd64.S: Likewise. * cipher/sha512-ssse3-amd64.S: Likewise. * cipher/sm3-avx-bmi2-amd64.S: Likewise. * cipher/sm4-aesni-avx-amd64.S: Likewise. * cipher/sm4-aesni-avx2-amd64.S: Likewise. * cipher/twofish-amd64.S: Likewise. * cipher/twofish-avx2-amd64.S: Likewise. * cipher/whirlpool-sse2-amd64.S: Likewise. * mpi/amd64/func_abi.h (CFI_*): Remove, include from "asm-common-amd64.h" instead. (FUNC_EXIT): Use 'ret_spec_stop' for 'ret' instruction. * mpi/asm-common-amd64.h: New. * mpi/i386/mpih-add1.S: Use 'ret_spec_stop' for 'ret' instruction. * mpi/i386/mpih-lshift.S: Likewise. * mpi/i386/mpih-mul1.S: Likewise. * mpi/i386/mpih-mul2.S: Likewise. * mpi/i386/mpih-mul3.S: Likewise. * mpi/i386/mpih-rshift.S: Likewise. * mpi/i386/mpih-sub1.S: Likewise. * mpi/i386/syntax.h (ret_spec_stop): New. -- Signed-off-by: Jussi Kivilinna --- cipher/arcfour-amd64.S | 2 +- cipher/asm-common-amd64.h | 4 ++++ cipher/blake2b-amd64-avx2.S | 2 +- cipher/blake2s-amd64-avx.S | 2 +- cipher/blowfish-amd64.S | 18 +++++++++--------- cipher/camellia-aesni-avx-amd64.S | 20 ++++++++++---------- cipher/camellia-aesni-avx2-amd64.h | 16 ++++++++-------- cipher/cast5-amd64.S | 14 +++++++------- cipher/chacha20-amd64-avx2.S | 4 ++-- cipher/chacha20-amd64-ssse3.S | 8 ++++---- cipher/des-amd64.S | 10 +++++----- cipher/rijndael-amd64.S | 4 ++-- cipher/rijndael-ssse3-amd64-asm.S | 18 +++++++++--------- cipher/rijndael-vaes-avx2-amd64.S | 14 +++++++------- cipher/salsa20-amd64.S | 6 +++--- cipher/serpent-avx2-amd64.S | 16 ++++++++-------- cipher/serpent-sse2-amd64.S | 16 ++++++++-------- cipher/sha1-avx-amd64.S | 2 +- cipher/sha1-avx-bmi2-amd64.S | 2 +- cipher/sha1-avx2-bmi2-amd64.S | 2 +- cipher/sha1-ssse3-amd64.S | 2 +- cipher/sha256-avx-amd64.S | 2 +- cipher/sha256-avx2-bmi2-amd64.S | 2 +- cipher/sha256-ssse3-amd64.S | 2 +- cipher/sha512-avx-amd64.S | 2 +- cipher/sha512-avx2-bmi2-amd64.S | 2 +- cipher/sha512-ssse3-amd64.S | 2 +- cipher/sm3-avx-bmi2-amd64.S | 2 +- cipher/sm4-aesni-avx-amd64.S | 20 ++++++++++---------- cipher/sm4-aesni-avx2-amd64.S | 14 +++++++------- cipher/twofish-amd64.S | 20 ++++++++++---------- cipher/twofish-avx2-amd64.S | 16 ++++++++-------- cipher/whirlpool-sse2-amd64.S | 2 +- mpi/amd64/func_abi.h | 28 +++------------------------- mpi/asm-common-amd64.h | 26 ++++++++++++++++++++++++++ mpi/i386/mpih-add1.S | 2 +- mpi/i386/mpih-lshift.S | 4 ++-- mpi/i386/mpih-mul1.S | 2 +- mpi/i386/mpih-mul2.S | 2 +- mpi/i386/mpih-mul3.S | 2 +- mpi/i386/mpih-rshift.S | 4 ++-- mpi/i386/mpih-sub1.S | 2 +- mpi/i386/syntax.h | 6 ++++++ 43 files changed, 180 insertions(+), 166 deletions(-) create mode 100644 mpi/asm-common-amd64.h diff --git a/cipher/arcfour-amd64.S b/cipher/arcfour-amd64.S index 221dfeff..2abd90a7 100644 --- a/cipher/arcfour-amd64.S +++ b/cipher/arcfour-amd64.S @@ -99,7 +99,7 @@ _gcry_arcfour_amd64: pop %rbp CFI_POP(%rbp) EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC() .L__gcry_arcfour_amd64_end: ELF(.size _gcry_arcfour_amd64,.L__gcry_arcfour_amd64_end-_gcry_arcfour_amd64) diff --git a/cipher/asm-common-amd64.h b/cipher/asm-common-amd64.h index 9d4a028a..8ee9d9e7 100644 --- a/cipher/asm-common-amd64.h +++ b/cipher/asm-common-amd64.h @@ -186,4 +186,8 @@ # define EXIT_SYSV_FUNC #endif +/* 'ret' instruction replacement for straight-line speculation mitigation */ +#define ret_spec_stop \ + ret; jmp .; int3; + #endif /* GCRY_ASM_COMMON_AMD64_H */ diff --git a/cipher/blake2b-amd64-avx2.S b/cipher/blake2b-amd64-avx2.S index 357e8a51..3601b65f 100644 --- a/cipher/blake2b-amd64-avx2.S +++ b/cipher/blake2b-amd64-avx2.S @@ -291,7 +291,7 @@ _gcry_blake2b_transform_amd64_avx2: xor %eax, %eax; vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blake2b_transform_amd64_avx2, .-_gcry_blake2b_transform_amd64_avx2;) diff --git a/cipher/blake2s-amd64-avx.S b/cipher/blake2s-amd64-avx.S index 5b936758..5094b4c1 100644 --- a/cipher/blake2s-amd64-avx.S +++ b/cipher/blake2s-amd64-avx.S @@ -269,7 +269,7 @@ _gcry_blake2s_transform_amd64_avx: xor %eax, %eax; vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blake2s_transform_amd64_avx, .-_gcry_blake2s_transform_amd64_avx;) diff --git a/cipher/blowfish-amd64.S b/cipher/blowfish-amd64.S index bdb361d7..2b4ffa1a 100644 --- a/cipher/blowfish-amd64.S +++ b/cipher/blowfish-amd64.S @@ -151,7 +151,7 @@ __blowfish_enc_blk1: movq %r11, %rbp; CFI_RESTORE(%rbp) - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __blowfish_enc_blk1,.-__blowfish_enc_blk1;) @@ -182,7 +182,7 @@ _gcry_blowfish_amd64_do_encrypt: movl RX0d, (RX2); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_do_encrypt,.-_gcry_blowfish_amd64_do_encrypt;) @@ -210,7 +210,7 @@ _gcry_blowfish_amd64_encrypt_block: write_block(); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_encrypt_block,.-_gcry_blowfish_amd64_encrypt_block;) @@ -253,7 +253,7 @@ _gcry_blowfish_amd64_decrypt_block: CFI_RESTORE(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_decrypt_block,.-_gcry_blowfish_amd64_decrypt_block;) @@ -367,7 +367,7 @@ __blowfish_enc_blk4: outbswap_block4(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __blowfish_enc_blk4,.-__blowfish_enc_blk4;) @@ -398,7 +398,7 @@ __blowfish_dec_blk4: outbswap_block4(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __blowfish_dec_blk4,.-__blowfish_dec_blk4;) @@ -468,7 +468,7 @@ _gcry_blowfish_amd64_ctr_enc: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_ctr_enc,.-_gcry_blowfish_amd64_ctr_enc;) @@ -529,7 +529,7 @@ _gcry_blowfish_amd64_cbc_dec: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_cbc_dec,.-_gcry_blowfish_amd64_cbc_dec;) @@ -593,7 +593,7 @@ _gcry_blowfish_amd64_cfb_dec: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_blowfish_amd64_cfb_dec,.-_gcry_blowfish_amd64_cfb_dec;) diff --git a/cipher/camellia-aesni-avx-amd64.S b/cipher/camellia-aesni-avx-amd64.S index 64cabaa5..5c304e57 100644 --- a/cipher/camellia-aesni-avx-amd64.S +++ b/cipher/camellia-aesni-avx-amd64.S @@ -822,7 +822,7 @@ __camellia_enc_blk16: %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 16(%rax)); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_enc_blk16,.-__camellia_enc_blk16;) @@ -887,7 +887,7 @@ __camellia_dec_blk16: %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, (key_table)(CTX), (%rax), 1 * 16(%rax)); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_dec_blk16,.-__camellia_dec_blk16;) @@ -1021,7 +1021,7 @@ _gcry_camellia_aesni_avx_ctr_enc: leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ctr_enc,.-_gcry_camellia_aesni_avx_ctr_enc;) @@ -1094,7 +1094,7 @@ _gcry_camellia_aesni_avx_cbc_dec: leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_cbc_dec,.-_gcry_camellia_aesni_avx_cbc_dec;) @@ -1176,7 +1176,7 @@ _gcry_camellia_aesni_avx_cfb_dec: leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_cfb_dec,.-_gcry_camellia_aesni_avx_cfb_dec;) @@ -1328,7 +1328,7 @@ _gcry_camellia_aesni_avx_ocb_enc: leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_enc,.-_gcry_camellia_aesni_avx_ocb_enc;) @@ -1499,7 +1499,7 @@ _gcry_camellia_aesni_avx_ocb_dec: leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_dec,.-_gcry_camellia_aesni_avx_ocb_dec;) @@ -1647,7 +1647,7 @@ _gcry_camellia_aesni_avx_ocb_auth: leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_camellia_aesni_avx_ocb_auth,.-_gcry_camellia_aesni_avx_ocb_auth;) @@ -2096,7 +2096,7 @@ __camellia_avx_setup128: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_avx_setup128,.-__camellia_avx_setup128;) @@ -2576,7 +2576,7 @@ __camellia_avx_setup256: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_avx_setup256,.-__camellia_avx_setup256;) diff --git a/cipher/camellia-aesni-avx2-amd64.h b/cipher/camellia-aesni-avx2-amd64.h index be7bb0aa..e93c40b8 100644 --- a/cipher/camellia-aesni-avx2-amd64.h +++ b/cipher/camellia-aesni-avx2-amd64.h @@ -815,7 +815,7 @@ __camellia_enc_blk32: %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, ((key_table) + 8 * 8)(%r8), (%rax), 1 * 32(%rax)); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_enc_blk32,.-__camellia_enc_blk32;) @@ -880,7 +880,7 @@ __camellia_dec_blk32: %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, (key_table)(CTX), (%rax), 1 * 32(%rax)); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __camellia_dec_blk32,.-__camellia_dec_blk32;) @@ -1084,7 +1084,7 @@ FUNC_NAME(ctr_enc): leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ctr_enc),.-FUNC_NAME(ctr_enc);) @@ -1161,7 +1161,7 @@ FUNC_NAME(cbc_dec): leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(cbc_dec),.-FUNC_NAME(cbc_dec);) @@ -1245,7 +1245,7 @@ FUNC_NAME(cfb_dec): leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(cfb_dec),.-FUNC_NAME(cfb_dec);) @@ -1419,7 +1419,7 @@ FUNC_NAME(ocb_enc): leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_enc),.-FUNC_NAME(ocb_enc);) @@ -1616,7 +1616,7 @@ FUNC_NAME(ocb_dec): leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_dec),.-FUNC_NAME(ocb_dec);) @@ -1787,7 +1787,7 @@ FUNC_NAME(ocb_auth): leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size FUNC_NAME(ocb_auth),.-FUNC_NAME(ocb_auth);) diff --git a/cipher/cast5-amd64.S b/cipher/cast5-amd64.S index 82f67890..a804654c 100644 --- a/cipher/cast5-amd64.S +++ b/cipher/cast5-amd64.S @@ -219,7 +219,7 @@ _gcry_cast5_amd64_encrypt_block: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_encrypt_block,.-_gcry_cast5_amd64_encrypt_block;) @@ -269,7 +269,7 @@ _gcry_cast5_amd64_decrypt_block: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_decrypt_block,.-_gcry_cast5_amd64_decrypt_block;) @@ -399,7 +399,7 @@ __cast5_enc_blk4: round_enc_last4(14, F4_3, F4_1); outbswap_block4(RLR0, RLR1, RLR2, RLR3); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __cast5_enc_blk4,.-__cast5_enc_blk4;) @@ -432,7 +432,7 @@ __cast5_dec_blk4: outbswap_block4(RLR0, RLR1, RLR2, RLR3); CFI_ENDPROC(); - ret; + ret_spec_stop; ELF(.size __cast5_dec_blk4,.-__cast5_dec_blk4;) .align 8 @@ -508,7 +508,7 @@ _gcry_cast5_amd64_ctr_enc: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_ctr_enc,.-_gcry_cast5_amd64_ctr_enc;) @@ -582,7 +582,7 @@ _gcry_cast5_amd64_cbc_dec: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_cbc_dec,.-_gcry_cast5_amd64_cbc_dec;) @@ -655,7 +655,7 @@ _gcry_cast5_amd64_cfb_dec: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_cast5_amd64_cfb_dec,.-_gcry_cast5_amd64_cfb_dec;) diff --git a/cipher/chacha20-amd64-avx2.S b/cipher/chacha20-amd64-avx2.S index 51e107be..9f2a036a 100644 --- a/cipher/chacha20-amd64-avx2.S +++ b/cipher/chacha20-amd64-avx2.S @@ -322,7 +322,7 @@ _gcry_chacha20_amd64_avx2_blocks8: /* eax zeroed by round loop. */ leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_avx2_blocks8, .-_gcry_chacha20_amd64_avx2_blocks8;) @@ -592,7 +592,7 @@ _gcry_chacha20_poly1305_amd64_avx2_blocks8: xorl %eax, %eax; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_poly1305_amd64_avx2_blocks8, .-_gcry_chacha20_poly1305_amd64_avx2_blocks8;) diff --git a/cipher/chacha20-amd64-ssse3.S b/cipher/chacha20-amd64-ssse3.S index 9cdb69ae..6c737978 100644 --- a/cipher/chacha20-amd64-ssse3.S +++ b/cipher/chacha20-amd64-ssse3.S @@ -333,7 +333,7 @@ _gcry_chacha20_amd64_ssse3_blocks4: /* eax zeroed by round loop. */ leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_ssse3_blocks4, .-_gcry_chacha20_amd64_ssse3_blocks4;) @@ -502,7 +502,7 @@ _gcry_chacha20_amd64_ssse3_blocks1: clear(X13); /* eax zeroed by round loop. */ - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_amd64_ssse3_blocks1, .-_gcry_chacha20_amd64_ssse3_blocks1;) @@ -772,7 +772,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks4: xorl %eax, %eax; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks4, .-_gcry_chacha20_poly1305_amd64_ssse3_blocks4;) @@ -1003,7 +1003,7 @@ _gcry_chacha20_poly1305_amd64_ssse3_blocks1: xorl %eax, %eax; leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_chacha20_poly1305_amd64_ssse3_blocks1, .-_gcry_chacha20_poly1305_amd64_ssse3_blocks1;) diff --git a/cipher/des-amd64.S b/cipher/des-amd64.S index a211dac3..c1bf9f29 100644 --- a/cipher/des-amd64.S +++ b/cipher/des-amd64.S @@ -285,7 +285,7 @@ _gcry_3des_amd64_crypt_block: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_crypt_block,.-_gcry_3des_amd64_crypt_block;) @@ -544,7 +544,7 @@ _gcry_3des_amd64_crypt_blk3: final_permutation3(RR, RL); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_crypt_blk3,.-_gcry_3des_amd64_crypt_blk3;) @@ -642,7 +642,7 @@ _gcry_3des_amd64_cbc_dec: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;) @@ -740,7 +740,7 @@ _gcry_3des_amd64_ctr_enc: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cbc_dec,.-_gcry_3des_amd64_cbc_dec;) @@ -837,7 +837,7 @@ _gcry_3des_amd64_cfb_dec: CFI_POP(%rbp); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_3des_amd64_cfb_dec,.-_gcry_3des_amd64_cfb_dec;) diff --git a/cipher/rijndael-amd64.S b/cipher/rijndael-amd64.S index 3dcaa856..6e3cc819 100644 --- a/cipher/rijndael-amd64.S +++ b/cipher/rijndael-amd64.S @@ -270,7 +270,7 @@ _gcry_aes_amd64_encrypt_block: movl $(6 * 8), %eax; EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_RESTORE_STATE(); .align 4 @@ -448,7 +448,7 @@ _gcry_aes_amd64_decrypt_block: movl $(6 * 8), %eax; EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_RESTORE_STATE(); .align 4 diff --git a/cipher/rijndael-ssse3-amd64-asm.S b/cipher/rijndael-ssse3-amd64-asm.S index 8124eb21..b98dca26 100644 --- a/cipher/rijndael-ssse3-amd64-asm.S +++ b/cipher/rijndael-ssse3-amd64-asm.S @@ -61,7 +61,7 @@ _gcry_aes_ssse3_enc_preload: movdqa .Lk_sb2 (%rax), %xmm15 # sb2u movdqa .Lk_sb2+16(%rax), %xmm14 # sb2t EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ssse3_enc_preload,.-_gcry_aes_ssse3_enc_preload) @@ -83,7 +83,7 @@ _gcry_aes_ssse3_dec_preload: movdqa .Lk_dsbb (%rax), %xmm14 # sbbu movdqa .Lk_dsbe (%rax), %xmm8 # sbeu EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ssse3_dec_preload,.-_gcry_aes_ssse3_dec_preload) @@ -194,7 +194,7 @@ _aes_encrypt_core: pxor %xmm4, %xmm0 # 0 = A pshufb .Lk_sr(%rsi,%rcx), %xmm0 EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _aes_encrypt_core,.-_aes_encrypt_core) @@ -303,7 +303,7 @@ _aes_decrypt_core: pxor %xmm4, %xmm0 # 0 = A pshufb .Lk_sr(%rsi,%rcx), %xmm0 EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _aes_decrypt_core,.-_aes_decrypt_core) @@ -439,7 +439,7 @@ _aes_schedule_core: pxor %xmm6, %xmm0 # -> b+c+d b+c b a pshufd $0x0E, %xmm0, %xmm6 pslldq $8, %xmm6 # clobber low side with zeros - ret + ret_spec_stop ## ## .Laes_schedule_256 @@ -546,7 +546,7 @@ _aes_schedule_core: # add in smeared stuff pxor %xmm7, %xmm0 movdqa %xmm0, %xmm7 - ret + ret_spec_stop ## ## .Laes_schedule_transform @@ -567,7 +567,7 @@ _aes_schedule_core: movdqa 16(%r11), %xmm0 # hi pshufb %xmm1, %xmm0 pxor %xmm2, %xmm0 - ret + ret_spec_stop ## ## .Laes_schedule_mangle @@ -639,7 +639,7 @@ _aes_schedule_core: add $-16, %r8 and $48, %r8 movdqa %xmm3, (%rdx) - ret + ret_spec_stop ## ## .Laes_schedule_mangle_last @@ -679,7 +679,7 @@ _aes_schedule_core: pxor %xmm7, %xmm7 pxor %xmm8, %xmm8 EXIT_SYSV_FUNC - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_aes_ssse3_schedule_core,.-_gcry_aes_ssse3_schedule_core) diff --git a/cipher/rijndael-vaes-avx2-amd64.S b/cipher/rijndael-vaes-avx2-amd64.S index d4ecf59f..f94b58db 100644 --- a/cipher/rijndael-vaes-avx2-amd64.S +++ b/cipher/rijndael-vaes-avx2-amd64.S @@ -383,7 +383,7 @@ _gcry_vaes_avx2_cbc_dec_amd64: vmovdqu %xmm15, (%rsi); vzeroall; - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_cbc_dec_amd64,.-_gcry_vaes_avx2_cbc_dec_amd64) @@ -691,7 +691,7 @@ _gcry_vaes_avx2_cfb_dec_amd64: vmovdqu %xmm15, (%rsi); vzeroall; - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_cfb_dec_amd64,.-_gcry_vaes_avx2_cfb_dec_amd64) @@ -1103,7 +1103,7 @@ _gcry_vaes_avx2_ctr_enc_amd64: vzeroall; xorl %r10d, %r10d; xorl %r11d, %r11d; - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_ctr_enc_amd64,.-_gcry_vaes_avx2_ctr_enc_amd64) @@ -1387,7 +1387,7 @@ _gcry_vaes_avx2_ctr32le_enc_amd64: .Ldone_ctr32le_enc: vmovdqu %xmm15, (%rsi); vzeroall; - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_ctr32le_enc_amd64,.-_gcry_vaes_avx2_ctr32le_enc_amd64) @@ -1535,7 +1535,7 @@ _gcry_vaes_avx2_ocb_checksum: .Locb_checksum_done: vpxor (%rax), %xmm0, %xmm0; vmovdqu %xmm0, (%rax); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_ocb_checksum,.-_gcry_vaes_avx2_ocb_checksum) @@ -2398,7 +2398,7 @@ _gcry_vaes_avx2_ocb_crypt_amd64: leave; CFI_LEAVE(); - ret + ret_spec_stop #undef STACK_REGS_POS #undef STACK_ALLOC @@ -2919,7 +2919,7 @@ _gcry_vaes_avx2_xts_crypt_amd64: vzeroall; xorl %eax, %eax - ret + ret_spec_stop CFI_ENDPROC(); ELF(.size _gcry_vaes_avx2_xts_crypt_amd64,.-_gcry_vaes_avx2_xts_crypt_amd64) diff --git a/cipher/salsa20-amd64.S b/cipher/salsa20-amd64.S index ae8f2715..64626063 100644 --- a/cipher/salsa20-amd64.S +++ b/cipher/salsa20-amd64.S @@ -83,7 +83,7 @@ _gcry_salsa20_amd64_keysetup: movl %ecx,8(%rdi) movl %r8d,12(%rdi) .L_keysetupdone: - ret + ret_spec_stop CFI_ENDPROC(); .align 8 @@ -99,7 +99,7 @@ _gcry_salsa20_amd64_ivsetup: movl %esi,44(%rdi) movl %r9d,32(%rdi) movl %eax,52(%rdi) - ret + ret_spec_stop CFI_ENDPROC(); .align 8 @@ -926,7 +926,7 @@ _gcry_salsa20_amd64_encrypt_blocks: CFI_DEF_CFA_REGISTER(%rsp) pop %rbx CFI_POP(%rbx) - ret + ret_spec_stop CFI_RESTORE_STATE(); .L_bytes_are_128_or_192: sub $64,%rdx diff --git a/cipher/serpent-avx2-amd64.S b/cipher/serpent-avx2-amd64.S index dcee9b62..d3515a21 100644 --- a/cipher/serpent-avx2-amd64.S +++ b/cipher/serpent-avx2-amd64.S @@ -487,7 +487,7 @@ __serpent_enc_blk16: transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1); transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_enc_blk16,.-__serpent_enc_blk16;) @@ -579,7 +579,7 @@ __serpent_dec_blk16: transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_dec_blk16,.-__serpent_dec_blk16;) @@ -697,7 +697,7 @@ _gcry_serpent_avx2_ctr_enc: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ctr_enc,.-_gcry_serpent_avx2_ctr_enc;) @@ -750,7 +750,7 @@ _gcry_serpent_avx2_cbc_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cbc_dec,.-_gcry_serpent_avx2_cbc_dec;) @@ -805,7 +805,7 @@ _gcry_serpent_avx2_cfb_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_cfb_dec,.-_gcry_serpent_avx2_cfb_dec;) @@ -919,7 +919,7 @@ _gcry_serpent_avx2_ocb_enc: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_enc,.-_gcry_serpent_avx2_ocb_enc;) @@ -1043,7 +1043,7 @@ _gcry_serpent_avx2_ocb_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_dec,.-_gcry_serpent_avx2_ocb_dec;) @@ -1146,7 +1146,7 @@ _gcry_serpent_avx2_ocb_auth: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_avx2_ocb_auth,.-_gcry_serpent_avx2_ocb_auth;) diff --git a/cipher/serpent-sse2-amd64.S b/cipher/serpent-sse2-amd64.S index 39cba002..b5935095 100644 --- a/cipher/serpent-sse2-amd64.S +++ b/cipher/serpent-sse2-amd64.S @@ -509,7 +509,7 @@ __serpent_enc_blk8: transpose_4x4(RA4, RA1, RA2, RA0, RA3, RTMP0, RTMP1); transpose_4x4(RB4, RB1, RB2, RB0, RB3, RTMP0, RTMP1); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_enc_blk8,.-__serpent_enc_blk8;) @@ -601,7 +601,7 @@ __serpent_dec_blk8: transpose_4x4(RA0, RA1, RA2, RA3, RA4, RTMP0, RTMP1); transpose_4x4(RB0, RB1, RB2, RB3, RB4, RTMP0, RTMP1); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __serpent_dec_blk8,.-__serpent_dec_blk8;) @@ -733,7 +733,7 @@ _gcry_serpent_sse2_ctr_enc: pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ctr_enc,.-_gcry_serpent_sse2_ctr_enc;) @@ -796,7 +796,7 @@ _gcry_serpent_sse2_cbc_dec: pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cbc_dec,.-_gcry_serpent_sse2_cbc_dec;) @@ -862,7 +862,7 @@ _gcry_serpent_sse2_cfb_dec: pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_cfb_dec,.-_gcry_serpent_sse2_cfb_dec;) @@ -976,7 +976,7 @@ _gcry_serpent_sse2_ocb_enc: pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_enc,.-_gcry_serpent_sse2_ocb_enc;) @@ -1100,7 +1100,7 @@ _gcry_serpent_sse2_ocb_dec: pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_dec,.-_gcry_serpent_sse2_ocb_dec;) @@ -1203,7 +1203,7 @@ _gcry_serpent_sse2_ocb_auth: pxor RTMP2, RTMP2; pxor RNOT, RNOT; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_serpent_sse2_ocb_auth,.-_gcry_serpent_sse2_ocb_auth;) diff --git a/cipher/sha1-avx-amd64.S b/cipher/sha1-avx-amd64.S index 85876ad4..acada960 100644 --- a/cipher/sha1-avx-amd64.S +++ b/cipher/sha1-avx-amd64.S @@ -420,7 +420,7 @@ _gcry_sha1_transform_amd64_avx: xorl %eax, %eax; .Lret: - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_avx, .-_gcry_sha1_transform_amd64_avx;) diff --git a/cipher/sha1-avx-bmi2-amd64.S b/cipher/sha1-avx-bmi2-amd64.S index 5dfcdca9..5f4b9e69 100644 --- a/cipher/sha1-avx-bmi2-amd64.S +++ b/cipher/sha1-avx-bmi2-amd64.S @@ -432,7 +432,7 @@ _gcry_sha1_transform_amd64_avx_bmi2: xorl %eax, %eax; .Lret: - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_avx_bmi2, .-_gcry_sha1_transform_amd64_avx_bmi2;) diff --git a/cipher/sha1-avx2-bmi2-amd64.S b/cipher/sha1-avx2-bmi2-amd64.S index 93863230..ed52761b 100644 --- a/cipher/sha1-avx2-bmi2-amd64.S +++ b/cipher/sha1-avx2-bmi2-amd64.S @@ -564,7 +564,7 @@ _gcry_sha1_transform_amd64_avx2_bmi2: /* stack already burned */ xorl %eax, %eax; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_avx2_bmi2, .-_gcry_sha1_transform_amd64_avx2_bmi2;) diff --git a/cipher/sha1-ssse3-amd64.S b/cipher/sha1-ssse3-amd64.S index db62928a..f09b1de1 100644 --- a/cipher/sha1-ssse3-amd64.S +++ b/cipher/sha1-ssse3-amd64.S @@ -428,7 +428,7 @@ _gcry_sha1_transform_amd64_ssse3: xorl %eax, %eax; .Lret: - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sha1_transform_amd64_ssse3, .-_gcry_sha1_transform_amd64_ssse3;) diff --git a/cipher/sha256-avx-amd64.S b/cipher/sha256-avx-amd64.S index ec945f84..be8a799d 100644 --- a/cipher/sha256-avx-amd64.S +++ b/cipher/sha256-avx-amd64.S @@ -471,7 +471,7 @@ _gcry_sha256_transform_amd64_avx: pop rbx CFI_POP(rbx) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/cipher/sha256-avx2-bmi2-amd64.S b/cipher/sha256-avx2-bmi2-amd64.S index d130dd4a..60ad442c 100644 --- a/cipher/sha256-avx2-bmi2-amd64.S +++ b/cipher/sha256-avx2-bmi2-amd64.S @@ -474,7 +474,7 @@ _gcry_sha256_transform_amd64_avx2: CFI_POP(rbx) .Lnowork: - ret + ret_spec_stop CFI_ENDPROC() .align 64 diff --git a/cipher/sha256-ssse3-amd64.S b/cipher/sha256-ssse3-amd64.S index 098b0eb6..401ff6f4 100644 --- a/cipher/sha256-ssse3-amd64.S +++ b/cipher/sha256-ssse3-amd64.S @@ -493,7 +493,7 @@ _gcry_sha256_transform_amd64_ssse3: pop rbx CFI_POP(rbx) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/cipher/sha512-avx-amd64.S b/cipher/sha512-avx-amd64.S index 75f7b070..bfc4435d 100644 --- a/cipher/sha512-avx-amd64.S +++ b/cipher/sha512-avx-amd64.S @@ -400,7 +400,7 @@ _gcry_sha512_transform_amd64_avx: CFI_ADJUST_CFA_OFFSET(-frame_size); .Lnowork: - ret + ret_spec_stop CFI_ENDPROC() /* diff --git a/cipher/sha512-avx2-bmi2-amd64.S b/cipher/sha512-avx2-bmi2-amd64.S index 7f119e6c..a431e196 100644 --- a/cipher/sha512-avx2-bmi2-amd64.S +++ b/cipher/sha512-avx2-bmi2-amd64.S @@ -439,7 +439,7 @@ _gcry_sha512_transform_amd64_avx2: CFI_DEF_CFA_REGISTER(rsp) .Lnowork: - ret + ret_spec_stop CFI_ENDPROC() /*;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; */ diff --git a/cipher/sha512-ssse3-amd64.S b/cipher/sha512-ssse3-amd64.S index 6a1328a6..9cc30892 100644 --- a/cipher/sha512-ssse3-amd64.S +++ b/cipher/sha512-ssse3-amd64.S @@ -406,7 +406,7 @@ _gcry_sha512_transform_amd64_ssse3: CFI_ADJUST_CFA_OFFSET(-frame_size); .Lnowork: - ret + ret_spec_stop CFI_ENDPROC() /* diff --git a/cipher/sm3-avx-bmi2-amd64.S b/cipher/sm3-avx-bmi2-amd64.S index 46226ae6..d9b6206a 100644 --- a/cipher/sm3-avx-bmi2-amd64.S +++ b/cipher/sm3-avx-bmi2-amd64.S @@ -544,7 +544,7 @@ _gcry_sm3_transform_amd64_avx_bmi2: leave; CFI_LEAVE(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm3_transform_amd64_avx_bmi2, .-_gcry_sm3_transform_amd64_avx_bmi2;) diff --git a/cipher/sm4-aesni-avx-amd64.S b/cipher/sm4-aesni-avx-amd64.S index 3610b98c..7a99e070 100644 --- a/cipher/sm4-aesni-avx-amd64.S +++ b/cipher/sm4-aesni-avx-amd64.S @@ -240,7 +240,7 @@ _gcry_sm4_aesni_avx_expand_key: #undef ROUND vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_expand_key,.-_gcry_sm4_aesni_avx_expand_key;) @@ -345,7 +345,7 @@ sm4_aesni_avx_crypt_blk1_4: .Lblk4_store_output_done: vzeroall; xorl %eax, %eax; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size sm4_aesni_avx_crypt_blk1_4,.-sm4_aesni_avx_crypt_blk1_4;) @@ -454,7 +454,7 @@ __sm4_crypt_blk8: vpshufb RTMP2, RB2, RB2; vpshufb RTMP2, RB3, RB3; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __sm4_crypt_blk8,.-__sm4_crypt_blk8;) @@ -508,7 +508,7 @@ _gcry_sm4_aesni_avx_crypt_blk1_8: .Lblk8_store_output_done: vzeroall; xorl %eax, %eax; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_crypt_blk1_8,.-_gcry_sm4_aesni_avx_crypt_blk1_8;) @@ -582,7 +582,7 @@ _gcry_sm4_aesni_avx_ctr_enc: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ctr_enc,.-_gcry_sm4_aesni_avx_ctr_enc;) @@ -631,7 +631,7 @@ _gcry_sm4_aesni_avx_cbc_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_cbc_dec,.-_gcry_sm4_aesni_avx_cbc_dec;) @@ -683,7 +683,7 @@ _gcry_sm4_aesni_avx_cfb_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_cfb_dec,.-_gcry_sm4_aesni_avx_cfb_dec;) @@ -782,7 +782,7 @@ _gcry_sm4_aesni_avx_ocb_enc: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ocb_enc,.-_gcry_sm4_aesni_avx_ocb_enc;) @@ -891,7 +891,7 @@ _gcry_sm4_aesni_avx_ocb_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ocb_dec,.-_gcry_sm4_aesni_avx_ocb_dec;) @@ -979,7 +979,7 @@ _gcry_sm4_aesni_avx_ocb_auth: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx_ocb_auth,.-_gcry_sm4_aesni_avx_ocb_auth;) diff --git a/cipher/sm4-aesni-avx2-amd64.S b/cipher/sm4-aesni-avx2-amd64.S index 6e46c0dc..7a8b9558 100644 --- a/cipher/sm4-aesni-avx2-amd64.S +++ b/cipher/sm4-aesni-avx2-amd64.S @@ -276,7 +276,7 @@ __sm4_crypt_blk16: vpshufb RTMP2, RB2, RB2; vpshufb RTMP2, RB3, RB3; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __sm4_crypt_blk16,.-__sm4_crypt_blk16;) @@ -394,7 +394,7 @@ _gcry_sm4_aesni_avx2_ctr_enc: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ctr_enc,.-_gcry_sm4_aesni_avx2_ctr_enc;) @@ -447,7 +447,7 @@ _gcry_sm4_aesni_avx2_cbc_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_cbc_dec,.-_gcry_sm4_aesni_avx2_cbc_dec;) @@ -502,7 +502,7 @@ _gcry_sm4_aesni_avx2_cfb_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_cfb_dec,.-_gcry_sm4_aesni_avx2_cfb_dec;) @@ -616,7 +616,7 @@ _gcry_sm4_aesni_avx2_ocb_enc: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ocb_enc,.-_gcry_sm4_aesni_avx2_ocb_enc;) @@ -740,7 +740,7 @@ _gcry_sm4_aesni_avx2_ocb_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ocb_dec,.-_gcry_sm4_aesni_avx2_ocb_dec;) @@ -843,7 +843,7 @@ _gcry_sm4_aesni_avx2_ocb_auth: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_sm4_aesni_avx2_ocb_auth,.-_gcry_sm4_aesni_avx2_ocb_auth;) diff --git a/cipher/twofish-amd64.S b/cipher/twofish-amd64.S index 3cb73431..a7a60553 100644 --- a/cipher/twofish-amd64.S +++ b/cipher/twofish-amd64.S @@ -211,7 +211,7 @@ _gcry_twofish_amd64_encrypt_block: CFI_ADJUST_CFA_OFFSET(-3 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;) @@ -265,7 +265,7 @@ _gcry_twofish_amd64_decrypt_block: CFI_ADJUST_CFA_OFFSET(-3 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_encrypt_block,.-_gcry_twofish_amd64_encrypt_block;) @@ -511,7 +511,7 @@ __twofish_enc_blk3: outunpack_enc3(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __twofish_enc_blk3,.-__twofish_enc_blk3;) @@ -540,7 +540,7 @@ __twofish_dec_blk3: outunpack_dec3(); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __twofish_dec_blk3,.-__twofish_dec_blk3;) @@ -641,7 +641,7 @@ _gcry_twofish_amd64_ctr_enc: CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ctr_enc,.-_gcry_twofish_amd64_ctr_enc;) @@ -726,7 +726,7 @@ _gcry_twofish_amd64_cbc_dec: CFI_ADJUST_CFA_OFFSET(-9 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_cbc_dec,.-_gcry_twofish_amd64_cbc_dec;) @@ -811,7 +811,7 @@ _gcry_twofish_amd64_cfb_dec: CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_cfb_dec,.-_gcry_twofish_amd64_cfb_dec;) @@ -937,7 +937,7 @@ _gcry_twofish_amd64_ocb_enc: CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_enc,.-_gcry_twofish_amd64_ocb_enc;) @@ -1071,7 +1071,7 @@ _gcry_twofish_amd64_ocb_dec: CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_dec,.-_gcry_twofish_amd64_ocb_dec;) @@ -1176,7 +1176,7 @@ _gcry_twofish_amd64_ocb_auth: CFI_ADJUST_CFA_OFFSET(-8 * 8); EXIT_SYSV_FUNC - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_amd64_ocb_auth,.-_gcry_twofish_amd64_ocb_auth;) diff --git a/cipher/twofish-avx2-amd64.S b/cipher/twofish-avx2-amd64.S index 74cad355..930ac792 100644 --- a/cipher/twofish-avx2-amd64.S +++ b/cipher/twofish-avx2-amd64.S @@ -431,7 +431,7 @@ __twofish_enc_blk16: outunpack_enc16(RA, RB, RC, RD); transpose4x4_16(RA, RB, RC, RD); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __twofish_enc_blk16,.-__twofish_enc_blk16;) @@ -464,7 +464,7 @@ __twofish_dec_blk16: outunpack_dec16(RA, RB, RC, RD); transpose4x4_16(RA, RB, RC, RD); - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size __twofish_dec_blk16,.-__twofish_dec_blk16;) @@ -582,7 +582,7 @@ _gcry_twofish_avx2_ctr_enc: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ctr_enc,.-_gcry_twofish_avx2_ctr_enc;) @@ -635,7 +635,7 @@ _gcry_twofish_avx2_cbc_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_cbc_dec,.-_gcry_twofish_avx2_cbc_dec;) @@ -690,7 +690,7 @@ _gcry_twofish_avx2_cfb_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_cfb_dec,.-_gcry_twofish_avx2_cfb_dec;) @@ -804,7 +804,7 @@ _gcry_twofish_avx2_ocb_enc: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_enc,.-_gcry_twofish_avx2_ocb_enc;) @@ -929,7 +929,7 @@ _gcry_twofish_avx2_ocb_dec: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_dec,.-_gcry_twofish_avx2_ocb_dec;) @@ -1032,7 +1032,7 @@ _gcry_twofish_avx2_ocb_auth: vzeroall; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_twofish_avx2_ocb_auth,.-_gcry_twofish_avx2_ocb_auth;) diff --git a/cipher/whirlpool-sse2-amd64.S b/cipher/whirlpool-sse2-amd64.S index 5631dc56..37648faa 100644 --- a/cipher/whirlpool-sse2-amd64.S +++ b/cipher/whirlpool-sse2-amd64.S @@ -340,7 +340,7 @@ _gcry_whirlpool_transform_amd64: CFI_ADJUST_CFA_OFFSET(-STACK_MAX); .Lskip: movl $(STACK_MAX + 8), %eax; - ret; + ret_spec_stop; CFI_ENDPROC(); ELF(.size _gcry_whirlpool_transform_amd64,.-_gcry_whirlpool_transform_amd64;) diff --git a/mpi/amd64/func_abi.h b/mpi/amd64/func_abi.h index a60363e4..c3f2d026 100644 --- a/mpi/amd64/func_abi.h +++ b/mpi/amd64/func_abi.h @@ -1,28 +1,6 @@ #include -#ifdef __x86_64__ -#ifdef HAVE_GCC_ASM_CFI_DIRECTIVES -# define CFI_STARTPROC() .cfi_startproc -# define CFI_ENDPROC() .cfi_endproc -# define CFI_ADJUST_CFA_OFFSET(off) .cfi_adjust_cfa_offset off -# define CFI_REL_OFFSET(reg,off) .cfi_rel_offset reg, off -# define CFI_RESTORE(reg) .cfi_restore reg - -# define CFI_PUSH(reg) \ - CFI_ADJUST_CFA_OFFSET(8); CFI_REL_OFFSET(reg, 0) -# define CFI_POP(reg) \ - CFI_ADJUST_CFA_OFFSET(-8); CFI_RESTORE(reg) -#else -# define CFI_STARTPROC() -# define CFI_ENDPROC() -# define CFI_ADJUST_CFA_OFFSET(off) -# define CFI_REL_OFFSET(reg,off) -# define CFI_RESTORE(reg) - -# define CFI_PUSH(reg) -# define CFI_POP(reg) -#endif -#endif +#include "asm-common-amd64.h" #ifdef USE_MS_ABI /* Store registers and move four first input arguments from MS ABI to @@ -44,13 +22,13 @@ CFI_POP(%rdi); \ popq %rsi; \ CFI_POP(%rsi); \ - ret; \ + ret_spec_stop; \ CFI_ENDPROC(); #else #define FUNC_ENTRY() \ CFI_STARTPROC(); #define FUNC_EXIT() \ - ret; \ + ret_spec_stop; \ CFI_ENDPROC(); #endif diff --git a/mpi/asm-common-amd64.h b/mpi/asm-common-amd64.h new file mode 100644 index 00000000..ad0e8e62 --- /dev/null +++ b/mpi/asm-common-amd64.h @@ -0,0 +1,26 @@ +/* asm-common-amd64.h - Common macros for AMD64 assembly + * + * Copyright (C) 2022 Jussi Kivilinna + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + */ + +#ifndef MPI_ASM_COMMON_AMD64_H +#define MPI_ASM_COMMON_AMD64_H + +#include "../cipher/asm-common-amd64.h" + +#endif /* MPI_ASM_COMMON_AMD64_H */ diff --git a/mpi/i386/mpih-add1.S b/mpi/i386/mpih-add1.S index de78a0cb..95a75890 100644 --- a/mpi/i386/mpih-add1.S +++ b/mpi/i386/mpih-add1.S @@ -156,6 +156,6 @@ Loop: movl (%esi),%eax CFI_POP(%esi) popl %edi CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-lshift.S b/mpi/i386/mpih-lshift.S index 55da0678..3404cf55 100644 --- a/mpi/i386/mpih-lshift.S +++ b/mpi/i386/mpih-lshift.S @@ -86,7 +86,7 @@ L1: movl (%esi,%edx,4),%eax popl %ebx popl %esi popl %edi - ret + ret_spec_stop Lend: shll %cl,%ebx /* compute least significant limb */ movl %ebx,(%edi) /* store it */ @@ -97,6 +97,6 @@ Lend: shll %cl,%ebx /* compute least significant limb */ CFI_POP(%esi) popl %edi CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-mul1.S b/mpi/i386/mpih-mul1.S index 9679ea62..a672d052 100644 --- a/mpi/i386/mpih-mul1.S +++ b/mpi/i386/mpih-mul1.S @@ -89,6 +89,6 @@ Loop: CFI_POP(%esi) INSN1(pop,l ,R(edi)) CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-mul2.S b/mpi/i386/mpih-mul2.S index fe4129c4..e09c3f7c 100644 --- a/mpi/i386/mpih-mul2.S +++ b/mpi/i386/mpih-mul2.S @@ -91,6 +91,6 @@ Loop: CFI_POP(%esi) INSN1(pop,l ,R(edi)) CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-mul3.S b/mpi/i386/mpih-mul3.S index 87577d54..4112c699 100644 --- a/mpi/i386/mpih-mul3.S +++ b/mpi/i386/mpih-mul3.S @@ -91,6 +91,6 @@ Loop: CFI_POP(%esi) INSN1(pop,l ,R(edi)) CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-rshift.S b/mpi/i386/mpih-rshift.S index 35a8201f..5d34696c 100644 --- a/mpi/i386/mpih-rshift.S +++ b/mpi/i386/mpih-rshift.S @@ -89,7 +89,7 @@ L2: movl (%esi,%edx,4),%eax popl %ebx popl %esi popl %edi - ret + ret_spec_stop Lend2: shrl %cl,%ebx /* compute most significant limb */ movl %ebx,(%edi) /* store it */ @@ -100,6 +100,6 @@ Lend2: shrl %cl,%ebx /* compute most significant limb */ CFI_POP(%esi) popl %edi CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/mpih-sub1.S b/mpi/i386/mpih-sub1.S index 2bdc1438..49477ae3 100644 --- a/mpi/i386/mpih-sub1.S +++ b/mpi/i386/mpih-sub1.S @@ -157,6 +157,6 @@ Loop: movl (%esi),%eax CFI_POP(%esi) popl %edi CFI_POP(%edi) - ret + ret_spec_stop CFI_ENDPROC() diff --git a/mpi/i386/syntax.h b/mpi/i386/syntax.h index dd300319..bab2d4a6 100644 --- a/mpi/i386/syntax.h +++ b/mpi/i386/syntax.h @@ -92,3 +92,9 @@ #undef ALIGN #define ALIGN(log) .align log,0x90 #endif + +/* 'ret' instruction replacement for straight-line speculation mitigation */ +#define ret_spec_stop \ + ret; \ + jmp .; \ + int3; -- 2.32.0 From jussi.kivilinna at iki.fi Sat Jan 8 21:13:38 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 8 Jan 2022 22:13:38 +0200 Subject: [PATCH 3/4] mpi: remove unused i586 and pentium4 assembly In-Reply-To: <20220108201339.360118-1-jussi.kivilinna@iki.fi> References: <20220108201339.360118-1-jussi.kivilinna@iki.fi> Message-ID: <20220108201339.360118-3-jussi.kivilinna@iki.fi> * mpi/config.links: Remove 'i586' from paths. * mpi/i586*: Remove. * mpi/pentium4/*: Remove. -- Current x86 targets (i686) have been defaulting on mpi/i386 assembly for quite some time now. Remove mpi/i586 as it is no longer used. While at it, remove mpi/pentium4 assembly also as obsolete. Signed-off-by: Jussi Kivilinna --- mpi/config.links | 8 +- mpi/i586/README | 26 -- mpi/i586/distfiles | 9 - mpi/i586/mpih-add1.S | 135 ---------- mpi/i586/mpih-lshift.S | 229 ----------------- mpi/i586/mpih-mul1.S | 89 ------- mpi/i586/mpih-mul2.S | 93 ------- mpi/i586/mpih-mul3.S | 93 ------- mpi/i586/mpih-rshift.S | 228 ---------------- mpi/i586/mpih-sub1.S | 142 ---------- mpi/pentium4/README | 115 --------- mpi/pentium4/distfiles | 3 - mpi/pentium4/mmx/distfiles | 2 - mpi/pentium4/mmx/mpih-lshift.S | 457 --------------------------------- mpi/pentium4/mmx/mpih-rshift.S | 453 -------------------------------- mpi/pentium4/sse2/distfiles | 5 - mpi/pentium4/sse2/mpih-add1.S | 91 ------- mpi/pentium4/sse2/mpih-mul1.S | 96 ------- mpi/pentium4/sse2/mpih-mul2.S | 136 ---------- mpi/pentium4/sse2/mpih-mul3.S | 127 --------- mpi/pentium4/sse2/mpih-sub1.S | 112 -------- 21 files changed, 4 insertions(+), 2645 deletions(-) delete mode 100644 mpi/i586/README delete mode 100644 mpi/i586/distfiles delete mode 100644 mpi/i586/mpih-add1.S delete mode 100644 mpi/i586/mpih-lshift.S delete mode 100644 mpi/i586/mpih-mul1.S delete mode 100644 mpi/i586/mpih-mul2.S delete mode 100644 mpi/i586/mpih-mul3.S delete mode 100644 mpi/i586/mpih-rshift.S delete mode 100644 mpi/i586/mpih-sub1.S delete mode 100644 mpi/pentium4/README delete mode 100644 mpi/pentium4/distfiles delete mode 100644 mpi/pentium4/mmx/distfiles delete mode 100644 mpi/pentium4/mmx/mpih-lshift.S delete mode 100644 mpi/pentium4/mmx/mpih-rshift.S delete mode 100644 mpi/pentium4/sse2/distfiles delete mode 100644 mpi/pentium4/sse2/mpih-add1.S delete mode 100644 mpi/pentium4/sse2/mpih-mul1.S delete mode 100644 mpi/pentium4/sse2/mpih-mul2.S delete mode 100644 mpi/pentium4/sse2/mpih-mul3.S delete mode 100644 mpi/pentium4/sse2/mpih-sub1.S diff --git a/mpi/config.links b/mpi/config.links index e4fc4fc4..deb98bf0 100644 --- a/mpi/config.links +++ b/mpi/config.links @@ -73,7 +73,7 @@ case "${host}" in pentiumpro-*-netbsd*) echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h - path="i586 i386" + path="i386" mpi_cpu_arch="x86" ;; i[34]86*-*-bsdi4*) @@ -97,7 +97,7 @@ case "${host}" in echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h echo '#define X86_BROKEN_ALIGN' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h - path="i586 i386" + path="i386" mpi_cpu_arch="x86" ;; i[3467]86*-msdosdjgpp* | \ @@ -111,7 +111,7 @@ case "${host}" in i[567]86*-apple-darwin*) echo '#define BSD_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h - path="i586 i386" + path="i386" mpi_cpu_arch="x86" ;; i[3467]86*-*-*) @@ -125,7 +125,7 @@ case "${host}" in pentiumpro-*-*) echo '#define ELF_SYNTAX' >>./mpi/asm-syntax.h cat $srcdir/mpi/i386/syntax.h >>./mpi/asm-syntax.h - path="i586 i386" + path="i386" mpi_cpu_arch="x86" ;; x86_64-apple-darwin*) diff --git a/mpi/i586/README b/mpi/i586/README deleted file mode 100644 index d73b0826..00000000 --- a/mpi/i586/README +++ /dev/null @@ -1,26 +0,0 @@ -This directory contains mpn functions optimized for Intel Pentium -processors. - -RELEVANT OPTIMIZATION ISSUES - -1. Pentium doesn't allocate cache lines on writes, unlike most other modern -processors. Since the functions in the mpn class do array writes, we have to -handle allocating the destination cache lines by reading a word from it in the -loops, to achieve the best performance. - -2. Pairing of memory operations requires that the two issued operations refer -to different cache banks. The simplest way to insure this is to read/write -two words from the same object. If we make operations on different objects, -they might or might not be to the same cache bank. - -STATUS - -1. mpn_lshift and mpn_rshift run at about 6 cycles/limb, but the Pentium -documentation indicates that they should take only 43/8 = 5.375 cycles/limb, -or 5 cycles/limb asymptotically. - -2. mpn_add_n and mpn_sub_n run at asymptotically 2 cycles/limb. Due to loop -overhead and other delays (cache refill?), they run at or near 2.5 cycles/limb. - -3. mpn_mul_1, mpn_addmul_1, mpn_submul_1 all run 1 cycle faster than they -should... diff --git a/mpi/i586/distfiles b/mpi/i586/distfiles deleted file mode 100644 index 8f821fbf..00000000 --- a/mpi/i586/distfiles +++ /dev/null @@ -1,9 +0,0 @@ -mpih-add1.S -mpih-mul1.S -mpih-mul2.S -mpih-mul3.S -mpih-lshift.S -mpih-rshift.S -mpih-sub1.S -README - diff --git a/mpi/i586/mpih-add1.S b/mpi/i586/mpih-add1.S deleted file mode 100644 index 7436d592..00000000 --- a/mpi/i586/mpih-add1.S +++ /dev/null @@ -1,135 +0,0 @@ -/* i80586 add_n -- Add two limb vectors of the same length > 0 and store - * sum in a third limb vector. - * - * Copyright (C) 1992, 1994, 1995, 1996, 1998, - * 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_add_n( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_ptr_t s2_ptr, (sp + 12) - * mpi_size_t size) (sp + 16) - */ - -.text - ALIGN (3) - .globl C_SYMBOL_NAME(_gcry_mpih_add_n) -C_SYMBOL_NAME(_gcry_mpih_add_n:) - pushl %edi - pushl %esi - pushl %ebx - pushl %ebp - - movl 20(%esp),%edi /* res_ptr */ - movl 24(%esp),%esi /* s1_ptr */ - movl 28(%esp),%ebp /* s2_ptr */ - movl 32(%esp),%ecx /* size */ - - movl (%ebp),%ebx - - decl %ecx - movl %ecx,%edx - shrl $3,%ecx - andl $7,%edx - testl %ecx,%ecx /* zero carry flag */ - jz Lend - pushl %edx - - ALIGN (3) -Loop: movl 28(%edi),%eax /* fetch destination cache line */ - leal 32(%edi),%edi - -L1: movl (%esi),%eax - movl 4(%esi),%edx - adcl %ebx,%eax - movl 4(%ebp),%ebx - adcl %ebx,%edx - movl 8(%ebp),%ebx - movl %eax,-32(%edi) - movl %edx,-28(%edi) - -L2: movl 8(%esi),%eax - movl 12(%esi),%edx - adcl %ebx,%eax - movl 12(%ebp),%ebx - adcl %ebx,%edx - movl 16(%ebp),%ebx - movl %eax,-24(%edi) - movl %edx,-20(%edi) - -L3: movl 16(%esi),%eax - movl 20(%esi),%edx - adcl %ebx,%eax - movl 20(%ebp),%ebx - adcl %ebx,%edx - movl 24(%ebp),%ebx - movl %eax,-16(%edi) - movl %edx,-12(%edi) - -L4: movl 24(%esi),%eax - movl 28(%esi),%edx - adcl %ebx,%eax - movl 28(%ebp),%ebx - adcl %ebx,%edx - movl 32(%ebp),%ebx - movl %eax,-8(%edi) - movl %edx,-4(%edi) - - leal 32(%esi),%esi - leal 32(%ebp),%ebp - decl %ecx - jnz Loop - - popl %edx -Lend: - decl %edx /* test %edx w/o clobbering carry */ - js Lend2 - incl %edx -Loop2: - leal 4(%edi),%edi - movl (%esi),%eax - adcl %ebx,%eax - movl 4(%ebp),%ebx - movl %eax,-4(%edi) - leal 4(%esi),%esi - leal 4(%ebp),%ebp - decl %edx - jnz Loop2 -Lend2: - movl (%esi),%eax - adcl %ebx,%eax - movl %eax,(%edi) - - sbbl %eax,%eax - negl %eax - - popl %ebp - popl %ebx - popl %esi - popl %edi - ret - - diff --git a/mpi/i586/mpih-lshift.S b/mpi/i586/mpih-lshift.S deleted file mode 100644 index 9d25fe9d..00000000 --- a/mpi/i586/mpih-lshift.S +++ /dev/null @@ -1,229 +0,0 @@ -/* i80586 lshift - * - * Copyright (C) 1992, 1994, 1998, - * 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_lshift( mpi_ptr_t wp, (sp + 4) - * mpi_ptr_t up, (sp + 8) - * mpi_size_t usize, (sp + 12) - * unsigned cnt) (sp + 16) - */ - -.text - ALIGN (3) - .globl C_SYMBOL_NAME(_gcry_mpih_lshift) -C_SYMBOL_NAME(_gcry_mpih_lshift:) - - pushl %edi - pushl %esi - pushl %ebx - pushl %ebp - - movl 20(%esp),%edi /* res_ptr */ - movl 24(%esp),%esi /* s_ptr */ - movl 28(%esp),%ebp /* size */ - movl 32(%esp),%ecx /* cnt */ - -/* We can use faster code for shift-by-1 under certain conditions. */ - cmp $1,%ecx - jne Lnormal - leal 4(%esi),%eax - cmpl %edi,%eax - jnc Lspecial /* jump if s_ptr + 1 >= res_ptr */ - leal (%esi,%ebp,4),%eax - cmpl %eax,%edi - jnc Lspecial /* jump if res_ptr >= s_ptr + size */ - -Lnormal: - leal -4(%edi,%ebp,4),%edi - leal -4(%esi,%ebp,4),%esi - - movl (%esi),%edx - subl $4,%esi - xorl %eax,%eax - shldl %cl,%edx,%eax /* compute carry limb */ - pushl %eax /* push carry limb onto stack */ - - decl %ebp - pushl %ebp - shrl $3,%ebp - jz Lend - - movl (%edi),%eax /* fetch destination cache line */ - - ALIGN (2) -Loop: movl -28(%edi),%eax /* fetch destination cache line */ - movl %edx,%ebx - - movl (%esi),%eax - movl -4(%esi),%edx - shldl %cl,%eax,%ebx - shldl %cl,%edx,%eax - movl %ebx,(%edi) - movl %eax,-4(%edi) - - movl -8(%esi),%ebx - movl -12(%esi),%eax - shldl %cl,%ebx,%edx - shldl %cl,%eax,%ebx - movl %edx,-8(%edi) - movl %ebx,-12(%edi) - - movl -16(%esi),%edx - movl -20(%esi),%ebx - shldl %cl,%edx,%eax - shldl %cl,%ebx,%edx - movl %eax,-16(%edi) - movl %edx,-20(%edi) - - movl -24(%esi),%eax - movl -28(%esi),%edx - shldl %cl,%eax,%ebx - shldl %cl,%edx,%eax - movl %ebx,-24(%edi) - movl %eax,-28(%edi) - - subl $32,%esi - subl $32,%edi - decl %ebp - jnz Loop - -Lend: popl %ebp - andl $7,%ebp - jz Lend2 -Loop2: movl (%esi),%eax - shldl %cl,%eax,%edx - movl %edx,(%edi) - movl %eax,%edx - subl $4,%esi - subl $4,%edi - decl %ebp - jnz Loop2 - -Lend2: shll %cl,%edx /* compute least significant limb */ - movl %edx,(%edi) /* store it */ - - popl %eax /* pop carry limb */ - - popl %ebp - popl %ebx - popl %esi - popl %edi - ret - -/* We loop from least significant end of the arrays, which is only - permissable if the source and destination don't overlap, since the - function is documented to work for overlapping source and destination. -*/ - -Lspecial: - movl (%esi),%edx - addl $4,%esi - - decl %ebp - pushl %ebp - shrl $3,%ebp - - addl %edx,%edx - incl %ebp - decl %ebp - jz LLend - - movl (%edi),%eax /* fetch destination cache line */ - - ALIGN (2) -LLoop: movl 28(%edi),%eax /* fetch destination cache line */ - movl %edx,%ebx - - movl (%esi),%eax - movl 4(%esi),%edx - adcl %eax,%eax - movl %ebx,(%edi) - adcl %edx,%edx - movl %eax,4(%edi) - - movl 8(%esi),%ebx - movl 12(%esi),%eax - adcl %ebx,%ebx - movl %edx,8(%edi) - adcl %eax,%eax - movl %ebx,12(%edi) - - movl 16(%esi),%edx - movl 20(%esi),%ebx - adcl %edx,%edx - movl %eax,16(%edi) - adcl %ebx,%ebx - movl %edx,20(%edi) - - movl 24(%esi),%eax - movl 28(%esi),%edx - adcl %eax,%eax - movl %ebx,24(%edi) - adcl %edx,%edx - movl %eax,28(%edi) - - leal 32(%esi),%esi /* use leal not to clobber carry */ - leal 32(%edi),%edi - decl %ebp - jnz LLoop - -LLend: popl %ebp - sbbl %eax,%eax /* save carry in %eax */ - andl $7,%ebp - jz LLend2 - addl %eax,%eax /* restore carry from eax */ -LLoop2: movl %edx,%ebx - movl (%esi),%edx - adcl %edx,%edx - movl %ebx,(%edi) - - leal 4(%esi),%esi /* use leal not to clobber carry */ - leal 4(%edi),%edi - decl %ebp - jnz LLoop2 - - jmp LL1 -LLend2: addl %eax,%eax /* restore carry from eax */ -LL1: movl %edx,(%edi) /* store last limb */ - - sbbl %eax,%eax - negl %eax - - popl %ebp - popl %ebx - popl %esi - popl %edi - ret - - diff --git a/mpi/i586/mpih-mul1.S b/mpi/i586/mpih-mul1.S deleted file mode 100644 index 3601d968..00000000 --- a/mpi/i586/mpih-mul1.S +++ /dev/null @@ -1,89 +0,0 @@ -/* i80586 mul_1 -- Multiply a limb vector with a limb and store - * the result in a second limb vector. - * - * Copyright (C) 1992, 1994, 1996, 1998, - * 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_mul_1( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_size_t s1_size, (sp + 12) - * mpi_limb_t s2_limb) (sp + 16) - */ - -#define res_ptr edi -#define s1_ptr esi -#define size ecx -#define s2_limb ebp - - TEXT - ALIGN (3) - GLOBL C_SYMBOL_NAME(_gcry_mpih_mul_1) -C_SYMBOL_NAME(_gcry_mpih_mul_1:) - - INSN1(push,l ,R(edi)) - INSN1(push,l ,R(esi)) - INSN1(push,l ,R(ebx)) - INSN1(push,l ,R(ebp)) - - INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) - INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) - INSN2(mov,l ,R(size),MEM_DISP(esp,28)) - INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) - - INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) - INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) - INSN1(neg,l ,R(size)) - INSN2(xor,l ,R(ebx),R(ebx)) - ALIGN (3) - -Loop: INSN2(adc,l ,R(ebx),$0) - INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) - - INSN1(mul,l ,R(s2_limb)) - - INSN2(add,l ,R(ebx),R(eax)) - - INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(ebx)) - INSN1(inc,l ,R(size)) - - INSN2(mov,l ,R(ebx),R(edx)) - INSN1(jnz, ,Loop) - - INSN2(adc,l ,R(ebx),$0) - INSN2(mov,l ,R(eax),R(ebx)) - INSN1(pop,l ,R(ebp)) - INSN1(pop,l ,R(ebx)) - INSN1(pop,l ,R(esi)) - INSN1(pop,l ,R(edi)) - ret - diff --git a/mpi/i586/mpih-mul2.S b/mpi/i586/mpih-mul2.S deleted file mode 100644 index f32d363a..00000000 --- a/mpi/i586/mpih-mul2.S +++ /dev/null @@ -1,93 +0,0 @@ -/* i80586 addmul_1 -- Multiply a limb vector with a limb and add - * the result to a second limb vector. - * - * Copyright (C) 1992, 1994, 1998, - * 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_size_t s1_size, (sp + 12) - * mpi_limb_t s2_limb) (sp + 16) - */ - -#define res_ptr edi -#define s1_ptr esi -#define size ecx -#define s2_limb ebp - - TEXT - ALIGN (3) - GLOBL C_SYMBOL_NAME(_gcry_mpih_addmul_1) -C_SYMBOL_NAME(_gcry_mpih_addmul_1:) - - INSN1(push,l ,R(edi)) - INSN1(push,l ,R(esi)) - INSN1(push,l ,R(ebx)) - INSN1(push,l ,R(ebp)) - - INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) - INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) - INSN2(mov,l ,R(size),MEM_DISP(esp,28)) - INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) - - INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) - INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) - INSN1(neg,l ,R(size)) - INSN2(xor,l ,R(ebx),R(ebx)) - ALIGN (3) - -Loop: INSN2(adc,l ,R(ebx),$0) - INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) - - INSN1(mul,l ,R(s2_limb)) - - INSN2(add,l ,R(eax),R(ebx)) - INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,size,4)) - - INSN2(adc,l ,R(edx),$0) - INSN2(add,l ,R(ebx),R(eax)) - - INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(ebx)) - INSN1(inc,l ,R(size)) - - INSN2(mov,l ,R(ebx),R(edx)) - INSN1(jnz, ,Loop) - - INSN2(adc,l ,R(ebx),$0) - INSN2(mov,l ,R(eax),R(ebx)) - INSN1(pop,l ,R(ebp)) - INSN1(pop,l ,R(ebx)) - INSN1(pop,l ,R(esi)) - INSN1(pop,l ,R(edi)) - ret - diff --git a/mpi/i586/mpih-mul3.S b/mpi/i586/mpih-mul3.S deleted file mode 100644 index fa27d4e1..00000000 --- a/mpi/i586/mpih-mul3.S +++ /dev/null @@ -1,93 +0,0 @@ -/* i80586 submul_1 -- Multiply a limb vector with a limb and add - * the result to a second limb vector. - * - * Copyright (C) 1992, 1994, 1998, - * 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_submul_1( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_size_t s1_size, (sp + 12) - * mpi_limb_t s2_limb) (sp + 16) - */ - -#define res_ptr edi -#define s1_ptr esi -#define size ecx -#define s2_limb ebp - - TEXT - ALIGN (3) - GLOBL C_SYMBOL_NAME(_gcry_mpih_submul_1) -C_SYMBOL_NAME(_gcry_mpih_submul_1:) - - INSN1(push,l ,R(edi)) - INSN1(push,l ,R(esi)) - INSN1(push,l ,R(ebx)) - INSN1(push,l ,R(ebp)) - - INSN2(mov,l ,R(res_ptr),MEM_DISP(esp,20)) - INSN2(mov,l ,R(s1_ptr),MEM_DISP(esp,24)) - INSN2(mov,l ,R(size),MEM_DISP(esp,28)) - INSN2(mov,l ,R(s2_limb),MEM_DISP(esp,32)) - - INSN2(lea,l ,R(res_ptr),MEM_INDEX(res_ptr,size,4)) - INSN2(lea,l ,R(s1_ptr),MEM_INDEX(s1_ptr,size,4)) - INSN1(neg,l ,R(size)) - INSN2(xor,l ,R(ebx),R(ebx)) - ALIGN (3) - -Loop: INSN2(adc,l ,R(ebx),$0) - INSN2(mov,l ,R(eax),MEM_INDEX(s1_ptr,size,4)) - - INSN1(mul,l ,R(s2_limb)) - - INSN2(add,l ,R(eax),R(ebx)) - INSN2(mov,l ,R(ebx),MEM_INDEX(res_ptr,size,4)) - - INSN2(adc,l ,R(edx),$0) - INSN2(sub,l ,R(ebx),R(eax)) - - INSN2(mov,l ,MEM_INDEX(res_ptr,size,4),R(ebx)) - INSN1(inc,l ,R(size)) - - INSN2(mov,l ,R(ebx),R(edx)) - INSN1(jnz, ,Loop) - - INSN2(adc,l ,R(ebx),$0) - INSN2(mov,l ,R(eax),R(ebx)) - INSN1(pop,l ,R(ebp)) - INSN1(pop,l ,R(ebx)) - INSN1(pop,l ,R(esi)) - INSN1(pop,l ,R(edi)) - ret - diff --git a/mpi/i586/mpih-rshift.S b/mpi/i586/mpih-rshift.S deleted file mode 100644 index c661e3d3..00000000 --- a/mpi/i586/mpih-rshift.S +++ /dev/null @@ -1,228 +0,0 @@ -/* i80586 rshift - * - * Copyright (C) 1992, 1994, 1998, - * 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - - -/******************* - * mpi_limb_t - * _gcry_mpih_rshift( mpi_ptr_t wp, (sp + 4) - * mpi_ptr_t up, (sp + 8) - * mpi_size_t usize, (sp + 12) - * unsigned cnt) (sp + 16) - */ - -.text - ALIGN (3) - .globl C_SYMBOL_NAME(_gcry_mpih_rshift) -C_SYMBOL_NAME(_gcry_mpih_rshift:) - pushl %edi - pushl %esi - pushl %ebx - pushl %ebp - - movl 20(%esp),%edi /* res_ptr */ - movl 24(%esp),%esi /* s_ptr */ - movl 28(%esp),%ebp /* size */ - movl 32(%esp),%ecx /* cnt */ - -/* We can use faster code for shift-by-1 under certain conditions. */ - cmp $1,%ecx - jne Rnormal - leal 4(%edi),%eax - cmpl %esi,%eax - jnc Rspecial /* jump if res_ptr + 1 >= s_ptr */ - leal (%edi,%ebp,4),%eax - cmpl %eax,%esi - jnc Rspecial /* jump if s_ptr >= res_ptr + size */ - -Rnormal: - movl (%esi),%edx - addl $4,%esi - xorl %eax,%eax - shrdl %cl,%edx,%eax /* compute carry limb */ - pushl %eax /* push carry limb onto stack */ - - decl %ebp - pushl %ebp - shrl $3,%ebp - jz Rend - - movl (%edi),%eax /* fetch destination cache line */ - - ALIGN (2) -Roop: movl 28(%edi),%eax /* fetch destination cache line */ - movl %edx,%ebx - - movl (%esi),%eax - movl 4(%esi),%edx - shrdl %cl,%eax,%ebx - shrdl %cl,%edx,%eax - movl %ebx,(%edi) - movl %eax,4(%edi) - - movl 8(%esi),%ebx - movl 12(%esi),%eax - shrdl %cl,%ebx,%edx - shrdl %cl,%eax,%ebx - movl %edx,8(%edi) - movl %ebx,12(%edi) - - movl 16(%esi),%edx - movl 20(%esi),%ebx - shrdl %cl,%edx,%eax - shrdl %cl,%ebx,%edx - movl %eax,16(%edi) - movl %edx,20(%edi) - - movl 24(%esi),%eax - movl 28(%esi),%edx - shrdl %cl,%eax,%ebx - shrdl %cl,%edx,%eax - movl %ebx,24(%edi) - movl %eax,28(%edi) - - addl $32,%esi - addl $32,%edi - decl %ebp - jnz Roop - -Rend: popl %ebp - andl $7,%ebp - jz Rend2 -Roop2: movl (%esi),%eax - shrdl %cl,%eax,%edx /* compute result limb */ - movl %edx,(%edi) - movl %eax,%edx - addl $4,%esi - addl $4,%edi - decl %ebp - jnz Roop2 - -Rend2: shrl %cl,%edx /* compute most significant limb */ - movl %edx,(%edi) /* store it */ - - popl %eax /* pop carry limb */ - - popl %ebp - popl %ebx - popl %esi - popl %edi - ret - -/* We loop from least significant end of the arrays, which is only - permissable if the source and destination don't overlap, since the - function is documented to work for overlapping source and destination. -*/ - -Rspecial: - leal -4(%edi,%ebp,4),%edi - leal -4(%esi,%ebp,4),%esi - - movl (%esi),%edx - subl $4,%esi - - decl %ebp - pushl %ebp - shrl $3,%ebp - - shrl $1,%edx - incl %ebp - decl %ebp - jz RLend - - movl (%edi),%eax /* fetch destination cache line */ - - ALIGN (2) -RLoop: movl -28(%edi),%eax /* fetch destination cache line */ - movl %edx,%ebx - - movl (%esi),%eax - movl -4(%esi),%edx - rcrl $1,%eax - movl %ebx,(%edi) - rcrl $1,%edx - movl %eax,-4(%edi) - - movl -8(%esi),%ebx - movl -12(%esi),%eax - rcrl $1,%ebx - movl %edx,-8(%edi) - rcrl $1,%eax - movl %ebx,-12(%edi) - - movl -16(%esi),%edx - movl -20(%esi),%ebx - rcrl $1,%edx - movl %eax,-16(%edi) - rcrl $1,%ebx - movl %edx,-20(%edi) - - movl -24(%esi),%eax - movl -28(%esi),%edx - rcrl $1,%eax - movl %ebx,-24(%edi) - rcrl $1,%edx - movl %eax,-28(%edi) - - leal -32(%esi),%esi /* use leal not to clobber carry */ - leal -32(%edi),%edi - decl %ebp - jnz RLoop - -RLend: popl %ebp - sbbl %eax,%eax /* save carry in %eax */ - andl $7,%ebp - jz RLend2 - addl %eax,%eax /* restore carry from eax */ -RLoop2: movl %edx,%ebx - movl (%esi),%edx - rcrl $1,%edx - movl %ebx,(%edi) - - leal -4(%esi),%esi /* use leal not to clobber carry */ - leal -4(%edi),%edi - decl %ebp - jnz RLoop2 - - jmp RL1 -RLend2: addl %eax,%eax /* restore carry from eax */ -RL1: movl %edx,(%edi) /* store last limb */ - - movl $0,%eax - rcrl $1,%eax - - popl %ebp - popl %ebx - popl %esi - popl %edi - ret - diff --git a/mpi/i586/mpih-sub1.S b/mpi/i586/mpih-sub1.S deleted file mode 100644 index ef2d5807..00000000 --- a/mpi/i586/mpih-sub1.S +++ /dev/null @@ -1,142 +0,0 @@ -/* i80586 sub_n -- Sub two limb vectors of the same length > 0 and store - * sum in a third limb vector. - * - * Copyright (C) 1992, 1994, 1995, 1998, - * 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_sub_n( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_ptr_t s2_ptr, (sp + 12) - * mpi_size_t size) (sp + 16) - */ - - -.text - ALIGN (3) - .globl C_SYMBOL_NAME(_gcry_mpih_sub_n) -C_SYMBOL_NAME(_gcry_mpih_sub_n:) - - pushl %edi - pushl %esi - pushl %ebx - pushl %ebp - - movl 20(%esp),%edi /* res_ptr */ - movl 24(%esp),%esi /* s1_ptr */ - movl 28(%esp),%ebp /* s2_ptr */ - movl 32(%esp),%ecx /* size */ - - movl (%ebp),%ebx - - decl %ecx - movl %ecx,%edx - shrl $3,%ecx - andl $7,%edx - testl %ecx,%ecx /* zero carry flag */ - jz Lend - pushl %edx - - ALIGN (3) -Loop: movl 28(%edi),%eax /* fetch destination cache line */ - leal 32(%edi),%edi - -L1: movl (%esi),%eax - movl 4(%esi),%edx - sbbl %ebx,%eax - movl 4(%ebp),%ebx - sbbl %ebx,%edx - movl 8(%ebp),%ebx - movl %eax,-32(%edi) - movl %edx,-28(%edi) - -L2: movl 8(%esi),%eax - movl 12(%esi),%edx - sbbl %ebx,%eax - movl 12(%ebp),%ebx - sbbl %ebx,%edx - movl 16(%ebp),%ebx - movl %eax,-24(%edi) - movl %edx,-20(%edi) - -L3: movl 16(%esi),%eax - movl 20(%esi),%edx - sbbl %ebx,%eax - movl 20(%ebp),%ebx - sbbl %ebx,%edx - movl 24(%ebp),%ebx - movl %eax,-16(%edi) - movl %edx,-12(%edi) - -L4: movl 24(%esi),%eax - movl 28(%esi),%edx - sbbl %ebx,%eax - movl 28(%ebp),%ebx - sbbl %ebx,%edx - movl 32(%ebp),%ebx - movl %eax,-8(%edi) - movl %edx,-4(%edi) - - leal 32(%esi),%esi - leal 32(%ebp),%ebp - decl %ecx - jnz Loop - - popl %edx -Lend: - decl %edx /* test %edx w/o clobbering carry */ - js Lend2 - incl %edx -Loop2: - leal 4(%edi),%edi - movl (%esi),%eax - sbbl %ebx,%eax - movl 4(%ebp),%ebx - movl %eax,-4(%edi) - leal 4(%esi),%esi - leal 4(%ebp),%ebp - decl %edx - jnz Loop2 -Lend2: - movl (%esi),%eax - sbbl %ebx,%eax - movl %eax,(%edi) - - sbbl %eax,%eax - negl %eax - - popl %ebp - popl %ebx - popl %esi - popl %edi - ret - diff --git a/mpi/pentium4/README b/mpi/pentium4/README deleted file mode 100644 index 215fc7f8..00000000 --- a/mpi/pentium4/README +++ /dev/null @@ -1,115 +0,0 @@ -Copyright 2001 Free Software Foundation, Inc. - -This file is part of the GNU MP Library. - -The GNU MP Library is free software; you can redistribute it and/or modify -it under the terms of the GNU Lesser General Public License as published by -the Free Software Foundation; either version 2.1 of the License, or (at your -option) any later version. - -The GNU MP Library is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public -License for more details. - -You should have received a copy of the GNU Lesser General Public License -along with the GNU MP Library; see the file COPYING.LIB. If not, write to -the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA -02110-1301, USA. - - - - - INTEL PENTIUM-4 MPN SUBROUTINES - - -This directory contains mpn functions optimized for Intel Pentium-4. - -The mmx subdirectory has routines using MMX instructions, the sse2 -subdirectory has routines using SSE2 instructions. All P4s have these, the -separate directories are just so configure can omit that code if the -assembler doesn't support it. - - -STATUS - - cycles/limb - - mpn_add_n/sub_n 4 normal, 6 in-place - - mpn_mul_1 4 normal, 6 in-place - mpn_addmul_1 6 - mpn_submul_1 7 - - mpn_mul_basecase 6 cycles/crossproduct (approx) - - mpn_sqr_basecase 3.5 cycles/crossproduct (approx) - or 7.0 cycles/triangleproduct (approx) - - mpn_l/rshift 1.75 - - - -The shifts ought to be able to go at 1.5 c/l, but not much effort has been -applied to them yet. - -In-place operations, and all addmul, submul, mul_basecase and sqr_basecase -calls, suffer from pipeline anomalies associated with write combining and -movd reads and writes to the same or nearby locations. The movq -instructions do not trigger the same hardware problems. Unfortunately, -using movq and splitting/combining seems to require too many extra -instructions to help. Perhaps future chip steppings will be better. - - - -NOTES - -The Pentium-4 pipeline "Netburst", provides for quite a number of surprises. -Many traditional x86 instructions run very slowly, requiring use of -alterative instructions for acceptable performance. - -adcl and sbbl are quite slow at 8 cycles for reg->reg. paddq of 32-bits -within a 64-bit mmx register seems better, though the combination -paddq/psrlq when propagating a carry is still a 4 cycle latency. - -incl and decl should be avoided, instead use add $1 and sub $1. Apparently -the carry flag is not separately renamed, so incl and decl depend on all -previous flags-setting instructions. - -shll and shrl have a 4 cycle latency, or 8 times the latency of the fastest -integer instructions (addl, subl, orl, andl, and some more). shldl and -shrdl seem to have 13 and 15 cycles latency, respectively. Bizarre. - -movq mmx -> mmx does have 6 cycle latency, as noted in the documentation. -pxor/por or similar combination at 2 cycles latency can be used instead. -The movq however executes in the float unit, thereby saving MMX execution -resources. With the right juggling, data moves shouldn't be on a dependent -chain. - -L1 is write-through, but the write-combining sounds like it does enough to -not require explicit destination prefetching. - -xmm registers so far haven't found a use, but not much effort has been -expended. A configure test for whether the operating system knows -fxsave/fxrestor will be needed if they're used. - - - -REFERENCES - -Intel Pentium-4 processor manuals, - - http://developer.intel.com/design/pentium4/manuals - -"Intel Pentium 4 Processor Optimization Reference Manual", Intel, 2001, -order number 248966. Available on-line: - - http://developer.intel.com/design/pentium4/manuals/248966.htm - - - ----------------- -Local variables: -mode: text -fill-column: 76 -End: diff --git a/mpi/pentium4/distfiles b/mpi/pentium4/distfiles deleted file mode 100644 index b419f85a..00000000 --- a/mpi/pentium4/distfiles +++ /dev/null @@ -1,3 +0,0 @@ -README - - diff --git a/mpi/pentium4/mmx/distfiles b/mpi/pentium4/mmx/distfiles deleted file mode 100644 index 8f0ea426..00000000 --- a/mpi/pentium4/mmx/distfiles +++ /dev/null @@ -1,2 +0,0 @@ -mpih-lshift.S -mpih-rshift.S diff --git a/mpi/pentium4/mmx/mpih-lshift.S b/mpi/pentium4/mmx/mpih-lshift.S deleted file mode 100644 index e2dd184b..00000000 --- a/mpi/pentium4/mmx/mpih-lshift.S +++ /dev/null @@ -1,457 +0,0 @@ -/* Intel Pentium-4 mpn_lshift -- left shift. - * - * Copyright 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_lshift( mpi_ptr_t wp, (sp + 4) - * mpi_ptr_t up, (sp + 8) - * mpi_size_t usize, (sp + 12) - * unsigned cnt) (sp + 16) - * - * P4 Willamette, Northwood: 1.75 cycles/limb - * P4 Prescott: 2.0 cycles/limb - */ - -.text - ALIGN (3) - .globl C_SYMBOL_NAME(_gcry_mpih_lshift) -C_SYMBOL_NAME(_gcry_mpih_lshift:) - - - pushl %ebx - pushl %edi - - - movl 20(%esp), %eax - movl 12(%esp), %edx - - movl 16(%esp), %ebx - movl 24(%esp), %ecx - - cmp $5, %eax - jae .Lunroll - - movl -4(%ebx,%eax,4), %edi - decl %eax - - jnz .Lsimple - - shldl %cl, %edi, %eax - - shll %cl, %edi - - movl %edi, (%edx) - popl %edi - - popl %ebx - - ret - - - - - -.Lsimple: - - - - - - - - - - movd (%ebx,%eax,4), %mm5 - - movd %ecx, %mm6 - negl %ecx - - psllq %mm6, %mm5 - addl $32, %ecx - - movd %ecx, %mm7 - psrlq $32, %mm5 - - -.Lsimple_top: - - - - - - - - - - - - - movq -4(%ebx,%eax,4), %mm0 - decl %eax - - psrlq %mm7, %mm0 - - - - movd %mm0, 4(%edx,%eax,4) - jnz .Lsimple_top - - - movd (%ebx), %mm0 - - movd %mm5, %eax - psllq %mm6, %mm0 - - popl %edi - popl %ebx - - movd %mm0, (%edx) - - emms - - ret - - - - - - .align 8, 0x90 -.Lunroll: - - - - - - - - - - movd -4(%ebx,%eax,4), %mm5 - leal (%ebx,%eax,4), %edi - - movd %ecx, %mm6 - andl $4, %edi - - psllq %mm6, %mm5 - jz .Lstart_src_aligned - - - - - - - - - - - - - - - - - - - - movq -8(%ebx,%eax,4), %mm0 - - psllq %mm6, %mm0 - decl %eax - - psrlq $32, %mm0 - - - - movd %mm0, (%edx,%eax,4) -.Lstart_src_aligned: - - movq -8(%ebx,%eax,4), %mm1 - leal (%edx,%eax,4), %edi - - andl $4, %edi - psrlq $32, %mm5 - - movq -16(%ebx,%eax,4), %mm3 - jz .Lstart_dst_aligned - - - - - - - - - - - - - - - - - - - - - movq %mm1, %mm0 - addl $32, %ecx - - psllq %mm6, %mm0 - - movd %ecx, %mm6 - psrlq $32, %mm0 - - - - movd %mm0, -4(%edx,%eax,4) - subl $4, %edx -.Lstart_dst_aligned: - - - psllq %mm6, %mm1 - negl %ecx - - addl $64, %ecx - movq %mm3, %mm2 - - movd %ecx, %mm7 - subl $8, %eax - - psrlq %mm7, %mm3 - - por %mm1, %mm3 - jc .Lfinish - - - - - .align 8, 0x90 -.Lunroll_loop: - - - - - - - - - - - - - - - - - movq 8(%ebx,%eax,4), %mm0 - psllq %mm6, %mm2 - - movq %mm0, %mm1 - psrlq %mm7, %mm0 - - movq %mm3, 24(%edx,%eax,4) - por %mm2, %mm0 - - movq (%ebx,%eax,4), %mm3 - psllq %mm6, %mm1 - - movq %mm0, 16(%edx,%eax,4) - movq %mm3, %mm2 - - psrlq %mm7, %mm3 - subl $4, %eax - - por %mm1, %mm3 - jnc .Lunroll_loop - - - -.Lfinish: - - - testb $2, %al - - jz .Lfinish_no_two - - movq 8(%ebx,%eax,4), %mm0 - psllq %mm6, %mm2 - - movq %mm0, %mm1 - psrlq %mm7, %mm0 - - movq %mm3, 24(%edx,%eax,4) - por %mm2, %mm0 - - movq %mm1, %mm2 - movq %mm0, %mm3 - - subl $2, %eax -.Lfinish_no_two: - - - - - - - - testb $1, %al - movd %mm5, %eax - - popl %edi - jz .Lfinish_zero - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - movd (%ebx), %mm0 - psllq %mm6, %mm2 - - movq %mm3, 12(%edx) - psllq $32, %mm0 - - movq %mm0, %mm1 - psrlq %mm7, %mm0 - - por %mm2, %mm0 - psllq %mm6, %mm1 - - movq %mm0, 4(%edx) - psrlq $32, %mm1 - - andl $32, %ecx - popl %ebx - - jz .Lfinish_one_unaligned - - movd %mm1, (%edx) -.Lfinish_one_unaligned: - - emms - - ret - - - - -.Lfinish_zero: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - movq %mm3, 8(%edx) - andl $32, %ecx - - psllq %mm6, %mm2 - jz .Lfinish_zero_unaligned - - movq %mm2, (%edx) -.Lfinish_zero_unaligned: - - psrlq $32, %mm2 - popl %ebx - - movd %mm5, %eax - - movd %mm2, 4(%edx) - - emms - - ret diff --git a/mpi/pentium4/mmx/mpih-rshift.S b/mpi/pentium4/mmx/mpih-rshift.S deleted file mode 100644 index e3374e3b..00000000 --- a/mpi/pentium4/mmx/mpih-rshift.S +++ /dev/null @@ -1,453 +0,0 @@ -/* Intel Pentium-4 mpn_rshift -- right shift. - * - * Copyright 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_rshift( mpi_ptr_t wp, (sp + 4) - * mpi_ptr_t up, (sp + 8) - * mpi_size_t usize, (sp + 12) - * unsigned cnt) (sp + 16) - * - * P4 Willamette, Northwood: 1.75 cycles/limb - * P4 Prescott: 2.0 cycles/limb - */ - -.text - ALIGN (3) - .globl C_SYMBOL_NAME(_gcry_mpih_rshift) -C_SYMBOL_NAME(_gcry_mpih_rshift:) - pushl %ebx - pushl %edi - - - movl 20(%esp), %eax - movl 12(%esp), %edx - - movl 16(%esp), %ebx - movl 24(%esp), %ecx - - cmp $5, %eax - jae .Lunroll - - decl %eax - movl (%ebx), %edi - - jnz .Lsimple - - shrdl %cl, %edi, %eax - - shrl %cl, %edi - - movl %edi, (%edx) - popl %edi - - popl %ebx - - ret - - - - - - .align 8, 0x90 -.Lsimple: - - - - - - - - - - movd (%ebx), %mm5 - leal (%ebx,%eax,4), %ebx - - movd %ecx, %mm6 - leal -4(%edx,%eax,4), %edx - - psllq $32, %mm5 - negl %eax - - - - - - - -.Lsimple_top: - - - - - - - - - - movq (%ebx,%eax,4), %mm0 - incl %eax - - psrlq %mm6, %mm0 - - movd %mm0, (%edx,%eax,4) - jnz .Lsimple_top - - - movd (%ebx), %mm0 - psrlq %mm6, %mm5 - - psrlq %mm6, %mm0 - popl %edi - - movd %mm5, %eax - popl %ebx - - movd %mm0, 4(%edx) - - emms - - ret - - - - - - .align 8, 0x90 -.Lunroll: - - - - - - - - - - movd (%ebx), %mm5 - movl $4, %edi - - movd %ecx, %mm6 - testl %edi, %ebx - - psllq $32, %mm5 - jz .Lstart_src_aligned - - - - - - - - - - - - - - - - - movq (%ebx), %mm0 - - psrlq %mm6, %mm0 - addl $4, %ebx - - decl %eax - - movd %mm0, (%edx) - addl $4, %edx -.Lstart_src_aligned: - - - movq (%ebx), %mm1 - testl %edi, %edx - - psrlq %mm6, %mm5 - jz .Lstart_dst_aligned - - - - - - - - - - - - - - - - - - movq %mm1, %mm0 - addl $32, %ecx - - psrlq %mm6, %mm0 - - movd %ecx, %mm6 - - movd %mm0, (%edx) - addl $4, %edx -.Lstart_dst_aligned: - - - movq 8(%ebx), %mm3 - negl %ecx - - movq %mm3, %mm2 - addl $64, %ecx - - movd %ecx, %mm7 - psrlq %mm6, %mm1 - - leal -12(%ebx,%eax,4), %ebx - leal -20(%edx,%eax,4), %edx - - psllq %mm7, %mm3 - subl $7, %eax - - por %mm1, %mm3 - negl %eax - - jns .Lfinish - - - - - - - - - - - - - - - - .align 8, 0x90 -.Lunroll_loop: - - - - - - - - - - - - - - - - - movq (%ebx,%eax,4), %mm0 - psrlq %mm6, %mm2 - - movq %mm0, %mm1 - psllq %mm7, %mm0 - - movq %mm3, -8(%edx,%eax,4) - por %mm2, %mm0 - - movq 8(%ebx,%eax,4), %mm3 - psrlq %mm6, %mm1 - - movq %mm0, (%edx,%eax,4) - movq %mm3, %mm2 - - psllq %mm7, %mm3 - addl $4, %eax - - por %mm1, %mm3 - js .Lunroll_loop - - -.Lfinish: - - - testb $2, %al - - jnz .Lfinish_no_two - - movq (%ebx,%eax,4), %mm0 - psrlq %mm6, %mm2 - - movq %mm0, %mm1 - psllq %mm7, %mm0 - - movq %mm3, -8(%edx,%eax,4) - por %mm2, %mm0 - - movq %mm1, %mm2 - movq %mm0, %mm3 - - addl $2, %eax -.Lfinish_no_two: - - - - - - - - testb $1, %al - popl %edi - - movd %mm5, %eax - jnz .Lfinish_zero - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - movd 8(%ebx), %mm0 - psrlq %mm6, %mm2 - - movq %mm0, %mm1 - psllq %mm7, %mm0 - - movq %mm3, (%edx) - por %mm2, %mm0 - - psrlq %mm6, %mm1 - andl $32, %ecx - - popl %ebx - jz .Lfinish_one_unaligned - - - movd %mm1, 16(%edx) -.Lfinish_one_unaligned: - - movq %mm0, 8(%edx) - - emms - - ret - - - - -.Lfinish_zero: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - movq %mm3, 4(%edx) - psrlq %mm6, %mm2 - - movd %mm2, 12(%edx) - andl $32, %ecx - - popl %ebx - jz .Lfinish_zero_unaligned - - movq %mm2, 12(%edx) -.Lfinish_zero_unaligned: - - emms - - ret diff --git a/mpi/pentium4/sse2/distfiles b/mpi/pentium4/sse2/distfiles deleted file mode 100644 index 7252cd7e..00000000 --- a/mpi/pentium4/sse2/distfiles +++ /dev/null @@ -1,5 +0,0 @@ -mpih-add1.S -mpih-mul1.S -mpih-mul2.S -mpih-mul3.S -mpih-sub1.S diff --git a/mpi/pentium4/sse2/mpih-add1.S b/mpi/pentium4/sse2/mpih-add1.S deleted file mode 100644 index 55ed6630..00000000 --- a/mpi/pentium4/sse2/mpih-add1.S +++ /dev/null @@ -1,91 +0,0 @@ -/* Intel Pentium-4 mpn_add_n -- mpn addition. - * - * Copyright 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - - /******************* - * mpi_limb_t - * _gcry_mpih_add_n( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_ptr_t s2_ptr, (sp + 12) - * mpi_size_t size) (sp + 16) - * - * P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2 - * 6.0 cycles/limb if dst==src1 or dst==src2 - * P4 Prescott: >= 5 cycles/limb - * - * The 4 c/l achieved here isn't particularly good, but is better than 9 c/l - * for a basic adc loop. - */ - - TEXT - ALIGN (3) - GLOBL C_SYMBOL_NAME(_gcry_mpih_add_n) -C_SYMBOL_NAME(_gcry_mpih_add_n:) - - pxor %mm0, %mm0 - - movl 8(%esp), %eax /* s1_ptr */ - movl %ebx, 8(%esp) /* re-use parameter space */ - movl 12(%esp), %ebx /* res_ptr */ - movl 4(%esp), %edx /* s2_ptr */ - movl 16(%esp), %ecx /* size */ - - leal (%eax,%ecx,4), %eax /* src1 end */ - leal (%ebx,%ecx,4), %ebx /* src2 end */ - leal (%edx,%ecx,4), %edx /* dst end */ - negl %ecx /* -size */ - -Ltop: -/* - C eax src1 end - C ebx src2 end - C ecx counter, limbs, negative - C edx dst end - C mm0 carry bit -*/ - - movd (%eax,%ecx,4), %mm1 - movd (%ebx,%ecx,4), %mm2 - paddq %mm2, %mm1 - - paddq %mm1, %mm0 - movd %mm0, (%edx,%ecx,4) - - psrlq $32, %mm0 - - addl $1, %ecx - jnz Ltop - - - movd %mm0, %eax - movl 8(%esp), %ebx /* restore saved EBX */ - emms - ret diff --git a/mpi/pentium4/sse2/mpih-mul1.S b/mpi/pentium4/sse2/mpih-mul1.S deleted file mode 100644 index a0c98fb4..00000000 --- a/mpi/pentium4/sse2/mpih-mul1.S +++ /dev/null @@ -1,96 +0,0 @@ -/* Intel Pentium-4 mpn_mul_1 -- Multiply a limb vector with a limb and store - * the result in a second limb vector. - * - * Copyright 2001, 2002, 2003, 2005 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_mul_1( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_size_t s1_size, (sp + 12) - * mpi_limb_t s2_limb) (sp + 16) - * - * src != dst src == dst - * P6 model 9 (Banias) ?.? - * P6 model 13 (Dothan) 4.75 4.75 - * P4 model 0 (Willamette) 4.0 6.0 - * P4 model 1 (?) 4.0 6.0 - * P4 model 2 (Northwood) 4.0 6.0 - * P4 model 3 (Prescott) ?.? ?.? - * P4 model 4 (Nocona) ?.? ?.? - * Unfortunately when src==dst the write-combining described in - * pentium4/README takes us up to 6 c/l. - * - */ - - TEXT - ALIGN (3) - GLOBL C_SYMBOL_NAME(_gcry_mpih_mul_1) -C_SYMBOL_NAME(_gcry_mpih_mul_1:); - - pxor %mm0, %mm0 - -.Lstart_1c: - movl 8(%esp), %eax - movd 16(%esp), %mm7 - movl 4(%esp), %edx - movl 12(%esp), %ecx - -.Ltop: - -/* - C eax src, incrementing - C ebx - C ecx counter, size iterations - C edx dst, incrementing - C - C mm0 carry limb - C mm7 multiplier -*/ - - movd (%eax), %mm1 - addl $4, %eax - pmuludq %mm7, %mm1 - - paddq %mm1, %mm0 - movd %mm0, (%edx) - addl $4, %edx - - psrlq $32, %mm0 - - subl $1, %ecx - jnz .Ltop - - - movd %mm0, %eax - emms - ret - diff --git a/mpi/pentium4/sse2/mpih-mul2.S b/mpi/pentium4/sse2/mpih-mul2.S deleted file mode 100644 index f975adfc..00000000 --- a/mpi/pentium4/sse2/mpih-mul2.S +++ /dev/null @@ -1,136 +0,0 @@ -/* Intel Pentium-4 mpn_addmul_1 -- Multiply a limb vector with a limb and add - * the result to a second limb vector. - * - * Copyright 2001, 2002, 2004, 2005 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_addmul_1( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_size_t s1_size, (sp + 12) - * mpi_limb_t s2_limb) (sp + 16) - * - * P3 model 9 (Banias) ?.? - * P3 model 13 (Dothan) 5.8 - * P4 model 0 (Willamette) 5.5 - * P4 model 1 (?) 5.5 - * P4 model 2 (Northwood) 5.5 - * P4 model 3 (Prescott) 6.0 - * P4 model 4 (Nocona) - * - * Only the carry limb propagation is on the dependent chain, but some other - * Pentium4 pipeline magic brings down performance to 6 cycles/l from the - * ideal 4 cycles/l. - */ - - - TEXT - ALIGN (4) - GLOBL C_SYMBOL_NAME(_gcry_mpih_addmul_1) -C_SYMBOL_NAME(_gcry_mpih_addmul_1:) - - pxor %mm4, %mm4 -.Lstart_1c: - movl 8(%esp), %eax - movl 12(%esp), %ecx - movl 4(%esp), %edx - movd 16(%esp), %mm7 - -/* - C eax src, incrementing ; 5B - C ecx loop counter, decrementing - C edx dst, incrementing - C - C mm4 carry, low 32-bits - C mm7 multiplier -*/ - - movd (%eax), %mm2 - pmuludq %mm7, %mm2 - - shrl $1, %ecx - jnc .Leven - - leal 4(%eax), %eax - movd (%edx), %mm1 - paddq %mm2, %mm1 - paddq %mm1, %mm4 - movd %mm4, (%edx) - psrlq $32, %mm4 - - testl %ecx, %ecx - jz .Lrtn - leal 4(%edx), %edx - - movd (%eax), %mm2 - pmuludq %mm7, %mm2 -.Leven: - movd 4(%eax), %mm0 - movd (%edx), %mm1 - pmuludq %mm7, %mm0 - - subl $1, %ecx - jz .Lend -.Lloop: - paddq %mm2, %mm1 - movd 8(%eax), %mm2 - paddq %mm1, %mm4 - movd 4(%edx), %mm3 - pmuludq %mm7, %mm2 - movd %mm4, (%edx) - psrlq $32, %mm4 - - paddq %mm0, %mm3 - movd 12(%eax), %mm0 - paddq %mm3, %mm4 - movd 8(%edx), %mm1 - pmuludq %mm7, %mm0 - movd %mm4, 4(%edx) - psrlq $32, %mm4 - - leal 8(%eax), %eax - leal 8(%edx), %edx - subl $1, %ecx - jnz .Lloop -.Lend: - paddq %mm2, %mm1 - paddq %mm1, %mm4 - movd 4(%edx), %mm3 - movd %mm4, (%edx) - psrlq $32, %mm4 - paddq %mm0, %mm3 - paddq %mm3, %mm4 - movd %mm4, 4(%edx) - psrlq $32, %mm4 -.Lrtn: - movd %mm4, %eax - emms - ret diff --git a/mpi/pentium4/sse2/mpih-mul3.S b/mpi/pentium4/sse2/mpih-mul3.S deleted file mode 100644 index ebcd2a68..00000000 --- a/mpi/pentium4/sse2/mpih-mul3.S +++ /dev/null @@ -1,127 +0,0 @@ -/* Intel Pentium-4 mpn_submul_1 -- Multiply a limb vector with a limb and - * subtract the result from a second limb vector. - * - * Copyright 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_submul_1( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_size_t s1_size, (sp + 12) - * mpi_limb_t s2_limb) (sp + 16) - * - * P4: 7 cycles/limb, unstable timing, at least on early Pentium4 silicon - * (stepping 10). - * - * This code is not particularly good at 7 c/l. The dependent chain is only - * 4 c/l and there's only 4 MMX unit instructions, so it's not clear why that - * speed isn't achieved. - * - * The arrangements made here to get a two instruction dependent chain are - * slightly subtle. In the loop the carry (or borrow rather) is a negative - * so that a paddq can be used to give a low limb ready to store, and a high - * limb ready to become the new carry after a psrlq. - * - * If the carry was a simple twos complement negative then the psrlq shift - * would need to bring in 0 bits or 1 bits according to whether the high was - * zero or non-zero, since a non-zero value would represent a negative - * needing sign extension. That wouldn't be particularly easy to arrange and - * certainly would add an instruction to the dependent chain, so instead an - * offset is applied so that the high limb will be 0xFFFFFFFF+c. With c in - * the range -0xFFFFFFFF to 0, the value 0xFFFFFFFF+c is in the range 0 to - * 0xFFFFFFFF and is therefore always positive and can always have 0 bits - * shifted in, which is what psrlq does. - * - * The extra 0xFFFFFFFF must be subtracted before c is used, but that can be - * done off the dependent chain. The total adjustment then is to add - * 0xFFFFFFFF00000000 to offset the new carry, and subtract - * 0x00000000FFFFFFFF to remove the offset from the current carry, for a net - * add of 0xFFFFFFFE00000001. In the code this is applied to the destination - * limb when fetched. - * - * It's also possible to view the 0xFFFFFFFF adjustment as a ones-complement - * negative, which is how it's undone for the return value, but that doesn't - * seem as clear. -*/ - - TEXT - ALIGN (4) - GLOBL C_SYMBOL_NAME(_gcry_mpih_submul_1) -C_SYMBOL_NAME(_gcry_mpih_submul_1:) - - pxor %mm1, %mm1 - -.Lstart_1c: - movl 8(%esp), %eax - pcmpeqd %mm0, %mm0 - - movd 16(%esp), %mm7 - pcmpeqd %mm6, %mm6 - - movl 4(%esp), %edx - psrlq $32, %mm0 - - movl 12(%esp), %ecx - psllq $32, %mm6 - - psubq %mm0, %mm6 - - psubq %mm1, %mm0 - -/* - C eax src, incrementing - C ebx - C ecx loop counter, decrementing - C edx dst, incrementing - C - C mm0 0xFFFFFFFF - borrow - C mm6 0xFFFFFFFE00000001 - C mm7 multiplier -*/ - -.Lloop: - movd (%eax), %mm1 - leal 4(%eax), %eax - movd (%edx), %mm2 - paddq %mm6, %mm2 - pmuludq %mm7, %mm1 - psubq %mm1, %mm2 - paddq %mm2, %mm0 - subl $1, %ecx - movd %mm0, (%edx) - psrlq $32, %mm0 - leal 4(%edx), %edx - jnz .Lloop - - movd %mm0, %eax - notl %eax - emms - ret diff --git a/mpi/pentium4/sse2/mpih-sub1.S b/mpi/pentium4/sse2/mpih-sub1.S deleted file mode 100644 index 33900c74..00000000 --- a/mpi/pentium4/sse2/mpih-sub1.S +++ /dev/null @@ -1,112 +0,0 @@ -/* Intel Pentium-4 mpn_sub_n -- mpn subtraction. - * - * Copyright 2001, 2002 Free Software Foundation, Inc. - * - * This file is part of Libgcrypt. - * - * Libgcrypt is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation; either version 2.1 of - * the License, or (at your option) any later version. - * - * Libgcrypt is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA - * - * Note: This code is heavily based on the GNU MP Library. - * Actually it's the same code with only minor changes in the - * way the data is stored; this is to support the abstraction - * of an optional secure memory allocation which may be used - * to avoid revealing of sensitive data due to paging etc. - */ - - -#include "sysdep.h" -#include "asm-syntax.h" - - -/******************* - * mpi_limb_t - * _gcry_mpih_sub_n( mpi_ptr_t res_ptr, (sp + 4) - * mpi_ptr_t s1_ptr, (sp + 8) - * mpi_ptr_t s2_ptr, (sp + 12) - * mpi_size_t size) (sp + 16) - * - * P4 Willamette, Northwood: 4.0 cycles/limb if dst!=src1 and dst!=src2 - * 6.0 cycles/limb if dst==src1 or dst==src2 - * P4 Prescott: >= 5 cycles/limb - * - * The main loop code is 2x unrolled so that the carry bit can alternate - * between mm0 and mm1. - */ - - -.text - ALIGN (3) - .globl C_SYMBOL_NAME(_gcry_mpih_sub_n) -C_SYMBOL_NAME(_gcry_mpih_sub_n:) - - pxor %mm0, %mm0 -.Lstart_nc: - movl 8(%esp), %eax - movl %ebx, 8(%esp) - movl 12(%esp), %ebx - movl 4(%esp), %edx - movl 16(%esp), %ecx - - leal (%eax,%ecx,4), %eax - leal (%ebx,%ecx,4), %ebx - leal (%edx,%ecx,4), %edx - negl %ecx - -.Ltop: -/* - C eax src1 end - C ebx src2 end - C ecx counter, limbs, negative - C edx dst end - C mm0 carry bit -*/ - - movd (%eax,%ecx,4), %mm1 - movd (%ebx,%ecx,4), %mm2 - psubq %mm2, %mm1 - - psubq %mm0, %mm1 - movd %mm1, (%edx,%ecx,4) - - psrlq $63, %mm1 - - addl $1, %ecx - jz .Ldone_mm1 - - movd (%eax,%ecx,4), %mm0 - movd (%ebx,%ecx,4), %mm2 - psubq %mm2, %mm0 - - psubq %mm1, %mm0 - movd %mm0, (%edx,%ecx,4) - - psrlq $63, %mm0 - - addl $1, %ecx - jnz .Ltop - - - movd %mm0, %eax - movl 8(%esp), %ebx - emms - ret - - - -.Ldone_mm1: - movd %mm1, %eax - movl 8(%esp), %ebx - emms - ret -- 2.32.0 From jussi.kivilinna at iki.fi Sat Jan 8 22:43:43 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sat, 8 Jan 2022 23:43:43 +0200 Subject: [PATCH] mpi/amd64: remove extra 'ret' from assembly functions Message-ID: <20220108214343.486437-1-jussi.kivilinna@iki.fi> * mpi/amd64/mpih-add1.S: Remove 'ret' as it is already included by FUNC_EXIT macro. * mpi/amd64/mpih-lshift.S: Likewise. * mpi/amd64/mpih-mul1.S: Likewise. * mpi/amd64/mpih-mul2.S: Likewise. * mpi/amd64/mpih-mul3.S: Likewise. * mpi/amd64/mpih-rshift.S: Likewise. * mpi/amd64/mpih-sub1.S: Likewise. -- Signed-off-by: Jussi Kivilinna --- mpi/amd64/mpih-add1.S | 1 - mpi/amd64/mpih-lshift.S | 1 - mpi/amd64/mpih-mul1.S | 1 - mpi/amd64/mpih-mul2.S | 1 - mpi/amd64/mpih-mul3.S | 1 - mpi/amd64/mpih-rshift.S | 1 - mpi/amd64/mpih-sub1.S | 1 - 7 files changed, 7 deletions(-) diff --git a/mpi/amd64/mpih-add1.S b/mpi/amd64/mpih-add1.S index 157e5f1e..39c00c52 100644 --- a/mpi/amd64/mpih-add1.S +++ b/mpi/amd64/mpih-add1.S @@ -61,4 +61,3 @@ C_SYMBOL_NAME(_gcry_mpih_add_n:) movq %rcx, %rax /* zero %rax */ adcq %rax, %rax FUNC_EXIT() - ret diff --git a/mpi/amd64/mpih-lshift.S b/mpi/amd64/mpih-lshift.S index 76e9408f..a9c7d7e1 100644 --- a/mpi/amd64/mpih-lshift.S +++ b/mpi/amd64/mpih-lshift.S @@ -76,4 +76,3 @@ C_SYMBOL_NAME(_gcry_mpih_lshift:) .Lende: psllq %xmm1, %xmm2 movq %xmm2, (%rdi) FUNC_EXIT() - ret diff --git a/mpi/amd64/mpih-mul1.S b/mpi/amd64/mpih-mul1.S index 67ab47ea..dacb9d87 100644 --- a/mpi/amd64/mpih-mul1.S +++ b/mpi/amd64/mpih-mul1.S @@ -64,4 +64,3 @@ C_SYMBOL_NAME(_gcry_mpih_mul_1:) movq %r8, %rax FUNC_EXIT() - ret diff --git a/mpi/amd64/mpih-mul2.S b/mpi/amd64/mpih-mul2.S index 1aa4fa0a..07913586 100644 --- a/mpi/amd64/mpih-mul2.S +++ b/mpi/amd64/mpih-mul2.S @@ -63,4 +63,3 @@ C_SYMBOL_NAME(_gcry_mpih_addmul_1:) movq %r8, %rax FUNC_EXIT() - ret diff --git a/mpi/amd64/mpih-mul3.S b/mpi/amd64/mpih-mul3.S index bc41c4eb..f8889eb2 100644 --- a/mpi/amd64/mpih-mul3.S +++ b/mpi/amd64/mpih-mul3.S @@ -64,4 +64,3 @@ C_SYMBOL_NAME(_gcry_mpih_submul_1:) movq %r8, %rax FUNC_EXIT() - ret diff --git a/mpi/amd64/mpih-rshift.S b/mpi/amd64/mpih-rshift.S index d5e27974..8ecf155f 100644 --- a/mpi/amd64/mpih-rshift.S +++ b/mpi/amd64/mpih-rshift.S @@ -79,4 +79,3 @@ C_SYMBOL_NAME(_gcry_mpih_rshift:) .Lende: psrlq %xmm1, %xmm2 movq %xmm2, -8(%rdi) FUNC_EXIT() - ret diff --git a/mpi/amd64/mpih-sub1.S b/mpi/amd64/mpih-sub1.S index ccf64963..d60b58a5 100644 --- a/mpi/amd64/mpih-sub1.S +++ b/mpi/amd64/mpih-sub1.S @@ -60,4 +60,3 @@ C_SYMBOL_NAME(_gcry_mpih_sub_n:) movq %rcx, %rax /* zero %rax */ adcq %rax, %rax FUNC_EXIT() - ret -- 2.32.0 From jussi.kivilinna at iki.fi Tue Jan 11 20:00:08 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Tue, 11 Jan 2022 21:00:08 +0200 Subject: [PATCH] rijndael-aesni: small optimization for cbc-enc and cfb-enc Message-ID: <20220111190008.1220151-1-jussi.kivilinna@iki.fi> * cipher/rijndael-aesni.c (_gcry_aes_aesni_cfb_enc) (_gcry_aes_aesni_cbc_enc): Copy contents of 'do_aesni_enc' here and merge input/output and first/last round key xoring to shorten critical path. -- Benchmark on AMD Ryzen 7 5800X: Before: AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC enc | 0.541 ns/B 1762 MiB/s 2.62 c/B 4850 CFB enc | 0.541 ns/B 1762 MiB/s 2.63 c/B 4850 After (5% faster): AES | nanosecs/byte mebibytes/sec cycles/byte auto Mhz CBC enc | 0.515 ns/B 1850 MiB/s 2.50 c/B 4850 CFB enc | 0.515 ns/B 1851 MiB/s 2.50 c/B 4850 Signed-off-by: Jussi Kivilinna --- cipher/rijndael-aesni.c | 201 +++++++++++++++++++++++++++++++++------- 1 file changed, 165 insertions(+), 36 deletions(-) diff --git a/cipher/rijndael-aesni.c b/cipher/rijndael-aesni.c index 34a4a447..ff6b0b26 100644 --- a/cipher/rijndael-aesni.c +++ b/cipher/rijndael-aesni.c @@ -1723,34 +1723,97 @@ _gcry_aes_aesni_cfb_enc (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks) { + unsigned int rounds = ctx->rounds; + aesni_prepare_2_7_variable; + aesni_prepare (); + aesni_prepare_2_7(); asm volatile ("movdqu %[iv], %%xmm0\n\t" - : /* No output */ - : [iv] "m" (*iv) - : "memory" ); + : /* No output */ + : [iv] "m" (*iv) + : "memory" ); + + asm volatile ("movdqa %[key0], %%xmm2\n\t" /* xmm2 = key[0] */ + "movdqa %[keylast], %%xmm4\n\t" /* xmm4 = key[last] */ + "movdqa %%xmm0, %%xmm3\n" + "pxor %%xmm2, %%xmm4\n\t" /* xmm4 = key[0] ^ key[last] */ + "pxor %%xmm2, %%xmm0\n\t" /* xmm0 = IV ^ key[0] */ + : /* No output */ + : [key0] "m" (ctx->keyschenc[0][0][0]), + [keylast] "m" (ctx->keyschenc[rounds][0][0]) + : "memory" ); for ( ;nblocks; nblocks-- ) { - do_aesni_enc (ctx); + asm volatile ("movdqu %[inbuf], %%xmm5\n\t" + "movdqa %%xmm2, %%xmm3\n\t" + "pxor %%xmm4, %%xmm5\n\t" /* xmm5 = input ^ key[last] ^ key[0] */ + : + : [inbuf] "m" (*inbuf) + : "memory" ); - asm volatile ("movdqu %[inbuf], %%xmm1\n\t" - "pxor %%xmm1, %%xmm0\n\t" - "movdqu %%xmm0, %[outbuf]\n\t" - : [outbuf] "=m" (*outbuf) - : [inbuf] "m" (*inbuf) - : "memory" ); +#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" +#define aesenclast_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc1\n\t" +#define aesenclast_xmm5_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc5\n\t" + asm volatile ("movdqa 0x10(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x20(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x30(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x40(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x50(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x60(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x70(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x80(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x90(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "cmpl $10, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + "movdqa 0xa0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xb0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "cmpl $12, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + "movdqa 0xc0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xd0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + + ".Lenclast%=:\n\t" + aesenclast_xmm5_xmm0 + : + : [key] "r" (ctx->keyschenc), + [rounds] "r" (rounds) + : "cc", "memory"); +#undef aesenc_xmm1_xmm0 +#undef aesenclast_xmm1_xmm0 +#undef aesenclast_xmm5_xmm0 + + asm volatile ("pxor %%xmm0, %%xmm3\n\t" + "movdqu %%xmm3, %[outbuf]\n\t" + : [outbuf] "=m" (*outbuf) + : [inbuf] "m" (*inbuf) + : "memory" ); outbuf += BLOCKSIZE; inbuf += BLOCKSIZE; } - asm volatile ("movdqu %%xmm0, %[iv]\n\t" - : [iv] "=m" (*iv) - : - : "memory" ); + asm volatile ("movdqu %%xmm3, %[iv]\n\t" + : [iv] "=m" (*iv) + : + : "memory" ); aesni_cleanup (); + aesni_cleanup_2_7 (); } @@ -1759,41 +1822,107 @@ _gcry_aes_aesni_cbc_enc (RIJNDAEL_context *ctx, unsigned char *iv, unsigned char *outbuf, const unsigned char *inbuf, size_t nblocks, int cbc_mac) { + unsigned int rounds = ctx->rounds; aesni_prepare_2_7_variable; + if (nblocks == 0) /* CMAC may call with nblocks 0. */ + return; + aesni_prepare (); aesni_prepare_2_7(); - asm volatile ("movdqu %[iv], %%xmm5\n\t" - : /* No output */ - : [iv] "m" (*iv) - : "memory" ); + asm volatile ("movdqu %[iv], %%xmm0\n\t" + : /* No output */ + : [iv] "m" (*iv) + : "memory" ); - for ( ;nblocks; nblocks-- ) + asm volatile ("movdqa %[key0], %%xmm2\n\t" /* xmm2 = key[0] */ + "movdqa %[keylast], %%xmm3\n\t" /* xmm3 = key[last] */ + "pxor %%xmm2, %%xmm0\n\t" /* xmm0 = IV ^ key[0] */ + "pxor %%xmm3, %%xmm2\n\t" /* xmm2 = key[0] ^ key[last] */ + : /* No output */ + : [key0] "m" (ctx->keyschenc[0][0][0]), + [keylast] "m" (ctx->keyschenc[rounds][0][0]) + : "memory" ); + + asm volatile ("movdqu %[inbuf], %%xmm4\n\t" + "pxor %%xmm4, %%xmm0\n\t" /* xmm0 = IV ^ key[0] ^ input */ + : + : [inbuf] "m" (*inbuf) + : "memory" ); + inbuf += BLOCKSIZE; + + for ( ;nblocks; ) { - asm volatile ("movdqu %[inbuf], %%xmm0\n\t" - "pxor %%xmm5, %%xmm0\n\t" - : /* No output */ - : [inbuf] "m" (*inbuf) - : "memory" ); + if (--nblocks) + { + asm volatile ("movdqu %[inbuf], %%xmm4\n\t" + /* xmm4 = IV ^ key[0] ^ key[last] ^ input: */ + "pxor %%xmm2, %%xmm4\n\t" + : + : [inbuf] "m" (*inbuf) + : "memory" ); + inbuf += BLOCKSIZE; + } - do_aesni_enc (ctx); +#define aesenc_xmm1_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdc, 0xc1\n\t" +#define aesenclast_xmm4_xmm0 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xc4\n\t" +#define aesenclast_xmm3_xmm5 ".byte 0x66, 0x0f, 0x38, 0xdd, 0xeb\n\t" + asm volatile ("movdqa 0x10(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x20(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x30(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x40(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x50(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x60(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x70(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x80(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0x90(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "cmpl $10, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + "movdqa 0xa0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xb0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "cmpl $12, %[rounds]\n\t" + "jz .Lenclast%=\n\t" + "movdqa 0xc0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + "movdqa 0xd0(%[key]), %%xmm1\n\t" + aesenc_xmm1_xmm0 + + ".Lenclast%=:\n\t" + "movdqa %%xmm0, %%xmm5\n" + aesenclast_xmm4_xmm0 /* xmm0 = IV ^ key[0] */ + aesenclast_xmm3_xmm5 /* xmm5 = IV */ + : + : [key] "r" (ctx->keyschenc), + [rounds] "r" (rounds) + : "cc", "memory"); +#undef aesenc_xmm1_xmm0 +#undef aesenclast_xmm4_xmm0 +#undef aesenclast_xmm3_xmm5 - asm volatile ("movdqa %%xmm0, %%xmm5\n\t" - "movdqu %%xmm0, %[outbuf]\n\t" - : [outbuf] "=m" (*outbuf) - : - : "memory" ); + asm volatile ("movdqu %%xmm5, %[outbuf]\n\t" + : [outbuf] "=m" (*outbuf) + : + : "memory" ); - inbuf += BLOCKSIZE; - if (!cbc_mac) - outbuf += BLOCKSIZE; + outbuf += -(!cbc_mac) & BLOCKSIZE; } asm volatile ("movdqu %%xmm5, %[iv]\n\t" - : [iv] "=m" (*iv) - : - : "memory" ); + : [iv] "=m" (*iv) + : + : "memory" ); aesni_cleanup (); aesni_cleanup_2_7 (); -- 2.32.0 From sam at gentoo.org Tue Jan 18 16:44:15 2022 From: sam at gentoo.org (Sam James) Date: Tue, 18 Jan 2022 15:44:15 +0000 Subject: [PATCH] cipher/cipher-gcm: fix build failure on ARM NEON Message-ID: <20220118154415.953374-1-sam@gentoo.org> 'features' is only defined when relevant CPU features are found, but one of the uses below its definition checked for GCM_USE_ARM_NEON which wasn't in the guard above it. i.e. We used to only define 'features' when: - GCM_USE_INTEL_PCLMUL - GCM_USE_ARM_PMULL - GCM_USE_S390X_CRYPTO - GCM_USE_PPC_VPMSUM - GCM_USE_S390X_CRYPTO - GCM_USE_PPC_VPMSUM is set. We were missing GCM_USE_ARM_NEON so when we check for GCM_USE_ARM_NEON below, it'd fail as features wasn't defined. Bug: https://bugs.gentoo.org/831397 --- cipher/cipher-gcm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index a039c5e9..22834f35 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -583,7 +583,8 @@ static void setupM (gcry_cipher_hd_t c) { #if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \ - defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM) + defined(GCM_USE_ARM_NEON) || defined(GCM_USE_S390X_CRYPTO) || \ + defined(GCM_USE_PPC_VPMSUM) unsigned int features = _gcry_get_hw_features (); #endif -- 2.34.1 From sam at gentoo.org Tue Jan 18 13:52:52 2022 From: sam at gentoo.org (Sam James) Date: Tue, 18 Jan 2022 12:52:52 +0000 Subject: [PATCH] cipher/cipher-gcm: fix build failure on ARM NEON Message-ID: <20220118125252.809049-1-sam@gentoo.org> 'features' is only defined when relevant CPU features are found, but one of the uses below its definition checked for GCM_USE_ARM_NEON which wasn't in the guard above it. i.e. We used to only define 'features' when: - GCM_USE_INTEL_PCLMUL - GCM_USE_ARM_PMULL - GCM_USE_S390X_CRYPTO - GCM_USE_PPC_VPMSUM - GCM_USE_S390X_CRYPTO - GCM_USE_PPC_VPMSUM is set. We were missing GCM_USE_ARM_NEON so when we check for GCM_USE_ARM_NEON below, it'd fail as features wasn't defined. Bug: https://bugs.gentoo.org/831397 --- cipher/cipher-gcm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index a039c5e9..22834f35 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -583,7 +583,8 @@ static void setupM (gcry_cipher_hd_t c) { #if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \ - defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM) + defined(GCM_USE_ARM_NEON) || defined(GCM_USE_S390X_CRYPTO) || \ + defined(GCM_USE_PPC_VPMSUM) unsigned int features = _gcry_get_hw_features (); #endif -- 2.34.1 From sam at gentoo.org Tue Jan 18 16:38:51 2022 From: sam at gentoo.org (Sam James) Date: Tue, 18 Jan 2022 15:38:51 +0000 Subject: [PATCH] cipher/cipher-gcm: fix build failure on ARM NEON Message-ID: <20220118153851.950749-1-sam@gentoo.org> 'features' is only defined when relevant CPU features are found, but one of the uses below its definition checked for GCM_USE_ARM_NEON which wasn't in the guard above it. i.e. We used to only define 'features' when: - GCM_USE_INTEL_PCLMUL - GCM_USE_ARM_PMULL - GCM_USE_S390X_CRYPTO - GCM_USE_PPC_VPMSUM - GCM_USE_S390X_CRYPTO - GCM_USE_PPC_VPMSUM is set. We were missing GCM_USE_ARM_NEON so when we check for GCM_USE_ARM_NEON below, it'd fail as features wasn't defined. Bug: https://bugs.gentoo.org/831397 --- cipher/cipher-gcm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index a039c5e9..22834f35 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -583,7 +583,8 @@ static void setupM (gcry_cipher_hd_t c) { #if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \ - defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM) + defined(GCM_USE_ARM_NEON) || defined(GCM_USE_S390X_CRYPTO) || \ + defined(GCM_USE_PPC_VPMSUM) unsigned int features = _gcry_get_hw_features (); #endif -- 2.34.1 From gniibe at fsij.org Thu Jan 20 07:40:15 2022 From: gniibe at fsij.org (NIIBE Yutaka) Date: Thu, 20 Jan 2022 15:40:15 +0900 Subject: Balloon hashing (was: Argon2) In-Reply-To: References: <87v91yiu99.fsf@akagi.fsij.org> <584e3784-609d-b56a-1a00-ed8f244e0b42@iki.fi> <87v91pngwb.fsf@wheatstone.g10code.de> Message-ID: <87k0euoqpc.fsf@akagi.fsij.org> Hello, Last October, I wrote about possible addition of Argon2 to libgcrypt. Today, I am considering adding Balloon instead (or as well as Argon2). Background: These days, we try to prepare FIPS mode for coming libgcrypt 1.10. In this context of FIPS compliant things, I'm afraid Argon2 won't be approved algo by FIPS (in future). This week, I read this document of NIST: https://pages.nist.gov/800-63-3/sp800-63b.html#sec5 and it addresses Balloon [0] as one of examples. And I found that Balloon is more FIPS friendly, as it can use FIPS approved hash function. Just like Argon2, it has three parameters (parallelism, space cost, and time cost). Thus, it has same problem with the gcry_kdf_derive API (which only has "iterations"). For parallelism, Balloon approach is straight forward: tweaking salt for each worker thread, parallel computation by threads, and merging results by XOR. (It's a bit simpler than Argon2 where its H0 includes parallelism parameter.) So, I think that we could only offer single-thread version of Balloon by libgcrypt and assume use of parallelism by an application. This way, we may avoid introducing thread dependency in libgcrypt. [0] https://crypto.stanford.edu/balloon/ -- From bad at bsd.de Mon Jan 24 18:38:55 2022 From: bad at bsd.de (Christoph Badura) Date: Mon, 24 Jan 2022 18:38:55 +0100 Subject: PATCH random/rndgetentropy.c: fix build failure on macOS Message-ID: <20220124173855.GD23126@irregular-apocalypse.k.bsd.de> Before the weekend I did a speedo.mk build of gnupg off the master branches on an Intel MacBook running Big Sur with Xcode 13.2.1 using the MacOS SDK 12.1. libgcrypt fails in rndgetentropy.c because the prototype for getentropy() is missing. The prototype is provided by sys/random.h per the man pages. The following patch fixes this for me. --chris 1 file changed, 3 insertions(+) random/rndgetentropy.c | 3 +++ modified random/rndgetentropy.c @@ -23,6 +23,9 @@ #include #include #include +#ifdef __APPLE__ +#include /* getentropy(2) lives here */ +#endif #include #include #include From jussi.kivilinna at iki.fi Fri Jan 28 20:06:13 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 28 Jan 2022 21:06:13 +0200 Subject: [PATCH 1/4] tests/t-kdf: few changes to pthread example and fix win32/win64 builds Message-ID: <20220128190616.884237-1-jussi.kivilinna@iki.fi> * src/gcrypt.h.in (gcry_kdf_thread_ops_t): New based on 'struct gcry_kdf_thread_ops'. (gcry_kdf_compute): Use 'gcry_kdf_thread_ops_t' instead of 'struct gcry_kdf_thread_ops'. * tests/Makefile.am: Define 't_kdf_LDADD' and 't_kdf_CFLAGS' on win32/win64 target too. * tests/t-kdf.c (pthread_jobs_launch_job): Set 'oldest_thread_idx' on first thread creation. (wait_all_jobs_completion): Reset 'oldest_thread_idx' to -1. (my_kdf_derive): Merge HAVE_PTHREAD ifdefs; Initialize 'oldest_thread_idx' to -1. -- Windows build was not working because of missing HAVE_PTHREAD in 't-kdf.c' and LDADD/CFLAGS issue in 'Makefile.am'. Signed-off-by: Jussi Kivilinna --- src/gcrypt.h.in | 7 ++++--- tests/Makefile.am | 2 ++ tests/t-kdf.c | 46 +++++++++++++++++++++++----------------------- 3 files changed, 29 insertions(+), 26 deletions(-) diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 5e016932..680f634f 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1620,11 +1620,12 @@ typedef int (*gcry_kdf_lauch_job_t) (void *jobs_context, typedef int (*gcry_kdf_wait_all_jobs_completion_t) (void *jobs_context); /* Exposed structure for KDF computation to decouple thread functionality. */ -struct gcry_kdf_thread_ops { +typedef struct gcry_kdf_thread_ops +{ void *jobs_context; gcry_kdf_lauch_job_t launch_job; gcry_kdf_wait_all_jobs_completion_t wait_all_jobs_completion; -}; +} gcry_kdf_thread_ops_t; gcry_error_t gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo, const unsigned long *param, unsigned int paramlen, @@ -1633,7 +1634,7 @@ gcry_error_t gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo, const void *key, size_t keylen, const void *ad, size_t adlen); gcry_error_t gcry_kdf_compute (gcry_kdf_hd_t h, - const struct gcry_kdf_thread_ops *ops); + const gcry_kdf_thread_ops_t *ops); gcry_error_t gcry_kdf_final (gcry_kdf_hd_t h, size_t resultlen, void *result); void gcry_kdf_close (gcry_kdf_hd_t h); diff --git a/tests/Makefile.am b/tests/Makefile.am index b42156f0..e6953fd3 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -89,6 +89,8 @@ if HAVE_W32_SYSTEM xtestsuite_libs = ../src/.libs/libgcrypt-20.dll \ $(prefix)/bin/libgpg-error*-0.dll xtestsuite_driver = .libs/testdrv.exe +t_kdf_LDADD = $(standard_ldadd) $(GPG_ERROR_LIBS) @LDADD_FOR_TESTS_KLUDGE@ +t_kdf_CFLAGS = $(GPG_ERROR_CFLAGS) else xtestsuite_libs = ../src/.libs/libgcrypt.so* xtestsuite_driver = testdrv diff --git a/tests/t-kdf.c b/tests/t-kdf.c index 59559a4c..d61159e3 100644 --- a/tests/t-kdf.c +++ b/tests/t-kdf.c @@ -1255,7 +1255,8 @@ struct user_defined_threads_ctx int num_threads_running; pthread_attr_t attr; pthread_t thread[MAX_THREADS]; - struct job_thread_param { + struct job_thread_param + { void (*job) (void *work_priv); void *priv; } work[MAX_THREADS]; @@ -1275,8 +1276,7 @@ pthread_jobs_launch_job (void *jobs_context, { struct user_defined_threads_ctx *ctx = jobs_context; - if (ctx->num_threads_running - && ctx->next_thread_idx == ctx->oldest_thread_idx) + if (ctx->next_thread_idx == ctx->oldest_thread_idx) { assert (ctx->num_threads_running == MAX_THREADS); /* thread limit reached, join a thread */ @@ -1289,6 +1289,8 @@ pthread_jobs_launch_job (void *jobs_context, ctx->work[ctx->next_thread_idx].priv = work_priv; pthread_create (&ctx->thread[ctx->next_thread_idx], &ctx->attr, job_thread, &ctx->work[ctx->next_thread_idx]); + if (ctx->oldest_thread_idx < 0) + ctx->oldest_thread_idx = ctx->next_thread_idx; ctx->next_thread_idx = (ctx->next_thread_idx + 1) % MAX_THREADS; ctx->num_threads_running++; return 0; @@ -1308,7 +1310,7 @@ wait_all_jobs_completion (void *jobs_context) /* reset context for next round of parallel work */ ctx->num_threads_running = 0; - ctx->oldest_thread_idx = 0; + ctx->oldest_thread_idx = -1; ctx->next_thread_idx = 0; return 0; @@ -1327,9 +1329,8 @@ my_kdf_derive (int parallel, { gcry_error_t err; gcry_kdf_hd_t hd; -#ifdef HAVE_PTHREAD - struct user_defined_threads_ctx jobs_context; -#endif + + (void)parallel; err = gcry_kdf_open (&hd, algo, subalgo, params, paramslen, pass, passlen, salt, saltlen, key, keylen, @@ -1340,7 +1341,16 @@ my_kdf_derive (int parallel, #ifdef HAVE_PTHREAD if (parallel) { + struct user_defined_threads_ctx jobs_context; + const gcry_kdf_thread_ops_t ops = + { + &jobs_context, + pthread_jobs_launch_job, + wait_all_jobs_completion + }; + memset (&jobs_context, 0, sizeof (struct user_defined_threads_ctx)); + jobs_context.oldest_thread_idx = -1; if (pthread_attr_init (&jobs_context.attr)) { @@ -1357,26 +1367,16 @@ my_kdf_derive (int parallel, gcry_kdf_close (hd); return err; } - } -#endif - - if (!parallel) - err = gcry_kdf_compute (hd, NULL); - else - { - struct gcry_kdf_thread_ops ops = { - &jobs_context, - pthread_jobs_launch_job, - wait_all_jobs_completion - }; err = gcry_kdf_compute (hd, &ops); - } -#ifdef HAVE_PTHREAD - if (parallel) - pthread_attr_destroy (&jobs_context. attr); + pthread_attr_destroy (&jobs_context. attr); + } + else #endif + { + err = gcry_kdf_compute (hd, NULL); + } if (!err) err = gcry_kdf_final (hd, outlen, out); -- 2.32.0 From jussi.kivilinna at iki.fi Fri Jan 28 20:06:14 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 28 Jan 2022 21:06:14 +0200 Subject: [PATCH 2/4] Rename KDF job functions and function types In-Reply-To: <20220128190616.884237-1-jussi.kivilinna@iki.fi> References: <20220128190616.884237-1-jussi.kivilinna@iki.fi> Message-ID: <20220128190616.884237-2-jussi.kivilinna@iki.fi> * src/gcrypt.h.in (gcry_kdf_job_fn_t): New. (gcry_kdf_dispatch_job_fn_t): Renamed from 'gcry_kdf_lauch_job_t'; Use 'gcry_kdf_job_fn_t' for function pointer parameter. (gcry_kdf_wait_all_jobs_fn_t): Renamed from 'gcry_kdf_wait_all_jobs_completion_t'. (gcry_kdf_thread_ops_t): Rename functions to 'dispatch_job' and 'wait_all_jobs'. * cipher/kdf.c (argon2_compute): Change to use 'dispatch_job' and 'wait_all_jobs'. * tests/t-kdf.c (job_thread_param, pthread_jobs_launch_job): Use 'gcry_kdf_job_fn_t' type for 'job'. -- Rename 'launch_job' to 'dispatch_job', dispatch feels better word to describe the action here. Also remove '_completion' from wait_all function name as it makes name unnecessary long. Signed-off-by: Jussi Kivilinna --- cipher/kdf.c | 6 +++--- src/gcrypt.h.in | 14 +++++++------- tests/t-kdf.c | 8 ++++---- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/cipher/kdf.c b/cipher/kdf.c index 94cd064f..d426b608 100644 --- a/cipher/kdf.c +++ b/cipher/kdf.c @@ -719,14 +719,14 @@ argon2_compute (argon2_ctx_t a, const struct gcry_kdf_thread_ops *ops) thread_data->lane = l; if (ops) - ops->launch_job (ops->jobs_context, - argon2_compute_segment, thread_data); + ops->dispatch_job (ops->jobs_context, + argon2_compute_segment, thread_data); else argon2_compute_segment (thread_data); } if (ops) - ops->wait_all_jobs_completion (ops->jobs_context); + ops->wait_all_jobs (ops->jobs_context); } return 0; diff --git a/src/gcrypt.h.in b/src/gcrypt.h.in index 680f634f..2fd47292 100644 --- a/src/gcrypt.h.in +++ b/src/gcrypt.h.in @@ -1613,18 +1613,18 @@ gpg_error_t gcry_kdf_derive (const void *passphrase, size_t passphraselen, /* Another API to derive a key from a passphrase. */ typedef struct gcry_kdf_handle *gcry_kdf_hd_t; - -typedef int (*gcry_kdf_lauch_job_t) (void *jobs_context, - void (*job) (void *work_priv), - void *work_priv); -typedef int (*gcry_kdf_wait_all_jobs_completion_t) (void *jobs_context); +typedef void (*gcry_kdf_job_fn_t) (void *priv); +typedef int (*gcry_kdf_dispatch_job_fn_t) (void *jobs_context, + gcry_kdf_job_fn_t job_fn, + void *job_priv); +typedef int (*gcry_kdf_wait_all_jobs_fn_t) (void *jobs_context); /* Exposed structure for KDF computation to decouple thread functionality. */ typedef struct gcry_kdf_thread_ops { void *jobs_context; - gcry_kdf_lauch_job_t launch_job; - gcry_kdf_wait_all_jobs_completion_t wait_all_jobs_completion; + gcry_kdf_dispatch_job_fn_t dispatch_job; + gcry_kdf_wait_all_jobs_fn_t wait_all_jobs; } gcry_kdf_thread_ops_t; gcry_error_t gcry_kdf_open (gcry_kdf_hd_t *hd, int algo, int subalgo, diff --git a/tests/t-kdf.c b/tests/t-kdf.c index d61159e3..8844e111 100644 --- a/tests/t-kdf.c +++ b/tests/t-kdf.c @@ -1257,7 +1257,7 @@ struct user_defined_threads_ctx pthread_t thread[MAX_THREADS]; struct job_thread_param { - void (*job) (void *work_priv); + gcry_kdf_job_fn_t job; void *priv; } work[MAX_THREADS]; }; @@ -1271,8 +1271,8 @@ job_thread (void *p) } static int -pthread_jobs_launch_job (void *jobs_context, - void (*job) (void *work_priv), void *work_priv) +pthread_jobs_launch_job (void *jobs_context, gcry_kdf_job_fn_t job, + void *job_priv) { struct user_defined_threads_ctx *ctx = jobs_context; @@ -1286,7 +1286,7 @@ pthread_jobs_launch_job (void *jobs_context, } ctx->work[ctx->next_thread_idx].job = job; - ctx->work[ctx->next_thread_idx].priv = work_priv; + ctx->work[ctx->next_thread_idx].priv = job_priv; pthread_create (&ctx->thread[ctx->next_thread_idx], &ctx->attr, job_thread, &ctx->work[ctx->next_thread_idx]); if (ctx->oldest_thread_idx < 0) -- 2.32.0 From jussi.kivilinna at iki.fi Fri Jan 28 20:06:15 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 28 Jan 2022 21:06:15 +0200 Subject: [PATCH 3/4] kdf/argon2: use BLAKE2b hash_buffers function instead of _gcry_md_* In-Reply-To: <20220128190616.884237-1-jussi.kivilinna@iki.fi> References: <20220128190616.884237-1-jussi.kivilinna@iki.fi> Message-ID: <20220128190616.884237-3-jussi.kivilinna@iki.fi> * cipher/kdf.c (argon2_fill_first_blocks): Convert to use iov hash_buffers API instead of _gcry_md_*. -- More direct use of BLAKE2b avoids overhead from md object creation and cleanup. Signed-off-by: Jussi Kivilinna --- cipher/kdf.c | 97 ++++++++++++++++++++++++++++------------------------ 1 file changed, 52 insertions(+), 45 deletions(-) diff --git a/cipher/kdf.c b/cipher/kdf.c index d426b608..74c5b753 100644 --- a/cipher/kdf.c +++ b/cipher/kdf.c @@ -367,59 +367,66 @@ xor_block (u64 *dst, const u64 *src) static gpg_err_code_t argon2_fill_first_blocks (argon2_ctx_t a) { - gpg_err_code_t ec; unsigned char h0_01_i[72]; - const unsigned char *digest; - unsigned char buf[4]; + unsigned char buf[10][4]; + gcry_buffer_t iov[8]; + unsigned int iov_count = 0; int i; - gcry_md_hd_t hd; - - ec = _gcry_md_open (&hd, GCRY_MD_BLAKE2B_512, 0); - if (ec) - return ec; /* Generate H0. */ - buf_put_le32 (buf, a->lanes); - _gcry_md_write (hd, buf, 4); - - buf_put_le32 (buf, a->outlen); - _gcry_md_write (hd, buf, 4); - - buf_put_le32 (buf, a->m_cost); - _gcry_md_write (hd, buf, 4); - - buf_put_le32 (buf, a->passes); - _gcry_md_write (hd, buf, 4); - - buf_put_le32 (buf, ARGON2_VERSION); - _gcry_md_write (hd, buf, 4); - - buf_put_le32 (buf, a->hash_type); - _gcry_md_write (hd, buf, 4); - - buf_put_le32 (buf, a->passwordlen); - _gcry_md_write (hd, buf, 4); - _gcry_md_write (hd, a->password, a->passwordlen); - - buf_put_le32 (buf, a->saltlen); - _gcry_md_write (hd, buf, 4); - _gcry_md_write (hd, a->salt, a->saltlen); - - buf_put_le32 (buf, a->keylen); - _gcry_md_write (hd, buf, 4); + buf_put_le32 (buf[0], a->lanes); + buf_put_le32 (buf[1], a->outlen); + buf_put_le32 (buf[2], a->m_cost); + buf_put_le32 (buf[3], a->passes); + buf_put_le32 (buf[4], ARGON2_VERSION); + buf_put_le32 (buf[5], a->hash_type); + buf_put_le32 (buf[6], a->passwordlen); + iov[iov_count].data = buf[0]; + iov[iov_count].len = 4 * 7; + iov[iov_count].off = 0; + iov_count++; + iov[iov_count].data = (void *)a->password; + iov[iov_count].len = a->passwordlen; + iov[iov_count].off = 0; + iov_count++; + + buf_put_le32 (buf[7], a->saltlen); + iov[iov_count].data = buf[7]; + iov[iov_count].len = 4; + iov[iov_count].off = 0; + iov_count++; + iov[iov_count].data = (void *)a->salt; + iov[iov_count].len = a->saltlen; + iov[iov_count].off = 0; + iov_count++; + + buf_put_le32 (buf[8], a->keylen); + iov[iov_count].data = buf[8]; + iov[iov_count].len = 4; + iov[iov_count].off = 0; + iov_count++; if (a->key) - _gcry_md_write (hd, a->key, a->keylen); + { + iov[iov_count].data = (void *)a->key; + iov[iov_count].len = a->keylen; + iov[iov_count].off = 0; + iov_count++; + } - buf_put_le32 (buf, a->adlen); - _gcry_md_write (hd, buf, 4); + buf_put_le32 (buf[9], a->adlen); + iov[iov_count].data = buf[9]; + iov[iov_count].len = 4; + iov[iov_count].off = 0; + iov_count++; if (a->ad) - _gcry_md_write (hd, a->ad, a->adlen); - - digest = _gcry_md_read (hd, GCRY_MD_BLAKE2B_512); - - memcpy (h0_01_i, digest, 64); + { + iov[iov_count].data = (void *)a->ad; + iov[iov_count].len = a->adlen; + iov[iov_count].off = 0; + iov_count++; + } - _gcry_md_close (hd); + _gcry_digest_spec_blake2b_512.hash_buffers (h0_01_i, 64, iov, iov_count); for (i = 0; i < a->lanes; i++) { -- 2.32.0 From jussi.kivilinna at iki.fi Fri Jan 28 20:06:16 2022 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Fri, 28 Jan 2022 21:06:16 +0200 Subject: [PATCH 4/4] kdf: handle errors from thread dispatch/wait functions In-Reply-To: <20220128190616.884237-1-jussi.kivilinna@iki.fi> References: <20220128190616.884237-1-jussi.kivilinna@iki.fi> Message-ID: <20220128190616.884237-4-jussi.kivilinna@iki.fi> * cipher/kdf.c (argon2_compute): Handle failed job dispatch/wait. * tests/t-kdf.c (pthread_jobs_launch_job) (wait_all_jobs_completion): Handle errors returned from pthread functions. -- This allows thread helpers to return error code, which causes KDF processing to stop. Signed-off-by: Jussi Kivilinna --- cipher/kdf.c | 15 ++++++++++++--- tests/t-kdf.c | 24 ++++++++++++++++++++---- 2 files changed, 32 insertions(+), 7 deletions(-) diff --git a/cipher/kdf.c b/cipher/kdf.c index 74c5b753..79dc6cd8 100644 --- a/cipher/kdf.c +++ b/cipher/kdf.c @@ -706,6 +706,7 @@ argon2_compute (argon2_ctx_t a, const struct gcry_kdf_thread_ops *ops) unsigned int r; unsigned int s; unsigned int l; + int ret; ec = argon2_fill_first_blocks (a); if (ec) @@ -726,14 +727,22 @@ argon2_compute (argon2_ctx_t a, const struct gcry_kdf_thread_ops *ops) thread_data->lane = l; if (ops) - ops->dispatch_job (ops->jobs_context, - argon2_compute_segment, thread_data); + { + ret = ops->dispatch_job (ops->jobs_context, + argon2_compute_segment, thread_data); + if (ret < 0) + return GPG_ERR_CANCELED; + } else argon2_compute_segment (thread_data); } if (ops) - ops->wait_all_jobs (ops->jobs_context); + { + ret = ops->wait_all_jobs (ops->jobs_context); + if (ret < 0) + return GPG_ERR_CANCELED; + } } return 0; diff --git a/tests/t-kdf.c b/tests/t-kdf.c index 8844e111..4c82fed8 100644 --- a/tests/t-kdf.c +++ b/tests/t-kdf.c @@ -1270,25 +1270,38 @@ job_thread (void *p) pthread_exit (NULL); } +static int +wait_all_jobs_completion (void *jobs_context); + static int pthread_jobs_launch_job (void *jobs_context, gcry_kdf_job_fn_t job, void *job_priv) { struct user_defined_threads_ctx *ctx = jobs_context; + int ret; if (ctx->next_thread_idx == ctx->oldest_thread_idx) { assert (ctx->num_threads_running == MAX_THREADS); /* thread limit reached, join a thread */ - pthread_join (ctx->thread[ctx->oldest_thread_idx], NULL); + ret = pthread_join (ctx->thread[ctx->oldest_thread_idx], NULL); + if (ret) + return -1; ctx->oldest_thread_idx = (ctx->oldest_thread_idx + 1) % MAX_THREADS; ctx->num_threads_running--; } ctx->work[ctx->next_thread_idx].job = job; ctx->work[ctx->next_thread_idx].priv = job_priv; - pthread_create (&ctx->thread[ctx->next_thread_idx], &ctx->attr, - job_thread, &ctx->work[ctx->next_thread_idx]); + ret = pthread_create (&ctx->thread[ctx->next_thread_idx], &ctx->attr, + job_thread, &ctx->work[ctx->next_thread_idx]); + if (ret) + { + /* could not create new thread. */ + (void)wait_all_jobs_completion (jobs_context); + return -1; + } + if (ctx->oldest_thread_idx < 0) ctx->oldest_thread_idx = ctx->next_thread_idx; ctx->next_thread_idx = (ctx->next_thread_idx + 1) % MAX_THREADS; @@ -1301,11 +1314,14 @@ wait_all_jobs_completion (void *jobs_context) { struct user_defined_threads_ctx *ctx = jobs_context; int i, idx; + int ret; for (i = 0; i < ctx->num_threads_running; i++) { idx = (ctx->oldest_thread_idx + i) % MAX_THREADS; - pthread_join (ctx->thread[idx], NULL); + ret = pthread_join (ctx->thread[idx], NULL); + if (ret) + return -1; } /* reset context for next round of parallel work */ -- 2.32.0