[PATCH 2/4] Optimize Keccak 64-bit absorb functions
Jussi Kivilinna
jussi.kivilinna at iki.fi
Sun Nov 1 20:06:14 CET 2015
* cipher/keccak.c [USE_64BIT] [__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
* cipher/keccak.c [USE_64BIT] [!__x86_64__] (absorb_lanes64_8)
(absorb_lanes64_4, absorb_lanes64_2, absorb_lanes64_1): New.
[USE_64BIT] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT] (keccak_absorb_lanes64): Remove.
[USE_64BIT_SHLD] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_SHLD] (keccak_absorb_lanes64_shld): Remove.
[USE_64BIT_BMI2] (KECCAK_F1600_ABSORB_FUNC_NAME): New.
[USE_64BIT_BMI2] (keccak_absorb_lanes64_bmi2): Remove.
* cipher/keccak_permute_64.h (KECCAK_F1600_ABSORB_FUNC_NAME): New.
--
Optimize 64-bit absorb functions for small speed-up. After this
change, 64-bit BMI2 implementation matches speed of fastest results
from SUPERCOP for Intel Haswell CPUs (long messages).
Benchmark on Intel Haswell @ 3.2 Ghz:
Before:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.32 ns/B 411.7 MiB/s 7.41 c/B
SHAKE256 | 2.84 ns/B 336.2 MiB/s 9.08 c/B
SHA3-224 | 2.69 ns/B 354.9 MiB/s 8.60 c/B
SHA3-256 | 2.84 ns/B 336.0 MiB/s 9.08 c/B
SHA3-384 | 3.69 ns/B 258.4 MiB/s 11.81 c/B
SHA3-512 | 5.30 ns/B 179.9 MiB/s 16.97 c/B
After:
| nanosecs/byte mebibytes/sec cycles/byte
SHAKE128 | 2.27 ns/B 420.6 MiB/s 7.26 c/B
SHAKE256 | 2.79 ns/B 341.4 MiB/s 8.94 c/B
SHA3-224 | 2.64 ns/B 361.7 MiB/s 8.44 c/B
SHA3-256 | 2.79 ns/B 341.5 MiB/s 8.94 c/B
SHA3-384 | 3.65 ns/B 261.4 MiB/s 11.68 c/B
SHA3-512 | 5.27 ns/B 181.0 MiB/s 16.87 c/B
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
cipher/keccak.c | 159 ++++++++++++++++++++++++++------------------
cipher/keccak_permute_64.h | 99 +++++++++++++++++++++++++++
2 files changed, 192 insertions(+), 66 deletions(-)
diff --git a/cipher/keccak.c b/cipher/keccak.c
index f4f0ef3..ce57860 100644
--- a/cipher/keccak.c
+++ b/cipher/keccak.c
@@ -223,38 +223,105 @@ keccak_absorb_lane32bi(u32 *lane, u32 x0, u32 x1)
/* Construct generic 64-bit implementation. */
#ifdef USE_64BIT
+#if __GNUC__ >= 4 && defined(__x86_64__)
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+ asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+ "movdqu 0*16(%[in]), %%xmm4\n\t"
+ "movdqu 1*16(%[dst]), %%xmm1\n\t"
+ "movdqu 1*16(%[in]), %%xmm5\n\t"
+ "movdqu 2*16(%[dst]), %%xmm2\n\t"
+ "movdqu 3*16(%[dst]), %%xmm3\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu 2*16(%[in]), %%xmm4\n\t"
+ "movdqu 3*16(%[in]), %%xmm5\n\t"
+ "movdqu %%xmm0, 0*16(%[dst])\n\t"
+ "pxor %%xmm4, %%xmm2\n\t"
+ "movdqu %%xmm1, 1*16(%[dst])\n\t"
+ "pxor %%xmm5, %%xmm3\n\t"
+ "movdqu %%xmm2, 2*16(%[dst])\n\t"
+ "movdqu %%xmm3, 3*16(%[dst])\n\t"
+ :
+ : [dst] "r" (dst), [in] "r" (in)
+ : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+ asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+ "movdqu 0*16(%[in]), %%xmm4\n\t"
+ "movdqu 1*16(%[dst]), %%xmm1\n\t"
+ "movdqu 1*16(%[in]), %%xmm5\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ "pxor %%xmm5, %%xmm1\n\t"
+ "movdqu %%xmm0, 0*16(%[dst])\n\t"
+ "movdqu %%xmm1, 1*16(%[dst])\n\t"
+ :
+ : [dst] "r" (dst), [in] "r" (in)
+ : "xmm0", "xmm1", "xmm4", "xmm5", "memory");
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+ asm ("movdqu 0*16(%[dst]), %%xmm0\n\t"
+ "movdqu 0*16(%[in]), %%xmm4\n\t"
+ "pxor %%xmm4, %%xmm0\n\t"
+ "movdqu %%xmm0, 0*16(%[dst])\n\t"
+ :
+ : [dst] "r" (dst), [in] "r" (in)
+ : "xmm0", "xmm4", "memory");
+}
+
+#else /* __x86_64__ */
+
+static inline void absorb_lanes64_8(u64 *dst, const byte *in)
+{
+ dst[0] ^= buf_get_le64(in + 8 * 0);
+ dst[1] ^= buf_get_le64(in + 8 * 1);
+ dst[2] ^= buf_get_le64(in + 8 * 2);
+ dst[3] ^= buf_get_le64(in + 8 * 3);
+ dst[4] ^= buf_get_le64(in + 8 * 4);
+ dst[5] ^= buf_get_le64(in + 8 * 5);
+ dst[6] ^= buf_get_le64(in + 8 * 6);
+ dst[7] ^= buf_get_le64(in + 8 * 7);
+}
+
+static inline void absorb_lanes64_4(u64 *dst, const byte *in)
+{
+ dst[0] ^= buf_get_le64(in + 8 * 0);
+ dst[1] ^= buf_get_le64(in + 8 * 1);
+ dst[2] ^= buf_get_le64(in + 8 * 2);
+ dst[3] ^= buf_get_le64(in + 8 * 3);
+}
+
+static inline void absorb_lanes64_2(u64 *dst, const byte *in)
+{
+ dst[0] ^= buf_get_le64(in + 8 * 0);
+ dst[1] ^= buf_get_le64(in + 8 * 1);
+}
+
+#endif /* !__x86_64__ */
+
+static inline void absorb_lanes64_1(u64 *dst, const byte *in)
+{
+ dst[0] ^= buf_get_le64(in + 8 * 0);
+}
+
+
# define ANDN64(x, y) (~(x) & (y))
# define ROL64(x, n) (((x) << ((unsigned int)n & 63)) | \
((x) >> ((64 - (unsigned int)(n)) & 63)))
# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64
# include "keccak_permute_64.h"
# undef ANDN64
# undef ROL64
# undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64(KECCAK_STATE *hd, int pos, const byte *lanes,
- unsigned int nlanes, int blocklanes)
-{
- unsigned int burn = 0;
-
- while (nlanes)
- {
- hd->u.state64[pos] ^= buf_get_le64(lanes);
- lanes += 8;
- nlanes--;
-
- if (++pos == blocklanes)
- {
- burn = keccak_f1600_state_permute64(hd);
- pos = 0;
- }
- }
-
- return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
static const keccak_ops_t keccak_generic64_ops =
{
@@ -279,33 +346,13 @@ static const keccak_ops_t keccak_generic64_ops =
tmp; })
# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_shld
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_shld
# include "keccak_permute_64.h"
# undef ANDN64
# undef ROL64
# undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_shld(KECCAK_STATE *hd, int pos, const byte *lanes,
- unsigned int nlanes, int blocklanes)
-{
- unsigned int burn = 0;
-
- while (nlanes)
- {
- hd->u.state64[pos] ^= buf_get_le64(lanes);
- lanes += 8;
- nlanes--;
-
- if (++pos == blocklanes)
- {
- burn = keccak_f1600_state_permute64_shld(hd);
- pos = 0;
- }
- }
-
- return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
static const keccak_ops_t keccak_shld_64_ops =
{
@@ -335,33 +382,13 @@ static const keccak_ops_t keccak_shld_64_ops =
tmp; })
# define KECCAK_F1600_PERMUTE_FUNC_NAME keccak_f1600_state_permute64_bmi2
+# define KECCAK_F1600_ABSORB_FUNC_NAME keccak_absorb_lanes64_bmi2
# include "keccak_permute_64.h"
# undef ANDN64
# undef ROL64
# undef KECCAK_F1600_PERMUTE_FUNC_NAME
-
-static unsigned int
-keccak_absorb_lanes64_bmi2(KECCAK_STATE *hd, int pos, const byte *lanes,
- unsigned int nlanes, int blocklanes)
-{
- unsigned int burn = 0;
-
- while (nlanes)
- {
- hd->u.state64[pos] ^= buf_get_le64(lanes);
- lanes += 8;
- nlanes--;
-
- if (++pos == blocklanes)
- {
- burn = keccak_f1600_state_permute64_bmi2(hd);
- pos = 0;
- }
- }
-
- return burn;
-}
+# undef KECCAK_F1600_ABSORB_FUNC_NAME
static const keccak_ops_t keccak_bmi2_64_ops =
{
diff --git a/cipher/keccak_permute_64.h b/cipher/keccak_permute_64.h
index 1264f19..6f24217 100644
--- a/cipher/keccak_permute_64.h
+++ b/cipher/keccak_permute_64.h
@@ -288,3 +288,102 @@ KECCAK_F1600_PERMUTE_FUNC_NAME(KECCAK_STATE *hd)
return sizeof(void *) * 4 + sizeof(u64) * 12 * 5;
}
+
+static unsigned int
+KECCAK_F1600_ABSORB_FUNC_NAME(KECCAK_STATE *hd, int pos, const byte *lanes,
+ unsigned int nlanes, int blocklanes)
+{
+ unsigned int burn = 0;
+
+ while (nlanes)
+ {
+ switch (blocklanes)
+ {
+ case 21:
+ /* SHAKE128 */
+ while (pos == 0 && nlanes >= 21)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+ absorb_lanes64_8(&hd->u.state64[12], lanes + 8 * 12);
+ absorb_lanes64_1(&hd->u.state64[20], lanes + 8 * 20);
+ lanes += 8 * 21;
+ nlanes -= 21;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+
+ case 18:
+ /* SHA3-224 */
+ while (pos == 0 && nlanes >= 18)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_2(&hd->u.state64[8], lanes + 8 * 8);
+ absorb_lanes64_8(&hd->u.state64[10], lanes + 8 * 10);
+ lanes += 8 * 18;
+ nlanes -= 18;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+
+ case 17:
+ /* SHA3-256 & SHAKE256 */
+ while (pos == 0 && nlanes >= 17)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_8(&hd->u.state64[8], lanes + 8 * 8);
+ absorb_lanes64_1(&hd->u.state64[16], lanes + 8 * 16);
+ lanes += 8 * 17;
+ nlanes -= 17;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+
+ case 13:
+ /* SHA3-384 */
+ while (pos == 0 && nlanes >= 13)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_4(&hd->u.state64[8], lanes + 8 * 8);
+ absorb_lanes64_1(&hd->u.state64[12], lanes + 8 * 12);
+ lanes += 8 * 13;
+ nlanes -= 13;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+
+ case 9:
+ /* SHA3-512 */
+ while (pos == 0 && nlanes >= 9)
+ {
+ absorb_lanes64_8(&hd->u.state64[0], lanes + 8 * 0);
+ absorb_lanes64_1(&hd->u.state64[8], lanes + 8 * 8);
+ lanes += 8 * 9;
+ nlanes -= 9;
+
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ }
+ break;
+ }
+
+ while (nlanes)
+ {
+ hd->u.state64[pos] ^= buf_get_le64(lanes);
+ lanes += 8;
+ nlanes--;
+
+ if (++pos == blocklanes)
+ {
+ burn = KECCAK_F1600_PERMUTE_FUNC_NAME(hd);
+ pos = 0;
+ break;
+ }
+ }
+ }
+
+ return burn;
+}
More information about the Gcrypt-devel
mailing list