From chris at chatsecure.org Wed Oct 5 02:09:15 2016 From: chris at chatsecure.org (Chris Ballinger) Date: Tue, 4 Oct 2016 17:09:15 -0700 Subject: gen-posix-lock-obj headers for iOS simulator Message-ID: I hit some issues with our OTRKit build scripts when upgrading to libgcrypt 1.7.3 from 1.6.4 due to missing syscfg headers when cross-compiling for the iOS simulator. I generated the new headers below: -------------- next part -------------- An HTML attachment was scrubbed... URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: lock-obj-pub.i386-apple-darwin.h Type: text/x-chdr Size: 667 bytes Desc: not available URL: -------------- next part -------------- A non-text attachment was scrubbed... Name: lock-obj-pub.x86_64-apple-darwin.h Type: text/x-chdr Size: 553 bytes Desc: not available URL: From wk at gnupg.org Fri Oct 7 16:26:18 2016 From: wk at gnupg.org (Werner Koch) Date: Fri, 07 Oct 2016 16:26:18 +0200 Subject: gen-posix-lock-obj headers for iOS simulator In-Reply-To: (Chris Ballinger's message of "Tue, 4 Oct 2016 17:09:15 -0700") References: Message-ID: <87y4206xrp.fsf@wheatstone.g10code.de> > I hit some issues with our OTRKit build scripts when upgrading to libgcrypt > 1.7.3 from 1.6.4 due to missing syscfg headers when cross-compiling for the > iOS simulator. I generated the new headers below: Thanks. Will go into the next release. Shalom-Salam, Werner -- Die Gedanken sind frei. Ausnahmen regelt ein Bundesgesetz. -------------- next part -------------- A non-text attachment was scrubbed... Name: not available Type: application/pgp-signature Size: 162 bytes Desc: not available URL: From jussi.kivilinna at iki.fi Sun Oct 9 17:13:56 2016 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 09 Oct 2016 18:13:56 +0300 Subject: [PATCH 1/2] GCM: Add bulk processing for ARMv8/AArch32 implementation Message-ID: <147602603697.12134.11212163233767134875.stgit@localhost6.localdomain6> * cipher/cipher-gcm-armv8-aarch32-ce.S: Add 4 blocks bulk processing. * tests/basic.c (check_digests): Print correct data length for "?" tests. (check_one_mac): Add large 1000000 bytes tests, when input is "!" or "?". (check_mac): Add "?" tests vectors for HMAC, CMAC, GMAC and POLY1305. -- Benchmark on Cortex-A53 (1152 Mhz): Before: | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 0.924 ns/B 1032.2 MiB/s 1.06 c/B After (1.21x faster): | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 0.764 ns/B 1248.2 MiB/s 0.880 c/B Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/cipher-gcm-armv8-aarch32-ce.S b/cipher/cipher-gcm-armv8-aarch32-ce.S index b879fb2..b61a787 100644 --- a/cipher/cipher-gcm-armv8-aarch32-ce.S +++ b/cipher/cipher-gcm-armv8-aarch32-ce.S @@ -57,69 +57,125 @@ gcry_gcm_reduction_constant: #define rhash_l d0 #define rhash_h d1 -#define rbuf q1 -#define rbuf_l d2 -#define rbuf_h d3 +#define rh1 q1 +#define rh1_l d2 +#define rh1_h d3 -#define rh0 q2 -#define rh0_l d4 -#define rh0_h d5 +#define rbuf q2 +#define rbuf_l d4 +#define rbuf_h d5 -#define rt0 q3 -#define rt0_l d6 -#define rt0_h d7 +#define rbuf1 q3 +#define rbuf1_l d6 +#define rbuf1_h d7 -#define rr0 q8 -#define rr0_l d16 -#define rr0_h d17 +#define rbuf2 q4 +#define rbuf2_l d8 +#define rbuf2_h d9 -#define rr1 q9 -#define rr1_l d18 -#define rr1_h d19 +#define rbuf3 q5 +#define rbuf3_l d10 +#define rbuf3_h d11 + +#define rh2 q6 +#define rh2_l d12 +#define rh2_h d13 + +#define rh3 q7 +#define rh3_l d14 +#define rh3_h d15 + +#define rh4 q8 +#define rh4_l d16 +#define rh4_h d17 + +#define rr2 q9 +#define rr2_l d18 +#define rr2_h d19 + +#define rr3 q10 +#define rr3_l d20 +#define rr3_h d21 + +#define rr0 q11 +#define rr0_l d22 +#define rr0_h d23 + +#define rr1 q12 +#define rr1_l d24 +#define rr1_h d25 + +#define rt0 q13 +#define rt0_l d26 +#define rt0_h d27 + +#define rt1 q14 +#define rt1_l d28 +#define rt1_h d29 #define rrconst q15 #define rrconst_l d30 #define rrconst_h d31 -#define ia rbuf_h -#define ib rbuf_l -#define oa rh0_l -#define ob rh0_h -#define co rrconst_l -#define ma rrconst_h - /* GHASH macros */ /* See "Gouv?a, C. P. L. & L?pez, J. Implementing GCM on ARMv8. Topics in * Cryptology ? CT-RSA 2015" for details. */ -/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */ +/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) + * Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'. + */ #define PMUL_128x128(r0, r1, a, b, t, interleave_op) \ veor t##_h, b##_l, b##_h; \ veor t##_l, a##_l, a##_h; \ vmull.p64 r0, a##_l, b##_l; \ vmull.p64 r1, a##_h, b##_h; \ vmull.p64 t, t##_h, t##_l; \ - interleave_op(); \ + interleave_op; \ veor t, r0; \ veor t, r1; \ veor r0##_h, t##_l; \ veor r1##_l, t##_h; +/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A) + * Note: 'r1A' may be 'aA' or 'bA', 'r0A' must not be either 'aA' or 'bA'. + * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B) + * Note: 'r1B' may be 'aB' or 'bB', 'r0B' must not be either 'aB' or 'bB'. + */ +#define PMUL_128x128_2(r0A, r1A, aA, bA, r0B, r1B, aB, bB, tA, tB, interleave_op) \ + veor tA##_h, bA##_l, bA##_h; \ + veor tA##_l, aA##_l, aA##_h; \ + veor tB##_h, bB##_l, bB##_h; \ + veor tB##_l, aB##_l, aB##_h; \ + vmull.p64 r0A, aA##_l, bA##_l; \ + vmull.p64 r1A, aA##_h, bA##_h; \ + vmull.p64 tA, tA##_h, tA##_l; \ + vmull.p64 r0B, aB##_l, bB##_l; \ + vmull.p64 r1B, aB##_h, bB##_h; \ + vmull.p64 tB, tB##_h, tB##_l; \ + interleave_op; \ + veor tA, r0A; \ + veor tA, r1A; \ + veor tB, r0B; \ + veor tB, r1B; \ + veor r0A##_h, tA##_l; \ + veor r1A##_l, tA##_h; \ + veor r0B##_h, tB##_l; \ + veor r1B##_l, tB##_h; \ + /* Input: 'r0:r1', Output: 'a' */ #define REDUCTION(a, r0, r1, rconst, t, interleave_op) \ vmull.p64 t, r0##_l, rconst; \ veor r0##_h, t##_l; \ veor r1##_l, t##_h; \ - interleave_op(); \ + interleave_op; \ vmull.p64 t, r0##_h, rconst; \ veor r1, t; \ veor a, r0, r1; -#define _(...) /*_*/ -#define vrev_rbuf() vrev64.8 rbuf, rbuf; -#define vext_rbuf() vext.8 rbuf, rbuf, rbuf, #8; +#define _(...) __VA_ARGS__ +#define __ _() /* Other functional macros */ @@ -142,22 +198,128 @@ _gcry_ghash_armv8_ce_pmull: * r3: nblocks * %st+0: gcm_table */ - push {r4, lr} + push {r4-r6, lr} cmp r3, #0 beq .Ldo_nothing - GET_DATA_POINTER(lr, .Lrconst64, r4) + GET_DATA_POINTER(r4, .Lrconst64, lr) - subs r3, r3, #1 vld1.64 {rhash}, [r1] - vld1.64 {rh0}, [r0] + vld1.64 {rh1}, [r0] vrev64.8 rhash, rhash /* byte-swap */ - vld1.64 {rrconst_h}, [lr] + vld1.64 {rrconst_h}, [r4] vext.8 rhash, rhash, rhash, #8 + cmp r3, #4 + blo .Less_than_4 + + /* Bulk processing of 4 blocks per loop iteration. */ + + ldr r5, [sp, #(4*4)]; + add r6, r5, #32 + + vpush {q4-q7} + + vld1.64 {rh2-rh3}, [r5] + vld1.64 {rh4}, [r6] + + vld1.64 {rbuf-rbuf1}, [r2]! + sub r3, r3, #4 + vld1.64 {rbuf2-rbuf3}, [r2]! + + cmp r3, #4 + vrev64.8 rbuf, rbuf /* byte-swap */ + vrev64.8 rbuf1, rbuf1 /* byte-swap */ + vrev64.8 rbuf2, rbuf2 /* byte-swap */ + vrev64.8 rbuf3, rbuf3 /* byte-swap */ + + vext.8 rbuf, rbuf, rbuf, #8 + vext.8 rbuf1, rbuf1, rbuf1, #8 + vext.8 rbuf2, rbuf2, rbuf2, #8 + vext.8 rbuf3, rbuf3, rbuf3, #8 + veor rhash, rhash, rbuf /* in0 ^ hash */ + + blo .Lend_4 + +.Loop_4: + /* (in0 ^ hash) * H? => rr2:rr3 */ + /* (in1) * H? => rr0:rr1 */ + PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) + + vld1.64 {rbuf-rbuf1}, [r2]! + sub r3, r3, #4 + veor rr0, rr0, rr2 + veor rr1, rr1, rr3 + + /* (in2) * H? => rr2:rr3 */ + /* (in3) * H? => rhash:rbuf3 */ + PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1, + _(vrev64.8 rbuf, rbuf)) + + vld1.64 {rbuf2}, [r2]! + + vrev64.8 rbuf1, rbuf1 + veor rr0, rr0, rr2 + veor rr1, rr1, rr3 + + cmp r3, #4 + vext.8 rbuf, rbuf, rbuf, #8 + vext.8 rbuf1, rbuf1, rbuf1, #8 + + veor rr0, rr0, rhash + veor rr1, rr1, rbuf3 + + vld1.64 {rbuf3}, [r2]! + + REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, + _(vrev64.8 rbuf2, rbuf2; + vrev64.8 rbuf3, rbuf3)) + + vext.8 rbuf2, rbuf2, rbuf2, #8 + vext.8 rbuf3, rbuf3, rbuf3, #8 + veor rhash, rhash, rbuf /* in0 ^ hash */ + + bhs .Loop_4 + +.Lend_4: + /* (in0 ^ hash) * H? => rr2:rr3 */ + /* (in1) * H? => rr0:rr1 */ + PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) + + /* (in2) * H? => rhash:rbuf */ + /* (in3) * H? => rbuf1:rbuf2 */ + PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1, + _(veor rr0, rr0, rr2; + veor rr1, rr1, rr3)) + + veor rr0, rr0, rhash + veor rr1, rr1, rbuf + + veor rr0, rr0, rbuf1 + veor rr1, rr1, rbuf2 + + REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, + _(CLEAR_REG(rr2); + CLEAR_REG(rr3); + CLEAR_REG(rbuf1); + CLEAR_REG(rbuf2); + CLEAR_REG(rbuf3); + CLEAR_REG(rh2); + CLEAR_REG(rh3); + CLEAR_REG(rh4))) + + vpop {q4-q7} + + cmp r3, #0 + beq .Ldone + +.Less_than_4: + /* Handle remaining blocks. */ + vld1.64 {rbuf}, [r2]! + subs r3, r3, #1 vrev64.8 rbuf, rbuf /* byte-swap */ vext.8 rbuf, rbuf, rbuf, #8 @@ -169,30 +331,29 @@ _gcry_ghash_armv8_ce_pmull: .Loop: vld1.64 {rbuf}, [r2]! subs r3, r3, #1 - PMUL_128x128(rr0, rr1, rh0, rhash, rt0, vrev_rbuf) - REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, vext_rbuf) + PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(vrev64.8 rbuf, rbuf)) + REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(vext.8 rbuf, rbuf, rbuf, #8)) veor rhash, rhash, rbuf bne .Loop .Lend: - PMUL_128x128(rr0, rr1, rh0, rhash, rt0, _) - REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _) + PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf))) + REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1))) +.Ldone: CLEAR_REG(rr1) - CLEAR_REG(rr0) vrev64.8 rhash, rhash /* byte-swap */ - CLEAR_REG(rbuf) CLEAR_REG(rt0) + CLEAR_REG(rr0) vext.8 rhash, rhash, rhash, #8 - CLEAR_REG(rh0) - + CLEAR_REG(rt1) vst1.64 {rhash}, [r1] CLEAR_REG(rhash) .Ldo_nothing: mov r0, #0 - pop {r4, pc} + pop {r4-r6, pc} .size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull; @@ -208,28 +369,64 @@ _gcry_ghash_setup_armv8_ce_pmull: * r1: gcm_table */ - push {r4, lr} + vpush {q4-q7} - GET_DATA_POINTER(r4, .Lrconst64, lr) + GET_DATA_POINTER(r2, .Lrconst64, r3) + + vld1.64 {rrconst_h}, [r2] + +#define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \ + /* H <<< 1 */ \ + vshr.s64 ma, ib, #63; \ + vshr.u64 oa, ib, #63; \ + vshr.u64 ob, ia, #63; \ + vand ma, const_d; \ + vshl.u64 ib, ib, #1; \ + vshl.u64 ia, ia, #1; \ + vorr ob, ib; \ + vorr oa, ia; \ + veor ob, ma; \ + vst1.64 {oa, ob}, [r_out] + + vld1.64 {rhash}, [r0] + vrev64.8 rhash, rhash /* byte-swap */ + vext.8 rhash, rhash, rhash, #8 + + vmov rbuf1, rhash + GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */ - /* H <<< 1 */ - vld1.64 {ib,ia}, [r0] - vld1.64 {co}, [r4] - vrev64.8 ib, ib; - vrev64.8 ia, ia; - vshr.s64 ma, ib, #63 - vshr.u64 oa, ib, #63 - vshr.u64 ob, ia, #63 - vand ma, co - vshl.u64 ib, ib, #1 - vshl.u64 ia, ia, #1 - vorr ob, ib - vorr oa, ia - veor ob, ma - - vst1.64 {oa, ob}, [r0] - - pop {r4, pc} + /* H? */ + PMUL_128x128(rr0, rr1, rbuf1, rh1, rt0, __) + REDUCTION(rh2, rr0, rr1, rrconst_h, rt0, __) + vmov rhash, rh2 + GCM_LSH_1(r1, rh2_l, rh2_h, rrconst_h, rbuf1_l, rbuf1_h, rt1_l) /* H?<<<1 */ + add r1, r1, #16 + + /* H? */ + PMUL_128x128(rr0, rr1, rhash, rh1, rt1, __) + REDUCTION(rh3, rr0, rr1, rrconst_h, rt1, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rhash, rbuf1, rt0, __) + REDUCTION(rh4, rr0, rr1, rrconst_h, rt0, __) + + GCM_LSH_1(r1, rh3_l, rh3_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H?<<<1 */ + add r1, r1, #16 + GCM_LSH_1(r1, rh4_l, rh4_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H?<<<1 */ + + CLEAR_REG(rt0) + CLEAR_REG(rt1) + CLEAR_REG(rr1) + CLEAR_REG(rr0) + CLEAR_REG(rh1) + CLEAR_REG(rh2) + CLEAR_REG(rh3) + CLEAR_REG(rh4) + CLEAR_REG(rhash) + CLEAR_REG(rbuf1) + CLEAR_REG(rrconst) + vpop {q4-q7} + bx lr .size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull; #endif diff --git a/tests/basic.c b/tests/basic.c index 96fb4cb..e5a325b 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -6902,7 +6902,7 @@ check_digests (void) fprintf (stderr, " checking %s [%i] for length %d\n", gcry_md_algo_name (algos[i].md), algos[i].md, - !strcmp (algos[i].data, "!")? + (!strcmp (algos[i].data, "!") || !strcmp (algos[i].data, "?"))? 1000000 : (int)strlen(algos[i].data)); check_one_md (algos[i].md, algos[i].data, @@ -7359,6 +7359,15 @@ check_one_mac (int algo, const char *data, int datalen, int i; gcry_error_t err = 0; + if (test_buffering) + { + if ((*data == '!' && !data[1]) || + (*data == '?' && !data[1])) + { + return; /* Skip. */ + } + } + err = gcry_mac_open (&hd, algo, 0, NULL); if (err) { @@ -7416,7 +7425,60 @@ check_one_mac (int algo, const char *data, int datalen, } else { - err = gcry_mac_write (hd, data, datalen); + if ((*data == '!' && !data[1]) || /* hash one million times a "a" */ + (*data == '?' && !data[1])) /* hash million byte data-set with byte pattern 0x00,0x01,0x02,... */ + { + char aaa[1000]; + size_t left = 1000 * 1000; + size_t startlen = 1; + size_t piecelen = startlen; + + if (*data == '!') + memset (aaa, 'a', 1000); + + /* Write in chuck with all sizes 1 to 1000 (500500 bytes) */ + for (i = 1; i <= 1000 && left > 0; i++) + { + piecelen = i; + if (piecelen > sizeof(aaa)) + piecelen = sizeof(aaa); + if (piecelen > left) + piecelen = left; + + if (*data == '?') + fillbuf_count(aaa, piecelen, 1000 * 1000 - left); + + gcry_mac_write (hd, aaa, piecelen); + + left -= piecelen; + } + + /* Write in odd size chunks so that we test the buffering. */ + while (left > 0) + { + if (piecelen > sizeof(aaa)) + piecelen = sizeof(aaa); + if (piecelen > left) + piecelen = left; + + if (*data == '?') + fillbuf_count(aaa, piecelen, 1000 * 1000 - left); + + gcry_mac_write (hd, aaa, piecelen); + + left -= piecelen; + + if (piecelen == sizeof(aaa)) + piecelen = ++startlen; + else + piecelen = piecelen * 2 - ((piecelen != startlen) ? startlen : 0); + } + } + else + { + err = gcry_mac_write (hd, data, datalen); + } + if (err) fail("algo %d, mac gcry_mac_write failed: %s\n", algo, gpg_strerror (err)); if (err) @@ -7426,8 +7488,6 @@ check_one_mac (int algo, const char *data, int datalen, err = gcry_mac_verify (hd, expect, maclen); if (err) fail("algo %d, mac gcry_mac_verify failed: %s\n", algo, gpg_strerror (err)); - if (err) - goto out; macoutlen = maclen; err = gcry_mac_read (hd, p, &macoutlen); @@ -7511,6 +7571,8 @@ check_mac (void) "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa" "\xaa\xaa\xaa\xaa\xaa", "\x6f\x63\x0f\xad\x67\xcd\xa0\xee\x1f\xb1\xf5\x62\xdb\x3a\xa5\x3e", }, + { GCRY_MAC_HMAC_MD5, "?", "????????????????", + "\x7e\x28\xf8\x8e\xf4\x6c\x48\x30\xa2\x0c\xe3\xe1\x42\xd4\xb5\x6b" }, { GCRY_MAC_HMAC_SHA256, "what do ya want for nothing?", "Jefe", "\x5b\xdc\xc1\x46\xbf\x60\x75\x4e\x6a\x04\x24\x26\x08\x95\x75\xc7\x5a" "\x00\x3f\x08\x9d\x27\x39\x83\x9d\xec\x58\xb9\x64\xec\x38\x43" }, @@ -7564,6 +7626,9 @@ check_mac (void) "\xaa\xaa\xaa", "\x9b\x09\xff\xa7\x1b\x94\x2f\xcb\x27\x63\x5f\xbc\xd5\xb0\xe9\x44" "\xbf\xdc\x63\x64\x4f\x07\x13\x93\x8a\x7f\x51\x53\x5c\x3a\x35\xe2" }, + { GCRY_MAC_HMAC_SHA256, "?", "????????????????", + "\x1c\x0e\x57\xad\x4a\x02\xd2\x30\xce\x7e\xf8\x08\x23\x25\x71\x5e" + "\x16\x9b\x30\xca\xc3\xf4\x99\xc5\x1d\x4c\x25\x32\xa9\xf2\x15\x28" }, { GCRY_MAC_HMAC_SHA224, "what do ya want for nothing?", "Jefe", "\xa3\x0e\x01\x09\x8b\xc6\xdb\xbf\x45\x69\x0f\x3a\x7e\x9e\x6d\x0f" "\x8b\xbe\xa2\xa3\x9e\x61\x48\x00\x8f\xd0\x5e\x44" }, @@ -7617,6 +7682,9 @@ check_mac (void) "\xaa\xaa\xaa", "\x3a\x85\x41\x66\xac\x5d\x9f\x02\x3f\x54\xd5\x17\xd0\xb3\x9d\xbd" "\x94\x67\x70\xdb\x9c\x2b\x95\xc9\xf6\xf5\x65\xd1" }, + { GCRY_MAC_HMAC_SHA224, "?", "????????????????", + "\xc1\x88\xaf\xcf\xce\x51\xa2\x14\x3d\xc1\xaf\x93\xcc\x2b\xe9\x4d" + "\x39\x55\x90\x4c\x46\x70\xfc\xc2\x04\xcf\xab\xfa" }, { GCRY_MAC_HMAC_SHA384, "what do ya want for nothing?", "Jefe", "\xaf\x45\xd2\xe3\x76\x48\x40\x31\x61\x7f\x78\xd2\xb5\x8a\x6b\x1b" "\x9c\x7e\xf4\x64\xf5\xa0\x1b\x47\xe4\x2e\xc3\x73\x63\x22\x44\x5e" @@ -7676,6 +7744,10 @@ check_mac (void) "\x66\x17\x17\x8e\x94\x1f\x02\x0d\x35\x1e\x2f\x25\x4e\x8f\xd3\x2c" "\x60\x24\x20\xfe\xb0\xb8\xfb\x9a\xdc\xce\xbb\x82\x46\x1e\x99\xc5" "\xa6\x78\xcc\x31\xe7\x99\x17\x6d\x38\x60\xe6\x11\x0c\x46\x52\x3e" }, + { GCRY_MAC_HMAC_SHA384, "?", "????????????????", + "\xe7\x96\x29\xa3\x40\x5f\x1e\x6e\x92\xa5\xdb\xa5\xc6\xe9\x60\xa8" + "\xf5\xd1\x6d\xcb\x10\xec\x30\x2f\x6b\x9c\x37\xe0\xea\xf1\x53\x28" + "\x08\x01\x9b\xe3\x4a\x43\xc6\xc2\x2b\x0c\xd9\x43\x64\x35\x25\x78" }, { GCRY_MAC_HMAC_SHA512, "what do ya want for nothing?", "Jefe", "\x16\x4b\x7a\x7b\xfc\xf8\x19\xe2\xe3\x95\xfb\xe7\x3b\x56\xe0\xa3" "\x87\xbd\x64\x22\x2e\x83\x1f\xd6\x10\x27\x0c\xd7\xea\x25\x05\x54" @@ -7741,6 +7813,11 @@ check_mac (void) "\xde\xbd\x71\xf8\x86\x72\x89\x86\x5d\xf5\xa3\x2d\x20\xcd\xc9\x44" "\xb6\x02\x2c\xac\x3c\x49\x82\xb1\x0d\x5e\xeb\x55\xc3\xe4\xde\x15" "\x13\x46\x76\xfb\x6d\xe0\x44\x60\x65\xc9\x74\x40\xfa\x8c\x6a\x58" }, + { GCRY_MAC_HMAC_SHA512, "?", "????????????????", + "\xd4\x43\x61\xfa\x3d\x3d\x57\xd6\xac\xc3\x9f\x1c\x3d\xd9\x26\x84" + "\x1f\xfc\x4d\xf2\xbf\x78\x87\x72\x5e\x6c\x3e\x00\x6d\x39\x5f\xfa" + "\xd7\x3a\xf7\x83\xb7\xb5\x61\xbd\xfb\x33\xe0\x03\x97\xa7\x72\x79" + "\x66\x66\xbf\xbd\x44\xfa\x04\x01\x1b\xc1\x48\x1d\x9e\xde\x5b\x8e" }, /* HMAC-SHA3 test vectors from * http://wolfgang-ehrhardt.de/hmac-sha3-testvectors.html */ { GCRY_MAC_HMAC_SHA3_224, @@ -7904,6 +7981,21 @@ check_mac (void) "\x1f\x3e\x6c\xf0\x48\x60\xc6\xbb\xd7\xfa\x48\x86\x74\x78\x2b\x46" "\x59\xfd\xbd\xf3\xfd\x87\x78\x52\x88\x5c\xfe\x6e\x22\x18\x5f\xe7" "\xb2\xee\x95\x20\x43\x62\x9b\xc9\xd5\xf3\x29\x8a\x41\xd0\x2c\x66" }, + { GCRY_MAC_HMAC_SHA3_224, "?", "????????????????", + "\x80\x2b\x3c\x84\xfe\x3e\x01\x22\x14\xf8\xba\x74\x79\xfd\xb5\x02" + "\xea\x0c\x06\xa4\x7e\x01\xe3\x2c\xc7\x24\x89\xc3" }, + { GCRY_MAC_HMAC_SHA3_256, "?", "????????????????", + "\x6c\x7c\x96\x5b\x19\xba\xcd\x61\x69\x8a\x2c\x7a\x2b\x96\xa1\xc3" + "\x33\xa0\x3c\x5d\x54\x87\x37\x60\xc8\x2f\xa2\xa6\x12\x38\x8d\x1b" }, + { GCRY_MAC_HMAC_SHA3_384, "?", "????????????????", + "\xc0\x20\xd0\x9b\xa7\xb9\xd5\xb8\xa6\xa4\xba\x20\x55\xd9\x0b\x35" + "\x8b\xe0\xb7\xec\x1e\x9f\xe6\xb9\xbd\xd5\xe9\x9b\xfc\x0a\x11\x3a" + "\x15\x41\xed\xfd\xef\x30\x8d\x03\xb8\xca\x3a\xa8\xc7\x2d\x89\x32" }, + { GCRY_MAC_HMAC_SHA3_512, "?", "????????????????", + "\xb4\xef\x24\xd2\x07\xa7\x01\xb3\xe1\x81\x11\x22\x93\x83\x64\xe0" + "\x5e\xad\x03\xb7\x43\x4f\x87\xa1\x14\x8e\x17\x8f\x2a\x97\x7d\xe8" + "\xbd\xb0\x37\x3b\x67\xb9\x97\x36\xa5\x82\x9b\xdc\x0d\xe4\x5a\x8c" + "\x5e\xda\xb5\xca\xea\xa9\xb4\x6e\xba\xca\x25\xc8\xbf\xa1\x0e\xb0" }, /* CMAC AES and DES test vectors from http://web.archive.org/web/20130930212819/http://csrc.nist.gov/publica\ tions/nistpubs/800-38B/Updated_CMAC_Examples.pdf */ @@ -7978,6 +8070,8 @@ check_mac (void) "\x60\x3d\xeb\x10\x15\xca\x71\xbe\x2b\x73\xae\xf0\x85\x7d\x77\x81" "\x1f\x35\x2c\x07\x3b\x61\x08\xd7\x2d\x98\x10\xa3\x09\x14\xdf\xf4", "\xe1\x99\x21\x90\x54\x9f\x6e\xd5\x69\x6a\x2c\x05\x6c\x31\x54\x10" }, + { GCRY_MAC_CMAC_AES, "?", "????????????????????????????????", + "\x9f\x72\x73\x68\xb0\x49\x2e\xb1\x35\xa0\x1d\xf9\xa8\x0a\xf6\xee" }, { GCRY_MAC_CMAC_3DES, "", "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58" @@ -8022,6 +8116,8 @@ check_mac (void) "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5\x8a\x3d\x10\xba\x80\x57\x0d\x38" "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5", "\x31\xb1\xe4\x31\xda\xbc\x4e\xb8" }, + { GCRY_MAC_CMAC_3DES, "?", "????????????????????????", + "\xc1\x38\x13\xb2\x31\x8f\x3a\xdf" }, /* CMAC Camellia test vectors from http://tools.ietf.org/html/draft-kato-ipsec-camellia-cmac96and128-05 */ { GCRY_MAC_CMAC_CAMELLIA, @@ -8045,6 +8141,8 @@ check_mac (void) "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17\xad\x2b\x41\x7b\xe6\x6c\x37\x10", "\x2b\x7e\x15\x16\x28\xae\xd2\xa6\xab\xf7\x15\x88\x09\xcf\x4f\x3c", "\xc2\x69\x9a\x6e\xba\x55\xce\x9d\x93\x9a\x8a\x4e\x19\x46\x6e\xe9" }, + { GCRY_MAC_CMAC_CAMELLIA, "?", "????????????????????????????????", + "\xba\x8a\x5a\x8d\xa7\x54\x26\x83\x3e\xb1\x20\xb5\x45\xd0\x9f\x4e" }, /* http://csrc.nist.gov/groups/STM/cavp/documents/mac/gcmtestvectors.zip */ { GCRY_MAC_GMAC_AES, "", @@ -8081,6 +8179,8 @@ check_mac (void) "\xc9\xfc\xa7\x29\xab\x60\xad\xa0", "\x20\x4b\xdb\x1b\xd6\x21\x54\xbf\x08\x92\x2a\xaa\x54\xee\xd7\x05", "\x05\xad\x13\xa5\xe2\xc2\xab\x66\x7e\x1a\x6f\xbc" }, + { GCRY_MAC_GMAC_AES, "?", "????????????????????????????????", + "\x84\x37\xc3\x42\xae\xf5\xd0\x40\xd3\x73\x90\xa9\x36\xed\x8a\x12" }, /* from NaCl */ { GCRY_MAC_POLY1305, "\x8e\x99\x3b\x9f\x48\x68\x12\x73\xc2\x96\x50\xba\x32\xfc\x76\xce" @@ -8250,6 +8350,8 @@ check_mac (void) "\x12\x97\x6a\x08\xc4\x42\x6d\x0c\xe8\xa8\x24\x07\xc4\xf4\x82\x07" "\x80\xf8\xc2\x0a\xa7\x12\x02\xd1\xe2\x91\x79\xcb\xcb\x55\x5a\x57", "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b" }, + { GCRY_MAC_POLY1305, "?", "????????????????????????????????", + "\xc3\x88\xce\x8a\x52\xd6\xe7\x21\x86\xfa\xaa\x5d\x2d\x16\xf9\xa3" }, /* from http://cr.yp.to/mac/poly1305-20050329.pdf */ { GCRY_MAC_POLY1305_AES, "\xf3\xf6", @@ -8283,6 +8385,10 @@ check_mac (void) "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b", "\x9a\xe8\x31\xe7\x43\x97\x8d\x3a\x23\x52\x7c\x71\x28\x14\x9e\x3a", 0, 32 }, + { GCRY_MAC_POLY1305_AES, "?", "????????????????????????????????", + "\x9d\xeb\xb0\xcd\x24\x90\xd3\x9b\x47\x78\x37\x0a\x81\xf2\x83\x2a", + "\x61\xee\x09\x21\x8d\x29\xb0\xaa\xed\x7e\x15\x4a\x2c\x55\x09\xcc", + 0, 32 }, { 0 }, }; int i; @@ -8310,8 +8416,9 @@ check_mac (void) fprintf (stderr, " checking %s [%i] for %d byte key and %d byte data\n", gcry_mac_algo_name (algos[i].algo), - algos[i].algo, - (int)strlen(algos[i].key), (int)strlen(algos[i].data)); + algos[i].algo, (int)strlen(algos[i].key), + (!strcmp(algos[i].data, "!") || !strcmp(algos[i].data, "?")) + ? 1000000 : (int)strlen(algos[i].data)); klen = algos[i].klen ? algos[i].klen : strlen(algos[i].key); dlen = algos[i].dlen ? algos[i].dlen : strlen (algos[i].data); From jussi.kivilinna at iki.fi Sun Oct 9 17:14:02 2016 From: jussi.kivilinna at iki.fi (Jussi Kivilinna) Date: Sun, 09 Oct 2016 18:14:02 +0300 Subject: [PATCH 2/2] GCM: Add bulk processing for ARMv8/AArch64 implementation In-Reply-To: <147602603697.12134.11212163233767134875.stgit@localhost6.localdomain6> References: <147602603697.12134.11212163233767134875.stgit@localhost6.localdomain6> Message-ID: <147602604202.12134.18337008145069449063.stgit@localhost6.localdomain6> * cipher/cipher-gcm-armv8-aarch64-ce.S: Add 6 blocks bulk processing. -- Benchmark on Cortex-A53 (1152 Mhz): Before: | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 1.30 ns/B 731.6 MiB/s 1.50 c/B After (1.49x faster): | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 0.873 ns/B 1092.1 MiB/s 1.01 c/B Signed-off-by: Jussi Kivilinna --- 0 files changed diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S index 51d67b7..4830b61 100644 --- a/cipher/cipher-gcm-armv8-aarch64-ce.S +++ b/cipher/cipher-gcm-armv8-aarch64-ce.S @@ -43,14 +43,37 @@ gcry_gcm_reduction_constant: /* Register macros */ #define rhash v0 -#define rbuf v1 -#define rh0 v2 -#define rr0 v3 -#define rr1 v4 -#define rrconst v5 -#define vT0 v16 -#define vT1 v17 -#define vZZ v18 +#define rr0 v1 +#define rr1 v2 +#define rbuf v3 +#define rbuf1 v4 +#define rbuf2 v5 +#define rbuf3 v6 +#define rbuf4 v7 +#define rbuf5 v8 +#define rr2 v9 +#define rr3 v10 +#define rr4 v11 +#define rr5 v12 +#define rr6 v13 +#define rr7 v14 +#define rr8 v15 +#define rr9 v16 + +#define rrconst v18 +#define rh1 v19 +#define rh2 v20 +#define rh3 v21 +#define rh4 v22 +#define rh5 v23 +#define rh6 v24 +#define t0 v25 +#define t1 v26 +#define t2 v27 +#define t3 v28 +#define t4 v29 +#define t5 v30 +#define vZZ v31 /* GHASH macros */ @@ -59,38 +82,90 @@ gcry_gcm_reduction_constant: */ /* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */ -#define PMUL_128x128(r0, r1, a, b, interleave_op) \ - ext vT0.16b, b.16b, b.16b, #8; \ +#define PMUL_128x128(r0, r1, a, b, T0, T1, interleave_op) \ + ext T0.16b, b.16b, b.16b, #8; \ pmull r0.1q, a.1d, b.1d; \ pmull2 r1.1q, a.2d, b.2d; \ - pmull vT1.1q, a.1d, vT0.1d; \ - pmull2 vT0.1q, a.2d, vT0.2d; \ - interleave_op(); \ - eor vT0.16b, vT0.16b, vT1.16b; \ - ext vT1.16b, vZZ.16b, vT0.16b, #8; \ - ext vT0.16b, vT0.16b, vZZ.16b, #8; \ - eor r0.16b, r0.16b, vT1.16b; \ - eor r1.16b, r1.16b, vT0.16b; + pmull T1.1q, a.1d, T0.1d; \ + pmull2 T0.1q, a.2d, T0.2d; \ + interleave_op; \ + eor T0.16b, T0.16b, T1.16b; \ + ext T1.16b, vZZ.16b, T0.16b, #8; \ + ext T0.16b, T0.16b, vZZ.16b, #8; \ + eor r0.16b, r0.16b, T1.16b; \ + eor r1.16b, r1.16b, T0.16b; + +/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A) + * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B) + * Input: 'aC' and 'bC', Output: 'r0C:r1C' (low 128-bits in r0C, high in r1C) + */ +#define PMUL_128x128_3(r0A, r1A, aA, bA, t0A, t1A, \ + r0B, r1B, aB, bB, t0B, t1B, \ + r0C, r1C, aC, bC, t0C, t1C, interleave_op) \ + ext t0A.16b, bA.16b, bA.16b, #8; \ + pmull r0A.1q, aA.1d, bA.1d; \ + pmull2 r1A.1q, aA.2d, bA.2d; \ + ext t0B.16b, bB.16b, bB.16b, #8; \ + pmull r0B.1q, aB.1d, bB.1d; \ + pmull2 r1B.1q, aB.2d, bB.2d; \ + ext t0C.16b, bC.16b, bC.16b, #8; \ + pmull r0C.1q, aC.1d, bC.1d; \ + pmull2 r1C.1q, aC.2d, bC.2d; \ + pmull t1A.1q, aA.1d, t0A.1d; \ + pmull2 t0A.1q, aA.2d, t0A.2d; \ + pmull t1B.1q, aB.1d, t0B.1d; \ + pmull2 t0B.1q, aB.2d, t0B.2d; \ + pmull t1C.1q, aC.1d, t0C.1d; \ + pmull2 t0C.1q, aC.2d, t0C.2d; \ + eor t0A.16b, t0A.16b, t1A.16b; \ + eor t0B.16b, t0B.16b, t1B.16b; \ + eor t0C.16b, t0C.16b, t1C.16b; \ + interleave_op; \ + ext t1A.16b, vZZ.16b, t0A.16b, #8; \ + ext t0A.16b, t0A.16b, vZZ.16b, #8; \ + ext t1B.16b, vZZ.16b, t0B.16b, #8; \ + ext t0B.16b, t0B.16b, vZZ.16b, #8; \ + ext t1C.16b, vZZ.16b, t0C.16b, #8; \ + ext t0C.16b, t0C.16b, vZZ.16b, #8; \ + eor r0A.16b, r0A.16b, t1A.16b; \ + eor r1A.16b, r1A.16b, t0A.16b; \ + eor r0B.16b, r0B.16b, t1B.16b; \ + eor r1B.16b, r1B.16b, t0B.16b; \ + eor r0C.16b, r0C.16b, t1C.16b; \ + eor r1C.16b, r1C.16b, t0C.16b; \ /* Input: 'r0:r1', Output: 'a' */ -#define REDUCTION(a, r0, r1, rconst, interleave_op) \ - pmull2 vT0.1q, r1.2d, rconst.2d; \ - interleave_op(); \ - ext vT1.16b, vT0.16b, vZZ.16b, #8; \ - ext vT0.16b, vZZ.16b, vT0.16b, #8; \ - eor r1.16b, r1.16b, vT1.16b; \ - eor r0.16b, r0.16b, vT0.16b; \ - pmull vT0.1q, r1.1d, rconst.1d; \ - eor a.16b, r0.16b, vT0.16b; - -#define _(...) /*_*/ -#define ld1_rbuf() ld1 {rbuf.16b}, [x2], #16; -#define rbit_rbuf() rbit rbuf.16b, rbuf.16b; +#define REDUCTION(a, r0, r1, rconst, T0, T1, interleave_op1, interleave_op2, \ + interleave_op3) \ + pmull2 T0.1q, r1.2d, rconst.2d; \ + interleave_op1; \ + ext T1.16b, T0.16b, vZZ.16b, #8; \ + ext T0.16b, vZZ.16b, T0.16b, #8; \ + interleave_op2; \ + eor r1.16b, r1.16b, T1.16b; \ + eor r0.16b, r0.16b, T0.16b; \ + pmull T0.1q, r1.1d, rconst.1d; \ + interleave_op3; \ + eor a.16b, r0.16b, T0.16b; /* Other functional macros */ +#define _(...) __VA_ARGS__ +#define __ _() + #define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b; +#define VPUSH_ABI \ + stp d8, d9, [sp, #-16]!; \ + stp d10, d11, [sp, #-16]!; \ + stp d12, d13, [sp, #-16]!; \ + stp d14, d15, [sp, #-16]!; + +#define VPOP_ABI \ + ldp d14, d15, [sp], #16; \ + ldp d12, d13, [sp], #16; \ + ldp d10, d11, [sp], #16; \ + ldp d8, d9, [sp], #16; /* * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result, @@ -112,16 +187,145 @@ _gcry_ghash_armv8_ce_pmull: GET_DATA_POINTER(x5, .Lrconst) - sub x3, x3, #1 - eor vZZ.16b, vZZ.16b, vZZ.16b ld1 {rhash.16b}, [x1] - ld1 {rh0.16b}, [x0] + ld1 {rh1.16b}, [x0] rbit rhash.16b, rhash.16b /* bit-swap */ ld1r {rrconst.2d}, [x5] + cmp x3, #6 + b.lo .Less_than_6 + + add x6, x4, #64 + VPUSH_ABI + + ld1 {rh2.16b-rh5.16b}, [x4] + ld1 {rh6.16b}, [x6] + + sub x3, x3, #6 + + ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16) + ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16) + + rbit rbuf.16b, rbuf.16b /* bit-swap */ + rbit rbuf1.16b, rbuf1.16b /* bit-swap */ + rbit rbuf2.16b, rbuf2.16b /* bit-swap */ + rbit rbuf3.16b, rbuf3.16b /* bit-swap */ + rbit rbuf4.16b, rbuf4.16b /* bit-swap */ + rbit rbuf5.16b, rbuf5.16b /* bit-swap */ + eor rhash.16b, rhash.16b, rbuf.16b + + cmp x3, #6 + b.lo .Lend_6 + +.Loop_6: + + /* (in1) * H? => rr0:rr1 */ + /* (in2) * H? => rr2:rr3 */ + /* (in0 ^ hash) * H? => rr4:rr5 */ + PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1, + rr2, rr3, rbuf2, rh4, t2, t3, + rr4, rr5, rhash, rh6, t4, t5, + _(sub x3, x3, #6)) + + ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16) + cmp x3, #6 + + eor rr0.16b, rr0.16b, rr2.16b + eor rr1.16b, rr1.16b, rr3.16b + + /* (in3) * H? => rr2:rr3 */ + /* (in4) * H? => rr6:rr7 */ + /* (in5) * H? => rr8:rr9 */ + PMUL_128x128_3(rr2, rr3, rbuf3, rh3, t0, t1, + rr6, rr7, rbuf4, rh2, t2, t3, + rr8, rr9, rbuf5, rh1, t4, t5, + _(eor rr0.16b, rr0.16b, rr4.16b; + eor rr1.16b, rr1.16b, rr5.16b)) + + eor rr0.16b, rr0.16b, rr2.16b + eor rr1.16b, rr1.16b, rr3.16b + rbit rbuf.16b, rbuf.16b + eor rr0.16b, rr0.16b, rr6.16b + eor rr1.16b, rr1.16b, rr7.16b + rbit rbuf1.16b, rbuf1.16b + eor rr0.16b, rr0.16b, rr8.16b + eor rr1.16b, rr1.16b, rr9.16b + ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16) + + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, + _(rbit rbuf2.16b, rbuf2.16b), + _(rbit rbuf3.16b, rbuf3.16b), + _(rbit rbuf4.16b, rbuf4.16b)) + + rbit rbuf5.16b, rbuf5.16b + eor rhash.16b, rhash.16b, rbuf.16b + + b.hs .Loop_6 + +.Lend_6: + + /* (in1) * H? => rr0:rr1 */ + /* (in0 ^ hash) * H? => rr2:rr3 */ + /* (in2) * H? => rr4:rr5 */ + PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1, + rr2, rr3, rhash, rh6, t2, t3, + rr4, rr5, rbuf2, rh4, t4, t5, + __) + eor rr0.16b, rr0.16b, rr2.16b + eor rr1.16b, rr1.16b, rr3.16b + eor rr0.16b, rr0.16b, rr4.16b + eor rr1.16b, rr1.16b, rr5.16b + + /* (in3) * H? => rhash:rbuf */ + /* (in4) * H? => rr6:rr7 */ + /* (in5) * H? => rr8:rr9 */ + PMUL_128x128_3(rhash, rbuf, rbuf3, rh3, t0, t1, + rr6, rr7, rbuf4, rh2, t2, t3, + rr8, rr9, rbuf5, rh1, t4, t5, + _(CLEAR_REG(rh4); + CLEAR_REG(rh5); + CLEAR_REG(rh6))) + eor rr0.16b, rr0.16b, rhash.16b + eor rr1.16b, rr1.16b, rbuf.16b + eor rr0.16b, rr0.16b, rr6.16b + eor rr1.16b, rr1.16b, rr7.16b + eor rr0.16b, rr0.16b, rr8.16b + eor rr1.16b, rr1.16b, rr9.16b + + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, + _(CLEAR_REG(rh2); + CLEAR_REG(rh3); + CLEAR_REG(rr2); + CLEAR_REG(rbuf2); + CLEAR_REG(rbuf3)), + _(CLEAR_REG(rr3); + CLEAR_REG(rr4); + CLEAR_REG(rr5); + CLEAR_REG(rr6); + CLEAR_REG(rr7)), + _(CLEAR_REG(rr8); + CLEAR_REG(rr9); + CLEAR_REG(rbuf1); + CLEAR_REG(rbuf2))) + + CLEAR_REG(rbuf4) + CLEAR_REG(rbuf5) + CLEAR_REG(t2) + CLEAR_REG(t3) + CLEAR_REG(t4) + CLEAR_REG(t5) + + VPOP_ABI + + cbz x3, .Ldone + +.Less_than_6: + /* Handle remaining blocks. */ + ld1 {rbuf.16b}, [x2], #16 + sub x3, x3, #1 rbit rbuf.16b, rbuf.16b /* bit-swap */ @@ -130,24 +334,25 @@ _gcry_ghash_armv8_ce_pmull: cbz x3, .Lend .Loop: - PMUL_128x128(rr0, rr1, rh0, rhash, ld1_rbuf) - sub x3, x3, #1 - REDUCTION(rhash, rr0, rr1, rrconst, rbit_rbuf) + PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(ld1 {rbuf.16b}, [x2], #16)) + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, + _(sub x3, x3, #1), + _(rbit rbuf.16b, rbuf.16b), + __) eor rhash.16b, rhash.16b, rbuf.16b cbnz x3, .Loop .Lend: - PMUL_128x128(rr0, rr1, rh0, rhash, _) - REDUCTION(rhash, rr0, rr1, rrconst, _) + PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(CLEAR_REG(rbuf))) + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, __, _(CLEAR_REG(rh1)), __) +.Ldone: CLEAR_REG(rr1) CLEAR_REG(rr0) rbit rhash.16b, rhash.16b /* bit-swap */ - CLEAR_REG(rbuf) - CLEAR_REG(vT0) - CLEAR_REG(vT1) - CLEAR_REG(rh0) + CLEAR_REG(t0) + CLEAR_REG(t1) st1 {rhash.2d}, [x1] CLEAR_REG(rhash) @@ -170,9 +375,37 @@ _gcry_ghash_setup_armv8_ce_pmull: * x1: gcm_table */ - ld1 {vT0.16b}, [x0] - rbit vT0.16b, vT0.16b - st1 {vT0.16b}, [x0] + GET_DATA_POINTER(x2, .Lrconst) + + /* H? */ + ld1 {rh1.16b}, [x0] + rbit rh1.16b, rh1.16b + st1 {rh1.16b}, [x0] + + ld1r {rrconst.2d}, [x2] + + /* H? */ + PMUL_128x128(rr0, rr1, rh1, rh1, t0, t1, __) + REDUCTION(rh2, rr0, rr1, rrconst, t0, t1, __, __, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rh2, rh1, t0, t1, __) + REDUCTION(rh3, rr0, rr1, rrconst, t0, t1, __, __, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rh2, rh2, t0, t1, __) + REDUCTION(rh4, rr0, rr1, rrconst, t0, t1, __, __, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rh2, rh3, t0, t1, __) + REDUCTION(rh5, rr0, rr1, rrconst, t0, t1, __, __, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rh3, rh3, t0, t1, __) + REDUCTION(rh6, rr0, rr1, rrconst, t0, t1, __, __, __) + + st1 {rh2.16b-rh4.16b}, [x1], #(3*16) + st1 {rh5.16b-rh6.16b}, [x1] ret .size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull; From cvs at cvs.gnupg.org Wed Oct 12 21:17:23 2016 From: cvs at cvs.gnupg.org (by Jussi Kivilinna) Date: Wed, 12 Oct 2016 21:17:23 +0200 Subject: [git] GCRYPT - branch, master, updated. libgcrypt-1.7.3-10-gbfd732f Message-ID: This is an automated email from the git hooks/post-receive script. It was generated because a ref change was pushed to the repository containing the project "The GNU crypto library". The branch, master has been updated via bfd732f53a9b5dfe14217a68a0fa289bf6913ec0 (commit) via 27747921cb1dfced83c5666cd1c474764724c52b (commit) from 5418d9ca4c0e087fd6872ad350a996fe74880d86 (commit) Those revisions listed above that are new to this repository have not appeared on any other notification email; so we list those revisions in full, below. - Log ----------------------------------------------------------------- commit bfd732f53a9b5dfe14217a68a0fa289bf6913ec0 Author: Jussi Kivilinna Date: Sun Oct 9 12:53:48 2016 +0300 GCM: Add bulk processing for ARMv8/AArch64 implementation * cipher/cipher-gcm-armv8-aarch64-ce.S: Add 6 blocks bulk processing. -- Benchmark on Cortex-A53 (1152 Mhz): Before: | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 1.30 ns/B 731.6 MiB/s 1.50 c/B After (1.49x faster): | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 0.873 ns/B 1092.1 MiB/s 1.01 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/cipher-gcm-armv8-aarch64-ce.S b/cipher/cipher-gcm-armv8-aarch64-ce.S index 51d67b7..4830b61 100644 --- a/cipher/cipher-gcm-armv8-aarch64-ce.S +++ b/cipher/cipher-gcm-armv8-aarch64-ce.S @@ -43,14 +43,37 @@ gcry_gcm_reduction_constant: /* Register macros */ #define rhash v0 -#define rbuf v1 -#define rh0 v2 -#define rr0 v3 -#define rr1 v4 -#define rrconst v5 -#define vT0 v16 -#define vT1 v17 -#define vZZ v18 +#define rr0 v1 +#define rr1 v2 +#define rbuf v3 +#define rbuf1 v4 +#define rbuf2 v5 +#define rbuf3 v6 +#define rbuf4 v7 +#define rbuf5 v8 +#define rr2 v9 +#define rr3 v10 +#define rr4 v11 +#define rr5 v12 +#define rr6 v13 +#define rr7 v14 +#define rr8 v15 +#define rr9 v16 + +#define rrconst v18 +#define rh1 v19 +#define rh2 v20 +#define rh3 v21 +#define rh4 v22 +#define rh5 v23 +#define rh6 v24 +#define t0 v25 +#define t1 v26 +#define t2 v27 +#define t3 v28 +#define t4 v29 +#define t5 v30 +#define vZZ v31 /* GHASH macros */ @@ -59,38 +82,90 @@ gcry_gcm_reduction_constant: */ /* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */ -#define PMUL_128x128(r0, r1, a, b, interleave_op) \ - ext vT0.16b, b.16b, b.16b, #8; \ +#define PMUL_128x128(r0, r1, a, b, T0, T1, interleave_op) \ + ext T0.16b, b.16b, b.16b, #8; \ pmull r0.1q, a.1d, b.1d; \ pmull2 r1.1q, a.2d, b.2d; \ - pmull vT1.1q, a.1d, vT0.1d; \ - pmull2 vT0.1q, a.2d, vT0.2d; \ - interleave_op(); \ - eor vT0.16b, vT0.16b, vT1.16b; \ - ext vT1.16b, vZZ.16b, vT0.16b, #8; \ - ext vT0.16b, vT0.16b, vZZ.16b, #8; \ - eor r0.16b, r0.16b, vT1.16b; \ - eor r1.16b, r1.16b, vT0.16b; + pmull T1.1q, a.1d, T0.1d; \ + pmull2 T0.1q, a.2d, T0.2d; \ + interleave_op; \ + eor T0.16b, T0.16b, T1.16b; \ + ext T1.16b, vZZ.16b, T0.16b, #8; \ + ext T0.16b, T0.16b, vZZ.16b, #8; \ + eor r0.16b, r0.16b, T1.16b; \ + eor r1.16b, r1.16b, T0.16b; + +/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A) + * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B) + * Input: 'aC' and 'bC', Output: 'r0C:r1C' (low 128-bits in r0C, high in r1C) + */ +#define PMUL_128x128_3(r0A, r1A, aA, bA, t0A, t1A, \ + r0B, r1B, aB, bB, t0B, t1B, \ + r0C, r1C, aC, bC, t0C, t1C, interleave_op) \ + ext t0A.16b, bA.16b, bA.16b, #8; \ + pmull r0A.1q, aA.1d, bA.1d; \ + pmull2 r1A.1q, aA.2d, bA.2d; \ + ext t0B.16b, bB.16b, bB.16b, #8; \ + pmull r0B.1q, aB.1d, bB.1d; \ + pmull2 r1B.1q, aB.2d, bB.2d; \ + ext t0C.16b, bC.16b, bC.16b, #8; \ + pmull r0C.1q, aC.1d, bC.1d; \ + pmull2 r1C.1q, aC.2d, bC.2d; \ + pmull t1A.1q, aA.1d, t0A.1d; \ + pmull2 t0A.1q, aA.2d, t0A.2d; \ + pmull t1B.1q, aB.1d, t0B.1d; \ + pmull2 t0B.1q, aB.2d, t0B.2d; \ + pmull t1C.1q, aC.1d, t0C.1d; \ + pmull2 t0C.1q, aC.2d, t0C.2d; \ + eor t0A.16b, t0A.16b, t1A.16b; \ + eor t0B.16b, t0B.16b, t1B.16b; \ + eor t0C.16b, t0C.16b, t1C.16b; \ + interleave_op; \ + ext t1A.16b, vZZ.16b, t0A.16b, #8; \ + ext t0A.16b, t0A.16b, vZZ.16b, #8; \ + ext t1B.16b, vZZ.16b, t0B.16b, #8; \ + ext t0B.16b, t0B.16b, vZZ.16b, #8; \ + ext t1C.16b, vZZ.16b, t0C.16b, #8; \ + ext t0C.16b, t0C.16b, vZZ.16b, #8; \ + eor r0A.16b, r0A.16b, t1A.16b; \ + eor r1A.16b, r1A.16b, t0A.16b; \ + eor r0B.16b, r0B.16b, t1B.16b; \ + eor r1B.16b, r1B.16b, t0B.16b; \ + eor r0C.16b, r0C.16b, t1C.16b; \ + eor r1C.16b, r1C.16b, t0C.16b; \ /* Input: 'r0:r1', Output: 'a' */ -#define REDUCTION(a, r0, r1, rconst, interleave_op) \ - pmull2 vT0.1q, r1.2d, rconst.2d; \ - interleave_op(); \ - ext vT1.16b, vT0.16b, vZZ.16b, #8; \ - ext vT0.16b, vZZ.16b, vT0.16b, #8; \ - eor r1.16b, r1.16b, vT1.16b; \ - eor r0.16b, r0.16b, vT0.16b; \ - pmull vT0.1q, r1.1d, rconst.1d; \ - eor a.16b, r0.16b, vT0.16b; - -#define _(...) /*_*/ -#define ld1_rbuf() ld1 {rbuf.16b}, [x2], #16; -#define rbit_rbuf() rbit rbuf.16b, rbuf.16b; +#define REDUCTION(a, r0, r1, rconst, T0, T1, interleave_op1, interleave_op2, \ + interleave_op3) \ + pmull2 T0.1q, r1.2d, rconst.2d; \ + interleave_op1; \ + ext T1.16b, T0.16b, vZZ.16b, #8; \ + ext T0.16b, vZZ.16b, T0.16b, #8; \ + interleave_op2; \ + eor r1.16b, r1.16b, T1.16b; \ + eor r0.16b, r0.16b, T0.16b; \ + pmull T0.1q, r1.1d, rconst.1d; \ + interleave_op3; \ + eor a.16b, r0.16b, T0.16b; /* Other functional macros */ +#define _(...) __VA_ARGS__ +#define __ _() + #define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b; +#define VPUSH_ABI \ + stp d8, d9, [sp, #-16]!; \ + stp d10, d11, [sp, #-16]!; \ + stp d12, d13, [sp, #-16]!; \ + stp d14, d15, [sp, #-16]!; + +#define VPOP_ABI \ + ldp d14, d15, [sp], #16; \ + ldp d12, d13, [sp], #16; \ + ldp d10, d11, [sp], #16; \ + ldp d8, d9, [sp], #16; /* * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result, @@ -112,16 +187,145 @@ _gcry_ghash_armv8_ce_pmull: GET_DATA_POINTER(x5, .Lrconst) - sub x3, x3, #1 - eor vZZ.16b, vZZ.16b, vZZ.16b ld1 {rhash.16b}, [x1] - ld1 {rh0.16b}, [x0] + ld1 {rh1.16b}, [x0] rbit rhash.16b, rhash.16b /* bit-swap */ ld1r {rrconst.2d}, [x5] + cmp x3, #6 + b.lo .Less_than_6 + + add x6, x4, #64 + VPUSH_ABI + + ld1 {rh2.16b-rh5.16b}, [x4] + ld1 {rh6.16b}, [x6] + + sub x3, x3, #6 + + ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16) + ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16) + + rbit rbuf.16b, rbuf.16b /* bit-swap */ + rbit rbuf1.16b, rbuf1.16b /* bit-swap */ + rbit rbuf2.16b, rbuf2.16b /* bit-swap */ + rbit rbuf3.16b, rbuf3.16b /* bit-swap */ + rbit rbuf4.16b, rbuf4.16b /* bit-swap */ + rbit rbuf5.16b, rbuf5.16b /* bit-swap */ + eor rhash.16b, rhash.16b, rbuf.16b + + cmp x3, #6 + b.lo .Lend_6 + +.Loop_6: + + /* (in1) * H? => rr0:rr1 */ + /* (in2) * H? => rr2:rr3 */ + /* (in0 ^ hash) * H? => rr4:rr5 */ + PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1, + rr2, rr3, rbuf2, rh4, t2, t3, + rr4, rr5, rhash, rh6, t4, t5, + _(sub x3, x3, #6)) + + ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16) + cmp x3, #6 + + eor rr0.16b, rr0.16b, rr2.16b + eor rr1.16b, rr1.16b, rr3.16b + + /* (in3) * H? => rr2:rr3 */ + /* (in4) * H? => rr6:rr7 */ + /* (in5) * H? => rr8:rr9 */ + PMUL_128x128_3(rr2, rr3, rbuf3, rh3, t0, t1, + rr6, rr7, rbuf4, rh2, t2, t3, + rr8, rr9, rbuf5, rh1, t4, t5, + _(eor rr0.16b, rr0.16b, rr4.16b; + eor rr1.16b, rr1.16b, rr5.16b)) + + eor rr0.16b, rr0.16b, rr2.16b + eor rr1.16b, rr1.16b, rr3.16b + rbit rbuf.16b, rbuf.16b + eor rr0.16b, rr0.16b, rr6.16b + eor rr1.16b, rr1.16b, rr7.16b + rbit rbuf1.16b, rbuf1.16b + eor rr0.16b, rr0.16b, rr8.16b + eor rr1.16b, rr1.16b, rr9.16b + ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16) + + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, + _(rbit rbuf2.16b, rbuf2.16b), + _(rbit rbuf3.16b, rbuf3.16b), + _(rbit rbuf4.16b, rbuf4.16b)) + + rbit rbuf5.16b, rbuf5.16b + eor rhash.16b, rhash.16b, rbuf.16b + + b.hs .Loop_6 + +.Lend_6: + + /* (in1) * H? => rr0:rr1 */ + /* (in0 ^ hash) * H? => rr2:rr3 */ + /* (in2) * H? => rr4:rr5 */ + PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1, + rr2, rr3, rhash, rh6, t2, t3, + rr4, rr5, rbuf2, rh4, t4, t5, + __) + eor rr0.16b, rr0.16b, rr2.16b + eor rr1.16b, rr1.16b, rr3.16b + eor rr0.16b, rr0.16b, rr4.16b + eor rr1.16b, rr1.16b, rr5.16b + + /* (in3) * H? => rhash:rbuf */ + /* (in4) * H? => rr6:rr7 */ + /* (in5) * H? => rr8:rr9 */ + PMUL_128x128_3(rhash, rbuf, rbuf3, rh3, t0, t1, + rr6, rr7, rbuf4, rh2, t2, t3, + rr8, rr9, rbuf5, rh1, t4, t5, + _(CLEAR_REG(rh4); + CLEAR_REG(rh5); + CLEAR_REG(rh6))) + eor rr0.16b, rr0.16b, rhash.16b + eor rr1.16b, rr1.16b, rbuf.16b + eor rr0.16b, rr0.16b, rr6.16b + eor rr1.16b, rr1.16b, rr7.16b + eor rr0.16b, rr0.16b, rr8.16b + eor rr1.16b, rr1.16b, rr9.16b + + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, + _(CLEAR_REG(rh2); + CLEAR_REG(rh3); + CLEAR_REG(rr2); + CLEAR_REG(rbuf2); + CLEAR_REG(rbuf3)), + _(CLEAR_REG(rr3); + CLEAR_REG(rr4); + CLEAR_REG(rr5); + CLEAR_REG(rr6); + CLEAR_REG(rr7)), + _(CLEAR_REG(rr8); + CLEAR_REG(rr9); + CLEAR_REG(rbuf1); + CLEAR_REG(rbuf2))) + + CLEAR_REG(rbuf4) + CLEAR_REG(rbuf5) + CLEAR_REG(t2) + CLEAR_REG(t3) + CLEAR_REG(t4) + CLEAR_REG(t5) + + VPOP_ABI + + cbz x3, .Ldone + +.Less_than_6: + /* Handle remaining blocks. */ + ld1 {rbuf.16b}, [x2], #16 + sub x3, x3, #1 rbit rbuf.16b, rbuf.16b /* bit-swap */ @@ -130,24 +334,25 @@ _gcry_ghash_armv8_ce_pmull: cbz x3, .Lend .Loop: - PMUL_128x128(rr0, rr1, rh0, rhash, ld1_rbuf) - sub x3, x3, #1 - REDUCTION(rhash, rr0, rr1, rrconst, rbit_rbuf) + PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(ld1 {rbuf.16b}, [x2], #16)) + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, + _(sub x3, x3, #1), + _(rbit rbuf.16b, rbuf.16b), + __) eor rhash.16b, rhash.16b, rbuf.16b cbnz x3, .Loop .Lend: - PMUL_128x128(rr0, rr1, rh0, rhash, _) - REDUCTION(rhash, rr0, rr1, rrconst, _) + PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(CLEAR_REG(rbuf))) + REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, __, _(CLEAR_REG(rh1)), __) +.Ldone: CLEAR_REG(rr1) CLEAR_REG(rr0) rbit rhash.16b, rhash.16b /* bit-swap */ - CLEAR_REG(rbuf) - CLEAR_REG(vT0) - CLEAR_REG(vT1) - CLEAR_REG(rh0) + CLEAR_REG(t0) + CLEAR_REG(t1) st1 {rhash.2d}, [x1] CLEAR_REG(rhash) @@ -170,9 +375,37 @@ _gcry_ghash_setup_armv8_ce_pmull: * x1: gcm_table */ - ld1 {vT0.16b}, [x0] - rbit vT0.16b, vT0.16b - st1 {vT0.16b}, [x0] + GET_DATA_POINTER(x2, .Lrconst) + + /* H? */ + ld1 {rh1.16b}, [x0] + rbit rh1.16b, rh1.16b + st1 {rh1.16b}, [x0] + + ld1r {rrconst.2d}, [x2] + + /* H? */ + PMUL_128x128(rr0, rr1, rh1, rh1, t0, t1, __) + REDUCTION(rh2, rr0, rr1, rrconst, t0, t1, __, __, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rh2, rh1, t0, t1, __) + REDUCTION(rh3, rr0, rr1, rrconst, t0, t1, __, __, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rh2, rh2, t0, t1, __) + REDUCTION(rh4, rr0, rr1, rrconst, t0, t1, __, __, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rh2, rh3, t0, t1, __) + REDUCTION(rh5, rr0, rr1, rrconst, t0, t1, __, __, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rh3, rh3, t0, t1, __) + REDUCTION(rh6, rr0, rr1, rrconst, t0, t1, __, __, __) + + st1 {rh2.16b-rh4.16b}, [x1], #(3*16) + st1 {rh5.16b-rh6.16b}, [x1] ret .size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull; commit 27747921cb1dfced83c5666cd1c474764724c52b Author: Jussi Kivilinna Date: Sun Oct 9 12:52:55 2016 +0300 GCM: Add bulk processing for ARMv8/AArch32 implementation * cipher/cipher-gcm-armv8-aarch32-ce.S: Add 4 blocks bulk processing. * tests/basic.c (check_digests): Print correct data length for "?" tests. (check_one_mac): Add large 1000000 bytes tests, when input is "!" or "?". (check_mac): Add "?" tests vectors for HMAC, CMAC, GMAC and POLY1305. -- Benchmark on Cortex-A53 (1152 Mhz): Before: | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 0.924 ns/B 1032.2 MiB/s 1.06 c/B After (1.21x faster): | nanosecs/byte mebibytes/sec cycles/byte GMAC_AES | 0.764 ns/B 1248.2 MiB/s 0.880 c/B Signed-off-by: Jussi Kivilinna diff --git a/cipher/cipher-gcm-armv8-aarch32-ce.S b/cipher/cipher-gcm-armv8-aarch32-ce.S index b879fb2..b61a787 100644 --- a/cipher/cipher-gcm-armv8-aarch32-ce.S +++ b/cipher/cipher-gcm-armv8-aarch32-ce.S @@ -57,69 +57,125 @@ gcry_gcm_reduction_constant: #define rhash_l d0 #define rhash_h d1 -#define rbuf q1 -#define rbuf_l d2 -#define rbuf_h d3 +#define rh1 q1 +#define rh1_l d2 +#define rh1_h d3 -#define rh0 q2 -#define rh0_l d4 -#define rh0_h d5 +#define rbuf q2 +#define rbuf_l d4 +#define rbuf_h d5 -#define rt0 q3 -#define rt0_l d6 -#define rt0_h d7 +#define rbuf1 q3 +#define rbuf1_l d6 +#define rbuf1_h d7 -#define rr0 q8 -#define rr0_l d16 -#define rr0_h d17 +#define rbuf2 q4 +#define rbuf2_l d8 +#define rbuf2_h d9 -#define rr1 q9 -#define rr1_l d18 -#define rr1_h d19 +#define rbuf3 q5 +#define rbuf3_l d10 +#define rbuf3_h d11 + +#define rh2 q6 +#define rh2_l d12 +#define rh2_h d13 + +#define rh3 q7 +#define rh3_l d14 +#define rh3_h d15 + +#define rh4 q8 +#define rh4_l d16 +#define rh4_h d17 + +#define rr2 q9 +#define rr2_l d18 +#define rr2_h d19 + +#define rr3 q10 +#define rr3_l d20 +#define rr3_h d21 + +#define rr0 q11 +#define rr0_l d22 +#define rr0_h d23 + +#define rr1 q12 +#define rr1_l d24 +#define rr1_h d25 + +#define rt0 q13 +#define rt0_l d26 +#define rt0_h d27 + +#define rt1 q14 +#define rt1_l d28 +#define rt1_h d29 #define rrconst q15 #define rrconst_l d30 #define rrconst_h d31 -#define ia rbuf_h -#define ib rbuf_l -#define oa rh0_l -#define ob rh0_h -#define co rrconst_l -#define ma rrconst_h - /* GHASH macros */ /* See "Gouv?a, C. P. L. & L?pez, J. Implementing GCM on ARMv8. Topics in * Cryptology ? CT-RSA 2015" for details. */ -/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */ +/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) + * Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'. + */ #define PMUL_128x128(r0, r1, a, b, t, interleave_op) \ veor t##_h, b##_l, b##_h; \ veor t##_l, a##_l, a##_h; \ vmull.p64 r0, a##_l, b##_l; \ vmull.p64 r1, a##_h, b##_h; \ vmull.p64 t, t##_h, t##_l; \ - interleave_op(); \ + interleave_op; \ veor t, r0; \ veor t, r1; \ veor r0##_h, t##_l; \ veor r1##_l, t##_h; +/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A) + * Note: 'r1A' may be 'aA' or 'bA', 'r0A' must not be either 'aA' or 'bA'. + * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B) + * Note: 'r1B' may be 'aB' or 'bB', 'r0B' must not be either 'aB' or 'bB'. + */ +#define PMUL_128x128_2(r0A, r1A, aA, bA, r0B, r1B, aB, bB, tA, tB, interleave_op) \ + veor tA##_h, bA##_l, bA##_h; \ + veor tA##_l, aA##_l, aA##_h; \ + veor tB##_h, bB##_l, bB##_h; \ + veor tB##_l, aB##_l, aB##_h; \ + vmull.p64 r0A, aA##_l, bA##_l; \ + vmull.p64 r1A, aA##_h, bA##_h; \ + vmull.p64 tA, tA##_h, tA##_l; \ + vmull.p64 r0B, aB##_l, bB##_l; \ + vmull.p64 r1B, aB##_h, bB##_h; \ + vmull.p64 tB, tB##_h, tB##_l; \ + interleave_op; \ + veor tA, r0A; \ + veor tA, r1A; \ + veor tB, r0B; \ + veor tB, r1B; \ + veor r0A##_h, tA##_l; \ + veor r1A##_l, tA##_h; \ + veor r0B##_h, tB##_l; \ + veor r1B##_l, tB##_h; \ + /* Input: 'r0:r1', Output: 'a' */ #define REDUCTION(a, r0, r1, rconst, t, interleave_op) \ vmull.p64 t, r0##_l, rconst; \ veor r0##_h, t##_l; \ veor r1##_l, t##_h; \ - interleave_op(); \ + interleave_op; \ vmull.p64 t, r0##_h, rconst; \ veor r1, t; \ veor a, r0, r1; -#define _(...) /*_*/ -#define vrev_rbuf() vrev64.8 rbuf, rbuf; -#define vext_rbuf() vext.8 rbuf, rbuf, rbuf, #8; +#define _(...) __VA_ARGS__ +#define __ _() /* Other functional macros */ @@ -142,22 +198,128 @@ _gcry_ghash_armv8_ce_pmull: * r3: nblocks * %st+0: gcm_table */ - push {r4, lr} + push {r4-r6, lr} cmp r3, #0 beq .Ldo_nothing - GET_DATA_POINTER(lr, .Lrconst64, r4) + GET_DATA_POINTER(r4, .Lrconst64, lr) - subs r3, r3, #1 vld1.64 {rhash}, [r1] - vld1.64 {rh0}, [r0] + vld1.64 {rh1}, [r0] vrev64.8 rhash, rhash /* byte-swap */ - vld1.64 {rrconst_h}, [lr] + vld1.64 {rrconst_h}, [r4] vext.8 rhash, rhash, rhash, #8 + cmp r3, #4 + blo .Less_than_4 + + /* Bulk processing of 4 blocks per loop iteration. */ + + ldr r5, [sp, #(4*4)]; + add r6, r5, #32 + + vpush {q4-q7} + + vld1.64 {rh2-rh3}, [r5] + vld1.64 {rh4}, [r6] + + vld1.64 {rbuf-rbuf1}, [r2]! + sub r3, r3, #4 + vld1.64 {rbuf2-rbuf3}, [r2]! + + cmp r3, #4 + vrev64.8 rbuf, rbuf /* byte-swap */ + vrev64.8 rbuf1, rbuf1 /* byte-swap */ + vrev64.8 rbuf2, rbuf2 /* byte-swap */ + vrev64.8 rbuf3, rbuf3 /* byte-swap */ + + vext.8 rbuf, rbuf, rbuf, #8 + vext.8 rbuf1, rbuf1, rbuf1, #8 + vext.8 rbuf2, rbuf2, rbuf2, #8 + vext.8 rbuf3, rbuf3, rbuf3, #8 + veor rhash, rhash, rbuf /* in0 ^ hash */ + + blo .Lend_4 + +.Loop_4: + /* (in0 ^ hash) * H? => rr2:rr3 */ + /* (in1) * H? => rr0:rr1 */ + PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) + + vld1.64 {rbuf-rbuf1}, [r2]! + sub r3, r3, #4 + veor rr0, rr0, rr2 + veor rr1, rr1, rr3 + + /* (in2) * H? => rr2:rr3 */ + /* (in3) * H? => rhash:rbuf3 */ + PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1, + _(vrev64.8 rbuf, rbuf)) + + vld1.64 {rbuf2}, [r2]! + + vrev64.8 rbuf1, rbuf1 + veor rr0, rr0, rr2 + veor rr1, rr1, rr3 + + cmp r3, #4 + vext.8 rbuf, rbuf, rbuf, #8 + vext.8 rbuf1, rbuf1, rbuf1, #8 + + veor rr0, rr0, rhash + veor rr1, rr1, rbuf3 + + vld1.64 {rbuf3}, [r2]! + + REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, + _(vrev64.8 rbuf2, rbuf2; + vrev64.8 rbuf3, rbuf3)) + + vext.8 rbuf2, rbuf2, rbuf2, #8 + vext.8 rbuf3, rbuf3, rbuf3, #8 + veor rhash, rhash, rbuf /* in0 ^ hash */ + + bhs .Loop_4 + +.Lend_4: + /* (in0 ^ hash) * H? => rr2:rr3 */ + /* (in1) * H? => rr0:rr1 */ + PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __) + + /* (in2) * H? => rhash:rbuf */ + /* (in3) * H? => rbuf1:rbuf2 */ + PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1, + _(veor rr0, rr0, rr2; + veor rr1, rr1, rr3)) + + veor rr0, rr0, rhash + veor rr1, rr1, rbuf + + veor rr0, rr0, rbuf1 + veor rr1, rr1, rbuf2 + + REDUCTION(rhash, rr0, rr1, rrconst_h, rt1, + _(CLEAR_REG(rr2); + CLEAR_REG(rr3); + CLEAR_REG(rbuf1); + CLEAR_REG(rbuf2); + CLEAR_REG(rbuf3); + CLEAR_REG(rh2); + CLEAR_REG(rh3); + CLEAR_REG(rh4))) + + vpop {q4-q7} + + cmp r3, #0 + beq .Ldone + +.Less_than_4: + /* Handle remaining blocks. */ + vld1.64 {rbuf}, [r2]! + subs r3, r3, #1 vrev64.8 rbuf, rbuf /* byte-swap */ vext.8 rbuf, rbuf, rbuf, #8 @@ -169,30 +331,29 @@ _gcry_ghash_armv8_ce_pmull: .Loop: vld1.64 {rbuf}, [r2]! subs r3, r3, #1 - PMUL_128x128(rr0, rr1, rh0, rhash, rt0, vrev_rbuf) - REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, vext_rbuf) + PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(vrev64.8 rbuf, rbuf)) + REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(vext.8 rbuf, rbuf, rbuf, #8)) veor rhash, rhash, rbuf bne .Loop .Lend: - PMUL_128x128(rr0, rr1, rh0, rhash, rt0, _) - REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _) + PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf))) + REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1))) +.Ldone: CLEAR_REG(rr1) - CLEAR_REG(rr0) vrev64.8 rhash, rhash /* byte-swap */ - CLEAR_REG(rbuf) CLEAR_REG(rt0) + CLEAR_REG(rr0) vext.8 rhash, rhash, rhash, #8 - CLEAR_REG(rh0) - + CLEAR_REG(rt1) vst1.64 {rhash}, [r1] CLEAR_REG(rhash) .Ldo_nothing: mov r0, #0 - pop {r4, pc} + pop {r4-r6, pc} .size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull; @@ -208,28 +369,64 @@ _gcry_ghash_setup_armv8_ce_pmull: * r1: gcm_table */ - push {r4, lr} + vpush {q4-q7} - GET_DATA_POINTER(r4, .Lrconst64, lr) + GET_DATA_POINTER(r2, .Lrconst64, r3) + + vld1.64 {rrconst_h}, [r2] + +#define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \ + /* H <<< 1 */ \ + vshr.s64 ma, ib, #63; \ + vshr.u64 oa, ib, #63; \ + vshr.u64 ob, ia, #63; \ + vand ma, const_d; \ + vshl.u64 ib, ib, #1; \ + vshl.u64 ia, ia, #1; \ + vorr ob, ib; \ + vorr oa, ia; \ + veor ob, ma; \ + vst1.64 {oa, ob}, [r_out] + + vld1.64 {rhash}, [r0] + vrev64.8 rhash, rhash /* byte-swap */ + vext.8 rhash, rhash, rhash, #8 + + vmov rbuf1, rhash + GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */ - /* H <<< 1 */ - vld1.64 {ib,ia}, [r0] - vld1.64 {co}, [r4] - vrev64.8 ib, ib; - vrev64.8 ia, ia; - vshr.s64 ma, ib, #63 - vshr.u64 oa, ib, #63 - vshr.u64 ob, ia, #63 - vand ma, co - vshl.u64 ib, ib, #1 - vshl.u64 ia, ia, #1 - vorr ob, ib - vorr oa, ia - veor ob, ma - - vst1.64 {oa, ob}, [r0] - - pop {r4, pc} + /* H? */ + PMUL_128x128(rr0, rr1, rbuf1, rh1, rt0, __) + REDUCTION(rh2, rr0, rr1, rrconst_h, rt0, __) + vmov rhash, rh2 + GCM_LSH_1(r1, rh2_l, rh2_h, rrconst_h, rbuf1_l, rbuf1_h, rt1_l) /* H?<<<1 */ + add r1, r1, #16 + + /* H? */ + PMUL_128x128(rr0, rr1, rhash, rh1, rt1, __) + REDUCTION(rh3, rr0, rr1, rrconst_h, rt1, __) + + /* H? */ + PMUL_128x128(rr0, rr1, rhash, rbuf1, rt0, __) + REDUCTION(rh4, rr0, rr1, rrconst_h, rt0, __) + + GCM_LSH_1(r1, rh3_l, rh3_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H?<<<1 */ + add r1, r1, #16 + GCM_LSH_1(r1, rh4_l, rh4_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H?<<<1 */ + + CLEAR_REG(rt0) + CLEAR_REG(rt1) + CLEAR_REG(rr1) + CLEAR_REG(rr0) + CLEAR_REG(rh1) + CLEAR_REG(rh2) + CLEAR_REG(rh3) + CLEAR_REG(rh4) + CLEAR_REG(rhash) + CLEAR_REG(rbuf1) + CLEAR_REG(rrconst) + vpop {q4-q7} + bx lr .size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull; #endif diff --git a/tests/basic.c b/tests/basic.c index 96fb4cb..e5a325b 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -6902,7 +6902,7 @@ check_digests (void) fprintf (stderr, " checking %s [%i] for length %d\n", gcry_md_algo_name (algos[i].md), algos[i].md, - !strcmp (algos[i].data, "!")? + (!strcmp (algos[i].data, "!") || !strcmp (algos[i].data, "?"))? 1000000 : (int)strlen(algos[i].data)); check_one_md (algos[i].md, algos[i].data, @@ -7359,6 +7359,15 @@ check_one_mac (int algo, const char *data, int datalen, int i; gcry_error_t err = 0; + if (test_buffering) + { + if ((*data == '!' && !data[1]) || + (*data == '?' && !data[1])) + { + return; /* Skip. */ + } + } + err = gcry_mac_open (&hd, algo, 0, NULL); if (err) { @@ -7416,7 +7425,60 @@ check_one_mac (int algo, const char *data, int datalen, } else { - err = gcry_mac_write (hd, data, datalen); + if ((*data == '!' && !data[1]) || /* hash one million times a "a" */ + (*data == '?' && !data[1])) /* hash million byte data-set with byte pattern 0x00,0x01,0x02,... */ + { + char aaa[1000]; + size_t left = 1000 * 1000; + size_t startlen = 1; + size_t piecelen = startlen; + + if (*data == '!') + memset (aaa, 'a', 1000); + + /* Write in chuck with all sizes 1 to 1000 (500500 bytes) */ + for (i = 1; i <= 1000 && left > 0; i++) + { + piecelen = i; + if (piecelen > sizeof(aaa)) + piecelen = sizeof(aaa); + if (piecelen > left) + piecelen = left; + + if (*data == '?') + fillbuf_count(aaa, piecelen, 1000 * 1000 - left); + + gcry_mac_write (hd, aaa, piecelen); + + left -= piecelen; + } + + /* Write in odd size chunks so that we test the buffering. */ + while (left > 0) + { + if (piecelen > sizeof(aaa)) + piecelen = sizeof(aaa); + if (piecelen > left) + piecelen = left; + + if (*data == '?') + fillbuf_count(aaa, piecelen, 1000 * 1000 - left); + + gcry_mac_write (hd, aaa, piecelen); + + left -= piecelen; + + if (piecelen == sizeof(aaa)) + piecelen = ++startlen; + else + piecelen = piecelen * 2 - ((piecelen != startlen) ? startlen : 0); + } + } + else + { + err = gcry_mac_write (hd, data, datalen); + } + if (err) fail("algo %d, mac gcry_mac_write failed: %s\n", algo, gpg_strerror (err)); if (err) @@ -7426,8 +7488,6 @@ check_one_mac (int algo, const char *data, int datalen, err = gcry_mac_verify (hd, expect, maclen); if (err) fail("algo %d, mac gcry_mac_verify failed: %s\n", algo, gpg_strerror (err)); - if (err) - goto out; macoutlen = maclen; err = gcry_mac_read (hd, p, &macoutlen); @@ -7511,6 +7571,8 @@ check_mac (void) "\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa\xaa" "\xaa\xaa\xaa\xaa\xaa", "\x6f\x63\x0f\xad\x67\xcd\xa0\xee\x1f\xb1\xf5\x62\xdb\x3a\xa5\x3e", }, + { GCRY_MAC_HMAC_MD5, "?", "????????????????", + "\x7e\x28\xf8\x8e\xf4\x6c\x48\x30\xa2\x0c\xe3\xe1\x42\xd4\xb5\x6b" }, { GCRY_MAC_HMAC_SHA256, "what do ya want for nothing?", "Jefe", "\x5b\xdc\xc1\x46\xbf\x60\x75\x4e\x6a\x04\x24\x26\x08\x95\x75\xc7\x5a" "\x00\x3f\x08\x9d\x27\x39\x83\x9d\xec\x58\xb9\x64\xec\x38\x43" }, @@ -7564,6 +7626,9 @@ check_mac (void) "\xaa\xaa\xaa", "\x9b\x09\xff\xa7\x1b\x94\x2f\xcb\x27\x63\x5f\xbc\xd5\xb0\xe9\x44" "\xbf\xdc\x63\x64\x4f\x07\x13\x93\x8a\x7f\x51\x53\x5c\x3a\x35\xe2" }, + { GCRY_MAC_HMAC_SHA256, "?", "????????????????", + "\x1c\x0e\x57\xad\x4a\x02\xd2\x30\xce\x7e\xf8\x08\x23\x25\x71\x5e" + "\x16\x9b\x30\xca\xc3\xf4\x99\xc5\x1d\x4c\x25\x32\xa9\xf2\x15\x28" }, { GCRY_MAC_HMAC_SHA224, "what do ya want for nothing?", "Jefe", "\xa3\x0e\x01\x09\x8b\xc6\xdb\xbf\x45\x69\x0f\x3a\x7e\x9e\x6d\x0f" "\x8b\xbe\xa2\xa3\x9e\x61\x48\x00\x8f\xd0\x5e\x44" }, @@ -7617,6 +7682,9 @@ check_mac (void) "\xaa\xaa\xaa", "\x3a\x85\x41\x66\xac\x5d\x9f\x02\x3f\x54\xd5\x17\xd0\xb3\x9d\xbd" "\x94\x67\x70\xdb\x9c\x2b\x95\xc9\xf6\xf5\x65\xd1" }, + { GCRY_MAC_HMAC_SHA224, "?", "????????????????", + "\xc1\x88\xaf\xcf\xce\x51\xa2\x14\x3d\xc1\xaf\x93\xcc\x2b\xe9\x4d" + "\x39\x55\x90\x4c\x46\x70\xfc\xc2\x04\xcf\xab\xfa" }, { GCRY_MAC_HMAC_SHA384, "what do ya want for nothing?", "Jefe", "\xaf\x45\xd2\xe3\x76\x48\x40\x31\x61\x7f\x78\xd2\xb5\x8a\x6b\x1b" "\x9c\x7e\xf4\x64\xf5\xa0\x1b\x47\xe4\x2e\xc3\x73\x63\x22\x44\x5e" @@ -7676,6 +7744,10 @@ check_mac (void) "\x66\x17\x17\x8e\x94\x1f\x02\x0d\x35\x1e\x2f\x25\x4e\x8f\xd3\x2c" "\x60\x24\x20\xfe\xb0\xb8\xfb\x9a\xdc\xce\xbb\x82\x46\x1e\x99\xc5" "\xa6\x78\xcc\x31\xe7\x99\x17\x6d\x38\x60\xe6\x11\x0c\x46\x52\x3e" }, + { GCRY_MAC_HMAC_SHA384, "?", "????????????????", + "\xe7\x96\x29\xa3\x40\x5f\x1e\x6e\x92\xa5\xdb\xa5\xc6\xe9\x60\xa8" + "\xf5\xd1\x6d\xcb\x10\xec\x30\x2f\x6b\x9c\x37\xe0\xea\xf1\x53\x28" + "\x08\x01\x9b\xe3\x4a\x43\xc6\xc2\x2b\x0c\xd9\x43\x64\x35\x25\x78" }, { GCRY_MAC_HMAC_SHA512, "what do ya want for nothing?", "Jefe", "\x16\x4b\x7a\x7b\xfc\xf8\x19\xe2\xe3\x95\xfb\xe7\x3b\x56\xe0\xa3" "\x87\xbd\x64\x22\x2e\x83\x1f\xd6\x10\x27\x0c\xd7\xea\x25\x05\x54" @@ -7741,6 +7813,11 @@ check_mac (void) "\xde\xbd\x71\xf8\x86\x72\x89\x86\x5d\xf5\xa3\x2d\x20\xcd\xc9\x44" "\xb6\x02\x2c\xac\x3c\x49\x82\xb1\x0d\x5e\xeb\x55\xc3\xe4\xde\x15" "\x13\x46\x76\xfb\x6d\xe0\x44\x60\x65\xc9\x74\x40\xfa\x8c\x6a\x58" }, + { GCRY_MAC_HMAC_SHA512, "?", "????????????????", + "\xd4\x43\x61\xfa\x3d\x3d\x57\xd6\xac\xc3\x9f\x1c\x3d\xd9\x26\x84" + "\x1f\xfc\x4d\xf2\xbf\x78\x87\x72\x5e\x6c\x3e\x00\x6d\x39\x5f\xfa" + "\xd7\x3a\xf7\x83\xb7\xb5\x61\xbd\xfb\x33\xe0\x03\x97\xa7\x72\x79" + "\x66\x66\xbf\xbd\x44\xfa\x04\x01\x1b\xc1\x48\x1d\x9e\xde\x5b\x8e" }, /* HMAC-SHA3 test vectors from * http://wolfgang-ehrhardt.de/hmac-sha3-testvectors.html */ { GCRY_MAC_HMAC_SHA3_224, @@ -7904,6 +7981,21 @@ check_mac (void) "\x1f\x3e\x6c\xf0\x48\x60\xc6\xbb\xd7\xfa\x48\x86\x74\x78\x2b\x46" "\x59\xfd\xbd\xf3\xfd\x87\x78\x52\x88\x5c\xfe\x6e\x22\x18\x5f\xe7" "\xb2\xee\x95\x20\x43\x62\x9b\xc9\xd5\xf3\x29\x8a\x41\xd0\x2c\x66" }, + { GCRY_MAC_HMAC_SHA3_224, "?", "????????????????", + "\x80\x2b\x3c\x84\xfe\x3e\x01\x22\x14\xf8\xba\x74\x79\xfd\xb5\x02" + "\xea\x0c\x06\xa4\x7e\x01\xe3\x2c\xc7\x24\x89\xc3" }, + { GCRY_MAC_HMAC_SHA3_256, "?", "????????????????", + "\x6c\x7c\x96\x5b\x19\xba\xcd\x61\x69\x8a\x2c\x7a\x2b\x96\xa1\xc3" + "\x33\xa0\x3c\x5d\x54\x87\x37\x60\xc8\x2f\xa2\xa6\x12\x38\x8d\x1b" }, + { GCRY_MAC_HMAC_SHA3_384, "?", "????????????????", + "\xc0\x20\xd0\x9b\xa7\xb9\xd5\xb8\xa6\xa4\xba\x20\x55\xd9\x0b\x35" + "\x8b\xe0\xb7\xec\x1e\x9f\xe6\xb9\xbd\xd5\xe9\x9b\xfc\x0a\x11\x3a" + "\x15\x41\xed\xfd\xef\x30\x8d\x03\xb8\xca\x3a\xa8\xc7\x2d\x89\x32" }, + { GCRY_MAC_HMAC_SHA3_512, "?", "????????????????", + "\xb4\xef\x24\xd2\x07\xa7\x01\xb3\xe1\x81\x11\x22\x93\x83\x64\xe0" + "\x5e\xad\x03\xb7\x43\x4f\x87\xa1\x14\x8e\x17\x8f\x2a\x97\x7d\xe8" + "\xbd\xb0\x37\x3b\x67\xb9\x97\x36\xa5\x82\x9b\xdc\x0d\xe4\x5a\x8c" + "\x5e\xda\xb5\xca\xea\xa9\xb4\x6e\xba\xca\x25\xc8\xbf\xa1\x0e\xb0" }, /* CMAC AES and DES test vectors from http://web.archive.org/web/20130930212819/http://csrc.nist.gov/publica\ tions/nistpubs/800-38B/Updated_CMAC_Examples.pdf */ @@ -7978,6 +8070,8 @@ check_mac (void) "\x60\x3d\xeb\x10\x15\xca\x71\xbe\x2b\x73\xae\xf0\x85\x7d\x77\x81" "\x1f\x35\x2c\x07\x3b\x61\x08\xd7\x2d\x98\x10\xa3\x09\x14\xdf\xf4", "\xe1\x99\x21\x90\x54\x9f\x6e\xd5\x69\x6a\x2c\x05\x6c\x31\x54\x10" }, + { GCRY_MAC_CMAC_AES, "?", "????????????????????????????????", + "\x9f\x72\x73\x68\xb0\x49\x2e\xb1\x35\xa0\x1d\xf9\xa8\x0a\xf6\xee" }, { GCRY_MAC_CMAC_3DES, "", "\x8a\xa8\x3b\xf8\xcb\xda\x10\x62\x0b\xc1\xbf\x19\xfb\xb6\xcd\x58" @@ -8022,6 +8116,8 @@ check_mac (void) "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5\x8a\x3d\x10\xba\x80\x57\x0d\x38" "\x4c\xf1\x51\x34\xa2\x85\x0d\xd5", "\x31\xb1\xe4\x31\xda\xbc\x4e\xb8" }, + { GCRY_MAC_CMAC_3DES, "?", "????????????????????????", + "\xc1\x38\x13\xb2\x31\x8f\x3a\xdf" }, /* CMAC Camellia test vectors from http://tools.ietf.org/html/draft-kato-ipsec-camellia-cmac96and128-05 */ { GCRY_MAC_CMAC_CAMELLIA, @@ -8045,6 +8141,8 @@ check_mac (void) "\xf6\x9f\x24\x45\xdf\x4f\x9b\x17\xad\x2b\x41\x7b\xe6\x6c\x37\x10", "\x2b\x7e\x15\x16\x28\xae\xd2\xa6\xab\xf7\x15\x88\x09\xcf\x4f\x3c", "\xc2\x69\x9a\x6e\xba\x55\xce\x9d\x93\x9a\x8a\x4e\x19\x46\x6e\xe9" }, + { GCRY_MAC_CMAC_CAMELLIA, "?", "????????????????????????????????", + "\xba\x8a\x5a\x8d\xa7\x54\x26\x83\x3e\xb1\x20\xb5\x45\xd0\x9f\x4e" }, /* http://csrc.nist.gov/groups/STM/cavp/documents/mac/gcmtestvectors.zip */ { GCRY_MAC_GMAC_AES, "", @@ -8081,6 +8179,8 @@ check_mac (void) "\xc9\xfc\xa7\x29\xab\x60\xad\xa0", "\x20\x4b\xdb\x1b\xd6\x21\x54\xbf\x08\x92\x2a\xaa\x54\xee\xd7\x05", "\x05\xad\x13\xa5\xe2\xc2\xab\x66\x7e\x1a\x6f\xbc" }, + { GCRY_MAC_GMAC_AES, "?", "????????????????????????????????", + "\x84\x37\xc3\x42\xae\xf5\xd0\x40\xd3\x73\x90\xa9\x36\xed\x8a\x12" }, /* from NaCl */ { GCRY_MAC_POLY1305, "\x8e\x99\x3b\x9f\x48\x68\x12\x73\xc2\x96\x50\xba\x32\xfc\x76\xce" @@ -8250,6 +8350,8 @@ check_mac (void) "\x12\x97\x6a\x08\xc4\x42\x6d\x0c\xe8\xa8\x24\x07\xc4\xf4\x82\x07" "\x80\xf8\xc2\x0a\xa7\x12\x02\xd1\xe2\x91\x79\xcb\xcb\x55\x5a\x57", "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b" }, + { GCRY_MAC_POLY1305, "?", "????????????????????????????????", + "\xc3\x88\xce\x8a\x52\xd6\xe7\x21\x86\xfa\xaa\x5d\x2d\x16\xf9\xa3" }, /* from http://cr.yp.to/mac/poly1305-20050329.pdf */ { GCRY_MAC_POLY1305_AES, "\xf3\xf6", @@ -8283,6 +8385,10 @@ check_mac (void) "\x51\x54\xad\x0d\x2c\xb2\x6e\x01\x27\x4f\xc5\x11\x48\x49\x1f\x1b", "\x9a\xe8\x31\xe7\x43\x97\x8d\x3a\x23\x52\x7c\x71\x28\x14\x9e\x3a", 0, 32 }, + { GCRY_MAC_POLY1305_AES, "?", "????????????????????????????????", + "\x9d\xeb\xb0\xcd\x24\x90\xd3\x9b\x47\x78\x37\x0a\x81\xf2\x83\x2a", + "\x61\xee\x09\x21\x8d\x29\xb0\xaa\xed\x7e\x15\x4a\x2c\x55\x09\xcc", + 0, 32 }, { 0 }, }; int i; @@ -8310,8 +8416,9 @@ check_mac (void) fprintf (stderr, " checking %s [%i] for %d byte key and %d byte data\n", gcry_mac_algo_name (algos[i].algo), - algos[i].algo, - (int)strlen(algos[i].key), (int)strlen(algos[i].data)); + algos[i].algo, (int)strlen(algos[i].key), + (!strcmp(algos[i].data, "!") || !strcmp(algos[i].data, "?")) + ? 1000000 : (int)strlen(algos[i].data)); klen = algos[i].klen ? algos[i].klen : strlen(algos[i].key); dlen = algos[i].dlen ? algos[i].dlen : strlen (algos[i].data); ----------------------------------------------------------------------- Summary of changes: cipher/cipher-gcm-armv8-aarch32-ce.S | 321 +++++++++++++++++++++++++++------- cipher/cipher-gcm-armv8-aarch64-ce.S | 325 ++++++++++++++++++++++++++++++----- tests/basic.c | 119 ++++++++++++- 3 files changed, 651 insertions(+), 114 deletions(-) hooks/post-receive -- The GNU crypto library http://git.gnupg.org _______________________________________________ Gnupg-commits mailing list Gnupg-commits at gnupg.org http://lists.gnupg.org/mailman/listinfo/gnupg-commits From d.brentjes at gmail.com Mon Oct 24 15:34:23 2016 From: d.brentjes at gmail.com (D. Brentjes) Date: Mon, 24 Oct 2016 15:34:23 +0200 Subject: Using elgamal encryption with a specific (multiplicative) group. Message-ID: <1be0a764-e1c4-37f3-adf4-6e520449534e@gmail.com> Hi all, I've been looking at the library and am trying to use it for elgamal encryption and decryption. So first step is to generate a keypair. size_t parse_error_offset; gcry_error_t error; gcry_sexp_t key_generator; error = gcry_sexp_build(&key_generator, &parse_error_offset, "(genkey (%s (nbits %s)))", "elg", "2048"); check(error); gcry_sexp_t key_pair; error = gcry_pk_genkey(&key_pair, key_generator); check(error); I took a quick peek in the code and this finds a prime and generator to use for the encryption and decryption. But I want to supply my own. So I found that you can use a s-expr "(genkey (algo (domain (p ...) (q ...) (g ...))))" with gcry_pk_genkey, but this only works on DSA Now I have succesfully gcry_mpi_scan'ed my desired p,q and g (from rfc5114), But I have no idea how to securely generate a private key for this group using the gcrypt facilities. And can I use the MPI facilty after that to calculate the public key (as long as my private key is stored in secure memory)? Kind regards, Dennis.