[PATCH 3/5] Add optimized wipememory for ARM
Jussi Kivilinna
jussi.kivilinna at iki.fi
Mon Aug 19 11:16:06 CEST 2013
src/g10lib.h [__arm__] (fast_wipememory2_unaligned_head)
(fast_wipememory2): New functions.
--
Previous patch that removed _gcry_burn_stack optimization causes burn_stack
take over 30% CPU usage when looping 'benchmark cipher blowfish' on
ARM/Cortex-A8. Optimizing wipememory2 for ARM helps situation a lot.
Old vs new (Cortex-A8):
ECB/Stream CBC CFB OFB CTR
--------------- --------------- --------------- --------------- ---------------
IDEA 1.20x 1.18x 1.16x 1.15x 1.16x 1.18x 1.18x 1.16x 1.16x 1.17x
3DES 1.14x 1.14x 1.12x 1.13x 1.12x 1.13x 1.12x 1.13x 1.13x 1.15x
CAST5 1.66x 1.67x 1.43x 1.00x 1.48x 1.00x 1.44x 1.44x 1.04x 0.96x
BLOWFISH 1.56x 1.66x 1.47x 1.00x 1.54x 1.05x 1.44x 1.47x 1.00x 1.00x
AES 1.52x 1.42x 1.04x 1.00x 1.00x 1.00x 1.38x 1.37x 1.00x 1.00x
AES192 1.36x 1.36x 1.00x 1.00x 1.00x 1.04x 1.26x 1.22x 1.00x 1.04x
AES256 1.32x 1.31x 1.03x 1.00x 1.00x 1.00x 1.24x 1.30x 1.03x 0.97x
TWOFISH 1.31x 1.26x 1.23x 1.00x 1.25x 1.00x 1.24x 1.23x 1.00x 1.03x
ARCFOUR 1.05x 0.96x
DES 1.31x 1.33x 1.26x 1.29x 1.28x 1.29x 1.26x 1.29x 1.27x 1.29x
TWOFISH128 1.27x 1.24x 1.23x 1.00x 1.28x 1.00x 1.21x 1.26x 0.97x 1.06x
SERPENT128 1.19x 1.19x 1.15x 1.00x 1.14x 1.00x 1.17x 1.17x 0.98x 1.00x
SERPENT192 1.19x 1.24x 1.17x 1.00x 1.14x 1.00x 1.15x 1.17x 1.00x 1.00x
SERPENT256 1.16x 1.19x 1.17x 1.00x 1.14x 1.00x 1.15x 1.15x 1.00x 1.00x
RFC2268_40 1.00x 0.99x 1.00x 1.01x 1.00x 1.00x 1.03x 1.00x 1.01x 1.00x
SEED 1.20x 1.20x 1.18x 1.17x 1.17x 1.19x 1.18x 1.16x 1.19x 1.19x
CAMELLIA128 1.38x 1.34x 1.31x 1.00x 1.31x 1.00x 1.29x 1.32x 1.00x 1.00x
CAMELLIA192 1.27x 1.27x 1.23x 1.00x 1.25x 1.03x 1.20x 1.23x 1.00x 1.00x
CAMELLIA256 1.27x 1.27x 1.26x 1.00x 1.25x 1.03x 1.20x 1.23x 1.00x 1.00x
SALSA20 1.04x 1.00x
note: bulk encryption/decryption do burn_stack after full buffer processing.
instead of after each block.
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
src/g10lib.h | 34 +++++++++++++++++++++++++++++++++-
1 file changed, 33 insertions(+), 1 deletion(-)
diff --git a/src/g10lib.h b/src/g10lib.h
index e6d20e9..198ab38 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -248,7 +248,7 @@ void _gcry_burn_stack (int bytes);
#define wipememory(_ptr,_len) wipememory2(_ptr,0,_len)
-/* Optimized fast_wipememory2 for i386 and x86-64 architechtures. Maybe leave
+/* Optimized fast_wipememory2 for i386, x86-64 and arm architectures. May leave
tail bytes unhandled, in which case tail bytes are handled by wipememory2.
*/
#if defined(__x86_64__) && __GNUC__ >= 4
@@ -283,6 +283,38 @@ void _gcry_burn_stack (int bytes);
_vptr += 4; \
} while (_vlen >= 4); \
} while (0)
+#elif defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) && \
+ __GNUC__ >= 4
+
+#ifdef __ARM_FEATURE_UNALIGNED
+#define fast_wipememory2_unaligned_head(_ptr,_set,_len) /*do nothing*/
+#else
+#define fast_wipememory2_unaligned_head(_vptr,_vset,_vlen) do { \
+ while((size_t)(_vptr)&3 && _vlen) \
+ { *_vptr=(_vset); _vptr++; _vlen--; } \
+ } while(0)
+#endif
+
+#define fast_wipememory2(_vptr,_vset,_vlen) do { \
+ unsigned long _vset4 = _vset; \
+ fast_wipememory2_unaligned_head(_vptr,_vset,_vlen); \
+ if (_vlen < 8) \
+ break; \
+ _vset4 *= 0x01010101; \
+ asm volatile( \
+ "mov %%r4, %[set];\n\t" \
+ "mov %%r5, %[set];\n\t" \
+ "1:;\n\t" \
+ "stm %[ptr]!, {%%r4, %%r5};\n\t" \
+ "cmp %[end], %[ptr];\n\t" \
+ "bne 1b;\n\t" \
+ : [ptr] "=r" (_vptr) \
+ : [set] "r" (_vset4), \
+ [end] "r" (_vptr+(_vlen&(~0x7))), \
+ "0" (_vptr) \
+ : "memory", "r4", "r5", "cc"); \
+ _vlen &= 0x7; \
+ } while (0)
#else
#define fast_wipememory2(_ptr,_set,_len)
#endif
More information about the Gcrypt-devel
mailing list