[PATCH 3/5] Add optimized wipememory for ARM

Mon Aug 19 11:16:06 CEST 2013

src/g10lib.h [__arm__] (fast_wipememory2_unaligned_head)
(fast_wipememory2): New functions.
--

Previous patch that removed _gcry_burn_stack optimization causes burn_stack
take over 30% CPU usage when looping 'benchmark cipher blowfish' on
ARM/Cortex-A8. Optimizing wipememory2 for ARM helps situation a lot.

Old vs new (Cortex-A8):
                  ECB/Stream         CBC             CFB             OFB             CTR
               --------------- --------------- --------------- --------------- ---------------
IDEA            1.20x   1.18x   1.16x   1.15x   1.16x   1.18x   1.18x   1.16x   1.16x   1.17x
3DES            1.14x   1.14x   1.12x   1.13x   1.12x   1.13x   1.12x   1.13x   1.13x   1.15x
CAST5           1.66x   1.67x   1.43x   1.00x   1.48x   1.00x   1.44x   1.44x   1.04x   0.96x
BLOWFISH        1.56x   1.66x   1.47x   1.00x   1.54x   1.05x   1.44x   1.47x   1.00x   1.00x
AES             1.52x   1.42x   1.04x   1.00x   1.00x   1.00x   1.38x   1.37x   1.00x   1.00x
AES192          1.36x   1.36x   1.00x   1.00x   1.00x   1.04x   1.26x   1.22x   1.00x   1.04x
AES256          1.32x   1.31x   1.03x   1.00x   1.00x   1.00x   1.24x   1.30x   1.03x   0.97x
TWOFISH         1.31x   1.26x   1.23x   1.00x   1.25x   1.00x   1.24x   1.23x   1.00x   1.03x
ARCFOUR         1.05x   0.96x
DES             1.31x   1.33x   1.26x   1.29x   1.28x   1.29x   1.26x   1.29x   1.27x   1.29x
TWOFISH128      1.27x   1.24x   1.23x   1.00x   1.28x   1.00x   1.21x   1.26x   0.97x   1.06x
SERPENT128      1.19x   1.19x   1.15x   1.00x   1.14x   1.00x   1.17x   1.17x   0.98x   1.00x
SERPENT192      1.19x   1.24x   1.17x   1.00x   1.14x   1.00x   1.15x   1.17x   1.00x   1.00x
SERPENT256      1.16x   1.19x   1.17x   1.00x   1.14x   1.00x   1.15x   1.15x   1.00x   1.00x
RFC2268_40      1.00x   0.99x   1.00x   1.01x   1.00x   1.00x   1.03x   1.00x   1.01x   1.00x
SEED            1.20x   1.20x   1.18x   1.17x   1.17x   1.19x   1.18x   1.16x   1.19x   1.19x
CAMELLIA128     1.38x   1.34x   1.31x   1.00x   1.31x   1.00x   1.29x   1.32x   1.00x   1.00x
CAMELLIA192     1.27x   1.27x   1.23x   1.00x   1.25x   1.03x   1.20x   1.23x   1.00x   1.00x
CAMELLIA256     1.27x   1.27x   1.26x   1.00x   1.25x   1.03x   1.20x   1.23x   1.00x   1.00x
SALSA20         1.04x   1.00x

note: bulk encryption/decryption do burn_stack after full buffer processing.
instead of after each block.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 src/g10lib.h |   34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/src/g10lib.h b/src/g10lib.h
index e6d20e9..198ab38 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -248,7 +248,7 @@ void _gcry_burn_stack (int bytes);
 #define wipememory(_ptr,_len) wipememory2(_ptr,0,_len)
 
 
-/* Optimized fast_wipememory2 for i386 and x86-64 architechtures.  Maybe leave
+/* Optimized fast_wipememory2 for i386, x86-64 and arm architectures.  May leave
    tail bytes unhandled, in which case tail bytes are handled by wipememory2.
  */
 #if defined(__x86_64__) && __GNUC__ >= 4
@@ -283,6 +283,38 @@ void _gcry_burn_stack (int bytes);
                 _vptr += 4; \
               } while (_vlen >= 4); \
                   } while (0)
+#elif defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) && \
+	__GNUC__ >= 4
+
+#ifdef __ARM_FEATURE_UNALIGNED
+#define fast_wipememory2_unaligned_head(_ptr,_set,_len) /*do nothing*/
+#else
+#define fast_wipememory2_unaligned_head(_vptr,_vset,_vlen) do { \
+              while((size_t)(_vptr)&3 && _vlen) \
+	        { *_vptr=(_vset); _vptr++; _vlen--; } \
+                  } while(0)
+#endif
+
+#define fast_wipememory2(_vptr,_vset,_vlen) do { \
+              unsigned long _vset4 = _vset; \
+              fast_wipememory2_unaligned_head(_vptr,_vset,_vlen); \
+              if (_vlen < 8) \
+                break; \
+              _vset4 *= 0x01010101; \
+              asm volatile( \
+                "mov %%r4, %[set];\n\t" \
+                "mov %%r5, %[set];\n\t" \
+                "1:;\n\t" \
+                "stm %[ptr]!, {%%r4, %%r5};\n\t" \
+                "cmp %[end], %[ptr];\n\t" \
+                "bne 1b;\n\t" \
+                : [ptr] "=r" (_vptr) \
+                : [set] "r" (_vset4), \
+                  [end] "r" (_vptr+(_vlen&(~0x7))), \
+                  "0" (_vptr) \
+                : "memory", "r4", "r5", "cc"); \
+              _vlen &= 0x7; \
+                  } while (0)
 #else
 #define fast_wipememory2(_ptr,_set,_len)
 #endif