[PATCH] Fix unaligned accesses with ldm/stm in ChaCha20 and Poly1305 ARM/NEON
Jussi Kivilinna
jussi.kivilinna at iki.fi
Fri Jul 8 00:28:27 CEST 2016
* cipher/chacha20-armv7-neon.S (UNALIGNED_STMIA8)
(UNALIGNED_LDMIA4): New.
(_gcry_chacha20_armv7_neon_blocks): Use new helper macros instead of
ldm/stm instructions directly.
* cipher/poly1305-armv7-neon.S (UNALIGNED_LDMIA2)
(UNALIGNED_LDMIA4): New.
(_gcry_poly1305_armv7_neon_init_ext, _gcry_poly1305_armv7_neon_blocks)
(_gcry_poly1305_armv7_neon_finish_ext): Use new helper macros instead
of ldm instruction directly.
--
Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
0 files changed
diff --git a/cipher/chacha20-armv7-neon.S b/cipher/chacha20-armv7-neon.S
index 1a395ba..4d3340b 100644
--- a/cipher/chacha20-armv7-neon.S
+++ b/cipher/chacha20-armv7-neon.S
@@ -33,6 +33,40 @@
.fpu neon
.arm
+#define UNALIGNED_STMIA8(ptr, l0, l1, l2, l3, l4, l5, l6, l7) \
+ tst ptr, #3; \
+ beq 1f; \
+ vpush {d0-d3}; \
+ vmov s0, l0; \
+ vmov s1, l1; \
+ vmov s2, l2; \
+ vmov s3, l3; \
+ vmov s4, l4; \
+ vmov s5, l5; \
+ vmov s6, l6; \
+ vmov s7, l7; \
+ vst1.32 {d0-d3}, [ptr]; \
+ add ptr, #32; \
+ vpop {d0-d3}; \
+ b 2f; \
+ 1: stmia ptr!, {l0-l7}; \
+ 2: ;
+
+#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \
+ tst ptr, #3; \
+ /*beq 1f;*/ \
+ vpush {d0-d1}; \
+ vld1.32 {d0-d1}, [ptr]; \
+ add ptr, #16; \
+ vmov l0, s0; \
+ vmov l1, s1; \
+ vmov l2, s2; \
+ vmov l3, s3; \
+ vpop {d0-d1}; \
+ b 2f; \
+ 1: ldmia ptr!, {l0-l3}; \
+ 2: ;
+
.text
.globl _gcry_chacha20_armv7_neon_blocks
@@ -352,7 +386,8 @@ _gcry_chacha20_armv7_neon_blocks:
add r7, r7, r11
vadd.i32 q11, q11, q14
beq .Lchacha_blocks_neon_nomessage11
- ldmia r12!, {r8-r11}
+ UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
+ tst r12, r12
eor r0, r0, r8
eor r1, r1, r9
eor r2, r2, r10
@@ -367,7 +402,8 @@ _gcry_chacha20_armv7_neon_blocks:
add r12, r12, #16
eor r7, r7, r11
.Lchacha_blocks_neon_nomessage11:
- stmia r14!, {r0-r7}
+ UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
+ tst r12, r12
ldm sp, {r0-r7}
ldr r8, [sp, #(64 +32)]
ldr r9, [sp, #(64 +36)]
@@ -391,7 +427,8 @@ _gcry_chacha20_armv7_neon_blocks:
tst r12, r12
str r9, [sp, #(64 +52)]
beq .Lchacha_blocks_neon_nomessage12
- ldmia r12!, {r8-r11}
+ UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
+ tst r12, r12
eor r0, r0, r8
eor r1, r1, r9
eor r2, r2, r10
@@ -406,7 +443,8 @@ _gcry_chacha20_armv7_neon_blocks:
add r12, r12, #16
eor r7, r7, r11
.Lchacha_blocks_neon_nomessage12:
- stmia r14!, {r0-r7}
+ UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
+ tst r12, r12
beq .Lchacha_blocks_neon_nomessage13
vld1.32 {q12,q13}, [r12]!
vld1.32 {q14,q15}, [r12]!
@@ -613,7 +651,8 @@ _gcry_chacha20_armv7_neon_blocks:
tst r12, r12
add r7, r7, r11
beq .Lchacha_blocks_neon_nomessage21
- ldmia r12!, {r8-r11}
+ UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
+ tst r12, r12
eor r0, r0, r8
eor r1, r1, r9
eor r2, r2, r10
@@ -628,7 +667,7 @@ _gcry_chacha20_armv7_neon_blocks:
add r12, r12, #16
eor r7, r7, r11
.Lchacha_blocks_neon_nomessage21:
- stmia r14!, {r0-r7}
+ UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
ldm sp, {r0-r7}
ldr r8, [sp, #(64 +32)]
ldr r9, [sp, #(64 +36)]
@@ -652,7 +691,8 @@ _gcry_chacha20_armv7_neon_blocks:
tst r12, r12
str r9, [sp, #(64 +52)]
beq .Lchacha_blocks_neon_nomessage22
- ldmia r12!, {r8-r11}
+ UNALIGNED_LDMIA4(r12, r8, r9, r10, r11)
+ tst r12, r12
eor r0, r0, r8
eor r1, r1, r9
eor r2, r2, r10
@@ -667,7 +707,7 @@ _gcry_chacha20_armv7_neon_blocks:
add r12, r12, #16
eor r7, r7, r11
.Lchacha_blocks_neon_nomessage22:
- stmia r14!, {r0-r7}
+ UNALIGNED_STMIA8(r14, r0, r1, r2, r3, r4, r5, r6, r7)
str r12, [sp, #48]
str r14, [sp, #40]
ldr r3, [sp, #52]
diff --git a/cipher/poly1305-armv7-neon.S b/cipher/poly1305-armv7-neon.S
index b1554ed..13cb4a5 100644
--- a/cipher/poly1305-armv7-neon.S
+++ b/cipher/poly1305-armv7-neon.S
@@ -46,6 +46,32 @@
# define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
#endif
+#define UNALIGNED_LDMIA2(ptr, l0, l1) \
+ tst ptr, #3; \
+ beq 1f; \
+ vpush {d0}; \
+ vld1.32 {d0}, [ptr]!; \
+ vmov l0, s0; \
+ vmov l1, s1; \
+ vpop {d0}; \
+ b 2f; \
+ 1: ldmia ptr!, {l0-l1}; \
+ 2: ;
+
+#define UNALIGNED_LDMIA4(ptr, l0, l1, l2, l3) \
+ tst ptr, #3; \
+ beq 1f; \
+ vpush {d0-d1}; \
+ vld1.32 {d0-d1}, [ptr]!; \
+ vmov l0, s0; \
+ vmov l1, s1; \
+ vmov l2, s2; \
+ vmov l3, s3; \
+ vpop {d0-d1}; \
+ b 2f; \
+ 1: ldmia ptr!, {l0-l3}; \
+ 2: ;
+
.text
.p2align 2
@@ -64,7 +90,7 @@ _gcry_poly1305_armv7_neon_init_ext:
mov r14, r2
and r2, r2, r2
moveq r14, #-1
- ldmia r1!, {r2-r5}
+ UNALIGNED_LDMIA4(r1, r2, r3, r4, r5)
GET_DATA_POINTER(r7,.Lpoly1305_init_constants_neon,r8)
mov r6, r2
mov r8, r2, lsr #26
@@ -175,7 +201,7 @@ _gcry_poly1305_armv7_neon_init_ext:
eor r6, r6, r6
stmia r0!, {r2-r6}
stmia r0!, {r2-r6}
- ldmia r1!, {r2-r5}
+ UNALIGNED_LDMIA4(r1, r2, r3, r4, r5)
stmia r0, {r2-r6}
add sp, sp, #32
ldmfd sp!, {r4-r11, lr}
@@ -286,7 +312,7 @@ _gcry_poly1305_armv7_neon_blocks:
vmov d14, d12
vmul.i32 q6, q5, d0[0]
.Lpoly1305_blocks_neon_mainloop:
- ldmia r0!, {r2-r5}
+ UNALIGNED_LDMIA4(r0, r2, r3, r4, r5)
vmull.u32 q0, d25, d12[0]
mov r7, r2, lsr #26
vmlal.u32 q0, d24, d12[1]
@@ -302,7 +328,7 @@ _gcry_poly1305_armv7_neon_blocks:
orr r4, r8, r4, lsl #12
orr r5, r9, r5, lsl #18
vmlal.u32 q1, d24, d13[0]
- ldmia r0!, {r7-r10}
+ UNALIGNED_LDMIA4(r0, r7, r8, r9, r10)
vmlal.u32 q1, d23, d13[1]
mov r1, r7, lsr #26
vmlal.u32 q1, d22, d14[0]
@@ -344,7 +370,7 @@ _gcry_poly1305_armv7_neon_blocks:
vmlal.u32 q4, d21, d11[1]
vld1.64 {d21-d24}, [r14, :256]!
vld1.64 {d25}, [r14, :64]
- ldmia r0!, {r2-r5}
+ UNALIGNED_LDMIA4(r0, r2, r3, r4, r5)
vmlal.u32 q0, d25, d26
mov r7, r2, lsr #26
vmlal.u32 q0, d24, d27
@@ -360,7 +386,7 @@ _gcry_poly1305_armv7_neon_blocks:
orr r4, r8, r4, lsl #12
orr r5, r9, r5, lsl #18
vmlal.u32 q1, d24, d28
- ldmia r0!, {r7-r10}
+ UNALIGNED_LDMIA4(r0, r7, r8, r9, r10)
vmlal.u32 q1, d23, d29
mov r1, r7, lsr #26
vmlal.u32 q1, d22, d20
@@ -643,7 +669,7 @@ _gcry_poly1305_armv7_neon_finish_ext:
.Lpoly1305_finish_ext_neon_skip16:
tst r7, #8
beq .Lpoly1305_finish_ext_neon_skip8
- ldmia r1!, {r10-r11}
+ UNALIGNED_LDMIA2(r1, r10, r11)
stmia r9!, {r10-r11}
.Lpoly1305_finish_ext_neon_skip8:
tst r7, #4
More information about the Gcrypt-devel
mailing list