[PATCH 3/5] aarch64: Fix assembling chacha20-aarch64.S with clang/llvm
Martin Storsjö
martin at martin.st
Thu Mar 22 09:56:13 CET 2018
When referring to a specific lane, one doesn't need to specify
the total number of lanes of the register. With GNU binutils,
both forms are accepted, while clang/llvm rejects the form
with the unnecessary number of lanes.
Signed-off-by: Martin Storsjö <martin at martin.st>
---
cipher/chacha20-aarch64.S | 60 +++++++++++++++++++++++------------------------
1 file changed, 30 insertions(+), 30 deletions(-)
diff --git a/cipher/chacha20-aarch64.S b/cipher/chacha20-aarch64.S
index 739ddde..5990a08 100644
--- a/cipher/chacha20-aarch64.S
+++ b/cipher/chacha20-aarch64.S
@@ -170,27 +170,27 @@ _gcry_chacha20_aarch64_blocks4:
mov ROUND, #20;
ld1 {VTMP1.16b-VTMP3.16b}, [INPUT_POS];
- dup X12.4s, X15.4s[0];
- dup X13.4s, X15.4s[1];
+ dup X12.4s, X15.s[0];
+ dup X13.4s, X15.s[1];
ldr CTR, [INPUT_CTR];
add X12.4s, X12.4s, VCTR.4s;
- dup X0.4s, VTMP1.4s[0];
- dup X1.4s, VTMP1.4s[1];
- dup X2.4s, VTMP1.4s[2];
- dup X3.4s, VTMP1.4s[3];
- dup X14.4s, X15.4s[2];
+ dup X0.4s, VTMP1.s[0];
+ dup X1.4s, VTMP1.s[1];
+ dup X2.4s, VTMP1.s[2];
+ dup X3.4s, VTMP1.s[3];
+ dup X14.4s, X15.s[2];
cmhi VTMP0.4s, VCTR.4s, X12.4s;
- dup X15.4s, X15.4s[3];
+ dup X15.4s, X15.s[3];
add CTR, CTR, #4; /* Update counter */
- dup X4.4s, VTMP2.4s[0];
- dup X5.4s, VTMP2.4s[1];
- dup X6.4s, VTMP2.4s[2];
- dup X7.4s, VTMP2.4s[3];
+ dup X4.4s, VTMP2.s[0];
+ dup X5.4s, VTMP2.s[1];
+ dup X6.4s, VTMP2.s[2];
+ dup X7.4s, VTMP2.s[3];
sub X13.4s, X13.4s, VTMP0.4s;
- dup X8.4s, VTMP3.4s[0];
- dup X9.4s, VTMP3.4s[1];
- dup X10.4s, VTMP3.4s[2];
- dup X11.4s, VTMP3.4s[3];
+ dup X8.4s, VTMP3.s[0];
+ dup X9.4s, VTMP3.s[1];
+ dup X10.4s, VTMP3.s[2];
+ dup X11.4s, VTMP3.s[3];
mov X12_TMP.16b, X12.16b;
mov X13_TMP.16b, X13.16b;
str CTR, [INPUT_CTR];
@@ -208,19 +208,19 @@ _gcry_chacha20_aarch64_blocks4:
PLUS(X12, X12_TMP); /* INPUT + 12 * 4 + counter */
PLUS(X13, X13_TMP); /* INPUT + 13 * 4 + counter */
- dup VTMP2.4s, VTMP0.4s[0]; /* INPUT + 0 * 4 */
- dup VTMP3.4s, VTMP0.4s[1]; /* INPUT + 1 * 4 */
- dup X12_TMP.4s, VTMP0.4s[2]; /* INPUT + 2 * 4 */
- dup X13_TMP.4s, VTMP0.4s[3]; /* INPUT + 3 * 4 */
+ dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 0 * 4 */
+ dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 1 * 4 */
+ dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 2 * 4 */
+ dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 3 * 4 */
PLUS(X0, VTMP2);
PLUS(X1, VTMP3);
PLUS(X2, X12_TMP);
PLUS(X3, X13_TMP);
- dup VTMP2.4s, VTMP1.4s[0]; /* INPUT + 4 * 4 */
- dup VTMP3.4s, VTMP1.4s[1]; /* INPUT + 5 * 4 */
- dup X12_TMP.4s, VTMP1.4s[2]; /* INPUT + 6 * 4 */
- dup X13_TMP.4s, VTMP1.4s[3]; /* INPUT + 7 * 4 */
+ dup VTMP2.4s, VTMP1.s[0]; /* INPUT + 4 * 4 */
+ dup VTMP3.4s, VTMP1.s[1]; /* INPUT + 5 * 4 */
+ dup X12_TMP.4s, VTMP1.s[2]; /* INPUT + 6 * 4 */
+ dup X13_TMP.4s, VTMP1.s[3]; /* INPUT + 7 * 4 */
ld1 {VTMP0.16b, VTMP1.16b}, [INPUT_POS];
mov INPUT_POS, INPUT;
PLUS(X4, VTMP2);
@@ -228,12 +228,12 @@ _gcry_chacha20_aarch64_blocks4:
PLUS(X6, X12_TMP);
PLUS(X7, X13_TMP);
- dup VTMP2.4s, VTMP0.4s[0]; /* INPUT + 8 * 4 */
- dup VTMP3.4s, VTMP0.4s[1]; /* INPUT + 9 * 4 */
- dup X12_TMP.4s, VTMP0.4s[2]; /* INPUT + 10 * 4 */
- dup X13_TMP.4s, VTMP0.4s[3]; /* INPUT + 11 * 4 */
- dup VTMP0.4s, VTMP1.4s[2]; /* INPUT + 14 * 4 */
- dup VTMP1.4s, VTMP1.4s[3]; /* INPUT + 15 * 4 */
+ dup VTMP2.4s, VTMP0.s[0]; /* INPUT + 8 * 4 */
+ dup VTMP3.4s, VTMP0.s[1]; /* INPUT + 9 * 4 */
+ dup X12_TMP.4s, VTMP0.s[2]; /* INPUT + 10 * 4 */
+ dup X13_TMP.4s, VTMP0.s[3]; /* INPUT + 11 * 4 */
+ dup VTMP0.4s, VTMP1.s[2]; /* INPUT + 14 * 4 */
+ dup VTMP1.4s, VTMP1.s[3]; /* INPUT + 15 * 4 */
PLUS(X8, VTMP2);
PLUS(X9, VTMP3);
PLUS(X10, X12_TMP);
--
2.7.4
More information about the Gcrypt-devel
mailing list