[PATCH 3/3] Optimize and cleanup 32-bit and 64-bit endianess transforms

Jussi Kivilinna jussi.kivilinna at iki.fi
Thu Sep 19 13:48:04 CEST 2013


* cipher/bithelp.h (bswap32, bswap64, le_bswap32, be_bswap32)
(le_bswap64, be_bswap64): New.
* cipher/bufhelp.h (buf_get_be32, buf_get_le32, buf_put_le32)
(buf_put_be32, buf_get_be64, buf_get_le64, buf_put_be64)
(buf_put_le64): New.
* cipher/blowfish.c (do_encrypt_block, do_decrypt_block): Use new
endian conversion helpers.
(do_bf_setkey): Turn endian specific code to generic.
* cipher/camellia.c (GETU32, PUTU32): Use new endian conversion
helpers.
* cipher/cast5.c (rol): Remove, use rol from bithelp.
(F1, F2, F3): Fix to use rol from bithelp.
(do_encrypt_block, do_decrypt_block, do_cast_setkey): Use new endian
conversion helpers.
* cipher/des.c (READ_64BIT_DATA, WRITE_64BIT_DATA): Ditto.
* cipher/md4.c (transform, md4_final): Ditto.
* cipher/md5.c (transform, md5_final): Ditto.
* cipher/rmd160.c (transform, rmd160_final): Ditto.
* cipher/salsa20.c (LE_SWAP32, LE_READ_UINT32): Ditto.
* cipher/scrypt.c (READ_UINT64, LE_READ_UINT64, LE_SWAP32): Ditto.
* cipher/seed.c (GETU32, PUTU32): Ditto.
* cipher/serpent.c (byte_swap_32): Remove.
(serpent_key_prepare, serpent_encrypt_internal)
(serpent_decrypt_internal): Use new endian conversion helpers.
* cipher/sha1.c (transform, sha1_final): Ditto.
* cipher/sha256.c (transform, sha256_final): Ditto.
* cipher/sha512.c (__transform, sha512_final): Ditto.
* cipher/stribog.c (transform, stribog_final): Ditto.
* cipher/tiger.c (transform, tiger_final): Ditto.
* cipher/twofish.c (INPACK, OUTUNPACK): Ditto.
* cipher/whirlpool.c (buffer_to_block, block_to_buffer): Ditto.
* configure.ac (gcry_cv_have_builtin_bswap32): Check for compiler
provided __builtin_bswap32.
(gcry_cv_have_builtin_bswap64): Check for compiler provided
__builtin_bswap64.
--

Patch add helper functions that provide conversions to/from integers and
buffers of different endianess. Benefits are code cleanup and optimization
for architectures that have byte-swaping instructions and/or can do fast
unaligned memory accesses.

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/bithelp.h   |   40 +++++++++++++++
 cipher/blowfish.c  |   43 ++++------------
 cipher/bufhelp.h   |  143 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 cipher/camellia.c  |   14 +----
 cipher/cast5.c     |   57 ++++++---------------
 cipher/des.c       |   11 ++--
 cipher/md4.c       |   37 +++----------
 cipher/md5.c       |   38 +++-----------
 cipher/rmd160.c    |   45 +++-------------
 cipher/salsa20.c   |   14 +----
 cipher/scrypt.c    |   34 ++----------
 cipher/seed.c      |    9 +--
 cipher/serpent.c   |   57 +++++++--------------
 cipher/sha1.c      |   70 ++++++++-----------------
 cipher/sha256.c    |   37 ++-----------
 cipher/sha512.c    |   51 ++-----------------
 cipher/stribog.c   |   35 +------------
 cipher/tiger.c     |   48 ++++-------------
 cipher/twofish.c   |    7 +--
 cipher/whirlpool.c |   23 +-------
 configure.ac       |   30 +++++++++++
 21 files changed, 352 insertions(+), 491 deletions(-)

diff --git a/cipher/bithelp.h b/cipher/bithelp.h
index 785701e..734dcbb 100644
--- a/cipher/bithelp.h
+++ b/cipher/bithelp.h
@@ -20,6 +20,8 @@
 #ifndef G10_BITHELP_H
 #define G10_BITHELP_H
 
+#include "types.h"
+
 
 /****************
  * Rotate the 32 bit unsigned integer X by N bits left/right
@@ -52,5 +54,43 @@ ror(u32 x, int n)
 #define ror(x,n) ( ((x) >> (n)) | ((x) << (32-(n))) )
 #endif
 
+/* Byte swap for 32-bit and 64-bit integers.  If available, use compiler
+   provided helpers.  */
+#ifdef HAVE_BUILTIN_BSWAP32
+# define bswap32 __builtin_bswap32
+#else
+static inline u32 bswap32(u32 x)
+{
+	return ((rol(x, 8) & 0x00ff00ffL) | (ror(x, 8) & 0xff00ff00L));
+}
+#endif
+
+#ifdef HAVE_U64_TYPEDEF
+# ifdef HAVE_BUILTIN_BSWAP64
+#  define bswap64 __builtin_bswap64
+# else
+static inline u64 bswap64(u64 x)
+{
+	return ((u64)bswap32(x) << 32) | (bswap32(x >> 32));
+}
+# endif
+#endif
+
+/* Endian dependent byte swap operations.  */
+#ifdef WORDS_BIGENDIAN
+# define le_bswap32(x) bswap32(x)
+# define be_bswap32(x) ((u32)(x))
+# ifdef HAVE_U64_TYPEDEF
+#  define le_bswap64(x) bswap64(x)
+#  define be_bswap64(x) ((u64)(x))
+# endif
+#else
+# define le_bswap32(x) ((u32)(x))
+# define be_bswap32(x) bswap32(x)
+# ifdef HAVE_U64_TYPEDEF
+#  define le_bswap64(x) ((u64)(x))
+#  define be_bswap64(x) bswap64(x)
+# endif
+#endif
 
 #endif /*G10_BITHELP_H*/
diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index 80e1ec7..61042ed 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -544,17 +544,11 @@ do_encrypt_block ( BLOWFISH_context *bc, byte *outbuf, const byte *inbuf )
 {
   u32 d1, d2;
 
-  d1 = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-  d2 = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+  d1 = buf_get_be32(inbuf);
+  d2 = buf_get_be32(inbuf + 4);
   do_encrypt( bc, &d1, &d2 );
-  outbuf[0] = (d1 >> 24) & 0xff;
-  outbuf[1] = (d1 >> 16) & 0xff;
-  outbuf[2] = (d1 >>	8) & 0xff;
-  outbuf[3] =  d1	   & 0xff;
-  outbuf[4] = (d2 >> 24) & 0xff;
-  outbuf[5] = (d2 >> 16) & 0xff;
-  outbuf[6] = (d2 >>	8) & 0xff;
-  outbuf[7] =  d2	   & 0xff;
+  buf_put_be32(outbuf, d1);
+  buf_put_be32(outbuf + 4, d2);
 }
 
 static unsigned int
@@ -571,17 +565,11 @@ do_decrypt_block (BLOWFISH_context *bc, byte *outbuf, const byte *inbuf)
 {
   u32 d1, d2;
 
-  d1 = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-  d2 = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+  d1 = buf_get_be32(inbuf);
+  d2 = buf_get_be32(inbuf + 4);
   decrypt( bc, &d1, &d2 );
-  outbuf[0] = (d1 >> 24) & 0xff;
-  outbuf[1] = (d1 >> 16) & 0xff;
-  outbuf[2] = (d1 >>	8) & 0xff;
-  outbuf[3] =  d1	   & 0xff;
-  outbuf[4] = (d2 >> 24) & 0xff;
-  outbuf[5] = (d2 >> 16) & 0xff;
-  outbuf[6] = (d2 >>	8) & 0xff;
-  outbuf[7] =  d2	   & 0xff;
+  buf_put_be32(outbuf, d1);
+  buf_put_be32(outbuf + 4, d2);
 }
 
 static unsigned int
@@ -903,17 +891,10 @@ do_bf_setkey (BLOWFISH_context *c, const byte *key, unsigned keylen)
 
   for(i=j=0; i < BLOWFISH_ROUNDS+2; i++ )
     {
-#ifdef WORDS_BIGENDIAN
-      ((byte*)&data)[0] = key[j];
-      ((byte*)&data)[1] = key[(j+1)%keylen];
-      ((byte*)&data)[2] = key[(j+2)%keylen];
-      ((byte*)&data)[3] = key[(j+3)%keylen];
-#else
-      ((byte*)&data)[3] = key[j];
-      ((byte*)&data)[2] = key[(j+1)%keylen];
-      ((byte*)&data)[1] = key[(j+2)%keylen];
-      ((byte*)&data)[0] = key[(j+3)%keylen];
-#endif
+      data = ((u32)key[j] << 24) |
+             ((u32)key[(j+1)%keylen] << 16) |
+             ((u32)key[(j+2)%keylen] << 8) |
+             ((u32)key[(j+3)%keylen]);
       c->p[i] ^= data;
       j = (j+4) % keylen;
     }
diff --git a/cipher/bufhelp.h b/cipher/bufhelp.h
index d829cf1..c637dac 100644
--- a/cipher/bufhelp.h
+++ b/cipher/bufhelp.h
@@ -20,6 +20,8 @@
 #ifndef G10_BUFHELP_H
 #define G10_BUFHELP_H
 
+#include <config.h>
+
 #ifdef HAVE_STDINT_H
 # include <stdint.h> /* uintptr_t */
 #elif defined(HAVE_INTTYPES_H)
@@ -28,6 +30,8 @@
 /* In this case, uintptr_t is provided by config.h. */
 #endif
 
+#include "bithelp.h"
+
 
 #if defined(__i386__) || defined(__x86_64__) || \
     (defined(__arm__) && defined(__ARM_FEATURE_UNALIGNED))
@@ -179,4 +183,143 @@ do_bytes:
     }
 }
 
+
+#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
+
+/* Functions for loading and storing unaligned u32 values of different
+   endianness.  */
+static inline u32 buf_get_be32(const void *_buf)
+{
+  const byte *in = _buf;
+  return ((u32)in[0] << 24) | ((u32)in[1] << 16) | \
+         ((u32)in[2] << 8) | (u32)in[3];
+}
+
+static inline u32 buf_get_le32(const void *_buf)
+{
+  const byte *in = _buf;
+  return ((u32)in[3] << 24) | ((u32)in[2] << 16) | \
+         ((u32)in[1] << 8) | (u32)in[0];
+}
+
+static inline void buf_put_be32(void *_buf, u32 val)
+{
+  byte *out = _buf;
+  out[0] = val >> 24;
+  out[1] = val >> 16;
+  out[2] = val >> 8;
+  out[3] = val;
+}
+
+static inline void buf_put_le32(void *_buf, u32 val)
+{
+  byte *out = _buf;
+  out[3] = val >> 24;
+  out[2] = val >> 16;
+  out[1] = val >> 8;
+  out[0] = val;
+}
+
+#ifdef HAVE_U64_TYPEDEF
+/* Functions for loading and storing unaligned u64 values of different
+   endianness.  */
+static inline u64 buf_get_be64(const void *_buf)
+{
+  const byte *in = _buf;
+  return ((u64)in[0] << 56) | ((u64)in[1] << 48) | \
+         ((u64)in[2] << 40) | ((u64)in[3] << 32) | \
+         ((u64)in[4] << 24) | ((u64)in[5] << 16) | \
+         ((u64)in[6] << 8) | (u64)in[7];
+}
+
+static inline u64 buf_get_le64(const void *_buf)
+{
+  const byte *in = _buf;
+  return ((u64)in[7] << 56) | ((u64)in[6] << 48) | \
+         ((u64)in[5] << 40) | ((u64)in[4] << 32) | \
+         ((u64)in[3] << 24) | ((u64)in[2] << 16) | \
+         ((u64)in[1] << 8) | (u64)in[0];
+}
+
+static inline void buf_put_be64(void *_buf, u64 val)
+{
+  byte *out = _buf;
+  out[0] = val >> 56;
+  out[1] = val >> 48;
+  out[2] = val >> 40;
+  out[3] = val >> 32;
+  out[4] = val >> 24;
+  out[5] = val >> 16;
+  out[6] = val >> 8;
+  out[7] = val;
+}
+
+static inline void buf_put_le64(void *_buf, u64 val)
+{
+  byte *out = _buf;
+  out[7] = val >> 56;
+  out[6] = val >> 48;
+  out[5] = val >> 40;
+  out[4] = val >> 32;
+  out[3] = val >> 24;
+  out[2] = val >> 16;
+  out[1] = val >> 8;
+  out[0] = val;
+}
+#endif /*HAVE_U64_TYPEDEF*/
+
+#else /*BUFHELP_FAST_UNALIGNED_ACCESS*/
+
+/* Functions for loading and storing unaligned u32 values of different
+   endianness.  */
+static inline u32 buf_get_be32(const void *_buf)
+{
+  return be_bswap32(*(const u32 *)_buf);
+}
+
+static inline u32 buf_get_le32(const void *_buf)
+{
+  return le_bswap32(*(const u32 *)_buf);
+}
+
+static inline void buf_put_be32(void *_buf, u32 val)
+{
+  u32 *out = _buf;
+  *out = be_bswap32(val);
+}
+
+static inline void buf_put_le32(void *_buf, u32 val)
+{
+  u32 *out = _buf;
+  *out = le_bswap32(val);
+}
+
+#ifdef HAVE_U64_TYPEDEF
+/* Functions for loading and storing unaligned u64 values of different
+   endianness.  */
+static inline u64 buf_get_be64(const void *_buf)
+{
+  return be_bswap64(*(const u64 *)_buf);
+}
+
+static inline u64 buf_get_le64(const void *_buf)
+{
+  return le_bswap64(*(const u64 *)_buf);
+}
+
+static inline void buf_put_be64(void *_buf, u64 val)
+{
+  u64 *out = _buf;
+  *out = be_bswap64(val);
+}
+
+static inline void buf_put_le64(void *_buf, u64 val)
+{
+  u64 *out = _buf;
+  *out = le_bswap64(val);
+}
+#endif /*HAVE_U64_TYPEDEF*/
+
+#endif /*BUFHELP_FAST_UNALIGNED_ACCESS*/
+
 #endif /*G10_BITHELP_H*/
diff --git a/cipher/camellia.c b/cipher/camellia.c
index 038d911..03510a3 100644
--- a/cipher/camellia.c
+++ b/cipher/camellia.c
@@ -25,6 +25,7 @@
 #include <string.h>
 #include <stdlib.h>
 
+#include "bufhelp.h"
 #include "camellia.h"
 
 /* u32 must be 32bit word */
@@ -59,17 +60,8 @@ typedef unsigned char u8;
 
 #else /* not MS-VC */
 
-# define GETU32(pt)				\
-    (((u32)(pt)[0] << 24)			\
-     ^ ((u32)(pt)[1] << 16)			\
-     ^ ((u32)(pt)[2] <<  8)			\
-     ^ ((u32)(pt)[3]))
-
-# define PUTU32(ct, st)  {			\
-	(ct)[0] = (u8)((st) >> 24);		\
-	(ct)[1] = (u8)((st) >> 16);		\
-	(ct)[2] = (u8)((st) >>  8);		\
-	(ct)[3] = (u8)(st); }
+# define GETU32(pt) buf_get_be32(pt)
+# define PUTU32(ct, st) buf_put_be32(ct, st)
 
 #endif
 
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 0cd5953..ae6b509 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -42,6 +42,7 @@
 #include "g10lib.h"
 #include "types.h"
 #include "cipher.h"
+#include "bithelp.h"
 #include "bufhelp.h"
 #include "cipher-selftest.h"
 
@@ -448,25 +449,11 @@ decrypt_block (void *context, byte *outbuf, const byte *inbuf)
 
 #else /*USE_ARMV6_ASM*/
 
-#if defined(__GNUC__) && defined(__i386__)
-static inline u32
-rol(int n, u32 x)
-{
-	__asm__("roll %%cl,%0"
-		:"=r" (x)
-		:"0" (x),"c" (n)
-		:"cc");
-	return x;
-}
-#else
-#define rol(n,x) ( ((x) << (n)) | ((x) >> (32-(n))) )
-#endif
-
-#define F1(D,m,r)  (  (I = ((m) + (D))), (I=rol((r),I)),   \
+#define F1(D,m,r)  (  (I = ((m) + (D))), (I=rol(I,(r))),   \
     (((s1[I >> 24] ^ s2[(I>>16)&0xff]) - s3[(I>>8)&0xff]) + s4[I&0xff]) )
-#define F2(D,m,r)  (  (I = ((m) ^ (D))), (I=rol((r),I)),   \
+#define F2(D,m,r)  (  (I = ((m) ^ (D))), (I=rol(I,(r))),   \
     (((s1[I >> 24] - s2[(I>>16)&0xff]) + s3[(I>>8)&0xff]) ^ s4[I&0xff]) )
-#define F3(D,m,r)  (  (I = ((m) - (D))), (I=rol((r),I)),   \
+#define F3(D,m,r)  (  (I = ((m) - (D))), (I=rol(I,(r))),   \
     (((s1[I >> 24] + s2[(I>>16)&0xff]) ^ s3[(I>>8)&0xff]) - s4[I&0xff]) )
 
 static void
@@ -483,8 +470,8 @@ do_encrypt_block( CAST5_context *c, byte *outbuf, const byte *inbuf )
     /* (L0,R0) <-- (m1...m64).	(Split the plaintext into left and
      * right 32-bit halves L0 = m1...m32 and R0 = m33...m64.)
      */
-    l = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-    r = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+    l = buf_get_be32(inbuf + 0);
+    r = buf_get_be32(inbuf + 4);
 
     /* (16 rounds) for i from 1 to 16, compute Li and Ri as follows:
      *	Li = Ri-1;
@@ -513,14 +500,8 @@ do_encrypt_block( CAST5_context *c, byte *outbuf, const byte *inbuf )
 
     /* c1...c64 <-- (R16,L16).	(Exchange final blocks L16, R16 and
      *	concatenate to form the ciphertext.) */
-    outbuf[0] = (r >> 24) & 0xff;
-    outbuf[1] = (r >> 16) & 0xff;
-    outbuf[2] = (r >>  8) & 0xff;
-    outbuf[3] =  r	  & 0xff;
-    outbuf[4] = (l >> 24) & 0xff;
-    outbuf[5] = (l >> 16) & 0xff;
-    outbuf[6] = (l >>  8) & 0xff;
-    outbuf[7] =  l	  & 0xff;
+    buf_put_be32(outbuf + 0, r);
+    buf_put_be32(outbuf + 4, l);
 }
 
 static unsigned int
@@ -543,8 +524,8 @@ do_decrypt_block (CAST5_context *c, byte *outbuf, const byte *inbuf )
     Km = c->Km;
     Kr = c->Kr;
 
-    l = inbuf[0] << 24 | inbuf[1] << 16 | inbuf[2] << 8 | inbuf[3];
-    r = inbuf[4] << 24 | inbuf[5] << 16 | inbuf[6] << 8 | inbuf[7];
+    l = buf_get_be32(inbuf + 0);
+    r = buf_get_be32(inbuf + 4);
 
     t = l; l = r; r = t ^ F1(r, Km[15], Kr[15]);
     t = l; l = r; r = t ^ F3(r, Km[14], Kr[14]);
@@ -563,14 +544,8 @@ do_decrypt_block (CAST5_context *c, byte *outbuf, const byte *inbuf )
     t = l; l = r; r = t ^ F2(r, Km[ 1], Kr[ 1]);
     t = l; l = r; r = t ^ F1(r, Km[ 0], Kr[ 0]);
 
-    outbuf[0] = (r >> 24) & 0xff;
-    outbuf[1] = (r >> 16) & 0xff;
-    outbuf[2] = (r >>  8) & 0xff;
-    outbuf[3] =  r	  & 0xff;
-    outbuf[4] = (l >> 24) & 0xff;
-    outbuf[5] = (l >> 16) & 0xff;
-    outbuf[6] = (l >>  8) & 0xff;
-    outbuf[7] =  l	  & 0xff;
+    buf_put_be32(outbuf + 0, r);
+    buf_put_be32(outbuf + 4, l);
 }
 
 static unsigned int
@@ -949,10 +924,10 @@ do_cast_setkey( CAST5_context *c, const byte *key, unsigned keylen )
   if( keylen != 16 )
     return GPG_ERR_INV_KEYLEN;
 
-  x[0] = key[0]  << 24 | key[1]  << 16 | key[2]  << 8 | key[3];
-  x[1] = key[4]  << 24 | key[5]  << 16 | key[6]  << 8 | key[7];
-  x[2] = key[8]  << 24 | key[9]  << 16 | key[10] << 8 | key[11];
-  x[3] = key[12] << 24 | key[13] << 16 | key[14] << 8 | key[15];
+  x[0] = buf_get_be32(key + 0);
+  x[1] = buf_get_be32(key + 4);
+  x[2] = buf_get_be32(key + 8);
+  x[3] = buf_get_be32(key + 12);
 
   key_schedule( x, z, k );
   for(i=0; i < 16; i++ )
diff --git a/cipher/des.c b/cipher/des.c
index 7db9e5d..f1550d1 100644
--- a/cipher/des.c
+++ b/cipher/des.c
@@ -118,6 +118,7 @@
 #include "types.h"             /* for byte and u32 typedefs */
 #include "g10lib.h"
 #include "cipher.h"
+#include "bufhelp.h"
 
 #if defined(__GNUC__) && defined(__GNU_LIBRARY__)
 #define working_memcmp memcmp
@@ -455,14 +456,12 @@ static unsigned char weak_keys_chksum[20] = {
  * Macros to convert 8 bytes from/to 32bit words.
  */
 #define READ_64BIT_DATA(data, left, right)				   \
-    left  = (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];  \
-    right = (data[4] << 24) | (data[5] << 16) | (data[6] << 8) | data[7];
+    left = buf_get_be32(data + 0);					   \
+    right = buf_get_be32(data + 4);
 
 #define WRITE_64BIT_DATA(data, left, right)				   \
-    data[0] = (left >> 24) &0xff; data[1] = (left >> 16) &0xff; 	   \
-    data[2] = (left >> 8) &0xff; data[3] = left &0xff;			   \
-    data[4] = (right >> 24) &0xff; data[5] = (right >> 16) &0xff;	   \
-    data[6] = (right >> 8) &0xff; data[7] = right &0xff;
+    buf_put_be32(data + 0, left);					   \
+    buf_put_be32(data + 4, right);
 
 /*
  * Handy macros for encryption and decryption of data
diff --git a/cipher/md4.c b/cipher/md4.c
index 2de530c..e2d096c 100644
--- a/cipher/md4.c
+++ b/cipher/md4.c
@@ -56,6 +56,7 @@
 #include "cipher.h"
 
 #include "bithelp.h"
+#include "bufhelp.h"
 #include "hash-common.h"
 
 
@@ -100,23 +101,10 @@ transform ( void *c, const unsigned char *data )
   register u32 B = ctx->B;
   register u32 C = ctx->C;
   register u32 D = ctx->D;
+  int i;
 
-#ifdef WORDS_BIGENDIAN
-  {
-    int i;
-    byte *p2;
-    const byte *p1;
-    for(i=0, p1=data, p2=(byte*)in; i < 16; i++, p2 += 4 )
-      {
-	p2[3] = *p1++;
-	p2[2] = *p1++;
-	p2[1] = *p1++;
-	p2[0] = *p1++;
-      }
-  }
-#else
-  memcpy (in, data, 64);
-#endif
+  for ( i = 0; i < 16; i++ )
+    in[i] = buf_get_le32(data + i * 4);
 
   /* Round 1.  */
 #define function(a,b,c,d,k,s) a=rol(a+F(b,c,d)+in[k],s);
@@ -238,24 +226,13 @@ md4_final( void *context )
       memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
     }
   /* append the 64 bit count */
-  hd->bctx.buf[56] = lsb	   ;
-  hd->bctx.buf[57] = lsb >>  8;
-  hd->bctx.buf[58] = lsb >> 16;
-  hd->bctx.buf[59] = lsb >> 24;
-  hd->bctx.buf[60] = msb	   ;
-  hd->bctx.buf[61] = msb >>  8;
-  hd->bctx.buf[62] = msb >> 16;
-  hd->bctx.buf[63] = msb >> 24;
+  buf_put_le32(hd->bctx.buf + 56, lsb);
+  buf_put_le32(hd->bctx.buf + 60, msb);
   burn = transform( hd, hd->bctx.buf );
   _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
-#ifdef WORDS_BIGENDIAN
-#define X(a) do { *p++ = hd->a      ; *p++ = hd->a >> 8;      \
-		  *p++ = hd->a >> 16; *p++ = hd->a >> 24; } while(0)
-#else /* little endian */
-#define X(a) do { *(u32*)p = (*hd).a ; p += 4; } while(0)
-#endif
+#define X(a) do { *(u32*)p = le_bswap32((*hd).a) ; p += 4; } while(0)
   X(A);
   X(B);
   X(C);
diff --git a/cipher/md5.c b/cipher/md5.c
index 88745a8..db0f315 100644
--- a/cipher/md5.c
+++ b/cipher/md5.c
@@ -40,6 +40,7 @@
 #include "cipher.h"
 
 #include "bithelp.h"
+#include "bufhelp.h"
 #include "hash-common.h"
 
 
@@ -91,24 +92,10 @@ transform ( void *c, const unsigned char *data )
   register u32 C = ctx->C;
   register u32 D = ctx->D;
   u32 *cwp = correct_words;
+  int i;
 
-#ifdef WORDS_BIGENDIAN
-  {
-    int i;
-    byte *p2;
-    const byte *p1;
-    for(i=0, p1=data, p2=(byte*)correct_words; i < 16; i++, p2 += 4 )
-      {
-        p2[3] = *p1++;
-	p2[2] = *p1++;
-	p2[1] = *p1++;
-	p2[0] = *p1++;
-      }
-  }
-#else
-  memcpy( correct_words, data, 64 );
-#endif
-
+  for ( i = 0; i < 16; i++ )
+    correct_words[i] = buf_get_le32(data + i * 4);
 
 #define OP(a, b, c, d, s, T) \
   do			         	   \
@@ -263,24 +250,13 @@ md5_final( void *context)
       memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
     }
   /* append the 64 bit count */
-  hd->bctx.buf[56] = lsb	   ;
-  hd->bctx.buf[57] = lsb >>  8;
-  hd->bctx.buf[58] = lsb >> 16;
-  hd->bctx.buf[59] = lsb >> 24;
-  hd->bctx.buf[60] = msb	   ;
-  hd->bctx.buf[61] = msb >>  8;
-  hd->bctx.buf[62] = msb >> 16;
-  hd->bctx.buf[63] = msb >> 24;
+  buf_put_le32(hd->bctx.buf + 56, lsb);
+  buf_put_le32(hd->bctx.buf + 60, msb);
   burn = transform( hd, hd->bctx.buf );
   _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
-#ifdef WORDS_BIGENDIAN
-#define X(a) do { *p++ = hd->a      ; *p++ = hd->a >> 8;      \
-	          *p++ = hd->a >> 16; *p++ = hd->a >> 24; } while(0)
-#else /* little endian */
-#define X(a) do { *(u32*)p = (*hd).a ; p += 4; } while(0)
-#endif
+#define X(a) do { *(u32*)p = le_bswap32((*hd).a) ; p += 4; } while(0)
   X(A);
   X(B);
   X(C);
diff --git a/cipher/rmd160.c b/cipher/rmd160.c
index 7f143df..d156e61 100644
--- a/cipher/rmd160.c
+++ b/cipher/rmd160.c
@@ -28,6 +28,7 @@
 #include "cipher.h" /* Only used for the rmd160_hash_buffer() prototype. */
 
 #include "bithelp.h"
+#include "bufhelp.h"
 
 /*********************************
  * RIPEMD-160 is not patented, see (as of 25.10.97)
@@ -170,32 +171,11 @@ transform ( void *ctx, const unsigned char *data )
   RMD160_CONTEXT *hd = ctx;
   register u32 a,b,c,d,e;
   u32 aa,bb,cc,dd,ee,t;
-#ifdef WORDS_BIGENDIAN
   u32 x[16];
-  {
-    int i;
-    byte *p2;
-    const byte *p1;
-    for (i=0, p1=data, p2=(byte*)x; i < 16; i++, p2 += 4 )
-      {
-        p2[3] = *p1++;
-        p2[2] = *p1++;
-        p2[1] = *p1++;
-        p2[0] = *p1++;
-      }
-  }
-#else
-  /* This version is better because it is always aligned;
-   * The performance penalty on a 586-100 is about 6% which
-   * is acceptable - because the data is more local it might
-   * also be possible that this is faster on some machines.
-   * This function (when compiled with -02 on gcc 2.7.2)
-   * executes on a 586-100 (39.73 bogomips) at about 1900kb/sec;
-   * [measured with a 4MB data and "gpgm --print-md rmd160"] */
-  u32 x[16];
-  memcpy( x, data, 64 );
-#endif
+  int i;
 
+  for ( i = 0; i < 16; i++ )
+    x[i] = buf_get_le32(data + i * 4);
 
 #define K0  0x00000000
 #define K1  0x5A827999
@@ -469,24 +449,13 @@ rmd160_final( void *context )
       memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
     }
   /* append the 64 bit count */
-  hd->bctx.buf[56] = lsb	   ;
-  hd->bctx.buf[57] = lsb >>  8;
-  hd->bctx.buf[58] = lsb >> 16;
-  hd->bctx.buf[59] = lsb >> 24;
-  hd->bctx.buf[60] = msb	   ;
-  hd->bctx.buf[61] = msb >>  8;
-  hd->bctx.buf[62] = msb >> 16;
-  hd->bctx.buf[63] = msb >> 24;
+  buf_put_le32(hd->bctx.buf + 56, lsb);
+  buf_put_le32(hd->bctx.buf + 60, msb);
   burn = transform( hd, hd->bctx.buf );
   _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
-#ifdef WORDS_BIGENDIAN
-#define X(a) do { *p++ = hd->h##a	   ; *p++ = hd->h##a >> 8;	\
-	          *p++ = hd->h##a >> 16; *p++ = hd->h##a >> 24; } while(0)
-#else /* little endian */
-#define X(a) do { *(u32*)p = hd->h##a ; p += 4; } while(0)
-#endif
+#define X(a) do { *(u32*)p = le_bswap32(hd->h##a) ; p += 4; } while(0)
   X(0);
   X(1);
   X(2);
diff --git a/cipher/salsa20.c b/cipher/salsa20.c
index 37f2989..88f5372 100644
--- a/cipher/salsa20.c
+++ b/cipher/salsa20.c
@@ -75,19 +75,9 @@ typedef struct
 #define ROTL32(n,x) (((x)<<(n)) | ((x)>>((-(n)&31))))
 
 
-#ifdef WORDS_BIGENDIAN
-# define LE_SWAP32(v)              \
-  ( (ROTL32( 8, v) & 0x00FF00FFul) \
-   |(ROTL32(24, v) & 0xFF00FF00ul))
-#else
-# define LE_SWAP32(v) (v)
-#endif
+#define LE_SWAP32(v) le_bswap32(v)
 
-#define LE_READ_UINT32(p)                 \
-  (  (((u32)(p)[3]) << 24)                \
-   | (((u32)(p)[2]) << 16)                \
-   | (((u32)(p)[1]) << 8)                 \
-   |  ((u32)(p)[0]))
+#define LE_READ_UINT32(p) buf_get_le32(p)
 
 
 static void salsa20_setiv (void *context, const byte *iv, unsigned int ivlen);
diff --git a/cipher/scrypt.c b/cipher/scrypt.c
index 9e29288..6f6a7f1 100644
--- a/cipher/scrypt.c
+++ b/cipher/scrypt.c
@@ -58,38 +58,14 @@
 
 
 /* Reads a 64-bit integer, in network, big-endian, byte order */
-#define READ_UINT64(p)                          \
-(  (((u64) (p)[0]) << 56)                  \
- | (((u64) (p)[1]) << 48)                  \
- | (((u64) (p)[2]) << 40)                  \
- | (((u64) (p)[3]) << 32)                  \
- | (((u64) (p)[4]) << 24)                  \
- | (((u64) (p)[5]) << 16)                  \
- | (((u64) (p)[6]) << 8)                   \
- |  ((u64) (p)[7]))
-
+#define READ_UINT64(p) buf_get_be64(p)
 
 
 /* And the other, little-endian, byteorder */
-#define LE_READ_UINT64(p)                       \
-(  (((u64) (p)[7]) << 56)                  \
- | (((u64) (p)[6]) << 48)                  \
- | (((u64) (p)[5]) << 40)                  \
- | (((u64) (p)[4]) << 32)                  \
- | (((u64) (p)[3]) << 24)                  \
- | (((u64) (p)[2]) << 16)                  \
- | (((u64) (p)[1]) << 8)                   \
- |  ((u64) (p)[0]))
-
-
-
-#ifdef WORDS_BIGENDIAN
-#define LE_SWAP32(v)				\
-  ((ROTL32(8,  v) & 0x00FF00FFUL) |		\
-   (ROTL32(24, v) & 0xFF00FF00UL))
-#else
-#define LE_SWAP32(v) (v)
-#endif
+#define LE_READ_UINT64(p) buf_get_le64(p)
+
+#define LE_SWAP32(v) le_bswap32(v)
+
 
 #define QROUND(x0, x1, x2, x3) do { \
   x1 ^= ROTL32(7, x0 + x3);	    \
diff --git a/cipher/seed.c b/cipher/seed.c
index 1600c55..474ccba 100644
--- a/cipher/seed.c
+++ b/cipher/seed.c
@@ -29,15 +29,12 @@
 #include "types.h"  /* for byte and u32 typedefs */
 #include "g10lib.h"
 #include "cipher.h"
+#include "bufhelp.h"
 
 #define NUMKC	16
 
-#define GETU32(pt) (((u32)(pt)[0] << 24) ^ ((u32)(pt)[1] << 16) ^ \
-		    ((u32)(pt)[2] <<  8) ^ ((u32)(pt)[3]))
-#define PUTU32(ct, st) { (ct)[0] = (byte)((st) >> 24); \
-			 (ct)[1] = (byte)((st) >> 16); \
-			 (ct)[2] = (byte)((st) >>  8); \
-			 (ct)[3] = (byte)(st); }
+#define GETU32(pt) buf_get_be32(pt)
+#define PUTU32(ct, st) buf_put_be32(ct, st)
 
 union wordbuf
 {
diff --git a/cipher/serpent.c b/cipher/serpent.c
index 72895ed..4720b9c 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -119,11 +119,6 @@ extern void _gcry_serpent_avx2_cfb_dec(serpent_context_t *ctx,
 static const char *serpent_test (void);
 
 
-#define byte_swap_32(x) \
-  (0 \
-   | (((x) & 0xff000000) >> 24) | (((x) & 0x00ff0000) >>  8) \
-   | (((x) & 0x0000ff00) <<  8) | (((x) & 0x000000ff) << 24))
-
 /*
  * These are the S-Boxes of Serpent from following research paper.
  *
@@ -548,14 +543,10 @@ serpent_key_prepare (const byte *key, unsigned int key_length,
   int i;
 
   /* Copy key.  */
-  memcpy (key_prepared, key, key_length);
   key_length /= 4;
-#ifdef WORDS_BIGENDIAN
   for (i = 0; i < key_length; i++)
-    key_prepared[i] = byte_swap_32 (key_prepared[i]);
-#else
-  i = key_length;
-#endif
+    key_prepared[i] = buf_get_le32 (key + i * 4);
+
   if (i < 8)
     {
       /* Key must be padded according to the Serpent
@@ -683,13 +674,10 @@ serpent_encrypt_internal (serpent_context_t *context,
   serpent_block_t b, b_next;
   int round = 0;
 
-  memcpy (b, input, sizeof (b));
-#ifdef WORDS_BIGENDIAN
-  b[0] = byte_swap_32 (b[0]);
-  b[1] = byte_swap_32 (b[1]);
-  b[2] = byte_swap_32 (b[2]);
-  b[3] = byte_swap_32 (b[3]);
-#endif
+  b[0] = buf_get_le32 (input + 0);
+  b[1] = buf_get_le32 (input + 4);
+  b[2] = buf_get_le32 (input + 8);
+  b[3] = buf_get_le32 (input + 12);
 
   ROUND (0, context->keys, b, b_next);
   ROUND (1, context->keys, b, b_next);
@@ -725,13 +713,10 @@ serpent_encrypt_internal (serpent_context_t *context,
 
   ROUND_LAST (7, context->keys, b, b_next);
 
-#ifdef WORDS_BIGENDIAN
-  b_next[0] = byte_swap_32 (b_next[0]);
-  b_next[1] = byte_swap_32 (b_next[1]);
-  b_next[2] = byte_swap_32 (b_next[2]);
-  b_next[3] = byte_swap_32 (b_next[3]);
-#endif
-  memcpy (output, b_next, sizeof (b_next));
+  buf_put_le32 (output + 0, b_next[0]);
+  buf_put_le32 (output + 4, b_next[1]);
+  buf_put_le32 (output + 8, b_next[2]);
+  buf_put_le32 (output + 12, b_next[3]);
 }
 
 static void
@@ -741,13 +726,10 @@ serpent_decrypt_internal (serpent_context_t *context,
   serpent_block_t b, b_next;
   int round = ROUNDS;
 
-  memcpy (b_next, input, sizeof (b));
-#ifdef WORDS_BIGENDIAN
-  b_next[0] = byte_swap_32 (b_next[0]);
-  b_next[1] = byte_swap_32 (b_next[1]);
-  b_next[2] = byte_swap_32 (b_next[2]);
-  b_next[3] = byte_swap_32 (b_next[3]);
-#endif
+  b_next[0] = buf_get_le32 (input + 0);
+  b_next[1] = buf_get_le32 (input + 4);
+  b_next[2] = buf_get_le32 (input + 8);
+  b_next[3] = buf_get_le32 (input + 12);
 
   ROUND_FIRST_INVERSE (7, context->keys, b_next, b);
 
@@ -783,13 +765,10 @@ serpent_decrypt_internal (serpent_context_t *context,
   ROUND_INVERSE (1, context->keys, b, b_next);
   ROUND_INVERSE (0, context->keys, b, b_next);
 
-#ifdef WORDS_BIGENDIAN
-  b_next[0] = byte_swap_32 (b_next[0]);
-  b_next[1] = byte_swap_32 (b_next[1]);
-  b_next[2] = byte_swap_32 (b_next[2]);
-  b_next[3] = byte_swap_32 (b_next[3]);
-#endif
-  memcpy (output, b_next, sizeof (b_next));
+  buf_put_le32 (output + 0, b_next[0]);
+  buf_put_le32 (output + 4, b_next[1]);
+  buf_put_le32 (output + 8, b_next[2]);
+  buf_put_le32 (output + 12, b_next[3]);
 }
 
 static unsigned int
diff --git a/cipher/sha1.c b/cipher/sha1.c
index 382bce8..aef9f05 100644
--- a/cipher/sha1.c
+++ b/cipher/sha1.c
@@ -38,6 +38,7 @@
 
 #include "g10lib.h"
 #include "bithelp.h"
+#include "bufhelp.h"
 #include "cipher.h"
 #include "hash-common.h"
 
@@ -108,27 +109,13 @@ static unsigned int
 transform (void *ctx, const unsigned char *data)
 {
   SHA1_CONTEXT *hd = ctx;
+  const u32 *idata = (const void *)data;
   register u32 a, b, c, d, e; /* Local copies of the chaining variables.  */
   register u32 tm;            /* Helper.  */
   u32 x[16];                  /* The array we work on. */
 
-#ifdef WORDS_BIGENDIAN
-      memcpy (x, data, 64);
-      data += 64;
-#else
-      {
-        int i;
-        unsigned char *p;
-
-        for(i=0, p=(unsigned char*)x; i < 16; i++, p += 4 )
-          {
-            p[3] = *data++;
-            p[2] = *data++;
-            p[1] = *data++;
-            p[0] = *data++;
-          }
-      }
-#endif
+#define I(i) (x[i] = buf_get_be32(idata + i))
+
       /* Get the values of the chaining variables. */
       a = hd->h0;
       b = hd->h1;
@@ -137,22 +124,22 @@ transform (void *ctx, const unsigned char *data)
       e = hd->h4;
 
       /* Transform. */
-      R( a, b, c, d, e, F1, K1, x[ 0] );
-      R( e, a, b, c, d, F1, K1, x[ 1] );
-      R( d, e, a, b, c, F1, K1, x[ 2] );
-      R( c, d, e, a, b, F1, K1, x[ 3] );
-      R( b, c, d, e, a, F1, K1, x[ 4] );
-      R( a, b, c, d, e, F1, K1, x[ 5] );
-      R( e, a, b, c, d, F1, K1, x[ 6] );
-      R( d, e, a, b, c, F1, K1, x[ 7] );
-      R( c, d, e, a, b, F1, K1, x[ 8] );
-      R( b, c, d, e, a, F1, K1, x[ 9] );
-      R( a, b, c, d, e, F1, K1, x[10] );
-      R( e, a, b, c, d, F1, K1, x[11] );
-      R( d, e, a, b, c, F1, K1, x[12] );
-      R( c, d, e, a, b, F1, K1, x[13] );
-      R( b, c, d, e, a, F1, K1, x[14] );
-      R( a, b, c, d, e, F1, K1, x[15] );
+      R( a, b, c, d, e, F1, K1, I( 0) );
+      R( e, a, b, c, d, F1, K1, I( 1) );
+      R( d, e, a, b, c, F1, K1, I( 2) );
+      R( c, d, e, a, b, F1, K1, I( 3) );
+      R( b, c, d, e, a, F1, K1, I( 4) );
+      R( a, b, c, d, e, F1, K1, I( 5) );
+      R( e, a, b, c, d, F1, K1, I( 6) );
+      R( d, e, a, b, c, F1, K1, I( 7) );
+      R( c, d, e, a, b, F1, K1, I( 8) );
+      R( b, c, d, e, a, F1, K1, I( 9) );
+      R( a, b, c, d, e, F1, K1, I(10) );
+      R( e, a, b, c, d, F1, K1, I(11) );
+      R( d, e, a, b, c, F1, K1, I(12) );
+      R( c, d, e, a, b, F1, K1, I(13) );
+      R( b, c, d, e, a, F1, K1, I(14) );
+      R( a, b, c, d, e, F1, K1, I(15) );
       R( e, a, b, c, d, F1, K1, M(16) );
       R( d, e, a, b, c, F1, K1, M(17) );
       R( c, d, e, a, b, F1, K1, M(18) );
@@ -275,24 +262,13 @@ sha1_final(void *context)
       memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
     }
   /* append the 64 bit count */
-  hd->bctx.buf[56] = msb >> 24;
-  hd->bctx.buf[57] = msb >> 16;
-  hd->bctx.buf[58] = msb >>  8;
-  hd->bctx.buf[59] = msb	   ;
-  hd->bctx.buf[60] = lsb >> 24;
-  hd->bctx.buf[61] = lsb >> 16;
-  hd->bctx.buf[62] = lsb >>  8;
-  hd->bctx.buf[63] = lsb	   ;
+  buf_put_be32(hd->bctx.buf + 56, msb);
+  buf_put_be32(hd->bctx.buf + 60, lsb);
   burn = transform( hd, hd->bctx.buf );
   _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
-#ifdef WORDS_BIGENDIAN
-#define X(a) do { *(u32*)p = hd->h##a ; p += 4; } while(0)
-#else /* little endian */
-#define X(a) do { *p++ = hd->h##a >> 24; *p++ = hd->h##a >> 16;	 \
-                  *p++ = hd->h##a >> 8; *p++ = hd->h##a; } while(0)
-#endif
+#define X(a) do { *(u32*)p = be_bswap32(hd->h##a) ; p += 4; } while(0)
   X(0);
   X(1);
   X(2);
diff --git a/cipher/sha256.c b/cipher/sha256.c
index cf23f2f..ad08cc7 100644
--- a/cipher/sha256.c
+++ b/cipher/sha256.c
@@ -42,6 +42,7 @@
 
 #include "g10lib.h"
 #include "bithelp.h"
+#include "bufhelp.h"
 #include "cipher.h"
 #include "hash-common.h"
 
@@ -168,7 +169,6 @@ transform (void *ctx, const unsigned char *data)
   };
 
   u32 a,b,c,d,e,f,g,h,t1,t2;
-  u32 x[16];
   u32 w[64];
   int i;
 
@@ -181,24 +181,8 @@ transform (void *ctx, const unsigned char *data)
   g = hd->h6;
   h = hd->h7;
 
-#ifdef WORDS_BIGENDIAN
-  memcpy (x, data, 64);
-#else
-  {
-    byte *p2;
-
-    for (i=0, p2=(byte*)x; i < 16; i++, p2 += 4 )
-      {
-        p2[3] = *data++;
-        p2[2] = *data++;
-        p2[1] = *data++;
-        p2[0] = *data++;
-      }
-  }
-#endif
-
   for (i=0; i < 16; i++)
-    w[i] = x[i];
+    w[i] = buf_get_be32(data + i * 4);
   for (; i < 64; i++)
     w[i] = S1(w[i-2]) + w[i-7] + S0(w[i-15]) + w[i-16];
 
@@ -312,24 +296,13 @@ sha256_final(void *context)
       memset (hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
     }
   /* append the 64 bit count */
-  hd->bctx.buf[56] = msb >> 24;
-  hd->bctx.buf[57] = msb >> 16;
-  hd->bctx.buf[58] = msb >>  8;
-  hd->bctx.buf[59] = msb;
-  hd->bctx.buf[60] = lsb >> 24;
-  hd->bctx.buf[61] = lsb >> 16;
-  hd->bctx.buf[62] = lsb >>  8;
-  hd->bctx.buf[63] = lsb;
+  buf_put_be32(hd->bctx.buf + 56, msb);
+  buf_put_be32(hd->bctx.buf + 60, lsb);
   burn = transform (hd, hd->bctx.buf);
   _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
-#ifdef WORDS_BIGENDIAN
-#define X(a) do { *(u32*)p = hd->h##a ; p += 4; } while(0)
-#else /* little endian */
-#define X(a) do { *p++ = hd->h##a >> 24; *p++ = hd->h##a >> 16;	 \
-		  *p++ = hd->h##a >> 8; *p++ = hd->h##a; } while(0)
-#endif
+#define X(a) do { *(u32*)p = be_bswap32(hd->h##a); p += 4; } while(0)
   X(0);
   X(1);
   X(2);
diff --git a/cipher/sha512.c b/cipher/sha512.c
index 26cbe14..505a1e4 100644
--- a/cipher/sha512.c
+++ b/cipher/sha512.c
@@ -50,6 +50,7 @@
 #include <string.h>
 #include "g10lib.h"
 #include "bithelp.h"
+#include "bufhelp.h"
 #include "cipher.h"
 #include "hash-common.h"
 
@@ -225,26 +226,8 @@ __transform (SHA512_STATE *hd, const unsigned char *data)
   g = hd->h6;
   h = hd->h7;
 
-#ifdef WORDS_BIGENDIAN
-  memcpy (w, data, 128);
-#else
-  {
-    int i;
-    byte *p2;
-
-    for (i = 0, p2 = (byte *) w; i < 16; i++, p2 += 8)
-      {
-	p2[7] = *data++;
-	p2[6] = *data++;
-	p2[5] = *data++;
-	p2[4] = *data++;
-	p2[3] = *data++;
-	p2[2] = *data++;
-	p2[1] = *data++;
-	p2[0] = *data++;
-      }
-  }
-#endif
+  for ( t = 0; t < 16; t++ )
+    w[t] = buf_get_be64(data + t * 8);
 
 #define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
 #define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
@@ -566,35 +549,13 @@ sha512_final (void *context)
       memset (hd->bctx.buf, 0, 112);	/* fill next block with zeroes */
     }
   /* append the 128 bit count */
-  hd->bctx.buf[112] = msb >> 56;
-  hd->bctx.buf[113] = msb >> 48;
-  hd->bctx.buf[114] = msb >> 40;
-  hd->bctx.buf[115] = msb >> 32;
-  hd->bctx.buf[116] = msb >> 24;
-  hd->bctx.buf[117] = msb >> 16;
-  hd->bctx.buf[118] = msb >> 8;
-  hd->bctx.buf[119] = msb;
-
-  hd->bctx.buf[120] = lsb >> 56;
-  hd->bctx.buf[121] = lsb >> 48;
-  hd->bctx.buf[122] = lsb >> 40;
-  hd->bctx.buf[123] = lsb >> 32;
-  hd->bctx.buf[124] = lsb >> 24;
-  hd->bctx.buf[125] = lsb >> 16;
-  hd->bctx.buf[126] = lsb >> 8;
-  hd->bctx.buf[127] = lsb;
+  buf_put_be64(hd->bctx.buf + 112, msb);
+  buf_put_be64(hd->bctx.buf + 120, lsb);
   stack_burn_depth = transform (hd, hd->bctx.buf);
   _gcry_burn_stack (stack_burn_depth);
 
   p = hd->bctx.buf;
-#ifdef WORDS_BIGENDIAN
-#define X(a) do { *(u64*)p = hd->state.h##a ; p += 8; } while (0)
-#else /* little endian */
-#define X(a) do { *p++ = hd->state.h##a >> 56; *p++ = hd->state.h##a >> 48;    \
-                  *p++ = hd->state.h##a >> 40; *p++ = hd->state.h##a >> 32;    \
-                  *p++ = hd->state.h##a >> 24; *p++ = hd->state.h##a >> 16;    \
-                  *p++ = hd->state.h##a >> 8;  *p++ = hd->state.h##a; } while(0)
-#endif
+#define X(a) do { *(u64*)p = be_bswap64(hd->state.h##a) ; p += 8; } while (0)
   X (0);
   X (1);
   X (2);
diff --git a/cipher/stribog.c b/cipher/stribog.c
index 1f79882..61aa222 100644
--- a/cipher/stribog.c
+++ b/cipher/stribog.c
@@ -27,6 +27,7 @@
 
 #include "g10lib.h"
 #include "bithelp.h"
+#include "bufhelp.h"
 #include "cipher.h"
 #include "hash-common.h"
 
@@ -1304,25 +1305,8 @@ transform (STRIBOG_CONTEXT *hd, const unsigned char *data, unsigned count)
   u64 l;
   int i;
 
-#ifndef WORDS_BIGENDIAN
-  memcpy (M, data, 64);
-#else
-  {
-    byte *p2;
-
-    for (i = 0, p2 = (byte *) M; i < 8; i++, p2 += 8)
-      {
-          p2[7] = *data++;
-          p2[6] = *data++;
-          p2[5] = *data++;
-          p2[4] = *data++;
-          p2[3] = *data++;
-          p2[2] = *data++;
-          p2[1] = *data++;
-          p2[0] = *data++;
-        }
-    }
-#endif
+  for (i = 0; i < 8; i++)
+    M[i] = buf_get_le64(data + i * 8);
 
   g (hd->h, M, hd->N);
   l = hd->N[0];
@@ -1379,19 +1363,8 @@ stribog_final (void *context)
   g (hd->h, hd->N, Z);
   g (hd->h, hd->Sigma, Z);
 
-#ifdef WORDS_BIGENDIAN
   for (i = 0; i < 8; i++)
-    {
-      u64 T = hd->h[i];
-      T = ((T & U64_C(0x00ff00ff00ff00ff)) << 8) |
-          ((T & U64_C(0xff00ff00ff00ff00)) >> 8);
-      T = ((T & U64_C(0x0000ffff0000ffff)) << 16) |
-          ((T & U64_C(0xffff0000ffff0000)) >> 16);
-      T = ((T & U64_C(0x00000000ffffffff)) << 32) |
-          ((T & U64_C(0xffffffff00000000)) >> 32);
-      hd->h[i] = T;
-    }
-#endif
+    hd->h[i] = le_bswap64(hd->h[i]);
 
   _gcry_burn_stack (768);
 }
diff --git a/cipher/tiger.c b/cipher/tiger.c
index 8f5959b..df16098 100644
--- a/cipher/tiger.c
+++ b/cipher/tiger.c
@@ -28,6 +28,8 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "hash-common.h"
+#include "bithelp.h"
+#include "bufhelp.h"
 
 /* We really need a 64 bit type for this code.  */
 #ifdef HAVE_U64_TYPEDEF
@@ -697,24 +699,10 @@ transform ( void *ctx, const unsigned char *data )
   TIGER_CONTEXT *hd = ctx;
   u64 a,b,c,aa,bb,cc;
   u64 x[8];
-#ifdef WORDS_BIGENDIAN
-#define MKWORD(d,n) \
-		(  ((u64)(d)[8*(n)+7]) << 56 | ((u64)(d)[8*(n)+6]) << 48  \
-		 | ((u64)(d)[8*(n)+5]) << 40 | ((u64)(d)[8*(n)+4]) << 32  \
-		 | ((u64)(d)[8*(n)+3]) << 24 | ((u64)(d)[8*(n)+2]) << 16  \
-		 | ((u64)(d)[8*(n)+1]) << 8  | ((u64)(d)[8*(n)	])	 )
-  x[0] = MKWORD(data, 0);
-  x[1] = MKWORD(data, 1);
-  x[2] = MKWORD(data, 2);
-  x[3] = MKWORD(data, 3);
-  x[4] = MKWORD(data, 4);
-  x[5] = MKWORD(data, 5);
-  x[6] = MKWORD(data, 6);
-  x[7] = MKWORD(data, 7);
-#undef MKWORD
-#else
-  memcpy( &x[0], data, 64 );
-#endif
+  int i;
+
+  for ( i = 0; i < 8; i++ )
+    x[i] = buf_get_le64(data + i * 8);
 
   /* save */
   a = aa = hd->a;
@@ -783,30 +771,14 @@ tiger_final( void *context )
       memset(hd->bctx.buf, 0, 56 ); /* fill next block with zeroes */
     }
   /* append the 64 bit count */
-  hd->bctx.buf[56] = lsb	   ;
-  hd->bctx.buf[57] = lsb >>  8;
-  hd->bctx.buf[58] = lsb >> 16;
-  hd->bctx.buf[59] = lsb >> 24;
-  hd->bctx.buf[60] = msb	   ;
-  hd->bctx.buf[61] = msb >>  8;
-  hd->bctx.buf[62] = msb >> 16;
-  hd->bctx.buf[63] = msb >> 24;
+  buf_put_le32(hd->bctx.buf + 56, lsb);
+  buf_put_le32(hd->bctx.buf + 60, msb);
   burn = transform( hd, hd->bctx.buf );
   _gcry_burn_stack (burn);
 
   p = hd->bctx.buf;
-#ifdef WORDS_BIGENDIAN
-#define X(a) do { *(u64*)p = hd->a ; p += 8; } while(0)
-#else /* little endian */
-#define X(a) do { *p++ = hd->a >> 56; *p++ = hd->a >> 48; \
-	          *p++ = hd->a >> 40; *p++ = hd->a >> 32; \
-	          *p++ = hd->a >> 24; *p++ = hd->a >> 16; \
-	          *p++ = hd->a >>  8; *p++ = hd->a;       } while(0)
-#endif
-#define Y(a) do { *p++ = hd->a      ; *p++ = hd->a >> 8;  \
-	          *p++ = hd->a >> 16; *p++ = hd->a >> 24; \
-	          *p++ = hd->a >> 32; *p++ = hd->a >> 40; \
-	          *p++ = hd->a >> 48; *p++ = hd->a >> 56; } while(0)
+#define X(a) do { *(u64*)p = be_bswap64(hd->a); p += 8; } while(0)
+#define Y(a) do { *(u64*)p = le_bswap64(hd->a); p += 8; } while(0)
   if (hd->variant == 0)
     {
       X(a);
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 70cdb47..17b3aa3 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -798,13 +798,12 @@ extern void _gcry_twofish_amd64_cfb_dec(const TWOFISH_context *c, byte *out,
  * whitening subkey number m. */
 
 #define INPACK(n, x, m) \
-   x = in[4 * (n)] ^ (in[4 * (n) + 1] << 8) \
-     ^ (in[4 * (n) + 2] << 16) ^ (in[4 * (n) + 3] << 24) ^ ctx->w[m]
+   x = buf_get_le32(in + (n) * 4); \
+   x ^= ctx->w[m]
 
 #define OUTUNPACK(n, x, m) \
    x ^= ctx->w[m]; \
-   out[4 * (n)] = x; out[4 * (n) + 1] = x >> 8; \
-   out[4 * (n) + 2] = x >> 16; out[4 * (n) + 3] = x >> 24
+   buf_put_le32(out + (n) * 4, x)
 
 #endif /*!USE_AMD64_ASM*/
 
diff --git a/cipher/whirlpool.c b/cipher/whirlpool.c
index 2c3beb7..954640a 100644
--- a/cipher/whirlpool.c
+++ b/cipher/whirlpool.c
@@ -37,7 +37,7 @@
 #include "g10lib.h"
 #include "cipher.h"
 
-#include "bithelp.h"
+#include "bufhelp.h"
 
 /* Size of a whirlpool block (in bytes).  */
 #define BLOCK_SIZE 64
@@ -65,30 +65,13 @@ typedef struct {
    counter.  */
 #define buffer_to_block(buffer, block, i) \
   for (i = 0; i < 8; i++) \
-    (block)[i] = ((u64) (0 \
-                         | (((u64) (buffer)[i * 8 + 0]) << 56) \
-                         | (((u64) (buffer)[i * 8 + 1]) << 48) \
-                         | (((u64) (buffer)[i * 8 + 2]) << 40) \
-                         | (((u64) (buffer)[i * 8 + 3]) << 32) \
-                         | (((u64) (buffer)[i * 8 + 4]) << 24) \
-                         | (((u64) (buffer)[i * 8 + 5]) << 16) \
-                         | (((u64) (buffer)[i * 8 + 6]) <<  8) \
-                         | (((u64) (buffer)[i * 8 + 7]) <<  0)));
+    (block)[i] = buf_get_be64((buffer) + i * 8);
 
 /* Convert the block BLOCK into a buffer BUFFER, using I as
    counter.  */
 #define block_to_buffer(buffer, block, i) \
   for (i = 0; i < 8; i++) \
-    { \
-      (buffer)[i * 8 + 0] = (block[i] >> 56) & 0xFF; \
-      (buffer)[i * 8 + 1] = (block[i] >> 48) & 0xFF; \
-      (buffer)[i * 8 + 2] = (block[i] >> 40) & 0xFF; \
-      (buffer)[i * 8 + 3] = (block[i] >> 32) & 0xFF; \
-      (buffer)[i * 8 + 4] = (block[i] >> 24) & 0xFF; \
-      (buffer)[i * 8 + 5] = (block[i] >> 16) & 0xFF; \
-      (buffer)[i * 8 + 6] = (block[i] >>  8) & 0xFF; \
-      (buffer)[i * 8 + 7] = (block[i] >>  0) & 0xFF; \
-    }
+    buf_put_be64((buffer) + i * 8, (block)[i]);
 
 /* Copy the block BLOCK_SRC to BLOCK_DST, using I as counter.  */
 #define block_copy(block_dst, block_src, i) \
diff --git a/configure.ac b/configure.ac
index 91bbf4b..2c92028 100644
--- a/configure.ac
+++ b/configure.ac
@@ -774,6 +774,36 @@ AC_SUBST(FALLBACK_SOCKLEN_T)
 
 
 #
+# Check for __builtin_bswap32 intrinsic.
+#
+AC_CACHE_CHECK(for __builtin_bswap32,
+       [gcry_cv_have_builtin_bswap32],
+       [gcry_cv_have_builtin_bswap32=no
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[int foo(int x) { return __builtin_bswap32(x); }]])],
+          [gcry_cv_have_builtin_bswap32=yes])])
+if test "$gcry_cv_have_builtin_bswap32" = "yes" ; then
+   AC_DEFINE(HAVE_BUILTIN_BSWAP32,1,
+             [Defined if compiler has '__builtin_bswap32' intrinsic])
+fi
+
+
+#
+# Check for __builtin_bswap64 intrinsic.
+#
+AC_CACHE_CHECK(for __builtin_bswap64,
+       [gcry_cv_have_builtin_bswap64],
+       [gcry_cv_have_builtin_bswap64=no
+        AC_COMPILE_IFELSE([AC_LANG_SOURCE(
+          [[int foo(int x) { return __builtin_bswap64(x); }]])],
+          [gcry_cv_have_builtin_bswap64=yes])])
+if test "$gcry_cv_have_builtin_bswap64" = "yes" ; then
+   AC_DEFINE(HAVE_BUILTIN_BSWAP64,1,
+             [Defined if compiler has '__builtin_bswap64' intrinsic])
+fi
+
+
+#
 # Check for VLA support (variable length arrays).
 #
 AC_CACHE_CHECK(whether the variable length arrays are supported,




More information about the Gcrypt-devel mailing list