[git] GCRYPT - branch, master, updated. libgcrypt-1.5.0-332-g54df6fc

by Jussi Kivilinna cvs at cvs.gnupg.org
Wed Oct 23 17:56:15 CEST 2013


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "The GNU crypto library".

The branch, master has been updated
       via  54df6fcd806f8c150cffe6cc09925bb8b638bb5b (commit)
       via  293e93672fdabc829e35cc624c397276342bafe4 (commit)
       via  2901a10dbf1264707debc8402546c07eeac60932 (commit)
      from  2fd83faa876d0be91ab7884b1a9eaa7793559eb9 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 54df6fcd806f8c150cffe6cc09925bb8b638bb5b
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Wed Oct 23 18:36:18 2013 +0300

    Replace architecture specific fast_wipememory2 with generic
    
    * src/g10lib.h (fast_wipememory2): Remove architecture specific
    implementations and add generic implementation.
    --
    
    Reduce code size, adds support for other architectures and gcc appears to
    generated better code without assembly parts.
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/src/g10lib.h b/src/g10lib.h
index 3b09448..80c73ee 100644
--- a/src/g10lib.h
+++ b/src/g10lib.h
@@ -275,77 +275,42 @@ void __gcry_burn_stack (unsigned int bytes);
                   } while(0)
 #define wipememory(_ptr,_len) wipememory2(_ptr,0,_len)
 
+#ifdef HAVE_U64_TYPEDEF
+  #define FASTWIPE_T u64
+  #define FASTWIPE_MULT (U64_C(0x0101010101010101))
+#else
+  #define FASTWIPE_T u32
+  #define FASTWIPE_MULT (0x01010101U)
+#endif
 
-/* Optimized fast_wipememory2 for i386, x86-64 and arm architectures.  May leave
-   tail bytes unhandled, in which case tail bytes are handled by wipememory2.
- */
-#if defined(__x86_64__) && __GNUC__ >= 4
-#define fast_wipememory2(_vptr,_vset,_vlen) do { \
-              unsigned long long int _vset8 = _vset; \
-              if (_vlen < 8) \
-                break; \
-              _vset8 *= 0x0101010101010101ULL; \
-              do { \
-                asm volatile("movq %[set], %[ptr]\n\t" \
-                             : /**/ \
-                             : [set] "Cr" (_vset8), \
-                               [ptr] "m" (*_vptr) \
-                             : "memory"); \
-                _vlen -= 8; \
-                _vptr += 8; \
-              } while (_vlen >= 8); \
-                  } while (0)
-#elif defined (__i386__) && SIZEOF_UNSIGNED_LONG == 4 && __GNUC__ >= 4
-#define fast_wipememory2(_ptr,_set,_len) do { \
-              unsigned long _vset4 = _vset; \
-              if (_vlen < 4) \
-                break; \
-              _vset4 *= 0x01010101; \
-              do { \
-                asm volatile("movl %[set], %[ptr]\n\t" \
-                             : /**/ \
-                             : [set] "Cr" (_vset4), \
-                               [ptr] "m" (*_vptr) \
-                             : "memory"); \
-                _vlen -= 4; \
-                _vptr += 4; \
-              } while (_vlen >= 4); \
-                  } while (0)
-#elif defined (__arm__) && (defined (__thumb2__) || !defined (__thumb__)) && \
-	__GNUC__ >= 4
-
-#ifdef __ARM_FEATURE_UNALIGNED
+/* Following architectures can handle unaligned accesses fast.  */
+#if defined(__i386__) || defined(__x86_64__) || \
+    defined(__powerpc__) || defined(__powerpc64__) || \
+    (defined(__arm__) && defined(__ARM_FEATURE_UNALIGNED)) || \
+    defined(__aarch64__)
 #define fast_wipememory2_unaligned_head(_ptr,_set,_len) /*do nothing*/
 #else
 #define fast_wipememory2_unaligned_head(_vptr,_vset,_vlen) do { \
-              while((size_t)(_vptr)&3 && _vlen) \
-	        { *_vptr=(_vset); _vptr++; _vlen--; } \
+              while((size_t)(_vptr)&(sizeof(FASTWIPE_T)-1) && _vlen) \
+                { *_vptr=(_vset); _vptr++; _vlen--; } \
                   } while(0)
 #endif
 
+/* fast_wipememory2 may leave tail bytes unhandled, in which case tail bytes
+   are handled by wipememory2. */
 #define fast_wipememory2(_vptr,_vset,_vlen) do { \
-              unsigned long _vset4 = _vset; \
+              FASTWIPE_T _vset_long = _vset; \
               fast_wipememory2_unaligned_head(_vptr,_vset,_vlen); \
-              if (_vlen < 8) \
+              if (_vlen < sizeof(FASTWIPE_T)) \
                 break; \
-              _vset4 *= 0x01010101; \
-              asm volatile( \
-                "mov %%r4, %[set];\n\t" \
-                "mov %%r5, %[set];\n\t" \
-                "1:;\n\t" \
-                "stm %[ptr]!, {%%r4, %%r5};\n\t" \
-                "cmp %[end], %[ptr];\n\t" \
-                "bne 1b;\n\t" \
-                : [ptr] "=r" (_vptr) \
-                : [set] "r" (_vset4), \
-                  [end] "r" (_vptr+(_vlen&(~0x7))), \
-                  "0" (_vptr) \
-                : "memory", "r4", "r5", "cc"); \
-              _vlen &= 0x7; \
+              _vset_long *= FASTWIPE_MULT; \
+              do { \
+                volatile FASTWIPE_T *_vptr_long = (volatile void *)_vptr; \
+                *_vptr_long = _vset_long; \
+                _vlen -= sizeof(FASTWIPE_T); \
+                _vptr += sizeof(FASTWIPE_T); \
+              } while (_vlen >= sizeof(FASTWIPE_T)); \
                   } while (0)
-#else
-#define fast_wipememory2(_ptr,_set,_len)
-#endif
 
 
 /* Digit predicates.  */

commit 293e93672fdabc829e35cc624c397276342bafe4
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Wed Oct 23 18:36:18 2013 +0300

    Improve the speed of the cipher mode code
    
    * cipher/bufhelp.h (buf_cpy): New.
    (buf_xor, buf_xor_2dst): If buffers unaligned, always jump to per-byte
    processing.
    (buf_xor_n_copy_2): New.
    (buf_xor_n_copy): Use 'buf_xor_n_copy_2'.
    * cipher/blowfish.c (_gcry_blowfish_cbc_dec): Avoid extra memory copy
    and use new 'buf_xor_n_copy_2'.
    * cipher/camellia-glue.c (_gcry_camellia_cbc_dec): Ditto.
    * cipher/cast5.c (_gcry_cast_cbc_dec): Ditto.
    * cipher/serpent.c (_gcry_serpent_cbc_dec): Ditto.
    * cipher/twofish.c (_gcry_twofish_cbc_dec): Ditto.
    * cipher/rijndael.c (_gcry_aes_cbc_dec): Ditto.
    (do_encrypt, do_decrypt): Use 'buf_cpy' instead of 'memcpy'.
    (_gcry_aes_cbc_enc): Avoid copying IV, use 'last_iv' pointer instead.
    * cipher/cipher-cbc.c (_gcry_cipher_cbc_encrypt): Avoid copying IV,
    update pointer to IV instead.
    (_gcry_cipher_cbc_decrypt): Avoid extra memory copy and use new
    'buf_xor_n_copy_2'.
    (_gcry_cipher_cbc_encrypt, _gcry_cipher_cbc_decrypt): Avoid extra
    accesses to c->spec, use 'buf_cpy' instead of memcpy.
    * cipher/cipher-ccm.c (do_cbc_mac): Ditto.
    * cipher/cipher-cfb.c (_gcry_cipher_cfb_encrypt)
    (_gcry_cipher_cfb_decrypt): Ditto.
    * cipher/cipher-ctr.c (_gcry_cipher_ctr_encrypt): Ditto.
    * cipher/cipher-ofb.c (_gcry_cipher_ofb_encrypt)
    (_gcry_cipher_ofb_decrypt): Ditto.
    * cipher/cipher.c (do_ecb_encrypt, do_ecb_decrypt): Ditto.
    --
    
    Patch improves the speed of the generic block cipher mode code. Especially on
    targets without faster unaligned memory accesses, the generic code was slower
    than the algorithm specific bulk versions. With this patch, this issue should
    be solved.
    
    Tests on Cortex-A8; compiled for ARMv4, without unaligned-accesses:
    
     Before:
                      ECB/Stream         CBC             CFB             OFB             CTR             CCM
                   --------------- --------------- --------------- --------------- --------------- ---------------
      SEED           490ms   500ms   560ms   580ms   530ms   540ms   560ms   560ms   550ms   540ms  1080ms  1080ms
      TWOFISH        230ms   230ms   290ms   300ms   260ms   240ms   290ms   290ms   240ms   240ms   520ms   510ms
      DES            720ms   720ms   800ms   860ms   770ms   770ms   810ms   820ms   770ms   780ms       -       -
      CAST5          340ms   340ms   440ms   250ms   390ms   250ms   440ms   430ms   260ms   250ms       -       -
    
     After:
                      ECB/Stream         CBC             CFB             OFB             CTR             CCM
                   --------------- --------------- --------------- --------------- --------------- ---------------
      SEED           500ms   490ms   520ms   520ms   530ms   520ms   530ms   540ms   500ms   520ms  1060ms  1070ms
      TWOFISH        230ms   220ms   250ms   230ms   260ms   230ms   260ms   260ms   230ms   230ms   500ms   490ms
      DES            720ms   720ms   750ms   760ms   740ms   750ms   770ms   770ms   760ms   760ms       -       -
      CAST5          340ms   340ms   370ms   250ms   370ms   250ms   380ms   390ms   250ms   250ms       -       -
    
    Tests on Cortex-A8; compiled for ARMv7-A, with unaligned-accesses:
    
     Before:
                      ECB/Stream         CBC             CFB             OFB             CTR             CCM
                   --------------- --------------- --------------- --------------- --------------- ---------------
      SEED           430ms   440ms   480ms   530ms   470ms   460ms   490ms   480ms   470ms   460ms   930ms   940ms
      TWOFISH        220ms   220ms   250ms   230ms   240ms   230ms   270ms   250ms   230ms   240ms   480ms   470ms
      DES            550ms   540ms   620ms   690ms   570ms   540ms   630ms   650ms   590ms   580ms       -       -
      CAST5          300ms   300ms   380ms   230ms   330ms   230ms   380ms   370ms   230ms   230ms       -       -
    
     After:
                      ECB/Stream         CBC             CFB             OFB             CTR             CCM
                   --------------- --------------- --------------- --------------- --------------- ---------------
      SEED           430ms   430ms   460ms   450ms   460ms   450ms   470ms   470ms   460ms   470ms   900ms   930ms
      TWOFISH        220ms   210ms   240ms   230ms   230ms   230ms   250ms   250ms   230ms   230ms   470ms   470ms
      DES            540ms   540ms   580ms   570ms   570ms   570ms   560ms   620ms   580ms   570ms       -       -
      CAST5          300ms   290ms   310ms   230ms   320ms   230ms   350ms   350ms   230ms   230ms       -       -
    
    Tests on Intel Atom N160 (i386):
    
     Before:
                      ECB/Stream         CBC             CFB             OFB             CTR             CCM
                   --------------- --------------- --------------- --------------- --------------- ---------------
      SEED           380ms   380ms   410ms   420ms   400ms   400ms   410ms   410ms   390ms   400ms   820ms   800ms
      TWOFISH        340ms   340ms   370ms   350ms   360ms   340ms   370ms   370ms   330ms   340ms   710ms   700ms
      DES            660ms   650ms   710ms   740ms   680ms   700ms   700ms   710ms   680ms   680ms       -       -
      CAST5          340ms   340ms   380ms   330ms   360ms   330ms   390ms   390ms   320ms   330ms       -       -
    
     After:
                      ECB/Stream         CBC             CFB             OFB             CTR             CCM
                   --------------- --------------- --------------- --------------- --------------- ---------------
      SEED           380ms   380ms   390ms   410ms   400ms   390ms   410ms   400ms   400ms   390ms   810ms   800ms
      TWOFISH        330ms   340ms   350ms   360ms   350ms   340ms   380ms   370ms   340ms   360ms   700ms   710ms
      DES            630ms   640ms   660ms   690ms   680ms   680ms   700ms   690ms   680ms   680ms       -       -
      CAST5          340ms   330ms   350ms   330ms   370ms   340ms   380ms   390ms   330ms   330ms       -       -
    
    Tests in Intel i5-4570 (x86-64):
    
      Before:
                      ECB/Stream         CBC             CFB             OFB             CTR             CCM
                   --------------- --------------- --------------- --------------- --------------- ---------------
      SEED           560ms   560ms   600ms   590ms   600ms   570ms   570ms   570ms   580ms   590ms  1200ms  1180ms
      TWOFISH        240ms   240ms   270ms   160ms   260ms   160ms   250ms   250ms   160ms   160ms   430ms   430ms
      DES            570ms   570ms   640ms   590ms   630ms   580ms   600ms   600ms   610ms   620ms       -       -
      CAST5          410ms   410ms   470ms   150ms   470ms   150ms   450ms   450ms   150ms   160ms       -       -
    
      After:
                      ECB/Stream         CBC             CFB             OFB             CTR             CCM
                   --------------- --------------- --------------- --------------- --------------- ---------------
      SEED           560ms   560ms   590ms   570ms   580ms   570ms   570ms   570ms   590ms   590ms  1200ms  1200ms
      TWOFISH        240ms   240ms   260ms   160ms   250ms   170ms   250ms   250ms   160ms   160ms   430ms   430ms
      DES            570ms   570ms   620ms   580ms   630ms   570ms   600ms   590ms   620ms   620ms       -       -
      CAST5          410ms   410ms   460ms   150ms   460ms   160ms   450ms   450ms   150ms   150ms       -       -
    
    
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/blowfish.c b/cipher/blowfish.c
index ed4e901..3b6bf6b 100644
--- a/cipher/blowfish.c
+++ b/cipher/blowfish.c
@@ -701,14 +701,11 @@ _gcry_blowfish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
   for ( ;nblocks; nblocks-- )
     {
-      /* We need to save INBUF away because it may be identical to
-         OUTBUF.  */
-      memcpy(savebuf, inbuf, BLOWFISH_BLOCKSIZE);
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      do_decrypt_block (ctx, savebuf, inbuf);
 
-      do_decrypt_block (ctx, outbuf, inbuf);
-
-      buf_xor(outbuf, outbuf, iv, BLOWFISH_BLOCKSIZE);
-      memcpy(iv, savebuf, BLOWFISH_BLOCKSIZE);
+      buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOWFISH_BLOCKSIZE);
       inbuf += BLOWFISH_BLOCKSIZE;
       outbuf += BLOWFISH_BLOCKSIZE;
     }
diff --git a/cipher/bufhelp.h b/cipher/bufhelp.h
index 198d286..dc39b46 100644
--- a/cipher/bufhelp.h
+++ b/cipher/bufhelp.h
@@ -44,6 +44,45 @@
 #endif
 
 
+/* Optimized function for small buffer copying */
+static inline void
+buf_cpy(void *_dst, const void *_src, size_t len)
+{
+#if __GNUC__ >= 4 && (defined(__x86_64__) || defined(__i386__))
+  /* For AMD64 and i386, memcpy is faster.  */
+  memcpy(_dst, _src, len);
+#else
+  byte *dst = _dst;
+  const byte *src = _src;
+  uintptr_t *ldst;
+  const uintptr_t *lsrc;
+#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
+  const unsigned int longmask = sizeof(uintptr_t) - 1;
+
+  /* Skip fast processing if buffers are unaligned.  */
+  if (((uintptr_t)dst | (uintptr_t)src) & longmask)
+    goto do_bytes;
+#endif
+
+  ldst = (uintptr_t *)(void *)dst;
+  lsrc = (const uintptr_t *)(const void *)src;
+
+  for (; len >= sizeof(uintptr_t); len -= sizeof(uintptr_t))
+    *ldst++ = *lsrc++;
+
+  dst = (byte *)ldst;
+  src = (const byte *)lsrc;
+
+#ifndef BUFHELP_FAST_UNALIGNED_ACCESS
+do_bytes:
+#endif
+  /* Handle tail.  */
+  for (; len; len--)
+    *dst++ = *src++;
+#endif /*__GNUC__ >= 4 && (__x86_64__ || __i386__)*/
+}
+
+
 /* Optimized function for buffer xoring */
 static inline void
 buf_xor(void *_dst, const void *_src1, const void *_src2, size_t len)
@@ -56,14 +95,9 @@ buf_xor(void *_dst, const void *_src1, const void *_src2, size_t len)
 #ifndef BUFHELP_FAST_UNALIGNED_ACCESS
   const unsigned int longmask = sizeof(uintptr_t) - 1;
 
-  /* Skip fast processing if alignment of buffers do not match.  */
-  if ((((uintptr_t)dst ^ (uintptr_t)src1) |
-       ((uintptr_t)dst ^ (uintptr_t)src2)) & longmask)
+  /* Skip fast processing if buffers are unaligned.  */
+  if (((uintptr_t)dst | (uintptr_t)src1 | (uintptr_t)src2) & longmask)
     goto do_bytes;
-
-  /* Handle unaligned head.  */
-  for (; len && ((uintptr_t)dst & longmask); len--)
-      *dst++ = *src1++ ^ *src2++;
 #endif
 
   ldst = (uintptr_t *)(void *)dst;
@@ -99,14 +133,9 @@ buf_xor_2dst(void *_dst1, void *_dst2, const void *_src, size_t len)
 #ifndef BUFHELP_FAST_UNALIGNED_ACCESS
   const unsigned int longmask = sizeof(uintptr_t) - 1;
 
-  /* Skip fast processing if alignment of buffers do not match.  */
-  if ((((uintptr_t)src ^ (uintptr_t)dst1) |
-       ((uintptr_t)src ^ (uintptr_t)dst2)) & longmask)
+  /* Skip fast processing if buffers are unaligned.  */
+  if (((uintptr_t)src | (uintptr_t)dst1 | (uintptr_t)dst2) & longmask)
     goto do_bytes;
-
-  /* Handle unaligned head.  */
-  for (; len && ((uintptr_t)src & longmask); len--)
-    *dst1++ = (*dst2++ ^= *src++);
 #endif
 
   ldst1 = (uintptr_t *)(void *)dst1;
@@ -130,48 +159,44 @@ do_bytes:
 
 
 /* Optimized function for combined buffer xoring and copying.  Used by mainly
-   CFB mode decryption.  */
+   CBC mode decryption.  */
 static inline void
-buf_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src, size_t len)
+buf_xor_n_copy_2(void *_dst_xor, const void *_src_xor, void *_srcdst_cpy,
+		 const void *_src_cpy, size_t len)
 {
   byte *dst_xor = _dst_xor;
   byte *srcdst_cpy = _srcdst_cpy;
+  const byte *src_xor = _src_xor;
+  const byte *src_cpy = _src_cpy;
   byte temp;
-  const byte *src = _src;
   uintptr_t *ldst_xor, *lsrcdst_cpy;
-  const uintptr_t *lsrc;
+  const uintptr_t *lsrc_cpy, *lsrc_xor;
   uintptr_t ltemp;
 #ifndef BUFHELP_FAST_UNALIGNED_ACCESS
   const unsigned int longmask = sizeof(uintptr_t) - 1;
 
-  /* Skip fast processing if alignment of buffers do not match.  */
-  if ((((uintptr_t)src ^ (uintptr_t)dst_xor) |
-       ((uintptr_t)src ^ (uintptr_t)srcdst_cpy)) & longmask)
+  /* Skip fast processing if buffers are unaligned.  */
+  if (((uintptr_t)src_cpy | (uintptr_t)src_xor | (uintptr_t)dst_xor |
+       (uintptr_t)srcdst_cpy) & longmask)
     goto do_bytes;
-
-  /* Handle unaligned head.  */
-  for (; len && ((uintptr_t)src & longmask); len--)
-    {
-      temp = *src++;
-      *dst_xor++ = *srcdst_cpy ^ temp;
-      *srcdst_cpy++ = temp;
-    }
 #endif
 
   ldst_xor = (uintptr_t *)(void *)dst_xor;
+  lsrc_xor = (const uintptr_t *)(void *)src_xor;
   lsrcdst_cpy = (uintptr_t *)(void *)srcdst_cpy;
-  lsrc = (const uintptr_t *)(const void *)src;
+  lsrc_cpy = (const uintptr_t *)(const void *)src_cpy;
 
   for (; len >= sizeof(uintptr_t); len -= sizeof(uintptr_t))
     {
-      ltemp = *lsrc++;
-      *ldst_xor++ = *lsrcdst_cpy ^ ltemp;
+      ltemp = *lsrc_cpy++;
+      *ldst_xor++ = *lsrcdst_cpy ^ *lsrc_xor++;
       *lsrcdst_cpy++ = ltemp;
     }
 
   dst_xor = (byte *)ldst_xor;
+  src_xor = (const byte *)lsrc_xor;
   srcdst_cpy = (byte *)lsrcdst_cpy;
-  src = (const byte *)lsrc;
+  src_cpy = (const byte *)lsrc_cpy;
 
 #ifndef BUFHELP_FAST_UNALIGNED_ACCESS
 do_bytes:
@@ -179,13 +204,22 @@ do_bytes:
   /* Handle tail.  */
   for (; len; len--)
     {
-      temp = *src++;
-      *dst_xor++ = *srcdst_cpy ^ temp;
+      temp = *src_cpy++;
+      *dst_xor++ = *srcdst_cpy ^ *src_xor++;
       *srcdst_cpy++ = temp;
     }
 }
 
 
+/* Optimized function for combined buffer xoring and copying.  Used by mainly
+   CFB mode decryption.  */
+static inline void
+buf_xor_n_copy(void *_dst_xor, void *_srcdst_cpy, const void *_src, size_t len)
+{
+  buf_xor_n_copy_2(_dst_xor, _src, _srcdst_cpy, _src, len);
+}
+
+
 #ifndef BUFHELP_FAST_UNALIGNED_ACCESS
 
 /* Functions for loading and storing unaligned u32 values of different
diff --git a/cipher/camellia-glue.c b/cipher/camellia-glue.c
index e6d4029..8c217a7 100644
--- a/cipher/camellia-glue.c
+++ b/cipher/camellia-glue.c
@@ -441,14 +441,11 @@ _gcry_camellia_cbc_dec(void *context, unsigned char *iv,
 
   for ( ;nblocks; nblocks-- )
     {
-      /* We need to save INBUF away because it may be identical to
-         OUTBUF.  */
-      memcpy(savebuf, inbuf, CAMELLIA_BLOCK_SIZE);
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, savebuf);
 
-      Camellia_DecryptBlock(ctx->keybitlength, inbuf, ctx->keytable, outbuf);
-
-      buf_xor(outbuf, outbuf, iv, CAMELLIA_BLOCK_SIZE);
-      memcpy(iv, savebuf, CAMELLIA_BLOCK_SIZE);
+      buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, CAMELLIA_BLOCK_SIZE);
       inbuf += CAMELLIA_BLOCK_SIZE;
       outbuf += CAMELLIA_BLOCK_SIZE;
     }
diff --git a/cipher/cast5.c b/cipher/cast5.c
index 8c016d7..0df7886 100644
--- a/cipher/cast5.c
+++ b/cipher/cast5.c
@@ -678,14 +678,11 @@ _gcry_cast5_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
   for ( ;nblocks; nblocks-- )
     {
-      /* We need to save INBUF away because it may be identical to
-         OUTBUF.  */
-      memcpy(savebuf, inbuf, CAST5_BLOCKSIZE);
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      do_decrypt_block (ctx, savebuf, inbuf);
 
-      do_decrypt_block (ctx, outbuf, inbuf);
-
-      buf_xor(outbuf, outbuf, iv, CAST5_BLOCKSIZE);
-      memcpy(iv, savebuf, CAST5_BLOCKSIZE);
+      buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, CAST5_BLOCKSIZE);
       inbuf += CAST5_BLOCKSIZE;
       outbuf += CAST5_BLOCKSIZE;
     }
diff --git a/cipher/cipher-cbc.c b/cipher/cipher-cbc.c
index 523f5a6..4ad2ebd 100644
--- a/cipher/cipher-cbc.c
+++ b/cipher/cipher-cbc.c
@@ -41,14 +41,15 @@ _gcry_cipher_cbc_encrypt (gcry_cipher_hd_t c,
   unsigned char *ivp;
   int i;
   size_t blocksize = c->spec->blocksize;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   unsigned nblocks = inbuflen / blocksize;
   unsigned int burn, nburn;
 
   if (outbuflen < ((c->flags & GCRY_CIPHER_CBC_MAC)? blocksize : inbuflen))
     return GPG_ERR_BUFFER_TOO_SHORT;
 
-  if ((inbuflen % c->spec->blocksize)
-      && !(inbuflen > c->spec->blocksize
+  if ((inbuflen % blocksize)
+      && !(inbuflen > blocksize
            && (c->flags & GCRY_CIPHER_CBC_CTS)))
     return GPG_ERR_INV_LENGTH;
 
@@ -70,16 +71,21 @@ _gcry_cipher_cbc_encrypt (gcry_cipher_hd_t c,
     }
   else
     {
+      ivp = c->u_iv.iv;
+
       for (n=0; n < nblocks; n++ )
         {
-          buf_xor(outbuf, inbuf, c->u_iv.iv, blocksize);
-          nburn = c->spec->encrypt ( &c->context.c, outbuf, outbuf );
+          buf_xor (outbuf, inbuf, ivp, blocksize);
+          nburn = enc_fn ( &c->context.c, outbuf, outbuf );
           burn = nburn > burn ? nburn : burn;
-          memcpy (c->u_iv.iv, outbuf, blocksize );
+          ivp = outbuf;
           inbuf  += blocksize;
           if (!(c->flags & GCRY_CIPHER_CBC_MAC))
             outbuf += blocksize;
         }
+
+      if (ivp != c->u_iv.iv)
+        buf_cpy (c->u_iv.iv, ivp, blocksize );
     }
 
   if ((c->flags & GCRY_CIPHER_CBC_CTS) && inbuflen > blocksize)
@@ -104,9 +110,9 @@ _gcry_cipher_cbc_encrypt (gcry_cipher_hd_t c,
       for (; i < blocksize; i++)
         outbuf[i] = 0 ^ *ivp++;
 
-      nburn = c->spec->encrypt (&c->context.c, outbuf, outbuf);
+      nburn = enc_fn (&c->context.c, outbuf, outbuf);
       burn = nburn > burn ? nburn : burn;
-      memcpy (c->u_iv.iv, outbuf, blocksize);
+      buf_cpy (c->u_iv.iv, outbuf, blocksize);
     }
 
   if (burn > 0)
@@ -124,14 +130,15 @@ _gcry_cipher_cbc_decrypt (gcry_cipher_hd_t c,
   unsigned int n;
   int i;
   size_t blocksize = c->spec->blocksize;
+  gcry_cipher_decrypt_t dec_fn = c->spec->decrypt;
   unsigned int nblocks = inbuflen / blocksize;
   unsigned int burn, nburn;
 
   if (outbuflen < inbuflen)
     return GPG_ERR_BUFFER_TOO_SHORT;
 
-  if ((inbuflen % c->spec->blocksize)
-      && !(inbuflen > c->spec->blocksize
+  if ((inbuflen % blocksize)
+      && !(inbuflen > blocksize
            && (c->flags & GCRY_CIPHER_CBC_CTS)))
     return GPG_ERR_INV_LENGTH;
 
@@ -142,7 +149,7 @@ _gcry_cipher_cbc_decrypt (gcry_cipher_hd_t c,
       nblocks--;
       if ((inbuflen % blocksize) == 0)
 	nblocks--;
-      memcpy (c->lastiv, c->u_iv.iv, blocksize);
+      buf_cpy (c->lastiv, c->u_iv.iv, blocksize);
     }
 
   if (c->bulk.cbc_dec)
@@ -155,16 +162,14 @@ _gcry_cipher_cbc_decrypt (gcry_cipher_hd_t c,
     {
       for (n=0; n < nblocks; n++ )
         {
-          /* Because outbuf and inbuf might be the same, we have to
-           * save the original ciphertext block.  We use LASTIV for
-           * this here because it is not used otherwise. */
-          memcpy (c->lastiv, inbuf, blocksize);
-          nburn = c->spec->decrypt ( &c->context.c, outbuf, inbuf );
+          /* Because outbuf and inbuf might be the same, we must not overwrite
+             the original ciphertext block.  We use LASTIV as intermediate
+             storage here because it is not used otherwise.  */
+          nburn = dec_fn ( &c->context.c, c->lastiv, inbuf );
           burn = nburn > burn ? nburn : burn;
-          buf_xor(outbuf, outbuf, c->u_iv.iv, blocksize);
-          memcpy(c->u_iv.iv, c->lastiv, blocksize );
-          inbuf  += c->spec->blocksize;
-          outbuf += c->spec->blocksize;
+          buf_xor_n_copy_2(outbuf, c->lastiv, c->u_iv.iv, inbuf, blocksize);
+          inbuf  += blocksize;
+          outbuf += blocksize;
         }
     }
 
@@ -177,17 +182,17 @@ _gcry_cipher_cbc_decrypt (gcry_cipher_hd_t c,
       else
         restbytes = inbuflen % blocksize;
 
-      memcpy (c->lastiv, c->u_iv.iv, blocksize );         /* Save Cn-2. */
-      memcpy (c->u_iv.iv, inbuf + blocksize, restbytes ); /* Save Cn. */
+      buf_cpy (c->lastiv, c->u_iv.iv, blocksize );         /* Save Cn-2. */
+      buf_cpy (c->u_iv.iv, inbuf + blocksize, restbytes ); /* Save Cn. */
 
-      nburn = c->spec->decrypt ( &c->context.c, outbuf, inbuf );
+      nburn = dec_fn ( &c->context.c, outbuf, inbuf );
       burn = nburn > burn ? nburn : burn;
       buf_xor(outbuf, outbuf, c->u_iv.iv, restbytes);
 
-      memcpy(outbuf + blocksize, outbuf, restbytes);
+      buf_cpy (outbuf + blocksize, outbuf, restbytes);
       for(i=restbytes; i < blocksize; i++)
         c->u_iv.iv[i] = outbuf[i];
-      nburn = c->spec->decrypt (&c->context.c, outbuf, c->u_iv.iv);
+      nburn = dec_fn (&c->context.c, outbuf, c->u_iv.iv);
       burn = nburn > burn ? nburn : burn;
       buf_xor(outbuf, outbuf, c->lastiv, blocksize);
       /* c->lastiv is now really lastlastiv, does this matter? */
diff --git a/cipher/cipher-ccm.c b/cipher/cipher-ccm.c
index 38752d5..ebcbf1e 100644
--- a/cipher/cipher-ccm.c
+++ b/cipher/cipher-ccm.c
@@ -40,6 +40,7 @@ do_cbc_mac (gcry_cipher_hd_t c, const unsigned char *inbuf, size_t inlen,
             int do_padding)
 {
   const unsigned int blocksize = 16;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   unsigned char tmp[blocksize];
   unsigned int burn = 0;
   unsigned int unused = c->u_mode.ccm.mac_unused;
@@ -68,8 +69,7 @@ do_cbc_mac (gcry_cipher_hd_t c, const unsigned char *inbuf, size_t inlen,
         {
           /* Process one block from macbuf.  */
           buf_xor(c->u_iv.iv, c->u_iv.iv, c->u_mode.ccm.macbuf, blocksize);
-          set_burn (burn, c->spec->encrypt ( &c->context.c, c->u_iv.iv,
-                                             c->u_iv.iv ));
+          set_burn (burn, enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv ));
 
           unused = 0;
         }
@@ -89,8 +89,7 @@ do_cbc_mac (gcry_cipher_hd_t c, const unsigned char *inbuf, size_t inlen,
             {
               buf_xor(c->u_iv.iv, c->u_iv.iv, inbuf, blocksize);
 
-              set_burn (burn, c->spec->encrypt ( &c->context.c, c->u_iv.iv,
-                                                 c->u_iv.iv ));
+              set_burn (burn, enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv ));
 
               inlen -= blocksize;
               inbuf += blocksize;
diff --git a/cipher/cipher-cfb.c b/cipher/cipher-cfb.c
index 244f5fd..610d006 100644
--- a/cipher/cipher-cfb.c
+++ b/cipher/cipher-cfb.c
@@ -37,6 +37,7 @@ _gcry_cipher_cfb_encrypt (gcry_cipher_hd_t c,
                           const unsigned char *inbuf, unsigned int inbuflen)
 {
   unsigned char *ivp;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   size_t blocksize = c->spec->blocksize;
   size_t blocksize_x_2 = blocksize + blocksize;
   unsigned int burn, nburn;
@@ -48,7 +49,7 @@ _gcry_cipher_cfb_encrypt (gcry_cipher_hd_t c,
     {
       /* Short enough to be encoded by the remaining XOR mask. */
       /* XOR the input with the IV and store input into IV. */
-      ivp = c->u_iv.iv + c->spec->blocksize - c->unused;
+      ivp = c->u_iv.iv + blocksize - c->unused;
       buf_xor_2dst(outbuf, ivp, inbuf, inbuflen);
       c->unused -= inbuflen;
       return 0;
@@ -83,7 +84,7 @@ _gcry_cipher_cfb_encrypt (gcry_cipher_hd_t c,
       while ( inbuflen >= blocksize_x_2 )
         {
           /* Encrypt the IV. */
-          nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+          nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
           burn = nburn > burn ? nburn : burn;
           /* XOR the input with the IV and store input into IV.  */
           buf_xor_2dst(outbuf, c->u_iv.iv, inbuf, blocksize);
@@ -96,8 +97,8 @@ _gcry_cipher_cfb_encrypt (gcry_cipher_hd_t c,
   if ( inbuflen >= blocksize )
     {
       /* Save the current IV and then encrypt the IV. */
-      memcpy( c->lastiv, c->u_iv.iv, blocksize );
-      nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      buf_cpy( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
       burn = nburn > burn ? nburn : burn;
       /* XOR the input with the IV and store input into IV */
       buf_xor_2dst(outbuf, c->u_iv.iv, inbuf, blocksize);
@@ -108,8 +109,8 @@ _gcry_cipher_cfb_encrypt (gcry_cipher_hd_t c,
   if ( inbuflen )
     {
       /* Save the current IV and then encrypt the IV. */
-      memcpy( c->lastiv, c->u_iv.iv, blocksize );
-      nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      buf_cpy( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
       burn = nburn > burn ? nburn : burn;
       c->unused = blocksize;
       /* Apply the XOR. */
@@ -133,6 +134,7 @@ _gcry_cipher_cfb_decrypt (gcry_cipher_hd_t c,
                           const unsigned char *inbuf, unsigned int inbuflen)
 {
   unsigned char *ivp;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   size_t blocksize = c->spec->blocksize;
   size_t blocksize_x_2 = blocksize + blocksize;
   unsigned int burn, nburn;
@@ -179,7 +181,7 @@ _gcry_cipher_cfb_decrypt (gcry_cipher_hd_t c,
       while (inbuflen >= blocksize_x_2 )
         {
           /* Encrypt the IV. */
-          nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+          nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
           burn = nburn > burn ? nburn : burn;
           /* XOR the input with the IV and store input into IV. */
           buf_xor_n_copy(outbuf, c->u_iv.iv, inbuf, blocksize);
@@ -192,8 +194,8 @@ _gcry_cipher_cfb_decrypt (gcry_cipher_hd_t c,
   if (inbuflen >= blocksize )
     {
       /* Save the current IV and then encrypt the IV. */
-      memcpy ( c->lastiv, c->u_iv.iv, blocksize);
-      nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      buf_cpy ( c->lastiv, c->u_iv.iv, blocksize);
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
       burn = nburn > burn ? nburn : burn;
       /* XOR the input with the IV and store input into IV */
       buf_xor_n_copy(outbuf, c->u_iv.iv, inbuf, blocksize);
@@ -205,8 +207,8 @@ _gcry_cipher_cfb_decrypt (gcry_cipher_hd_t c,
   if (inbuflen)
     {
       /* Save the current IV and then encrypt the IV. */
-      memcpy ( c->lastiv, c->u_iv.iv, blocksize );
-      nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      buf_cpy ( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
       burn = nburn > burn ? nburn : burn;
       c->unused = blocksize;
       /* Apply the XOR. */
diff --git a/cipher/cipher-ctr.c b/cipher/cipher-ctr.c
index fbc898f..37a6a79 100644
--- a/cipher/cipher-ctr.c
+++ b/cipher/cipher-ctr.c
@@ -38,6 +38,7 @@ _gcry_cipher_ctr_encrypt (gcry_cipher_hd_t c,
 {
   unsigned int n;
   int i;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   unsigned int blocksize = c->spec->blocksize;
   unsigned int nblocks;
   unsigned int burn, nburn;
@@ -77,7 +78,7 @@ _gcry_cipher_ctr_encrypt (gcry_cipher_hd_t c,
       unsigned char tmp[MAX_BLOCKSIZE];
 
       do {
-        nburn = c->spec->encrypt (&c->context.c, tmp, c->u_ctr.ctr);
+        nburn = enc_fn (&c->context.c, tmp, c->u_ctr.ctr);
         burn = nburn > burn ? nburn : burn;
 
         for (i = blocksize; i > 0; i--)
@@ -98,7 +99,7 @@ _gcry_cipher_ctr_encrypt (gcry_cipher_hd_t c,
       /* Save the unused bytes of the counter.  */
       c->unused = blocksize - n;
       if (c->unused)
-        memcpy (c->lastiv+n, tmp+n, c->unused);
+        buf_cpy (c->lastiv+n, tmp+n, c->unused);
 
       wipememory (tmp, sizeof tmp);
     }
diff --git a/cipher/cipher-ofb.c b/cipher/cipher-ofb.c
index 3d9d54c..333a748 100644
--- a/cipher/cipher-ofb.c
+++ b/cipher/cipher-ofb.c
@@ -37,6 +37,7 @@ _gcry_cipher_ofb_encrypt (gcry_cipher_hd_t c,
                           const unsigned char *inbuf, unsigned int inbuflen)
 {
   unsigned char *ivp;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   size_t blocksize = c->spec->blocksize;
   unsigned int burn, nburn;
 
@@ -47,7 +48,7 @@ _gcry_cipher_ofb_encrypt (gcry_cipher_hd_t c,
     {
       /* Short enough to be encoded by the remaining XOR mask. */
       /* XOR the input with the IV */
-      ivp = c->u_iv.iv + c->spec->blocksize - c->unused;
+      ivp = c->u_iv.iv + blocksize - c->unused;
       buf_xor(outbuf, ivp, inbuf, inbuflen);
       c->unused -= inbuflen;
       return 0;
@@ -69,8 +70,8 @@ _gcry_cipher_ofb_encrypt (gcry_cipher_hd_t c,
   while ( inbuflen >= blocksize )
     {
       /* Encrypt the IV (and save the current one). */
-      memcpy( c->lastiv, c->u_iv.iv, blocksize );
-      nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      buf_cpy( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
       burn = nburn > burn ? nburn : burn;
       buf_xor(outbuf, c->u_iv.iv, inbuf, blocksize);
       outbuf += blocksize;
@@ -79,8 +80,8 @@ _gcry_cipher_ofb_encrypt (gcry_cipher_hd_t c,
     }
   if ( inbuflen )
     { /* process the remaining bytes */
-      memcpy( c->lastiv, c->u_iv.iv, blocksize );
-      nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      buf_cpy( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
       burn = nburn > burn ? nburn : burn;
       c->unused = blocksize;
       c->unused -= inbuflen;
@@ -103,6 +104,7 @@ _gcry_cipher_ofb_decrypt (gcry_cipher_hd_t c,
                           const unsigned char *inbuf, unsigned int inbuflen)
 {
   unsigned char *ivp;
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   size_t blocksize = c->spec->blocksize;
   unsigned int burn, nburn;
 
@@ -134,8 +136,8 @@ _gcry_cipher_ofb_decrypt (gcry_cipher_hd_t c,
   while ( inbuflen >= blocksize )
     {
       /* Encrypt the IV (and save the current one). */
-      memcpy( c->lastiv, c->u_iv.iv, blocksize );
-      nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      buf_cpy( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
       burn = nburn > burn ? nburn : burn;
       buf_xor(outbuf, c->u_iv.iv, inbuf, blocksize);
       outbuf += blocksize;
@@ -145,8 +147,8 @@ _gcry_cipher_ofb_decrypt (gcry_cipher_hd_t c,
   if ( inbuflen )
     { /* Process the remaining bytes. */
       /* Encrypt the IV (and save the current one). */
-      memcpy( c->lastiv, c->u_iv.iv, blocksize );
-      nburn = c->spec->encrypt ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
+      buf_cpy( c->lastiv, c->u_iv.iv, blocksize );
+      nburn = enc_fn ( &c->context.c, c->u_iv.iv, c->u_iv.iv );
       burn = nburn > burn ? nburn : burn;
       c->unused = blocksize;
       c->unused -= inbuflen;
diff --git a/cipher/cipher.c b/cipher/cipher.c
index 5214d26..c0d1d0b 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -631,6 +631,7 @@ do_ecb_encrypt (gcry_cipher_hd_t c,
                 unsigned char *outbuf, unsigned int outbuflen,
                 const unsigned char *inbuf, unsigned int inbuflen)
 {
+  gcry_cipher_encrypt_t enc_fn = c->spec->encrypt;
   unsigned int blocksize = c->spec->blocksize;
   unsigned int n, nblocks;
   unsigned int burn, nburn;
@@ -640,12 +641,12 @@ do_ecb_encrypt (gcry_cipher_hd_t c,
   if ((inbuflen % blocksize))
     return GPG_ERR_INV_LENGTH;
 
-  nblocks = inbuflen / c->spec->blocksize;
+  nblocks = inbuflen / blocksize;
   burn = 0;
 
   for (n=0; n < nblocks; n++ )
     {
-      nburn = c->spec->encrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf);
+      nburn = enc_fn (&c->context.c, outbuf, (byte*)/*arggg*/inbuf);
       burn = nburn > burn ? nburn : burn;
       inbuf  += blocksize;
       outbuf += blocksize;
@@ -662,6 +663,7 @@ do_ecb_decrypt (gcry_cipher_hd_t c,
                 unsigned char *outbuf, unsigned int outbuflen,
                 const unsigned char *inbuf, unsigned int inbuflen)
 {
+  gcry_cipher_decrypt_t dec_fn = c->spec->decrypt;
   unsigned int blocksize = c->spec->blocksize;
   unsigned int n, nblocks;
   unsigned int burn, nburn;
@@ -671,12 +673,12 @@ do_ecb_decrypt (gcry_cipher_hd_t c,
   if ((inbuflen % blocksize))
     return GPG_ERR_INV_LENGTH;
 
-  nblocks = inbuflen / c->spec->blocksize;
+  nblocks = inbuflen / blocksize;
   burn = 0;
 
   for (n=0; n < nblocks; n++ )
     {
-      nburn = c->spec->decrypt (&c->context.c, outbuf, (byte*)/*arggg*/inbuf);
+      nburn = dec_fn (&c->context.c, outbuf, (byte*)/*arggg*/inbuf);
       burn = nburn > burn ? nburn : burn;
       inbuf  += blocksize;
       outbuf += blocksize;
diff --git a/cipher/rijndael.c b/cipher/rijndael.c
index e9bb4f6..e8733c9 100644
--- a/cipher/rijndael.c
+++ b/cipher/rijndael.c
@@ -675,9 +675,9 @@ do_encrypt (const RIJNDAEL_context *ctx,
         byte b[16] ATTR_ALIGNED_16;
       } b;
 
-      memcpy (a.a, ax, 16);
+      buf_cpy (a.a, ax, 16);
       do_encrypt_aligned (ctx, b.b, a.a);
-      memcpy (bx, b.b, 16);
+      buf_cpy (bx, b.b, 16);
     }
   else
 #endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
@@ -1556,12 +1556,15 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
   RIJNDAEL_context *ctx = context;
   unsigned char *outbuf = outbuf_arg;
   const unsigned char *inbuf = inbuf_arg;
+  unsigned char *last_iv;
 
 #ifdef USE_AESNI
   if (ctx->use_aesni)
     aesni_prepare ();
 #endif /*USE_AESNI*/
 
+  last_iv = iv;
+
   for ( ;nblocks; nblocks-- )
     {
       if (0)
@@ -1576,24 +1579,17 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
                         "pxor %%xmm0, %%xmm1\n\t"
                         "movdqu %%xmm1, %[outbuf]\n\t"
                         : /* No output */
-                        : [iv] "m" (*iv),
+                        : [iv] "m" (*last_iv),
                           [inbuf] "m" (*inbuf),
                           [outbuf] "m" (*outbuf)
                         : "memory" );
 
           do_aesni (ctx, 0, outbuf, outbuf);
-
-          asm volatile ("movdqu %[outbuf], %%xmm0\n\t"
-                        "movdqu %%xmm0, %[iv]\n\t"
-                        : /* No output */
-                        : [outbuf] "m" (*outbuf),
-                          [iv] "m" (*iv)
-                        : "memory" );
         }
 #endif /*USE_AESNI*/
       else
         {
-          buf_xor(outbuf, inbuf, iv, BLOCKSIZE);
+          buf_xor(outbuf, inbuf, last_iv, BLOCKSIZE);
 
           if (0)
             ;
@@ -1603,18 +1599,34 @@ _gcry_aes_cbc_enc (void *context, unsigned char *iv,
 #endif /*USE_PADLOCK*/
           else
             do_encrypt (ctx, outbuf, outbuf );
-
-          memcpy (iv, outbuf, BLOCKSIZE);
         }
 
+      last_iv = outbuf;
       inbuf += BLOCKSIZE;
       if (!cbc_mac)
         outbuf += BLOCKSIZE;
     }
 
+  if (last_iv != iv)
+    {
+      if (0)
+        ;
+#ifdef USE_AESNI
+      else if (ctx->use_aesni)
+        asm volatile ("movdqu %[last], %%xmm0\n\t"
+                      "movdqu %%xmm0, %[iv]\n\t"
+                      : /* No output */
+                      : [last] "m" (*last_iv),
+                        [iv] "m" (*iv)
+                      : "memory" );
+#endif /*USE_AESNI*/
+      else
+        buf_cpy (iv, last_iv, BLOCKSIZE);
+    }
+
 #ifdef USE_AESNI
-  if (ctx->use_aesni)
-    aesni_cleanup ();
+   if (ctx->use_aesni)
+      aesni_cleanup ();
 #endif /*USE_AESNI*/
 
   _gcry_burn_stack (48 + 2*sizeof(int));
@@ -1810,9 +1822,9 @@ do_decrypt (RIJNDAEL_context *ctx, byte *bx, const byte *ax)
         byte b[16] ATTR_ALIGNED_16;
       } b;
 
-      memcpy (a.a, ax, 16);
+      buf_cpy (a.a, ax, 16);
       do_decrypt_aligned (ctx, b.b, a.a);
-      memcpy (bx, b.b, 16);
+      buf_cpy (bx, b.b, 16);
     }
   else
 #endif /*!USE_AMD64_ASM && !USE_ARM_ASM*/
@@ -2068,21 +2080,19 @@ _gcry_aes_cbc_dec (void *context, unsigned char *iv,
   else
     for ( ;nblocks; nblocks-- )
       {
-        /* We need to save INBUF away because it may be identical to
-           OUTBUF.  */
-        memcpy (savebuf, inbuf, BLOCKSIZE);
+        /* INBUF is needed later and it may be identical to OUTBUF, so store
+           the intermediate result to SAVEBUF.  */
 
         if (0)
           ;
 #ifdef USE_PADLOCK
         else if (ctx->use_padlock)
-          do_padlock (ctx, 1, outbuf, inbuf);
+          do_padlock (ctx, 1, savebuf, inbuf);
 #endif /*USE_PADLOCK*/
         else
-          do_decrypt (ctx, outbuf, inbuf);
+          do_decrypt (ctx, savebuf, inbuf);
 
-        buf_xor(outbuf, outbuf, iv, BLOCKSIZE);
-        memcpy (iv, savebuf, BLOCKSIZE);
+        buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, BLOCKSIZE);
         inbuf += BLOCKSIZE;
         outbuf += BLOCKSIZE;
       }
diff --git a/cipher/serpent.c b/cipher/serpent.c
index c0898dc..a8ee15f 100644
--- a/cipher/serpent.c
+++ b/cipher/serpent.c
@@ -950,14 +950,11 @@ _gcry_serpent_cbc_dec(void *context, unsigned char *iv,
 
   for ( ;nblocks; nblocks-- )
     {
-      /* We need to save INBUF away because it may be identical to
-         OUTBUF.  */
-      memcpy(savebuf, inbuf, sizeof(serpent_block_t));
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      serpent_decrypt_internal (ctx, inbuf, savebuf);
 
-      serpent_decrypt_internal (ctx, inbuf, outbuf);
-
-      buf_xor(outbuf, outbuf, iv, sizeof(serpent_block_t));
-      memcpy(iv, savebuf, sizeof(serpent_block_t));
+      buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, sizeof(serpent_block_t));
       inbuf += sizeof(serpent_block_t);
       outbuf += sizeof(serpent_block_t);
     }
diff --git a/cipher/twofish.c b/cipher/twofish.c
index 086df76..04081db 100644
--- a/cipher/twofish.c
+++ b/cipher/twofish.c
@@ -1055,16 +1055,13 @@ _gcry_twofish_cbc_dec(void *context, unsigned char *iv, void *outbuf_arg,
 
   for ( ;nblocks; nblocks-- )
     {
-      /* We need to save INBUF away because it may be identical to
-         OUTBUF.  */
-      memcpy(savebuf, inbuf, TWOFISH_BLOCKSIZE);
-
-      burn = twofish_decrypt (ctx, outbuf, inbuf);
+      /* INBUF is needed later and it may be identical to OUTBUF, so store
+         the intermediate result to SAVEBUF.  */
+      burn = twofish_decrypt (ctx, savebuf, inbuf);
       if (burn > burn_stack_depth)
         burn_stack_depth = burn;
 
-      buf_xor(outbuf, outbuf, iv, TWOFISH_BLOCKSIZE);
-      memcpy(iv, savebuf, TWOFISH_BLOCKSIZE);
+      buf_xor_n_copy_2(outbuf, savebuf, iv, inbuf, TWOFISH_BLOCKSIZE);
       inbuf += TWOFISH_BLOCKSIZE;
       outbuf += TWOFISH_BLOCKSIZE;
     }

commit 2901a10dbf1264707debc8402546c07eeac60932
Author: Jussi Kivilinna <jussi.kivilinna at iki.fi>
Date:   Wed Oct 23 18:36:18 2013 +0300

    bufhelp: enable unaligned memory accesses for AArch64 (64-bit ARM)
    
    * cipher/bufhelp.h [__aarch64__] (BUFHELP_FAST_UNALIGNED_ACCESS): Set
    macro on AArch64.
    --
    Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>

diff --git a/cipher/bufhelp.h b/cipher/bufhelp.h
index 21fac07..198d286 100644
--- a/cipher/bufhelp.h
+++ b/cipher/bufhelp.h
@@ -35,7 +35,8 @@
 
 #if defined(__i386__) || defined(__x86_64__) || \
     defined(__powerpc__) || defined(__powerpc64__) || \
-    (defined(__arm__) && defined(__ARM_FEATURE_UNALIGNED))
+    (defined(__arm__) && defined(__ARM_FEATURE_UNALIGNED)) || \
+    defined(__aarch64__)
 /* These architectures are able of unaligned memory accesses and can
    handle those fast.
  */

-----------------------------------------------------------------------

Summary of changes:
 cipher/blowfish.c      |   11 ++---
 cipher/bufhelp.h       |  107 ++++++++++++++++++++++++++++++++----------------
 cipher/camellia-glue.c |   11 ++---
 cipher/cast5.c         |   11 ++---
 cipher/cipher-cbc.c    |   53 +++++++++++++-----------
 cipher/cipher-ccm.c    |    7 ++--
 cipher/cipher-cfb.c    |   24 ++++++-----
 cipher/cipher-ctr.c    |    5 ++-
 cipher/cipher-ofb.c    |   20 +++++----
 cipher/cipher.c        |   10 +++--
 cipher/rijndael.c      |   58 +++++++++++++++-----------
 cipher/serpent.c       |   11 ++---
 cipher/twofish.c       |   11 ++---
 src/g10lib.h           |   85 +++++++++++---------------------------
 14 files changed, 215 insertions(+), 209 deletions(-)


hooks/post-receive
-- 
The GNU crypto library
http://git.gnupg.org




More information about the Gnupg-commits mailing list