[PATCH 1/3] Optimizations for SM4 cipher

Jussi Kivilinna jussi.kivilinna at iki.fi
Tue Jun 16 21:28:23 CEST 2020


* cipher/cipher.c (_gcry_cipher_open_internal): Add SM4 bulk
functions.
* cipher/sm4.c (ATTR_ALIGNED_64): New.
(sbox): Convert to ...
(sbox_table): ... this structure for sbox hardening as is done
for AES and GCM.
(prefetch_sbox_table): New.
(sm4_t_non_lin_sub): Make inline; Optimize sbox access pattern.
(sm4_key_lin_sub): Make inline; Tune slightly.
(sm4_key_sub, sm4_enc_sub): Make inline.
(sm4_round): Make inline; Take 'x' as separate parameters instead
of array.
(sm4_expand_key): Return void; Drop keylen; Unroll loops by 4;
Wipe sensitive variables at end; Move key-length check to
'sm4_setkey'.
(sm4_setkey): Add initial self-test step; Add key-length check;
Remove burn stack (as variables wiped in 'sm4_expand_key').
(sm4_do_crypt): Return burn stack depth; Unroll loops by 4.
(sm4_encrypt, sm4_decrypt): Prefetch sbox table; Return burn
stack from 'sm4_do_crypt', as allows tail-call optimization
by compiler.
(sm4_do_crypt_blks2): New two parallel block function for greater
instruction level parallelism.
(sm4_crypt_blocks, _gcry_sm4_ctr_enc, _gcry_sm4_cbc_dec)
(_gcry_sm4_cfb_dec, _gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth): New
bulk processing functions.
(selftest_ctr_128, selftest_cbc_128, selftest_cfb_128): New
bulk processing self-tests.
(sm4_selftest): Clear SM4 context before use; Use 'sm4_expand_key'
instead of 'sm4_setkey'; Call bulk processing self-tests.
* src/cipher.h (_gcry_sm4_ctr_enc, _gcry_sm4_ctr_dec)
(_gcry_sm4_cfb_dec, _gcry_sm4_ocb_crypt, _gcry_sm4_ocb_auth): New.
* tests/basic.c (check_ocb_cipher): Add SM4-OCB test vector.
--

Benchmark on AMD Ryzen 7 3700X (x86-64):

Before:
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     17.69 ns/B     53.92 MiB/s     76.50 c/B      4326
        ECB dec |     17.74 ns/B     53.77 MiB/s     76.72 c/B      4325
        CBC enc |     18.14 ns/B     52.56 MiB/s     78.47 c/B      4325
        CBC dec |     18.05 ns/B     52.83 MiB/s     78.09 c/B      4326
        CFB enc |     18.19 ns/B     52.44 MiB/s     78.67 c/B      4326
        CFB dec |     18.16 ns/B     52.53 MiB/s     78.53 c/B      4326
        OFB enc |     16.82 ns/B     56.70 MiB/s     72.96 c/B      4338
        OFB dec |     16.87 ns/B     56.53 MiB/s     72.96 c/B      4325
        CTR enc |     18.17 ns/B     52.47 MiB/s     78.62 c/B      4326
        CTR dec |     18.02 ns/B     52.94 MiB/s     77.92 c/B      4325
        XTS enc |     17.70 ns/B     53.87 MiB/s     76.11 c/B      4300
        XTS dec |     17.65 ns/B     54.04 MiB/s     76.28 c/B      4323±1
        CCM enc |     33.76 ns/B     28.25 MiB/s     146.9 c/B      4350
        CCM dec |     34.07 ns/B     27.99 MiB/s     147.4 c/B      4326
       CCM auth |     16.97 ns/B     56.19 MiB/s     73.41 c/B      4325
        EAX enc |     34.02 ns/B     28.03 MiB/s     147.1 c/B      4325
        EAX dec |     36.56 ns/B     26.08 MiB/s     159.1 c/B      4350
       EAX auth |     17.02 ns/B     56.03 MiB/s     73.62 c/B      4325
        GCM enc |     16.76 ns/B     56.90 MiB/s     72.50 c/B      4325
        GCM dec |     18.01 ns/B     52.94 MiB/s     78.37 c/B      4350
       GCM auth |     0.120 ns/B      7975 MiB/s     0.517 c/B      4325
        OCB enc |     18.19 ns/B     52.43 MiB/s     78.68 c/B      4325
        OCB dec |     18.15 ns/B     52.54 MiB/s     78.51 c/B      4325
       OCB auth |     16.87 ns/B     56.54 MiB/s     72.95 c/B      4325

After (non-parallalizeble modes ~2.0x faster, parallel modes ~3.8x):
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |      8.28 ns/B     115.1 MiB/s     35.84 c/B      4327±1
        ECB dec |      8.33 ns/B     114.4 MiB/s     36.13 c/B      4336±1
        CBC enc |      8.94 ns/B     106.7 MiB/s     38.66 c/B      4325
        CBC dec |      4.78 ns/B     199.7 MiB/s     20.42 c/B      4275
        CFB enc |      8.95 ns/B     106.5 MiB/s     38.72 c/B      4325
        CFB dec |      4.81 ns/B     198.2 MiB/s     20.57 c/B      4275
        OFB enc |      8.48 ns/B     112.5 MiB/s     36.66 c/B      4325
        OFB dec |      8.42 ns/B     113.3 MiB/s     36.41 c/B      4325
        CTR enc |      4.81 ns/B     198.2 MiB/s     20.69 c/B      4300
        CTR dec |      4.80 ns/B     198.8 MiB/s     20.63 c/B      4300
        XTS enc |      8.75 ns/B     109.0 MiB/s     37.83 c/B      4325
        XTS dec |      8.86 ns/B     107.7 MiB/s     38.30 c/B      4326
        CCM enc |     13.74 ns/B     69.42 MiB/s     59.42 c/B      4325
        CCM dec |     13.77 ns/B     69.25 MiB/s     59.57 c/B      4326
       CCM auth |      8.87 ns/B     107.5 MiB/s     38.36 c/B      4325
        EAX enc |     13.76 ns/B     69.29 MiB/s     59.54 c/B      4326
        EAX dec |     13.77 ns/B     69.25 MiB/s     59.57 c/B      4325
       EAX auth |      8.89 ns/B     107.3 MiB/s     38.44 c/B      4325
        GCM enc |      4.96 ns/B     192.3 MiB/s     21.20 c/B      4275
        GCM dec |      4.91 ns/B     194.4 MiB/s     21.10 c/B      4300
       GCM auth |     0.116 ns/B      8232 MiB/s     0.504 c/B      4351
        OCB enc |      4.88 ns/B     195.5 MiB/s     20.86 c/B      4275
        OCB dec |      4.85 ns/B     196.6 MiB/s     20.86 c/B      4301
       OCB auth |      4.80 ns/B     198.9 MiB/s     20.62 c/B      4301

Benchmark on ARM Cortex-A53 (aarch64):

Before:
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     84.08 ns/B     11.34 MiB/s     54.48 c/B     648.0
        ECB dec |     84.07 ns/B     11.34 MiB/s     54.47 c/B     648.0
        CBC enc |     84.90 ns/B     11.23 MiB/s     55.01 c/B     647.9
        CBC dec |     84.69 ns/B     11.26 MiB/s     54.87 c/B     648.0
        CFB enc |     84.55 ns/B     11.28 MiB/s     54.79 c/B     648.0
        CFB dec |     84.55 ns/B     11.28 MiB/s     54.78 c/B     648.0
        OFB enc |     84.45 ns/B     11.29 MiB/s     54.72 c/B     647.9
        OFB dec |     84.45 ns/B     11.29 MiB/s     54.72 c/B     648.0
        CTR enc |     85.42 ns/B     11.16 MiB/s     55.35 c/B     648.0
        CTR dec |     85.42 ns/B     11.16 MiB/s     55.35 c/B     648.0
        XTS enc |     88.72 ns/B     10.75 MiB/s     57.49 c/B     648.0
        XTS dec |     88.71 ns/B     10.75 MiB/s     57.48 c/B     648.0
        CCM enc |     170.2 ns/B      5.60 MiB/s     110.3 c/B     647.9
        CCM dec |     170.2 ns/B      5.60 MiB/s     110.3 c/B     648.0
       CCM auth |     84.27 ns/B     11.32 MiB/s     54.60 c/B     648.0
        EAX enc |     170.6 ns/B      5.59 MiB/s     110.5 c/B     648.0
        EAX dec |     170.6 ns/B      5.59 MiB/s     110.5 c/B     648.0
       EAX auth |     84.51 ns/B     11.29 MiB/s     54.76 c/B     648.0
        GCM enc |     86.99 ns/B     10.96 MiB/s     56.36 c/B     648.0
        GCM dec |     87.00 ns/B     10.96 MiB/s     56.37 c/B     648.0
       GCM auth |      1.56 ns/B     609.9 MiB/s      1.01 c/B     648.0
        OCB enc |     86.77 ns/B     10.99 MiB/s     56.22 c/B     648.0
        OCB dec |     86.77 ns/B     10.99 MiB/s     56.22 c/B     648.0
       OCB auth |     86.20 ns/B     11.06 MiB/s     55.85 c/B     648.0

After (non-parallalizable modes ~30% faster, parallel modes ~80%):
 SM4            |  nanosecs/byte   mebibytes/sec   cycles/byte  auto Mhz
        ECB enc |     64.85 ns/B     14.71 MiB/s     42.02 c/B     648.0
        ECB dec |     64.78 ns/B     14.72 MiB/s     41.98 c/B     648.0
        CBC enc |     64.53 ns/B     14.78 MiB/s     41.81 c/B     647.9
        CBC dec |     45.09 ns/B     21.15 MiB/s     29.21 c/B     648.0
        CFB enc |     64.56 ns/B     14.77 MiB/s     41.84 c/B     648.0
        CFB dec |     45.52 ns/B     20.95 MiB/s     29.49 c/B     647.9
        OFB enc |     64.14 ns/B     14.87 MiB/s     41.56 c/B     648.0
        OFB dec |     64.14 ns/B     14.87 MiB/s     41.56 c/B     648.0
        CTR enc |     45.54 ns/B     20.94 MiB/s     29.51 c/B     648.0
        CTR dec |     45.53 ns/B     20.95 MiB/s     29.50 c/B     648.0
        XTS enc |     67.88 ns/B     14.05 MiB/s     43.98 c/B     648.0
        XTS dec |     67.69 ns/B     14.09 MiB/s     43.86 c/B     648.0
        CCM enc |     110.6 ns/B      8.62 MiB/s     71.66 c/B     648.0
        CCM dec |     110.2 ns/B      8.65 MiB/s     71.42 c/B     648.0
       CCM auth |     64.87 ns/B     14.70 MiB/s     42.04 c/B     648.0
        EAX enc |     109.9 ns/B      8.68 MiB/s     71.22 c/B     648.0
        EAX dec |     109.9 ns/B      8.68 MiB/s     71.22 c/B     648.0
       EAX auth |     64.37 ns/B     14.81 MiB/s     41.71 c/B     648.0
        GCM enc |     47.07 ns/B     20.26 MiB/s     30.51 c/B     648.0
        GCM dec |     47.08 ns/B     20.26 MiB/s     30.51 c/B     648.0
       GCM auth |      1.55 ns/B     614.7 MiB/s      1.01 c/B     648.0
        OCB enc |     48.38 ns/B     19.71 MiB/s     31.35 c/B     648.0
        OCB dec |     48.11 ns/B     19.82 MiB/s     31.17 c/B     648.0
       OCB auth |     46.71 ns/B     20.42 MiB/s     30.27 c/B     648.0

Signed-off-by: Jussi Kivilinna <jussi.kivilinna at iki.fi>
---
 cipher/cipher.c |   9 +
 cipher/sm4.c    | 709 ++++++++++++++++++++++++++++++++++++++++++------
 src/cipher.h    |  16 ++
 tests/basic.c   |   2 +
 4 files changed, 648 insertions(+), 88 deletions(-)

diff --git a/cipher/cipher.c b/cipher/cipher.c
index dfb083a0..c77c9682 100644
--- a/cipher/cipher.c
+++ b/cipher/cipher.c
@@ -707,6 +707,15 @@ _gcry_cipher_open_internal (gcry_cipher_hd_t *handle,
               h->bulk.ocb_auth  = _gcry_serpent_ocb_auth;
               break;
 #endif /*USE_SERPENT*/
+#ifdef USE_SM4
+	    case GCRY_CIPHER_SM4:
+              h->bulk.cbc_dec = _gcry_sm4_cbc_dec;
+              h->bulk.cfb_dec = _gcry_sm4_cfb_dec;
+              h->bulk.ctr_enc = _gcry_sm4_ctr_enc;
+              h->bulk.ocb_crypt = _gcry_sm4_ocb_crypt;
+              h->bulk.ocb_auth  = _gcry_sm4_ocb_auth;
+              break;
+#endif /*USE_SM4*/
 #ifdef USE_TWOFISH
 	    case GCRY_CIPHER_TWOFISH:
 	    case GCRY_CIPHER_TWOFISH128:
diff --git a/cipher/sm4.c b/cipher/sm4.c
index 061ee26e..621532fa 100644
--- a/cipher/sm4.c
+++ b/cipher/sm4.c
@@ -1,6 +1,7 @@
 /* sm4.c  -  SM4 Cipher Algorithm
  * Copyright (C) 2020 Alibaba Group.
  * Copyright (C) 2020 Tianjia Zhang <tianjia.zhang at linux.alibaba.com>
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna at iki.fi>
  *
  * This file is part of Libgcrypt.
  *
@@ -27,6 +28,17 @@
 #include "g10lib.h"
 #include "cipher.h"
 #include "bufhelp.h"
+#include "cipher-internal.h"
+#include "cipher-selftest.h"
+
+/* Helper macro to force alignment to 64 bytes.  */
+#ifdef HAVE_GCC_ATTRIBUTE_ALIGNED
+# define ATTR_ALIGNED_64  __attribute__ ((aligned (64)))
+#else
+# define ATTR_ALIGNED_64
+#endif
+
+static const char *sm4_selftest (void);
 
 typedef struct
 {
@@ -34,46 +46,60 @@ typedef struct
   u32 rkey_dec[32];
 } SM4_context;
 
-static const u32 fk[4] = {
+static const u32 fk[4] =
+{
   0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
 };
 
-static const byte sbox[256] = {
-  0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7,
-  0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05,
-  0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3,
-  0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
-  0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a,
-  0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62,
-  0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95,
-  0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6,
-  0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba,
-  0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8,
-  0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b,
-  0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35,
-  0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2,
-  0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87,
-  0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52,
-  0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e,
-  0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5,
-  0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1,
-  0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55,
-  0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3,
-  0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60,
-  0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f,
-  0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f,
-  0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51,
-  0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f,
-  0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8,
-  0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd,
-  0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0,
-  0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e,
-  0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84,
-  0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20,
-  0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48
-};
+static struct
+{
+  volatile u32 counter_head;
+  u32 cacheline_align[64 / 4 - 1];
+  byte S[256];
+  volatile u32 counter_tail;
+} sbox_table ATTR_ALIGNED_64 =
+  {
+    0,
+    { 0, },
+    {
+      0xd6, 0x90, 0xe9, 0xfe, 0xcc, 0xe1, 0x3d, 0xb7,
+      0x16, 0xb6, 0x14, 0xc2, 0x28, 0xfb, 0x2c, 0x05,
+      0x2b, 0x67, 0x9a, 0x76, 0x2a, 0xbe, 0x04, 0xc3,
+      0xaa, 0x44, 0x13, 0x26, 0x49, 0x86, 0x06, 0x99,
+      0x9c, 0x42, 0x50, 0xf4, 0x91, 0xef, 0x98, 0x7a,
+      0x33, 0x54, 0x0b, 0x43, 0xed, 0xcf, 0xac, 0x62,
+      0xe4, 0xb3, 0x1c, 0xa9, 0xc9, 0x08, 0xe8, 0x95,
+      0x80, 0xdf, 0x94, 0xfa, 0x75, 0x8f, 0x3f, 0xa6,
+      0x47, 0x07, 0xa7, 0xfc, 0xf3, 0x73, 0x17, 0xba,
+      0x83, 0x59, 0x3c, 0x19, 0xe6, 0x85, 0x4f, 0xa8,
+      0x68, 0x6b, 0x81, 0xb2, 0x71, 0x64, 0xda, 0x8b,
+      0xf8, 0xeb, 0x0f, 0x4b, 0x70, 0x56, 0x9d, 0x35,
+      0x1e, 0x24, 0x0e, 0x5e, 0x63, 0x58, 0xd1, 0xa2,
+      0x25, 0x22, 0x7c, 0x3b, 0x01, 0x21, 0x78, 0x87,
+      0xd4, 0x00, 0x46, 0x57, 0x9f, 0xd3, 0x27, 0x52,
+      0x4c, 0x36, 0x02, 0xe7, 0xa0, 0xc4, 0xc8, 0x9e,
+      0xea, 0xbf, 0x8a, 0xd2, 0x40, 0xc7, 0x38, 0xb5,
+      0xa3, 0xf7, 0xf2, 0xce, 0xf9, 0x61, 0x15, 0xa1,
+      0xe0, 0xae, 0x5d, 0xa4, 0x9b, 0x34, 0x1a, 0x55,
+      0xad, 0x93, 0x32, 0x30, 0xf5, 0x8c, 0xb1, 0xe3,
+      0x1d, 0xf6, 0xe2, 0x2e, 0x82, 0x66, 0xca, 0x60,
+      0xc0, 0x29, 0x23, 0xab, 0x0d, 0x53, 0x4e, 0x6f,
+      0xd5, 0xdb, 0x37, 0x45, 0xde, 0xfd, 0x8e, 0x2f,
+      0x03, 0xff, 0x6a, 0x72, 0x6d, 0x6c, 0x5b, 0x51,
+      0x8d, 0x1b, 0xaf, 0x92, 0xbb, 0xdd, 0xbc, 0x7f,
+      0x11, 0xd9, 0x5c, 0x41, 0x1f, 0x10, 0x5a, 0xd8,
+      0x0a, 0xc1, 0x31, 0x88, 0xa5, 0xcd, 0x7b, 0xbd,
+      0x2d, 0x74, 0xd0, 0x12, 0xb8, 0xe5, 0xb4, 0xb0,
+      0x89, 0x69, 0x97, 0x4a, 0x0c, 0x96, 0x77, 0x7e,
+      0x65, 0xb9, 0xf1, 0x09, 0xc5, 0x6e, 0xc6, 0x84,
+      0x18, 0xf0, 0x7d, 0xec, 0x3a, 0xdc, 0x4d, 0x20,
+      0x79, 0xee, 0x5f, 0x3e, 0xd7, 0xcb, 0x39, 0x48
+    },
+    0
+  };
 
-static const u32 ck[] = {
+static const u32 ck[] =
+{
   0x00070e15, 0x1c232a31, 0x383f464d, 0x545b6269,
   0x70777e85, 0x8c939aa1, 0xa8afb6bd, 0xc4cbd2d9,
   0xe0e7eef5, 0xfc030a11, 0x181f262d, 0x343b4249,
@@ -84,68 +110,96 @@ static const u32 ck[] = {
   0x10171e25, 0x2c333a41, 0x484f565d, 0x646b7279
 };
 
-static u32 sm4_t_non_lin_sub(u32 x)
+static inline void prefetch_sbox_table(void)
 {
-  int i;
-  byte *b = (byte *)&x;
+  const volatile byte *vtab = (void *)&sbox_table;
+
+  /* Modify counters to trigger copy-on-write and unsharing if physical pages
+   * of look-up table are shared between processes.  Modifying counters also
+   * causes checksums for pages to change and hint same-page merging algorithm
+   * that these pages are frequently changing.  */
+  sbox_table.counter_head++;
+  sbox_table.counter_tail++;
+
+  /* Prefetch look-up table to cache.  */
+  (void)vtab[0 * 32];
+  (void)vtab[1 * 32];
+  (void)vtab[2 * 32];
+  (void)vtab[3 * 32];
+  (void)vtab[4 * 32];
+  (void)vtab[5 * 32];
+  (void)vtab[6 * 32];
+  (void)vtab[7 * 32];
+  (void)vtab[8 * 32 - 1];
+}
 
-  for (i = 0; i < 4; ++i)
-    b[i] = sbox[b[i]];
+static inline u32 sm4_t_non_lin_sub(u32 x)
+{
+  u32 out;
 
-  return x;
+  out  = (u32)sbox_table.S[(x >> 0) & 0xff] << 0;
+  out |= (u32)sbox_table.S[(x >> 8) & 0xff] << 8;
+  out |= (u32)sbox_table.S[(x >> 16) & 0xff] << 16;
+  out |= (u32)sbox_table.S[(x >> 24) & 0xff] << 24;
+
+  return out;
 }
 
-static u32 sm4_key_lin_sub(u32 x)
+static inline u32 sm4_key_lin_sub(u32 x)
 {
   return x ^ rol(x, 13) ^ rol(x, 23);
 }
 
-static u32 sm4_enc_lin_sub(u32 x)
+static inline u32 sm4_enc_lin_sub(u32 x)
 {
-  return x ^ rol(x, 2) ^ rol(x, 10) ^ rol(x, 18) ^ rol(x, 24);
+  u32 xrol2 = rol(x, 2);
+  return x ^ xrol2 ^ rol(xrol2, 8) ^ rol(xrol2, 16) ^ rol(x, 24);
 }
 
-static u32 sm4_key_sub(u32 x)
+static inline u32 sm4_key_sub(u32 x)
 {
   return sm4_key_lin_sub(sm4_t_non_lin_sub(x));
 }
 
-static u32 sm4_enc_sub(u32 x)
+static inline u32 sm4_enc_sub(u32 x)
 {
   return sm4_enc_lin_sub(sm4_t_non_lin_sub(x));
 }
 
-static u32 sm4_round(const u32 *x, const u32 rk)
+static inline u32
+sm4_round(const u32 x0, const u32 x1, const u32 x2, const u32 x3, const u32 rk)
 {
-  return x[0] ^ sm4_enc_sub(x[1] ^ x[2] ^ x[3] ^ rk);
+  return x0 ^ sm4_enc_sub(x1 ^ x2 ^ x3 ^ rk);
 }
 
-static gcry_err_code_t
-sm4_expand_key (SM4_context *ctx, const byte *key, const unsigned keylen)
+static void
+sm4_expand_key (SM4_context *ctx, const byte *key)
 {
-  u32 rk[4], t;
+  u32 rk[4];
   int i;
 
-  if (keylen != 16)
-    return GPG_ERR_INV_KEYLEN;
+  rk[0] = buf_get_be32(key + 4 * 0) ^ fk[0];
+  rk[1] = buf_get_be32(key + 4 * 1) ^ fk[1];
+  rk[2] = buf_get_be32(key + 4 * 2) ^ fk[2];
+  rk[3] = buf_get_be32(key + 4 * 3) ^ fk[3];
 
-  for (i = 0; i < 4; ++i)
-    rk[i] = buf_get_be32(&key[i*4]) ^ fk[i];
-
-  for (i = 0; i < 32; ++i)
+  for (i = 0; i < 32; i += 4)
     {
-      t = rk[0] ^ sm4_key_sub(rk[1] ^ rk[2] ^ rk[3] ^ ck[i]);
-      ctx->rkey_enc[i] = t;
-      rk[0] = rk[1];
-      rk[1] = rk[2];
-      rk[2] = rk[3];
-      rk[3] = t;
+      rk[0] = rk[0] ^ sm4_key_sub(rk[1] ^ rk[2] ^ rk[3] ^ ck[i + 0]);
+      rk[1] = rk[1] ^ sm4_key_sub(rk[2] ^ rk[3] ^ rk[0] ^ ck[i + 1]);
+      rk[2] = rk[2] ^ sm4_key_sub(rk[3] ^ rk[0] ^ rk[1] ^ ck[i + 2]);
+      rk[3] = rk[3] ^ sm4_key_sub(rk[0] ^ rk[1] ^ rk[2] ^ ck[i + 3]);
+      ctx->rkey_enc[i + 0] = rk[0];
+      ctx->rkey_enc[i + 1] = rk[1];
+      ctx->rkey_enc[i + 2] = rk[2];
+      ctx->rkey_enc[i + 3] = rk[3];
+      ctx->rkey_dec[31 - i - 0] = rk[0];
+      ctx->rkey_dec[31 - i - 1] = rk[1];
+      ctx->rkey_dec[31 - i - 2] = rk[2];
+      ctx->rkey_dec[31 - i - 3] = rk[3];
     }
 
-  for (i = 0; i < 32; ++i)
-    ctx->rkey_dec[i] = ctx->rkey_enc[31 - i];
-
-  return 0;
+  wipememory (rk, sizeof(rk));
 }
 
 static gcry_err_code_t
@@ -153,32 +207,53 @@ sm4_setkey (void *context, const byte *key, const unsigned keylen,
             gcry_cipher_hd_t hd)
 {
   SM4_context *ctx = context;
-  int rc = sm4_expand_key (ctx, key, keylen);
+  static int init = 0;
+  static const char *selftest_failed = NULL;
+
   (void)hd;
-  _gcry_burn_stack (4*5 + sizeof(int)*2);
-  return rc;
+
+  if (!init)
+    {
+      init = 1;
+      selftest_failed = sm4_selftest();
+      if (selftest_failed)
+	log_error("%s\n", selftest_failed);
+    }
+  if (selftest_failed)
+    return GPG_ERR_SELFTEST_FAILED;
+
+  if (keylen != 16)
+    return GPG_ERR_INV_KEYLEN;
+
+  sm4_expand_key (ctx, key);
+  return 0;
 }
 
-static void
+static unsigned int
 sm4_do_crypt (const u32 *rk, byte *out, const byte *in)
 {
-  u32 x[4], t;
+  u32 x[4];
   int i;
 
-  for (i = 0; i < 4; ++i)
-    x[i] = buf_get_be32(&in[i*4]);
+  x[0] = buf_get_be32(in + 0 * 4);
+  x[1] = buf_get_be32(in + 1 * 4);
+  x[2] = buf_get_be32(in + 2 * 4);
+  x[3] = buf_get_be32(in + 3 * 4);
 
-  for (i = 0; i < 32; ++i)
+  for (i = 0; i < 32; i += 4)
     {
-      t = sm4_round(x, rk[i]);
-      x[0] = x[1];
-      x[1] = x[2];
-      x[2] = x[3];
-      x[3] = t;
+      x[0] = sm4_round(x[0], x[1], x[2], x[3], rk[i + 0]);
+      x[1] = sm4_round(x[1], x[2], x[3], x[0], rk[i + 1]);
+      x[2] = sm4_round(x[2], x[3], x[0], x[1], rk[i + 2]);
+      x[3] = sm4_round(x[3], x[0], x[1], x[2], rk[i + 3]);
     }
 
-  for (i = 0; i < 4; ++i)
-    buf_put_be32(&out[i*4], x[3 - i]);
+  buf_put_be32(out + 0 * 4, x[3 - 0]);
+  buf_put_be32(out + 1 * 4, x[3 - 1]);
+  buf_put_be32(out + 2 * 4, x[3 - 2]);
+  buf_put_be32(out + 3 * 4, x[3 - 3]);
+
+  return /*burn_stack*/ 4*6+sizeof(void*)*4;
 }
 
 static unsigned int
@@ -186,8 +261,9 @@ sm4_encrypt (void *context, byte *outbuf, const byte *inbuf)
 {
   SM4_context *ctx = context;
 
-  sm4_do_crypt (ctx->rkey_enc, outbuf, inbuf);
-  return /*burn_stack*/ 4*6+sizeof(void*)*4;
+  prefetch_sbox_table ();
+
+  return sm4_do_crypt (ctx->rkey_enc, outbuf, inbuf);
 }
 
 static unsigned int
@@ -195,8 +271,453 @@ sm4_decrypt (void *context, byte *outbuf, const byte *inbuf)
 {
   SM4_context *ctx = context;
 
-  sm4_do_crypt (ctx->rkey_dec, outbuf, inbuf);
-  return /*burn_stack*/ 4*6+sizeof(void*)*4;
+  prefetch_sbox_table ();
+
+  return sm4_do_crypt (ctx->rkey_dec, outbuf, inbuf);
+}
+
+static unsigned int
+sm4_do_crypt_blks2 (const u32 *rk, byte *out, const byte *in)
+{
+  u32 x[4];
+  u32 y[4];
+  u32 k;
+  int i;
+
+  /* Encrypts/Decrypts two blocks for higher instruction level
+   * parallelism. */
+
+  x[0] = buf_get_be32(in + 0 * 4);
+  x[1] = buf_get_be32(in + 1 * 4);
+  x[2] = buf_get_be32(in + 2 * 4);
+  x[3] = buf_get_be32(in + 3 * 4);
+  y[0] = buf_get_be32(in + 4 * 4);
+  y[1] = buf_get_be32(in + 5 * 4);
+  y[2] = buf_get_be32(in + 6 * 4);
+  y[3] = buf_get_be32(in + 7 * 4);
+
+  for (i = 0; i < 32; i += 4)
+    {
+      k = rk[i + 0];
+      x[0] = sm4_round(x[0], x[1], x[2], x[3], k);
+      y[0] = sm4_round(y[0], y[1], y[2], y[3], k);
+      k = rk[i + 1];
+      x[1] = sm4_round(x[1], x[2], x[3], x[0], k);
+      y[1] = sm4_round(y[1], y[2], y[3], y[0], k);
+      k = rk[i + 2];
+      x[2] = sm4_round(x[2], x[3], x[0], x[1], k);
+      y[2] = sm4_round(y[2], y[3], y[0], y[1], k);
+      k = rk[i + 3];
+      x[3] = sm4_round(x[3], x[0], x[1], x[2], k);
+      y[3] = sm4_round(y[3], y[0], y[1], y[2], k);
+    }
+
+  buf_put_be32(out + 0 * 4, x[3 - 0]);
+  buf_put_be32(out + 1 * 4, x[3 - 1]);
+  buf_put_be32(out + 2 * 4, x[3 - 2]);
+  buf_put_be32(out + 3 * 4, x[3 - 3]);
+  buf_put_be32(out + 4 * 4, y[3 - 0]);
+  buf_put_be32(out + 5 * 4, y[3 - 1]);
+  buf_put_be32(out + 6 * 4, y[3 - 2]);
+  buf_put_be32(out + 7 * 4, y[3 - 3]);
+
+  return /*burn_stack*/ 4*10+sizeof(void*)*4;
+}
+
+static unsigned int
+sm4_crypt_blocks (const u32 *rk, byte *out, const byte *in,
+		  unsigned int num_blks)
+{
+  unsigned int burn_depth = 0;
+  unsigned int nburn;
+
+  while (num_blks >= 2)
+    {
+      nburn = sm4_do_crypt_blks2 (rk, out, in);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      out += 2 * 16;
+      in += 2 * 16;
+      num_blks -= 2;
+    }
+
+  while (num_blks)
+    {
+      nburn = sm4_do_crypt (rk, out, in);
+      burn_depth = nburn > burn_depth ? nburn : burn_depth;
+      out += 16;
+      in += 16;
+      num_blks--;
+    }
+
+  if (burn_depth)
+    burn_depth += sizeof(void *) * 5;
+  return burn_depth;
+}
+
+/* Bulk encryption of complete blocks in CTR mode.  This function is only
+   intended for the bulk encryption feature of cipher.c.  CTR is expected to be
+   of size 16. */
+void
+_gcry_sm4_ctr_enc(void *context, unsigned char *ctr,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  SM4_context *ctx = context;
+  byte *outbuf = outbuf_arg;
+  const byte *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      byte tmpbuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      /* Process remaining blocks. */
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  cipher_block_cpy (tmpbuf + 0 * 16, ctr, 16);
+	  for (i = 1; i < curr_blks; i++)
+	    {
+	      cipher_block_cpy (&tmpbuf[i * 16], ctr, 16);
+	      cipher_block_add (&tmpbuf[i * 16], i, 16);
+	    }
+	  cipher_block_add (ctr, curr_blks, 16);
+
+	  burn_stack_depth = crypt_blk1_8 (ctx->rkey_enc, tmpbuf, tmpbuf,
+					   curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor (outbuf, &tmpbuf[i * 16], inbuf, 16);
+	      outbuf += 16;
+	      inbuf += 16;
+	    }
+
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CBC mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_sm4_cbc_dec(void *context, unsigned char *iv,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  SM4_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      unsigned char savebuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      /* Process remaining blocks. */
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  burn_stack_depth = crypt_blk1_8 (ctx->rkey_dec, savebuf, inbuf,
+					   curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor_n_copy_2(outbuf, &savebuf[i * 16], iv, inbuf,
+					16);
+	      outbuf += 16;
+	      inbuf += 16;
+	    }
+
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(savebuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk decryption of complete blocks in CFB mode.  This function is only
+   intended for the bulk encryption feature of cipher.c. */
+void
+_gcry_sm4_cfb_dec(void *context, unsigned char *iv,
+                  void *outbuf_arg, const void *inbuf_arg,
+                  size_t nblocks)
+{
+  SM4_context *ctx = context;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  int burn_stack_depth = 0;
+
+  /* Process remaining blocks. */
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      unsigned char ivbuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      /* Process remaining blocks. */
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  cipher_block_cpy (&ivbuf[0 * 16], iv, 16);
+	  for (i = 1; i < curr_blks; i++)
+	    cipher_block_cpy (&ivbuf[i * 16], &inbuf[(i - 1) * 16], 16);
+	  cipher_block_cpy (iv, &inbuf[(i - 1) * 16], 16);
+
+	  burn_stack_depth = crypt_blk1_8 (ctx->rkey_enc, ivbuf, ivbuf,
+					   curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor (outbuf, inbuf, &ivbuf[i * 16], 16);
+	      outbuf += 16;
+	      inbuf += 16;
+	    }
+
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(ivbuf, tmp_used);
+    }
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+}
+
+/* Bulk encryption/decryption of complete blocks in OCB mode. */
+size_t
+_gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+		     const void *inbuf_arg, size_t nblocks, int encrypt)
+{
+  SM4_context *ctx = (void *)&c->context.c;
+  unsigned char *outbuf = outbuf_arg;
+  const unsigned char *inbuf = inbuf_arg;
+  u64 blkn = c->u_mode.ocb.data_nblocks;
+  int burn_stack_depth = 0;
+
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      const u32 *rk = encrypt ? ctx->rkey_enc : ctx->rkey_dec;
+      unsigned char tmpbuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      const unsigned char *l = ocb_get_l(c, ++blkn);
+
+	      /* Checksum_i = Checksum_{i-1} xor P_i  */
+	      if (encrypt)
+		cipher_block_xor_1(c->u_ctr.ctr, &inbuf[i * 16], 16);
+
+	      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	      cipher_block_xor_2dst (&tmpbuf[i * 16], c->u_iv.iv, l, 16);
+	      cipher_block_xor (&outbuf[i * 16], &inbuf[i * 16],
+				c->u_iv.iv, 16);
+	    }
+
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  crypt_blk1_8 (rk, outbuf, outbuf, curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor_1 (&outbuf[i * 16], &tmpbuf[i * 16], 16);
+
+	      /* Checksum_i = Checksum_{i-1} xor P_i  */
+	      if (!encrypt)
+		  cipher_block_xor_1(c->u_ctr.ctr, &outbuf[i * 16], 16);
+	    }
+
+	  outbuf += curr_blks * 16;
+	  inbuf  += curr_blks * 16;
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  c->u_mode.ocb.data_nblocks = blkn;
+
+  if (burn_stack_depth)
+    _gcry_burn_stack(burn_stack_depth);
+
+  return 0;
+}
+
+/* Bulk authentication of complete blocks in OCB mode. */
+size_t
+_gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg, size_t nblocks)
+{
+  SM4_context *ctx = (void *)&c->context.c;
+  const unsigned char *abuf = abuf_arg;
+  u64 blkn = c->u_mode.ocb.aad_nblocks;
+
+  if (nblocks)
+    {
+      unsigned int (*crypt_blk1_8)(const u32 *rk, byte *out, const byte *in,
+				   unsigned int num_blks);
+      unsigned char tmpbuf[16 * 8];
+      unsigned int tmp_used = 16;
+
+      if (0)
+	;
+      else
+	{
+	  prefetch_sbox_table ();
+	  crypt_blk1_8 = sm4_crypt_blocks;
+	}
+
+      while (nblocks)
+	{
+	  size_t curr_blks = nblocks > 8 ? 8 : nblocks;
+	  size_t i;
+
+	  if (curr_blks * 16 > tmp_used)
+	    tmp_used = curr_blks * 16;
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      const unsigned char *l = ocb_get_l(c, ++blkn);
+
+	      /* Offset_i = Offset_{i-1} xor L_{ntz(i)} */
+	      cipher_block_xor_2dst (&tmpbuf[i * 16],
+				     c->u_mode.ocb.aad_offset, l, 16);
+	      cipher_block_xor_1 (&tmpbuf[i * 16], &abuf[i * 16], 16);
+	    }
+
+	  /* C_i = Offset_i xor ENCIPHER(K, P_i xor Offset_i)  */
+	  crypt_blk1_8 (ctx->rkey_enc, tmpbuf, tmpbuf, curr_blks);
+
+	  for (i = 0; i < curr_blks; i++)
+	    {
+	      cipher_block_xor_1 (c->u_mode.ocb.aad_sum, &tmpbuf[i * 16], 16);
+	    }
+
+	  abuf += curr_blks * 16;
+	  nblocks -= curr_blks;
+	}
+
+      wipememory(tmpbuf, tmp_used);
+    }
+
+  c->u_mode.ocb.aad_nblocks = blkn;
+
+  return 0;
+}
+
+/* Run the self-tests for SM4-CTR, tests IV increment of bulk CTR
+   encryption.  Returns NULL on success. */
+static const char*
+selftest_ctr_128 (void)
+{
+  const int nblocks = 16 - 1;
+  const int blocksize = 16;
+  const int context_size = sizeof(SM4_context);
+
+  return _gcry_selftest_helper_ctr("SM4", &sm4_setkey,
+           &sm4_encrypt, &_gcry_sm4_ctr_enc, nblocks, blocksize,
+	   context_size);
+}
+
+/* Run the self-tests for SM4-CBC, tests bulk CBC decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cbc_128 (void)
+{
+  const int nblocks = 16 - 1;
+  const int blocksize = 16;
+  const int context_size = sizeof(SM4_context);
+
+  return _gcry_selftest_helper_cbc("SM4", &sm4_setkey,
+           &sm4_encrypt, &_gcry_sm4_cbc_dec, nblocks, blocksize,
+	   context_size);
+}
+
+/* Run the self-tests for SM4-CFB, tests bulk CFB decryption.
+   Returns NULL on success. */
+static const char*
+selftest_cfb_128 (void)
+{
+  const int nblocks = 16 - 1;
+  const int blocksize = 16;
+  const int context_size = sizeof(SM4_context);
+
+  return _gcry_selftest_helper_cfb("SM4", &sm4_setkey,
+           &sm4_encrypt, &_gcry_sm4_cfb_dec, nblocks, blocksize,
+	   context_size);
 }
 
 static const char *
@@ -204,6 +725,7 @@ sm4_selftest (void)
 {
   SM4_context ctx;
   byte scratch[16];
+  const char *r;
 
   static const byte plaintext[16] = {
     0x01, 0x23, 0x45, 0x67, 0x89, 0xAB, 0xCD, 0xEF,
@@ -218,7 +740,9 @@ sm4_selftest (void)
     0x86, 0xB3, 0xE9, 0x4F, 0x53, 0x6E, 0x42, 0x46
   };
 
-  sm4_setkey (&ctx, key, sizeof (key), NULL);
+  memset (&ctx, 0, sizeof(ctx));
+
+  sm4_expand_key (&ctx, key);
   sm4_encrypt (&ctx, scratch, plaintext);
   if (memcmp (scratch, ciphertext, sizeof (ciphertext)))
     return "SM4 test encryption failed.";
@@ -226,6 +750,15 @@ sm4_selftest (void)
   if (memcmp (scratch, plaintext, sizeof (plaintext)))
     return "SM4 test decryption failed.";
 
+  if ( (r = selftest_ctr_128 ()) )
+    return r;
+
+  if ( (r = selftest_cbc_128 ()) )
+    return r;
+
+  if ( (r = selftest_cfb_128 ()) )
+    return r;
+
   return NULL;
 }
 
diff --git a/src/cipher.h b/src/cipher.h
index c49bbda5..decdc4d1 100644
--- a/src/cipher.h
+++ b/src/cipher.h
@@ -241,6 +241,22 @@ size_t _gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
 size_t _gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
 			       size_t nblocks);
 
+/*-- sm4.c --*/
+void _gcry_sm4_ctr_enc (void *context, unsigned char *ctr,
+			void *outbuf_arg, const void *inbuf_arg,
+			size_t nblocks);
+void _gcry_sm4_cbc_dec (void *context, unsigned char *iv,
+                        void *outbuf_arg, const void *inbuf_arg,
+			size_t nblocks);
+void _gcry_sm4_cfb_dec (void *context, unsigned char *iv,
+			void *outbuf_arg, const void *inbuf_arg,
+			size_t nblocks);
+size_t _gcry_sm4_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
+			    const void *inbuf_arg, size_t nblocks,
+			    int encrypt);
+size_t _gcry_sm4_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
+			   size_t nblocks);
+
 /*-- twofish.c --*/
 void _gcry_twofish_ctr_enc (void *context, unsigned char *ctr,
                             void *outbuf_arg, const void *inbuf_arg,
diff --git a/tests/basic.c b/tests/basic.c
index 5acbab84..8ccb9c66 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -7035,6 +7035,8 @@ check_ocb_cipher (void)
     "\x99\xeb\x35\xb0\x62\x4e\x7b\xf1\x5e\x9f\xed\x32\x78\x90\x0b\xd0");
   check_ocb_cipher_largebuf(GCRY_CIPHER_SERPENT256, 32,
     "\x71\x66\x2f\x68\xbf\xdd\xcc\xb1\xbf\x81\x56\x5f\x01\x73\xeb\x44");
+  check_ocb_cipher_largebuf(GCRY_CIPHER_SM4, 16,
+    "\x2c\x0b\x31\x0b\xf4\x71\x9b\x01\xf4\x18\x5d\xf1\xe9\x3d\xed\x6b");
 
   /* Check that the AAD data is correctly buffered.  */
   check_ocb_cipher_splitaad ();
-- 
2.25.1




More information about the Gcrypt-devel mailing list