diff --git a/lib/Makefile b/lib/Makefile
index f4c61c3d6a49f560e8272e09866b95d8ba82bffd..f0d6a08fdfb8c303a381a27312893f7c0b6b6845 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -294,7 +294,9 @@ c_lib_objs := \
 	zuc_simd_no_aesni.o \
 	zuc_aarch64_top.o \
 	mb_mgr_zuc_submit_flush_aarch64.o \
-	mb_mgr_zuc_submit_flush_aarch64_no_aesni.o
+	mb_mgr_zuc_submit_flush_aarch64_no_aesni.o \
+	mb_mgr_snow3g_submit_flush_aarch64.o \
+	mb_mgr_snow3g_submit_flush_aarch64_no_aesni.o
 asm_generic_lib_objs := \
 	lookup_16x8bit_neon.o
 else
diff --git a/lib/aarch64/aesni_emu_aarch64.h b/lib/aarch64/aesni_emu_aarch64.h
new file mode 100644
index 0000000000000000000000000000000000000000..c222ea15e56e3d3b120fc881dd33abc0611116b6
--- /dev/null
+++ b/lib/aarch64/aesni_emu_aarch64.h
@@ -0,0 +1,170 @@
+/**********************************************************************
+  Copyright(c) 2022 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef AESNI_EMU_AARCH64_H
+#define AESNI_EMU_AARCH64_H
+
+#include "aesni_emu.h"
+
+#include "aarch64/constant_lookup_aarch64.h"
+#include <arm_neon.h>
+
+static const DECLARE_ALIGNED(uint8_t aes_sbox[16][16], 16) = {
+        { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
+          0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76 },
+        { 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0,
+          0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0 },
+        { 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc,
+          0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15 },
+        { 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a,
+          0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75 },
+        { 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0,
+          0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84 },
+        { 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b,
+          0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf },
+        { 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85,
+          0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8 },
+        { 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5,
+          0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2 },
+        { 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17,
+          0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73 },
+        { 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88,
+          0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb },
+        { 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c,
+          0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79 },
+        { 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9,
+          0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08 },
+        { 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6,
+          0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a },
+        { 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e,
+          0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e },
+        { 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94,
+          0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf },
+        { 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68,
+          0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16 }
+};
+
+static const DECLARE_ALIGNED(uint8_t aes_isbox[16][16], 16) = {
+        { 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38,
+          0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb },
+        { 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
+          0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb },
+        { 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d,
+          0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e },
+        { 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2,
+          0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25 },
+        { 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
+          0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92 },
+        { 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda,
+          0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84 },
+        { 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
+          0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06 },
+        { 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
+          0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b },
+        { 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea,
+          0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73 },
+        { 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85,
+          0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e },
+        { 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
+          0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b },
+        { 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20,
+          0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4 },
+        { 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31,
+          0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f },
+        { 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
+          0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef },
+        { 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0,
+          0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61 },
+        { 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26,
+          0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d }
+};
+
+/* ========================================================================== */
+/* Emulation API helper functions */
+/* ========================================================================== */
+
+static void xor_xmm(union xmm_reg *d,
+                    const union xmm_reg *s1,
+                    const union xmm_reg *s2)
+{
+        uint32_t i;
+
+        for (i = 0; i < MAX_QWORDS_PER_XMM; i++)
+                d->qword[i] = s1->qword[i] ^ s2->qword[i];
+}
+
+static void substitute_bytes(union xmm_reg *dst, const union xmm_reg *src)
+{
+        uint8x16_t vx = vld1q_u8((uint8_t const *) &src->byte[0]);
+
+        IMB_ASSERT(MAX_BYTES_PER_XMM == 16);
+
+        vx = lookup_16x8bit_neon(vx, aes_sbox);
+        vst1q_u8((uint8_t *) &dst->byte[0], vx);
+}
+
+static uint8_t gfmul(const uint8_t x, const uint8_t y)
+{
+        uint32_t i;
+        uint8_t multiplier = y;
+        uint8_t out = 0;
+
+        for (i = 0; i < 7; i++) {
+                if (i >= 1) {
+                        /* GFMUL by 2. "xtimes" operation from FIPS document */
+                        uint8_t t = multiplier << 1; /* lop of the high bit */
+
+                        if (multiplier >> 7) /* look at the old high bit */
+                                multiplier = t ^ 0x1B; /* polynomial division */
+                        else
+                                multiplier = t;
+                }
+                if ((x >> i) & 1)
+                        out = out ^ multiplier;
+        }
+
+        return out;
+}
+
+static void mix_columns(union xmm_reg *dst, const union xmm_reg *src)
+{
+        uint32_t c;
+
+        for (c = 0; c < MAX_DWORDS_PER_XMM; c++) {
+                uint8_t s0c = src->byte[c*4+0];
+                uint8_t s1c = src->byte[c*4+1];
+                uint8_t s2c = src->byte[c*4+2];
+                uint8_t s3c = src->byte[c*4+3];
+
+                dst->byte[c*4+0] = gfmul(2, s0c) ^ gfmul(3, s1c) ^ s2c ^ s3c;
+                dst->byte[c*4+1] = s0c ^ gfmul(2, s1c) ^ gfmul(3, s2c) ^ s3c;
+                dst->byte[c*4+2] = s0c ^ s1c ^ gfmul(2, s2c) ^ gfmul(3, s3c);
+                dst->byte[c*4+3] = gfmul(3, s0c) ^ s1c ^ s2c ^ gfmul(2, s3c);
+        }
+}
+#endif /* AESNI_EMU_AARCH64_H */
diff --git a/lib/aarch64/alloc_aarch64.c b/lib/aarch64/alloc_aarch64.c
index 6677bce8e415cfcec05fe93adf8138654817be29..eb303b6c46921e1008de5d62e7dce31be7f903ab 100644
--- a/lib/aarch64/alloc_aarch64.c
+++ b/lib/aarch64/alloc_aarch64.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -56,6 +56,8 @@ const struct {
         OOO_INFO(zuc_eia3_ooo, MB_MGR_ZUC_OOO),
         OOO_INFO(zuc256_eea3_ooo, MB_MGR_ZUC_OOO),
         OOO_INFO(zuc256_eia3_ooo, MB_MGR_ZUC_OOO),
+        OOO_INFO(snow3g_uea2_ooo, MB_MGR_SNOW3G_OOO),
+        OOO_INFO(snow3g_uia2_ooo, MB_MGR_SNOW3G_OOO),
 };
 
 /**
diff --git a/lib/aarch64/mb_mgr_aarch64.c b/lib/aarch64/mb_mgr_aarch64.c
index cd7da998a31be7d7ea046a7a96f783a684edcd2f..e1c19d74cedb31c642f82cf995e5df91d1a357dc 100644
--- a/lib/aarch64/mb_mgr_aarch64.c
+++ b/lib/aarch64/mb_mgr_aarch64.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -72,6 +72,13 @@ IMB_JOB *submit_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state,
                                                IMB_JOB *job);
 IMB_JOB *flush_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state);
 
+IMB_JOB *submit_job_snow3g_uea2_aarch64_common(IMB_MGR *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uea2_aarch64_common(IMB_MGR *state);
+
+IMB_JOB *submit_job_snow3g_uia2_aarch64_common(IMB_MGR *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uia2_aarch64_common(IMB_MGR *state);
 /* ====================================================================== */
 
 #define SUBMIT_JOB               submit_job_aarch64
@@ -96,6 +103,11 @@ IMB_JOB *flush_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state);
 #define FLUSH_JOB_ZUC256_EEA3    flush_job_zuc256_eea3_aarch64
 #define SUBMIT_JOB_ZUC256_EIA3   submit_job_zuc256_eia3_aarch64
 #define FLUSH_JOB_ZUC256_EIA3    flush_job_zuc256_eia3_aarch64
+#define SUBMIT_JOB_SNOW3G_UEA2   submit_job_snow3g_uea2_aarch64
+#define FLUSH_JOB_SNOW3G_UEA2    flush_job_snow3g_uea2_aarch64
+#define SUBMIT_JOB_SNOW3G_UIA2   submit_job_snow3g_uia2_aarch64
+#define FLUSH_JOB_SNOW3G_UIA2    flush_job_snow3g_uia2_aarch64
+
 
 static IMB_JOB *
 (*submit_job_zuc_eea3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) =
@@ -129,6 +141,21 @@ static IMB_JOB *
 (*flush_job_zuc256_eia3_aarch64)(MB_MGR_ZUC_OOO *state) =
                                             flush_job_zuc256_eia3_aarch64_common;
 
+static IMB_JOB *
+(*submit_job_snow3g_uea2_aarch64)(IMB_MGR *state, IMB_JOB *job) =
+                                            submit_job_snow3g_uea2_aarch64_common;
+
+static IMB_JOB *
+(*flush_job_snow3g_uea2_aarch64)(IMB_MGR *state) =
+                                            flush_job_snow3g_uea2_aarch64_common;
+
+static IMB_JOB *
+(*submit_job_snow3g_uia2_aarch64)(IMB_MGR *state, IMB_JOB *job) =
+                                            submit_job_snow3g_uia2_aarch64_common;
+
+static IMB_JOB *
+(*flush_job_snow3g_uia2_aarch64)(IMB_MGR *state) =
+                                            flush_job_snow3g_uia2_aarch64_common;
 static void
 reset_ooo_mgrs(IMB_MGR *state)
 {
@@ -136,6 +163,8 @@ reset_ooo_mgrs(IMB_MGR *state)
         MB_MGR_ZUC_OOO *zuc_eia3_ooo = state->zuc_eia3_ooo;
         MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo;
         MB_MGR_ZUC_OOO *zuc256_eia3_ooo = state->zuc256_eia3_ooo;
+        MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo;
+        MB_MGR_SNOW3G_OOO *snow3g_uia2_ooo = state->snow3g_uia2_ooo;
 
         /* Init ZUC out-of-order fields */
         memset(zuc_eea3_ooo->lens, 0,
@@ -182,6 +211,34 @@ reset_ooo_mgrs(IMB_MGR *state)
         zuc256_eia3_ooo->init_not_done = 0;
         zuc256_eia3_ooo->unused_lane_bitmask = 0x0f;
 
+        /* Init SNOW3G out-of-order fields */
+        memset(snow3g_uea2_ooo->lens, 0,
+               sizeof(snow3g_uea2_ooo->lens));
+        memset(snow3g_uea2_ooo->job_in_lane, 0,
+               sizeof(snow3g_uea2_ooo->job_in_lane));
+        memset(snow3g_uea2_ooo->bits_fixup, 0,
+               sizeof(snow3g_uea2_ooo->bits_fixup));
+        snow3g_uea2_ooo->init_mask = 0;
+        snow3g_uea2_ooo->unused_lanes = 0xFF03020100;
+        snow3g_uea2_ooo->num_lanes_inuse = 0;
+        snow3g_uea2_ooo->init_done = 0;
+        memset(snow3g_uea2_ooo->ks, 0,
+               sizeof(snow3g_uea2_ooo->ks));
+        snow3g_uea2_ooo->road_block = 0;
+
+        memset(snow3g_uia2_ooo->lens, 0,
+               sizeof(snow3g_uia2_ooo->lens));
+        memset(snow3g_uia2_ooo->job_in_lane, 0,
+               sizeof(snow3g_uia2_ooo->job_in_lane));
+        memset(snow3g_uia2_ooo->bits_fixup, 0,
+               sizeof(snow3g_uia2_ooo->bits_fixup));
+        snow3g_uia2_ooo->init_mask = 0;
+        snow3g_uia2_ooo->unused_lanes = 0xFF03020100;
+        snow3g_uia2_ooo->num_lanes_inuse = 0;
+        snow3g_uia2_ooo->init_done = 0;
+        memset(snow3g_uia2_ooo->ks, 0,
+               sizeof(snow3g_uia2_ooo->ks));
+        snow3g_uia2_ooo->road_block = 0;
         return;
 }
 
@@ -247,6 +304,7 @@ init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs)
         state->snow3g_f8_4_buffer          = snow3g_f8_4_buffer_aarch64;
         state->snow3g_f8_8_buffer          = snow3g_f8_8_buffer_aarch64;
         state->snow3g_f8_n_buffer          = snow3g_f8_n_buffer_aarch64;
+        state->snow3g_f8_4_buffer_multikey = snow3g_f8_4_buffer_multikey_aarch64;
         state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_aarch64;
         state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_aarch64;
         state->snow3g_f9_1_buffer          = snow3g_f9_1_buffer_aarch64;
diff --git a/lib/aarch64/mb_mgr_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_aarch64_no_aesni.c
index bcf6f45705f00be9c59fb05d7af5e8824f24f852..fedb481738feb8eb498295612a5433eba22675b6 100644
--- a/lib/aarch64/mb_mgr_aarch64_no_aesni.c
+++ b/lib/aarch64/mb_mgr_aarch64_no_aesni.c
@@ -1,5 +1,5 @@
 /*********************************************************************
-  Copyright(c) 2021 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -53,6 +53,14 @@ IMB_JOB *flush_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state);
 IMB_JOB *submit_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state,
                                                  IMB_JOB *job);
 IMB_JOB *flush_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state);
+
+IMB_JOB *submit_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state);
+
+IMB_JOB *submit_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state);
 /* ====================================================================== */
 
 #define SUBMIT_JOB         submit_job_aarch64_no_aesni
@@ -77,6 +85,11 @@ IMB_JOB *flush_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state);
 #define FLUSH_JOB_ZUC256_EEA3    flush_job_zuc256_eea3_aarch64_no_aesni
 #define SUBMIT_JOB_ZUC256_EIA3   submit_job_zuc256_eia3_aarch64_no_aesni
 #define FLUSH_JOB_ZUC256_EIA3    flush_job_zuc256_eia3_aarch64_no_aesni
+#define SUBMIT_JOB_SNOW3G_UEA2   submit_job_snow3g_uea2_aarch64_no_aesni
+#define FLUSH_JOB_SNOW3G_UEA2    flush_job_snow3g_uea2_aarch64_no_aesni
+#define SUBMIT_JOB_SNOW3G_UIA2   submit_job_snow3g_uia2_aarch64_no_aesni
+#define FLUSH_JOB_SNOW3G_UIA2    flush_job_snow3g_uia2_aarch64_no_aesni
+
 
 static void
 reset_ooo_mgrs(IMB_MGR *state)
@@ -85,6 +98,8 @@ reset_ooo_mgrs(IMB_MGR *state)
         MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo;
         MB_MGR_ZUC_OOO *zuc_eia3_ooo = state->zuc_eia3_ooo;
         MB_MGR_ZUC_OOO *zuc256_eia3_ooo = state->zuc256_eia3_ooo;
+        MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo;
+        MB_MGR_SNOW3G_OOO *snow3g_uia2_ooo = state->snow3g_uia2_ooo;
 
         /* Init ZUC out-of-order fields */
         memset(zuc_eea3_ooo->lens, 0,
@@ -131,6 +146,34 @@ reset_ooo_mgrs(IMB_MGR *state)
         zuc256_eia3_ooo->init_not_done = 0;
         zuc256_eia3_ooo->unused_lane_bitmask = 0x0f;
 
+        /* Init SNOW3G out-of-order fields */
+        memset(snow3g_uea2_ooo->lens, 0,
+               sizeof(snow3g_uea2_ooo->lens));
+        memset(snow3g_uea2_ooo->job_in_lane, 0,
+               sizeof(snow3g_uea2_ooo->job_in_lane));
+        memset(snow3g_uea2_ooo->bits_fixup, 0,
+               sizeof(snow3g_uea2_ooo->bits_fixup));
+        snow3g_uea2_ooo->init_mask = 0;
+        snow3g_uea2_ooo->unused_lanes = 0xFF03020100;
+        snow3g_uea2_ooo->num_lanes_inuse = 0;
+        snow3g_uea2_ooo->init_done = 0;
+        memset(snow3g_uea2_ooo->ks, 0,
+               sizeof(snow3g_uea2_ooo->ks));
+        snow3g_uea2_ooo->road_block = 0;
+
+        memset(snow3g_uia2_ooo->lens, 0,
+               sizeof(snow3g_uia2_ooo->lens));
+        memset(snow3g_uia2_ooo->job_in_lane, 0,
+               sizeof(snow3g_uia2_ooo->job_in_lane));
+        memset(snow3g_uia2_ooo->bits_fixup, 0,
+               sizeof(snow3g_uia2_ooo->bits_fixup));
+        snow3g_uia2_ooo->init_mask = 0;
+        snow3g_uia2_ooo->unused_lanes = 0xFF03020100;
+        snow3g_uia2_ooo->num_lanes_inuse = 0;
+        snow3g_uia2_ooo->init_done = 0;
+        memset(snow3g_uia2_ooo->ks, 0,
+               sizeof(snow3g_uia2_ooo->ks));
+        snow3g_uia2_ooo->road_block = 0;
         return;
 }
 
@@ -179,6 +222,8 @@ init_mb_mgr_aarch64_no_aesni_internal(IMB_MGR *state, const int reset_mgrs)
         state->snow3g_f8_4_buffer  = snow3g_f8_4_buffer_aarch64_no_aesni;
         state->snow3g_f8_8_buffer  = snow3g_f8_8_buffer_aarch64_no_aesni;
         state->snow3g_f8_n_buffer  = snow3g_f8_n_buffer_aarch64_no_aesni;
+        state->snow3g_f8_4_buffer_multikey =
+                snow3g_f8_4_buffer_multikey_aarch64_no_aesni;
         state->snow3g_f8_8_buffer_multikey =
                 snow3g_f8_8_buffer_multikey_aarch64_no_aesni;
         state->snow3g_f8_n_buffer_multikey =
diff --git a/lib/aarch64/mb_mgr_code_aarch64.h b/lib/aarch64/mb_mgr_code_aarch64.h
index 45b4545e2d9702f8c60b6a746cadd16fa8365255..560adc58ebcec429719ccffc5a525bea38bf9144 100644
--- a/lib/aarch64/mb_mgr_code_aarch64.h
+++ b/lib/aarch64/mb_mgr_code_aarch64.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -128,7 +128,7 @@ SUBMIT_JOB_AES_ENC(IMB_MGR *state, IMB_JOB *job)
         MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo;
 
         if (IMB_CIPHER_SNOW3G_UEA2_BITLEN == job->cipher_mode) {
-                return submit_snow3g_uea2_job(state, job);
+                return SUBMIT_JOB_SNOW3G_UEA2(state, job);
         } else if (IMB_CIPHER_ZUC_EEA3 == job->cipher_mode) {
                 if (16 == job->key_len_in_bytes) {
                         return SUBMIT_JOB_ZUC_EEA3(zuc_eea3_ooo, job);
@@ -154,6 +154,8 @@ FLUSH_JOB_AES_ENC(IMB_MGR *state, IMB_JOB *job)
                 } else { /* assume 32 */
                         return FLUSH_JOB_ZUC256_EEA3(zuc256_eea3_ooo);
                 }
+        } else if (IMB_CIPHER_SNOW3G_UEA2_BITLEN == job->cipher_mode){
+                return FLUSH_JOB_SNOW3G_UEA2(state);
         } else { /* assume IMB_CIPHER_NULL */
                 return NULL;
         }
@@ -167,7 +169,7 @@ SUBMIT_JOB_AES_DEC(IMB_MGR *state, IMB_JOB *job)
         MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo;
 
         if (IMB_CIPHER_SNOW3G_UEA2_BITLEN == job->cipher_mode) {
-                return submit_snow3g_uea2_job(state, job);
+                return SUBMIT_JOB_SNOW3G_UEA2(state, job);
         } else if(IMB_CIPHER_ZUC_EEA3 == job->cipher_mode) {
                 if (16 == job->key_len_in_bytes) {
                         return SUBMIT_JOB_ZUC_EEA3(zuc_eea3_ooo, job);
@@ -194,6 +196,8 @@ FLUSH_JOB_AES_DEC(IMB_MGR *state, IMB_JOB *job)
                 } else { /* assume 32 */
                         return FLUSH_JOB_ZUC256_EEA3(zuc256_eea3_ooo);
                 }
+        } else if (IMB_CIPHER_SNOW3G_UEA2_BITLEN == job->cipher_mode){
+                return FLUSH_JOB_SNOW3G_UEA2(state);
         }
         (void) state;
         return NULL;
@@ -231,14 +235,7 @@ SUBMIT_JOB_HASH(IMB_MGR *state, IMB_JOB *job)
 
         switch (job->hash_alg) {
         case IMB_AUTH_SNOW3G_UIA2_BITLEN:
-                IMB_SNOW3G_F9_1_BUFFER(state, (const snow3g_key_schedule_t *)
-                               job->u.SNOW3G_UIA2._key,
-                               job->u.SNOW3G_UIA2._iv,
-                               job->src + job->hash_start_src_offset_in_bytes,
-                               job->msg_len_to_hash_in_bits,
-                               job->auth_tag_output);
-                job->status |= IMB_STATUS_COMPLETED_AUTH;
-                return job;
+                return SUBMIT_JOB_SNOW3G_UIA2(state, job);
         case IMB_AUTH_ZUC_EIA3_BITLEN:
                 return SUBMIT_JOB_ZUC_EIA3(zuc_eia3_ooo, job);
         case IMB_AUTH_ZUC256_EIA3_BITLEN:
@@ -257,6 +254,8 @@ FLUSH_JOB_HASH(IMB_MGR *state, IMB_JOB *job)
         MB_MGR_ZUC_OOO *zuc256_eia3_ooo = state->zuc256_eia3_ooo;
 
         switch (job->hash_alg) {
+        case IMB_AUTH_SNOW3G_UIA2_BITLEN:
+                return FLUSH_JOB_SNOW3G_UIA2(state);
         case IMB_AUTH_ZUC_EIA3_BITLEN:
                 return FLUSH_JOB_ZUC_EIA3(zuc_eia3_ooo);
         case IMB_AUTH_ZUC256_EIA3_BITLEN:
diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c
new file mode 100644
index 0000000000000000000000000000000000000000..ce55bbfbfe691cd93be64abc2484345e67d58874
--- /dev/null
+++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c
@@ -0,0 +1,42 @@
+/**********************************************************************
+  Copyright(c) 2022 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef SUBMIT_JOB_SNOW3G_UEA2
+#define SUBMIT_JOB_SNOW3G_UEA2    submit_job_snow3g_uea2_aarch64_common
+#define FLUSH_JOB_SNOW3G_UEA2     flush_job_snow3g_uea2_aarch64_common
+#define SUBMIT_JOB_SNOW3G_UIA2    submit_job_snow3g_uia2_aarch64_common
+#define FLUSH_JOB_SNOW3G_UIA2     flush_job_snow3g_uia2_aarch64_common
+#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64
+#define SNOW3G_F8_4_BUFFER_STREAM  snow3g_f8_4_buffer_stream_aarch64
+#define SNOW3G_F8_1_BUFFER_STREAM  snow3g_f8_1_buffer_stream_aarch64
+#define SNOW3G_F9_1_BUFFER_DIGEST  snow3g_f9_1_buffer_digest_aarch64
+#define SNOW3G_F9_4_BUFFER_KEYSTREAM  snow3g_f9_4_buffer_keystream_aarch64
+
+#endif
+
+#include "mb_mgr_snow3g_submit_flush_common_aarch64.h"
diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c
new file mode 100644
index 0000000000000000000000000000000000000000..56eafb845a3cccd1c52f82b854b5eaf615e3fe8b
--- /dev/null
+++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2022 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef SUBMIT_JOB_SNOW3G_UEA2
+#define SUBMIT_JOB_SNOW3G_UEA2    submit_job_snow3g_uea2_aarch64_no_aesni
+#define FLUSH_JOB_SNOW3G_UEA2     flush_job_snow3g_uea2_aarch64_no_aesni
+#define SUBMIT_JOB_SNOW3G_UIA2    submit_job_snow3g_uia2_aarch64_no_aesni
+#define FLUSH_JOB_SNOW3G_UIA2     flush_job_snow3g_uia2_aarch64_no_aesni
+#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64_no_aesni
+#define SNOW3G_F8_4_BUFFER_STREAM  snow3g_f8_4_buffer_stream_aarch64_no_aesni
+#define SNOW3G_F8_1_BUFFER_STREAM  snow3g_f8_1_buffer_stream_aarch64_no_aesni
+#define SNOW3G_F9_1_BUFFER_DIGEST  snow3g_f9_1_buffer_digest_aarch64_no_aesni
+#define SNOW3G_F9_4_BUFFER_KEYSTREAM  snow3g_f9_4_buffer_keystream_aarch64_no_aesni
+#endif
+
+#include "mb_mgr_snow3g_submit_flush_common_aarch64.h"
diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h b/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h
new file mode 100644
index 0000000000000000000000000000000000000000..0773c2c338af20743764871062bed0ea63a7d659
--- /dev/null
+++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h
@@ -0,0 +1,531 @@
+/**********************************************************************
+  Copyright(c) 2022 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef MB_MGR_SNOW3G_SUBMIT_FLUSH_AARCH64_H
+#define MB_MGR_SNOW3G_SUBMIT_FLUSH_AARCH64_H
+
+#include "include/ipsec_ooo_mgr.h"
+#include "snow3g_internal.h"
+#include "snow3g.h"
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#define SNOW3G_MB_MAX_LANES_SIMD    4
+
+#define INIT_DONE_MASK  0x0F
+#define INIT_ALL_DONE   INIT_DONE_MASK
+#define JOB_IS_COMPLETED(state, i)  \
+        (((state->job_in_lane[i]) != NULL) && (state->args.byte_length[i] == 0))
+#define JOB_NOT_INITIALIZED(state, i) \
+        ((state->args.INITIALIZED[i] == 0))
+#define JOB_INITIALIZED(state, i) \
+        ((state->args.INITIALIZED[i] == 1))
+#define JOB_IS_NULL(state, i) \
+        (state->job_in_lane[i] == NULL)
+
+
+IMB_JOB *SUBMIT_JOB_SNOW3G_UEA2(IMB_MGR *state, IMB_JOB *job);
+IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state);
+IMB_JOB *SUBMIT_JOB_SNOW3G_UIA2(IMB_MGR *state, IMB_JOB *job);
+IMB_JOB *FLUSH_JOB_SNOW3G_UIA2(IMB_MGR *state);
+
+static void snow3g_mb_mgr_insert_uea2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job)
+{
+    uint64_t used_lane_idx = state->unused_lanes & 0xff;
+    assert(used_lane_idx < SNOW3G_MB_MAX_LANES_SIMD);
+    state->unused_lanes =  state->unused_lanes >> 8;
+    state->args.iv[used_lane_idx] = job->iv;
+    state->args.keys[used_lane_idx] = job->enc_keys;
+    state->args.in[used_lane_idx] = job->src + job->cipher_start_src_offset_in_bytes;
+    state->args.out[used_lane_idx] = job->dst;
+    state->args.byte_length[used_lane_idx] = job->msg_len_to_cipher_in_bits / 8;
+    state->args.INITIALIZED[used_lane_idx] = 0;
+    state->lens[used_lane_idx] = job->msg_len_to_cipher_in_bits / 8;
+
+    state->job_in_lane[used_lane_idx] = job;
+}
+
+static void snow3g_mb_mgr_insert_uia2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job)
+{
+    uint64_t used_lane_idx = state->unused_lanes & 0xff;
+    assert(used_lane_idx < SNOW3G_MB_MAX_LANES_SIMD);
+    state->unused_lanes =  state->unused_lanes >> 8;
+    state->num_lanes_inuse++;
+    state->args.iv[used_lane_idx] = job->u.SNOW3G_UIA2._iv;
+    state->args.keys[used_lane_idx] = job->u.SNOW3G_UIA2._key;
+    state->args.in[used_lane_idx] = job->src + job->hash_start_src_offset_in_bytes;
+    state->args.out[used_lane_idx] = job->auth_tag_output;
+    state->args.INITIALIZED[used_lane_idx] = 0;
+    state->lens[used_lane_idx] = job->msg_len_to_hash_in_bits;
+    state->init_done = state->init_done & (~(1 << used_lane_idx) & 0xff);
+
+    state->job_in_lane[used_lane_idx] = job;
+}
+
+static IMB_JOB *snow3g_mb_mgr_free_uea2_job(MB_MGR_SNOW3G_OOO *state)
+{
+    IMB_JOB *ret = NULL;
+
+    for (int i = 0; i <= SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        if (JOB_IS_COMPLETED(state, i)) {
+            ret = state->job_in_lane[i];
+            ret->status |= IMB_STATUS_COMPLETED_CIPHER;
+            state->job_in_lane[i] = NULL;
+            state->unused_lanes = state->unused_lanes << 8;
+            state->unused_lanes |= i;
+            state->lens[i] = 0;
+            state->args.INITIALIZED[i] = 0;
+#ifdef SAFE_DATA
+            state->args.LFSR_0[i] = 0;
+            state->args.LFSR_1[i] = 0;
+            state->args.LFSR_2[i] = 0;
+            state->args.LFSR_3[i] = 0;
+            state->args.LFSR_4[i] = 0;
+            state->args.LFSR_5[i] = 0;
+            state->args.LFSR_6[i] = 0;
+            state->args.LFSR_7[i] = 0;
+            state->args.LFSR_8[i] = 0;
+            state->args.LFSR_9[i] = 0;
+            state->args.LFSR_10[i] = 0;
+            state->args.LFSR_11[i] = 0;
+            state->args.LFSR_12[i] = 0;
+            state->args.LFSR_13[i] = 0;
+            state->args.LFSR_14[i] = 0;
+            state->args.LFSR_15[i] = 0;
+            state->args.FSM_1[i] = 0;
+            state->args.FSM_2[i] = 0;
+            state->args.FSM_3[i] = 0;
+#endif
+            break;
+        }
+    }
+
+    return ret;
+}
+
+static IMB_JOB *snow3g_mb_mgr_free_uia2_job(MB_MGR_SNOW3G_OOO *state, int i)
+{
+    IMB_JOB *ret = NULL;
+    assert(!JOB_IS_NULL(state, i));
+    ret = state->job_in_lane[i];
+    ret->status |= IMB_STATUS_COMPLETED_AUTH;
+    state->job_in_lane[i] = NULL;
+    state->unused_lanes = state->unused_lanes << 8;
+    state->unused_lanes |= i;
+    state->num_lanes_inuse--;
+    state->lens[i] = 0;
+    state->args.INITIALIZED[i] = 0;
+    state->init_done = state->init_done & (~(1 << i) & 0xff);
+
+#ifdef SAFE_DATA
+    state->args.LFSR_0[i] = 0;
+    state->args.LFSR_1[i] = 0;
+    state->args.LFSR_2[i] = 0;
+    state->args.LFSR_3[i] = 0;
+    state->args.LFSR_4[i] = 0;
+    state->args.LFSR_5[i] = 0;
+    state->args.LFSR_6[i] = 0;
+    state->args.LFSR_7[i] = 0;
+    state->args.LFSR_8[i] = 0;
+    state->args.LFSR_9[i] = 0;
+    state->args.LFSR_10[i] = 0;
+    state->args.LFSR_11[i] = 0;
+    state->args.LFSR_12[i] = 0;
+    state->args.LFSR_13[i] = 0;
+    state->args.LFSR_14[i] = 0;
+    state->args.LFSR_15[i] = 0;
+    state->args.FSM_1[i] = 0;
+    state->args.FSM_2[i] = 0;
+    state->args.FSM_3[i] = 0;
+    for (int k = 0; k < 5; k++) {
+        state->ks[i * 5 + k] = 0;
+    }
+#endif
+
+    return ret;
+}
+
+__forceinline
+void cpy_snow3g_state_to_ctx_1(snow3gKeyState1_t* ctx, MB_MGR_SNOW3G_OOO* state, const int num_lane) {
+    SNOW3G_ARGS args = state->args;
+    ctx->LFSR_S[0] = args.LFSR_0[num_lane];
+    ctx->LFSR_S[1] = args.LFSR_1[num_lane];
+    ctx->LFSR_S[2] = args.LFSR_2[num_lane];
+    ctx->LFSR_S[3] = args.LFSR_3[num_lane];
+    ctx->LFSR_S[4] = args.LFSR_4[num_lane];
+    ctx->LFSR_S[5] = args.LFSR_5[num_lane];
+    ctx->LFSR_S[6] = args.LFSR_6[num_lane];
+    ctx->LFSR_S[7] = args.LFSR_7[num_lane];
+    ctx->LFSR_S[8] = args.LFSR_8[num_lane];
+    ctx->LFSR_S[9] = args.LFSR_9[num_lane];
+    ctx->LFSR_S[10] = args.LFSR_10[num_lane];
+    ctx->LFSR_S[11] = args.LFSR_11[num_lane];
+    ctx->LFSR_S[12] = args.LFSR_12[num_lane];
+    ctx->LFSR_S[13] = args.LFSR_13[num_lane];
+    ctx->LFSR_S[14] = args.LFSR_14[num_lane];
+    ctx->LFSR_S[15] = args.LFSR_15[num_lane];
+    ctx->FSM_R1 = args.FSM_1[num_lane];
+    ctx->FSM_R2 = args.FSM_2[num_lane];
+    ctx->FSM_R3 = args.FSM_3[num_lane];
+}
+
+__forceinline
+void cpy_snow3g_ctx_to_state_after_stream(MB_MGR_SNOW3G_OOO* state, snow3gKeyState4_t* ctx) {
+    SNOW3G_ARGS *args = &(state->args);
+    const uint32_t *pLFSR_0 = (const uint32_t *) &ctx->LFSR_X[ctx->iLFSR_X];
+    const uint32_t *pLFSR_1 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 1) & 15];
+    const uint32_t *pLFSR_2 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 2) & 15];
+    const uint32_t *pLFSR_3 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 3) & 15];
+    const uint32_t *pLFSR_4 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 4) & 15];
+    const uint32_t *pLFSR_5 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 5) & 15];
+    const uint32_t *pLFSR_6 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 6) & 15];
+    const uint32_t *pLFSR_7 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 7) & 15];
+    const uint32_t *pLFSR_8 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 8) & 15];
+    const uint32_t *pLFSR_9 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 9) & 15];
+    const uint32_t *pLFSR_10 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 10) & 15];
+    const uint32_t *pLFSR_11 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 11) & 15];
+    const uint32_t *pLFSR_12 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 12) & 15];
+    const uint32_t *pLFSR_13 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 13) & 15];
+    const uint32_t *pLFSR_14 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 14) & 15];
+    const uint32_t *pLFSR_15 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 15) & 15];
+    const uint32_t *pFSM_1 = (const uint32_t *) &ctx->FSM_X[0];
+    const uint32_t *pFSM_2 = (const uint32_t *) &ctx->FSM_X[1];
+    const uint32_t *pFSM_3 = (const uint32_t *) &ctx->FSM_X[2];
+    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        if (!JOB_IS_COMPLETED(state, i)) {
+            args->LFSR_0[i] = pLFSR_0[i];
+            args->LFSR_1[i] = pLFSR_1[i];
+            args->LFSR_2[i] = pLFSR_2[i];
+            args->LFSR_3[i] = pLFSR_3[i];
+            args->LFSR_4[i] = pLFSR_4[i];
+            args->LFSR_5[i] = pLFSR_5[i];
+            args->LFSR_6[i] = pLFSR_6[i];
+            args->LFSR_7[i] = pLFSR_7[i];
+            args->LFSR_8[i] = pLFSR_8[i];
+            args->LFSR_9[i] = pLFSR_9[i];
+            args->LFSR_10[i] = pLFSR_10[i];
+            args->LFSR_11[i] = pLFSR_11[i];
+            args->LFSR_12[i] = pLFSR_12[i];
+            args->LFSR_13[i] = pLFSR_13[i];
+            args->LFSR_14[i] = pLFSR_14[i];
+            args->LFSR_15[i] = pLFSR_15[i];
+            args->FSM_1[i] = pFSM_1[i];
+            args->FSM_2[i] = pFSM_2[i];
+            args->FSM_3[i] = pFSM_3[i];
+        }
+    }
+}
+
+__forceinline
+void cpy_snow3g_state_to_ctx_after_initialize(snow3gKeyState4_t* ctx, MB_MGR_SNOW3G_OOO* state) {
+    SNOW3G_ARGS *args = &(state->args);
+    uint32_t *pLFSR_0 = (uint32_t *) &ctx->LFSR_X[ctx->iLFSR_X];
+    uint32_t *pLFSR_1 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 1) & 15];
+    uint32_t *pLFSR_2 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 2) & 15];
+    uint32_t *pLFSR_3 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 3) & 15];
+    uint32_t *pLFSR_4 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 4) & 15];
+    uint32_t *pLFSR_5 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 5) & 15];
+    uint32_t *pLFSR_6 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 6) & 15];
+    uint32_t *pLFSR_7 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 7) & 15];
+    uint32_t *pLFSR_8 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 8) & 15];
+    uint32_t *pLFSR_9 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 9) & 15];
+    uint32_t *pLFSR_10 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 10) & 15];
+    uint32_t *pLFSR_11 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 11) & 15];
+    uint32_t *pLFSR_12 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 12) & 15];
+    uint32_t *pLFSR_13 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 13) & 15];
+    uint32_t *pLFSR_14 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 14) & 15];
+    uint32_t *pLFSR_15 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 15) & 15];
+    uint32_t *pFSM_1 = (uint32_t *) &ctx->FSM_X[0];
+    uint32_t *pFSM_2 = (uint32_t *) &ctx->FSM_X[1];
+    uint32_t *pFSM_3 = (uint32_t *) &ctx->FSM_X[2];
+    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        if (JOB_INITIALIZED(state, i)) {
+            pLFSR_0[i] = args->LFSR_0[i];
+            pLFSR_1[i] = args->LFSR_1[i];
+            pLFSR_2[i] = args->LFSR_2[i];
+            pLFSR_3[i] = args->LFSR_3[i];
+            pLFSR_4[i] = args->LFSR_4[i];
+            pLFSR_5[i] = args->LFSR_5[i];
+            pLFSR_6[i] = args->LFSR_6[i];
+            pLFSR_7[i] = args->LFSR_7[i];
+            pLFSR_8[i] = args->LFSR_8[i];
+            pLFSR_9[i] = args->LFSR_9[i];
+            pLFSR_10[i] = args->LFSR_10[i];
+            pLFSR_11[i] = args->LFSR_11[i];
+            pLFSR_12[i] = args->LFSR_12[i];
+            pLFSR_13[i] = args->LFSR_13[i];
+            pLFSR_14[i] = args->LFSR_14[i];
+            pLFSR_15[i] = args->LFSR_15[i];
+            pFSM_1[i] = args->FSM_1[i];
+            pFSM_2[i] = args->FSM_2[i];
+            pFSM_3[i] = args->FSM_3[i];
+        }
+    }
+}
+
+IMB_JOB *SUBMIT_JOB_SNOW3G_UEA2(IMB_MGR *state,
+                                       IMB_JOB *job)
+{
+    MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uea2_ooo;
+    uint32_t msg_bitlen = job->msg_len_to_cipher_in_bits;
+    uint32_t msg_bitoff = job->cipher_start_src_offset_in_bits;
+
+    /* Use bit length API if
+     * - msg length is not a multiple of bytes
+     * - bit offset is not a multiple of bytes
+     */
+    if ((msg_bitlen & 0x07) || (msg_bitoff & 0x07)) {
+        IMB_SNOW3G_F8_1_BUFFER_BIT(state, job->enc_keys, job->iv, job->src,
+                                   job->dst, msg_bitlen, msg_bitoff);
+        job->status |= IMB_STATUS_COMPLETED_CIPHER;
+        return job;
+    }
+
+    IMB_JOB *ret = NULL;
+
+    snow3g_mb_mgr_insert_uea2_job(snow3g_state, job);
+
+    ret = snow3g_mb_mgr_free_uea2_job(snow3g_state);
+    if (ret != NULL)
+        return ret;
+
+    if (snow3g_state->unused_lanes != 0xff)
+        return NULL;
+
+    uint32_t min_word_len = UINT32_MAX;
+    snow3gKeyState4_t ctx;
+    SNOW3G_F8_4_BUFFER_INITIALIZE(&ctx, snow3g_state->args.keys[0], snow3g_state->args.keys[1],
+                                  snow3g_state->args.keys[2], snow3g_state->args.keys[3],
+                                  snow3g_state->args.iv[0],snow3g_state->args.iv[1],
+                                  snow3g_state->args.iv[2],snow3g_state->args.iv[3]);
+
+    cpy_snow3g_state_to_ctx_after_initialize(&ctx, snow3g_state);
+
+    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        if (JOB_NOT_INITIALIZED(snow3g_state, i)) {
+            snow3g_state->args.INITIALIZED[i] = 1;
+        }
+        min_word_len = (min_word_len < snow3g_state->args.byte_length[i] / SNOW3G_4_BYTES) ?
+                       min_word_len : snow3g_state->args.byte_length[i] / SNOW3G_4_BYTES;
+    }
+
+    SNOW3G_F8_4_BUFFER_STREAM(&ctx,
+                              snow3g_state->args.in[0],snow3g_state->args.out[0],
+                              snow3g_state->args.in[1],snow3g_state->args.out[1],
+                              snow3g_state->args.in[2],snow3g_state->args.out[2],
+                              snow3g_state->args.in[3],snow3g_state->args.out[3],
+                              min_word_len * SNOW3G_4_BYTES);
+
+    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        snow3g_state->args.in[i] = (uint8_t *)snow3g_state->args.in[i] +
+                                   min_word_len * SNOW3G_4_BYTES;
+        snow3g_state->args.out[i] = (uint8_t *)snow3g_state->args.out[i] +
+                                    min_word_len * SNOW3G_4_BYTES;
+        snow3g_state->args.byte_length[i] -= min_word_len * SNOW3G_4_BYTES;
+    }
+
+    cpy_snow3g_ctx_to_state_after_stream(snow3g_state, &ctx);
+
+    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        //if less than one word left, finish job here.
+        if (snow3g_state->args.byte_length[i] < SNOW3G_4_BYTES &&
+            snow3g_state->args.byte_length[i] != 0) {
+            snow3gKeyState1_t ctx_1;
+            cpy_snow3g_state_to_ctx_1(&ctx_1, snow3g_state, i);
+            SNOW3G_F8_1_BUFFER_STREAM(&ctx_1, snow3g_state->args.in[i],
+                                      snow3g_state->args.out[i],
+                                      snow3g_state->args.byte_length[i]);
+            snow3g_state->args.byte_length[i] = 0;
+        }
+    }
+
+    ret = snow3g_mb_mgr_free_uea2_job(snow3g_state);
+
+#ifdef SAFE_DATA
+    //data has been cleard in snow3g_mb_mgr_free_uea2_job.
+#endif
+
+    return ret;
+}
+
+IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state)
+{
+    IMB_JOB *ret = NULL;
+    MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uea2_ooo;
+    ret = snow3g_mb_mgr_free_uea2_job(snow3g_state);
+
+    if (ret != NULL) {
+        return ret;
+    }
+
+    for (int i = 0; i <= SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        if (snow3g_state->job_in_lane[i] != NULL && snow3g_state->lens[i] != 0) {
+            ret = snow3g_state->job_in_lane[i];
+
+            if (JOB_NOT_INITIALIZED(snow3g_state, i)) {
+                //if not initialized
+                IMB_SNOW3G_F8_1_BUFFER(state, snow3g_state->args.keys[i],
+                                       snow3g_state->args.iv[i],
+                                       snow3g_state->args.in[i],
+                                       snow3g_state->args.out[i],
+                                       snow3g_state->args.byte_length[i]);
+            } else {
+                snow3gKeyState1_t ctx;
+                cpy_snow3g_state_to_ctx_1(&ctx, snow3g_state, i);
+                SNOW3G_F8_1_BUFFER_STREAM(&ctx, snow3g_state->args.in[i],
+                                          snow3g_state->args.out[i],
+                                          snow3g_state->args.byte_length[i]);
+            }
+
+            ret->status |= IMB_STATUS_COMPLETED_CIPHER;
+            snow3g_state->lens[i] = 0;
+            snow3g_state->job_in_lane[i] = NULL;
+            snow3g_state->unused_lanes = snow3g_state->unused_lanes << 8;
+            snow3g_state->unused_lanes |= i;
+            snow3g_state->args.byte_length[i] = 0;
+            snow3g_state->args.INITIALIZED[i] = 0;
+#ifdef SAFE_DATA
+            snow3g_state->args.LFSR_0[i] = 0;
+            snow3g_state->args.LFSR_1[i] = 0;
+            snow3g_state->args.LFSR_2[i] = 0;
+            snow3g_state->args.LFSR_3[i] = 0;
+            snow3g_state->args.LFSR_4[i] = 0;
+            snow3g_state->args.LFSR_5[i] = 0;
+            snow3g_state->args.LFSR_6[i] = 0;
+            snow3g_state->args.LFSR_7[i] = 0;
+            snow3g_state->args.LFSR_8[i] = 0;
+            snow3g_state->args.LFSR_9[i] = 0;
+            snow3g_state->args.LFSR_10[i] = 0;
+            snow3g_state->args.LFSR_11[i] = 0;
+            snow3g_state->args.LFSR_12[i] = 0;
+            snow3g_state->args.LFSR_13[i] = 0;
+            snow3g_state->args.LFSR_14[i] = 0;
+            snow3g_state->args.LFSR_15[i] = 0;
+            snow3g_state->args.FSM_1[i] = 0;
+            snow3g_state->args.FSM_2[i] = 0;
+            snow3g_state->args.FSM_3[i] = 0;
+#endif
+            return ret;
+        }
+    }
+    return NULL;
+}
+
+IMB_JOB *SUBMIT_JOB_SNOW3G_UIA2(IMB_MGR *state,
+                                       IMB_JOB *job)
+{
+    MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uia2_ooo;
+
+    IMB_JOB *ret = NULL;
+
+    snow3g_mb_mgr_insert_uia2_job(snow3g_state, job);
+
+    if (snow3g_state->unused_lanes != 0xff)
+        return NULL;
+
+    if (snow3g_state->init_done == 0) {
+        //all lanes are not initialized.
+        snow3gKeyState4_t ctx;
+        SNOW3G_F8_4_BUFFER_INITIALIZE(&ctx,
+                                      snow3g_state->args.keys[0], snow3g_state->args.keys[1],
+                                      snow3g_state->args.keys[2], snow3g_state->args.keys[3],
+                                      snow3g_state->args.iv[0],snow3g_state->args.iv[1],
+                                      snow3g_state->args.iv[2],snow3g_state->args.iv[3]);
+        SNOW3G_F9_4_BUFFER_KEYSTREAM(&ctx,
+                                     &snow3g_state->ks[0*5],
+                                     &snow3g_state->ks[1*5],
+                                     &snow3g_state->ks[2*5],
+                                     &snow3g_state->ks[3*5]);
+        snow3g_state->init_done = INIT_ALL_DONE;
+    }
+
+    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        if (snow3g_state->init_done & (1 << i)) {
+            //pick a initialized lane
+            SNOW3G_F9_1_BUFFER_DIGEST(&snow3g_state->ks[i*5], snow3g_state->args.in[i],
+                                      snow3g_state->lens[i], snow3g_state->args.out[i]);
+            ret = snow3g_mb_mgr_free_uia2_job(snow3g_state, i);
+            break;
+        }
+    }
+    return ret;
+}
+
+IMB_JOB *FLUSH_JOB_SNOW3G_UIA2(IMB_MGR *state)
+{
+    IMB_JOB *ret = NULL;
+    MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uia2_ooo;
+
+    if (snow3g_state->num_lanes_inuse == 0) {
+        //empty
+        return NULL;
+    }
+    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        if (snow3g_state->init_done & (1<<i)) {
+            //pick a initialized lane
+            SNOW3G_F9_1_BUFFER_DIGEST(&snow3g_state->ks[i*5], snow3g_state->args.in[i],
+                                      snow3g_state->lens[i], snow3g_state->args.out[i]);
+            ret = snow3g_mb_mgr_free_uia2_job(snow3g_state, i);
+            return ret;
+        }
+    }
+    int lane_idx;
+    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        if (!JOB_IS_NULL(snow3g_state, i)) {
+            snow3g_state->init_done |= (1<<i);
+            lane_idx = i;
+        }
+    }
+    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
+        //copy keys and ivs to empty lane
+        if (JOB_IS_NULL(snow3g_state, i)) {
+            snow3g_state->args.keys[i] = snow3g_state->args.keys[lane_idx];
+            snow3g_state->args.iv[i] = snow3g_state->args.iv[lane_idx];
+        }
+    }
+
+    snow3gKeyState4_t ctx;
+    SNOW3G_F8_4_BUFFER_INITIALIZE(&ctx,
+                                  snow3g_state->args.keys[0], snow3g_state->args.keys[1],
+                                  snow3g_state->args.keys[2], snow3g_state->args.keys[3],
+                                  snow3g_state->args.iv[0],snow3g_state->args.iv[1],
+                                  snow3g_state->args.iv[2],snow3g_state->args.iv[3]);
+    SNOW3G_F9_4_BUFFER_KEYSTREAM(&ctx,
+                                 &snow3g_state->ks[0*5],
+                                 &snow3g_state->ks[1*5],
+                                 &snow3g_state->ks[2*5],
+                                 &snow3g_state->ks[3*5]);
+    //pick a initialized lane
+    SNOW3G_F9_1_BUFFER_DIGEST(&snow3g_state->ks[lane_idx*5], snow3g_state->args.in[lane_idx],
+                              snow3g_state->lens[lane_idx], snow3g_state->args.out[lane_idx]);
+    ret = snow3g_mb_mgr_free_uia2_job(snow3g_state, lane_idx);
+    return ret;
+}
+
+#endif //MB_MGR_SNOW3G_SUBMIT_FLUSH_AARCH64_H
diff --git a/lib/aarch64/snow3g_aarch64.c b/lib/aarch64/snow3g_aarch64.c
index 6ff912bb4460d5fa0f6925dff70549cf4cb9a385..4b4172fd6f2380a05fd592295c4ac8d8ef2fa986 100644
--- a/lib/aarch64/snow3g_aarch64.c
+++ b/lib/aarch64/snow3g_aarch64.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -32,10 +32,16 @@
 #define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_aarch64
 #define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_aarch64
 #define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_aarch64
+#define SNOW3G_F8_4_BUFFER_MULTIKEY snow3g_f8_4_buffer_multikey_aarch64
 #define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_aarch64
 #define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_aarch64
 #define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64
 #define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64
 #define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64
+#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64
+#define SNOW3G_F8_4_BUFFER_STREAM  snow3g_f8_4_buffer_stream_aarch64
+#define SNOW3G_F8_1_BUFFER_STREAM  snow3g_f8_1_buffer_stream_aarch64
+#define SNOW3G_F9_1_BUFFER_DIGEST  snow3g_f9_1_buffer_digest_aarch64
+#define SNOW3G_F9_4_BUFFER_KEYSTREAM snow3g_f9_4_buffer_keystream_aarch64
 
 #include "snow3g_common_aarch64.h"
diff --git a/lib/aarch64/snow3g_aarch64_no_aesni.c b/lib/aarch64/snow3g_aarch64_no_aesni.c
index fbc861b23eddfc3373e838a67b5c8c885f52c9b7..f5a9e589bd07fc3f240ac2894efcfa48d0a9768f 100644
--- a/lib/aarch64/snow3g_aarch64_no_aesni.c
+++ b/lib/aarch64/snow3g_aarch64_no_aesni.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -34,10 +34,16 @@
 #define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_aarch64_no_aesni
 #define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_aarch64_no_aesni
 #define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_aarch64_no_aesni
+#define SNOW3G_F8_4_BUFFER_MULTIKEY snow3g_f8_4_buffer_multikey_aarch64_no_aesni
 #define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_aarch64_no_aesni
 #define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_aarch64_no_aesni
 #define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64_no_aesni
 #define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64_no_aesni
 #define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64_no_aesni
+#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64_no_aesni
+#define SNOW3G_F8_4_BUFFER_STREAM  snow3g_f8_4_buffer_stream_aarch64_no_aesni
+#define SNOW3G_F8_1_BUFFER_STREAM  snow3g_f8_1_buffer_stream_aarch64_no_aesni
+#define SNOW3G_F9_1_BUFFER_DIGEST  snow3g_f9_1_buffer_digest_aarch64_no_aesni
+#define SNOW3G_F9_4_BUFFER_KEYSTREAM snow3g_f9_4_buffer_keystream_aarch64_no_aesni
 
 #include "snow3g_common_aarch64.h"
diff --git a/lib/aarch64/snow3g_common_aarch64.h b/lib/aarch64/snow3g_common_aarch64.h
index d8da71edd1fbaf9bb3da8a11e9a9051e89a73eb1..f404924250285037b962e2d606de3cee31eb9939 100644
--- a/lib/aarch64/snow3g_common_aarch64.h
+++ b/lib/aarch64/snow3g_common_aarch64.h
@@ -34,13 +34,14 @@
 #include <stdint.h>
 
 #include "ipsec-mb.h"
+#include "snow3g_internal.h"
 #include "include/wireless_common.h"
 #include "snow3g.h"
 #include "snow3g_tables.h"
 #include "constant_lookup_aarch64.h"
 #include "clear_regs_mem_aarch64.h"
 #ifdef NO_AESNI
-#include "include/aesni_emu.h"
+#include "aesni_emu_aarch64.h"
 #endif
 #ifdef SAFE_PARAM
 #include "include/error.h"
@@ -49,258 +50,6 @@
 #define CLEAR_MEM clear_mem
 #define CLEAR_VAR clear_var
 
-#define MAX_KEY_LEN (16)
-#define SNOW3G_4_BYTES (4)
-#define SNOW3G_8_BYTES (8)
-#define SNOW3G_8_BITS (8)
-#define SNOW3G_16_BYTES (16)
-#define SNOW3G_16_BITS (16)
-
-#define SNOW3G_BLOCK_SIZE (8)
-
-#define SNOW3G_KEY_LEN_IN_BYTES (16) /* 128b */
-#define SNOW3G_IV_LEN_IN_BYTES (16)  /* 128b */
-
-#define SNOW3GCONSTANT (0x1b)
-
-/* Range of input data for SNOW3G is from 1 to 2^32 bits */
-#define SNOW3G_MIN_LEN 1
-#define SNOW3G_MAX_BITLEN (UINT32_MAX)
-#define SNOW3G_MAX_BYTELEN (UINT32_MAX / 8)
-
-typedef union SafeBuffer {
-        uint64_t b64;
-        uint32_t b32[2];
-        uint8_t b8[SNOW3G_8_BYTES];
-} SafeBuf;
-
-typedef struct snow3gKeyState1_s {
-        /* 16 LFSR stages */
-        uint32_t LFSR_S[16];
-        /* 3 FSM states */
-        uint32_t FSM_R3;
-        uint32_t FSM_R2;
-        uint32_t FSM_R1;
-} DECLARE_ALIGNED(snow3gKeyState1_t, 16);
-
-typedef struct snow3gKeyState4_s {
-        /* 16 LFSR stages */
-        uint32x4_t LFSR_X[16];
-        /* 3 FSM states */
-        uint32x4_t FSM_X[3];
-        uint32_t iLFSR_X;
-} snow3gKeyState4_t;
-
-/**
- * @brief Finds minimum 32-bit value in an array
- * @return Min 32-bit value
- */
-static inline uint32_t
-length_find_min(const uint32_t *out_array, const size_t dim_array)
-{
-        size_t i;
-        uint32_t min = 0;
-
-        if (dim_array > 0)
-                min  = out_array[0];
-
-        for (i = 1; i < dim_array; i++)
-                if (out_array[i] < min)
-                        min = out_array[i];
-
-        return min;
-}
-
-/**
- * @brief Subtracts \a subv from a vector of 32-bit words
- */
-static inline void
-length_sub(uint32_t *out_array, const size_t dim_array, const uint32_t subv)
-{
-        size_t i;
-
-        for (i = 0; i < dim_array; i++)
-                out_array[i] -= subv;
-}
-
-#ifdef SAFE_PARAM
-/**
- * @brief Checks vector of length values against 0 and SNOW3G_MAX_BYTELEN values
- * @retval 0 incorrect length value found
- * @retval 1 all OK
- */
-static inline uint32_t
-length_check(const uint32_t *out_array, const size_t dim_array)
-{
-        size_t i;
-
-        if (out_array == NULL) {
-                imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
-                return 0;
-        }
-
-        for (i = 0; i < dim_array; i++) {
-                if ((out_array[i] == 0) ||
-                    (out_array[i] > SNOW3G_MAX_BYTELEN)) {
-                        imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
-                        return 0;
-                    }
-        }
-
-        return 1;
-}
-#endif
-
-/**
- * @brief Copies 4 32-bit length values into an array
- */
-static inline void
-length_copy_4(uint32_t *out_array,
-              const uint32_t length1, const uint32_t length2,
-              const uint32_t length3, const uint32_t length4)
-{
-        out_array[0] = length1;
-        out_array[1] = length2;
-        out_array[2] = length3;
-        out_array[3] = length4;
-}
-
-/**
- * @brief Copies 8 32-bit length values into an array
- */
-static inline void
-length_copy_8(uint32_t *out_array,
-              const uint32_t length1, const uint32_t length2,
-              const uint32_t length3, const uint32_t length4,
-              const uint32_t length5, const uint32_t length6,
-              const uint32_t length7, const uint32_t length8)
-{
-        out_array[0] = length1;
-        out_array[1] = length2;
-        out_array[2] = length3;
-        out_array[3] = length4;
-        out_array[4] = length5;
-        out_array[5] = length6;
-        out_array[6] = length7;
-        out_array[7] = length8;
-}
-
-#ifdef SAFE_PARAM
-/**
- * @brief Checks vector of pointers against NULL
- * @retval 0 incorrect pointer found
- * @retval 1 all OK
- */
-static inline int
-ptr_check(void *out_array[], const size_t dim_array, const int errnum)
-{
-        size_t i;
-
-        if (out_array == NULL) {
-                imb_set_errno(NULL, errnum);
-                return 0;
-        }
-        for (i = 0; i < dim_array; i++)
-                if (out_array[i] == NULL) {
-                        imb_set_errno(NULL, errnum);
-                        return 0;
-                }
-        return 1;
-}
-#endif
-
-#ifdef SAFE_PARAM
-/**
- * @brief Checks vector of const pointers against NULL
- * @retval 0 incorrect pointer found
- * @retval 1 all OK
- */
-static inline int
-cptr_check(const void * const out_array[],
-           const size_t dim_array,
-           const int errnum)
-{
-        size_t i;
-
-        if (out_array == NULL) {
-                imb_set_errno(NULL, errnum);
-                return 0;
-        }
-        for (i = 0; i < dim_array; i++)
-                if (out_array[i] == NULL) {
-                        imb_set_errno(NULL, errnum);
-                        return 0;
-                }
-
-        return 1;
-}
-#endif
-
-/**
- * @brief Copies 4 pointers into an array
- */
-static inline void
-ptr_copy_4(void *out_array[],
-           void *ptr1, void *ptr2, void *ptr3, void *ptr4)
-{
-        out_array[0] = ptr1;
-        out_array[1] = ptr2;
-        out_array[2] = ptr3;
-        out_array[3] = ptr4;
-}
-
-/**
- * @brief Copies 4 const pointers into an array
- */
-static inline void
-cptr_copy_4(const void *out_array[],
-            const void *ptr1, const void *ptr2,
-            const void *ptr3, const void *ptr4)
-{
-        out_array[0] = ptr1;
-        out_array[1] = ptr2;
-        out_array[2] = ptr3;
-        out_array[3] = ptr4;
-}
-
-/**
- * @brief Copies 8 pointers into an array
- */
-static inline void
-ptr_copy_8(void *out_array[],
-           void *ptr1, void *ptr2, void *ptr3, void *ptr4,
-           void *ptr5, void *ptr6, void *ptr7, void *ptr8)
-{
-        out_array[0] = ptr1;
-        out_array[1] = ptr2;
-        out_array[2] = ptr3;
-        out_array[3] = ptr4;
-        out_array[4] = ptr5;
-        out_array[5] = ptr6;
-        out_array[6] = ptr7;
-        out_array[7] = ptr8;
-}
-
-/**
- * @brief Copies 8 const pointers into an array
- */
-static inline void
-cptr_copy_8(const void *out_array[],
-            const void *ptr1, const void *ptr2,
-            const void *ptr3, const void *ptr4,
-            const void *ptr5, const void *ptr6,
-            const void *ptr7, const void *ptr8)
-{
-        out_array[0] = ptr1;
-        out_array[1] = ptr2;
-        out_array[2] = ptr3;
-        out_array[3] = ptr4;
-        out_array[4] = ptr5;
-        out_array[5] = ptr6;
-        out_array[6] = ptr7;
-        out_array[7] = ptr8;
-}
-
 /**
  * @brief Wrapper for safe lookup of 16 indexes in 256x8-bit table
  * @param[in] indexes  vector of 16x8-bit indexes to be looked up
@@ -324,6 +73,27 @@ static inline void ShiftTwiceLFSR_1(snow3gKeyState1_t *pCtx)
                 pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 2];
 }
 
+#ifdef NO_AESNI
+static inline void emulate_AESENC_WITHOUT_SHIFTROW(union xmm_reg *dst,
+                                                   const union xmm_reg *key)
+{
+        union xmm_reg tmp = *dst;
+
+        substitute_bytes(&tmp, &tmp);
+        mix_columns(&tmp, &tmp);
+        xor_xmm(dst, &tmp, key);
+}
+
+static inline void emulate_AESENCLAST_WITHOUT_SHIFTROW(union xmm_reg *dst,
+                                                       const union xmm_reg *src)
+{
+        union xmm_reg tmp = *dst;
+
+        substitute_bytes(&tmp, &tmp);
+        xor_xmm(dst, &tmp, src);
+}
+#endif
+
 /**
  * @brief SNOW3G S2 mix column correction function
  *
@@ -408,7 +178,7 @@ static inline uint32_t S1_box(const uint32_t x)
         v.dword[0] = v.dword[1] =
                 v.dword[2] = v.dword[3] = x;
 
-        emulate_AESENC(&v, &key);
+        emulate_AESENC_WITHOUT_SHIFTROW(&v, &key);
         return v.dword[0];
 #else
         uint32x4_t dup_x;
@@ -426,8 +196,8 @@ static inline uint32_t S1_box(const uint32_t x)
 /**
  * @brief Sbox S1 maps a 2x32bit input to a 2x32bit output
  *
- * @param[in] x1  32-bit word to be passed through S1 box
- * @param[in] x2  32-bit word to be passed through S1 box
+ * @param[in/out] x1  32-bit word to be passed through S1 box
+ * @param[in/out] x2  32-bit word to be passed through S1 box
  */
 static inline void S1_box_2(uint32_t *x1, uint32_t *x2)
 {
@@ -462,69 +232,27 @@ static inline void S1_box_2(uint32_t *x1, uint32_t *x2)
 static inline uint32x4_t S1_box_4(const uint32x4_t x)
 {
 #ifdef NO_AESNI
-        union xmm_reg key, v, vt;
+        union xmm_reg key, v;
 
+        v.dword[0] = vgetq_lane_u32(x, 0);
+        v.dword[1] = vgetq_lane_u32(x, 1);
+        v.dword[2] = vgetq_lane_u32(x, 2);
+        v.dword[3] = vgetq_lane_u32(x, 3);
         key.qword[0] = key.qword[1] = 0;
 
-        /*
-         * - Broadcast 32-bit word across XMM
-         * - Perform AES operations
-         */
-        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 0);
-        emulate_AESENC(&vt, &key);
-        v.dword[0] = vt.dword[0];
-
-        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 1);
-        emulate_AESENC(&vt, &key);
-        v.dword[1] = vt.dword[0];
-
-        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 2);
-        emulate_AESENC(&vt, &key);
-        v.dword[2] = vt.dword[0];
-
-        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 3);
-        emulate_AESENC(&vt, &key);
-        v.dword[3] = vt.dword[0];
-
+        emulate_AESENC_WITHOUT_SHIFTROW(&v, &key);
         return vld1q_u32(&v.dword[0]);
 #else
-        const uint8x16_t m_zero = vdupq_n_u8(0);
-        uint8x16_t m1, m2, m3, m4;
-        uint32x4_t r1, r2;
-
-        m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 0)));
-        m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 1)));
-        m3 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 2)));
-        m4 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 3)));
+        const uint8x16_t inv_aes_shift_row = {0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b,
+                                              0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03};
+        uint8x16_t new_x = vreinterpretq_u8_u32(x);
+        new_x = vqtbl1q_u8(new_x, inv_aes_shift_row);
 
-        m1 = vaeseq_u8(m1, m_zero);
-        m1 = vaesmcq_u8(m1);
-        m2 = vaeseq_u8(m2, m_zero);
-        m2 = vaesmcq_u8(m2);
-        m3 = vaeseq_u8(m3, m_zero);
-        m3 = vaesmcq_u8(m3);
-        m4 = vaeseq_u8(m4, m_zero);
-        m4 = vaesmcq_u8(m4);
-
-        /*
-         * Put results of AES operations back into
-         * two vectors of 32-bit words
-         *
-         * First step:
-         * r1 = [ 0-31 m1 | 0-31 m2 | 32-63 m1 | 32-63 m2 ]
-         * r2 = [ 0-31 m3 | 0-31 m4 | 32-63 m3 | 32-63 m4 ]
-         */
-        r1 = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2));
-        r2 = vzip1q_u32(vreinterpretq_u32_u8(m3), vreinterpretq_u32_u8(m4));
-        r1 = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
-                                                vreinterpretq_u64_u32(r2)));
-        /*
-         * The last step:
-         * r1 = [ 0-63 m1 | 0-63 m3 ] =>
-         *      [ 0-31 m1 | 0-31 m2 | 0-31 m3 | 0-31 m4 ]
-         */
-
-        return r1;
+        const uint8x16_t m_zero = vdupq_n_u8(0);
+        new_x = vaeseq_u8(new_x, m_zero);
+        new_x = vaesmcq_u8(new_x);
+        uint32x4_t ret = vreinterpretq_u32_u8(new_x);
+        return ret;
 #endif
 }
 
@@ -552,8 +280,8 @@ static inline uint32_t S2_box(const uint32_t x)
 
         v_fixup = v;
 
-        emulate_AESENC(&v, &key);
-        emulate_AESENCLAST(&v_fixup, &key);
+        emulate_AESENC_WITHOUT_SHIFTROW(&v, &key);
+        emulate_AESENCLAST_WITHOUT_SHIFTROW(&v_fixup, &key);
 
         const uint8x16_t ret_mixc = vreinterpretq_u8_u32(
                                 vld1q_u32(&v.dword[0]));
@@ -656,96 +384,33 @@ static inline uint32x4_t S2_box_4(const uint32x4_t x)
 
         /* use AESNI operations for the rest of the S2 box */
 #ifdef NO_AESNI
-        union xmm_reg key, v, f;
-        union xmm_reg vt, ft;
-
+        union xmm_reg key, vt, ft;
         key.qword[0] = key.qword[1] = 0;
 
-        /*
-         * - Broadcast 32-bit word across XMM and
-         *   perform AES operations
-         * - Save result 32-bit words in v and f vectors.
-         *   'f' is used for fix-up of mixed columns only
-         */
-        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] =
-                                                vgetq_lane_u32(new_x, 0);
-        ft = vt;
-        emulate_AESENC(&vt, &key);
-        emulate_AESENCLAST(&ft, &key);
-        v.dword[0] = vt.dword[0];
-        f.dword[0] = ft.dword[0];
-
-        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] =
-                                                vgetq_lane_u32(new_x, 1);
-        ft = vt;
-        emulate_AESENC(&vt, &key);
-        emulate_AESENCLAST(&ft, &key);
-        v.dword[1] = vt.dword[0];
-        f.dword[1] = ft.dword[0];
-
-        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] =
-                                                vgetq_lane_u32(new_x, 2);
+        vt.dword[0] = vgetq_lane_u32(new_x, 0);
+        vt.dword[1] = vgetq_lane_u32(new_x, 1);
+        vt.dword[2] = vgetq_lane_u32(new_x, 2);
+        vt.dword[3] = vgetq_lane_u32(new_x, 3);
         ft = vt;
-        emulate_AESENC(&vt, &key);
-        emulate_AESENCLAST(&ft, &key);
-        v.dword[2] = vt.dword[0];
-        f.dword[2] = ft.dword[0];
 
-        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] =
-                                                vgetq_lane_u32(new_x, 3);
-        ft = vt;
-        emulate_AESENC(&vt, &key);
-        emulate_AESENCLAST(&ft, &key);
-        v.dword[3] = vt.dword[0];
-        f.dword[3] = ft.dword[0];
+        emulate_AESENC_WITHOUT_SHIFTROW(&vt, &key);
+        emulate_AESENCLAST_WITHOUT_SHIFTROW(&ft, &key);
 
-        return s2_mixc_fixup_4(vreinterpretq_u8_u32(vld1q_u32(&f.dword[0])),
-                                vreinterpretq_u8_u32(vld1q_u32(&v.dword[0])));
+        return s2_mixc_fixup_4(vreinterpretq_u8_u32(vld1q_u32(&ft.dword[0])),
+                                vreinterpretq_u8_u32(vld1q_u32(&vt.dword[0])));
 #else
-        const uint8x16_t zero = vdupq_n_u8(0);
-        uint8x16_t m1, m2, m3, m4, f1, f2, f3, f4, mixc, no_mixc;
-        uint32x4_t r1, r2;
+        const uint8x16_t inv_aes_shift_row = {0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b,
+                                              0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03};
+        uint8x16_t new_new_x = vreinterpretq_u8_u32(new_x);
+        new_new_x = vqtbl1q_u8(new_new_x, inv_aes_shift_row);
 
-        m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 0)));
-        m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 1)));
-        m3 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 2)));
-        m4 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 3)));
-
-        f1 = vaeseq_u8(m1, zero); // no_mixc
-        m1 = vaesmcq_u8(f1);
-        f2 = vaeseq_u8(m2, zero);
-        m2 = vaesmcq_u8(f2);
-        f3 = vaeseq_u8(m3, zero);
-        m3 = vaesmcq_u8(f3);
-        f4 = vaeseq_u8(m4, zero);
-        m4 = vaesmcq_u8(f4);
-
-        /*
-         * Put results of AES operations back into
-         * two vectors of 32-bit words
-         *
-         * First step:
-         * m1 = [ 0-31 m1 | 0-31 m2 | 32-63 m1 | 32-63 m2 ]
-         * m3 = [ 0-31 m3 | 0-31 m4 | 32-63 m3 | 32-63 m4 ]
-         */
-        /*
-         * The last step:
-         * m1 = [ 0-63 m1 | 0-63 m3 ] =>
-         *      [ 0-31 m1 | 0-31 m2 | 0-31 m3 | 0-31 m4 ]
-         * f1 = [ 0-63 f1 | 0-63 f3 ] =>
-         *      [ 0-31 f1 | 0-31 f2 | 0-31 f3 | 0-31 f4 ]
-         */
-        r1 = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2));
-        r2 = vzip1q_u32(vreinterpretq_u32_u8(m3), vreinterpretq_u32_u8(m4));
-        mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
-                                                vreinterpretq_u64_u32(r2)));
+        const uint8x16_t m_zero = vdupq_n_u8(0);
+        uint8x16_t ret_nomixc, ret_mixc;
 
-        r1 = vzip1q_u32(vreinterpretq_u32_u8(f1), vreinterpretq_u32_u8(f2));
-        r2 = vzip1q_u32(vreinterpretq_u32_u8(f3), vreinterpretq_u32_u8(f4));
-        no_mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
-                                                 vreinterpretq_u64_u32(r2)));
+        ret_nomixc = vaeseq_u8(new_new_x, m_zero);
+        ret_mixc = vaesmcq_u8(ret_nomixc);
 
-        return s2_mixc_fixup_4(no_mixc, mixc);
+        return s2_mixc_fixup_4(ret_nomixc, ret_mixc);
 #endif
 }
 
@@ -1234,8 +899,8 @@ snow3gStateInitialize_1(snow3gKeyState1_t *pCtx,
 /**
  * @brief Generates 5 words of key stream used in the initial stages of F9.
  *
- * @param[in]     pCtx        Context where the scheduled keys are stored
- * @param[in/out] pKeyStream  Pointer to the generated keystream
+ * @param[in/out] pCtx        Context where the scheduled keys are stored
+ * @param[out]    pKeyStream  Pointer to the generated keystream
  */
 static inline void snow3g_f9_keystream_words(snow3gKeyState1_t *pCtx,
                                              uint32_t *pKeyStream)
@@ -1253,7 +918,7 @@ static inline void snow3g_f9_keystream_words(snow3gKeyState1_t *pCtx,
 
 /**
  * @brief LFSR array shift by one (4 lanes)
- * @param[in]     pCtx       Context where the scheduled keys are stored
+ * @param[in/out]     pCtx       Context where the scheduled keys are stored
  */
 static inline void ShiftLFSR_4(snow3gKeyState4_t *pCtx)
 {
@@ -1331,7 +996,7 @@ static inline uint32x4_t C0_C11_4(const uint32x4_t L0, const uint32x4_t L11)
  *       ^ table_Alpha_mul[LFSR[0] >> 24]
  *       ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
  *
- * @param[in]     pCtx       Context where the scheduled keys are stored
+ * @param[in/out] pCtx       Context where the scheduled keys are stored
  */
 static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx)
 {
@@ -1357,7 +1022,7 @@ static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx)
  *
  * It operates on 4 packets/lanes at a time
  *
- * @param[in]     pCtx       Context where the scheduled keys are stored
+ * @param[in/out] pCtx       Context where the scheduled keys are stored
  * @return 4 x 4bytes of key stream
  */
 static inline uint32x4_t ClockFSM_4(snow3gKeyState4_t *pCtx)
@@ -1381,7 +1046,7 @@ static inline uint32x4_t ClockFSM_4(snow3gKeyState4_t *pCtx)
 /**
  * @brief Generates 4 bytes of key stream 1 buffer at a time
  *
- * @param[in]     pCtx       Context where the scheduled keys are stored
+ * @param[in/out] pCtx       Context where the scheduled keys are stored
  * @return 4 bytes of key stream
  */
 static inline uint32_t snow3g_keystream_1_4(snow3gKeyState1_t *pCtx)
@@ -1396,7 +1061,7 @@ static inline uint32_t snow3g_keystream_1_4(snow3gKeyState1_t *pCtx)
 /**
  * @brief Generates 8 bytes of key stream for 1 buffer at a time
  *
- * @param[in] pCtx Context where the scheduled keys are stored
+ * @param[in/out] pCtx Context where the scheduled keys are stored
  * @return 8 bytes of a key stream
  */
 static inline uint64_t snow3g_keystream_1_8(snow3gKeyState1_t *pCtx)
@@ -1472,8 +1137,7 @@ static inline uint64_t snow3g_keystream_1_8(snow3gKeyState1_t *pCtx)
 /**
  * @brief Generates 4 bytes of key stream 4 buffers at a time
  *
- * @param[in]      pCtx         Context where the scheduled keys are stored
- * @param[in/out]  pKeyStream   Pointer to generated key stream
+ * @param[in/out]  pCtx         Context where the scheduled keys are stored
  */
 static inline uint32x4_t snow3g_keystream_4_4(snow3gKeyState4_t *pCtx)
 {
@@ -1487,9 +1151,9 @@ static inline uint32x4_t snow3g_keystream_4_4(snow3gKeyState4_t *pCtx)
 /**
  * @brief Generates 8 bytes of key stream 4 buffers at a time
  *
- * @param[in]      pCtx         Context where the scheduled keys are stored
- * @param[in/out]  pKeyStreamLo Pointer to lower end of generated key stream
- * @param[in/out]  pKeyStreamHi Pointer to higher end of generated key stream
+ * @param[in/out]  pCtx         Context where the scheduled keys are stored
+ * @param[out]     pKeyStreamLo Pointer to lower end of generated key stream
+ * @param[out]     pKeyStreamHi Pointer to higher end of generated key stream
  */
 static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx,
                                         uint32x4_t *pKeyStreamLo,
@@ -1570,8 +1234,8 @@ static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx,
 /**
  * @brief Generates 16 bytes of key stream 4 buffers at a time
  *
- * @param[in]     pCtx         Context where the scheduled keys are stored
- * @param[in/out] pKeyStream   Pointer to store generated key stream
+ * @param[in/out] pCtx         Context where the scheduled keys are stored
+ * @param[out]    pKeyStream   Pointer to store generated key stream
  */
 static inline void snow3g_keystream_4_16(snow3gKeyState4_t *pCtx,
                                          uint32x4_t pKeyStream[4])
@@ -1612,7 +1276,7 @@ static inline void snow3g_keystream_4_16(snow3gKeyState4_t *pCtx,
 /**
  * @brief Initializes the key schedule for 4 buffers for SNOW3G f8/f9.
  *
- * @param [in]      pCtx        Context where the scheduled keys are stored
+ * @param [in/out]  pCtx        Context where the scheduled keys are stored
  * @param [in]      pKeySched   Key schedule
  * @param [in]      pIV1        IV for buffer 1
  * @param [in]      pIV2        IV for buffer 2
@@ -1633,7 +1297,7 @@ snow3gStateInitialize_4(snow3gKeyState4_t *pCtx,
 
         /* Load complete 128b IV into register */
         static const uint64_t sm[2] = {
-                0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL
+                0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL
         };
 
         R = vld1q_u32(pIV1);
@@ -1706,6 +1370,112 @@ snow3gStateInitialize_4(snow3gKeyState4_t *pCtx,
         }
 }
 
+/**
+ * @brief Initializes the four key schedule for 4 buffers for SNOW3G f8/f9.
+ *
+ * @param [in/out]  pCtx        Context where the scheduled keys are stored
+ * @param [in]      pKeySched1  Key1 schedule
+ * @param [in]      pKeySched2  Key2 schedule
+ * @param [in]      pKeySched3  Key3 schedule
+ * @param [in]      pKeySched4  Key4 schedule
+ * @param [in]      pIV1        IV for buffer 1
+ * @param [in]      pIV2        IV for buffer 2
+ * @param [in]      pIV3        IV for buffer 3
+ * @param [in]      pIV4        IV for buffer 4
+ */
+static inline void
+snow3gStateInitialize_4_multikey(snow3gKeyState4_t *pCtx,
+                                 const snow3g_key_schedule_t *pKeySched1,
+                                 const snow3g_key_schedule_t *pKeySched2,
+                                 const snow3g_key_schedule_t *pKeySched3,
+                                 const snow3g_key_schedule_t *pKeySched4,
+                                 const void *pIV1, const void *pIV2,
+                                 const void *pIV3, const void *pIV4)
+{
+        uint32x4_t R, S, T, U;
+        uint32x4_t T0, T1;
+        int i;
+
+        /* Initialize the LFSR table from constants, Keys, and IV */
+
+        /* Load complete 128b IV into register */
+        static const uint64_t sm[2] = {
+                0x0405060700010203ULL, 0x0c0d0e0f08090a0bULL
+        };
+
+        R = vld1q_u32(pIV1);
+        S = vld1q_u32(pIV2);
+        T = vld1q_u32(pIV3);
+        U = vld1q_u32(pIV4);
+
+        uint32x4_t VK[4];
+        uint32x4_t VL[4];
+        /* initialize the array block */
+        for (i = 0; i < 4; i++) {
+                VK[i] = vsetq_lane_u32(pKeySched1->k[i], VK[i], 0);
+                VK[i] = vsetq_lane_u32(pKeySched2->k[i], VK[i], 1);
+                VK[i] = vsetq_lane_u32(pKeySched3->k[i], VK[i], 2);
+                VK[i] = vsetq_lane_u32(pKeySched4->k[i], VK[i], 3);
+                VL[i] = ~VK[i];
+
+                pCtx->LFSR_X[i + 4] =
+                        pCtx->LFSR_X[i + 12] = VK[i];
+                pCtx->LFSR_X[i + 0] =
+                        pCtx->LFSR_X[i + 8] = VL[i];
+        }
+
+        /* Update the schedule structure with IVs */
+        /* Store the 4 IVs in LFSR by a column/row matrix swap
+         * after endianness correction */
+
+        /* endianness swap */
+        const uint64x2_t swapMask = vld1q_u64(sm);
+
+        R = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(R),
+                                        vreinterpretq_u8_u64(swapMask)));
+        S = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(S),
+                                        vreinterpretq_u8_u64(swapMask)));
+        T = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(T),
+                                        vreinterpretq_u8_u64(swapMask)));
+        U = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(U),
+                                        vreinterpretq_u8_u64(swapMask)));
+
+        /* row/column dword inversion */
+        T0 = vzip1q_u32(R, S);
+        R = vzip2q_u32(R, S);
+        T1 = vzip1q_u32(T, U);
+        T = vzip2q_u32(T, U);
+
+        /* row/column qword inversion */
+        U = vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(R),
+                                        vreinterpretq_u64_u32(T)));
+        T = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(R),
+                                        vreinterpretq_u64_u32(T)));
+        S = vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(T0),
+                                        vreinterpretq_u64_u32(T1)));
+        R = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(T0),
+                                        vreinterpretq_u64_u32(T1)));
+
+        /* IV ^ LFSR */
+        pCtx->LFSR_X[15] = pCtx->LFSR_X[15] ^ U;
+        pCtx->LFSR_X[12] = pCtx->LFSR_X[12] ^ T;
+        pCtx->LFSR_X[10] = pCtx->LFSR_X[10] ^ S;
+        pCtx->LFSR_X[9] = pCtx->LFSR_X[9] ^ R;
+        pCtx->iLFSR_X = 0;
+
+        /* FSM initialization */
+        pCtx->FSM_X[0] = pCtx->FSM_X[1] =
+                pCtx->FSM_X[2] = vdupq_n_u32(0);
+
+        /* Initialisation rounds */
+        for (i = 0; i < 32; i++) {
+                T1 = ClockFSM_4(pCtx);
+
+                ClockLFSR_4(pCtx);
+                pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] =
+                        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] ^ T1;
+        }
+}
 
 static inline void
 preserve_bits(uint64_t *KS,
@@ -1736,7 +1506,7 @@ preserve_bits(uint64_t *KS,
 /**
  * @brief Core SNOW3G F8 bit algorithm for the 3GPP confidentiality algorithm
  *
- * @param[in]    pCtx          Context where the scheduled keys are stored
+ * @param[in/out]pCtx          Context where the scheduled keys are stored
  * @param[in]    pIn           Input buffer
  * @param[out]   pOut          Output buffer
  * @param[in]    lengthInBits  length in bits of the data to be encrypted
@@ -1938,7 +1708,7 @@ static inline void f8_snow3g(snow3gKeyState1_t *pCtx,
  * @brief Extracts one state from a 4 buffer state structure.
  *
  * @param[in]  pSrcState   Pointer to the source state
- * @param[in]  pDstState   Pointer to the destination state
+ * @param[out] pDstState   Pointer to the destination state
  * @param[in]  NumBuffer   Buffer number
  */
 static inline void snow3gStateConvert_4(const snow3gKeyState4_t *pSrcState,
@@ -2148,10 +1918,10 @@ void SNOW3G_F8_1_BUFFER_BIT(const snow3g_key_schedule_t *pHandle,
  * @param[in] pIV1         pointer to IV
  * @param[in] pIV2         pointer to IV
  * @param[in] pBufIn1      pointer to an input buffer
- * @param[in] pBufOut1     pointer to an output buffer
+ * @param[out]pBufOut1     pointer to an output buffer
  * @param[in] lenInBytes1  message size in bytes
  * @param[in] pBufIn2      pointer to an input buffer
- * @param[in] pBufOut2     pointer to an output buffer
+ * @param[out]pBufOut2     pointer to an output buffer
  * @param[in] lenInBytes2  message size in bytes
  */
 void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle,
@@ -2224,6 +1994,166 @@ void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle,
 
 }
 
+
+/**
+ * @brief Four buffer F8 encrypt/decrypt with different key schedule
+ *
+ * Four packets enc/dec with different key schedule.
+ * The 4 keys and IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *
+ * @param[in]     pHandle1        pointer to precomputed key1 schedule
+ * @param[in]     pHandle2        pointer to precomputed key2 schedule
+ * @param[in]     pHandle3        pointer to precomputed key3 schedule
+ * @param[in]     pHandle4        pointer to precomputed key4 schedule
+ * @param[in]     pIV1            pointer to IV1
+ * @param[in]     pIV2            pointer to IV2
+ * @param[in]     pIV3            pointer to IV3
+ * @param[in]     pIV4            pointer to IV4
+ * @param[in]     pBufferIn1      pointer to an input buffer
+ * @param[out]    pBufferOut1     pointer to an output buffer
+ * @param[in]     lengthInBytes1  message size in bytes
+ * @param[in]     pBufferIn2      pointer to an input buffer
+ * @param[out]    pBufferOut2     pointer to an output buffer
+ * @param[in]     lengthInBytes2  message size in bytes
+ * @param[in]     pBufferIn3      pointer to an input buffer
+ * @param[out]    pBufferOut3     pointer to an output buffer
+ * @param[in]     lengthInBytes3  message size in bytes
+ * @param[in]     pBufferIn4      pointer to an input buffer
+ * @param[out]    pBufferOut4     pointer to an output buffer
+ * @param[in]     lengthInBytes4  message size in bytes
+ */
+void SNOW3G_F8_4_BUFFER_MULTIKEY(const snow3g_key_schedule_t *pHandle1,
+                                 const snow3g_key_schedule_t *pHandle2,
+                                 const snow3g_key_schedule_t *pHandle3,
+                                 const snow3g_key_schedule_t *pHandle4,
+                                 const void *pIV1,
+                                 const void *pIV2,
+                                 const void *pIV3,
+                                 const void *pIV4,
+                                 const void *pBufferIn1,
+                                 void *pBufferOut1,
+                                 const uint32_t lengthInBytes1,
+                                 const void *pBufferIn2,
+                                 void *pBufferOut2,
+                                 const uint32_t lengthInBytes2,
+                                 const void *pBufferIn3,
+                                 void *pBufferOut3,
+                                 const uint32_t lengthInBytes3,
+                                 const void *pBufferIn4,
+                                 void *pBufferOut4,
+                                 const uint32_t lengthInBytes4)
+{
+        const size_t num_lanes = 4;
+        snow3gKeyState4_t ctx;
+        uint32_t lenInBytes[4];
+        uint8_t *pBufferOut[4];
+        const uint8_t *pBufferIn[4];
+        uint32_t bytes, qwords, i;
+
+        length_copy_4(lenInBytes, lengthInBytes1, lengthInBytes2,
+                      lengthInBytes3, lengthInBytes4);
+
+        cptr_copy_4((const void **)pBufferIn,
+                    pBufferIn1, pBufferIn2, pBufferIn3, pBufferIn4);
+
+        ptr_copy_4((void **)pBufferOut, pBufferOut1, pBufferOut2,
+                   pBufferOut3, pBufferOut4);
+
+#ifdef SAFE_PARAM
+        /* reset error status */
+        imb_set_errno(NULL, 0);
+        if ((pHandle1 == NULL) || (pHandle2 == NULL) ||
+            (pHandle3 == NULL) || (pHandle4 == NULL)) {
+                imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY);
+                return;
+        }
+        if ((pIV1 == NULL) || pIV2 == NULL ||
+            (pIV3 == NULL) || (pIV4 == NULL)) {
+                imb_set_errno(NULL, IMB_ERR_NULL_IV);
+                return;
+        }
+        if (!cptr_check((const void * const *)pBufferIn, num_lanes,
+                        IMB_ERR_NULL_SRC))
+                return;
+        if (!ptr_check((void **)pBufferOut, num_lanes, IMB_ERR_NULL_DST))
+                return;
+        if (!length_check(lenInBytes, num_lanes))
+                return;
+#endif
+
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        /* find min common length */
+        bytes = length_find_min(lenInBytes, num_lanes);
+        qwords = bytes / SNOW3G_8_BYTES;
+
+        /* subtract min common length from all buffers */
+        length_sub(lenInBytes, num_lanes, qwords * SNOW3G_8_BYTES);
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_4_multikey(&ctx, pHandle1, pHandle2, pHandle3, pHandle4, pIV1, pIV2, pIV3, pIV4);
+
+        /* Clock FSM and LFSR once, ignore the key stream */
+        (void) snow3g_keystream_4_4(&ctx);
+
+        /* generates 8 bytes at a time on all streams */
+        while (qwords >= 2) {
+                uint32x4_t ks[4];
+
+                snow3g_keystream_4_16(&ctx, ks);
+
+                for (i = 0; i < num_lanes; i++) {
+                        const uint32x4_t in = vld1q_u32((const uint32_t *)pBufferIn[i]);
+
+                        vst1q_u32((uint32_t *)pBufferOut[i], in ^ ks[i]);
+
+                        pBufferOut[i] += (2 * SNOW3G_8_BYTES);
+                        pBufferIn[i] += (2 * SNOW3G_8_BYTES);
+                }
+
+                qwords = qwords - 2;
+        }
+
+        while (qwords--) {
+                uint32x4_t H, L; /* 4 bytes of key stream */
+
+                snow3g_keystream_4_8(&ctx, &L, &H);
+
+                pBufferIn[0] = xor_keystrm_rev(pBufferOut[0], pBufferIn[0],
+                                vgetq_lane_u64(vreinterpretq_u64_u32(L), 0));
+                pBufferIn[1] = xor_keystrm_rev(pBufferOut[1], pBufferIn[1],
+                                vgetq_lane_u64(vreinterpretq_u64_u32(L), 1));
+                pBufferIn[2] = xor_keystrm_rev(pBufferOut[2], pBufferIn[2],
+                                vgetq_lane_u64(vreinterpretq_u64_u32(H), 0));
+                pBufferIn[3] = xor_keystrm_rev(pBufferOut[3], pBufferIn[3],
+                                vgetq_lane_u64(vreinterpretq_u64_u32(H), 1));
+
+                for (i = 0; i < num_lanes; i++)
+                        pBufferOut[i] += SNOW3G_8_BYTES;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        for (i = 0; i < num_lanes; i++) {
+                snow3gKeyState1_t ctx_t;
+
+                if (lenInBytes[i] == 0)
+                        continue;
+                snow3gStateConvert_4(&ctx, &ctx_t, i);
+                f8_snow3g(&ctx_t, pBufferIn[i], pBufferOut[i], lenInBytes[i]);
+        }
+#ifdef SAFE_DATA
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
 /**
  * @brief Four buffer F8 encrypt/decrypt with the same key schedule
  *
@@ -2237,16 +2167,16 @@ void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle,
  * @param[in] pIV3            pointer to IV
  * @param[in] pIV4            pointer to IV
  * @param[in] pBufferIn1      pointer to an input buffer
- * @param[in] pBufferOut1     pointer to an output buffer
+ * @param[out]pBufferOut1     pointer to an output buffer
  * @param[in] lengthInBytes1  message size in bytes
  * @param[in] pBufferIn2      pointer to an input buffer
- * @param[in] pBufferOut2     pointer to an output buffer
+ * @param[out]pBufferOut2     pointer to an output buffer
  * @param[in] lengthInBytes2  message size in bytes
  * @param[in] pBufferIn3      pointer to an input buffer
- * @param[in] pBufferOut3     pointer to an output buffer
+ * @param[out]pBufferOut3     pointer to an output buffer
  * @param[in] lengthInBytes3  message size in bytes
  * @param[in] pBufferIn4      pointer to an input buffer
- * @param[in] pBufferOut4     pointer to an output buffer
+ * @param[out]pBufferOut4     pointer to an output buffer
  * @param[in] lengthInBytes4  message size in bytes
  */
 void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle,
@@ -2322,7 +2252,7 @@ void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle,
         /* Clock FSM and LFSR once, ignore the key stream */
         (void) snow3g_keystream_4_4(&ctx);
 
-        /* generates 8 bytes at a time on all streams */
+        /* generates 16 bytes at a time on all streams */
         while (qwords >= 2) {
                 uint32x4_t ks[4];
 
@@ -2419,9 +2349,19 @@ void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[],
         if (!length_check(lengthInBytes, num_lanes))
                 return;
 #endif
-        for (uint32_t i = 0; i < num_lanes; i++)
-                SNOW3G_F8_1_BUFFER(pKey[i], IV[i], BufferIn[i], BufferOut[i],
-                                   lengthInBytes[i]);
+        SNOW3G_F8_4_BUFFER_MULTIKEY(pKey[0], pKey[1], pKey[2], pKey[3],
+                                    IV[0], IV[1], IV[2], IV[3],
+                                    BufferIn[0], BufferOut[0], lengthInBytes[0],
+                                    BufferIn[1], BufferOut[1], lengthInBytes[1],
+                                    BufferIn[2], BufferOut[2], lengthInBytes[2],
+                                    BufferIn[3], BufferOut[3], lengthInBytes[3]);
+
+        SNOW3G_F8_4_BUFFER_MULTIKEY(pKey[4], pKey[5], pKey[6], pKey[7],
+                                    IV[4], IV[5], IV[6], IV[7],
+                                    BufferIn[4], BufferOut[4], lengthInBytes[4],
+                                    BufferIn[5], BufferOut[5], lengthInBytes[5],
+                                    BufferIn[6], BufferOut[6], lengthInBytes[6],
+                                    BufferIn[7], BufferOut[7], lengthInBytes[7]);
 #ifdef SAFE_DATA
         CLEAR_SCRATCH_GPS();
         CLEAR_SCRATCH_SIMD_REGS();
@@ -2446,28 +2386,28 @@ void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[],
  * @param[in] pIV7            pointer to IV
  * @param[in] pIV8            pointer to IV
  * @param[in] pBufIn1         pointer to an input buffer
- * @param[in] pBufOut1        pointer to an output buffer
+ * @param[out]pBufOut1        pointer to an output buffer
  * @param[in] lenInBytes1     message size in bytes
  * @param[in] pBufIn2         pointer to an input buffer
- * @param[in] pBufOut2        pointer to an output buffer
+ * @param[out]pBufOut2        pointer to an output buffer
  * @param[in] lenInBytes2     message size in bytes
  * @param[in] pBufIn3         pointer to an input buffer
- * @param[in] pBufOut3        pointer to an output buffer
+ * @param[out]pBufOut3        pointer to an output buffer
  * @param[in] lenInBytes3     message size in bytes
  * @param[in] pBufIn4         pointer to an input buffer
- * @param[in] pBufOut4        pointer to an output buffer
+ * @param[out]pBufOut4        pointer to an output buffer
  * @param[in] lenInBytes4     message size in bytes
  * @param[in] pBufIn5         pointer to an input buffer
- * @param[in] pBufOut5        pointer to an output buffer
+ * @param[out]pBufOut5        pointer to an output buffer
  * @param[in] lenInBytes5     message size in bytes
  * @param[in] pBufIn6         pointer to an input buffer
- * @param[in] pBufOut6        pointer to an output buffer
+ * @param[out]pBufOut6        pointer to an output buffer
  * @param[in] lenInBytes6     message size in bytes
  * @param[in] pBufIn7         pointer to an input buffer
- * @param[in] pBufOut7        pointer to an output buffer
+ * @param[out]pBufOut7        pointer to an output buffer
  * @param[in] lenInBytes7     message size in bytes
  * @param[in] pBufIn8         pointer to an input buffer
- * @param[in] pBufOut8        pointer to an output buffer
+ * @param[out]pBufOut8        pointer to an output buffer
  * @param[in] lenInBytes8     message size in bytes
  */
 void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle,
@@ -2865,6 +2805,147 @@ void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[],
         }
 }
 
+/**
+ * @brief Initializes the four keys for SNOW3G f8/f9.
+ *
+ * @param [in/out]  pCtx        Pointer to snow3g state
+ * @param [in]      pKeySched1  Key1 schedule
+ * @param [in]      pKeySched2  Key2 schedule
+ * @param [in]      pKeySched3  Key3 schedule
+ * @param [in]      pKeySched4  Key4 schedule
+ * @param [in]      pIV1        IV for buffer 1
+ * @param [in]      pIV2        IV for buffer 2
+ * @param [in]      pIV3        IV for buffer 3
+ * @param [in]      pIV4        IV for buffer 4
+ */
+void
+SNOW3G_F8_4_BUFFER_INITIALIZE(void *pCtx,
+                              const snow3g_key_schedule_t *pKeySched1,
+                              const snow3g_key_schedule_t *pKeySched2,
+                              const snow3g_key_schedule_t *pKeySched3,
+                              const snow3g_key_schedule_t *pKeySched4,
+                              const void *pIV1, const void *pIV2,
+                              const void *pIV3, const void *pIV4)
+{
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_4_multikey((snow3gKeyState4_t *)pCtx,
+                                         pKeySched1, pKeySched2,
+                                         pKeySched3, pKeySched4,
+                                         pIV1, pIV2, pIV3, pIV4);
+
+        /* Clock FSM and LFSR once, ignore the key stream */
+        (void) snow3g_keystream_4_4((snow3gKeyState4_t *)pCtx);
+
+        return;
+}
+
+/**
+ * @brief Four buffer F8 encrypt/decrypt after initialize.
+ *
+ * @param[in/out] pCtx            pointer to snow3g state
+ * @param[in]     pBufferIn1      pointer to an input buffer
+ * @param[out]    pBufferOut1     pointer to an output buffer
+ * @param[in]     pBufferIn2      pointer to an input buffer
+ * @param[out]    pBufferOut2     pointer to an output buffer
+ * @param[in]     pBufferIn3      pointer to an input buffer
+ * @param[out]    pBufferOut3     pointer to an output buffer
+ * @param[in]     pBufferIn4      pointer to an input buffer
+ * @param[out]    pBufferOut4     pointer to an output buffer
+ */
+void SNOW3G_F8_4_BUFFER_STREAM(void *pCtx,
+                               const void *pBufferIn1,
+                               void *pBufferOut1,
+                               const void *pBufferIn2,
+                               void *pBufferOut2,
+                               const void *pBufferIn3,
+                               void *pBufferOut3,
+                               const void *pBufferIn4,
+                               void *pBufferOut4,
+                               const uint32_t lengthInBytes)
+{
+        const uint32_t num_lanes = 4;
+        snow3gKeyState4_t *ctx = (snow3gKeyState4_t *)pCtx;
+        uint32_t words;
+        uint8_t *pBufferOut[4];
+        const uint8_t *pBufferIn[4];
+
+        cptr_copy_4((const void **)pBufferIn,
+                    pBufferIn1, pBufferIn2, pBufferIn3, pBufferIn4);
+
+        ptr_copy_4((void **)pBufferOut, pBufferOut1, pBufferOut2,
+                   pBufferOut3, pBufferOut4);
+#ifdef SAFE_PARAM
+        /* reset error status */
+        imb_set_errno(NULL, 0);
+        if (!cptr_check((const void * const *)pCtx, num_lanes,
+            IMB_ERR_NULL_EXP_KEY))
+                return;
+        if (!cptr_check((const void * const *)pBufferIn,
+                        num_lanes,
+                        IMB_ERR_NULL_SRC))
+                return;
+        if (!ptr_check((void **)pBufferOut, num_lanes, IMB_ERR_NULL_DST))
+                return;
+        if ((lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN)) {
+                imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
+                return;
+        }
+        if (pCtx == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_CTX);
+                return;
+        }
+#endif
+
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        words = lengthInBytes / SNOW3G_4_BYTES;
+
+        while (words--) {
+                uint32x4_t ks4 = snow3g_keystream_4_4(ctx);
+                xor_keystream_reverse_32(pBufferOut[0], pBufferIn[0],
+                                         vgetq_lane_u32(ks4, 0));
+                xor_keystream_reverse_32(pBufferOut[1], pBufferIn[1],
+                                         vgetq_lane_u32(ks4, 1));
+                xor_keystream_reverse_32(pBufferOut[2], pBufferIn[2],
+                                         vgetq_lane_u32(ks4, 2));
+                xor_keystream_reverse_32(pBufferOut[3], pBufferIn[3],
+                                         vgetq_lane_u32(ks4, 3));
+                for (uint32_t i = 0; i < num_lanes; i++) {
+                        pBufferIn[i] += SNOW3G_4_BYTES;
+                        pBufferOut[i] += SNOW3G_4_BYTES;
+                }
+        }
+
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/**
+ * @brief One buffer F8 encrypt/decrypt after initialize.
+ *
+ * One packet enc/dec after initialize.
+ *
+ * @param[in/out] pCtx            pointer to snow3g state
+ * @param[in]     pBufferIn       pointer to an input buffer
+ * @param[out]    pBufferOut      pointer to an output buffer
+ * @param[in]     lengthInBytes   length in bytes
+ */
+void SNOW3G_F8_1_BUFFER_STREAM(void *pCtx,
+                               const void *pBufferIn,
+                               void *pBufferOut,
+                               const uint32_t lengthInBytes)
+{
+        f8_snow3g((snow3gKeyState1_t *)pCtx, pBufferIn, pBufferOut, lengthInBytes);
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
 /**
  * @brief Single buffer bit-length F9 function
  *
@@ -2883,6 +2964,8 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
                         void *pDigest)
 {
 #ifdef SAFE_PARAM
+        /* reset error status */
+        imb_set_errno(NULL, 0);
         if (pHandle == NULL) {
                 imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY);
                 return;
@@ -2910,11 +2993,6 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
 
         snow3gKeyState1_t ctx;
         uint32_t z[5];
-        uint64_t lengthInQwords, E, P;
-        uint64_t i;
-        const uint64_t *inputBuffer;
-
-        inputBuffer = (const uint64_t *)pBufferIn;
 
         /* Initialize the SNOW3G key schedule */
         snow3gStateInitialize_1(&ctx, pHandle, pIV);
@@ -2922,7 +3000,48 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
         /*Generate 5 key stream words*/
         snow3g_f9_keystream_words(&ctx, &z[0]);
 
+        SNOW3G_F9_1_BUFFER_DIGEST(z, pBufferIn, lengthInBits, pDigest);
+
+#ifdef SAFE_DATA
+        CLEAR_MEM(&z, sizeof(z));
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/**
+ * @brief Single buffer bit-length F9 function
+ *
+ * Single buffer digest with generated keystream.
+ *
+ * @param[in]  z            pointer to pre-generated keystream
+ * @param[in]  pBufferIn    pointer to an input buffer
+ * @param[in]  lengthInBits message length in bits
+ * @param[out] pDigest      pointer to store the F9 digest
+ */
+void SNOW3G_F9_1_BUFFER_DIGEST(const uint32_t z[5],
+                               const void *pBufferIn,
+                               const uint64_t lengthInBits,
+                               void *pDigest)
+{
+#ifdef SAFE_PARAM
+        if ((z == NULL) || (pBufferIn == NULL) || (pDigest == NULL) ||
+            (lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN))
+                return;
+#endif
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        uint64_t lengthInQwords, E, P;
+        uint64_t i;
+        const uint64_t *inputBuffer;
+
+        inputBuffer = (const uint64_t *)pBufferIn;
+
         P = ((uint64_t)z[0] << 32) | ((uint64_t)z[1]);
+        const uint64x2_t nP = vdupq_n_u64(P);
 
         lengthInQwords = lengthInBits / 64;
 
@@ -2934,6 +3053,7 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
                 const uint64_t P2 = multiply_and_reduce64(P, P);
                 const uint64_t P3 = multiply_and_reduce64(P2, P);
                 const uint64_t P4 = multiply_and_reduce64(P3, P);
+                const uint64x2_t nP3 = vdupq_n_u64(P3);
                 const uint64_t bs[2] = {0x0001020304050607ULL,
                                         0x08090a0b0c0d0e0fULL};
                 const uint8x16_t bswap2x64 = vreinterpretq_u8_u64(vld1q_u64(bs));
@@ -2943,7 +3063,6 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
                 poly64x2_t EV = vdupq_n_p64(0);
 
                 for (; (i + 3) < lengthInQwords; i+= 4) {
-                        uint64_t m0, m1, m2, m3;
                         uint64x2_t M1_t, M2_t;
                         poly64x2_t t1, t2, t3;
                         /* load 2 x 128-bits and byte swap 64-bit words */
@@ -2955,23 +3074,20 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
                                        vreinterpretq_u8_u64(M1_t), bswap2x64));
                         M2_t = vreinterpretq_u64_u8(vqtbl1q_u8(
                                        vreinterpretq_u8_u64(M2_t), bswap2x64));
-                        m0 = vgetq_lane_u64(M1_t, 0);
-                        m1 = vgetq_lane_u64(M1_t, 1);
-                        m2 = vgetq_lane_u64(M2_t, 0);
-                        m3 = vgetq_lane_u64(M2_t, 1);
 
                         /* add current EV to the first word of the message */
-                        m0 = m0 ^ vgetq_lane_u64(vreinterpretq_u64_p64(EV), 0);
-                        m1 = m1 ^ vgetq_lane_u64(vreinterpretq_u64_p64(EV), 1);
+                        M1_t = M1_t ^ EV;
 
-                        /* t1 = (M0 x P4) + (M1 x P3) + (M2 x P2) + (M3 x P1) */
-                        t1 = (poly64x2_t)vmull_p64(m2, P2);
-                        t2 = (poly64x2_t)vmull_p64(m3, P);
+                        /* t1 = (M1.0 x P4) + (M1.1 x P3) + (M2.0 x P2) + (M2.1 x P1) */
+                        t1 = (poly64x2_t)vmull_p64(vgetq_lane_u64(M2_t, 0), P2);
+                        t2 = (poly64x2_t)vmull_high_p64(vreinterpretq_p64_u64(M2_t),
+                                                        vreinterpretq_p64_u64(nP));
 
                         t1 = t1 ^ t2;
 
-                        t2 = (poly64x2_t)vmull_p64(m0, P4);
-                        t3 = (poly64x2_t)vmull_p64(m1, P3);
+                        t2 = (poly64x2_t)vmull_p64(vgetq_lane_u64(M1_t, 0), P4);
+                        t3 = (poly64x2_t)vmull_high_p64(vreinterpretq_p64_u64(M1_t),
+                                                        vreinterpretq_p64_u64(nP3));
 
                         t2 = t2 ^ t3;
                         t1 = t2 ^ t1;
@@ -2996,12 +3112,10 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
                         /* add current EV to the first word of the message */
                         M_t = M_t ^ EV;
 
-                        poly64_t M0 = vgetq_lane_u64(M_t, 0);
-                        poly64_t M1 = vgetq_lane_u64(M_t, 1);
-
-                        /* t1 = (M0 x P2) + (M1 x P1) */
-                        t1 = (poly64x2_t)vmull_p64(M0, P2);
-                        t2 = (poly64x2_t)vmull_p64(M1, P);
+                        /* t1 = (M.0 x P2) + (M.1 x P1) */
+                        t1 = (poly64x2_t)vmull_p64(vgetq_lane_u64(M_t, 0), P2);
+                        t2 = (poly64x2_t)vmull_high_p64(vreinterpretq_p64_u64(M_t),
+                                                        vreinterpretq_p64_u64(nP));
                         t1 = t1 ^ t2;
 
                         /* reduce 128-bit product */
@@ -3051,10 +3165,51 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
 #ifdef SAFE_DATA
         CLEAR_VAR(&E, sizeof(E));
         CLEAR_VAR(&P, sizeof(P));
-        CLEAR_MEM(&z, sizeof(z));
-        CLEAR_MEM(&ctx, sizeof(ctx));
         CLEAR_SCRATCH_GPS();
         CLEAR_SCRATCH_SIMD_REGS();
 #endif /* SAFE_DATA */
 }
+
+/**
+ * @brief Four buffer F9 keystream generation.
+ *
+ * @param[in/out] pCtx    pointer to snow3g state
+ * @param[out]    ks1     pointer to output keystream1
+ * @param[out]    ks2     pointer to output keystream2
+ * @param[out]    ks3     pointer to output keystream3
+ * @param[out]    ks4     pointer to output keystream4
+ */
+void SNOW3G_F9_4_BUFFER_KEYSTREAM(void *pCtx,
+                                  uint32_t ks1[5],
+                                  uint32_t ks2[5],
+                                  uint32_t ks3[5],
+                                  uint32_t ks4[5])
+{
+        snow3gKeyState4_t *ctx = (snow3gKeyState4_t *)pCtx;
+
+#ifdef SAFE_PARAM
+        if (pCtx == NULL)
+                return;
+        if (ks1 ==  NULL || ks2 ==  NULL || ks3 ==  NULL || ks4 ==  NULL)
+                return;
+#endif
+
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        for (int i = 0; i < 5; i++) {
+            uint32x4_t ks = snow3g_keystream_4_4(ctx);
+            ks1[i] = vgetq_lane_u32(ks, 0);
+            ks2[i] = vgetq_lane_u32(ks, 1);
+            ks3[i] = vgetq_lane_u32(ks, 2);
+            ks4[i] = vgetq_lane_u32(ks, 3);
+        }
+
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
 #endif /* SNOW3G_COMMON_H */
diff --git a/lib/aarch64/snow3g_internal.h b/lib/aarch64/snow3g_internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b7e79224a5ceee3e24418ba862b92bd28d6cb82
--- /dev/null
+++ b/lib/aarch64/snow3g_internal.h
@@ -0,0 +1,289 @@
+/**********************************************************************
+  Copyright(c) 2022 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef SNOW3G_INTERNAL_H
+#define SNOW3G_INTERNAL_H
+
+#include <arm_neon.h>
+#ifdef SAFE_PARAM
+#include "include/error.h"
+#endif
+
+#define MAX_KEY_LEN (16)
+#define SNOW3G_4_BYTES (4)
+#define SNOW3G_8_BYTES (8)
+#define SNOW3G_8_BITS (8)
+#define SNOW3G_16_BYTES (16)
+#define SNOW3G_16_BITS (16)
+
+#define SNOW3G_BLOCK_SIZE (8)
+
+#define SNOW3G_KEY_LEN_IN_BYTES (16) /* 128b */
+#define SNOW3G_IV_LEN_IN_BYTES (16)  /* 128b */
+
+#define SNOW3GCONSTANT (0x1b)
+
+/* Range of input data for SNOW3G is from 1 to 2^32 bits */
+#define SNOW3G_MIN_LEN 1
+#define SNOW3G_MAX_BITLEN (UINT32_MAX)
+#define SNOW3G_MAX_BYTELEN (UINT32_MAX / 8)
+
+typedef union SafeBuffer {
+        uint64_t b64;
+        uint32_t b32[2];
+        uint8_t b8[SNOW3G_8_BYTES];
+} SafeBuf;
+
+typedef struct snow3gKeyState1_s {
+        /* 16 LFSR stages */
+        uint32_t LFSR_S[16];
+        /* 3 FSM states */
+        uint32_t FSM_R3;
+        uint32_t FSM_R2;
+        uint32_t FSM_R1;
+} DECLARE_ALIGNED(snow3gKeyState1_t, 16);
+
+typedef struct snow3gKeyState4_s {
+        /* 16 LFSR stages */
+        uint32x4_t LFSR_X[16];
+        /* 3 FSM states */
+        uint32x4_t FSM_X[3];
+        uint32_t iLFSR_X;
+} snow3gKeyState4_t;
+
+/**
+ * @brief Finds minimum 32-bit value in an array
+ * @return Min 32-bit value
+ */
+static inline uint32_t
+length_find_min(const uint32_t *out_array, const size_t dim_array)
+{
+        size_t i;
+        uint32_t min = 0;
+
+        if (dim_array > 0)
+                min  = out_array[0];
+
+        for (i = 1; i < dim_array; i++)
+                if (out_array[i] < min)
+                        min = out_array[i];
+
+        return min;
+}
+
+/**
+ * @brief Subtracts \a subv from a vector of 32-bit words
+ */
+static inline void
+length_sub(uint32_t *out_array, const size_t dim_array, const uint32_t subv)
+{
+        size_t i;
+
+        for (i = 0; i < dim_array; i++)
+                out_array[i] -= subv;
+}
+
+#ifdef SAFE_PARAM
+/**
+ * @brief Checks vector of length values against 0 and SNOW3G_MAX_BYTELEN values
+ * @retval 0 incorrect length value found
+ * @retval 1 all OK
+ */
+static inline uint32_t
+length_check(const uint32_t *out_array, const size_t dim_array)
+{
+        size_t i;
+
+        if (out_array == NULL) {
+                imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
+                return 0;
+        }
+
+        for (i = 0; i < dim_array; i++) {
+                if ((out_array[i] == 0) ||
+                    (out_array[i] > SNOW3G_MAX_BYTELEN)) {
+                        imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
+                        return 0;
+                    }
+        }
+
+        return 1;
+}
+#endif
+
+/**
+ * @brief Copies 4 32-bit length values into an array
+ */
+static inline void
+length_copy_4(uint32_t *out_array,
+              const uint32_t length1, const uint32_t length2,
+              const uint32_t length3, const uint32_t length4)
+{
+        out_array[0] = length1;
+        out_array[1] = length2;
+        out_array[2] = length3;
+        out_array[3] = length4;
+}
+
+/**
+ * @brief Copies 8 32-bit length values into an array
+ */
+static inline void
+length_copy_8(uint32_t *out_array,
+              const uint32_t length1, const uint32_t length2,
+              const uint32_t length3, const uint32_t length4,
+              const uint32_t length5, const uint32_t length6,
+              const uint32_t length7, const uint32_t length8)
+{
+        out_array[0] = length1;
+        out_array[1] = length2;
+        out_array[2] = length3;
+        out_array[3] = length4;
+        out_array[4] = length5;
+        out_array[5] = length6;
+        out_array[6] = length7;
+        out_array[7] = length8;
+}
+
+#ifdef SAFE_PARAM
+/**
+ * @brief Checks vector of pointers against NULL
+ * @retval 0 incorrect pointer found
+ * @retval 1 all OK
+ */
+static inline int
+ptr_check(void *out_array[], const size_t dim_array, const int errnum)
+{
+        size_t i;
+
+        if (out_array == NULL) {
+                imb_set_errno(NULL, errnum);
+                return 0;
+        }
+        for (i = 0; i < dim_array; i++)
+                if (out_array[i] == NULL) {
+                        imb_set_errno(NULL, errnum);
+                        return 0;
+                }
+        return 1;
+}
+#endif
+
+#ifdef SAFE_PARAM
+/**
+ * @brief Checks vector of const pointers against NULL
+ * @retval 0 incorrect pointer found
+ * @retval 1 all OK
+ */
+static inline int
+cptr_check(const void * const out_array[],
+           const size_t dim_array,
+           const int errnum)
+{
+        size_t i;
+
+        if (out_array == NULL) {
+                imb_set_errno(NULL, errnum);
+                return 0;
+        }
+        for (i = 0; i < dim_array; i++)
+                if (out_array[i] == NULL) {
+                        imb_set_errno(NULL, errnum);
+                        return 0;
+                }
+
+        return 1;
+}
+#endif
+
+/**
+ * @brief Copies 4 pointers into an array
+ */
+static inline void
+ptr_copy_4(void *out_array[],
+           void *ptr1, void *ptr2, void *ptr3, void *ptr4)
+{
+        out_array[0] = ptr1;
+        out_array[1] = ptr2;
+        out_array[2] = ptr3;
+        out_array[3] = ptr4;
+}
+
+/**
+ * @brief Copies 4 const pointers into an array
+ */
+static inline void
+cptr_copy_4(const void *out_array[],
+            const void *ptr1, const void *ptr2,
+            const void *ptr3, const void *ptr4)
+{
+        out_array[0] = ptr1;
+        out_array[1] = ptr2;
+        out_array[2] = ptr3;
+        out_array[3] = ptr4;
+}
+
+/**
+ * @brief Copies 8 pointers into an array
+ */
+static inline void
+ptr_copy_8(void *out_array[],
+           void *ptr1, void *ptr2, void *ptr3, void *ptr4,
+           void *ptr5, void *ptr6, void *ptr7, void *ptr8)
+{
+        out_array[0] = ptr1;
+        out_array[1] = ptr2;
+        out_array[2] = ptr3;
+        out_array[3] = ptr4;
+        out_array[4] = ptr5;
+        out_array[5] = ptr6;
+        out_array[6] = ptr7;
+        out_array[7] = ptr8;
+}
+
+/**
+ * @brief Copies 8 const pointers into an array
+ */
+static inline void
+cptr_copy_8(const void *out_array[],
+            const void *ptr1, const void *ptr2,
+            const void *ptr3, const void *ptr4,
+            const void *ptr5, const void *ptr6,
+            const void *ptr7, const void *ptr8)
+{
+        out_array[0] = ptr1;
+        out_array[1] = ptr2;
+        out_array[2] = ptr3;
+        out_array[3] = ptr4;
+        out_array[4] = ptr5;
+        out_array[5] = ptr6;
+        out_array[6] = ptr7;
+        out_array[7] = ptr8;
+}
+
+#endif /* SNOW3G_INTERNAL_H */
diff --git a/lib/include/snow3g.h b/lib/include/snow3g.h
index 13bdbb156d9414eab0c7a5bfcea5be16b0967b7d..9bf40ae85f1cec766b1037793dfbd9cde5eee188 100644
--- a/lib/include/snow3g.h
+++ b/lib/include/snow3g.h
@@ -702,6 +702,29 @@ snow3g_f8_4_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
                            const void *pBufferIn4,
                            void *pBufferOut4,
                            const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_4_buffer_multikey_aarch64(const snow3g_key_schedule_t *pCtx1,
+                                    const snow3g_key_schedule_t *pCtx2,
+                                    const snow3g_key_schedule_t *pCtx3,
+                                    const snow3g_key_schedule_t *pCtx4,
+                                    const void *pIV1,
+                                    const void *pIV2,
+                                    const void *pIV3,
+                                    const void *pIV4,
+                                    const void *pBufferIn1,
+                                    void *pBufferOut1,
+                                    const uint32_t lengthInBytes1,
+                                    const void *pBufferIn2,
+                                    void *pBufferOut2,
+                                    const uint32_t lengthInBytes2,
+                                    const void *pBufferIn3,
+                                    void *pBufferOut3,
+                                    const uint32_t lengthInBytes3,
+                                    const void *pBufferIn4,
+                                    void *pBufferOut4,
+                                    const uint32_t lengthInBytes4);
+
 void
 snow3g_f8_8_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
                            const void *pIV1,
@@ -759,6 +782,32 @@ snow3g_f8_n_buffer_multikey_aarch64(const snow3g_key_schedule_t * const pCtx[],
                                     void *pBufferOut[],
                                     const uint32_t bufferLenInBytes[],
                                     const uint32_t bufferCount);
+void
+snow3g_f8_4_buffer_initialize_aarch64(void *pCtx,
+                                      const snow3g_key_schedule_t *pKeySched1,
+                                      const snow3g_key_schedule_t *pKeySched2,
+                                      const snow3g_key_schedule_t *pKeySched3,
+                                      const snow3g_key_schedule_t *pKeySched4,
+                                      const void *pIV1, const void *pIV2,
+                                      const void *pIV3, const void *pIV4);
+
+void
+snow3g_f8_1_buffer_stream_aarch64(void *pCtx,
+                                  const void *pBufferIn,
+                                  void *pBufferOut,
+                                  const uint32_t lengthInBytes);
+
+void
+snow3g_f8_4_buffer_stream_aarch64(void *pCtx,
+                                  const void *pBufferIn1,
+                                  void *pBufferOut1,
+                                  const void *pBufferIn2,
+                                  void *pBufferOut2,
+                                  const void *pBufferIn3,
+                                  void *pBufferOut3,
+                                  const void *pBufferIn4,
+                                  void *pBufferOut4,
+                                  const uint32_t lengthInBytes);
 
 void
 snow3g_f9_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
@@ -767,6 +816,19 @@ snow3g_f9_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
                            const uint64_t lengthInBits,
                            void *pDigest);
 
+void
+snow3g_f9_1_buffer_digest_aarch64(const uint32_t z[5],
+                                  const void *pBufferIn,
+                                  const uint64_t lengthInBits,
+                                  void *pDigest);
+
+void
+snow3g_f9_4_buffer_keystream_aarch64(void *pCtx,
+                                     uint32_t ks1[5],
+                                     uint32_t ks2[5],
+                                     uint32_t ks3[5],
+                                     uint32_t ks4[5]);
+
 size_t
 snow3g_key_sched_size_aarch64(void);
 
@@ -821,6 +883,28 @@ snow3g_f8_4_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
                                     void *pBufferOut4,
                                     const uint32_t lengthInBytes4);
 
+void
+snow3g_f8_4_buffer_multikey_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx1,
+                                             const snow3g_key_schedule_t *pCtx2,
+                                             const snow3g_key_schedule_t *pCtx3,
+                                             const snow3g_key_schedule_t *pCtx4,
+                                             const void *pIV1,
+                                             const void *pIV2,
+                                             const void *pIV3,
+                                             const void *pIV4,
+                                             const void *pBufferIn1,
+                                             void *pBufferOut1,
+                                             const uint32_t lengthInBytes1,
+                                             const void *pBufferIn2,
+                                             void *pBufferOut2,
+                                             const uint32_t lengthInBytes2,
+                                             const void *pBufferIn3,
+                                             void *pBufferOut3,
+                                             const uint32_t lengthInBytes3,
+                                             const void *pBufferIn4,
+                                             void *pBufferOut4,
+                                             const uint32_t lengthInBytes4);
+
 void
 snow3g_f8_8_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
                                     const void *pIV1,
@@ -881,6 +965,33 @@ snow3g_f8_n_buffer_multikey_aarch64_no_aesni(const snow3g_key_schedule_t * const
                                              const uint32_t bufferLenInBytes[],
                                              const uint32_t bufferCount);
 
+void
+snow3g_f8_4_buffer_initialize_aarch64_no_aesni(void *pCtx,
+                                               const snow3g_key_schedule_t *pKeySched1,
+                                               const snow3g_key_schedule_t *pKeySched2,
+                                               const snow3g_key_schedule_t *pKeySched3,
+                                               const snow3g_key_schedule_t *pKeySched4,
+                                               const void *pIV1, const void *pIV2,
+                                               const void *pIV3, const void *pIV4);
+
+void
+snow3g_f8_1_buffer_stream_aarch64_no_aesni(void *pCtx,
+                                           const void *pBufferIn,
+                                           void *pBufferOut,
+                                           const uint32_t lengthInBytes);
+
+void
+snow3g_f8_4_buffer_stream_aarch64_no_aesni(void *pCtx,
+                                           const void *pBufferIn1,
+                                           void *pBufferOut1,
+                                           const void *pBufferIn2,
+                                           void *pBufferOut2,
+                                           const void *pBufferIn3,
+                                           void *pBufferOut3,
+                                           const void *pBufferIn4,
+                                           void *pBufferOut4,
+                                           const uint32_t lengthInBytes);
+
 void
 snow3g_f9_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
                                     const void *pIV,
@@ -888,10 +999,24 @@ snow3g_f9_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
                                     const uint64_t lengthInBits,
                                     void *pDigest);
 
+void
+snow3g_f9_1_buffer_digest_aarch64_no_aesni(const uint32_t z[5],
+                                           const void *pBufferIn,
+                                           const uint64_t lengthInBits,
+                                           void *pDigest);
+
+void
+snow3g_f9_4_buffer_keystream_aarch64_no_aesni(void *pCtx,
+                                              uint32_t ks1[5],
+                                              uint32_t ks2[5],
+                                              uint32_t ks3[5],
+                                              uint32_t ks4[5]);
+
 size_t
 snow3g_key_sched_size_aarch64_no_aesni(void);
 
 int
 snow3g_init_key_sched_aarch64_no_aesni(const void *pKey,
                                        snow3g_key_schedule_t *pCtx);
+
 #endif /* _SNOW3G_H_ */
diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h
index e1b8130d47c981e7b71332a926a2703463efdf90..4de506b435f128f12b886f0abb394e193df64891 100644
--- a/lib/ipsec-mb.h
+++ b/lib/ipsec-mb.h
@@ -957,6 +957,19 @@ typedef void (*snow3g_f8_4_buffer_t)(const snow3g_key_schedule_t *,
                                      const uint32_t, const void *, void *,
                                      const uint32_t);
 
+#ifdef __aarch64__
+typedef void (*snow3g_f8_4_buffer_multikey_t)(const snow3g_key_schedule_t *,
+                                              const snow3g_key_schedule_t *,
+                                              const snow3g_key_schedule_t *,
+                                              const snow3g_key_schedule_t *,
+                                              const void *, const void *, const void *,
+                                              const void *, const void *, void *,
+                                              const uint32_t, const void *, void *,
+                                              const uint32_t, const void *, void *,
+                                              const uint32_t, const void *, void *,
+                                              const uint32_t);
+#endif //__aarch64__
+
 typedef void (*snow3g_f8_8_buffer_t)(const snow3g_key_schedule_t *,
                                      const void *, const void *, const void *,
                                      const void *, const void *, const void *,
@@ -1164,6 +1177,9 @@ typedef struct IMB_MGR {
         snow3g_f8_4_buffer_t snow3g_f8_4_buffer;
         snow3g_f8_8_buffer_t snow3g_f8_8_buffer;
         snow3g_f8_n_buffer_t snow3g_f8_n_buffer;
+#ifdef __aarch64__
+        snow3g_f8_4_buffer_multikey_t snow3g_f8_4_buffer_multikey;
+#endif //__aarch64__
         snow3g_f8_8_buffer_multikey_t snow3g_f8_8_buffer_multikey;
         snow3g_f8_n_buffer_multikey_t snow3g_f8_n_buffer_multikey;
         snow3g_f9_1_buffer_t snow3g_f9_1_buffer;
@@ -2425,6 +2441,51 @@ IMB_DLL_EXPORT void init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch);
                                     (_src3), (_dst3), (_len3),           \
                                     (_src4), (_dst4), (_len4)))
 
+#ifdef __aarch64__
+/**
+ *******************************************************************************
+ * This function performs snow3g f8 operation on four buffers. They will
+ * be processed with different keys, which has already been scheduled with
+ * snow3g_init_key_sched().
+ *
+ * @param[in]  _mgr           Pointer to multi-buffer structure
+ * @param[in]  _exp_key1      Context where the scheduled key1 are stored
+ * @param[in]  _exp_key2      Context where the scheduled key2 are stored
+ * @param[in]  _exp_key3      Context where the scheduled key3 are stored
+ * @param[in]  _exp_key4      Context where the scheduled key4 are stored
+ * @param[in]  _iv1           IV to use for buffer pBufferIn1
+ * @param[in]  _iv2           IV to use for buffer pBufferIn2
+ * @param[in]  _iv3           IV to use for buffer pBufferIn3
+ * @param[in]  _iv4           IV to use for buffer pBufferIn4
+ * @param[in]  _src1          Input buffer 1
+ * @param[out] _dst1          Output buffer 1
+ * @param[in]  _len1          Length in bytes of input buffer 1
+ * @param[in]  _src2          Input buffer 2
+ * @param[out] _dst2          Output buffer 2
+ * @param[in]  _len2          Length in bytes of input buffer 2
+ * @param[in]  _src3          Input buffer 3
+ * @param[out] _dst3          Output buffer 3
+ * @param[in]  _len3          Length in bytes of input buffer 3
+ * @param[in]  _src4          Input buffer 4
+ * @param[out] _dst4          Output buffer 4
+ * @param[in]  _len4          Length in bytes of input buffer 4
+ */
+#define IMB_SNOW3G_F8_4_BUFFER_MULTIKEY(_mgr, \
+                                        _exp_key1, _exp_key2, _exp_key3, _exp_key4, \
+                                        _iv1, _iv2, _iv3, _iv4,                     \
+                                        _src1, _dst1, _len1,                        \
+                                        _src2, _dst2, _len2,                        \
+                                        _src3, _dst3, _len3,                        \
+                                        _src4, _dst4, _len4)                        \
+        ((_mgr)->snow3g_f8_4_buffer_multikey((_exp_key1), (_exp_key2),              \
+                                             (_exp_key3), (_exp_key4),              \
+                                             (_iv1), (_iv2), (_iv3), (_iv4),        \
+                                             (_src1), (_dst1), (_len1),             \
+                                             (_src2), (_dst2), (_len2),             \
+                                             (_src3), (_dst3), (_len3),             \
+                                             (_src4), (_dst4), (_len4)))
+#endif //__aarch64__
+
 /**
  *******************************************************************************
  * This function performs snow3g f8 operation on eight buffers. They will
diff --git a/perf/ipsec_perf.c b/perf/ipsec_perf.c
index e0b1a6111e5a99577f04ab1c27d78d0e25a8a981..d3e96a386602c608385d9bc9ab1140697aefb250 100644
--- a/perf/ipsec_perf.c
+++ b/perf/ipsec_perf.c
@@ -4150,10 +4150,18 @@ int main(int argc, char *argv[])
                                 arch_str_map[arch_id].name);
                 }
         }
-
+#ifdef __aarch64__
+        /* The scale maybe less than 0.01 on AARCH64, so precision of .3f
+           is not enough. Use .6f instead of .3f */
+        if (tsc_detect)
+                fprintf(stderr, "TSC scaling to core cycles: %.6f\n",
+                        get_tsc_to_core_scale(turbo_enabled));
+#else
         if (tsc_detect)
                 fprintf(stderr, "TSC scaling to core cycles: %.3f\n",
                         get_tsc_to_core_scale(turbo_enabled));
+#endif
+
 #ifdef __aarch64__
         fprintf(stderr, "CNT frequency: %ld\n", read_cntfreq());
 #endif