diff --git a/lib/Makefile b/lib/Makefile
index 327a171d8b70fb1e74d3679bab1bc9cb2a4c7279..9797c54d3cfb406d8a2270179cbad07da5c0a79e 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2012-2022, Intel Corporation
+# Copyright (c) 2012-2023, Intel Corporation
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
@@ -198,6 +198,7 @@ endif # x86_64
 
 ifeq ($(ARCH),aarch64)
 OPT_AARCH64 := -march=armv8-a+crypto+aes
+OPT_SVE := -march=armv8-a+sve+crypto+aes
 OPT_NOAESNI := -march=armv8-a
 endif # aarch64
 
@@ -207,7 +208,14 @@ GCC_VERSION = $(shell $(CC) -dumpversion | cut -d. -f1)
 GCC_GE_V5 = $(shell [ $(GCC_VERSION) -ge 5 ] && echo true)
 ifeq ($(GCC_GE_V5),true)
 ifeq ($(ARCH),aarch64)
+GCC_GE_V11 = $(shell [ $(GCC_VERSION) -ge 11 ] && echo true)
+#arm sve requires gcc-11 or newer.
+ifneq ($(GCC_GE_V11),true)
+$(warning "GCC version found: $(GCC_VERSION)")
+$(error "Minimum required: 11")
+endif # GCC_GE_V11
 OPT_AARCH64 := -march=armv8-a+crypto+aes
+OPT_SVE := -march=armv8-a+sve+crypto+aes
 OPT_NOAESNI := -march=armv8-a
 else
 OPT_SSE := -march=nehalem -maes -mpclmul
@@ -282,6 +290,7 @@ SAFE_OPTIONS_MSG2="All safe options enabled by default."
 ifeq ($(ARCH),aarch64)
 c_lib_objs := \
 	mb_mgr_aarch64.o \
+	mb_mgr_aarch64_sve256.o \
 	mb_mgr_aarch64_no_aesni.o \
 	mb_mgr_auto_aarch64.o \
 	alloc_aarch64.o \
@@ -302,7 +311,10 @@ c_lib_objs := \
 	mb_mgr_zuc_submit_flush_aarch64.o \
 	mb_mgr_zuc_submit_flush_aarch64_no_aesni.o \
 	mb_mgr_snow3g_submit_flush_aarch64.o \
-	mb_mgr_snow3g_submit_flush_aarch64_no_aesni.o
+	mb_mgr_snow3g_submit_flush_aarch64_no_aesni.o \
+	mb_mgr_snow3g_submit_flush_aarch64_sve256.o \
+	snow3g_aarch64_sve256.o \
+	snow3g_impl_aarch64_sve256.o
 asm_generic_lib_objs := \
 	lookup_16x8bit_neon.o
 else
@@ -863,6 +875,8 @@ $(dep_target_files): | $(OBJ_DIR)
 #
 
 ifeq ($(ARCH),aarch64)
+$(OBJ_DIR)/cpu_features_aarch64.o:aarch64/cpu_features_aarch64.c
+	$(CC) -MMD $(OPT_SVE) -c $(CFLAGS) $< -o $@
 $(OBJ_DIR)/%.o:aarch64/%.c
 	$(CC) -MMD $(OPT_AARCH64) -c $(CFLAGS) $< -o $@
 $(OBJ_DIR)/%.o:x86_64/%.c
diff --git a/lib/aarch64/cpu_features_aarch64.c b/lib/aarch64/cpu_features_aarch64.c
index a34e2cb400cdff1d226908af56f727dcc9d95301..1f1191d6c82cdde371aa608548b175823a850d0e 100644
--- a/lib/aarch64/cpu_features_aarch64.c
+++ b/lib/aarch64/cpu_features_aarch64.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -29,6 +29,7 @@
 #include "cpu_feature.h"
 #include <sys/auxv.h>
 #include <asm/hwcap.h>
+#include <arm_sve.h>
 
 static uint32_t detect_asimd(void)
 {
@@ -45,6 +46,11 @@ static uint32_t detect_pmull(void)
         return getauxval(AT_HWCAP) & HWCAP_PMULL;
 }
 
+static uint32_t detect_sve(void)
+{
+        return getauxval(AT_HWCAP) & HWCAP_SVE;
+}
+
 uint64_t cpu_feature_detect(void)
 {
         uint64_t features = 0;
@@ -58,6 +64,12 @@ uint64_t cpu_feature_detect(void)
                 if (detect_pmull())
                         features |= IMB_FEATURE_PMULL;
         }
+        if (detect_sve()) {
+                volatile uint64_t sve_width = svcntw();
+                if (sve_width >= (256 / 32)) {
+                        features |= IMB_FEATURE_SVE256;
+                }
+        }
 
 #ifdef SAFE_DATA
         features |= IMB_FEATURE_SAFE_DATA;
diff --git a/lib/aarch64/mb_mgr_aarch64.c b/lib/aarch64/mb_mgr_aarch64.c
index e1c19d74cedb31c642f82cf995e5df91d1a357dc..808998768508bd507e4060756c3aa034ecdf75b8 100644
--- a/lib/aarch64/mb_mgr_aarch64.c
+++ b/lib/aarch64/mb_mgr_aarch64.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -79,6 +79,14 @@ IMB_JOB *flush_job_snow3g_uea2_aarch64_common(IMB_MGR *state);
 IMB_JOB *submit_job_snow3g_uia2_aarch64_common(IMB_MGR *state,
                                                IMB_JOB *job);
 IMB_JOB *flush_job_snow3g_uia2_aarch64_common(IMB_MGR *state);
+
+IMB_JOB *submit_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state);
+
+IMB_JOB *submit_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state);
 /* ====================================================================== */
 
 #define SUBMIT_JOB               submit_job_aarch64
@@ -218,8 +226,11 @@ reset_ooo_mgrs(IMB_MGR *state)
                sizeof(snow3g_uea2_ooo->job_in_lane));
         memset(snow3g_uea2_ooo->bits_fixup, 0,
                sizeof(snow3g_uea2_ooo->bits_fixup));
+        memset(&(snow3g_uea2_ooo->args), 0,
+               sizeof(snow3g_uea2_ooo->args));
         snow3g_uea2_ooo->init_mask = 0;
-        snow3g_uea2_ooo->unused_lanes = 0xFF03020100;
+        // each 4 bit indicate one lane, at most 16 buffer
+        snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210;
         snow3g_uea2_ooo->num_lanes_inuse = 0;
         snow3g_uea2_ooo->init_done = 0;
         memset(snow3g_uea2_ooo->ks, 0,
@@ -232,8 +243,11 @@ reset_ooo_mgrs(IMB_MGR *state)
                sizeof(snow3g_uia2_ooo->job_in_lane));
         memset(snow3g_uia2_ooo->bits_fixup, 0,
                sizeof(snow3g_uia2_ooo->bits_fixup));
+        memset(&(snow3g_uia2_ooo->args), 0,
+               sizeof(snow3g_uia2_ooo->args));
         snow3g_uia2_ooo->init_mask = 0;
-        snow3g_uia2_ooo->unused_lanes = 0xFF03020100;
+        // each 4 bit indicate one lane, at most 16 buffer
+        snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210;
         snow3g_uia2_ooo->num_lanes_inuse = 0;
         snow3g_uia2_ooo->init_done = 0;
         memset(snow3g_uia2_ooo->ks, 0,
@@ -271,6 +285,10 @@ init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs)
                 flush_job_zuc256_eea3_aarch64 = flush_job_zuc256_eea3_aarch64_no_aesni;
                 submit_job_zuc256_eia3_aarch64 = submit_job_zuc256_eia3_aarch64_no_aesni;
                 flush_job_zuc256_eia3_aarch64 = flush_job_zuc256_eia3_aarch64_no_aesni;
+                submit_job_snow3g_uea2_aarch64 = submit_job_snow3g_uea2_aarch64_no_aesni;
+                flush_job_snow3g_uea2_aarch64 = flush_job_snow3g_uea2_aarch64_no_aesni;
+                submit_job_snow3g_uia2_aarch64 = submit_job_snow3g_uia2_aarch64_no_aesni;
+                flush_job_snow3g_uia2_aarch64 = flush_job_snow3g_uia2_aarch64_no_aesni;
                 return;
         }
 
@@ -317,5 +335,4 @@ init_mb_mgr_aarch64(IMB_MGR *state)
 {
         init_mb_mgr_aarch64_internal(state, 1);
 }
-
 #include "mb_mgr_code_aarch64.h"
diff --git a/lib/aarch64/mb_mgr_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_aarch64_no_aesni.c
index fedb481738feb8eb498295612a5433eba22675b6..a3c4a9d9b096a25771dff4257a84b2886cf0c9c8 100644
--- a/lib/aarch64/mb_mgr_aarch64_no_aesni.c
+++ b/lib/aarch64/mb_mgr_aarch64_no_aesni.c
@@ -1,5 +1,5 @@
 /*********************************************************************
-  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -153,8 +153,11 @@ reset_ooo_mgrs(IMB_MGR *state)
                sizeof(snow3g_uea2_ooo->job_in_lane));
         memset(snow3g_uea2_ooo->bits_fixup, 0,
                sizeof(snow3g_uea2_ooo->bits_fixup));
+        memset(&(snow3g_uea2_ooo->args), 0,
+               sizeof(snow3g_uea2_ooo->args));
         snow3g_uea2_ooo->init_mask = 0;
-        snow3g_uea2_ooo->unused_lanes = 0xFF03020100;
+        // each 4 bit indicate one lane, at most 16 buffer
+        snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210;
         snow3g_uea2_ooo->num_lanes_inuse = 0;
         snow3g_uea2_ooo->init_done = 0;
         memset(snow3g_uea2_ooo->ks, 0,
@@ -167,8 +170,11 @@ reset_ooo_mgrs(IMB_MGR *state)
                sizeof(snow3g_uia2_ooo->job_in_lane));
         memset(snow3g_uia2_ooo->bits_fixup, 0,
                sizeof(snow3g_uia2_ooo->bits_fixup));
+        memset(&(snow3g_uia2_ooo->args), 0,
+               sizeof(snow3g_uia2_ooo->args));
         snow3g_uia2_ooo->init_mask = 0;
-        snow3g_uia2_ooo->unused_lanes = 0xFF03020100;
+        // each 4 bit indicate one lane, at most 16 buffer
+        snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210;
         snow3g_uia2_ooo->num_lanes_inuse = 0;
         snow3g_uia2_ooo->init_done = 0;
         memset(snow3g_uia2_ooo->ks, 0,
diff --git a/lib/aarch64/mb_mgr_aarch64_sve256.c b/lib/aarch64/mb_mgr_aarch64_sve256.c
new file mode 100644
index 0000000000000000000000000000000000000000..d3d2bf4d1150fa804366a081d25fe392bfd40a66
--- /dev/null
+++ b/lib/aarch64/mb_mgr_aarch64_sve256.c
@@ -0,0 +1,340 @@
+/**********************************************************************
+  Copyright(c) 2021-2023 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <arm_sve.h>
+
+#include "ipsec-mb.h"
+#include "include/snow3g.h"
+#include "include/zuc_internal.h"
+
+#include "include/cpu_feature.h"
+#include "include/error.h"
+#include "clear_regs_mem_aarch64.h"
+#include "include/noaesni.h"
+#include "include/ipsec_ooo_mgr.h"
+
+IMB_JOB *submit_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state,
+                                              IMB_JOB *job);
+IMB_JOB *flush_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state);
+
+IMB_JOB *submit_job_zuc_eea3_aarch64_common(MB_MGR_ZUC_OOO *state,
+                                            IMB_JOB *job);
+IMB_JOB *flush_job_zuc_eea3_aarch64_common(MB_MGR_ZUC_OOO *state);
+
+IMB_JOB *submit_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state,
+                                                 IMB_JOB *job);
+IMB_JOB *flush_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state);
+
+IMB_JOB *submit_job_zuc256_eea3_aarch64_common(MB_MGR_ZUC_OOO *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_zuc256_eea3_aarch64_common(MB_MGR_ZUC_OOO *state);
+
+IMB_JOB *submit_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state,
+                                              IMB_JOB *job);
+IMB_JOB *flush_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state);
+
+IMB_JOB *submit_job_zuc_eia3_aarch64_common(MB_MGR_ZUC_OOO *state,
+                                            IMB_JOB *job);
+IMB_JOB *flush_job_zuc_eia3_aarch64_common(MB_MGR_ZUC_OOO *state);
+
+IMB_JOB *submit_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state,
+                                                 IMB_JOB *job);
+IMB_JOB *flush_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state);
+
+IMB_JOB *submit_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state);
+
+IMB_JOB *submit_job_snow3g_uea2_aarch64_sve256(IMB_MGR *state,
+                                            IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uea2_aarch64_sve256(IMB_MGR *state);
+
+IMB_JOB *submit_job_snow3g_uia2_aarch64_sve256(IMB_MGR *state,
+                                            IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uia2_aarch64_sve256(IMB_MGR *state);
+
+IMB_JOB *submit_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state);
+
+IMB_JOB *submit_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state,
+                                               IMB_JOB *job);
+IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state);
+/* ====================================================================== */
+
+#define SUBMIT_JOB               submit_job_aarch64_sve256
+#define FLUSH_JOB                flush_job_aarch64_sve256
+#define SUBMIT_JOB_NOCHECK       submit_job_nocheck_aarch64_sve256
+#define GET_NEXT_JOB             get_next_job_aarch64_sve256
+#define GET_COMPLETED_JOB        get_completed_job_aarch64_sve256
+
+#define QUEUE_SIZE               queue_size_aarch64_sve256
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB_HASH          SUBMIT_JOB_HASH_AARCH64
+#define FLUSH_JOB_HASH           FLUSH_JOB_HASH_AARCH64
+
+/* ====================================================================== */
+#define SUBMIT_JOB_ZUC_EEA3      submit_job_zuc_eea3_aarch64
+#define FLUSH_JOB_ZUC_EEA3       flush_job_zuc_eea3_aarch64
+#define SUBMIT_JOB_ZUC_EIA3      submit_job_zuc_eia3_aarch64
+#define FLUSH_JOB_ZUC_EIA3       flush_job_zuc_eia3_aarch64
+#define SUBMIT_JOB_ZUC256_EEA3   submit_job_zuc256_eea3_aarch64
+#define FLUSH_JOB_ZUC256_EEA3    flush_job_zuc256_eea3_aarch64
+#define SUBMIT_JOB_ZUC256_EIA3   submit_job_zuc256_eia3_aarch64
+#define FLUSH_JOB_ZUC256_EIA3    flush_job_zuc256_eia3_aarch64
+#define SUBMIT_JOB_SNOW3G_UEA2   submit_job_snow3g_uea2_aarch64
+#define FLUSH_JOB_SNOW3G_UEA2    flush_job_snow3g_uea2_aarch64
+#define SUBMIT_JOB_SNOW3G_UIA2   submit_job_snow3g_uia2_aarch64
+#define FLUSH_JOB_SNOW3G_UIA2    flush_job_snow3g_uia2_aarch64
+
+
+static IMB_JOB *
+(*submit_job_zuc_eea3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) =
+                                            submit_job_zuc_eea3_aarch64_common;
+
+static IMB_JOB *
+(*flush_job_zuc_eea3_aarch64)(MB_MGR_ZUC_OOO *state) =
+                                            flush_job_zuc_eea3_aarch64_common;
+
+static IMB_JOB *
+(*submit_job_zuc_eia3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) =
+                                            submit_job_zuc_eia3_aarch64_common;
+
+static IMB_JOB *
+(*flush_job_zuc_eia3_aarch64)(MB_MGR_ZUC_OOO *state) =
+                                            flush_job_zuc_eia3_aarch64_common;
+
+static IMB_JOB *
+(*submit_job_zuc256_eea3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) =
+                                            submit_job_zuc256_eea3_aarch64_common;
+
+static IMB_JOB *
+(*flush_job_zuc256_eea3_aarch64)(MB_MGR_ZUC_OOO *state) =
+                                            flush_job_zuc256_eea3_aarch64_common;
+
+static IMB_JOB *
+(*submit_job_zuc256_eia3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) =
+                                            submit_job_zuc256_eia3_aarch64_common;
+
+static IMB_JOB *
+(*flush_job_zuc256_eia3_aarch64)(MB_MGR_ZUC_OOO *state) =
+                                            flush_job_zuc256_eia3_aarch64_common;
+
+static IMB_JOB *
+(*submit_job_snow3g_uea2_aarch64)(IMB_MGR *state, IMB_JOB *job) =
+                                            submit_job_snow3g_uea2_aarch64_sve256;
+
+static IMB_JOB *
+(*flush_job_snow3g_uea2_aarch64)(IMB_MGR *state) =
+                                            flush_job_snow3g_uea2_aarch64_sve256;
+
+static IMB_JOB *
+(*submit_job_snow3g_uia2_aarch64)(IMB_MGR *state, IMB_JOB *job) =
+                                            submit_job_snow3g_uia2_aarch64_sve256;
+
+static IMB_JOB *
+(*flush_job_snow3g_uia2_aarch64)(IMB_MGR *state) =
+                                            flush_job_snow3g_uia2_aarch64_sve256;
+static void
+reset_ooo_mgrs(IMB_MGR *state)
+{
+        MB_MGR_ZUC_OOO *zuc_eea3_ooo = state->zuc_eea3_ooo;
+        MB_MGR_ZUC_OOO *zuc_eia3_ooo = state->zuc_eia3_ooo;
+        MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo;
+        MB_MGR_ZUC_OOO *zuc256_eia3_ooo = state->zuc256_eia3_ooo;
+        MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo;
+        MB_MGR_SNOW3G_OOO *snow3g_uia2_ooo = state->snow3g_uia2_ooo;
+
+        /* Init ZUC out-of-order fields */
+        memset(zuc_eea3_ooo->lens, 0,
+               sizeof(zuc_eea3_ooo->lens));
+        memset(zuc_eea3_ooo->job_in_lane, 0,
+               sizeof(zuc_eea3_ooo->job_in_lane));
+        zuc_eea3_ooo->unused_lanes = 0xFF03020100;
+        zuc_eea3_ooo->num_lanes_inuse = 0;
+        memset(&zuc_eea3_ooo->state, 0,
+               sizeof(zuc_eea3_ooo->state));
+        zuc_eea3_ooo->init_not_done = 0;
+        zuc_eea3_ooo->unused_lane_bitmask = 0x0f;
+
+        memset(zuc_eia3_ooo->lens, 0xFF,
+               sizeof(zuc_eia3_ooo->lens));
+        memset(zuc_eia3_ooo->job_in_lane, 0,
+               sizeof(zuc_eia3_ooo->job_in_lane));
+        zuc_eia3_ooo->unused_lanes = 0xFF03020100;
+        zuc_eia3_ooo->num_lanes_inuse = 0;
+        memset(&zuc_eia3_ooo->state, 0,
+               sizeof(zuc_eia3_ooo->state));
+        zuc_eia3_ooo->init_not_done = 0;
+        zuc_eia3_ooo->unused_lane_bitmask = 0x0f;
+
+        memset(zuc256_eea3_ooo->lens, 0,
+               sizeof(zuc256_eea3_ooo->lens));
+        memset(zuc256_eea3_ooo->job_in_lane, 0,
+               sizeof(zuc256_eea3_ooo->job_in_lane));
+        zuc256_eea3_ooo->unused_lanes = 0xFF03020100;
+        zuc256_eea3_ooo->num_lanes_inuse = 0;
+        memset(&zuc256_eea3_ooo->state, 0,
+               sizeof(zuc256_eea3_ooo->state));
+        zuc256_eea3_ooo->init_not_done = 0;
+        zuc256_eea3_ooo->unused_lane_bitmask = 0x0f;
+
+        memset(zuc256_eia3_ooo->lens, 0xFF,
+               sizeof(zuc256_eia3_ooo->lens));
+        memset(zuc256_eia3_ooo->job_in_lane, 0,
+               sizeof(zuc256_eia3_ooo->job_in_lane));
+        zuc256_eia3_ooo->unused_lanes = 0xFF03020100;
+        zuc256_eia3_ooo->num_lanes_inuse = 0;
+        memset(&zuc256_eia3_ooo->state, 0,
+               sizeof(zuc256_eia3_ooo->state));
+        zuc256_eia3_ooo->init_not_done = 0;
+        zuc256_eia3_ooo->unused_lane_bitmask = 0x0f;
+
+        /* Init SNOW3G out-of-order fields */
+        memset(snow3g_uea2_ooo->lens, 0,
+               sizeof(snow3g_uea2_ooo->lens));
+        memset(snow3g_uea2_ooo->job_in_lane, 0,
+               sizeof(snow3g_uea2_ooo->job_in_lane));
+        memset(snow3g_uea2_ooo->bits_fixup, 0,
+               sizeof(snow3g_uea2_ooo->bits_fixup));
+        memset(&(snow3g_uea2_ooo->args), 0,
+               sizeof(snow3g_uea2_ooo->args));
+        snow3g_uea2_ooo->init_mask = 0;
+        // each 4 bit indicate one lane, at most 16 buffer
+        snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210;
+        snow3g_uea2_ooo->num_lanes_inuse = 0;
+        snow3g_uea2_ooo->init_done = 0;
+        memset(snow3g_uea2_ooo->ks, 0,
+               sizeof(snow3g_uea2_ooo->ks));
+        snow3g_uea2_ooo->road_block = 0;
+
+        memset(snow3g_uia2_ooo->lens, 0,
+               sizeof(snow3g_uia2_ooo->lens));
+        memset(snow3g_uia2_ooo->job_in_lane, 0,
+               sizeof(snow3g_uia2_ooo->job_in_lane));
+        memset(snow3g_uia2_ooo->bits_fixup, 0,
+               sizeof(snow3g_uia2_ooo->bits_fixup));
+        memset(&(snow3g_uia2_ooo->args), 0,
+               sizeof(snow3g_uia2_ooo->args));
+        snow3g_uia2_ooo->init_mask = 0;
+        // each 4 bit indicate one lane, at most 16 buffer
+        snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210;
+        snow3g_uia2_ooo->num_lanes_inuse = 0;
+        snow3g_uia2_ooo->init_done = 0;
+        memset(snow3g_uia2_ooo->ks, 0,
+               sizeof(snow3g_uia2_ooo->ks));
+        snow3g_uia2_ooo->road_block = 0;
+        return;
+}
+
+IMB_DLL_LOCAL void
+init_mb_mgr_aarch64_sve256_internal(IMB_MGR *state, const int reset_mgrs)
+{
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return;
+        }
+#endif
+
+        /* reset error status */
+        imb_set_errno(state, 0);
+
+        state->features = cpu_feature_adjust(state->flags,
+                                             cpu_feature_detect());
+
+        /* Set architecture for future checks */
+        state->used_arch = (uint32_t) IMB_ARCH_SVE256;
+
+        if (!(state->features & IMB_FEATURE_AESNI)) {
+                init_mb_mgr_aarch64_no_aesni(state);
+                submit_job_zuc_eea3_aarch64 = submit_job_zuc_eea3_aarch64_no_aesni;
+                flush_job_zuc_eea3_aarch64 = flush_job_zuc_eea3_aarch64_no_aesni;
+                submit_job_zuc_eia3_aarch64 = submit_job_zuc_eia3_aarch64_no_aesni;
+                flush_job_zuc_eia3_aarch64 = flush_job_zuc_eia3_aarch64_no_aesni;
+                submit_job_zuc256_eea3_aarch64 = submit_job_zuc256_eea3_aarch64_no_aesni;
+                flush_job_zuc256_eea3_aarch64 = flush_job_zuc256_eea3_aarch64_no_aesni;
+                submit_job_zuc256_eia3_aarch64 = submit_job_zuc256_eia3_aarch64_no_aesni;
+                flush_job_zuc256_eia3_aarch64 = flush_job_zuc256_eia3_aarch64_no_aesni;
+                submit_job_snow3g_uea2_aarch64 = submit_job_snow3g_uea2_aarch64_no_aesni;
+                flush_job_snow3g_uea2_aarch64 = flush_job_snow3g_uea2_aarch64_no_aesni;
+                submit_job_snow3g_uia2_aarch64 = submit_job_snow3g_uia2_aarch64_no_aesni;
+                flush_job_snow3g_uia2_aarch64 = flush_job_snow3g_uia2_aarch64_no_aesni;
+                return;
+        }
+
+        if (reset_mgrs) {
+                reset_ooo_mgrs(state);
+
+                /* Init "in order" components */
+                state->next_job = 0;
+                state->earliest_job = -1;
+        }
+
+        /* set AARCH64 handlers */
+        state->get_next_job                = get_next_job_aarch64_sve256;
+        state->submit_job                  = submit_job_aarch64_sve256;
+        state->submit_job_nocheck          = submit_job_nocheck_aarch64_sve256;
+        state->get_completed_job           = get_completed_job_aarch64_sve256;
+        state->flush_job                   = flush_job_aarch64_sve256;
+        state->queue_size                  = queue_size_aarch64_sve256;
+
+        state->eea3_1_buffer               = zuc_eea3_1_buffer_aarch64;
+        state->eea3_4_buffer               = zuc_eea3_4_buffer_aarch64;
+        state->eea3_n_buffer               = zuc_eea3_n_buffer_aarch64;
+        state->zuc256_eea3_1_buffer        = zuc256_eea3_1_buffer_aarch64;
+        state->eia3_1_buffer               = zuc_eia3_1_buffer_aarch64;
+        state->eia3_n_buffer               = zuc_eia3_n_buffer_aarch64;
+        state->zuc256_eia3_1_buffer        = zuc256_eia3_1_buffer_aarch64;
+
+        state->snow3g_f8_1_buffer_bit      = snow3g_f8_1_buffer_bit_aarch64_sve256;
+        state->snow3g_f8_1_buffer          = snow3g_f8_1_buffer_aarch64_sve256;
+        state->snow3g_f8_2_buffer          = snow3g_f8_2_buffer_aarch64_sve256;
+        state->snow3g_f8_4_buffer          = snow3g_f8_4_buffer_aarch64_sve256;
+        state->snow3g_f8_8_buffer          = snow3g_f8_8_buffer_aarch64_sve256;
+        state->snow3g_f8_n_buffer          = snow3g_f8_n_buffer_aarch64_sve256;
+        state->snow3g_f8_4_buffer_multikey = snow3g_f8_4_buffer_multikey_aarch64_sve256;
+        state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_aarch64_sve256;
+        state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_aarch64_sve256;
+        state->snow3g_f9_1_buffer          = snow3g_f9_1_buffer_aarch64_sve256;
+        state->snow3g_init_key_sched       = snow3g_init_key_sched_aarch64_sve256;
+        state->snow3g_key_sched_size       = snow3g_key_sched_size_aarch64_sve256;
+}
+
+void
+init_mb_mgr_aarch64_sve256(IMB_MGR *state)
+{
+        IMB_ASSERT(state->features & IMB_FEATURE_SVE256);
+        init_mb_mgr_aarch64_sve256_internal(state, 1);
+}
+#include "mb_mgr_code_aarch64.h"
diff --git a/lib/aarch64/mb_mgr_auto_aarch64.c b/lib/aarch64/mb_mgr_auto_aarch64.c
index b4c0797e1eac1f0852135e62e991f74b6cd4a1a4..0bf93fe23b1d20af21a7be8baa3e8e8fbbc3f860 100644
--- a/lib/aarch64/mb_mgr_auto_aarch64.c
+++ b/lib/aarch64/mb_mgr_auto_aarch64.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -26,6 +26,7 @@
   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **********************************************************************/
+#include <arm_sve.h>
 #include "ipsec-mb.h"
 #include "cpu_feature.h"
 #include "noaesni.h"
@@ -44,6 +45,7 @@ init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch)
         IMB_ARCH arch_detected = IMB_ARCH_NONE;
         const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI;
         const uint64_t detect_noaesni = IMB_FEATURE_AARCH64 | IMB_FEATURE_ASIMD;
+        const uint64_t detect_sve256 = IMB_FEATURE_AARCH64 | IMB_FEATURE_SVE256;
 
         /* reset error status */
         imb_set_errno(state, 0);
@@ -54,6 +56,11 @@ init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch)
                 return;
         }
 #endif
+        if ((state->features & detect_sve256) == detect_sve256) {
+                init_mb_mgr_aarch64_sve256(state);
+                arch_detected = IMB_ARCH_SVE256;
+                goto init_mb_mgr_auto_ret;
+        }
         if ((state->features & detect_aarch64) == detect_aarch64) {
                 init_mb_mgr_aarch64(state);
                 arch_detected = IMB_ARCH_AARCH64;
diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c
index ce55bbfbfe691cd93be64abc2484345e67d58874..adb33784b0168d2ebe28501ad694e19019cdf414 100644
--- a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c
+++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2022 Arm Corporation All rights reserved.
+  Copyright(c) 2022-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -31,12 +31,14 @@
 #define FLUSH_JOB_SNOW3G_UEA2     flush_job_snow3g_uea2_aarch64_common
 #define SUBMIT_JOB_SNOW3G_UIA2    submit_job_snow3g_uia2_aarch64_common
 #define FLUSH_JOB_SNOW3G_UIA2     flush_job_snow3g_uia2_aarch64_common
-#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64
-#define SNOW3G_F8_4_BUFFER_STREAM  snow3g_f8_4_buffer_stream_aarch64
-#define SNOW3G_F8_1_BUFFER_STREAM  snow3g_f8_1_buffer_stream_aarch64
-#define SNOW3G_F9_1_BUFFER_DIGEST  snow3g_f9_1_buffer_digest_aarch64
-#define SNOW3G_F9_4_BUFFER_KEYSTREAM  snow3g_f9_4_buffer_keystream_aarch64
-
+#define SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB snow3g_f8_4_buffer_initialize_aarch64
+#define SNOW3G_F8_MULTI_BUFFER_STREAM_JOB     snow3g_f8_4_buffer_stream_aarch64
+#define SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB  snow3g_f9_4_buffer_keystream_aarch64
+#define SNOW3G_F8_1_BUFFER_STREAM_JOB  snow3g_f8_1_buffer_stream_aarch64
+#define SNOW3G_F9_1_BUFFER_DIGEST_JOB  snow3g_f9_1_buffer_digest_aarch64
 #endif
 
-#include "mb_mgr_snow3g_submit_flush_common_aarch64.h"
+#define SNOW3G_MB_MAX_LANES_SIMD    4
+#define snow3gKeyStateMulti_t snow3gKeyState4_t
+
+#include "mb_mgr_snow3g_submit_flush_common_aarch64.h"
\ No newline at end of file
diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c
index 56eafb845a3cccd1c52f82b854b5eaf615e3fe8b..ebe852c3384785d2713a0a2654248f81725eb3b4 100644
--- a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c
+++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2022 Arm Corporation All rights reserved.
+  Copyright(c) 2022-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -31,11 +31,14 @@
 #define FLUSH_JOB_SNOW3G_UEA2     flush_job_snow3g_uea2_aarch64_no_aesni
 #define SUBMIT_JOB_SNOW3G_UIA2    submit_job_snow3g_uia2_aarch64_no_aesni
 #define FLUSH_JOB_SNOW3G_UIA2     flush_job_snow3g_uia2_aarch64_no_aesni
-#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64_no_aesni
-#define SNOW3G_F8_4_BUFFER_STREAM  snow3g_f8_4_buffer_stream_aarch64_no_aesni
-#define SNOW3G_F8_1_BUFFER_STREAM  snow3g_f8_1_buffer_stream_aarch64_no_aesni
-#define SNOW3G_F9_1_BUFFER_DIGEST  snow3g_f9_1_buffer_digest_aarch64_no_aesni
-#define SNOW3G_F9_4_BUFFER_KEYSTREAM  snow3g_f9_4_buffer_keystream_aarch64_no_aesni
+#define SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB snow3g_f8_4_buffer_initialize_aarch64_no_aesni
+#define SNOW3G_F8_MULTI_BUFFER_STREAM_JOB     snow3g_f8_4_buffer_stream_aarch64_no_aesni
+#define SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB  snow3g_f9_4_buffer_keystream_aarch64_no_aesni
+#define SNOW3G_F8_1_BUFFER_STREAM_JOB  snow3g_f8_1_buffer_stream_aarch64_no_aesni
+#define SNOW3G_F9_1_BUFFER_DIGEST_JOB  snow3g_f9_1_buffer_digest_aarch64_no_aesni
 #endif
 
-#include "mb_mgr_snow3g_submit_flush_common_aarch64.h"
+#define SNOW3G_MB_MAX_LANES_SIMD    4
+#define snow3gKeyStateMulti_t snow3gKeyState4_t
+
+#include "mb_mgr_snow3g_submit_flush_common_aarch64.h"
\ No newline at end of file
diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_sve256.c b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_sve256.c
new file mode 100644
index 0000000000000000000000000000000000000000..3392457e23c84ce9d4d15c398e9773958054350e
--- /dev/null
+++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_sve256.c
@@ -0,0 +1,44 @@
+/**********************************************************************
+  Copyright(c) 2023 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef SUBMIT_JOB_SNOW3G_UEA2
+#define SUBMIT_JOB_SNOW3G_UEA2    submit_job_snow3g_uea2_aarch64_sve256
+#define FLUSH_JOB_SNOW3G_UEA2     flush_job_snow3g_uea2_aarch64_sve256
+#define SUBMIT_JOB_SNOW3G_UIA2    submit_job_snow3g_uia2_aarch64_sve256
+#define FLUSH_JOB_SNOW3G_UIA2     flush_job_snow3g_uia2_aarch64_sve256
+#define SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB snow3g_f8_8_buffer_initialize_aarch64_sve256_asm
+#define SNOW3G_F8_MULTI_BUFFER_STREAM_JOB     snow3g_f8_8_buffer_stream_aarch64_sve256_asm
+#define SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB  snow3g_f9_8_buffer_keystream_aarch64_sve256_asm
+#define SNOW3G_F8_1_BUFFER_STREAM_JOB  snow3g_f8_1_buffer_stream_aarch64_sve256
+#define SNOW3G_F9_1_BUFFER_DIGEST_JOB  snow3g_f9_1_buffer_digest_aarch64_sve256
+#endif
+
+#define SNOW3G_MB_MAX_LANES_SIMD    8
+#define snow3gKeyStateMulti_t snow3gKeyState8_t
+
+#include "mb_mgr_snow3g_submit_flush_common_aarch64.h"
\ No newline at end of file
diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h b/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h
index 0773c2c338af20743764871062bed0ea63a7d659..e55d39789a029fa2fc9cf73be1a8579a90761e63 100644
--- a/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h
+++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2022 Arm Corporation All rights reserved.
+  Copyright(c) 2022-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -35,11 +35,20 @@
 #include <arm_neon.h>
 #include <assert.h>
 #include <string.h>
+#ifdef SAFE_PARAM
+#include "error.h"
+#endif
 
-#define SNOW3G_MB_MAX_LANES_SIMD    4
+#define UNUSED_LANE_MASK_BITS       4
+#define UNUSED_LANE_MASK            0xF
 
+#if SNOW3G_MB_MAX_LANES_SIMD == 4
 #define INIT_DONE_MASK  0x0F
+#elif SNOW3G_MB_MAX_LANES_SIMD == 8
+#define INIT_DONE_MASK  0xFF
+#endif
 #define INIT_ALL_DONE   INIT_DONE_MASK
+
 #define JOB_IS_COMPLETED(state, i)  \
         (((state->job_in_lane[i]) != NULL) && (state->args.byte_length[i] == 0))
 #define JOB_NOT_INITIALIZED(state, i) \
@@ -55,11 +64,34 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state);
 IMB_JOB *SUBMIT_JOB_SNOW3G_UIA2(IMB_MGR *state, IMB_JOB *job);
 IMB_JOB *FLUSH_JOB_SNOW3G_UIA2(IMB_MGR *state);
 
+void SNOW3G_F8_1_BUFFER_STREAM_JOB(void *pCtx,
+                                   const void *pBufferIn,
+                                   void *pBufferOut,
+                                   const uint32_t lengthInBytes);
+
+void SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB(void *pCtx,
+                                           const snow3g_key_schedule_t **pKeySched,
+                                           const void **pIV);
+
+void SNOW3G_F8_MULTI_BUFFER_STREAM_JOB(void *pCtx,
+                                       const void **pBufferIn,
+                                       void **pBufferOut,
+                                       const uint32_t lengthInBytes);
+
+void SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB(void *pCtx,
+                                          uint32_t *ks);
+
+void SNOW3G_F9_1_BUFFER_DIGEST_JOB(const uint32_t z[5],
+                                   const void *pBufferIn,
+                                   const uint64_t lengthInBits,
+                                   void *pDigest);
+
 static void snow3g_mb_mgr_insert_uea2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job)
 {
-    uint64_t used_lane_idx = state->unused_lanes & 0xff;
+    uint64_t used_lane_idx = state->unused_lanes & UNUSED_LANE_MASK;
     assert(used_lane_idx < SNOW3G_MB_MAX_LANES_SIMD);
-    state->unused_lanes =  state->unused_lanes >> 8;
+    state->unused_lanes =  state->unused_lanes >> UNUSED_LANE_MASK_BITS;
+    state->num_lanes_inuse++;
     state->args.iv[used_lane_idx] = job->iv;
     state->args.keys[used_lane_idx] = job->enc_keys;
     state->args.in[used_lane_idx] = job->src + job->cipher_start_src_offset_in_bytes;
@@ -73,9 +105,9 @@ static void snow3g_mb_mgr_insert_uea2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job
 
 static void snow3g_mb_mgr_insert_uia2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job)
 {
-    uint64_t used_lane_idx = state->unused_lanes & 0xff;
+    uint64_t used_lane_idx = state->unused_lanes & UNUSED_LANE_MASK;
     assert(used_lane_idx < SNOW3G_MB_MAX_LANES_SIMD);
-    state->unused_lanes =  state->unused_lanes >> 8;
+    state->unused_lanes =  state->unused_lanes >> UNUSED_LANE_MASK_BITS;
     state->num_lanes_inuse++;
     state->args.iv[used_lane_idx] = job->u.SNOW3G_UIA2._iv;
     state->args.keys[used_lane_idx] = job->u.SNOW3G_UIA2._key;
@@ -83,7 +115,7 @@ static void snow3g_mb_mgr_insert_uia2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job
     state->args.out[used_lane_idx] = job->auth_tag_output;
     state->args.INITIALIZED[used_lane_idx] = 0;
     state->lens[used_lane_idx] = job->msg_len_to_hash_in_bits;
-    state->init_done = state->init_done & (~(1 << used_lane_idx) & 0xff);
+    state->init_done = state->init_done & (~(1 << used_lane_idx) & INIT_DONE_MASK);
 
     state->job_in_lane[used_lane_idx] = job;
 }
@@ -97,30 +129,16 @@ static IMB_JOB *snow3g_mb_mgr_free_uea2_job(MB_MGR_SNOW3G_OOO *state)
             ret = state->job_in_lane[i];
             ret->status |= IMB_STATUS_COMPLETED_CIPHER;
             state->job_in_lane[i] = NULL;
-            state->unused_lanes = state->unused_lanes << 8;
+            state->unused_lanes = state->unused_lanes << UNUSED_LANE_MASK_BITS;
             state->unused_lanes |= i;
+            state->num_lanes_inuse--;
             state->lens[i] = 0;
             state->args.INITIALIZED[i] = 0;
 #ifdef SAFE_DATA
-            state->args.LFSR_0[i] = 0;
-            state->args.LFSR_1[i] = 0;
-            state->args.LFSR_2[i] = 0;
-            state->args.LFSR_3[i] = 0;
-            state->args.LFSR_4[i] = 0;
-            state->args.LFSR_5[i] = 0;
-            state->args.LFSR_6[i] = 0;
-            state->args.LFSR_7[i] = 0;
-            state->args.LFSR_8[i] = 0;
-            state->args.LFSR_9[i] = 0;
-            state->args.LFSR_10[i] = 0;
-            state->args.LFSR_11[i] = 0;
-            state->args.LFSR_12[i] = 0;
-            state->args.LFSR_13[i] = 0;
-            state->args.LFSR_14[i] = 0;
-            state->args.LFSR_15[i] = 0;
-            state->args.FSM_1[i] = 0;
-            state->args.FSM_2[i] = 0;
-            state->args.FSM_3[i] = 0;
+            uint32_t* key_state = (uint32_t *)&(state->args.LFSR_0[0]);
+            for (int k = 0; k < (16 + 3); k++) {
+                key_state[k * SNOW3G_MB_MAX_LANES_SIMD + i] = 0;
+            }
 #endif
             break;
         }
@@ -136,33 +154,18 @@ static IMB_JOB *snow3g_mb_mgr_free_uia2_job(MB_MGR_SNOW3G_OOO *state, int i)
     ret = state->job_in_lane[i];
     ret->status |= IMB_STATUS_COMPLETED_AUTH;
     state->job_in_lane[i] = NULL;
-    state->unused_lanes = state->unused_lanes << 8;
+    state->unused_lanes = state->unused_lanes << UNUSED_LANE_MASK_BITS;
     state->unused_lanes |= i;
     state->num_lanes_inuse--;
     state->lens[i] = 0;
     state->args.INITIALIZED[i] = 0;
-    state->init_done = state->init_done & (~(1 << i) & 0xff);
+    state->init_done = state->init_done & (~(1 << i) & INIT_DONE_MASK);
 
 #ifdef SAFE_DATA
-    state->args.LFSR_0[i] = 0;
-    state->args.LFSR_1[i] = 0;
-    state->args.LFSR_2[i] = 0;
-    state->args.LFSR_3[i] = 0;
-    state->args.LFSR_4[i] = 0;
-    state->args.LFSR_5[i] = 0;
-    state->args.LFSR_6[i] = 0;
-    state->args.LFSR_7[i] = 0;
-    state->args.LFSR_8[i] = 0;
-    state->args.LFSR_9[i] = 0;
-    state->args.LFSR_10[i] = 0;
-    state->args.LFSR_11[i] = 0;
-    state->args.LFSR_12[i] = 0;
-    state->args.LFSR_13[i] = 0;
-    state->args.LFSR_14[i] = 0;
-    state->args.LFSR_15[i] = 0;
-    state->args.FSM_1[i] = 0;
-    state->args.FSM_2[i] = 0;
-    state->args.FSM_3[i] = 0;
+    uint32_t* key_state = (uint32_t *)&(state->args.LFSR_0[0]);
+    for (int k = 0; k < (16 + 3); k++) {
+        key_state[k * SNOW3G_MB_MAX_LANES_SIMD + i] = 0;
+    }
     for (int k = 0; k < 5; k++) {
         state->ks[i * 5 + k] = 0;
     }
@@ -172,119 +175,35 @@ static IMB_JOB *snow3g_mb_mgr_free_uia2_job(MB_MGR_SNOW3G_OOO *state, int i)
 }
 
 __forceinline
-void cpy_snow3g_state_to_ctx_1(snow3gKeyState1_t* ctx, MB_MGR_SNOW3G_OOO* state, const int num_lane) {
-    SNOW3G_ARGS args = state->args;
-    ctx->LFSR_S[0] = args.LFSR_0[num_lane];
-    ctx->LFSR_S[1] = args.LFSR_1[num_lane];
-    ctx->LFSR_S[2] = args.LFSR_2[num_lane];
-    ctx->LFSR_S[3] = args.LFSR_3[num_lane];
-    ctx->LFSR_S[4] = args.LFSR_4[num_lane];
-    ctx->LFSR_S[5] = args.LFSR_5[num_lane];
-    ctx->LFSR_S[6] = args.LFSR_6[num_lane];
-    ctx->LFSR_S[7] = args.LFSR_7[num_lane];
-    ctx->LFSR_S[8] = args.LFSR_8[num_lane];
-    ctx->LFSR_S[9] = args.LFSR_9[num_lane];
-    ctx->LFSR_S[10] = args.LFSR_10[num_lane];
-    ctx->LFSR_S[11] = args.LFSR_11[num_lane];
-    ctx->LFSR_S[12] = args.LFSR_12[num_lane];
-    ctx->LFSR_S[13] = args.LFSR_13[num_lane];
-    ctx->LFSR_S[14] = args.LFSR_14[num_lane];
-    ctx->LFSR_S[15] = args.LFSR_15[num_lane];
-    ctx->FSM_R1 = args.FSM_1[num_lane];
-    ctx->FSM_R2 = args.FSM_2[num_lane];
-    ctx->FSM_R3 = args.FSM_3[num_lane];
-}
-
-__forceinline
-void cpy_snow3g_ctx_to_state_after_stream(MB_MGR_SNOW3G_OOO* state, snow3gKeyState4_t* ctx) {
-    SNOW3G_ARGS *args = &(state->args);
-    const uint32_t *pLFSR_0 = (const uint32_t *) &ctx->LFSR_X[ctx->iLFSR_X];
-    const uint32_t *pLFSR_1 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 1) & 15];
-    const uint32_t *pLFSR_2 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 2) & 15];
-    const uint32_t *pLFSR_3 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 3) & 15];
-    const uint32_t *pLFSR_4 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 4) & 15];
-    const uint32_t *pLFSR_5 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 5) & 15];
-    const uint32_t *pLFSR_6 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 6) & 15];
-    const uint32_t *pLFSR_7 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 7) & 15];
-    const uint32_t *pLFSR_8 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 8) & 15];
-    const uint32_t *pLFSR_9 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 9) & 15];
-    const uint32_t *pLFSR_10 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 10) & 15];
-    const uint32_t *pLFSR_11 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 11) & 15];
-    const uint32_t *pLFSR_12 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 12) & 15];
-    const uint32_t *pLFSR_13 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 13) & 15];
-    const uint32_t *pLFSR_14 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 14) & 15];
-    const uint32_t *pLFSR_15 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 15) & 15];
-    const uint32_t *pFSM_1 = (const uint32_t *) &ctx->FSM_X[0];
-    const uint32_t *pFSM_2 = (const uint32_t *) &ctx->FSM_X[1];
-    const uint32_t *pFSM_3 = (const uint32_t *) &ctx->FSM_X[2];
-    for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
-        if (!JOB_IS_COMPLETED(state, i)) {
-            args->LFSR_0[i] = pLFSR_0[i];
-            args->LFSR_1[i] = pLFSR_1[i];
-            args->LFSR_2[i] = pLFSR_2[i];
-            args->LFSR_3[i] = pLFSR_3[i];
-            args->LFSR_4[i] = pLFSR_4[i];
-            args->LFSR_5[i] = pLFSR_5[i];
-            args->LFSR_6[i] = pLFSR_6[i];
-            args->LFSR_7[i] = pLFSR_7[i];
-            args->LFSR_8[i] = pLFSR_8[i];
-            args->LFSR_9[i] = pLFSR_9[i];
-            args->LFSR_10[i] = pLFSR_10[i];
-            args->LFSR_11[i] = pLFSR_11[i];
-            args->LFSR_12[i] = pLFSR_12[i];
-            args->LFSR_13[i] = pLFSR_13[i];
-            args->LFSR_14[i] = pLFSR_14[i];
-            args->LFSR_15[i] = pLFSR_15[i];
-            args->FSM_1[i] = pFSM_1[i];
-            args->FSM_2[i] = pFSM_2[i];
-            args->FSM_3[i] = pFSM_3[i];
-        }
+void cpy_state_to_ctx1(snow3gKeyStateMulti_t* state, snow3gKeyState1_t* ctx, const int num_lane) {
+    uint32_t iLFSR_X = state->iLFSR_X;
+    uint32_t *src = (uint32_t *)&(state->LFSR_X[0]);
+    uint32_t *dst = (uint32_t *)&(ctx->LFSR_S[0]);
+    for (int i = 0; i < 16; i++) {
+        dst[i] = src[((i + iLFSR_X) % 16) * SNOW3G_MB_MAX_LANES_SIMD + num_lane];
+    }
+    for (int i = 16; i < 19; i++) {
+        dst[i] = src[i * SNOW3G_MB_MAX_LANES_SIMD + num_lane];
     }
 }
 
 __forceinline
-void cpy_snow3g_state_to_ctx_after_initialize(snow3gKeyState4_t* ctx, MB_MGR_SNOW3G_OOO* state) {
-    SNOW3G_ARGS *args = &(state->args);
-    uint32_t *pLFSR_0 = (uint32_t *) &ctx->LFSR_X[ctx->iLFSR_X];
-    uint32_t *pLFSR_1 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 1) & 15];
-    uint32_t *pLFSR_2 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 2) & 15];
-    uint32_t *pLFSR_3 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 3) & 15];
-    uint32_t *pLFSR_4 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 4) & 15];
-    uint32_t *pLFSR_5 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 5) & 15];
-    uint32_t *pLFSR_6 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 6) & 15];
-    uint32_t *pLFSR_7 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 7) & 15];
-    uint32_t *pLFSR_8 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 8) & 15];
-    uint32_t *pLFSR_9 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 9) & 15];
-    uint32_t *pLFSR_10 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 10) & 15];
-    uint32_t *pLFSR_11 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 11) & 15];
-    uint32_t *pLFSR_12 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 12) & 15];
-    uint32_t *pLFSR_13 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 13) & 15];
-    uint32_t *pLFSR_14 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 14) & 15];
-    uint32_t *pLFSR_15 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 15) & 15];
-    uint32_t *pFSM_1 = (uint32_t *) &ctx->FSM_X[0];
-    uint32_t *pFSM_2 = (uint32_t *) &ctx->FSM_X[1];
-    uint32_t *pFSM_3 = (uint32_t *) &ctx->FSM_X[2];
+void cpy_newly_intialized_ctx_to_state(snow3gKeyStateMulti_t* new, MB_MGR_SNOW3G_OOO* state) {
+    snow3gKeyStateMulti_t* ctx = (snow3gKeyStateMulti_t *)&(state->args.LFSR_0[0]);
+    uint32_t* dst = (uint32_t *)&(ctx->LFSR_X[0]);
+    uint32_t* src = (uint32_t *)&(new->LFSR_X[0]);
+    uint32_t dst_iLFSR = ctx->iLFSR_X;
+    uint32_t src_iLFSR = new->iLFSR_X;
     for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
-        if (JOB_INITIALIZED(state, i)) {
-            pLFSR_0[i] = args->LFSR_0[i];
-            pLFSR_1[i] = args->LFSR_1[i];
-            pLFSR_2[i] = args->LFSR_2[i];
-            pLFSR_3[i] = args->LFSR_3[i];
-            pLFSR_4[i] = args->LFSR_4[i];
-            pLFSR_5[i] = args->LFSR_5[i];
-            pLFSR_6[i] = args->LFSR_6[i];
-            pLFSR_7[i] = args->LFSR_7[i];
-            pLFSR_8[i] = args->LFSR_8[i];
-            pLFSR_9[i] = args->LFSR_9[i];
-            pLFSR_10[i] = args->LFSR_10[i];
-            pLFSR_11[i] = args->LFSR_11[i];
-            pLFSR_12[i] = args->LFSR_12[i];
-            pLFSR_13[i] = args->LFSR_13[i];
-            pLFSR_14[i] = args->LFSR_14[i];
-            pLFSR_15[i] = args->LFSR_15[i];
-            pFSM_1[i] = args->FSM_1[i];
-            pFSM_2[i] = args->FSM_2[i];
-            pFSM_3[i] = args->FSM_3[i];
+        if (JOB_NOT_INITIALIZED(state, i)) {
+            for (int j = 0; j < (16 + 3); j++) {
+                dst[((j + dst_iLFSR) % 16) * SNOW3G_MB_MAX_LANES_SIMD + i] =
+                src[((j + src_iLFSR) % 16) * SNOW3G_MB_MAX_LANES_SIMD + i];
+            }
+            for (int j = 16; j < 19; j++) {
+                dst[j * SNOW3G_MB_MAX_LANES_SIMD + i] = src[j * SNOW3G_MB_MAX_LANES_SIMD + i];
+            }
+            state->args.INITIALIZED[i] = 1;
         }
     }
 }
@@ -292,6 +211,35 @@ void cpy_snow3g_state_to_ctx_after_initialize(snow3gKeyState4_t* ctx, MB_MGR_SNO
 IMB_JOB *SUBMIT_JOB_SNOW3G_UEA2(IMB_MGR *state,
                                        IMB_JOB *job)
 {
+#ifdef SAFE_PARAM
+        /* reset error status */
+        if (imb_errno != 0)
+                imb_set_errno(NULL, 0);
+
+        if (job->enc_keys == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY);
+                return NULL;
+        }
+        if (job->iv == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_IV);
+                return NULL;
+        }
+
+        if (job->src == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_SRC);
+                return NULL;
+        }
+        if (job->dst == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_DST);
+                return NULL;
+        }
+        if ((job->msg_len_to_cipher_in_bits == 0) ||
+            (job->msg_len_to_cipher_in_bits > SNOW3G_MAX_BITLEN)) {
+                imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
+                return NULL;
+        }
+#endif
+
     MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uea2_ooo;
     uint32_t msg_bitlen = job->msg_len_to_cipher_in_bits;
     uint32_t msg_bitoff = job->cipher_start_src_offset_in_bits;
@@ -315,52 +263,41 @@ IMB_JOB *SUBMIT_JOB_SNOW3G_UEA2(IMB_MGR *state,
     if (ret != NULL)
         return ret;
 
-    if (snow3g_state->unused_lanes != 0xff)
+    if(snow3g_state->num_lanes_inuse < SNOW3G_MB_MAX_LANES_SIMD)
         return NULL;
 
     uint32_t min_word_len = UINT32_MAX;
-    snow3gKeyState4_t ctx;
-    SNOW3G_F8_4_BUFFER_INITIALIZE(&ctx, snow3g_state->args.keys[0], snow3g_state->args.keys[1],
-                                  snow3g_state->args.keys[2], snow3g_state->args.keys[3],
-                                  snow3g_state->args.iv[0],snow3g_state->args.iv[1],
-                                  snow3g_state->args.iv[2],snow3g_state->args.iv[3]);
+    snow3gKeyStateMulti_t *pCtx = (snow3gKeyStateMulti_t *)&(snow3g_state->args.LFSR_0[0]);
+    snow3gKeyStateMulti_t tmp_ctx;
 
-    cpy_snow3g_state_to_ctx_after_initialize(&ctx, snow3g_state);
+    SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB(&tmp_ctx,
+                                          (const snow3g_key_schedule_t **)snow3g_state->args.keys,
+                                          (const void**)snow3g_state->args.iv);
+    cpy_newly_intialized_ctx_to_state(&tmp_ctx, snow3g_state);
 
     for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
-        if (JOB_NOT_INITIALIZED(snow3g_state, i)) {
-            snow3g_state->args.INITIALIZED[i] = 1;
-        }
         min_word_len = (min_word_len < snow3g_state->args.byte_length[i] / SNOW3G_4_BYTES) ?
                        min_word_len : snow3g_state->args.byte_length[i] / SNOW3G_4_BYTES;
     }
 
-    SNOW3G_F8_4_BUFFER_STREAM(&ctx,
-                              snow3g_state->args.in[0],snow3g_state->args.out[0],
-                              snow3g_state->args.in[1],snow3g_state->args.out[1],
-                              snow3g_state->args.in[2],snow3g_state->args.out[2],
-                              snow3g_state->args.in[3],snow3g_state->args.out[3],
-                              min_word_len * SNOW3G_4_BYTES);
+    SNOW3G_F8_MULTI_BUFFER_STREAM_JOB(pCtx,
+                                      (const void **)snow3g_state->args.in,
+                                      (void **)snow3g_state->args.out,
+                                      min_word_len * SNOW3G_4_BYTES);
 
     for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
-        snow3g_state->args.in[i] = (uint8_t *)snow3g_state->args.in[i] +
-                                   min_word_len * SNOW3G_4_BYTES;
-        snow3g_state->args.out[i] = (uint8_t *)snow3g_state->args.out[i] +
-                                    min_word_len * SNOW3G_4_BYTES;
         snow3g_state->args.byte_length[i] -= min_word_len * SNOW3G_4_BYTES;
     }
 
-    cpy_snow3g_ctx_to_state_after_stream(snow3g_state, &ctx);
-
     for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
-        //if less than one word left, finish job here.
+        // if less than one word left, finish job here.
         if (snow3g_state->args.byte_length[i] < SNOW3G_4_BYTES &&
             snow3g_state->args.byte_length[i] != 0) {
             snow3gKeyState1_t ctx_1;
-            cpy_snow3g_state_to_ctx_1(&ctx_1, snow3g_state, i);
-            SNOW3G_F8_1_BUFFER_STREAM(&ctx_1, snow3g_state->args.in[i],
-                                      snow3g_state->args.out[i],
-                                      snow3g_state->args.byte_length[i]);
+            cpy_state_to_ctx1(pCtx, &ctx_1, i);
+            SNOW3G_F8_1_BUFFER_STREAM_JOB(&ctx_1, snow3g_state->args.in[i],
+                                          snow3g_state->args.out[i],
+                                          snow3g_state->args.byte_length[i]);
             snow3g_state->args.byte_length[i] = 0;
         }
     }
@@ -368,7 +305,7 @@ IMB_JOB *SUBMIT_JOB_SNOW3G_UEA2(IMB_MGR *state,
     ret = snow3g_mb_mgr_free_uea2_job(snow3g_state);
 
 #ifdef SAFE_DATA
-    //data has been cleard in snow3g_mb_mgr_free_uea2_job.
+    // data has been cleard in snow3g_mb_mgr_free_uea2_job.
 #endif
 
     return ret;
@@ -389,7 +326,7 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state)
             ret = snow3g_state->job_in_lane[i];
 
             if (JOB_NOT_INITIALIZED(snow3g_state, i)) {
-                //if not initialized
+                // if not initialized
                 IMB_SNOW3G_F8_1_BUFFER(state, snow3g_state->args.keys[i],
                                        snow3g_state->args.iv[i],
                                        snow3g_state->args.in[i],
@@ -397,39 +334,26 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state)
                                        snow3g_state->args.byte_length[i]);
             } else {
                 snow3gKeyState1_t ctx;
-                cpy_snow3g_state_to_ctx_1(&ctx, snow3g_state, i);
-                SNOW3G_F8_1_BUFFER_STREAM(&ctx, snow3g_state->args.in[i],
-                                          snow3g_state->args.out[i],
-                                          snow3g_state->args.byte_length[i]);
+                snow3gKeyStateMulti_t* state = (snow3gKeyStateMulti_t*)&(snow3g_state->args.LFSR_0[0]);
+                cpy_state_to_ctx1(state, &ctx, i);
+                SNOW3G_F8_1_BUFFER_STREAM_JOB(&ctx, snow3g_state->args.in[i],
+                                              snow3g_state->args.out[i],
+                                              snow3g_state->args.byte_length[i]);
             }
 
             ret->status |= IMB_STATUS_COMPLETED_CIPHER;
             snow3g_state->lens[i] = 0;
             snow3g_state->job_in_lane[i] = NULL;
-            snow3g_state->unused_lanes = snow3g_state->unused_lanes << 8;
+            snow3g_state->unused_lanes = snow3g_state->unused_lanes << UNUSED_LANE_MASK_BITS;
             snow3g_state->unused_lanes |= i;
+            snow3g_state->num_lanes_inuse--;
             snow3g_state->args.byte_length[i] = 0;
             snow3g_state->args.INITIALIZED[i] = 0;
 #ifdef SAFE_DATA
-            snow3g_state->args.LFSR_0[i] = 0;
-            snow3g_state->args.LFSR_1[i] = 0;
-            snow3g_state->args.LFSR_2[i] = 0;
-            snow3g_state->args.LFSR_3[i] = 0;
-            snow3g_state->args.LFSR_4[i] = 0;
-            snow3g_state->args.LFSR_5[i] = 0;
-            snow3g_state->args.LFSR_6[i] = 0;
-            snow3g_state->args.LFSR_7[i] = 0;
-            snow3g_state->args.LFSR_8[i] = 0;
-            snow3g_state->args.LFSR_9[i] = 0;
-            snow3g_state->args.LFSR_10[i] = 0;
-            snow3g_state->args.LFSR_11[i] = 0;
-            snow3g_state->args.LFSR_12[i] = 0;
-            snow3g_state->args.LFSR_13[i] = 0;
-            snow3g_state->args.LFSR_14[i] = 0;
-            snow3g_state->args.LFSR_15[i] = 0;
-            snow3g_state->args.FSM_1[i] = 0;
-            snow3g_state->args.FSM_2[i] = 0;
-            snow3g_state->args.FSM_3[i] = 0;
+            uint32_t* key_state = (uint32_t *)&(snow3g_state->args.LFSR_0[0]);
+            for (int k = 0; k < (16 + 3); k++) {
+                key_state[k * SNOW3G_MB_MAX_LANES_SIMD + i] = 0;
+            }
 #endif
             return ret;
         }
@@ -440,36 +364,59 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state)
 IMB_JOB *SUBMIT_JOB_SNOW3G_UIA2(IMB_MGR *state,
                                        IMB_JOB *job)
 {
+#ifdef SAFE_PARAM
+        /* reset error status */
+        if (imb_errno != 0)
+                imb_set_errno(NULL, 0);
+
+        if (job->u.SNOW3G_UIA2._key == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY);
+                return NULL;
+        }
+        if (job->u.SNOW3G_UIA2._iv == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_IV);
+                return NULL;
+        }
+
+        if (job->src == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_SRC);
+                return NULL;
+        }
+        if (job->auth_tag_output == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_AUTH);
+                return NULL;
+        }
+        if ((job->msg_len_to_hash_in_bits == 0) ||
+            (job->msg_len_to_hash_in_bits > SNOW3G_MAX_BITLEN)) {
+                imb_set_errno(NULL, IMB_ERR_AUTH_LEN);
+                return NULL;
+        }
+#endif
     MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uia2_ooo;
 
     IMB_JOB *ret = NULL;
 
     snow3g_mb_mgr_insert_uia2_job(snow3g_state, job);
 
-    if (snow3g_state->unused_lanes != 0xff)
+    if (snow3g_state->num_lanes_inuse < SNOW3G_MB_MAX_LANES_SIMD)
         return NULL;
 
     if (snow3g_state->init_done == 0) {
-        //all lanes are not initialized.
-        snow3gKeyState4_t ctx;
-        SNOW3G_F8_4_BUFFER_INITIALIZE(&ctx,
-                                      snow3g_state->args.keys[0], snow3g_state->args.keys[1],
-                                      snow3g_state->args.keys[2], snow3g_state->args.keys[3],
-                                      snow3g_state->args.iv[0],snow3g_state->args.iv[1],
-                                      snow3g_state->args.iv[2],snow3g_state->args.iv[3]);
-        SNOW3G_F9_4_BUFFER_KEYSTREAM(&ctx,
-                                     &snow3g_state->ks[0*5],
-                                     &snow3g_state->ks[1*5],
-                                     &snow3g_state->ks[2*5],
-                                     &snow3g_state->ks[3*5]);
+        // all lanes are not initialized.
+        snow3gKeyStateMulti_t ctx;
+        SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB(&ctx,
+                                              (const snow3g_key_schedule_t **)snow3g_state->args.keys,
+                                              (const void**)snow3g_state->args.iv);
+        SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB(&ctx,
+                                             snow3g_state->ks);
         snow3g_state->init_done = INIT_ALL_DONE;
     }
 
     for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
         if (snow3g_state->init_done & (1 << i)) {
-            //pick a initialized lane
-            SNOW3G_F9_1_BUFFER_DIGEST(&snow3g_state->ks[i*5], snow3g_state->args.in[i],
-                                      snow3g_state->lens[i], snow3g_state->args.out[i]);
+            // pick a initialized lane
+            SNOW3G_F9_1_BUFFER_DIGEST_JOB(&snow3g_state->ks[i*5], snow3g_state->args.in[i],
+                                          snow3g_state->lens[i], snow3g_state->args.out[i]);
             ret = snow3g_mb_mgr_free_uia2_job(snow3g_state, i);
             break;
         }
@@ -483,14 +430,14 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UIA2(IMB_MGR *state)
     MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uia2_ooo;
 
     if (snow3g_state->num_lanes_inuse == 0) {
-        //empty
+        // empty
         return NULL;
     }
     for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
         if (snow3g_state->init_done & (1<<i)) {
-            //pick a initialized lane
-            SNOW3G_F9_1_BUFFER_DIGEST(&snow3g_state->ks[i*5], snow3g_state->args.in[i],
-                                      snow3g_state->lens[i], snow3g_state->args.out[i]);
+            // pick a initialized lane
+            SNOW3G_F9_1_BUFFER_DIGEST_JOB(&snow3g_state->ks[i*5], snow3g_state->args.in[i],
+                                          snow3g_state->lens[i], snow3g_state->args.out[i]);
             ret = snow3g_mb_mgr_free_uia2_job(snow3g_state, i);
             return ret;
         }
@@ -503,29 +450,24 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UIA2(IMB_MGR *state)
         }
     }
     for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) {
-        //copy keys and ivs to empty lane
+        // copy keys and ivs to empty lane
         if (JOB_IS_NULL(snow3g_state, i)) {
             snow3g_state->args.keys[i] = snow3g_state->args.keys[lane_idx];
             snow3g_state->args.iv[i] = snow3g_state->args.iv[lane_idx];
         }
     }
 
-    snow3gKeyState4_t ctx;
-    SNOW3G_F8_4_BUFFER_INITIALIZE(&ctx,
-                                  snow3g_state->args.keys[0], snow3g_state->args.keys[1],
-                                  snow3g_state->args.keys[2], snow3g_state->args.keys[3],
-                                  snow3g_state->args.iv[0],snow3g_state->args.iv[1],
-                                  snow3g_state->args.iv[2],snow3g_state->args.iv[3]);
-    SNOW3G_F9_4_BUFFER_KEYSTREAM(&ctx,
-                                 &snow3g_state->ks[0*5],
-                                 &snow3g_state->ks[1*5],
-                                 &snow3g_state->ks[2*5],
-                                 &snow3g_state->ks[3*5]);
-    //pick a initialized lane
-    SNOW3G_F9_1_BUFFER_DIGEST(&snow3g_state->ks[lane_idx*5], snow3g_state->args.in[lane_idx],
-                              snow3g_state->lens[lane_idx], snow3g_state->args.out[lane_idx]);
+    snow3gKeyStateMulti_t ctx;
+    SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB(&ctx,
+                                          (const snow3g_key_schedule_t **)snow3g_state->args.keys,
+                                          (const void **)snow3g_state->args.iv);
+    SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB(&ctx,
+                                         snow3g_state->ks);
+    // pick a initialized lane
+    SNOW3G_F9_1_BUFFER_DIGEST_JOB(&snow3g_state->ks[lane_idx*5], snow3g_state->args.in[lane_idx],
+                                  snow3g_state->lens[lane_idx], snow3g_state->args.out[lane_idx]);
     ret = snow3g_mb_mgr_free_uia2_job(snow3g_state, lane_idx);
     return ret;
 }
 
-#endif //MB_MGR_SNOW3G_SUBMIT_FLUSH_AARCH64_H
+#endif // MB_MGR_SNOW3G_SUBMIT_FLUSH_AARCH64_H
diff --git a/lib/aarch64/snow3g_aarch64.c b/lib/aarch64/snow3g_aarch64.c
index 4b4172fd6f2380a05fd592295c4ac8d8ef2fa986..deccbb380d46bb68ae3cfab0d7f840df3d0ce0a6 100644
--- a/lib/aarch64/snow3g_aarch64.c
+++ b/lib/aarch64/snow3g_aarch64.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -38,10 +38,9 @@
 #define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64
 #define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64
 #define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64
-#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64
-#define SNOW3G_F8_4_BUFFER_STREAM  snow3g_f8_4_buffer_stream_aarch64
-#define SNOW3G_F8_1_BUFFER_STREAM  snow3g_f8_1_buffer_stream_aarch64
-#define SNOW3G_F9_1_BUFFER_DIGEST  snow3g_f9_1_buffer_digest_aarch64
-#define SNOW3G_F9_4_BUFFER_KEYSTREAM snow3g_f9_4_buffer_keystream_aarch64
-
+#define SNOW3G_F8_1_BUFFER_STREAM_JOB      snow3g_f8_1_buffer_stream_aarch64
+#define SNOW3G_F8_4_BUFFER_INITIALIZE_JOB  snow3g_f8_4_buffer_initialize_aarch64
+#define SNOW3G_F8_4_BUFFER_STREAM_JOB      snow3g_f8_4_buffer_stream_aarch64
+#define SNOW3G_F9_1_BUFFER_DIGEST_JOB      snow3g_f9_1_buffer_digest_aarch64
+#define SNOW3G_F9_4_BUFFER_KEYSTREAM_JOB   snow3g_f9_4_buffer_keystream_aarch64
 #include "snow3g_common_aarch64.h"
diff --git a/lib/aarch64/snow3g_aarch64_no_aesni.c b/lib/aarch64/snow3g_aarch64_no_aesni.c
index f5a9e589bd07fc3f240ac2894efcfa48d0a9768f..995c0202fc2b5403aa629419974c61144f77af39 100644
--- a/lib/aarch64/snow3g_aarch64_no_aesni.c
+++ b/lib/aarch64/snow3g_aarch64_no_aesni.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -40,10 +40,10 @@
 #define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64_no_aesni
 #define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64_no_aesni
 #define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64_no_aesni
-#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64_no_aesni
-#define SNOW3G_F8_4_BUFFER_STREAM  snow3g_f8_4_buffer_stream_aarch64_no_aesni
-#define SNOW3G_F8_1_BUFFER_STREAM  snow3g_f8_1_buffer_stream_aarch64_no_aesni
-#define SNOW3G_F9_1_BUFFER_DIGEST  snow3g_f9_1_buffer_digest_aarch64_no_aesni
-#define SNOW3G_F9_4_BUFFER_KEYSTREAM snow3g_f9_4_buffer_keystream_aarch64_no_aesni
+#define SNOW3G_F8_1_BUFFER_STREAM_JOB      snow3g_f8_1_buffer_stream_aarch64_no_aesni
+#define SNOW3G_F8_4_BUFFER_INITIALIZE_JOB  snow3g_f8_4_buffer_initialize_aarch64_no_aesni
+#define SNOW3G_F8_4_BUFFER_STREAM_JOB      snow3g_f8_4_buffer_stream_aarch64_no_aesni
+#define SNOW3G_F9_1_BUFFER_DIGEST_JOB      snow3g_f9_1_buffer_digest_aarch64_no_aesni
+#define SNOW3G_F9_4_BUFFER_KEYSTREAM_JOB   snow3g_f9_4_buffer_keystream_aarch64_no_aesni
 
 #include "snow3g_common_aarch64.h"
diff --git a/lib/aarch64/snow3g_aarch64_sve256.c b/lib/aarch64/snow3g_aarch64_sve256.c
new file mode 100644
index 0000000000000000000000000000000000000000..ef71297353e9d82e86a44713ba79dbc2956f5f29
--- /dev/null
+++ b/lib/aarch64/snow3g_aarch64_sve256.c
@@ -0,0 +1,49 @@
+/**********************************************************************
+  Copyright(c) 2023 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define AARCH64_SVE256
+#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_aarch64_sve256
+#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_aarch64_sve256
+#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_aarch64_sve256
+#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_aarch64_sve256
+#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_aarch64_sve256
+#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_aarch64_sve256
+#define SNOW3G_F8_4_BUFFER_MULTIKEY snow3g_f8_4_buffer_multikey_aarch64_sve256
+#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_aarch64_sve256
+#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_aarch64_sve256
+#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64_sve256
+#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64_sve256
+#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64_sve256
+#define SNOW3G_F8_1_BUFFER_STREAM_JOB   snow3g_f8_1_buffer_stream_aarch64_sve256
+#define SNOW3G_F9_1_BUFFER_DIGEST_JOB   snow3g_f9_1_buffer_digest_aarch64_sve256
+#define SNOW3G_F8_4_BUFFER_ASM          snow3g_f8_4_buffer_aarch64_neon_asm
+#define SNOW3G_F8_8_BUFFER_ASM          snow3g_f8_8_buffer_aarch64_sve256_asm
+#define SNOW3G_F8_8_BUFFER_MULTIKEY_ASM snow3g_f8_8_buffer_multikey_aarch64_sve256_asm
+
+
+#include "snow3g_common_aarch64.h"
diff --git a/lib/aarch64/snow3g_common_aarch64.h b/lib/aarch64/snow3g_common_aarch64.h
index c4c60c21add0eaaf8b733a8dd011319a276deac0..e565a55ac5a87a3f33f68772447dbbeae5b3dd31 100644
--- a/lib/aarch64/snow3g_common_aarch64.h
+++ b/lib/aarch64/snow3g_common_aarch64.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2021-2022 Arm Corporation All rights reserved.
+  Copyright(c) 2021-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -50,6 +50,40 @@
 #define CLEAR_MEM clear_mem
 #define CLEAR_VAR clear_var
 
+void SNOW3G_F8_1_BUFFER_STREAM_JOB(void *pCtx,
+                                   const void *pBufferIn,
+                                   void *pBufferOut,
+                                   const uint32_t lengthInBytes);
+
+void SNOW3G_F8_4_BUFFER_INITIALIZE_JOB(void *pCtx,
+                                       const snow3g_key_schedule_t **pKeySched,
+                                       const void **pIV);
+
+void SNOW3G_F8_4_BUFFER_STREAM_JOB(void *pCtx,
+                                   const void **pBufferIn,
+                                   void **pBufferOut,
+                                   const uint32_t lengthInBytes);
+
+void SNOW3G_F9_1_BUFFER_DIGEST_JOB(const uint32_t z[5],
+                                   const void *pBufferIn,
+                                   const uint64_t lengthInBits,
+                                   void *pDigest);
+
+void SNOW3G_F9_4_BUFFER_KEYSTREAM_JOB(void *pCtx,
+                                      uint32_t *keystream);
+
+void SNOW3G_F8_8_BUFFER_ASM(const snow3g_key_schedule_t *key,
+                            const void **iv,
+                            const void **in,
+                            void **out,
+                            uint32_t lengthInBytes[]);
+
+void SNOW3G_F8_8_BUFFER_MULTIKEY_ASM(const snow3g_key_schedule_t **key,
+                                     const void **iv,
+                                     const void **in,
+                                     void **out,
+                                     uint32_t lengthInBytes[]);
+
 /**
  * @brief Wrapper for safe lookup of 16 indexes in 256x8-bit table
  * @param[in] indexes  vector of 16x8-bit indexes to be looked up
@@ -1735,6 +1769,79 @@ static inline void snow3gStateConvert_4(const snow3gKeyState4_t *pSrcState,
         pDstState->FSM_R3 = pFSM_X2[NumBuffer];
 }
 
+static inline void sortLanesByLength(const snow3g_key_schedule_t** pCtxBuf,
+                                    const void** pIV,
+                                    const void** pSrcBuf,
+                                    void** pDstBuf,
+                                    uint32_t* lensBuf,
+                                    const uint32_t packet_count)
+{
+        uint32_t packet_index, inner_index;
+        uint32_t sortNeeded = 0, tempLen = 0;
+        const void *srctempbuff;
+        void *dsttempbuff;
+        const void *ivtempbuff;
+        const snow3g_key_schedule_t *tempCtx;
+
+        packet_index = packet_count;
+        while (packet_index--) {
+                /* check if all packets are sorted by decreasing length */
+                if (packet_index > 0 && lensBuf[packet_index - 1] <
+                    lensBuf[packet_index]) {
+                        /* this packet array is not correctly sorted */
+                        sortNeeded = 1;
+                }
+        }
+
+        if (sortNeeded) {
+                /* sort packets in decreasing buffer size from [0] to [n]th
+                   packet, where buffer[0] will contain longest buffer and
+                   buffer[n] will contain the shortest buffer.
+                   4 arrays are swapped :
+                   - pointers to input buffers
+                   - pointers to output buffers
+                   - pointers to input IV's
+                   - input buffer lengths */
+
+                packet_index = packet_count;
+                while (packet_index--) {
+                        inner_index = packet_index;
+                        while (inner_index--) {
+                                if (lensBuf[packet_index] >
+                                    lensBuf[inner_index]) {
+                                        /* swap buffers to arrange in
+                                           descending order from [0]. */
+                                        srctempbuff = pSrcBuf[packet_index];
+                                        dsttempbuff = pDstBuf[packet_index];
+                                        ivtempbuff = pIV[packet_index];
+                                        tempLen = lensBuf[packet_index];
+
+                                        pSrcBuf[packet_index] =
+                                                pSrcBuf[inner_index];
+                                        pDstBuf[packet_index] =
+                                                pDstBuf[inner_index];
+                                        pIV[packet_index] = pIV[inner_index];
+                                        lensBuf[packet_index] =
+                                                lensBuf[inner_index];
+
+                                        pSrcBuf[inner_index] = srctempbuff;
+                                        pDstBuf[inner_index] = dsttempbuff;
+                                        pIV[inner_index] = ivtempbuff;
+                                        lensBuf[inner_index] = tempLen;
+
+                                        if (pCtxBuf != NULL) {
+                                                tempCtx = pCtxBuf[packet_index];
+                                                pCtxBuf[packet_index] =
+                                                        pCtxBuf[inner_index];
+                                                pCtxBuf[inner_index] = tempCtx;
+                                        }
+
+                                }
+                        } /* for inner packet index (inner bubble-sort) */
+                }         /* for outer packet index (outer bubble-sort) */
+        }                 /* if sortNeeded */
+}
+
 /**
  * @brief Provides size of key schedule structure
  * @return Key schedule structure in bytes
@@ -2204,8 +2311,8 @@ void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle,
         const size_t num_lanes = 4;
         snow3gKeyState4_t ctx;
         uint32_t lenInBytes[4];
-        uint8_t *pBufferOut[4];
-        const uint8_t *pBufferIn[4];
+        void *pBufferOut[4];
+        const void * pBufferIn[4];
         uint32_t bytes, qwords, i;
 
         length_copy_4(lenInBytes, lengthInBytes1, lengthInBytes2,
@@ -2355,6 +2462,27 @@ void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[],
         if (!length_check(lengthInBytes, num_lanes))
                 return;
 #endif
+#ifdef AARCH64_SVE256
+        const void *pSrcBuf[NUM_PACKETS_8] = {NULL};
+        void *pDstBuf[NUM_PACKETS_8] = {NULL};
+        const void *pIV[NUM_PACKETS_8] = {NULL};
+        uint32_t lensBuf[NUM_PACKETS_8] = {0};
+        const snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_8] = {NULL};
+
+        memcpy((void *)lensBuf, lengthInBytes, NUM_PACKETS_8 * sizeof(uint32_t));
+        memcpy((void *)pSrcBuf, BufferIn, NUM_PACKETS_8 * sizeof(void *));
+        memcpy((void *)pDstBuf, BufferOut, NUM_PACKETS_8 * sizeof(void *));
+        memcpy((void *)pIV, IV, NUM_PACKETS_8 * sizeof(void *));
+        memcpy((void *)pCtxBuf, pKey, NUM_PACKETS_8 * sizeof(void *));
+
+        sortLanesByLength(pCtxBuf, pIV, pSrcBuf, pDstBuf, lensBuf, NUM_PACKETS_8);
+        SNOW3G_F8_8_BUFFER_MULTIKEY_ASM(pCtxBuf,
+                                        pIV,
+                                        pSrcBuf,
+                                        pDstBuf,
+                                        lensBuf);
+
+#else
         SNOW3G_F8_4_BUFFER_MULTIKEY(pKey[0], pKey[1], pKey[2], pKey[3],
                                     IV[0], IV[1], IV[2], IV[3],
                                     BufferIn[0], BufferOut[0], lengthInBytes[0],
@@ -2368,6 +2496,7 @@ void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[],
                                     BufferIn[5], BufferOut[5], lengthInBytes[5],
                                     BufferIn[6], BufferOut[6], lengthInBytes[6],
                                     BufferIn[7], BufferOut[7], lengthInBytes[7]);
+#endif
 #ifdef SAFE_DATA
         CLEAR_SCRATCH_GPS();
         CLEAR_SCRATCH_SIMD_REGS();
@@ -2451,9 +2580,9 @@ void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle,
                         const uint32_t lenInBytes8)
 {
         uint32_t lengthInBytes[8];
-        const uint8_t *pBufferIn[8];
+        const void *pBufferIn[8];
         const void *pIV[8];
-        uint8_t *pBufferOut[8];
+        void *pBufferOut[8];
 
         length_copy_8(lengthInBytes,
                       lenInBytes1, lenInBytes2, lenInBytes3, lenInBytes4,
@@ -2491,7 +2620,26 @@ void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle,
         if (!length_check(lengthInBytes, num_lanes))
                 return;
 #endif
+#ifdef AARCH64_SVE256
+        const void *pSrcBuf[NUM_PACKETS_8] = {NULL};
+        void *pDstBuf[NUM_PACKETS_8] = {NULL};
+        const void *tmpIV[NUM_PACKETS_8] = {NULL};
+        uint32_t lensBuf[NUM_PACKETS_8] = {0};
+
+        memcpy((void *)lensBuf, lengthInBytes, NUM_PACKETS_8 * sizeof(uint32_t));
+        memcpy((void *)pSrcBuf, pBufferIn, NUM_PACKETS_8 * sizeof(void *));
+        memcpy((void *)pDstBuf, pBufferOut, NUM_PACKETS_8 * sizeof(void *));
+        memcpy((void *)tmpIV, pIV, NUM_PACKETS_8 * sizeof(void *));
 
+        sortLanesByLength(NULL, tmpIV, pSrcBuf, pDstBuf, lensBuf, NUM_PACKETS_8);
+
+        SNOW3G_F8_8_BUFFER_ASM(pHandle,
+                               tmpIV,
+                               pSrcBuf,
+                               pDstBuf,
+                               lensBuf);
+
+#else
         SNOW3G_F8_4_BUFFER(pHandle,
                            pIV[0], pIV[1], pIV[2], pIV[3],
                            pBufferIn[0], pBufferOut[0], lengthInBytes[0],
@@ -2505,6 +2653,7 @@ void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle,
                            pBufferIn[5], pBufferOut[5], lengthInBytes[5],
                            pBufferIn[6], pBufferOut[6], lengthInBytes[6],
                            pBufferIn[7], pBufferOut[7], lengthInBytes[7]);
+#endif
 }
 
 /**
@@ -2559,14 +2708,10 @@ void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx,
                 return;
         }
 
-        uint32_t packet_index, inner_index, pktCnt = packetCount;
-        int sortNeeded = 0, tempLen = 0;
-        uint8_t *srctempbuff;
-        uint8_t *dsttempbuff;
-        uint8_t *ivtempbuff;
-        uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
-        uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
-        uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+        uint32_t packet_index, pktCnt = packetCount;
+        const void *pSrcBuf[NUM_PACKETS_16] = {NULL};
+        void *pDstBuf[NUM_PACKETS_16] = {NULL};
+        const void *pIV[NUM_PACKETS_16] = {NULL};
         uint32_t lensBuf[NUM_PACKETS_16] = {0};
 
         memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
@@ -2574,63 +2719,48 @@ void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx,
         memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
         memcpy((void *)pIV, IV, packetCount * sizeof(void *));
 
-        packet_index = packetCount;
-
-        while (packet_index--) {
-
-                /* check if all packets are sorted by decreasing length */
-                if (packet_index > 0 && lensBuf[packet_index - 1] <
-                    lensBuf[packet_index]) {
-                        /* this packet array is not correctly sorted */
-                        sortNeeded = 1;
-                }
-        }
-
-        if (sortNeeded) {
-
-                /* sort packets in decreasing buffer size from [0] to
-                   [n]th packet, ** where buffer[0] will contain longest
-                   buffer and buffer[n] will contain the shortest buffer.
-                   4 arrays are swapped :
-                   - pointers to input buffers
-                   - pointers to output buffers
-                   - pointers to input IV's
-                   - input buffer lengths */
-                packet_index = packetCount;
-                while (packet_index--) {
-
-                        inner_index = packet_index;
-                        while (inner_index--) {
-
-                                if (lensBuf[packet_index] >
-                                    lensBuf[inner_index]) {
-
-                                        /* swap buffers to arrange in
-                                           descending order from [0]. */
-                                        srctempbuff = pSrcBuf[packet_index];
-                                        dsttempbuff = pDstBuf[packet_index];
-                                        ivtempbuff = pIV[packet_index];
-                                        tempLen = lensBuf[packet_index];
-
-                                        pSrcBuf[packet_index] =
-                                                pSrcBuf[inner_index];
-                                        pDstBuf[packet_index] =
-                                                pDstBuf[inner_index];
-                                        pIV[packet_index] = pIV[inner_index];
-                                        lensBuf[packet_index] =
-                                                lensBuf[inner_index];
-
-                                        pSrcBuf[inner_index] = srctempbuff;
-                                        pDstBuf[inner_index] = dsttempbuff;
-                                        pIV[inner_index] = ivtempbuff;
-                                        lensBuf[inner_index] = tempLen;
-                                }
-                        } /* for inner packet index (inner bubble-sort) */
-                }         /* for outer packet index (outer bubble-sort) */
-        }                 /* if sortNeeded */
+        sortLanesByLength(NULL, pIV, pSrcBuf, pDstBuf, lensBuf, packetCount);
 
         packet_index = 0;
+#ifdef AARCH64_SVE256
         /* process 8 buffers at-a-time */
+        while (pktCnt >= 8) {
+                pktCnt -= 8;
+                SNOW3G_F8_8_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pIV[packet_index + 1],
+                                   pIV[packet_index + 2],
+                                   pIV[packet_index + 3],
+                                   pIV[packet_index + 4],
+                                   pIV[packet_index + 5],
+                                   pIV[packet_index + 6],
+                                   pIV[packet_index + 7],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0],
+                                   pSrcBuf[packet_index + 1],
+                                   pDstBuf[packet_index + 1],
+                                   lensBuf[packet_index + 1],
+                                   pSrcBuf[packet_index + 2],
+                                   pDstBuf[packet_index + 2],
+                                   lensBuf[packet_index + 2],
+                                   pSrcBuf[packet_index + 3],
+                                   pDstBuf[packet_index + 3],
+                                   lensBuf[packet_index + 3],
+                                   pSrcBuf[packet_index + 4],
+                                   pDstBuf[packet_index + 4],
+                                   lensBuf[packet_index + 4],
+                                   pSrcBuf[packet_index + 5],
+                                   pDstBuf[packet_index + 5],
+                                   lensBuf[packet_index + 5],
+                                   pSrcBuf[packet_index + 6],
+                                   pDstBuf[packet_index + 6],
+                                   lensBuf[packet_index + 6],
+                                   pSrcBuf[packet_index + 7],
+                                   pDstBuf[packet_index + 7],
+                                   lensBuf[packet_index + 7]);
+                packet_index += 8;
+        }
+#endif
        /* process 4 buffers at-a-time */
         while (pktCnt >= 4) {
                 pktCnt -= 4;
@@ -2726,17 +2856,12 @@ void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[],
                 return;
         }
 
-        uint32_t packet_index, inner_index, pktCnt = packetCount;
-        int sortNeeded = 0, tempLen = 0;
-        uint8_t *srctempbuff;
-        uint8_t *dsttempbuff;
-        uint8_t *ivtempbuff;
-        snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL};
-        uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
-        uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
-        uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+        uint32_t packet_index, pktCnt = packetCount;
+        const snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL};
+        const void *pSrcBuf[NUM_PACKETS_16] = {NULL};
+        void *pDstBuf[NUM_PACKETS_16] = {NULL};
+        const void *pIV[NUM_PACKETS_16] = {NULL};
         uint32_t lensBuf[NUM_PACKETS_16] = {0};
-        snow3g_key_schedule_t *tempCtx;
 
         memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *));
         memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
@@ -2744,60 +2869,7 @@ void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[],
         memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
         memcpy((void *)pIV, IV, packetCount * sizeof(void *));
 
-        packet_index = packetCount;
-
-        while (packet_index--) {
-
-                /* check if all packets are sorted by decreasing length */
-                if (packet_index > 0 && lensBuf[packet_index - 1] <
-                    lensBuf[packet_index]) {
-                        /* this packet array is not correctly sorted */
-                        sortNeeded = 1;
-                }
-        }
-
-        if (sortNeeded) {
-                /* sort packets in decreasing buffer size from [0] to [n]th
-                   packet, where buffer[0] will contain longest buffer and
-                   buffer[n] will contain the shortest buffer.
-                   4 arrays are swapped :
-                   - pointers to input buffers
-                   - pointers to output buffers
-                   - pointers to input IV's
-                   - input buffer lengths */
-                packet_index = packetCount;
-                while (packet_index--) {
-                        inner_index = packet_index;
-                        while (inner_index--) {
-                                if (lensBuf[packet_index] >
-                                    lensBuf[inner_index]) {
-                                        /* swap buffers to arrange in
-                                           descending order from [0]. */
-                                        srctempbuff = pSrcBuf[packet_index];
-                                        dsttempbuff = pDstBuf[packet_index];
-                                        ivtempbuff = pIV[packet_index];
-                                        tempLen = lensBuf[packet_index];
-                                        tempCtx = pCtxBuf[packet_index];
-
-                                        pSrcBuf[packet_index] =
-                                                pSrcBuf[inner_index];
-                                        pDstBuf[packet_index] =
-                                                pDstBuf[inner_index];
-                                        pIV[packet_index] = pIV[inner_index];
-                                        lensBuf[packet_index] =
-                                                lensBuf[inner_index];
-                                        pCtxBuf[packet_index] =
-                                                pCtxBuf[inner_index];
-
-                                        pSrcBuf[inner_index] = srctempbuff;
-                                        pDstBuf[inner_index] = dsttempbuff;
-                                        pIV[inner_index] = ivtempbuff;
-                                        lensBuf[inner_index] = tempLen;
-                                        pCtxBuf[inner_index] = tempCtx;
-                                }
-                        } /* for inner packet index (inner bubble-sort) */
-                }         /* for outer packet index (outer bubble-sort) */
-        }                 /* if sortNeeded */
+        sortLanesByLength(pCtxBuf, pIV, pSrcBuf, pDstBuf, lensBuf, packetCount);
 
         packet_index = 0;
         /* process 8 buffers at-a-time */
@@ -2814,52 +2886,25 @@ void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[],
         }
 }
 
+#ifndef AARCH64_SVE256
 /**
  * @brief Initializes the four keys for SNOW3G f8/f9.
+ *        Only called by JOB API.
  *
  * @param [in/out]  pCtx        Pointer to snow3g state
- * @param [in]      pKeySched1  Key1 schedule
- * @param [in]      pKeySched2  Key2 schedule
- * @param [in]      pKeySched3  Key3 schedule
- * @param [in]      pKeySched4  Key4 schedule
- * @param [in]      pIV1        IV for buffer 1
- * @param [in]      pIV2        IV for buffer 2
- * @param [in]      pIV3        IV for buffer 3
- * @param [in]      pIV4        IV for buffer 4
+ * @param [in]      pKeySched   pointer to key schedule
+ * @param [in]      pIV         pointer to IV
  */
 void
-SNOW3G_F8_4_BUFFER_INITIALIZE(void *pCtx,
-                              const snow3g_key_schedule_t *pKeySched1,
-                              const snow3g_key_schedule_t *pKeySched2,
-                              const snow3g_key_schedule_t *pKeySched3,
-                              const snow3g_key_schedule_t *pKeySched4,
-                              const void *pIV1, const void *pIV2,
-                              const void *pIV3, const void *pIV4)
+SNOW3G_F8_4_BUFFER_INITIALIZE_JOB(void *pCtx,
+                                  const snow3g_key_schedule_t **pKeySched,
+                                  const void **pIV)
 {
-#ifdef SAFE_PARAM
-        /* reset error status */
-        if (imb_errno != 0)
-                imb_set_errno(NULL, 0);
-
-        if (pCtx == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_CTX);
-        }
-        if (pKeySched1 == NULL || pKeySched2 == NULL ||
-            pKeySched3 == NULL || pKeySched4 == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY);
-                return;
-        }
-        if ((pIV1 == NULL) || pIV2 == NULL ||
-            (pIV3 == NULL) || (pIV4 == NULL)) {
-                imb_set_errno(NULL, IMB_ERR_NULL_IV);
-                return;
-        }
-#endif
         /* Initialize the schedule from the IV */
         snow3gStateInitialize_4_multikey((snow3gKeyState4_t *)pCtx,
-                                         pKeySched1, pKeySched2,
-                                         pKeySched3, pKeySched4,
-                                         pIV1, pIV2, pIV3, pIV4);
+                                         pKeySched[0], pKeySched[1],
+                                         pKeySched[2], pKeySched[3],
+                                         pIV[0], pIV[1], pIV[2], pIV[3]);
 
         /* Clock FSM and LFSR once, ignore the key stream */
         (void) snow3g_keystream_4_4((snow3gKeyState4_t *)pCtx);
@@ -2869,58 +2914,21 @@ SNOW3G_F8_4_BUFFER_INITIALIZE(void *pCtx,
 
 /**
  * @brief Four buffer F8 encrypt/decrypt after initialize.
+ *        Only called by JOB API.
  *
  * @param[in/out] pCtx            pointer to snow3g state
- * @param[in]     pBufferIn1      pointer to an input buffer
- * @param[out]    pBufferOut1     pointer to an output buffer
- * @param[in]     pBufferIn2      pointer to an input buffer
- * @param[out]    pBufferOut2     pointer to an output buffer
- * @param[in]     pBufferIn3      pointer to an input buffer
- * @param[out]    pBufferOut3     pointer to an output buffer
- * @param[in]     pBufferIn4      pointer to an input buffer
- * @param[out]    pBufferOut4     pointer to an output buffer
+ * @param[in]     pBufferIn       pointer to an input buffer array
+ * @param[out]    pBufferOut      pointer to an output buffer array
+ * @param[in]     lengthInBytes   message length in bytes
  */
-void SNOW3G_F8_4_BUFFER_STREAM(void *pCtx,
-                               const void *pBufferIn1,
-                               void *pBufferOut1,
-                               const void *pBufferIn2,
-                               void *pBufferOut2,
-                               const void *pBufferIn3,
-                               void *pBufferOut3,
-                               const void *pBufferIn4,
-                               void *pBufferOut4,
-                               const uint32_t lengthInBytes)
+void SNOW3G_F8_4_BUFFER_STREAM_JOB(void *pCtx,
+                                   const void **pBufferIn,
+                                   void **pBufferOut,
+                                   const uint32_t lengthInBytes)
 {
         const uint32_t num_lanes = 4;
         snow3gKeyState4_t *ctx = (snow3gKeyState4_t *)pCtx;
         uint32_t words;
-        uint8_t *pBufferOut[4];
-        const uint8_t *pBufferIn[4];
-
-        cptr_copy_4((const void **)pBufferIn,
-                    pBufferIn1, pBufferIn2, pBufferIn3, pBufferIn4);
-
-        ptr_copy_4((void **)pBufferOut, pBufferOut1, pBufferOut2,
-                   pBufferOut3, pBufferOut4);
-#ifdef SAFE_PARAM
-        /* reset error status */
-        if (imb_errno != 0)
-                imb_set_errno(NULL, 0);
-        if (!cptr_check((const void * const *)pBufferIn,
-                        num_lanes,
-                        IMB_ERR_NULL_SRC))
-                return;
-        if (!ptr_check((void **)pBufferOut, num_lanes, IMB_ERR_NULL_DST))
-                return;
-        if ((lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN)) {
-                imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
-                return;
-        }
-        if (pCtx == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_CTX);
-                return;
-        }
-#endif
 
 #ifdef SAFE_DATA
         CLEAR_SCRATCH_SIMD_REGS();
@@ -2949,9 +2957,11 @@ void SNOW3G_F8_4_BUFFER_STREAM(void *pCtx,
         CLEAR_SCRATCH_SIMD_REGS();
 #endif /* SAFE_DATA */
 }
+#endif
 
 /**
  * @brief One buffer F8 encrypt/decrypt after initialize.
+ *        Only called by JOB API.
  *
  * One packet enc/dec after initialize.
  *
@@ -2960,32 +2970,11 @@ void SNOW3G_F8_4_BUFFER_STREAM(void *pCtx,
  * @param[out]    pBufferOut      pointer to an output buffer
  * @param[in]     lengthInBytes   length in bytes
  */
-void SNOW3G_F8_1_BUFFER_STREAM(void *pCtx,
-                               const void *pBufferIn,
-                               void *pBufferOut,
-                               const uint32_t lengthInBytes)
+void SNOW3G_F8_1_BUFFER_STREAM_JOB(void *pCtx,
+                                   const void *pBufferIn,
+                                   void *pBufferOut,
+                                   const uint32_t lengthInBytes)
 {
-#ifdef SAFE_PARAM
-        /* reset error status */
-        if (imb_errno != 0)
-                imb_set_errno(NULL, 0);
-        if (pBufferIn == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_SRC);
-                return;
-        }
-        if (pBufferOut == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_DST);
-                return;
-        }
-        if ((lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN)) {
-                imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
-                return;
-        }
-        if (pCtx == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_CTX);
-                return;
-        }
-#endif
         f8_snow3g((snow3gKeyState1_t *)pCtx, pBufferIn, pBufferOut, lengthInBytes);
 #ifdef SAFE_DATA
         CLEAR_SCRATCH_GPS();
@@ -3048,7 +3037,7 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
         /*Generate 5 key stream words*/
         snow3g_f9_keystream_words(&ctx, &z[0]);
 
-        SNOW3G_F9_1_BUFFER_DIGEST(z, pBufferIn, lengthInBits, pDigest);
+        SNOW3G_F9_1_BUFFER_DIGEST_JOB(z, pBufferIn, lengthInBits, pDigest);
 
 #ifdef SAFE_DATA
         CLEAR_MEM(&z, sizeof(z));
@@ -3068,33 +3057,11 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
  * @param[in]  lengthInBits message length in bits
  * @param[out] pDigest      pointer to store the F9 digest
  */
-void SNOW3G_F9_1_BUFFER_DIGEST(const uint32_t z[5],
-                               const void *pBufferIn,
-                               const uint64_t lengthInBits,
-                               void *pDigest)
+void SNOW3G_F9_1_BUFFER_DIGEST_JOB(const uint32_t z[5],
+                                   const void *pBufferIn,
+                                   const uint64_t lengthInBits,
+                                   void *pDigest)
 {
-#ifdef SAFE_PARAM
-        /* reset error status */
-        if (imb_errno != 0)
-                imb_set_errno(NULL, 0);
-        if (z == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_AUTH_KEY);
-                return;
-        }
-        if (pBufferIn == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_SRC);
-                return;
-        }
-        if (pDigest == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_AUTH);
-                return;
-        }
-        if ((lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN)) {
-                imb_set_errno(NULL, IMB_ERR_AUTH_LEN);
-                return;
-        }
-#endif
-
 #ifdef SAFE_DATA
         CLEAR_SCRATCH_SIMD_REGS();
 #endif /* SAFE_DATA */
@@ -3235,37 +3202,25 @@ void SNOW3G_F9_1_BUFFER_DIGEST(const uint32_t z[5],
 #endif /* SAFE_DATA */
 }
 
+#ifndef AARCH64_SVE256
 /**
  * @brief Four buffer F9 keystream generation.
+ *        Only called by JOB API.
  *
- * @param[in/out] pCtx    pointer to snow3g state
- * @param[out]    ks1     pointer to output keystream1
- * @param[out]    ks2     pointer to output keystream2
- * @param[out]    ks3     pointer to output keystream3
- * @param[out]    ks4     pointer to output keystream4
+ * @param[in/out] pCtx        pointer to snow3g state
+ * @param[out]    keystream   pointer to output keystream
  */
-void SNOW3G_F9_4_BUFFER_KEYSTREAM(void *pCtx,
-                                  uint32_t ks1[5],
-                                  uint32_t ks2[5],
-                                  uint32_t ks3[5],
-                                  uint32_t ks4[5])
+
+void SNOW3G_F9_4_BUFFER_KEYSTREAM_JOB(void *pCtx,
+                                      uint32_t *keystream)
 {
         snow3gKeyState4_t *ctx = (snow3gKeyState4_t *)pCtx;
 
-#ifdef SAFE_PARAM
-        /* reset error status */
-        if (imb_errno != 0)
-                imb_set_errno(NULL, 0);
-
-        if (pCtx == NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_CTX);
-                return;
-        }
-        if (ks1 ==  NULL || ks2 ==  NULL || ks3 ==  NULL || ks4 ==  NULL) {
-                imb_set_errno(NULL, IMB_ERR_NULL_AUTH_KEY);
-                return;
-        }
-#endif
+        uint32_t *ks1, *ks2, *ks3, *ks4;
+        ks1 = keystream;
+        ks2 = keystream + 5;
+        ks3 = keystream + 10;
+        ks4 = keystream + 15;
 
 #ifdef SAFE_DATA
         CLEAR_SCRATCH_SIMD_REGS();
@@ -3284,5 +3239,6 @@ void SNOW3G_F9_4_BUFFER_KEYSTREAM(void *pCtx,
         CLEAR_SCRATCH_SIMD_REGS();
 #endif /* SAFE_DATA */
 }
+#endif
 
 #endif /* SNOW3G_COMMON_H */
diff --git a/lib/aarch64/snow3g_impl_aarch64_sve256.S b/lib/aarch64/snow3g_impl_aarch64_sve256.S
new file mode 100644
index 0000000000000000000000000000000000000000..42934f3848b88876558b95fd57a8e48398617db3
--- /dev/null
+++ b/lib/aarch64/snow3g_impl_aarch64_sve256.S
@@ -0,0 +1,1532 @@
+/*******************************************************************************
+ Copyright (c) 2023 Arm  Corporation All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+
+   * Redistributions of source code must retain the above copyright notice,
+     this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above copyright
+     notice, this list of conditions and the following disclaimer in the
+     documentation and/or other materials provided with the distribution.
+   * Neither the name of Intel Corporation nor the names of its contributors
+     may be used to endorse or promote products derived from this software
+     without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+
+.arch armv8-a+sve+crypto
+
+#define VECTOR_LEN 32
+
+.section .data
+
+.align    8
+.type    snow3g_invSR_SQ, %object
+snow3g_invSR_SQ:
+.byte   0xC2, 0xA6, 0x8F, 0x0A, 0x0D, 0xBE, 0xA7, 0x08
+.byte   0x1D, 0x99, 0x45, 0x59, 0x13, 0xD2, 0x11, 0x9F
+.byte   0xAE, 0xE6, 0xD4, 0xA4, 0x92, 0x8D, 0x58, 0xC1
+.byte   0xD0, 0x97, 0xC8, 0x84, 0x9D, 0x4F, 0xBC, 0x3B
+.byte   0x2D, 0xEB, 0x27, 0x53, 0x72, 0x4E, 0xE3, 0xEE
+.byte   0xDA, 0x7F, 0xAA, 0x4D, 0x5C, 0x2F, 0x44, 0xDB
+.byte   0x3E, 0x3A, 0x67, 0xC5, 0xC3, 0x6A, 0x16, 0x4C
+.byte   0x38, 0xCC, 0xD7, 0xDD, 0x70, 0x62, 0xF2, 0x19
+.byte   0x10, 0x09, 0x98, 0x4B, 0x61, 0xC9, 0x86, 0x03
+.byte   0xA8, 0x6B, 0x5A, 0x33, 0x6E, 0x54, 0x5D, 0x8C
+.byte   0x41, 0x1A, 0xF7, 0xF6, 0x82, 0xC6, 0xF8, 0x80
+.byte   0xC0, 0xC7, 0xFE, 0xB3, 0x65, 0x2C, 0x7B, 0xBA
+.byte   0xB4, 0xFC, 0x2A, 0x22, 0x0C, 0x73, 0xF5, 0x5F
+.byte   0x64, 0x68, 0x2E, 0x94, 0xB2, 0x24, 0x35, 0x14
+.byte   0x78, 0xFB, 0xBF, 0x48, 0xDE, 0xED, 0x43, 0x07
+.byte   0xB6, 0x32, 0xE4, 0xBD, 0x74, 0x7D, 0x57, 0x46
+.byte   0x3C, 0x37, 0xC4, 0xB7, 0x51, 0x8A, 0xF3, 0x55
+.byte   0x6C, 0xCF, 0x79, 0xAB, 0x77, 0xA3, 0xE1, 0x93
+.byte   0xD5, 0x6D, 0x81, 0x5B, 0x2B, 0x9A, 0x7E, 0x8B
+.byte   0x04, 0xB5, 0x85, 0xD3, 0x91, 0xA1, 0x47, 0x52
+.byte   0xA5, 0xEC, 0xD6, 0xBB, 0x20, 0x87, 0x26, 0xF0
+.byte   0xAF, 0x4A, 0x89, 0xF4, 0xCE, 0x25, 0xCB, 0x50
+.byte   0x00, 0x3F, 0xD9, 0x42, 0x90, 0x21, 0x3D, 0xA9
+.byte   0xE7, 0x29, 0x01, 0xF1, 0x36, 0x5E, 0xFA, 0xCD
+.byte   0xE5, 0x31, 0x1B, 0x05, 0xFD, 0x9E, 0xA0, 0x76
+.byte   0x30, 0xB1, 0x75, 0xB0, 0x9B, 0x56, 0xEA, 0x1C
+.byte   0xEF, 0x06, 0x69, 0x7A, 0x95, 0x88, 0x15, 0xFF
+.byte   0xCA, 0xAC, 0x0E, 0x23, 0xD8, 0x0F, 0x28, 0x0B
+.byte   0x18, 0xF9, 0x63, 0x1E, 0x83, 0x66, 0x39, 0x9C
+.byte   0xE2, 0x49, 0x1F, 0xE8, 0xD1, 0x34, 0x7C, 0xA2
+.byte   0xB9, 0xE0, 0x02, 0x12, 0xE9, 0xDF, 0xAD, 0x71
+.byte   0x96, 0x8E, 0x6F, 0xB8, 0x40, 0x60, 0x17, 0xDC
+.size    snow3g_invSR_SQ,.-snow3g_invSR_SQ
+
+.align    8
+.type    snow3g_MULa, %object
+snow3g_MULa:
+.byte   0x00, 0x13, 0x26, 0x35, 0x4C, 0x5F, 0x6A, 0x79
+.byte   0x98, 0x8B, 0xBE, 0xAD, 0xD4, 0xC7, 0xF2, 0xE1
+.byte   0x00, 0xCF, 0x37, 0xF8, 0x6E, 0xA1, 0x59, 0x96
+.byte   0xDC, 0x13, 0xEB, 0x24, 0xB2, 0x7D, 0x85, 0x4A
+.byte   0x00, 0x9F, 0x97, 0x08, 0x87, 0x18, 0x10, 0x8F
+.byte   0xA7, 0x38, 0x30, 0xAF, 0x20, 0xBF, 0xB7, 0x28
+.byte   0x00, 0xE1, 0x6B, 0x8A, 0xD6, 0x37, 0xBD, 0x5C
+.byte   0x05, 0xE4, 0x6E, 0x8F, 0xD3, 0x32, 0xB8, 0x59
+.byte   0x00, 0x99, 0x9B, 0x02, 0x9F, 0x06, 0x04, 0x9D
+.byte   0x97, 0x0E, 0x0C, 0x95, 0x08, 0x91, 0x93, 0x0A
+.byte   0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77
+.byte   0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF
+.byte   0x00, 0xE7, 0x67, 0x80, 0xCE, 0x29, 0xA9, 0x4E
+.byte   0x35, 0xD2, 0x52, 0xB5, 0xFB, 0x1C, 0x9C, 0x7B
+.byte   0x00, 0x0A, 0x14, 0x1E, 0x28, 0x22, 0x3C, 0x36
+.byte   0x50, 0x5A, 0x44, 0x4E, 0x78, 0x72, 0x6C, 0x66
+.size    snow3g_MULa,.-snow3g_MULa
+
+.align    8
+.type    snow3g_DIVa, %object
+snow3g_DIVa:
+.byte   0x00, 0xCD, 0x33, 0xFE, 0x66, 0xAB, 0x55, 0x98
+.byte   0xCC, 0x01, 0xFF, 0x32, 0xAA, 0x67, 0x99, 0x54
+.byte   0x00, 0x40, 0x80, 0xC0, 0xA9, 0xE9, 0x29, 0x69
+.byte   0xFB, 0xBB, 0x7B, 0x3B, 0x52, 0x12, 0xD2, 0x92
+.byte   0x00, 0x0F, 0x1E, 0x11, 0x3C, 0x33, 0x22, 0x2D
+.byte   0x78, 0x77, 0x66, 0x69, 0x44, 0x4B, 0x5A, 0x55
+.byte   0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48
+.byte   0xC0, 0xD8, 0xF0, 0xE8, 0xA0, 0xB8, 0x90, 0x88
+.byte   0x00, 0x31, 0x62, 0x53, 0xC4, 0xF5, 0xA6, 0x97
+.byte   0x21, 0x10, 0x43, 0x72, 0xE5, 0xD4, 0x87, 0xB6
+.byte   0x00, 0x5F, 0xBE, 0xE1, 0xD5, 0x8A, 0x6B, 0x34
+.byte   0x03, 0x5C, 0xBD, 0xE2, 0xD6, 0x89, 0x68, 0x37
+.byte   0x00, 0xF0, 0x49, 0xB9, 0x92, 0x62, 0xDB, 0x2B
+.byte   0x8D, 0x7D, 0xC4, 0x34, 0x1F, 0xEF, 0x56, 0xA6
+.byte   0x00, 0x29, 0x52, 0x7B, 0xA4, 0x8D, 0xF6, 0xDF
+.byte   0xE1, 0xC8, 0xB3, 0x9A, 0x45, 0x6C, 0x17, 0x3E
+.size    snow3g_DIVa,.-snow3g_DIVa
+
+.align    6
+.type    n_inv_aes_shift_row, %object
+n_inv_aes_shift_row:
+.byte    0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b
+.byte    0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03
+.byte    0x10, 0x1d, 0x1a, 0x17, 0x14, 0x11, 0x1e, 0x1b
+.byte    0x18, 0x15, 0x12, 0x1f, 0x1c, 0x19, 0x16, 0x13
+.size    n_inv_aes_shift_row,.-n_inv_aes_shift_row
+
+.align    6
+.type    ror8, %object
+ror8:
+.word    0x00030201, 0x04070605, 0x080b0a09, 0x0c0f0e0d
+.word    0x10131211, 0x14171615, 0x181b1a19, 0x1c1f1e1d
+.size    ror8,.-ror8
+
+.align    6
+.type    gather_clear_mask_mul, %object
+gather_clear_mask_mul:
+.byte   0x03, 0x07, 0x0b, 0x0f, 0x13, 0x17, 0x1b, 0x1f
+.byte   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.byte   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.byte   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.size    gather_clear_mask_mul,.-gather_clear_mask_mul
+
+.align    6
+.type    gather_clear_mask_div, %object
+gather_clear_mask_div:
+.byte   0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c
+.byte   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.byte   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.byte   0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
+.size    gather_clear_mask_div,.-gather_clear_mask_div
+
+.align    6
+.type    iv_swap_mask, %object
+iv_swap_mask:
+.quad    0x0405060700010203, 0x0c0d0e0f08090a0b
+.size    iv_swap_mask,.-iv_swap_mask
+
+.section .text
+
+#define START_FUNC(fn) .globl fn; \
+        .type fn, %function; \
+        .align 6; \
+        fn:
+
+#define END_FUNC(fn) .size fn,.-fn
+
+.macro declare_register name:req, reg:req
+.ifdef def_\name
+    .unreq    \name
+.endif
+    .set def_\name, 0
+    \name    .req    \reg
+.endm
+
+declare_register LFSR_S0, z12
+declare_register LFSR_S1, z13
+declare_register LFSR_S2, z14
+declare_register LFSR_S3, z15
+declare_register LFSR_S4, z16
+declare_register LFSR_S5, z17
+declare_register LFSR_S6,  z18
+declare_register LFSR_S7,  z19
+declare_register LFSR_S8,  z20
+declare_register LFSR_S9,  z21
+declare_register LFSR_S10, z22
+declare_register LFSR_S11, z23
+declare_register LFSR_S12, z24
+declare_register LFSR_S13, z25
+declare_register LFSR_S14, z26
+declare_register LFSR_S15, z27
+declare_register FSM_R1,  z28
+declare_register FSM_R2,  z29
+declare_register FSM_R3,  z30
+declare_register zINV_SHIFT_ROW, z31
+declare_register zTMP0, z0
+declare_register zTMP1, z1
+declare_register zTMP2, z2
+declare_register zTMP3, z3
+declare_register zTMP4, z4
+declare_register zTMP5, z5
+declare_register zTMP6, z6
+declare_register zTMP7, z7
+declare_register zTMP8, z8
+declare_register zTMP9, z9
+declare_register zTMP10, z10
+declare_register zTMP11, z11
+declare_register vTMP0, v0
+declare_register vTMP1, v1
+declare_register vTMP2, v2
+declare_register vTMP3, v3
+declare_register vTMP4, v4
+declare_register vTMP5, v5
+declare_register vTMP6, v6
+declare_register vTMP7, v7
+declare_register vTMP8, v8
+declare_register vTMP9, v9
+declare_register vTMP10, v10
+declare_register vTMP11, v11
+declare_register xTMP0, x13
+declare_register xTMP1, x14
+declare_register xTMP2, x15
+declare_register xTMP3, x16
+declare_register xTMP4, x17
+declare_register xTMP5, x18
+declare_register xTMP6, x19
+declare_register xTMP7, x20
+declare_register xTMP8, x21
+declare_register xTMP9, x22
+declare_register xTMP10, x23
+declare_register xTMP11, x24
+declare_register xTMP12, x25
+declare_register xTMP13, x26
+declare_register xTMP14, x27
+declare_register xTMP15, x28
+declare_register xTMP16, x9
+declare_register xTMP17, x10
+declare_register xTMP18, x11
+declare_register xTMP19, x12
+
+declare_register wTMP15, w28
+declare_register wTMP16, w9
+declare_register wTMP17, w10
+declare_register wTMP18, w11
+declare_register wTMP19, w12
+declare_register PRED8,   p3
+declare_register PRED32,  p4
+declare_register PRED32_HALF1, p5
+declare_register PRED32_HALF2, p6
+declare_register pTMP0, p2
+
+.macro FUNC_SCALAR_SAVE
+    stp x19, x20, [sp, -80]!
+    stp x21, x22, [sp, 16]
+    stp x23, x24, [sp, 32]
+    stp x25, x26, [sp, 48]
+    stp x27, x28, [sp, 64]
+.endm
+
+.macro FUNC_SCALAR_RESTORE
+    ldp x21, x22, [sp, 16]
+    ldp x23, x24, [sp, 32]
+    ldp x25, x26, [sp, 48]
+    ldp x27, x28, [sp, 64]
+    ldp x19, x20, [sp], 80
+.endm
+
+.macro FUNC_VECTOR_SAVE
+    stp d8, d9, [sp, -64]!
+    stp d10, d11, [sp, 16]
+    stp d12, d13, [sp, 32]
+    stp d14, d15, [sp, 48]
+.endm
+
+.macro FUNC_VECTOR_RESTORE
+    ldp d10, d11, [sp, 16]
+    ldp d12, d13, [sp, 32]
+    ldp d14, d15, [sp, 48]
+    ldp d8, d9, [sp], 64
+.endm
+
+/*
+ * S1_BOX_8_SVE256()
+ *
+ * params
+ *     \x - input value
+ *     \rslt - returen value
+ * uses
+ *     zTMP0-2
+ */
+.macro S1_BOX_8_SVE256 x, rslt
+    tbl zTMP0.B, \x\().B, zINV_SHIFT_ROW.B
+    compact zTMP1.S, PRED32_HALF2, zTMP0.S
+    movi vTMP2.16B, #0
+    aese vTMP0.16B, vTMP2.16B
+    aesmc vTMP0.16B, vTMP0.16B
+    aese VTMP1.16B, vTMP2.16B
+    aesmc vTMP1.16B, vTMP1.16B
+    insr zTMP1.D, X0
+    insr zTMP1.D, X0
+    mov \rslt\().S, PRED32_HALF1/M, zTMP0.S
+    mov \rslt\().S, PRED32_HALF2/M, zTMP1.S
+.endm
+
+/*
+ * LOOKUP_32X8BIT_SVE256()
+ *
+ * params
+ *     \index  - input value
+ *     \lookup - lookup table
+ *     \rslt   - return value
+ * uses
+ *     zTMP0-2
+ */
+.macro LOOKUP_32X8BIT_SVE256 index, lookup, rslt
+    mov zTMP0.B, PRED8/Z, #32
+
+    ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #0, MUL VL]
+    tbl \rslt\().B, {zTMP1.B}, \index\().B
+    sub \index\().B, \index\().B, zTMP0.B
+
+    ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #1, MUL VL]
+    tbl zTMP2.B, {zTMP1.B}, \index\().B
+    sub \index\().B, \index\().B, zTMP0.B
+    eor \rslt\().D, \rslt\().D, zTMP2.D
+
+    ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #2, MUL VL]
+    tbl zTMP2.B, {zTMP1.B}, \index\().B
+    sub \index\().B, \index\().B, zTMP0.B
+    eor \rslt\().D, \rslt\().D, zTMP2.D
+
+    ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #3, MUL VL]
+    tbl zTMP2.B, {zTMP1.B}, \index\().B
+    sub \index\().B, \index\().B, zTMP0.B
+    eor \rslt\().D, \rslt\().D, zTMP2.D
+
+    ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #4, MUL VL]
+    tbl zTMP2.B, {zTMP1.B}, \index\().B
+    sub \index\().B, \index\().B, zTMP0.B
+    eor \rslt\().D, \rslt\().D, zTMP2.D
+
+    ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #5, MUL VL]
+    tbl zTMP2.B, {zTMP1.B}, \index\().B
+    sub \index\().B, \index\().B, zTMP0.B
+    eor \rslt\().D, \rslt\().D, zTMP2.D
+
+    ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #6, MUL VL]
+    tbl zTMP2.B, {zTMP1.B}, \index\().B
+    sub \index\().B, \index\().B, zTMP0.B
+    eor \rslt\().D, \rslt\().D, zTMP2.D
+
+    ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #7, MUL VL]
+    tbl zTMP2.B, {zTMP1.B}, \index\().B
+    sub \index\().B, \index\().B, zTMP0.B
+    eor \rslt\().D, \rslt\().D, zTMP2.D
+.endm
+
+/*
+ * S2_MIXC_FIXUP_8_SVE256()
+ *
+ * params
+ *     \no_mixc  - input value
+ *     \mixc     - lookup table
+ *     \rslt     - return value
+ * uses
+ *     xTMP0, zTMP0-2
+ */
+.macro S2_MIXC_FIXUP_8_SVE256 no_mixc, mixc, rslt
+    // PAT = CMLT(NO_MIXC);
+    cmplt pTMP0.B, PRED8/Z, \no_mixc\().B, #0
+    mov zTMP1.B, pTMP0/Z, #0xFF
+    // PAT_SHUF = TBL(PAT, ROR8);
+    adrp xTMP0, ror8
+    add xTMP0, xTMP0, #:lo12:ror8
+    ld1b {zTMP0.B}, PRED8/Z, [xTMP0]
+    tbl zTMP2.B, {zTMP1.B}, zTMP0.B
+    // RSLT = MIXC ^ (0X72 AND (PAT ^ PAT_SHUF))
+    eor zTMP1.D, zTMP1.D, zTMP2.D
+    mov zTMP2.B, PRED8/Z, #0x72
+    and zTMP0.D, zTMP2.D, zTMP1.D
+    eor \rslt\().D, zTMP0.D, \mixc\().D
+.endm
+
+/*
+ * S2_BOX_8_SVE256()
+ *
+ * params
+ *     \x    - input value
+ *     \rslt - return value
+ * uses
+ *     xTMP0, zTMP0-4
+ */
+.macro S2_BOX_8_SVE256 x, rslt
+    // NEW_X = TBL(LOOKUP(X, snow3g_invSR_SQ), inv_aes_shift_row);
+    adrp xTMP0, snow3g_invSR_SQ
+    add xTMP0, xTMP0, #:lo12:snow3g_invSR_SQ
+    LOOKUP_32X8BIT_SVE256 \x\(), xTMP0, \rslt\()
+    tbl zTMP3.B, \rslt\().B, zINV_SHIFT_ROW.B
+    compact zTMP1.S, PRED32_HALF2, zTMP3.S
+    // NOMIXC = AESE(NEW_X, 0)
+    movi vTMP2.16B, #0
+    aese vTMP3.16B, vTMP2.16B
+    aese vTMP1.16B, vTMP2.16B
+    // MIXC = AESMC(NOMIXC)
+    aesmc vTMP4.16B, vTMP3.16B
+    aesmc vTMP0.16B, vTMP1.16B
+    insr zTMP1.D, X0
+    insr zTMP1.D, X0
+    insr zTMP0.D, X0
+    insr zTMP0.D, X0
+    mov zTMP3.S, PRED32_HALF2/M, zTMP1.S
+    mov zTMP4.S, PRED32_HALF2/M, zTMP0.S
+    // S2_MIXC_FIXUP(NOMIXC, MIXC)
+    S2_MIXC_FIXUP_8_SVE256 zTMP3, zTMP4, \rslt\()
+.endm
+
+/*
+ * MUL_DIV_A_8_SVE256()
+ *
+ * params
+ *     \S    - input value, S0 or S11
+ *     \rslt - return value
+ * uses
+ *     xTMP0, zTMP0-4
+ */
+.macro MUL_DIV_A_8_SVE256 MUL_OR_DIV S, rslt
+    // L = S0,3 & 0x0F
+    mov zTMP0.B, PRED8/Z, #0x0F
+.ifc \MUL_OR_DIV, MUL
+    adrp xTMP0, gather_clear_mask_mul
+    add xTMP0, xTMP0, #:lo12:gather_clear_mask_mul
+.else
+    adrp xTMP0, gather_clear_mask_div
+    add xTMP0, xTMP0, #:lo12:gather_clear_mask_div
+.endif
+    ld1b {zTMP1.B}, PRED8/Z, [xTMP0]
+
+    // TL = TBL8(MULa_B0, L) || TBL8(MULa_B1, L) || TBL8(MULa_B2, L) || TBL8(MULa_B3, L)
+    tbl zTMP1.B, \S\().B, zTMP1.B
+    and zTMP0.D, zTMP1.D, zTMP0.D
+.ifc \MUL_OR_DIV, MUL
+    adrp xTMP0, snow3g_MULa
+    add xTMP0, xTMP0, #:lo12:snow3g_MULa
+.else
+    adrp xTMP0, snow3g_DIVa
+    add xTMP0, xTMP0, #:lo12:snow3g_DIVa
+.endif
+    ld1 {vTMP2.16b,vTMP3.16b},[xTMP0],#32
+    tbl zTMP2.B, zTMP2.B, zTMP0.B
+    tbl zTMP3.B, zTMP3.B, zTMP0.B
+    zip1 zTMP2.B, zTMP2.B, zTMP3.B
+    ld1 {vTMP3.16b,vTMP4.16b},[xTMP0],#32
+    tbl zTMP3.B, zTMP3.B, zTMP0.B
+    tbl zTMP4.B, zTMP4.B, zTMP0.B
+    zip1 zTMP3.B, zTMP3.B, zTMP4.B
+    zip1 zTMP0.H, zTMP2.H, zTMP3.H
+
+    // H = S0,3 & 0xF0
+    lsr zTMP1.B, PRED8/M, zTMP1.B, #4
+
+    // TH = TBL8(MULa_B4, H) || TBL8(MULa_B5, H) || TBL8(MULa_B6, H) || TBL8(MULa_B7, H)
+    ld1 {vTMP2.16b,vTMP3.16b},[xTMP0],#32
+    tbl zTMP2.B, zTMP2.B, zTMP1.B
+    tbl zTMP3.B, zTMP3.B, zTMP1.B
+    zip1 zTMP2.B, zTMP2.B, zTMP3.B
+    ld1 {vTMP3.16b,vTMP4.16b},[xTMP0]
+    tbl zTMP3.B, zTMP3.B, zTMP1.B
+    tbl zTMP4.B, zTMP4.B, zTMP1.B
+    zip1 zTMP3.B, zTMP3.B, zTMP4.B
+    zip1 zTMP1.H, zTMP2.H, zTMP3.H
+
+    // RSLT = TL ^ TH
+    eor \rslt\().D, zTMP1.D, zTMP0.D
+.endm
+
+/*
+ * CLOCK_FSM_8_SVE256()
+ *
+ * params
+ *     \F - input value
+ * uses
+ *     xTMP0, zTMP0-5
+ */
+.macro CLOCK_FSM_8_SVE256 F
+    // F = (S15 + R1) ^ R2
+    // R = R2 + (R3 ^ S5)
+    add \F\().S, LFSR_S15.S, FSM_R1.S
+    eor zTMP5.D, LFSR_S5.D, FSM_R3.D
+    eor \F\().D, \F\().D, FSM_R2.D
+    add zTMP5.S, zTMP5.S, FSM_R2.S
+    // R3 = S2(R2);
+    S2_BOX_8_SVE256 FSM_R2, FSM_R3
+    // R2 = S1(R1);
+    S1_BOX_8_SVE256 FSM_R1, FSM_R2
+    // R1 = R;
+    mov FSM_R1.D, zTMP5.D
+.endm
+
+/*
+ * SHIFT_LFSR_8_SVE256()
+ *
+ * uses
+ *     zTMP0-2
+ */
+.macro SHIFT_LFSR_8_SVE256 S15
+    mov zTMP0.D, LFSR_S4.D
+    mov zTMP1.D, LFSR_S8.D
+    mov zTMP2.D, LFSR_S12.D
+
+    mov LFSR_S0.D, LFSR_S1.D
+    mov LFSR_S4.D, LFSR_S5.D
+    mov LFSR_S8.D, LFSR_S9.D
+    mov LFSR_S12.D, LFSR_S13.D
+
+    mov LFSR_S1.D, LFSR_S2.D
+    mov LFSR_S5.D, LFSR_S6.D
+    mov LFSR_S9.D, LFSR_S10.D
+    mov LFSR_S13.D, LFSR_S14.D
+
+    mov LFSR_S2.D, LFSR_S3.D
+    mov LFSR_S6.D, LFSR_S7.D
+    mov LFSR_S10.D, LFSR_S11.D
+    mov LFSR_S14.D, LFSR_S15.D
+
+    mov LFSR_S3.D, zTMP0.D
+    mov LFSR_S7.D, zTMP1.D
+    mov LFSR_S11.D, zTMP2.D
+    mov LFSR_S15.D, \S15\().D
+.endm
+
+/*
+ * CLOCK_LFSR_8_SVE256()
+ *
+ * uses
+ *     xTMP0, zTMP0-6
+ */
+.macro CLOCK_LFSR_8_SVE256
+    // V = (S0 << 8) ^ MULa(S0) ^ S2 ^ (S11 >> 8) ^ DIVa(S11)
+    MUL_DIV_A_8_SVE256 MUL LFSR_S0, zTMP5
+    MUL_DIV_A_8_SVE256 DIV LFSR_S11, zTMP6
+    eor zTMP5.D, zTMP5.D, zTMP6.D
+    lsl zTMP3.S, LFSR_S0.S, #8
+    lsr zTMP1.S, LFSR_S11.S, #8
+    eor zTMP3.D, zTMP3.D, zTMP1.D
+    eor zTMP5.D, zTMP5.D, LFSR_S2.D
+    eor zTMP3.D, zTMP3.D, zTMP5.D
+    SHIFT_LFSR_8_SVE256 zTMP3
+.endm
+
+/*
+ * SNOW3G_KEYSTREAM_8_4_SVE256()
+ *
+ * params
+ *     \KEY - output keystream
+ * uses
+ *     xTMP0, zTMP0-6
+ */
+.macro SNOW3G_KEYSTREAM_8_4_SVE256 KEY
+    CLOCK_FSM_8_SVE256 \KEY\()
+    eor \KEY\().D, \KEY\().D, LFSR_S0.D
+    CLOCK_LFSR_8_SVE256
+.endm
+
+/*
+ * INTERLEAVE_IV_KEY_8()
+ *
+ * uses
+ *     xTMP0, zTMP0-3 when SWAP == 0
+ *     xTMP0, zTMP0-4 when SWAP == 1
+ */
+.macro INTERLEAVE_IV_KEY_8 SWAP RSLT0, RSLT1, RSLT2, RSLT3, \
+                                ADDR1, ADDR2, ADDR3, ADDR4, \
+                                ADDR5, ADDR6, ADDR7, ADDR8
+    ld1 {v\RSLT0\().4S}, [\ADDR1\()]
+    ld1 {vTMP0.4S}, [\ADDR2\()]
+    ld1 {v\RSLT1\().4S}, [\ADDR3\()]
+    ld1 {vTMP1.4S}, [\ADDR4\()]
+    ld1 {v\RSLT2\().4S}, [\ADDR5\()]
+    ld1 {vTMP2.4S}, [\ADDR6\()]
+    ld1 {v\RSLT3\().4S}, [\ADDR7\()]
+    ld1 {vTMP3.4S}, [\ADDR8\()]
+.if \SWAP == 1
+    adrp xTMP0, iv_swap_mask
+    add xTMP0, xTMP0, #:lo12:iv_swap_mask
+    ld1 {vTMP4.4S}, [xTMP0]
+    tbl vTMP0.16B, {vTMP0.16B}, vTMP4.16B
+    tbl vTMP1.16B, {vTMP1.16B}, vTMP4.16B
+    tbl vTMP2.16B, {vTMP2.16B}, vTMP4.16B
+    tbl vTMP3.16B, {vTMP3.16B}, vTMP4.16B
+    tbl v\RSLT0\().16B, {v\RSLT0\().16B}, vTMP4.16B
+    tbl v\RSLT1\().16B, {v\RSLT1\().16B}, vTMP4.16B
+    tbl v\RSLT2\().16B, {v\RSLT2\().16B}, vTMP4.16B
+    tbl v\RSLT3\().16B, {v\RSLT3\().16B}, vTMP4.16B
+.endif
+    zip1 z\RSLT0\().S, z\RSLT0\().S, zTMP0.S
+    zip1 z\RSLT1\().S, z\RSLT1\().S, zTMP1.S
+    zip1 z\RSLT2\().S, z\RSLT2\().S, zTMP2.S
+    zip1 z\RSLT3\().S, z\RSLT3\().S, zTMP3.S
+
+    zip1 zTMP0.D, z\RSLT0\().D, z\RSLT1\().D
+    zip2 zTMP1.D, z\RSLT0\().D, z\RSLT1\().D
+    zip1 zTMP2.D, z\RSLT2\().D, z\RSLT3\().D
+    zip2 zTMP3.D, z\RSLT2\().D, z\RSLT3\().D
+
+    compact z\RSLT1\().S, PRED32_HALF2, zTMP0.S
+    mov z\RSLT1\().S, PRED32_HALF2/M, zTMP2.S
+    insr zTMP2.D, x0
+    insr zTMP2.D, x0
+    sel z\RSLT0\().S, PRED32_HALF1, zTMP0.S, zTMP2.S
+
+    compact z\RSLT3\().S, PRED32_HALF2, zTMP1.S
+    mov z\RSLT3\().S, PRED32_HALF2/M, zTMP3.S
+    insr zTMP3.D, x0
+    insr zTMP3.D, x0
+    sel z\RSLT2\().S, PRED32_HALF1, zTMP1.S, zTMP3.S
+.endm
+
+/*
+ * SNOW3G_INITIALIZE_8_SVE256_FIRST()
+ *
+ * uses
+ *     zTMP0-8
+ */
+.macro SNOW3G_INITIALIZE_8_SVE256_FIRST KEYADDR1 KEYADDR2 KEYADDR3 KEYADDR4 \
+                                     KEYADDR5 KEYADDR6 KEYADDR7 KEYADDR8 \
+                                     IVADDR1 IVADDR2 IVADDR3 IVADDR4 \
+                                     IVADDR5 IVADDR6 IVADDR7 IVADDR8
+    INTERLEAVE_IV_KEY_8 0, 4, 5, 6, 7, \
+                        \KEYADDR1\(), \KEYADDR2\(), \KEYADDR3\(), \KEYADDR4\(), \
+                        \KEYADDR5\(), \KEYADDR6\(), \KEYADDR7\(), \KEYADDR8\()
+    mov LFSR_S4.D, zTMP4.D
+    mov LFSR_S5.D, zTMP5.D
+    mov LFSR_S6.D, zTMP6.D
+    mov LFSR_S7.D, zTMP7.D
+    mov LFSR_S12.D, zTMP4.D
+    mov LFSR_S13.D, zTMP5.D
+    mov LFSR_S14.D, zTMP6.D
+    mov LFSR_S15.D, zTMP7.D
+    not LFSR_S0.S, PRED32/M, zTMP4.S
+    not LFSR_S1.S, PRED32/M, zTMP5.S
+    not LFSR_S2.S, PRED32/M, zTMP6.S
+    not LFSR_S3.S, PRED32/M, zTMP7.S
+    mov LFSR_S8.D, LFSR_S0.D
+    mov LFSR_S9.D, LFSR_S1.D
+    mov LFSR_S10.D, LFSR_S2.D
+    mov LFSR_S11.D, LFSR_S3.D
+
+    INTERLEAVE_IV_KEY_8 1, 5, 6, 7, 8, \
+                        \IVADDR1\(), \IVADDR2\(), \IVADDR3\(), \IVADDR4\(), \
+                        \IVADDR5\(), \IVADDR6\(), \IVADDR7\(), \IVADDR8\()
+
+    eor LFSR_S15.D, LFSR_S15.D, zTMP8.D
+    eor LFSR_S12.D, LFSR_S12.D, zTMP7.D
+    eor LFSR_S10.D, LFSR_S10.D, zTMP6.D
+    eor LFSR_S9.D, LFSR_S9.D, zTMP5.D
+
+    mov FSM_R1.B, PRED8/Z, #0
+    mov FSM_R2.B, PRED8/Z, #0
+    mov FSM_R3.B, PRED8/Z, #0
+.endm
+
+/*
+ * SNOW3G_INITIALIZE_8_SVE256_SECOND()
+ *
+ * uses
+ *     xTMP0, zTMP0-7
+ */
+.macro SNOW3G_INITIALIZE_8_SVE256_SECOND
+.rept 32
+    CLOCK_FSM_8_SVE256 zTMP7
+    CLOCK_LFSR_8_SVE256
+    eor LFSR_S15.D, LFSR_S15.D, zTMP7.D
+.endr
+    CLOCK_FSM_8_SVE256 zTMP7
+    CLOCK_LFSR_8_SVE256
+.endm
+
+/*
+ * SNOW3G_LOAD_CTX_8_SVE256()
+ *
+ */
+.macro SNOW3G_LOAD_CTX_8_SVE256 ctx_addr
+    ld1b {LFSR_S0.B}, PRED8/Z, [\ctx_addr\(), #0, MUL VL]
+    ld1b {LFSR_S1.B}, PRED8/Z, [\ctx_addr\(), #1, MUL VL]
+    ld1b {LFSR_S2.B}, PRED8/Z, [\ctx_addr\(), #2, MUL VL]
+    ld1b {LFSR_S3.B}, PRED8/Z, [\ctx_addr\(), #3, MUL VL]
+    ld1b {LFSR_S4.B}, PRED8/Z, [\ctx_addr\(), #4, MUL VL]
+    ld1b {LFSR_S5.B}, PRED8/Z, [\ctx_addr\(), #5, MUL VL]
+    ld1b {LFSR_S6.B}, PRED8/Z, [\ctx_addr\(), #6, MUL VL]
+    ld1b {LFSR_S7.B}, PRED8/Z, [\ctx_addr\(), #7, MUL VL]
+    add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*8)
+    ld1b {LFSR_S8.B}, PRED8/Z, [\ctx_addr\(), #0, MUL VL]
+    ld1b {LFSR_S9.B}, PRED8/Z, [\ctx_addr\(), #1, MUL VL]
+    ld1b {LFSR_S10.B}, PRED8/Z, [\ctx_addr\(), #2, MUL VL]
+    ld1b {LFSR_S11.B}, PRED8/Z, [\ctx_addr\(), #3, MUL VL]
+    ld1b {LFSR_S12.B}, PRED8/Z, [\ctx_addr\(), #4, MUL VL]
+    ld1b {LFSR_S13.B}, PRED8/Z, [\ctx_addr\(), #5, MUL VL]
+    ld1b {LFSR_S14.B}, PRED8/Z, [\ctx_addr\(), #6, MUL VL]
+    ld1b {LFSR_S15.B}, PRED8/Z, [\ctx_addr\(), #7, MUL VL]
+    add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*8)
+    ld1b {FSM_R1.B}, PRED8/Z, [\ctx_addr\(), #0, MUL VL]
+    ld1b {FSM_R2.B}, PRED8/Z, [\ctx_addr\(), #1, MUL VL]
+    ld1b {FSM_R3.B}, PRED8/Z, [\ctx_addr\(), #2, MUL VL]
+    sub \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*16)
+.endm
+
+/*
+ * SNOW3G_STORE_CTX_8_SVE256()
+ *
+ */
+.macro SNOW3G_STORE_CTX_8_SVE256 ctx_addr
+    st1b {LFSR_S0.B}, PRED8, [\ctx_addr\(), #0, MUL VL]
+    st1b {LFSR_S1.B}, PRED8, [\ctx_addr\(), #1, MUL VL]
+    st1b {LFSR_S2.B}, PRED8, [\ctx_addr\(), #2, MUL VL]
+    st1b {LFSR_S3.B}, PRED8, [\ctx_addr\(), #3, MUL VL]
+    st1b {LFSR_S4.B}, PRED8, [\ctx_addr\(), #4, MUL VL]
+    st1b {LFSR_S5.B}, PRED8, [\ctx_addr\(), #5, MUL VL]
+    st1b {LFSR_S6.B}, PRED8, [\ctx_addr\(), #6, MUL VL]
+    st1b {LFSR_S7.B}, PRED8, [\ctx_addr\(), #7, MUL VL]
+    add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*8)
+    st1b {LFSR_S8.B}, PRED8, [\ctx_addr\(), #0, MUL VL]
+    st1b {LFSR_S9.B}, PRED8, [\ctx_addr\(), #1, MUL VL]
+    st1b {LFSR_S10.B}, PRED8, [\ctx_addr\(), #2, MUL VL]
+    st1b {LFSR_S11.B}, PRED8, [\ctx_addr\(), #3, MUL VL]
+    st1b {LFSR_S12.B}, PRED8, [\ctx_addr\(), #4, MUL VL]
+    st1b {LFSR_S13.B}, PRED8, [\ctx_addr\(), #5, MUL VL]
+    st1b {LFSR_S14.B}, PRED8, [\ctx_addr\(), #6, MUL VL]
+    st1b {LFSR_S15.B}, PRED8, [\ctx_addr\(), #7, MUL VL]
+    add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*8)
+    st1b {FSM_R1.B}, PRED8, [\ctx_addr\(), #0, MUL VL]
+    st1b {FSM_R2.B}, PRED8, [\ctx_addr\(), #1, MUL VL]
+    st1b {FSM_R3.B}, PRED8, [\ctx_addr\(), #2, MUL VL]
+    add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*3)
+    str wzr, [\ctx_addr\()]
+    sub \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*19)
+.endm
+
+/*
+ * CLEAR_VECTORS_SVE256()
+ *
+ */
+.macro CLEAR_VECTORS_SVE256
+    eor zTMP0.D, zTMP0.D, zTMP0.D
+    eor zTMP1.D, zTMP1.D, zTMP1.D
+    eor zTMP2.D, zTMP2.D, zTMP2.D
+    eor zTMP3.D, zTMP3.D, zTMP3.D
+    eor zTMP4.D, zTMP4.D, zTMP4.D
+    eor zTMP5.D, zTMP5.D, zTMP5.D
+    eor zTMP6.D, zTMP6.D, zTMP6.D
+    eor zTMP7.D, zTMP7.D, zTMP7.D
+    eor zTMP8.D, zTMP8.D, zTMP8.D
+    eor zTMP9.D, zTMP9.D, zTMP9.D
+    eor zTMP10.D, zTMP10.D, zTMP10.D
+    eor zTMP11.D, zTMP11.D, zTMP11.D
+    eor LFSR_S0.D, LFSR_S0.D, LFSR_S0.D
+    eor LFSR_S1.D, LFSR_S1.D, LFSR_S1.D
+    eor LFSR_S2.D, LFSR_S2.D, LFSR_S2.D
+    eor LFSR_S3.D, LFSR_S3.D, LFSR_S3.D
+    eor LFSR_S4.D, LFSR_S4.D, LFSR_S4.D
+    eor LFSR_S5.D, LFSR_S5.D, LFSR_S5.D
+    eor LFSR_S6.D, LFSR_S6.D, LFSR_S6.D
+    eor LFSR_S7.D, LFSR_S7.D, LFSR_S7.D
+    eor LFSR_S8.D, LFSR_S8.D, LFSR_S8.D
+    eor LFSR_S9.D, LFSR_S9.D, LFSR_S9.D
+    eor LFSR_S10.D, LFSR_S10.D, LFSR_S10.D
+    eor LFSR_S11.D, LFSR_S11.D, LFSR_S11.D
+    eor LFSR_S12.D, LFSR_S12.D, LFSR_S12.D
+    eor LFSR_S13.D, LFSR_S13.D, LFSR_S13.D
+    eor LFSR_S14.D, LFSR_S14.D, LFSR_S14.D
+    eor LFSR_S15.D, LFSR_S15.D, LFSR_S15.D
+    eor FSM_R1.D, FSM_R1.D, FSM_R1.D
+    eor FSM_R2.D, FSM_R2.D, FSM_R2.D
+    eor FSM_R3.D, FSM_R3.D, FSM_R3.D
+.endm
+/*
+ * snow3g_f8_8_buffer_initialize_aarch64_sve256_asm(
+ *              void *ctx,
+ *              snow3g_key_schedule_t **pKeySched,
+ *              void **pIV)
+ */
+START_FUNC(snow3g_f8_8_buffer_initialize_aarch64_sve256_asm)
+    FUNC_SCALAR_SAVE
+    ptrue PRED8.B, ALL
+    ptrue PRED32.S, ALL
+    ptrue PRED32_HALF1.S, VL4
+    not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B
+    FUNC_VECTOR_SAVE
+    adrp xTMP0, n_inv_aes_shift_row
+    add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row
+    ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0]
+    mov xTMP17, x1
+    mov xTMP18, x2
+
+    ldp xTMP0, xTMP1, [xTMP17], #16
+    ldp xTMP2, xTMP3, [xTMP17], #16
+    ldp xTMP4, xTMP5, [xTMP17], #16
+    ldp xTMP6, xTMP7, [xTMP17]
+    ldp xTMP8, xTMP9, [xTMP18], #16
+    ldp xTMP10, xTMP11, [xTMP18], #16
+    ldp xTMP12, xTMP13, [xTMP18], #16
+    ldp xTMP14, xTMP15, [xTMP18]
+
+    SNOW3G_INITIALIZE_8_SVE256_FIRST xTMP0 xTMP1 xTMP2 xTMP3 xTMP4 xTMP5 xTMP6 xTMP7\
+                                  xTMP8 xTMP9 xTMP10 xTMP11 xTMP12 xTMP13 xTMP14 xTMP15
+    SNOW3G_INITIALIZE_8_SVE256_SECOND
+    SNOW3G_STORE_CTX_8_SVE256 x0
+
+    FUNC_VECTOR_RESTORE
+    FUNC_SCALAR_RESTORE
+    ret
+END_FUNC(snow3g_f8_8_buffer_initialize_aarch64_sve256_asm)
+
+#ifndef GATHER_SCATTER_IMPL
+/*
+ * snow3g_f8_8_buffer_stream_aarch64_sve256_asm(void *ctx,
+ *                                           void **in,
+ *                                           void **out,
+ *                                           uint32_t lengthInBytes)
+ *
+ */
+START_FUNC(snow3g_f8_8_buffer_stream_aarch64_sve256_asm)
+    FUNC_SCALAR_SAVE
+    ptrue PRED8.B, ALL
+    ptrue PRED32.S, ALL
+    ptrue PRED32_HALF1.S, VL4
+    not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B
+    FUNC_VECTOR_SAVE
+    adrp xTMP0, n_inv_aes_shift_row
+    add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row
+    ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0]
+
+    mov xTMP17, x1
+    mov xTMP18, x2
+    mov xTMP19, x3
+
+    SNOW3G_LOAD_CTX_8_SVE256 x0
+    ldp xTMP16, xTMP1, [xTMP17], #16
+    ldp xTMP2, xTMP3, [xTMP17], #16
+    ldp xTMP4, xTMP5, [xTMP17], #16
+    ldp xTMP6, xTMP7, [xTMP17]
+    ldp xTMP8, xTMP9, [xTMP18], #16
+    ldp xTMP10, xTMP11, [xTMP18], #16
+    ldp xTMP12, xTMP13, [xTMP18], #16
+    ldp xTMP14, xTMP15, [xTMP18]
+
+    cmp xTMP19, #16
+    b.lt GEN8
+
+GEN16_LOOP:
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11
+    zip1 zTMP0.S, zTMP8.S, zTMP9.S
+    zip2 zTMP1.S, zTMP8.S, zTMP9.S
+    zip1 zTMP2.S, zTMP10.S, zTMP11.S
+    zip2 zTMP3.S, zTMP10.S, zTMP11.S
+    zip1 zTMP8.D, zTMP0.D, zTMP2.D
+    zip2 zTMP9.D, zTMP0.D, zTMP2.D
+    zip1 zTMP10.D, zTMP1.D, zTMP3.D
+    zip2 zTMP11.D, zTMP1.D, zTMP3.D
+    revb zTMP8.S, PRED32/M, zTMP8.S
+    revb zTMP9.S, PRED32/M, zTMP9.S
+    revb zTMP10.S, PRED32/M, zTMP10.S
+    revb zTMP11.S, PRED32/M, zTMP11.S
+
+    ld1 {vTMP0.4S}, [xTMP16], #16
+    ld1 {vTMP4.4S}, [xTMP1], #16
+    ld1 {vTMP1.4S}, [xTMP2], #16
+    ld1 {vTMP5.4S}, [xTMP3], #16
+    ld1 {vTMP2.4S}, [xTMP4], #16
+    ld1 {vTMP6.4S}, [xTMP5], #16
+    ld1 {vTMP3.4S}, [xTMP6], #16
+    ld1 {vTMP7.4S}, [xTMP7], #16
+    insr zTMP4.D, x0
+    insr zTMP5.D, x0
+    insr zTMP6.D, x0
+    insr zTMP7.D, x0
+    insr zTMP4.D, x0
+    insr zTMP5.D, x0
+    insr zTMP6.D, x0
+    insr zTMP7.D, x0
+    mov zTMP0.S, PRED32_HALF2/M, zTMP4.S
+    mov zTMP1.S, PRED32_HALF2/M, zTMP5.S
+    mov zTMP2.S, PRED32_HALF2/M, zTMP6.S
+    mov zTMP3.S, PRED32_HALF2/M, zTMP7.S
+    eor zTMP0.D, zTMP0.D, zTMP8.D
+    eor zTMP1.D, zTMP1.D, zTMP9.D
+    eor zTMP2.D, zTMP2.D, zTMP10.D
+    eor zTMP3.D, zTMP3.D, zTMP11.D
+
+    compact zTMP4.S, PRED32_HALF2, zTMP0.S
+    compact zTMP5.S, PRED32_HALF2, zTMP1.S
+    compact zTMP6.S, PRED32_HALF2, zTMP2.S
+    compact zTMP7.S, PRED32_HALF2, zTMP3.S
+
+    st1 {vTMP0.4S}, [xTMP8], #16
+    st1 {vTMP4.4S}, [xTMP9], #16
+    st1 {vTMP1.4S}, [xTMP10], #16
+    st1 {vTMP5.4S}, [xTMP11], #16
+    st1 {vTMP2.4S}, [xTMP12], #16
+    st1 {vTMP6.4S}, [xTMP13], #16
+    st1 {vTMP3.4S}, [xTMP14], #16
+    st1 {vTMP7.4S}, [xTMP15], #16
+
+    sub xTMP19, xTMP19, #16
+    cmp xTMP19, #16
+    b.ge GEN16_LOOP
+
+GEN8:
+    cmp xTMP19, #8
+    b.lt GEN4
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9
+    zip1 zTMP10.S, zTMP8.S, zTMP9.S
+    zip2 zTMP11.S, zTMP8.S, zTMP9.S
+    revb zTMP10.S, PRED32/M, zTMP10.S
+    revb zTMP11.S, PRED32/M, zTMP11.S
+
+    ld1 {vTMP0.D}[0], [xTMP16], #8
+    ld1 {vTMP0.D}[1], [xTMP1], #8
+    ld1 {vTMP1.D}[0], [xTMP2], #8
+    ld1 {vTMP1.D}[1], [xTMP3], #8
+    ld1 {vTMP2.D}[0], [xTMP4], #8
+    ld1 {vTMP2.D}[1], [xTMP5], #8
+    ld1 {vTMP3.D}[0], [xTMP6], #8
+    ld1 {vTMP3.D}[1], [xTMP7], #8
+
+    compact zTMP4.S, PRED32_HALF2, zTMP10.S
+    compact zTMP5.S, PRED32_HALF2, zTMP11.S
+
+    eor vTMP0.16B, vTMP0.16B, vTMP10.16B
+    eor vTMP1.16B, vTMP1.16B, vTMP4.16B
+    eor vTMP2.16B, vTMP2.16B, vTMP11.16B
+    eor vTMP3.16B, vTMP3.16B, vTMP5.16B
+
+    st1 {vTMP0.D}[0], [xTMP8], #8
+    st1 {vTMP0.D}[1], [xTMP9], #8
+    st1 {vTMP1.D}[0], [xTMP10], #8
+    st1 {vTMP1.D}[1], [xTMP11], #8
+    st1 {vTMP2.D}[0], [xTMP12], #8
+    st1 {vTMP2.D}[1], [xTMP13], #8
+    st1 {vTMP3.D}[0], [xTMP14], #8
+    st1 {vTMP3.D}[1], [xTMP15], #8
+
+    sub xTMP19, xTMP19, #8
+
+GEN4:
+    cmp xTMP19, #4
+    b.lt FINISH
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8
+    revb zTMP8.S, PRED32/M, zTMP8.S
+
+    ld1 {vTMP0.S}[0], [xTMP16], #4
+    ld1 {vTMP0.S}[1], [xTMP1], #4
+    ld1 {vTMP0.S}[2], [xTMP2], #4
+    ld1 {vTMP0.S}[3], [xTMP3], #4
+    ld1 {vTMP1.S}[0], [xTMP4], #4
+    ld1 {vTMP1.S}[1], [xTMP5], #4
+    ld1 {vTMP1.S}[2], [xTMP6], #4
+    ld1 {vTMP1.S}[3], [xTMP7], #4
+
+    compact zTMP4.S, PRED32_HALF2, zTMP8.S
+
+    eor vTMP0.16B, vTMP0.16B, vTMP8.16B
+    eor vTMP1.16B, vTMP1.16B, vTMP4.16B
+
+    st1 {vTMP0.S}[0], [xTMP8], #4
+    st1 {vTMP0.S}[1], [xTMP9], #4
+    st1 {vTMP0.S}[2], [xTMP10], #4
+    st1 {vTMP0.S}[3], [xTMP11], #4
+    st1 {vTMP1.S}[0], [xTMP12], #4
+    st1 {vTMP1.S}[1], [xTMP13], #4
+    st1 {vTMP1.S}[2], [xTMP14], #4
+    st1 {vTMP1.S}[3], [xTMP15], #4
+
+FINISH:
+    SNOW3G_STORE_CTX_8_SVE256 x0
+    mov xTMP17, x1
+    mov xTMP18, x2
+    stp xTMP16, xTMP1, [xTMP17], #16
+    stp xTMP2, xTMP3, [xTMP17], #16
+    stp xTMP4, xTMP5, [xTMP17], #16
+    stp xTMP6, xTMP7, [xTMP17]
+    stp xTMP8, xTMP9, [xTMP18], #16
+    stp xTMP10, xTMP11, [xTMP18], #16
+    stp xTMP12, xTMP13, [xTMP18], #16
+    stp xTMP14, xTMP15, [xTMP18]
+
+    FUNC_VECTOR_RESTORE
+    FUNC_SCALAR_RESTORE
+    ret
+END_FUNC(snow3g_f8_8_buffer_stream_aarch64_sve256_asm)
+
+#else
+
+/*
+ * snow3g_f8_8_buffer_stream_aarch64_sve256_asm(void *ctx,
+ *                                           void **in,
+ *                                           void **out,
+ *                                           uint32_t lengthInBytes)
+ *
+ * NOTE: This implementation uses SVE gather load and scatter store,
+ *       but the performance is 10% worse than implementation using
+ *       contiguous load and store.
+ */
+START_FUNC(snow3g_f8_8_buffer_stream_aarch64_sve256_asm)
+    cbz x3, FINISH_GS
+    FUNC_SCALAR_SAVE
+    ptrue PRED8.B, ALL
+    ptrue PRED32.S, ALL
+    ptrue PRED32_HALF1.S, VL4
+    not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B
+    FUNC_VECTOR_SAVE
+    adrp xTMP0, n_inv_aes_shift_row
+    add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row
+    ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0]
+
+    SNOW3G_LOAD_CTX_8_SVE256 x0
+    ld1d {zTMP10.D}, PRED32/Z, [x1]
+    ld1d {zTMP0.D}, PRED32/Z, [x1, #1, MUL VL]
+    uzp1 zTMP10.S, zTMP10.S, zTMP0.S
+    ld1d {zTMP11.D}, PRED32/Z, [x2]
+    ld1d {zTMP0.D}, PRED32/Z, [x2, #1, MUL VL]
+    uzp1 zTMP11.S, zTMP11.S, zTMP0.S
+
+    ldr xTMP18, [x1]
+    ldr xTMP19, [x2]
+    bfm xTMP18, XZR, #0, #31
+    bfm xTMP19, XZR, #0, #31
+    mov xTMP17, #0
+
+GEN4_LOOP:
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8
+    revb zTMP8.S, PRED32/M, zTMP8.S
+
+    ld1w {zTMP9.S}, PRED32/Z, [xTMP18, zTMP10.S, UXTW]
+    eor zTMP9.D, zTMP9.D, zTMP8.D
+    st1w {zTMP9.S}, PRED32, [xTMP19, zTMP11.S, UXTW]
+    add xTMP18, xTMP18, #4
+    add xTMP19, xTMP19, #4
+    add xTMP17, xTMP17, #4
+    cmp xTMP17, x3
+    b.lt GEN4_LOOP
+
+    SNOW3G_STORE_CTX_8_SVE256 x0
+
+    cpy zTMP9.D, PRED32/M, x3
+    ld1d {zTMP10.D}, PRED32/Z, [x1]
+    ld1d {zTMP11.D}, PRED32/Z, [x1, #1, MUL VL]
+    add zTMP10.D, zTMP10.D, zTMP9.D
+    add zTMP11.D, zTMP11.D, zTMP9.D
+    st1d {zTMP10.D}, PRED32, [x1]
+    st1d {zTMP11.D}, PRED32, [x1, #1, MUL VL]
+
+    ld1d {zTMP10.D}, PRED32/Z, [x2]
+    ld1d {zTMP11.D}, PRED32/Z, [x2, #1, MUL VL]
+    add zTMP10.D, zTMP10.D, zTMP9.D
+    add zTMP11.D, zTMP11.D, zTMP9.D
+    st1d {zTMP10.D}, PRED32, [x2]
+    st1d {zTMP11.D}, PRED32, [x2, #1, MUL VL]
+
+    FUNC_VECTOR_RESTORE
+    FUNC_SCALAR_RESTORE
+FINISH_GS:
+    ret
+END_FUNC(snow3g_f8_8_buffer_stream_aarch64_sve256_asm)
+
+#endif
+
+/* OUT = IN XOR OUT
+ * use this macro to generate output when LEN is less than 16
+ * use: vTMP0 */
+.macro X_BYTE_STREAM IN, OUT, KEY, LEN
+    cmp \LEN\(), #8
+    b.lt 4f
+    ld1 {vTMP0.D}[0], [\IN\()], #8
+    eor vTMP0.16B, vTMP0.16B, \KEY\().16B
+    st1 {vTMP0.D}[0], [\OUT\()], #8
+    mov \KEY\().D[0], \KEY\().D[1]
+    sub \LEN\(), \LEN\(), #8
+4:
+    cmp \LEN\(), #4
+    b.lt 2f
+    ld1 {vTMP0.S}[0], [\IN\()], #4
+    eor vTMP0.16B, vTMP0.16B, \KEY\().16B
+    st1 {vTMP0.S}[0], [\OUT\()], #4
+    mov \KEY\().S[0], \KEY\().S[1]
+    sub \LEN\(), \LEN\(), #4
+2:
+    cmp \LEN\(), #2
+    b.lt 1f
+    ld1 {vTMP0.H}[0], [\IN\()], #2
+    eor vTMP0.16B, vTMP0.16B, \KEY\().16B
+    st1 {vTMP0.H}[0], [\OUT\()], #2
+    mov \KEY\().H[0], \KEY\().H[1]
+    sub \LEN\(), \LEN\(), #2
+1:
+    cmp \LEN\(), #1
+    b.lt 0f
+    ld1 {vTMP0.B}[0], [\IN\()], #1
+    eor vTMP0.16B, vTMP0.16B, \KEY\().16B
+    st1 {vTMP0.B}[0], [\OUT\()], #1
+0:
+.endm
+
+.macro GEN_1_TO_8_LANES LANE_NR SUFFIX
+    CURR_LEN .req wTMP17
+    LEFT_LEN .req wTMP17
+    LEN .req wTMP18
+    IN1 .req xTMP1
+    IN2 .req xTMP2
+    IN3 .req xTMP3
+    IN4 .req xTMP4
+    IN5 .req xTMP5
+    IN6 .req xTMP6
+    IN7 .req xTMP7
+    IN8 .req xTMP8
+    OUT1 .req xTMP9
+    OUT2 .req xTMP10
+    OUT3 .req xTMP11
+    OUT4 .req xTMP12
+    OUT5 .req xTMP13
+    OUT6 .req xTMP14
+    OUT7 .req xTMP15
+    OUT8 .req xTMP16
+    // lanes are sorted by length decrease
+    // lane1.len >= lane2.len >= .... lane8.len
+    // load length
+    ldr CURR_LEN, [x4, #(4*(\LANE_NR\()-1))]
+GEN_\LANE_NR\()LANES\SUFFIX\():
+    cmp CURR_LEN, LEN
+    b.lt FINISH_\LANE_NR\()TH\SUFFIX\()
+
+    // load 16byte x LANE_NR input
+    ld1 {vTMP0.4S}, [IN1], #16
+.ifge \LANE_NR - 2
+    ld1 {vTMP4.4S}, [IN2], #16
+.ifge \LANE_NR - 3
+    ld1 {vTMP1.4S}, [IN3], #16
+.ifge \LANE_NR - 4
+    ld1 {vTMP5.4S}, [IN4], #16
+.ifge \LANE_NR - 5
+    ld1 {vTMP2.4S}, [IN5], #16
+.ifge \LANE_NR - 6
+    ld1 {vTMP6.4S}, [IN6], #16
+.ifge \LANE_NR - 7
+    ld1 {vTMP3.4S}, [IN7], #16
+.ifge \LANE_NR - 8
+    ld1 {vTMP7.4S}, [IN8], #16
+.endif
+.endif
+.endif
+.endif
+.endif
+.endif
+.endif
+
+    // merge 16byte x LANE_NR input into at most 4 SVE registers
+.rept 2
+.ifge \LANE_NR - 2
+    insr zTMP4.D, x0
+.ifge \LANE_NR - 4
+    insr zTMP5.D, x0
+.ifge \LANE_NR - 6
+    insr zTMP6.D, x0
+.ifge \LANE_NR - 8
+    insr zTMP7.D, x0
+.endif
+.endif
+.endif
+.endif
+.endr
+.ifge \LANE_NR - 2
+    mov zTMP0.S, PRED32_HALF2/M, zTMP4.S
+.ifge \LANE_NR - 4
+    mov zTMP1.S, PRED32_HALF2/M, zTMP5.S
+.ifge \LANE_NR - 6
+    mov zTMP2.S, PRED32_HALF2/M, zTMP6.S
+.ifge \LANE_NR - 8
+    mov zTMP3.S, PRED32_HALF2/M, zTMP7.S
+.endif
+.endif
+.endif
+.endif
+
+    // XOR with generated keystream
+    eor zTMP0.D, zTMP0.D, zTMP8.D
+.ifge \LANE_NR - 3
+    eor zTMP1.D, zTMP1.D, zTMP9.D
+.ifge \LANE_NR - 5
+    eor zTMP2.D, zTMP2.D, zTMP10.D
+.ifge \LANE_NR - 7
+    eor zTMP3.D, zTMP3.D, zTMP11.D
+.endif
+.endif
+.endif
+
+    // compact SVE register into NEON register for store
+.ifge \LANE_NR - 2
+    compact zTMP4.S, PRED32_HALF2, zTMP0.S
+.ifge \LANE_NR - 4
+    compact zTMP5.S, PRED32_HALF2, zTMP1.S
+.ifge \LANE_NR - 6
+    compact zTMP6.S, PRED32_HALF2, zTMP2.S
+.ifge \LANE_NR - 8
+    compact zTMP7.S, PRED32_HALF2, zTMP3.S
+.endif
+.endif
+.endif
+.endif
+
+    // store to 16byte x LANE_NR output
+    st1 {vTMP0.4S}, [OUT1], #16
+.ifge \LANE_NR - 2
+    st1 {vTMP4.4S}, [OUT2], #16
+.ifge \LANE_NR - 3
+    st1 {vTMP1.4S}, [OUT3], #16
+.ifge \LANE_NR - 4
+    st1 {vTMP5.4S}, [OUT4], #16
+.ifge \LANE_NR - 5
+    st1 {vTMP2.4S}, [OUT5], #16
+.ifge \LANE_NR - 6
+    st1 {vTMP6.4S}, [OUT6], #16
+.ifge \LANE_NR - 7
+    st1 {vTMP3.4S}, [OUT7], #16
+.ifge \LANE_NR - 8
+    st1 {vTMP7.4S}, [OUT8], #16
+.endif
+.endif
+.endif
+.endif
+.endif
+.endif
+.endif
+
+    // update number of generated output
+    add LEN, LEN, #16
+
+    // generate 16byte x 8lanes keystream
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11
+    zip1 zTMP0.S, zTMP8.S, zTMP9.S
+    zip2 zTMP1.S, zTMP8.S, zTMP9.S
+    zip1 zTMP2.S, zTMP10.S, zTMP11.S
+    zip2 zTMP3.S, zTMP10.S, zTMP11.S
+    zip1 zTMP8.D, zTMP0.D, zTMP2.D
+.ifge \LANE_NR - 3
+    zip2 zTMP9.D, zTMP0.D, zTMP2.D
+.ifge \LANE_NR - 5
+    zip1 zTMP10.D, zTMP1.D, zTMP3.D
+.ifge \LANE_NR - 7
+    zip2 zTMP11.D, zTMP1.D, zTMP3.D
+.endif
+.endif
+.endif
+
+    revb zTMP8.S, PRED32/M, zTMP8.S
+.ifge \LANE_NR - 3
+    revb zTMP9.S, PRED32/M, zTMP9.S
+.ifge \LANE_NR - 5
+    revb zTMP10.S, PRED32/M, zTMP10.S
+.ifge \LANE_NR - 7
+    revb zTMP11.S, PRED32/M, zTMP11.S
+.endif
+.endif
+.endif
+    b GEN_\LANE_NR\()LANES\SUFFIX\()
+
+FINISH_\LANE_NR\()TH\SUFFIX\():
+    add CURR_LEN, CURR_LEN, 16
+    sub LEFT_LEN, CURR_LEN, LEN
+.if \LANE_NR == 8
+    compact zTMP1.S, PRED32_HALF2, zTMP11.S
+    X_BYTE_STREAM IN8, OUT8, vTMP1, LEFT_LEN
+.endif
+.if \LANE_NR == 7
+    X_BYTE_STREAM IN7, OUT7, vTMP11, LEFT_LEN
+.endif
+.if \LANE_NR == 6
+    compact zTMP1.S, PRED32_HALF2, zTMP10.S
+    X_BYTE_STREAM IN6, OUT6, vTMP1, LEFT_LEN
+.endif
+.if \LANE_NR == 5
+    X_BYTE_STREAM IN5, OUT5, vTMP10, LEFT_LEN
+.endif
+.if \LANE_NR == 4
+    compact zTMP1.S, PRED32_HALF2, zTMP9.S
+    X_BYTE_STREAM IN4, OUT4, vTMP1, LEFT_LEN
+.endif
+.if \LANE_NR == 3
+    X_BYTE_STREAM IN3, OUT3, vTMP9, LEFT_LEN
+.endif
+.if \LANE_NR == 2
+    compact zTMP1.S, PRED32_HALF2, zTMP8.S
+    X_BYTE_STREAM IN2, OUT2, vTMP1, LEFT_LEN
+.endif
+.if \LANE_NR == 1
+    X_BYTE_STREAM IN1, OUT1, vTMP8, LEFT_LEN
+.endif
+.endm
+
+/*
+ * snow3g_f8_8_buffer_aarch64_sve256_asm(void *key,
+ *                                    void **iv,
+ *                                    void **in,
+ *                                    void **out,
+ *                                    uint32_t lengthInBytes[])
+ *
+ */
+START_FUNC(snow3g_f8_8_buffer_aarch64_sve256_asm)
+    FUNC_SCALAR_SAVE
+    ptrue PRED8.B, ALL
+    ptrue PRED32.S, ALL
+    ptrue PRED32_HALF1.S, VL4
+    not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B
+    FUNC_VECTOR_SAVE
+
+    adrp xTMP0, n_inv_aes_shift_row
+    add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row
+    ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0]
+    // key
+    mov xTMP17, x0
+    // iv
+    mov xTMP18, x1
+    ldp xTMP8, xTMP9, [xTMP18], #16
+    ldp xTMP10, xTMP11, [xTMP18], #16
+    ldp xTMP12, xTMP13, [xTMP18], #16
+    ldp xTMP14, xTMP15, [xTMP18]
+
+    SNOW3G_INITIALIZE_8_SVE256_FIRST xTMP17 xTMP17 xTMP17 xTMP17 xTMP17 xTMP17 xTMP17 xTMP17\
+                                  xTMP8 xTMP9 xTMP10 xTMP11 xTMP12 xTMP13 xTMP14 xTMP15
+    SNOW3G_INITIALIZE_8_SVE256_SECOND
+
+    mov xTMP17, x2
+    mov xTMP18, x3
+
+    // in
+    ldp xTMP1, xTMP2, [xTMP17], #16
+    ldp xTMP3, xTMP4, [xTMP17], #16
+    ldp xTMP5, xTMP6, [xTMP17], #16
+    ldp xTMP7, xTMP8, [xTMP17]
+    // out
+    ldp xTMP9, xTMP10, [xTMP18], #16
+    ldp xTMP11, xTMP12, [xTMP18], #16
+    ldp xTMP13, xTMP14, [xTMP18], #16
+    ldp xTMP15, xTMP16, [xTMP18]
+
+    mov wTMP18, #16
+
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11
+    zip1 zTMP0.S, zTMP8.S, zTMP9.S
+    zip2 zTMP1.S, zTMP8.S, zTMP9.S
+    zip1 zTMP2.S, zTMP10.S, zTMP11.S
+    zip2 zTMP3.S, zTMP10.S, zTMP11.S
+    zip1 zTMP8.D, zTMP0.D, zTMP2.D
+    zip2 zTMP9.D, zTMP0.D, zTMP2.D
+    zip1 zTMP10.D, zTMP1.D, zTMP3.D
+    zip2 zTMP11.D, zTMP1.D, zTMP3.D
+    revb zTMP8.S, PRED32/M, zTMP8.S
+    revb zTMP9.S, PRED32/M, zTMP9.S
+    revb zTMP10.S, PRED32/M, zTMP10.S
+    revb zTMP11.S, PRED32/M, zTMP11.S
+
+    GEN_1_TO_8_LANES 8 _SINGLE_KEY
+    GEN_1_TO_8_LANES 7 _SINGLE_KEY
+    GEN_1_TO_8_LANES 6 _SINGLE_KEY
+    GEN_1_TO_8_LANES 5 _SINGLE_KEY
+    GEN_1_TO_8_LANES 4 _SINGLE_KEY
+    GEN_1_TO_8_LANES 3 _SINGLE_KEY
+    GEN_1_TO_8_LANES 2 _SINGLE_KEY
+    GEN_1_TO_8_LANES 1 _SINGLE_KEY
+
+    FUNC_VECTOR_RESTORE
+    FUNC_SCALAR_RESTORE
+    ret
+END_FUNC(snow3g_f8_8_buffer_aarch64_sve256_asm)
+
+/*
+ * snow3g_f8_8_buffer_multikey_aarch64_sve256_asm(void **key,
+ *                                             void **iv,
+ *                                             void **in,
+ *                                             void **out,
+ *                                             uint32_t lengthInBytes[])
+ *
+ */
+START_FUNC(snow3g_f8_8_buffer_multikey_aarch64_sve256_asm)
+    FUNC_SCALAR_SAVE
+    ptrue PRED8.B, ALL
+    ptrue PRED32.S, ALL
+    ptrue PRED32_HALF1.S, VL4
+    not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B
+    FUNC_VECTOR_SAVE
+
+    adrp xTMP0, n_inv_aes_shift_row
+    add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row
+    ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0]
+    // key
+    mov xTMP17, x0
+    // iv
+    mov xTMP18, x1
+
+    // key
+    ldp xTMP0, xTMP1, [xTMP17], #16
+    ldp xTMP2, xTMP3, [xTMP17], #16
+    ldp xTMP4, xTMP5, [xTMP17], #16
+    ldp xTMP6, xTMP7, [xTMP17]
+    // iv
+    ldp xTMP8, xTMP9, [xTMP18], #16
+    ldp xTMP10, xTMP11, [xTMP18], #16
+    ldp xTMP12, xTMP13, [xTMP18], #16
+    ldp xTMP14, xTMP15, [xTMP18]
+
+    SNOW3G_INITIALIZE_8_SVE256_FIRST xTMP0 xTMP1 xTMP2 xTMP3 xTMP4 xTMP5 xTMP6 xTMP7\
+                                  xTMP8 xTMP9 xTMP10 xTMP11 xTMP12 xTMP13 xTMP14 xTMP15
+    SNOW3G_INITIALIZE_8_SVE256_SECOND
+
+    mov xTMP17, x2
+    mov xTMP18, x3
+
+    // in
+    ldp xTMP1, xTMP2, [xTMP17], #16
+    ldp xTMP3, xTMP4, [xTMP17], #16
+    ldp xTMP5, xTMP6, [xTMP17], #16
+    ldp xTMP7, xTMP8, [xTMP17]
+    // out
+    ldp xTMP9, xTMP10, [xTMP18], #16
+    ldp xTMP11, xTMP12, [xTMP18], #16
+    ldp xTMP13, xTMP14, [xTMP18], #16
+    ldp xTMP15, xTMP16, [xTMP18]
+
+    mov wTMP18, #16
+
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11
+    zip1 zTMP0.S, zTMP8.S, zTMP9.S
+    zip2 zTMP1.S, zTMP8.S, zTMP9.S
+    zip1 zTMP2.S, zTMP10.S, zTMP11.S
+    zip2 zTMP3.S, zTMP10.S, zTMP11.S
+    zip1 zTMP8.D, zTMP0.D, zTMP2.D
+    zip2 zTMP9.D, zTMP0.D, zTMP2.D
+    zip1 zTMP10.D, zTMP1.D, zTMP3.D
+    zip2 zTMP11.D, zTMP1.D, zTMP3.D
+    revb zTMP8.S, PRED32/M, zTMP8.S
+    revb zTMP9.S, PRED32/M, zTMP9.S
+    revb zTMP10.S, PRED32/M, zTMP10.S
+    revb zTMP11.S, PRED32/M, zTMP11.S
+
+    GEN_1_TO_8_LANES 8 _MULTI_KEY
+    GEN_1_TO_8_LANES 7 _MULTI_KEY
+    GEN_1_TO_8_LANES 6 _MULTI_KEY
+    GEN_1_TO_8_LANES 5 _MULTI_KEY
+    GEN_1_TO_8_LANES 4 _MULTI_KEY
+    GEN_1_TO_8_LANES 3 _MULTI_KEY
+    GEN_1_TO_8_LANES 2 _MULTI_KEY
+    GEN_1_TO_8_LANES 1 _MULTI_KEY
+
+    FUNC_VECTOR_RESTORE
+    FUNC_SCALAR_RESTORE
+    ret
+END_FUNC(snow3g_f8_8_buffer_multikey_aarch64_sve256_asm)
+
+/*
+ * snow3g_f9_8_buffer_keystream_aarch64_sve256_asm(void *pCtx,
+ *                                              uint32_t* ks)
+ *
+ */
+START_FUNC(snow3g_f9_8_buffer_keystream_aarch64_sve256_asm)
+    FUNC_SCALAR_SAVE
+    ptrue PRED8.B, ALL
+    ptrue PRED32.S, ALL
+    ptrue PRED32_HALF1.S, VL4
+    not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B
+    FUNC_VECTOR_SAVE
+    adrp xTMP0, n_inv_aes_shift_row
+    add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row
+    ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0]
+
+    mov xTMP8, x1
+    add xTMP9, xTMP8, #20
+    add xTMP10, xTMP9, #20
+    add xTMP11, xTMP10, #20
+    add xTMP12, xTMP11, #20
+    add xTMP13, xTMP12, #20
+    add xTMP14, xTMP13, #20
+    add xTMP15, xTMP14, #20
+
+    SNOW3G_LOAD_CTX_8_SVE256 x0
+
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11
+    zip1 zTMP0.S, zTMP8.S, zTMP9.S
+    zip2 zTMP1.S, zTMP8.S, zTMP9.S
+    zip1 zTMP2.S, zTMP10.S, zTMP11.S
+    zip2 zTMP3.S, zTMP10.S, zTMP11.S
+    zip1 zTMP8.D, zTMP0.D, zTMP2.D
+    zip2 zTMP9.D, zTMP0.D, zTMP2.D
+    zip1 zTMP10.D, zTMP1.D, zTMP3.D
+    zip2 zTMP11.D, zTMP1.D, zTMP3.D
+    compact zTMP4.S, PRED32_HALF2, zTMP8.S
+    compact zTMP5.S, PRED32_HALF2, zTMP9.S
+    compact zTMP6.S, PRED32_HALF2, zTMP10.S
+    compact zTMP7.S, PRED32_HALF2, zTMP11.S
+
+    st1 {vTMP8.4S}, [xTMP8], #16
+    st1 {vTMP4.4S}, [xTMP9], #16
+    st1 {vTMP9.4S}, [xTMP10], #16
+    st1 {vTMP5.4S}, [xTMP11], #16
+    st1 {vTMP10.4S}, [xTMP12], #16
+    st1 {vTMP6.4S}, [xTMP13], #16
+    st1 {vTMP11.4S}, [xTMP14], #16
+    st1 {vTMP7.4S}, [xTMP15], #16
+
+    SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8
+    compact zTMP4.S, PRED32_HALF2, zTMP8.S
+
+    st1 {vTMP8.S}[0], [xTMP8], #4
+    st1 {vTMP8.S}[1], [xTMP9], #4
+    st1 {vTMP8.S}[2], [xTMP10], #4
+    st1 {vTMP8.S}[3], [xTMP11], #4
+    st1 {vTMP4.S}[0], [xTMP12], #4
+    st1 {vTMP4.S}[1], [xTMP13], #4
+    st1 {vTMP4.S}[2], [xTMP14], #4
+    st1 {vTMP4.S}[3], [xTMP15], #4
+
+    FUNC_VECTOR_RESTORE
+    FUNC_SCALAR_RESTORE
+    ret
+END_FUNC(snow3g_f9_8_buffer_keystream_aarch64_sve256_asm)
\ No newline at end of file
diff --git a/lib/aarch64/snow3g_internal.h b/lib/aarch64/snow3g_internal.h
index 8b7e79224a5ceee3e24418ba862b92bd28d6cb82..dfe8adaec8ea297e169dbddf10b9940bd9f36c12 100644
--- a/lib/aarch64/snow3g_internal.h
+++ b/lib/aarch64/snow3g_internal.h
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2022 Arm Corporation All rights reserved.
+  Copyright(c) 2022-2023 Arm Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -63,9 +63,9 @@ typedef struct snow3gKeyState1_s {
         /* 16 LFSR stages */
         uint32_t LFSR_S[16];
         /* 3 FSM states */
-        uint32_t FSM_R3;
-        uint32_t FSM_R2;
         uint32_t FSM_R1;
+        uint32_t FSM_R2;
+        uint32_t FSM_R3;
 } DECLARE_ALIGNED(snow3gKeyState1_t, 16);
 
 typedef struct snow3gKeyState4_s {
@@ -76,6 +76,14 @@ typedef struct snow3gKeyState4_s {
         uint32_t iLFSR_X;
 } snow3gKeyState4_t;
 
+typedef struct snow3gKeyState8_s {
+        /* 16 LFSR stages */
+        uint32x4x2_t LFSR_X[16];
+        /* 3 FSM states */
+        uint32x4x2_t FSM_X[3];
+        uint32_t iLFSR_X;
+} snow3gKeyState8_t;
+
 /**
  * @brief Finds minimum 32-bit value in an array
  * @return Min 32-bit value
@@ -134,6 +142,32 @@ length_check(const uint32_t *out_array, const size_t dim_array)
 
         return 1;
 }
+
+/**
+ * @brief Checks vector of length values against 0 and SNOW3G_MAX_BYTELEN values
+ * @retval 0 incorrect length value found
+ * @retval 1 all OK
+ */
+static inline uint32_t
+length64_check(const uint64_t *out_array, const size_t dim_array)
+{
+        size_t i;
+
+        if (out_array == NULL) {
+                imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
+                return 0;
+        }
+
+        for (i = 0; i < dim_array; i++) {
+                if ((out_array[i] == 0) ||
+                    (out_array[i] > SNOW3G_MAX_BYTELEN)) {
+                        imb_set_errno(NULL, IMB_ERR_CIPH_LEN);
+                        return 0;
+                    }
+        }
+
+        return 1;
+}
 #endif
 
 /**
diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h
index 982aaee8dc8706ff035027be1c8fead67fcffeff..67f6230c76988146a6381001d70938fca61d68fb 100644
--- a/lib/include/ipsec_ooo_mgr.h
+++ b/lib/include/ipsec_ooo_mgr.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-  Copyright (c) 2012-2022, Intel Corporation
+  Copyright (c) 2012-2023, Intel Corporation
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
@@ -785,5 +785,7 @@ IMB_DLL_LOCAL void
 init_mb_mgr_aarch64_no_aesni_internal(IMB_MGR *state, const int reset_mgrs);
 IMB_DLL_LOCAL void
 init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs);
+IMB_DLL_LOCAL void
+init_mb_mgr_aarch64_sve256_internal(IMB_MGR *state, const int reset_mgrs);
 
 #endif /* IMB_IPSEC_MB_INTERNAL_H */
diff --git a/lib/include/snow3g.h b/lib/include/snow3g.h
index 9bf40ae85f1cec766b1037793dfbd9cde5eee188..12326b0b2ebf00d54a335bcf58473fc699ee7835 100644
--- a/lib/include/snow3g.h
+++ b/lib/include/snow3g.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-  Copyright (c) 2009-2022, Intel Corporation
+  Copyright (c) 2009-2023, Intel Corporation
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
@@ -782,58 +782,172 @@ snow3g_f8_n_buffer_multikey_aarch64(const snow3g_key_schedule_t * const pCtx[],
                                     void *pBufferOut[],
                                     const uint32_t bufferLenInBytes[],
                                     const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
+                           const void *pIV,
+                           const void *pBufferIn,
+                           const uint64_t lengthInBits,
+                           void *pDigest);
+
+size_t
+snow3g_key_sched_size_aarch64(void);
+
+int
+snow3g_init_key_sched_aarch64(const void *pKey, snow3g_key_schedule_t *pCtx);
+
+/*******************************************************************************
+ * SVE
+ ******************************************************************************/
 void
-snow3g_f8_4_buffer_initialize_aarch64(void *pCtx,
-                                      const snow3g_key_schedule_t *pKeySched1,
-                                      const snow3g_key_schedule_t *pKeySched2,
-                                      const snow3g_key_schedule_t *pKeySched3,
-                                      const snow3g_key_schedule_t *pKeySched4,
-                                      const void *pIV1, const void *pIV2,
-                                      const void *pIV3, const void *pIV4);
+snow3g_f8_1_buffer_bit_aarch64_sve256(const snow3g_key_schedule_t *pCtx,
+                                      const void *pIV,
+                                      const void *pBufferIn,
+                                      void *pBufferOut,
+                                      const uint32_t cipherLengthInBits,
+                                      const uint32_t offsetInBits);
 
 void
-snow3g_f8_1_buffer_stream_aarch64(void *pCtx,
+snow3g_f8_1_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx,
+                                  const void *pIV,
                                   const void *pBufferIn,
                                   void *pBufferOut,
                                   const uint32_t lengthInBytes);
 
 void
-snow3g_f8_4_buffer_stream_aarch64(void *pCtx,
+snow3g_f8_2_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx,
+                                  const void *pIV1,
+                                  const void *pIV2,
+                                  const void *pBufferIn1,
+                                  void *pBufferOut1,
+                                  const uint32_t lengthInBytes1,
+                                  const void *pBufferIn2,
+                                  void *pBufferOut2,
+                                  const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx,
+                                  const void *pIV1,
+                                  const void *pIV2,
+                                  const void *pIV3,
+                                  const void *pIV4,
                                   const void *pBufferIn1,
                                   void *pBufferOut1,
+                                  const uint32_t lengthInBytes1,
                                   const void *pBufferIn2,
                                   void *pBufferOut2,
+                                  const uint32_t lengthInBytes2,
                                   const void *pBufferIn3,
                                   void *pBufferOut3,
+                                  const uint32_t lengthInBytes3,
                                   const void *pBufferIn4,
                                   void *pBufferOut4,
-                                  const uint32_t lengthInBytes);
-
-void
-snow3g_f9_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
-                           const void *pIV,
-                           const void *pBufferIn,
-                           const uint64_t lengthInBits,
-                           void *pDigest);
+                                  const uint32_t lengthInBytes4);
 
 void
-snow3g_f9_1_buffer_digest_aarch64(const uint32_t z[5],
+snow3g_f8_4_buffer_multikey_aarch64_sve256(const snow3g_key_schedule_t *pCtx1,
+                                           const snow3g_key_schedule_t *pCtx2,
+                                           const snow3g_key_schedule_t *pCtx3,
+                                           const snow3g_key_schedule_t *pCtx4,
+                                           const void *pIV1,
+                                           const void *pIV2,
+                                           const void *pIV3,
+                                           const void *pIV4,
+                                           const void *pBufferIn1,
+                                           void *pBufferOut1,
+                                           const uint32_t lengthInBytes1,
+                                           const void *pBufferIn2,
+                                           void *pBufferOut2,
+                                           const uint32_t lengthInBytes2,
+                                           const void *pBufferIn3,
+                                           void *pBufferOut3,
+                                           const uint32_t lengthInBytes3,
+                                           const void *pBufferIn4,
+                                           void *pBufferOut4,
+                                           const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx,
+                                  const void *pIV1,
+                                  const void *pIV2,
+                                  const void *pIV3,
+                                  const void *pIV4,
+                                  const void *pIV5,
+                                  const void *pIV6,
+                                  const void *pIV7,
+                                  const void *pIV8,
+                                  const void *pBufferIn1,
+                                  void *pBufferOut1,
+                                  const uint32_t lengthInBytes1,
+                                  const void *pBufferIn2,
+                                  void *pBufferOut2,
+                                  const uint32_t lengthInBytes2,
+                                  const void *pBufferIn3,
+                                  void *pBufferOut3,
+                                  const uint32_t lengthInBytes3,
+                                  const void *pBufferIn4,
+                                  void *pBufferOut4,
+                                  const uint32_t lengthInBytes4,
+                                  const void *pBufferIn5,
+                                  void *pBufferOut5,
+                                  const uint32_t lengthInBytes5,
+                                  const void *pBufferIn6,
+                                  void *pBufferOut6,
+                                  const uint32_t lengthInBytes6,
+                                  const void *pBufferIn7,
+                                  void *pBufferOut7,
+                                  const uint32_t lengthInBytes7,
+                                  const void *pBufferIn8,
+                                  void *pBufferOut8,
+                                  const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_aarch64_sve256(const snow3g_key_schedule_t * const pCtx[],
+                                           const void * const pIV[],
+                                           const void * const pBufferIn[],
+                                           void *pBufferOut[],
+                                           const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx,
+                                  const void * const IV[],
+                                  const void * const pBufferIn[],
+                                  void *pBufferOut[],
+                                  const uint32_t bufferLenInBytes[],
+                                  const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_aarch64_sve256(const snow3g_key_schedule_t * const pCtx[],
+                                           const void * const IV[],
+                                           const void * const pBufferIn[],
+                                           void *pBufferOut[],
+                                           const uint32_t bufferLenInBytes[],
+                                           const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx,
+                                  const void *pIV,
                                   const void *pBufferIn,
                                   const uint64_t lengthInBits,
                                   void *pDigest);
 
-void
-snow3g_f9_4_buffer_keystream_aarch64(void *pCtx,
-                                     uint32_t ks1[5],
-                                     uint32_t ks2[5],
-                                     uint32_t ks3[5],
-                                     uint32_t ks4[5]);
-
 size_t
-snow3g_key_sched_size_aarch64(void);
+snow3g_key_sched_size_aarch64_sve256(void);
 
 int
-snow3g_init_key_sched_aarch64(const void *pKey, snow3g_key_schedule_t *pCtx);
+snow3g_init_key_sched_aarch64_sve256(const void *pKey,
+                                     snow3g_key_schedule_t *pCtx);
+
+
+void
+snow3g_f8_8_buffer_initialize_aarch64_sve256(void *pCtx,
+                                             const snow3g_key_schedule_t **pKeySched,
+                                             const void **pIV);
+void
+snow3g_f8_8_buffer_stream_aarch64_sve256(void *pCtx,
+                                         const void **in,
+                                         void **out,
+                                         uint32_t lengthInBytes);
 
 /*******************************************************************************
  * AARCH64 NO-AESNI
@@ -965,33 +1079,6 @@ snow3g_f8_n_buffer_multikey_aarch64_no_aesni(const snow3g_key_schedule_t * const
                                              const uint32_t bufferLenInBytes[],
                                              const uint32_t bufferCount);
 
-void
-snow3g_f8_4_buffer_initialize_aarch64_no_aesni(void *pCtx,
-                                               const snow3g_key_schedule_t *pKeySched1,
-                                               const snow3g_key_schedule_t *pKeySched2,
-                                               const snow3g_key_schedule_t *pKeySched3,
-                                               const snow3g_key_schedule_t *pKeySched4,
-                                               const void *pIV1, const void *pIV2,
-                                               const void *pIV3, const void *pIV4);
-
-void
-snow3g_f8_1_buffer_stream_aarch64_no_aesni(void *pCtx,
-                                           const void *pBufferIn,
-                                           void *pBufferOut,
-                                           const uint32_t lengthInBytes);
-
-void
-snow3g_f8_4_buffer_stream_aarch64_no_aesni(void *pCtx,
-                                           const void *pBufferIn1,
-                                           void *pBufferOut1,
-                                           const void *pBufferIn2,
-                                           void *pBufferOut2,
-                                           const void *pBufferIn3,
-                                           void *pBufferOut3,
-                                           const void *pBufferIn4,
-                                           void *pBufferOut4,
-                                           const uint32_t lengthInBytes);
-
 void
 snow3g_f9_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
                                     const void *pIV,
@@ -999,19 +1086,6 @@ snow3g_f9_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
                                     const uint64_t lengthInBits,
                                     void *pDigest);
 
-void
-snow3g_f9_1_buffer_digest_aarch64_no_aesni(const uint32_t z[5],
-                                           const void *pBufferIn,
-                                           const uint64_t lengthInBits,
-                                           void *pDigest);
-
-void
-snow3g_f9_4_buffer_keystream_aarch64_no_aesni(void *pCtx,
-                                              uint32_t ks1[5],
-                                              uint32_t ks2[5],
-                                              uint32_t ks3[5],
-                                              uint32_t ks4[5]);
-
 size_t
 snow3g_key_sched_size_aarch64_no_aesni(void);
 
diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h
index 4de506b435f128f12b886f0abb394e193df64891..04b61e1ddafcb743fd45814a433a6a7716175df1 100644
--- a/lib/ipsec-mb.h
+++ b/lib/ipsec-mb.h
@@ -1,5 +1,5 @@
 /*******************************************************************************
-  Copyright (c) 2012-2022, Intel Corporation
+  Copyright (c) 2012-2023, Intel Corporation
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are met:
@@ -124,6 +124,7 @@ typedef enum {
         IMB_ARCH_AVX2,
         IMB_ARCH_AVX512,
         IMB_ARCH_AARCH64,
+        IMB_ARCH_SVE256,
         IMB_ARCH_NUM,
 } IMB_ARCH;
 
@@ -1077,6 +1078,7 @@ typedef uint32_t (*crc32_fn_t)(const void *, const uint64_t);
 #define IMB_FEATURE_AARCH64    (1ULL << 32)
 #define IMB_FEATURE_ASIMD      (1ULL << 33)
 #define IMB_FEATURE_PMULL      (1ULL << 34)
+#define IMB_FEATURE_SVE256     (1ULL << 35)
 
 /* TOP LEVEL (IMB_MGR) Data structure fields */
 
@@ -1556,6 +1558,13 @@ IMB_DLL_EXPORT IMB_JOB *flush_job_aarch64(IMB_MGR *state);
 IMB_DLL_EXPORT uint32_t queue_size_aarch64(IMB_MGR *state);
 IMB_DLL_EXPORT IMB_JOB *get_completed_job_aarch64(IMB_MGR *state);
 IMB_DLL_EXPORT IMB_JOB *get_next_job_aarch64(IMB_MGR *state);
+IMB_DLL_EXPORT void init_mb_mgr_aarch64_sve256(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *submit_job_aarch64_sve256(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *submit_job_nocheck_aarch64_sve256(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *flush_job_aarch64_sve256(IMB_MGR *state);
+IMB_DLL_EXPORT uint32_t queue_size_aarch64_sve256(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *get_completed_job_aarch64_sve256(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *get_next_job_aarch64_sve256(IMB_MGR *state);
 
 /**
  * @brief Automatically initialize most performant
diff --git a/perf/ipsec_perf.c b/perf/ipsec_perf.c
index d3e96a386602c608385d9bc9ab1140697aefb250..55d392aeb7b602a55ed03b4295f1f199e8d5320c 100644
--- a/perf/ipsec_perf.c
+++ b/perf/ipsec_perf.c
@@ -1,5 +1,5 @@
 /**********************************************************************
-  Copyright(c) 2017-2022, Intel Corporation All rights reserved.
+  Copyright(c) 2017-2023, Intel Corporation All rights reserved.
 
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@@ -123,6 +123,7 @@ enum arch_type_e {
         ARCH_AVX2,
         ARCH_AVX512,
         ARCH_AARCH64,
+        ARCH_SVE256,
         NUM_ARCHS
 };
 
@@ -242,6 +243,7 @@ const struct str_value_mapping arch_str_map[] = {
         {.name = "AVX2",   .values.arch_type = ARCH_AVX2 },
         {.name = "AVX512", .values.arch_type = ARCH_AVX512 },
         {.name = "AARCH64",.values.arch_type = ARCH_AARCH64 },
+        {.name = "SVE256", .values.arch_type = ARCH_SVE256 },
 };
 
 const struct str_value_mapping cipher_algo_str_map[] = {
@@ -947,7 +949,7 @@ struct custom_job_params custom_job_params = {
         .cipher_dir   = IMB_DIR_ENCRYPT
 };
 
-uint8_t archs[NUM_ARCHS] = {1, 1, 1, 1, 1}; /* uses all function sets */
+uint8_t archs[NUM_ARCHS] = {1, 1, 1, 1, 1, 1}; /* uses all function sets */
 int use_job_api = 0;
 int use_gcm_sgl_api = 0;
 int use_unhalted_cycles = 0; /* read unhalted cycles instead of tsc */
@@ -2994,7 +2996,7 @@ print_times(struct variant_s *variant_list, struct params_s *params,
 
         if (plot_output_option == 0) {
                 const char *func_names[NUM_ARCHS] = {
-                        "SSE", "AVX", "AVX2", "AVX512", "AARCH64"
+                        "SSE", "AVX", "AVX2", "AVX512", "AARCH64", "SVE256"
                 };
                 const char *c_mode_names[TEST_NUM_CIPHER_TESTS - 1] = {
                         "CBC", "CNTR", "CNTR+8", "CNTR_BITLEN", "CNTR_BITLEN4",
@@ -3234,6 +3236,9 @@ run_tests(void *arg)
                         case ARCH_AARCH64:
                                 init_mb_mgr_aarch64(p_mgr);
                                 break;
+                        case ARCH_SVE256:
+                                init_mb_mgr_aarch64_sve256(p_mgr);
+                                break;
 #endif /* __aarch64__ */
                         default:
                                 fprintf(stderr, "Invalid architecture: %d\n", arch);
@@ -3305,7 +3310,7 @@ static void usage(void)
                 "-h: print this message\n"
                 "-c: Use cold cache, it uses warm as default\n"
                 "-w: Use warm cache\n"
-                "--arch: run only tests on specified architecture (SSE/AVX/AVX2/AVX512/AARCH64)\n"
+                "--arch: run only tests on specified architecture (SSE/AVX/AVX2/AVX512/AARCH64/SVE)\n"
                 "--arch-best: detect available architectures and run only on the best one\n"
                 "--cipher-dir: Select cipher direction to run on the custom test  "
                 "(encrypt/decrypt) (default = encrypt)\n"
@@ -3420,6 +3425,7 @@ detect_arch(unsigned int arch_support[NUM_ARCHS])
         const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx;
         const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2;
         const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI;
+        const uint64_t detect_sve256 = IMB_FEATURE_AARCH64 | IMB_FEATURE_SVE256;
         IMB_MGR *p_mgr = NULL;
         enum arch_type_e arch_id;
 
@@ -3452,6 +3458,9 @@ detect_arch(unsigned int arch_support[NUM_ARCHS])
         if ((p_mgr->features & detect_aarch64) != detect_aarch64)
                 arch_support[ARCH_AARCH64] = 0;
 
+        if ((p_mgr->features & detect_sve256) != detect_sve256)
+                arch_support[ARCH_SVE256] = 0;
+
         free_mb_mgr(p_mgr);
 
         return 0;
@@ -3654,6 +3663,7 @@ detect_best_arch(uint8_t arch_support[NUM_ARCHS])
         const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx;
         const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2;
         const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI;
+        const uint64_t detect_sve256 = IMB_FEATURE_AARCH64 | IMB_FEATURE_SVE256;
         IMB_MGR *p_mgr = NULL;
         uint64_t detected_features = 0;
 
@@ -3699,6 +3709,10 @@ detect_best_arch(uint8_t arch_support[NUM_ARCHS])
                 return 0;
         }
 
+        if ((detected_features & detect_sve256) == detect_sve256) {
+                arch_support[ARCH_SVE256] = 1;
+                return 0;
+        }
         fprintf(stderr, "Arch detection: no architecture available!\n");
         return -1;
 }
diff --git a/test/main.c b/test/main.c
index 8dd4033495596b93acac9e8076a580d0316229c1..bc61efc718a39c86ad1ba75d9d080e2751b5cdb2 100644
--- a/test/main.c
+++ b/test/main.c
@@ -1,5 +1,5 @@
 /*****************************************************************************
- Copyright (c) 2012-2022, Intel Corporation
+ Copyright (c) 2012-2023, Intel Corporation
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
@@ -316,6 +316,7 @@ print_hw_features(void)
                 { IMB_FEATURE_AVX512_IFMA, "AVX512-IFMA" },
                 { IMB_FEATURE_BMI2, "BMI2" },
                 { IMB_FEATURE_AARCH64, "AARCH64" },
+                { IMB_FEATURE_SVE256, "SVE256" },
         };
         IMB_MGR *p_mgr = NULL;
         unsigned i;
@@ -471,6 +472,9 @@ main(int argc, char **argv)
                 case IMB_ARCH_AARCH64:
                         init_mb_mgr_aarch64(p_mgr);
                         break;
+                case IMB_ARCH_SVE256:
+                        init_mb_mgr_aarch64_sve256(p_mgr);
+                        break;
 #endif
 
 #ifdef __x86_64__
diff --git a/test/utils.c b/test/utils.c
index a894d81f58949af64fcbfd5f6b9ebf8bc9a25991..44e8b36da35f1cba42960b175a8a6646b2056ea1 100644
--- a/test/utils.c
+++ b/test/utils.c
@@ -1,5 +1,5 @@
 /*****************************************************************************
- Copyright (c) 2018-2022, Intel Corporation
+ Copyright (c) 2018-2023, Intel Corporation
 
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
@@ -220,6 +220,7 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM])
         const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2;
 
         const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI;
+        const uint64_t detect_sve256 = IMB_FEATURE_AARCH64 | IMB_FEATURE_SVE256;
 
 #ifdef __x86_64__
         const uint64_t detect_noaesni = IMB_FEATURE_SSE4_2 | IMB_FEATURE_CMOV;
@@ -264,6 +265,10 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM])
         if ((p_mgr->features & detect_aarch64) != detect_aarch64)
                 arch_support[IMB_ARCH_AARCH64] = 0;
 
+        if ((p_mgr->features & detect_sve256) != detect_sve256) {
+                arch_support[IMB_ARCH_SVE256] = 0;
+        }
+
         free_mb_mgr(p_mgr);
 
         if (arch_support[IMB_ARCH_NOAESNI] == 0 &&
@@ -271,7 +276,8 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM])
             arch_support[IMB_ARCH_AVX] == 0 &&
             arch_support[IMB_ARCH_AVX2] == 0 &&
             arch_support[IMB_ARCH_AVX512] == 0 &&
-            arch_support[IMB_ARCH_AARCH64] == 0) {
+            arch_support[IMB_ARCH_AARCH64] == 0 &&
+            arch_support[IMB_ARCH_SVE256] == 0) {
                 fprintf(stderr, "No available architecture detected!\n");
                 return -1;
         }
@@ -289,7 +295,7 @@ void
 print_tested_arch(const uint64_t features, const IMB_ARCH arch)
 {
         static const char *arch_str_tab[IMB_ARCH_NUM] = {
-                "NONE", "NO-AESNI", "SSE", "AVX", "AVX2", "AVX512", "AARCH64"
+                "NONE", "NO-AESNI", "SSE", "AVX", "AVX2", "AVX512", "AARCH64", "SVE256"
         };
         const char *feat = "";
 
@@ -298,6 +304,7 @@ print_tested_arch(const uint64_t features, const IMB_ARCH arch)
         case IMB_ARCH_AVX2:
         case IMB_ARCH_AVX:
         case IMB_ARCH_AARCH64:
+        case IMB_ARCH_SVE256:
                 break;
         case IMB_ARCH_SSE:
                 if (features & IMB_FEATURE_SHANI) {