diff --git a/README.md b/README.md
index d683ff03fbe4bb0c1585c7b38a956c2bc00810f2..056812213331933d714a7a9b440daa797f9433aa 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ Table 1. List of supported cipher algorithms and their implementations.
 | KASUMI-F8      | Y      | N      | N      | N      | N      | N      | N       |
 | ZUC-EEA3       | N      | Y  x4  | Y  x4  | Y  x8  | Y  x16 | Y  x16 | N       |
 | ZUC-EEA3-256   | N      | Y  x4  | Y  x4  | Y  x8  | Y  x16 | Y  x16 | N       |
-| SNOW3G-UEA2    | N      | Y      | Y      | Y      | Y  x16 | Y  x16 | N       |
+| SNOW3G-UEA2    | N      | Y      | Y      | Y      | Y  x16 | Y  x16 | Y       |
 | AES128-CBCS(9) | N      | Y(1)   | Y(3)   | N      | N      | Y(6)   | N       |
 | Chacha20       | N      | Y      | Y      | Y      | Y      | N      | N       |
 | Chacha20 AEAD  | N      | Y      | Y      | Y      | Y      | N      | N       |
@@ -133,7 +133,7 @@ Table 2. List of supported integrity algorithms and their implementations.
 | KASUMI-F9         | Y      | N      | N      | N      | N      | N      | N       |
 | ZUC-EIA3          | N      | Y  x4  | Y  x4  | Y  x8  | Y  x16 | Y  x16 | N       |
 | ZUC-EIA3-256      | N      | Y  x4  | Y  x4  | Y  x8  | Y  x16 | Y  x16 | N       |
-| SNOW3G-UIA2(8)    | N      | Y by4  | Y by4  | N      | Y by32 | Y by32 | N       |
+| SNOW3G-UIA2(8)    | N      | Y by4  | Y by4  | N      | Y by32 | Y by32 | Y       |
 | DOCSIS-CRC32(4)   | N      | Y      | Y      | N      | Y      | Y      | N       |
 | HEC               | N      | Y      | Y      | N      | N      | N      | N       |
 | POLY1305          | Y      | N      | N      | N      | Y      | Y      | N       |
diff --git a/lib/Makefile b/lib/Makefile
index 14cc191af5c98e4d7a7fdf8d4e991d9cba385d66..195fc76f78cd72afed226aeb0a7a2190f99e06cc 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -24,7 +24,6 @@
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
-
 LIB = libIPSec_MB
 SHARED ?= y
 IMB_HDR = ipsec-mb.h
@@ -35,6 +34,8 @@ ifeq ($(IMB_VERSION),)
 $(error "Failed to detect library version!")
 endif
 
+ARCH = $(shell uname -m)
+
 VERSION = $(shell echo $(IMB_VERSION) | cut -d. -f1-3)
 SO_VERSION = $(shell echo $(VERSION) | cut -d. -f1)
 
@@ -46,6 +47,7 @@ MAN1 = libipsec-mb.7
 MAN2 = libipsec-mb-dev.7
 NOLDCONFIG ?= n
 
+ifeq ($(ARCH),x86_64)
 USE_YASM ?= n
 YASM ?= yasm
 NASM ?= nasm
@@ -66,12 +68,14 @@ NASM_GE_MINOR = $(shell [ $(NASM_MINOR_VER) -ge $(NASM_MINOR_REQ) ] && echo true
 ifneq ($(NASM_GE_MAJOR),true)
 $(warning "NASM version found: $(NASM_VERSION)")
 $(error "Minimum required: $(NASM_MAJOR_REQ).$(NASM_MINOR_REQ)")
-endif
+endif # NASM_GE_MAJOR
 ifneq ($(NASM_GE_MINOR),true)
 $(warning "NASM version found: $(NASM_VERSION)")
 $(error "Minimum required: $(NASM_MAJOR_REQ).$(NASM_MINOR_REQ)")
-endif
-endif
+endif # NASM_GE_MINOR
+endif # NASM_VERSION
+
+endif # x86_64
 
 OBJ_DIR ?= obj
 LIB_DIR ?= .
@@ -85,11 +89,13 @@ MINGW ?= $(shell $(CC) -dM -E - < /dev/null | grep -i mingw | wc -l | sed 's/^ *
 
 # if "-z ibt" is supported then assume "-z shstk, -z cet-report=error" are also supported
 # "-fcf-protection" needs to be checked separately
+ifeq ($(ARCH),x86_64)
 ifeq ($(MINGW),0)
 CC_HAS_CET = $(and $(shell $(CC) --target-help 2> /dev/null | grep -m1 -e "-z ibt" | wc -l), \
 	$(shell $(CC) --help=common 2> /dev/null | grep -m1 -e "-fcf-protection" | wc -l))
 CET_LDFLAGS=-r -z ibt -z shstk
-endif
+endif # MINGW
+endif # x86_64
 CFLAGS := -DNO_COMPAT_IMB_API_053 $(EXTRA_CFLAGS) $(INCLUDES) \
 	-W -Wall -Wextra -Wmissing-declarations -Wpointer-arith \
 	-Wcast-qual -Wundef -Wwrite-strings  \
@@ -116,7 +122,7 @@ NASM_FLAGS := -Werror -fwin64 -Xvc -gcv8 -DWIN_ABI $(NASM_INCLUDES)
 else
 YASM_FLAGS := -f x64 -f elf64 -X gnu -g dwarf2 -DLINUX -D__linux__ $(YASM_INCLUDES)
 NASM_FLAGS := -Werror -felf64 -Xgnu -gdwarf -DLINUX -D__linux__ $(NASM_INCLUDES)
-endif
+endif # MINGW
 
 DEBUG_OPT ?= -O0
 ifeq ($(DEBUG),y)
@@ -130,8 +136,8 @@ CFLAGS += -fstack-protector -D_FORTIFY_SOURCE=2
 else
 OPT = -O2
 LDFLAGS += -s
-endif
-endif
+endif # MINGW
+endif # DEBUG
 
 ifeq ($(SAFE_OPTIONS), n)
 SAFE_DATA = n
@@ -168,40 +174,56 @@ CFLAGS_NO_SIMD = $(CFLAGS) -O1
 CFLAGS += $(OPT)
 
 # Set generic architectural optimizations
+ifeq ($(ARCH),x86_64)
 OPT_X86 := -msse4.2
 OPT_SSE := -msse4.2 -maes -mpclmul
 OPT_AVX := -mavx -maes -mpclmul
 OPT_AVX2 := -mavx2 -maes -mpclmul
 OPT_AVX512 := -mavx2 -maes -mpclmul # -mavx512f is not available until gcc 4.9
 OPT_NOAESNI := -msse4.2 -mno-aes -mno-pclmul
+endif # x86_64
+
+ifeq ($(ARCH),aarch64)
+OPT_AARCH64 := -march=armv8-a+crypto+aes
+OPT_NOAESNI := -march=armv8-a
+endif # aarch64
 
 # Set architectural optimizations for GCC/CC
 ifeq ($(CC),$(filter $(CC),gcc cc))
 GCC_VERSION = $(shell $(CC) -dumpversion | cut -d. -f1)
 GCC_GE_V5 = $(shell [ $(GCC_VERSION) -ge 5 ] && echo true)
 ifeq ($(GCC_GE_V5),true)
+ifeq ($(ARCH),aarch64)
+OPT_AARCH64 := -march=armv8-a+crypto+aes
+OPT_NOAESNI := -march=armv8-a
+else
 OPT_SSE := -march=nehalem -maes -mpclmul
 OPT_AVX := -march=sandybridge -maes -mpclmul
 OPT_AVX2 := -march=haswell -maes -mpclmul
 OPT_AVX512 := -march=broadwell -maes -mpclmul
 OPT_NOAESNI := -march=nehalem -mno-pclmul
-endif
-endif
+endif # AARCH64
+endif # GCC
+endif # CC
 
 # Set architectural optimizations for clang
 ifeq ($(CC),$(filter $(CC),clang))
 CLANG_VERSION = $(shell $(CC) --version | head -n 1 | cut -d ' ' -f 3)
 CLANG_GE_V381 = $(shell test "$(CLANG_VERSION)" \> "3.8.0" && echo true)
 ifeq ($(CLANG_GE_V381),true)
+ifeq ($(ARCH),aarch64)
+OPT_AARCH64 := -march=armv8-a+crypto+aes
+else
 OPT_SSE := -march=nehalem -maes -mpclmul
 OPT_AVX := -march=sandybridge -maes -mpclmul
 OPT_AVX2 := -march=haswell -maes -mpclmul
 OPT_AVX512 := -march=broadwell -maes -mpclmul
-endif
+endif # AARCH64
+endif # CLANG
 # remove CFLAGS that clang warns about
 CFLAGS := $(subst -fno-delete-null-pointer-checks,,$(CFLAGS))
 CFLAGS := $(subst -fno-strict-overflow,,$(CFLAGS))
-endif
+endif # CC
 
 # so or static build
 ifeq ($(SHARED),y)
@@ -211,7 +233,7 @@ LIBNAME = $(LIB).dll
 else
 LIBNAME = $(LIB).so.$(VERSION)
 LDFLAGS += -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now
-endif
+endif # MINGW
 LIBPERM = 0755
 ifeq ($(CC_HAS_CET),1)
 LDFLAGS += -fcf-protection=full -Wl,-z,ibt -Wl,-z,shstk -Wl,-z,cet-report=error
@@ -224,7 +246,7 @@ LDFLAGS += -g
 ifeq ($(CC_HAS_CET),1)
 LDFLAGS += -fcf-protection=full
 endif
-endif
+endif # shared
 
 # warning messages
 SAFE_PARAM_MSG1="SAFE_PARAM option not set."
@@ -242,6 +264,24 @@ SAFE_OPTIONS_MSG2="All safe options enabled by default."
 #
 # List of C modules (any origin)
 #
+ifeq ($(ARCH),aarch64)
+c_lib_objs := \
+	mb_mgr_aarch64.o \
+	mb_mgr_auto_aarch64.o \
+	mb_mgr_aarch64_no_aesni.o \
+	alloc_aarch64.o \
+	clear_mem_aarch64.o \
+	cpu_features_aarch64.o \
+	version.o \
+	aesni_emu.o \
+	snow3g_aarch64.o \
+	snow3g_aarch64_no_aesni.o \
+	snow3g_tables.o \
+	snow3g_iv.o \
+	error.o
+asm_generic_lib_objs := \
+	lookup_16x8bit_neon.o
+else
 c_lib_objs := \
 	mb_mgr_avx.o \
 	mb_mgr_avx2.o \
@@ -300,7 +340,6 @@ asm_generic_lib_objs := \
 	crc32_const.o \
 	poly1305.o \
 	chacha20_poly1305.o
-
 #
 # List of ASM modules (no-aesni directory)
 #
@@ -604,8 +643,8 @@ asm_avx512_lib_objs := \
 	mb_mgr_zuc_submit_flush_avx512.o \
 	mb_mgr_zuc_submit_flush_gfni_avx512.o \
 	chacha20_avx512.o \
-        poly_avx512.o \
-        poly_fma_avx512.o \
+	poly_avx512.o \
+	poly_fma_avx512.o \
 	ethernet_fcs_avx512.o \
 	crc16_x25_avx512.o \
 	crc32_refl_by16_vclmul_avx512.o \
@@ -649,9 +688,15 @@ asm_avx512_gcm_objs := \
 	aes128_gmac_by48_api_vaes_avx512.o aes192_gmac_by48_api_vaes_avx512.o aes256_gmac_by48_api_vaes_avx512.o \
 	aes128_gcm_by8_avx512.o aes192_gcm_by8_avx512.o aes256_gcm_by8_avx512.o
 
+endif # AARCH64
+
 #
 # build object files lists
 #
+ifeq ($(ARCH),aarch64)
+asm_obj_files := $(asm_generic_lib_objs)
+c_obj_files := $(c_lib_objs)
+else
 asm_obj_files := $(asm_generic_lib_objs) \
 	$(asm_sse_lib_objs) $(asm_sse_gcm_objs) \
 	$(asm_avx_lib_objs) $(asm_avx_gcm_objs) \
@@ -662,6 +707,7 @@ ifeq ($(AESNI_EMU), y)
 asm_obj_files := $(asm_obj_files) $(asm_noaesni_lib_objs) $(asm_noaesni_gcm_objs)
 endif
 c_obj_files := $(c_lib_objs) $(c_gcm_objs)
+endif # AARCH64
 
 #
 # aggregate all objects files together and prefix with OBJDIR
@@ -705,7 +751,7 @@ else
 endif
 else
 	$(AR) -qcs $@ $^
-endif
+endif # SHARED
 ifeq ($(SAFE_PARAM), n)
 	@echo "NOTE:" $(SAFE_PARAM_MSG1) $(SAFE_PARAM_MSG2)
 endif
@@ -764,6 +810,16 @@ $(dep_target_files): | $(OBJ_DIR)
 # - dependency file construction is part of the compilation
 #
 
+ifeq ($(ARCH),aarch64)
+$(OBJ_DIR)/%.o:aarch64/%.c
+	$(CC) -MMD $(OPT_AARCH64) -c $(CFLAGS) $< -o $@
+$(OBJ_DIR)/%.o:x86_64/%.c
+	$(CC) -MMD $(OPT_AARCH64) -c $(CFLAGS) $< -o $@
+$(OBJ_DIR)/%.o:aarch64/%.S
+	$(CC) -MMD $(OPT_AARCH64) -c $(CFLAGS) $< -o $@
+$(OBJ_DIR)/%.o:no-aesni/%.c
+	$(CC) -MMD $(OPT_NOAESNI) -c $(CFLAGS) $< -o $@
+else
 $(OBJ_DIR)/%.o:x86_64/%.c
 	$(CC) -MMD $(OPT_X86) -c $(CFLAGS) $< -o $@
 
@@ -846,7 +902,8 @@ endif
 ifeq ($(CC_HAS_CET),1)
 	$(LD) $(CET_LDFLAGS) -o $@.tmp $@
 	mv $@.tmp $@
-endif
+endif # CC_HAS_CET
+endif # AARCH64
 
 $(OBJ_DIR):
 	mkdir $(OBJ_DIR)
@@ -905,7 +962,7 @@ help:
 	@echo "SAFE_OPTIONS=n"
 	@echo "          - Disable all safe options (enabled by default)"
 
-
+ifneq ($(ARCH), aarch64)
 CHECKPATCH ?= checkpatch.pl
 # checkpatch ignore settings:
 #   SPACING - produces false positives with typedefs and *
@@ -935,6 +992,7 @@ CHECKPATCH_FLAGS = --no-tree --no-signoff --emacs --no-color --ignore CODE_INDEN
 SOURCES_DIRS := . sse avx avx2 avx512 include no-aesni
 SOURCES := $(foreach dir,$(SOURCES_DIRS),$(wildcard $(dir)/*.[ch]) $(wildcard $(dir)/*.asm) $(wildcard $(dir)/*.inc))
 SOURCES_STYLE := $(foreach infile,$(SOURCES),$(infile)_style_check)
+endif # AARCH64
 
 .PHONY: style
 style: $(SOURCES_STYLE)
diff --git a/lib/aarch64/alloc_aarch64.c b/lib/aarch64/alloc_aarch64.c
new file mode 100644
index 0000000000000000000000000000000000000000..cc9e980ab89a29fc453b8535bf6b8db3b3eb7867
--- /dev/null
+++ b/lib/aarch64/alloc_aarch64.c
@@ -0,0 +1,245 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdint.h>
+
+#include <stdlib.h> /* posix_memalign() and free() */
+
+#include <string.h>
+#include "ipsec-mb.h"
+#include "ipsec_ooo_mgr.h"
+#include "cpu_feature.h"
+#include "error.h"
+
+#define IMB_OOO_ROAD_BLOCK 0xDEADCAFEDEADCAFEULL
+
+#define ALIGNMENT 64
+#define ALIGN(x, y) ((x + (y - 1)) & (~(y - 1)))
+
+#define OOO_INFO(imb_mgr_ooo_ptr_name__, ooo_mgr_type__) \
+        { offsetof(IMB_MGR, imb_mgr_ooo_ptr_name__), \
+          ALIGN(sizeof(ooo_mgr_type__), ALIGNMENT),      \
+          offsetof(ooo_mgr_type__, road_block) }
+
+const struct {
+        size_t ooo_ptr_offset;
+        size_t ooo_aligned_size;
+        size_t road_block_offset;
+} ooo_mgr_table[] = {
+};
+
+/**
+ * @brief Calculates necessary memory size for IMB_MGR.
+ *
+ * @return Size for IMB_MGR (aligned to 64 bytes)
+ */
+size_t imb_get_mb_mgr_size(void)
+{
+        size_t ooo_total_size = 0;
+        unsigned i;
+
+        for (i = 0; i < IMB_DIM(ooo_mgr_table); i++)
+                ooo_total_size += ooo_mgr_table[i].ooo_aligned_size;
+        /*
+         * Add 64 bytes into the maximum size calculation to
+         * make sure there is enough room to align the OOO managers.
+         */
+        return (sizeof(IMB_MGR) + ooo_total_size + ALIGNMENT);
+}
+
+static uint8_t *get_ooo_ptr(IMB_MGR *mgr, const size_t offset)
+{
+        uint8_t *mgr_offset = &((uint8_t *) mgr)[offset];
+        uint8_t **ptr = (uint8_t **) mgr_offset;
+
+        return *ptr;
+}
+
+static void set_ooo_ptr(IMB_MGR *mgr, const size_t offset, uint8_t *new_ptr)
+{
+        uint8_t *mgr_offset = &((uint8_t *) mgr)[offset];
+        uint8_t **ptr = (uint8_t **) mgr_offset;
+
+        *ptr = new_ptr;
+}
+
+static void set_road_block(uint8_t *ooo_ptr, const size_t offset)
+{
+        uint64_t *p_road_block = (uint64_t *) &ooo_ptr[offset];
+
+        *p_road_block = IMB_OOO_ROAD_BLOCK;
+}
+
+/*
+ * Set last 8 bytes of OOO mgrs to predefined pattern
+ *
+ * This is to assist in searching for sensitive data remaining
+ * in the heap after algorithmic code completes
+ */
+static void set_ooo_mgr_road_block(IMB_MGR *mgr)
+{
+        unsigned n;
+
+        for (n = 0; n < IMB_DIM(ooo_mgr_table); n++)
+                set_road_block(get_ooo_ptr(mgr,
+                                           ooo_mgr_table[n].ooo_ptr_offset),
+                               ooo_mgr_table[n].road_block_offset);
+}
+
+
+/**
+ * @brief Initializes IMB_MGR pointers to out-of-order managers with
+ *        use of externally allocated memory.
+ *
+ * imb_get_mb_mgr_size() should be called to know how much memory
+ * should be allocated externally.
+ *
+ * init_mb_mgr_XXX() must be called after this function call,
+ * whereas XXX is the desired architecture (including "auto"),
+ * only if reset_mgr is set to 0.
+ *
+ * @param mem_ptr a pointer to allocated memory
+ * @param flags multi-buffer manager flags
+ *     IMB_FLAG_SHANI_OFF - disable use (and detection) of SHA extensions.
+ *     IMB_FLAG_AESNI_OFF - disable use (and detection) of AES extensions.
+ *
+ * @param reset_mgr if 0, IMB_MGR structure is not cleared, else it is.
+ *
+ * @return Pointer to IMB_MGR structure
+ */
+IMB_MGR *imb_set_pointers_mb_mgr(void *mem_ptr, const uint64_t flags,
+                                 const unsigned reset_mgr)
+{
+        if (mem_ptr == NULL) {
+                imb_set_errno(mem_ptr, ENOMEM);
+                return NULL;
+        }
+
+        IMB_MGR *ptr = (IMB_MGR *) mem_ptr;
+        uint8_t *ptr8 = (uint8_t *) ptr;
+        uint8_t *free_mem = &ptr8[ALIGN(sizeof(IMB_MGR), ALIGNMENT)];
+        const size_t mem_size = imb_get_mb_mgr_size();
+        unsigned i;
+
+        if (reset_mgr) {
+                /* Zero out MB_MGR memory */
+                memset(mem_ptr, 0, mem_size);
+        } else {
+                IMB_ARCH used_arch = (IMB_ARCH) ptr->used_arch;
+
+                /* Reset function pointers from previously used architecture */
+                switch (used_arch) {
+                case IMB_ARCH_NOAESNI:
+                        init_mb_mgr_aarch64_no_aesni_internal(ptr, 0);
+                        break;
+                case IMB_ARCH_AARCH64:
+                        init_mb_mgr_aarch64_internal(ptr, 0);
+                        break;
+                default:
+                        break;
+                }
+        }
+
+        imb_set_errno(ptr, 0);
+        ptr->flags = flags; /* save the flags for future use in init */
+        ptr->features = cpu_feature_adjust(flags, cpu_feature_detect());
+
+        /* Set OOO pointers */
+        for (i = 0; i < IMB_DIM(ooo_mgr_table); i++) {
+                set_ooo_ptr(ptr, ooo_mgr_table[i].ooo_ptr_offset, free_mem);
+                free_mem = &free_mem[ooo_mgr_table[i].ooo_aligned_size];
+                IMB_ASSERT((uintptr_t)(free_mem - ptr8) <= mem_size);
+        }
+        set_ooo_mgr_road_block(ptr);
+
+        return ptr;
+}
+
+
+static void *
+alloc_aligned_mem(const size_t size)
+{
+        void *ptr;
+
+        const size_t alignment = 64;
+        if (posix_memalign((void **)&ptr, alignment, size))
+                return NULL;
+
+        IMB_ASSERT(ptr != NULL);
+
+        memset(ptr, 0, size);
+
+        return ptr;
+}
+
+static void
+free_mem(void *ptr)
+{
+        free(ptr);
+}
+
+/**
+ * @brief Allocates memory for multi-buffer manager instance
+ *
+ * For binary compatibility between library versions
+ * it is recommended to use this API.
+ *
+ * @return Pointer to allocated memory for MB_MGR structure
+ * @retval NULL on allocation error
+ */
+IMB_MGR *alloc_mb_mgr(uint64_t flags)
+{
+        IMB_MGR *ptr = NULL;
+
+        ptr = alloc_aligned_mem(sizeof(IMB_MGR));
+        IMB_ASSERT(ptr != NULL);
+        if (ptr != NULL) {
+                imb_set_errno(ptr, 0);
+                ptr->flags = flags; /* save the flags for future use in init */
+                ptr->features = cpu_feature_adjust(flags, cpu_feature_detect());
+        } else {
+                imb_set_errno(ptr, ENOMEM);
+                return NULL;
+        }
+
+        return ptr;
+}
+
+/**
+ * @brief Frees memory allocated previously by alloc_mb_mgr()
+ *
+ * @param ptr a pointer to allocated MB_MGR structure
+ *
+ */
+void free_mb_mgr(IMB_MGR *ptr)
+{
+        IMB_ASSERT(ptr != NULL);
+
+        /* Free IMB_MGR */
+        free_mem(ptr);
+}
diff --git a/lib/aarch64/clear_mem_aarch64.c b/lib/aarch64/clear_mem_aarch64.c
new file mode 100644
index 0000000000000000000000000000000000000000..bf50dfe3a97bb1ab86118e7c2e61fba2c9d311bc
--- /dev/null
+++ b/lib/aarch64/clear_mem_aarch64.c
@@ -0,0 +1,36 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "ipsec-mb.h"
+#include <string.h>
+
+void imb_clear_mem(void *mem, const size_t size)
+{
+        if (mem == NULL) return;
+        memset(mem, 0, size);
+}
diff --git a/lib/aarch64/clear_regs_mem_aarch64.h b/lib/aarch64/clear_regs_mem_aarch64.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b9587d12965828bada0b7af0a74aeecf2fcd398
--- /dev/null
+++ b/lib/aarch64/clear_regs_mem_aarch64.h
@@ -0,0 +1,109 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef CLEAR_REGS_H
+#define CLEAR_REGS_H
+
+#include <string.h>
+
+#define GPR_EOR_SELF(reg) "eor " #reg "," #reg "," #reg ";"
+
+#define CLEAR_SCRATCH_GPS() \
+do {\
+    asm volatile(\
+            GPR_EOR_SELF(x0) \
+            GPR_EOR_SELF(x1) \
+            GPR_EOR_SELF(x2) \
+            GPR_EOR_SELF(x3) \
+            GPR_EOR_SELF(x4) \
+            GPR_EOR_SELF(x5) \
+            GPR_EOR_SELF(x6) \
+            GPR_EOR_SELF(x7) \
+            GPR_EOR_SELF(x8) \
+            GPR_EOR_SELF(x9) \
+            GPR_EOR_SELF(x10) \
+            GPR_EOR_SELF(x11) \
+            GPR_EOR_SELF(x12) \
+            GPR_EOR_SELF(x13) \
+            GPR_EOR_SELF(x14) \
+            GPR_EOR_SELF(x15) \
+            GPR_EOR_SELF(x16) \
+            GPR_EOR_SELF(x17) \
+            GPR_EOR_SELF(x18) \
+        :::"x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11", \
+        "x12","x13","x14","x15","x16","x17","x18","x19","x20","x21","x22", \
+	"x23","x24","x25","x26","x27","x28"); \
+} while(0)
+
+#define SIMD_EOR_SELF(reg) "eor " #reg ".16b," #reg ".16b," #reg ".16b;"
+
+#define CLEAR_SCRATCH_SIMD_REGS() \
+do{\
+    asm volatile(\
+            SIMD_EOR_SELF(v0) \
+            SIMD_EOR_SELF(v1) \
+            SIMD_EOR_SELF(v2) \
+            SIMD_EOR_SELF(v3) \
+            SIMD_EOR_SELF(v4) \
+            SIMD_EOR_SELF(v5) \
+            SIMD_EOR_SELF(v6) \
+            SIMD_EOR_SELF(v7) \
+            SIMD_EOR_SELF(v16) \
+            SIMD_EOR_SELF(v17) \
+            SIMD_EOR_SELF(v18) \
+            SIMD_EOR_SELF(v19) \
+            SIMD_EOR_SELF(v20) \
+            SIMD_EOR_SELF(v21) \
+            SIMD_EOR_SELF(v22) \
+            SIMD_EOR_SELF(v23) \
+            SIMD_EOR_SELF(v24) \
+            SIMD_EOR_SELF(v25) \
+            SIMD_EOR_SELF(v26) \
+            SIMD_EOR_SELF(v27) \
+            SIMD_EOR_SELF(v28) \
+            SIMD_EOR_SELF(v29) \
+            SIMD_EOR_SELF(v30) \
+            SIMD_EOR_SELF(v31) \
+        :::"v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18", \
+        "v19","v20","v21","v22","v23","v24","v25","v26","v27","v28", \
+        "v29","v30","v31"); \
+}while(0)
+
+static inline void
+clear_mem(void *mem, const size_t size)
+{
+        memset(mem, 0, size);
+}
+
+static inline void
+clear_var(void *var, const size_t size)
+{
+        memset(var, 0, size);
+}
+
+#endif /* CLEAR_REGS_H */
diff --git a/lib/aarch64/constant_lookup_aarch64.h b/lib/aarch64/constant_lookup_aarch64.h
new file mode 100644
index 0000000000000000000000000000000000000000..d55d341312b2eeec1b7dc5cdb5b3f990c03e3f56
--- /dev/null
+++ b/lib/aarch64/constant_lookup_aarch64.h
@@ -0,0 +1,48 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef CONSTANT_LOOKUP_H
+#define CONSTANT_LOOKUP_H
+
+#include "ipsec-mb.h"
+
+#include <arm_neon.h>
+
+/**
+ * @brief Constant time and parallel NEON lookup function on table of
+ *        256 elements of 8-bit values.
+ *
+ * @param[in] indexes   vector with 16 8-bit indexes
+ * @param[in] table     pointer to 256 element table
+ *
+ * @return Vector with 16 8-bit values corresponding to the indexes
+ */
+IMB_DLL_LOCAL uint8x16_t
+lookup_16x8bit_neon(const uint8x16_t indexes, const void *table);
+
+#endif /* CONSTANT_LOOKUP_H */
diff --git a/lib/aarch64/cpu_features_aarch64.c b/lib/aarch64/cpu_features_aarch64.c
new file mode 100644
index 0000000000000000000000000000000000000000..2aae71a25a8e16b23c6f5d789e0b20a535cbf3c2
--- /dev/null
+++ b/lib/aarch64/cpu_features_aarch64.c
@@ -0,0 +1,78 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "cpu_feature.h"
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+
+static uint32_t detect_asimd(void)
+{
+        return getauxval(AT_HWCAP) & HWCAP_ASIMD;
+}
+
+static uint32_t detect_aes(void)
+{
+        return getauxval(AT_HWCAP) & HWCAP_AES;
+}
+
+uint64_t cpu_feature_detect(void)
+{
+        uint64_t features = 0;
+#ifdef __aarch64__
+        features |= IMB_FEATURE_AARCH64;
+#endif
+        if (detect_asimd()) {
+                features |= IMB_FEATURE_ASIMD;
+                if (detect_aes())
+                        features |= IMB_FEATURE_AESNI;
+        }
+
+#ifdef SAFE_DATA
+        features |= IMB_FEATURE_SAFE_DATA;
+#endif
+#ifdef SAFE_PARAM
+        features |= IMB_FEATURE_SAFE_PARAM;
+#endif
+
+        return features;
+}
+
+uint64_t cpu_feature_adjust(const uint64_t flags, uint64_t features)
+{
+        if (flags & IMB_FLAG_AESNI_OFF)
+                features &= ~IMB_FEATURE_AESNI;
+
+        return features;
+}
+
+/* External function to retrieve feature flags */
+uint64_t imb_get_feature_flags(void)
+{
+        return cpu_feature_detect();
+}
+
diff --git a/lib/aarch64/lookup_16x8bit_neon.S b/lib/aarch64/lookup_16x8bit_neon.S
new file mode 100644
index 0000000000000000000000000000000000000000..1d55641674cf7a84291a0ab6f1aa52ccae9d8323
--- /dev/null
+++ b/lib/aarch64/lookup_16x8bit_neon.S
@@ -0,0 +1,57 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.globl	lookup_16x8bit_neon
+.type	lookup_16x8bit_neon,%function
+.align	5
+lookup_16x8bit_neon:
+/* param[in]:	x0	table pointer
+ * 		v0	indexes
+ */
+	// table: v16-v31
+	mov	v1.16b,v0.16b
+
+	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64
+	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64
+	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64
+	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x0]
+
+	movi	v2.16b,#64
+
+	eor	v3.16b,v3.16b,v3.16b
+	tbx	v3.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v1.16b
+	sub	v1.16b,v1.16b,v2.16b
+	tbx	v3.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
+	sub	v1.16b,v1.16b,v2.16b
+	tbx	v3.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
+	sub	v1.16b,v1.16b,v2.16b
+	tbx	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v1.16b
+
+	mov	v0.16b,v3.16b
+
+	ret
diff --git a/lib/aarch64/mb_mgr_aarch64.c b/lib/aarch64/mb_mgr_aarch64.c
new file mode 100644
index 0000000000000000000000000000000000000000..c7dad7ade82ee3b50d2679fab82b35eab1467bfe
--- /dev/null
+++ b/lib/aarch64/mb_mgr_aarch64.c
@@ -0,0 +1,123 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ipsec-mb.h"
+#include "include/snow3g.h"
+
+#include "include/cpu_feature.h"
+#include "include/error.h"
+#include "clear_regs_mem_aarch64.h"
+#include "include/noaesni.h"
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB               submit_job_aarch64
+#define FLUSH_JOB                flush_job_aarch64
+#define SUBMIT_JOB_NOCHECK       submit_job_nocheck_aarch64
+#define GET_NEXT_JOB             get_next_job_aarch64
+#define GET_COMPLETED_JOB        get_completed_job_aarch64
+
+#define QUEUE_SIZE               queue_size_aarch64
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB_HASH          SUBMIT_JOB_HASH_AARCH64
+#define FLUSH_JOB_HASH           FLUSH_JOB_HASH_AARCH64
+
+/* ====================================================================== */
+
+static void
+reset_ooo_mgrs(IMB_MGR *state)
+{
+        return;
+}
+
+IMB_DLL_LOCAL void
+init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs)
+{
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return;
+        }
+#endif
+
+        /* reset error status */
+        imb_set_errno(state, 0);
+
+        state->features = cpu_feature_adjust(state->flags,
+                                             cpu_feature_detect());
+
+        /* Set architecture for future checks */
+        state->used_arch = (uint32_t) IMB_ARCH_AARCH64;
+
+        if (!(state->features & IMB_FEATURE_AESNI)) {
+                init_mb_mgr_aarch64_no_aesni(state);
+                return;
+        }
+
+        if (reset_mgrs) {
+                reset_ooo_mgrs(state);
+
+                /* Init "in order" components */
+                state->next_job = 0;
+                state->earliest_job = -1;
+        }
+
+        /* set AARCH64 handlers */
+        state->get_next_job                = get_next_job_aarch64;
+        state->submit_job                  = submit_job_aarch64;
+        state->submit_job_nocheck          = submit_job_nocheck_aarch64;
+        state->get_completed_job           = get_completed_job_aarch64;
+        state->flush_job                   = flush_job_aarch64;
+        state->queue_size                  = queue_size_aarch64;
+
+        state->snow3g_f8_1_buffer_bit      = snow3g_f8_1_buffer_bit_aarch64;
+        state->snow3g_f8_1_buffer          = snow3g_f8_1_buffer_aarch64;
+        state->snow3g_f8_2_buffer          = snow3g_f8_2_buffer_aarch64;
+        state->snow3g_f8_4_buffer          = snow3g_f8_4_buffer_aarch64;
+        state->snow3g_f8_8_buffer          = snow3g_f8_8_buffer_aarch64;
+        state->snow3g_f8_n_buffer          = snow3g_f8_n_buffer_aarch64;
+        state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_aarch64;
+        state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_aarch64;
+        state->snow3g_f9_1_buffer          = snow3g_f9_1_buffer_aarch64;
+        state->snow3g_init_key_sched       = snow3g_init_key_sched_aarch64;
+        state->snow3g_key_sched_size       = snow3g_key_sched_size_aarch64;
+}
+
+void
+init_mb_mgr_aarch64(IMB_MGR *state)
+{
+        init_mb_mgr_aarch64_internal(state, 1);
+}
+
+#include "mb_mgr_code_aarch64.h"
diff --git a/lib/aarch64/mb_mgr_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_aarch64_no_aesni.c
new file mode 100644
index 0000000000000000000000000000000000000000..be858f54e6dc03c977eb59917e793a9cc1fb0b09
--- /dev/null
+++ b/lib/aarch64/mb_mgr_aarch64_no_aesni.c
@@ -0,0 +1,112 @@
+/*********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "ipsec-mb.h"
+#include "include/snow3g.h"
+
+#include "include/noaesni.h"
+#include "include/error.h"
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB         submit_job_aarch64_no_aesni
+#define FLUSH_JOB          flush_job_aarch64_no_aesni
+#define SUBMIT_JOB_NOCHECK submit_job_nocheck_aarch64_no_aesni
+#define GET_NEXT_JOB       get_next_job_aarch64_no_aesni
+#define GET_COMPLETED_JOB  get_completed_job_aarch64_no_aesni
+
+#define QUEUE_SIZE         queue_size_aarch64_no_aesni
+
+/* ====================================================================== */
+
+#define SUBMIT_JOB_HASH    SUBMIT_JOB_HASH_AARCH64
+#define FLUSH_JOB_HASH     FLUSH_JOB_HASH_AARCH64
+
+/* ====================================================================== */
+static void
+reset_ooo_mgrs(IMB_MGR *state)
+{
+        return;
+}
+
+IMB_DLL_LOCAL void
+init_mb_mgr_aarch64_no_aesni_internal(IMB_MGR *state, const int reset_mgrs)
+{
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return;
+        }
+#endif
+        imb_set_errno(state, 0);
+
+        /* Set architecture for future checks */
+        state->used_arch = (uint32_t) IMB_ARCH_NOAESNI;
+
+        if (reset_mgrs) {
+                reset_ooo_mgrs(state);
+
+                /* Init "in order" components */
+                state->next_job = 0;
+                state->earliest_job = -1;
+        }
+
+        /* set AARCH64 NO AESNI handlers */
+        state->get_next_job        = get_next_job_aarch64_no_aesni;
+        state->submit_job          = submit_job_aarch64_no_aesni;
+        state->submit_job_nocheck  = submit_job_nocheck_aarch64_no_aesni;
+        state->get_completed_job   = get_completed_job_aarch64_no_aesni;
+        state->flush_job           = flush_job_aarch64_no_aesni;
+        state->queue_size          = queue_size_aarch64_no_aesni;
+
+        state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_aarch64_no_aesni;
+        state->snow3g_f8_1_buffer  = snow3g_f8_1_buffer_aarch64_no_aesni;
+        state->snow3g_f8_2_buffer  = snow3g_f8_2_buffer_aarch64_no_aesni;
+        state->snow3g_f8_4_buffer  = snow3g_f8_4_buffer_aarch64_no_aesni;
+        state->snow3g_f8_8_buffer  = snow3g_f8_8_buffer_aarch64_no_aesni;
+        state->snow3g_f8_n_buffer  = snow3g_f8_n_buffer_aarch64_no_aesni;
+        state->snow3g_f8_8_buffer_multikey =
+                snow3g_f8_8_buffer_multikey_aarch64_no_aesni;
+        state->snow3g_f8_n_buffer_multikey =
+                snow3g_f8_n_buffer_multikey_aarch64_no_aesni;
+        state->snow3g_f9_1_buffer    = snow3g_f9_1_buffer_aarch64_no_aesni;
+        state->snow3g_init_key_sched = snow3g_init_key_sched_aarch64_no_aesni;
+        state->snow3g_key_sched_size = snow3g_key_sched_size_aarch64_no_aesni;
+
+}
+
+void
+init_mb_mgr_aarch64_no_aesni(IMB_MGR *state)
+{
+        init_mb_mgr_aarch64_no_aesni_internal(state, 1);
+}
+#include "mb_mgr_code_aarch64.h"
diff --git a/lib/aarch64/mb_mgr_auto_aarch64.c b/lib/aarch64/mb_mgr_auto_aarch64.c
new file mode 100644
index 0000000000000000000000000000000000000000..b4c0797e1eac1f0852135e62e991f74b6cd4a1a4
--- /dev/null
+++ b/lib/aarch64/mb_mgr_auto_aarch64.c
@@ -0,0 +1,73 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#include "ipsec-mb.h"
+#include "cpu_feature.h"
+#include "noaesni.h"
+#include "error.h"
+
+/**
+ * @brief Automatically initialize most performant
+ *        Multi-buffer manager based on CPU features
+ *
+ * @param [in]  state Pointer to MB_MGR struct
+ * @param [out] arch Pointer to arch enum to be set (can be NULL)
+ */
+void
+init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch)
+{
+        IMB_ARCH arch_detected = IMB_ARCH_NONE;
+        const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI;
+        const uint64_t detect_noaesni = IMB_FEATURE_AARCH64 | IMB_FEATURE_ASIMD;
+
+        /* reset error status */
+        imb_set_errno(state, 0);
+
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return;
+        }
+#endif
+        if ((state->features & detect_aarch64) == detect_aarch64) {
+                init_mb_mgr_aarch64(state);
+                arch_detected = IMB_ARCH_AARCH64;
+                goto init_mb_mgr_auto_ret;
+        }
+        if ((state->features & detect_noaesni) == detect_noaesni) {
+                init_mb_mgr_aarch64_no_aesni(state);
+                arch_detected = IMB_ARCH_NOAESNI;
+                goto init_mb_mgr_auto_ret;
+        }
+
+        imb_set_errno(state, ENODEV);
+
+init_mb_mgr_auto_ret:
+        if (arch != NULL)
+                *arch = arch_detected;
+}
diff --git a/lib/aarch64/mb_mgr_code_aarch64.h b/lib/aarch64/mb_mgr_code_aarch64.h
new file mode 100644
index 0000000000000000000000000000000000000000..97343d12894bf33102c757b17625898f35b2333f
--- /dev/null
+++ b/lib/aarch64/mb_mgr_code_aarch64.h
@@ -0,0 +1,540 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef MB_MGR_CODE_H
+#define MB_MGR_CODE_H
+
+/*
+ * This contains the bulk of the mb_mgr code, with #define's to build
+ * an AARCH64 version (see mb_mgr_aarch64.c).
+ *
+ * get_next_job() returns a job object. This must be filled in and returned
+ * via submit_job() before get_next_job() is called again.
+ *
+ * submit_job() and flush_job() returns a job object. This job object ceases
+ * to be usable at the next call to get_next_job()
+ */
+
+#include <string.h> /* memcpy(), memset() */
+
+#include "clear_regs_mem_aarch64.h"
+#include "ipsec-mb.h"
+#include "error.h"
+
+#define BSWAP64 __builtin_bswap64
+
+/*
+ * JOBS() and ADV_JOBS() moved into mb_mgr_code.h
+ * get_next_job() and get_completed_job() API's are no longer inlines.
+ * For binary compatibility they have been made proper symbols.
+ */
+__forceinline
+IMB_JOB *JOBS(IMB_MGR *state, const int offset)
+{
+        char *cp = (char *)state->jobs;
+
+        return (IMB_JOB *)(cp + offset);
+}
+
+__forceinline
+void ADV_JOBS(int *ptr)
+{
+        *ptr += sizeof(IMB_JOB);
+        if (*ptr >= (int) (IMB_MAX_JOBS * sizeof(IMB_JOB)))
+                *ptr = 0;
+}
+
+__forceinline
+IMB_JOB *
+submit_snow3g_uea2_job(IMB_MGR *state, IMB_JOB *job)
+{
+        const snow3g_key_schedule_t *key = job->enc_keys;
+        const uint32_t msg_bitlen =
+                        (const uint32_t)job->msg_len_to_cipher_in_bits;
+        const uint32_t msg_bitoff =
+                        (const uint32_t)job->cipher_start_src_offset_in_bits;
+
+        /* Use bit length API if
+         * - msg length is not a multiple of bytes
+         * - bit offset is not a multiple of bytes
+         */
+        if ((msg_bitlen & 0x07) || (msg_bitoff & 0x07)) {
+                IMB_SNOW3G_F8_1_BUFFER_BIT(state, key, job->iv, job->src,
+                                           job->dst, msg_bitlen, msg_bitoff);
+        } else {
+                const uint32_t msg_bytelen = msg_bitlen >> 3;
+                const uint32_t msg_byteoff = msg_bitoff >> 3;
+                const void *src = job->src + msg_byteoff;
+                void *dst = job->dst + msg_byteoff;
+
+                IMB_SNOW3G_F8_1_BUFFER(state, key, job->iv, src,
+                                       dst, msg_bytelen);
+        }
+
+        job->status |= IMB_STATUS_COMPLETED_CIPHER;
+        return job;
+}
+
+__forceinline
+IMB_JOB *
+SUBMIT_JOB_AES_ENC(IMB_MGR *state, IMB_JOB *job)
+{
+        if (IMB_CIPHER_SNOW3G_UEA2_BITLEN == job->cipher_mode) {
+                return submit_snow3g_uea2_job(state, job);
+        } else { /* assume IMB_CIPHER_NULL */
+                job->status |= IMB_STATUS_COMPLETED_CIPHER;
+                return job;
+        }
+}
+
+__forceinline
+IMB_JOB *
+FLUSH_JOB_AES_ENC(IMB_MGR *state, IMB_JOB *job)
+{
+        (void) state;
+        (void) job;
+
+        return NULL;
+}
+
+__forceinline
+IMB_JOB *
+SUBMIT_JOB_AES_DEC(IMB_MGR *state, IMB_JOB *job)
+{
+        if (IMB_CIPHER_SNOW3G_UEA2_BITLEN == job->cipher_mode) {
+                return submit_snow3g_uea2_job(state, job);
+        } else {
+                /* assume IMB_CIPHER_NULL */
+                job->status |= IMB_STATUS_COMPLETED_CIPHER;
+                return job;
+        }
+}
+
+__forceinline
+IMB_JOB *
+FLUSH_JOB_AES_DEC(IMB_MGR *state, IMB_JOB *job)
+{
+        (void) state;
+        (void) job;
+
+        return NULL;
+}
+
+/* ========================================================================= */
+/* Hash submit & flush functions */
+/* ========================================================================= */
+__forceinline
+IMB_JOB *
+SUBMIT_JOB_HASH(IMB_MGR *state, IMB_JOB *job)
+{
+        switch (job->hash_alg) {
+        case IMB_AUTH_SNOW3G_UIA2_BITLEN:
+                IMB_SNOW3G_F9_1_BUFFER(state, (const snow3g_key_schedule_t *)
+                               job->u.SNOW3G_UIA2._key,
+                               job->u.SNOW3G_UIA2._iv,
+                               job->src + job->hash_start_src_offset_in_bytes,
+                               job->msg_len_to_hash_in_bits,
+                               job->auth_tag_output);
+                job->status |= IMB_STATUS_COMPLETED_AUTH;
+                return job;
+        default:
+                job->status |= IMB_STATUS_COMPLETED_AUTH;
+                return job;
+        }
+}
+
+__forceinline
+IMB_JOB *
+FLUSH_JOB_HASH(IMB_MGR *state, IMB_JOB *job)
+{
+        (void) state;
+
+        switch (job->hash_alg) {
+        default:
+                if (!(job->status & IMB_STATUS_COMPLETED_AUTH)) {
+                        job->status |= IMB_STATUS_COMPLETED_AUTH;
+                        return job;
+                }
+                return NULL;
+        }
+}
+
+
+/* ========================================================================= */
+/* Job submit & flush functions */
+/* ========================================================================= */
+
+#define SNOW3G_MAX_BITLEN (UINT32_MAX)
+#define MB_MAX_LEN16 ((1 << 16) - 2)
+
+__forceinline int
+is_job_invalid(IMB_MGR *state, const IMB_JOB *job)
+{
+        switch (job->cipher_mode) {
+        case IMB_CIPHER_NULL:
+                 /*
+                  * No checks required for this mode
+                  * @note NULL cipher doesn't perform memory copy operation
+                  *       from source to destination
+                  */
+                break;
+        case IMB_CIPHER_SNOW3G_UEA2_BITLEN:
+                if (job->src == NULL) {
+                        imb_set_errno(state, IMB_ERR_JOB_NULL_SRC);
+                        return 1;
+                }
+                if (job->dst == NULL) {
+                        imb_set_errno(state, IMB_ERR_JOB_NULL_DST);
+                        return 1;
+                }
+                if (job->iv == NULL) {
+                        imb_set_errno(state, IMB_ERR_JOB_NULL_IV);
+                        return 1;
+                }
+                if (job->enc_keys == NULL) {
+                        imb_set_errno(state, IMB_ERR_JOB_NULL_KEY);
+                        return 1;
+                }
+                if (job->key_len_in_bytes != UINT64_C(16)) {
+                        imb_set_errno(state, IMB_ERR_JOB_KEY_LEN);
+                        return 1;
+                }
+                if (job->msg_len_to_cipher_in_bits == 0 ||
+                    job->msg_len_to_cipher_in_bits > SNOW3G_MAX_BITLEN) {
+                        imb_set_errno(state, IMB_ERR_JOB_CIPH_LEN);
+                        return 1;
+                }
+                if (job->iv_len_in_bytes != UINT64_C(16)) {
+                        imb_set_errno(state, IMB_ERR_JOB_IV_LEN);
+                        return 1;
+                }
+                break;
+        default:
+                imb_set_errno(state, IMB_ERR_CIPH_MODE);
+                return 1;
+        }
+
+        switch (job->hash_alg) {
+        case IMB_AUTH_NULL:
+                break;
+        case IMB_AUTH_SNOW3G_UIA2_BITLEN:
+                if (job->src == NULL) {
+                        imb_set_errno(state, IMB_ERR_JOB_NULL_SRC);
+                        return 1;
+                }
+                if ((job->msg_len_to_hash_in_bits == 0) ||
+                    (job->msg_len_to_hash_in_bits > SNOW3G_MAX_BITLEN)) {
+                        imb_set_errno(state, IMB_ERR_JOB_AUTH_LEN);
+                        return 1;
+                }
+                if (job->u.SNOW3G_UIA2._key == NULL) {
+                        imb_set_errno(state, IMB_ERR_JOB_NULL_KEY);
+                        return 1;
+                }
+                if (job->u.SNOW3G_UIA2._iv == NULL) {
+                        imb_set_errno(state, IMB_ERR_JOB_NULL_IV);
+                        return 1;
+                }
+                if (job->auth_tag_output_len_in_bytes != UINT64_C(4)) {
+                        imb_set_errno(state, IMB_ERR_JOB_AUTH_TAG_LEN);
+                        return 1;
+                }
+                if (job->auth_tag_output == NULL) {
+                        imb_set_errno(state, IMB_ERR_JOB_NULL_AUTH);
+                        return 1;
+                }
+                break;
+        default:
+                imb_set_errno(state, IMB_ERR_HASH_ALGO);
+                return 1;
+        }
+        return 0;
+}
+
+__forceinline
+IMB_JOB *SUBMIT_JOB_AES(IMB_MGR *state, IMB_JOB *job)
+{
+        if (job->cipher_direction == IMB_DIR_ENCRYPT)
+                job = SUBMIT_JOB_AES_ENC(state, job);
+        else
+                job = SUBMIT_JOB_AES_DEC(state, job);
+
+        return job;
+}
+
+__forceinline
+IMB_JOB *FLUSH_JOB_AES(IMB_MGR *state, IMB_JOB *job)
+{
+        if (job->cipher_direction == IMB_DIR_ENCRYPT)
+                job = FLUSH_JOB_AES_ENC(state, job);
+        else
+                job = FLUSH_JOB_AES_DEC(state, job);
+
+        return job;
+}
+
+/* submit a half-completed job, based on the status */
+__forceinline
+IMB_JOB *RESUBMIT_JOB(IMB_MGR *state, IMB_JOB *job)
+{
+        while (job != NULL && job->status < IMB_STATUS_COMPLETED) {
+                if (job->status == IMB_STATUS_COMPLETED_AUTH)
+                        job = SUBMIT_JOB_AES(state, job);
+                else /* assumed job->status = IMB_STATUS_COMPLETED_CIPHER */
+                        job = SUBMIT_JOB_HASH(state, job);
+        }
+
+        return job;
+}
+
+__forceinline
+IMB_JOB *submit_new_job(IMB_MGR *state, IMB_JOB *job)
+{
+        if (job->chain_order == IMB_ORDER_CIPHER_HASH)
+                job = SUBMIT_JOB_AES(state, job);
+        else
+                job = SUBMIT_JOB_HASH(state, job);
+
+        job = RESUBMIT_JOB(state, job);
+        return job;
+}
+
+__forceinline
+void complete_job(IMB_MGR *state, IMB_JOB *job)
+{
+        if (job->chain_order == IMB_ORDER_CIPHER_HASH) {
+                /* while() loop optimized for cipher_hash order */
+                while (job->status < IMB_STATUS_COMPLETED) {
+                        IMB_JOB *tmp = FLUSH_JOB_AES(state, job);
+
+                        if (tmp == NULL)
+                                tmp = FLUSH_JOB_HASH(state, job);
+
+                        (void) RESUBMIT_JOB(state, tmp);
+                }
+        } else {
+                /* while() loop optimized for hash_cipher order */
+                while (job->status < IMB_STATUS_COMPLETED) {
+                        IMB_JOB *tmp = FLUSH_JOB_HASH(state, job);
+
+                        if (tmp == NULL)
+                                tmp = FLUSH_JOB_AES(state, job);
+
+                        (void) RESUBMIT_JOB(state, tmp);
+                }
+        }
+}
+
+__forceinline
+IMB_JOB *
+submit_job_and_check(IMB_MGR *state, const int run_check)
+{
+        IMB_JOB *job = NULL;
+
+        job = JOBS(state, state->next_job);
+
+        if (run_check) {
+                if (is_job_invalid(state, job)) {
+                        job->status = IMB_STATUS_INVALID_ARGS;
+                } else {
+                        job->status = IMB_STATUS_BEING_PROCESSED;
+                        job = submit_new_job(state, job);
+                }
+        } else {
+                job->status = IMB_STATUS_BEING_PROCESSED;
+                job = submit_new_job(state, job);
+        }
+
+        if (state->earliest_job < 0) {
+                /* state was previously empty */
+                if (job == NULL)
+                        state->earliest_job = state->next_job;
+                ADV_JOBS(&state->next_job);
+                goto exit;
+        }
+
+        ADV_JOBS(&state->next_job);
+
+        if (state->earliest_job == state->next_job) {
+                /* Full */
+                job = JOBS(state, state->earliest_job);
+                complete_job(state, job);
+                ADV_JOBS(&state->earliest_job);
+                goto exit;
+        }
+
+        /* not full */
+        job = JOBS(state, state->earliest_job);
+        if (job->status < IMB_STATUS_COMPLETED) {
+                job = NULL;
+                goto exit;
+        }
+
+        ADV_JOBS(&state->earliest_job);
+exit:
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        return job;
+}
+
+IMB_JOB *
+SUBMIT_JOB(IMB_MGR *state)
+{
+        /* reset error status */
+        imb_set_errno(state, 0);
+
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return NULL;
+        }
+#endif
+
+        return submit_job_and_check(state, 1);
+}
+
+IMB_JOB *
+SUBMIT_JOB_NOCHECK(IMB_MGR *state)
+{
+        /* reset error status */
+        imb_set_errno(state, 0);
+
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return NULL;
+        }
+#endif
+
+        return submit_job_and_check(state, 0);
+}
+
+IMB_JOB *
+FLUSH_JOB(IMB_MGR *state)
+{
+        /* reset error status */
+        imb_set_errno(state, 0);
+
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return NULL;
+        }
+#endif
+        IMB_JOB *job;
+        if (state->earliest_job < 0)
+                return NULL; /* empty */
+
+        job = JOBS(state, state->earliest_job);
+        complete_job(state, job);
+
+        ADV_JOBS(&state->earliest_job);
+
+        if (state->earliest_job == state->next_job)
+                state->earliest_job = -1; /* becomes empty */
+
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        return job;
+}
+
+/* ========================================================================= */
+
+uint32_t
+QUEUE_SIZE(IMB_MGR *state)
+{
+        /* reset error status */
+        imb_set_errno(state, 0);
+
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return 0;
+        }
+#endif
+        int a, b;
+
+        if (state->earliest_job < 0)
+                return 0;
+        a = state->next_job / sizeof(IMB_JOB);
+        b = state->earliest_job / sizeof(IMB_JOB);
+        return ((a-b) & (IMB_MAX_JOBS-1));
+}
+
+IMB_JOB *
+GET_COMPLETED_JOB(IMB_MGR *state)
+{
+        /* reset error status */
+        imb_set_errno(state, 0);
+
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return NULL;
+        }
+#endif
+        IMB_JOB *job;
+
+        if (state->earliest_job < 0)
+                return NULL;
+
+        job = JOBS(state, state->earliest_job);
+        if (job->status < IMB_STATUS_COMPLETED)
+                return NULL;
+
+        ADV_JOBS(&state->earliest_job);
+
+        if (state->earliest_job == state->next_job)
+                state->earliest_job = -1;
+
+        return job;
+}
+
+IMB_JOB *
+GET_NEXT_JOB(IMB_MGR *state)
+{
+        /* reset error status */
+        imb_set_errno(state, 0);
+
+#ifdef SAFE_PARAM
+        if (state == NULL) {
+                imb_set_errno(NULL, IMB_ERR_NULL_MBMGR);
+                return NULL;
+        }
+#endif
+
+        return JOBS(state, state->next_job);
+}
+
+#endif /* MB_MGR_CODE_H */
diff --git a/lib/aarch64/snow3g_aarch64.c b/lib/aarch64/snow3g_aarch64.c
new file mode 100644
index 0000000000000000000000000000000000000000..6ff912bb4460d5fa0f6925dff70549cf4cb9a385
--- /dev/null
+++ b/lib/aarch64/snow3g_aarch64.c
@@ -0,0 +1,41 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_aarch64
+#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_aarch64
+#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_aarch64
+#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_aarch64
+#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_aarch64
+#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_aarch64
+#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_aarch64
+#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_aarch64
+#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64
+#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64
+#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64
+
+#include "snow3g_common_aarch64.h"
diff --git a/lib/aarch64/snow3g_aarch64_no_aesni.c b/lib/aarch64/snow3g_aarch64_no_aesni.c
new file mode 100644
index 0000000000000000000000000000000000000000..fbc861b23eddfc3373e838a67b5c8c885f52c9b7
--- /dev/null
+++ b/lib/aarch64/snow3g_aarch64_no_aesni.c
@@ -0,0 +1,43 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#define NO_AESNI
+#define AESNI_EMU
+#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_aarch64_no_aesni
+#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_aarch64_no_aesni
+#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_aarch64_no_aesni
+#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_aarch64_no_aesni
+#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_aarch64_no_aesni
+#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_aarch64_no_aesni
+#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_aarch64_no_aesni
+#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_aarch64_no_aesni
+#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64_no_aesni
+#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64_no_aesni
+#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64_no_aesni
+
+#include "snow3g_common_aarch64.h"
diff --git a/lib/aarch64/snow3g_common_aarch64.h b/lib/aarch64/snow3g_common_aarch64.h
new file mode 100644
index 0000000000000000000000000000000000000000..3b1651212112f0141dbd0715a89396031c8fb266
--- /dev/null
+++ b/lib/aarch64/snow3g_common_aarch64.h
@@ -0,0 +1,2905 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+#ifndef SNOW3G_COMMON_H
+#define SNOW3G_COMMON_H
+
+#include <stdio.h> /* printf() */
+#include <string.h> /* memset(), memcpy() */
+#include <stdint.h>
+
+#include "ipsec-mb.h"
+#include "include/wireless_common.h"
+#include "snow3g.h"
+#include "snow3g_tables.h"
+#include "constant_lookup_aarch64.h"
+#include "clear_regs_mem_aarch64.h"
+#ifdef NO_AESNI
+#include "include/aesni_emu.h"
+#endif
+
+#define CLEAR_MEM clear_mem
+#define CLEAR_VAR clear_var
+
+#define MAX_KEY_LEN (16)
+#define SNOW3G_4_BYTES (4)
+#define SNOW3G_8_BYTES (8)
+#define SNOW3G_8_BITS (8)
+#define SNOW3G_16_BYTES (16)
+#define SNOW3G_16_BITS (16)
+
+#define SNOW3G_BLOCK_SIZE (8)
+
+#define SNOW3G_KEY_LEN_IN_BYTES (16) /* 128b */
+#define SNOW3G_IV_LEN_IN_BYTES (16)  /* 128b */
+
+#define SNOW3GCONSTANT (0x1b)
+
+/* Range of input data for SNOW3G is from 1 to 2^32 bits */
+#define SNOW3G_MIN_LEN 1
+#define SNOW3G_MAX_BITLEN (UINT32_MAX)
+#define SNOW3G_MAX_BYTELEN (UINT32_MAX / 8)
+
+typedef union SafeBuffer {
+        uint64_t b64;
+        uint32_t b32[2];
+        uint8_t b8[SNOW3G_8_BYTES];
+} SafeBuf;
+
+typedef struct snow3gKeyState1_s {
+        /* 16 LFSR stages */
+        uint32_t LFSR_S[16];
+        /* 3 FSM states */
+        uint32_t FSM_R3;
+        uint32_t FSM_R2;
+        uint32_t FSM_R1;
+} DECLARE_ALIGNED(snow3gKeyState1_t, 16);
+
+typedef struct snow3gKeyState4_s {
+        /* 16 LFSR stages */
+        uint32x4_t LFSR_X[16];
+        /* 3 FSM states */
+        uint32x4_t FSM_X[3];
+        uint32_t iLFSR_X;
+} snow3gKeyState4_t;
+
+/**
+ * @brief Finds minimum 32-bit value in an array
+ * @return Min 32-bit value
+ */
+static inline uint32_t
+length_find_min(const uint32_t *out_array, const size_t dim_array)
+{
+        size_t i;
+        uint32_t min = 0;
+
+        if (dim_array > 0)
+                min  = out_array[0];
+
+        for (i = 1; i < dim_array; i++)
+                if (out_array[i] < min)
+                        min = out_array[i];
+
+        return min;
+}
+
+/**
+ * @brief Subtracts \a subv from a vector of 32-bit words
+ */
+static inline void
+length_sub(uint32_t *out_array, const size_t dim_array, const uint32_t subv)
+{
+        size_t i;
+
+        for (i = 0; i < dim_array; i++)
+                out_array[i] -= subv;
+}
+
+/**
+ * @brief Checks vector of length values against 0 and SNOW3G_MAX_BYTELEN values
+ * @retval 0 incorrect length value found
+ * @retval 1 all OK
+ */
+static inline uint32_t
+length_check(const uint32_t *out_array, const size_t dim_array)
+{
+        size_t i;
+
+        for (i = 0; i < dim_array; i++) {
+                if ((out_array[i] == 0) ||
+                    (out_array[i] > SNOW3G_MAX_BYTELEN))
+                        return 0;
+        }
+
+        return 1;
+}
+
+/**
+ * @brief Copies 4 32-bit length values into an array
+ */
+static inline void
+length_copy_4(uint32_t *out_array,
+              const uint32_t length1, const uint32_t length2,
+              const uint32_t length3, const uint32_t length4)
+{
+        out_array[0] = length1;
+        out_array[1] = length2;
+        out_array[2] = length3;
+        out_array[3] = length4;
+}
+
+/**
+ * @brief Copies 8 32-bit length values into an array
+ */
+static inline void
+length_copy_8(uint32_t *out_array,
+              const uint32_t length1, const uint32_t length2,
+              const uint32_t length3, const uint32_t length4,
+              const uint32_t length5, const uint32_t length6,
+              const uint32_t length7, const uint32_t length8)
+{
+        out_array[0] = length1;
+        out_array[1] = length2;
+        out_array[2] = length3;
+        out_array[3] = length4;
+        out_array[4] = length5;
+        out_array[5] = length6;
+        out_array[6] = length7;
+        out_array[7] = length8;
+}
+
+/**
+ * @brief Checks vector of pointers against NULL
+ * @retval 0 incorrect pointer found
+ * @retval 1 all OK
+ */
+static inline int
+ptr_check(void *out_array[], const size_t dim_array)
+{
+        size_t i;
+
+        for (i = 0; i < dim_array; i++)
+                if (out_array[i] == NULL)
+                        return 0;
+
+        return 1;
+}
+
+/**
+ * @brief Checks vector of const pointers against NULL
+ * @retval 0 incorrect pointer found
+ * @retval 1 all OK
+ */
+static inline int
+cptr_check(const void * const out_array[], const size_t dim_array)
+{
+        size_t i;
+
+        for (i = 0; i < dim_array; i++)
+                if (out_array[i] == NULL)
+                        return 0;
+
+        return 1;
+}
+
+/**
+ * @brief Copies 4 pointers into an array
+ */
+static inline void
+ptr_copy_4(void *out_array[],
+           void *ptr1, void *ptr2, void *ptr3, void *ptr4)
+{
+        out_array[0] = ptr1;
+        out_array[1] = ptr2;
+        out_array[2] = ptr3;
+        out_array[3] = ptr4;
+}
+
+/**
+ * @brief Copies 4 const pointers into an array
+ */
+static inline void
+cptr_copy_4(const void *out_array[],
+            const void *ptr1, const void *ptr2,
+            const void *ptr3, const void *ptr4)
+{
+        out_array[0] = ptr1;
+        out_array[1] = ptr2;
+        out_array[2] = ptr3;
+        out_array[3] = ptr4;
+}
+
+/**
+ * @brief Copies 8 pointers into an array
+ */
+static inline void
+ptr_copy_8(void *out_array[],
+           void *ptr1, void *ptr2, void *ptr3, void *ptr4,
+           void *ptr5, void *ptr6, void *ptr7, void *ptr8)
+{
+        out_array[0] = ptr1;
+        out_array[1] = ptr2;
+        out_array[2] = ptr3;
+        out_array[3] = ptr4;
+        out_array[4] = ptr5;
+        out_array[5] = ptr6;
+        out_array[6] = ptr7;
+        out_array[7] = ptr8;
+}
+
+/**
+ * @brief Copies 8 const pointers into an array
+ */
+static inline void
+cptr_copy_8(const void *out_array[],
+            const void *ptr1, const void *ptr2,
+            const void *ptr3, const void *ptr4,
+            const void *ptr5, const void *ptr6,
+            const void *ptr7, const void *ptr8)
+{
+        out_array[0] = ptr1;
+        out_array[1] = ptr2;
+        out_array[2] = ptr3;
+        out_array[3] = ptr4;
+        out_array[4] = ptr5;
+        out_array[5] = ptr6;
+        out_array[6] = ptr7;
+        out_array[7] = ptr8;
+}
+
+/**
+ * @brief Wrapper for safe lookup of 16 indexes in 256x8-bit table
+ * @param[in] indexes  vector of 16x8-bit indexes to be looked up
+ * @param[in] lut      pointer to a 256x8-bit table
+ * @return 16x8-bit values looked in \a lut using 16x8-bit \a indexes
+ */
+static inline uint8x16_t lut16x8b_256(const uint8x16_t indexes, const void *lut)
+{
+        return lookup_16x8bit_neon(indexes, lut);
+}
+
+/**
+ * @brief LFSR array shift by 2 positions
+ * @param[in/out] pCtx  key state context structure
+ */
+static inline void ShiftTwiceLFSR_1(snow3gKeyState1_t *pCtx)
+{
+        int i;
+
+        for (i = 0; i < 14; i++)
+                pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 2];
+}
+
+/**
+ * @brief SNOW3G S2 mix column correction function
+ *
+ * Mix column AES GF() reduction poly is 0x1B and SNOW3G reduction poly is 0x69.
+ * The fix-up value is 0x1B ^ 0x69 = 0x72 and needs to be applied on selected
+ * bytes of the 32-bit word.
+ *
+ * 'aese' operation does not perform mix column operation and allows to
+ * determine the fix-up value to be applied on result of 'aese + aesmc'
+ * in order to produce correct result for SNOW3G.
+ *
+ * This function implements more scalable SIMD method to apply the fix-up value
+ * for multiple stream at the same time.
+ *
+ * a = \a no_mixc bit-31
+ * b = \a no_mixc bit-23
+ * c = \a no_mixc bit-15
+ * d = \a no_mixc bit-7
+ *
+ * mask0_f(), mask1_f(), mask2_f() and mask3_f() functions
+ * specify if corresponding byte of \a mixc word, i.e. 0, 1, 2 or 3
+ * respectively, should be corrected.
+ * Definition of the functions:
+ *     mask0_f(a, b, c, d) = c'd + cd' => c xor d
+ *     mask1_f(a, b, c, d) = b'c + bc' => b xor c
+ *     mask2_f(a, b, c, d) = a'b + ab' => a xor b
+ *     mask3_f(a, b, c, d) = a'd + ad' => d xor a
+ * The above are resolved through SIMD instructions: and, cmlt and
+ * xor. As the result mask is obtained with 0xff byte value at positions
+ * that require 0x72 fix up value to be applied.
+ *
+ * @param no_mixc result of 'aese' operation, 4 x 32-bit words
+ * @param mixc    result of 'aese + aesmc' operation, 4 x 32-bit words
+ *
+ * @return corrected \a mixc for SNOW3G S2, 4 x 32-bit words
+ */
+static inline uint32x4_t s2_mixc_fixup_4(const uint8x16_t no_mixc, const uint8x16_t mixc)
+{
+        const uint32_t ror8[4] = {0x00030201, 0x04070605, 0x080b0a09, 0x0c0f0e0d};
+        uint8x16_t pattern, pattern_shuf, idx, mask, fixup;
+
+        pattern = vcltzq_s8(vreinterpretq_s8_u8(no_mixc));
+        idx = vreinterpretq_u8_u32(vld1q_u32(ror8));
+        pattern_shuf = vqtbl1q_u8(pattern, idx);
+
+        mask = vdupq_n_u8(0x72);
+        pattern = pattern ^ pattern_shuf;
+
+        fixup = mask & pattern;
+        return vreinterpretq_u32_u8(veorq_u8(fixup, mixc));
+
+}
+
+/**
+ * @brief SNOW3G S2 mix column correction function
+ *
+ * @param no_mixc result of 'aese' operation, 32-bit word index 0 only
+ * @param mixc    result of 'aese + aesmc' operation, 32-bit word index 0 only
+ *
+ * @return corrected \a mixc 32-bit word for SNOW3G S2
+ */
+static inline uint32_t
+s2_mixc_fixup_scalar(const uint8x16_t no_mixc, const uint8x16_t mixc)
+{
+        return vgetq_lane_u32(s2_mixc_fixup_4(no_mixc, mixc), 0);
+}
+
+/**
+ * @brief Sbox S1 maps a 32bit input to a 32bit output
+ *
+ * @param[in] x  32-bit word to be passed through S1 box
+ *
+ * @return \a x transformed through S1 box
+ */
+static inline uint32_t S1_box(const uint32_t x)
+{
+#ifdef NO_AESNI
+        union xmm_reg key, v;
+
+        key.qword[0] = key.qword[1] = 0;
+
+        v.dword[0] = v.dword[1] =
+                v.dword[2] = v.dword[3] = x;
+
+        emulate_AESENC(&v, &key);
+        return v.dword[0];
+#else
+        uint32x4_t dup_x;
+        uint8x16_t new_x, key, tmp;
+        dup_x = vdupq_n_u32(x);
+        key = vdupq_n_u8(0);
+        new_x = vreinterpretq_u8_u32(dup_x);
+        tmp = vaeseq_u8(new_x, key);
+        tmp = vaesmcq_u8(tmp);
+
+        return vgetq_lane_u32(vreinterpretq_u32_u8(tmp),0);
+#endif
+}
+
+/**
+ * @brief Sbox S1 maps a 2x32bit input to a 2x32bit output
+ *
+ * @param[in] x1  32-bit word to be passed through S1 box
+ * @param[in] x2  32-bit word to be passed through S1 box
+ */
+static inline void S1_box_2(uint32_t *x1, uint32_t *x2)
+{
+#ifdef NO_AESNI
+        /* reuse S1_box() for NO_AESNI path */
+        *x1 = S1_box(*x1);
+        *x2 = S1_box(*x2);
+#else
+        const uint8x16_t m_zero = vdupq_n_u8(0);
+        uint32x4_t m1, m2;
+        uint8x16_t r1, r2;
+
+        m1 = vdupq_n_u32(*x1);
+        r1 = vaeseq_u8(vreinterpretq_u8_u32(m1), m_zero);
+        r1 = vaesmcq_u8(r1);
+        m2 = vdupq_n_u32(*x2);
+        r2 = vaeseq_u8(vreinterpretq_u8_u32(m2), m_zero);
+        r2 = vaesmcq_u8(r2);
+
+        *x1 = vgetq_lane_u32(vreinterpretq_u32_u8(r1), 0);
+        *x2 = vgetq_lane_u32(vreinterpretq_u32_u8(r2), 0);
+#endif
+}
+
+/**
+ * @brief Sbox S1 maps a 4x32bit input to a 4x32bit output
+ *
+ * @param[in] x  vector of 4 32-bit words to be passed through S1 box
+ *
+ * @return 4x32-bits from \a x transformed through S1 box
+ */
+static inline uint32x4_t S1_box_4(const uint32x4_t x)
+{
+#ifdef NO_AESNI
+        union xmm_reg key, v, vt;
+
+        key.qword[0] = key.qword[1] = 0;
+
+        /*
+         * - Broadcast 32-bit word across XMM
+         * - Perform AES operations
+         */
+        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 0);
+        emulate_AESENC(&vt, &key);
+        v.dword[0] = vt.dword[0];
+
+        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 1);
+        emulate_AESENC(&vt, &key);
+        v.dword[1] = vt.dword[0];
+
+        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 2);
+        emulate_AESENC(&vt, &key);
+        v.dword[2] = vt.dword[0];
+
+        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 3);
+        emulate_AESENC(&vt, &key);
+        v.dword[3] = vt.dword[0];
+
+        return vld1q_u32(&v.dword[0]);
+#else
+        const uint8x16_t m_zero = vdupq_n_u8(0);
+        uint8x16_t m1, m2, m3, m4;
+        uint32x4_t r1, r2;
+
+        m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 0)));
+        m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 1)));
+        m3 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 2)));
+        m4 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 3)));
+
+        m1 = vaeseq_u8(m1, m_zero);
+        m1 = vaesmcq_u8(m1);
+        m2 = vaeseq_u8(m2, m_zero);
+        m2 = vaesmcq_u8(m2);
+        m3 = vaeseq_u8(m3, m_zero);
+        m3 = vaesmcq_u8(m3);
+        m4 = vaeseq_u8(m4, m_zero);
+        m4 = vaesmcq_u8(m4);
+
+        /*
+         * Put results of AES operations back into
+         * two vectors of 32-bit words
+         *
+         * First step:
+         * r1 = [ 0-31 m1 | 0-31 m2 | 32-63 m1 | 32-63 m2 ]
+         * r2 = [ 0-31 m3 | 0-31 m4 | 32-63 m3 | 32-63 m4 ]
+         */
+        r1 = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2));
+        r2 = vzip1q_u32(vreinterpretq_u32_u8(m3), vreinterpretq_u32_u8(m4));
+        r1 = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
+                                                vreinterpretq_u64_u32(r2)));
+        /*
+         * The last step:
+         * r1 = [ 0-63 m1 | 0-63 m3 ] =>
+         *      [ 0-31 m1 | 0-31 m2 | 0-31 m3 | 0-31 m4 ]
+         */
+
+        return r1;
+#endif
+}
+
+/**
+ * @brief Sbox S2 maps a 32-bit input to a 32-bit output
+ *
+ * @param[in] x  32-bit word to be passed through S2 box
+ *
+ * @return \a x transformed through S2 box
+ */
+static inline uint32_t S2_box(const uint32_t x)
+{
+#ifdef NO_AESNI
+        /* Perform invSR(SQ(x)) transform */
+        const uint32x4_t par_lut = vreinterpretq_u32_u8(
+                            lut16x8b_256(vreinterpretq_u8_u32(vdupq_n_u32(x)),
+                                         snow3g_invSR_SQ));
+        const uint32_t new_x = vgetq_lane_u32(par_lut, 0);
+        union xmm_reg key, v, v_fixup;
+
+        key.qword[0] = key.qword[1] = 0;
+
+        v.dword[0] = v.dword[1] =
+                v.dword[2] = v.dword[3] = new_x;
+
+        v_fixup = v;
+
+        emulate_AESENC(&v, &key);
+        emulate_AESENCLAST(&v_fixup, &key);
+
+        const uint8x16_t ret_mixc = vreinterpretq_u8_u32(
+                                vld1q_u32(&v.dword[0]));
+        const uint8x16_t ret_nomixc = vreinterpretq_u8_u32(
+                                vld1q_u32(&v_fixup.dword[0]));
+
+        return s2_mixc_fixup_scalar(ret_nomixc, ret_mixc);
+#else
+
+#ifndef SAFE_LOOKUP
+        const uint8_t *w3 = (const uint8_t *)&snow3g_table_S2[x & 0xff];
+        const uint8_t *w1 = (const uint8_t *)&snow3g_table_S2[(x >> 16) & 0xff];
+        const uint8_t *w2 = (const uint8_t *)&snow3g_table_S2[(x >> 8) & 0xff];
+        const uint8_t *w0 = (const uint8_t *)&snow3g_table_S2[(x >> 24) & 0xff];
+
+        return *((const uint32_t *)&w3[3]) ^
+                *((const uint32_t *)&w1[1]) ^
+                *((const uint32_t *)&w2[2]) ^
+                *((const uint32_t *)&w0[0]);
+
+#else
+        uint32x4_t par_lut;
+        uint8x16_t m, key, ret_nomixc, ret_mixc;
+
+        /* Perform invSR(SQ(x)) transform */
+        par_lut = vreinterpretq_u32_u8(
+                  lut16x8b_256(vreinterpretq_u8_u32(vdupq_n_u32(x)),
+                               snow3g_invSR_SQ));
+
+        m = vreinterpretq_u8_u32(vdupq_n_u32((vgetq_lane_u32(par_lut, 0))));
+        key = vdupq_n_u8(0);
+
+        ret_nomixc = vaeseq_u8(m, key);
+        ret_mixc = vaesmcq_u8(ret_nomixc);
+
+        return s2_mixc_fixup_scalar(ret_nomixc, ret_mixc);
+#endif
+
+#endif
+}
+
+/**
+ * @brief Sbox S2 maps a 2x32bit input to a 2x32bit output
+ *
+ * @param[in/out] x1  32-bit word to be passed through S2 box
+ * @param[in/out] x2  32-bit word to be passed through S2 box
+ */
+static inline void S2_box_2(uint32_t *x1, uint32_t *x2)
+{
+#if defined(NO_AESNI) || !defined(SAFE_LOOKUP)
+        *x1 = S2_box(*x1);
+        *x2 = S2_box(*x2);
+#else
+        /* Perform invSR(SQ(x)) transform through a lookup table */
+        const uint8x16_t m_zero = vdupq_n_u8(0);
+        uint32x4_t x_vec, ret_nomixc, ret_mixc, res;
+
+        x_vec = vdupq_n_u32(x1[0]);
+        x_vec = vsetq_lane_u32(x2[0], x_vec, 1);
+
+        const uint32x4_t new_x = vreinterpretq_u32_u8(
+                                 lut16x8b_256(vreinterpretq_u8_u32(x_vec),
+                                              snow3g_invSR_SQ));
+        uint8x16_t m1, m2, f1, f2;
+
+        m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 0)));
+        m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 1)));
+
+        f1 = vaeseq_u8(m1, m_zero); // no_mixc
+        m1 = vaesmcq_u8(f1);
+        f2 = vaeseq_u8(m2, m_zero);
+        m2 = vaesmcq_u8(f2);
+        /*
+         * Put results of AES operations back into one vector
+         * for further fix up
+         */
+        ret_nomixc = vzip1q_u32(vreinterpretq_u32_u8(f1), vreinterpretq_u32_u8(f2));
+        ret_mixc = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2));
+
+        res = s2_mixc_fixup_4(vreinterpretq_u8_u32(ret_nomixc), vreinterpretq_u8_u32(ret_mixc));
+
+        *x1 = vgetq_lane_u32(res, 0);
+        *x2 = vgetq_lane_u32(res, 1);
+#endif
+}
+
+/**
+ * @brief Sbox S2 maps a 4x32bit input to a 4x32bit output
+ *
+ * @param[in] x  vector of 4 32-bit words to be passed through S2 box
+ *
+ * @return 4x32-bits from \a x transformed through S2 box
+ */
+static inline uint32x4_t S2_box_4(const uint32x4_t x)
+{
+        /* Perform invSR(SQ(x)) transform through a lookup table */
+        const uint32x4_t new_x = vreinterpretq_u32_u8(
+                                 lut16x8b_256(vreinterpretq_u8_u32(x),
+                                              snow3g_invSR_SQ));
+
+        /* use AESNI operations for the rest of the S2 box */
+#ifdef NO_AESNI
+        union xmm_reg key, v, f;
+        union xmm_reg vt, ft;
+
+        key.qword[0] = key.qword[1] = 0;
+
+        /*
+         * - Broadcast 32-bit word across XMM and
+         *   perform AES operations
+         * - Save result 32-bit words in v and f vectors.
+         *   'f' is used for fix-up of mixed columns only
+         */
+        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] =
+                                                vgetq_lane_u32(new_x, 0);
+        ft = vt;
+        emulate_AESENC(&vt, &key);
+        emulate_AESENCLAST(&ft, &key);
+        v.dword[0] = vt.dword[0];
+        f.dword[0] = ft.dword[0];
+
+        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] =
+                                                vgetq_lane_u32(new_x, 1);
+        ft = vt;
+        emulate_AESENC(&vt, &key);
+        emulate_AESENCLAST(&ft, &key);
+        v.dword[1] = vt.dword[0];
+        f.dword[1] = ft.dword[0];
+
+        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] =
+                                                vgetq_lane_u32(new_x, 2);
+        ft = vt;
+        emulate_AESENC(&vt, &key);
+        emulate_AESENCLAST(&ft, &key);
+        v.dword[2] = vt.dword[0];
+        f.dword[2] = ft.dword[0];
+
+        vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] =
+                                                vgetq_lane_u32(new_x, 3);
+        ft = vt;
+        emulate_AESENC(&vt, &key);
+        emulate_AESENCLAST(&ft, &key);
+        v.dword[3] = vt.dword[0];
+        f.dword[3] = ft.dword[0];
+
+        return s2_mixc_fixup_4(vreinterpretq_u8_u32(vld1q_u32(&f.dword[0])),
+                                vreinterpretq_u8_u32(vld1q_u32(&v.dword[0])));
+#else
+        const uint8x16_t zero = vdupq_n_u8(0);
+        uint8x16_t m1, m2, m3, m4, f1, f2, f3, f4, mixc, no_mixc;
+        uint32x4_t r1, r2;
+
+        m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 0)));
+        m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 1)));
+        m3 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 2)));
+        m4 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 3)));
+
+        f1 = vaeseq_u8(m1, zero); // no_mixc
+        m1 = vaesmcq_u8(f1);
+        f2 = vaeseq_u8(m2, zero);
+        m2 = vaesmcq_u8(f2);
+        f3 = vaeseq_u8(m3, zero);
+        m3 = vaesmcq_u8(f3);
+        f4 = vaeseq_u8(m4, zero);
+        m4 = vaesmcq_u8(f4);
+
+        /*
+         * Put results of AES operations back into
+         * two vectors of 32-bit words
+         *
+         * First step:
+         * m1 = [ 0-31 m1 | 0-31 m2 | 32-63 m1 | 32-63 m2 ]
+         * m3 = [ 0-31 m3 | 0-31 m4 | 32-63 m3 | 32-63 m4 ]
+         */
+        /*
+         * The last step:
+         * m1 = [ 0-63 m1 | 0-63 m3 ] =>
+         *      [ 0-31 m1 | 0-31 m2 | 0-31 m3 | 0-31 m4 ]
+         * f1 = [ 0-63 f1 | 0-63 f3 ] =>
+         *      [ 0-31 f1 | 0-31 f2 | 0-31 f3 | 0-31 f4 ]
+         */
+        r1 = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2));
+        r2 = vzip1q_u32(vreinterpretq_u32_u8(m3), vreinterpretq_u32_u8(m4));
+        mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
+                                                vreinterpretq_u64_u32(r2)));
+
+        r1 = vzip1q_u32(vreinterpretq_u32_u8(f1), vreinterpretq_u32_u8(f2));
+        r2 = vzip1q_u32(vreinterpretq_u32_u8(f3), vreinterpretq_u32_u8(f4));
+        no_mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
+                                                 vreinterpretq_u64_u32(r2)));
+
+        return s2_mixc_fixup_4(no_mixc, mixc);
+#endif
+}
+
+/**
+ * @brief Sbox S2 maps a 2x4x32bit input to a 2x4x32bit output
+ *
+ * @param[in/out] in_out1  vector of 4 32-bit words to be passed through S2 box
+ * @param[in/out] in_out2  vector of 4 32-bit words to be passed through S2 box
+ */
+static inline void S2_box_2x4(uint32x4_t *in_out1, uint32x4_t *in_out2)
+{
+#ifdef NO_AESNI
+        *in_out1 = S2_box_4(*in_out1);
+        *in_out2 = S2_box_4(*in_out2);
+#else
+        /*
+         * Perform invSR(SQ(x)) transform through a lookup table and
+         * use AES operations for the rest of the S2 box
+         */
+        const uint8x16_t zero = vdupq_n_u8(0);
+        const uint32x4_t x1 = vreinterpretq_u32_u8(
+                              lut16x8b_256(vreinterpretq_u8_u32(*in_out1),
+                                           snow3g_invSR_SQ));
+        uint8x16_t m1, m2, m3, m4, f1, f2, f3, f4;
+        uint8x16_t m5, m6, m7, m8, f5, f6, f7, f8;
+        uint8x16_t mixc, no_mixc;
+        uint32x4_t r1, r2;
+
+        m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x1, 0)));
+        m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x1, 1)));
+        m3 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x1, 2)));
+        m4 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x1, 3)));
+
+        /* start shuffling next 128 bits of data */
+        const uint32x4_t x2 = vreinterpretq_u32_u8(
+                              lut16x8b_256(vreinterpretq_u8_u32(*in_out2),
+                                           snow3g_invSR_SQ));
+
+        m5 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x2, 0)));
+        m6 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x2, 1)));
+        m7 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x2, 2)));
+        m8 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x2, 3)));
+
+        f1 = vaeseq_u8(m1, zero); // no_mixc
+        m1 = vaesmcq_u8(f1);
+        f2 = vaeseq_u8(m2, zero);
+        m2 = vaesmcq_u8(f2);
+        f3 = vaeseq_u8(m3, zero);
+        m3 = vaesmcq_u8(f3);
+        f4 = vaeseq_u8(m4, zero);
+        m4 = vaesmcq_u8(f4);
+
+        r1 = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2));
+        r2 = vzip1q_u32(vreinterpretq_u32_u8(m3), vreinterpretq_u32_u8(m4));
+        mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
+                                                vreinterpretq_u64_u32(r2)));
+
+        r1 = vzip1q_u32(vreinterpretq_u32_u8(f1), vreinterpretq_u32_u8(f2));
+        r2 = vzip1q_u32(vreinterpretq_u32_u8(f3), vreinterpretq_u32_u8(f4));
+        no_mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
+                                                 vreinterpretq_u64_u32(r2)));
+
+        *in_out1 = s2_mixc_fixup_4(no_mixc, mixc);
+
+        /* start encrypting next 128 bits */
+        f5 = vaeseq_u8(m5, zero); // no_mixc
+        m5 = vaesmcq_u8(f5);
+        f6 = vaeseq_u8(m6, zero);
+        m6 = vaesmcq_u8(f6);
+        f7 = vaeseq_u8(m7, zero);
+        m7 = vaesmcq_u8(f7);
+        f8 = vaeseq_u8(m8, zero);
+        m8 = vaesmcq_u8(f8);
+
+        r1 = vzip1q_u32(vreinterpretq_u32_u8(m5), vreinterpretq_u32_u8(m6));
+        r2 = vzip1q_u32(vreinterpretq_u32_u8(m7), vreinterpretq_u32_u8(m8));
+        mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
+                                                vreinterpretq_u64_u32(r2)));
+
+        r1 = vzip1q_u32(vreinterpretq_u32_u8(f5), vreinterpretq_u32_u8(f6));
+        r2 = vzip1q_u32(vreinterpretq_u32_u8(f7), vreinterpretq_u32_u8(f8));
+        no_mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1),
+                                                 vreinterpretq_u64_u32(r2)));
+
+        *in_out2 = s2_mixc_fixup_4(no_mixc, mixc);
+#endif
+}
+
+/**
+ * @brief MULalpha SNOW3G operation on 4 8-bit values at the same time
+ *
+ * Function picks the right byte from the register to run MULalpha operation on.
+ * MULalpha is implemented through 8 16-byte tables and pshufb is used to
+ * look the tables up. This approach is possible because
+ * MULalpha operation has linear nature.
+ * Final operation result is calculated via byte re-arrangement on
+ * the lookup results and an XOR operation.
+ *
+ * @param [in] L0       4 x 32-bit LFSR[0]
+ * @return 4 x 32-bit MULalpha(L0 >> 24)
+ */
+static inline
+uint32x4_t MULa_4(const uint32x4_t L0)
+{
+#ifdef SAFE_LOOKUP
+        const uint8_t gather_clear_mask[]= {
+                        0x03,0x07,0x0b,0x0f,0xff,0xff,0xff,0xff,
+                        0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+                        };
+        const uint8x16_t low_nibble_mask = vdupq_n_u8(0x0f);
+        const uint8x16_t clear_mask = vld1q_u8(gather_clear_mask);
+        uint8x16_t th, tl, b0, b1, b2, b3;
+
+        th = vqtbl1q_u8(vreinterpretq_u8_u32(L0), clear_mask);
+
+        tl = th & low_nibble_mask;
+        b0 = vld1q_u8(snow3g_MULa_byte0_low);
+        b1 = vld1q_u8(snow3g_MULa_byte1_low);
+        b2 = vld1q_u8(snow3g_MULa_byte2_low);
+        b3 = vld1q_u8(snow3g_MULa_byte3_low);
+
+        b0 = vqtbl1q_u8(b0, tl);
+        b1 = vqtbl1q_u8(b1, tl);
+        b2 = vqtbl1q_u8(b2, tl);
+        b3 = vqtbl1q_u8(b3, tl);
+
+        b0 = vzip1q_u8(b0, b1);
+        b2 = vzip1q_u8(b2, b3);
+        tl = vreinterpretq_u8_u16(vzip1q_u16(vreinterpretq_u16_u8(b0),
+                                             vreinterpretq_u16_u8(b2)));
+
+        b0 = vld1q_u8(snow3g_MULa_byte0_hi);
+        b1 = vld1q_u8(snow3g_MULa_byte1_hi);
+        b2 = vld1q_u8(snow3g_MULa_byte2_hi);
+        b3 = vld1q_u8(snow3g_MULa_byte3_hi);
+
+        th = vshrq_n_u8(th, 4) & low_nibble_mask;
+
+        b0 = vqtbl1q_u8(b0, th);
+        b1 = vqtbl1q_u8(b1, th);
+        b2 = vqtbl1q_u8(b2, th);
+        b3 = vqtbl1q_u8(b3, th);
+
+        b0 = vzip1q_u8(b0, b1);
+        b2 = vzip1q_u8(b2, b3);
+        th = vreinterpretq_u8_u16(vzip1q_u16(vreinterpretq_u16_u8(b0),
+                                             vreinterpretq_u16_u8(b2)));
+
+        return vreinterpretq_u32_u8(th ^ tl);
+#else
+        const uint8_t L0IDX0 = vgetq_lane_u8(vreinterpretq_u8_u32(L0), 3);
+        const uint8_t L0IDX1 = vgetq_lane_u8(vreinterpretq_u8_u32(L0), 7);
+        const uint8_t L0IDX2 = vgetq_lane_u8(vreinterpretq_u8_u32(L0), 11);
+        const uint8_t L0IDX3 = vgetq_lane_u8(vreinterpretq_u8_u32(L0), 15);
+
+        uint32x4_t ret;
+        uint32_t x0, x1, x2, x3;
+
+        x0 = snow3g_table_A_mul[L0IDX0];
+        x1 = snow3g_table_A_mul[L0IDX1];
+        x2 = snow3g_table_A_mul[L0IDX2];
+        x3 = snow3g_table_A_mul[L0IDX3];
+
+        ret = vdupq_n_u32(x0);
+        ret = vsetq_lane_u32(x1, ret, 1);
+        ret = vsetq_lane_u32(x2, ret, 2);
+        ret = vsetq_lane_u32(x3, ret, 3);
+        return ret;
+#endif
+}
+
+/**
+ * @brief MULalpha SNOW3G operation on 2 8-bit values at the same time
+ *
+ * @param [in/out] L0_1  On input, 32-bit LFSR[0].
+ *                       On output, 32-bit MULalpha(L0 >> 24)
+ * @param [in/out] L0_2  On input, 32-bit LFSR[0].
+ *                       On output, 32-bit MULalpha(L0 >> 24)
+ */
+static inline
+void MULa_2(uint32_t *L0_1, uint32_t *L0_2)
+{
+#ifdef SAFE_LOOKUP
+        uint32x4_t in, out;
+
+        in = vdupq_n_u32(*L0_1);
+        in = vsetq_lane_u32(*L0_2, in, 1);
+        out = MULa_4(in);
+
+        *L0_1 = vgetq_lane_u32(out, 0);
+        *L0_2 = vgetq_lane_u32(out, 1);
+#else
+        *L0_1 = snow3g_table_A_mul[*L0_1 >> 24];
+        *L0_2 = snow3g_table_A_mul[*L0_2 >> 24];
+#endif
+}
+
+/**
+ * @brief MULalpha SNOW3G operation on a 8-bit value.
+ *
+ * @param [in] L0       32-bit LFSR[0]
+ * @return 32-bit MULalpha(L0 >> 24)
+ */
+static inline
+uint32_t MULa(const uint32_t L0)
+{
+#ifdef SAFE_LOOKUP
+        const uint32x4_t L0_vec = vdupq_n_u32(L0);
+
+        return vgetq_lane_u32(MULa_4(L0_vec), 0);
+#else
+        return snow3g_table_A_mul[L0 >> 24];
+#endif
+}
+
+/**
+ * @brief DIValpha SNOW3G operation on 4 8-bit values at the same time
+ *
+ * Function picks the right byte from the register to run DIValpha operation on.
+ * DIValpha is implemented through 8 16-byte tables and pshufb is used to
+ * look the tables up. This approach is possible because
+ * DIValpha operation has linear nature.
+ * Final operation result is calculated via byte re-arrangement on
+ * the lookup results and an XOR operation.
+ *
+ * @param [in] L11      4 x 32-bit LFSR[11]
+ * @return 4 x 32-bit DIValpha(L11 & 0xff)
+ */
+static inline
+uint32x4_t DIVa_4(const uint32x4_t L11)
+{
+#ifdef SAFE_LOOKUP
+        const uint8_t gather_clear_mask[]= {
+                        0x00,0x04,0x08,0x0c,0xff,0xff,0xff,0xff,
+                        0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
+                        };
+        const uint8x16_t low_nibble_mask = vdupq_n_u8(0x0f);
+        const uint8x16_t clear_mask = vld1q_u8(gather_clear_mask);
+        uint8x16_t th, tl, b0, b1, b2, b3;
+
+        th = vqtbl1q_u8(vreinterpretq_u8_u32(L11), clear_mask);
+
+        tl = th & low_nibble_mask;
+        b0 = vld1q_u8(snow3g_DIVa_byte0_low);
+        b1 = vld1q_u8(snow3g_DIVa_byte1_low);
+        b2 = vld1q_u8(snow3g_DIVa_byte2_low);
+        b3 = vld1q_u8(snow3g_DIVa_byte3_low);
+
+        b0 = vqtbl1q_u8(b0, tl);
+        b1 = vqtbl1q_u8(b1, tl);
+        b2 = vqtbl1q_u8(b2, tl);
+        b3 = vqtbl1q_u8(b3, tl);
+
+        b0 = vzip1q_u8(b0, b1);
+        b2 = vzip1q_u8(b2, b3);
+        tl = vreinterpretq_u8_u16(vzip1q_u16(vreinterpretq_u16_u8(b0),
+                                             vreinterpretq_u16_u8(b2)));
+
+        b0 = vld1q_u8(snow3g_DIVa_byte0_hi);
+        b1 = vld1q_u8(snow3g_DIVa_byte1_hi);
+        b2 = vld1q_u8(snow3g_DIVa_byte2_hi);
+        b3 = vld1q_u8(snow3g_DIVa_byte3_hi);
+
+        th = vshrq_n_u8(th, 4) & low_nibble_mask;
+
+        b0 = vqtbl1q_u8(b0, th);
+        b1 = vqtbl1q_u8(b1, th);
+        b2 = vqtbl1q_u8(b2, th);
+        b3 = vqtbl1q_u8(b3, th);
+
+        b0 = vzip1q_u8(b0, b1);
+        b2 = vzip1q_u8(b2, b3);
+        th = vreinterpretq_u8_u16(vzip1q_u16(vreinterpretq_u16_u8(b0),
+                                             vreinterpretq_u16_u8(b2)));
+
+        return vreinterpretq_u32_u8(th ^ tl);
+#else
+        const uint8_t L11IDX0 = vgetq_lane_u8(vreinterpretq_u8_u32(L11), 0);
+        const uint8_t L11IDX1 = vgetq_lane_u8(vreinterpretq_u8_u32(L11), 4);
+        const uint8_t L11IDX2 = vgetq_lane_u8(vreinterpretq_u8_u32(L11), 8);
+        const uint8_t L11IDX3 = vgetq_lane_u8(vreinterpretq_u8_u32(L11), 12);
+
+        uint32x4_t ret;
+        uint32_t x0, x1, x2, x3;
+
+        x0 = snow3g_table_A_div[L11IDX0];
+        x1 = snow3g_table_A_div[L11IDX1];
+        x2 = snow3g_table_A_div[L11IDX2];
+        x3 = snow3g_table_A_div[L11IDX3];
+
+        ret = vdupq_n_u32(x0);
+        ret = vsetq_lane_u32(x1, ret, 1);
+        ret = vsetq_lane_u32(x2, ret, 2);
+        ret = vsetq_lane_u32(x3, ret, 3);
+
+        return ret;
+#endif
+}
+
+/**
+ * @brief DIValpha SNOW3G operation on 2 8-bit values at the same time
+ *
+ * @param [in/out] L11_1 On input, 32-bit LFSR[11].
+ *                       On output, 32-bit DIValpha(L11 & 0xff)
+ * @param [in/out] L11_2 On input, 32-bit LFSR[11].
+ *                       On output, 32-bit DIValpha(L11 & 0xff)
+ */
+static inline
+void DIVa_2(uint32_t *L11_1, uint32_t *L11_2)
+{
+#ifdef SAFE_LOOKUP
+        uint32x4_t in, out;
+
+        in = vdupq_n_u32(*L11_1);
+        in = vsetq_lane_u32(*L11_2, in, 1);
+        out = DIVa_4(in);
+
+        *L11_1 = vgetq_lane_u32(out, 0);
+        *L11_2 = vgetq_lane_u32(out, 1);
+#else
+        *L11_1 = snow3g_table_A_div[*L11_1 & 0xff];
+        *L11_2 = snow3g_table_A_div[*L11_2 & 0xff];
+#endif
+}
+
+/**
+ * @brief DIValpha SNOW3G operation on a 8-bit value.
+ *
+ * @param [in] L11       32-bit LFSR[11]
+ * @return 32-bit DIValpha(L11 & 0xff)
+ */
+static inline
+uint32_t DIVa(const uint32_t L11)
+{
+#ifdef SAFE_LOOKUP
+        const uint32x4_t L11_vec = vdupq_n_u32(L11);
+
+        return vgetq_lane_u32(DIVa_4(L11_vec), 0);
+#else
+        return snow3g_table_A_div[L11 & 0xff];
+#endif
+}
+
+/**
+ * @brief ClockFSM function as defined in SNOW3G standard
+ *
+ * The FSM has 2 input words S5 and S15 from the LFSR
+ * produces a 32 bit output word F.
+ *
+ * @param[in/out] pCtx  context structure
+ */
+static inline uint32_t ClockFSM_1(snow3gKeyState1_t *pCtx)
+{
+        const uint32_t F = (pCtx->LFSR_S[15] + pCtx->FSM_R1) ^ pCtx->FSM_R2;
+        const uint32_t R = (pCtx->FSM_R3 ^ pCtx->LFSR_S[5]) + pCtx->FSM_R2;
+
+        pCtx->FSM_R3 = S2_box(pCtx->FSM_R2);
+        pCtx->FSM_R2 = S1_box(pCtx->FSM_R1);
+        pCtx->FSM_R1 = R;
+
+        return F;
+}
+
+/**
+ * @brief ClockLFSR function as defined in SNOW3G standard
+ * @param[in/out] pCtx  context structure
+ */
+static inline void ClockLFSR_1(snow3gKeyState1_t *pCtx)
+{
+        const uint32_t S0 = pCtx->LFSR_S[0];
+        const uint32_t S11 = pCtx->LFSR_S[11];
+        const uint32_t V = pCtx->LFSR_S[2] ^
+                MULa(S0) ^
+                DIVa(S11) ^
+                (S0 << 8) ^
+                (S11 >> 8);
+        unsigned i;
+
+        /* LFSR array shift by 1 position */
+        for (i = 0; i < 15; i++)
+                pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 1];
+
+        pCtx->LFSR_S[15] = V;
+}
+
+/**
+ * @brief Initializes the key schedule for 1 buffer for SNOW3G f8/f9.
+ *
+ * @param[in/out]  pCtx        Context where the scheduled keys are stored
+ * @param[in]      pKeySched   Key schedule
+ * @param[in]      pIV         IV
+ */
+static inline void
+snow3gStateInitialize_1(snow3gKeyState1_t *pCtx,
+                        const snow3g_key_schedule_t *pKeySched,
+                        const void *pIV)
+{
+        uint32_t FSM1, FSM2, FSM3;
+        const uint32_t *pIV32 = pIV;
+        int i;
+
+        /* LFSR initialisation */
+        for (i = 0; i < 4; i++) {
+                const uint32_t K = pKeySched->k[i];
+                const uint32_t L = ~K;
+
+                pCtx->LFSR_S[i + 4] = K;
+                pCtx->LFSR_S[i + 12] = K;
+                pCtx->LFSR_S[i + 0] = L;
+                pCtx->LFSR_S[i + 8] = L;
+        }
+
+        pCtx->LFSR_S[15] ^= BSWAP32(pIV32[3]);
+        pCtx->LFSR_S[12] ^= BSWAP32(pIV32[2]);
+        pCtx->LFSR_S[10] ^= BSWAP32(pIV32[1]);
+        pCtx->LFSR_S[9] ^= BSWAP32(pIV32[0]);
+
+        /* FSM initialization */
+        FSM2 = 0;
+        FSM3 = 0;
+        FSM1 = 0;
+
+        for (i = 0; i < 16; i++) {
+                const uint32_t L0 = pCtx->LFSR_S[0];
+                const uint32_t L1 = pCtx->LFSR_S[1];
+                const uint32_t L11 = pCtx->LFSR_S[11];
+                const uint32_t L12 = pCtx->LFSR_S[12];
+                uint32_t MULa_L0 = L0;
+                uint32_t MULa_L1 = L1;
+                uint32_t DIVa_L11 = L11;
+                uint32_t DIVa_L12 = L12;
+
+                MULa_2(&MULa_L0, &MULa_L1);
+                DIVa_2(&DIVa_L11, &DIVa_L12);
+
+                /* clock FSM + clock LFSR + clockFSM + clock LFSR */
+                const uint32_t F0 =
+                        (pCtx->LFSR_S[15] + FSM1) ^ FSM2; /* (s15 + R1) ^ R2 */
+
+                const uint32_t V0 =
+                        pCtx->LFSR_S[2] ^
+                        MULa_L0 ^ /* MUL(s0,0 ) */
+                        DIVa_L11 ^ /* DIV(s11,3 )*/
+                        (L0 << 8) ^ /*  (s0,1 || s0,2 || s0,3 || 0x00) */
+                        (L11 >> 8) ^ /* (0x00 || s11,0 || s11,1 || s11,2 ) */
+                        F0;
+
+                const uint32_t R0 =
+                        (FSM3 ^ pCtx->LFSR_S[5]) + FSM2; /* R2 + (R3 ^ s5 ) */
+
+                uint32_t s1_box_step1 = FSM1;
+                uint32_t s1_box_step2 = R0;
+
+                S1_box_2(&s1_box_step1, &s1_box_step2);
+
+                uint32_t s2_box_step1 = FSM2;
+                uint32_t s2_box_step2 = s1_box_step1; /* S1_box(R0) */
+
+                S2_box_2(&s2_box_step1, &s2_box_step2);
+
+                FSM1 = (s2_box_step1 ^ pCtx->LFSR_S[6]) + s1_box_step1;
+
+                const uint32_t F1 = (V0 + R0) ^ s1_box_step1;
+
+                const uint32_t V1 = pCtx->LFSR_S[3] ^
+                             MULa_L1 ^
+                             DIVa_L12 ^
+                             (L1 << 8) ^ (L12 >> 8) ^ F1;
+
+                FSM2 = s1_box_step2;
+                FSM3 = s2_box_step2;
+
+                /* shift LFSR twice */
+                ShiftTwiceLFSR_1(pCtx);
+
+                pCtx->LFSR_S[14] = V0;
+                pCtx->LFSR_S[15] = V1;
+        }
+
+        /* set FSM into scheduling structure */
+        pCtx->FSM_R3 = FSM3;
+        pCtx->FSM_R2 = FSM2;
+        pCtx->FSM_R1 = FSM1;
+}
+
+/**
+ * @brief Generates 5 words of key stream used in the initial stages of F9.
+ *
+ * @param[in]     pCtx        Context where the scheduled keys are stored
+ * @param[in/out] pKeyStream  Pointer to the generated keystream
+ */
+static inline void snow3g_f9_keystream_words(snow3gKeyState1_t *pCtx,
+                                             uint32_t *pKeyStream)
+{
+        int i;
+
+        (void) ClockFSM_1(pCtx);
+        ClockLFSR_1(pCtx);
+
+        for (i = 0; i < 5; i++) {
+                pKeyStream[i] = ClockFSM_1(pCtx) ^ pCtx->LFSR_S[0];
+                ClockLFSR_1(pCtx);
+        }
+}
+
+/**
+ * @brief LFSR array shift by one (4 lanes)
+ * @param[in]     pCtx       Context where the scheduled keys are stored
+ */
+static inline void ShiftLFSR_4(snow3gKeyState4_t *pCtx)
+{
+        pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) & 15;
+}
+
+/**
+ * @brief GF2 modular reduction 128-bits to 64-bits
+ *
+ * SNOW3GCONSTANT/0x1b reduction polynomial applied.
+ *
+ * @param[in] m   128-bit input
+ * @return 128-bit output (only least significant 64-bits are valid)
+ */
+static inline poly64x2_t reduce128_to_64(const poly64x2_t m)
+{
+        const poly64x1_t p = vdup_n_p64((poly64_t)SNOW3GCONSTANT);
+        poly64x2_t x, t;
+
+        /* start reduction */
+        /* top 64-bits of m x p */
+        x = (poly64x2_t)vmull_p64(vgetq_lane_p64(m, 1), (poly64_t)p);
+        t = m ^ x;
+
+        /*
+         * repeat multiply and xor in case
+         * 'x' product was bigger than 64 bits
+         */
+        x = (poly64x2_t)vmull_p64(vgetq_lane_p64(x, 1), (poly64_t)p);
+        t = t ^ x;
+
+        return t;
+}
+
+/**
+ * @brief GF2 modular multiplication 64-bits x 64-bits with reduction
+ *
+ * Implements MUL64 function from the standard.
+ * SNOW3GCONSTANT/0x1b reduction polynomial applied.
+ *
+ * @param[in] a   64-bit input
+ * @param[in] b   64-bit input
+ * @return 64-bit output
+ */
+static inline uint64_t multiply_and_reduce64(uint64_t a, uint64_t b)
+{
+        poly64x2_t m;
+
+        m = (poly64x2_t)vmull_p64(a, b);  /*  m = a x b */
+
+        m = reduce128_to_64(m); /* reduction */
+
+        return vgetq_lane_u64(vreinterpretq_u64_p64(m), 0);
+}
+
+/**
+ * @brief ClockLFSR sub-function as defined in SNOW3G standard (4 lanes)
+ *
+ * @param[in] L0        LFSR[0]
+ * @param[in] L11       LFSR[11]
+ * @return table_Alpha_div[LFSR[11] & 0xff] ^ table_Alpha_mul[LFSR[0] & 0xff]
+ */
+static inline uint32x4_t C0_C11_4(const uint32x4_t L0, const uint32x4_t L11)
+{
+        const uint32x4_t SL11 = DIVa_4(L11);
+        const uint32x4_t SL0 = MULa_4(L0);
+
+        return SL11 ^ SL0;
+}
+
+/**
+ * @brief ClockLFSR function as defined in SNOW3G standard (4 lanes)
+ *
+ * S =  table_Alpha_div[LFSR[11] & 0xff]
+ *       ^ table_Alpha_mul[LFSR[0] >> 24]
+ *       ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8
+ *
+ * @param[in]     pCtx       Context where the scheduled keys are stored
+ */
+static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx)
+{
+        uint32x4_t S, T, U;
+
+
+        U = pCtx->LFSR_X[pCtx->iLFSR_X];
+        S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) & 15];
+        const uint32x4_t X2 = C0_C11_4(U, S);
+
+        T = vshlq_n_u32(U, 8);
+        S = vshrq_n_u32(S, 8);
+        U = T ^ pCtx->LFSR_X[(pCtx->iLFSR_X + 2) & 15];
+        ShiftLFSR_4(pCtx);
+
+        S = S ^ U;
+        S = S ^ X2;
+        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] = S;
+}
+
+/**
+ * @brief ClockFSM function as defined in SNOW3G standard
+ *
+ * It operates on 4 packets/lanes at a time
+ *
+ * @param[in]     pCtx       Context where the scheduled keys are stored
+ * @return 4 x 4bytes of key stream
+ */
+static inline uint32x4_t ClockFSM_4(snow3gKeyState4_t *pCtx)
+{
+        const uint32_t iLFSR_X = pCtx->iLFSR_X;
+        const uint32x4_t F =
+                pCtx->LFSR_X[(iLFSR_X + 15) & 15] + pCtx->FSM_X[0];
+        const uint32x4_t R =
+                vaddq_u32(pCtx->LFSR_X[(iLFSR_X + 5) & 15] ^ pCtx->FSM_X[2],
+                              pCtx->FSM_X[1]);
+
+        const uint32x4_t ret = F ^ pCtx->FSM_X[1];
+
+        pCtx->FSM_X[2] = S2_box_4(pCtx->FSM_X[1]);
+        pCtx->FSM_X[1] = S1_box_4(pCtx->FSM_X[0]);
+        pCtx->FSM_X[0] = R;
+
+        return ret;
+}
+
+/**
+ * @brief Generates 4 bytes of key stream 1 buffer at a time
+ *
+ * @param[in]     pCtx       Context where the scheduled keys are stored
+ * @return 4 bytes of key stream
+ */
+static inline uint32_t snow3g_keystream_1_4(snow3gKeyState1_t *pCtx)
+{
+        const uint32_t F = ClockFSM_1(pCtx);
+        const uint32_t ks = F ^ pCtx->LFSR_S[0];
+
+        ClockLFSR_1(pCtx);
+        return ks;
+}
+
+/**
+ * @brief Generates 8 bytes of key stream for 1 buffer at a time
+ *
+ * @param[in] pCtx Context where the scheduled keys are stored
+ * @return 8 bytes of a key stream
+ */
+static inline uint64_t snow3g_keystream_1_8(snow3gKeyState1_t *pCtx)
+{
+        /*
+         * Merged clock FSM + clock LFSR + clock FSM + clockLFSR
+         * in order to avoid redundancies in function processing
+         * and less instruction immediate dependencies
+         */
+        const uint32_t L0 = pCtx->LFSR_S[0];
+        const uint32_t L1 = pCtx->LFSR_S[1];
+        const uint32_t L11 = pCtx->LFSR_S[11];
+        const uint32_t L12 = pCtx->LFSR_S[12];
+        uint32_t MULa_L0 = L0;
+        uint32_t MULa_L1 = L1;
+        uint32_t DIVa_L11 = L11;
+        uint32_t DIVa_L12 = L12;
+
+        MULa_2(&MULa_L0, &MULa_L1);
+        DIVa_2(&DIVa_L11, &DIVa_L12);
+
+        const uint32_t V0 =
+                pCtx->LFSR_S[2] ^
+                MULa_L0 ^
+                DIVa_L11 ^
+                (L0 << 8) ^
+                (L11 >> 8);
+
+        const uint32_t V1 =
+                pCtx->LFSR_S[3] ^
+                MULa_L1 ^
+                DIVa_L12 ^
+                (L1 << 8) ^
+                (L12 >> 8);
+
+        const uint32_t F0 =
+                (pCtx->LFSR_S[15] + pCtx->FSM_R1) ^ L0 ^ pCtx->FSM_R2;
+        const uint32_t R0 =
+                (pCtx->FSM_R3 ^ pCtx->LFSR_S[5]) + pCtx->FSM_R2;
+
+        uint32_t s1_box_step1 = pCtx->FSM_R1;
+        uint32_t s1_box_step2 = R0;
+
+        S1_box_2(&s1_box_step1, &s1_box_step2);
+
+        uint32_t s2_box_step1 = pCtx->FSM_R2;
+        uint32_t s2_box_step2 = s1_box_step1;
+
+        S2_box_2(&s2_box_step1, &s2_box_step2);
+
+        /*
+         * At this stage FSM_R mapping is as follows:
+         *    FSM_R2 = s1_box_step1
+         *    FSM_R3 = s2_box_step1
+         */
+        const uint32_t F1 = (V0 + R0) ^ L1 ^ s1_box_step1;
+
+        pCtx->FSM_R3 = s2_box_step2;
+        pCtx->FSM_R2 = s1_box_step2;
+        pCtx->FSM_R1 = (s2_box_step1 ^ pCtx->LFSR_S[6]) + s1_box_step1;
+
+        /* Shift LFSR twice */
+        ShiftTwiceLFSR_1(pCtx);
+
+        /* key stream mode LFSR update */
+        pCtx->LFSR_S[14] = V0;
+        pCtx->LFSR_S[15] = V1;
+
+        return (((uint64_t) F0) << 32) | ((uint64_t) F1);
+}
+
+
+/**
+ * @brief Generates 4 bytes of key stream 4 buffers at a time
+ *
+ * @param[in]      pCtx         Context where the scheduled keys are stored
+ * @param[in/out]  pKeyStream   Pointer to generated key stream
+ */
+static inline uint32x4_t snow3g_keystream_4_4(snow3gKeyState4_t *pCtx)
+{
+        const uint32x4_t keyStream =
+                        ClockFSM_4(pCtx) ^ pCtx->LFSR_X[pCtx->iLFSR_X];
+
+        ClockLFSR_4(pCtx);
+        return keyStream;
+}
+
+/**
+ * @brief Generates 8 bytes of key stream 4 buffers at a time
+ *
+ * @param[in]      pCtx         Context where the scheduled keys are stored
+ * @param[in/out]  pKeyStreamLo Pointer to lower end of generated key stream
+ * @param[in/out]  pKeyStreamHi Pointer to higher end of generated key stream
+ */
+static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx,
+                                        uint32x4_t *pKeyStreamLo,
+                                        uint32x4_t *pKeyStreamHi)
+{
+        const uint32x4_t L0 = pCtx->LFSR_X[pCtx->iLFSR_X];
+        const uint32x4_t L2 = pCtx->LFSR_X[(pCtx->iLFSR_X + 2) & 15];
+        const uint32x4_t L11 = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) & 15];
+
+        const uint32x4_t L1= pCtx->LFSR_X[(pCtx->iLFSR_X + 1) & 15];
+        const uint32x4_t L3 = pCtx->LFSR_X[(pCtx->iLFSR_X + 3) & 15];
+        const uint32x4_t L12 = pCtx->LFSR_X[(pCtx->iLFSR_X + 12) & 15];
+
+        const uint32x4_t L5 = pCtx->LFSR_X[(pCtx->iLFSR_X + 5) & 15];
+        const uint32x4_t L6 = pCtx->LFSR_X[(pCtx->iLFSR_X + 6) & 15];
+        const uint32x4_t L15 = pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15];
+
+        const uint32x4_t V0 = veorq_u32(veorq_u32(C0_C11_4(L0, L11), L2),
+                                        veorq_u32(vshlq_n_u32(L0, 8),
+                                                  vshrq_n_u32(L11, 8)));
+
+        const uint32x4_t V1 = veorq_u32(veorq_u32(C0_C11_4(L1, L12), L3),
+                                        veorq_u32(vshlq_n_u32(L1, 8),
+                                                  vshrq_n_u32(L12, 8)));
+
+        /* ======== first set of 4 bytes */
+
+        const uint32x4_t s1_box_step1 = S1_box_4(pCtx->FSM_X[0]); /* do early */
+
+        const uint32x4_t R0 = vaddq_u32(veorq_u32(L5, pCtx->FSM_X[2]),
+                                        pCtx->FSM_X[1]);
+
+        const uint32x4_t F0 = veorq_u32(vaddq_u32(L15, pCtx->FSM_X[0]),
+                                        pCtx->FSM_X[1]);
+        const uint32x4_t L = F0 ^ L0;
+
+        const uint32x4_t F1 = veorq_u32(vaddq_u32(V0, R0), s1_box_step1);
+        const uint32x4_t H = F1 ^ L1;
+
+        /* Merge L & H sets for output */
+        *pKeyStreamLo = vzip1q_u32(H, L);
+        *pKeyStreamHi = vzip2q_u32(H, L);
+
+        uint32x4_t s2_box_step1 = pCtx->FSM_X[1];
+        uint32x4_t s2_box_step2 = s1_box_step1;
+
+        S2_box_2x4(&s2_box_step1, &s2_box_step2);
+
+        /*
+         * At this stage FSM_X mapping is as follows:
+         *    FSM_X[2] = s2_box_step1
+         *    FSM_X[1] = s1_box_step1
+         *    FSM_X[0] = R0
+         */
+
+        /* Shift LFSR twice */
+        pCtx->iLFSR_X = (pCtx->iLFSR_X + 2) & 15;
+
+        /* LFSR Update */
+        pCtx->LFSR_X[(pCtx->iLFSR_X + 14) & 15] = V0;
+        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] = V1;
+
+        const uint32x4_t s1_box_step2 = S1_box_4(R0);
+
+        const uint32x4_t R1 = vaddq_u32(veorq_u32(L6, s2_box_step1),
+                                        s1_box_step1);
+
+        /* Final FSM_X update
+         *    FSM_X[2] = s2_box_step2
+         *    FSM_X[1] = s1_box_step2
+         *    FSM_X[0] = R1
+         */
+        pCtx->FSM_X[2] = s2_box_step2;
+        pCtx->FSM_X[1] = s1_box_step2;
+        pCtx->FSM_X[0] = R1;
+}
+
+/**
+ * @brief Generates 16 bytes of key stream 4 buffers at a time
+ *
+ * @param[in]     pCtx         Context where the scheduled keys are stored
+ * @param[in/out] pKeyStream   Pointer to store generated key stream
+ */
+static inline void snow3g_keystream_4_16(snow3gKeyState4_t *pCtx,
+                                         uint32x4_t pKeyStream[4])
+{
+        static const uint64_t sm[2] = {
+                /* mask for byte swapping 64-bit words */
+                0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL
+        };
+        uint32x4_t ksL1, ksL2, ksH1, ksH2;
+
+        snow3g_keystream_4_8(pCtx, &ksL1, &ksH1);
+        snow3g_keystream_4_8(pCtx, &ksL2, &ksH2);
+
+        const uint8x16_t swapMask = vreinterpretq_u8_u64(vld1q_u64(sm));
+
+        pKeyStream[0] = vreinterpretq_u32_u8(vqtbl1q_u8(
+                                        vreinterpretq_u8_u64(vzip1q_u64(
+                                                vreinterpretq_u64_u32(ksL1),
+                                                vreinterpretq_u64_u32(ksL2))),
+                                        swapMask));
+        pKeyStream[1] = vreinterpretq_u32_u8(vqtbl1q_u8(
+                                        vreinterpretq_u8_u64(vzip2q_u64(
+                                                vreinterpretq_u64_u32(ksL1),
+                                                vreinterpretq_u64_u32(ksL2))),
+                                        swapMask));
+        pKeyStream[2] = vreinterpretq_u32_u8(vqtbl1q_u8(
+                                        vreinterpretq_u8_u64(vzip1q_u64(
+                                                vreinterpretq_u64_u32(ksH1),
+                                                vreinterpretq_u64_u32(ksH2))),
+                                        swapMask));
+        pKeyStream[3] = vreinterpretq_u32_u8(vqtbl1q_u8(
+                                        vreinterpretq_u8_u64(vzip2q_u64(
+                                                vreinterpretq_u64_u32(ksH1),
+                                                vreinterpretq_u64_u32(ksH2))),
+                                        swapMask));
+}
+
+/**
+ * @brief Initializes the key schedule for 4 buffers for SNOW3G f8/f9.
+ *
+ * @param [in]      pCtx        Context where the scheduled keys are stored
+ * @param [in]      pKeySched   Key schedule
+ * @param [in]      pIV1        IV for buffer 1
+ * @param [in]      pIV2        IV for buffer 2
+ * @param [in]      pIV3        IV for buffer 3
+ * @param [in]      pIV4        IV for buffer 4
+ */
+static inline void
+snow3gStateInitialize_4(snow3gKeyState4_t *pCtx,
+                        const snow3g_key_schedule_t *pKeySched,
+                        const void *pIV1, const void *pIV2,
+                        const void *pIV3, const void *pIV4)
+{
+        uint32x4_t R, S, T, U;
+        uint32x4_t T0, T1;
+        int i;
+
+        /* Initialize the LFSR table from constants, Keys, and IV */
+
+        /* Load complete 128b IV into register */
+        static const uint64_t sm[2] = {
+                0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL
+        };
+
+        R = vld1q_u32(pIV1);
+        S = vld1q_u32(pIV2);
+        T = vld1q_u32(pIV3);
+        U = vld1q_u32(pIV4);
+
+        /* initialize the array block */
+        for (i = 0; i < 4; i++) {
+                const uint32_t K = pKeySched->k[i];
+                const uint32_t L = ~K;
+                const uint32x4_t VK = vdupq_n_u32(K);
+                const uint32x4_t VL = vdupq_n_u32(L);
+
+                pCtx->LFSR_X[i + 4] =
+                        pCtx->LFSR_X[i + 12] = VK;
+                pCtx->LFSR_X[i + 0] =
+                        pCtx->LFSR_X[i + 8] = VL;
+        }
+        /* Update the schedule structure with IVs */
+        /* Store the 4 IVs in LFSR by a column/row matrix swap
+         * after endianness correction */
+
+        /* endianness swap */
+        const uint64x2_t swapMask = vld1q_u64(sm);
+
+        R = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(R),
+                                        vreinterpretq_u8_u64(swapMask)));
+        S = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(S),
+                                        vreinterpretq_u8_u64(swapMask)));
+        T = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(T),
+                                        vreinterpretq_u8_u64(swapMask)));
+        U = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(U),
+                                        vreinterpretq_u8_u64(swapMask)));
+
+        /* row/column dword inversion */
+        T0 = vzip1q_u32(R, S);
+        R = vzip2q_u32(R, S);
+        T1 = vzip1q_u32(T, U);
+        T = vzip2q_u32(T, U);
+
+        /* row/column qword inversion */
+        U = vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(R),
+                                        vreinterpretq_u64_u32(T)));
+        T = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(R),
+                                        vreinterpretq_u64_u32(T)));
+        S = vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(T0),
+                                        vreinterpretq_u64_u32(T1)));
+        R = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(T0),
+                                        vreinterpretq_u64_u32(T1)));
+
+        /* IV ^ LFSR */
+        pCtx->LFSR_X[15] = pCtx->LFSR_X[15] ^ U;
+        pCtx->LFSR_X[12] = pCtx->LFSR_X[12] ^ T;
+        pCtx->LFSR_X[10] = pCtx->LFSR_X[10] ^ S;
+        pCtx->LFSR_X[9] = pCtx->LFSR_X[9] ^ R;
+        pCtx->iLFSR_X = 0;
+
+        /* FSM initialization */
+        pCtx->FSM_X[0] = pCtx->FSM_X[1] =
+                pCtx->FSM_X[2] = vdupq_n_u32(0);
+
+        /* Initialisation rounds */
+        for (i = 0; i < 32; i++) {
+                T1 = ClockFSM_4(pCtx);
+
+                ClockLFSR_4(pCtx);
+                pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] =
+                        pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] ^ T1;
+        }
+}
+
+
+static inline void
+preserve_bits(uint64_t *KS,
+              const uint8_t *pcBufferOut, const uint8_t *pcBufferIn,
+              SafeBuf *safeOutBuf, SafeBuf *safeInBuf,
+              const uint8_t bit_len, const uint8_t byte_len)
+{
+        const uint64_t mask = UINT64_MAX << (SNOW3G_BLOCK_SIZE * 8 - bit_len);
+
+        /* Clear the last bits of the key stream and the input
+         * (input only in out-of-place case) */
+        *KS &= mask;
+        if (pcBufferIn != pcBufferOut) {
+                const uint64_t swapMask = BSWAP64(mask);
+
+                safeInBuf->b64 &= swapMask;
+
+                /*
+                 * Merge the last bits from the output, to be preserved,
+                 * in the key stream, to be XOR'd with the input
+                 * (which last bits are 0, maintaining the output bits)
+                 */
+                memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len);
+                *KS |= BSWAP64(safeOutBuf->b64 & ~swapMask);
+        }
+}
+
+/**
+ * @brief Core SNOW3G F8 bit algorithm for the 3GPP confidentiality algorithm
+ *
+ * @param[in]    pCtx          Context where the scheduled keys are stored
+ * @param[in]    pIn           Input buffer
+ * @param[out]   pOut          Output buffer
+ * @param[in]    lengthInBits  length in bits of the data to be encrypted
+ * @param[in]    offsetinBits  offset in input buffer, where data are valid
+ */
+static inline void f8_snow3g_bit(snow3gKeyState1_t *pCtx,
+                                 const void *pIn,
+                                 void *pOut,
+                                 const uint32_t lengthInBits,
+                                 const uint32_t offsetInBits)
+{
+        const uint8_t *pBufferIn = pIn;
+        uint8_t *pBufferOut = pOut;
+        uint32_t cipherLengthInBits = lengthInBits;
+        uint64_t shiftrem = 0;
+        uint64_t KS8, KS8bit; /* 8 bytes of key stream */
+        const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8);
+        uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8);
+        /* Offset into the first byte (0 - 7 bits) */
+        uint32_t remainOffset = offsetInBits % 8;
+        uint32_t byteLength = (cipherLengthInBits + 7) / 8;
+        SafeBuf safeInBuf = {0};
+        SafeBuf safeOutBuf = {0};
+
+        /* Now run the block cipher */
+
+        /* Start with potential partial block (due to offset and length) */
+        KS8 = snow3g_keystream_1_8(pCtx);
+        KS8bit = KS8 >> remainOffset;
+
+        /* Only one block to encrypt */
+        if (cipherLengthInBits < (64 - remainOffset)) {
+                byteLength = (cipherLengthInBits + 7) / 8;
+                memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength);
+                /*
+                 * If operation is Out-of-place and there is offset
+                 * to be applied, "remainOffset" bits from the output buffer
+                 * need to be preserved (only applicable to first byte,
+                 * since remainOffset is up to 7 bits)
+                 */
+                if ((pIn != pOut) && remainOffset) {
+                        const uint8_t mask8 = (uint8_t)
+                                (1 << (8 - remainOffset)) - 1;
+
+                        safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+                                (pcBufferOut[0] & ~mask8);
+                }
+                /* If last byte is a partial byte, the last bits of the output
+                 * need to be preserved */
+                const uint8_t bitlen_with_off = remainOffset +
+                        cipherLengthInBits;
+
+                if ((bitlen_with_off & 0x7) != 0)
+                        preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+                                      &safeOutBuf, &safeInBuf,
+                                      bitlen_with_off, byteLength);
+
+                xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+                memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+                return;
+        }
+        /*
+         * If operation is Out-of-place and there is offset
+         * to be applied, "remainOffset" bits from the output buffer
+         * need to be preserved (only applicable to first byte,
+         * since remainOffset is up to 7 bits)
+         */
+        if ((pIn != pOut) && remainOffset) {
+                const uint8_t mask8 = (uint8_t)(1 << (8 - remainOffset)) - 1;
+
+                memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8);
+                safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) |
+                        (pcBufferOut[0] & ~mask8);
+                xor_keystrm_rev(pcBufferOut, safeInBuf.b8, KS8bit);
+                pcBufferIn += SNOW3G_BLOCK_SIZE;
+        } else {
+                /* At least 64 bits to produce (including offset) */
+                pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, KS8bit);
+        }
+
+        if (remainOffset != 0)
+                shiftrem = KS8 << (64 - remainOffset);
+        cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8 - remainOffset;
+        pcBufferOut += SNOW3G_BLOCK_SIZE;
+
+        while (cipherLengthInBits) {
+                /* produce the next block of key stream */
+                KS8 = snow3g_keystream_1_8(pCtx);
+                KS8bit = (KS8 >> remainOffset) | shiftrem;
+                if (remainOffset != 0)
+                        shiftrem = KS8 << (64 - remainOffset);
+                if (cipherLengthInBits >= SNOW3G_BLOCK_SIZE * 8) {
+                        pcBufferIn = xor_keystrm_rev(pcBufferOut,
+                                                     pcBufferIn, KS8bit);
+                        cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8;
+                        pcBufferOut += SNOW3G_BLOCK_SIZE;
+                        /* loop variant */
+                } else {
+                        /* end of the loop, handle the last bytes */
+                        byteLength = (cipherLengthInBits + 7) / 8;
+                        memcpy_keystrm(safeInBuf.b8, pcBufferIn,
+                                       byteLength);
+
+                        /* If last byte is a partial byte, the last bits
+                         * of the output need to be preserved */
+                        if ((cipherLengthInBits & 0x7) != 0)
+                                preserve_bits(&KS8bit, pcBufferOut, pcBufferIn,
+                                              &safeOutBuf, &safeInBuf,
+                                              cipherLengthInBits, byteLength);
+
+                        xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit);
+                        memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength);
+                        cipherLengthInBits = 0;
+                }
+        }
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS8, sizeof(KS8));
+        CLEAR_VAR(&KS8bit, sizeof(KS8bit));
+        CLEAR_MEM(&safeInBuf, sizeof(safeInBuf));
+        CLEAR_MEM(&safeOutBuf, sizeof(safeOutBuf));
+#endif
+}
+
+/**
+ * @brief Core SNOW3G F8 algorithm for the 3GPP confidentiality algorithm
+ *
+ * @param[in]  pCtx           Context where the scheduled keys are stored
+ * @param[in]  pIn            Input buffer
+ * @param[out] pOut           Output buffer
+ * @param[in]  lengthInBytes  length in bytes of the data to be encrypted
+ */
+static inline void f8_snow3g(snow3gKeyState1_t *pCtx,
+                             const void *pIn,
+                             void *pOut,
+                             const uint32_t lengthInBytes)
+{
+        uint32_t qwords = lengthInBytes / SNOW3G_8_BYTES; /* number of qwords */
+        const uint32_t words = lengthInBytes & 4; /* remaining word if not 0 */
+        const uint32_t bytes = lengthInBytes & 3; /* remaining bytes */
+        uint32_t KS4;                       /* 4 bytes of key stream */
+        uint64_t KS8;                       /* 8 bytes of key stream */
+        const uint8_t *pBufferIn = pIn;
+        uint8_t *pBufferOut = pOut;
+
+        /* process 64 bits at a time */
+        while (qwords--) {
+                /* generate key stream 8 bytes at a time */
+                KS8 = snow3g_keystream_1_8(pCtx);
+
+                /* xor key stream 8 bytes at a time */
+                pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn, KS8);
+                pBufferOut += SNOW3G_8_BYTES;
+        }
+
+        /* check for remaining 0 to 7 bytes */
+        if (0 != words) {
+                if (bytes) {
+                        /* 5 to 7 last bytes, process 8 bytes */
+                        uint8_t buftemp[8];
+                        uint8_t safeBuff[8];
+
+                        memset(safeBuff, 0, SNOW3G_8_BYTES);
+                        KS8 = snow3g_keystream_1_8(pCtx);
+                        memcpy_keystrm(safeBuff, pBufferIn, 4 + bytes);
+                        xor_keystrm_rev(buftemp, safeBuff, KS8);
+                        memcpy_keystrm(pBufferOut, buftemp, 4 + bytes);
+#ifdef SAFE_DATA
+                        CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+                        CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+                } else {
+                        /* exactly 4 last bytes */
+                        KS4 = snow3g_keystream_1_4(pCtx);
+                        xor_keystream_reverse_32(pBufferOut, pBufferIn, KS4);
+                }
+        } else if (0 != bytes) {
+                /* 1 to 3 last bytes */
+                uint8_t buftemp[4];
+                uint8_t safeBuff[4];
+
+                memset(safeBuff, 0, SNOW3G_4_BYTES);
+                KS4 = snow3g_keystream_1_4(pCtx);
+                memcpy_keystream_32(safeBuff, pBufferIn, bytes);
+                xor_keystream_reverse_32(buftemp, safeBuff, KS4);
+                memcpy_keystream_32(pBufferOut, buftemp, bytes);
+#ifdef SAFE_DATA
+                CLEAR_MEM(&safeBuff, sizeof(safeBuff));
+                CLEAR_MEM(&buftemp, sizeof(buftemp));
+#endif
+        }
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&KS4, sizeof(KS4));
+        CLEAR_VAR(&KS8, sizeof(KS8));
+#endif
+}
+
+/**
+ * @brief Extracts one state from a 4 buffer state structure.
+ *
+ * @param[in]  pSrcState   Pointer to the source state
+ * @param[in]  pDstState   Pointer to the destination state
+ * @param[in]  NumBuffer   Buffer number
+ */
+static inline void snow3gStateConvert_4(const snow3gKeyState4_t *pSrcState,
+                                        snow3gKeyState1_t *pDstState,
+                                        const uint32_t NumBuffer)
+{
+        const uint32_t iLFSR_X = pSrcState->iLFSR_X;
+        const uint32x4_t *LFSR_X = pSrcState->LFSR_X;
+        uint32_t i;
+
+        for (i = 0; i < 16; i++) {
+                const uint32_t *pLFSR_X =
+                        (const uint32_t *) &LFSR_X[(i + iLFSR_X) & 15];
+
+                pDstState->LFSR_S[i] = pLFSR_X[NumBuffer];
+        }
+
+        const uint32_t *pFSM_X0 = (const uint32_t *)&pSrcState->FSM_X[0];
+        const uint32_t *pFSM_X1 = (const uint32_t *)&pSrcState->FSM_X[1];
+        const uint32_t *pFSM_X2 = (const uint32_t *)&pSrcState->FSM_X[2];
+
+        pDstState->FSM_R1 = pFSM_X0[NumBuffer];
+        pDstState->FSM_R2 = pFSM_X1[NumBuffer];
+        pDstState->FSM_R3 = pFSM_X2[NumBuffer];
+}
+
+/**
+ * @brief Provides size of key schedule structure
+ * @return Key schedule structure in bytes
+ */
+size_t SNOW3G_KEY_SCHED_SIZE(void)
+{
+        return sizeof(snow3g_key_schedule_t);
+}
+
+/**
+ * @brief Key schedule initialisation
+ * @param[in]  pKey  pointer to a 16-byte key
+ * @param[out] pCtx  pointer to key schedule structure
+ * @return Operation status
+ * @retval 0 all OK
+ * @retval -1 parameter error
+ */
+int SNOW3G_INIT_KEY_SCHED(const void *pKey, snow3g_key_schedule_t *pCtx)
+{
+#ifdef SAFE_PARAM
+        if ((pKey == NULL) || (pCtx == NULL))
+                return -1;
+#endif
+
+        const uint32_t *pKey32 = pKey;
+
+        pCtx->k[3] = BSWAP32(pKey32[0]);
+        pCtx->k[2] = BSWAP32(pKey32[1]);
+        pCtx->k[1] = BSWAP32(pKey32[2]);
+        pCtx->k[0] = BSWAP32(pKey32[3]);
+
+        return 0;
+}
+
+/**
+ * @brief Single buffer F8 encrypt/decrypt
+ *
+ * Single buffer enc/dec with IV and precomputed key schedule
+ *
+ * @param[in]  pHandle       pointer to precomputed key schedule
+ * @param[in]  pIV           pointer to IV
+ * @param[in]  pBufferIn     pointer to an input buffer
+ * @param[out] pBufferOut    pointer to an output buffer
+ * @param[in]  lengthInBytes message length in bits
+ */
+void SNOW3G_F8_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV,
+                        const void *pBufferIn,
+                        void  *pBufferOut,
+                        const uint32_t lengthInBytes)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV == NULL) ||
+            (pBufferIn == NULL) || (pBufferOut == NULL) ||
+            (lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        snow3gKeyState1_t ctx;
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+        /* Clock FSM and LFSR once, ignore the key stream */
+        (void) snow3g_keystream_1_4(&ctx);
+
+        f8_snow3g(&ctx, pBufferIn, pBufferOut, lengthInBytes);
+
+#ifdef SAFE_DATA
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/**
+ * @brief Single bit-length buffer F8 encrypt/decrypt
+ * @param[in] pHandle      pointer to precomputed key schedule
+ * @param[in] pIV          pointer to IV
+ * @param[in] pBufferIn    pointer to an input buffer
+ * @param[out] pBufferOut  pointer to an output buffer
+ * @param[in] lengthInBits message length in bits
+ * @param[in] offsetInBits message offset in bits
+ */
+void SNOW3G_F8_1_BUFFER_BIT(const snow3g_key_schedule_t *pHandle,
+                            const void *pIV,
+                            const void *pBufferIn,
+                            void *pBufferOut,
+                            const uint32_t lengthInBits,
+                            const uint32_t offsetInBits)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV == NULL) ||
+            (pBufferIn == NULL) || (pBufferOut == NULL) ||
+            (lengthInBits == 0))
+                return;
+#endif
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        snow3gKeyState1_t ctx;
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+        /* Clock FSM and LFSR once, ignore the key stream */
+        (void) snow3g_keystream_1_4(&ctx);
+
+        f8_snow3g_bit(&ctx, pBufferIn, pBufferOut, lengthInBits, offsetInBits);
+
+#ifdef SAFE_DATA
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/**
+ * @brief Two buffer F8 encrypt/decrypt with the same key schedule
+ *
+ * Two buffers enc/dec with the same key schedule.
+ * The 2 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *
+ * @param[in] pHandle      pointer to precomputed key schedule
+ * @param[in] pIV1         pointer to IV
+ * @param[in] pIV2         pointer to IV
+ * @param[in] pBufIn1      pointer to an input buffer
+ * @param[in] pBufOut1     pointer to an output buffer
+ * @param[in] lenInBytes1  message size in bytes
+ * @param[in] pBufIn2      pointer to an input buffer
+ * @param[in] pBufOut2     pointer to an output buffer
+ * @param[in] lenInBytes2  message size in bytes
+ */
+void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pBufIn1,
+                        void *pBufOut1,
+                        const uint32_t lenInBytes1,
+                        const void *pBufIn2,
+                        void *pBufOut2,
+                        const uint32_t lenInBytes2)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV1 == NULL) || (pIV2 == NULL) ||
+            (pBufIn1 == NULL) || (pBufOut1 == NULL) ||
+            (pBufIn2 == NULL) || (pBufOut2 == NULL) ||
+            (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) ||
+            (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN))
+                return;
+#endif
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        snow3gKeyState1_t ctx1, ctx2;
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx1, pHandle, pIV1);
+
+        /* Clock FSM and LFSR once, ignore the key stream */
+        (void) snow3g_keystream_1_4(&ctx1);
+
+        /* data processing for packet 1 */
+        f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1);
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_1(&ctx2, pHandle, pIV2);
+
+        /* Clock FSM and LFSR once, ignore the key stream */
+        (void) snow3g_keystream_1_4(&ctx2);
+
+        /* data processing for packet 2 */
+        f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2);
+
+#ifdef SAFE_DATA
+        CLEAR_MEM(&ctx1, sizeof(ctx1));
+        CLEAR_MEM(&ctx2, sizeof(ctx2));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+}
+
+/**
+ * @brief Four buffer F8 encrypt/decrypt with the same key schedule
+ *
+ * Four packets enc/dec with the same key schedule.
+ * The 4 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *
+ * @param[in] pHandle         pointer to precomputed key schedule
+ * @param[in] pIV1            pointer to IV
+ * @param[in] pIV2            pointer to IV
+ * @param[in] pIV3            pointer to IV
+ * @param[in] pIV4            pointer to IV
+ * @param[in] pBufferIn1      pointer to an input buffer
+ * @param[in] pBufferOut1     pointer to an output buffer
+ * @param[in] lengthInBytes1  message size in bytes
+ * @param[in] pBufferIn2      pointer to an input buffer
+ * @param[in] pBufferOut2     pointer to an output buffer
+ * @param[in] lengthInBytes2  message size in bytes
+ * @param[in] pBufferIn3      pointer to an input buffer
+ * @param[in] pBufferOut3     pointer to an output buffer
+ * @param[in] lengthInBytes3  message size in bytes
+ * @param[in] pBufferIn4      pointer to an input buffer
+ * @param[in] pBufferOut4     pointer to an output buffer
+ * @param[in] lengthInBytes4  message size in bytes
+ */
+void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pIV3,
+                        const void *pIV4,
+                        const void *pBufferIn1,
+                        void *pBufferOut1,
+                        const uint32_t lengthInBytes1,
+                        const void *pBufferIn2,
+                        void *pBufferOut2,
+                        const uint32_t lengthInBytes2,
+                        const void *pBufferIn3,
+                        void *pBufferOut3,
+                        const uint32_t lengthInBytes3,
+                        const void *pBufferIn4,
+                        void *pBufferOut4,
+                        const uint32_t lengthInBytes4)
+{
+        const size_t num_lanes = 4;
+        snow3gKeyState4_t ctx;
+        uint32_t lenInBytes[4];
+        uint8_t *pBufferOut[4];
+        const uint8_t *pBufferIn[4];
+        uint32_t bytes, qwords, i;
+
+        length_copy_4(lenInBytes, lengthInBytes1, lengthInBytes2,
+                      lengthInBytes3, lengthInBytes4);
+
+        cptr_copy_4((const void **)pBufferIn,
+                    pBufferIn1, pBufferIn2, pBufferIn3, pBufferIn4);
+
+        ptr_copy_4((void **)pBufferOut, pBufferOut1, pBufferOut2,
+                   pBufferOut3, pBufferOut4);
+
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) ||
+            (pIV1 == NULL) || (pIV2 == NULL) ||
+            (pIV3 == NULL) || (pIV4 == NULL))
+                return;
+
+        if (!cptr_check((const void * const *)pBufferIn, num_lanes) ||
+            !ptr_check((void **)pBufferOut, num_lanes) ||
+            !length_check(lenInBytes, num_lanes))
+                return;
+#endif
+
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        /* find min common length */
+        bytes = length_find_min(lenInBytes, num_lanes);
+        qwords = bytes / SNOW3G_8_BYTES;
+
+        /* subtract min common length from all buffers */
+        length_sub(lenInBytes, num_lanes, qwords * SNOW3G_8_BYTES);
+
+        /* Initialize the schedule from the IV */
+        snow3gStateInitialize_4(&ctx, pHandle, pIV1, pIV2, pIV3, pIV4);
+
+        /* Clock FSM and LFSR once, ignore the key stream */
+        (void) snow3g_keystream_4_4(&ctx);
+
+        /* generates 8 bytes at a time on all streams */
+        while (qwords >= 2) {
+                uint32x4_t ks[4];
+
+                snow3g_keystream_4_16(&ctx, ks);
+
+                for (i = 0; i < num_lanes; i++) {
+                        const uint32x4_t in = vld1q_u32((const uint32_t *)pBufferIn[i]);
+
+                        vst1q_u32((uint32_t *)pBufferOut[i], in ^ ks[i]);
+
+                        pBufferOut[i] += (2 * SNOW3G_8_BYTES);
+                        pBufferIn[i] += (2 * SNOW3G_8_BYTES);
+                }
+
+                qwords = qwords - 2;
+        }
+
+        while (qwords--) {
+                uint32x4_t H, L; /* 4 bytes of key stream */
+
+                snow3g_keystream_4_8(&ctx, &L, &H);
+
+                pBufferIn[0] = xor_keystrm_rev(pBufferOut[0], pBufferIn[0],
+                                vgetq_lane_u64(vreinterpretq_u64_u32(L), 0));
+                pBufferIn[1] = xor_keystrm_rev(pBufferOut[1], pBufferIn[1],
+                                vgetq_lane_u64(vreinterpretq_u64_u32(L), 1));
+                pBufferIn[2] = xor_keystrm_rev(pBufferOut[2], pBufferIn[2],
+                                vgetq_lane_u64(vreinterpretq_u64_u32(H), 0));
+                pBufferIn[3] = xor_keystrm_rev(pBufferOut[3], pBufferIn[3],
+                                vgetq_lane_u64(vreinterpretq_u64_u32(H), 1));
+
+                for (i = 0; i < num_lanes; i++)
+                        pBufferOut[i] += SNOW3G_8_BYTES;
+        }
+
+        /* process the remaining of each buffer
+         *  - extract the LFSR and FSM structures
+         *  - Continue process 1 buffer
+         */
+        for (i = 0; i < num_lanes; i++) {
+                snow3gKeyState1_t ctx_t;
+
+                if (lenInBytes[i] == 0)
+                        continue;
+                snow3gStateConvert_4(&ctx, &ctx_t, i);
+                f8_snow3g(&ctx_t, pBufferIn[i], pBufferOut[i], lenInBytes[i]);
+        }
+#ifdef SAFE_DATA
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+
+/**
+ * @brief Multiple-key 8 buffer F8 encrypt/decrypt
+ *
+ * Eight packets enc/dec with eight respective key schedules.
+ * The 8 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *
+ * @param[in] pKey          pointer to an array of key schedules
+ * @param[in] IV            pointer to an array of IV's
+ * @param[in] pBufferIn     pointer to an array of input buffers
+ * @param[out] pBufferOut   pointer to an array of output buffers
+ * @param[in] lengthInBytes pointer to an array of message lengths in bytes
+ */
+void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[],
+                                 const void * const IV[],
+                                 const void * const BufferIn[],
+                                 void *BufferOut[],
+                                 const uint32_t lengthInBytes[])
+{
+        const size_t num_lanes = 8;
+
+#ifdef SAFE_PARAM
+        if ((pKey == NULL) || (IV == NULL) || (BufferIn == NULL) ||
+            (BufferOut == NULL) || (lengthInBytes == NULL))
+                return;
+
+        if (!ptr_check(BufferOut, num_lanes) || !cptr_check(IV, num_lanes) ||
+            !cptr_check((const void * const *)pKey, num_lanes) ||
+            !cptr_check(BufferIn, num_lanes) ||
+            !length_check(lengthInBytes, num_lanes))
+                return;
+#endif
+        for (uint32_t i = 0; i < num_lanes; i++)
+                SNOW3G_F8_1_BUFFER(pKey[i], IV[i], BufferIn[i], BufferOut[i],
+                                   lengthInBytes[i]);
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif
+
+}
+
+/**
+ * @brief 8 buffer F8 encrypt/decrypt with the same key schedule
+ *
+ * Eight packets enc/dec with the same key schedule.
+ * The 8 IVs are independent and are passed as an array of pointers.
+ * Each buffer and data length are separate.
+ *
+ * @param[in] pHandle         pointer to precomputed key schedule
+ * @param[in] pIV1            pointer to IV
+ * @param[in] pIV2            pointer to IV
+ * @param[in] pIV3            pointer to IV
+ * @param[in] pIV4            pointer to IV
+ * @param[in] pIV5            pointer to IV
+ * @param[in] pIV6            pointer to IV
+ * @param[in] pIV7            pointer to IV
+ * @param[in] pIV8            pointer to IV
+ * @param[in] pBufIn1         pointer to an input buffer
+ * @param[in] pBufOut1        pointer to an output buffer
+ * @param[in] lenInBytes1     message size in bytes
+ * @param[in] pBufIn2         pointer to an input buffer
+ * @param[in] pBufOut2        pointer to an output buffer
+ * @param[in] lenInBytes2     message size in bytes
+ * @param[in] pBufIn3         pointer to an input buffer
+ * @param[in] pBufOut3        pointer to an output buffer
+ * @param[in] lenInBytes3     message size in bytes
+ * @param[in] pBufIn4         pointer to an input buffer
+ * @param[in] pBufOut4        pointer to an output buffer
+ * @param[in] lenInBytes4     message size in bytes
+ * @param[in] pBufIn5         pointer to an input buffer
+ * @param[in] pBufOut5        pointer to an output buffer
+ * @param[in] lenInBytes5     message size in bytes
+ * @param[in] pBufIn6         pointer to an input buffer
+ * @param[in] pBufOut6        pointer to an output buffer
+ * @param[in] lenInBytes6     message size in bytes
+ * @param[in] pBufIn7         pointer to an input buffer
+ * @param[in] pBufOut7        pointer to an output buffer
+ * @param[in] lenInBytes7     message size in bytes
+ * @param[in] pBufIn8         pointer to an input buffer
+ * @param[in] pBufOut8        pointer to an output buffer
+ * @param[in] lenInBytes8     message size in bytes
+ */
+void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV1,
+                        const void *pIV2,
+                        const void *pIV3,
+                        const void *pIV4,
+                        const void *pIV5,
+                        const void *pIV6,
+                        const void *pIV7,
+                        const void *pIV8,
+                        const void *pBufIn1,
+                        void *pBufOut1,
+                        const uint32_t lenInBytes1,
+                        const void *pBufIn2,
+                        void *pBufOut2,
+                        const uint32_t lenInBytes2,
+                        const void *pBufIn3,
+                        void *pBufOut3,
+                        const uint32_t lenInBytes3,
+                        const void *pBufIn4,
+                        void *pBufOut4,
+                        const uint32_t lenInBytes4,
+                        const void *pBufIn5,
+                        void *pBufOut5,
+                        const uint32_t lenInBytes5,
+                        const void *pBufIn6,
+                        void *pBufOut6,
+                        const uint32_t lenInBytes6,
+                        const void *pBufIn7,
+                        void *pBufOut7,
+                        const uint32_t lenInBytes7,
+                        const void *pBufIn8,
+                        void *pBufOut8,
+                        const uint32_t lenInBytes8)
+{
+        uint32_t lengthInBytes[8];
+        const uint8_t *pBufferIn[8];
+        const void *pIV[8];
+        uint8_t *pBufferOut[8];
+
+        length_copy_8(lengthInBytes,
+                      lenInBytes1, lenInBytes2, lenInBytes3, lenInBytes4,
+                      lenInBytes5, lenInBytes6, lenInBytes7, lenInBytes8);
+
+        cptr_copy_8((const void **)pBufferIn,
+                    pBufIn1, pBufIn2, pBufIn3, pBufIn4,
+                    pBufIn5, pBufIn6, pBufIn7, pBufIn8);
+
+        cptr_copy_8(pIV, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6, pIV7, pIV8);
+
+        ptr_copy_8((void **)pBufferOut,
+                   pBufOut1, pBufOut2, pBufOut3, pBufOut4,
+                   pBufOut5, pBufOut6, pBufOut7, pBufOut8);
+
+#ifdef SAFE_PARAM
+        const size_t num_lanes = 8;
+
+        if (pHandle == NULL)
+                return;
+
+        if (!length_check(lengthInBytes, num_lanes) ||
+            !cptr_check((const void * const *)pBufferIn, num_lanes) ||
+            !cptr_check(pIV, num_lanes) ||
+            !ptr_check((void **)pBufferOut, num_lanes))
+                return;
+#endif
+
+        SNOW3G_F8_4_BUFFER(pHandle,
+                           pIV[0], pIV[1], pIV[2], pIV[3],
+                           pBufferIn[0], pBufferOut[0], lengthInBytes[0],
+                           pBufferIn[1], pBufferOut[1], lengthInBytes[1],
+                           pBufferIn[2], pBufferOut[2], lengthInBytes[2],
+                           pBufferIn[3], pBufferOut[3], lengthInBytes[3]);
+
+        SNOW3G_F8_4_BUFFER(pHandle,
+                           pIV[4], pIV[5], pIV[6], pIV[7],
+                           pBufferIn[4], pBufferOut[4], lengthInBytes[4],
+                           pBufferIn[5], pBufferOut[5], lengthInBytes[5],
+                           pBufferIn[6], pBufferOut[6], lengthInBytes[6],
+                           pBufferIn[7], pBufferOut[7], lengthInBytes[7]);
+}
+
+/**
+ * @brief Single-key N buffer F8 encrypt/decrypt
+ *
+ * Performs F8 enc/dec on N packets.
+ * The input IV's are passed in Little Endian format.
+ * The KeySchedule is in Little Endian format.
+ *
+ * @param[in] pCtx          pointer to a key schedule
+ * @param[in] IV            pointer to an array of IV's
+ * @param[in] pBufferIn     pointer to an array of input buffers
+ * @param[out] pBufferOut   pointer to an array of output buffers
+ * @param[in] bufLenInBytes pointer to an array of message lengths in bytes
+ * @param[in] packetCount   number of packets to process (N)
+ */
+void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx,
+                        const void * const IV[],
+                        const void * const pBufferIn[],
+                        void *pBufferOut[],
+                        const uint32_t bufLenInBytes[],
+                        const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+        if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+            (pBufferOut == NULL) || (bufLenInBytes == NULL))
+                return;
+
+        if (!cptr_check(IV, packetCount) ||
+            !cptr_check(pBufferIn, packetCount) ||
+            !ptr_check(pBufferOut, packetCount) ||
+            !length_check(bufLenInBytes, packetCount))
+                return;
+#endif
+
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        if (packetCount > NUM_PACKETS_16) {
+                pBufferOut[0] = NULL;
+                printf("packetCount too high (%d)\n", packetCount);
+                return;
+        }
+
+        uint32_t packet_index, inner_index, pktCnt = packetCount;
+        int sortNeeded = 0, tempLen = 0;
+        uint8_t *srctempbuff;
+        uint8_t *dsttempbuff;
+        uint8_t *ivtempbuff;
+        uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+        uint32_t lensBuf[NUM_PACKETS_16] = {0};
+
+        memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+        memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+        memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+        memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+        packet_index = packetCount;
+
+        while (packet_index--) {
+
+                /* check if all packets are sorted by decreasing length */
+                if (packet_index > 0 && lensBuf[packet_index - 1] <
+                    lensBuf[packet_index]) {
+                        /* this packet array is not correctly sorted */
+                        sortNeeded = 1;
+                }
+        }
+
+        if (sortNeeded) {
+
+                /* sort packets in decreasing buffer size from [0] to
+                   [n]th packet, ** where buffer[0] will contain longest
+                   buffer and buffer[n] will contain the shortest buffer.
+                   4 arrays are swapped :
+                   - pointers to input buffers
+                   - pointers to output buffers
+                   - pointers to input IV's
+                   - input buffer lengths */
+                packet_index = packetCount;
+                while (packet_index--) {
+
+                        inner_index = packet_index;
+                        while (inner_index--) {
+
+                                if (lensBuf[packet_index] >
+                                    lensBuf[inner_index]) {
+
+                                        /* swap buffers to arrange in
+                                           descending order from [0]. */
+                                        srctempbuff = pSrcBuf[packet_index];
+                                        dsttempbuff = pDstBuf[packet_index];
+                                        ivtempbuff = pIV[packet_index];
+                                        tempLen = lensBuf[packet_index];
+
+                                        pSrcBuf[packet_index] =
+                                                pSrcBuf[inner_index];
+                                        pDstBuf[packet_index] =
+                                                pDstBuf[inner_index];
+                                        pIV[packet_index] = pIV[inner_index];
+                                        lensBuf[packet_index] =
+                                                lensBuf[inner_index];
+
+                                        pSrcBuf[inner_index] = srctempbuff;
+                                        pDstBuf[inner_index] = dsttempbuff;
+                                        pIV[inner_index] = ivtempbuff;
+                                        lensBuf[inner_index] = tempLen;
+                                }
+                        } /* for inner packet index (inner bubble-sort) */
+                }         /* for outer packet index (outer bubble-sort) */
+        }                 /* if sortNeeded */
+
+        packet_index = 0;
+        /* process 8 buffers at-a-time */
+       /* process 4 buffers at-a-time */
+        while (pktCnt >= 4) {
+                pktCnt -= 4;
+                SNOW3G_F8_4_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pIV[packet_index + 1],
+                                   pIV[packet_index + 2],
+                                   pIV[packet_index + 3],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0],
+                                   pSrcBuf[packet_index + 1],
+                                   pDstBuf[packet_index + 1],
+                                   lensBuf[packet_index + 1],
+                                   pSrcBuf[packet_index + 2],
+                                   pDstBuf[packet_index + 2],
+                                   lensBuf[packet_index + 2],
+                                   pSrcBuf[packet_index + 3],
+                                   pDstBuf[packet_index + 3],
+                                   lensBuf[packet_index + 3]);
+                packet_index += 4;
+        }
+
+        /* process 2 packets at-a-time */
+        while (pktCnt >= 2) {
+                pktCnt -= 2;
+                SNOW3G_F8_2_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pIV[packet_index + 1],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0],
+                                   pSrcBuf[packet_index + 1],
+                                   pDstBuf[packet_index + 1],
+                                   lensBuf[packet_index + 1]);
+                packet_index += 2;
+        }
+
+        /* remaining packets are processed 1 at a time */
+        while (pktCnt--) {
+                SNOW3G_F8_1_BUFFER(pCtx, pIV[packet_index + 0],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0]);
+                packet_index++;
+        }
+}
+
+/**
+ * @brief Multi-key N buffer F8 encrypt/decrypt
+ *
+ * Performs F8 enc/dec on N packets.
+ * The input IV's are passed in Little Endian format.
+ * The KeySchedule is in Little Endian format.
+ *
+ * @param[in]  pCtx          pointer to an array of key schedules
+ * @param[in]  IV            pointer to an array of IV's
+ * @param[in]  pBufferIn     pointer to an array of input buffers
+ * @param[out] pBufferOut    pointer to an array of output buffers
+ * @param[in]  bufLenInBytes pointer to an array of message lengths in bytes
+ * @param[in]  packetCount   number of packets to process (N)
+ */
+void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[],
+                                 const void * const IV[],
+                                 const void * const pBufferIn[],
+                                 void *pBufferOut[],
+                                 const uint32_t bufLenInBytes[],
+                                 const uint32_t packetCount)
+{
+#ifdef SAFE_PARAM
+        uint32_t i;
+
+        if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) ||
+            (pBufferOut == NULL) || (bufLenInBytes == NULL))
+                return;
+
+        for (i = 0; i < packetCount; i++)
+                if ((pCtx[i] == NULL) || (IV[i] == NULL) ||
+                    (pBufferIn[i] == NULL) || (pBufferOut[i] == NULL) ||
+                    (bufLenInBytes[i] == 0) ||
+                    (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN))
+                        return;
+#endif
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        if (packetCount > NUM_PACKETS_16) {
+                pBufferOut[0] = NULL;
+                printf("packetCount too high (%d)\n", packetCount);
+                return;
+        }
+
+        uint32_t packet_index, inner_index, pktCnt = packetCount;
+        int sortNeeded = 0, tempLen = 0;
+        uint8_t *srctempbuff;
+        uint8_t *dsttempbuff;
+        uint8_t *ivtempbuff;
+        snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL};
+        uint8_t *pIV[NUM_PACKETS_16] = {NULL};
+        uint32_t lensBuf[NUM_PACKETS_16] = {0};
+        snow3g_key_schedule_t *tempCtx;
+
+        memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *));
+        memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t));
+        memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *));
+        memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *));
+        memcpy((void *)pIV, IV, packetCount * sizeof(void *));
+
+        packet_index = packetCount;
+
+        while (packet_index--) {
+
+                /* check if all packets are sorted by decreasing length */
+                if (packet_index > 0 && lensBuf[packet_index - 1] <
+                    lensBuf[packet_index]) {
+                        /* this packet array is not correctly sorted */
+                        sortNeeded = 1;
+                }
+        }
+
+        if (sortNeeded) {
+                /* sort packets in decreasing buffer size from [0] to [n]th
+                   packet, where buffer[0] will contain longest buffer and
+                   buffer[n] will contain the shortest buffer.
+                   4 arrays are swapped :
+                   - pointers to input buffers
+                   - pointers to output buffers
+                   - pointers to input IV's
+                   - input buffer lengths */
+                packet_index = packetCount;
+                while (packet_index--) {
+                        inner_index = packet_index;
+                        while (inner_index--) {
+                                if (lensBuf[packet_index] >
+                                    lensBuf[inner_index]) {
+                                        /* swap buffers to arrange in
+                                           descending order from [0]. */
+                                        srctempbuff = pSrcBuf[packet_index];
+                                        dsttempbuff = pDstBuf[packet_index];
+                                        ivtempbuff = pIV[packet_index];
+                                        tempLen = lensBuf[packet_index];
+                                        tempCtx = pCtxBuf[packet_index];
+
+                                        pSrcBuf[packet_index] =
+                                                pSrcBuf[inner_index];
+                                        pDstBuf[packet_index] =
+                                                pDstBuf[inner_index];
+                                        pIV[packet_index] = pIV[inner_index];
+                                        lensBuf[packet_index] =
+                                                lensBuf[inner_index];
+                                        pCtxBuf[packet_index] =
+                                                pCtxBuf[inner_index];
+
+                                        pSrcBuf[inner_index] = srctempbuff;
+                                        pDstBuf[inner_index] = dsttempbuff;
+                                        pIV[inner_index] = ivtempbuff;
+                                        lensBuf[inner_index] = tempLen;
+                                        pCtxBuf[inner_index] = tempCtx;
+                                }
+                        } /* for inner packet index (inner bubble-sort) */
+                }         /* for outer packet index (outer bubble-sort) */
+        }                 /* if sortNeeded */
+
+        packet_index = 0;
+        /* process 8 buffers at-a-time */
+        /* @todo process 4 buffers at-a-time */
+        /* @todo process 2 packets at-a-time */
+        /* remaining packets are processed 1 at a time */
+        while (pktCnt--) {
+                SNOW3G_F8_1_BUFFER(pCtxBuf[packet_index + 0],
+                                   pIV[packet_index + 0],
+                                   pSrcBuf[packet_index + 0],
+                                   pDstBuf[packet_index + 0],
+                                   lensBuf[packet_index + 0]);
+                packet_index++;
+        }
+}
+
+/**
+ * @brief Single buffer bit-length F9 function
+ *
+ * Single buffer digest with IV and precomputed key schedule.
+ *
+ * @param[in] pHandle      pointer to precomputed key schedule
+ * @param[in] pIV          pointer to IV
+ * @param[in] pBufferIn    pointer to an input buffer
+ * @param[in] lengthInBits message length in bits
+ * @param[out] pDigest     pointer to store the F9 digest
+ */
+void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle,
+                        const void *pIV,
+                        const void *pBufferIn,
+                        const uint64_t lengthInBits,
+                        void *pDigest)
+{
+#ifdef SAFE_PARAM
+        if ((pHandle == NULL) || (pIV == NULL) ||
+            (pBufferIn == NULL) || (pDigest == NULL) ||
+            (lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN))
+                return;
+#endif
+#ifdef SAFE_DATA
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+
+        snow3gKeyState1_t ctx;
+        uint32_t z[5];
+        uint64_t lengthInQwords, E, P;
+        uint64_t i;
+        const uint64_t *inputBuffer;
+
+        inputBuffer = (const uint64_t *)pBufferIn;
+
+        /* Initialize the SNOW3G key schedule */
+        snow3gStateInitialize_1(&ctx, pHandle, pIV);
+
+        /*Generate 5 key stream words*/
+        snow3g_f9_keystream_words(&ctx, &z[0]);
+
+        P = ((uint64_t)z[0] << 32) | ((uint64_t)z[1]);
+
+        lengthInQwords = lengthInBits / 64;
+
+        E = 0;
+        i = 0;
+
+        if (lengthInQwords > 8) {
+                /* compute P^2, P^3 and P^4 and put into p1p2 & p3p4 */
+                const uint64_t P2 = multiply_and_reduce64(P, P);
+                const uint64_t P3 = multiply_and_reduce64(P2, P);
+                const uint64_t P4 = multiply_and_reduce64(P3, P);
+                const uint64_t bs[2] = {0x0001020304050607ULL,
+                                        0x08090a0b0c0d0e0fULL};
+                const uint8x16_t bswap2x64 = vreinterpretq_u8_u64(vld1q_u64(bs));
+                uint64_t ch[2] = {0xffffffffffffffffULL, 0};
+                const poly64x2_t clear_hi64 = vld1q_p64((poly64_t *)ch);
+                const uint64_t *m_ptr = &inputBuffer[i];
+                poly64x2_t EV = vdupq_n_p64(0);
+
+                for (; (i + 3) < lengthInQwords; i+= 4) {
+                        uint64_t m0, m1, m2, m3;
+                        uint64x2_t M1_t, M2_t;
+                        poly64x2_t t1, t2, t3;
+                        /* load 2 x 128-bits and byte swap 64-bit words */
+                        M1_t = vld1q_u64(m_ptr);
+                        m_ptr +=2;
+                        M2_t = vld1q_u64(m_ptr);
+                        m_ptr +=2;
+                        M1_t = vreinterpretq_u64_u8(vqtbl1q_u8(
+                                       vreinterpretq_u8_u64(M1_t), bswap2x64));
+                        M2_t = vreinterpretq_u64_u8(vqtbl1q_u8(
+                                       vreinterpretq_u8_u64(M2_t), bswap2x64));
+                        m0 = vgetq_lane_u64(M1_t, 0);
+                        m1 = vgetq_lane_u64(M1_t, 1);
+                        m2 = vgetq_lane_u64(M2_t, 0);
+                        m3 = vgetq_lane_u64(M2_t, 1);
+
+                        /* add current EV to the first word of the message */
+                        m0 = m0 ^ vgetq_lane_u64(vreinterpretq_u64_p64(EV), 0);
+                        m1 = m1 ^ vgetq_lane_u64(vreinterpretq_u64_p64(EV), 1);
+
+                        /* t1 = (M0 x P4) + (M1 x P3) + (M2 x P2) + (M3 x P1) */
+                        t1 = (poly64x2_t)vmull_p64(m2, P2);
+                        t2 = (poly64x2_t)vmull_p64(m3, P);
+
+                        t1 = t1 ^ t2;
+
+                        t2 = (poly64x2_t)vmull_p64(m0, P4);
+                        t3 = (poly64x2_t)vmull_p64(m1, P3);
+
+                        t2 = t2 ^ t3;
+                        t1 = t2 ^ t1;
+
+                        /* reduce 128-bit product */
+                        EV = reduce128_to_64(t1);
+
+                        /* clear top 64-bits for the subsequent add/xor */
+                        EV = EV & clear_hi64;
+                }
+
+                for (; (i + 1) < lengthInQwords; i+= 2) {
+                        poly64x2_t t1, t2;
+                        uint64x2_t M_t;
+
+                        /* load 128-bits and byte swap 64-bit words */
+                        M_t = vld1q_u64(m_ptr);
+                        m_ptr += 2;
+                        M_t = vreinterpretq_u64_u8(vqtbl1q_u8(
+                                        vreinterpretq_u8_u64(M_t), bswap2x64));
+
+                        /* add current EV to the first word of the message */
+                        M_t = M_t ^ EV;
+
+                        poly64_t M0 = vgetq_lane_u64(M_t, 0);
+                        poly64_t M1 = vgetq_lane_u64(M_t, 1);
+
+                        /* t1 = (M0 x P2) + (M1 x P1) */
+                        t1 = (poly64x2_t)vmull_p64(M0, P2);
+                        t2 = (poly64x2_t)vmull_p64(M1, P);
+                        t1 = t1 ^ t2;
+
+                        /* reduce 128-bit product */
+                        EV = reduce128_to_64(t1);
+
+                        /* clear top 64-bits for the subsequent add/xor */
+                        EV = EV & clear_hi64;
+                }
+                E = vgetq_lane_u64(vreinterpretq_u64_p64(EV), 0);
+        }
+
+        {
+                /* all blocks except the last one */
+                uint64_t V;
+                for (; i < lengthInQwords; i++) {
+                        V = BSWAP64(inputBuffer[i]);
+                        E = multiply_and_reduce64(E ^ V, P);
+                }
+#ifdef SAFE_DATA
+                CLEAR_VAR(&V, sizeof(V));
+#endif
+        }
+
+                /* last bits of last block if any left */
+        uint64_t rem_bits = lengthInBits % 64;
+        if (rem_bits) {
+                uint64_t V;
+                /* last bytes, do not go past end of buffer */
+                memcpy(&V, &inputBuffer[i], (rem_bits + 7) / 8);
+                V = BSWAP64(V);
+                /* mask extra bits */
+                V &= (((uint64_t)-1) << (64 - rem_bits));
+                E = multiply_and_reduce64(E ^ V, P);
+#ifdef SAFE_DATA
+                CLEAR_VAR(&V, sizeof(V));
+#endif
+        }
+
+        /* Multiply by Q */
+        E = multiply_and_reduce64(E ^ lengthInBits,
+                                  (((uint64_t)z[2] << 32) | ((uint64_t)z[3])));
+
+        /* Final MAC */
+        *(uint32_t *)pDigest =
+                (uint32_t)BSWAP64(E ^ ((uint64_t)z[4] << 32));
+
+#ifdef SAFE_DATA
+        CLEAR_VAR(&E, sizeof(E));
+        CLEAR_VAR(&P, sizeof(P));
+        CLEAR_MEM(&z, sizeof(z));
+        CLEAR_MEM(&ctx, sizeof(ctx));
+        CLEAR_SCRATCH_GPS();
+        CLEAR_SCRATCH_SIMD_REGS();
+#endif /* SAFE_DATA */
+}
+#endif /* SNOW3G_COMMON_H */
diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h
index 0e0204864ff2c28a5b02333699d1011962b7102e..f5a8334041a12ff7c211a2d54d310a2bd4cbeeed 100644
--- a/lib/include/ipsec_ooo_mgr.h
+++ b/lib/include/ipsec_ooo_mgr.h
@@ -395,4 +395,9 @@ init_mb_mgr_avx2_internal(IMB_MGR *state, const int reset_mgrs);
 IMB_DLL_LOCAL void
 init_mb_mgr_avx512_internal(IMB_MGR *state, const int reset_mgrs);
 
+IMB_DLL_LOCAL void
+init_mb_mgr_aarch64_no_aesni_internal(IMB_MGR *state, const int reset_mgrs);
+IMB_DLL_LOCAL void
+init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs);
+
 #endif /* IMB_IPSEC_MB_INTERNAL_H */
diff --git a/lib/include/noaesni.h b/lib/include/noaesni.h
index 19e19fa3a28629ed3f519845ab063763782591ce..57f564dd19acbe09f6a230a913c5fb3b48dc5f4b 100644
--- a/lib/include/noaesni.h
+++ b/lib/include/noaesni.h
@@ -30,6 +30,14 @@
 #ifndef NOAESNI_H
 #define NOAESNI_H
 
+IMB_DLL_EXPORT void init_mb_mgr_aarch64_no_aesni(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *submit_job_aarch64_no_aesni(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *submit_job_nocheck_aarch64_no_aesni(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *flush_job_aarch64_no_aesni(IMB_MGR *state);
+IMB_DLL_EXPORT uint32_t queue_size_aarch64_no_aesni(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *get_completed_job_aarch64_no_aesni(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *get_next_job_aarch64_no_aesni(IMB_MGR *state);
+
 IMB_DLL_EXPORT void init_mb_mgr_sse_no_aesni(IMB_MGR *state);
 IMB_DLL_EXPORT IMB_JOB *submit_job_sse_no_aesni(IMB_MGR *state);
 IMB_DLL_EXPORT IMB_JOB *submit_job_nocheck_sse_no_aesni(IMB_MGR *state);
diff --git a/lib/include/snow3g.h b/lib/include/snow3g.h
index 24555e2f0c190dba6c52c38d0f2f28ff1f30b280..13bdbb156d9414eab0c7a5bfcea5be16b0967b7d 100644
--- a/lib/include/snow3g.h
+++ b/lib/include/snow3g.h
@@ -655,4 +655,243 @@ snow3g_f9_1_buffer_internal_vaes_avx512(const uint64_t *pBufferIn,
                                         const uint32_t KS[5],
                                         const uint64_t lengthInBits);
 
+/*****************************************************************************
+ * AARCH64
+******************************************************************************/
+void
+snow3g_f8_1_buffer_bit_aarch64(const snow3g_key_schedule_t *pCtx,
+                               const void *pIV,
+                               const void *pBufferIn,
+                               void *pBufferOut,
+                               const uint32_t cipherLengthInBits,
+                               const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
+                           const void *pIV,
+                           const void *pBufferIn,
+                           void *pBufferOut,
+                           const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
+                           const void *pIV1,
+                           const void *pIV2,
+                           const void *pBufferIn1,
+                           void *pBufferOut1,
+                           const uint32_t lengthInBytes1,
+                           const void *pBufferIn2,
+                           void *pBufferOut2,
+                           const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
+                           const void *pIV1,
+                           const void *pIV2,
+                           const void *pIV3,
+                           const void *pIV4,
+                           const void *pBufferIn1,
+                           void *pBufferOut1,
+                           const uint32_t lengthInBytes1,
+                           const void *pBufferIn2,
+                           void *pBufferOut2,
+                           const uint32_t lengthInBytes2,
+                           const void *pBufferIn3,
+                           void *pBufferOut3,
+                           const uint32_t lengthInBytes3,
+                           const void *pBufferIn4,
+                           void *pBufferOut4,
+                           const uint32_t lengthInBytes4);
+void
+snow3g_f8_8_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
+                           const void *pIV1,
+                           const void *pIV2,
+                           const void *pIV3,
+                           const void *pIV4,
+                           const void *pIV5,
+                           const void *pIV6,
+                           const void *pIV7,
+                           const void *pIV8,
+                           const void *pBufferIn1,
+                           void *pBufferOut1,
+                           const uint32_t lengthInBytes1,
+                           const void *pBufferIn2,
+                           void *pBufferOut2,
+                           const uint32_t lengthInBytes2,
+                           const void *pBufferIn3,
+                           void *pBufferOut3,
+                           const uint32_t lengthInBytes3,
+                           const void *pBufferIn4,
+                           void *pBufferOut4,
+                           const uint32_t lengthInBytes4,
+                           const void *pBufferIn5,
+                           void *pBufferOut5,
+                           const uint32_t lengthInBytes5,
+                           const void *pBufferIn6,
+                           void *pBufferOut6,
+                           const uint32_t lengthInBytes6,
+                           const void *pBufferIn7,
+                           void *pBufferOut7,
+                           const uint32_t lengthInBytes7,
+                           const void *pBufferIn8,
+                           void *pBufferOut8,
+                           const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_aarch64(const snow3g_key_schedule_t * const pCtx[],
+                                    const void * const pIV[],
+                                    const void * const pBufferIn[],
+                                    void *pBufferOut[],
+                                    const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
+                           const void * const IV[],
+                           const void * const pBufferIn[],
+                           void *pBufferOut[],
+                           const uint32_t bufferLenInBytes[],
+                           const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_aarch64(const snow3g_key_schedule_t * const pCtx[],
+                                    const void * const IV[],
+                                    const void * const pBufferIn[],
+                                    void *pBufferOut[],
+                                    const uint32_t bufferLenInBytes[],
+                                    const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx,
+                           const void *pIV,
+                           const void *pBufferIn,
+                           const uint64_t lengthInBits,
+                           void *pDigest);
+
+size_t
+snow3g_key_sched_size_aarch64(void);
+
+int
+snow3g_init_key_sched_aarch64(const void *pKey, snow3g_key_schedule_t *pCtx);
+
+/*******************************************************************************
+ * AARCH64 NO-AESNI
+ ******************************************************************************/
+void
+snow3g_f8_1_buffer_bit_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                        const void *pIV,
+                                        const void *pBufferIn,
+                                        void *pBufferOut,
+                                        const uint32_t cipherLengthInBits,
+                                        const uint32_t offsetInBits);
+
+void
+snow3g_f8_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                    const void *pIV,
+                                    const void *pBufferIn,
+                                    void *pBufferOut,
+                                    const uint32_t lengthInBytes);
+
+void
+snow3g_f8_2_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                    const void *pIV1,
+                                    const void *pIV2,
+                                    const void *pBufferIn1,
+                                    void *pBufferOut1,
+                                    const uint32_t lengthInBytes1,
+                                    const void *pBufferIn2,
+                                    void *pBufferOut2,
+                                    const uint32_t lengthInBytes2);
+
+void
+snow3g_f8_4_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                    const void *pIV1,
+                                    const void *pIV2,
+                                    const void *pIV3,
+                                    const void *pIV4,
+                                    const void *pBufferIn1,
+                                    void *pBufferOut1,
+                                    const uint32_t lengthInBytes1,
+                                    const void *pBufferIn2,
+                                    void *pBufferOut2,
+                                    const uint32_t lengthInBytes2,
+                                    const void *pBufferIn3,
+                                    void *pBufferOut3,
+                                    const uint32_t lengthInBytes3,
+                                    const void *pBufferIn4,
+                                    void *pBufferOut4,
+                                    const uint32_t lengthInBytes4);
+
+void
+snow3g_f8_8_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                    const void *pIV1,
+                                    const void *pIV2,
+                                    const void *pIV3,
+                                    const void *pIV4,
+                                    const void *pIV5,
+                                    const void *pIV6,
+                                    const void *pIV7,
+                                    const void *pIV8,
+                                    const void *pBufferIn1,
+                                    void *pBufferOut1,
+                                    const uint32_t lengthInBytes1,
+                                    const void *pBufferIn2,
+                                    void *pBufferOut2,
+                                    const uint32_t lengthInBytes2,
+                                    const void *pBufferIn3,
+                                    void *pBufferOut3,
+                                    const uint32_t lengthInBytes3,
+                                    const void *pBufferIn4,
+                                    void *pBufferOut4,
+                                    const uint32_t lengthInBytes4,
+                                    const void *pBufferIn5,
+                                    void *pBufferOut5,
+                                    const uint32_t lengthInBytes5,
+                                    const void *pBufferIn6,
+                                    void *pBufferOut6,
+                                    const uint32_t lengthInBytes6,
+                                    const void *pBufferIn7,
+                                    void *pBufferOut7,
+                                    const uint32_t lengthInBytes7,
+                                    const void *pBufferIn8,
+                                    void *pBufferOut8,
+                                    const uint32_t lengthInBytes8);
+
+void
+snow3g_f8_8_buffer_multikey_aarch64_no_aesni(const snow3g_key_schedule_t *
+                                             const pCtx[],
+                                             const void * const pIV[],
+                                             const void * const pBufferIn[],
+                                             void *pBufferOut[],
+                                             const uint32_t lengthInBytes[]);
+
+void
+snow3g_f8_n_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                    const void * const IV[],
+                                    const void * const pBufferIn[],
+                                    void *pBufferOut[],
+                                    const uint32_t bufferLenInBytes[],
+                                    const uint32_t bufferCount);
+
+void
+snow3g_f8_n_buffer_multikey_aarch64_no_aesni(const snow3g_key_schedule_t * const
+                                         pCtx[],
+                                             const void * const IV[],
+                                             const void * const pBufferIn[],
+                                             void *pBufferOut[],
+                                             const uint32_t bufferLenInBytes[],
+                                             const uint32_t bufferCount);
+
+void
+snow3g_f9_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx,
+                                    const void *pIV,
+                                    const void *pBufferIn,
+                                    const uint64_t lengthInBits,
+                                    void *pDigest);
+
+size_t
+snow3g_key_sched_size_aarch64_no_aesni(void);
+
+int
+snow3g_init_key_sched_aarch64_no_aesni(const void *pKey,
+                                       snow3g_key_schedule_t *pCtx);
 #endif /* _SNOW3G_H_ */
diff --git a/lib/include/snow3g_tables.h b/lib/include/snow3g_tables.h
index 8b43f286c27beabc176922b6e3d9a4f4ca858e2e..bc87e6da91d899cb89854b05801e183017c3b90f 100644
--- a/lib/include/snow3g_tables.h
+++ b/lib/include/snow3g_tables.h
@@ -29,6 +29,9 @@
 #define _SNOW3G_TABLES_H_
 
 #include <stdint.h>
+#include "ipsec-mb.h"
+
+#ifdef __x86_64__
 #include "constant_lookup.h"
 
 #if defined (AVX) || defined (AVX2)
@@ -37,6 +40,8 @@
 #define SNOW3G_SAFE_LUT8(table, idx, size) LOOKUP8_SSE(table, idx, size)
 #endif /* AVX || AVX2 */
 
+#endif /* __x86_64__ */
+
 extern const int snow3g_table_A_mul[256];
 extern const int snow3g_table_A_div[256];
 extern const uint8_t snow3g_invSR_SQ[256];
diff --git a/lib/include/wireless_common.h b/lib/include/wireless_common.h
index 4f838c4ac3eadd7c7931b13d751cd36dd4349559..6851f9aeefa72dabf42dcec053dc973e92f3b023 100644
--- a/lib/include/wireless_common.h
+++ b/lib/include/wireless_common.h
@@ -29,11 +29,14 @@
 #define _WIRELESS_COMMON_H_
 
 #include <string.h>
+
+#ifdef __x86_64__
 #ifdef LINUX
 #include <x86intrin.h>
 #else
 #include <intrin.h>
 #endif
+#endif /* __x86_64__ */
 
 #define NUM_PACKETS_1 1
 #define NUM_PACKETS_2 2
@@ -50,6 +53,7 @@
 #define BSWAP64 _byteswap_uint64
 #endif
 
+#ifndef __aarch64__
 typedef union _m128_u {
         uint8_t byte[16];
         uint16_t word[8];
@@ -64,6 +68,7 @@ typedef union _m64_u {
         uint32_t dword[2];
         uint64_t m;
 } m64_t;
+#endif
 
 static inline uint32_t bswap4(const uint32_t val)
 {
@@ -174,6 +179,7 @@ memcpy_keystrm(uint8_t *pDst, const uint8_t *pSrc, const uint32_t len)
         }
 }
 
+#ifndef __aarch64__
 /**
  * @brief Save start and end of the buffer around message
  *
@@ -403,4 +409,5 @@ IMB_DLL_LOCAL void asm_XorKeyStream32B_avx2(const void *pIn, void *pOut,
 IMB_DLL_LOCAL void asm_XorKeyStream64B_avx512(const void *pIn, void *pOut,
                                               const void *pKey);
 
+#endif /* __aarch64__ */
 #endif /* _WIRELESS_COMMON_H_ */
diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h
index a386b031970bcea66f50e42b66d559b7254e56b8..4cbcd060a8e0b366422d186820c1b2af60618f31 100644
--- a/lib/ipsec-mb.h
+++ b/lib/ipsec-mb.h
@@ -123,6 +123,7 @@ typedef enum {
         IMB_ARCH_AVX,
         IMB_ARCH_AVX2,
         IMB_ARCH_AVX512,
+        IMB_ARCH_AARCH64,
         IMB_ARCH_NUM,
 } IMB_ARCH;
 
@@ -984,6 +985,9 @@ typedef uint32_t (*crc32_fn_t)(const void *, const uint64_t);
 #define IMB_FEATURE_BMI2       (1ULL << 18)
 #define IMB_FEATURE_AESNI_EMU  (1ULL << 19)
 
+#define IMB_FEATURE_AARCH64    (1ULL << 32)
+#define IMB_FEATURE_ASIMD      (1ULL << 33)
+
 /* TOP LEVEL (IMB_MGR) Data structure fields */
 
 #define IMB_MAX_JOBS 128
@@ -1307,6 +1311,14 @@ IMB_DLL_EXPORT uint32_t queue_size_sse(IMB_MGR *state);
 IMB_DLL_EXPORT IMB_JOB *get_completed_job_sse(IMB_MGR *state);
 IMB_DLL_EXPORT IMB_JOB *get_next_job_sse(IMB_MGR *state);
 
+IMB_DLL_EXPORT void init_mb_mgr_aarch64(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *submit_job_aarch64(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *submit_job_nocheck_aarch64(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *flush_job_aarch64(IMB_MGR *state);
+IMB_DLL_EXPORT uint32_t queue_size_aarch64(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *get_completed_job_aarch64(IMB_MGR *state);
+IMB_DLL_EXPORT IMB_JOB *get_next_job_aarch64(IMB_MGR *state);
+
 /**
  * @brief Automatically initialize most performant
  *        Multi-buffer manager based on CPU features
diff --git a/lib/no-aesni/aesni_emu.c b/lib/no-aesni/aesni_emu.c
index 9fc35db8ee823161d752b44bd0926fa3405c69d7..29492234837ac4ed82ebf945886679f942e1e732 100644
--- a/lib/no-aesni/aesni_emu.c
+++ b/lib/no-aesni/aesni_emu.c
@@ -28,16 +28,24 @@
 /* ========================================================================== */
 /* AESNI emulation API and helper functions */
 /* ========================================================================== */
+#define AESNI_EMU
 
 #include "ipsec-mb.h"
 #include "aesni_emu.h"
-#include "include/constant_lookup.h"
 
+#ifdef __aarch64__
+#include "aarch64/constant_lookup_aarch64.h"
+#include <arm_neon.h>
+#endif /* __aarch64__ */
+
+#ifdef __86_x64__
+#include "include/constant_lookup.h"
 #ifdef LINUX
 #include <x86intrin.h>
 #else
 #include <intrin.h>
 #endif
+#endif /* __x86_64__ */
 
 static const DECLARE_ALIGNED(uint8_t aes_sbox[16][16], 16) = {
         { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
@@ -132,23 +140,41 @@ static uint32_t rot(const uint32_t x)
 
 static void substitute_bytes(union xmm_reg *dst, const union xmm_reg *src)
 {
+#ifdef __aarch64__
+        uint8x16_t vx = vld1q_u8((uint8_t const *) &src->byte[0]);
+
+        IMB_ASSERT(MAX_BYTES_PER_XMM == 16);
+
+        vx = lookup_16x8bit_neon(vx, aes_sbox);
+        vst1q_u8((uint8_t *) &dst->byte[0], vx);
+#else
         __m128i vx = _mm_loadu_si128((const __m128i *) &src->byte[0]);
 
         IMB_ASSERT(MAX_BYTES_PER_XMM == 16);
 
         vx = lookup_16x8bit_sse(vx, aes_sbox);
         _mm_storeu_si128((__m128i *) &dst->byte[0], vx);
+#endif
 }
 
 static void inverse_substitute_bytes(union xmm_reg *dst,
                                      const union xmm_reg *src)
 {
+#ifdef __aarch64__
+       uint8x16_t vx = vld1q_u8((uint8_t const *) &src->byte[0]);
+
+        IMB_ASSERT(MAX_BYTES_PER_XMM == 16);
+
+        vx = lookup_16x8bit_neon(vx, aes_isbox);
+        vst1q_u8((uint8_t *) &dst->byte[0], vx);
+#else
         __m128i vx = _mm_loadu_si128((const __m128i *) &src->byte[0]);
 
         IMB_ASSERT(MAX_BYTES_PER_XMM == 16);
 
         vx = lookup_16x8bit_sse(vx, aes_isbox);
         _mm_storeu_si128((__m128i *) &dst->byte[0], vx);
+#endif
 }
 
 static uint8_t gfmul(const uint8_t x, const uint8_t y)
diff --git a/perf/Makefile b/perf/Makefile
index 67d58449be058db956330dbac497ced77ccd936f..3dc13dc20df9a9f24750523902fce91695e1c53e 100644
--- a/perf/Makefile
+++ b/perf/Makefile
@@ -1,8 +1,8 @@
 # Copyright (c) 2017-2022, Intel Corporation
-# 
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are met:
-# 
+#
 #     * Redistributions of source code must retain the above copyright notice,
 #       this list of conditions and the following disclaimer.
 #     * Redistributions in binary form must reproduce the above copyright
@@ -11,7 +11,7 @@
 #     * Neither the name of Intel Corporation nor the names of its contributors
 #       may be used to endorse or promote products derived from this software
 #       without specific prior written permission.
-# 
+#
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@@ -26,6 +26,9 @@
 EXE=ipsec_perf
 INSTPATH ?= /usr/include/ipsec-mb.h
 LIB_DIR ?= ../lib
+
+ARCH = $(shell uname -m)
+
 NASM ?= nasm
 
 MINGW ?= $(shell $(CC) -dM -E - < /dev/null | grep -i mingw | wc -l | sed 's/^ *//')
@@ -47,11 +50,14 @@ endif
 
 # if "-z ibt" is supported then assume "-z shstk, -z cet-report=error" are also supported
 # "-fcf-protection" needs to be checked separately
+ifeq ($(ARCH),x86_64)
 ifeq ($(MINGW),0)
 CC_HAS_CET = $(and $(shell $(CC) --target-help 2> /dev/null | grep -m1 -e "-z ibt" | wc -l), \
 	$(shell $(CC) --help=common 2> /dev/null | grep -m1 -e "-fcf-protection" | wc -l))
 CET_LDFLAGS=-r -z ibt -z shstk
-endif
+endif # MINGW
+endif # x86_64
+
 ifeq ($(CC_HAS_CET),1)
 CFLAGS += -fcf-protection=full
 endif
@@ -82,12 +88,18 @@ ifeq ($(MINGW),0)
 CFLAGS += -O3 -fPIE -fstack-protector -D_FORTIFY_SOURCE=2
 else
 CFLAGS += -O2 -fPIE
-endif
-endif
+endif # MINGW
+endif # DEBUG
 
 SOURCES := ipsec_perf.c msr.c
+ifneq ($(ARCH),aarch64)
 ASM_SOURCES := misc.asm
 OBJECTS := $(SOURCES:%.c=%.o) $(ASM_SOURCES:%.asm=%.o)
+else
+ASM_SOURCES := misc_aarch64.S
+CFLAGS += -march=armv8-a
+OBJECTS := $(SOURCES:%.c=%.o) $(ASM_SOURCES:%.S=%.o)
+endif # AARCH64
 
 CHECKPATCH ?= checkpatch.pl
 CPPCHECK ?= cppcheck
@@ -95,12 +107,17 @@ CPPCHECK ?= cppcheck
 .PHONY: all clean style cppcheck
 
 # rule for compiling assembly code with producing dependencies
+ifneq ($(ARCH),aarch64)
 %.o:%.asm
 	$(NASM) -MD $(@:.o=.d) -MT $@ -o $@ $(NASM_FLAGS) $<
 ifeq ($(CC_HAS_CET),1)
 	$(LD) $(CET_LDFLAGS) -o $@.tmp $@
 	mv $@.tmp $@
-endif
+endif # CC_HAS_CET
+else
+%.o:%.S
+	$(CC) -c $(CFLAGS) $< -o $@
+endif # AARCH64
 
 all: $(EXE)
 
diff --git a/perf/ipsec_perf.c b/perf/ipsec_perf.c
index 3200536adc75716a0aaa6b161ea57a2817239073..97dc132769cbd60014a3defb0f67f9b31098594b 100644
--- a/perf/ipsec_perf.c
+++ b/perf/ipsec_perf.c
@@ -43,19 +43,21 @@
 #define __forceinline static __forceinline
 #define __func__ __FUNCTION__
 #define strcasecmp _stricmp
-#else
+#else /* _WIN32 */
 #include <stdlib.h>
+#ifdef __x86_64__
 #include <x86intrin.h>
+#endif /* __x86_64__ */
 #define __forceinline static inline __attribute__((always_inline))
 #include <unistd.h>
 #include <pthread.h>
 #if defined (__FreeBSD__)
 #include <sys/cpuset.h>
 typedef cpuset_t cpu_set_t;
-#else
+#else /* __FreeBSD__ */
 #include <sched.h>
-#endif
-#endif
+#endif /* __FreeBSD__ */
+#endif /* _WIN32 */
 
 #include <ipsec-mb.h>
 
@@ -112,6 +114,7 @@ enum arch_type_e {
         ARCH_AVX,
         ARCH_AVX2,
         ARCH_AVX512,
+        ARCH_AARCH64,
         NUM_ARCHS
 };
 
@@ -192,13 +195,13 @@ enum test_hash_alg_e {
 
 /* Struct storing cipher parameters */
 struct params_s {
-        IMB_CIPHER_DIRECTION	cipher_dir;
-        enum test_cipher_mode_e	cipher_mode;
-        enum test_hash_alg_e	hash_alg;
-        uint32_t		aes_key_size;
-        uint32_t		size_aes;
-        uint64_t		aad_size;
-        uint32_t		num_sizes;
+        IMB_CIPHER_DIRECTION    cipher_dir;
+        enum test_cipher_mode_e cipher_mode;
+        enum test_hash_alg_e    hash_alg;
+        uint32_t                aes_key_size;
+        uint32_t                size_aes;
+        uint64_t                aad_size;
+        uint32_t                num_sizes;
         uint32_t                core;
 };
 
@@ -223,7 +226,8 @@ const struct str_value_mapping arch_str_map[] = {
         {.name = "SSE",    .values.arch_type = ARCH_SSE },
         {.name = "AVX",    .values.arch_type = ARCH_AVX },
         {.name = "AVX2",   .values.arch_type = ARCH_AVX2 },
-        {.name = "AVX512", .values.arch_type = ARCH_AVX512 }
+        {.name = "AVX512", .values.arch_type = ARCH_AVX512 },
+        {.name = "AARCH64",.values.arch_type = ARCH_AARCH64 },
 };
 
 const struct str_value_mapping cipher_algo_str_map[] = {
@@ -879,7 +883,7 @@ struct custom_job_params custom_job_params = {
         .cipher_dir   = IMB_DIR_ENCRYPT
 };
 
-uint8_t archs[NUM_ARCHS] = {1, 1, 1, 1}; /* uses all function sets */
+uint8_t archs[NUM_ARCHS] = {1, 1, 1, 1, 1}; /* uses all function sets */
 int use_gcm_job_api = 0;
 int use_gcm_sgl_api = 0;
 int use_unhalted_cycles = 0; /* read unhalted cycles instead of tsc */
@@ -1064,8 +1068,8 @@ static int set_affinity(const int cpu)
 
         /* Set affinity of current process to cpu */
 #if defined(__FreeBSD__)
-	ret = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1,
-				sizeof(cpuset), &cpuset);
+        ret = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1,
+                                sizeof(cpuset), &cpuset);
 #else
         ret = sched_setaffinity(0, sizeof(cpuset), &cpuset);
 #endif
@@ -1158,6 +1162,16 @@ static int set_avg_unhalted_cycle_cost(const int core, uint64_t *value)
         return 0;
 }
 
+static inline uint64_t perf_rdtscp(void)
+{
+#ifdef __aarch64__
+        return rdtscp();
+#else
+        uint32_t aux;
+        return __rdtscp(&aux);
+#endif
+}
+
 /* Freeing allocated memory */
 static void free_mem(uint8_t **p_buffer, imb_uint128_t **p_keys)
 {
@@ -1238,7 +1252,7 @@ static void init_mem(uint8_t **p_buffer, imb_uint128_t **p_keys)
         uint8_t *buf = NULL;
         imb_uint128_t *keys = NULL;
 #ifdef LINUX
-	int ret;
+        int ret;
 #endif
 
         if (p_keys == NULL || p_buffer == NULL) {
@@ -1249,7 +1263,7 @@ static void init_mem(uint8_t **p_buffer, imb_uint128_t **p_keys)
 #ifdef LINUX
         ret = posix_memalign((void **) &buf, alignment, bufs_size);
 
-	if (ret != 0) {
+        if (ret != 0) {
                 fprintf(stderr, "Could not malloc buf\n");
                 exit(EXIT_FAILURE);
         }
@@ -1263,7 +1277,7 @@ static void init_mem(uint8_t **p_buffer, imb_uint128_t **p_keys)
 
 #ifdef LINUX
         ret = posix_memalign((void **) &keys, alignment, keys_size);
-	if (ret != 0) {
+        if (ret != 0) {
                 fprintf(stderr, "Could not allocate memory for keys!\n");
                 free_mem(&buf, &keys);
                 exit(EXIT_FAILURE);
@@ -1588,11 +1602,10 @@ do_test(IMB_MGR *mb_mgr, struct params_s *params,
         static DECLARE_ALIGNED(imb_uint128_t auth_iv, 16);
         static uint32_t ipad[5], opad[5], digest[3];
         static DECLARE_ALIGNED(uint32_t k1_expanded[11 * 4], 16);
-        static DECLARE_ALIGNED(uint8_t	k2[16], 16);
-        static DECLARE_ALIGNED(uint8_t	k3[16], 16);
+        static DECLARE_ALIGNED(uint8_t k2[16], 16);
+        static DECLARE_ALIGNED(uint8_t k3[16], 16);
         static DECLARE_ALIGNED(struct gcm_key_data gdata_key, 512);
         uint64_t time = 0;
-        uint32_t aux;
         uint8_t gcm_key[32];
         uint8_t next_iv[IMB_AES_BLOCK_SIZE];
 
@@ -1863,7 +1876,7 @@ do_test(IMB_MGR *mb_mgr, struct params_s *params,
                 time = read_cycles(params->core);
         else
 #endif
-                time = __rdtscp(&aux);
+                time = perf_rdtscp();
 
         for (i = 0; i < num_iter; i++) {
                 job = IMB_GET_NEXT_JOB(mb_mgr);
@@ -1909,7 +1922,7 @@ do_test(IMB_MGR *mb_mgr, struct params_s *params,
                 time = (read_cycles(params->core) - rd_cycles_cost) - time;
         else
 #endif
-                time = __rdtscp(&aux) - time;
+                time = perf_rdtscp() - time;
 
         if (!num_iter)
                 return time;
@@ -1983,7 +1996,6 @@ do_test_gcm(struct params_s *params,
         uint8_t *key;
         uint8_t *aad = NULL;
         uint64_t time = 0;
-        uint32_t aux;
 
         key = (uint8_t *) malloc(sizeof(uint8_t) * params->aes_key_size);
         if (!key) {
@@ -2020,7 +2032,7 @@ do_test_gcm(struct params_s *params,
                         time = read_cycles(params->core);
                 else
 #endif
-                        time = __rdtscp(&aux);
+                        time = perf_rdtscp();
 
                 if (params->aes_key_size == IMB_KEY_128_BYTES) {
                         if (use_gcm_sgl_api)
@@ -2074,14 +2086,14 @@ do_test_gcm(struct params_s *params,
                                 rd_cycles_cost) - time;
                 else
 #endif
-                        time = __rdtscp(&aux) - time;
+                        time = perf_rdtscp() - time;
         } else { /*DECRYPT*/
 #ifndef _WIN32
                 if (use_unhalted_cycles)
                         time = read_cycles(params->core);
                 else
 #endif
-                        time = __rdtscp(&aux);
+                        time = perf_rdtscp();
 
                 if (params->aes_key_size == IMB_KEY_128_BYTES) {
                         if (use_gcm_sgl_api)
@@ -2135,7 +2147,7 @@ do_test_gcm(struct params_s *params,
                                 rd_cycles_cost) - time;
                 else
 #endif
-                        time = __rdtscp(&aux) - time;
+                        time = perf_rdtscp() - time;
         }
 
         free(key);
@@ -2270,8 +2282,8 @@ print_times(struct variant_s *variant_list, struct params_s *params,
         uint32_t sz;
 
         if (plot_output_option == 0) {
-                const char *func_names[4] = {
-                        "SSE", "AVX", "AVX2", "AVX512"
+                const char *func_names[NUM_ARCHS] = {
+                        "SSE", "AVX", "AVX2", "AVX512", "AARCH64"
                 };
                 const char *c_mode_names[TEST_NUM_CIPHER_TESTS - 1] = {
                         "CBC", "CNTR", "CNTR+8", "CNTR_BITLEN", "CNTR_BITLEN4",
@@ -2481,11 +2493,12 @@ run_tests(void *arg)
                 params.hash_alg = custom_job_params.hash_alg;
 
                 /* Performing tests for each selected architecture */
-                for (arch = ARCH_SSE; arch <= ARCH_AVX512; arch++) {
+                for (arch = ARCH_SSE; arch < NUM_ARCHS; arch++) {
                         if (archs[arch] == 0)
                                 continue;
 
                         switch (arch) {
+#ifdef __x86_64__
                         case ARCH_SSE:
                                 init_mb_mgr_sse(p_mgr);
                                 break;
@@ -2495,9 +2508,19 @@ run_tests(void *arg)
                         case ARCH_AVX2:
                                 init_mb_mgr_avx2(p_mgr);
                                 break;
-                        default: /* ARCH_AV512 */
+                        case ARCH_AVX512:
                                 init_mb_mgr_avx512(p_mgr);
                                 break;
+#endif /* __x86_64__ */
+
+#ifdef __aarch64__
+                        case ARCH_AARCH64:
+                                init_mb_mgr_aarch64(p_mgr);
+                                break;
+#endif /* __aarch64__ */
+                        default:
+                                fprintf(stderr, "Invalid architecture: %d\n", arch);
+                                goto exit_failure;
                         }
 
                         process_variant(p_mgr, arch, &params,
@@ -2559,7 +2582,7 @@ static void usage(void)
                 "-h: print this message\n"
                 "-c: Use cold cache, it uses warm as default\n"
                 "-w: Use warm cache\n"
-                "--arch: run only tests on specified architecture (SSE/AVX/AVX2/AVX512)\n"
+                "--arch: run only tests on specified architecture (SSE/AVX/AVX2/AVX512/AARCH64)\n"
                 "--arch-best: detect available architectures and run only on the best one\n"
                 "--cipher-dir: Select cipher direction to run on the custom test  "
                 "(encrypt/decrypt) (default = encrypt)\n"
@@ -2664,6 +2687,7 @@ detect_arch(unsigned int arch_support[NUM_ARCHS])
                 IMB_FEATURE_AVX | IMB_FEATURE_CMOV | IMB_FEATURE_AESNI;
         const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx;
         const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2;
+        const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI;
         IMB_MGR *p_mgr = NULL;
         enum arch_type_e arch_id;
 
@@ -2693,6 +2717,9 @@ detect_arch(unsigned int arch_support[NUM_ARCHS])
         if ((p_mgr->features & detect_sse) != detect_sse)
                 arch_support[ARCH_SSE] = 0;
 
+        if ((p_mgr->features & detect_aarch64) != detect_aarch64)
+                arch_support[ARCH_AARCH64] = 0;
+
         free_mb_mgr(p_mgr);
 
         return 0;
@@ -2894,6 +2921,7 @@ detect_best_arch(uint8_t arch_support[NUM_ARCHS])
                 IMB_FEATURE_AVX | IMB_FEATURE_CMOV | IMB_FEATURE_AESNI;
         const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx;
         const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2;
+        const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI;
         IMB_MGR *p_mgr = NULL;
         uint64_t detected_features = 0;
 
@@ -2934,6 +2962,11 @@ detect_best_arch(uint8_t arch_support[NUM_ARCHS])
                 return 0;
         }
 
+        if ((detected_features & detect_aarch64) == detect_aarch64) {
+                arch_support[ARCH_AARCH64] = 1;
+                return 0;
+        }
+
         fprintf(stderr, "Arch detection: no architecture available!\n");
         return -1;
 }
@@ -3220,38 +3253,38 @@ int main(int argc, char *argv[])
                 }
 
                 num_sizes_list = JOB_SIZE_IMIX_LIST;
-		/*
-		 * Calculate accumulated distribution of
-		 * probabilities per job size
-		 */
-		distribution_total[0] = imix_list[0];
-		for (i = 1; i < (int)imix_list_count; i++)
-			distribution_total[i] = imix_list[i] +
-				distribution_total[i-1];
+                /*
+                 * Calculate accumulated distribution of
+                 * probabilities per job size
+                 */
+                distribution_total[0] = imix_list[0];
+                for (i = 1; i < (int)imix_list_count; i++)
+                        distribution_total[i] = imix_list[i] +
+                                distribution_total[i-1];
 
                 /* Use always same seed */
                 srand(0);
-		/* Calculate a random sequence of packet sizes,
+                /* Calculate a random sequence of packet sizes,
                    based on distribution */
-		for (i = 0; i < (int)JOB_SIZE_IMIX_LIST; i++) {
-			uint16_t random_number = rand() %
-				distribution_total[imix_list_count - 1];
+                for (i = 0; i < (int)JOB_SIZE_IMIX_LIST; i++) {
+                        uint16_t random_number = rand() %
+                                distribution_total[imix_list_count - 1];
                         uint16_t j;
 
-			for (j = 0; j < imix_list_count; j++)
-				if (random_number < distribution_total[j])
-					break;
+                        for (j = 0; j < imix_list_count; j++)
+                                if (random_number < distribution_total[j])
+                                        break;
 
-			job_size_imix_list[i] = job_size_list[j];
-		}
+                        job_size_imix_list[i] = job_size_list[j];
+                }
 
-		/* Calculate average buffer size for the IMIX distribution */
-		for (i = 0; i < (int)imix_list_count; i++)
-			average_job_size += job_size_list[i] *
-				imix_list[i];
+                /* Calculate average buffer size for the IMIX distribution */
+                for (i = 0; i < (int)imix_list_count; i++)
+                        average_job_size += job_size_list[i] *
+                                imix_list[i];
 
-		average_job_size /=
-				distribution_total[imix_list_count - 1];
+                average_job_size /=
+                                distribution_total[imix_list_count - 1];
         }
         cipher_size_list = (uint32_t *) malloc(sizeof(uint32_t) *
                                 num_sizes_list);
@@ -3317,6 +3350,9 @@ int main(int argc, char *argv[])
         if (tsc_detect)
                 fprintf(stderr, "TSC scaling to core cycles: %.3f\n",
                         get_tsc_to_core_scale(turbo_enabled));
+#ifdef __aarch64__
+        fprintf(stderr, "CNT frequency: %ld\n", read_cntfreq());
+#endif
 
         fprintf(stderr,
                 "Authentication size = cipher size + %u\n"
@@ -3330,6 +3366,7 @@ int main(int argc, char *argv[])
         if (custom_job_params.cipher_mode == TEST_CCM)
                 fprintf(stderr, "CCM AAD = %"PRIu64"\n", ccm_aad_size);
 
+#ifdef __x86_64__
         if (archs[ARCH_SSE]) {
                 IMB_MGR *p_mgr = alloc_mb_mgr(flags);
 
@@ -3343,6 +3380,15 @@ int main(int argc, char *argv[])
                         "Using" : "Not using");
                 free_mb_mgr(p_mgr);
         }
+#else
+        IMB_MGR *p_mgr = alloc_mb_mgr(flags);
+
+        if (p_mgr == NULL) {
+                fprintf(stderr, "Error allocating MB_MGR structure!\n");
+                return EXIT_FAILURE;
+        }
+        free_mb_mgr(p_mgr);
+#endif /* __x86_64__ */
 
         memset(t_info, 0, sizeof(t_info));
         init_offsets(cache_type);
diff --git a/perf/misc.h b/perf/misc.h
index 0f6b504dbccf1a32d7ac57e31113ef49455d99d6..2fcdbe9a63a65501935077b35cc958481be12efa 100644
--- a/perf/misc.h
+++ b/perf/misc.h
@@ -34,3 +34,8 @@
  * @return Number of TSC cycles measured while in fixed cost loop
  */
 uint64_t measure_tsc(const uint64_t cycles);
+
+#ifdef __aarch64__
+uint64_t rdtscp(void);
+uint64_t read_cntfreq(void);
+#endif
diff --git a/perf/misc_aarch64.S b/perf/misc_aarch64.S
new file mode 100644
index 0000000000000000000000000000000000000000..3a3d112ba87bfc2d13cdd8b289a231c8e7dfe45a
--- /dev/null
+++ b/perf/misc_aarch64.S
@@ -0,0 +1,60 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.global	measure_tsc
+.type	measure_tsc,%function
+.align	5
+measure_tsc:
+	mov	x5,x0
+	mrs	x6,CNTVCT_EL0
+fixed_loop:
+	sub	x5,x5,#1
+	cbnz	x5,fixed_loop
+
+	mrs	x7,CNTVCT_EL0
+	sub	x0,x7,x6
+
+	ret
+
+
+.global rdtscp
+.type	rdtscp,%function
+.align	5
+rdtscp:
+	mrs	x0,CNTVCT_EL0
+
+	ret
+
+
+.global	read_cntfreq
+.type	read_cntfreq,%function
+.align	5
+read_cntfreq:
+	mrs	x0,CNTFRQ_EL0
+
+	ret
diff --git a/test/Makefile b/test/Makefile
index 949001c509bf0f99f8185c33ea417a6afe5298fe..8aab75995d331f6648275d8a0cb0fbb64a6b26d0 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -32,6 +32,8 @@ ACVP_APP := acvp_app
 INSTPATH ?= /usr/include/ipsec-mb.h
 LIB_DIR ?= ../lib
 
+ARCH = $(shell uname -m)
+
 USE_YASM ?= n
 YASM ?= yasm
 NASM ?= nasm
@@ -48,11 +50,13 @@ CFLAGS = -MMD -D_GNU_SOURCE -DNO_COMPAT_IMB_API_053 \
 
 # if "-z ibt" is supported then assume "-z shstk, -z cet-report=error" are also supported
 # "-fcf-protection" needs to be checked separately
+ifeq ($(ARCH),x86_64)
 ifeq ($(MINGW),0)
 CC_HAS_CET = $(and $(shell $(CC) --target-help 2> /dev/null | grep -m1 -e "-z ibt" | wc -l), \
 	$(shell $(CC) --help=common 2> /dev/null | grep -m1 -e "-fcf-protection" | wc -l))
 CET_LDFLAGS=-r -z ibt -z shstk
-endif
+endif # MINGW
+endif # x86_64
 
 ifeq ($(CC_HAS_CET),1)
 CFLAGS += -fcf-protection=full
@@ -97,8 +101,8 @@ ifeq ($(MINGW),0)
 CFLAGS += -O3
 else
 CFLAGS += -O2
-endif
-endif
+endif # MINGW
+endif # DEBUG
 
 ACVP_LOC ?= /usr/local/acvp
 ACVP_HDR ?= $(ACVP_LOC)/include
@@ -108,6 +112,27 @@ ACVP_LDFLAGS = -L$(ACVP_LIB) $(LDFLAGS)
 ACVP_LDLIBS = -lacvp $(LDLIBS)
 
 # ipsec_MB_testapp modules
+ifeq ($(ARCH),aarch64)
+SOURCES := main.c utils.c api_test.c snow3g_test.c direct_api_test.c clear_mem_test.c
+OBJECTS := $(SOURCES:%.c=%.o)
+
+# ipsec_xvalid_test modules
+XVALID_ASM := misc_aarch64.S
+XVALID_SOURCES := ipsec_xvalid.c utils.c
+XVALID_OBJECTS := $(XVALID_SOURCES:%.c=%.o) $(XVALID_ASM:%.S=%.o)
+
+# fuzz modules
+FUZZ_SOURCES := job_api_fuzz_test.c
+
+# list of present dependency files
+DEP_FILES = $(wildcard ./*.d)
+
+# rule for compiling assembly code with producing dependencies
+%.o:%.S
+	$(CC) -c $(CFLAGS) $< -o $@
+endif # aarch64
+
+ifeq ($(ARCH),x86_64)
 SOURCES := main.c gcm_test.c ctr_test.c customop_test.c des_test.c ccm_test.c \
 	cmac_test.c utils.c hmac_sha1_test.c hmac_sha256_sha512_test.c \
 	hmac_md5_test.c aes_test.c sha_test.c chained_test.c api_test.c pon_test.c \
@@ -144,7 +169,8 @@ endif
 ifeq ($(CC_HAS_CET),1)
 	$(LD) $(CET_LDFLAGS) -o $@.tmp $@
 	mv $@.tmp $@
-endif
+endif # CC_HAS_CET
+endif # x86_64
 
 # targets come here
 all: $(TEST_APP) $(XVALID_APP) $(FUZZ_APP) $(ACVP_APP)
diff --git a/test/api_test.c b/test/api_test.c
index b50a0cce60be476ae6065b6e92d3bc80bfcbfac4..476a9a89f08fe91905b70d34884d2cf2786ff3e8 100644
--- a/test/api_test.c
+++ b/test/api_test.c
@@ -679,6 +679,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr)
                                 if (hash == IMB_AUTH_NULL ||
                                     hash == IMB_AUTH_CUSTOM)
                                         continue;
+#ifdef __aarch64__
+                                if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN)
+                                        continue;
+#endif
 
                                 /*
                                  * Skip hash algorithms belonging to AEAD
@@ -709,6 +713,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr)
                                 if (hash == IMB_AUTH_NULL ||
                                     hash == IMB_AUTH_CUSTOM)
                                         continue;
+#ifdef __aarch64__
+                                if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN)
+                                        continue;
+#endif
 
                                 /*
                                  * Skip hash algorithms belonging to AEAD
@@ -741,6 +749,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr)
                                 if (hash == IMB_AUTH_NULL ||
                                     hash == IMB_AUTH_CUSTOM)
                                         continue;
+#ifdef __aarch64__
+                                if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN)
+                                        continue;
+#endif
 
                                 /*
                                  * Skip hash algorithms belonging to AEAD
@@ -793,6 +805,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr)
                                     hash == IMB_AUTH_CRC6_IUUP_HEADER ||
                                     hash == IMB_AUTH_POLY1305)
                                         continue;
+#ifdef __aarch64__
+                                if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN)
+                                        continue;
+#endif
 
                                 /*
                                  * Skip hash algorithms belonging to AEAD
@@ -847,6 +863,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr)
                 for (dir = IMB_DIR_ENCRYPT; dir <= IMB_DIR_DECRYPT; dir++)
                         for (hash = IMB_AUTH_HMAC_SHA_1;
                              hash < IMB_AUTH_NUM; hash++) {
+#ifdef __aarch64__
+                                if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN)
+                                        continue;
+#endif
 
                                 switch (hash) {
                                 /*
@@ -1060,6 +1080,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr)
                                 if (cipher == IMB_CIPHER_NULL ||
                                     cipher == IMB_CIPHER_CUSTOM)
                                         continue;
+#ifdef __aarch64__
+                                if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN)
+                                        continue;
+#endif
 
                                 /*
                                  * Skip cipher algorithms belonging to AEAD
@@ -1089,6 +1113,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr)
                                 if (cipher == IMB_CIPHER_NULL ||
                                     cipher == IMB_CIPHER_CUSTOM)
                                         continue;
+#ifdef __aarch64__
+                                if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN)
+                                        continue;
+#endif
 
                                 /*
                                  * Skip cipher algorithms belonging to AEAD
@@ -1118,6 +1146,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr)
                                 if (cipher == IMB_CIPHER_NULL ||
                                     cipher == IMB_CIPHER_CUSTOM)
                                         continue;
+#ifdef __aarch64__
+                                if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN)
+                                        continue;
+#endif
 
                                 /*
                                  * Skip cipher algorithms belonging to AEAD
@@ -1148,6 +1180,12 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr)
              order++)
                 for (cipher = IMB_CIPHER_CBC; cipher < IMB_CIPHER_NUM;
                      cipher++) {
+#ifdef __aarch64__
+                        if ((cipher != IMB_CIPHER_NULL) &&
+                            (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN))
+                                continue;
+#endif
+
                         fill_in_job(&template_job, cipher, IMB_DIR_ENCRYPT,
                                     hash, order, &chacha_ctx, &gcm_ctx);
 
@@ -1181,6 +1219,11 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr)
              order++)
                 for (cipher = IMB_CIPHER_CBC; cipher < IMB_CIPHER_NUM;
                      cipher++) {
+#ifdef __aarch64__
+                        if ((cipher != IMB_CIPHER_NULL) &&
+                            (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN))
+                                continue;
+#endif
                         /*
                          * Skip cipher algorithms belonging to AEAD
                          * algorithms, as the test is for cipher
@@ -1251,6 +1294,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr)
                                 if (cipher == IMB_CIPHER_NULL ||
                                     cipher == IMB_CIPHER_CUSTOM)
                                         continue;
+#ifdef __aarch64__
+                                if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN)
+                                        continue;
+#endif
 
                                 /*
                                  * Skip cipher algorithms belonging to AEAD
@@ -1298,6 +1345,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr)
                                 if (cipher == IMB_CIPHER_NULL ||
                                     cipher == IMB_CIPHER_CUSTOM)
                                         continue;
+#ifdef __aarch64__
+                                if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN)
+                                        continue;
+#endif
 
                                 /*
                                  * Skip cipher algorithms belonging to AEAD
@@ -1503,6 +1554,7 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr)
          * OTHER MISC TESTS
          */
 
+#ifndef __aarch64__
         /* CBCS NULL NEXT IV TEST */
         for (order = IMB_ORDER_CIPHER_HASH; order <= IMB_ORDER_HASH_CIPHER;
              order++)
@@ -1529,6 +1581,7 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr)
                                 return 1;
                         printf(".");
                 }
+#endif
 
         /* clean up */
         while (IMB_FLUSH_JOB(mb_mgr) != NULL)
diff --git a/test/clear_mem_test.c b/test/clear_mem_test.c
index 50804e007c1780b45247aa3367e115e334374b9b..2443bd9d5247cfec0d51e950d9350a168caea6a1 100644
--- a/test/clear_mem_test.c
+++ b/test/clear_mem_test.c
@@ -31,7 +31,15 @@
 #include <string.h>
 
 #include "ipsec-mb.h"
+
+#ifdef __aarch64__
+#include "aarch64/clear_regs_mem_aarch64.h"
+#endif /* __aarch64__ */
+
+#ifdef __x86_64__
 #include "gcm_ctr_vectors_test.h"
+#endif /* __x86_64__ */
+
 #include "utils.h"
 
 #define MAX_RAND 1024
diff --git a/test/direct_api_test.c b/test/direct_api_test.c
index faba01e4bbfb5473442b5ab594120eec511d9788..d1718e2b0e3ed67c41cfc4b148d16758999156dd 100644
--- a/test/direct_api_test.c
+++ b/test/direct_api_test.c
@@ -33,7 +33,15 @@
 #include <signal.h>
 
 #include <ipsec-mb.h>
+
+#ifdef __aarch64__
+#include "aarch64/clear_regs_mem_aarch64.h"
+#endif /* __aarch64__ */
+
+#ifdef __x86_64__
 #include "gcm_ctr_vectors_test.h"
+#endif /* __x86_64__ */
+
 #include "utils.h"
 
 #define BUF_SIZE ((uint32_t)sizeof(struct gcm_key_data))
@@ -64,6 +72,7 @@ seg_handler(int signum)
 }
 #endif /* DEBUG */
 
+#ifndef __aarch64__
 /*
  * @brief Performs direct GCM API invalid param tests
  */
@@ -875,6 +884,7 @@ test_kasumi_api(struct IMB_MGR *mgr)
         printf("\n");
         return 0;
 }
+#endif /* __aarch64__ */
 
 /*
  * @brief Performs direct SNOW3G API invalid param tests
@@ -1141,6 +1151,7 @@ direct_api_test(struct IMB_MGR *mb_mgr)
                 goto dir_api_exit;
         }
 
+#ifdef __x86_64__
         errors += test_gcm_api(mb_mgr);
         run++;
 
@@ -1158,6 +1169,7 @@ direct_api_test(struct IMB_MGR *mb_mgr)
 
         errors += test_kasumi_api(mb_mgr);
         run++;
+#endif /* __x86_64__ */
 
         errors += test_snow3g_api(mb_mgr);
         run++;
@@ -1170,5 +1182,5 @@ direct_api_test(struct IMB_MGR *mb_mgr)
 #ifndef DEBUG
         signal(SIGSEGV, handler);
 #endif
-	return errors;
+        return errors;
 }
diff --git a/test/ipsec_xvalid.c b/test/ipsec_xvalid.c
index df5112448339cb7ebcdfd6140043211b04083c95..5c861216994272009e7c6c9833d3939315bc4313 100644
--- a/test/ipsec_xvalid.c
+++ b/test/ipsec_xvalid.c
@@ -50,11 +50,18 @@
 #define __func__ __FUNCTION__
 #define strcasecmp _stricmp
 #else
-#include <x86intrin.h>
 #define BSWAP64 __builtin_bswap64
 #endif
 
 #include <ipsec-mb.h>
+#ifdef __aarch64__
+#include "aarch64/clear_regs_mem_aarch64.h"
+#endif
+
+#ifdef __x86_64__
+#include <x86intrin.h>
+#endif
+
 
 /* maximum size of a test buffer */
 #define JOB_SIZE_TOP (16 * 1024)
@@ -99,10 +106,10 @@ static uint64_t pattern8_plain_text;
 struct params_s {
         IMB_CIPHER_MODE         cipher_mode; /* CBC, CNTR, DES, GCM etc. */
         IMB_HASH_ALG            hash_alg; /* SHA-1 or others... */
-        uint32_t		key_size;
-        uint32_t		buf_size;
-        uint64_t		aad_size;
-        uint32_t		num_sizes;
+        uint32_t                key_size;
+        uint32_t                buf_size;
+        uint64_t                aad_size;
+        uint32_t                num_sizes;
 };
 
 /* Struct storing all expanded keys */
@@ -112,8 +119,8 @@ struct cipher_auth_keys {
         uint8_t ipad[IMB_SHA512_DIGEST_SIZE_IN_BYTES];
         uint8_t opad[IMB_SHA512_DIGEST_SIZE_IN_BYTES];
         DECLARE_ALIGNED(uint32_t k1_expanded[15 * 4], 16);
-        DECLARE_ALIGNED(uint8_t	k2[32], 16);
-        DECLARE_ALIGNED(uint8_t	k3[16], 16);
+        DECLARE_ALIGNED(uint8_t k2[32], 16);
+        DECLARE_ALIGNED(uint8_t k3[16], 16);
         DECLARE_ALIGNED(uint32_t enc_keys[15 * 4], 16);
         DECLARE_ALIGNED(uint32_t dec_keys[15 * 4], 16);
         DECLARE_ALIGNED(struct gcm_key_data gdata_key, 64);
@@ -152,11 +159,12 @@ struct str_value_mapping {
 
 const struct str_value_mapping arch_str_map[] = {
         {.name = "NONE",        .values.arch_type = IMB_ARCH_NONE },
-        {.name = "SSE",         .values.arch_type = IMB_ARCH_SSE },
         {.name = "NO-AESNI",    .values.arch_type = IMB_ARCH_NOAESNI },
+        {.name = "SSE",         .values.arch_type = IMB_ARCH_SSE },
         {.name = "AVX",         .values.arch_type = IMB_ARCH_AVX },
         {.name = "AVX2",        .values.arch_type = IMB_ARCH_AVX2 },
-        {.name = "AVX512",      .values.arch_type = IMB_ARCH_AVX512 }
+        {.name = "AVX512",      .values.arch_type = IMB_ARCH_AVX512 },
+        {.name = "AARCH64",     .values.arch_type = IMB_ARCH_AARCH64 },
 };
 
 struct str_value_mapping cipher_algo_str_map[] = {
@@ -673,8 +681,8 @@ struct custom_job_params custom_job_params = {
 };
 
 /* AESNI_EMU disabled by default */
-uint8_t enc_archs[IMB_ARCH_NUM] = {0, 0, 1, 1, 1, 1};
-uint8_t dec_archs[IMB_ARCH_NUM] = {0, 0, 1, 1, 1, 1};
+uint8_t enc_archs[IMB_ARCH_NUM] = {0, 0, 1, 1, 1, 1, 1};
+uint8_t dec_archs[IMB_ARCH_NUM] = {0, 0, 1, 1, 1, 1, 1};
 
 uint64_t flags = 0; /* flags passed to alloc_mb_mgr() */
 
@@ -1455,6 +1463,7 @@ prepare_keys(IMB_MGR *mb_mgr, struct cipher_auth_keys *keys,
         }
 
         switch (params->cipher_mode) {
+#ifndef __aarch64__
         case IMB_CIPHER_GCM:
                 switch (params->key_size) {
                 case IMB_KEY_128_BYTES:
@@ -1532,6 +1541,11 @@ prepare_keys(IMB_MGR *mb_mgr, struct cipher_auth_keys *keys,
                 memcpy(k2, ciph_key, 16);
                 memcpy(k2 + 16, ciph_key + 16, 16);
                 break;
+#else
+        case IMB_CIPHER_SNOW3G_UEA2_BITLEN:
+                memcpy(k2, ciph_key, 16);
+                break;
+#endif
         case IMB_CIPHER_NULL:
                 /* No operation needed */
                 break;
@@ -1609,8 +1623,9 @@ perform_safe_checks(IMB_MGR *mgr, const IMB_ARCH arch, const char *dir)
 
         dump_gps();
         switch (arch) {
-        case IMB_ARCH_SSE:
+#ifdef __x86_64__
         case IMB_ARCH_NOAESNI:
+        case IMB_ARCH_SSE:
                 dump_xmms_sse();
                 simd_size = XMM_MEM_SIZE;
                 break;
@@ -1626,6 +1641,15 @@ perform_safe_checks(IMB_MGR *mgr, const IMB_ARCH arch, const char *dir)
                 dump_zmms();
                 simd_size = ZMM_MEM_SIZE;
                 break;
+#endif
+
+#ifdef __aarch64__
+        case IMB_ARCH_NOAESNI:
+        case IMB_ARCH_AARCH64:
+                dump_simd_regs();
+                simd_size = SIMD_MEM_SIZE;
+                break;
+#endif
         default:
                 fprintf(stderr,
                         "Error getting the architecture\n");
@@ -1659,6 +1683,8 @@ perform_safe_checks(IMB_MGR *mgr, const IMB_ARCH arch, const char *dir)
              ooo_ptr++, i++) {
                 void *ooo_mgr_p = *ooo_ptr;
 
+                if (ooo_mgr_p == NULL) continue;
+
                 if (search_patterns(ooo_mgr_p,
                                     get_ooo_mgr_size(ooo_mgr_p, i)) == 0) {
                         fprintf(stderr,
@@ -1675,8 +1701,9 @@ static void
 clear_scratch_simd(const IMB_ARCH arch)
 {
         switch (arch) {
-        case IMB_ARCH_SSE:
+#ifdef __x86_64__
         case IMB_ARCH_NOAESNI:
+        case IMB_ARCH_SSE:
                 clr_scratch_xmms_sse();
                 break;
         case IMB_ARCH_AVX:
@@ -1688,6 +1715,14 @@ clear_scratch_simd(const IMB_ARCH arch)
         case IMB_ARCH_AVX512:
                 clr_scratch_zmms();
                 break;
+#endif
+
+#ifdef __aarch64__
+        case IMB_ARCH_NOAESNI:
+        case IMB_ARCH_AARCH64:
+                CLEAR_SCRATCH_SIMD_REGS();
+                break;
+#endif
         default:
                 fprintf(stderr, "Invalid architecture\n");
                 exit(EXIT_FAILURE);
@@ -2348,8 +2383,9 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch,
         }
 
         switch (enc_arch) {
-        case IMB_ARCH_SSE:
+#ifndef __aarch64__
         case IMB_ARCH_NOAESNI:
+        case IMB_ARCH_SSE:
                 init_mb_mgr_sse(enc_mgr);
                 break;
         case IMB_ARCH_AVX:
@@ -2361,6 +2397,12 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch,
         case IMB_ARCH_AVX512:
                 init_mb_mgr_avx512(enc_mgr);
                 break;
+#else
+        case IMB_ARCH_NOAESNI:
+        case IMB_ARCH_AARCH64:
+                init_mb_mgr_aarch64(enc_mgr);
+                break;
+#endif
         default:
                 fprintf(stderr, "Invalid architecture\n");
                 exit(EXIT_FAILURE);
@@ -2385,8 +2427,9 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch,
         }
 
         switch (dec_arch) {
-        case IMB_ARCH_SSE:
+#ifndef __aarch64__
         case IMB_ARCH_NOAESNI:
+        case IMB_ARCH_SSE:
                 init_mb_mgr_sse(dec_mgr);
                 break;
         case IMB_ARCH_AVX:
@@ -2398,6 +2441,12 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch,
         case IMB_ARCH_AVX512:
                 init_mb_mgr_avx512(dec_mgr);
                 break;
+#else
+        case IMB_ARCH_NOAESNI:
+        case IMB_ARCH_AARCH64:
+                init_mb_mgr_aarch64(dec_mgr);
+                break;
+#endif
         default:
                 fprintf(stderr, "Invalid architecture\n");
                 exit(EXIT_FAILURE);
@@ -2423,7 +2472,11 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch,
                 /* Skip IMB_CIPHER_CUSTOM */
                 if (c_mode == IMB_CIPHER_CUSTOM)
                         continue;
-
+#ifdef __aarch64__
+                if ((c_mode != IMB_CIPHER_NULL) &&
+                    (c_mode != IMB_CIPHER_SNOW3G_UEA2_BITLEN))
+                        continue;
+#endif
                 params->cipher_mode = c_mode;
 
                 for (hash_alg = IMB_AUTH_HMAC_SHA_1;
@@ -2432,7 +2485,11 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch,
                         /* Skip IMB_AUTH_CUSTOM */
                         if (hash_alg == IMB_AUTH_CUSTOM)
                                 continue;
-
+#ifdef __aarch64__
+                        if ((hash_alg != IMB_AUTH_NULL) &&
+                            (hash_alg != IMB_AUTH_SNOW3G_UIA2_BITLEN))
+                                continue;
+#endif
                         /* Skip not supported combinations */
                         if ((c_mode == IMB_CIPHER_GCM &&
                             hash_alg != IMB_AUTH_AES_GMAC) ||
@@ -2571,18 +2628,29 @@ static void usage(const char *app_name)
                 "where args are zero or more\n"
                 "-h: print this message\n"
                 "-v: verbose, prints extra information\n"
+#ifdef __aarch64__
+                "--enc-arch: encrypting with architecture "
+                "(NO-AESNI/AARCH64)\n"
+                "--dec-arch: decrypting with architecture "
+                "(NO-AESNI/AARCH64)\n"
+#else
                 "--enc-arch: encrypting with architecture "
                 "(NO-AESNI/SSE/AVX/AVX2/AVX512)\n"
                 "--dec-arch: decrypting with architecture "
                 "(NO-AESNI/SSE/AVX/AVX2/AVX512)\n"
+#endif
                 "--cipher-algo: Select cipher algorithm to run on the custom "
                 "test\n"
                 "--hash-algo: Select hash algorithm to run on the custom test\n"
                 "--aead-algo: Select AEAD algorithm to run on the custom test\n"
+#ifdef __aarch64__
+                "--no-aarch64: Don't do AARCH64\n"
+#else
                 "--no-avx512: Don't do AVX512\n"
                 "--no-avx2: Don't do AVX2\n"
                 "--no-avx: Don't do AVX\n"
                 "--no-sse: Don't do SSE\n"
+#endif
                 "--aesni-emu: Do AESNI_EMU (disabled by default)\n"
                 "--shani-on: use SHA extensions, default: auto-detect\n"
                 "--shani-off: don't use SHA extensions\n"
diff --git a/test/job_api_fuzz_test.c b/test/job_api_fuzz_test.c
index 7703402b34ee603e54674f92c4803558bbccd790..6e69c5fc9c3c34d53c5e5b82aaffc81f1a40400e 100644
--- a/test/job_api_fuzz_test.c
+++ b/test/job_api_fuzz_test.c
@@ -462,6 +462,12 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t dataSize)
         if (ar == NULL) {
                 init_mb_mgr_auto(p_mgr, &arch);
         } else {
+#ifdef __aarch64__
+                if (strcmp(ar, "aarch64") == 0)
+                        init_mb_mgr_aarch64(p_mgr);
+#endif /* aarch64 */
+
+#ifdef __x86_64__
                 if (strcmp(ar, "avx") == 0)
                         init_mb_mgr_avx(p_mgr);
                 else if (strcmp(ar, "avx2") == 0)
@@ -470,6 +476,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t dataSize)
                         init_mb_mgr_avx512(p_mgr);
                 else if (strcmp(ar, "sse") == 0)
                         init_mb_mgr_sse(p_mgr);
+#endif /* x86_64 */
                 else
                         init_mb_mgr_auto(p_mgr, &arch);
         }
diff --git a/test/main.c b/test/main.c
index dda1518b0a5c7c530b8265bda72af87d08d6c393..fe5b2aaec84be1669e7a1344f7b2497b9b3ca701 100644
--- a/test/main.c
+++ b/test/main.c
@@ -65,7 +65,9 @@ extern int direct_api_param_test(struct IMB_MGR *mb_mgr);
 
 typedef int (*imb_test_t)(struct IMB_MGR *mb_mgr);
 
+#ifdef __x86_64__
 #include "do_test.h"
+#endif
 
 #ifdef _WIN32
 #define strcasecmp _stricmp
@@ -80,6 +82,7 @@ struct imb_test {
 };
 
 struct imb_test tests[] = {
+#ifdef __x86_64__
         {
                 .str = "KAT",
                 .fn = known_answer_test,
@@ -91,7 +94,7 @@ struct imb_test tests[] = {
                 .enabled = 1
         },
         {
-		.str = "CTR",
+                .str = "CTR",
                 .fn = ctr_test,
                 .enabled = 1
         },
@@ -101,7 +104,7 @@ struct imb_test tests[] = {
                 .enabled = 1
         },
         {
-		.str = "XCBC",
+                .str = "XCBC",
                 .fn = xcbc_test,
                 .enabled = 1
         },
@@ -176,7 +179,7 @@ struct imb_test tests[] = {
                 .enabled = 1
         },
         {
-		.str = "CHAINED",
+                .str = "CHAINED",
                 .fn = chained_test,
                 .enabled = 1
         },
@@ -239,26 +242,50 @@ struct imb_test tests[] = {
                 .str = "DIRECT_API_PARAM",
                 .fn = direct_api_param_test,
                 .enabled = 1
-        }
+        },
+#endif
+
+#ifdef __aarch64__
+        {
+                .str = "SNOW3G",
+                .fn = snow3g_test,
+                .enabled = 1
+        },
+        {
+                .str = "API",
+                .fn = api_test,
+                .enabled = 1
+        },
+        {
+                .str = "DIRECT_API",
+                .fn = direct_api_test,
+                .enabled = 1
+        },
+        {
+                .str = "CLEAR_MEM",
+                .fn = clear_mem_test,
+                .enabled = 1
+        },
+#endif
 };
 
 static void
 usage(const char *name)
 {
-	fprintf(stderr,
+        fprintf(stderr,
                 "Usage: %s [args], where args are zero or more\n"
                 "--test-type TEST_NAME : Run single test type\n"
                 "--stop-on-fail: Stop test execution if a test fails\n"
                 "--no-aesni-emu: Don't do AESNI emulation\n"
                 "--no-avx512: Don't do AVX512\n"
-		"--no-avx2: Don't do AVX2\n"
-		"--no-avx: Don't do AVX\n"
-		"--no-sse: Don't do SSE\n"
+                "--no-avx2: Don't do AVX2\n"
+                "--no-avx: Don't do AVX\n"
+                "--no-sse: Don't do SSE\n"
                 "--auto-detect: auto detects current architecture "
                 "to run the tests\n  Note: Auto detection "
                 "option now run by default and will be removed in the future\n"
-		"--shani-on: use SHA extensions, default: auto-detect\n"
-		"--shani-off: don't use SHA extensions\n", name);
+                "--shani-on: use SHA extensions, default: auto-detect\n"
+                "--shani-off: don't use SHA extensions\n", name);
 }
 
 static void
@@ -281,6 +308,7 @@ print_hw_features(void)
                 { IMB_FEATURE_GFNI, "GFNI" },
                 { IMB_FEATURE_AVX512_IFMA, "AVX512-IFMA" },
                 { IMB_FEATURE_BMI2, "BMI2" },
+                { IMB_FEATURE_AARCH64, "AARCH64" },
         };
         IMB_MGR *p_mgr = NULL;
         unsigned i;
@@ -366,17 +394,17 @@ main(int argc, char **argv)
         if (detect_arch(arch_support) < 0)
                 return EXIT_FAILURE;
 
-	for (i = 1; i < argc; i++) {
-		if (strcmp(argv[i], "-h") == 0) {
-			usage(argv[0]);
-			return EXIT_SUCCESS;
-		} else if (update_flags_and_archs(argv[i],
+        for (i = 1; i < argc; i++) {
+                if (strcmp(argv[i], "-h") == 0) {
+                        usage(argv[0]);
+                        return EXIT_SUCCESS;
+                } else if (update_flags_and_archs(argv[i],
                                                   arch_support,
                                                   &flags))
-			continue;
-		else if (strcmp(argv[i], "--auto-detect") == 0)
+                        continue;
+                else if (strcmp(argv[i], "--auto-detect") == 0)
                         (void) auto_detect; /* legacy option - to be removed */
-		else if (strcmp(argv[i], "--stop-on-fail") == 0)
+                else if (strcmp(argv[i], "--stop-on-fail") == 0)
                         stop_on_fail = 1;
                 else if (strcmp(argv[i], "--test-type") == 0) {
                         unsigned selected_test;
@@ -396,6 +424,10 @@ main(int argc, char **argv)
                         }
                         i++;
                 }
+                else {
+                        usage(argv[0]);
+                        return EXIT_FAILURE;
+                }
         }
 
         /* Go through architectures */
@@ -427,8 +459,16 @@ main(int argc, char **argv)
                 }
 
                 switch (atype) {
-                case IMB_ARCH_SSE:
+#ifdef __aarch64__
                 case IMB_ARCH_NOAESNI:
+                case IMB_ARCH_AARCH64:
+                        init_mb_mgr_aarch64(p_mgr);
+                        break;
+#endif
+
+#ifdef __x86_64__
+                case IMB_ARCH_NOAESNI:
+                case IMB_ARCH_SSE:
                         init_mb_mgr_sse(p_mgr);
                         break;
                 case IMB_ARCH_AVX:
@@ -440,6 +480,7 @@ main(int argc, char **argv)
                 case IMB_ARCH_AVX512:
                         init_mb_mgr_avx512(p_mgr);
                         break;
+#endif
                 }
 
                 print_tested_arch(p_mgr->features, atype);
diff --git a/test/misc.h b/test/misc.h
index 2fc3be284fe5f3d843bcd10af00a30dc68995ebc..1d1e2563e767771f042eff03093a5883c29485d3 100644
--- a/test/misc.h
+++ b/test/misc.h
@@ -28,6 +28,7 @@
 #ifndef XVALIDAPP_MISC_H
 #define XVALIDAPP_MISC_H
 
+#ifdef __x86_64__
 /* RAX, RBX, RCX, RDX, RDI, RSI, R8-R15 */
 #define GP_MEM_SIZE 14*8
 
@@ -55,4 +56,25 @@ void clr_scratch_xmms_avx(void);
 void clr_scratch_ymms(void);
 void clr_scratch_zmms(void);
 
+#endif /* __x86_64__ */
+
+#ifdef __aarch64__
+/* x0-x28 */
+#define GP_MEM_SIZE 29*8
+
+#define SIMD_MEM_SIZE 32*16
+
+/* Memory allocated */
+uint8_t gps[GP_MEM_SIZE];
+uint8_t simd_regs[SIMD_MEM_SIZE];
+
+/* Read the stack pointer */
+void *rdrsp(void);
+
+/* Functions to dump all registers into predefined memory */
+void dump_gps(void);
+void dump_simd_regs(void);
+
+#endif /* __aarch64__ */
+
 #endif /* XVALIDAPP_MISC_H */
diff --git a/test/misc_aarch64.S b/test/misc_aarch64.S
new file mode 100644
index 0000000000000000000000000000000000000000..a355ca2ff0594385abcbecd3deaf9f16dfa445c7
--- /dev/null
+++ b/test/misc_aarch64.S
@@ -0,0 +1,87 @@
+/**********************************************************************
+  Copyright(c) 2021 Arm Corporation All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions
+  are met:
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+    * Neither the name of Arm Corporation nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************/
+.global	dump_gps
+.type	dump_gps,%function
+.align	5
+dump_gps:
+	str	x28,[sp,#-16]!
+	ldr	x28,=gps
+	stp	x0,x1,[x28]
+	stp	x2,x3,[x28,#16]
+	stp	x4,x5,[x28,#32]
+	stp	x6,x7,[x28,#48]
+	stp	x8,x9,[x28,#64]
+	stp	x10,x11,[x28,#80]
+	stp	x12,x13,[x28,#96]
+	stp	x14,x15,[x28,#112]
+	stp	x16,x17,[x28,#128]
+	stp	x18,x19,[x28,#144]
+	stp	x20,x21,[x28,#160]
+	stp	x22,x23,[x28,#176]
+	stp	x24,x25,[x28,#192]
+	stp	x26,x27,[x28,#208]
+	ldr	x0,[sp],#16
+	str	x0,[x28,#224]
+	mov	x28,x0
+
+	ret
+
+
+.global	dump_simd_regs
+.type	dump_simd_regs,%function
+.align	5
+dump_simd_regs:
+	ldr	x0,=simd_regs
+	stp	q0,q1,[x0]
+	stp	q2,q3,[x0,#32]
+	stp	q4,q5,[x0,#64]
+	stp	q6,q7,[x0,#96]
+	stp	q8,q9,[x0,#128]
+	stp	q10,q11,[x0,#160]
+	stp	q12,q13,[x0,#192]
+	stp	q14,q15,[x0,#224]
+	stp	q16,q17,[x0,#256]
+	stp	q18,q19,[x0,#288]
+	stp	q20,q21,[x0,#320]
+	stp	q22,q23,[x0,#352]
+	stp	q24,q25,[x0,#384]
+	stp	q26,q27,[x0,#416]
+	stp	q28,q29,[x0,#448]
+	stp	q30,q31,[x0,#480]
+
+	ret
+
+
+.global rdrsp
+.type	rdrsp,%function
+.align	5
+rdrsp:
+	mov	x0,sp
+
+	ret
diff --git a/test/utils.c b/test/utils.c
index f16b3b8f4fbf6f4294fd6091f1da96b616eb65ec..46912677f37abd60ff2f3739955e193b0337fc30 100644
--- a/test/utils.c
+++ b/test/utils.c
@@ -181,6 +181,8 @@ update_flags_and_archs(const char *arg,
                 arch_support[IMB_ARCH_AVX] = 0;
         else if (strcmp(arg, "--no-sse") == 0)
                 arch_support[IMB_ARCH_SSE] = 0;
+        else if (strcmp(arg, "--no-aarch64") ==0)
+                arch_support[IMB_ARCH_AARCH64] = 0;
         else if (strcmp(arg, "--aesni-emu") == 0)
                 arch_support[IMB_ARCH_NOAESNI] = 1;
         else if (strcmp(arg, "--no-aesni-emu") == 0)
@@ -212,7 +214,16 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM])
                 IMB_FEATURE_AVX | IMB_FEATURE_CMOV | IMB_FEATURE_AESNI;
         const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx;
         const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2;
+
+        const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI;
+
+#ifdef __x86_64__
         const uint64_t detect_noaesni = IMB_FEATURE_SSE4_2 | IMB_FEATURE_CMOV;
+#endif
+
+#ifdef __aarch64__
+        const uint64_t detect_noaesni = IMB_FEATURE_AARCH64 | IMB_FEATURE_ASIMD;
+#endif
 
         IMB_MGR *p_mgr = NULL;
         IMB_ARCH arch_id;
@@ -246,13 +257,17 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM])
         if ((p_mgr->features & detect_noaesni) != detect_noaesni)
                 arch_support[IMB_ARCH_NOAESNI] = 0;
 
+        if ((p_mgr->features & detect_aarch64) != detect_aarch64)
+                arch_support[IMB_ARCH_AARCH64] = 0;
+
         free_mb_mgr(p_mgr);
 
         if (arch_support[IMB_ARCH_NOAESNI] == 0 &&
             arch_support[IMB_ARCH_SSE] == 0 &&
             arch_support[IMB_ARCH_AVX] == 0 &&
             arch_support[IMB_ARCH_AVX2] == 0 &&
-            arch_support[IMB_ARCH_AVX512] == 0) {
+            arch_support[IMB_ARCH_AVX512] == 0 &&
+            arch_support[IMB_ARCH_AARCH64] == 0) {
                 fprintf(stderr, "No available architecture detected!\n");
                 return -1;
         }
@@ -270,7 +285,7 @@ void
 print_tested_arch(const uint64_t features, const IMB_ARCH arch)
 {
         static const char *arch_str_tab[IMB_ARCH_NUM] = {
-                "NONE", "NO-AESNI", "SSE", "AVX", "AVX2", "AVX512"
+                "NONE", "NO-AESNI", "SSE", "AVX", "AVX2", "AVX512", "AARCH64"
         };
         const char *feat = "";
 
@@ -278,6 +293,7 @@ print_tested_arch(const uint64_t features, const IMB_ARCH arch)
         case IMB_ARCH_NOAESNI:
         case IMB_ARCH_AVX2:
         case IMB_ARCH_AVX:
+        case IMB_ARCH_AARCH64:
                 break;
         case IMB_ARCH_SSE:
                 if (features & IMB_FEATURE_SHANI) {