diff --git a/README.md b/README.md index d683ff03fbe4bb0c1585c7b38a956c2bc00810f2..056812213331933d714a7a9b440daa797f9433aa 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ Table 1. List of supported cipher algorithms and their implementations. | KASUMI-F8 | Y | N | N | N | N | N | N | | ZUC-EEA3 | N | Y x4 | Y x4 | Y x8 | Y x16 | Y x16 | N | | ZUC-EEA3-256 | N | Y x4 | Y x4 | Y x8 | Y x16 | Y x16 | N | -| SNOW3G-UEA2 | N | Y | Y | Y | Y x16 | Y x16 | N | +| SNOW3G-UEA2 | N | Y | Y | Y | Y x16 | Y x16 | Y | | AES128-CBCS(9) | N | Y(1) | Y(3) | N | N | Y(6) | N | | Chacha20 | N | Y | Y | Y | Y | N | N | | Chacha20 AEAD | N | Y | Y | Y | Y | N | N | @@ -133,7 +133,7 @@ Table 2. List of supported integrity algorithms and their implementations. | KASUMI-F9 | Y | N | N | N | N | N | N | | ZUC-EIA3 | N | Y x4 | Y x4 | Y x8 | Y x16 | Y x16 | N | | ZUC-EIA3-256 | N | Y x4 | Y x4 | Y x8 | Y x16 | Y x16 | N | -| SNOW3G-UIA2(8) | N | Y by4 | Y by4 | N | Y by32 | Y by32 | N | +| SNOW3G-UIA2(8) | N | Y by4 | Y by4 | N | Y by32 | Y by32 | Y | | DOCSIS-CRC32(4) | N | Y | Y | N | Y | Y | N | | HEC | N | Y | Y | N | N | N | N | | POLY1305 | Y | N | N | N | Y | Y | N | diff --git a/lib/Makefile b/lib/Makefile index 14cc191af5c98e4d7a7fdf8d4e991d9cba385d66..195fc76f78cd72afed226aeb0a7a2190f99e06cc 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -24,7 +24,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # - LIB = libIPSec_MB SHARED ?= y IMB_HDR = ipsec-mb.h @@ -35,6 +34,8 @@ ifeq ($(IMB_VERSION),) $(error "Failed to detect library version!") endif +ARCH = $(shell uname -m) + VERSION = $(shell echo $(IMB_VERSION) | cut -d. -f1-3) SO_VERSION = $(shell echo $(VERSION) | cut -d. -f1) @@ -46,6 +47,7 @@ MAN1 = libipsec-mb.7 MAN2 = libipsec-mb-dev.7 NOLDCONFIG ?= n +ifeq ($(ARCH),x86_64) USE_YASM ?= n YASM ?= yasm NASM ?= nasm @@ -66,12 +68,14 @@ NASM_GE_MINOR = $(shell [ $(NASM_MINOR_VER) -ge $(NASM_MINOR_REQ) ] && echo true ifneq ($(NASM_GE_MAJOR),true) $(warning "NASM version found: $(NASM_VERSION)") $(error "Minimum required: $(NASM_MAJOR_REQ).$(NASM_MINOR_REQ)") -endif +endif # NASM_GE_MAJOR ifneq ($(NASM_GE_MINOR),true) $(warning "NASM version found: $(NASM_VERSION)") $(error "Minimum required: $(NASM_MAJOR_REQ).$(NASM_MINOR_REQ)") -endif -endif +endif # NASM_GE_MINOR +endif # NASM_VERSION + +endif # x86_64 OBJ_DIR ?= obj LIB_DIR ?= . @@ -85,11 +89,13 @@ MINGW ?= $(shell $(CC) -dM -E - < /dev/null | grep -i mingw | wc -l | sed 's/^ * # if "-z ibt" is supported then assume "-z shstk, -z cet-report=error" are also supported # "-fcf-protection" needs to be checked separately +ifeq ($(ARCH),x86_64) ifeq ($(MINGW),0) CC_HAS_CET = $(and $(shell $(CC) --target-help 2> /dev/null | grep -m1 -e "-z ibt" | wc -l), \ $(shell $(CC) --help=common 2> /dev/null | grep -m1 -e "-fcf-protection" | wc -l)) CET_LDFLAGS=-r -z ibt -z shstk -endif +endif # MINGW +endif # x86_64 CFLAGS := -DNO_COMPAT_IMB_API_053 $(EXTRA_CFLAGS) $(INCLUDES) \ -W -Wall -Wextra -Wmissing-declarations -Wpointer-arith \ -Wcast-qual -Wundef -Wwrite-strings \ @@ -116,7 +122,7 @@ NASM_FLAGS := -Werror -fwin64 -Xvc -gcv8 -DWIN_ABI $(NASM_INCLUDES) else YASM_FLAGS := -f x64 -f elf64 -X gnu -g dwarf2 -DLINUX -D__linux__ $(YASM_INCLUDES) NASM_FLAGS := -Werror -felf64 -Xgnu -gdwarf -DLINUX -D__linux__ $(NASM_INCLUDES) -endif +endif # MINGW DEBUG_OPT ?= -O0 ifeq ($(DEBUG),y) @@ -130,8 +136,8 @@ CFLAGS += -fstack-protector -D_FORTIFY_SOURCE=2 else OPT = -O2 LDFLAGS += -s -endif -endif +endif # MINGW +endif # DEBUG ifeq ($(SAFE_OPTIONS), n) SAFE_DATA = n @@ -168,40 +174,56 @@ CFLAGS_NO_SIMD = $(CFLAGS) -O1 CFLAGS += $(OPT) # Set generic architectural optimizations +ifeq ($(ARCH),x86_64) OPT_X86 := -msse4.2 OPT_SSE := -msse4.2 -maes -mpclmul OPT_AVX := -mavx -maes -mpclmul OPT_AVX2 := -mavx2 -maes -mpclmul OPT_AVX512 := -mavx2 -maes -mpclmul # -mavx512f is not available until gcc 4.9 OPT_NOAESNI := -msse4.2 -mno-aes -mno-pclmul +endif # x86_64 + +ifeq ($(ARCH),aarch64) +OPT_AARCH64 := -march=armv8-a+crypto+aes +OPT_NOAESNI := -march=armv8-a +endif # aarch64 # Set architectural optimizations for GCC/CC ifeq ($(CC),$(filter $(CC),gcc cc)) GCC_VERSION = $(shell $(CC) -dumpversion | cut -d. -f1) GCC_GE_V5 = $(shell [ $(GCC_VERSION) -ge 5 ] && echo true) ifeq ($(GCC_GE_V5),true) +ifeq ($(ARCH),aarch64) +OPT_AARCH64 := -march=armv8-a+crypto+aes +OPT_NOAESNI := -march=armv8-a +else OPT_SSE := -march=nehalem -maes -mpclmul OPT_AVX := -march=sandybridge -maes -mpclmul OPT_AVX2 := -march=haswell -maes -mpclmul OPT_AVX512 := -march=broadwell -maes -mpclmul OPT_NOAESNI := -march=nehalem -mno-pclmul -endif -endif +endif # AARCH64 +endif # GCC +endif # CC # Set architectural optimizations for clang ifeq ($(CC),$(filter $(CC),clang)) CLANG_VERSION = $(shell $(CC) --version | head -n 1 | cut -d ' ' -f 3) CLANG_GE_V381 = $(shell test "$(CLANG_VERSION)" \> "3.8.0" && echo true) ifeq ($(CLANG_GE_V381),true) +ifeq ($(ARCH),aarch64) +OPT_AARCH64 := -march=armv8-a+crypto+aes +else OPT_SSE := -march=nehalem -maes -mpclmul OPT_AVX := -march=sandybridge -maes -mpclmul OPT_AVX2 := -march=haswell -maes -mpclmul OPT_AVX512 := -march=broadwell -maes -mpclmul -endif +endif # AARCH64 +endif # CLANG # remove CFLAGS that clang warns about CFLAGS := $(subst -fno-delete-null-pointer-checks,,$(CFLAGS)) CFLAGS := $(subst -fno-strict-overflow,,$(CFLAGS)) -endif +endif # CC # so or static build ifeq ($(SHARED),y) @@ -211,7 +233,7 @@ LIBNAME = $(LIB).dll else LIBNAME = $(LIB).so.$(VERSION) LDFLAGS += -Wl,-z,noexecstack -Wl,-z,relro -Wl,-z,now -endif +endif # MINGW LIBPERM = 0755 ifeq ($(CC_HAS_CET),1) LDFLAGS += -fcf-protection=full -Wl,-z,ibt -Wl,-z,shstk -Wl,-z,cet-report=error @@ -224,7 +246,7 @@ LDFLAGS += -g ifeq ($(CC_HAS_CET),1) LDFLAGS += -fcf-protection=full endif -endif +endif # shared # warning messages SAFE_PARAM_MSG1="SAFE_PARAM option not set." @@ -242,6 +264,24 @@ SAFE_OPTIONS_MSG2="All safe options enabled by default." # # List of C modules (any origin) # +ifeq ($(ARCH),aarch64) +c_lib_objs := \ + mb_mgr_aarch64.o \ + mb_mgr_auto_aarch64.o \ + mb_mgr_aarch64_no_aesni.o \ + alloc_aarch64.o \ + clear_mem_aarch64.o \ + cpu_features_aarch64.o \ + version.o \ + aesni_emu.o \ + snow3g_aarch64.o \ + snow3g_aarch64_no_aesni.o \ + snow3g_tables.o \ + snow3g_iv.o \ + error.o +asm_generic_lib_objs := \ + lookup_16x8bit_neon.o +else c_lib_objs := \ mb_mgr_avx.o \ mb_mgr_avx2.o \ @@ -300,7 +340,6 @@ asm_generic_lib_objs := \ crc32_const.o \ poly1305.o \ chacha20_poly1305.o - # # List of ASM modules (no-aesni directory) # @@ -604,8 +643,8 @@ asm_avx512_lib_objs := \ mb_mgr_zuc_submit_flush_avx512.o \ mb_mgr_zuc_submit_flush_gfni_avx512.o \ chacha20_avx512.o \ - poly_avx512.o \ - poly_fma_avx512.o \ + poly_avx512.o \ + poly_fma_avx512.o \ ethernet_fcs_avx512.o \ crc16_x25_avx512.o \ crc32_refl_by16_vclmul_avx512.o \ @@ -649,9 +688,15 @@ asm_avx512_gcm_objs := \ aes128_gmac_by48_api_vaes_avx512.o aes192_gmac_by48_api_vaes_avx512.o aes256_gmac_by48_api_vaes_avx512.o \ aes128_gcm_by8_avx512.o aes192_gcm_by8_avx512.o aes256_gcm_by8_avx512.o +endif # AARCH64 + # # build object files lists # +ifeq ($(ARCH),aarch64) +asm_obj_files := $(asm_generic_lib_objs) +c_obj_files := $(c_lib_objs) +else asm_obj_files := $(asm_generic_lib_objs) \ $(asm_sse_lib_objs) $(asm_sse_gcm_objs) \ $(asm_avx_lib_objs) $(asm_avx_gcm_objs) \ @@ -662,6 +707,7 @@ ifeq ($(AESNI_EMU), y) asm_obj_files := $(asm_obj_files) $(asm_noaesni_lib_objs) $(asm_noaesni_gcm_objs) endif c_obj_files := $(c_lib_objs) $(c_gcm_objs) +endif # AARCH64 # # aggregate all objects files together and prefix with OBJDIR @@ -705,7 +751,7 @@ else endif else $(AR) -qcs $@ $^ -endif +endif # SHARED ifeq ($(SAFE_PARAM), n) @echo "NOTE:" $(SAFE_PARAM_MSG1) $(SAFE_PARAM_MSG2) endif @@ -764,6 +810,16 @@ $(dep_target_files): | $(OBJ_DIR) # - dependency file construction is part of the compilation # +ifeq ($(ARCH),aarch64) +$(OBJ_DIR)/%.o:aarch64/%.c + $(CC) -MMD $(OPT_AARCH64) -c $(CFLAGS) $< -o $@ +$(OBJ_DIR)/%.o:x86_64/%.c + $(CC) -MMD $(OPT_AARCH64) -c $(CFLAGS) $< -o $@ +$(OBJ_DIR)/%.o:aarch64/%.S + $(CC) -MMD $(OPT_AARCH64) -c $(CFLAGS) $< -o $@ +$(OBJ_DIR)/%.o:no-aesni/%.c + $(CC) -MMD $(OPT_NOAESNI) -c $(CFLAGS) $< -o $@ +else $(OBJ_DIR)/%.o:x86_64/%.c $(CC) -MMD $(OPT_X86) -c $(CFLAGS) $< -o $@ @@ -846,7 +902,8 @@ endif ifeq ($(CC_HAS_CET),1) $(LD) $(CET_LDFLAGS) -o $@.tmp $@ mv $@.tmp $@ -endif +endif # CC_HAS_CET +endif # AARCH64 $(OBJ_DIR): mkdir $(OBJ_DIR) @@ -905,7 +962,7 @@ help: @echo "SAFE_OPTIONS=n" @echo " - Disable all safe options (enabled by default)" - +ifneq ($(ARCH), aarch64) CHECKPATCH ?= checkpatch.pl # checkpatch ignore settings: # SPACING - produces false positives with typedefs and * @@ -935,6 +992,7 @@ CHECKPATCH_FLAGS = --no-tree --no-signoff --emacs --no-color --ignore CODE_INDEN SOURCES_DIRS := . sse avx avx2 avx512 include no-aesni SOURCES := $(foreach dir,$(SOURCES_DIRS),$(wildcard $(dir)/*.[ch]) $(wildcard $(dir)/*.asm) $(wildcard $(dir)/*.inc)) SOURCES_STYLE := $(foreach infile,$(SOURCES),$(infile)_style_check) +endif # AARCH64 .PHONY: style style: $(SOURCES_STYLE) diff --git a/lib/aarch64/alloc_aarch64.c b/lib/aarch64/alloc_aarch64.c new file mode 100644 index 0000000000000000000000000000000000000000..cc9e980ab89a29fc453b8535bf6b8db3b3eb7867 --- /dev/null +++ b/lib/aarch64/alloc_aarch64.c @@ -0,0 +1,245 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include + +#include /* posix_memalign() and free() */ + +#include +#include "ipsec-mb.h" +#include "ipsec_ooo_mgr.h" +#include "cpu_feature.h" +#include "error.h" + +#define IMB_OOO_ROAD_BLOCK 0xDEADCAFEDEADCAFEULL + +#define ALIGNMENT 64 +#define ALIGN(x, y) ((x + (y - 1)) & (~(y - 1))) + +#define OOO_INFO(imb_mgr_ooo_ptr_name__, ooo_mgr_type__) \ + { offsetof(IMB_MGR, imb_mgr_ooo_ptr_name__), \ + ALIGN(sizeof(ooo_mgr_type__), ALIGNMENT), \ + offsetof(ooo_mgr_type__, road_block) } + +const struct { + size_t ooo_ptr_offset; + size_t ooo_aligned_size; + size_t road_block_offset; +} ooo_mgr_table[] = { +}; + +/** + * @brief Calculates necessary memory size for IMB_MGR. + * + * @return Size for IMB_MGR (aligned to 64 bytes) + */ +size_t imb_get_mb_mgr_size(void) +{ + size_t ooo_total_size = 0; + unsigned i; + + for (i = 0; i < IMB_DIM(ooo_mgr_table); i++) + ooo_total_size += ooo_mgr_table[i].ooo_aligned_size; + /* + * Add 64 bytes into the maximum size calculation to + * make sure there is enough room to align the OOO managers. + */ + return (sizeof(IMB_MGR) + ooo_total_size + ALIGNMENT); +} + +static uint8_t *get_ooo_ptr(IMB_MGR *mgr, const size_t offset) +{ + uint8_t *mgr_offset = &((uint8_t *) mgr)[offset]; + uint8_t **ptr = (uint8_t **) mgr_offset; + + return *ptr; +} + +static void set_ooo_ptr(IMB_MGR *mgr, const size_t offset, uint8_t *new_ptr) +{ + uint8_t *mgr_offset = &((uint8_t *) mgr)[offset]; + uint8_t **ptr = (uint8_t **) mgr_offset; + + *ptr = new_ptr; +} + +static void set_road_block(uint8_t *ooo_ptr, const size_t offset) +{ + uint64_t *p_road_block = (uint64_t *) &ooo_ptr[offset]; + + *p_road_block = IMB_OOO_ROAD_BLOCK; +} + +/* + * Set last 8 bytes of OOO mgrs to predefined pattern + * + * This is to assist in searching for sensitive data remaining + * in the heap after algorithmic code completes + */ +static void set_ooo_mgr_road_block(IMB_MGR *mgr) +{ + unsigned n; + + for (n = 0; n < IMB_DIM(ooo_mgr_table); n++) + set_road_block(get_ooo_ptr(mgr, + ooo_mgr_table[n].ooo_ptr_offset), + ooo_mgr_table[n].road_block_offset); +} + + +/** + * @brief Initializes IMB_MGR pointers to out-of-order managers with + * use of externally allocated memory. + * + * imb_get_mb_mgr_size() should be called to know how much memory + * should be allocated externally. + * + * init_mb_mgr_XXX() must be called after this function call, + * whereas XXX is the desired architecture (including "auto"), + * only if reset_mgr is set to 0. + * + * @param mem_ptr a pointer to allocated memory + * @param flags multi-buffer manager flags + * IMB_FLAG_SHANI_OFF - disable use (and detection) of SHA extensions. + * IMB_FLAG_AESNI_OFF - disable use (and detection) of AES extensions. + * + * @param reset_mgr if 0, IMB_MGR structure is not cleared, else it is. + * + * @return Pointer to IMB_MGR structure + */ +IMB_MGR *imb_set_pointers_mb_mgr(void *mem_ptr, const uint64_t flags, + const unsigned reset_mgr) +{ + if (mem_ptr == NULL) { + imb_set_errno(mem_ptr, ENOMEM); + return NULL; + } + + IMB_MGR *ptr = (IMB_MGR *) mem_ptr; + uint8_t *ptr8 = (uint8_t *) ptr; + uint8_t *free_mem = &ptr8[ALIGN(sizeof(IMB_MGR), ALIGNMENT)]; + const size_t mem_size = imb_get_mb_mgr_size(); + unsigned i; + + if (reset_mgr) { + /* Zero out MB_MGR memory */ + memset(mem_ptr, 0, mem_size); + } else { + IMB_ARCH used_arch = (IMB_ARCH) ptr->used_arch; + + /* Reset function pointers from previously used architecture */ + switch (used_arch) { + case IMB_ARCH_NOAESNI: + init_mb_mgr_aarch64_no_aesni_internal(ptr, 0); + break; + case IMB_ARCH_AARCH64: + init_mb_mgr_aarch64_internal(ptr, 0); + break; + default: + break; + } + } + + imb_set_errno(ptr, 0); + ptr->flags = flags; /* save the flags for future use in init */ + ptr->features = cpu_feature_adjust(flags, cpu_feature_detect()); + + /* Set OOO pointers */ + for (i = 0; i < IMB_DIM(ooo_mgr_table); i++) { + set_ooo_ptr(ptr, ooo_mgr_table[i].ooo_ptr_offset, free_mem); + free_mem = &free_mem[ooo_mgr_table[i].ooo_aligned_size]; + IMB_ASSERT((uintptr_t)(free_mem - ptr8) <= mem_size); + } + set_ooo_mgr_road_block(ptr); + + return ptr; +} + + +static void * +alloc_aligned_mem(const size_t size) +{ + void *ptr; + + const size_t alignment = 64; + if (posix_memalign((void **)&ptr, alignment, size)) + return NULL; + + IMB_ASSERT(ptr != NULL); + + memset(ptr, 0, size); + + return ptr; +} + +static void +free_mem(void *ptr) +{ + free(ptr); +} + +/** + * @brief Allocates memory for multi-buffer manager instance + * + * For binary compatibility between library versions + * it is recommended to use this API. + * + * @return Pointer to allocated memory for MB_MGR structure + * @retval NULL on allocation error + */ +IMB_MGR *alloc_mb_mgr(uint64_t flags) +{ + IMB_MGR *ptr = NULL; + + ptr = alloc_aligned_mem(sizeof(IMB_MGR)); + IMB_ASSERT(ptr != NULL); + if (ptr != NULL) { + imb_set_errno(ptr, 0); + ptr->flags = flags; /* save the flags for future use in init */ + ptr->features = cpu_feature_adjust(flags, cpu_feature_detect()); + } else { + imb_set_errno(ptr, ENOMEM); + return NULL; + } + + return ptr; +} + +/** + * @brief Frees memory allocated previously by alloc_mb_mgr() + * + * @param ptr a pointer to allocated MB_MGR structure + * + */ +void free_mb_mgr(IMB_MGR *ptr) +{ + IMB_ASSERT(ptr != NULL); + + /* Free IMB_MGR */ + free_mem(ptr); +} diff --git a/lib/aarch64/clear_mem_aarch64.c b/lib/aarch64/clear_mem_aarch64.c new file mode 100644 index 0000000000000000000000000000000000000000..bf50dfe3a97bb1ab86118e7c2e61fba2c9d311bc --- /dev/null +++ b/lib/aarch64/clear_mem_aarch64.c @@ -0,0 +1,36 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "ipsec-mb.h" +#include + +void imb_clear_mem(void *mem, const size_t size) +{ + if (mem == NULL) return; + memset(mem, 0, size); +} diff --git a/lib/aarch64/clear_regs_mem_aarch64.h b/lib/aarch64/clear_regs_mem_aarch64.h new file mode 100644 index 0000000000000000000000000000000000000000..1b9587d12965828bada0b7af0a74aeecf2fcd398 --- /dev/null +++ b/lib/aarch64/clear_regs_mem_aarch64.h @@ -0,0 +1,109 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#ifndef CLEAR_REGS_H +#define CLEAR_REGS_H + +#include + +#define GPR_EOR_SELF(reg) "eor " #reg "," #reg "," #reg ";" + +#define CLEAR_SCRATCH_GPS() \ +do {\ + asm volatile(\ + GPR_EOR_SELF(x0) \ + GPR_EOR_SELF(x1) \ + GPR_EOR_SELF(x2) \ + GPR_EOR_SELF(x3) \ + GPR_EOR_SELF(x4) \ + GPR_EOR_SELF(x5) \ + GPR_EOR_SELF(x6) \ + GPR_EOR_SELF(x7) \ + GPR_EOR_SELF(x8) \ + GPR_EOR_SELF(x9) \ + GPR_EOR_SELF(x10) \ + GPR_EOR_SELF(x11) \ + GPR_EOR_SELF(x12) \ + GPR_EOR_SELF(x13) \ + GPR_EOR_SELF(x14) \ + GPR_EOR_SELF(x15) \ + GPR_EOR_SELF(x16) \ + GPR_EOR_SELF(x17) \ + GPR_EOR_SELF(x18) \ + :::"x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11", \ + "x12","x13","x14","x15","x16","x17","x18","x19","x20","x21","x22", \ + "x23","x24","x25","x26","x27","x28"); \ +} while(0) + +#define SIMD_EOR_SELF(reg) "eor " #reg ".16b," #reg ".16b," #reg ".16b;" + +#define CLEAR_SCRATCH_SIMD_REGS() \ +do{\ + asm volatile(\ + SIMD_EOR_SELF(v0) \ + SIMD_EOR_SELF(v1) \ + SIMD_EOR_SELF(v2) \ + SIMD_EOR_SELF(v3) \ + SIMD_EOR_SELF(v4) \ + SIMD_EOR_SELF(v5) \ + SIMD_EOR_SELF(v6) \ + SIMD_EOR_SELF(v7) \ + SIMD_EOR_SELF(v16) \ + SIMD_EOR_SELF(v17) \ + SIMD_EOR_SELF(v18) \ + SIMD_EOR_SELF(v19) \ + SIMD_EOR_SELF(v20) \ + SIMD_EOR_SELF(v21) \ + SIMD_EOR_SELF(v22) \ + SIMD_EOR_SELF(v23) \ + SIMD_EOR_SELF(v24) \ + SIMD_EOR_SELF(v25) \ + SIMD_EOR_SELF(v26) \ + SIMD_EOR_SELF(v27) \ + SIMD_EOR_SELF(v28) \ + SIMD_EOR_SELF(v29) \ + SIMD_EOR_SELF(v30) \ + SIMD_EOR_SELF(v31) \ + :::"v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18", \ + "v19","v20","v21","v22","v23","v24","v25","v26","v27","v28", \ + "v29","v30","v31"); \ +}while(0) + +static inline void +clear_mem(void *mem, const size_t size) +{ + memset(mem, 0, size); +} + +static inline void +clear_var(void *var, const size_t size) +{ + memset(var, 0, size); +} + +#endif /* CLEAR_REGS_H */ diff --git a/lib/aarch64/constant_lookup_aarch64.h b/lib/aarch64/constant_lookup_aarch64.h new file mode 100644 index 0000000000000000000000000000000000000000..d55d341312b2eeec1b7dc5cdb5b3f990c03e3f56 --- /dev/null +++ b/lib/aarch64/constant_lookup_aarch64.h @@ -0,0 +1,48 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#ifndef CONSTANT_LOOKUP_H +#define CONSTANT_LOOKUP_H + +#include "ipsec-mb.h" + +#include + +/** + * @brief Constant time and parallel NEON lookup function on table of + * 256 elements of 8-bit values. + * + * @param[in] indexes vector with 16 8-bit indexes + * @param[in] table pointer to 256 element table + * + * @return Vector with 16 8-bit values corresponding to the indexes + */ +IMB_DLL_LOCAL uint8x16_t +lookup_16x8bit_neon(const uint8x16_t indexes, const void *table); + +#endif /* CONSTANT_LOOKUP_H */ diff --git a/lib/aarch64/cpu_features_aarch64.c b/lib/aarch64/cpu_features_aarch64.c new file mode 100644 index 0000000000000000000000000000000000000000..2aae71a25a8e16b23c6f5d789e0b20a535cbf3c2 --- /dev/null +++ b/lib/aarch64/cpu_features_aarch64.c @@ -0,0 +1,78 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "cpu_feature.h" +#include +#include + +static uint32_t detect_asimd(void) +{ + return getauxval(AT_HWCAP) & HWCAP_ASIMD; +} + +static uint32_t detect_aes(void) +{ + return getauxval(AT_HWCAP) & HWCAP_AES; +} + +uint64_t cpu_feature_detect(void) +{ + uint64_t features = 0; +#ifdef __aarch64__ + features |= IMB_FEATURE_AARCH64; +#endif + if (detect_asimd()) { + features |= IMB_FEATURE_ASIMD; + if (detect_aes()) + features |= IMB_FEATURE_AESNI; + } + +#ifdef SAFE_DATA + features |= IMB_FEATURE_SAFE_DATA; +#endif +#ifdef SAFE_PARAM + features |= IMB_FEATURE_SAFE_PARAM; +#endif + + return features; +} + +uint64_t cpu_feature_adjust(const uint64_t flags, uint64_t features) +{ + if (flags & IMB_FLAG_AESNI_OFF) + features &= ~IMB_FEATURE_AESNI; + + return features; +} + +/* External function to retrieve feature flags */ +uint64_t imb_get_feature_flags(void) +{ + return cpu_feature_detect(); +} + diff --git a/lib/aarch64/lookup_16x8bit_neon.S b/lib/aarch64/lookup_16x8bit_neon.S new file mode 100644 index 0000000000000000000000000000000000000000..1d55641674cf7a84291a0ab6f1aa52ccae9d8323 --- /dev/null +++ b/lib/aarch64/lookup_16x8bit_neon.S @@ -0,0 +1,57 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.globl lookup_16x8bit_neon +.type lookup_16x8bit_neon,%function +.align 5 +lookup_16x8bit_neon: +/* param[in]: x0 table pointer + * v0 indexes + */ + // table: v16-v31 + mov v1.16b,v0.16b + + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 + ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0] + + movi v2.16b,#64 + + eor v3.16b,v3.16b,v3.16b + tbx v3.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v1.16b + sub v1.16b,v1.16b,v2.16b + tbx v3.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b + sub v1.16b,v1.16b,v2.16b + tbx v3.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b + sub v1.16b,v1.16b,v2.16b + tbx v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v1.16b + + mov v0.16b,v3.16b + + ret diff --git a/lib/aarch64/mb_mgr_aarch64.c b/lib/aarch64/mb_mgr_aarch64.c new file mode 100644 index 0000000000000000000000000000000000000000..c7dad7ade82ee3b50d2679fab82b35eab1467bfe --- /dev/null +++ b/lib/aarch64/mb_mgr_aarch64.c @@ -0,0 +1,123 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include + +#include "ipsec-mb.h" +#include "include/snow3g.h" + +#include "include/cpu_feature.h" +#include "include/error.h" +#include "clear_regs_mem_aarch64.h" +#include "include/noaesni.h" + +/* ====================================================================== */ + +#define SUBMIT_JOB submit_job_aarch64 +#define FLUSH_JOB flush_job_aarch64 +#define SUBMIT_JOB_NOCHECK submit_job_nocheck_aarch64 +#define GET_NEXT_JOB get_next_job_aarch64 +#define GET_COMPLETED_JOB get_completed_job_aarch64 + +#define QUEUE_SIZE queue_size_aarch64 + +/* ====================================================================== */ + +#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AARCH64 +#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AARCH64 + +/* ====================================================================== */ + +static void +reset_ooo_mgrs(IMB_MGR *state) +{ + return; +} + +IMB_DLL_LOCAL void +init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs) +{ +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return; + } +#endif + + /* reset error status */ + imb_set_errno(state, 0); + + state->features = cpu_feature_adjust(state->flags, + cpu_feature_detect()); + + /* Set architecture for future checks */ + state->used_arch = (uint32_t) IMB_ARCH_AARCH64; + + if (!(state->features & IMB_FEATURE_AESNI)) { + init_mb_mgr_aarch64_no_aesni(state); + return; + } + + if (reset_mgrs) { + reset_ooo_mgrs(state); + + /* Init "in order" components */ + state->next_job = 0; + state->earliest_job = -1; + } + + /* set AARCH64 handlers */ + state->get_next_job = get_next_job_aarch64; + state->submit_job = submit_job_aarch64; + state->submit_job_nocheck = submit_job_nocheck_aarch64; + state->get_completed_job = get_completed_job_aarch64; + state->flush_job = flush_job_aarch64; + state->queue_size = queue_size_aarch64; + + state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_aarch64; + state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_aarch64; + state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_aarch64; + state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_aarch64; + state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_aarch64; + state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_aarch64; + state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_aarch64; + state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_aarch64; + state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_aarch64; + state->snow3g_init_key_sched = snow3g_init_key_sched_aarch64; + state->snow3g_key_sched_size = snow3g_key_sched_size_aarch64; +} + +void +init_mb_mgr_aarch64(IMB_MGR *state) +{ + init_mb_mgr_aarch64_internal(state, 1); +} + +#include "mb_mgr_code_aarch64.h" diff --git a/lib/aarch64/mb_mgr_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_aarch64_no_aesni.c new file mode 100644 index 0000000000000000000000000000000000000000..be858f54e6dc03c977eb59917e793a9cc1fb0b09 --- /dev/null +++ b/lib/aarch64/mb_mgr_aarch64_no_aesni.c @@ -0,0 +1,112 @@ +/********************************************************************* + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include + +#include "ipsec-mb.h" +#include "include/snow3g.h" + +#include "include/noaesni.h" +#include "include/error.h" + +/* ====================================================================== */ + +#define SUBMIT_JOB submit_job_aarch64_no_aesni +#define FLUSH_JOB flush_job_aarch64_no_aesni +#define SUBMIT_JOB_NOCHECK submit_job_nocheck_aarch64_no_aesni +#define GET_NEXT_JOB get_next_job_aarch64_no_aesni +#define GET_COMPLETED_JOB get_completed_job_aarch64_no_aesni + +#define QUEUE_SIZE queue_size_aarch64_no_aesni + +/* ====================================================================== */ + +#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AARCH64 +#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AARCH64 + +/* ====================================================================== */ +static void +reset_ooo_mgrs(IMB_MGR *state) +{ + return; +} + +IMB_DLL_LOCAL void +init_mb_mgr_aarch64_no_aesni_internal(IMB_MGR *state, const int reset_mgrs) +{ +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return; + } +#endif + imb_set_errno(state, 0); + + /* Set architecture for future checks */ + state->used_arch = (uint32_t) IMB_ARCH_NOAESNI; + + if (reset_mgrs) { + reset_ooo_mgrs(state); + + /* Init "in order" components */ + state->next_job = 0; + state->earliest_job = -1; + } + + /* set AARCH64 NO AESNI handlers */ + state->get_next_job = get_next_job_aarch64_no_aesni; + state->submit_job = submit_job_aarch64_no_aesni; + state->submit_job_nocheck = submit_job_nocheck_aarch64_no_aesni; + state->get_completed_job = get_completed_job_aarch64_no_aesni; + state->flush_job = flush_job_aarch64_no_aesni; + state->queue_size = queue_size_aarch64_no_aesni; + + state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_aarch64_no_aesni; + state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_aarch64_no_aesni; + state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_aarch64_no_aesni; + state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_aarch64_no_aesni; + state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_aarch64_no_aesni; + state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_aarch64_no_aesni; + state->snow3g_f8_8_buffer_multikey = + snow3g_f8_8_buffer_multikey_aarch64_no_aesni; + state->snow3g_f8_n_buffer_multikey = + snow3g_f8_n_buffer_multikey_aarch64_no_aesni; + state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_aarch64_no_aesni; + state->snow3g_init_key_sched = snow3g_init_key_sched_aarch64_no_aesni; + state->snow3g_key_sched_size = snow3g_key_sched_size_aarch64_no_aesni; + +} + +void +init_mb_mgr_aarch64_no_aesni(IMB_MGR *state) +{ + init_mb_mgr_aarch64_no_aesni_internal(state, 1); +} +#include "mb_mgr_code_aarch64.h" diff --git a/lib/aarch64/mb_mgr_auto_aarch64.c b/lib/aarch64/mb_mgr_auto_aarch64.c new file mode 100644 index 0000000000000000000000000000000000000000..b4c0797e1eac1f0852135e62e991f74b6cd4a1a4 --- /dev/null +++ b/lib/aarch64/mb_mgr_auto_aarch64.c @@ -0,0 +1,73 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include "ipsec-mb.h" +#include "cpu_feature.h" +#include "noaesni.h" +#include "error.h" + +/** + * @brief Automatically initialize most performant + * Multi-buffer manager based on CPU features + * + * @param [in] state Pointer to MB_MGR struct + * @param [out] arch Pointer to arch enum to be set (can be NULL) + */ +void +init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch) +{ + IMB_ARCH arch_detected = IMB_ARCH_NONE; + const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI; + const uint64_t detect_noaesni = IMB_FEATURE_AARCH64 | IMB_FEATURE_ASIMD; + + /* reset error status */ + imb_set_errno(state, 0); + +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return; + } +#endif + if ((state->features & detect_aarch64) == detect_aarch64) { + init_mb_mgr_aarch64(state); + arch_detected = IMB_ARCH_AARCH64; + goto init_mb_mgr_auto_ret; + } + if ((state->features & detect_noaesni) == detect_noaesni) { + init_mb_mgr_aarch64_no_aesni(state); + arch_detected = IMB_ARCH_NOAESNI; + goto init_mb_mgr_auto_ret; + } + + imb_set_errno(state, ENODEV); + +init_mb_mgr_auto_ret: + if (arch != NULL) + *arch = arch_detected; +} diff --git a/lib/aarch64/mb_mgr_code_aarch64.h b/lib/aarch64/mb_mgr_code_aarch64.h new file mode 100644 index 0000000000000000000000000000000000000000..97343d12894bf33102c757b17625898f35b2333f --- /dev/null +++ b/lib/aarch64/mb_mgr_code_aarch64.h @@ -0,0 +1,540 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#ifndef MB_MGR_CODE_H +#define MB_MGR_CODE_H + +/* + * This contains the bulk of the mb_mgr code, with #define's to build + * an AARCH64 version (see mb_mgr_aarch64.c). + * + * get_next_job() returns a job object. This must be filled in and returned + * via submit_job() before get_next_job() is called again. + * + * submit_job() and flush_job() returns a job object. This job object ceases + * to be usable at the next call to get_next_job() + */ + +#include /* memcpy(), memset() */ + +#include "clear_regs_mem_aarch64.h" +#include "ipsec-mb.h" +#include "error.h" + +#define BSWAP64 __builtin_bswap64 + +/* + * JOBS() and ADV_JOBS() moved into mb_mgr_code.h + * get_next_job() and get_completed_job() API's are no longer inlines. + * For binary compatibility they have been made proper symbols. + */ +__forceinline +IMB_JOB *JOBS(IMB_MGR *state, const int offset) +{ + char *cp = (char *)state->jobs; + + return (IMB_JOB *)(cp + offset); +} + +__forceinline +void ADV_JOBS(int *ptr) +{ + *ptr += sizeof(IMB_JOB); + if (*ptr >= (int) (IMB_MAX_JOBS * sizeof(IMB_JOB))) + *ptr = 0; +} + +__forceinline +IMB_JOB * +submit_snow3g_uea2_job(IMB_MGR *state, IMB_JOB *job) +{ + const snow3g_key_schedule_t *key = job->enc_keys; + const uint32_t msg_bitlen = + (const uint32_t)job->msg_len_to_cipher_in_bits; + const uint32_t msg_bitoff = + (const uint32_t)job->cipher_start_src_offset_in_bits; + + /* Use bit length API if + * - msg length is not a multiple of bytes + * - bit offset is not a multiple of bytes + */ + if ((msg_bitlen & 0x07) || (msg_bitoff & 0x07)) { + IMB_SNOW3G_F8_1_BUFFER_BIT(state, key, job->iv, job->src, + job->dst, msg_bitlen, msg_bitoff); + } else { + const uint32_t msg_bytelen = msg_bitlen >> 3; + const uint32_t msg_byteoff = msg_bitoff >> 3; + const void *src = job->src + msg_byteoff; + void *dst = job->dst + msg_byteoff; + + IMB_SNOW3G_F8_1_BUFFER(state, key, job->iv, src, + dst, msg_bytelen); + } + + job->status |= IMB_STATUS_COMPLETED_CIPHER; + return job; +} + +__forceinline +IMB_JOB * +SUBMIT_JOB_AES_ENC(IMB_MGR *state, IMB_JOB *job) +{ + if (IMB_CIPHER_SNOW3G_UEA2_BITLEN == job->cipher_mode) { + return submit_snow3g_uea2_job(state, job); + } else { /* assume IMB_CIPHER_NULL */ + job->status |= IMB_STATUS_COMPLETED_CIPHER; + return job; + } +} + +__forceinline +IMB_JOB * +FLUSH_JOB_AES_ENC(IMB_MGR *state, IMB_JOB *job) +{ + (void) state; + (void) job; + + return NULL; +} + +__forceinline +IMB_JOB * +SUBMIT_JOB_AES_DEC(IMB_MGR *state, IMB_JOB *job) +{ + if (IMB_CIPHER_SNOW3G_UEA2_BITLEN == job->cipher_mode) { + return submit_snow3g_uea2_job(state, job); + } else { + /* assume IMB_CIPHER_NULL */ + job->status |= IMB_STATUS_COMPLETED_CIPHER; + return job; + } +} + +__forceinline +IMB_JOB * +FLUSH_JOB_AES_DEC(IMB_MGR *state, IMB_JOB *job) +{ + (void) state; + (void) job; + + return NULL; +} + +/* ========================================================================= */ +/* Hash submit & flush functions */ +/* ========================================================================= */ +__forceinline +IMB_JOB * +SUBMIT_JOB_HASH(IMB_MGR *state, IMB_JOB *job) +{ + switch (job->hash_alg) { + case IMB_AUTH_SNOW3G_UIA2_BITLEN: + IMB_SNOW3G_F9_1_BUFFER(state, (const snow3g_key_schedule_t *) + job->u.SNOW3G_UIA2._key, + job->u.SNOW3G_UIA2._iv, + job->src + job->hash_start_src_offset_in_bytes, + job->msg_len_to_hash_in_bits, + job->auth_tag_output); + job->status |= IMB_STATUS_COMPLETED_AUTH; + return job; + default: + job->status |= IMB_STATUS_COMPLETED_AUTH; + return job; + } +} + +__forceinline +IMB_JOB * +FLUSH_JOB_HASH(IMB_MGR *state, IMB_JOB *job) +{ + (void) state; + + switch (job->hash_alg) { + default: + if (!(job->status & IMB_STATUS_COMPLETED_AUTH)) { + job->status |= IMB_STATUS_COMPLETED_AUTH; + return job; + } + return NULL; + } +} + + +/* ========================================================================= */ +/* Job submit & flush functions */ +/* ========================================================================= */ + +#define SNOW3G_MAX_BITLEN (UINT32_MAX) +#define MB_MAX_LEN16 ((1 << 16) - 2) + +__forceinline int +is_job_invalid(IMB_MGR *state, const IMB_JOB *job) +{ + switch (job->cipher_mode) { + case IMB_CIPHER_NULL: + /* + * No checks required for this mode + * @note NULL cipher doesn't perform memory copy operation + * from source to destination + */ + break; + case IMB_CIPHER_SNOW3G_UEA2_BITLEN: + if (job->src == NULL) { + imb_set_errno(state, IMB_ERR_JOB_NULL_SRC); + return 1; + } + if (job->dst == NULL) { + imb_set_errno(state, IMB_ERR_JOB_NULL_DST); + return 1; + } + if (job->iv == NULL) { + imb_set_errno(state, IMB_ERR_JOB_NULL_IV); + return 1; + } + if (job->enc_keys == NULL) { + imb_set_errno(state, IMB_ERR_JOB_NULL_KEY); + return 1; + } + if (job->key_len_in_bytes != UINT64_C(16)) { + imb_set_errno(state, IMB_ERR_JOB_KEY_LEN); + return 1; + } + if (job->msg_len_to_cipher_in_bits == 0 || + job->msg_len_to_cipher_in_bits > SNOW3G_MAX_BITLEN) { + imb_set_errno(state, IMB_ERR_JOB_CIPH_LEN); + return 1; + } + if (job->iv_len_in_bytes != UINT64_C(16)) { + imb_set_errno(state, IMB_ERR_JOB_IV_LEN); + return 1; + } + break; + default: + imb_set_errno(state, IMB_ERR_CIPH_MODE); + return 1; + } + + switch (job->hash_alg) { + case IMB_AUTH_NULL: + break; + case IMB_AUTH_SNOW3G_UIA2_BITLEN: + if (job->src == NULL) { + imb_set_errno(state, IMB_ERR_JOB_NULL_SRC); + return 1; + } + if ((job->msg_len_to_hash_in_bits == 0) || + (job->msg_len_to_hash_in_bits > SNOW3G_MAX_BITLEN)) { + imb_set_errno(state, IMB_ERR_JOB_AUTH_LEN); + return 1; + } + if (job->u.SNOW3G_UIA2._key == NULL) { + imb_set_errno(state, IMB_ERR_JOB_NULL_KEY); + return 1; + } + if (job->u.SNOW3G_UIA2._iv == NULL) { + imb_set_errno(state, IMB_ERR_JOB_NULL_IV); + return 1; + } + if (job->auth_tag_output_len_in_bytes != UINT64_C(4)) { + imb_set_errno(state, IMB_ERR_JOB_AUTH_TAG_LEN); + return 1; + } + if (job->auth_tag_output == NULL) { + imb_set_errno(state, IMB_ERR_JOB_NULL_AUTH); + return 1; + } + break; + default: + imb_set_errno(state, IMB_ERR_HASH_ALGO); + return 1; + } + return 0; +} + +__forceinline +IMB_JOB *SUBMIT_JOB_AES(IMB_MGR *state, IMB_JOB *job) +{ + if (job->cipher_direction == IMB_DIR_ENCRYPT) + job = SUBMIT_JOB_AES_ENC(state, job); + else + job = SUBMIT_JOB_AES_DEC(state, job); + + return job; +} + +__forceinline +IMB_JOB *FLUSH_JOB_AES(IMB_MGR *state, IMB_JOB *job) +{ + if (job->cipher_direction == IMB_DIR_ENCRYPT) + job = FLUSH_JOB_AES_ENC(state, job); + else + job = FLUSH_JOB_AES_DEC(state, job); + + return job; +} + +/* submit a half-completed job, based on the status */ +__forceinline +IMB_JOB *RESUBMIT_JOB(IMB_MGR *state, IMB_JOB *job) +{ + while (job != NULL && job->status < IMB_STATUS_COMPLETED) { + if (job->status == IMB_STATUS_COMPLETED_AUTH) + job = SUBMIT_JOB_AES(state, job); + else /* assumed job->status = IMB_STATUS_COMPLETED_CIPHER */ + job = SUBMIT_JOB_HASH(state, job); + } + + return job; +} + +__forceinline +IMB_JOB *submit_new_job(IMB_MGR *state, IMB_JOB *job) +{ + if (job->chain_order == IMB_ORDER_CIPHER_HASH) + job = SUBMIT_JOB_AES(state, job); + else + job = SUBMIT_JOB_HASH(state, job); + + job = RESUBMIT_JOB(state, job); + return job; +} + +__forceinline +void complete_job(IMB_MGR *state, IMB_JOB *job) +{ + if (job->chain_order == IMB_ORDER_CIPHER_HASH) { + /* while() loop optimized for cipher_hash order */ + while (job->status < IMB_STATUS_COMPLETED) { + IMB_JOB *tmp = FLUSH_JOB_AES(state, job); + + if (tmp == NULL) + tmp = FLUSH_JOB_HASH(state, job); + + (void) RESUBMIT_JOB(state, tmp); + } + } else { + /* while() loop optimized for hash_cipher order */ + while (job->status < IMB_STATUS_COMPLETED) { + IMB_JOB *tmp = FLUSH_JOB_HASH(state, job); + + if (tmp == NULL) + tmp = FLUSH_JOB_AES(state, job); + + (void) RESUBMIT_JOB(state, tmp); + } + } +} + +__forceinline +IMB_JOB * +submit_job_and_check(IMB_MGR *state, const int run_check) +{ + IMB_JOB *job = NULL; + + job = JOBS(state, state->next_job); + + if (run_check) { + if (is_job_invalid(state, job)) { + job->status = IMB_STATUS_INVALID_ARGS; + } else { + job->status = IMB_STATUS_BEING_PROCESSED; + job = submit_new_job(state, job); + } + } else { + job->status = IMB_STATUS_BEING_PROCESSED; + job = submit_new_job(state, job); + } + + if (state->earliest_job < 0) { + /* state was previously empty */ + if (job == NULL) + state->earliest_job = state->next_job; + ADV_JOBS(&state->next_job); + goto exit; + } + + ADV_JOBS(&state->next_job); + + if (state->earliest_job == state->next_job) { + /* Full */ + job = JOBS(state, state->earliest_job); + complete_job(state, job); + ADV_JOBS(&state->earliest_job); + goto exit; + } + + /* not full */ + job = JOBS(state, state->earliest_job); + if (job->status < IMB_STATUS_COMPLETED) { + job = NULL; + goto exit; + } + + ADV_JOBS(&state->earliest_job); +exit: +#ifdef SAFE_DATA + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + + return job; +} + +IMB_JOB * +SUBMIT_JOB(IMB_MGR *state) +{ + /* reset error status */ + imb_set_errno(state, 0); + +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return NULL; + } +#endif + + return submit_job_and_check(state, 1); +} + +IMB_JOB * +SUBMIT_JOB_NOCHECK(IMB_MGR *state) +{ + /* reset error status */ + imb_set_errno(state, 0); + +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return NULL; + } +#endif + + return submit_job_and_check(state, 0); +} + +IMB_JOB * +FLUSH_JOB(IMB_MGR *state) +{ + /* reset error status */ + imb_set_errno(state, 0); + +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return NULL; + } +#endif + IMB_JOB *job; + if (state->earliest_job < 0) + return NULL; /* empty */ + + job = JOBS(state, state->earliest_job); + complete_job(state, job); + + ADV_JOBS(&state->earliest_job); + + if (state->earliest_job == state->next_job) + state->earliest_job = -1; /* becomes empty */ + +#ifdef SAFE_DATA + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + + return job; +} + +/* ========================================================================= */ + +uint32_t +QUEUE_SIZE(IMB_MGR *state) +{ + /* reset error status */ + imb_set_errno(state, 0); + +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return 0; + } +#endif + int a, b; + + if (state->earliest_job < 0) + return 0; + a = state->next_job / sizeof(IMB_JOB); + b = state->earliest_job / sizeof(IMB_JOB); + return ((a-b) & (IMB_MAX_JOBS-1)); +} + +IMB_JOB * +GET_COMPLETED_JOB(IMB_MGR *state) +{ + /* reset error status */ + imb_set_errno(state, 0); + +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return NULL; + } +#endif + IMB_JOB *job; + + if (state->earliest_job < 0) + return NULL; + + job = JOBS(state, state->earliest_job); + if (job->status < IMB_STATUS_COMPLETED) + return NULL; + + ADV_JOBS(&state->earliest_job); + + if (state->earliest_job == state->next_job) + state->earliest_job = -1; + + return job; +} + +IMB_JOB * +GET_NEXT_JOB(IMB_MGR *state) +{ + /* reset error status */ + imb_set_errno(state, 0); + +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return NULL; + } +#endif + + return JOBS(state, state->next_job); +} + +#endif /* MB_MGR_CODE_H */ diff --git a/lib/aarch64/snow3g_aarch64.c b/lib/aarch64/snow3g_aarch64.c new file mode 100644 index 0000000000000000000000000000000000000000..6ff912bb4460d5fa0f6925dff70549cf4cb9a385 --- /dev/null +++ b/lib/aarch64/snow3g_aarch64.c @@ -0,0 +1,41 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_aarch64 +#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_aarch64 +#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_aarch64 +#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_aarch64 +#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_aarch64 +#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_aarch64 +#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_aarch64 +#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_aarch64 +#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64 +#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64 +#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64 + +#include "snow3g_common_aarch64.h" diff --git a/lib/aarch64/snow3g_aarch64_no_aesni.c b/lib/aarch64/snow3g_aarch64_no_aesni.c new file mode 100644 index 0000000000000000000000000000000000000000..fbc861b23eddfc3373e838a67b5c8c885f52c9b7 --- /dev/null +++ b/lib/aarch64/snow3g_aarch64_no_aesni.c @@ -0,0 +1,43 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define NO_AESNI +#define AESNI_EMU +#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_aarch64_no_aesni +#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_aarch64_no_aesni +#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_aarch64_no_aesni +#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_aarch64_no_aesni +#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_aarch64_no_aesni +#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_aarch64_no_aesni +#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_aarch64_no_aesni +#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_aarch64_no_aesni +#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64_no_aesni +#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64_no_aesni +#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64_no_aesni + +#include "snow3g_common_aarch64.h" diff --git a/lib/aarch64/snow3g_common_aarch64.h b/lib/aarch64/snow3g_common_aarch64.h new file mode 100644 index 0000000000000000000000000000000000000000..3b1651212112f0141dbd0715a89396031c8fb266 --- /dev/null +++ b/lib/aarch64/snow3g_common_aarch64.h @@ -0,0 +1,2905 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#ifndef SNOW3G_COMMON_H +#define SNOW3G_COMMON_H + +#include /* printf() */ +#include /* memset(), memcpy() */ +#include + +#include "ipsec-mb.h" +#include "include/wireless_common.h" +#include "snow3g.h" +#include "snow3g_tables.h" +#include "constant_lookup_aarch64.h" +#include "clear_regs_mem_aarch64.h" +#ifdef NO_AESNI +#include "include/aesni_emu.h" +#endif + +#define CLEAR_MEM clear_mem +#define CLEAR_VAR clear_var + +#define MAX_KEY_LEN (16) +#define SNOW3G_4_BYTES (4) +#define SNOW3G_8_BYTES (8) +#define SNOW3G_8_BITS (8) +#define SNOW3G_16_BYTES (16) +#define SNOW3G_16_BITS (16) + +#define SNOW3G_BLOCK_SIZE (8) + +#define SNOW3G_KEY_LEN_IN_BYTES (16) /* 128b */ +#define SNOW3G_IV_LEN_IN_BYTES (16) /* 128b */ + +#define SNOW3GCONSTANT (0x1b) + +/* Range of input data for SNOW3G is from 1 to 2^32 bits */ +#define SNOW3G_MIN_LEN 1 +#define SNOW3G_MAX_BITLEN (UINT32_MAX) +#define SNOW3G_MAX_BYTELEN (UINT32_MAX / 8) + +typedef union SafeBuffer { + uint64_t b64; + uint32_t b32[2]; + uint8_t b8[SNOW3G_8_BYTES]; +} SafeBuf; + +typedef struct snow3gKeyState1_s { + /* 16 LFSR stages */ + uint32_t LFSR_S[16]; + /* 3 FSM states */ + uint32_t FSM_R3; + uint32_t FSM_R2; + uint32_t FSM_R1; +} DECLARE_ALIGNED(snow3gKeyState1_t, 16); + +typedef struct snow3gKeyState4_s { + /* 16 LFSR stages */ + uint32x4_t LFSR_X[16]; + /* 3 FSM states */ + uint32x4_t FSM_X[3]; + uint32_t iLFSR_X; +} snow3gKeyState4_t; + +/** + * @brief Finds minimum 32-bit value in an array + * @return Min 32-bit value + */ +static inline uint32_t +length_find_min(const uint32_t *out_array, const size_t dim_array) +{ + size_t i; + uint32_t min = 0; + + if (dim_array > 0) + min = out_array[0]; + + for (i = 1; i < dim_array; i++) + if (out_array[i] < min) + min = out_array[i]; + + return min; +} + +/** + * @brief Subtracts \a subv from a vector of 32-bit words + */ +static inline void +length_sub(uint32_t *out_array, const size_t dim_array, const uint32_t subv) +{ + size_t i; + + for (i = 0; i < dim_array; i++) + out_array[i] -= subv; +} + +/** + * @brief Checks vector of length values against 0 and SNOW3G_MAX_BYTELEN values + * @retval 0 incorrect length value found + * @retval 1 all OK + */ +static inline uint32_t +length_check(const uint32_t *out_array, const size_t dim_array) +{ + size_t i; + + for (i = 0; i < dim_array; i++) { + if ((out_array[i] == 0) || + (out_array[i] > SNOW3G_MAX_BYTELEN)) + return 0; + } + + return 1; +} + +/** + * @brief Copies 4 32-bit length values into an array + */ +static inline void +length_copy_4(uint32_t *out_array, + const uint32_t length1, const uint32_t length2, + const uint32_t length3, const uint32_t length4) +{ + out_array[0] = length1; + out_array[1] = length2; + out_array[2] = length3; + out_array[3] = length4; +} + +/** + * @brief Copies 8 32-bit length values into an array + */ +static inline void +length_copy_8(uint32_t *out_array, + const uint32_t length1, const uint32_t length2, + const uint32_t length3, const uint32_t length4, + const uint32_t length5, const uint32_t length6, + const uint32_t length7, const uint32_t length8) +{ + out_array[0] = length1; + out_array[1] = length2; + out_array[2] = length3; + out_array[3] = length4; + out_array[4] = length5; + out_array[5] = length6; + out_array[6] = length7; + out_array[7] = length8; +} + +/** + * @brief Checks vector of pointers against NULL + * @retval 0 incorrect pointer found + * @retval 1 all OK + */ +static inline int +ptr_check(void *out_array[], const size_t dim_array) +{ + size_t i; + + for (i = 0; i < dim_array; i++) + if (out_array[i] == NULL) + return 0; + + return 1; +} + +/** + * @brief Checks vector of const pointers against NULL + * @retval 0 incorrect pointer found + * @retval 1 all OK + */ +static inline int +cptr_check(const void * const out_array[], const size_t dim_array) +{ + size_t i; + + for (i = 0; i < dim_array; i++) + if (out_array[i] == NULL) + return 0; + + return 1; +} + +/** + * @brief Copies 4 pointers into an array + */ +static inline void +ptr_copy_4(void *out_array[], + void *ptr1, void *ptr2, void *ptr3, void *ptr4) +{ + out_array[0] = ptr1; + out_array[1] = ptr2; + out_array[2] = ptr3; + out_array[3] = ptr4; +} + +/** + * @brief Copies 4 const pointers into an array + */ +static inline void +cptr_copy_4(const void *out_array[], + const void *ptr1, const void *ptr2, + const void *ptr3, const void *ptr4) +{ + out_array[0] = ptr1; + out_array[1] = ptr2; + out_array[2] = ptr3; + out_array[3] = ptr4; +} + +/** + * @brief Copies 8 pointers into an array + */ +static inline void +ptr_copy_8(void *out_array[], + void *ptr1, void *ptr2, void *ptr3, void *ptr4, + void *ptr5, void *ptr6, void *ptr7, void *ptr8) +{ + out_array[0] = ptr1; + out_array[1] = ptr2; + out_array[2] = ptr3; + out_array[3] = ptr4; + out_array[4] = ptr5; + out_array[5] = ptr6; + out_array[6] = ptr7; + out_array[7] = ptr8; +} + +/** + * @brief Copies 8 const pointers into an array + */ +static inline void +cptr_copy_8(const void *out_array[], + const void *ptr1, const void *ptr2, + const void *ptr3, const void *ptr4, + const void *ptr5, const void *ptr6, + const void *ptr7, const void *ptr8) +{ + out_array[0] = ptr1; + out_array[1] = ptr2; + out_array[2] = ptr3; + out_array[3] = ptr4; + out_array[4] = ptr5; + out_array[5] = ptr6; + out_array[6] = ptr7; + out_array[7] = ptr8; +} + +/** + * @brief Wrapper for safe lookup of 16 indexes in 256x8-bit table + * @param[in] indexes vector of 16x8-bit indexes to be looked up + * @param[in] lut pointer to a 256x8-bit table + * @return 16x8-bit values looked in \a lut using 16x8-bit \a indexes + */ +static inline uint8x16_t lut16x8b_256(const uint8x16_t indexes, const void *lut) +{ + return lookup_16x8bit_neon(indexes, lut); +} + +/** + * @brief LFSR array shift by 2 positions + * @param[in/out] pCtx key state context structure + */ +static inline void ShiftTwiceLFSR_1(snow3gKeyState1_t *pCtx) +{ + int i; + + for (i = 0; i < 14; i++) + pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 2]; +} + +/** + * @brief SNOW3G S2 mix column correction function + * + * Mix column AES GF() reduction poly is 0x1B and SNOW3G reduction poly is 0x69. + * The fix-up value is 0x1B ^ 0x69 = 0x72 and needs to be applied on selected + * bytes of the 32-bit word. + * + * 'aese' operation does not perform mix column operation and allows to + * determine the fix-up value to be applied on result of 'aese + aesmc' + * in order to produce correct result for SNOW3G. + * + * This function implements more scalable SIMD method to apply the fix-up value + * for multiple stream at the same time. + * + * a = \a no_mixc bit-31 + * b = \a no_mixc bit-23 + * c = \a no_mixc bit-15 + * d = \a no_mixc bit-7 + * + * mask0_f(), mask1_f(), mask2_f() and mask3_f() functions + * specify if corresponding byte of \a mixc word, i.e. 0, 1, 2 or 3 + * respectively, should be corrected. + * Definition of the functions: + * mask0_f(a, b, c, d) = c'd + cd' => c xor d + * mask1_f(a, b, c, d) = b'c + bc' => b xor c + * mask2_f(a, b, c, d) = a'b + ab' => a xor b + * mask3_f(a, b, c, d) = a'd + ad' => d xor a + * The above are resolved through SIMD instructions: and, cmlt and + * xor. As the result mask is obtained with 0xff byte value at positions + * that require 0x72 fix up value to be applied. + * + * @param no_mixc result of 'aese' operation, 4 x 32-bit words + * @param mixc result of 'aese + aesmc' operation, 4 x 32-bit words + * + * @return corrected \a mixc for SNOW3G S2, 4 x 32-bit words + */ +static inline uint32x4_t s2_mixc_fixup_4(const uint8x16_t no_mixc, const uint8x16_t mixc) +{ + const uint32_t ror8[4] = {0x00030201, 0x04070605, 0x080b0a09, 0x0c0f0e0d}; + uint8x16_t pattern, pattern_shuf, idx, mask, fixup; + + pattern = vcltzq_s8(vreinterpretq_s8_u8(no_mixc)); + idx = vreinterpretq_u8_u32(vld1q_u32(ror8)); + pattern_shuf = vqtbl1q_u8(pattern, idx); + + mask = vdupq_n_u8(0x72); + pattern = pattern ^ pattern_shuf; + + fixup = mask & pattern; + return vreinterpretq_u32_u8(veorq_u8(fixup, mixc)); + +} + +/** + * @brief SNOW3G S2 mix column correction function + * + * @param no_mixc result of 'aese' operation, 32-bit word index 0 only + * @param mixc result of 'aese + aesmc' operation, 32-bit word index 0 only + * + * @return corrected \a mixc 32-bit word for SNOW3G S2 + */ +static inline uint32_t +s2_mixc_fixup_scalar(const uint8x16_t no_mixc, const uint8x16_t mixc) +{ + return vgetq_lane_u32(s2_mixc_fixup_4(no_mixc, mixc), 0); +} + +/** + * @brief Sbox S1 maps a 32bit input to a 32bit output + * + * @param[in] x 32-bit word to be passed through S1 box + * + * @return \a x transformed through S1 box + */ +static inline uint32_t S1_box(const uint32_t x) +{ +#ifdef NO_AESNI + union xmm_reg key, v; + + key.qword[0] = key.qword[1] = 0; + + v.dword[0] = v.dword[1] = + v.dword[2] = v.dword[3] = x; + + emulate_AESENC(&v, &key); + return v.dword[0]; +#else + uint32x4_t dup_x; + uint8x16_t new_x, key, tmp; + dup_x = vdupq_n_u32(x); + key = vdupq_n_u8(0); + new_x = vreinterpretq_u8_u32(dup_x); + tmp = vaeseq_u8(new_x, key); + tmp = vaesmcq_u8(tmp); + + return vgetq_lane_u32(vreinterpretq_u32_u8(tmp),0); +#endif +} + +/** + * @brief Sbox S1 maps a 2x32bit input to a 2x32bit output + * + * @param[in] x1 32-bit word to be passed through S1 box + * @param[in] x2 32-bit word to be passed through S1 box + */ +static inline void S1_box_2(uint32_t *x1, uint32_t *x2) +{ +#ifdef NO_AESNI + /* reuse S1_box() for NO_AESNI path */ + *x1 = S1_box(*x1); + *x2 = S1_box(*x2); +#else + const uint8x16_t m_zero = vdupq_n_u8(0); + uint32x4_t m1, m2; + uint8x16_t r1, r2; + + m1 = vdupq_n_u32(*x1); + r1 = vaeseq_u8(vreinterpretq_u8_u32(m1), m_zero); + r1 = vaesmcq_u8(r1); + m2 = vdupq_n_u32(*x2); + r2 = vaeseq_u8(vreinterpretq_u8_u32(m2), m_zero); + r2 = vaesmcq_u8(r2); + + *x1 = vgetq_lane_u32(vreinterpretq_u32_u8(r1), 0); + *x2 = vgetq_lane_u32(vreinterpretq_u32_u8(r2), 0); +#endif +} + +/** + * @brief Sbox S1 maps a 4x32bit input to a 4x32bit output + * + * @param[in] x vector of 4 32-bit words to be passed through S1 box + * + * @return 4x32-bits from \a x transformed through S1 box + */ +static inline uint32x4_t S1_box_4(const uint32x4_t x) +{ +#ifdef NO_AESNI + union xmm_reg key, v, vt; + + key.qword[0] = key.qword[1] = 0; + + /* + * - Broadcast 32-bit word across XMM + * - Perform AES operations + */ + vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 0); + emulate_AESENC(&vt, &key); + v.dword[0] = vt.dword[0]; + + vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 1); + emulate_AESENC(&vt, &key); + v.dword[1] = vt.dword[0]; + + vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 2); + emulate_AESENC(&vt, &key); + v.dword[2] = vt.dword[0]; + + vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = vgetq_lane_u32(x, 3); + emulate_AESENC(&vt, &key); + v.dword[3] = vt.dword[0]; + + return vld1q_u32(&v.dword[0]); +#else + const uint8x16_t m_zero = vdupq_n_u8(0); + uint8x16_t m1, m2, m3, m4; + uint32x4_t r1, r2; + + m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 0))); + m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 1))); + m3 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 2))); + m4 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x, 3))); + + m1 = vaeseq_u8(m1, m_zero); + m1 = vaesmcq_u8(m1); + m2 = vaeseq_u8(m2, m_zero); + m2 = vaesmcq_u8(m2); + m3 = vaeseq_u8(m3, m_zero); + m3 = vaesmcq_u8(m3); + m4 = vaeseq_u8(m4, m_zero); + m4 = vaesmcq_u8(m4); + + /* + * Put results of AES operations back into + * two vectors of 32-bit words + * + * First step: + * r1 = [ 0-31 m1 | 0-31 m2 | 32-63 m1 | 32-63 m2 ] + * r2 = [ 0-31 m3 | 0-31 m4 | 32-63 m3 | 32-63 m4 ] + */ + r1 = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2)); + r2 = vzip1q_u32(vreinterpretq_u32_u8(m3), vreinterpretq_u32_u8(m4)); + r1 = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(r1), + vreinterpretq_u64_u32(r2))); + /* + * The last step: + * r1 = [ 0-63 m1 | 0-63 m3 ] => + * [ 0-31 m1 | 0-31 m2 | 0-31 m3 | 0-31 m4 ] + */ + + return r1; +#endif +} + +/** + * @brief Sbox S2 maps a 32-bit input to a 32-bit output + * + * @param[in] x 32-bit word to be passed through S2 box + * + * @return \a x transformed through S2 box + */ +static inline uint32_t S2_box(const uint32_t x) +{ +#ifdef NO_AESNI + /* Perform invSR(SQ(x)) transform */ + const uint32x4_t par_lut = vreinterpretq_u32_u8( + lut16x8b_256(vreinterpretq_u8_u32(vdupq_n_u32(x)), + snow3g_invSR_SQ)); + const uint32_t new_x = vgetq_lane_u32(par_lut, 0); + union xmm_reg key, v, v_fixup; + + key.qword[0] = key.qword[1] = 0; + + v.dword[0] = v.dword[1] = + v.dword[2] = v.dword[3] = new_x; + + v_fixup = v; + + emulate_AESENC(&v, &key); + emulate_AESENCLAST(&v_fixup, &key); + + const uint8x16_t ret_mixc = vreinterpretq_u8_u32( + vld1q_u32(&v.dword[0])); + const uint8x16_t ret_nomixc = vreinterpretq_u8_u32( + vld1q_u32(&v_fixup.dword[0])); + + return s2_mixc_fixup_scalar(ret_nomixc, ret_mixc); +#else + +#ifndef SAFE_LOOKUP + const uint8_t *w3 = (const uint8_t *)&snow3g_table_S2[x & 0xff]; + const uint8_t *w1 = (const uint8_t *)&snow3g_table_S2[(x >> 16) & 0xff]; + const uint8_t *w2 = (const uint8_t *)&snow3g_table_S2[(x >> 8) & 0xff]; + const uint8_t *w0 = (const uint8_t *)&snow3g_table_S2[(x >> 24) & 0xff]; + + return *((const uint32_t *)&w3[3]) ^ + *((const uint32_t *)&w1[1]) ^ + *((const uint32_t *)&w2[2]) ^ + *((const uint32_t *)&w0[0]); + +#else + uint32x4_t par_lut; + uint8x16_t m, key, ret_nomixc, ret_mixc; + + /* Perform invSR(SQ(x)) transform */ + par_lut = vreinterpretq_u32_u8( + lut16x8b_256(vreinterpretq_u8_u32(vdupq_n_u32(x)), + snow3g_invSR_SQ)); + + m = vreinterpretq_u8_u32(vdupq_n_u32((vgetq_lane_u32(par_lut, 0)))); + key = vdupq_n_u8(0); + + ret_nomixc = vaeseq_u8(m, key); + ret_mixc = vaesmcq_u8(ret_nomixc); + + return s2_mixc_fixup_scalar(ret_nomixc, ret_mixc); +#endif + +#endif +} + +/** + * @brief Sbox S2 maps a 2x32bit input to a 2x32bit output + * + * @param[in/out] x1 32-bit word to be passed through S2 box + * @param[in/out] x2 32-bit word to be passed through S2 box + */ +static inline void S2_box_2(uint32_t *x1, uint32_t *x2) +{ +#if defined(NO_AESNI) || !defined(SAFE_LOOKUP) + *x1 = S2_box(*x1); + *x2 = S2_box(*x2); +#else + /* Perform invSR(SQ(x)) transform through a lookup table */ + const uint8x16_t m_zero = vdupq_n_u8(0); + uint32x4_t x_vec, ret_nomixc, ret_mixc, res; + + x_vec = vdupq_n_u32(x1[0]); + x_vec = vsetq_lane_u32(x2[0], x_vec, 1); + + const uint32x4_t new_x = vreinterpretq_u32_u8( + lut16x8b_256(vreinterpretq_u8_u32(x_vec), + snow3g_invSR_SQ)); + uint8x16_t m1, m2, f1, f2; + + m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 0))); + m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 1))); + + f1 = vaeseq_u8(m1, m_zero); // no_mixc + m1 = vaesmcq_u8(f1); + f2 = vaeseq_u8(m2, m_zero); + m2 = vaesmcq_u8(f2); + /* + * Put results of AES operations back into one vector + * for further fix up + */ + ret_nomixc = vzip1q_u32(vreinterpretq_u32_u8(f1), vreinterpretq_u32_u8(f2)); + ret_mixc = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2)); + + res = s2_mixc_fixup_4(vreinterpretq_u8_u32(ret_nomixc), vreinterpretq_u8_u32(ret_mixc)); + + *x1 = vgetq_lane_u32(res, 0); + *x2 = vgetq_lane_u32(res, 1); +#endif +} + +/** + * @brief Sbox S2 maps a 4x32bit input to a 4x32bit output + * + * @param[in] x vector of 4 32-bit words to be passed through S2 box + * + * @return 4x32-bits from \a x transformed through S2 box + */ +static inline uint32x4_t S2_box_4(const uint32x4_t x) +{ + /* Perform invSR(SQ(x)) transform through a lookup table */ + const uint32x4_t new_x = vreinterpretq_u32_u8( + lut16x8b_256(vreinterpretq_u8_u32(x), + snow3g_invSR_SQ)); + + /* use AESNI operations for the rest of the S2 box */ +#ifdef NO_AESNI + union xmm_reg key, v, f; + union xmm_reg vt, ft; + + key.qword[0] = key.qword[1] = 0; + + /* + * - Broadcast 32-bit word across XMM and + * perform AES operations + * - Save result 32-bit words in v and f vectors. + * 'f' is used for fix-up of mixed columns only + */ + vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = + vgetq_lane_u32(new_x, 0); + ft = vt; + emulate_AESENC(&vt, &key); + emulate_AESENCLAST(&ft, &key); + v.dword[0] = vt.dword[0]; + f.dword[0] = ft.dword[0]; + + vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = + vgetq_lane_u32(new_x, 1); + ft = vt; + emulate_AESENC(&vt, &key); + emulate_AESENCLAST(&ft, &key); + v.dword[1] = vt.dword[0]; + f.dword[1] = ft.dword[0]; + + vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = + vgetq_lane_u32(new_x, 2); + ft = vt; + emulate_AESENC(&vt, &key); + emulate_AESENCLAST(&ft, &key); + v.dword[2] = vt.dword[0]; + f.dword[2] = ft.dword[0]; + + vt.dword[0] = vt.dword[1] = vt.dword[2] = vt.dword[3] = + vgetq_lane_u32(new_x, 3); + ft = vt; + emulate_AESENC(&vt, &key); + emulate_AESENCLAST(&ft, &key); + v.dword[3] = vt.dword[0]; + f.dword[3] = ft.dword[0]; + + return s2_mixc_fixup_4(vreinterpretq_u8_u32(vld1q_u32(&f.dword[0])), + vreinterpretq_u8_u32(vld1q_u32(&v.dword[0]))); +#else + const uint8x16_t zero = vdupq_n_u8(0); + uint8x16_t m1, m2, m3, m4, f1, f2, f3, f4, mixc, no_mixc; + uint32x4_t r1, r2; + + m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 0))); + m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 1))); + m3 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 2))); + m4 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(new_x, 3))); + + f1 = vaeseq_u8(m1, zero); // no_mixc + m1 = vaesmcq_u8(f1); + f2 = vaeseq_u8(m2, zero); + m2 = vaesmcq_u8(f2); + f3 = vaeseq_u8(m3, zero); + m3 = vaesmcq_u8(f3); + f4 = vaeseq_u8(m4, zero); + m4 = vaesmcq_u8(f4); + + /* + * Put results of AES operations back into + * two vectors of 32-bit words + * + * First step: + * m1 = [ 0-31 m1 | 0-31 m2 | 32-63 m1 | 32-63 m2 ] + * m3 = [ 0-31 m3 | 0-31 m4 | 32-63 m3 | 32-63 m4 ] + */ + /* + * The last step: + * m1 = [ 0-63 m1 | 0-63 m3 ] => + * [ 0-31 m1 | 0-31 m2 | 0-31 m3 | 0-31 m4 ] + * f1 = [ 0-63 f1 | 0-63 f3 ] => + * [ 0-31 f1 | 0-31 f2 | 0-31 f3 | 0-31 f4 ] + */ + r1 = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2)); + r2 = vzip1q_u32(vreinterpretq_u32_u8(m3), vreinterpretq_u32_u8(m4)); + mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1), + vreinterpretq_u64_u32(r2))); + + r1 = vzip1q_u32(vreinterpretq_u32_u8(f1), vreinterpretq_u32_u8(f2)); + r2 = vzip1q_u32(vreinterpretq_u32_u8(f3), vreinterpretq_u32_u8(f4)); + no_mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1), + vreinterpretq_u64_u32(r2))); + + return s2_mixc_fixup_4(no_mixc, mixc); +#endif +} + +/** + * @brief Sbox S2 maps a 2x4x32bit input to a 2x4x32bit output + * + * @param[in/out] in_out1 vector of 4 32-bit words to be passed through S2 box + * @param[in/out] in_out2 vector of 4 32-bit words to be passed through S2 box + */ +static inline void S2_box_2x4(uint32x4_t *in_out1, uint32x4_t *in_out2) +{ +#ifdef NO_AESNI + *in_out1 = S2_box_4(*in_out1); + *in_out2 = S2_box_4(*in_out2); +#else + /* + * Perform invSR(SQ(x)) transform through a lookup table and + * use AES operations for the rest of the S2 box + */ + const uint8x16_t zero = vdupq_n_u8(0); + const uint32x4_t x1 = vreinterpretq_u32_u8( + lut16x8b_256(vreinterpretq_u8_u32(*in_out1), + snow3g_invSR_SQ)); + uint8x16_t m1, m2, m3, m4, f1, f2, f3, f4; + uint8x16_t m5, m6, m7, m8, f5, f6, f7, f8; + uint8x16_t mixc, no_mixc; + uint32x4_t r1, r2; + + m1 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x1, 0))); + m2 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x1, 1))); + m3 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x1, 2))); + m4 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x1, 3))); + + /* start shuffling next 128 bits of data */ + const uint32x4_t x2 = vreinterpretq_u32_u8( + lut16x8b_256(vreinterpretq_u8_u32(*in_out2), + snow3g_invSR_SQ)); + + m5 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x2, 0))); + m6 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x2, 1))); + m7 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x2, 2))); + m8 = vreinterpretq_u8_u32(vdupq_n_u32(vgetq_lane_u32(x2, 3))); + + f1 = vaeseq_u8(m1, zero); // no_mixc + m1 = vaesmcq_u8(f1); + f2 = vaeseq_u8(m2, zero); + m2 = vaesmcq_u8(f2); + f3 = vaeseq_u8(m3, zero); + m3 = vaesmcq_u8(f3); + f4 = vaeseq_u8(m4, zero); + m4 = vaesmcq_u8(f4); + + r1 = vzip1q_u32(vreinterpretq_u32_u8(m1), vreinterpretq_u32_u8(m2)); + r2 = vzip1q_u32(vreinterpretq_u32_u8(m3), vreinterpretq_u32_u8(m4)); + mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1), + vreinterpretq_u64_u32(r2))); + + r1 = vzip1q_u32(vreinterpretq_u32_u8(f1), vreinterpretq_u32_u8(f2)); + r2 = vzip1q_u32(vreinterpretq_u32_u8(f3), vreinterpretq_u32_u8(f4)); + no_mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1), + vreinterpretq_u64_u32(r2))); + + *in_out1 = s2_mixc_fixup_4(no_mixc, mixc); + + /* start encrypting next 128 bits */ + f5 = vaeseq_u8(m5, zero); // no_mixc + m5 = vaesmcq_u8(f5); + f6 = vaeseq_u8(m6, zero); + m6 = vaesmcq_u8(f6); + f7 = vaeseq_u8(m7, zero); + m7 = vaesmcq_u8(f7); + f8 = vaeseq_u8(m8, zero); + m8 = vaesmcq_u8(f8); + + r1 = vzip1q_u32(vreinterpretq_u32_u8(m5), vreinterpretq_u32_u8(m6)); + r2 = vzip1q_u32(vreinterpretq_u32_u8(m7), vreinterpretq_u32_u8(m8)); + mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1), + vreinterpretq_u64_u32(r2))); + + r1 = vzip1q_u32(vreinterpretq_u32_u8(f5), vreinterpretq_u32_u8(f6)); + r2 = vzip1q_u32(vreinterpretq_u32_u8(f7), vreinterpretq_u32_u8(f8)); + no_mixc = vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u32(r1), + vreinterpretq_u64_u32(r2))); + + *in_out2 = s2_mixc_fixup_4(no_mixc, mixc); +#endif +} + +/** + * @brief MULalpha SNOW3G operation on 4 8-bit values at the same time + * + * Function picks the right byte from the register to run MULalpha operation on. + * MULalpha is implemented through 8 16-byte tables and pshufb is used to + * look the tables up. This approach is possible because + * MULalpha operation has linear nature. + * Final operation result is calculated via byte re-arrangement on + * the lookup results and an XOR operation. + * + * @param [in] L0 4 x 32-bit LFSR[0] + * @return 4 x 32-bit MULalpha(L0 >> 24) + */ +static inline +uint32x4_t MULa_4(const uint32x4_t L0) +{ +#ifdef SAFE_LOOKUP + const uint8_t gather_clear_mask[]= { + 0x03,0x07,0x0b,0x0f,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + }; + const uint8x16_t low_nibble_mask = vdupq_n_u8(0x0f); + const uint8x16_t clear_mask = vld1q_u8(gather_clear_mask); + uint8x16_t th, tl, b0, b1, b2, b3; + + th = vqtbl1q_u8(vreinterpretq_u8_u32(L0), clear_mask); + + tl = th & low_nibble_mask; + b0 = vld1q_u8(snow3g_MULa_byte0_low); + b1 = vld1q_u8(snow3g_MULa_byte1_low); + b2 = vld1q_u8(snow3g_MULa_byte2_low); + b3 = vld1q_u8(snow3g_MULa_byte3_low); + + b0 = vqtbl1q_u8(b0, tl); + b1 = vqtbl1q_u8(b1, tl); + b2 = vqtbl1q_u8(b2, tl); + b3 = vqtbl1q_u8(b3, tl); + + b0 = vzip1q_u8(b0, b1); + b2 = vzip1q_u8(b2, b3); + tl = vreinterpretq_u8_u16(vzip1q_u16(vreinterpretq_u16_u8(b0), + vreinterpretq_u16_u8(b2))); + + b0 = vld1q_u8(snow3g_MULa_byte0_hi); + b1 = vld1q_u8(snow3g_MULa_byte1_hi); + b2 = vld1q_u8(snow3g_MULa_byte2_hi); + b3 = vld1q_u8(snow3g_MULa_byte3_hi); + + th = vshrq_n_u8(th, 4) & low_nibble_mask; + + b0 = vqtbl1q_u8(b0, th); + b1 = vqtbl1q_u8(b1, th); + b2 = vqtbl1q_u8(b2, th); + b3 = vqtbl1q_u8(b3, th); + + b0 = vzip1q_u8(b0, b1); + b2 = vzip1q_u8(b2, b3); + th = vreinterpretq_u8_u16(vzip1q_u16(vreinterpretq_u16_u8(b0), + vreinterpretq_u16_u8(b2))); + + return vreinterpretq_u32_u8(th ^ tl); +#else + const uint8_t L0IDX0 = vgetq_lane_u8(vreinterpretq_u8_u32(L0), 3); + const uint8_t L0IDX1 = vgetq_lane_u8(vreinterpretq_u8_u32(L0), 7); + const uint8_t L0IDX2 = vgetq_lane_u8(vreinterpretq_u8_u32(L0), 11); + const uint8_t L0IDX3 = vgetq_lane_u8(vreinterpretq_u8_u32(L0), 15); + + uint32x4_t ret; + uint32_t x0, x1, x2, x3; + + x0 = snow3g_table_A_mul[L0IDX0]; + x1 = snow3g_table_A_mul[L0IDX1]; + x2 = snow3g_table_A_mul[L0IDX2]; + x3 = snow3g_table_A_mul[L0IDX3]; + + ret = vdupq_n_u32(x0); + ret = vsetq_lane_u32(x1, ret, 1); + ret = vsetq_lane_u32(x2, ret, 2); + ret = vsetq_lane_u32(x3, ret, 3); + return ret; +#endif +} + +/** + * @brief MULalpha SNOW3G operation on 2 8-bit values at the same time + * + * @param [in/out] L0_1 On input, 32-bit LFSR[0]. + * On output, 32-bit MULalpha(L0 >> 24) + * @param [in/out] L0_2 On input, 32-bit LFSR[0]. + * On output, 32-bit MULalpha(L0 >> 24) + */ +static inline +void MULa_2(uint32_t *L0_1, uint32_t *L0_2) +{ +#ifdef SAFE_LOOKUP + uint32x4_t in, out; + + in = vdupq_n_u32(*L0_1); + in = vsetq_lane_u32(*L0_2, in, 1); + out = MULa_4(in); + + *L0_1 = vgetq_lane_u32(out, 0); + *L0_2 = vgetq_lane_u32(out, 1); +#else + *L0_1 = snow3g_table_A_mul[*L0_1 >> 24]; + *L0_2 = snow3g_table_A_mul[*L0_2 >> 24]; +#endif +} + +/** + * @brief MULalpha SNOW3G operation on a 8-bit value. + * + * @param [in] L0 32-bit LFSR[0] + * @return 32-bit MULalpha(L0 >> 24) + */ +static inline +uint32_t MULa(const uint32_t L0) +{ +#ifdef SAFE_LOOKUP + const uint32x4_t L0_vec = vdupq_n_u32(L0); + + return vgetq_lane_u32(MULa_4(L0_vec), 0); +#else + return snow3g_table_A_mul[L0 >> 24]; +#endif +} + +/** + * @brief DIValpha SNOW3G operation on 4 8-bit values at the same time + * + * Function picks the right byte from the register to run DIValpha operation on. + * DIValpha is implemented through 8 16-byte tables and pshufb is used to + * look the tables up. This approach is possible because + * DIValpha operation has linear nature. + * Final operation result is calculated via byte re-arrangement on + * the lookup results and an XOR operation. + * + * @param [in] L11 4 x 32-bit LFSR[11] + * @return 4 x 32-bit DIValpha(L11 & 0xff) + */ +static inline +uint32x4_t DIVa_4(const uint32x4_t L11) +{ +#ifdef SAFE_LOOKUP + const uint8_t gather_clear_mask[]= { + 0x00,0x04,0x08,0x0c,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff + }; + const uint8x16_t low_nibble_mask = vdupq_n_u8(0x0f); + const uint8x16_t clear_mask = vld1q_u8(gather_clear_mask); + uint8x16_t th, tl, b0, b1, b2, b3; + + th = vqtbl1q_u8(vreinterpretq_u8_u32(L11), clear_mask); + + tl = th & low_nibble_mask; + b0 = vld1q_u8(snow3g_DIVa_byte0_low); + b1 = vld1q_u8(snow3g_DIVa_byte1_low); + b2 = vld1q_u8(snow3g_DIVa_byte2_low); + b3 = vld1q_u8(snow3g_DIVa_byte3_low); + + b0 = vqtbl1q_u8(b0, tl); + b1 = vqtbl1q_u8(b1, tl); + b2 = vqtbl1q_u8(b2, tl); + b3 = vqtbl1q_u8(b3, tl); + + b0 = vzip1q_u8(b0, b1); + b2 = vzip1q_u8(b2, b3); + tl = vreinterpretq_u8_u16(vzip1q_u16(vreinterpretq_u16_u8(b0), + vreinterpretq_u16_u8(b2))); + + b0 = vld1q_u8(snow3g_DIVa_byte0_hi); + b1 = vld1q_u8(snow3g_DIVa_byte1_hi); + b2 = vld1q_u8(snow3g_DIVa_byte2_hi); + b3 = vld1q_u8(snow3g_DIVa_byte3_hi); + + th = vshrq_n_u8(th, 4) & low_nibble_mask; + + b0 = vqtbl1q_u8(b0, th); + b1 = vqtbl1q_u8(b1, th); + b2 = vqtbl1q_u8(b2, th); + b3 = vqtbl1q_u8(b3, th); + + b0 = vzip1q_u8(b0, b1); + b2 = vzip1q_u8(b2, b3); + th = vreinterpretq_u8_u16(vzip1q_u16(vreinterpretq_u16_u8(b0), + vreinterpretq_u16_u8(b2))); + + return vreinterpretq_u32_u8(th ^ tl); +#else + const uint8_t L11IDX0 = vgetq_lane_u8(vreinterpretq_u8_u32(L11), 0); + const uint8_t L11IDX1 = vgetq_lane_u8(vreinterpretq_u8_u32(L11), 4); + const uint8_t L11IDX2 = vgetq_lane_u8(vreinterpretq_u8_u32(L11), 8); + const uint8_t L11IDX3 = vgetq_lane_u8(vreinterpretq_u8_u32(L11), 12); + + uint32x4_t ret; + uint32_t x0, x1, x2, x3; + + x0 = snow3g_table_A_div[L11IDX0]; + x1 = snow3g_table_A_div[L11IDX1]; + x2 = snow3g_table_A_div[L11IDX2]; + x3 = snow3g_table_A_div[L11IDX3]; + + ret = vdupq_n_u32(x0); + ret = vsetq_lane_u32(x1, ret, 1); + ret = vsetq_lane_u32(x2, ret, 2); + ret = vsetq_lane_u32(x3, ret, 3); + + return ret; +#endif +} + +/** + * @brief DIValpha SNOW3G operation on 2 8-bit values at the same time + * + * @param [in/out] L11_1 On input, 32-bit LFSR[11]. + * On output, 32-bit DIValpha(L11 & 0xff) + * @param [in/out] L11_2 On input, 32-bit LFSR[11]. + * On output, 32-bit DIValpha(L11 & 0xff) + */ +static inline +void DIVa_2(uint32_t *L11_1, uint32_t *L11_2) +{ +#ifdef SAFE_LOOKUP + uint32x4_t in, out; + + in = vdupq_n_u32(*L11_1); + in = vsetq_lane_u32(*L11_2, in, 1); + out = DIVa_4(in); + + *L11_1 = vgetq_lane_u32(out, 0); + *L11_2 = vgetq_lane_u32(out, 1); +#else + *L11_1 = snow3g_table_A_div[*L11_1 & 0xff]; + *L11_2 = snow3g_table_A_div[*L11_2 & 0xff]; +#endif +} + +/** + * @brief DIValpha SNOW3G operation on a 8-bit value. + * + * @param [in] L11 32-bit LFSR[11] + * @return 32-bit DIValpha(L11 & 0xff) + */ +static inline +uint32_t DIVa(const uint32_t L11) +{ +#ifdef SAFE_LOOKUP + const uint32x4_t L11_vec = vdupq_n_u32(L11); + + return vgetq_lane_u32(DIVa_4(L11_vec), 0); +#else + return snow3g_table_A_div[L11 & 0xff]; +#endif +} + +/** + * @brief ClockFSM function as defined in SNOW3G standard + * + * The FSM has 2 input words S5 and S15 from the LFSR + * produces a 32 bit output word F. + * + * @param[in/out] pCtx context structure + */ +static inline uint32_t ClockFSM_1(snow3gKeyState1_t *pCtx) +{ + const uint32_t F = (pCtx->LFSR_S[15] + pCtx->FSM_R1) ^ pCtx->FSM_R2; + const uint32_t R = (pCtx->FSM_R3 ^ pCtx->LFSR_S[5]) + pCtx->FSM_R2; + + pCtx->FSM_R3 = S2_box(pCtx->FSM_R2); + pCtx->FSM_R2 = S1_box(pCtx->FSM_R1); + pCtx->FSM_R1 = R; + + return F; +} + +/** + * @brief ClockLFSR function as defined in SNOW3G standard + * @param[in/out] pCtx context structure + */ +static inline void ClockLFSR_1(snow3gKeyState1_t *pCtx) +{ + const uint32_t S0 = pCtx->LFSR_S[0]; + const uint32_t S11 = pCtx->LFSR_S[11]; + const uint32_t V = pCtx->LFSR_S[2] ^ + MULa(S0) ^ + DIVa(S11) ^ + (S0 << 8) ^ + (S11 >> 8); + unsigned i; + + /* LFSR array shift by 1 position */ + for (i = 0; i < 15; i++) + pCtx->LFSR_S[i] = pCtx->LFSR_S[i + 1]; + + pCtx->LFSR_S[15] = V; +} + +/** + * @brief Initializes the key schedule for 1 buffer for SNOW3G f8/f9. + * + * @param[in/out] pCtx Context where the scheduled keys are stored + * @param[in] pKeySched Key schedule + * @param[in] pIV IV + */ +static inline void +snow3gStateInitialize_1(snow3gKeyState1_t *pCtx, + const snow3g_key_schedule_t *pKeySched, + const void *pIV) +{ + uint32_t FSM1, FSM2, FSM3; + const uint32_t *pIV32 = pIV; + int i; + + /* LFSR initialisation */ + for (i = 0; i < 4; i++) { + const uint32_t K = pKeySched->k[i]; + const uint32_t L = ~K; + + pCtx->LFSR_S[i + 4] = K; + pCtx->LFSR_S[i + 12] = K; + pCtx->LFSR_S[i + 0] = L; + pCtx->LFSR_S[i + 8] = L; + } + + pCtx->LFSR_S[15] ^= BSWAP32(pIV32[3]); + pCtx->LFSR_S[12] ^= BSWAP32(pIV32[2]); + pCtx->LFSR_S[10] ^= BSWAP32(pIV32[1]); + pCtx->LFSR_S[9] ^= BSWAP32(pIV32[0]); + + /* FSM initialization */ + FSM2 = 0; + FSM3 = 0; + FSM1 = 0; + + for (i = 0; i < 16; i++) { + const uint32_t L0 = pCtx->LFSR_S[0]; + const uint32_t L1 = pCtx->LFSR_S[1]; + const uint32_t L11 = pCtx->LFSR_S[11]; + const uint32_t L12 = pCtx->LFSR_S[12]; + uint32_t MULa_L0 = L0; + uint32_t MULa_L1 = L1; + uint32_t DIVa_L11 = L11; + uint32_t DIVa_L12 = L12; + + MULa_2(&MULa_L0, &MULa_L1); + DIVa_2(&DIVa_L11, &DIVa_L12); + + /* clock FSM + clock LFSR + clockFSM + clock LFSR */ + const uint32_t F0 = + (pCtx->LFSR_S[15] + FSM1) ^ FSM2; /* (s15 + R1) ^ R2 */ + + const uint32_t V0 = + pCtx->LFSR_S[2] ^ + MULa_L0 ^ /* MUL(s0,0 ) */ + DIVa_L11 ^ /* DIV(s11,3 )*/ + (L0 << 8) ^ /* (s0,1 || s0,2 || s0,3 || 0x00) */ + (L11 >> 8) ^ /* (0x00 || s11,0 || s11,1 || s11,2 ) */ + F0; + + const uint32_t R0 = + (FSM3 ^ pCtx->LFSR_S[5]) + FSM2; /* R2 + (R3 ^ s5 ) */ + + uint32_t s1_box_step1 = FSM1; + uint32_t s1_box_step2 = R0; + + S1_box_2(&s1_box_step1, &s1_box_step2); + + uint32_t s2_box_step1 = FSM2; + uint32_t s2_box_step2 = s1_box_step1; /* S1_box(R0) */ + + S2_box_2(&s2_box_step1, &s2_box_step2); + + FSM1 = (s2_box_step1 ^ pCtx->LFSR_S[6]) + s1_box_step1; + + const uint32_t F1 = (V0 + R0) ^ s1_box_step1; + + const uint32_t V1 = pCtx->LFSR_S[3] ^ + MULa_L1 ^ + DIVa_L12 ^ + (L1 << 8) ^ (L12 >> 8) ^ F1; + + FSM2 = s1_box_step2; + FSM3 = s2_box_step2; + + /* shift LFSR twice */ + ShiftTwiceLFSR_1(pCtx); + + pCtx->LFSR_S[14] = V0; + pCtx->LFSR_S[15] = V1; + } + + /* set FSM into scheduling structure */ + pCtx->FSM_R3 = FSM3; + pCtx->FSM_R2 = FSM2; + pCtx->FSM_R1 = FSM1; +} + +/** + * @brief Generates 5 words of key stream used in the initial stages of F9. + * + * @param[in] pCtx Context where the scheduled keys are stored + * @param[in/out] pKeyStream Pointer to the generated keystream + */ +static inline void snow3g_f9_keystream_words(snow3gKeyState1_t *pCtx, + uint32_t *pKeyStream) +{ + int i; + + (void) ClockFSM_1(pCtx); + ClockLFSR_1(pCtx); + + for (i = 0; i < 5; i++) { + pKeyStream[i] = ClockFSM_1(pCtx) ^ pCtx->LFSR_S[0]; + ClockLFSR_1(pCtx); + } +} + +/** + * @brief LFSR array shift by one (4 lanes) + * @param[in] pCtx Context where the scheduled keys are stored + */ +static inline void ShiftLFSR_4(snow3gKeyState4_t *pCtx) +{ + pCtx->iLFSR_X = (pCtx->iLFSR_X + 1) & 15; +} + +/** + * @brief GF2 modular reduction 128-bits to 64-bits + * + * SNOW3GCONSTANT/0x1b reduction polynomial applied. + * + * @param[in] m 128-bit input + * @return 128-bit output (only least significant 64-bits are valid) + */ +static inline poly64x2_t reduce128_to_64(const poly64x2_t m) +{ + const poly64x1_t p = vdup_n_p64((poly64_t)SNOW3GCONSTANT); + poly64x2_t x, t; + + /* start reduction */ + /* top 64-bits of m x p */ + x = (poly64x2_t)vmull_p64(vgetq_lane_p64(m, 1), (poly64_t)p); + t = m ^ x; + + /* + * repeat multiply and xor in case + * 'x' product was bigger than 64 bits + */ + x = (poly64x2_t)vmull_p64(vgetq_lane_p64(x, 1), (poly64_t)p); + t = t ^ x; + + return t; +} + +/** + * @brief GF2 modular multiplication 64-bits x 64-bits with reduction + * + * Implements MUL64 function from the standard. + * SNOW3GCONSTANT/0x1b reduction polynomial applied. + * + * @param[in] a 64-bit input + * @param[in] b 64-bit input + * @return 64-bit output + */ +static inline uint64_t multiply_and_reduce64(uint64_t a, uint64_t b) +{ + poly64x2_t m; + + m = (poly64x2_t)vmull_p64(a, b); /* m = a x b */ + + m = reduce128_to_64(m); /* reduction */ + + return vgetq_lane_u64(vreinterpretq_u64_p64(m), 0); +} + +/** + * @brief ClockLFSR sub-function as defined in SNOW3G standard (4 lanes) + * + * @param[in] L0 LFSR[0] + * @param[in] L11 LFSR[11] + * @return table_Alpha_div[LFSR[11] & 0xff] ^ table_Alpha_mul[LFSR[0] & 0xff] + */ +static inline uint32x4_t C0_C11_4(const uint32x4_t L0, const uint32x4_t L11) +{ + const uint32x4_t SL11 = DIVa_4(L11); + const uint32x4_t SL0 = MULa_4(L0); + + return SL11 ^ SL0; +} + +/** + * @brief ClockLFSR function as defined in SNOW3G standard (4 lanes) + * + * S = table_Alpha_div[LFSR[11] & 0xff] + * ^ table_Alpha_mul[LFSR[0] >> 24] + * ^ LFSR[2] ^ LFSR[0] << 8 ^ LFSR[11] >> 8 + * + * @param[in] pCtx Context where the scheduled keys are stored + */ +static inline void ClockLFSR_4(snow3gKeyState4_t *pCtx) +{ + uint32x4_t S, T, U; + + + U = pCtx->LFSR_X[pCtx->iLFSR_X]; + S = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) & 15]; + const uint32x4_t X2 = C0_C11_4(U, S); + + T = vshlq_n_u32(U, 8); + S = vshrq_n_u32(S, 8); + U = T ^ pCtx->LFSR_X[(pCtx->iLFSR_X + 2) & 15]; + ShiftLFSR_4(pCtx); + + S = S ^ U; + S = S ^ X2; + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] = S; +} + +/** + * @brief ClockFSM function as defined in SNOW3G standard + * + * It operates on 4 packets/lanes at a time + * + * @param[in] pCtx Context where the scheduled keys are stored + * @return 4 x 4bytes of key stream + */ +static inline uint32x4_t ClockFSM_4(snow3gKeyState4_t *pCtx) +{ + const uint32_t iLFSR_X = pCtx->iLFSR_X; + const uint32x4_t F = + pCtx->LFSR_X[(iLFSR_X + 15) & 15] + pCtx->FSM_X[0]; + const uint32x4_t R = + vaddq_u32(pCtx->LFSR_X[(iLFSR_X + 5) & 15] ^ pCtx->FSM_X[2], + pCtx->FSM_X[1]); + + const uint32x4_t ret = F ^ pCtx->FSM_X[1]; + + pCtx->FSM_X[2] = S2_box_4(pCtx->FSM_X[1]); + pCtx->FSM_X[1] = S1_box_4(pCtx->FSM_X[0]); + pCtx->FSM_X[0] = R; + + return ret; +} + +/** + * @brief Generates 4 bytes of key stream 1 buffer at a time + * + * @param[in] pCtx Context where the scheduled keys are stored + * @return 4 bytes of key stream + */ +static inline uint32_t snow3g_keystream_1_4(snow3gKeyState1_t *pCtx) +{ + const uint32_t F = ClockFSM_1(pCtx); + const uint32_t ks = F ^ pCtx->LFSR_S[0]; + + ClockLFSR_1(pCtx); + return ks; +} + +/** + * @brief Generates 8 bytes of key stream for 1 buffer at a time + * + * @param[in] pCtx Context where the scheduled keys are stored + * @return 8 bytes of a key stream + */ +static inline uint64_t snow3g_keystream_1_8(snow3gKeyState1_t *pCtx) +{ + /* + * Merged clock FSM + clock LFSR + clock FSM + clockLFSR + * in order to avoid redundancies in function processing + * and less instruction immediate dependencies + */ + const uint32_t L0 = pCtx->LFSR_S[0]; + const uint32_t L1 = pCtx->LFSR_S[1]; + const uint32_t L11 = pCtx->LFSR_S[11]; + const uint32_t L12 = pCtx->LFSR_S[12]; + uint32_t MULa_L0 = L0; + uint32_t MULa_L1 = L1; + uint32_t DIVa_L11 = L11; + uint32_t DIVa_L12 = L12; + + MULa_2(&MULa_L0, &MULa_L1); + DIVa_2(&DIVa_L11, &DIVa_L12); + + const uint32_t V0 = + pCtx->LFSR_S[2] ^ + MULa_L0 ^ + DIVa_L11 ^ + (L0 << 8) ^ + (L11 >> 8); + + const uint32_t V1 = + pCtx->LFSR_S[3] ^ + MULa_L1 ^ + DIVa_L12 ^ + (L1 << 8) ^ + (L12 >> 8); + + const uint32_t F0 = + (pCtx->LFSR_S[15] + pCtx->FSM_R1) ^ L0 ^ pCtx->FSM_R2; + const uint32_t R0 = + (pCtx->FSM_R3 ^ pCtx->LFSR_S[5]) + pCtx->FSM_R2; + + uint32_t s1_box_step1 = pCtx->FSM_R1; + uint32_t s1_box_step2 = R0; + + S1_box_2(&s1_box_step1, &s1_box_step2); + + uint32_t s2_box_step1 = pCtx->FSM_R2; + uint32_t s2_box_step2 = s1_box_step1; + + S2_box_2(&s2_box_step1, &s2_box_step2); + + /* + * At this stage FSM_R mapping is as follows: + * FSM_R2 = s1_box_step1 + * FSM_R3 = s2_box_step1 + */ + const uint32_t F1 = (V0 + R0) ^ L1 ^ s1_box_step1; + + pCtx->FSM_R3 = s2_box_step2; + pCtx->FSM_R2 = s1_box_step2; + pCtx->FSM_R1 = (s2_box_step1 ^ pCtx->LFSR_S[6]) + s1_box_step1; + + /* Shift LFSR twice */ + ShiftTwiceLFSR_1(pCtx); + + /* key stream mode LFSR update */ + pCtx->LFSR_S[14] = V0; + pCtx->LFSR_S[15] = V1; + + return (((uint64_t) F0) << 32) | ((uint64_t) F1); +} + + +/** + * @brief Generates 4 bytes of key stream 4 buffers at a time + * + * @param[in] pCtx Context where the scheduled keys are stored + * @param[in/out] pKeyStream Pointer to generated key stream + */ +static inline uint32x4_t snow3g_keystream_4_4(snow3gKeyState4_t *pCtx) +{ + const uint32x4_t keyStream = + ClockFSM_4(pCtx) ^ pCtx->LFSR_X[pCtx->iLFSR_X]; + + ClockLFSR_4(pCtx); + return keyStream; +} + +/** + * @brief Generates 8 bytes of key stream 4 buffers at a time + * + * @param[in] pCtx Context where the scheduled keys are stored + * @param[in/out] pKeyStreamLo Pointer to lower end of generated key stream + * @param[in/out] pKeyStreamHi Pointer to higher end of generated key stream + */ +static inline void snow3g_keystream_4_8(snow3gKeyState4_t *pCtx, + uint32x4_t *pKeyStreamLo, + uint32x4_t *pKeyStreamHi) +{ + const uint32x4_t L0 = pCtx->LFSR_X[pCtx->iLFSR_X]; + const uint32x4_t L2 = pCtx->LFSR_X[(pCtx->iLFSR_X + 2) & 15]; + const uint32x4_t L11 = pCtx->LFSR_X[(pCtx->iLFSR_X + 11) & 15]; + + const uint32x4_t L1= pCtx->LFSR_X[(pCtx->iLFSR_X + 1) & 15]; + const uint32x4_t L3 = pCtx->LFSR_X[(pCtx->iLFSR_X + 3) & 15]; + const uint32x4_t L12 = pCtx->LFSR_X[(pCtx->iLFSR_X + 12) & 15]; + + const uint32x4_t L5 = pCtx->LFSR_X[(pCtx->iLFSR_X + 5) & 15]; + const uint32x4_t L6 = pCtx->LFSR_X[(pCtx->iLFSR_X + 6) & 15]; + const uint32x4_t L15 = pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15]; + + const uint32x4_t V0 = veorq_u32(veorq_u32(C0_C11_4(L0, L11), L2), + veorq_u32(vshlq_n_u32(L0, 8), + vshrq_n_u32(L11, 8))); + + const uint32x4_t V1 = veorq_u32(veorq_u32(C0_C11_4(L1, L12), L3), + veorq_u32(vshlq_n_u32(L1, 8), + vshrq_n_u32(L12, 8))); + + /* ======== first set of 4 bytes */ + + const uint32x4_t s1_box_step1 = S1_box_4(pCtx->FSM_X[0]); /* do early */ + + const uint32x4_t R0 = vaddq_u32(veorq_u32(L5, pCtx->FSM_X[2]), + pCtx->FSM_X[1]); + + const uint32x4_t F0 = veorq_u32(vaddq_u32(L15, pCtx->FSM_X[0]), + pCtx->FSM_X[1]); + const uint32x4_t L = F0 ^ L0; + + const uint32x4_t F1 = veorq_u32(vaddq_u32(V0, R0), s1_box_step1); + const uint32x4_t H = F1 ^ L1; + + /* Merge L & H sets for output */ + *pKeyStreamLo = vzip1q_u32(H, L); + *pKeyStreamHi = vzip2q_u32(H, L); + + uint32x4_t s2_box_step1 = pCtx->FSM_X[1]; + uint32x4_t s2_box_step2 = s1_box_step1; + + S2_box_2x4(&s2_box_step1, &s2_box_step2); + + /* + * At this stage FSM_X mapping is as follows: + * FSM_X[2] = s2_box_step1 + * FSM_X[1] = s1_box_step1 + * FSM_X[0] = R0 + */ + + /* Shift LFSR twice */ + pCtx->iLFSR_X = (pCtx->iLFSR_X + 2) & 15; + + /* LFSR Update */ + pCtx->LFSR_X[(pCtx->iLFSR_X + 14) & 15] = V0; + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] = V1; + + const uint32x4_t s1_box_step2 = S1_box_4(R0); + + const uint32x4_t R1 = vaddq_u32(veorq_u32(L6, s2_box_step1), + s1_box_step1); + + /* Final FSM_X update + * FSM_X[2] = s2_box_step2 + * FSM_X[1] = s1_box_step2 + * FSM_X[0] = R1 + */ + pCtx->FSM_X[2] = s2_box_step2; + pCtx->FSM_X[1] = s1_box_step2; + pCtx->FSM_X[0] = R1; +} + +/** + * @brief Generates 16 bytes of key stream 4 buffers at a time + * + * @param[in] pCtx Context where the scheduled keys are stored + * @param[in/out] pKeyStream Pointer to store generated key stream + */ +static inline void snow3g_keystream_4_16(snow3gKeyState4_t *pCtx, + uint32x4_t pKeyStream[4]) +{ + static const uint64_t sm[2] = { + /* mask for byte swapping 64-bit words */ + 0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL + }; + uint32x4_t ksL1, ksL2, ksH1, ksH2; + + snow3g_keystream_4_8(pCtx, &ksL1, &ksH1); + snow3g_keystream_4_8(pCtx, &ksL2, &ksH2); + + const uint8x16_t swapMask = vreinterpretq_u8_u64(vld1q_u64(sm)); + + pKeyStream[0] = vreinterpretq_u32_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(vzip1q_u64( + vreinterpretq_u64_u32(ksL1), + vreinterpretq_u64_u32(ksL2))), + swapMask)); + pKeyStream[1] = vreinterpretq_u32_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(vzip2q_u64( + vreinterpretq_u64_u32(ksL1), + vreinterpretq_u64_u32(ksL2))), + swapMask)); + pKeyStream[2] = vreinterpretq_u32_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(vzip1q_u64( + vreinterpretq_u64_u32(ksH1), + vreinterpretq_u64_u32(ksH2))), + swapMask)); + pKeyStream[3] = vreinterpretq_u32_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(vzip2q_u64( + vreinterpretq_u64_u32(ksH1), + vreinterpretq_u64_u32(ksH2))), + swapMask)); +} + +/** + * @brief Initializes the key schedule for 4 buffers for SNOW3G f8/f9. + * + * @param [in] pCtx Context where the scheduled keys are stored + * @param [in] pKeySched Key schedule + * @param [in] pIV1 IV for buffer 1 + * @param [in] pIV2 IV for buffer 2 + * @param [in] pIV3 IV for buffer 3 + * @param [in] pIV4 IV for buffer 4 + */ +static inline void +snow3gStateInitialize_4(snow3gKeyState4_t *pCtx, + const snow3g_key_schedule_t *pKeySched, + const void *pIV1, const void *pIV2, + const void *pIV3, const void *pIV4) +{ + uint32x4_t R, S, T, U; + uint32x4_t T0, T1; + int i; + + /* Initialize the LFSR table from constants, Keys, and IV */ + + /* Load complete 128b IV into register */ + static const uint64_t sm[2] = { + 0x0c0d0e0f08090a0bULL, 0x0405060700010203ULL + }; + + R = vld1q_u32(pIV1); + S = vld1q_u32(pIV2); + T = vld1q_u32(pIV3); + U = vld1q_u32(pIV4); + + /* initialize the array block */ + for (i = 0; i < 4; i++) { + const uint32_t K = pKeySched->k[i]; + const uint32_t L = ~K; + const uint32x4_t VK = vdupq_n_u32(K); + const uint32x4_t VL = vdupq_n_u32(L); + + pCtx->LFSR_X[i + 4] = + pCtx->LFSR_X[i + 12] = VK; + pCtx->LFSR_X[i + 0] = + pCtx->LFSR_X[i + 8] = VL; + } + /* Update the schedule structure with IVs */ + /* Store the 4 IVs in LFSR by a column/row matrix swap + * after endianness correction */ + + /* endianness swap */ + const uint64x2_t swapMask = vld1q_u64(sm); + + R = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(R), + vreinterpretq_u8_u64(swapMask))); + S = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(S), + vreinterpretq_u8_u64(swapMask))); + T = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(T), + vreinterpretq_u8_u64(swapMask))); + U = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(U), + vreinterpretq_u8_u64(swapMask))); + + /* row/column dword inversion */ + T0 = vzip1q_u32(R, S); + R = vzip2q_u32(R, S); + T1 = vzip1q_u32(T, U); + T = vzip2q_u32(T, U); + + /* row/column qword inversion */ + U = vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(R), + vreinterpretq_u64_u32(T))); + T = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(R), + vreinterpretq_u64_u32(T))); + S = vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(T0), + vreinterpretq_u64_u32(T1))); + R = vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(T0), + vreinterpretq_u64_u32(T1))); + + /* IV ^ LFSR */ + pCtx->LFSR_X[15] = pCtx->LFSR_X[15] ^ U; + pCtx->LFSR_X[12] = pCtx->LFSR_X[12] ^ T; + pCtx->LFSR_X[10] = pCtx->LFSR_X[10] ^ S; + pCtx->LFSR_X[9] = pCtx->LFSR_X[9] ^ R; + pCtx->iLFSR_X = 0; + + /* FSM initialization */ + pCtx->FSM_X[0] = pCtx->FSM_X[1] = + pCtx->FSM_X[2] = vdupq_n_u32(0); + + /* Initialisation rounds */ + for (i = 0; i < 32; i++) { + T1 = ClockFSM_4(pCtx); + + ClockLFSR_4(pCtx); + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] = + pCtx->LFSR_X[(pCtx->iLFSR_X + 15) & 15] ^ T1; + } +} + + +static inline void +preserve_bits(uint64_t *KS, + const uint8_t *pcBufferOut, const uint8_t *pcBufferIn, + SafeBuf *safeOutBuf, SafeBuf *safeInBuf, + const uint8_t bit_len, const uint8_t byte_len) +{ + const uint64_t mask = UINT64_MAX << (SNOW3G_BLOCK_SIZE * 8 - bit_len); + + /* Clear the last bits of the key stream and the input + * (input only in out-of-place case) */ + *KS &= mask; + if (pcBufferIn != pcBufferOut) { + const uint64_t swapMask = BSWAP64(mask); + + safeInBuf->b64 &= swapMask; + + /* + * Merge the last bits from the output, to be preserved, + * in the key stream, to be XOR'd with the input + * (which last bits are 0, maintaining the output bits) + */ + memcpy_keystrm(safeOutBuf->b8, pcBufferOut, byte_len); + *KS |= BSWAP64(safeOutBuf->b64 & ~swapMask); + } +} + +/** + * @brief Core SNOW3G F8 bit algorithm for the 3GPP confidentiality algorithm + * + * @param[in] pCtx Context where the scheduled keys are stored + * @param[in] pIn Input buffer + * @param[out] pOut Output buffer + * @param[in] lengthInBits length in bits of the data to be encrypted + * @param[in] offsetinBits offset in input buffer, where data are valid + */ +static inline void f8_snow3g_bit(snow3gKeyState1_t *pCtx, + const void *pIn, + void *pOut, + const uint32_t lengthInBits, + const uint32_t offsetInBits) +{ + const uint8_t *pBufferIn = pIn; + uint8_t *pBufferOut = pOut; + uint32_t cipherLengthInBits = lengthInBits; + uint64_t shiftrem = 0; + uint64_t KS8, KS8bit; /* 8 bytes of key stream */ + const uint8_t *pcBufferIn = pBufferIn + (offsetInBits / 8); + uint8_t *pcBufferOut = pBufferOut + (offsetInBits / 8); + /* Offset into the first byte (0 - 7 bits) */ + uint32_t remainOffset = offsetInBits % 8; + uint32_t byteLength = (cipherLengthInBits + 7) / 8; + SafeBuf safeInBuf = {0}; + SafeBuf safeOutBuf = {0}; + + /* Now run the block cipher */ + + /* Start with potential partial block (due to offset and length) */ + KS8 = snow3g_keystream_1_8(pCtx); + KS8bit = KS8 >> remainOffset; + + /* Only one block to encrypt */ + if (cipherLengthInBits < (64 - remainOffset)) { + byteLength = (cipherLengthInBits + 7) / 8; + memcpy_keystrm(safeInBuf.b8, pcBufferIn, byteLength); + /* + * If operation is Out-of-place and there is offset + * to be applied, "remainOffset" bits from the output buffer + * need to be preserved (only applicable to first byte, + * since remainOffset is up to 7 bits) + */ + if ((pIn != pOut) && remainOffset) { + const uint8_t mask8 = (uint8_t) + (1 << (8 - remainOffset)) - 1; + + safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) | + (pcBufferOut[0] & ~mask8); + } + /* If last byte is a partial byte, the last bits of the output + * need to be preserved */ + const uint8_t bitlen_with_off = remainOffset + + cipherLengthInBits; + + if ((bitlen_with_off & 0x7) != 0) + preserve_bits(&KS8bit, pcBufferOut, pcBufferIn, + &safeOutBuf, &safeInBuf, + bitlen_with_off, byteLength); + + xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit); + memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength); + return; + } + /* + * If operation is Out-of-place and there is offset + * to be applied, "remainOffset" bits from the output buffer + * need to be preserved (only applicable to first byte, + * since remainOffset is up to 7 bits) + */ + if ((pIn != pOut) && remainOffset) { + const uint8_t mask8 = (uint8_t)(1 << (8 - remainOffset)) - 1; + + memcpy_keystrm(safeInBuf.b8, pcBufferIn, 8); + safeInBuf.b8[0] = (safeInBuf.b8[0] & mask8) | + (pcBufferOut[0] & ~mask8); + xor_keystrm_rev(pcBufferOut, safeInBuf.b8, KS8bit); + pcBufferIn += SNOW3G_BLOCK_SIZE; + } else { + /* At least 64 bits to produce (including offset) */ + pcBufferIn = xor_keystrm_rev(pcBufferOut, pcBufferIn, KS8bit); + } + + if (remainOffset != 0) + shiftrem = KS8 << (64 - remainOffset); + cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8 - remainOffset; + pcBufferOut += SNOW3G_BLOCK_SIZE; + + while (cipherLengthInBits) { + /* produce the next block of key stream */ + KS8 = snow3g_keystream_1_8(pCtx); + KS8bit = (KS8 >> remainOffset) | shiftrem; + if (remainOffset != 0) + shiftrem = KS8 << (64 - remainOffset); + if (cipherLengthInBits >= SNOW3G_BLOCK_SIZE * 8) { + pcBufferIn = xor_keystrm_rev(pcBufferOut, + pcBufferIn, KS8bit); + cipherLengthInBits -= SNOW3G_BLOCK_SIZE * 8; + pcBufferOut += SNOW3G_BLOCK_SIZE; + /* loop variant */ + } else { + /* end of the loop, handle the last bytes */ + byteLength = (cipherLengthInBits + 7) / 8; + memcpy_keystrm(safeInBuf.b8, pcBufferIn, + byteLength); + + /* If last byte is a partial byte, the last bits + * of the output need to be preserved */ + if ((cipherLengthInBits & 0x7) != 0) + preserve_bits(&KS8bit, pcBufferOut, pcBufferIn, + &safeOutBuf, &safeInBuf, + cipherLengthInBits, byteLength); + + xor_keystrm_rev(safeOutBuf.b8, safeInBuf.b8, KS8bit); + memcpy_keystrm(pcBufferOut, safeOutBuf.b8, byteLength); + cipherLengthInBits = 0; + } + } +#ifdef SAFE_DATA + CLEAR_VAR(&KS8, sizeof(KS8)); + CLEAR_VAR(&KS8bit, sizeof(KS8bit)); + CLEAR_MEM(&safeInBuf, sizeof(safeInBuf)); + CLEAR_MEM(&safeOutBuf, sizeof(safeOutBuf)); +#endif +} + +/** + * @brief Core SNOW3G F8 algorithm for the 3GPP confidentiality algorithm + * + * @param[in] pCtx Context where the scheduled keys are stored + * @param[in] pIn Input buffer + * @param[out] pOut Output buffer + * @param[in] lengthInBytes length in bytes of the data to be encrypted + */ +static inline void f8_snow3g(snow3gKeyState1_t *pCtx, + const void *pIn, + void *pOut, + const uint32_t lengthInBytes) +{ + uint32_t qwords = lengthInBytes / SNOW3G_8_BYTES; /* number of qwords */ + const uint32_t words = lengthInBytes & 4; /* remaining word if not 0 */ + const uint32_t bytes = lengthInBytes & 3; /* remaining bytes */ + uint32_t KS4; /* 4 bytes of key stream */ + uint64_t KS8; /* 8 bytes of key stream */ + const uint8_t *pBufferIn = pIn; + uint8_t *pBufferOut = pOut; + + /* process 64 bits at a time */ + while (qwords--) { + /* generate key stream 8 bytes at a time */ + KS8 = snow3g_keystream_1_8(pCtx); + + /* xor key stream 8 bytes at a time */ + pBufferIn = xor_keystrm_rev(pBufferOut, pBufferIn, KS8); + pBufferOut += SNOW3G_8_BYTES; + } + + /* check for remaining 0 to 7 bytes */ + if (0 != words) { + if (bytes) { + /* 5 to 7 last bytes, process 8 bytes */ + uint8_t buftemp[8]; + uint8_t safeBuff[8]; + + memset(safeBuff, 0, SNOW3G_8_BYTES); + KS8 = snow3g_keystream_1_8(pCtx); + memcpy_keystrm(safeBuff, pBufferIn, 4 + bytes); + xor_keystrm_rev(buftemp, safeBuff, KS8); + memcpy_keystrm(pBufferOut, buftemp, 4 + bytes); +#ifdef SAFE_DATA + CLEAR_MEM(&safeBuff, sizeof(safeBuff)); + CLEAR_MEM(&buftemp, sizeof(buftemp)); +#endif + } else { + /* exactly 4 last bytes */ + KS4 = snow3g_keystream_1_4(pCtx); + xor_keystream_reverse_32(pBufferOut, pBufferIn, KS4); + } + } else if (0 != bytes) { + /* 1 to 3 last bytes */ + uint8_t buftemp[4]; + uint8_t safeBuff[4]; + + memset(safeBuff, 0, SNOW3G_4_BYTES); + KS4 = snow3g_keystream_1_4(pCtx); + memcpy_keystream_32(safeBuff, pBufferIn, bytes); + xor_keystream_reverse_32(buftemp, safeBuff, KS4); + memcpy_keystream_32(pBufferOut, buftemp, bytes); +#ifdef SAFE_DATA + CLEAR_MEM(&safeBuff, sizeof(safeBuff)); + CLEAR_MEM(&buftemp, sizeof(buftemp)); +#endif + } + +#ifdef SAFE_DATA + CLEAR_VAR(&KS4, sizeof(KS4)); + CLEAR_VAR(&KS8, sizeof(KS8)); +#endif +} + +/** + * @brief Extracts one state from a 4 buffer state structure. + * + * @param[in] pSrcState Pointer to the source state + * @param[in] pDstState Pointer to the destination state + * @param[in] NumBuffer Buffer number + */ +static inline void snow3gStateConvert_4(const snow3gKeyState4_t *pSrcState, + snow3gKeyState1_t *pDstState, + const uint32_t NumBuffer) +{ + const uint32_t iLFSR_X = pSrcState->iLFSR_X; + const uint32x4_t *LFSR_X = pSrcState->LFSR_X; + uint32_t i; + + for (i = 0; i < 16; i++) { + const uint32_t *pLFSR_X = + (const uint32_t *) &LFSR_X[(i + iLFSR_X) & 15]; + + pDstState->LFSR_S[i] = pLFSR_X[NumBuffer]; + } + + const uint32_t *pFSM_X0 = (const uint32_t *)&pSrcState->FSM_X[0]; + const uint32_t *pFSM_X1 = (const uint32_t *)&pSrcState->FSM_X[1]; + const uint32_t *pFSM_X2 = (const uint32_t *)&pSrcState->FSM_X[2]; + + pDstState->FSM_R1 = pFSM_X0[NumBuffer]; + pDstState->FSM_R2 = pFSM_X1[NumBuffer]; + pDstState->FSM_R3 = pFSM_X2[NumBuffer]; +} + +/** + * @brief Provides size of key schedule structure + * @return Key schedule structure in bytes + */ +size_t SNOW3G_KEY_SCHED_SIZE(void) +{ + return sizeof(snow3g_key_schedule_t); +} + +/** + * @brief Key schedule initialisation + * @param[in] pKey pointer to a 16-byte key + * @param[out] pCtx pointer to key schedule structure + * @return Operation status + * @retval 0 all OK + * @retval -1 parameter error + */ +int SNOW3G_INIT_KEY_SCHED(const void *pKey, snow3g_key_schedule_t *pCtx) +{ +#ifdef SAFE_PARAM + if ((pKey == NULL) || (pCtx == NULL)) + return -1; +#endif + + const uint32_t *pKey32 = pKey; + + pCtx->k[3] = BSWAP32(pKey32[0]); + pCtx->k[2] = BSWAP32(pKey32[1]); + pCtx->k[1] = BSWAP32(pKey32[2]); + pCtx->k[0] = BSWAP32(pKey32[3]); + + return 0; +} + +/** + * @brief Single buffer F8 encrypt/decrypt + * + * Single buffer enc/dec with IV and precomputed key schedule + * + * @param[in] pHandle pointer to precomputed key schedule + * @param[in] pIV pointer to IV + * @param[in] pBufferIn pointer to an input buffer + * @param[out] pBufferOut pointer to an output buffer + * @param[in] lengthInBytes message length in bits + */ +void SNOW3G_F8_1_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV == NULL) || + (pBufferIn == NULL) || (pBufferOut == NULL) || + (lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN)) + return; +#endif +#ifdef SAFE_DATA + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + + snow3gKeyState1_t ctx; + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx, pHandle, pIV); + + /* Clock FSM and LFSR once, ignore the key stream */ + (void) snow3g_keystream_1_4(&ctx); + + f8_snow3g(&ctx, pBufferIn, pBufferOut, lengthInBytes); + +#ifdef SAFE_DATA + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} + +/** + * @brief Single bit-length buffer F8 encrypt/decrypt + * @param[in] pHandle pointer to precomputed key schedule + * @param[in] pIV pointer to IV + * @param[in] pBufferIn pointer to an input buffer + * @param[out] pBufferOut pointer to an output buffer + * @param[in] lengthInBits message length in bits + * @param[in] offsetInBits message offset in bits + */ +void SNOW3G_F8_1_BUFFER_BIT(const snow3g_key_schedule_t *pHandle, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBits, + const uint32_t offsetInBits) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV == NULL) || + (pBufferIn == NULL) || (pBufferOut == NULL) || + (lengthInBits == 0)) + return; +#endif +#ifdef SAFE_DATA + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + + snow3gKeyState1_t ctx; + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx, pHandle, pIV); + + /* Clock FSM and LFSR once, ignore the key stream */ + (void) snow3g_keystream_1_4(&ctx); + + f8_snow3g_bit(&ctx, pBufferIn, pBufferOut, lengthInBits, offsetInBits); + +#ifdef SAFE_DATA + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} + +/** + * @brief Two buffer F8 encrypt/decrypt with the same key schedule + * + * Two buffers enc/dec with the same key schedule. + * The 2 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + * + * @param[in] pHandle pointer to precomputed key schedule + * @param[in] pIV1 pointer to IV + * @param[in] pIV2 pointer to IV + * @param[in] pBufIn1 pointer to an input buffer + * @param[in] pBufOut1 pointer to an output buffer + * @param[in] lenInBytes1 message size in bytes + * @param[in] pBufIn2 pointer to an input buffer + * @param[in] pBufOut2 pointer to an output buffer + * @param[in] lenInBytes2 message size in bytes + */ +void SNOW3G_F8_2_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pBufIn1, + void *pBufOut1, + const uint32_t lenInBytes1, + const void *pBufIn2, + void *pBufOut2, + const uint32_t lenInBytes2) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV1 == NULL) || (pIV2 == NULL) || + (pBufIn1 == NULL) || (pBufOut1 == NULL) || + (pBufIn2 == NULL) || (pBufOut2 == NULL) || + (lenInBytes1 == 0) || (lenInBytes1 > SNOW3G_MAX_BYTELEN) || + (lenInBytes2 == 0) || (lenInBytes2 > SNOW3G_MAX_BYTELEN)) + return; +#endif +#ifdef SAFE_DATA + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + + snow3gKeyState1_t ctx1, ctx2; + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx1, pHandle, pIV1); + + /* Clock FSM and LFSR once, ignore the key stream */ + (void) snow3g_keystream_1_4(&ctx1); + + /* data processing for packet 1 */ + f8_snow3g(&ctx1, pBufIn1, pBufOut1, lenInBytes1); + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_1(&ctx2, pHandle, pIV2); + + /* Clock FSM and LFSR once, ignore the key stream */ + (void) snow3g_keystream_1_4(&ctx2); + + /* data processing for packet 2 */ + f8_snow3g(&ctx2, pBufIn2, pBufOut2, lenInBytes2); + +#ifdef SAFE_DATA + CLEAR_MEM(&ctx1, sizeof(ctx1)); + CLEAR_MEM(&ctx2, sizeof(ctx2)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + +} + +/** + * @brief Four buffer F8 encrypt/decrypt with the same key schedule + * + * Four packets enc/dec with the same key schedule. + * The 4 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + * + * @param[in] pHandle pointer to precomputed key schedule + * @param[in] pIV1 pointer to IV + * @param[in] pIV2 pointer to IV + * @param[in] pIV3 pointer to IV + * @param[in] pIV4 pointer to IV + * @param[in] pBufferIn1 pointer to an input buffer + * @param[in] pBufferOut1 pointer to an output buffer + * @param[in] lengthInBytes1 message size in bytes + * @param[in] pBufferIn2 pointer to an input buffer + * @param[in] pBufferOut2 pointer to an output buffer + * @param[in] lengthInBytes2 message size in bytes + * @param[in] pBufferIn3 pointer to an input buffer + * @param[in] pBufferOut3 pointer to an output buffer + * @param[in] lengthInBytes3 message size in bytes + * @param[in] pBufferIn4 pointer to an input buffer + * @param[in] pBufferOut4 pointer to an output buffer + * @param[in] lengthInBytes4 message size in bytes + */ +void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4) +{ + const size_t num_lanes = 4; + snow3gKeyState4_t ctx; + uint32_t lenInBytes[4]; + uint8_t *pBufferOut[4]; + const uint8_t *pBufferIn[4]; + uint32_t bytes, qwords, i; + + length_copy_4(lenInBytes, lengthInBytes1, lengthInBytes2, + lengthInBytes3, lengthInBytes4); + + cptr_copy_4((const void **)pBufferIn, + pBufferIn1, pBufferIn2, pBufferIn3, pBufferIn4); + + ptr_copy_4((void **)pBufferOut, pBufferOut1, pBufferOut2, + pBufferOut3, pBufferOut4); + +#ifdef SAFE_PARAM + if ((pHandle == NULL) || + (pIV1 == NULL) || (pIV2 == NULL) || + (pIV3 == NULL) || (pIV4 == NULL)) + return; + + if (!cptr_check((const void * const *)pBufferIn, num_lanes) || + !ptr_check((void **)pBufferOut, num_lanes) || + !length_check(lenInBytes, num_lanes)) + return; +#endif + +#ifdef SAFE_DATA + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + + /* find min common length */ + bytes = length_find_min(lenInBytes, num_lanes); + qwords = bytes / SNOW3G_8_BYTES; + + /* subtract min common length from all buffers */ + length_sub(lenInBytes, num_lanes, qwords * SNOW3G_8_BYTES); + + /* Initialize the schedule from the IV */ + snow3gStateInitialize_4(&ctx, pHandle, pIV1, pIV2, pIV3, pIV4); + + /* Clock FSM and LFSR once, ignore the key stream */ + (void) snow3g_keystream_4_4(&ctx); + + /* generates 8 bytes at a time on all streams */ + while (qwords >= 2) { + uint32x4_t ks[4]; + + snow3g_keystream_4_16(&ctx, ks); + + for (i = 0; i < num_lanes; i++) { + const uint32x4_t in = vld1q_u32((const uint32_t *)pBufferIn[i]); + + vst1q_u32((uint32_t *)pBufferOut[i], in ^ ks[i]); + + pBufferOut[i] += (2 * SNOW3G_8_BYTES); + pBufferIn[i] += (2 * SNOW3G_8_BYTES); + } + + qwords = qwords - 2; + } + + while (qwords--) { + uint32x4_t H, L; /* 4 bytes of key stream */ + + snow3g_keystream_4_8(&ctx, &L, &H); + + pBufferIn[0] = xor_keystrm_rev(pBufferOut[0], pBufferIn[0], + vgetq_lane_u64(vreinterpretq_u64_u32(L), 0)); + pBufferIn[1] = xor_keystrm_rev(pBufferOut[1], pBufferIn[1], + vgetq_lane_u64(vreinterpretq_u64_u32(L), 1)); + pBufferIn[2] = xor_keystrm_rev(pBufferOut[2], pBufferIn[2], + vgetq_lane_u64(vreinterpretq_u64_u32(H), 0)); + pBufferIn[3] = xor_keystrm_rev(pBufferOut[3], pBufferIn[3], + vgetq_lane_u64(vreinterpretq_u64_u32(H), 1)); + + for (i = 0; i < num_lanes; i++) + pBufferOut[i] += SNOW3G_8_BYTES; + } + + /* process the remaining of each buffer + * - extract the LFSR and FSM structures + * - Continue process 1 buffer + */ + for (i = 0; i < num_lanes; i++) { + snow3gKeyState1_t ctx_t; + + if (lenInBytes[i] == 0) + continue; + snow3gStateConvert_4(&ctx, &ctx_t, i); + f8_snow3g(&ctx_t, pBufferIn[i], pBufferOut[i], lenInBytes[i]); + } +#ifdef SAFE_DATA + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} + +/** + * @brief Multiple-key 8 buffer F8 encrypt/decrypt + * + * Eight packets enc/dec with eight respective key schedules. + * The 8 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + * + * @param[in] pKey pointer to an array of key schedules + * @param[in] IV pointer to an array of IV's + * @param[in] pBufferIn pointer to an array of input buffers + * @param[out] pBufferOut pointer to an array of output buffers + * @param[in] lengthInBytes pointer to an array of message lengths in bytes + */ +void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[], + const void * const IV[], + const void * const BufferIn[], + void *BufferOut[], + const uint32_t lengthInBytes[]) +{ + const size_t num_lanes = 8; + +#ifdef SAFE_PARAM + if ((pKey == NULL) || (IV == NULL) || (BufferIn == NULL) || + (BufferOut == NULL) || (lengthInBytes == NULL)) + return; + + if (!ptr_check(BufferOut, num_lanes) || !cptr_check(IV, num_lanes) || + !cptr_check((const void * const *)pKey, num_lanes) || + !cptr_check(BufferIn, num_lanes) || + !length_check(lengthInBytes, num_lanes)) + return; +#endif + for (uint32_t i = 0; i < num_lanes; i++) + SNOW3G_F8_1_BUFFER(pKey[i], IV[i], BufferIn[i], BufferOut[i], + lengthInBytes[i]); +#ifdef SAFE_DATA + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif + +} + +/** + * @brief 8 buffer F8 encrypt/decrypt with the same key schedule + * + * Eight packets enc/dec with the same key schedule. + * The 8 IVs are independent and are passed as an array of pointers. + * Each buffer and data length are separate. + * + * @param[in] pHandle pointer to precomputed key schedule + * @param[in] pIV1 pointer to IV + * @param[in] pIV2 pointer to IV + * @param[in] pIV3 pointer to IV + * @param[in] pIV4 pointer to IV + * @param[in] pIV5 pointer to IV + * @param[in] pIV6 pointer to IV + * @param[in] pIV7 pointer to IV + * @param[in] pIV8 pointer to IV + * @param[in] pBufIn1 pointer to an input buffer + * @param[in] pBufOut1 pointer to an output buffer + * @param[in] lenInBytes1 message size in bytes + * @param[in] pBufIn2 pointer to an input buffer + * @param[in] pBufOut2 pointer to an output buffer + * @param[in] lenInBytes2 message size in bytes + * @param[in] pBufIn3 pointer to an input buffer + * @param[in] pBufOut3 pointer to an output buffer + * @param[in] lenInBytes3 message size in bytes + * @param[in] pBufIn4 pointer to an input buffer + * @param[in] pBufOut4 pointer to an output buffer + * @param[in] lenInBytes4 message size in bytes + * @param[in] pBufIn5 pointer to an input buffer + * @param[in] pBufOut5 pointer to an output buffer + * @param[in] lenInBytes5 message size in bytes + * @param[in] pBufIn6 pointer to an input buffer + * @param[in] pBufOut6 pointer to an output buffer + * @param[in] lenInBytes6 message size in bytes + * @param[in] pBufIn7 pointer to an input buffer + * @param[in] pBufOut7 pointer to an output buffer + * @param[in] lenInBytes7 message size in bytes + * @param[in] pBufIn8 pointer to an input buffer + * @param[in] pBufOut8 pointer to an output buffer + * @param[in] lenInBytes8 message size in bytes + */ +void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufIn1, + void *pBufOut1, + const uint32_t lenInBytes1, + const void *pBufIn2, + void *pBufOut2, + const uint32_t lenInBytes2, + const void *pBufIn3, + void *pBufOut3, + const uint32_t lenInBytes3, + const void *pBufIn4, + void *pBufOut4, + const uint32_t lenInBytes4, + const void *pBufIn5, + void *pBufOut5, + const uint32_t lenInBytes5, + const void *pBufIn6, + void *pBufOut6, + const uint32_t lenInBytes6, + const void *pBufIn7, + void *pBufOut7, + const uint32_t lenInBytes7, + const void *pBufIn8, + void *pBufOut8, + const uint32_t lenInBytes8) +{ + uint32_t lengthInBytes[8]; + const uint8_t *pBufferIn[8]; + const void *pIV[8]; + uint8_t *pBufferOut[8]; + + length_copy_8(lengthInBytes, + lenInBytes1, lenInBytes2, lenInBytes3, lenInBytes4, + lenInBytes5, lenInBytes6, lenInBytes7, lenInBytes8); + + cptr_copy_8((const void **)pBufferIn, + pBufIn1, pBufIn2, pBufIn3, pBufIn4, + pBufIn5, pBufIn6, pBufIn7, pBufIn8); + + cptr_copy_8(pIV, pIV1, pIV2, pIV3, pIV4, pIV5, pIV6, pIV7, pIV8); + + ptr_copy_8((void **)pBufferOut, + pBufOut1, pBufOut2, pBufOut3, pBufOut4, + pBufOut5, pBufOut6, pBufOut7, pBufOut8); + +#ifdef SAFE_PARAM + const size_t num_lanes = 8; + + if (pHandle == NULL) + return; + + if (!length_check(lengthInBytes, num_lanes) || + !cptr_check((const void * const *)pBufferIn, num_lanes) || + !cptr_check(pIV, num_lanes) || + !ptr_check((void **)pBufferOut, num_lanes)) + return; +#endif + + SNOW3G_F8_4_BUFFER(pHandle, + pIV[0], pIV[1], pIV[2], pIV[3], + pBufferIn[0], pBufferOut[0], lengthInBytes[0], + pBufferIn[1], pBufferOut[1], lengthInBytes[1], + pBufferIn[2], pBufferOut[2], lengthInBytes[2], + pBufferIn[3], pBufferOut[3], lengthInBytes[3]); + + SNOW3G_F8_4_BUFFER(pHandle, + pIV[4], pIV[5], pIV[6], pIV[7], + pBufferIn[4], pBufferOut[4], lengthInBytes[4], + pBufferIn[5], pBufferOut[5], lengthInBytes[5], + pBufferIn[6], pBufferOut[6], lengthInBytes[6], + pBufferIn[7], pBufferOut[7], lengthInBytes[7]); +} + +/** + * @brief Single-key N buffer F8 encrypt/decrypt + * + * Performs F8 enc/dec on N packets. + * The input IV's are passed in Little Endian format. + * The KeySchedule is in Little Endian format. + * + * @param[in] pCtx pointer to a key schedule + * @param[in] IV pointer to an array of IV's + * @param[in] pBufferIn pointer to an array of input buffers + * @param[out] pBufferOut pointer to an array of output buffers + * @param[in] bufLenInBytes pointer to an array of message lengths in bytes + * @param[in] packetCount number of packets to process (N) + */ +void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufLenInBytes[], + const uint32_t packetCount) +{ +#ifdef SAFE_PARAM + if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) || + (pBufferOut == NULL) || (bufLenInBytes == NULL)) + return; + + if (!cptr_check(IV, packetCount) || + !cptr_check(pBufferIn, packetCount) || + !ptr_check(pBufferOut, packetCount) || + !length_check(bufLenInBytes, packetCount)) + return; +#endif + +#ifdef SAFE_DATA + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + + if (packetCount > NUM_PACKETS_16) { + pBufferOut[0] = NULL; + printf("packetCount too high (%d)\n", packetCount); + return; + } + + uint32_t packet_index, inner_index, pktCnt = packetCount; + int sortNeeded = 0, tempLen = 0; + uint8_t *srctempbuff; + uint8_t *dsttempbuff; + uint8_t *ivtempbuff; + uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pIV[NUM_PACKETS_16] = {NULL}; + uint32_t lensBuf[NUM_PACKETS_16] = {0}; + + memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t)); + memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *)); + memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *)); + memcpy((void *)pIV, IV, packetCount * sizeof(void *)); + + packet_index = packetCount; + + while (packet_index--) { + + /* check if all packets are sorted by decreasing length */ + if (packet_index > 0 && lensBuf[packet_index - 1] < + lensBuf[packet_index]) { + /* this packet array is not correctly sorted */ + sortNeeded = 1; + } + } + + if (sortNeeded) { + + /* sort packets in decreasing buffer size from [0] to + [n]th packet, ** where buffer[0] will contain longest + buffer and buffer[n] will contain the shortest buffer. + 4 arrays are swapped : + - pointers to input buffers + - pointers to output buffers + - pointers to input IV's + - input buffer lengths */ + packet_index = packetCount; + while (packet_index--) { + + inner_index = packet_index; + while (inner_index--) { + + if (lensBuf[packet_index] > + lensBuf[inner_index]) { + + /* swap buffers to arrange in + descending order from [0]. */ + srctempbuff = pSrcBuf[packet_index]; + dsttempbuff = pDstBuf[packet_index]; + ivtempbuff = pIV[packet_index]; + tempLen = lensBuf[packet_index]; + + pSrcBuf[packet_index] = + pSrcBuf[inner_index]; + pDstBuf[packet_index] = + pDstBuf[inner_index]; + pIV[packet_index] = pIV[inner_index]; + lensBuf[packet_index] = + lensBuf[inner_index]; + + pSrcBuf[inner_index] = srctempbuff; + pDstBuf[inner_index] = dsttempbuff; + pIV[inner_index] = ivtempbuff; + lensBuf[inner_index] = tempLen; + } + } /* for inner packet index (inner bubble-sort) */ + } /* for outer packet index (outer bubble-sort) */ + } /* if sortNeeded */ + + packet_index = 0; + /* process 8 buffers at-a-time */ + /* process 4 buffers at-a-time */ + while (pktCnt >= 4) { + pktCnt -= 4; + SNOW3G_F8_4_BUFFER(pCtx, pIV[packet_index + 0], + pIV[packet_index + 1], + pIV[packet_index + 2], + pIV[packet_index + 3], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0], + pSrcBuf[packet_index + 1], + pDstBuf[packet_index + 1], + lensBuf[packet_index + 1], + pSrcBuf[packet_index + 2], + pDstBuf[packet_index + 2], + lensBuf[packet_index + 2], + pSrcBuf[packet_index + 3], + pDstBuf[packet_index + 3], + lensBuf[packet_index + 3]); + packet_index += 4; + } + + /* process 2 packets at-a-time */ + while (pktCnt >= 2) { + pktCnt -= 2; + SNOW3G_F8_2_BUFFER(pCtx, pIV[packet_index + 0], + pIV[packet_index + 1], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0], + pSrcBuf[packet_index + 1], + pDstBuf[packet_index + 1], + lensBuf[packet_index + 1]); + packet_index += 2; + } + + /* remaining packets are processed 1 at a time */ + while (pktCnt--) { + SNOW3G_F8_1_BUFFER(pCtx, pIV[packet_index + 0], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0]); + packet_index++; + } +} + +/** + * @brief Multi-key N buffer F8 encrypt/decrypt + * + * Performs F8 enc/dec on N packets. + * The input IV's are passed in Little Endian format. + * The KeySchedule is in Little Endian format. + * + * @param[in] pCtx pointer to an array of key schedules + * @param[in] IV pointer to an array of IV's + * @param[in] pBufferIn pointer to an array of input buffers + * @param[out] pBufferOut pointer to an array of output buffers + * @param[in] bufLenInBytes pointer to an array of message lengths in bytes + * @param[in] packetCount number of packets to process (N) + */ +void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufLenInBytes[], + const uint32_t packetCount) +{ +#ifdef SAFE_PARAM + uint32_t i; + + if ((pCtx == NULL) || (IV == NULL) || (pBufferIn == NULL) || + (pBufferOut == NULL) || (bufLenInBytes == NULL)) + return; + + for (i = 0; i < packetCount; i++) + if ((pCtx[i] == NULL) || (IV[i] == NULL) || + (pBufferIn[i] == NULL) || (pBufferOut[i] == NULL) || + (bufLenInBytes[i] == 0) || + (bufLenInBytes[i] > SNOW3G_MAX_BYTELEN)) + return; +#endif +#ifdef SAFE_DATA + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + + if (packetCount > NUM_PACKETS_16) { + pBufferOut[0] = NULL; + printf("packetCount too high (%d)\n", packetCount); + return; + } + + uint32_t packet_index, inner_index, pktCnt = packetCount; + int sortNeeded = 0, tempLen = 0; + uint8_t *srctempbuff; + uint8_t *dsttempbuff; + uint8_t *ivtempbuff; + snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL}; + uint8_t *pIV[NUM_PACKETS_16] = {NULL}; + uint32_t lensBuf[NUM_PACKETS_16] = {0}; + snow3g_key_schedule_t *tempCtx; + + memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *)); + memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t)); + memcpy((void *)pSrcBuf, pBufferIn, packetCount * sizeof(void *)); + memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *)); + memcpy((void *)pIV, IV, packetCount * sizeof(void *)); + + packet_index = packetCount; + + while (packet_index--) { + + /* check if all packets are sorted by decreasing length */ + if (packet_index > 0 && lensBuf[packet_index - 1] < + lensBuf[packet_index]) { + /* this packet array is not correctly sorted */ + sortNeeded = 1; + } + } + + if (sortNeeded) { + /* sort packets in decreasing buffer size from [0] to [n]th + packet, where buffer[0] will contain longest buffer and + buffer[n] will contain the shortest buffer. + 4 arrays are swapped : + - pointers to input buffers + - pointers to output buffers + - pointers to input IV's + - input buffer lengths */ + packet_index = packetCount; + while (packet_index--) { + inner_index = packet_index; + while (inner_index--) { + if (lensBuf[packet_index] > + lensBuf[inner_index]) { + /* swap buffers to arrange in + descending order from [0]. */ + srctempbuff = pSrcBuf[packet_index]; + dsttempbuff = pDstBuf[packet_index]; + ivtempbuff = pIV[packet_index]; + tempLen = lensBuf[packet_index]; + tempCtx = pCtxBuf[packet_index]; + + pSrcBuf[packet_index] = + pSrcBuf[inner_index]; + pDstBuf[packet_index] = + pDstBuf[inner_index]; + pIV[packet_index] = pIV[inner_index]; + lensBuf[packet_index] = + lensBuf[inner_index]; + pCtxBuf[packet_index] = + pCtxBuf[inner_index]; + + pSrcBuf[inner_index] = srctempbuff; + pDstBuf[inner_index] = dsttempbuff; + pIV[inner_index] = ivtempbuff; + lensBuf[inner_index] = tempLen; + pCtxBuf[inner_index] = tempCtx; + } + } /* for inner packet index (inner bubble-sort) */ + } /* for outer packet index (outer bubble-sort) */ + } /* if sortNeeded */ + + packet_index = 0; + /* process 8 buffers at-a-time */ + /* @todo process 4 buffers at-a-time */ + /* @todo process 2 packets at-a-time */ + /* remaining packets are processed 1 at a time */ + while (pktCnt--) { + SNOW3G_F8_1_BUFFER(pCtxBuf[packet_index + 0], + pIV[packet_index + 0], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0]); + packet_index++; + } +} + +/** + * @brief Single buffer bit-length F9 function + * + * Single buffer digest with IV and precomputed key schedule. + * + * @param[in] pHandle pointer to precomputed key schedule + * @param[in] pIV pointer to IV + * @param[in] pBufferIn pointer to an input buffer + * @param[in] lengthInBits message length in bits + * @param[out] pDigest pointer to store the F9 digest + */ +void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest) +{ +#ifdef SAFE_PARAM + if ((pHandle == NULL) || (pIV == NULL) || + (pBufferIn == NULL) || (pDigest == NULL) || + (lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN)) + return; +#endif +#ifdef SAFE_DATA + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ + + snow3gKeyState1_t ctx; + uint32_t z[5]; + uint64_t lengthInQwords, E, P; + uint64_t i; + const uint64_t *inputBuffer; + + inputBuffer = (const uint64_t *)pBufferIn; + + /* Initialize the SNOW3G key schedule */ + snow3gStateInitialize_1(&ctx, pHandle, pIV); + + /*Generate 5 key stream words*/ + snow3g_f9_keystream_words(&ctx, &z[0]); + + P = ((uint64_t)z[0] << 32) | ((uint64_t)z[1]); + + lengthInQwords = lengthInBits / 64; + + E = 0; + i = 0; + + if (lengthInQwords > 8) { + /* compute P^2, P^3 and P^4 and put into p1p2 & p3p4 */ + const uint64_t P2 = multiply_and_reduce64(P, P); + const uint64_t P3 = multiply_and_reduce64(P2, P); + const uint64_t P4 = multiply_and_reduce64(P3, P); + const uint64_t bs[2] = {0x0001020304050607ULL, + 0x08090a0b0c0d0e0fULL}; + const uint8x16_t bswap2x64 = vreinterpretq_u8_u64(vld1q_u64(bs)); + uint64_t ch[2] = {0xffffffffffffffffULL, 0}; + const poly64x2_t clear_hi64 = vld1q_p64((poly64_t *)ch); + const uint64_t *m_ptr = &inputBuffer[i]; + poly64x2_t EV = vdupq_n_p64(0); + + for (; (i + 3) < lengthInQwords; i+= 4) { + uint64_t m0, m1, m2, m3; + uint64x2_t M1_t, M2_t; + poly64x2_t t1, t2, t3; + /* load 2 x 128-bits and byte swap 64-bit words */ + M1_t = vld1q_u64(m_ptr); + m_ptr +=2; + M2_t = vld1q_u64(m_ptr); + m_ptr +=2; + M1_t = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(M1_t), bswap2x64)); + M2_t = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(M2_t), bswap2x64)); + m0 = vgetq_lane_u64(M1_t, 0); + m1 = vgetq_lane_u64(M1_t, 1); + m2 = vgetq_lane_u64(M2_t, 0); + m3 = vgetq_lane_u64(M2_t, 1); + + /* add current EV to the first word of the message */ + m0 = m0 ^ vgetq_lane_u64(vreinterpretq_u64_p64(EV), 0); + m1 = m1 ^ vgetq_lane_u64(vreinterpretq_u64_p64(EV), 1); + + /* t1 = (M0 x P4) + (M1 x P3) + (M2 x P2) + (M3 x P1) */ + t1 = (poly64x2_t)vmull_p64(m2, P2); + t2 = (poly64x2_t)vmull_p64(m3, P); + + t1 = t1 ^ t2; + + t2 = (poly64x2_t)vmull_p64(m0, P4); + t3 = (poly64x2_t)vmull_p64(m1, P3); + + t2 = t2 ^ t3; + t1 = t2 ^ t1; + + /* reduce 128-bit product */ + EV = reduce128_to_64(t1); + + /* clear top 64-bits for the subsequent add/xor */ + EV = EV & clear_hi64; + } + + for (; (i + 1) < lengthInQwords; i+= 2) { + poly64x2_t t1, t2; + uint64x2_t M_t; + + /* load 128-bits and byte swap 64-bit words */ + M_t = vld1q_u64(m_ptr); + m_ptr += 2; + M_t = vreinterpretq_u64_u8(vqtbl1q_u8( + vreinterpretq_u8_u64(M_t), bswap2x64)); + + /* add current EV to the first word of the message */ + M_t = M_t ^ EV; + + poly64_t M0 = vgetq_lane_u64(M_t, 0); + poly64_t M1 = vgetq_lane_u64(M_t, 1); + + /* t1 = (M0 x P2) + (M1 x P1) */ + t1 = (poly64x2_t)vmull_p64(M0, P2); + t2 = (poly64x2_t)vmull_p64(M1, P); + t1 = t1 ^ t2; + + /* reduce 128-bit product */ + EV = reduce128_to_64(t1); + + /* clear top 64-bits for the subsequent add/xor */ + EV = EV & clear_hi64; + } + E = vgetq_lane_u64(vreinterpretq_u64_p64(EV), 0); + } + + { + /* all blocks except the last one */ + uint64_t V; + for (; i < lengthInQwords; i++) { + V = BSWAP64(inputBuffer[i]); + E = multiply_and_reduce64(E ^ V, P); + } +#ifdef SAFE_DATA + CLEAR_VAR(&V, sizeof(V)); +#endif + } + + /* last bits of last block if any left */ + uint64_t rem_bits = lengthInBits % 64; + if (rem_bits) { + uint64_t V; + /* last bytes, do not go past end of buffer */ + memcpy(&V, &inputBuffer[i], (rem_bits + 7) / 8); + V = BSWAP64(V); + /* mask extra bits */ + V &= (((uint64_t)-1) << (64 - rem_bits)); + E = multiply_and_reduce64(E ^ V, P); +#ifdef SAFE_DATA + CLEAR_VAR(&V, sizeof(V)); +#endif + } + + /* Multiply by Q */ + E = multiply_and_reduce64(E ^ lengthInBits, + (((uint64_t)z[2] << 32) | ((uint64_t)z[3]))); + + /* Final MAC */ + *(uint32_t *)pDigest = + (uint32_t)BSWAP64(E ^ ((uint64_t)z[4] << 32)); + +#ifdef SAFE_DATA + CLEAR_VAR(&E, sizeof(E)); + CLEAR_VAR(&P, sizeof(P)); + CLEAR_MEM(&z, sizeof(z)); + CLEAR_MEM(&ctx, sizeof(ctx)); + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif /* SAFE_DATA */ +} +#endif /* SNOW3G_COMMON_H */ diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h index 0e0204864ff2c28a5b02333699d1011962b7102e..f5a8334041a12ff7c211a2d54d310a2bd4cbeeed 100644 --- a/lib/include/ipsec_ooo_mgr.h +++ b/lib/include/ipsec_ooo_mgr.h @@ -395,4 +395,9 @@ init_mb_mgr_avx2_internal(IMB_MGR *state, const int reset_mgrs); IMB_DLL_LOCAL void init_mb_mgr_avx512_internal(IMB_MGR *state, const int reset_mgrs); +IMB_DLL_LOCAL void +init_mb_mgr_aarch64_no_aesni_internal(IMB_MGR *state, const int reset_mgrs); +IMB_DLL_LOCAL void +init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs); + #endif /* IMB_IPSEC_MB_INTERNAL_H */ diff --git a/lib/include/noaesni.h b/lib/include/noaesni.h index 19e19fa3a28629ed3f519845ab063763782591ce..57f564dd19acbe09f6a230a913c5fb3b48dc5f4b 100644 --- a/lib/include/noaesni.h +++ b/lib/include/noaesni.h @@ -30,6 +30,14 @@ #ifndef NOAESNI_H #define NOAESNI_H +IMB_DLL_EXPORT void init_mb_mgr_aarch64_no_aesni(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *submit_job_aarch64_no_aesni(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *submit_job_nocheck_aarch64_no_aesni(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *flush_job_aarch64_no_aesni(IMB_MGR *state); +IMB_DLL_EXPORT uint32_t queue_size_aarch64_no_aesni(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *get_completed_job_aarch64_no_aesni(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *get_next_job_aarch64_no_aesni(IMB_MGR *state); + IMB_DLL_EXPORT void init_mb_mgr_sse_no_aesni(IMB_MGR *state); IMB_DLL_EXPORT IMB_JOB *submit_job_sse_no_aesni(IMB_MGR *state); IMB_DLL_EXPORT IMB_JOB *submit_job_nocheck_sse_no_aesni(IMB_MGR *state); diff --git a/lib/include/snow3g.h b/lib/include/snow3g.h index 24555e2f0c190dba6c52c38d0f2f28ff1f30b280..13bdbb156d9414eab0c7a5bfcea5be16b0967b7d 100644 --- a/lib/include/snow3g.h +++ b/lib/include/snow3g.h @@ -655,4 +655,243 @@ snow3g_f9_1_buffer_internal_vaes_avx512(const uint64_t *pBufferIn, const uint32_t KS[5], const uint64_t lengthInBits); +/***************************************************************************** + * AARCH64 +******************************************************************************/ +void +snow3g_f8_1_buffer_bit_aarch64(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits); + +void +snow3g_f8_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes); + +void +snow3g_f8_2_buffer_aarch64(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2); + +void +snow3g_f8_4_buffer_aarch64(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4); +void +snow3g_f8_8_buffer_aarch64(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, + void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, + void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, + void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, + void *pBufferOut8, + const uint32_t lengthInBytes8); + +void +snow3g_f8_8_buffer_multikey_aarch64(const snow3g_key_schedule_t * const pCtx[], + const void * const pIV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t lengthInBytes[]); + +void +snow3g_f8_n_buffer_aarch64(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f8_n_buffer_multikey_aarch64(const snow3g_key_schedule_t * const pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f9_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest); + +size_t +snow3g_key_sched_size_aarch64(void); + +int +snow3g_init_key_sched_aarch64(const void *pKey, snow3g_key_schedule_t *pCtx); + +/******************************************************************************* + * AARCH64 NO-AESNI + ******************************************************************************/ +void +snow3g_f8_1_buffer_bit_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits); + +void +snow3g_f8_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes); + +void +snow3g_f8_2_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2); + +void +snow3g_f8_4_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4); + +void +snow3g_f8_8_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, + void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, + void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, + void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, + void *pBufferOut8, + const uint32_t lengthInBytes8); + +void +snow3g_f8_8_buffer_multikey_aarch64_no_aesni(const snow3g_key_schedule_t * + const pCtx[], + const void * const pIV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t lengthInBytes[]); + +void +snow3g_f8_n_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f8_n_buffer_multikey_aarch64_no_aesni(const snow3g_key_schedule_t * const + pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f9_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest); + +size_t +snow3g_key_sched_size_aarch64_no_aesni(void); + +int +snow3g_init_key_sched_aarch64_no_aesni(const void *pKey, + snow3g_key_schedule_t *pCtx); #endif /* _SNOW3G_H_ */ diff --git a/lib/include/snow3g_tables.h b/lib/include/snow3g_tables.h index 8b43f286c27beabc176922b6e3d9a4f4ca858e2e..bc87e6da91d899cb89854b05801e183017c3b90f 100644 --- a/lib/include/snow3g_tables.h +++ b/lib/include/snow3g_tables.h @@ -29,6 +29,9 @@ #define _SNOW3G_TABLES_H_ #include +#include "ipsec-mb.h" + +#ifdef __x86_64__ #include "constant_lookup.h" #if defined (AVX) || defined (AVX2) @@ -37,6 +40,8 @@ #define SNOW3G_SAFE_LUT8(table, idx, size) LOOKUP8_SSE(table, idx, size) #endif /* AVX || AVX2 */ +#endif /* __x86_64__ */ + extern const int snow3g_table_A_mul[256]; extern const int snow3g_table_A_div[256]; extern const uint8_t snow3g_invSR_SQ[256]; diff --git a/lib/include/wireless_common.h b/lib/include/wireless_common.h index 4f838c4ac3eadd7c7931b13d751cd36dd4349559..6851f9aeefa72dabf42dcec053dc973e92f3b023 100644 --- a/lib/include/wireless_common.h +++ b/lib/include/wireless_common.h @@ -29,11 +29,14 @@ #define _WIRELESS_COMMON_H_ #include + +#ifdef __x86_64__ #ifdef LINUX #include #else #include #endif +#endif /* __x86_64__ */ #define NUM_PACKETS_1 1 #define NUM_PACKETS_2 2 @@ -50,6 +53,7 @@ #define BSWAP64 _byteswap_uint64 #endif +#ifndef __aarch64__ typedef union _m128_u { uint8_t byte[16]; uint16_t word[8]; @@ -64,6 +68,7 @@ typedef union _m64_u { uint32_t dword[2]; uint64_t m; } m64_t; +#endif static inline uint32_t bswap4(const uint32_t val) { @@ -174,6 +179,7 @@ memcpy_keystrm(uint8_t *pDst, const uint8_t *pSrc, const uint32_t len) } } +#ifndef __aarch64__ /** * @brief Save start and end of the buffer around message * @@ -403,4 +409,5 @@ IMB_DLL_LOCAL void asm_XorKeyStream32B_avx2(const void *pIn, void *pOut, IMB_DLL_LOCAL void asm_XorKeyStream64B_avx512(const void *pIn, void *pOut, const void *pKey); +#endif /* __aarch64__ */ #endif /* _WIRELESS_COMMON_H_ */ diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index a386b031970bcea66f50e42b66d559b7254e56b8..4cbcd060a8e0b366422d186820c1b2af60618f31 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -123,6 +123,7 @@ typedef enum { IMB_ARCH_AVX, IMB_ARCH_AVX2, IMB_ARCH_AVX512, + IMB_ARCH_AARCH64, IMB_ARCH_NUM, } IMB_ARCH; @@ -984,6 +985,9 @@ typedef uint32_t (*crc32_fn_t)(const void *, const uint64_t); #define IMB_FEATURE_BMI2 (1ULL << 18) #define IMB_FEATURE_AESNI_EMU (1ULL << 19) +#define IMB_FEATURE_AARCH64 (1ULL << 32) +#define IMB_FEATURE_ASIMD (1ULL << 33) + /* TOP LEVEL (IMB_MGR) Data structure fields */ #define IMB_MAX_JOBS 128 @@ -1307,6 +1311,14 @@ IMB_DLL_EXPORT uint32_t queue_size_sse(IMB_MGR *state); IMB_DLL_EXPORT IMB_JOB *get_completed_job_sse(IMB_MGR *state); IMB_DLL_EXPORT IMB_JOB *get_next_job_sse(IMB_MGR *state); +IMB_DLL_EXPORT void init_mb_mgr_aarch64(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *submit_job_aarch64(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *submit_job_nocheck_aarch64(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *flush_job_aarch64(IMB_MGR *state); +IMB_DLL_EXPORT uint32_t queue_size_aarch64(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *get_completed_job_aarch64(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *get_next_job_aarch64(IMB_MGR *state); + /** * @brief Automatically initialize most performant * Multi-buffer manager based on CPU features diff --git a/lib/no-aesni/aesni_emu.c b/lib/no-aesni/aesni_emu.c index 9fc35db8ee823161d752b44bd0926fa3405c69d7..29492234837ac4ed82ebf945886679f942e1e732 100644 --- a/lib/no-aesni/aesni_emu.c +++ b/lib/no-aesni/aesni_emu.c @@ -28,16 +28,24 @@ /* ========================================================================== */ /* AESNI emulation API and helper functions */ /* ========================================================================== */ +#define AESNI_EMU #include "ipsec-mb.h" #include "aesni_emu.h" -#include "include/constant_lookup.h" +#ifdef __aarch64__ +#include "aarch64/constant_lookup_aarch64.h" +#include +#endif /* __aarch64__ */ + +#ifdef __86_x64__ +#include "include/constant_lookup.h" #ifdef LINUX #include #else #include #endif +#endif /* __x86_64__ */ static const DECLARE_ALIGNED(uint8_t aes_sbox[16][16], 16) = { { 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, @@ -132,23 +140,41 @@ static uint32_t rot(const uint32_t x) static void substitute_bytes(union xmm_reg *dst, const union xmm_reg *src) { +#ifdef __aarch64__ + uint8x16_t vx = vld1q_u8((uint8_t const *) &src->byte[0]); + + IMB_ASSERT(MAX_BYTES_PER_XMM == 16); + + vx = lookup_16x8bit_neon(vx, aes_sbox); + vst1q_u8((uint8_t *) &dst->byte[0], vx); +#else __m128i vx = _mm_loadu_si128((const __m128i *) &src->byte[0]); IMB_ASSERT(MAX_BYTES_PER_XMM == 16); vx = lookup_16x8bit_sse(vx, aes_sbox); _mm_storeu_si128((__m128i *) &dst->byte[0], vx); +#endif } static void inverse_substitute_bytes(union xmm_reg *dst, const union xmm_reg *src) { +#ifdef __aarch64__ + uint8x16_t vx = vld1q_u8((uint8_t const *) &src->byte[0]); + + IMB_ASSERT(MAX_BYTES_PER_XMM == 16); + + vx = lookup_16x8bit_neon(vx, aes_isbox); + vst1q_u8((uint8_t *) &dst->byte[0], vx); +#else __m128i vx = _mm_loadu_si128((const __m128i *) &src->byte[0]); IMB_ASSERT(MAX_BYTES_PER_XMM == 16); vx = lookup_16x8bit_sse(vx, aes_isbox); _mm_storeu_si128((__m128i *) &dst->byte[0], vx); +#endif } static uint8_t gfmul(const uint8_t x, const uint8_t y) diff --git a/perf/Makefile b/perf/Makefile index 67d58449be058db956330dbac497ced77ccd936f..3dc13dc20df9a9f24750523902fce91695e1c53e 100644 --- a/perf/Makefile +++ b/perf/Makefile @@ -1,8 +1,8 @@ # Copyright (c) 2017-2022, Intel Corporation -# +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: -# +# # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright @@ -11,7 +11,7 @@ # * Neither the name of Intel Corporation nor the names of its contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. -# +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -26,6 +26,9 @@ EXE=ipsec_perf INSTPATH ?= /usr/include/ipsec-mb.h LIB_DIR ?= ../lib + +ARCH = $(shell uname -m) + NASM ?= nasm MINGW ?= $(shell $(CC) -dM -E - < /dev/null | grep -i mingw | wc -l | sed 's/^ *//') @@ -47,11 +50,14 @@ endif # if "-z ibt" is supported then assume "-z shstk, -z cet-report=error" are also supported # "-fcf-protection" needs to be checked separately +ifeq ($(ARCH),x86_64) ifeq ($(MINGW),0) CC_HAS_CET = $(and $(shell $(CC) --target-help 2> /dev/null | grep -m1 -e "-z ibt" | wc -l), \ $(shell $(CC) --help=common 2> /dev/null | grep -m1 -e "-fcf-protection" | wc -l)) CET_LDFLAGS=-r -z ibt -z shstk -endif +endif # MINGW +endif # x86_64 + ifeq ($(CC_HAS_CET),1) CFLAGS += -fcf-protection=full endif @@ -82,12 +88,18 @@ ifeq ($(MINGW),0) CFLAGS += -O3 -fPIE -fstack-protector -D_FORTIFY_SOURCE=2 else CFLAGS += -O2 -fPIE -endif -endif +endif # MINGW +endif # DEBUG SOURCES := ipsec_perf.c msr.c +ifneq ($(ARCH),aarch64) ASM_SOURCES := misc.asm OBJECTS := $(SOURCES:%.c=%.o) $(ASM_SOURCES:%.asm=%.o) +else +ASM_SOURCES := misc_aarch64.S +CFLAGS += -march=armv8-a +OBJECTS := $(SOURCES:%.c=%.o) $(ASM_SOURCES:%.S=%.o) +endif # AARCH64 CHECKPATCH ?= checkpatch.pl CPPCHECK ?= cppcheck @@ -95,12 +107,17 @@ CPPCHECK ?= cppcheck .PHONY: all clean style cppcheck # rule for compiling assembly code with producing dependencies +ifneq ($(ARCH),aarch64) %.o:%.asm $(NASM) -MD $(@:.o=.d) -MT $@ -o $@ $(NASM_FLAGS) $< ifeq ($(CC_HAS_CET),1) $(LD) $(CET_LDFLAGS) -o $@.tmp $@ mv $@.tmp $@ -endif +endif # CC_HAS_CET +else +%.o:%.S + $(CC) -c $(CFLAGS) $< -o $@ +endif # AARCH64 all: $(EXE) diff --git a/perf/ipsec_perf.c b/perf/ipsec_perf.c index 3200536adc75716a0aaa6b161ea57a2817239073..97dc132769cbd60014a3defb0f67f9b31098594b 100644 --- a/perf/ipsec_perf.c +++ b/perf/ipsec_perf.c @@ -43,19 +43,21 @@ #define __forceinline static __forceinline #define __func__ __FUNCTION__ #define strcasecmp _stricmp -#else +#else /* _WIN32 */ #include +#ifdef __x86_64__ #include +#endif /* __x86_64__ */ #define __forceinline static inline __attribute__((always_inline)) #include #include #if defined (__FreeBSD__) #include typedef cpuset_t cpu_set_t; -#else +#else /* __FreeBSD__ */ #include -#endif -#endif +#endif /* __FreeBSD__ */ +#endif /* _WIN32 */ #include @@ -112,6 +114,7 @@ enum arch_type_e { ARCH_AVX, ARCH_AVX2, ARCH_AVX512, + ARCH_AARCH64, NUM_ARCHS }; @@ -192,13 +195,13 @@ enum test_hash_alg_e { /* Struct storing cipher parameters */ struct params_s { - IMB_CIPHER_DIRECTION cipher_dir; - enum test_cipher_mode_e cipher_mode; - enum test_hash_alg_e hash_alg; - uint32_t aes_key_size; - uint32_t size_aes; - uint64_t aad_size; - uint32_t num_sizes; + IMB_CIPHER_DIRECTION cipher_dir; + enum test_cipher_mode_e cipher_mode; + enum test_hash_alg_e hash_alg; + uint32_t aes_key_size; + uint32_t size_aes; + uint64_t aad_size; + uint32_t num_sizes; uint32_t core; }; @@ -223,7 +226,8 @@ const struct str_value_mapping arch_str_map[] = { {.name = "SSE", .values.arch_type = ARCH_SSE }, {.name = "AVX", .values.arch_type = ARCH_AVX }, {.name = "AVX2", .values.arch_type = ARCH_AVX2 }, - {.name = "AVX512", .values.arch_type = ARCH_AVX512 } + {.name = "AVX512", .values.arch_type = ARCH_AVX512 }, + {.name = "AARCH64",.values.arch_type = ARCH_AARCH64 }, }; const struct str_value_mapping cipher_algo_str_map[] = { @@ -879,7 +883,7 @@ struct custom_job_params custom_job_params = { .cipher_dir = IMB_DIR_ENCRYPT }; -uint8_t archs[NUM_ARCHS] = {1, 1, 1, 1}; /* uses all function sets */ +uint8_t archs[NUM_ARCHS] = {1, 1, 1, 1, 1}; /* uses all function sets */ int use_gcm_job_api = 0; int use_gcm_sgl_api = 0; int use_unhalted_cycles = 0; /* read unhalted cycles instead of tsc */ @@ -1064,8 +1068,8 @@ static int set_affinity(const int cpu) /* Set affinity of current process to cpu */ #if defined(__FreeBSD__) - ret = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, - sizeof(cpuset), &cpuset); + ret = cpuset_setaffinity(CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, + sizeof(cpuset), &cpuset); #else ret = sched_setaffinity(0, sizeof(cpuset), &cpuset); #endif @@ -1158,6 +1162,16 @@ static int set_avg_unhalted_cycle_cost(const int core, uint64_t *value) return 0; } +static inline uint64_t perf_rdtscp(void) +{ +#ifdef __aarch64__ + return rdtscp(); +#else + uint32_t aux; + return __rdtscp(&aux); +#endif +} + /* Freeing allocated memory */ static void free_mem(uint8_t **p_buffer, imb_uint128_t **p_keys) { @@ -1238,7 +1252,7 @@ static void init_mem(uint8_t **p_buffer, imb_uint128_t **p_keys) uint8_t *buf = NULL; imb_uint128_t *keys = NULL; #ifdef LINUX - int ret; + int ret; #endif if (p_keys == NULL || p_buffer == NULL) { @@ -1249,7 +1263,7 @@ static void init_mem(uint8_t **p_buffer, imb_uint128_t **p_keys) #ifdef LINUX ret = posix_memalign((void **) &buf, alignment, bufs_size); - if (ret != 0) { + if (ret != 0) { fprintf(stderr, "Could not malloc buf\n"); exit(EXIT_FAILURE); } @@ -1263,7 +1277,7 @@ static void init_mem(uint8_t **p_buffer, imb_uint128_t **p_keys) #ifdef LINUX ret = posix_memalign((void **) &keys, alignment, keys_size); - if (ret != 0) { + if (ret != 0) { fprintf(stderr, "Could not allocate memory for keys!\n"); free_mem(&buf, &keys); exit(EXIT_FAILURE); @@ -1588,11 +1602,10 @@ do_test(IMB_MGR *mb_mgr, struct params_s *params, static DECLARE_ALIGNED(imb_uint128_t auth_iv, 16); static uint32_t ipad[5], opad[5], digest[3]; static DECLARE_ALIGNED(uint32_t k1_expanded[11 * 4], 16); - static DECLARE_ALIGNED(uint8_t k2[16], 16); - static DECLARE_ALIGNED(uint8_t k3[16], 16); + static DECLARE_ALIGNED(uint8_t k2[16], 16); + static DECLARE_ALIGNED(uint8_t k3[16], 16); static DECLARE_ALIGNED(struct gcm_key_data gdata_key, 512); uint64_t time = 0; - uint32_t aux; uint8_t gcm_key[32]; uint8_t next_iv[IMB_AES_BLOCK_SIZE]; @@ -1863,7 +1876,7 @@ do_test(IMB_MGR *mb_mgr, struct params_s *params, time = read_cycles(params->core); else #endif - time = __rdtscp(&aux); + time = perf_rdtscp(); for (i = 0; i < num_iter; i++) { job = IMB_GET_NEXT_JOB(mb_mgr); @@ -1909,7 +1922,7 @@ do_test(IMB_MGR *mb_mgr, struct params_s *params, time = (read_cycles(params->core) - rd_cycles_cost) - time; else #endif - time = __rdtscp(&aux) - time; + time = perf_rdtscp() - time; if (!num_iter) return time; @@ -1983,7 +1996,6 @@ do_test_gcm(struct params_s *params, uint8_t *key; uint8_t *aad = NULL; uint64_t time = 0; - uint32_t aux; key = (uint8_t *) malloc(sizeof(uint8_t) * params->aes_key_size); if (!key) { @@ -2020,7 +2032,7 @@ do_test_gcm(struct params_s *params, time = read_cycles(params->core); else #endif - time = __rdtscp(&aux); + time = perf_rdtscp(); if (params->aes_key_size == IMB_KEY_128_BYTES) { if (use_gcm_sgl_api) @@ -2074,14 +2086,14 @@ do_test_gcm(struct params_s *params, rd_cycles_cost) - time; else #endif - time = __rdtscp(&aux) - time; + time = perf_rdtscp() - time; } else { /*DECRYPT*/ #ifndef _WIN32 if (use_unhalted_cycles) time = read_cycles(params->core); else #endif - time = __rdtscp(&aux); + time = perf_rdtscp(); if (params->aes_key_size == IMB_KEY_128_BYTES) { if (use_gcm_sgl_api) @@ -2135,7 +2147,7 @@ do_test_gcm(struct params_s *params, rd_cycles_cost) - time; else #endif - time = __rdtscp(&aux) - time; + time = perf_rdtscp() - time; } free(key); @@ -2270,8 +2282,8 @@ print_times(struct variant_s *variant_list, struct params_s *params, uint32_t sz; if (plot_output_option == 0) { - const char *func_names[4] = { - "SSE", "AVX", "AVX2", "AVX512" + const char *func_names[NUM_ARCHS] = { + "SSE", "AVX", "AVX2", "AVX512", "AARCH64" }; const char *c_mode_names[TEST_NUM_CIPHER_TESTS - 1] = { "CBC", "CNTR", "CNTR+8", "CNTR_BITLEN", "CNTR_BITLEN4", @@ -2481,11 +2493,12 @@ run_tests(void *arg) params.hash_alg = custom_job_params.hash_alg; /* Performing tests for each selected architecture */ - for (arch = ARCH_SSE; arch <= ARCH_AVX512; arch++) { + for (arch = ARCH_SSE; arch < NUM_ARCHS; arch++) { if (archs[arch] == 0) continue; switch (arch) { +#ifdef __x86_64__ case ARCH_SSE: init_mb_mgr_sse(p_mgr); break; @@ -2495,9 +2508,19 @@ run_tests(void *arg) case ARCH_AVX2: init_mb_mgr_avx2(p_mgr); break; - default: /* ARCH_AV512 */ + case ARCH_AVX512: init_mb_mgr_avx512(p_mgr); break; +#endif /* __x86_64__ */ + +#ifdef __aarch64__ + case ARCH_AARCH64: + init_mb_mgr_aarch64(p_mgr); + break; +#endif /* __aarch64__ */ + default: + fprintf(stderr, "Invalid architecture: %d\n", arch); + goto exit_failure; } process_variant(p_mgr, arch, ¶ms, @@ -2559,7 +2582,7 @@ static void usage(void) "-h: print this message\n" "-c: Use cold cache, it uses warm as default\n" "-w: Use warm cache\n" - "--arch: run only tests on specified architecture (SSE/AVX/AVX2/AVX512)\n" + "--arch: run only tests on specified architecture (SSE/AVX/AVX2/AVX512/AARCH64)\n" "--arch-best: detect available architectures and run only on the best one\n" "--cipher-dir: Select cipher direction to run on the custom test " "(encrypt/decrypt) (default = encrypt)\n" @@ -2664,6 +2687,7 @@ detect_arch(unsigned int arch_support[NUM_ARCHS]) IMB_FEATURE_AVX | IMB_FEATURE_CMOV | IMB_FEATURE_AESNI; const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx; const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2; + const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI; IMB_MGR *p_mgr = NULL; enum arch_type_e arch_id; @@ -2693,6 +2717,9 @@ detect_arch(unsigned int arch_support[NUM_ARCHS]) if ((p_mgr->features & detect_sse) != detect_sse) arch_support[ARCH_SSE] = 0; + if ((p_mgr->features & detect_aarch64) != detect_aarch64) + arch_support[ARCH_AARCH64] = 0; + free_mb_mgr(p_mgr); return 0; @@ -2894,6 +2921,7 @@ detect_best_arch(uint8_t arch_support[NUM_ARCHS]) IMB_FEATURE_AVX | IMB_FEATURE_CMOV | IMB_FEATURE_AESNI; const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx; const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2; + const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI; IMB_MGR *p_mgr = NULL; uint64_t detected_features = 0; @@ -2934,6 +2962,11 @@ detect_best_arch(uint8_t arch_support[NUM_ARCHS]) return 0; } + if ((detected_features & detect_aarch64) == detect_aarch64) { + arch_support[ARCH_AARCH64] = 1; + return 0; + } + fprintf(stderr, "Arch detection: no architecture available!\n"); return -1; } @@ -3220,38 +3253,38 @@ int main(int argc, char *argv[]) } num_sizes_list = JOB_SIZE_IMIX_LIST; - /* - * Calculate accumulated distribution of - * probabilities per job size - */ - distribution_total[0] = imix_list[0]; - for (i = 1; i < (int)imix_list_count; i++) - distribution_total[i] = imix_list[i] + - distribution_total[i-1]; + /* + * Calculate accumulated distribution of + * probabilities per job size + */ + distribution_total[0] = imix_list[0]; + for (i = 1; i < (int)imix_list_count; i++) + distribution_total[i] = imix_list[i] + + distribution_total[i-1]; /* Use always same seed */ srand(0); - /* Calculate a random sequence of packet sizes, + /* Calculate a random sequence of packet sizes, based on distribution */ - for (i = 0; i < (int)JOB_SIZE_IMIX_LIST; i++) { - uint16_t random_number = rand() % - distribution_total[imix_list_count - 1]; + for (i = 0; i < (int)JOB_SIZE_IMIX_LIST; i++) { + uint16_t random_number = rand() % + distribution_total[imix_list_count - 1]; uint16_t j; - for (j = 0; j < imix_list_count; j++) - if (random_number < distribution_total[j]) - break; + for (j = 0; j < imix_list_count; j++) + if (random_number < distribution_total[j]) + break; - job_size_imix_list[i] = job_size_list[j]; - } + job_size_imix_list[i] = job_size_list[j]; + } - /* Calculate average buffer size for the IMIX distribution */ - for (i = 0; i < (int)imix_list_count; i++) - average_job_size += job_size_list[i] * - imix_list[i]; + /* Calculate average buffer size for the IMIX distribution */ + for (i = 0; i < (int)imix_list_count; i++) + average_job_size += job_size_list[i] * + imix_list[i]; - average_job_size /= - distribution_total[imix_list_count - 1]; + average_job_size /= + distribution_total[imix_list_count - 1]; } cipher_size_list = (uint32_t *) malloc(sizeof(uint32_t) * num_sizes_list); @@ -3317,6 +3350,9 @@ int main(int argc, char *argv[]) if (tsc_detect) fprintf(stderr, "TSC scaling to core cycles: %.3f\n", get_tsc_to_core_scale(turbo_enabled)); +#ifdef __aarch64__ + fprintf(stderr, "CNT frequency: %ld\n", read_cntfreq()); +#endif fprintf(stderr, "Authentication size = cipher size + %u\n" @@ -3330,6 +3366,7 @@ int main(int argc, char *argv[]) if (custom_job_params.cipher_mode == TEST_CCM) fprintf(stderr, "CCM AAD = %"PRIu64"\n", ccm_aad_size); +#ifdef __x86_64__ if (archs[ARCH_SSE]) { IMB_MGR *p_mgr = alloc_mb_mgr(flags); @@ -3343,6 +3380,15 @@ int main(int argc, char *argv[]) "Using" : "Not using"); free_mb_mgr(p_mgr); } +#else + IMB_MGR *p_mgr = alloc_mb_mgr(flags); + + if (p_mgr == NULL) { + fprintf(stderr, "Error allocating MB_MGR structure!\n"); + return EXIT_FAILURE; + } + free_mb_mgr(p_mgr); +#endif /* __x86_64__ */ memset(t_info, 0, sizeof(t_info)); init_offsets(cache_type); diff --git a/perf/misc.h b/perf/misc.h index 0f6b504dbccf1a32d7ac57e31113ef49455d99d6..2fcdbe9a63a65501935077b35cc958481be12efa 100644 --- a/perf/misc.h +++ b/perf/misc.h @@ -34,3 +34,8 @@ * @return Number of TSC cycles measured while in fixed cost loop */ uint64_t measure_tsc(const uint64_t cycles); + +#ifdef __aarch64__ +uint64_t rdtscp(void); +uint64_t read_cntfreq(void); +#endif diff --git a/perf/misc_aarch64.S b/perf/misc_aarch64.S new file mode 100644 index 0000000000000000000000000000000000000000..3a3d112ba87bfc2d13cdd8b289a231c8e7dfe45a --- /dev/null +++ b/perf/misc_aarch64.S @@ -0,0 +1,60 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.global measure_tsc +.type measure_tsc,%function +.align 5 +measure_tsc: + mov x5,x0 + mrs x6,CNTVCT_EL0 +fixed_loop: + sub x5,x5,#1 + cbnz x5,fixed_loop + + mrs x7,CNTVCT_EL0 + sub x0,x7,x6 + + ret + + +.global rdtscp +.type rdtscp,%function +.align 5 +rdtscp: + mrs x0,CNTVCT_EL0 + + ret + + +.global read_cntfreq +.type read_cntfreq,%function +.align 5 +read_cntfreq: + mrs x0,CNTFRQ_EL0 + + ret diff --git a/test/Makefile b/test/Makefile index 949001c509bf0f99f8185c33ea417a6afe5298fe..8aab75995d331f6648275d8a0cb0fbb64a6b26d0 100644 --- a/test/Makefile +++ b/test/Makefile @@ -32,6 +32,8 @@ ACVP_APP := acvp_app INSTPATH ?= /usr/include/ipsec-mb.h LIB_DIR ?= ../lib +ARCH = $(shell uname -m) + USE_YASM ?= n YASM ?= yasm NASM ?= nasm @@ -48,11 +50,13 @@ CFLAGS = -MMD -D_GNU_SOURCE -DNO_COMPAT_IMB_API_053 \ # if "-z ibt" is supported then assume "-z shstk, -z cet-report=error" are also supported # "-fcf-protection" needs to be checked separately +ifeq ($(ARCH),x86_64) ifeq ($(MINGW),0) CC_HAS_CET = $(and $(shell $(CC) --target-help 2> /dev/null | grep -m1 -e "-z ibt" | wc -l), \ $(shell $(CC) --help=common 2> /dev/null | grep -m1 -e "-fcf-protection" | wc -l)) CET_LDFLAGS=-r -z ibt -z shstk -endif +endif # MINGW +endif # x86_64 ifeq ($(CC_HAS_CET),1) CFLAGS += -fcf-protection=full @@ -97,8 +101,8 @@ ifeq ($(MINGW),0) CFLAGS += -O3 else CFLAGS += -O2 -endif -endif +endif # MINGW +endif # DEBUG ACVP_LOC ?= /usr/local/acvp ACVP_HDR ?= $(ACVP_LOC)/include @@ -108,6 +112,27 @@ ACVP_LDFLAGS = -L$(ACVP_LIB) $(LDFLAGS) ACVP_LDLIBS = -lacvp $(LDLIBS) # ipsec_MB_testapp modules +ifeq ($(ARCH),aarch64) +SOURCES := main.c utils.c api_test.c snow3g_test.c direct_api_test.c clear_mem_test.c +OBJECTS := $(SOURCES:%.c=%.o) + +# ipsec_xvalid_test modules +XVALID_ASM := misc_aarch64.S +XVALID_SOURCES := ipsec_xvalid.c utils.c +XVALID_OBJECTS := $(XVALID_SOURCES:%.c=%.o) $(XVALID_ASM:%.S=%.o) + +# fuzz modules +FUZZ_SOURCES := job_api_fuzz_test.c + +# list of present dependency files +DEP_FILES = $(wildcard ./*.d) + +# rule for compiling assembly code with producing dependencies +%.o:%.S + $(CC) -c $(CFLAGS) $< -o $@ +endif # aarch64 + +ifeq ($(ARCH),x86_64) SOURCES := main.c gcm_test.c ctr_test.c customop_test.c des_test.c ccm_test.c \ cmac_test.c utils.c hmac_sha1_test.c hmac_sha256_sha512_test.c \ hmac_md5_test.c aes_test.c sha_test.c chained_test.c api_test.c pon_test.c \ @@ -144,7 +169,8 @@ endif ifeq ($(CC_HAS_CET),1) $(LD) $(CET_LDFLAGS) -o $@.tmp $@ mv $@.tmp $@ -endif +endif # CC_HAS_CET +endif # x86_64 # targets come here all: $(TEST_APP) $(XVALID_APP) $(FUZZ_APP) $(ACVP_APP) diff --git a/test/api_test.c b/test/api_test.c index b50a0cce60be476ae6065b6e92d3bc80bfcbfac4..476a9a89f08fe91905b70d34884d2cf2786ff3e8 100644 --- a/test/api_test.c +++ b/test/api_test.c @@ -679,6 +679,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr) if (hash == IMB_AUTH_NULL || hash == IMB_AUTH_CUSTOM) continue; +#ifdef __aarch64__ + if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN) + continue; +#endif /* * Skip hash algorithms belonging to AEAD @@ -709,6 +713,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr) if (hash == IMB_AUTH_NULL || hash == IMB_AUTH_CUSTOM) continue; +#ifdef __aarch64__ + if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN) + continue; +#endif /* * Skip hash algorithms belonging to AEAD @@ -741,6 +749,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr) if (hash == IMB_AUTH_NULL || hash == IMB_AUTH_CUSTOM) continue; +#ifdef __aarch64__ + if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN) + continue; +#endif /* * Skip hash algorithms belonging to AEAD @@ -793,6 +805,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr) hash == IMB_AUTH_CRC6_IUUP_HEADER || hash == IMB_AUTH_POLY1305) continue; +#ifdef __aarch64__ + if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN) + continue; +#endif /* * Skip hash algorithms belonging to AEAD @@ -847,6 +863,10 @@ test_job_invalid_mac_args(struct IMB_MGR *mb_mgr) for (dir = IMB_DIR_ENCRYPT; dir <= IMB_DIR_DECRYPT; dir++) for (hash = IMB_AUTH_HMAC_SHA_1; hash < IMB_AUTH_NUM; hash++) { +#ifdef __aarch64__ + if (hash != IMB_AUTH_SNOW3G_UIA2_BITLEN) + continue; +#endif switch (hash) { /* @@ -1060,6 +1080,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr) if (cipher == IMB_CIPHER_NULL || cipher == IMB_CIPHER_CUSTOM) continue; +#ifdef __aarch64__ + if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN) + continue; +#endif /* * Skip cipher algorithms belonging to AEAD @@ -1089,6 +1113,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr) if (cipher == IMB_CIPHER_NULL || cipher == IMB_CIPHER_CUSTOM) continue; +#ifdef __aarch64__ + if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN) + continue; +#endif /* * Skip cipher algorithms belonging to AEAD @@ -1118,6 +1146,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr) if (cipher == IMB_CIPHER_NULL || cipher == IMB_CIPHER_CUSTOM) continue; +#ifdef __aarch64__ + if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN) + continue; +#endif /* * Skip cipher algorithms belonging to AEAD @@ -1148,6 +1180,12 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr) order++) for (cipher = IMB_CIPHER_CBC; cipher < IMB_CIPHER_NUM; cipher++) { +#ifdef __aarch64__ + if ((cipher != IMB_CIPHER_NULL) && + (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN)) + continue; +#endif + fill_in_job(&template_job, cipher, IMB_DIR_ENCRYPT, hash, order, &chacha_ctx, &gcm_ctx); @@ -1181,6 +1219,11 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr) order++) for (cipher = IMB_CIPHER_CBC; cipher < IMB_CIPHER_NUM; cipher++) { +#ifdef __aarch64__ + if ((cipher != IMB_CIPHER_NULL) && + (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN)) + continue; +#endif /* * Skip cipher algorithms belonging to AEAD * algorithms, as the test is for cipher @@ -1251,6 +1294,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr) if (cipher == IMB_CIPHER_NULL || cipher == IMB_CIPHER_CUSTOM) continue; +#ifdef __aarch64__ + if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN) + continue; +#endif /* * Skip cipher algorithms belonging to AEAD @@ -1298,6 +1345,10 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr) if (cipher == IMB_CIPHER_NULL || cipher == IMB_CIPHER_CUSTOM) continue; +#ifdef __aarch64__ + if (cipher != IMB_CIPHER_SNOW3G_UEA2_BITLEN) + continue; +#endif /* * Skip cipher algorithms belonging to AEAD @@ -1503,6 +1554,7 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr) * OTHER MISC TESTS */ +#ifndef __aarch64__ /* CBCS NULL NEXT IV TEST */ for (order = IMB_ORDER_CIPHER_HASH; order <= IMB_ORDER_HASH_CIPHER; order++) @@ -1529,6 +1581,7 @@ test_job_invalid_cipher_args(struct IMB_MGR *mb_mgr) return 1; printf("."); } +#endif /* clean up */ while (IMB_FLUSH_JOB(mb_mgr) != NULL) diff --git a/test/clear_mem_test.c b/test/clear_mem_test.c index 50804e007c1780b45247aa3367e115e334374b9b..2443bd9d5247cfec0d51e950d9350a168caea6a1 100644 --- a/test/clear_mem_test.c +++ b/test/clear_mem_test.c @@ -31,7 +31,15 @@ #include #include "ipsec-mb.h" + +#ifdef __aarch64__ +#include "aarch64/clear_regs_mem_aarch64.h" +#endif /* __aarch64__ */ + +#ifdef __x86_64__ #include "gcm_ctr_vectors_test.h" +#endif /* __x86_64__ */ + #include "utils.h" #define MAX_RAND 1024 diff --git a/test/direct_api_test.c b/test/direct_api_test.c index faba01e4bbfb5473442b5ab594120eec511d9788..d1718e2b0e3ed67c41cfc4b148d16758999156dd 100644 --- a/test/direct_api_test.c +++ b/test/direct_api_test.c @@ -33,7 +33,15 @@ #include #include + +#ifdef __aarch64__ +#include "aarch64/clear_regs_mem_aarch64.h" +#endif /* __aarch64__ */ + +#ifdef __x86_64__ #include "gcm_ctr_vectors_test.h" +#endif /* __x86_64__ */ + #include "utils.h" #define BUF_SIZE ((uint32_t)sizeof(struct gcm_key_data)) @@ -64,6 +72,7 @@ seg_handler(int signum) } #endif /* DEBUG */ +#ifndef __aarch64__ /* * @brief Performs direct GCM API invalid param tests */ @@ -875,6 +884,7 @@ test_kasumi_api(struct IMB_MGR *mgr) printf("\n"); return 0; } +#endif /* __aarch64__ */ /* * @brief Performs direct SNOW3G API invalid param tests @@ -1141,6 +1151,7 @@ direct_api_test(struct IMB_MGR *mb_mgr) goto dir_api_exit; } +#ifdef __x86_64__ errors += test_gcm_api(mb_mgr); run++; @@ -1158,6 +1169,7 @@ direct_api_test(struct IMB_MGR *mb_mgr) errors += test_kasumi_api(mb_mgr); run++; +#endif /* __x86_64__ */ errors += test_snow3g_api(mb_mgr); run++; @@ -1170,5 +1182,5 @@ direct_api_test(struct IMB_MGR *mb_mgr) #ifndef DEBUG signal(SIGSEGV, handler); #endif - return errors; + return errors; } diff --git a/test/ipsec_xvalid.c b/test/ipsec_xvalid.c index df5112448339cb7ebcdfd6140043211b04083c95..5c861216994272009e7c6c9833d3939315bc4313 100644 --- a/test/ipsec_xvalid.c +++ b/test/ipsec_xvalid.c @@ -50,11 +50,18 @@ #define __func__ __FUNCTION__ #define strcasecmp _stricmp #else -#include #define BSWAP64 __builtin_bswap64 #endif #include +#ifdef __aarch64__ +#include "aarch64/clear_regs_mem_aarch64.h" +#endif + +#ifdef __x86_64__ +#include +#endif + /* maximum size of a test buffer */ #define JOB_SIZE_TOP (16 * 1024) @@ -99,10 +106,10 @@ static uint64_t pattern8_plain_text; struct params_s { IMB_CIPHER_MODE cipher_mode; /* CBC, CNTR, DES, GCM etc. */ IMB_HASH_ALG hash_alg; /* SHA-1 or others... */ - uint32_t key_size; - uint32_t buf_size; - uint64_t aad_size; - uint32_t num_sizes; + uint32_t key_size; + uint32_t buf_size; + uint64_t aad_size; + uint32_t num_sizes; }; /* Struct storing all expanded keys */ @@ -112,8 +119,8 @@ struct cipher_auth_keys { uint8_t ipad[IMB_SHA512_DIGEST_SIZE_IN_BYTES]; uint8_t opad[IMB_SHA512_DIGEST_SIZE_IN_BYTES]; DECLARE_ALIGNED(uint32_t k1_expanded[15 * 4], 16); - DECLARE_ALIGNED(uint8_t k2[32], 16); - DECLARE_ALIGNED(uint8_t k3[16], 16); + DECLARE_ALIGNED(uint8_t k2[32], 16); + DECLARE_ALIGNED(uint8_t k3[16], 16); DECLARE_ALIGNED(uint32_t enc_keys[15 * 4], 16); DECLARE_ALIGNED(uint32_t dec_keys[15 * 4], 16); DECLARE_ALIGNED(struct gcm_key_data gdata_key, 64); @@ -152,11 +159,12 @@ struct str_value_mapping { const struct str_value_mapping arch_str_map[] = { {.name = "NONE", .values.arch_type = IMB_ARCH_NONE }, - {.name = "SSE", .values.arch_type = IMB_ARCH_SSE }, {.name = "NO-AESNI", .values.arch_type = IMB_ARCH_NOAESNI }, + {.name = "SSE", .values.arch_type = IMB_ARCH_SSE }, {.name = "AVX", .values.arch_type = IMB_ARCH_AVX }, {.name = "AVX2", .values.arch_type = IMB_ARCH_AVX2 }, - {.name = "AVX512", .values.arch_type = IMB_ARCH_AVX512 } + {.name = "AVX512", .values.arch_type = IMB_ARCH_AVX512 }, + {.name = "AARCH64", .values.arch_type = IMB_ARCH_AARCH64 }, }; struct str_value_mapping cipher_algo_str_map[] = { @@ -673,8 +681,8 @@ struct custom_job_params custom_job_params = { }; /* AESNI_EMU disabled by default */ -uint8_t enc_archs[IMB_ARCH_NUM] = {0, 0, 1, 1, 1, 1}; -uint8_t dec_archs[IMB_ARCH_NUM] = {0, 0, 1, 1, 1, 1}; +uint8_t enc_archs[IMB_ARCH_NUM] = {0, 0, 1, 1, 1, 1, 1}; +uint8_t dec_archs[IMB_ARCH_NUM] = {0, 0, 1, 1, 1, 1, 1}; uint64_t flags = 0; /* flags passed to alloc_mb_mgr() */ @@ -1455,6 +1463,7 @@ prepare_keys(IMB_MGR *mb_mgr, struct cipher_auth_keys *keys, } switch (params->cipher_mode) { +#ifndef __aarch64__ case IMB_CIPHER_GCM: switch (params->key_size) { case IMB_KEY_128_BYTES: @@ -1532,6 +1541,11 @@ prepare_keys(IMB_MGR *mb_mgr, struct cipher_auth_keys *keys, memcpy(k2, ciph_key, 16); memcpy(k2 + 16, ciph_key + 16, 16); break; +#else + case IMB_CIPHER_SNOW3G_UEA2_BITLEN: + memcpy(k2, ciph_key, 16); + break; +#endif case IMB_CIPHER_NULL: /* No operation needed */ break; @@ -1609,8 +1623,9 @@ perform_safe_checks(IMB_MGR *mgr, const IMB_ARCH arch, const char *dir) dump_gps(); switch (arch) { - case IMB_ARCH_SSE: +#ifdef __x86_64__ case IMB_ARCH_NOAESNI: + case IMB_ARCH_SSE: dump_xmms_sse(); simd_size = XMM_MEM_SIZE; break; @@ -1626,6 +1641,15 @@ perform_safe_checks(IMB_MGR *mgr, const IMB_ARCH arch, const char *dir) dump_zmms(); simd_size = ZMM_MEM_SIZE; break; +#endif + +#ifdef __aarch64__ + case IMB_ARCH_NOAESNI: + case IMB_ARCH_AARCH64: + dump_simd_regs(); + simd_size = SIMD_MEM_SIZE; + break; +#endif default: fprintf(stderr, "Error getting the architecture\n"); @@ -1659,6 +1683,8 @@ perform_safe_checks(IMB_MGR *mgr, const IMB_ARCH arch, const char *dir) ooo_ptr++, i++) { void *ooo_mgr_p = *ooo_ptr; + if (ooo_mgr_p == NULL) continue; + if (search_patterns(ooo_mgr_p, get_ooo_mgr_size(ooo_mgr_p, i)) == 0) { fprintf(stderr, @@ -1675,8 +1701,9 @@ static void clear_scratch_simd(const IMB_ARCH arch) { switch (arch) { - case IMB_ARCH_SSE: +#ifdef __x86_64__ case IMB_ARCH_NOAESNI: + case IMB_ARCH_SSE: clr_scratch_xmms_sse(); break; case IMB_ARCH_AVX: @@ -1688,6 +1715,14 @@ clear_scratch_simd(const IMB_ARCH arch) case IMB_ARCH_AVX512: clr_scratch_zmms(); break; +#endif + +#ifdef __aarch64__ + case IMB_ARCH_NOAESNI: + case IMB_ARCH_AARCH64: + CLEAR_SCRATCH_SIMD_REGS(); + break; +#endif default: fprintf(stderr, "Invalid architecture\n"); exit(EXIT_FAILURE); @@ -2348,8 +2383,9 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch, } switch (enc_arch) { - case IMB_ARCH_SSE: +#ifndef __aarch64__ case IMB_ARCH_NOAESNI: + case IMB_ARCH_SSE: init_mb_mgr_sse(enc_mgr); break; case IMB_ARCH_AVX: @@ -2361,6 +2397,12 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch, case IMB_ARCH_AVX512: init_mb_mgr_avx512(enc_mgr); break; +#else + case IMB_ARCH_NOAESNI: + case IMB_ARCH_AARCH64: + init_mb_mgr_aarch64(enc_mgr); + break; +#endif default: fprintf(stderr, "Invalid architecture\n"); exit(EXIT_FAILURE); @@ -2385,8 +2427,9 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch, } switch (dec_arch) { - case IMB_ARCH_SSE: +#ifndef __aarch64__ case IMB_ARCH_NOAESNI: + case IMB_ARCH_SSE: init_mb_mgr_sse(dec_mgr); break; case IMB_ARCH_AVX: @@ -2398,6 +2441,12 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch, case IMB_ARCH_AVX512: init_mb_mgr_avx512(dec_mgr); break; +#else + case IMB_ARCH_NOAESNI: + case IMB_ARCH_AARCH64: + init_mb_mgr_aarch64(dec_mgr); + break; +#endif default: fprintf(stderr, "Invalid architecture\n"); exit(EXIT_FAILURE); @@ -2423,7 +2472,11 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch, /* Skip IMB_CIPHER_CUSTOM */ if (c_mode == IMB_CIPHER_CUSTOM) continue; - +#ifdef __aarch64__ + if ((c_mode != IMB_CIPHER_NULL) && + (c_mode != IMB_CIPHER_SNOW3G_UEA2_BITLEN)) + continue; +#endif params->cipher_mode = c_mode; for (hash_alg = IMB_AUTH_HMAC_SHA_1; @@ -2432,7 +2485,11 @@ run_test(const IMB_ARCH enc_arch, const IMB_ARCH dec_arch, /* Skip IMB_AUTH_CUSTOM */ if (hash_alg == IMB_AUTH_CUSTOM) continue; - +#ifdef __aarch64__ + if ((hash_alg != IMB_AUTH_NULL) && + (hash_alg != IMB_AUTH_SNOW3G_UIA2_BITLEN)) + continue; +#endif /* Skip not supported combinations */ if ((c_mode == IMB_CIPHER_GCM && hash_alg != IMB_AUTH_AES_GMAC) || @@ -2571,18 +2628,29 @@ static void usage(const char *app_name) "where args are zero or more\n" "-h: print this message\n" "-v: verbose, prints extra information\n" +#ifdef __aarch64__ + "--enc-arch: encrypting with architecture " + "(NO-AESNI/AARCH64)\n" + "--dec-arch: decrypting with architecture " + "(NO-AESNI/AARCH64)\n" +#else "--enc-arch: encrypting with architecture " "(NO-AESNI/SSE/AVX/AVX2/AVX512)\n" "--dec-arch: decrypting with architecture " "(NO-AESNI/SSE/AVX/AVX2/AVX512)\n" +#endif "--cipher-algo: Select cipher algorithm to run on the custom " "test\n" "--hash-algo: Select hash algorithm to run on the custom test\n" "--aead-algo: Select AEAD algorithm to run on the custom test\n" +#ifdef __aarch64__ + "--no-aarch64: Don't do AARCH64\n" +#else "--no-avx512: Don't do AVX512\n" "--no-avx2: Don't do AVX2\n" "--no-avx: Don't do AVX\n" "--no-sse: Don't do SSE\n" +#endif "--aesni-emu: Do AESNI_EMU (disabled by default)\n" "--shani-on: use SHA extensions, default: auto-detect\n" "--shani-off: don't use SHA extensions\n" diff --git a/test/job_api_fuzz_test.c b/test/job_api_fuzz_test.c index 7703402b34ee603e54674f92c4803558bbccd790..6e69c5fc9c3c34d53c5e5b82aaffc81f1a40400e 100644 --- a/test/job_api_fuzz_test.c +++ b/test/job_api_fuzz_test.c @@ -462,6 +462,12 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t dataSize) if (ar == NULL) { init_mb_mgr_auto(p_mgr, &arch); } else { +#ifdef __aarch64__ + if (strcmp(ar, "aarch64") == 0) + init_mb_mgr_aarch64(p_mgr); +#endif /* aarch64 */ + +#ifdef __x86_64__ if (strcmp(ar, "avx") == 0) init_mb_mgr_avx(p_mgr); else if (strcmp(ar, "avx2") == 0) @@ -470,6 +476,7 @@ int LLVMFuzzerTestOneInput(const uint8_t *data, size_t dataSize) init_mb_mgr_avx512(p_mgr); else if (strcmp(ar, "sse") == 0) init_mb_mgr_sse(p_mgr); +#endif /* x86_64 */ else init_mb_mgr_auto(p_mgr, &arch); } diff --git a/test/main.c b/test/main.c index dda1518b0a5c7c530b8265bda72af87d08d6c393..fe5b2aaec84be1669e7a1344f7b2497b9b3ca701 100644 --- a/test/main.c +++ b/test/main.c @@ -65,7 +65,9 @@ extern int direct_api_param_test(struct IMB_MGR *mb_mgr); typedef int (*imb_test_t)(struct IMB_MGR *mb_mgr); +#ifdef __x86_64__ #include "do_test.h" +#endif #ifdef _WIN32 #define strcasecmp _stricmp @@ -80,6 +82,7 @@ struct imb_test { }; struct imb_test tests[] = { +#ifdef __x86_64__ { .str = "KAT", .fn = known_answer_test, @@ -91,7 +94,7 @@ struct imb_test tests[] = { .enabled = 1 }, { - .str = "CTR", + .str = "CTR", .fn = ctr_test, .enabled = 1 }, @@ -101,7 +104,7 @@ struct imb_test tests[] = { .enabled = 1 }, { - .str = "XCBC", + .str = "XCBC", .fn = xcbc_test, .enabled = 1 }, @@ -176,7 +179,7 @@ struct imb_test tests[] = { .enabled = 1 }, { - .str = "CHAINED", + .str = "CHAINED", .fn = chained_test, .enabled = 1 }, @@ -239,26 +242,50 @@ struct imb_test tests[] = { .str = "DIRECT_API_PARAM", .fn = direct_api_param_test, .enabled = 1 - } + }, +#endif + +#ifdef __aarch64__ + { + .str = "SNOW3G", + .fn = snow3g_test, + .enabled = 1 + }, + { + .str = "API", + .fn = api_test, + .enabled = 1 + }, + { + .str = "DIRECT_API", + .fn = direct_api_test, + .enabled = 1 + }, + { + .str = "CLEAR_MEM", + .fn = clear_mem_test, + .enabled = 1 + }, +#endif }; static void usage(const char *name) { - fprintf(stderr, + fprintf(stderr, "Usage: %s [args], where args are zero or more\n" "--test-type TEST_NAME : Run single test type\n" "--stop-on-fail: Stop test execution if a test fails\n" "--no-aesni-emu: Don't do AESNI emulation\n" "--no-avx512: Don't do AVX512\n" - "--no-avx2: Don't do AVX2\n" - "--no-avx: Don't do AVX\n" - "--no-sse: Don't do SSE\n" + "--no-avx2: Don't do AVX2\n" + "--no-avx: Don't do AVX\n" + "--no-sse: Don't do SSE\n" "--auto-detect: auto detects current architecture " "to run the tests\n Note: Auto detection " "option now run by default and will be removed in the future\n" - "--shani-on: use SHA extensions, default: auto-detect\n" - "--shani-off: don't use SHA extensions\n", name); + "--shani-on: use SHA extensions, default: auto-detect\n" + "--shani-off: don't use SHA extensions\n", name); } static void @@ -281,6 +308,7 @@ print_hw_features(void) { IMB_FEATURE_GFNI, "GFNI" }, { IMB_FEATURE_AVX512_IFMA, "AVX512-IFMA" }, { IMB_FEATURE_BMI2, "BMI2" }, + { IMB_FEATURE_AARCH64, "AARCH64" }, }; IMB_MGR *p_mgr = NULL; unsigned i; @@ -366,17 +394,17 @@ main(int argc, char **argv) if (detect_arch(arch_support) < 0) return EXIT_FAILURE; - for (i = 1; i < argc; i++) { - if (strcmp(argv[i], "-h") == 0) { - usage(argv[0]); - return EXIT_SUCCESS; - } else if (update_flags_and_archs(argv[i], + for (i = 1; i < argc; i++) { + if (strcmp(argv[i], "-h") == 0) { + usage(argv[0]); + return EXIT_SUCCESS; + } else if (update_flags_and_archs(argv[i], arch_support, &flags)) - continue; - else if (strcmp(argv[i], "--auto-detect") == 0) + continue; + else if (strcmp(argv[i], "--auto-detect") == 0) (void) auto_detect; /* legacy option - to be removed */ - else if (strcmp(argv[i], "--stop-on-fail") == 0) + else if (strcmp(argv[i], "--stop-on-fail") == 0) stop_on_fail = 1; else if (strcmp(argv[i], "--test-type") == 0) { unsigned selected_test; @@ -396,6 +424,10 @@ main(int argc, char **argv) } i++; } + else { + usage(argv[0]); + return EXIT_FAILURE; + } } /* Go through architectures */ @@ -427,8 +459,16 @@ main(int argc, char **argv) } switch (atype) { - case IMB_ARCH_SSE: +#ifdef __aarch64__ case IMB_ARCH_NOAESNI: + case IMB_ARCH_AARCH64: + init_mb_mgr_aarch64(p_mgr); + break; +#endif + +#ifdef __x86_64__ + case IMB_ARCH_NOAESNI: + case IMB_ARCH_SSE: init_mb_mgr_sse(p_mgr); break; case IMB_ARCH_AVX: @@ -440,6 +480,7 @@ main(int argc, char **argv) case IMB_ARCH_AVX512: init_mb_mgr_avx512(p_mgr); break; +#endif } print_tested_arch(p_mgr->features, atype); diff --git a/test/misc.h b/test/misc.h index 2fc3be284fe5f3d843bcd10af00a30dc68995ebc..1d1e2563e767771f042eff03093a5883c29485d3 100644 --- a/test/misc.h +++ b/test/misc.h @@ -28,6 +28,7 @@ #ifndef XVALIDAPP_MISC_H #define XVALIDAPP_MISC_H +#ifdef __x86_64__ /* RAX, RBX, RCX, RDX, RDI, RSI, R8-R15 */ #define GP_MEM_SIZE 14*8 @@ -55,4 +56,25 @@ void clr_scratch_xmms_avx(void); void clr_scratch_ymms(void); void clr_scratch_zmms(void); +#endif /* __x86_64__ */ + +#ifdef __aarch64__ +/* x0-x28 */ +#define GP_MEM_SIZE 29*8 + +#define SIMD_MEM_SIZE 32*16 + +/* Memory allocated */ +uint8_t gps[GP_MEM_SIZE]; +uint8_t simd_regs[SIMD_MEM_SIZE]; + +/* Read the stack pointer */ +void *rdrsp(void); + +/* Functions to dump all registers into predefined memory */ +void dump_gps(void); +void dump_simd_regs(void); + +#endif /* __aarch64__ */ + #endif /* XVALIDAPP_MISC_H */ diff --git a/test/misc_aarch64.S b/test/misc_aarch64.S new file mode 100644 index 0000000000000000000000000000000000000000..a355ca2ff0594385abcbecd3deaf9f16dfa445c7 --- /dev/null +++ b/test/misc_aarch64.S @@ -0,0 +1,87 @@ +/********************************************************************** + Copyright(c) 2021 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +.global dump_gps +.type dump_gps,%function +.align 5 +dump_gps: + str x28,[sp,#-16]! + ldr x28,=gps + stp x0,x1,[x28] + stp x2,x3,[x28,#16] + stp x4,x5,[x28,#32] + stp x6,x7,[x28,#48] + stp x8,x9,[x28,#64] + stp x10,x11,[x28,#80] + stp x12,x13,[x28,#96] + stp x14,x15,[x28,#112] + stp x16,x17,[x28,#128] + stp x18,x19,[x28,#144] + stp x20,x21,[x28,#160] + stp x22,x23,[x28,#176] + stp x24,x25,[x28,#192] + stp x26,x27,[x28,#208] + ldr x0,[sp],#16 + str x0,[x28,#224] + mov x28,x0 + + ret + + +.global dump_simd_regs +.type dump_simd_regs,%function +.align 5 +dump_simd_regs: + ldr x0,=simd_regs + stp q0,q1,[x0] + stp q2,q3,[x0,#32] + stp q4,q5,[x0,#64] + stp q6,q7,[x0,#96] + stp q8,q9,[x0,#128] + stp q10,q11,[x0,#160] + stp q12,q13,[x0,#192] + stp q14,q15,[x0,#224] + stp q16,q17,[x0,#256] + stp q18,q19,[x0,#288] + stp q20,q21,[x0,#320] + stp q22,q23,[x0,#352] + stp q24,q25,[x0,#384] + stp q26,q27,[x0,#416] + stp q28,q29,[x0,#448] + stp q30,q31,[x0,#480] + + ret + + +.global rdrsp +.type rdrsp,%function +.align 5 +rdrsp: + mov x0,sp + + ret diff --git a/test/utils.c b/test/utils.c index f16b3b8f4fbf6f4294fd6091f1da96b616eb65ec..46912677f37abd60ff2f3739955e193b0337fc30 100644 --- a/test/utils.c +++ b/test/utils.c @@ -181,6 +181,8 @@ update_flags_and_archs(const char *arg, arch_support[IMB_ARCH_AVX] = 0; else if (strcmp(arg, "--no-sse") == 0) arch_support[IMB_ARCH_SSE] = 0; + else if (strcmp(arg, "--no-aarch64") ==0) + arch_support[IMB_ARCH_AARCH64] = 0; else if (strcmp(arg, "--aesni-emu") == 0) arch_support[IMB_ARCH_NOAESNI] = 1; else if (strcmp(arg, "--no-aesni-emu") == 0) @@ -212,7 +214,16 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM]) IMB_FEATURE_AVX | IMB_FEATURE_CMOV | IMB_FEATURE_AESNI; const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx; const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2; + + const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI; + +#ifdef __x86_64__ const uint64_t detect_noaesni = IMB_FEATURE_SSE4_2 | IMB_FEATURE_CMOV; +#endif + +#ifdef __aarch64__ + const uint64_t detect_noaesni = IMB_FEATURE_AARCH64 | IMB_FEATURE_ASIMD; +#endif IMB_MGR *p_mgr = NULL; IMB_ARCH arch_id; @@ -246,13 +257,17 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM]) if ((p_mgr->features & detect_noaesni) != detect_noaesni) arch_support[IMB_ARCH_NOAESNI] = 0; + if ((p_mgr->features & detect_aarch64) != detect_aarch64) + arch_support[IMB_ARCH_AARCH64] = 0; + free_mb_mgr(p_mgr); if (arch_support[IMB_ARCH_NOAESNI] == 0 && arch_support[IMB_ARCH_SSE] == 0 && arch_support[IMB_ARCH_AVX] == 0 && arch_support[IMB_ARCH_AVX2] == 0 && - arch_support[IMB_ARCH_AVX512] == 0) { + arch_support[IMB_ARCH_AVX512] == 0 && + arch_support[IMB_ARCH_AARCH64] == 0) { fprintf(stderr, "No available architecture detected!\n"); return -1; } @@ -270,7 +285,7 @@ void print_tested_arch(const uint64_t features, const IMB_ARCH arch) { static const char *arch_str_tab[IMB_ARCH_NUM] = { - "NONE", "NO-AESNI", "SSE", "AVX", "AVX2", "AVX512" + "NONE", "NO-AESNI", "SSE", "AVX", "AVX2", "AVX512", "AARCH64" }; const char *feat = ""; @@ -278,6 +293,7 @@ print_tested_arch(const uint64_t features, const IMB_ARCH arch) case IMB_ARCH_NOAESNI: case IMB_ARCH_AVX2: case IMB_ARCH_AVX: + case IMB_ARCH_AARCH64: break; case IMB_ARCH_SSE: if (features & IMB_FEATURE_SHANI) {