diff --git a/lib/Makefile b/lib/Makefile index 327a171d8b70fb1e74d3679bab1bc9cb2a4c7279..9797c54d3cfb406d8a2270179cbad07da5c0a79e 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -1,5 +1,5 @@ # -# Copyright (c) 2012-2022, Intel Corporation +# Copyright (c) 2012-2023, Intel Corporation # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: @@ -198,6 +198,7 @@ endif # x86_64 ifeq ($(ARCH),aarch64) OPT_AARCH64 := -march=armv8-a+crypto+aes +OPT_SVE := -march=armv8-a+sve+crypto+aes OPT_NOAESNI := -march=armv8-a endif # aarch64 @@ -207,7 +208,14 @@ GCC_VERSION = $(shell $(CC) -dumpversion | cut -d. -f1) GCC_GE_V5 = $(shell [ $(GCC_VERSION) -ge 5 ] && echo true) ifeq ($(GCC_GE_V5),true) ifeq ($(ARCH),aarch64) +GCC_GE_V11 = $(shell [ $(GCC_VERSION) -ge 11 ] && echo true) +#arm sve requires gcc-11 or newer. +ifneq ($(GCC_GE_V11),true) +$(warning "GCC version found: $(GCC_VERSION)") +$(error "Minimum required: 11") +endif # GCC_GE_V11 OPT_AARCH64 := -march=armv8-a+crypto+aes +OPT_SVE := -march=armv8-a+sve+crypto+aes OPT_NOAESNI := -march=armv8-a else OPT_SSE := -march=nehalem -maes -mpclmul @@ -282,6 +290,7 @@ SAFE_OPTIONS_MSG2="All safe options enabled by default." ifeq ($(ARCH),aarch64) c_lib_objs := \ mb_mgr_aarch64.o \ + mb_mgr_aarch64_sve256.o \ mb_mgr_aarch64_no_aesni.o \ mb_mgr_auto_aarch64.o \ alloc_aarch64.o \ @@ -302,7 +311,10 @@ c_lib_objs := \ mb_mgr_zuc_submit_flush_aarch64.o \ mb_mgr_zuc_submit_flush_aarch64_no_aesni.o \ mb_mgr_snow3g_submit_flush_aarch64.o \ - mb_mgr_snow3g_submit_flush_aarch64_no_aesni.o + mb_mgr_snow3g_submit_flush_aarch64_no_aesni.o \ + mb_mgr_snow3g_submit_flush_aarch64_sve256.o \ + snow3g_aarch64_sve256.o \ + snow3g_impl_aarch64_sve256.o asm_generic_lib_objs := \ lookup_16x8bit_neon.o else @@ -863,6 +875,8 @@ $(dep_target_files): | $(OBJ_DIR) # ifeq ($(ARCH),aarch64) +$(OBJ_DIR)/cpu_features_aarch64.o:aarch64/cpu_features_aarch64.c + $(CC) -MMD $(OPT_SVE) -c $(CFLAGS) $< -o $@ $(OBJ_DIR)/%.o:aarch64/%.c $(CC) -MMD $(OPT_AARCH64) -c $(CFLAGS) $< -o $@ $(OBJ_DIR)/%.o:x86_64/%.c diff --git a/lib/aarch64/cpu_features_aarch64.c b/lib/aarch64/cpu_features_aarch64.c index a34e2cb400cdff1d226908af56f727dcc9d95301..1f1191d6c82cdde371aa608548b175823a850d0e 100644 --- a/lib/aarch64/cpu_features_aarch64.c +++ b/lib/aarch64/cpu_features_aarch64.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2021 Arm Corporation All rights reserved. + Copyright(c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -29,6 +29,7 @@ #include "cpu_feature.h" #include #include +#include static uint32_t detect_asimd(void) { @@ -45,6 +46,11 @@ static uint32_t detect_pmull(void) return getauxval(AT_HWCAP) & HWCAP_PMULL; } +static uint32_t detect_sve(void) +{ + return getauxval(AT_HWCAP) & HWCAP_SVE; +} + uint64_t cpu_feature_detect(void) { uint64_t features = 0; @@ -58,6 +64,12 @@ uint64_t cpu_feature_detect(void) if (detect_pmull()) features |= IMB_FEATURE_PMULL; } + if (detect_sve()) { + volatile uint64_t sve_width = svcntw(); + if (sve_width >= (256 / 32)) { + features |= IMB_FEATURE_SVE256; + } + } #ifdef SAFE_DATA features |= IMB_FEATURE_SAFE_DATA; diff --git a/lib/aarch64/mb_mgr_aarch64.c b/lib/aarch64/mb_mgr_aarch64.c index e1c19d74cedb31c642f82cf995e5df91d1a357dc..808998768508bd507e4060756c3aa034ecdf75b8 100644 --- a/lib/aarch64/mb_mgr_aarch64.c +++ b/lib/aarch64/mb_mgr_aarch64.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2021-2022 Arm Corporation All rights reserved. + Copyright(c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -79,6 +79,14 @@ IMB_JOB *flush_job_snow3g_uea2_aarch64_common(IMB_MGR *state); IMB_JOB *submit_job_snow3g_uia2_aarch64_common(IMB_MGR *state, IMB_JOB *job); IMB_JOB *flush_job_snow3g_uia2_aarch64_common(IMB_MGR *state); + +IMB_JOB *submit_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state, + IMB_JOB *job); +IMB_JOB *flush_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state); + +IMB_JOB *submit_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state, + IMB_JOB *job); +IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state); /* ====================================================================== */ #define SUBMIT_JOB submit_job_aarch64 @@ -218,8 +226,11 @@ reset_ooo_mgrs(IMB_MGR *state) sizeof(snow3g_uea2_ooo->job_in_lane)); memset(snow3g_uea2_ooo->bits_fixup, 0, sizeof(snow3g_uea2_ooo->bits_fixup)); + memset(&(snow3g_uea2_ooo->args), 0, + sizeof(snow3g_uea2_ooo->args)); snow3g_uea2_ooo->init_mask = 0; - snow3g_uea2_ooo->unused_lanes = 0xFF03020100; + // each 4 bit indicate one lane, at most 16 buffer + snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210; snow3g_uea2_ooo->num_lanes_inuse = 0; snow3g_uea2_ooo->init_done = 0; memset(snow3g_uea2_ooo->ks, 0, @@ -232,8 +243,11 @@ reset_ooo_mgrs(IMB_MGR *state) sizeof(snow3g_uia2_ooo->job_in_lane)); memset(snow3g_uia2_ooo->bits_fixup, 0, sizeof(snow3g_uia2_ooo->bits_fixup)); + memset(&(snow3g_uia2_ooo->args), 0, + sizeof(snow3g_uia2_ooo->args)); snow3g_uia2_ooo->init_mask = 0; - snow3g_uia2_ooo->unused_lanes = 0xFF03020100; + // each 4 bit indicate one lane, at most 16 buffer + snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210; snow3g_uia2_ooo->num_lanes_inuse = 0; snow3g_uia2_ooo->init_done = 0; memset(snow3g_uia2_ooo->ks, 0, @@ -271,6 +285,10 @@ init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs) flush_job_zuc256_eea3_aarch64 = flush_job_zuc256_eea3_aarch64_no_aesni; submit_job_zuc256_eia3_aarch64 = submit_job_zuc256_eia3_aarch64_no_aesni; flush_job_zuc256_eia3_aarch64 = flush_job_zuc256_eia3_aarch64_no_aesni; + submit_job_snow3g_uea2_aarch64 = submit_job_snow3g_uea2_aarch64_no_aesni; + flush_job_snow3g_uea2_aarch64 = flush_job_snow3g_uea2_aarch64_no_aesni; + submit_job_snow3g_uia2_aarch64 = submit_job_snow3g_uia2_aarch64_no_aesni; + flush_job_snow3g_uia2_aarch64 = flush_job_snow3g_uia2_aarch64_no_aesni; return; } @@ -317,5 +335,4 @@ init_mb_mgr_aarch64(IMB_MGR *state) { init_mb_mgr_aarch64_internal(state, 1); } - #include "mb_mgr_code_aarch64.h" diff --git a/lib/aarch64/mb_mgr_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_aarch64_no_aesni.c index fedb481738feb8eb498295612a5433eba22675b6..a3c4a9d9b096a25771dff4257a84b2886cf0c9c8 100644 --- a/lib/aarch64/mb_mgr_aarch64_no_aesni.c +++ b/lib/aarch64/mb_mgr_aarch64_no_aesni.c @@ -1,5 +1,5 @@ /********************************************************************* - Copyright(c) 2021-2022 Arm Corporation All rights reserved. + Copyright(c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -153,8 +153,11 @@ reset_ooo_mgrs(IMB_MGR *state) sizeof(snow3g_uea2_ooo->job_in_lane)); memset(snow3g_uea2_ooo->bits_fixup, 0, sizeof(snow3g_uea2_ooo->bits_fixup)); + memset(&(snow3g_uea2_ooo->args), 0, + sizeof(snow3g_uea2_ooo->args)); snow3g_uea2_ooo->init_mask = 0; - snow3g_uea2_ooo->unused_lanes = 0xFF03020100; + // each 4 bit indicate one lane, at most 16 buffer + snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210; snow3g_uea2_ooo->num_lanes_inuse = 0; snow3g_uea2_ooo->init_done = 0; memset(snow3g_uea2_ooo->ks, 0, @@ -167,8 +170,11 @@ reset_ooo_mgrs(IMB_MGR *state) sizeof(snow3g_uia2_ooo->job_in_lane)); memset(snow3g_uia2_ooo->bits_fixup, 0, sizeof(snow3g_uia2_ooo->bits_fixup)); + memset(&(snow3g_uia2_ooo->args), 0, + sizeof(snow3g_uia2_ooo->args)); snow3g_uia2_ooo->init_mask = 0; - snow3g_uia2_ooo->unused_lanes = 0xFF03020100; + // each 4 bit indicate one lane, at most 16 buffer + snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210; snow3g_uia2_ooo->num_lanes_inuse = 0; snow3g_uia2_ooo->init_done = 0; memset(snow3g_uia2_ooo->ks, 0, diff --git a/lib/aarch64/mb_mgr_aarch64_sve256.c b/lib/aarch64/mb_mgr_aarch64_sve256.c new file mode 100644 index 0000000000000000000000000000000000000000..d3d2bf4d1150fa804366a081d25fe392bfd40a66 --- /dev/null +++ b/lib/aarch64/mb_mgr_aarch64_sve256.c @@ -0,0 +1,340 @@ +/********************************************************************** + Copyright(c) 2021-2023 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#include +#include +#include +#include + +#include "ipsec-mb.h" +#include "include/snow3g.h" +#include "include/zuc_internal.h" + +#include "include/cpu_feature.h" +#include "include/error.h" +#include "clear_regs_mem_aarch64.h" +#include "include/noaesni.h" +#include "include/ipsec_ooo_mgr.h" + +IMB_JOB *submit_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, + IMB_JOB *job); +IMB_JOB *flush_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); + +IMB_JOB *submit_job_zuc_eea3_aarch64_common(MB_MGR_ZUC_OOO *state, + IMB_JOB *job); +IMB_JOB *flush_job_zuc_eea3_aarch64_common(MB_MGR_ZUC_OOO *state); + +IMB_JOB *submit_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, + IMB_JOB *job); +IMB_JOB *flush_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); + +IMB_JOB *submit_job_zuc256_eea3_aarch64_common(MB_MGR_ZUC_OOO *state, + IMB_JOB *job); +IMB_JOB *flush_job_zuc256_eea3_aarch64_common(MB_MGR_ZUC_OOO *state); + +IMB_JOB *submit_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, + IMB_JOB *job); +IMB_JOB *flush_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); + +IMB_JOB *submit_job_zuc_eia3_aarch64_common(MB_MGR_ZUC_OOO *state, + IMB_JOB *job); +IMB_JOB *flush_job_zuc_eia3_aarch64_common(MB_MGR_ZUC_OOO *state); + +IMB_JOB *submit_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, + IMB_JOB *job); +IMB_JOB *flush_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); + +IMB_JOB *submit_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state, + IMB_JOB *job); +IMB_JOB *flush_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state); + +IMB_JOB *submit_job_snow3g_uea2_aarch64_sve256(IMB_MGR *state, + IMB_JOB *job); +IMB_JOB *flush_job_snow3g_uea2_aarch64_sve256(IMB_MGR *state); + +IMB_JOB *submit_job_snow3g_uia2_aarch64_sve256(IMB_MGR *state, + IMB_JOB *job); +IMB_JOB *flush_job_snow3g_uia2_aarch64_sve256(IMB_MGR *state); + +IMB_JOB *submit_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state, + IMB_JOB *job); +IMB_JOB *flush_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state); + +IMB_JOB *submit_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state, + IMB_JOB *job); +IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state); +/* ====================================================================== */ + +#define SUBMIT_JOB submit_job_aarch64_sve256 +#define FLUSH_JOB flush_job_aarch64_sve256 +#define SUBMIT_JOB_NOCHECK submit_job_nocheck_aarch64_sve256 +#define GET_NEXT_JOB get_next_job_aarch64_sve256 +#define GET_COMPLETED_JOB get_completed_job_aarch64_sve256 + +#define QUEUE_SIZE queue_size_aarch64_sve256 + +/* ====================================================================== */ + +#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AARCH64 +#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AARCH64 + +/* ====================================================================== */ +#define SUBMIT_JOB_ZUC_EEA3 submit_job_zuc_eea3_aarch64 +#define FLUSH_JOB_ZUC_EEA3 flush_job_zuc_eea3_aarch64 +#define SUBMIT_JOB_ZUC_EIA3 submit_job_zuc_eia3_aarch64 +#define FLUSH_JOB_ZUC_EIA3 flush_job_zuc_eia3_aarch64 +#define SUBMIT_JOB_ZUC256_EEA3 submit_job_zuc256_eea3_aarch64 +#define FLUSH_JOB_ZUC256_EEA3 flush_job_zuc256_eea3_aarch64 +#define SUBMIT_JOB_ZUC256_EIA3 submit_job_zuc256_eia3_aarch64 +#define FLUSH_JOB_ZUC256_EIA3 flush_job_zuc256_eia3_aarch64 +#define SUBMIT_JOB_SNOW3G_UEA2 submit_job_snow3g_uea2_aarch64 +#define FLUSH_JOB_SNOW3G_UEA2 flush_job_snow3g_uea2_aarch64 +#define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_aarch64 +#define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_aarch64 + + +static IMB_JOB * +(*submit_job_zuc_eea3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = + submit_job_zuc_eea3_aarch64_common; + +static IMB_JOB * +(*flush_job_zuc_eea3_aarch64)(MB_MGR_ZUC_OOO *state) = + flush_job_zuc_eea3_aarch64_common; + +static IMB_JOB * +(*submit_job_zuc_eia3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = + submit_job_zuc_eia3_aarch64_common; + +static IMB_JOB * +(*flush_job_zuc_eia3_aarch64)(MB_MGR_ZUC_OOO *state) = + flush_job_zuc_eia3_aarch64_common; + +static IMB_JOB * +(*submit_job_zuc256_eea3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = + submit_job_zuc256_eea3_aarch64_common; + +static IMB_JOB * +(*flush_job_zuc256_eea3_aarch64)(MB_MGR_ZUC_OOO *state) = + flush_job_zuc256_eea3_aarch64_common; + +static IMB_JOB * +(*submit_job_zuc256_eia3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = + submit_job_zuc256_eia3_aarch64_common; + +static IMB_JOB * +(*flush_job_zuc256_eia3_aarch64)(MB_MGR_ZUC_OOO *state) = + flush_job_zuc256_eia3_aarch64_common; + +static IMB_JOB * +(*submit_job_snow3g_uea2_aarch64)(IMB_MGR *state, IMB_JOB *job) = + submit_job_snow3g_uea2_aarch64_sve256; + +static IMB_JOB * +(*flush_job_snow3g_uea2_aarch64)(IMB_MGR *state) = + flush_job_snow3g_uea2_aarch64_sve256; + +static IMB_JOB * +(*submit_job_snow3g_uia2_aarch64)(IMB_MGR *state, IMB_JOB *job) = + submit_job_snow3g_uia2_aarch64_sve256; + +static IMB_JOB * +(*flush_job_snow3g_uia2_aarch64)(IMB_MGR *state) = + flush_job_snow3g_uia2_aarch64_sve256; +static void +reset_ooo_mgrs(IMB_MGR *state) +{ + MB_MGR_ZUC_OOO *zuc_eea3_ooo = state->zuc_eea3_ooo; + MB_MGR_ZUC_OOO *zuc_eia3_ooo = state->zuc_eia3_ooo; + MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo; + MB_MGR_ZUC_OOO *zuc256_eia3_ooo = state->zuc256_eia3_ooo; + MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo; + MB_MGR_SNOW3G_OOO *snow3g_uia2_ooo = state->snow3g_uia2_ooo; + + /* Init ZUC out-of-order fields */ + memset(zuc_eea3_ooo->lens, 0, + sizeof(zuc_eea3_ooo->lens)); + memset(zuc_eea3_ooo->job_in_lane, 0, + sizeof(zuc_eea3_ooo->job_in_lane)); + zuc_eea3_ooo->unused_lanes = 0xFF03020100; + zuc_eea3_ooo->num_lanes_inuse = 0; + memset(&zuc_eea3_ooo->state, 0, + sizeof(zuc_eea3_ooo->state)); + zuc_eea3_ooo->init_not_done = 0; + zuc_eea3_ooo->unused_lane_bitmask = 0x0f; + + memset(zuc_eia3_ooo->lens, 0xFF, + sizeof(zuc_eia3_ooo->lens)); + memset(zuc_eia3_ooo->job_in_lane, 0, + sizeof(zuc_eia3_ooo->job_in_lane)); + zuc_eia3_ooo->unused_lanes = 0xFF03020100; + zuc_eia3_ooo->num_lanes_inuse = 0; + memset(&zuc_eia3_ooo->state, 0, + sizeof(zuc_eia3_ooo->state)); + zuc_eia3_ooo->init_not_done = 0; + zuc_eia3_ooo->unused_lane_bitmask = 0x0f; + + memset(zuc256_eea3_ooo->lens, 0, + sizeof(zuc256_eea3_ooo->lens)); + memset(zuc256_eea3_ooo->job_in_lane, 0, + sizeof(zuc256_eea3_ooo->job_in_lane)); + zuc256_eea3_ooo->unused_lanes = 0xFF03020100; + zuc256_eea3_ooo->num_lanes_inuse = 0; + memset(&zuc256_eea3_ooo->state, 0, + sizeof(zuc256_eea3_ooo->state)); + zuc256_eea3_ooo->init_not_done = 0; + zuc256_eea3_ooo->unused_lane_bitmask = 0x0f; + + memset(zuc256_eia3_ooo->lens, 0xFF, + sizeof(zuc256_eia3_ooo->lens)); + memset(zuc256_eia3_ooo->job_in_lane, 0, + sizeof(zuc256_eia3_ooo->job_in_lane)); + zuc256_eia3_ooo->unused_lanes = 0xFF03020100; + zuc256_eia3_ooo->num_lanes_inuse = 0; + memset(&zuc256_eia3_ooo->state, 0, + sizeof(zuc256_eia3_ooo->state)); + zuc256_eia3_ooo->init_not_done = 0; + zuc256_eia3_ooo->unused_lane_bitmask = 0x0f; + + /* Init SNOW3G out-of-order fields */ + memset(snow3g_uea2_ooo->lens, 0, + sizeof(snow3g_uea2_ooo->lens)); + memset(snow3g_uea2_ooo->job_in_lane, 0, + sizeof(snow3g_uea2_ooo->job_in_lane)); + memset(snow3g_uea2_ooo->bits_fixup, 0, + sizeof(snow3g_uea2_ooo->bits_fixup)); + memset(&(snow3g_uea2_ooo->args), 0, + sizeof(snow3g_uea2_ooo->args)); + snow3g_uea2_ooo->init_mask = 0; + // each 4 bit indicate one lane, at most 16 buffer + snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210; + snow3g_uea2_ooo->num_lanes_inuse = 0; + snow3g_uea2_ooo->init_done = 0; + memset(snow3g_uea2_ooo->ks, 0, + sizeof(snow3g_uea2_ooo->ks)); + snow3g_uea2_ooo->road_block = 0; + + memset(snow3g_uia2_ooo->lens, 0, + sizeof(snow3g_uia2_ooo->lens)); + memset(snow3g_uia2_ooo->job_in_lane, 0, + sizeof(snow3g_uia2_ooo->job_in_lane)); + memset(snow3g_uia2_ooo->bits_fixup, 0, + sizeof(snow3g_uia2_ooo->bits_fixup)); + memset(&(snow3g_uia2_ooo->args), 0, + sizeof(snow3g_uia2_ooo->args)); + snow3g_uia2_ooo->init_mask = 0; + // each 4 bit indicate one lane, at most 16 buffer + snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210; + snow3g_uia2_ooo->num_lanes_inuse = 0; + snow3g_uia2_ooo->init_done = 0; + memset(snow3g_uia2_ooo->ks, 0, + sizeof(snow3g_uia2_ooo->ks)); + snow3g_uia2_ooo->road_block = 0; + return; +} + +IMB_DLL_LOCAL void +init_mb_mgr_aarch64_sve256_internal(IMB_MGR *state, const int reset_mgrs) +{ +#ifdef SAFE_PARAM + if (state == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_MBMGR); + return; + } +#endif + + /* reset error status */ + imb_set_errno(state, 0); + + state->features = cpu_feature_adjust(state->flags, + cpu_feature_detect()); + + /* Set architecture for future checks */ + state->used_arch = (uint32_t) IMB_ARCH_SVE256; + + if (!(state->features & IMB_FEATURE_AESNI)) { + init_mb_mgr_aarch64_no_aesni(state); + submit_job_zuc_eea3_aarch64 = submit_job_zuc_eea3_aarch64_no_aesni; + flush_job_zuc_eea3_aarch64 = flush_job_zuc_eea3_aarch64_no_aesni; + submit_job_zuc_eia3_aarch64 = submit_job_zuc_eia3_aarch64_no_aesni; + flush_job_zuc_eia3_aarch64 = flush_job_zuc_eia3_aarch64_no_aesni; + submit_job_zuc256_eea3_aarch64 = submit_job_zuc256_eea3_aarch64_no_aesni; + flush_job_zuc256_eea3_aarch64 = flush_job_zuc256_eea3_aarch64_no_aesni; + submit_job_zuc256_eia3_aarch64 = submit_job_zuc256_eia3_aarch64_no_aesni; + flush_job_zuc256_eia3_aarch64 = flush_job_zuc256_eia3_aarch64_no_aesni; + submit_job_snow3g_uea2_aarch64 = submit_job_snow3g_uea2_aarch64_no_aesni; + flush_job_snow3g_uea2_aarch64 = flush_job_snow3g_uea2_aarch64_no_aesni; + submit_job_snow3g_uia2_aarch64 = submit_job_snow3g_uia2_aarch64_no_aesni; + flush_job_snow3g_uia2_aarch64 = flush_job_snow3g_uia2_aarch64_no_aesni; + return; + } + + if (reset_mgrs) { + reset_ooo_mgrs(state); + + /* Init "in order" components */ + state->next_job = 0; + state->earliest_job = -1; + } + + /* set AARCH64 handlers */ + state->get_next_job = get_next_job_aarch64_sve256; + state->submit_job = submit_job_aarch64_sve256; + state->submit_job_nocheck = submit_job_nocheck_aarch64_sve256; + state->get_completed_job = get_completed_job_aarch64_sve256; + state->flush_job = flush_job_aarch64_sve256; + state->queue_size = queue_size_aarch64_sve256; + + state->eea3_1_buffer = zuc_eea3_1_buffer_aarch64; + state->eea3_4_buffer = zuc_eea3_4_buffer_aarch64; + state->eea3_n_buffer = zuc_eea3_n_buffer_aarch64; + state->zuc256_eea3_1_buffer = zuc256_eea3_1_buffer_aarch64; + state->eia3_1_buffer = zuc_eia3_1_buffer_aarch64; + state->eia3_n_buffer = zuc_eia3_n_buffer_aarch64; + state->zuc256_eia3_1_buffer = zuc256_eia3_1_buffer_aarch64; + + state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_aarch64_sve256; + state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_aarch64_sve256; + state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_aarch64_sve256; + state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_aarch64_sve256; + state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_aarch64_sve256; + state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_aarch64_sve256; + state->snow3g_f8_4_buffer_multikey = snow3g_f8_4_buffer_multikey_aarch64_sve256; + state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_aarch64_sve256; + state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_aarch64_sve256; + state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_aarch64_sve256; + state->snow3g_init_key_sched = snow3g_init_key_sched_aarch64_sve256; + state->snow3g_key_sched_size = snow3g_key_sched_size_aarch64_sve256; +} + +void +init_mb_mgr_aarch64_sve256(IMB_MGR *state) +{ + IMB_ASSERT(state->features & IMB_FEATURE_SVE256); + init_mb_mgr_aarch64_sve256_internal(state, 1); +} +#include "mb_mgr_code_aarch64.h" diff --git a/lib/aarch64/mb_mgr_auto_aarch64.c b/lib/aarch64/mb_mgr_auto_aarch64.c index b4c0797e1eac1f0852135e62e991f74b6cd4a1a4..0bf93fe23b1d20af21a7be8baa3e8e8fbbc3f860 100644 --- a/lib/aarch64/mb_mgr_auto_aarch64.c +++ b/lib/aarch64/mb_mgr_auto_aarch64.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2021 Arm Corporation All rights reserved. + Copyright(c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -26,6 +26,7 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************/ +#include #include "ipsec-mb.h" #include "cpu_feature.h" #include "noaesni.h" @@ -44,6 +45,7 @@ init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch) IMB_ARCH arch_detected = IMB_ARCH_NONE; const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI; const uint64_t detect_noaesni = IMB_FEATURE_AARCH64 | IMB_FEATURE_ASIMD; + const uint64_t detect_sve256 = IMB_FEATURE_AARCH64 | IMB_FEATURE_SVE256; /* reset error status */ imb_set_errno(state, 0); @@ -54,6 +56,11 @@ init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch) return; } #endif + if ((state->features & detect_sve256) == detect_sve256) { + init_mb_mgr_aarch64_sve256(state); + arch_detected = IMB_ARCH_SVE256; + goto init_mb_mgr_auto_ret; + } if ((state->features & detect_aarch64) == detect_aarch64) { init_mb_mgr_aarch64(state); arch_detected = IMB_ARCH_AARCH64; diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c index ce55bbfbfe691cd93be64abc2484345e67d58874..adb33784b0168d2ebe28501ad694e19019cdf414 100644 --- a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c +++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2022 Arm Corporation All rights reserved. + Copyright(c) 2022-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -31,12 +31,14 @@ #define FLUSH_JOB_SNOW3G_UEA2 flush_job_snow3g_uea2_aarch64_common #define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_aarch64_common #define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_aarch64_common -#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64 -#define SNOW3G_F8_4_BUFFER_STREAM snow3g_f8_4_buffer_stream_aarch64 -#define SNOW3G_F8_1_BUFFER_STREAM snow3g_f8_1_buffer_stream_aarch64 -#define SNOW3G_F9_1_BUFFER_DIGEST snow3g_f9_1_buffer_digest_aarch64 -#define SNOW3G_F9_4_BUFFER_KEYSTREAM snow3g_f9_4_buffer_keystream_aarch64 - +#define SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB snow3g_f8_4_buffer_initialize_aarch64 +#define SNOW3G_F8_MULTI_BUFFER_STREAM_JOB snow3g_f8_4_buffer_stream_aarch64 +#define SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB snow3g_f9_4_buffer_keystream_aarch64 +#define SNOW3G_F8_1_BUFFER_STREAM_JOB snow3g_f8_1_buffer_stream_aarch64 +#define SNOW3G_F9_1_BUFFER_DIGEST_JOB snow3g_f9_1_buffer_digest_aarch64 #endif -#include "mb_mgr_snow3g_submit_flush_common_aarch64.h" +#define SNOW3G_MB_MAX_LANES_SIMD 4 +#define snow3gKeyStateMulti_t snow3gKeyState4_t + +#include "mb_mgr_snow3g_submit_flush_common_aarch64.h" \ No newline at end of file diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c index 56eafb845a3cccd1c52f82b854b5eaf615e3fe8b..ebe852c3384785d2713a0a2654248f81725eb3b4 100644 --- a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c +++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_no_aesni.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2022 Arm Corporation All rights reserved. + Copyright(c) 2022-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -31,11 +31,14 @@ #define FLUSH_JOB_SNOW3G_UEA2 flush_job_snow3g_uea2_aarch64_no_aesni #define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_aarch64_no_aesni #define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_aarch64_no_aesni -#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64_no_aesni -#define SNOW3G_F8_4_BUFFER_STREAM snow3g_f8_4_buffer_stream_aarch64_no_aesni -#define SNOW3G_F8_1_BUFFER_STREAM snow3g_f8_1_buffer_stream_aarch64_no_aesni -#define SNOW3G_F9_1_BUFFER_DIGEST snow3g_f9_1_buffer_digest_aarch64_no_aesni -#define SNOW3G_F9_4_BUFFER_KEYSTREAM snow3g_f9_4_buffer_keystream_aarch64_no_aesni +#define SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB snow3g_f8_4_buffer_initialize_aarch64_no_aesni +#define SNOW3G_F8_MULTI_BUFFER_STREAM_JOB snow3g_f8_4_buffer_stream_aarch64_no_aesni +#define SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB snow3g_f9_4_buffer_keystream_aarch64_no_aesni +#define SNOW3G_F8_1_BUFFER_STREAM_JOB snow3g_f8_1_buffer_stream_aarch64_no_aesni +#define SNOW3G_F9_1_BUFFER_DIGEST_JOB snow3g_f9_1_buffer_digest_aarch64_no_aesni #endif -#include "mb_mgr_snow3g_submit_flush_common_aarch64.h" +#define SNOW3G_MB_MAX_LANES_SIMD 4 +#define snow3gKeyStateMulti_t snow3gKeyState4_t + +#include "mb_mgr_snow3g_submit_flush_common_aarch64.h" \ No newline at end of file diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_sve256.c b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_sve256.c new file mode 100644 index 0000000000000000000000000000000000000000..3392457e23c84ce9d4d15c398e9773958054350e --- /dev/null +++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_aarch64_sve256.c @@ -0,0 +1,44 @@ +/********************************************************************** + Copyright(c) 2023 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#ifndef SUBMIT_JOB_SNOW3G_UEA2 +#define SUBMIT_JOB_SNOW3G_UEA2 submit_job_snow3g_uea2_aarch64_sve256 +#define FLUSH_JOB_SNOW3G_UEA2 flush_job_snow3g_uea2_aarch64_sve256 +#define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_aarch64_sve256 +#define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_aarch64_sve256 +#define SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB snow3g_f8_8_buffer_initialize_aarch64_sve256_asm +#define SNOW3G_F8_MULTI_BUFFER_STREAM_JOB snow3g_f8_8_buffer_stream_aarch64_sve256_asm +#define SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB snow3g_f9_8_buffer_keystream_aarch64_sve256_asm +#define SNOW3G_F8_1_BUFFER_STREAM_JOB snow3g_f8_1_buffer_stream_aarch64_sve256 +#define SNOW3G_F9_1_BUFFER_DIGEST_JOB snow3g_f9_1_buffer_digest_aarch64_sve256 +#endif + +#define SNOW3G_MB_MAX_LANES_SIMD 8 +#define snow3gKeyStateMulti_t snow3gKeyState8_t + +#include "mb_mgr_snow3g_submit_flush_common_aarch64.h" \ No newline at end of file diff --git a/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h b/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h index 0773c2c338af20743764871062bed0ea63a7d659..e55d39789a029fa2fc9cf73be1a8579a90761e63 100644 --- a/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h +++ b/lib/aarch64/mb_mgr_snow3g_submit_flush_common_aarch64.h @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2022 Arm Corporation All rights reserved. + Copyright(c) 2022-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -35,11 +35,20 @@ #include #include #include +#ifdef SAFE_PARAM +#include "error.h" +#endif -#define SNOW3G_MB_MAX_LANES_SIMD 4 +#define UNUSED_LANE_MASK_BITS 4 +#define UNUSED_LANE_MASK 0xF +#if SNOW3G_MB_MAX_LANES_SIMD == 4 #define INIT_DONE_MASK 0x0F +#elif SNOW3G_MB_MAX_LANES_SIMD == 8 +#define INIT_DONE_MASK 0xFF +#endif #define INIT_ALL_DONE INIT_DONE_MASK + #define JOB_IS_COMPLETED(state, i) \ (((state->job_in_lane[i]) != NULL) && (state->args.byte_length[i] == 0)) #define JOB_NOT_INITIALIZED(state, i) \ @@ -55,11 +64,34 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state); IMB_JOB *SUBMIT_JOB_SNOW3G_UIA2(IMB_MGR *state, IMB_JOB *job); IMB_JOB *FLUSH_JOB_SNOW3G_UIA2(IMB_MGR *state); +void SNOW3G_F8_1_BUFFER_STREAM_JOB(void *pCtx, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes); + +void SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB(void *pCtx, + const snow3g_key_schedule_t **pKeySched, + const void **pIV); + +void SNOW3G_F8_MULTI_BUFFER_STREAM_JOB(void *pCtx, + const void **pBufferIn, + void **pBufferOut, + const uint32_t lengthInBytes); + +void SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB(void *pCtx, + uint32_t *ks); + +void SNOW3G_F9_1_BUFFER_DIGEST_JOB(const uint32_t z[5], + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest); + static void snow3g_mb_mgr_insert_uea2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job) { - uint64_t used_lane_idx = state->unused_lanes & 0xff; + uint64_t used_lane_idx = state->unused_lanes & UNUSED_LANE_MASK; assert(used_lane_idx < SNOW3G_MB_MAX_LANES_SIMD); - state->unused_lanes = state->unused_lanes >> 8; + state->unused_lanes = state->unused_lanes >> UNUSED_LANE_MASK_BITS; + state->num_lanes_inuse++; state->args.iv[used_lane_idx] = job->iv; state->args.keys[used_lane_idx] = job->enc_keys; state->args.in[used_lane_idx] = job->src + job->cipher_start_src_offset_in_bytes; @@ -73,9 +105,9 @@ static void snow3g_mb_mgr_insert_uea2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job static void snow3g_mb_mgr_insert_uia2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job) { - uint64_t used_lane_idx = state->unused_lanes & 0xff; + uint64_t used_lane_idx = state->unused_lanes & UNUSED_LANE_MASK; assert(used_lane_idx < SNOW3G_MB_MAX_LANES_SIMD); - state->unused_lanes = state->unused_lanes >> 8; + state->unused_lanes = state->unused_lanes >> UNUSED_LANE_MASK_BITS; state->num_lanes_inuse++; state->args.iv[used_lane_idx] = job->u.SNOW3G_UIA2._iv; state->args.keys[used_lane_idx] = job->u.SNOW3G_UIA2._key; @@ -83,7 +115,7 @@ static void snow3g_mb_mgr_insert_uia2_job(MB_MGR_SNOW3G_OOO *state, IMB_JOB *job state->args.out[used_lane_idx] = job->auth_tag_output; state->args.INITIALIZED[used_lane_idx] = 0; state->lens[used_lane_idx] = job->msg_len_to_hash_in_bits; - state->init_done = state->init_done & (~(1 << used_lane_idx) & 0xff); + state->init_done = state->init_done & (~(1 << used_lane_idx) & INIT_DONE_MASK); state->job_in_lane[used_lane_idx] = job; } @@ -97,30 +129,16 @@ static IMB_JOB *snow3g_mb_mgr_free_uea2_job(MB_MGR_SNOW3G_OOO *state) ret = state->job_in_lane[i]; ret->status |= IMB_STATUS_COMPLETED_CIPHER; state->job_in_lane[i] = NULL; - state->unused_lanes = state->unused_lanes << 8; + state->unused_lanes = state->unused_lanes << UNUSED_LANE_MASK_BITS; state->unused_lanes |= i; + state->num_lanes_inuse--; state->lens[i] = 0; state->args.INITIALIZED[i] = 0; #ifdef SAFE_DATA - state->args.LFSR_0[i] = 0; - state->args.LFSR_1[i] = 0; - state->args.LFSR_2[i] = 0; - state->args.LFSR_3[i] = 0; - state->args.LFSR_4[i] = 0; - state->args.LFSR_5[i] = 0; - state->args.LFSR_6[i] = 0; - state->args.LFSR_7[i] = 0; - state->args.LFSR_8[i] = 0; - state->args.LFSR_9[i] = 0; - state->args.LFSR_10[i] = 0; - state->args.LFSR_11[i] = 0; - state->args.LFSR_12[i] = 0; - state->args.LFSR_13[i] = 0; - state->args.LFSR_14[i] = 0; - state->args.LFSR_15[i] = 0; - state->args.FSM_1[i] = 0; - state->args.FSM_2[i] = 0; - state->args.FSM_3[i] = 0; + uint32_t* key_state = (uint32_t *)&(state->args.LFSR_0[0]); + for (int k = 0; k < (16 + 3); k++) { + key_state[k * SNOW3G_MB_MAX_LANES_SIMD + i] = 0; + } #endif break; } @@ -136,33 +154,18 @@ static IMB_JOB *snow3g_mb_mgr_free_uia2_job(MB_MGR_SNOW3G_OOO *state, int i) ret = state->job_in_lane[i]; ret->status |= IMB_STATUS_COMPLETED_AUTH; state->job_in_lane[i] = NULL; - state->unused_lanes = state->unused_lanes << 8; + state->unused_lanes = state->unused_lanes << UNUSED_LANE_MASK_BITS; state->unused_lanes |= i; state->num_lanes_inuse--; state->lens[i] = 0; state->args.INITIALIZED[i] = 0; - state->init_done = state->init_done & (~(1 << i) & 0xff); + state->init_done = state->init_done & (~(1 << i) & INIT_DONE_MASK); #ifdef SAFE_DATA - state->args.LFSR_0[i] = 0; - state->args.LFSR_1[i] = 0; - state->args.LFSR_2[i] = 0; - state->args.LFSR_3[i] = 0; - state->args.LFSR_4[i] = 0; - state->args.LFSR_5[i] = 0; - state->args.LFSR_6[i] = 0; - state->args.LFSR_7[i] = 0; - state->args.LFSR_8[i] = 0; - state->args.LFSR_9[i] = 0; - state->args.LFSR_10[i] = 0; - state->args.LFSR_11[i] = 0; - state->args.LFSR_12[i] = 0; - state->args.LFSR_13[i] = 0; - state->args.LFSR_14[i] = 0; - state->args.LFSR_15[i] = 0; - state->args.FSM_1[i] = 0; - state->args.FSM_2[i] = 0; - state->args.FSM_3[i] = 0; + uint32_t* key_state = (uint32_t *)&(state->args.LFSR_0[0]); + for (int k = 0; k < (16 + 3); k++) { + key_state[k * SNOW3G_MB_MAX_LANES_SIMD + i] = 0; + } for (int k = 0; k < 5; k++) { state->ks[i * 5 + k] = 0; } @@ -172,119 +175,35 @@ static IMB_JOB *snow3g_mb_mgr_free_uia2_job(MB_MGR_SNOW3G_OOO *state, int i) } __forceinline -void cpy_snow3g_state_to_ctx_1(snow3gKeyState1_t* ctx, MB_MGR_SNOW3G_OOO* state, const int num_lane) { - SNOW3G_ARGS args = state->args; - ctx->LFSR_S[0] = args.LFSR_0[num_lane]; - ctx->LFSR_S[1] = args.LFSR_1[num_lane]; - ctx->LFSR_S[2] = args.LFSR_2[num_lane]; - ctx->LFSR_S[3] = args.LFSR_3[num_lane]; - ctx->LFSR_S[4] = args.LFSR_4[num_lane]; - ctx->LFSR_S[5] = args.LFSR_5[num_lane]; - ctx->LFSR_S[6] = args.LFSR_6[num_lane]; - ctx->LFSR_S[7] = args.LFSR_7[num_lane]; - ctx->LFSR_S[8] = args.LFSR_8[num_lane]; - ctx->LFSR_S[9] = args.LFSR_9[num_lane]; - ctx->LFSR_S[10] = args.LFSR_10[num_lane]; - ctx->LFSR_S[11] = args.LFSR_11[num_lane]; - ctx->LFSR_S[12] = args.LFSR_12[num_lane]; - ctx->LFSR_S[13] = args.LFSR_13[num_lane]; - ctx->LFSR_S[14] = args.LFSR_14[num_lane]; - ctx->LFSR_S[15] = args.LFSR_15[num_lane]; - ctx->FSM_R1 = args.FSM_1[num_lane]; - ctx->FSM_R2 = args.FSM_2[num_lane]; - ctx->FSM_R3 = args.FSM_3[num_lane]; -} - -__forceinline -void cpy_snow3g_ctx_to_state_after_stream(MB_MGR_SNOW3G_OOO* state, snow3gKeyState4_t* ctx) { - SNOW3G_ARGS *args = &(state->args); - const uint32_t *pLFSR_0 = (const uint32_t *) &ctx->LFSR_X[ctx->iLFSR_X]; - const uint32_t *pLFSR_1 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 1) & 15]; - const uint32_t *pLFSR_2 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 2) & 15]; - const uint32_t *pLFSR_3 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 3) & 15]; - const uint32_t *pLFSR_4 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 4) & 15]; - const uint32_t *pLFSR_5 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 5) & 15]; - const uint32_t *pLFSR_6 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 6) & 15]; - const uint32_t *pLFSR_7 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 7) & 15]; - const uint32_t *pLFSR_8 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 8) & 15]; - const uint32_t *pLFSR_9 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 9) & 15]; - const uint32_t *pLFSR_10 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 10) & 15]; - const uint32_t *pLFSR_11 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 11) & 15]; - const uint32_t *pLFSR_12 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 12) & 15]; - const uint32_t *pLFSR_13 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 13) & 15]; - const uint32_t *pLFSR_14 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 14) & 15]; - const uint32_t *pLFSR_15 = (const uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 15) & 15]; - const uint32_t *pFSM_1 = (const uint32_t *) &ctx->FSM_X[0]; - const uint32_t *pFSM_2 = (const uint32_t *) &ctx->FSM_X[1]; - const uint32_t *pFSM_3 = (const uint32_t *) &ctx->FSM_X[2]; - for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) { - if (!JOB_IS_COMPLETED(state, i)) { - args->LFSR_0[i] = pLFSR_0[i]; - args->LFSR_1[i] = pLFSR_1[i]; - args->LFSR_2[i] = pLFSR_2[i]; - args->LFSR_3[i] = pLFSR_3[i]; - args->LFSR_4[i] = pLFSR_4[i]; - args->LFSR_5[i] = pLFSR_5[i]; - args->LFSR_6[i] = pLFSR_6[i]; - args->LFSR_7[i] = pLFSR_7[i]; - args->LFSR_8[i] = pLFSR_8[i]; - args->LFSR_9[i] = pLFSR_9[i]; - args->LFSR_10[i] = pLFSR_10[i]; - args->LFSR_11[i] = pLFSR_11[i]; - args->LFSR_12[i] = pLFSR_12[i]; - args->LFSR_13[i] = pLFSR_13[i]; - args->LFSR_14[i] = pLFSR_14[i]; - args->LFSR_15[i] = pLFSR_15[i]; - args->FSM_1[i] = pFSM_1[i]; - args->FSM_2[i] = pFSM_2[i]; - args->FSM_3[i] = pFSM_3[i]; - } +void cpy_state_to_ctx1(snow3gKeyStateMulti_t* state, snow3gKeyState1_t* ctx, const int num_lane) { + uint32_t iLFSR_X = state->iLFSR_X; + uint32_t *src = (uint32_t *)&(state->LFSR_X[0]); + uint32_t *dst = (uint32_t *)&(ctx->LFSR_S[0]); + for (int i = 0; i < 16; i++) { + dst[i] = src[((i + iLFSR_X) % 16) * SNOW3G_MB_MAX_LANES_SIMD + num_lane]; + } + for (int i = 16; i < 19; i++) { + dst[i] = src[i * SNOW3G_MB_MAX_LANES_SIMD + num_lane]; } } __forceinline -void cpy_snow3g_state_to_ctx_after_initialize(snow3gKeyState4_t* ctx, MB_MGR_SNOW3G_OOO* state) { - SNOW3G_ARGS *args = &(state->args); - uint32_t *pLFSR_0 = (uint32_t *) &ctx->LFSR_X[ctx->iLFSR_X]; - uint32_t *pLFSR_1 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 1) & 15]; - uint32_t *pLFSR_2 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 2) & 15]; - uint32_t *pLFSR_3 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 3) & 15]; - uint32_t *pLFSR_4 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 4) & 15]; - uint32_t *pLFSR_5 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 5) & 15]; - uint32_t *pLFSR_6 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 6) & 15]; - uint32_t *pLFSR_7 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 7) & 15]; - uint32_t *pLFSR_8 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 8) & 15]; - uint32_t *pLFSR_9 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 9) & 15]; - uint32_t *pLFSR_10 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 10) & 15]; - uint32_t *pLFSR_11 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 11) & 15]; - uint32_t *pLFSR_12 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 12) & 15]; - uint32_t *pLFSR_13 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 13) & 15]; - uint32_t *pLFSR_14 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 14) & 15]; - uint32_t *pLFSR_15 = (uint32_t *) &ctx->LFSR_X[(ctx->iLFSR_X + 15) & 15]; - uint32_t *pFSM_1 = (uint32_t *) &ctx->FSM_X[0]; - uint32_t *pFSM_2 = (uint32_t *) &ctx->FSM_X[1]; - uint32_t *pFSM_3 = (uint32_t *) &ctx->FSM_X[2]; +void cpy_newly_intialized_ctx_to_state(snow3gKeyStateMulti_t* new, MB_MGR_SNOW3G_OOO* state) { + snow3gKeyStateMulti_t* ctx = (snow3gKeyStateMulti_t *)&(state->args.LFSR_0[0]); + uint32_t* dst = (uint32_t *)&(ctx->LFSR_X[0]); + uint32_t* src = (uint32_t *)&(new->LFSR_X[0]); + uint32_t dst_iLFSR = ctx->iLFSR_X; + uint32_t src_iLFSR = new->iLFSR_X; for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) { - if (JOB_INITIALIZED(state, i)) { - pLFSR_0[i] = args->LFSR_0[i]; - pLFSR_1[i] = args->LFSR_1[i]; - pLFSR_2[i] = args->LFSR_2[i]; - pLFSR_3[i] = args->LFSR_3[i]; - pLFSR_4[i] = args->LFSR_4[i]; - pLFSR_5[i] = args->LFSR_5[i]; - pLFSR_6[i] = args->LFSR_6[i]; - pLFSR_7[i] = args->LFSR_7[i]; - pLFSR_8[i] = args->LFSR_8[i]; - pLFSR_9[i] = args->LFSR_9[i]; - pLFSR_10[i] = args->LFSR_10[i]; - pLFSR_11[i] = args->LFSR_11[i]; - pLFSR_12[i] = args->LFSR_12[i]; - pLFSR_13[i] = args->LFSR_13[i]; - pLFSR_14[i] = args->LFSR_14[i]; - pLFSR_15[i] = args->LFSR_15[i]; - pFSM_1[i] = args->FSM_1[i]; - pFSM_2[i] = args->FSM_2[i]; - pFSM_3[i] = args->FSM_3[i]; + if (JOB_NOT_INITIALIZED(state, i)) { + for (int j = 0; j < (16 + 3); j++) { + dst[((j + dst_iLFSR) % 16) * SNOW3G_MB_MAX_LANES_SIMD + i] = + src[((j + src_iLFSR) % 16) * SNOW3G_MB_MAX_LANES_SIMD + i]; + } + for (int j = 16; j < 19; j++) { + dst[j * SNOW3G_MB_MAX_LANES_SIMD + i] = src[j * SNOW3G_MB_MAX_LANES_SIMD + i]; + } + state->args.INITIALIZED[i] = 1; } } } @@ -292,6 +211,35 @@ void cpy_snow3g_state_to_ctx_after_initialize(snow3gKeyState4_t* ctx, MB_MGR_SNO IMB_JOB *SUBMIT_JOB_SNOW3G_UEA2(IMB_MGR *state, IMB_JOB *job) { +#ifdef SAFE_PARAM + /* reset error status */ + if (imb_errno != 0) + imb_set_errno(NULL, 0); + + if (job->enc_keys == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY); + return NULL; + } + if (job->iv == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return NULL; + } + + if (job->src == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return NULL; + } + if (job->dst == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_DST); + return NULL; + } + if ((job->msg_len_to_cipher_in_bits == 0) || + (job->msg_len_to_cipher_in_bits > SNOW3G_MAX_BITLEN)) { + imb_set_errno(NULL, IMB_ERR_CIPH_LEN); + return NULL; + } +#endif + MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uea2_ooo; uint32_t msg_bitlen = job->msg_len_to_cipher_in_bits; uint32_t msg_bitoff = job->cipher_start_src_offset_in_bits; @@ -315,52 +263,41 @@ IMB_JOB *SUBMIT_JOB_SNOW3G_UEA2(IMB_MGR *state, if (ret != NULL) return ret; - if (snow3g_state->unused_lanes != 0xff) + if(snow3g_state->num_lanes_inuse < SNOW3G_MB_MAX_LANES_SIMD) return NULL; uint32_t min_word_len = UINT32_MAX; - snow3gKeyState4_t ctx; - SNOW3G_F8_4_BUFFER_INITIALIZE(&ctx, snow3g_state->args.keys[0], snow3g_state->args.keys[1], - snow3g_state->args.keys[2], snow3g_state->args.keys[3], - snow3g_state->args.iv[0],snow3g_state->args.iv[1], - snow3g_state->args.iv[2],snow3g_state->args.iv[3]); + snow3gKeyStateMulti_t *pCtx = (snow3gKeyStateMulti_t *)&(snow3g_state->args.LFSR_0[0]); + snow3gKeyStateMulti_t tmp_ctx; - cpy_snow3g_state_to_ctx_after_initialize(&ctx, snow3g_state); + SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB(&tmp_ctx, + (const snow3g_key_schedule_t **)snow3g_state->args.keys, + (const void**)snow3g_state->args.iv); + cpy_newly_intialized_ctx_to_state(&tmp_ctx, snow3g_state); for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) { - if (JOB_NOT_INITIALIZED(snow3g_state, i)) { - snow3g_state->args.INITIALIZED[i] = 1; - } min_word_len = (min_word_len < snow3g_state->args.byte_length[i] / SNOW3G_4_BYTES) ? min_word_len : snow3g_state->args.byte_length[i] / SNOW3G_4_BYTES; } - SNOW3G_F8_4_BUFFER_STREAM(&ctx, - snow3g_state->args.in[0],snow3g_state->args.out[0], - snow3g_state->args.in[1],snow3g_state->args.out[1], - snow3g_state->args.in[2],snow3g_state->args.out[2], - snow3g_state->args.in[3],snow3g_state->args.out[3], - min_word_len * SNOW3G_4_BYTES); + SNOW3G_F8_MULTI_BUFFER_STREAM_JOB(pCtx, + (const void **)snow3g_state->args.in, + (void **)snow3g_state->args.out, + min_word_len * SNOW3G_4_BYTES); for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) { - snow3g_state->args.in[i] = (uint8_t *)snow3g_state->args.in[i] + - min_word_len * SNOW3G_4_BYTES; - snow3g_state->args.out[i] = (uint8_t *)snow3g_state->args.out[i] + - min_word_len * SNOW3G_4_BYTES; snow3g_state->args.byte_length[i] -= min_word_len * SNOW3G_4_BYTES; } - cpy_snow3g_ctx_to_state_after_stream(snow3g_state, &ctx); - for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) { - //if less than one word left, finish job here. + // if less than one word left, finish job here. if (snow3g_state->args.byte_length[i] < SNOW3G_4_BYTES && snow3g_state->args.byte_length[i] != 0) { snow3gKeyState1_t ctx_1; - cpy_snow3g_state_to_ctx_1(&ctx_1, snow3g_state, i); - SNOW3G_F8_1_BUFFER_STREAM(&ctx_1, snow3g_state->args.in[i], - snow3g_state->args.out[i], - snow3g_state->args.byte_length[i]); + cpy_state_to_ctx1(pCtx, &ctx_1, i); + SNOW3G_F8_1_BUFFER_STREAM_JOB(&ctx_1, snow3g_state->args.in[i], + snow3g_state->args.out[i], + snow3g_state->args.byte_length[i]); snow3g_state->args.byte_length[i] = 0; } } @@ -368,7 +305,7 @@ IMB_JOB *SUBMIT_JOB_SNOW3G_UEA2(IMB_MGR *state, ret = snow3g_mb_mgr_free_uea2_job(snow3g_state); #ifdef SAFE_DATA - //data has been cleard in snow3g_mb_mgr_free_uea2_job. + // data has been cleard in snow3g_mb_mgr_free_uea2_job. #endif return ret; @@ -389,7 +326,7 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state) ret = snow3g_state->job_in_lane[i]; if (JOB_NOT_INITIALIZED(snow3g_state, i)) { - //if not initialized + // if not initialized IMB_SNOW3G_F8_1_BUFFER(state, snow3g_state->args.keys[i], snow3g_state->args.iv[i], snow3g_state->args.in[i], @@ -397,39 +334,26 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state) snow3g_state->args.byte_length[i]); } else { snow3gKeyState1_t ctx; - cpy_snow3g_state_to_ctx_1(&ctx, snow3g_state, i); - SNOW3G_F8_1_BUFFER_STREAM(&ctx, snow3g_state->args.in[i], - snow3g_state->args.out[i], - snow3g_state->args.byte_length[i]); + snow3gKeyStateMulti_t* state = (snow3gKeyStateMulti_t*)&(snow3g_state->args.LFSR_0[0]); + cpy_state_to_ctx1(state, &ctx, i); + SNOW3G_F8_1_BUFFER_STREAM_JOB(&ctx, snow3g_state->args.in[i], + snow3g_state->args.out[i], + snow3g_state->args.byte_length[i]); } ret->status |= IMB_STATUS_COMPLETED_CIPHER; snow3g_state->lens[i] = 0; snow3g_state->job_in_lane[i] = NULL; - snow3g_state->unused_lanes = snow3g_state->unused_lanes << 8; + snow3g_state->unused_lanes = snow3g_state->unused_lanes << UNUSED_LANE_MASK_BITS; snow3g_state->unused_lanes |= i; + snow3g_state->num_lanes_inuse--; snow3g_state->args.byte_length[i] = 0; snow3g_state->args.INITIALIZED[i] = 0; #ifdef SAFE_DATA - snow3g_state->args.LFSR_0[i] = 0; - snow3g_state->args.LFSR_1[i] = 0; - snow3g_state->args.LFSR_2[i] = 0; - snow3g_state->args.LFSR_3[i] = 0; - snow3g_state->args.LFSR_4[i] = 0; - snow3g_state->args.LFSR_5[i] = 0; - snow3g_state->args.LFSR_6[i] = 0; - snow3g_state->args.LFSR_7[i] = 0; - snow3g_state->args.LFSR_8[i] = 0; - snow3g_state->args.LFSR_9[i] = 0; - snow3g_state->args.LFSR_10[i] = 0; - snow3g_state->args.LFSR_11[i] = 0; - snow3g_state->args.LFSR_12[i] = 0; - snow3g_state->args.LFSR_13[i] = 0; - snow3g_state->args.LFSR_14[i] = 0; - snow3g_state->args.LFSR_15[i] = 0; - snow3g_state->args.FSM_1[i] = 0; - snow3g_state->args.FSM_2[i] = 0; - snow3g_state->args.FSM_3[i] = 0; + uint32_t* key_state = (uint32_t *)&(snow3g_state->args.LFSR_0[0]); + for (int k = 0; k < (16 + 3); k++) { + key_state[k * SNOW3G_MB_MAX_LANES_SIMD + i] = 0; + } #endif return ret; } @@ -440,36 +364,59 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state) IMB_JOB *SUBMIT_JOB_SNOW3G_UIA2(IMB_MGR *state, IMB_JOB *job) { +#ifdef SAFE_PARAM + /* reset error status */ + if (imb_errno != 0) + imb_set_errno(NULL, 0); + + if (job->u.SNOW3G_UIA2._key == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY); + return NULL; + } + if (job->u.SNOW3G_UIA2._iv == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return NULL; + } + + if (job->src == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return NULL; + } + if (job->auth_tag_output == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_AUTH); + return NULL; + } + if ((job->msg_len_to_hash_in_bits == 0) || + (job->msg_len_to_hash_in_bits > SNOW3G_MAX_BITLEN)) { + imb_set_errno(NULL, IMB_ERR_AUTH_LEN); + return NULL; + } +#endif MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uia2_ooo; IMB_JOB *ret = NULL; snow3g_mb_mgr_insert_uia2_job(snow3g_state, job); - if (snow3g_state->unused_lanes != 0xff) + if (snow3g_state->num_lanes_inuse < SNOW3G_MB_MAX_LANES_SIMD) return NULL; if (snow3g_state->init_done == 0) { - //all lanes are not initialized. - snow3gKeyState4_t ctx; - SNOW3G_F8_4_BUFFER_INITIALIZE(&ctx, - snow3g_state->args.keys[0], snow3g_state->args.keys[1], - snow3g_state->args.keys[2], snow3g_state->args.keys[3], - snow3g_state->args.iv[0],snow3g_state->args.iv[1], - snow3g_state->args.iv[2],snow3g_state->args.iv[3]); - SNOW3G_F9_4_BUFFER_KEYSTREAM(&ctx, - &snow3g_state->ks[0*5], - &snow3g_state->ks[1*5], - &snow3g_state->ks[2*5], - &snow3g_state->ks[3*5]); + // all lanes are not initialized. + snow3gKeyStateMulti_t ctx; + SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB(&ctx, + (const snow3g_key_schedule_t **)snow3g_state->args.keys, + (const void**)snow3g_state->args.iv); + SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB(&ctx, + snow3g_state->ks); snow3g_state->init_done = INIT_ALL_DONE; } for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) { if (snow3g_state->init_done & (1 << i)) { - //pick a initialized lane - SNOW3G_F9_1_BUFFER_DIGEST(&snow3g_state->ks[i*5], snow3g_state->args.in[i], - snow3g_state->lens[i], snow3g_state->args.out[i]); + // pick a initialized lane + SNOW3G_F9_1_BUFFER_DIGEST_JOB(&snow3g_state->ks[i*5], snow3g_state->args.in[i], + snow3g_state->lens[i], snow3g_state->args.out[i]); ret = snow3g_mb_mgr_free_uia2_job(snow3g_state, i); break; } @@ -483,14 +430,14 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UIA2(IMB_MGR *state) MB_MGR_SNOW3G_OOO *snow3g_state = state->snow3g_uia2_ooo; if (snow3g_state->num_lanes_inuse == 0) { - //empty + // empty return NULL; } for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) { if (snow3g_state->init_done & (1<ks[i*5], snow3g_state->args.in[i], - snow3g_state->lens[i], snow3g_state->args.out[i]); + // pick a initialized lane + SNOW3G_F9_1_BUFFER_DIGEST_JOB(&snow3g_state->ks[i*5], snow3g_state->args.in[i], + snow3g_state->lens[i], snow3g_state->args.out[i]); ret = snow3g_mb_mgr_free_uia2_job(snow3g_state, i); return ret; } @@ -503,29 +450,24 @@ IMB_JOB *FLUSH_JOB_SNOW3G_UIA2(IMB_MGR *state) } } for (int i = 0; i < SNOW3G_MB_MAX_LANES_SIMD; i++) { - //copy keys and ivs to empty lane + // copy keys and ivs to empty lane if (JOB_IS_NULL(snow3g_state, i)) { snow3g_state->args.keys[i] = snow3g_state->args.keys[lane_idx]; snow3g_state->args.iv[i] = snow3g_state->args.iv[lane_idx]; } } - snow3gKeyState4_t ctx; - SNOW3G_F8_4_BUFFER_INITIALIZE(&ctx, - snow3g_state->args.keys[0], snow3g_state->args.keys[1], - snow3g_state->args.keys[2], snow3g_state->args.keys[3], - snow3g_state->args.iv[0],snow3g_state->args.iv[1], - snow3g_state->args.iv[2],snow3g_state->args.iv[3]); - SNOW3G_F9_4_BUFFER_KEYSTREAM(&ctx, - &snow3g_state->ks[0*5], - &snow3g_state->ks[1*5], - &snow3g_state->ks[2*5], - &snow3g_state->ks[3*5]); - //pick a initialized lane - SNOW3G_F9_1_BUFFER_DIGEST(&snow3g_state->ks[lane_idx*5], snow3g_state->args.in[lane_idx], - snow3g_state->lens[lane_idx], snow3g_state->args.out[lane_idx]); + snow3gKeyStateMulti_t ctx; + SNOW3G_F8_MULTI_BUFFER_INITIALIZE_JOB(&ctx, + (const snow3g_key_schedule_t **)snow3g_state->args.keys, + (const void **)snow3g_state->args.iv); + SNOW3G_F9_MULTI_BUFFER_KEYSTREAM_JOB(&ctx, + snow3g_state->ks); + // pick a initialized lane + SNOW3G_F9_1_BUFFER_DIGEST_JOB(&snow3g_state->ks[lane_idx*5], snow3g_state->args.in[lane_idx], + snow3g_state->lens[lane_idx], snow3g_state->args.out[lane_idx]); ret = snow3g_mb_mgr_free_uia2_job(snow3g_state, lane_idx); return ret; } -#endif //MB_MGR_SNOW3G_SUBMIT_FLUSH_AARCH64_H +#endif // MB_MGR_SNOW3G_SUBMIT_FLUSH_AARCH64_H diff --git a/lib/aarch64/snow3g_aarch64.c b/lib/aarch64/snow3g_aarch64.c index 4b4172fd6f2380a05fd592295c4ac8d8ef2fa986..deccbb380d46bb68ae3cfab0d7f840df3d0ce0a6 100644 --- a/lib/aarch64/snow3g_aarch64.c +++ b/lib/aarch64/snow3g_aarch64.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2021-2022 Arm Corporation All rights reserved. + Copyright(c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -38,10 +38,9 @@ #define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64 #define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64 #define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64 -#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64 -#define SNOW3G_F8_4_BUFFER_STREAM snow3g_f8_4_buffer_stream_aarch64 -#define SNOW3G_F8_1_BUFFER_STREAM snow3g_f8_1_buffer_stream_aarch64 -#define SNOW3G_F9_1_BUFFER_DIGEST snow3g_f9_1_buffer_digest_aarch64 -#define SNOW3G_F9_4_BUFFER_KEYSTREAM snow3g_f9_4_buffer_keystream_aarch64 - +#define SNOW3G_F8_1_BUFFER_STREAM_JOB snow3g_f8_1_buffer_stream_aarch64 +#define SNOW3G_F8_4_BUFFER_INITIALIZE_JOB snow3g_f8_4_buffer_initialize_aarch64 +#define SNOW3G_F8_4_BUFFER_STREAM_JOB snow3g_f8_4_buffer_stream_aarch64 +#define SNOW3G_F9_1_BUFFER_DIGEST_JOB snow3g_f9_1_buffer_digest_aarch64 +#define SNOW3G_F9_4_BUFFER_KEYSTREAM_JOB snow3g_f9_4_buffer_keystream_aarch64 #include "snow3g_common_aarch64.h" diff --git a/lib/aarch64/snow3g_aarch64_no_aesni.c b/lib/aarch64/snow3g_aarch64_no_aesni.c index f5a9e589bd07fc3f240ac2894efcfa48d0a9768f..995c0202fc2b5403aa629419974c61144f77af39 100644 --- a/lib/aarch64/snow3g_aarch64_no_aesni.c +++ b/lib/aarch64/snow3g_aarch64_no_aesni.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2021-2022 Arm Corporation All rights reserved. + Copyright(c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -40,10 +40,10 @@ #define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64_no_aesni #define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64_no_aesni #define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64_no_aesni -#define SNOW3G_F8_4_BUFFER_INITIALIZE snow3g_f8_4_buffer_initialize_aarch64_no_aesni -#define SNOW3G_F8_4_BUFFER_STREAM snow3g_f8_4_buffer_stream_aarch64_no_aesni -#define SNOW3G_F8_1_BUFFER_STREAM snow3g_f8_1_buffer_stream_aarch64_no_aesni -#define SNOW3G_F9_1_BUFFER_DIGEST snow3g_f9_1_buffer_digest_aarch64_no_aesni -#define SNOW3G_F9_4_BUFFER_KEYSTREAM snow3g_f9_4_buffer_keystream_aarch64_no_aesni +#define SNOW3G_F8_1_BUFFER_STREAM_JOB snow3g_f8_1_buffer_stream_aarch64_no_aesni +#define SNOW3G_F8_4_BUFFER_INITIALIZE_JOB snow3g_f8_4_buffer_initialize_aarch64_no_aesni +#define SNOW3G_F8_4_BUFFER_STREAM_JOB snow3g_f8_4_buffer_stream_aarch64_no_aesni +#define SNOW3G_F9_1_BUFFER_DIGEST_JOB snow3g_f9_1_buffer_digest_aarch64_no_aesni +#define SNOW3G_F9_4_BUFFER_KEYSTREAM_JOB snow3g_f9_4_buffer_keystream_aarch64_no_aesni #include "snow3g_common_aarch64.h" diff --git a/lib/aarch64/snow3g_aarch64_sve256.c b/lib/aarch64/snow3g_aarch64_sve256.c new file mode 100644 index 0000000000000000000000000000000000000000..ef71297353e9d82e86a44713ba79dbc2956f5f29 --- /dev/null +++ b/lib/aarch64/snow3g_aarch64_sve256.c @@ -0,0 +1,49 @@ +/********************************************************************** + Copyright(c) 2023 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ +#define AARCH64_SVE256 +#define SNOW3G_F8_1_BUFFER_BIT snow3g_f8_1_buffer_bit_aarch64_sve256 +#define SNOW3G_F8_1_BUFFER snow3g_f8_1_buffer_aarch64_sve256 +#define SNOW3G_F8_2_BUFFER snow3g_f8_2_buffer_aarch64_sve256 +#define SNOW3G_F8_4_BUFFER snow3g_f8_4_buffer_aarch64_sve256 +#define SNOW3G_F8_8_BUFFER snow3g_f8_8_buffer_aarch64_sve256 +#define SNOW3G_F8_N_BUFFER snow3g_f8_n_buffer_aarch64_sve256 +#define SNOW3G_F8_4_BUFFER_MULTIKEY snow3g_f8_4_buffer_multikey_aarch64_sve256 +#define SNOW3G_F8_8_BUFFER_MULTIKEY snow3g_f8_8_buffer_multikey_aarch64_sve256 +#define SNOW3G_F8_N_BUFFER_MULTIKEY snow3g_f8_n_buffer_multikey_aarch64_sve256 +#define SNOW3G_F9_1_BUFFER snow3g_f9_1_buffer_aarch64_sve256 +#define SNOW3G_INIT_KEY_SCHED snow3g_init_key_sched_aarch64_sve256 +#define SNOW3G_KEY_SCHED_SIZE snow3g_key_sched_size_aarch64_sve256 +#define SNOW3G_F8_1_BUFFER_STREAM_JOB snow3g_f8_1_buffer_stream_aarch64_sve256 +#define SNOW3G_F9_1_BUFFER_DIGEST_JOB snow3g_f9_1_buffer_digest_aarch64_sve256 +#define SNOW3G_F8_4_BUFFER_ASM snow3g_f8_4_buffer_aarch64_neon_asm +#define SNOW3G_F8_8_BUFFER_ASM snow3g_f8_8_buffer_aarch64_sve256_asm +#define SNOW3G_F8_8_BUFFER_MULTIKEY_ASM snow3g_f8_8_buffer_multikey_aarch64_sve256_asm + + +#include "snow3g_common_aarch64.h" diff --git a/lib/aarch64/snow3g_common_aarch64.h b/lib/aarch64/snow3g_common_aarch64.h index c4c60c21add0eaaf8b733a8dd011319a276deac0..e565a55ac5a87a3f33f68772447dbbeae5b3dd31 100644 --- a/lib/aarch64/snow3g_common_aarch64.h +++ b/lib/aarch64/snow3g_common_aarch64.h @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2021-2022 Arm Corporation All rights reserved. + Copyright(c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -50,6 +50,40 @@ #define CLEAR_MEM clear_mem #define CLEAR_VAR clear_var +void SNOW3G_F8_1_BUFFER_STREAM_JOB(void *pCtx, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes); + +void SNOW3G_F8_4_BUFFER_INITIALIZE_JOB(void *pCtx, + const snow3g_key_schedule_t **pKeySched, + const void **pIV); + +void SNOW3G_F8_4_BUFFER_STREAM_JOB(void *pCtx, + const void **pBufferIn, + void **pBufferOut, + const uint32_t lengthInBytes); + +void SNOW3G_F9_1_BUFFER_DIGEST_JOB(const uint32_t z[5], + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest); + +void SNOW3G_F9_4_BUFFER_KEYSTREAM_JOB(void *pCtx, + uint32_t *keystream); + +void SNOW3G_F8_8_BUFFER_ASM(const snow3g_key_schedule_t *key, + const void **iv, + const void **in, + void **out, + uint32_t lengthInBytes[]); + +void SNOW3G_F8_8_BUFFER_MULTIKEY_ASM(const snow3g_key_schedule_t **key, + const void **iv, + const void **in, + void **out, + uint32_t lengthInBytes[]); + /** * @brief Wrapper for safe lookup of 16 indexes in 256x8-bit table * @param[in] indexes vector of 16x8-bit indexes to be looked up @@ -1735,6 +1769,79 @@ static inline void snow3gStateConvert_4(const snow3gKeyState4_t *pSrcState, pDstState->FSM_R3 = pFSM_X2[NumBuffer]; } +static inline void sortLanesByLength(const snow3g_key_schedule_t** pCtxBuf, + const void** pIV, + const void** pSrcBuf, + void** pDstBuf, + uint32_t* lensBuf, + const uint32_t packet_count) +{ + uint32_t packet_index, inner_index; + uint32_t sortNeeded = 0, tempLen = 0; + const void *srctempbuff; + void *dsttempbuff; + const void *ivtempbuff; + const snow3g_key_schedule_t *tempCtx; + + packet_index = packet_count; + while (packet_index--) { + /* check if all packets are sorted by decreasing length */ + if (packet_index > 0 && lensBuf[packet_index - 1] < + lensBuf[packet_index]) { + /* this packet array is not correctly sorted */ + sortNeeded = 1; + } + } + + if (sortNeeded) { + /* sort packets in decreasing buffer size from [0] to [n]th + packet, where buffer[0] will contain longest buffer and + buffer[n] will contain the shortest buffer. + 4 arrays are swapped : + - pointers to input buffers + - pointers to output buffers + - pointers to input IV's + - input buffer lengths */ + + packet_index = packet_count; + while (packet_index--) { + inner_index = packet_index; + while (inner_index--) { + if (lensBuf[packet_index] > + lensBuf[inner_index]) { + /* swap buffers to arrange in + descending order from [0]. */ + srctempbuff = pSrcBuf[packet_index]; + dsttempbuff = pDstBuf[packet_index]; + ivtempbuff = pIV[packet_index]; + tempLen = lensBuf[packet_index]; + + pSrcBuf[packet_index] = + pSrcBuf[inner_index]; + pDstBuf[packet_index] = + pDstBuf[inner_index]; + pIV[packet_index] = pIV[inner_index]; + lensBuf[packet_index] = + lensBuf[inner_index]; + + pSrcBuf[inner_index] = srctempbuff; + pDstBuf[inner_index] = dsttempbuff; + pIV[inner_index] = ivtempbuff; + lensBuf[inner_index] = tempLen; + + if (pCtxBuf != NULL) { + tempCtx = pCtxBuf[packet_index]; + pCtxBuf[packet_index] = + pCtxBuf[inner_index]; + pCtxBuf[inner_index] = tempCtx; + } + + } + } /* for inner packet index (inner bubble-sort) */ + } /* for outer packet index (outer bubble-sort) */ + } /* if sortNeeded */ +} + /** * @brief Provides size of key schedule structure * @return Key schedule structure in bytes @@ -2204,8 +2311,8 @@ void SNOW3G_F8_4_BUFFER(const snow3g_key_schedule_t *pHandle, const size_t num_lanes = 4; snow3gKeyState4_t ctx; uint32_t lenInBytes[4]; - uint8_t *pBufferOut[4]; - const uint8_t *pBufferIn[4]; + void *pBufferOut[4]; + const void * pBufferIn[4]; uint32_t bytes, qwords, i; length_copy_4(lenInBytes, lengthInBytes1, lengthInBytes2, @@ -2355,6 +2462,27 @@ void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[], if (!length_check(lengthInBytes, num_lanes)) return; #endif +#ifdef AARCH64_SVE256 + const void *pSrcBuf[NUM_PACKETS_8] = {NULL}; + void *pDstBuf[NUM_PACKETS_8] = {NULL}; + const void *pIV[NUM_PACKETS_8] = {NULL}; + uint32_t lensBuf[NUM_PACKETS_8] = {0}; + const snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_8] = {NULL}; + + memcpy((void *)lensBuf, lengthInBytes, NUM_PACKETS_8 * sizeof(uint32_t)); + memcpy((void *)pSrcBuf, BufferIn, NUM_PACKETS_8 * sizeof(void *)); + memcpy((void *)pDstBuf, BufferOut, NUM_PACKETS_8 * sizeof(void *)); + memcpy((void *)pIV, IV, NUM_PACKETS_8 * sizeof(void *)); + memcpy((void *)pCtxBuf, pKey, NUM_PACKETS_8 * sizeof(void *)); + + sortLanesByLength(pCtxBuf, pIV, pSrcBuf, pDstBuf, lensBuf, NUM_PACKETS_8); + SNOW3G_F8_8_BUFFER_MULTIKEY_ASM(pCtxBuf, + pIV, + pSrcBuf, + pDstBuf, + lensBuf); + +#else SNOW3G_F8_4_BUFFER_MULTIKEY(pKey[0], pKey[1], pKey[2], pKey[3], IV[0], IV[1], IV[2], IV[3], BufferIn[0], BufferOut[0], lengthInBytes[0], @@ -2368,6 +2496,7 @@ void SNOW3G_F8_8_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pKey[], BufferIn[5], BufferOut[5], lengthInBytes[5], BufferIn[6], BufferOut[6], lengthInBytes[6], BufferIn[7], BufferOut[7], lengthInBytes[7]); +#endif #ifdef SAFE_DATA CLEAR_SCRATCH_GPS(); CLEAR_SCRATCH_SIMD_REGS(); @@ -2451,9 +2580,9 @@ void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle, const uint32_t lenInBytes8) { uint32_t lengthInBytes[8]; - const uint8_t *pBufferIn[8]; + const void *pBufferIn[8]; const void *pIV[8]; - uint8_t *pBufferOut[8]; + void *pBufferOut[8]; length_copy_8(lengthInBytes, lenInBytes1, lenInBytes2, lenInBytes3, lenInBytes4, @@ -2491,7 +2620,26 @@ void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle, if (!length_check(lengthInBytes, num_lanes)) return; #endif +#ifdef AARCH64_SVE256 + const void *pSrcBuf[NUM_PACKETS_8] = {NULL}; + void *pDstBuf[NUM_PACKETS_8] = {NULL}; + const void *tmpIV[NUM_PACKETS_8] = {NULL}; + uint32_t lensBuf[NUM_PACKETS_8] = {0}; + + memcpy((void *)lensBuf, lengthInBytes, NUM_PACKETS_8 * sizeof(uint32_t)); + memcpy((void *)pSrcBuf, pBufferIn, NUM_PACKETS_8 * sizeof(void *)); + memcpy((void *)pDstBuf, pBufferOut, NUM_PACKETS_8 * sizeof(void *)); + memcpy((void *)tmpIV, pIV, NUM_PACKETS_8 * sizeof(void *)); + sortLanesByLength(NULL, tmpIV, pSrcBuf, pDstBuf, lensBuf, NUM_PACKETS_8); + + SNOW3G_F8_8_BUFFER_ASM(pHandle, + tmpIV, + pSrcBuf, + pDstBuf, + lensBuf); + +#else SNOW3G_F8_4_BUFFER(pHandle, pIV[0], pIV[1], pIV[2], pIV[3], pBufferIn[0], pBufferOut[0], lengthInBytes[0], @@ -2505,6 +2653,7 @@ void SNOW3G_F8_8_BUFFER(const snow3g_key_schedule_t *pHandle, pBufferIn[5], pBufferOut[5], lengthInBytes[5], pBufferIn[6], pBufferOut[6], lengthInBytes[6], pBufferIn[7], pBufferOut[7], lengthInBytes[7]); +#endif } /** @@ -2559,14 +2708,10 @@ void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx, return; } - uint32_t packet_index, inner_index, pktCnt = packetCount; - int sortNeeded = 0, tempLen = 0; - uint8_t *srctempbuff; - uint8_t *dsttempbuff; - uint8_t *ivtempbuff; - uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL}; - uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL}; - uint8_t *pIV[NUM_PACKETS_16] = {NULL}; + uint32_t packet_index, pktCnt = packetCount; + const void *pSrcBuf[NUM_PACKETS_16] = {NULL}; + void *pDstBuf[NUM_PACKETS_16] = {NULL}; + const void *pIV[NUM_PACKETS_16] = {NULL}; uint32_t lensBuf[NUM_PACKETS_16] = {0}; memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t)); @@ -2574,63 +2719,48 @@ void SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx, memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *)); memcpy((void *)pIV, IV, packetCount * sizeof(void *)); - packet_index = packetCount; - - while (packet_index--) { - - /* check if all packets are sorted by decreasing length */ - if (packet_index > 0 && lensBuf[packet_index - 1] < - lensBuf[packet_index]) { - /* this packet array is not correctly sorted */ - sortNeeded = 1; - } - } - - if (sortNeeded) { - - /* sort packets in decreasing buffer size from [0] to - [n]th packet, ** where buffer[0] will contain longest - buffer and buffer[n] will contain the shortest buffer. - 4 arrays are swapped : - - pointers to input buffers - - pointers to output buffers - - pointers to input IV's - - input buffer lengths */ - packet_index = packetCount; - while (packet_index--) { - - inner_index = packet_index; - while (inner_index--) { - - if (lensBuf[packet_index] > - lensBuf[inner_index]) { - - /* swap buffers to arrange in - descending order from [0]. */ - srctempbuff = pSrcBuf[packet_index]; - dsttempbuff = pDstBuf[packet_index]; - ivtempbuff = pIV[packet_index]; - tempLen = lensBuf[packet_index]; - - pSrcBuf[packet_index] = - pSrcBuf[inner_index]; - pDstBuf[packet_index] = - pDstBuf[inner_index]; - pIV[packet_index] = pIV[inner_index]; - lensBuf[packet_index] = - lensBuf[inner_index]; - - pSrcBuf[inner_index] = srctempbuff; - pDstBuf[inner_index] = dsttempbuff; - pIV[inner_index] = ivtempbuff; - lensBuf[inner_index] = tempLen; - } - } /* for inner packet index (inner bubble-sort) */ - } /* for outer packet index (outer bubble-sort) */ - } /* if sortNeeded */ + sortLanesByLength(NULL, pIV, pSrcBuf, pDstBuf, lensBuf, packetCount); packet_index = 0; +#ifdef AARCH64_SVE256 /* process 8 buffers at-a-time */ + while (pktCnt >= 8) { + pktCnt -= 8; + SNOW3G_F8_8_BUFFER(pCtx, pIV[packet_index + 0], + pIV[packet_index + 1], + pIV[packet_index + 2], + pIV[packet_index + 3], + pIV[packet_index + 4], + pIV[packet_index + 5], + pIV[packet_index + 6], + pIV[packet_index + 7], + pSrcBuf[packet_index + 0], + pDstBuf[packet_index + 0], + lensBuf[packet_index + 0], + pSrcBuf[packet_index + 1], + pDstBuf[packet_index + 1], + lensBuf[packet_index + 1], + pSrcBuf[packet_index + 2], + pDstBuf[packet_index + 2], + lensBuf[packet_index + 2], + pSrcBuf[packet_index + 3], + pDstBuf[packet_index + 3], + lensBuf[packet_index + 3], + pSrcBuf[packet_index + 4], + pDstBuf[packet_index + 4], + lensBuf[packet_index + 4], + pSrcBuf[packet_index + 5], + pDstBuf[packet_index + 5], + lensBuf[packet_index + 5], + pSrcBuf[packet_index + 6], + pDstBuf[packet_index + 6], + lensBuf[packet_index + 6], + pSrcBuf[packet_index + 7], + pDstBuf[packet_index + 7], + lensBuf[packet_index + 7]); + packet_index += 8; + } +#endif /* process 4 buffers at-a-time */ while (pktCnt >= 4) { pktCnt -= 4; @@ -2726,17 +2856,12 @@ void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[], return; } - uint32_t packet_index, inner_index, pktCnt = packetCount; - int sortNeeded = 0, tempLen = 0; - uint8_t *srctempbuff; - uint8_t *dsttempbuff; - uint8_t *ivtempbuff; - snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL}; - uint8_t *pSrcBuf[NUM_PACKETS_16] = {NULL}; - uint8_t *pDstBuf[NUM_PACKETS_16] = {NULL}; - uint8_t *pIV[NUM_PACKETS_16] = {NULL}; + uint32_t packet_index, pktCnt = packetCount; + const snow3g_key_schedule_t *pCtxBuf[NUM_PACKETS_16] = {NULL}; + const void *pSrcBuf[NUM_PACKETS_16] = {NULL}; + void *pDstBuf[NUM_PACKETS_16] = {NULL}; + const void *pIV[NUM_PACKETS_16] = {NULL}; uint32_t lensBuf[NUM_PACKETS_16] = {0}; - snow3g_key_schedule_t *tempCtx; memcpy((void *)pCtxBuf, pCtx, packetCount * sizeof(void *)); memcpy((void *)lensBuf, bufLenInBytes, packetCount * sizeof(uint32_t)); @@ -2744,60 +2869,7 @@ void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[], memcpy((void *)pDstBuf, pBufferOut, packetCount * sizeof(void *)); memcpy((void *)pIV, IV, packetCount * sizeof(void *)); - packet_index = packetCount; - - while (packet_index--) { - - /* check if all packets are sorted by decreasing length */ - if (packet_index > 0 && lensBuf[packet_index - 1] < - lensBuf[packet_index]) { - /* this packet array is not correctly sorted */ - sortNeeded = 1; - } - } - - if (sortNeeded) { - /* sort packets in decreasing buffer size from [0] to [n]th - packet, where buffer[0] will contain longest buffer and - buffer[n] will contain the shortest buffer. - 4 arrays are swapped : - - pointers to input buffers - - pointers to output buffers - - pointers to input IV's - - input buffer lengths */ - packet_index = packetCount; - while (packet_index--) { - inner_index = packet_index; - while (inner_index--) { - if (lensBuf[packet_index] > - lensBuf[inner_index]) { - /* swap buffers to arrange in - descending order from [0]. */ - srctempbuff = pSrcBuf[packet_index]; - dsttempbuff = pDstBuf[packet_index]; - ivtempbuff = pIV[packet_index]; - tempLen = lensBuf[packet_index]; - tempCtx = pCtxBuf[packet_index]; - - pSrcBuf[packet_index] = - pSrcBuf[inner_index]; - pDstBuf[packet_index] = - pDstBuf[inner_index]; - pIV[packet_index] = pIV[inner_index]; - lensBuf[packet_index] = - lensBuf[inner_index]; - pCtxBuf[packet_index] = - pCtxBuf[inner_index]; - - pSrcBuf[inner_index] = srctempbuff; - pDstBuf[inner_index] = dsttempbuff; - pIV[inner_index] = ivtempbuff; - lensBuf[inner_index] = tempLen; - pCtxBuf[inner_index] = tempCtx; - } - } /* for inner packet index (inner bubble-sort) */ - } /* for outer packet index (outer bubble-sort) */ - } /* if sortNeeded */ + sortLanesByLength(pCtxBuf, pIV, pSrcBuf, pDstBuf, lensBuf, packetCount); packet_index = 0; /* process 8 buffers at-a-time */ @@ -2814,52 +2886,25 @@ void SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t * const pCtx[], } } +#ifndef AARCH64_SVE256 /** * @brief Initializes the four keys for SNOW3G f8/f9. + * Only called by JOB API. * * @param [in/out] pCtx Pointer to snow3g state - * @param [in] pKeySched1 Key1 schedule - * @param [in] pKeySched2 Key2 schedule - * @param [in] pKeySched3 Key3 schedule - * @param [in] pKeySched4 Key4 schedule - * @param [in] pIV1 IV for buffer 1 - * @param [in] pIV2 IV for buffer 2 - * @param [in] pIV3 IV for buffer 3 - * @param [in] pIV4 IV for buffer 4 + * @param [in] pKeySched pointer to key schedule + * @param [in] pIV pointer to IV */ void -SNOW3G_F8_4_BUFFER_INITIALIZE(void *pCtx, - const snow3g_key_schedule_t *pKeySched1, - const snow3g_key_schedule_t *pKeySched2, - const snow3g_key_schedule_t *pKeySched3, - const snow3g_key_schedule_t *pKeySched4, - const void *pIV1, const void *pIV2, - const void *pIV3, const void *pIV4) +SNOW3G_F8_4_BUFFER_INITIALIZE_JOB(void *pCtx, + const snow3g_key_schedule_t **pKeySched, + const void **pIV) { -#ifdef SAFE_PARAM - /* reset error status */ - if (imb_errno != 0) - imb_set_errno(NULL, 0); - - if (pCtx == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_CTX); - } - if (pKeySched1 == NULL || pKeySched2 == NULL || - pKeySched3 == NULL || pKeySched4 == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY); - return; - } - if ((pIV1 == NULL) || pIV2 == NULL || - (pIV3 == NULL) || (pIV4 == NULL)) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } -#endif /* Initialize the schedule from the IV */ snow3gStateInitialize_4_multikey((snow3gKeyState4_t *)pCtx, - pKeySched1, pKeySched2, - pKeySched3, pKeySched4, - pIV1, pIV2, pIV3, pIV4); + pKeySched[0], pKeySched[1], + pKeySched[2], pKeySched[3], + pIV[0], pIV[1], pIV[2], pIV[3]); /* Clock FSM and LFSR once, ignore the key stream */ (void) snow3g_keystream_4_4((snow3gKeyState4_t *)pCtx); @@ -2869,58 +2914,21 @@ SNOW3G_F8_4_BUFFER_INITIALIZE(void *pCtx, /** * @brief Four buffer F8 encrypt/decrypt after initialize. + * Only called by JOB API. * * @param[in/out] pCtx pointer to snow3g state - * @param[in] pBufferIn1 pointer to an input buffer - * @param[out] pBufferOut1 pointer to an output buffer - * @param[in] pBufferIn2 pointer to an input buffer - * @param[out] pBufferOut2 pointer to an output buffer - * @param[in] pBufferIn3 pointer to an input buffer - * @param[out] pBufferOut3 pointer to an output buffer - * @param[in] pBufferIn4 pointer to an input buffer - * @param[out] pBufferOut4 pointer to an output buffer + * @param[in] pBufferIn pointer to an input buffer array + * @param[out] pBufferOut pointer to an output buffer array + * @param[in] lengthInBytes message length in bytes */ -void SNOW3G_F8_4_BUFFER_STREAM(void *pCtx, - const void *pBufferIn1, - void *pBufferOut1, - const void *pBufferIn2, - void *pBufferOut2, - const void *pBufferIn3, - void *pBufferOut3, - const void *pBufferIn4, - void *pBufferOut4, - const uint32_t lengthInBytes) +void SNOW3G_F8_4_BUFFER_STREAM_JOB(void *pCtx, + const void **pBufferIn, + void **pBufferOut, + const uint32_t lengthInBytes) { const uint32_t num_lanes = 4; snow3gKeyState4_t *ctx = (snow3gKeyState4_t *)pCtx; uint32_t words; - uint8_t *pBufferOut[4]; - const uint8_t *pBufferIn[4]; - - cptr_copy_4((const void **)pBufferIn, - pBufferIn1, pBufferIn2, pBufferIn3, pBufferIn4); - - ptr_copy_4((void **)pBufferOut, pBufferOut1, pBufferOut2, - pBufferOut3, pBufferOut4); -#ifdef SAFE_PARAM - /* reset error status */ - if (imb_errno != 0) - imb_set_errno(NULL, 0); - if (!cptr_check((const void * const *)pBufferIn, - num_lanes, - IMB_ERR_NULL_SRC)) - return; - if (!ptr_check((void **)pBufferOut, num_lanes, IMB_ERR_NULL_DST)) - return; - if ((lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN)) { - imb_set_errno(NULL, IMB_ERR_CIPH_LEN); - return; - } - if (pCtx == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_CTX); - return; - } -#endif #ifdef SAFE_DATA CLEAR_SCRATCH_SIMD_REGS(); @@ -2949,9 +2957,11 @@ void SNOW3G_F8_4_BUFFER_STREAM(void *pCtx, CLEAR_SCRATCH_SIMD_REGS(); #endif /* SAFE_DATA */ } +#endif /** * @brief One buffer F8 encrypt/decrypt after initialize. + * Only called by JOB API. * * One packet enc/dec after initialize. * @@ -2960,32 +2970,11 @@ void SNOW3G_F8_4_BUFFER_STREAM(void *pCtx, * @param[out] pBufferOut pointer to an output buffer * @param[in] lengthInBytes length in bytes */ -void SNOW3G_F8_1_BUFFER_STREAM(void *pCtx, - const void *pBufferIn, - void *pBufferOut, - const uint32_t lengthInBytes) +void SNOW3G_F8_1_BUFFER_STREAM_JOB(void *pCtx, + const void *pBufferIn, + void *pBufferOut, + const uint32_t lengthInBytes) { -#ifdef SAFE_PARAM - /* reset error status */ - if (imb_errno != 0) - imb_set_errno(NULL, 0); - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - if (pBufferOut == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_DST); - return; - } - if ((lengthInBytes == 0) || (lengthInBytes > SNOW3G_MAX_BYTELEN)) { - imb_set_errno(NULL, IMB_ERR_CIPH_LEN); - return; - } - if (pCtx == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_CTX); - return; - } -#endif f8_snow3g((snow3gKeyState1_t *)pCtx, pBufferIn, pBufferOut, lengthInBytes); #ifdef SAFE_DATA CLEAR_SCRATCH_GPS(); @@ -3048,7 +3037,7 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle, /*Generate 5 key stream words*/ snow3g_f9_keystream_words(&ctx, &z[0]); - SNOW3G_F9_1_BUFFER_DIGEST(z, pBufferIn, lengthInBits, pDigest); + SNOW3G_F9_1_BUFFER_DIGEST_JOB(z, pBufferIn, lengthInBits, pDigest); #ifdef SAFE_DATA CLEAR_MEM(&z, sizeof(z)); @@ -3068,33 +3057,11 @@ void SNOW3G_F9_1_BUFFER(const snow3g_key_schedule_t *pHandle, * @param[in] lengthInBits message length in bits * @param[out] pDigest pointer to store the F9 digest */ -void SNOW3G_F9_1_BUFFER_DIGEST(const uint32_t z[5], - const void *pBufferIn, - const uint64_t lengthInBits, - void *pDigest) +void SNOW3G_F9_1_BUFFER_DIGEST_JOB(const uint32_t z[5], + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest) { -#ifdef SAFE_PARAM - /* reset error status */ - if (imb_errno != 0) - imb_set_errno(NULL, 0); - if (z == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_AUTH_KEY); - return; - } - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - if (pDigest == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_AUTH); - return; - } - if ((lengthInBits == 0) || (lengthInBits > SNOW3G_MAX_BITLEN)) { - imb_set_errno(NULL, IMB_ERR_AUTH_LEN); - return; - } -#endif - #ifdef SAFE_DATA CLEAR_SCRATCH_SIMD_REGS(); #endif /* SAFE_DATA */ @@ -3235,37 +3202,25 @@ void SNOW3G_F9_1_BUFFER_DIGEST(const uint32_t z[5], #endif /* SAFE_DATA */ } +#ifndef AARCH64_SVE256 /** * @brief Four buffer F9 keystream generation. + * Only called by JOB API. * - * @param[in/out] pCtx pointer to snow3g state - * @param[out] ks1 pointer to output keystream1 - * @param[out] ks2 pointer to output keystream2 - * @param[out] ks3 pointer to output keystream3 - * @param[out] ks4 pointer to output keystream4 + * @param[in/out] pCtx pointer to snow3g state + * @param[out] keystream pointer to output keystream */ -void SNOW3G_F9_4_BUFFER_KEYSTREAM(void *pCtx, - uint32_t ks1[5], - uint32_t ks2[5], - uint32_t ks3[5], - uint32_t ks4[5]) + +void SNOW3G_F9_4_BUFFER_KEYSTREAM_JOB(void *pCtx, + uint32_t *keystream) { snow3gKeyState4_t *ctx = (snow3gKeyState4_t *)pCtx; -#ifdef SAFE_PARAM - /* reset error status */ - if (imb_errno != 0) - imb_set_errno(NULL, 0); - - if (pCtx == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_CTX); - return; - } - if (ks1 == NULL || ks2 == NULL || ks3 == NULL || ks4 == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_AUTH_KEY); - return; - } -#endif + uint32_t *ks1, *ks2, *ks3, *ks4; + ks1 = keystream; + ks2 = keystream + 5; + ks3 = keystream + 10; + ks4 = keystream + 15; #ifdef SAFE_DATA CLEAR_SCRATCH_SIMD_REGS(); @@ -3284,5 +3239,6 @@ void SNOW3G_F9_4_BUFFER_KEYSTREAM(void *pCtx, CLEAR_SCRATCH_SIMD_REGS(); #endif /* SAFE_DATA */ } +#endif #endif /* SNOW3G_COMMON_H */ diff --git a/lib/aarch64/snow3g_impl_aarch64_sve256.S b/lib/aarch64/snow3g_impl_aarch64_sve256.S new file mode 100644 index 0000000000000000000000000000000000000000..42934f3848b88876558b95fd57a8e48398617db3 --- /dev/null +++ b/lib/aarch64/snow3g_impl_aarch64_sve256.S @@ -0,0 +1,1532 @@ +/******************************************************************************* + Copyright (c) 2023 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +.arch armv8-a+sve+crypto + +#define VECTOR_LEN 32 + +.section .data + +.align 8 +.type snow3g_invSR_SQ, %object +snow3g_invSR_SQ: +.byte 0xC2, 0xA6, 0x8F, 0x0A, 0x0D, 0xBE, 0xA7, 0x08 +.byte 0x1D, 0x99, 0x45, 0x59, 0x13, 0xD2, 0x11, 0x9F +.byte 0xAE, 0xE6, 0xD4, 0xA4, 0x92, 0x8D, 0x58, 0xC1 +.byte 0xD0, 0x97, 0xC8, 0x84, 0x9D, 0x4F, 0xBC, 0x3B +.byte 0x2D, 0xEB, 0x27, 0x53, 0x72, 0x4E, 0xE3, 0xEE +.byte 0xDA, 0x7F, 0xAA, 0x4D, 0x5C, 0x2F, 0x44, 0xDB +.byte 0x3E, 0x3A, 0x67, 0xC5, 0xC3, 0x6A, 0x16, 0x4C +.byte 0x38, 0xCC, 0xD7, 0xDD, 0x70, 0x62, 0xF2, 0x19 +.byte 0x10, 0x09, 0x98, 0x4B, 0x61, 0xC9, 0x86, 0x03 +.byte 0xA8, 0x6B, 0x5A, 0x33, 0x6E, 0x54, 0x5D, 0x8C +.byte 0x41, 0x1A, 0xF7, 0xF6, 0x82, 0xC6, 0xF8, 0x80 +.byte 0xC0, 0xC7, 0xFE, 0xB3, 0x65, 0x2C, 0x7B, 0xBA +.byte 0xB4, 0xFC, 0x2A, 0x22, 0x0C, 0x73, 0xF5, 0x5F +.byte 0x64, 0x68, 0x2E, 0x94, 0xB2, 0x24, 0x35, 0x14 +.byte 0x78, 0xFB, 0xBF, 0x48, 0xDE, 0xED, 0x43, 0x07 +.byte 0xB6, 0x32, 0xE4, 0xBD, 0x74, 0x7D, 0x57, 0x46 +.byte 0x3C, 0x37, 0xC4, 0xB7, 0x51, 0x8A, 0xF3, 0x55 +.byte 0x6C, 0xCF, 0x79, 0xAB, 0x77, 0xA3, 0xE1, 0x93 +.byte 0xD5, 0x6D, 0x81, 0x5B, 0x2B, 0x9A, 0x7E, 0x8B +.byte 0x04, 0xB5, 0x85, 0xD3, 0x91, 0xA1, 0x47, 0x52 +.byte 0xA5, 0xEC, 0xD6, 0xBB, 0x20, 0x87, 0x26, 0xF0 +.byte 0xAF, 0x4A, 0x89, 0xF4, 0xCE, 0x25, 0xCB, 0x50 +.byte 0x00, 0x3F, 0xD9, 0x42, 0x90, 0x21, 0x3D, 0xA9 +.byte 0xE7, 0x29, 0x01, 0xF1, 0x36, 0x5E, 0xFA, 0xCD +.byte 0xE5, 0x31, 0x1B, 0x05, 0xFD, 0x9E, 0xA0, 0x76 +.byte 0x30, 0xB1, 0x75, 0xB0, 0x9B, 0x56, 0xEA, 0x1C +.byte 0xEF, 0x06, 0x69, 0x7A, 0x95, 0x88, 0x15, 0xFF +.byte 0xCA, 0xAC, 0x0E, 0x23, 0xD8, 0x0F, 0x28, 0x0B +.byte 0x18, 0xF9, 0x63, 0x1E, 0x83, 0x66, 0x39, 0x9C +.byte 0xE2, 0x49, 0x1F, 0xE8, 0xD1, 0x34, 0x7C, 0xA2 +.byte 0xB9, 0xE0, 0x02, 0x12, 0xE9, 0xDF, 0xAD, 0x71 +.byte 0x96, 0x8E, 0x6F, 0xB8, 0x40, 0x60, 0x17, 0xDC +.size snow3g_invSR_SQ,.-snow3g_invSR_SQ + +.align 8 +.type snow3g_MULa, %object +snow3g_MULa: +.byte 0x00, 0x13, 0x26, 0x35, 0x4C, 0x5F, 0x6A, 0x79 +.byte 0x98, 0x8B, 0xBE, 0xAD, 0xD4, 0xC7, 0xF2, 0xE1 +.byte 0x00, 0xCF, 0x37, 0xF8, 0x6E, 0xA1, 0x59, 0x96 +.byte 0xDC, 0x13, 0xEB, 0x24, 0xB2, 0x7D, 0x85, 0x4A +.byte 0x00, 0x9F, 0x97, 0x08, 0x87, 0x18, 0x10, 0x8F +.byte 0xA7, 0x38, 0x30, 0xAF, 0x20, 0xBF, 0xB7, 0x28 +.byte 0x00, 0xE1, 0x6B, 0x8A, 0xD6, 0x37, 0xBD, 0x5C +.byte 0x05, 0xE4, 0x6E, 0x8F, 0xD3, 0x32, 0xB8, 0x59 +.byte 0x00, 0x99, 0x9B, 0x02, 0x9F, 0x06, 0x04, 0x9D +.byte 0x97, 0x0E, 0x0C, 0x95, 0x08, 0x91, 0x93, 0x0A +.byte 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77 +.byte 0x88, 0x99, 0xAA, 0xBB, 0xCC, 0xDD, 0xEE, 0xFF +.byte 0x00, 0xE7, 0x67, 0x80, 0xCE, 0x29, 0xA9, 0x4E +.byte 0x35, 0xD2, 0x52, 0xB5, 0xFB, 0x1C, 0x9C, 0x7B +.byte 0x00, 0x0A, 0x14, 0x1E, 0x28, 0x22, 0x3C, 0x36 +.byte 0x50, 0x5A, 0x44, 0x4E, 0x78, 0x72, 0x6C, 0x66 +.size snow3g_MULa,.-snow3g_MULa + +.align 8 +.type snow3g_DIVa, %object +snow3g_DIVa: +.byte 0x00, 0xCD, 0x33, 0xFE, 0x66, 0xAB, 0x55, 0x98 +.byte 0xCC, 0x01, 0xFF, 0x32, 0xAA, 0x67, 0x99, 0x54 +.byte 0x00, 0x40, 0x80, 0xC0, 0xA9, 0xE9, 0x29, 0x69 +.byte 0xFB, 0xBB, 0x7B, 0x3B, 0x52, 0x12, 0xD2, 0x92 +.byte 0x00, 0x0F, 0x1E, 0x11, 0x3C, 0x33, 0x22, 0x2D +.byte 0x78, 0x77, 0x66, 0x69, 0x44, 0x4B, 0x5A, 0x55 +.byte 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48 +.byte 0xC0, 0xD8, 0xF0, 0xE8, 0xA0, 0xB8, 0x90, 0x88 +.byte 0x00, 0x31, 0x62, 0x53, 0xC4, 0xF5, 0xA6, 0x97 +.byte 0x21, 0x10, 0x43, 0x72, 0xE5, 0xD4, 0x87, 0xB6 +.byte 0x00, 0x5F, 0xBE, 0xE1, 0xD5, 0x8A, 0x6B, 0x34 +.byte 0x03, 0x5C, 0xBD, 0xE2, 0xD6, 0x89, 0x68, 0x37 +.byte 0x00, 0xF0, 0x49, 0xB9, 0x92, 0x62, 0xDB, 0x2B +.byte 0x8D, 0x7D, 0xC4, 0x34, 0x1F, 0xEF, 0x56, 0xA6 +.byte 0x00, 0x29, 0x52, 0x7B, 0xA4, 0x8D, 0xF6, 0xDF +.byte 0xE1, 0xC8, 0xB3, 0x9A, 0x45, 0x6C, 0x17, 0x3E +.size snow3g_DIVa,.-snow3g_DIVa + +.align 6 +.type n_inv_aes_shift_row, %object +n_inv_aes_shift_row: +.byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b +.byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 +.byte 0x10, 0x1d, 0x1a, 0x17, 0x14, 0x11, 0x1e, 0x1b +.byte 0x18, 0x15, 0x12, 0x1f, 0x1c, 0x19, 0x16, 0x13 +.size n_inv_aes_shift_row,.-n_inv_aes_shift_row + +.align 6 +.type ror8, %object +ror8: +.word 0x00030201, 0x04070605, 0x080b0a09, 0x0c0f0e0d +.word 0x10131211, 0x14171615, 0x181b1a19, 0x1c1f1e1d +.size ror8,.-ror8 + +.align 6 +.type gather_clear_mask_mul, %object +gather_clear_mask_mul: +.byte 0x03, 0x07, 0x0b, 0x0f, 0x13, 0x17, 0x1b, 0x1f +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.size gather_clear_mask_mul,.-gather_clear_mask_mul + +.align 6 +.type gather_clear_mask_div, %object +gather_clear_mask_div: +.byte 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +.size gather_clear_mask_div,.-gather_clear_mask_div + +.align 6 +.type iv_swap_mask, %object +iv_swap_mask: +.quad 0x0405060700010203, 0x0c0d0e0f08090a0b +.size iv_swap_mask,.-iv_swap_mask + +.section .text + +#define START_FUNC(fn) .globl fn; \ + .type fn, %function; \ + .align 6; \ + fn: + +#define END_FUNC(fn) .size fn,.-fn + +.macro declare_register name:req, reg:req +.ifdef def_\name + .unreq \name +.endif + .set def_\name, 0 + \name .req \reg +.endm + +declare_register LFSR_S0, z12 +declare_register LFSR_S1, z13 +declare_register LFSR_S2, z14 +declare_register LFSR_S3, z15 +declare_register LFSR_S4, z16 +declare_register LFSR_S5, z17 +declare_register LFSR_S6, z18 +declare_register LFSR_S7, z19 +declare_register LFSR_S8, z20 +declare_register LFSR_S9, z21 +declare_register LFSR_S10, z22 +declare_register LFSR_S11, z23 +declare_register LFSR_S12, z24 +declare_register LFSR_S13, z25 +declare_register LFSR_S14, z26 +declare_register LFSR_S15, z27 +declare_register FSM_R1, z28 +declare_register FSM_R2, z29 +declare_register FSM_R3, z30 +declare_register zINV_SHIFT_ROW, z31 +declare_register zTMP0, z0 +declare_register zTMP1, z1 +declare_register zTMP2, z2 +declare_register zTMP3, z3 +declare_register zTMP4, z4 +declare_register zTMP5, z5 +declare_register zTMP6, z6 +declare_register zTMP7, z7 +declare_register zTMP8, z8 +declare_register zTMP9, z9 +declare_register zTMP10, z10 +declare_register zTMP11, z11 +declare_register vTMP0, v0 +declare_register vTMP1, v1 +declare_register vTMP2, v2 +declare_register vTMP3, v3 +declare_register vTMP4, v4 +declare_register vTMP5, v5 +declare_register vTMP6, v6 +declare_register vTMP7, v7 +declare_register vTMP8, v8 +declare_register vTMP9, v9 +declare_register vTMP10, v10 +declare_register vTMP11, v11 +declare_register xTMP0, x13 +declare_register xTMP1, x14 +declare_register xTMP2, x15 +declare_register xTMP3, x16 +declare_register xTMP4, x17 +declare_register xTMP5, x18 +declare_register xTMP6, x19 +declare_register xTMP7, x20 +declare_register xTMP8, x21 +declare_register xTMP9, x22 +declare_register xTMP10, x23 +declare_register xTMP11, x24 +declare_register xTMP12, x25 +declare_register xTMP13, x26 +declare_register xTMP14, x27 +declare_register xTMP15, x28 +declare_register xTMP16, x9 +declare_register xTMP17, x10 +declare_register xTMP18, x11 +declare_register xTMP19, x12 + +declare_register wTMP15, w28 +declare_register wTMP16, w9 +declare_register wTMP17, w10 +declare_register wTMP18, w11 +declare_register wTMP19, w12 +declare_register PRED8, p3 +declare_register PRED32, p4 +declare_register PRED32_HALF1, p5 +declare_register PRED32_HALF2, p6 +declare_register pTMP0, p2 + +.macro FUNC_SCALAR_SAVE + stp x19, x20, [sp, -80]! + stp x21, x22, [sp, 16] + stp x23, x24, [sp, 32] + stp x25, x26, [sp, 48] + stp x27, x28, [sp, 64] +.endm + +.macro FUNC_SCALAR_RESTORE + ldp x21, x22, [sp, 16] + ldp x23, x24, [sp, 32] + ldp x25, x26, [sp, 48] + ldp x27, x28, [sp, 64] + ldp x19, x20, [sp], 80 +.endm + +.macro FUNC_VECTOR_SAVE + stp d8, d9, [sp, -64]! + stp d10, d11, [sp, 16] + stp d12, d13, [sp, 32] + stp d14, d15, [sp, 48] +.endm + +.macro FUNC_VECTOR_RESTORE + ldp d10, d11, [sp, 16] + ldp d12, d13, [sp, 32] + ldp d14, d15, [sp, 48] + ldp d8, d9, [sp], 64 +.endm + +/* + * S1_BOX_8_SVE256() + * + * params + * \x - input value + * \rslt - returen value + * uses + * zTMP0-2 + */ +.macro S1_BOX_8_SVE256 x, rslt + tbl zTMP0.B, \x\().B, zINV_SHIFT_ROW.B + compact zTMP1.S, PRED32_HALF2, zTMP0.S + movi vTMP2.16B, #0 + aese vTMP0.16B, vTMP2.16B + aesmc vTMP0.16B, vTMP0.16B + aese VTMP1.16B, vTMP2.16B + aesmc vTMP1.16B, vTMP1.16B + insr zTMP1.D, X0 + insr zTMP1.D, X0 + mov \rslt\().S, PRED32_HALF1/M, zTMP0.S + mov \rslt\().S, PRED32_HALF2/M, zTMP1.S +.endm + +/* + * LOOKUP_32X8BIT_SVE256() + * + * params + * \index - input value + * \lookup - lookup table + * \rslt - return value + * uses + * zTMP0-2 + */ +.macro LOOKUP_32X8BIT_SVE256 index, lookup, rslt + mov zTMP0.B, PRED8/Z, #32 + + ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #0, MUL VL] + tbl \rslt\().B, {zTMP1.B}, \index\().B + sub \index\().B, \index\().B, zTMP0.B + + ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #1, MUL VL] + tbl zTMP2.B, {zTMP1.B}, \index\().B + sub \index\().B, \index\().B, zTMP0.B + eor \rslt\().D, \rslt\().D, zTMP2.D + + ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #2, MUL VL] + tbl zTMP2.B, {zTMP1.B}, \index\().B + sub \index\().B, \index\().B, zTMP0.B + eor \rslt\().D, \rslt\().D, zTMP2.D + + ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #3, MUL VL] + tbl zTMP2.B, {zTMP1.B}, \index\().B + sub \index\().B, \index\().B, zTMP0.B + eor \rslt\().D, \rslt\().D, zTMP2.D + + ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #4, MUL VL] + tbl zTMP2.B, {zTMP1.B}, \index\().B + sub \index\().B, \index\().B, zTMP0.B + eor \rslt\().D, \rslt\().D, zTMP2.D + + ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #5, MUL VL] + tbl zTMP2.B, {zTMP1.B}, \index\().B + sub \index\().B, \index\().B, zTMP0.B + eor \rslt\().D, \rslt\().D, zTMP2.D + + ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #6, MUL VL] + tbl zTMP2.B, {zTMP1.B}, \index\().B + sub \index\().B, \index\().B, zTMP0.B + eor \rslt\().D, \rslt\().D, zTMP2.D + + ld1b {zTMP1.B}, PRED8/Z, [\lookup\(), #7, MUL VL] + tbl zTMP2.B, {zTMP1.B}, \index\().B + sub \index\().B, \index\().B, zTMP0.B + eor \rslt\().D, \rslt\().D, zTMP2.D +.endm + +/* + * S2_MIXC_FIXUP_8_SVE256() + * + * params + * \no_mixc - input value + * \mixc - lookup table + * \rslt - return value + * uses + * xTMP0, zTMP0-2 + */ +.macro S2_MIXC_FIXUP_8_SVE256 no_mixc, mixc, rslt + // PAT = CMLT(NO_MIXC); + cmplt pTMP0.B, PRED8/Z, \no_mixc\().B, #0 + mov zTMP1.B, pTMP0/Z, #0xFF + // PAT_SHUF = TBL(PAT, ROR8); + adrp xTMP0, ror8 + add xTMP0, xTMP0, #:lo12:ror8 + ld1b {zTMP0.B}, PRED8/Z, [xTMP0] + tbl zTMP2.B, {zTMP1.B}, zTMP0.B + // RSLT = MIXC ^ (0X72 AND (PAT ^ PAT_SHUF)) + eor zTMP1.D, zTMP1.D, zTMP2.D + mov zTMP2.B, PRED8/Z, #0x72 + and zTMP0.D, zTMP2.D, zTMP1.D + eor \rslt\().D, zTMP0.D, \mixc\().D +.endm + +/* + * S2_BOX_8_SVE256() + * + * params + * \x - input value + * \rslt - return value + * uses + * xTMP0, zTMP0-4 + */ +.macro S2_BOX_8_SVE256 x, rslt + // NEW_X = TBL(LOOKUP(X, snow3g_invSR_SQ), inv_aes_shift_row); + adrp xTMP0, snow3g_invSR_SQ + add xTMP0, xTMP0, #:lo12:snow3g_invSR_SQ + LOOKUP_32X8BIT_SVE256 \x\(), xTMP0, \rslt\() + tbl zTMP3.B, \rslt\().B, zINV_SHIFT_ROW.B + compact zTMP1.S, PRED32_HALF2, zTMP3.S + // NOMIXC = AESE(NEW_X, 0) + movi vTMP2.16B, #0 + aese vTMP3.16B, vTMP2.16B + aese vTMP1.16B, vTMP2.16B + // MIXC = AESMC(NOMIXC) + aesmc vTMP4.16B, vTMP3.16B + aesmc vTMP0.16B, vTMP1.16B + insr zTMP1.D, X0 + insr zTMP1.D, X0 + insr zTMP0.D, X0 + insr zTMP0.D, X0 + mov zTMP3.S, PRED32_HALF2/M, zTMP1.S + mov zTMP4.S, PRED32_HALF2/M, zTMP0.S + // S2_MIXC_FIXUP(NOMIXC, MIXC) + S2_MIXC_FIXUP_8_SVE256 zTMP3, zTMP4, \rslt\() +.endm + +/* + * MUL_DIV_A_8_SVE256() + * + * params + * \S - input value, S0 or S11 + * \rslt - return value + * uses + * xTMP0, zTMP0-4 + */ +.macro MUL_DIV_A_8_SVE256 MUL_OR_DIV S, rslt + // L = S0,3 & 0x0F + mov zTMP0.B, PRED8/Z, #0x0F +.ifc \MUL_OR_DIV, MUL + adrp xTMP0, gather_clear_mask_mul + add xTMP0, xTMP0, #:lo12:gather_clear_mask_mul +.else + adrp xTMP0, gather_clear_mask_div + add xTMP0, xTMP0, #:lo12:gather_clear_mask_div +.endif + ld1b {zTMP1.B}, PRED8/Z, [xTMP0] + + // TL = TBL8(MULa_B0, L) || TBL8(MULa_B1, L) || TBL8(MULa_B2, L) || TBL8(MULa_B3, L) + tbl zTMP1.B, \S\().B, zTMP1.B + and zTMP0.D, zTMP1.D, zTMP0.D +.ifc \MUL_OR_DIV, MUL + adrp xTMP0, snow3g_MULa + add xTMP0, xTMP0, #:lo12:snow3g_MULa +.else + adrp xTMP0, snow3g_DIVa + add xTMP0, xTMP0, #:lo12:snow3g_DIVa +.endif + ld1 {vTMP2.16b,vTMP3.16b},[xTMP0],#32 + tbl zTMP2.B, zTMP2.B, zTMP0.B + tbl zTMP3.B, zTMP3.B, zTMP0.B + zip1 zTMP2.B, zTMP2.B, zTMP3.B + ld1 {vTMP3.16b,vTMP4.16b},[xTMP0],#32 + tbl zTMP3.B, zTMP3.B, zTMP0.B + tbl zTMP4.B, zTMP4.B, zTMP0.B + zip1 zTMP3.B, zTMP3.B, zTMP4.B + zip1 zTMP0.H, zTMP2.H, zTMP3.H + + // H = S0,3 & 0xF0 + lsr zTMP1.B, PRED8/M, zTMP1.B, #4 + + // TH = TBL8(MULa_B4, H) || TBL8(MULa_B5, H) || TBL8(MULa_B6, H) || TBL8(MULa_B7, H) + ld1 {vTMP2.16b,vTMP3.16b},[xTMP0],#32 + tbl zTMP2.B, zTMP2.B, zTMP1.B + tbl zTMP3.B, zTMP3.B, zTMP1.B + zip1 zTMP2.B, zTMP2.B, zTMP3.B + ld1 {vTMP3.16b,vTMP4.16b},[xTMP0] + tbl zTMP3.B, zTMP3.B, zTMP1.B + tbl zTMP4.B, zTMP4.B, zTMP1.B + zip1 zTMP3.B, zTMP3.B, zTMP4.B + zip1 zTMP1.H, zTMP2.H, zTMP3.H + + // RSLT = TL ^ TH + eor \rslt\().D, zTMP1.D, zTMP0.D +.endm + +/* + * CLOCK_FSM_8_SVE256() + * + * params + * \F - input value + * uses + * xTMP0, zTMP0-5 + */ +.macro CLOCK_FSM_8_SVE256 F + // F = (S15 + R1) ^ R2 + // R = R2 + (R3 ^ S5) + add \F\().S, LFSR_S15.S, FSM_R1.S + eor zTMP5.D, LFSR_S5.D, FSM_R3.D + eor \F\().D, \F\().D, FSM_R2.D + add zTMP5.S, zTMP5.S, FSM_R2.S + // R3 = S2(R2); + S2_BOX_8_SVE256 FSM_R2, FSM_R3 + // R2 = S1(R1); + S1_BOX_8_SVE256 FSM_R1, FSM_R2 + // R1 = R; + mov FSM_R1.D, zTMP5.D +.endm + +/* + * SHIFT_LFSR_8_SVE256() + * + * uses + * zTMP0-2 + */ +.macro SHIFT_LFSR_8_SVE256 S15 + mov zTMP0.D, LFSR_S4.D + mov zTMP1.D, LFSR_S8.D + mov zTMP2.D, LFSR_S12.D + + mov LFSR_S0.D, LFSR_S1.D + mov LFSR_S4.D, LFSR_S5.D + mov LFSR_S8.D, LFSR_S9.D + mov LFSR_S12.D, LFSR_S13.D + + mov LFSR_S1.D, LFSR_S2.D + mov LFSR_S5.D, LFSR_S6.D + mov LFSR_S9.D, LFSR_S10.D + mov LFSR_S13.D, LFSR_S14.D + + mov LFSR_S2.D, LFSR_S3.D + mov LFSR_S6.D, LFSR_S7.D + mov LFSR_S10.D, LFSR_S11.D + mov LFSR_S14.D, LFSR_S15.D + + mov LFSR_S3.D, zTMP0.D + mov LFSR_S7.D, zTMP1.D + mov LFSR_S11.D, zTMP2.D + mov LFSR_S15.D, \S15\().D +.endm + +/* + * CLOCK_LFSR_8_SVE256() + * + * uses + * xTMP0, zTMP0-6 + */ +.macro CLOCK_LFSR_8_SVE256 + // V = (S0 << 8) ^ MULa(S0) ^ S2 ^ (S11 >> 8) ^ DIVa(S11) + MUL_DIV_A_8_SVE256 MUL LFSR_S0, zTMP5 + MUL_DIV_A_8_SVE256 DIV LFSR_S11, zTMP6 + eor zTMP5.D, zTMP5.D, zTMP6.D + lsl zTMP3.S, LFSR_S0.S, #8 + lsr zTMP1.S, LFSR_S11.S, #8 + eor zTMP3.D, zTMP3.D, zTMP1.D + eor zTMP5.D, zTMP5.D, LFSR_S2.D + eor zTMP3.D, zTMP3.D, zTMP5.D + SHIFT_LFSR_8_SVE256 zTMP3 +.endm + +/* + * SNOW3G_KEYSTREAM_8_4_SVE256() + * + * params + * \KEY - output keystream + * uses + * xTMP0, zTMP0-6 + */ +.macro SNOW3G_KEYSTREAM_8_4_SVE256 KEY + CLOCK_FSM_8_SVE256 \KEY\() + eor \KEY\().D, \KEY\().D, LFSR_S0.D + CLOCK_LFSR_8_SVE256 +.endm + +/* + * INTERLEAVE_IV_KEY_8() + * + * uses + * xTMP0, zTMP0-3 when SWAP == 0 + * xTMP0, zTMP0-4 when SWAP == 1 + */ +.macro INTERLEAVE_IV_KEY_8 SWAP RSLT0, RSLT1, RSLT2, RSLT3, \ + ADDR1, ADDR2, ADDR3, ADDR4, \ + ADDR5, ADDR6, ADDR7, ADDR8 + ld1 {v\RSLT0\().4S}, [\ADDR1\()] + ld1 {vTMP0.4S}, [\ADDR2\()] + ld1 {v\RSLT1\().4S}, [\ADDR3\()] + ld1 {vTMP1.4S}, [\ADDR4\()] + ld1 {v\RSLT2\().4S}, [\ADDR5\()] + ld1 {vTMP2.4S}, [\ADDR6\()] + ld1 {v\RSLT3\().4S}, [\ADDR7\()] + ld1 {vTMP3.4S}, [\ADDR8\()] +.if \SWAP == 1 + adrp xTMP0, iv_swap_mask + add xTMP0, xTMP0, #:lo12:iv_swap_mask + ld1 {vTMP4.4S}, [xTMP0] + tbl vTMP0.16B, {vTMP0.16B}, vTMP4.16B + tbl vTMP1.16B, {vTMP1.16B}, vTMP4.16B + tbl vTMP2.16B, {vTMP2.16B}, vTMP4.16B + tbl vTMP3.16B, {vTMP3.16B}, vTMP4.16B + tbl v\RSLT0\().16B, {v\RSLT0\().16B}, vTMP4.16B + tbl v\RSLT1\().16B, {v\RSLT1\().16B}, vTMP4.16B + tbl v\RSLT2\().16B, {v\RSLT2\().16B}, vTMP4.16B + tbl v\RSLT3\().16B, {v\RSLT3\().16B}, vTMP4.16B +.endif + zip1 z\RSLT0\().S, z\RSLT0\().S, zTMP0.S + zip1 z\RSLT1\().S, z\RSLT1\().S, zTMP1.S + zip1 z\RSLT2\().S, z\RSLT2\().S, zTMP2.S + zip1 z\RSLT3\().S, z\RSLT3\().S, zTMP3.S + + zip1 zTMP0.D, z\RSLT0\().D, z\RSLT1\().D + zip2 zTMP1.D, z\RSLT0\().D, z\RSLT1\().D + zip1 zTMP2.D, z\RSLT2\().D, z\RSLT3\().D + zip2 zTMP3.D, z\RSLT2\().D, z\RSLT3\().D + + compact z\RSLT1\().S, PRED32_HALF2, zTMP0.S + mov z\RSLT1\().S, PRED32_HALF2/M, zTMP2.S + insr zTMP2.D, x0 + insr zTMP2.D, x0 + sel z\RSLT0\().S, PRED32_HALF1, zTMP0.S, zTMP2.S + + compact z\RSLT3\().S, PRED32_HALF2, zTMP1.S + mov z\RSLT3\().S, PRED32_HALF2/M, zTMP3.S + insr zTMP3.D, x0 + insr zTMP3.D, x0 + sel z\RSLT2\().S, PRED32_HALF1, zTMP1.S, zTMP3.S +.endm + +/* + * SNOW3G_INITIALIZE_8_SVE256_FIRST() + * + * uses + * zTMP0-8 + */ +.macro SNOW3G_INITIALIZE_8_SVE256_FIRST KEYADDR1 KEYADDR2 KEYADDR3 KEYADDR4 \ + KEYADDR5 KEYADDR6 KEYADDR7 KEYADDR8 \ + IVADDR1 IVADDR2 IVADDR3 IVADDR4 \ + IVADDR5 IVADDR6 IVADDR7 IVADDR8 + INTERLEAVE_IV_KEY_8 0, 4, 5, 6, 7, \ + \KEYADDR1\(), \KEYADDR2\(), \KEYADDR3\(), \KEYADDR4\(), \ + \KEYADDR5\(), \KEYADDR6\(), \KEYADDR7\(), \KEYADDR8\() + mov LFSR_S4.D, zTMP4.D + mov LFSR_S5.D, zTMP5.D + mov LFSR_S6.D, zTMP6.D + mov LFSR_S7.D, zTMP7.D + mov LFSR_S12.D, zTMP4.D + mov LFSR_S13.D, zTMP5.D + mov LFSR_S14.D, zTMP6.D + mov LFSR_S15.D, zTMP7.D + not LFSR_S0.S, PRED32/M, zTMP4.S + not LFSR_S1.S, PRED32/M, zTMP5.S + not LFSR_S2.S, PRED32/M, zTMP6.S + not LFSR_S3.S, PRED32/M, zTMP7.S + mov LFSR_S8.D, LFSR_S0.D + mov LFSR_S9.D, LFSR_S1.D + mov LFSR_S10.D, LFSR_S2.D + mov LFSR_S11.D, LFSR_S3.D + + INTERLEAVE_IV_KEY_8 1, 5, 6, 7, 8, \ + \IVADDR1\(), \IVADDR2\(), \IVADDR3\(), \IVADDR4\(), \ + \IVADDR5\(), \IVADDR6\(), \IVADDR7\(), \IVADDR8\() + + eor LFSR_S15.D, LFSR_S15.D, zTMP8.D + eor LFSR_S12.D, LFSR_S12.D, zTMP7.D + eor LFSR_S10.D, LFSR_S10.D, zTMP6.D + eor LFSR_S9.D, LFSR_S9.D, zTMP5.D + + mov FSM_R1.B, PRED8/Z, #0 + mov FSM_R2.B, PRED8/Z, #0 + mov FSM_R3.B, PRED8/Z, #0 +.endm + +/* + * SNOW3G_INITIALIZE_8_SVE256_SECOND() + * + * uses + * xTMP0, zTMP0-7 + */ +.macro SNOW3G_INITIALIZE_8_SVE256_SECOND +.rept 32 + CLOCK_FSM_8_SVE256 zTMP7 + CLOCK_LFSR_8_SVE256 + eor LFSR_S15.D, LFSR_S15.D, zTMP7.D +.endr + CLOCK_FSM_8_SVE256 zTMP7 + CLOCK_LFSR_8_SVE256 +.endm + +/* + * SNOW3G_LOAD_CTX_8_SVE256() + * + */ +.macro SNOW3G_LOAD_CTX_8_SVE256 ctx_addr + ld1b {LFSR_S0.B}, PRED8/Z, [\ctx_addr\(), #0, MUL VL] + ld1b {LFSR_S1.B}, PRED8/Z, [\ctx_addr\(), #1, MUL VL] + ld1b {LFSR_S2.B}, PRED8/Z, [\ctx_addr\(), #2, MUL VL] + ld1b {LFSR_S3.B}, PRED8/Z, [\ctx_addr\(), #3, MUL VL] + ld1b {LFSR_S4.B}, PRED8/Z, [\ctx_addr\(), #4, MUL VL] + ld1b {LFSR_S5.B}, PRED8/Z, [\ctx_addr\(), #5, MUL VL] + ld1b {LFSR_S6.B}, PRED8/Z, [\ctx_addr\(), #6, MUL VL] + ld1b {LFSR_S7.B}, PRED8/Z, [\ctx_addr\(), #7, MUL VL] + add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*8) + ld1b {LFSR_S8.B}, PRED8/Z, [\ctx_addr\(), #0, MUL VL] + ld1b {LFSR_S9.B}, PRED8/Z, [\ctx_addr\(), #1, MUL VL] + ld1b {LFSR_S10.B}, PRED8/Z, [\ctx_addr\(), #2, MUL VL] + ld1b {LFSR_S11.B}, PRED8/Z, [\ctx_addr\(), #3, MUL VL] + ld1b {LFSR_S12.B}, PRED8/Z, [\ctx_addr\(), #4, MUL VL] + ld1b {LFSR_S13.B}, PRED8/Z, [\ctx_addr\(), #5, MUL VL] + ld1b {LFSR_S14.B}, PRED8/Z, [\ctx_addr\(), #6, MUL VL] + ld1b {LFSR_S15.B}, PRED8/Z, [\ctx_addr\(), #7, MUL VL] + add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*8) + ld1b {FSM_R1.B}, PRED8/Z, [\ctx_addr\(), #0, MUL VL] + ld1b {FSM_R2.B}, PRED8/Z, [\ctx_addr\(), #1, MUL VL] + ld1b {FSM_R3.B}, PRED8/Z, [\ctx_addr\(), #2, MUL VL] + sub \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*16) +.endm + +/* + * SNOW3G_STORE_CTX_8_SVE256() + * + */ +.macro SNOW3G_STORE_CTX_8_SVE256 ctx_addr + st1b {LFSR_S0.B}, PRED8, [\ctx_addr\(), #0, MUL VL] + st1b {LFSR_S1.B}, PRED8, [\ctx_addr\(), #1, MUL VL] + st1b {LFSR_S2.B}, PRED8, [\ctx_addr\(), #2, MUL VL] + st1b {LFSR_S3.B}, PRED8, [\ctx_addr\(), #3, MUL VL] + st1b {LFSR_S4.B}, PRED8, [\ctx_addr\(), #4, MUL VL] + st1b {LFSR_S5.B}, PRED8, [\ctx_addr\(), #5, MUL VL] + st1b {LFSR_S6.B}, PRED8, [\ctx_addr\(), #6, MUL VL] + st1b {LFSR_S7.B}, PRED8, [\ctx_addr\(), #7, MUL VL] + add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*8) + st1b {LFSR_S8.B}, PRED8, [\ctx_addr\(), #0, MUL VL] + st1b {LFSR_S9.B}, PRED8, [\ctx_addr\(), #1, MUL VL] + st1b {LFSR_S10.B}, PRED8, [\ctx_addr\(), #2, MUL VL] + st1b {LFSR_S11.B}, PRED8, [\ctx_addr\(), #3, MUL VL] + st1b {LFSR_S12.B}, PRED8, [\ctx_addr\(), #4, MUL VL] + st1b {LFSR_S13.B}, PRED8, [\ctx_addr\(), #5, MUL VL] + st1b {LFSR_S14.B}, PRED8, [\ctx_addr\(), #6, MUL VL] + st1b {LFSR_S15.B}, PRED8, [\ctx_addr\(), #7, MUL VL] + add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*8) + st1b {FSM_R1.B}, PRED8, [\ctx_addr\(), #0, MUL VL] + st1b {FSM_R2.B}, PRED8, [\ctx_addr\(), #1, MUL VL] + st1b {FSM_R3.B}, PRED8, [\ctx_addr\(), #2, MUL VL] + add \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*3) + str wzr, [\ctx_addr\()] + sub \ctx_addr\(), \ctx_addr\(), #(VECTOR_LEN*19) +.endm + +/* + * CLEAR_VECTORS_SVE256() + * + */ +.macro CLEAR_VECTORS_SVE256 + eor zTMP0.D, zTMP0.D, zTMP0.D + eor zTMP1.D, zTMP1.D, zTMP1.D + eor zTMP2.D, zTMP2.D, zTMP2.D + eor zTMP3.D, zTMP3.D, zTMP3.D + eor zTMP4.D, zTMP4.D, zTMP4.D + eor zTMP5.D, zTMP5.D, zTMP5.D + eor zTMP6.D, zTMP6.D, zTMP6.D + eor zTMP7.D, zTMP7.D, zTMP7.D + eor zTMP8.D, zTMP8.D, zTMP8.D + eor zTMP9.D, zTMP9.D, zTMP9.D + eor zTMP10.D, zTMP10.D, zTMP10.D + eor zTMP11.D, zTMP11.D, zTMP11.D + eor LFSR_S0.D, LFSR_S0.D, LFSR_S0.D + eor LFSR_S1.D, LFSR_S1.D, LFSR_S1.D + eor LFSR_S2.D, LFSR_S2.D, LFSR_S2.D + eor LFSR_S3.D, LFSR_S3.D, LFSR_S3.D + eor LFSR_S4.D, LFSR_S4.D, LFSR_S4.D + eor LFSR_S5.D, LFSR_S5.D, LFSR_S5.D + eor LFSR_S6.D, LFSR_S6.D, LFSR_S6.D + eor LFSR_S7.D, LFSR_S7.D, LFSR_S7.D + eor LFSR_S8.D, LFSR_S8.D, LFSR_S8.D + eor LFSR_S9.D, LFSR_S9.D, LFSR_S9.D + eor LFSR_S10.D, LFSR_S10.D, LFSR_S10.D + eor LFSR_S11.D, LFSR_S11.D, LFSR_S11.D + eor LFSR_S12.D, LFSR_S12.D, LFSR_S12.D + eor LFSR_S13.D, LFSR_S13.D, LFSR_S13.D + eor LFSR_S14.D, LFSR_S14.D, LFSR_S14.D + eor LFSR_S15.D, LFSR_S15.D, LFSR_S15.D + eor FSM_R1.D, FSM_R1.D, FSM_R1.D + eor FSM_R2.D, FSM_R2.D, FSM_R2.D + eor FSM_R3.D, FSM_R3.D, FSM_R3.D +.endm +/* + * snow3g_f8_8_buffer_initialize_aarch64_sve256_asm( + * void *ctx, + * snow3g_key_schedule_t **pKeySched, + * void **pIV) + */ +START_FUNC(snow3g_f8_8_buffer_initialize_aarch64_sve256_asm) + FUNC_SCALAR_SAVE + ptrue PRED8.B, ALL + ptrue PRED32.S, ALL + ptrue PRED32_HALF1.S, VL4 + not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B + FUNC_VECTOR_SAVE + adrp xTMP0, n_inv_aes_shift_row + add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row + ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0] + mov xTMP17, x1 + mov xTMP18, x2 + + ldp xTMP0, xTMP1, [xTMP17], #16 + ldp xTMP2, xTMP3, [xTMP17], #16 + ldp xTMP4, xTMP5, [xTMP17], #16 + ldp xTMP6, xTMP7, [xTMP17] + ldp xTMP8, xTMP9, [xTMP18], #16 + ldp xTMP10, xTMP11, [xTMP18], #16 + ldp xTMP12, xTMP13, [xTMP18], #16 + ldp xTMP14, xTMP15, [xTMP18] + + SNOW3G_INITIALIZE_8_SVE256_FIRST xTMP0 xTMP1 xTMP2 xTMP3 xTMP4 xTMP5 xTMP6 xTMP7\ + xTMP8 xTMP9 xTMP10 xTMP11 xTMP12 xTMP13 xTMP14 xTMP15 + SNOW3G_INITIALIZE_8_SVE256_SECOND + SNOW3G_STORE_CTX_8_SVE256 x0 + + FUNC_VECTOR_RESTORE + FUNC_SCALAR_RESTORE + ret +END_FUNC(snow3g_f8_8_buffer_initialize_aarch64_sve256_asm) + +#ifndef GATHER_SCATTER_IMPL +/* + * snow3g_f8_8_buffer_stream_aarch64_sve256_asm(void *ctx, + * void **in, + * void **out, + * uint32_t lengthInBytes) + * + */ +START_FUNC(snow3g_f8_8_buffer_stream_aarch64_sve256_asm) + FUNC_SCALAR_SAVE + ptrue PRED8.B, ALL + ptrue PRED32.S, ALL + ptrue PRED32_HALF1.S, VL4 + not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B + FUNC_VECTOR_SAVE + adrp xTMP0, n_inv_aes_shift_row + add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row + ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0] + + mov xTMP17, x1 + mov xTMP18, x2 + mov xTMP19, x3 + + SNOW3G_LOAD_CTX_8_SVE256 x0 + ldp xTMP16, xTMP1, [xTMP17], #16 + ldp xTMP2, xTMP3, [xTMP17], #16 + ldp xTMP4, xTMP5, [xTMP17], #16 + ldp xTMP6, xTMP7, [xTMP17] + ldp xTMP8, xTMP9, [xTMP18], #16 + ldp xTMP10, xTMP11, [xTMP18], #16 + ldp xTMP12, xTMP13, [xTMP18], #16 + ldp xTMP14, xTMP15, [xTMP18] + + cmp xTMP19, #16 + b.lt GEN8 + +GEN16_LOOP: + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11 + zip1 zTMP0.S, zTMP8.S, zTMP9.S + zip2 zTMP1.S, zTMP8.S, zTMP9.S + zip1 zTMP2.S, zTMP10.S, zTMP11.S + zip2 zTMP3.S, zTMP10.S, zTMP11.S + zip1 zTMP8.D, zTMP0.D, zTMP2.D + zip2 zTMP9.D, zTMP0.D, zTMP2.D + zip1 zTMP10.D, zTMP1.D, zTMP3.D + zip2 zTMP11.D, zTMP1.D, zTMP3.D + revb zTMP8.S, PRED32/M, zTMP8.S + revb zTMP9.S, PRED32/M, zTMP9.S + revb zTMP10.S, PRED32/M, zTMP10.S + revb zTMP11.S, PRED32/M, zTMP11.S + + ld1 {vTMP0.4S}, [xTMP16], #16 + ld1 {vTMP4.4S}, [xTMP1], #16 + ld1 {vTMP1.4S}, [xTMP2], #16 + ld1 {vTMP5.4S}, [xTMP3], #16 + ld1 {vTMP2.4S}, [xTMP4], #16 + ld1 {vTMP6.4S}, [xTMP5], #16 + ld1 {vTMP3.4S}, [xTMP6], #16 + ld1 {vTMP7.4S}, [xTMP7], #16 + insr zTMP4.D, x0 + insr zTMP5.D, x0 + insr zTMP6.D, x0 + insr zTMP7.D, x0 + insr zTMP4.D, x0 + insr zTMP5.D, x0 + insr zTMP6.D, x0 + insr zTMP7.D, x0 + mov zTMP0.S, PRED32_HALF2/M, zTMP4.S + mov zTMP1.S, PRED32_HALF2/M, zTMP5.S + mov zTMP2.S, PRED32_HALF2/M, zTMP6.S + mov zTMP3.S, PRED32_HALF2/M, zTMP7.S + eor zTMP0.D, zTMP0.D, zTMP8.D + eor zTMP1.D, zTMP1.D, zTMP9.D + eor zTMP2.D, zTMP2.D, zTMP10.D + eor zTMP3.D, zTMP3.D, zTMP11.D + + compact zTMP4.S, PRED32_HALF2, zTMP0.S + compact zTMP5.S, PRED32_HALF2, zTMP1.S + compact zTMP6.S, PRED32_HALF2, zTMP2.S + compact zTMP7.S, PRED32_HALF2, zTMP3.S + + st1 {vTMP0.4S}, [xTMP8], #16 + st1 {vTMP4.4S}, [xTMP9], #16 + st1 {vTMP1.4S}, [xTMP10], #16 + st1 {vTMP5.4S}, [xTMP11], #16 + st1 {vTMP2.4S}, [xTMP12], #16 + st1 {vTMP6.4S}, [xTMP13], #16 + st1 {vTMP3.4S}, [xTMP14], #16 + st1 {vTMP7.4S}, [xTMP15], #16 + + sub xTMP19, xTMP19, #16 + cmp xTMP19, #16 + b.ge GEN16_LOOP + +GEN8: + cmp xTMP19, #8 + b.lt GEN4 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9 + zip1 zTMP10.S, zTMP8.S, zTMP9.S + zip2 zTMP11.S, zTMP8.S, zTMP9.S + revb zTMP10.S, PRED32/M, zTMP10.S + revb zTMP11.S, PRED32/M, zTMP11.S + + ld1 {vTMP0.D}[0], [xTMP16], #8 + ld1 {vTMP0.D}[1], [xTMP1], #8 + ld1 {vTMP1.D}[0], [xTMP2], #8 + ld1 {vTMP1.D}[1], [xTMP3], #8 + ld1 {vTMP2.D}[0], [xTMP4], #8 + ld1 {vTMP2.D}[1], [xTMP5], #8 + ld1 {vTMP3.D}[0], [xTMP6], #8 + ld1 {vTMP3.D}[1], [xTMP7], #8 + + compact zTMP4.S, PRED32_HALF2, zTMP10.S + compact zTMP5.S, PRED32_HALF2, zTMP11.S + + eor vTMP0.16B, vTMP0.16B, vTMP10.16B + eor vTMP1.16B, vTMP1.16B, vTMP4.16B + eor vTMP2.16B, vTMP2.16B, vTMP11.16B + eor vTMP3.16B, vTMP3.16B, vTMP5.16B + + st1 {vTMP0.D}[0], [xTMP8], #8 + st1 {vTMP0.D}[1], [xTMP9], #8 + st1 {vTMP1.D}[0], [xTMP10], #8 + st1 {vTMP1.D}[1], [xTMP11], #8 + st1 {vTMP2.D}[0], [xTMP12], #8 + st1 {vTMP2.D}[1], [xTMP13], #8 + st1 {vTMP3.D}[0], [xTMP14], #8 + st1 {vTMP3.D}[1], [xTMP15], #8 + + sub xTMP19, xTMP19, #8 + +GEN4: + cmp xTMP19, #4 + b.lt FINISH + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8 + revb zTMP8.S, PRED32/M, zTMP8.S + + ld1 {vTMP0.S}[0], [xTMP16], #4 + ld1 {vTMP0.S}[1], [xTMP1], #4 + ld1 {vTMP0.S}[2], [xTMP2], #4 + ld1 {vTMP0.S}[3], [xTMP3], #4 + ld1 {vTMP1.S}[0], [xTMP4], #4 + ld1 {vTMP1.S}[1], [xTMP5], #4 + ld1 {vTMP1.S}[2], [xTMP6], #4 + ld1 {vTMP1.S}[3], [xTMP7], #4 + + compact zTMP4.S, PRED32_HALF2, zTMP8.S + + eor vTMP0.16B, vTMP0.16B, vTMP8.16B + eor vTMP1.16B, vTMP1.16B, vTMP4.16B + + st1 {vTMP0.S}[0], [xTMP8], #4 + st1 {vTMP0.S}[1], [xTMP9], #4 + st1 {vTMP0.S}[2], [xTMP10], #4 + st1 {vTMP0.S}[3], [xTMP11], #4 + st1 {vTMP1.S}[0], [xTMP12], #4 + st1 {vTMP1.S}[1], [xTMP13], #4 + st1 {vTMP1.S}[2], [xTMP14], #4 + st1 {vTMP1.S}[3], [xTMP15], #4 + +FINISH: + SNOW3G_STORE_CTX_8_SVE256 x0 + mov xTMP17, x1 + mov xTMP18, x2 + stp xTMP16, xTMP1, [xTMP17], #16 + stp xTMP2, xTMP3, [xTMP17], #16 + stp xTMP4, xTMP5, [xTMP17], #16 + stp xTMP6, xTMP7, [xTMP17] + stp xTMP8, xTMP9, [xTMP18], #16 + stp xTMP10, xTMP11, [xTMP18], #16 + stp xTMP12, xTMP13, [xTMP18], #16 + stp xTMP14, xTMP15, [xTMP18] + + FUNC_VECTOR_RESTORE + FUNC_SCALAR_RESTORE + ret +END_FUNC(snow3g_f8_8_buffer_stream_aarch64_sve256_asm) + +#else + +/* + * snow3g_f8_8_buffer_stream_aarch64_sve256_asm(void *ctx, + * void **in, + * void **out, + * uint32_t lengthInBytes) + * + * NOTE: This implementation uses SVE gather load and scatter store, + * but the performance is 10% worse than implementation using + * contiguous load and store. + */ +START_FUNC(snow3g_f8_8_buffer_stream_aarch64_sve256_asm) + cbz x3, FINISH_GS + FUNC_SCALAR_SAVE + ptrue PRED8.B, ALL + ptrue PRED32.S, ALL + ptrue PRED32_HALF1.S, VL4 + not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B + FUNC_VECTOR_SAVE + adrp xTMP0, n_inv_aes_shift_row + add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row + ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0] + + SNOW3G_LOAD_CTX_8_SVE256 x0 + ld1d {zTMP10.D}, PRED32/Z, [x1] + ld1d {zTMP0.D}, PRED32/Z, [x1, #1, MUL VL] + uzp1 zTMP10.S, zTMP10.S, zTMP0.S + ld1d {zTMP11.D}, PRED32/Z, [x2] + ld1d {zTMP0.D}, PRED32/Z, [x2, #1, MUL VL] + uzp1 zTMP11.S, zTMP11.S, zTMP0.S + + ldr xTMP18, [x1] + ldr xTMP19, [x2] + bfm xTMP18, XZR, #0, #31 + bfm xTMP19, XZR, #0, #31 + mov xTMP17, #0 + +GEN4_LOOP: + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8 + revb zTMP8.S, PRED32/M, zTMP8.S + + ld1w {zTMP9.S}, PRED32/Z, [xTMP18, zTMP10.S, UXTW] + eor zTMP9.D, zTMP9.D, zTMP8.D + st1w {zTMP9.S}, PRED32, [xTMP19, zTMP11.S, UXTW] + add xTMP18, xTMP18, #4 + add xTMP19, xTMP19, #4 + add xTMP17, xTMP17, #4 + cmp xTMP17, x3 + b.lt GEN4_LOOP + + SNOW3G_STORE_CTX_8_SVE256 x0 + + cpy zTMP9.D, PRED32/M, x3 + ld1d {zTMP10.D}, PRED32/Z, [x1] + ld1d {zTMP11.D}, PRED32/Z, [x1, #1, MUL VL] + add zTMP10.D, zTMP10.D, zTMP9.D + add zTMP11.D, zTMP11.D, zTMP9.D + st1d {zTMP10.D}, PRED32, [x1] + st1d {zTMP11.D}, PRED32, [x1, #1, MUL VL] + + ld1d {zTMP10.D}, PRED32/Z, [x2] + ld1d {zTMP11.D}, PRED32/Z, [x2, #1, MUL VL] + add zTMP10.D, zTMP10.D, zTMP9.D + add zTMP11.D, zTMP11.D, zTMP9.D + st1d {zTMP10.D}, PRED32, [x2] + st1d {zTMP11.D}, PRED32, [x2, #1, MUL VL] + + FUNC_VECTOR_RESTORE + FUNC_SCALAR_RESTORE +FINISH_GS: + ret +END_FUNC(snow3g_f8_8_buffer_stream_aarch64_sve256_asm) + +#endif + +/* OUT = IN XOR OUT + * use this macro to generate output when LEN is less than 16 + * use: vTMP0 */ +.macro X_BYTE_STREAM IN, OUT, KEY, LEN + cmp \LEN\(), #8 + b.lt 4f + ld1 {vTMP0.D}[0], [\IN\()], #8 + eor vTMP0.16B, vTMP0.16B, \KEY\().16B + st1 {vTMP0.D}[0], [\OUT\()], #8 + mov \KEY\().D[0], \KEY\().D[1] + sub \LEN\(), \LEN\(), #8 +4: + cmp \LEN\(), #4 + b.lt 2f + ld1 {vTMP0.S}[0], [\IN\()], #4 + eor vTMP0.16B, vTMP0.16B, \KEY\().16B + st1 {vTMP0.S}[0], [\OUT\()], #4 + mov \KEY\().S[0], \KEY\().S[1] + sub \LEN\(), \LEN\(), #4 +2: + cmp \LEN\(), #2 + b.lt 1f + ld1 {vTMP0.H}[0], [\IN\()], #2 + eor vTMP0.16B, vTMP0.16B, \KEY\().16B + st1 {vTMP0.H}[0], [\OUT\()], #2 + mov \KEY\().H[0], \KEY\().H[1] + sub \LEN\(), \LEN\(), #2 +1: + cmp \LEN\(), #1 + b.lt 0f + ld1 {vTMP0.B}[0], [\IN\()], #1 + eor vTMP0.16B, vTMP0.16B, \KEY\().16B + st1 {vTMP0.B}[0], [\OUT\()], #1 +0: +.endm + +.macro GEN_1_TO_8_LANES LANE_NR SUFFIX + CURR_LEN .req wTMP17 + LEFT_LEN .req wTMP17 + LEN .req wTMP18 + IN1 .req xTMP1 + IN2 .req xTMP2 + IN3 .req xTMP3 + IN4 .req xTMP4 + IN5 .req xTMP5 + IN6 .req xTMP6 + IN7 .req xTMP7 + IN8 .req xTMP8 + OUT1 .req xTMP9 + OUT2 .req xTMP10 + OUT3 .req xTMP11 + OUT4 .req xTMP12 + OUT5 .req xTMP13 + OUT6 .req xTMP14 + OUT7 .req xTMP15 + OUT8 .req xTMP16 + // lanes are sorted by length decrease + // lane1.len >= lane2.len >= .... lane8.len + // load length + ldr CURR_LEN, [x4, #(4*(\LANE_NR\()-1))] +GEN_\LANE_NR\()LANES\SUFFIX\(): + cmp CURR_LEN, LEN + b.lt FINISH_\LANE_NR\()TH\SUFFIX\() + + // load 16byte x LANE_NR input + ld1 {vTMP0.4S}, [IN1], #16 +.ifge \LANE_NR - 2 + ld1 {vTMP4.4S}, [IN2], #16 +.ifge \LANE_NR - 3 + ld1 {vTMP1.4S}, [IN3], #16 +.ifge \LANE_NR - 4 + ld1 {vTMP5.4S}, [IN4], #16 +.ifge \LANE_NR - 5 + ld1 {vTMP2.4S}, [IN5], #16 +.ifge \LANE_NR - 6 + ld1 {vTMP6.4S}, [IN6], #16 +.ifge \LANE_NR - 7 + ld1 {vTMP3.4S}, [IN7], #16 +.ifge \LANE_NR - 8 + ld1 {vTMP7.4S}, [IN8], #16 +.endif +.endif +.endif +.endif +.endif +.endif +.endif + + // merge 16byte x LANE_NR input into at most 4 SVE registers +.rept 2 +.ifge \LANE_NR - 2 + insr zTMP4.D, x0 +.ifge \LANE_NR - 4 + insr zTMP5.D, x0 +.ifge \LANE_NR - 6 + insr zTMP6.D, x0 +.ifge \LANE_NR - 8 + insr zTMP7.D, x0 +.endif +.endif +.endif +.endif +.endr +.ifge \LANE_NR - 2 + mov zTMP0.S, PRED32_HALF2/M, zTMP4.S +.ifge \LANE_NR - 4 + mov zTMP1.S, PRED32_HALF2/M, zTMP5.S +.ifge \LANE_NR - 6 + mov zTMP2.S, PRED32_HALF2/M, zTMP6.S +.ifge \LANE_NR - 8 + mov zTMP3.S, PRED32_HALF2/M, zTMP7.S +.endif +.endif +.endif +.endif + + // XOR with generated keystream + eor zTMP0.D, zTMP0.D, zTMP8.D +.ifge \LANE_NR - 3 + eor zTMP1.D, zTMP1.D, zTMP9.D +.ifge \LANE_NR - 5 + eor zTMP2.D, zTMP2.D, zTMP10.D +.ifge \LANE_NR - 7 + eor zTMP3.D, zTMP3.D, zTMP11.D +.endif +.endif +.endif + + // compact SVE register into NEON register for store +.ifge \LANE_NR - 2 + compact zTMP4.S, PRED32_HALF2, zTMP0.S +.ifge \LANE_NR - 4 + compact zTMP5.S, PRED32_HALF2, zTMP1.S +.ifge \LANE_NR - 6 + compact zTMP6.S, PRED32_HALF2, zTMP2.S +.ifge \LANE_NR - 8 + compact zTMP7.S, PRED32_HALF2, zTMP3.S +.endif +.endif +.endif +.endif + + // store to 16byte x LANE_NR output + st1 {vTMP0.4S}, [OUT1], #16 +.ifge \LANE_NR - 2 + st1 {vTMP4.4S}, [OUT2], #16 +.ifge \LANE_NR - 3 + st1 {vTMP1.4S}, [OUT3], #16 +.ifge \LANE_NR - 4 + st1 {vTMP5.4S}, [OUT4], #16 +.ifge \LANE_NR - 5 + st1 {vTMP2.4S}, [OUT5], #16 +.ifge \LANE_NR - 6 + st1 {vTMP6.4S}, [OUT6], #16 +.ifge \LANE_NR - 7 + st1 {vTMP3.4S}, [OUT7], #16 +.ifge \LANE_NR - 8 + st1 {vTMP7.4S}, [OUT8], #16 +.endif +.endif +.endif +.endif +.endif +.endif +.endif + + // update number of generated output + add LEN, LEN, #16 + + // generate 16byte x 8lanes keystream + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11 + zip1 zTMP0.S, zTMP8.S, zTMP9.S + zip2 zTMP1.S, zTMP8.S, zTMP9.S + zip1 zTMP2.S, zTMP10.S, zTMP11.S + zip2 zTMP3.S, zTMP10.S, zTMP11.S + zip1 zTMP8.D, zTMP0.D, zTMP2.D +.ifge \LANE_NR - 3 + zip2 zTMP9.D, zTMP0.D, zTMP2.D +.ifge \LANE_NR - 5 + zip1 zTMP10.D, zTMP1.D, zTMP3.D +.ifge \LANE_NR - 7 + zip2 zTMP11.D, zTMP1.D, zTMP3.D +.endif +.endif +.endif + + revb zTMP8.S, PRED32/M, zTMP8.S +.ifge \LANE_NR - 3 + revb zTMP9.S, PRED32/M, zTMP9.S +.ifge \LANE_NR - 5 + revb zTMP10.S, PRED32/M, zTMP10.S +.ifge \LANE_NR - 7 + revb zTMP11.S, PRED32/M, zTMP11.S +.endif +.endif +.endif + b GEN_\LANE_NR\()LANES\SUFFIX\() + +FINISH_\LANE_NR\()TH\SUFFIX\(): + add CURR_LEN, CURR_LEN, 16 + sub LEFT_LEN, CURR_LEN, LEN +.if \LANE_NR == 8 + compact zTMP1.S, PRED32_HALF2, zTMP11.S + X_BYTE_STREAM IN8, OUT8, vTMP1, LEFT_LEN +.endif +.if \LANE_NR == 7 + X_BYTE_STREAM IN7, OUT7, vTMP11, LEFT_LEN +.endif +.if \LANE_NR == 6 + compact zTMP1.S, PRED32_HALF2, zTMP10.S + X_BYTE_STREAM IN6, OUT6, vTMP1, LEFT_LEN +.endif +.if \LANE_NR == 5 + X_BYTE_STREAM IN5, OUT5, vTMP10, LEFT_LEN +.endif +.if \LANE_NR == 4 + compact zTMP1.S, PRED32_HALF2, zTMP9.S + X_BYTE_STREAM IN4, OUT4, vTMP1, LEFT_LEN +.endif +.if \LANE_NR == 3 + X_BYTE_STREAM IN3, OUT3, vTMP9, LEFT_LEN +.endif +.if \LANE_NR == 2 + compact zTMP1.S, PRED32_HALF2, zTMP8.S + X_BYTE_STREAM IN2, OUT2, vTMP1, LEFT_LEN +.endif +.if \LANE_NR == 1 + X_BYTE_STREAM IN1, OUT1, vTMP8, LEFT_LEN +.endif +.endm + +/* + * snow3g_f8_8_buffer_aarch64_sve256_asm(void *key, + * void **iv, + * void **in, + * void **out, + * uint32_t lengthInBytes[]) + * + */ +START_FUNC(snow3g_f8_8_buffer_aarch64_sve256_asm) + FUNC_SCALAR_SAVE + ptrue PRED8.B, ALL + ptrue PRED32.S, ALL + ptrue PRED32_HALF1.S, VL4 + not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B + FUNC_VECTOR_SAVE + + adrp xTMP0, n_inv_aes_shift_row + add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row + ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0] + // key + mov xTMP17, x0 + // iv + mov xTMP18, x1 + ldp xTMP8, xTMP9, [xTMP18], #16 + ldp xTMP10, xTMP11, [xTMP18], #16 + ldp xTMP12, xTMP13, [xTMP18], #16 + ldp xTMP14, xTMP15, [xTMP18] + + SNOW3G_INITIALIZE_8_SVE256_FIRST xTMP17 xTMP17 xTMP17 xTMP17 xTMP17 xTMP17 xTMP17 xTMP17\ + xTMP8 xTMP9 xTMP10 xTMP11 xTMP12 xTMP13 xTMP14 xTMP15 + SNOW3G_INITIALIZE_8_SVE256_SECOND + + mov xTMP17, x2 + mov xTMP18, x3 + + // in + ldp xTMP1, xTMP2, [xTMP17], #16 + ldp xTMP3, xTMP4, [xTMP17], #16 + ldp xTMP5, xTMP6, [xTMP17], #16 + ldp xTMP7, xTMP8, [xTMP17] + // out + ldp xTMP9, xTMP10, [xTMP18], #16 + ldp xTMP11, xTMP12, [xTMP18], #16 + ldp xTMP13, xTMP14, [xTMP18], #16 + ldp xTMP15, xTMP16, [xTMP18] + + mov wTMP18, #16 + + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11 + zip1 zTMP0.S, zTMP8.S, zTMP9.S + zip2 zTMP1.S, zTMP8.S, zTMP9.S + zip1 zTMP2.S, zTMP10.S, zTMP11.S + zip2 zTMP3.S, zTMP10.S, zTMP11.S + zip1 zTMP8.D, zTMP0.D, zTMP2.D + zip2 zTMP9.D, zTMP0.D, zTMP2.D + zip1 zTMP10.D, zTMP1.D, zTMP3.D + zip2 zTMP11.D, zTMP1.D, zTMP3.D + revb zTMP8.S, PRED32/M, zTMP8.S + revb zTMP9.S, PRED32/M, zTMP9.S + revb zTMP10.S, PRED32/M, zTMP10.S + revb zTMP11.S, PRED32/M, zTMP11.S + + GEN_1_TO_8_LANES 8 _SINGLE_KEY + GEN_1_TO_8_LANES 7 _SINGLE_KEY + GEN_1_TO_8_LANES 6 _SINGLE_KEY + GEN_1_TO_8_LANES 5 _SINGLE_KEY + GEN_1_TO_8_LANES 4 _SINGLE_KEY + GEN_1_TO_8_LANES 3 _SINGLE_KEY + GEN_1_TO_8_LANES 2 _SINGLE_KEY + GEN_1_TO_8_LANES 1 _SINGLE_KEY + + FUNC_VECTOR_RESTORE + FUNC_SCALAR_RESTORE + ret +END_FUNC(snow3g_f8_8_buffer_aarch64_sve256_asm) + +/* + * snow3g_f8_8_buffer_multikey_aarch64_sve256_asm(void **key, + * void **iv, + * void **in, + * void **out, + * uint32_t lengthInBytes[]) + * + */ +START_FUNC(snow3g_f8_8_buffer_multikey_aarch64_sve256_asm) + FUNC_SCALAR_SAVE + ptrue PRED8.B, ALL + ptrue PRED32.S, ALL + ptrue PRED32_HALF1.S, VL4 + not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B + FUNC_VECTOR_SAVE + + adrp xTMP0, n_inv_aes_shift_row + add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row + ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0] + // key + mov xTMP17, x0 + // iv + mov xTMP18, x1 + + // key + ldp xTMP0, xTMP1, [xTMP17], #16 + ldp xTMP2, xTMP3, [xTMP17], #16 + ldp xTMP4, xTMP5, [xTMP17], #16 + ldp xTMP6, xTMP7, [xTMP17] + // iv + ldp xTMP8, xTMP9, [xTMP18], #16 + ldp xTMP10, xTMP11, [xTMP18], #16 + ldp xTMP12, xTMP13, [xTMP18], #16 + ldp xTMP14, xTMP15, [xTMP18] + + SNOW3G_INITIALIZE_8_SVE256_FIRST xTMP0 xTMP1 xTMP2 xTMP3 xTMP4 xTMP5 xTMP6 xTMP7\ + xTMP8 xTMP9 xTMP10 xTMP11 xTMP12 xTMP13 xTMP14 xTMP15 + SNOW3G_INITIALIZE_8_SVE256_SECOND + + mov xTMP17, x2 + mov xTMP18, x3 + + // in + ldp xTMP1, xTMP2, [xTMP17], #16 + ldp xTMP3, xTMP4, [xTMP17], #16 + ldp xTMP5, xTMP6, [xTMP17], #16 + ldp xTMP7, xTMP8, [xTMP17] + // out + ldp xTMP9, xTMP10, [xTMP18], #16 + ldp xTMP11, xTMP12, [xTMP18], #16 + ldp xTMP13, xTMP14, [xTMP18], #16 + ldp xTMP15, xTMP16, [xTMP18] + + mov wTMP18, #16 + + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11 + zip1 zTMP0.S, zTMP8.S, zTMP9.S + zip2 zTMP1.S, zTMP8.S, zTMP9.S + zip1 zTMP2.S, zTMP10.S, zTMP11.S + zip2 zTMP3.S, zTMP10.S, zTMP11.S + zip1 zTMP8.D, zTMP0.D, zTMP2.D + zip2 zTMP9.D, zTMP0.D, zTMP2.D + zip1 zTMP10.D, zTMP1.D, zTMP3.D + zip2 zTMP11.D, zTMP1.D, zTMP3.D + revb zTMP8.S, PRED32/M, zTMP8.S + revb zTMP9.S, PRED32/M, zTMP9.S + revb zTMP10.S, PRED32/M, zTMP10.S + revb zTMP11.S, PRED32/M, zTMP11.S + + GEN_1_TO_8_LANES 8 _MULTI_KEY + GEN_1_TO_8_LANES 7 _MULTI_KEY + GEN_1_TO_8_LANES 6 _MULTI_KEY + GEN_1_TO_8_LANES 5 _MULTI_KEY + GEN_1_TO_8_LANES 4 _MULTI_KEY + GEN_1_TO_8_LANES 3 _MULTI_KEY + GEN_1_TO_8_LANES 2 _MULTI_KEY + GEN_1_TO_8_LANES 1 _MULTI_KEY + + FUNC_VECTOR_RESTORE + FUNC_SCALAR_RESTORE + ret +END_FUNC(snow3g_f8_8_buffer_multikey_aarch64_sve256_asm) + +/* + * snow3g_f9_8_buffer_keystream_aarch64_sve256_asm(void *pCtx, + * uint32_t* ks) + * + */ +START_FUNC(snow3g_f9_8_buffer_keystream_aarch64_sve256_asm) + FUNC_SCALAR_SAVE + ptrue PRED8.B, ALL + ptrue PRED32.S, ALL + ptrue PRED32_HALF1.S, VL4 + not PRED32_HALF2.B, PRED32/Z, PRED32_HALF1.B + FUNC_VECTOR_SAVE + adrp xTMP0, n_inv_aes_shift_row + add xTMP0, xTMP0, #:lo12:n_inv_aes_shift_row + ld1b {zINV_SHIFT_ROW.B}, PRED8/Z, [xTMP0] + + mov xTMP8, x1 + add xTMP9, xTMP8, #20 + add xTMP10, xTMP9, #20 + add xTMP11, xTMP10, #20 + add xTMP12, xTMP11, #20 + add xTMP13, xTMP12, #20 + add xTMP14, xTMP13, #20 + add xTMP15, xTMP14, #20 + + SNOW3G_LOAD_CTX_8_SVE256 x0 + + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP9 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP10 + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP11 + zip1 zTMP0.S, zTMP8.S, zTMP9.S + zip2 zTMP1.S, zTMP8.S, zTMP9.S + zip1 zTMP2.S, zTMP10.S, zTMP11.S + zip2 zTMP3.S, zTMP10.S, zTMP11.S + zip1 zTMP8.D, zTMP0.D, zTMP2.D + zip2 zTMP9.D, zTMP0.D, zTMP2.D + zip1 zTMP10.D, zTMP1.D, zTMP3.D + zip2 zTMP11.D, zTMP1.D, zTMP3.D + compact zTMP4.S, PRED32_HALF2, zTMP8.S + compact zTMP5.S, PRED32_HALF2, zTMP9.S + compact zTMP6.S, PRED32_HALF2, zTMP10.S + compact zTMP7.S, PRED32_HALF2, zTMP11.S + + st1 {vTMP8.4S}, [xTMP8], #16 + st1 {vTMP4.4S}, [xTMP9], #16 + st1 {vTMP9.4S}, [xTMP10], #16 + st1 {vTMP5.4S}, [xTMP11], #16 + st1 {vTMP10.4S}, [xTMP12], #16 + st1 {vTMP6.4S}, [xTMP13], #16 + st1 {vTMP11.4S}, [xTMP14], #16 + st1 {vTMP7.4S}, [xTMP15], #16 + + SNOW3G_KEYSTREAM_8_4_SVE256 zTMP8 + compact zTMP4.S, PRED32_HALF2, zTMP8.S + + st1 {vTMP8.S}[0], [xTMP8], #4 + st1 {vTMP8.S}[1], [xTMP9], #4 + st1 {vTMP8.S}[2], [xTMP10], #4 + st1 {vTMP8.S}[3], [xTMP11], #4 + st1 {vTMP4.S}[0], [xTMP12], #4 + st1 {vTMP4.S}[1], [xTMP13], #4 + st1 {vTMP4.S}[2], [xTMP14], #4 + st1 {vTMP4.S}[3], [xTMP15], #4 + + FUNC_VECTOR_RESTORE + FUNC_SCALAR_RESTORE + ret +END_FUNC(snow3g_f9_8_buffer_keystream_aarch64_sve256_asm) \ No newline at end of file diff --git a/lib/aarch64/snow3g_internal.h b/lib/aarch64/snow3g_internal.h index 8b7e79224a5ceee3e24418ba862b92bd28d6cb82..dfe8adaec8ea297e169dbddf10b9940bd9f36c12 100644 --- a/lib/aarch64/snow3g_internal.h +++ b/lib/aarch64/snow3g_internal.h @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2022 Arm Corporation All rights reserved. + Copyright(c) 2022-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -63,9 +63,9 @@ typedef struct snow3gKeyState1_s { /* 16 LFSR stages */ uint32_t LFSR_S[16]; /* 3 FSM states */ - uint32_t FSM_R3; - uint32_t FSM_R2; uint32_t FSM_R1; + uint32_t FSM_R2; + uint32_t FSM_R3; } DECLARE_ALIGNED(snow3gKeyState1_t, 16); typedef struct snow3gKeyState4_s { @@ -76,6 +76,14 @@ typedef struct snow3gKeyState4_s { uint32_t iLFSR_X; } snow3gKeyState4_t; +typedef struct snow3gKeyState8_s { + /* 16 LFSR stages */ + uint32x4x2_t LFSR_X[16]; + /* 3 FSM states */ + uint32x4x2_t FSM_X[3]; + uint32_t iLFSR_X; +} snow3gKeyState8_t; + /** * @brief Finds minimum 32-bit value in an array * @return Min 32-bit value @@ -134,6 +142,32 @@ length_check(const uint32_t *out_array, const size_t dim_array) return 1; } + +/** + * @brief Checks vector of length values against 0 and SNOW3G_MAX_BYTELEN values + * @retval 0 incorrect length value found + * @retval 1 all OK + */ +static inline uint32_t +length64_check(const uint64_t *out_array, const size_t dim_array) +{ + size_t i; + + if (out_array == NULL) { + imb_set_errno(NULL, IMB_ERR_CIPH_LEN); + return 0; + } + + for (i = 0; i < dim_array; i++) { + if ((out_array[i] == 0) || + (out_array[i] > SNOW3G_MAX_BYTELEN)) { + imb_set_errno(NULL, IMB_ERR_CIPH_LEN); + return 0; + } + } + + return 1; +} #endif /** diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h index 982aaee8dc8706ff035027be1c8fead67fcffeff..67f6230c76988146a6381001d70938fca61d68fb 100644 --- a/lib/include/ipsec_ooo_mgr.h +++ b/lib/include/ipsec_ooo_mgr.h @@ -1,5 +1,5 @@ /******************************************************************************* - Copyright (c) 2012-2022, Intel Corporation + Copyright (c) 2012-2023, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -785,5 +785,7 @@ IMB_DLL_LOCAL void init_mb_mgr_aarch64_no_aesni_internal(IMB_MGR *state, const int reset_mgrs); IMB_DLL_LOCAL void init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs); +IMB_DLL_LOCAL void +init_mb_mgr_aarch64_sve256_internal(IMB_MGR *state, const int reset_mgrs); #endif /* IMB_IPSEC_MB_INTERNAL_H */ diff --git a/lib/include/snow3g.h b/lib/include/snow3g.h index 9bf40ae85f1cec766b1037793dfbd9cde5eee188..12326b0b2ebf00d54a335bcf58473fc699ee7835 100644 --- a/lib/include/snow3g.h +++ b/lib/include/snow3g.h @@ -1,5 +1,5 @@ /******************************************************************************* - Copyright (c) 2009-2022, Intel Corporation + Copyright (c) 2009-2023, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -782,58 +782,172 @@ snow3g_f8_n_buffer_multikey_aarch64(const snow3g_key_schedule_t * const pCtx[], void *pBufferOut[], const uint32_t bufferLenInBytes[], const uint32_t bufferCount); + +void +snow3g_f9_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + const uint64_t lengthInBits, + void *pDigest); + +size_t +snow3g_key_sched_size_aarch64(void); + +int +snow3g_init_key_sched_aarch64(const void *pKey, snow3g_key_schedule_t *pCtx); + +/******************************************************************************* + * SVE + ******************************************************************************/ void -snow3g_f8_4_buffer_initialize_aarch64(void *pCtx, - const snow3g_key_schedule_t *pKeySched1, - const snow3g_key_schedule_t *pKeySched2, - const snow3g_key_schedule_t *pKeySched3, - const snow3g_key_schedule_t *pKeySched4, - const void *pIV1, const void *pIV2, - const void *pIV3, const void *pIV4); +snow3g_f8_1_buffer_bit_aarch64_sve256(const snow3g_key_schedule_t *pCtx, + const void *pIV, + const void *pBufferIn, + void *pBufferOut, + const uint32_t cipherLengthInBits, + const uint32_t offsetInBits); void -snow3g_f8_1_buffer_stream_aarch64(void *pCtx, +snow3g_f8_1_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx, + const void *pIV, const void *pBufferIn, void *pBufferOut, const uint32_t lengthInBytes); void -snow3g_f8_4_buffer_stream_aarch64(void *pCtx, +snow3g_f8_2_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2); + +void +snow3g_f8_4_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, const void *pBufferIn1, void *pBufferOut1, + const uint32_t lengthInBytes1, const void *pBufferIn2, void *pBufferOut2, + const uint32_t lengthInBytes2, const void *pBufferIn3, void *pBufferOut3, + const uint32_t lengthInBytes3, const void *pBufferIn4, void *pBufferOut4, - const uint32_t lengthInBytes); - -void -snow3g_f9_1_buffer_aarch64(const snow3g_key_schedule_t *pCtx, - const void *pIV, - const void *pBufferIn, - const uint64_t lengthInBits, - void *pDigest); + const uint32_t lengthInBytes4); void -snow3g_f9_1_buffer_digest_aarch64(const uint32_t z[5], +snow3g_f8_4_buffer_multikey_aarch64_sve256(const snow3g_key_schedule_t *pCtx1, + const snow3g_key_schedule_t *pCtx2, + const snow3g_key_schedule_t *pCtx3, + const snow3g_key_schedule_t *pCtx4, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4); + +void +snow3g_f8_8_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx, + const void *pIV1, + const void *pIV2, + const void *pIV3, + const void *pIV4, + const void *pIV5, + const void *pIV6, + const void *pIV7, + const void *pIV8, + const void *pBufferIn1, + void *pBufferOut1, + const uint32_t lengthInBytes1, + const void *pBufferIn2, + void *pBufferOut2, + const uint32_t lengthInBytes2, + const void *pBufferIn3, + void *pBufferOut3, + const uint32_t lengthInBytes3, + const void *pBufferIn4, + void *pBufferOut4, + const uint32_t lengthInBytes4, + const void *pBufferIn5, + void *pBufferOut5, + const uint32_t lengthInBytes5, + const void *pBufferIn6, + void *pBufferOut6, + const uint32_t lengthInBytes6, + const void *pBufferIn7, + void *pBufferOut7, + const uint32_t lengthInBytes7, + const void *pBufferIn8, + void *pBufferOut8, + const uint32_t lengthInBytes8); + +void +snow3g_f8_8_buffer_multikey_aarch64_sve256(const snow3g_key_schedule_t * const pCtx[], + const void * const pIV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t lengthInBytes[]); + +void +snow3g_f8_n_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx, + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f8_n_buffer_multikey_aarch64_sve256(const snow3g_key_schedule_t * const pCtx[], + const void * const IV[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t bufferLenInBytes[], + const uint32_t bufferCount); + +void +snow3g_f9_1_buffer_aarch64_sve256(const snow3g_key_schedule_t *pCtx, + const void *pIV, const void *pBufferIn, const uint64_t lengthInBits, void *pDigest); -void -snow3g_f9_4_buffer_keystream_aarch64(void *pCtx, - uint32_t ks1[5], - uint32_t ks2[5], - uint32_t ks3[5], - uint32_t ks4[5]); - size_t -snow3g_key_sched_size_aarch64(void); +snow3g_key_sched_size_aarch64_sve256(void); int -snow3g_init_key_sched_aarch64(const void *pKey, snow3g_key_schedule_t *pCtx); +snow3g_init_key_sched_aarch64_sve256(const void *pKey, + snow3g_key_schedule_t *pCtx); + + +void +snow3g_f8_8_buffer_initialize_aarch64_sve256(void *pCtx, + const snow3g_key_schedule_t **pKeySched, + const void **pIV); +void +snow3g_f8_8_buffer_stream_aarch64_sve256(void *pCtx, + const void **in, + void **out, + uint32_t lengthInBytes); /******************************************************************************* * AARCH64 NO-AESNI @@ -965,33 +1079,6 @@ snow3g_f8_n_buffer_multikey_aarch64_no_aesni(const snow3g_key_schedule_t * const const uint32_t bufferLenInBytes[], const uint32_t bufferCount); -void -snow3g_f8_4_buffer_initialize_aarch64_no_aesni(void *pCtx, - const snow3g_key_schedule_t *pKeySched1, - const snow3g_key_schedule_t *pKeySched2, - const snow3g_key_schedule_t *pKeySched3, - const snow3g_key_schedule_t *pKeySched4, - const void *pIV1, const void *pIV2, - const void *pIV3, const void *pIV4); - -void -snow3g_f8_1_buffer_stream_aarch64_no_aesni(void *pCtx, - const void *pBufferIn, - void *pBufferOut, - const uint32_t lengthInBytes); - -void -snow3g_f8_4_buffer_stream_aarch64_no_aesni(void *pCtx, - const void *pBufferIn1, - void *pBufferOut1, - const void *pBufferIn2, - void *pBufferOut2, - const void *pBufferIn3, - void *pBufferOut3, - const void *pBufferIn4, - void *pBufferOut4, - const uint32_t lengthInBytes); - void snow3g_f9_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx, const void *pIV, @@ -999,19 +1086,6 @@ snow3g_f9_1_buffer_aarch64_no_aesni(const snow3g_key_schedule_t *pCtx, const uint64_t lengthInBits, void *pDigest); -void -snow3g_f9_1_buffer_digest_aarch64_no_aesni(const uint32_t z[5], - const void *pBufferIn, - const uint64_t lengthInBits, - void *pDigest); - -void -snow3g_f9_4_buffer_keystream_aarch64_no_aesni(void *pCtx, - uint32_t ks1[5], - uint32_t ks2[5], - uint32_t ks3[5], - uint32_t ks4[5]); - size_t snow3g_key_sched_size_aarch64_no_aesni(void); diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index 4de506b435f128f12b886f0abb394e193df64891..04b61e1ddafcb743fd45814a433a6a7716175df1 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -1,5 +1,5 @@ /******************************************************************************* - Copyright (c) 2012-2022, Intel Corporation + Copyright (c) 2012-2023, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -124,6 +124,7 @@ typedef enum { IMB_ARCH_AVX2, IMB_ARCH_AVX512, IMB_ARCH_AARCH64, + IMB_ARCH_SVE256, IMB_ARCH_NUM, } IMB_ARCH; @@ -1077,6 +1078,7 @@ typedef uint32_t (*crc32_fn_t)(const void *, const uint64_t); #define IMB_FEATURE_AARCH64 (1ULL << 32) #define IMB_FEATURE_ASIMD (1ULL << 33) #define IMB_FEATURE_PMULL (1ULL << 34) +#define IMB_FEATURE_SVE256 (1ULL << 35) /* TOP LEVEL (IMB_MGR) Data structure fields */ @@ -1556,6 +1558,13 @@ IMB_DLL_EXPORT IMB_JOB *flush_job_aarch64(IMB_MGR *state); IMB_DLL_EXPORT uint32_t queue_size_aarch64(IMB_MGR *state); IMB_DLL_EXPORT IMB_JOB *get_completed_job_aarch64(IMB_MGR *state); IMB_DLL_EXPORT IMB_JOB *get_next_job_aarch64(IMB_MGR *state); +IMB_DLL_EXPORT void init_mb_mgr_aarch64_sve256(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *submit_job_aarch64_sve256(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *submit_job_nocheck_aarch64_sve256(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *flush_job_aarch64_sve256(IMB_MGR *state); +IMB_DLL_EXPORT uint32_t queue_size_aarch64_sve256(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *get_completed_job_aarch64_sve256(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB *get_next_job_aarch64_sve256(IMB_MGR *state); /** * @brief Automatically initialize most performant diff --git a/perf/ipsec_perf.c b/perf/ipsec_perf.c index d3e96a386602c608385d9bc9ab1140697aefb250..55d392aeb7b602a55ed03b4295f1f199e8d5320c 100644 --- a/perf/ipsec_perf.c +++ b/perf/ipsec_perf.c @@ -1,5 +1,5 @@ /********************************************************************** - Copyright(c) 2017-2022, Intel Corporation All rights reserved. + Copyright(c) 2017-2023, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions @@ -123,6 +123,7 @@ enum arch_type_e { ARCH_AVX2, ARCH_AVX512, ARCH_AARCH64, + ARCH_SVE256, NUM_ARCHS }; @@ -242,6 +243,7 @@ const struct str_value_mapping arch_str_map[] = { {.name = "AVX2", .values.arch_type = ARCH_AVX2 }, {.name = "AVX512", .values.arch_type = ARCH_AVX512 }, {.name = "AARCH64",.values.arch_type = ARCH_AARCH64 }, + {.name = "SVE256", .values.arch_type = ARCH_SVE256 }, }; const struct str_value_mapping cipher_algo_str_map[] = { @@ -947,7 +949,7 @@ struct custom_job_params custom_job_params = { .cipher_dir = IMB_DIR_ENCRYPT }; -uint8_t archs[NUM_ARCHS] = {1, 1, 1, 1, 1}; /* uses all function sets */ +uint8_t archs[NUM_ARCHS] = {1, 1, 1, 1, 1, 1}; /* uses all function sets */ int use_job_api = 0; int use_gcm_sgl_api = 0; int use_unhalted_cycles = 0; /* read unhalted cycles instead of tsc */ @@ -2994,7 +2996,7 @@ print_times(struct variant_s *variant_list, struct params_s *params, if (plot_output_option == 0) { const char *func_names[NUM_ARCHS] = { - "SSE", "AVX", "AVX2", "AVX512", "AARCH64" + "SSE", "AVX", "AVX2", "AVX512", "AARCH64", "SVE256" }; const char *c_mode_names[TEST_NUM_CIPHER_TESTS - 1] = { "CBC", "CNTR", "CNTR+8", "CNTR_BITLEN", "CNTR_BITLEN4", @@ -3234,6 +3236,9 @@ run_tests(void *arg) case ARCH_AARCH64: init_mb_mgr_aarch64(p_mgr); break; + case ARCH_SVE256: + init_mb_mgr_aarch64_sve256(p_mgr); + break; #endif /* __aarch64__ */ default: fprintf(stderr, "Invalid architecture: %d\n", arch); @@ -3305,7 +3310,7 @@ static void usage(void) "-h: print this message\n" "-c: Use cold cache, it uses warm as default\n" "-w: Use warm cache\n" - "--arch: run only tests on specified architecture (SSE/AVX/AVX2/AVX512/AARCH64)\n" + "--arch: run only tests on specified architecture (SSE/AVX/AVX2/AVX512/AARCH64/SVE)\n" "--arch-best: detect available architectures and run only on the best one\n" "--cipher-dir: Select cipher direction to run on the custom test " "(encrypt/decrypt) (default = encrypt)\n" @@ -3420,6 +3425,7 @@ detect_arch(unsigned int arch_support[NUM_ARCHS]) const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx; const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2; const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI; + const uint64_t detect_sve256 = IMB_FEATURE_AARCH64 | IMB_FEATURE_SVE256; IMB_MGR *p_mgr = NULL; enum arch_type_e arch_id; @@ -3452,6 +3458,9 @@ detect_arch(unsigned int arch_support[NUM_ARCHS]) if ((p_mgr->features & detect_aarch64) != detect_aarch64) arch_support[ARCH_AARCH64] = 0; + if ((p_mgr->features & detect_sve256) != detect_sve256) + arch_support[ARCH_SVE256] = 0; + free_mb_mgr(p_mgr); return 0; @@ -3654,6 +3663,7 @@ detect_best_arch(uint8_t arch_support[NUM_ARCHS]) const uint64_t detect_avx2 = IMB_FEATURE_AVX2 | detect_avx; const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2; const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI; + const uint64_t detect_sve256 = IMB_FEATURE_AARCH64 | IMB_FEATURE_SVE256; IMB_MGR *p_mgr = NULL; uint64_t detected_features = 0; @@ -3699,6 +3709,10 @@ detect_best_arch(uint8_t arch_support[NUM_ARCHS]) return 0; } + if ((detected_features & detect_sve256) == detect_sve256) { + arch_support[ARCH_SVE256] = 1; + return 0; + } fprintf(stderr, "Arch detection: no architecture available!\n"); return -1; } diff --git a/test/main.c b/test/main.c index 8dd4033495596b93acac9e8076a580d0316229c1..bc61efc718a39c86ad1ba75d9d080e2751b5cdb2 100644 --- a/test/main.c +++ b/test/main.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2012-2022, Intel Corporation + Copyright (c) 2012-2023, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -316,6 +316,7 @@ print_hw_features(void) { IMB_FEATURE_AVX512_IFMA, "AVX512-IFMA" }, { IMB_FEATURE_BMI2, "BMI2" }, { IMB_FEATURE_AARCH64, "AARCH64" }, + { IMB_FEATURE_SVE256, "SVE256" }, }; IMB_MGR *p_mgr = NULL; unsigned i; @@ -471,6 +472,9 @@ main(int argc, char **argv) case IMB_ARCH_AARCH64: init_mb_mgr_aarch64(p_mgr); break; + case IMB_ARCH_SVE256: + init_mb_mgr_aarch64_sve256(p_mgr); + break; #endif #ifdef __x86_64__ diff --git a/test/utils.c b/test/utils.c index a894d81f58949af64fcbfd5f6b9ebf8bc9a25991..44e8b36da35f1cba42960b175a8a6646b2056ea1 100644 --- a/test/utils.c +++ b/test/utils.c @@ -1,5 +1,5 @@ /***************************************************************************** - Copyright (c) 2018-2022, Intel Corporation + Copyright (c) 2018-2023, Intel Corporation Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -220,6 +220,7 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM]) const uint64_t detect_avx512 = IMB_FEATURE_AVX512_SKX | detect_avx2; const uint64_t detect_aarch64 = IMB_FEATURE_AARCH64 | IMB_FEATURE_AESNI; + const uint64_t detect_sve256 = IMB_FEATURE_AARCH64 | IMB_FEATURE_SVE256; #ifdef __x86_64__ const uint64_t detect_noaesni = IMB_FEATURE_SSE4_2 | IMB_FEATURE_CMOV; @@ -264,6 +265,10 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM]) if ((p_mgr->features & detect_aarch64) != detect_aarch64) arch_support[IMB_ARCH_AARCH64] = 0; + if ((p_mgr->features & detect_sve256) != detect_sve256) { + arch_support[IMB_ARCH_SVE256] = 0; + } + free_mb_mgr(p_mgr); if (arch_support[IMB_ARCH_NOAESNI] == 0 && @@ -271,7 +276,8 @@ detect_arch(uint8_t arch_support[IMB_ARCH_NUM]) arch_support[IMB_ARCH_AVX] == 0 && arch_support[IMB_ARCH_AVX2] == 0 && arch_support[IMB_ARCH_AVX512] == 0 && - arch_support[IMB_ARCH_AARCH64] == 0) { + arch_support[IMB_ARCH_AARCH64] == 0 && + arch_support[IMB_ARCH_SVE256] == 0) { fprintf(stderr, "No available architecture detected!\n"); return -1; } @@ -289,7 +295,7 @@ void print_tested_arch(const uint64_t features, const IMB_ARCH arch) { static const char *arch_str_tab[IMB_ARCH_NUM] = { - "NONE", "NO-AESNI", "SSE", "AVX", "AVX2", "AVX512", "AARCH64" + "NONE", "NO-AESNI", "SSE", "AVX", "AVX2", "AVX512", "AARCH64", "SVE256" }; const char *feat = ""; @@ -298,6 +304,7 @@ print_tested_arch(const uint64_t features, const IMB_ARCH arch) case IMB_ARCH_AVX2: case IMB_ARCH_AVX: case IMB_ARCH_AARCH64: + case IMB_ARCH_SVE256: break; case IMB_ARCH_SSE: if (features & IMB_FEATURE_SHANI) {