From 7c8a58d8babc67d08cb400f0ced3bc1d4269ce08 Mon Sep 17 00:00:00 2001 From: "fisher.yu" Date: Tue, 11 Apr 2023 06:43:01 +0000 Subject: [PATCH 1/2] Modify mb manager initializing implementation. 1. Use macros to point to submit/flush functions for different uarch, instead of using function pointers. 2. Move reset_ooo_mgr to mb_mgr_code_aarch64.h, to reduce duplicate code. Change-Id: I9d95cce82274d7da111c35d63fe5ee2b1485922d --- lib/aarch64/mb_mgr_aarch64.c | 231 ++------------------------ lib/aarch64/mb_mgr_aarch64_no_aesni.c | 122 +------------- lib/aarch64/mb_mgr_aarch64_sve256.c | 231 ++------------------------ lib/aarch64/mb_mgr_code_aarch64.h | 103 ++++++++++++ 4 files changed, 132 insertions(+), 555 deletions(-) diff --git a/lib/aarch64/mb_mgr_aarch64.c b/lib/aarch64/mb_mgr_aarch64.c index 28612733..935cba1e 100644 --- a/lib/aarch64/mb_mgr_aarch64.c +++ b/lib/aarch64/mb_mgr_aarch64.c @@ -40,55 +40,6 @@ #include "include/noaesni.h" #include "include/ipsec_ooo_mgr.h" -IMB_JOB *submit_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc_eea3_aarch64_common(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eea3_aarch64_common(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eea3_aarch64_common(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eea3_aarch64_common(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc_eia3_aarch64_common(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eia3_aarch64_common(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_snow3g_uea2_aarch64_common(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uea2_aarch64_common(IMB_MGR *state); - -IMB_JOB *submit_job_snow3g_uia2_aarch64_common(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uia2_aarch64_common(IMB_MGR *state); - -IMB_JOB *submit_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state); - -IMB_JOB *submit_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state); -/* ====================================================================== */ - #define SUBMIT_JOB submit_job_aarch64 #define FLUSH_JOB flush_job_aarch64 #define SUBMIT_JOB_NOCHECK submit_job_nocheck_aarch64 @@ -97,162 +48,20 @@ IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state); #define QUEUE_SIZE queue_size_aarch64 -/* ====================================================================== */ - -#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AARCH64 -#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AARCH64 - -/* ====================================================================== */ -#define SUBMIT_JOB_ZUC_EEA3 submit_job_zuc_eea3_aarch64 -#define FLUSH_JOB_ZUC_EEA3 flush_job_zuc_eea3_aarch64 -#define SUBMIT_JOB_ZUC_EIA3 submit_job_zuc_eia3_aarch64 -#define FLUSH_JOB_ZUC_EIA3 flush_job_zuc_eia3_aarch64 -#define SUBMIT_JOB_ZUC256_EEA3 submit_job_zuc256_eea3_aarch64 -#define FLUSH_JOB_ZUC256_EEA3 flush_job_zuc256_eea3_aarch64 -#define SUBMIT_JOB_ZUC256_EIA3 submit_job_zuc256_eia3_aarch64 -#define FLUSH_JOB_ZUC256_EIA3 flush_job_zuc256_eia3_aarch64 -#define SUBMIT_JOB_SNOW3G_UEA2 submit_job_snow3g_uea2_aarch64 -#define FLUSH_JOB_SNOW3G_UEA2 flush_job_snow3g_uea2_aarch64 -#define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_aarch64 -#define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_aarch64 - - -static IMB_JOB * -(*submit_job_zuc_eea3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = - submit_job_zuc_eea3_aarch64_common; - -static IMB_JOB * -(*flush_job_zuc_eea3_aarch64)(MB_MGR_ZUC_OOO *state) = - flush_job_zuc_eea3_aarch64_common; - -static IMB_JOB * -(*submit_job_zuc_eia3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = - submit_job_zuc_eia3_aarch64_common; - -static IMB_JOB * -(*flush_job_zuc_eia3_aarch64)(MB_MGR_ZUC_OOO *state) = - flush_job_zuc_eia3_aarch64_common; - -static IMB_JOB * -(*submit_job_zuc256_eea3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = - submit_job_zuc256_eea3_aarch64_common; - -static IMB_JOB * -(*flush_job_zuc256_eea3_aarch64)(MB_MGR_ZUC_OOO *state) = - flush_job_zuc256_eea3_aarch64_common; - -static IMB_JOB * -(*submit_job_zuc256_eia3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = - submit_job_zuc256_eia3_aarch64_common; - -static IMB_JOB * -(*flush_job_zuc256_eia3_aarch64)(MB_MGR_ZUC_OOO *state) = - flush_job_zuc256_eia3_aarch64_common; - -static IMB_JOB * -(*submit_job_snow3g_uea2_aarch64)(IMB_MGR *state, IMB_JOB *job) = - submit_job_snow3g_uea2_aarch64_common; - -static IMB_JOB * -(*flush_job_snow3g_uea2_aarch64)(IMB_MGR *state) = - flush_job_snow3g_uea2_aarch64_common; - -static IMB_JOB * -(*submit_job_snow3g_uia2_aarch64)(IMB_MGR *state, IMB_JOB *job) = - submit_job_snow3g_uia2_aarch64_common; - -static IMB_JOB * -(*flush_job_snow3g_uia2_aarch64)(IMB_MGR *state) = - flush_job_snow3g_uia2_aarch64_common; -static void -reset_ooo_mgrs(IMB_MGR *state) -{ - MB_MGR_ZUC_OOO *zuc_eea3_ooo = state->zuc_eea3_ooo; - MB_MGR_ZUC_OOO *zuc_eia3_ooo = state->zuc_eia3_ooo; - MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo; - MB_MGR_ZUC_OOO *zuc256_eia3_ooo = state->zuc256_eia3_ooo; - MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo; - MB_MGR_SNOW3G_OOO *snow3g_uia2_ooo = state->snow3g_uia2_ooo; - - /* Init ZUC out-of-order fields */ - memset(zuc_eea3_ooo->lens, 0, - sizeof(zuc_eea3_ooo->lens)); - memset(zuc_eea3_ooo->job_in_lane, 0, - sizeof(zuc_eea3_ooo->job_in_lane)); - zuc_eea3_ooo->unused_lanes = 0xFF03020100; - zuc_eea3_ooo->num_lanes_inuse = 0; - memset(&zuc_eea3_ooo->state, 0, - sizeof(zuc_eea3_ooo->state)); - zuc_eea3_ooo->init_not_done = 0; - zuc_eea3_ooo->unused_lane_bitmask = 0x0f; - - memset(zuc_eia3_ooo->lens, 0xFF, - sizeof(zuc_eia3_ooo->lens)); - memset(zuc_eia3_ooo->job_in_lane, 0, - sizeof(zuc_eia3_ooo->job_in_lane)); - zuc_eia3_ooo->unused_lanes = 0xFF03020100; - zuc_eia3_ooo->num_lanes_inuse = 0; - memset(&zuc_eia3_ooo->state, 0, - sizeof(zuc_eia3_ooo->state)); - zuc_eia3_ooo->init_not_done = 0; - zuc_eia3_ooo->unused_lane_bitmask = 0x0f; - - memset(zuc256_eea3_ooo->lens, 0, - sizeof(zuc256_eea3_ooo->lens)); - memset(zuc256_eea3_ooo->job_in_lane, 0, - sizeof(zuc256_eea3_ooo->job_in_lane)); - zuc256_eea3_ooo->unused_lanes = 0xFF03020100; - zuc256_eea3_ooo->num_lanes_inuse = 0; - memset(&zuc256_eea3_ooo->state, 0, - sizeof(zuc256_eea3_ooo->state)); - zuc256_eea3_ooo->init_not_done = 0; - zuc256_eea3_ooo->unused_lane_bitmask = 0x0f; - - memset(zuc256_eia3_ooo->lens, 0xFF, - sizeof(zuc256_eia3_ooo->lens)); - memset(zuc256_eia3_ooo->job_in_lane, 0, - sizeof(zuc256_eia3_ooo->job_in_lane)); - zuc256_eia3_ooo->unused_lanes = 0xFF03020100; - zuc256_eia3_ooo->num_lanes_inuse = 0; - memset(&zuc256_eia3_ooo->state, 0, - sizeof(zuc256_eia3_ooo->state)); - zuc256_eia3_ooo->init_not_done = 0; - zuc256_eia3_ooo->unused_lane_bitmask = 0x0f; - - /* Init SNOW3G out-of-order fields */ - memset(snow3g_uea2_ooo->lens, 0, - sizeof(snow3g_uea2_ooo->lens)); - memset(snow3g_uea2_ooo->job_in_lane, 0, - sizeof(snow3g_uea2_ooo->job_in_lane)); - memset(snow3g_uea2_ooo->bits_fixup, 0, - sizeof(snow3g_uea2_ooo->bits_fixup)); - memset(&(snow3g_uea2_ooo->args), 0, - sizeof(snow3g_uea2_ooo->args)); - snow3g_uea2_ooo->init_mask = 0; - // each 4 bit indicate one lane, at most 16 buffer - snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210; - snow3g_uea2_ooo->num_lanes_inuse = 0; - snow3g_uea2_ooo->init_done = 0; - memset(snow3g_uea2_ooo->ks, 0, - sizeof(snow3g_uea2_ooo->ks)); - - memset(snow3g_uia2_ooo->lens, 0, - sizeof(snow3g_uia2_ooo->lens)); - memset(snow3g_uia2_ooo->job_in_lane, 0, - sizeof(snow3g_uia2_ooo->job_in_lane)); - memset(snow3g_uia2_ooo->bits_fixup, 0, - sizeof(snow3g_uia2_ooo->bits_fixup)); - memset(&(snow3g_uia2_ooo->args), 0, - sizeof(snow3g_uia2_ooo->args)); - snow3g_uia2_ooo->init_mask = 0; - // each 4 bit indicate one lane, at most 16 buffer - snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210; - snow3g_uia2_ooo->num_lanes_inuse = 0; - snow3g_uia2_ooo->init_done = 0; - memset(snow3g_uia2_ooo->ks, 0, - sizeof(snow3g_uia2_ooo->ks)); - return; -} +#define SUBMIT_JOB_ZUC_EEA3 submit_job_zuc_eea3_aarch64_common +#define FLUSH_JOB_ZUC_EEA3 flush_job_zuc_eea3_aarch64_common +#define SUBMIT_JOB_ZUC_EIA3 submit_job_zuc_eia3_aarch64_common +#define FLUSH_JOB_ZUC_EIA3 flush_job_zuc_eia3_aarch64_common +#define SUBMIT_JOB_ZUC256_EEA3 submit_job_zuc256_eea3_aarch64_common +#define FLUSH_JOB_ZUC256_EEA3 flush_job_zuc256_eea3_aarch64_common +#define SUBMIT_JOB_ZUC256_EIA3 submit_job_zuc256_eia3_aarch64_common +#define FLUSH_JOB_ZUC256_EIA3 flush_job_zuc256_eia3_aarch64_common +#define SUBMIT_JOB_SNOW3G_UEA2 submit_job_snow3g_uea2_aarch64_common +#define FLUSH_JOB_SNOW3G_UEA2 flush_job_snow3g_uea2_aarch64_common +#define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_aarch64_common +#define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_aarch64_common + +static void reset_ooo_mgrs(IMB_MGR *state); IMB_DLL_LOCAL void init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs) @@ -275,18 +84,6 @@ init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs) if (!(state->features & IMB_FEATURE_AESNI)) { init_mb_mgr_aarch64_no_aesni(state); - submit_job_zuc_eea3_aarch64 = submit_job_zuc_eea3_aarch64_no_aesni; - flush_job_zuc_eea3_aarch64 = flush_job_zuc_eea3_aarch64_no_aesni; - submit_job_zuc_eia3_aarch64 = submit_job_zuc_eia3_aarch64_no_aesni; - flush_job_zuc_eia3_aarch64 = flush_job_zuc_eia3_aarch64_no_aesni; - submit_job_zuc256_eea3_aarch64 = submit_job_zuc256_eea3_aarch64_no_aesni; - flush_job_zuc256_eea3_aarch64 = flush_job_zuc256_eea3_aarch64_no_aesni; - submit_job_zuc256_eia3_aarch64 = submit_job_zuc256_eia3_aarch64_no_aesni; - flush_job_zuc256_eia3_aarch64 = flush_job_zuc256_eia3_aarch64_no_aesni; - submit_job_snow3g_uea2_aarch64 = submit_job_snow3g_uea2_aarch64_no_aesni; - flush_job_snow3g_uea2_aarch64 = flush_job_snow3g_uea2_aarch64_no_aesni; - submit_job_snow3g_uia2_aarch64 = submit_job_snow3g_uia2_aarch64_no_aesni; - flush_job_snow3g_uia2_aarch64 = flush_job_snow3g_uia2_aarch64_no_aesni; return; } diff --git a/lib/aarch64/mb_mgr_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_aarch64_no_aesni.c index d66f8d92..ec64f354 100644 --- a/lib/aarch64/mb_mgr_aarch64_no_aesni.c +++ b/lib/aarch64/mb_mgr_aarch64_no_aesni.c @@ -38,31 +38,6 @@ #include "include/error.h" #include "include/ipsec_ooo_mgr.h" -IMB_JOB *submit_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state); - -IMB_JOB *submit_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state); -/* ====================================================================== */ - #define SUBMIT_JOB submit_job_aarch64_no_aesni #define FLUSH_JOB flush_job_aarch64_no_aesni #define SUBMIT_JOB_NOCHECK submit_job_nocheck_aarch64_no_aesni @@ -71,12 +46,6 @@ IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state); #define QUEUE_SIZE queue_size_aarch64_no_aesni -/* ====================================================================== */ - -#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AARCH64 -#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AARCH64 - -/* ====================================================================== */ #define SUBMIT_JOB_ZUC_EEA3 submit_job_zuc_eea3_aarch64_no_aesni #define FLUSH_JOB_ZUC_EEA3 flush_job_zuc_eea3_aarch64_no_aesni #define SUBMIT_JOB_ZUC_EIA3 submit_job_zuc_eia3_aarch64_no_aesni @@ -90,96 +59,7 @@ IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state); #define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_aarch64_no_aesni #define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_aarch64_no_aesni - -static void -reset_ooo_mgrs(IMB_MGR *state) -{ - MB_MGR_ZUC_OOO *zuc_eea3_ooo = state->zuc_eea3_ooo; - MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo; - MB_MGR_ZUC_OOO *zuc_eia3_ooo = state->zuc_eia3_ooo; - MB_MGR_ZUC_OOO *zuc256_eia3_ooo = state->zuc256_eia3_ooo; - MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo; - MB_MGR_SNOW3G_OOO *snow3g_uia2_ooo = state->snow3g_uia2_ooo; - - /* Init ZUC out-of-order fields */ - memset(zuc_eea3_ooo->lens, 0, - sizeof(zuc_eea3_ooo->lens)); - memset(zuc_eea3_ooo->job_in_lane, 0, - sizeof(zuc_eea3_ooo->job_in_lane)); - zuc_eea3_ooo->unused_lanes = 0xFF03020100; - zuc_eea3_ooo->num_lanes_inuse = 0; - memset(&zuc_eea3_ooo->state, 0, - sizeof(zuc_eea3_ooo->state)); - zuc_eea3_ooo->init_not_done = 0; - zuc_eea3_ooo->unused_lane_bitmask = 0x0f; - - memset(zuc_eia3_ooo->lens, 0xFF, - sizeof(zuc_eia3_ooo->lens)); - memset(zuc_eia3_ooo->job_in_lane, 0, - sizeof(zuc_eia3_ooo->job_in_lane)); - zuc_eia3_ooo->unused_lanes = 0xFF03020100; - zuc_eia3_ooo->num_lanes_inuse = 0; - memset(&zuc_eia3_ooo->state, 0, - sizeof(zuc_eia3_ooo->state)); - zuc_eia3_ooo->init_not_done = 0; - zuc_eia3_ooo->unused_lane_bitmask = 0x0f; - - memset(zuc256_eea3_ooo->lens, 0, - sizeof(zuc256_eea3_ooo->lens)); - memset(zuc256_eea3_ooo->job_in_lane, 0, - sizeof(zuc256_eea3_ooo->job_in_lane)); - zuc256_eea3_ooo->unused_lanes = 0xFF03020100; - zuc256_eea3_ooo->num_lanes_inuse = 0; - memset(&zuc256_eea3_ooo->state, 0, - sizeof(zuc256_eea3_ooo->state)); - zuc256_eea3_ooo->init_not_done = 0; - zuc256_eea3_ooo->unused_lane_bitmask = 0x0f; - - memset(zuc256_eia3_ooo->lens, 0xFF, - sizeof(zuc256_eia3_ooo->lens)); - memset(zuc256_eia3_ooo->job_in_lane, 0, - sizeof(zuc256_eia3_ooo->job_in_lane)); - zuc256_eia3_ooo->unused_lanes = 0xFF03020100; - zuc256_eia3_ooo->num_lanes_inuse = 0; - memset(&zuc256_eia3_ooo->state, 0, - sizeof(zuc256_eia3_ooo->state)); - zuc256_eia3_ooo->init_not_done = 0; - zuc256_eia3_ooo->unused_lane_bitmask = 0x0f; - - /* Init SNOW3G out-of-order fields */ - memset(snow3g_uea2_ooo->lens, 0, - sizeof(snow3g_uea2_ooo->lens)); - memset(snow3g_uea2_ooo->job_in_lane, 0, - sizeof(snow3g_uea2_ooo->job_in_lane)); - memset(snow3g_uea2_ooo->bits_fixup, 0, - sizeof(snow3g_uea2_ooo->bits_fixup)); - memset(&(snow3g_uea2_ooo->args), 0, - sizeof(snow3g_uea2_ooo->args)); - snow3g_uea2_ooo->init_mask = 0; - // each 4 bit indicate one lane, at most 16 buffer - snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210; - snow3g_uea2_ooo->num_lanes_inuse = 0; - snow3g_uea2_ooo->init_done = 0; - memset(snow3g_uea2_ooo->ks, 0, - sizeof(snow3g_uea2_ooo->ks)); - - memset(snow3g_uia2_ooo->lens, 0, - sizeof(snow3g_uia2_ooo->lens)); - memset(snow3g_uia2_ooo->job_in_lane, 0, - sizeof(snow3g_uia2_ooo->job_in_lane)); - memset(snow3g_uia2_ooo->bits_fixup, 0, - sizeof(snow3g_uia2_ooo->bits_fixup)); - memset(&(snow3g_uia2_ooo->args), 0, - sizeof(snow3g_uia2_ooo->args)); - snow3g_uia2_ooo->init_mask = 0; - // each 4 bit indicate one lane, at most 16 buffer - snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210; - snow3g_uia2_ooo->num_lanes_inuse = 0; - snow3g_uia2_ooo->init_done = 0; - memset(snow3g_uia2_ooo->ks, 0, - sizeof(snow3g_uia2_ooo->ks)); - return; -} +static void reset_ooo_mgrs(IMB_MGR *state); IMB_DLL_LOCAL void init_mb_mgr_aarch64_no_aesni_internal(IMB_MGR *state, const int reset_mgrs) diff --git a/lib/aarch64/mb_mgr_aarch64_sve256.c b/lib/aarch64/mb_mgr_aarch64_sve256.c index ea759a88..b6c425a6 100644 --- a/lib/aarch64/mb_mgr_aarch64_sve256.c +++ b/lib/aarch64/mb_mgr_aarch64_sve256.c @@ -40,55 +40,6 @@ #include "include/noaesni.h" #include "include/ipsec_ooo_mgr.h" -IMB_JOB *submit_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc_eea3_aarch64_common(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eea3_aarch64_common(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eea3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eea3_aarch64_common(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eea3_aarch64_common(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc_eia3_aarch64_common(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc_eia3_aarch64_common(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eia3_aarch64_no_aesni(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state, - IMB_JOB *job); -IMB_JOB *flush_job_zuc256_eia3_aarch64_common(MB_MGR_ZUC_OOO *state); - -IMB_JOB *submit_job_snow3g_uea2_aarch64_sve256(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uea2_aarch64_sve256(IMB_MGR *state); - -IMB_JOB *submit_job_snow3g_uia2_aarch64_sve256(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uia2_aarch64_sve256(IMB_MGR *state); - -IMB_JOB *submit_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uea2_aarch64_no_aesni(IMB_MGR *state); - -IMB_JOB *submit_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state, - IMB_JOB *job); -IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state); -/* ====================================================================== */ - #define SUBMIT_JOB submit_job_aarch64_sve256 #define FLUSH_JOB flush_job_aarch64_sve256 #define SUBMIT_JOB_NOCHECK submit_job_nocheck_aarch64_sve256 @@ -97,162 +48,20 @@ IMB_JOB *flush_job_snow3g_uia2_aarch64_no_aesni(IMB_MGR *state); #define QUEUE_SIZE queue_size_aarch64_sve256 -/* ====================================================================== */ - -#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AARCH64 -#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AARCH64 - -/* ====================================================================== */ -#define SUBMIT_JOB_ZUC_EEA3 submit_job_zuc_eea3_aarch64 -#define FLUSH_JOB_ZUC_EEA3 flush_job_zuc_eea3_aarch64 -#define SUBMIT_JOB_ZUC_EIA3 submit_job_zuc_eia3_aarch64 -#define FLUSH_JOB_ZUC_EIA3 flush_job_zuc_eia3_aarch64 -#define SUBMIT_JOB_ZUC256_EEA3 submit_job_zuc256_eea3_aarch64 -#define FLUSH_JOB_ZUC256_EEA3 flush_job_zuc256_eea3_aarch64 -#define SUBMIT_JOB_ZUC256_EIA3 submit_job_zuc256_eia3_aarch64 -#define FLUSH_JOB_ZUC256_EIA3 flush_job_zuc256_eia3_aarch64 -#define SUBMIT_JOB_SNOW3G_UEA2 submit_job_snow3g_uea2_aarch64 -#define FLUSH_JOB_SNOW3G_UEA2 flush_job_snow3g_uea2_aarch64 -#define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_aarch64 -#define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_aarch64 - - -static IMB_JOB * -(*submit_job_zuc_eea3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = - submit_job_zuc_eea3_aarch64_common; - -static IMB_JOB * -(*flush_job_zuc_eea3_aarch64)(MB_MGR_ZUC_OOO *state) = - flush_job_zuc_eea3_aarch64_common; - -static IMB_JOB * -(*submit_job_zuc_eia3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = - submit_job_zuc_eia3_aarch64_common; - -static IMB_JOB * -(*flush_job_zuc_eia3_aarch64)(MB_MGR_ZUC_OOO *state) = - flush_job_zuc_eia3_aarch64_common; - -static IMB_JOB * -(*submit_job_zuc256_eea3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = - submit_job_zuc256_eea3_aarch64_common; - -static IMB_JOB * -(*flush_job_zuc256_eea3_aarch64)(MB_MGR_ZUC_OOO *state) = - flush_job_zuc256_eea3_aarch64_common; - -static IMB_JOB * -(*submit_job_zuc256_eia3_aarch64)(MB_MGR_ZUC_OOO *state, IMB_JOB *job) = - submit_job_zuc256_eia3_aarch64_common; - -static IMB_JOB * -(*flush_job_zuc256_eia3_aarch64)(MB_MGR_ZUC_OOO *state) = - flush_job_zuc256_eia3_aarch64_common; - -static IMB_JOB * -(*submit_job_snow3g_uea2_aarch64)(IMB_MGR *state, IMB_JOB *job) = - submit_job_snow3g_uea2_aarch64_sve256; - -static IMB_JOB * -(*flush_job_snow3g_uea2_aarch64)(IMB_MGR *state) = - flush_job_snow3g_uea2_aarch64_sve256; - -static IMB_JOB * -(*submit_job_snow3g_uia2_aarch64)(IMB_MGR *state, IMB_JOB *job) = - submit_job_snow3g_uia2_aarch64_sve256; - -static IMB_JOB * -(*flush_job_snow3g_uia2_aarch64)(IMB_MGR *state) = - flush_job_snow3g_uia2_aarch64_sve256; -static void -reset_ooo_mgrs(IMB_MGR *state) -{ - MB_MGR_ZUC_OOO *zuc_eea3_ooo = state->zuc_eea3_ooo; - MB_MGR_ZUC_OOO *zuc_eia3_ooo = state->zuc_eia3_ooo; - MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo; - MB_MGR_ZUC_OOO *zuc256_eia3_ooo = state->zuc256_eia3_ooo; - MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo; - MB_MGR_SNOW3G_OOO *snow3g_uia2_ooo = state->snow3g_uia2_ooo; - - /* Init ZUC out-of-order fields */ - memset(zuc_eea3_ooo->lens, 0, - sizeof(zuc_eea3_ooo->lens)); - memset(zuc_eea3_ooo->job_in_lane, 0, - sizeof(zuc_eea3_ooo->job_in_lane)); - zuc_eea3_ooo->unused_lanes = 0xFF03020100; - zuc_eea3_ooo->num_lanes_inuse = 0; - memset(&zuc_eea3_ooo->state, 0, - sizeof(zuc_eea3_ooo->state)); - zuc_eea3_ooo->init_not_done = 0; - zuc_eea3_ooo->unused_lane_bitmask = 0x0f; - - memset(zuc_eia3_ooo->lens, 0xFF, - sizeof(zuc_eia3_ooo->lens)); - memset(zuc_eia3_ooo->job_in_lane, 0, - sizeof(zuc_eia3_ooo->job_in_lane)); - zuc_eia3_ooo->unused_lanes = 0xFF03020100; - zuc_eia3_ooo->num_lanes_inuse = 0; - memset(&zuc_eia3_ooo->state, 0, - sizeof(zuc_eia3_ooo->state)); - zuc_eia3_ooo->init_not_done = 0; - zuc_eia3_ooo->unused_lane_bitmask = 0x0f; - - memset(zuc256_eea3_ooo->lens, 0, - sizeof(zuc256_eea3_ooo->lens)); - memset(zuc256_eea3_ooo->job_in_lane, 0, - sizeof(zuc256_eea3_ooo->job_in_lane)); - zuc256_eea3_ooo->unused_lanes = 0xFF03020100; - zuc256_eea3_ooo->num_lanes_inuse = 0; - memset(&zuc256_eea3_ooo->state, 0, - sizeof(zuc256_eea3_ooo->state)); - zuc256_eea3_ooo->init_not_done = 0; - zuc256_eea3_ooo->unused_lane_bitmask = 0x0f; - - memset(zuc256_eia3_ooo->lens, 0xFF, - sizeof(zuc256_eia3_ooo->lens)); - memset(zuc256_eia3_ooo->job_in_lane, 0, - sizeof(zuc256_eia3_ooo->job_in_lane)); - zuc256_eia3_ooo->unused_lanes = 0xFF03020100; - zuc256_eia3_ooo->num_lanes_inuse = 0; - memset(&zuc256_eia3_ooo->state, 0, - sizeof(zuc256_eia3_ooo->state)); - zuc256_eia3_ooo->init_not_done = 0; - zuc256_eia3_ooo->unused_lane_bitmask = 0x0f; - - /* Init SNOW3G out-of-order fields */ - memset(snow3g_uea2_ooo->lens, 0, - sizeof(snow3g_uea2_ooo->lens)); - memset(snow3g_uea2_ooo->job_in_lane, 0, - sizeof(snow3g_uea2_ooo->job_in_lane)); - memset(snow3g_uea2_ooo->bits_fixup, 0, - sizeof(snow3g_uea2_ooo->bits_fixup)); - memset(&(snow3g_uea2_ooo->args), 0, - sizeof(snow3g_uea2_ooo->args)); - snow3g_uea2_ooo->init_mask = 0; - // each 4 bit indicate one lane, at most 16 buffer - snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210; - snow3g_uea2_ooo->num_lanes_inuse = 0; - snow3g_uea2_ooo->init_done = 0; - memset(snow3g_uea2_ooo->ks, 0, - sizeof(snow3g_uea2_ooo->ks)); - - memset(snow3g_uia2_ooo->lens, 0, - sizeof(snow3g_uia2_ooo->lens)); - memset(snow3g_uia2_ooo->job_in_lane, 0, - sizeof(snow3g_uia2_ooo->job_in_lane)); - memset(snow3g_uia2_ooo->bits_fixup, 0, - sizeof(snow3g_uia2_ooo->bits_fixup)); - memset(&(snow3g_uia2_ooo->args), 0, - sizeof(snow3g_uia2_ooo->args)); - snow3g_uia2_ooo->init_mask = 0; - // each 4 bit indicate one lane, at most 16 buffer - snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210; - snow3g_uia2_ooo->num_lanes_inuse = 0; - snow3g_uia2_ooo->init_done = 0; - memset(snow3g_uia2_ooo->ks, 0, - sizeof(snow3g_uia2_ooo->ks)); - return; -} +#define SUBMIT_JOB_ZUC_EEA3 submit_job_zuc_eea3_aarch64_common +#define FLUSH_JOB_ZUC_EEA3 flush_job_zuc_eea3_aarch64_common +#define SUBMIT_JOB_ZUC_EIA3 submit_job_zuc_eia3_aarch64_common +#define FLUSH_JOB_ZUC_EIA3 flush_job_zuc_eia3_aarch64_common +#define SUBMIT_JOB_ZUC256_EEA3 submit_job_zuc256_eea3_aarch64_common +#define FLUSH_JOB_ZUC256_EEA3 flush_job_zuc256_eea3_aarch64_common +#define SUBMIT_JOB_ZUC256_EIA3 submit_job_zuc256_eia3_aarch64_common +#define FLUSH_JOB_ZUC256_EIA3 flush_job_zuc256_eia3_aarch64_common +#define SUBMIT_JOB_SNOW3G_UEA2 submit_job_snow3g_uea2_aarch64_sve256 +#define FLUSH_JOB_SNOW3G_UEA2 flush_job_snow3g_uea2_aarch64_sve256 +#define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_aarch64_sve256 +#define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_aarch64_sve256 + +static void reset_ooo_mgrs(IMB_MGR *state); IMB_DLL_LOCAL void init_mb_mgr_aarch64_sve256_internal(IMB_MGR *state, const int reset_mgrs) @@ -275,18 +84,6 @@ init_mb_mgr_aarch64_sve256_internal(IMB_MGR *state, const int reset_mgrs) if (!(state->features & IMB_FEATURE_AESNI)) { init_mb_mgr_aarch64_no_aesni(state); - submit_job_zuc_eea3_aarch64 = submit_job_zuc_eea3_aarch64_no_aesni; - flush_job_zuc_eea3_aarch64 = flush_job_zuc_eea3_aarch64_no_aesni; - submit_job_zuc_eia3_aarch64 = submit_job_zuc_eia3_aarch64_no_aesni; - flush_job_zuc_eia3_aarch64 = flush_job_zuc_eia3_aarch64_no_aesni; - submit_job_zuc256_eea3_aarch64 = submit_job_zuc256_eea3_aarch64_no_aesni; - flush_job_zuc256_eea3_aarch64 = flush_job_zuc256_eea3_aarch64_no_aesni; - submit_job_zuc256_eia3_aarch64 = submit_job_zuc256_eia3_aarch64_no_aesni; - flush_job_zuc256_eia3_aarch64 = flush_job_zuc256_eia3_aarch64_no_aesni; - submit_job_snow3g_uea2_aarch64 = submit_job_snow3g_uea2_aarch64_no_aesni; - flush_job_snow3g_uea2_aarch64 = flush_job_snow3g_uea2_aarch64_no_aesni; - submit_job_snow3g_uia2_aarch64 = submit_job_snow3g_uia2_aarch64_no_aesni; - flush_job_snow3g_uia2_aarch64 = flush_job_snow3g_uia2_aarch64_no_aesni; return; } diff --git a/lib/aarch64/mb_mgr_code_aarch64.h b/lib/aarch64/mb_mgr_code_aarch64.h index 560adc58..1723b1b6 100644 --- a/lib/aarch64/mb_mgr_code_aarch64.h +++ b/lib/aarch64/mb_mgr_code_aarch64.h @@ -48,6 +48,109 @@ #define BSWAP64 __builtin_bswap64 +IMB_JOB *SUBMIT_JOB_ZUC_EEA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job); +IMB_JOB *SUBMIT_JOB_ZUC256_EEA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job); +IMB_JOB *FLUSH_JOB_ZUC_EEA3(MB_MGR_ZUC_OOO *state); +IMB_JOB *FLUSH_JOB_ZUC256_EEA3(MB_MGR_ZUC_OOO *state); +IMB_JOB *SUBMIT_JOB_ZUC_EIA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job); +IMB_JOB *SUBMIT_JOB_ZUC256_EIA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job); +IMB_JOB *FLUSH_JOB_ZUC_EIA3(MB_MGR_ZUC_OOO *state); +IMB_JOB *FLUSH_JOB_ZUC256_EIA3(MB_MGR_ZUC_OOO *state); +IMB_JOB *SUBMIT_JOB_SNOW3G_UEA2(IMB_MGR *state, IMB_JOB *job); +IMB_JOB *FLUSH_JOB_SNOW3G_UEA2(IMB_MGR *state); +IMB_JOB *SUBMIT_JOB_SNOW3G_UIA2(IMB_MGR *state, IMB_JOB *job); +IMB_JOB *FLUSH_JOB_SNOW3G_UIA2(IMB_MGR *state); + +static void +reset_ooo_mgrs(IMB_MGR *state) +{ + MB_MGR_ZUC_OOO *zuc_eea3_ooo = state->zuc_eea3_ooo; + MB_MGR_ZUC_OOO *zuc_eia3_ooo = state->zuc_eia3_ooo; + MB_MGR_ZUC_OOO *zuc256_eea3_ooo = state->zuc256_eea3_ooo; + MB_MGR_ZUC_OOO *zuc256_eia3_ooo = state->zuc256_eia3_ooo; + MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo; + MB_MGR_SNOW3G_OOO *snow3g_uia2_ooo = state->snow3g_uia2_ooo; + + /* Init ZUC out-of-order fields */ + memset(zuc_eea3_ooo->lens, 0, + sizeof(zuc_eea3_ooo->lens)); + memset(zuc_eea3_ooo->job_in_lane, 0, + sizeof(zuc_eea3_ooo->job_in_lane)); + zuc_eea3_ooo->unused_lanes = 0xFF03020100; + zuc_eea3_ooo->num_lanes_inuse = 0; + memset(&zuc_eea3_ooo->state, 0, + sizeof(zuc_eea3_ooo->state)); + zuc_eea3_ooo->init_not_done = 0; + zuc_eea3_ooo->unused_lane_bitmask = 0x0f; + + memset(zuc_eia3_ooo->lens, 0xFF, + sizeof(zuc_eia3_ooo->lens)); + memset(zuc_eia3_ooo->job_in_lane, 0, + sizeof(zuc_eia3_ooo->job_in_lane)); + zuc_eia3_ooo->unused_lanes = 0xFF03020100; + zuc_eia3_ooo->num_lanes_inuse = 0; + memset(&zuc_eia3_ooo->state, 0, + sizeof(zuc_eia3_ooo->state)); + zuc_eia3_ooo->init_not_done = 0; + zuc_eia3_ooo->unused_lane_bitmask = 0x0f; + + memset(zuc256_eea3_ooo->lens, 0, + sizeof(zuc256_eea3_ooo->lens)); + memset(zuc256_eea3_ooo->job_in_lane, 0, + sizeof(zuc256_eea3_ooo->job_in_lane)); + zuc256_eea3_ooo->unused_lanes = 0xFF03020100; + zuc256_eea3_ooo->num_lanes_inuse = 0; + memset(&zuc256_eea3_ooo->state, 0, + sizeof(zuc256_eea3_ooo->state)); + zuc256_eea3_ooo->init_not_done = 0; + zuc256_eea3_ooo->unused_lane_bitmask = 0x0f; + + memset(zuc256_eia3_ooo->lens, 0xFF, + sizeof(zuc256_eia3_ooo->lens)); + memset(zuc256_eia3_ooo->job_in_lane, 0, + sizeof(zuc256_eia3_ooo->job_in_lane)); + zuc256_eia3_ooo->unused_lanes = 0xFF03020100; + zuc256_eia3_ooo->num_lanes_inuse = 0; + memset(&zuc256_eia3_ooo->state, 0, + sizeof(zuc256_eia3_ooo->state)); + zuc256_eia3_ooo->init_not_done = 0; + zuc256_eia3_ooo->unused_lane_bitmask = 0x0f; + + /* Init SNOW3G out-of-order fields */ + memset(snow3g_uea2_ooo->lens, 0, + sizeof(snow3g_uea2_ooo->lens)); + memset(snow3g_uea2_ooo->job_in_lane, 0, + sizeof(snow3g_uea2_ooo->job_in_lane)); + memset(snow3g_uea2_ooo->bits_fixup, 0, + sizeof(snow3g_uea2_ooo->bits_fixup)); + memset(&(snow3g_uea2_ooo->args), 0, + sizeof(snow3g_uea2_ooo->args)); + snow3g_uea2_ooo->init_mask = 0; + // each 4 bit indicate one lane, at most 16 buffer + snow3g_uea2_ooo->unused_lanes = 0xFEDCBA9876543210; + snow3g_uea2_ooo->num_lanes_inuse = 0; + snow3g_uea2_ooo->init_done = 0; + memset(snow3g_uea2_ooo->ks, 0, + sizeof(snow3g_uea2_ooo->ks)); + + memset(snow3g_uia2_ooo->lens, 0, + sizeof(snow3g_uia2_ooo->lens)); + memset(snow3g_uia2_ooo->job_in_lane, 0, + sizeof(snow3g_uia2_ooo->job_in_lane)); + memset(snow3g_uia2_ooo->bits_fixup, 0, + sizeof(snow3g_uia2_ooo->bits_fixup)); + memset(&(snow3g_uia2_ooo->args), 0, + sizeof(snow3g_uia2_ooo->args)); + snow3g_uia2_ooo->init_mask = 0; + // each 4 bit indicate one lane, at most 16 buffer + snow3g_uia2_ooo->unused_lanes = 0xFEDCBA9876543210; + snow3g_uia2_ooo->num_lanes_inuse = 0; + snow3g_uia2_ooo->init_done = 0; + memset(snow3g_uia2_ooo->ks, 0, + sizeof(snow3g_uia2_ooo->ks)); + return; +} + /* * JOBS() and ADV_JOBS() moved into mb_mgr_code.h * get_next_job() and get_completed_job() API's are no longer inlines. -- GitLab From 0e2bedf7e504d1ee192597940f0ecf51672020ba Mon Sep 17 00:00:00 2001 From: "fisher.yu" Date: Thu, 4 May 2023 02:53:19 +0000 Subject: [PATCH 2/2] Optimize ZUC assembly implementation. Follow changes in this patch. 1. assign vector registers to LFSR0-15, BRX0-3 and FR1-2, to reduce load&store. 2. modify alignment, remove useless constant data. 3. use instruction SLI to implement rotate. 4. use instruction BIC to clear MSB in MOD (2^31-1). 5. use instruction TRN1 to implement halfword swap in bits_reorg4. 6. remove useless instructions in S0_compute_NEON, MUL_TBL_NEON. 7. reorder instructions in nonlin_fun4, lfsr_updt4, bits_reorg4, MUL_TLB_NEON, S0_compute_NEON to eliminate dependency. 8. remove ADD W in work mode. Change-Id: I15f1afb57b0036a3e8aa2183dafd11d0fd28c8d2 --- lib/aarch64/zuc_common.S | 4 +- lib/aarch64/zuc_sbox.S | 90 ++---- lib/aarch64/zuc_simd.S | 600 ++++++++++++++++----------------------- 3 files changed, 265 insertions(+), 429 deletions(-) diff --git a/lib/aarch64/zuc_common.S b/lib/aarch64/zuc_common.S index 6f4ee78a..fc61ba03 100644 --- a/lib/aarch64/zuc_common.S +++ b/lib/aarch64/zuc_common.S @@ -1,5 +1,5 @@ /******************************************************************************* - Copyright (c) 2021 Arm Corporation All rights reserved. + Copyright (c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -79,7 +79,7 @@ mask_S1: #define START_FUNC(fn) .globl fn; \ .type fn, %function; \ - .align 16; \ + .align 5; \ fn: #define END_FUNC(fn) .size fn,.-fn diff --git a/lib/aarch64/zuc_sbox.S b/lib/aarch64/zuc_sbox.S index 5b79d452..f8445bb6 100644 --- a/lib/aarch64/zuc_sbox.S +++ b/lib/aarch64/zuc_sbox.S @@ -1,5 +1,5 @@ /******************************************************************************* - Copyright (c) 2021 Arm Corporation All rights reserved. + Copyright (c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -31,95 +31,63 @@ #include "aarch64/aesni_emu_aarch64.S" .section .data -.align 16 +.align 4 .type P1, %object P1: .byte 0x09, 0x0F, 0x00, 0x0E, 0x0F, 0x0F, 0x02, 0x0A, 0x00, 0x04, 0x00, 0x0C, 0x07, 0x05, 0x03, 0x09 .size P1,.-P1 -.align 16 +.align 4 .type P2, %object P2: .byte 0x08, 0x0D, 0x06, 0x05, 0x07, 0x00, 0x0C, 0x04, 0x0B, 0x01, 0x0E, 0x0A, 0x0F, 0x03, 0x09, 0x02 .size P2,.-P2 -.align 16 +.align 4 .type P3, %object P3: .byte 0x02, 0x06, 0x0A, 0x06, 0x00, 0x0D, 0x0A, 0x0F, 0x03, 0x03, 0x0D, 0x05, 0x00, 0x09, 0x0C, 0x0D .size P3,.-P3 -.align 16 -.type Low_nibble_mask, %object -Low_nibble_mask: - .byte 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f -.size Low_nibble_mask,.-Low_nibble_mask - -.align 16 -.type High_nibble_mask, %object -High_nibble_mask: - .byte 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0 -.size High_nibble_mask,.-High_nibble_mask - -.align 16 +.align 4 .type Aes_to_Zuc_mul_low_nibble, %object Aes_to_Zuc_mul_low_nibble: .byte 0x00, 0x01, 0x82, 0x83, 0x9e, 0x9f, 0x1c, 0x1d, 0x24, 0x25, 0xa6, 0xa7, 0xba, 0xbb, 0x38, 0x39 .size Aes_to_Zuc_mul_low_nibble,.-Aes_to_Zuc_mul_low_nibble -.align 16 +.align 4 .type Aes_to_Zuc_mul_high_nibble, %object Aes_to_Zuc_mul_high_nibble: .byte 0x00, 0xd5, 0x08, 0xdd, 0x7c, 0xa9, 0x74, 0xa1, 0x9c, 0x49, 0x94, 0x41, 0xe0, 0x35, 0xe8, 0x3d .size Aes_to_Zuc_mul_high_nibble,.-Aes_to_Zuc_mul_high_nibble -.align 16 +.align 4 .type Shuf_mask, %object Shuf_mask: .byte 0x00, 0x0D, 0x0A, 0x07, 0x04, 0x01, 0x0e, 0x0b, 0x08, 0x05, 0x02, 0x0f, 0x0C, 0x09, 0x06, 0x03 .size Shuf_mask,.-Shuf_mask -.align 16 -.type Cancel_aes, %object -Cancel_aes: - .byte 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63, 0x63 -.size Cancel_aes,.-Cancel_aes - -.align 16 +.align 4 .type Comb_matrix_mul_low_nibble, %object Comb_matrix_mul_low_nibble: .byte 0x55, 0x41, 0xff, 0xeb, 0x24, 0x30, 0x8e, 0x9a, 0xe2, 0xf6, 0x48, 0x5c, 0x93, 0x87, 0x39, 0x2d .size Comb_matrix_mul_low_nibble,.-Comb_matrix_mul_low_nibble -.align 16 +.align 4 .type Comb_matrix_mul_high_nibble, %object Comb_matrix_mul_high_nibble: .byte 0x55, 0xba, 0xcc, 0x23, 0x15, 0xfa, 0x8c, 0x63, 0x09, 0xe6, 0x90, 0x7f, 0x49, 0xa6, 0xd0, 0x3f .size Comb_matrix_mul_high_nibble,.-Comb_matrix_mul_high_nibble -.align 16 -.type Const_comb_matrix, %object -Const_comb_matrix: - .byte 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55 -.size Const_comb_matrix,.-Const_comb_matrix - #define xPage x23 .macro MUL_TBL_NEON vIN, vLO, vHI_OUT, vTMP - adrp xPage,Low_nibble_mask - add xPage, xPage, #:lo12:Low_nibble_mask - ld1 {\vTMP\().16b}, [xPage] + movi \vTMP\().16b, 0x0F and \vTMP\().16b, \vIN\().16b, \vTMP\().16b + ushr \vIN\().16b, \vIN\().16b, #4 tbl \vLO\().16b, {\vLO\().16b}, \vTMP\().16b - - adrp xPage,High_nibble_mask - add xPage, xPage, #:lo12:High_nibble_mask - ld1 {\vTMP\().16b}, [xPage] - and \vTMP\().16b, \vIN\().16b, \vTMP\().16b - ushr \vTMP\().2d, \vTMP\().2d, #4 - - tbl \vHI_OUT\().16b, {\vHI_OUT\().16b}, \vTMP\().16b + tbl \vHI_OUT\().16b, {\vHI_OUT\().16b}, \vIN\().16b eor \vHI_OUT\().16b, \vHI_OUT\().16b, \vLO\().16b .endm @@ -127,18 +95,10 @@ Const_comb_matrix: * Compute 16 S0 box values from 16 bytes, stored in SIMD register */ .macro S0_compute_NEON IN_OUT, vTMP1, vTMP2 - mov \vTMP1\().16b, \IN_OUT\().16b - adrp xPage, Low_nibble_mask - add xPage, xPage, #:lo12:Low_nibble_mask - ld1 {\vTMP2\().16b}, [xPage] + movi \vTMP2\().16b, 0x0F + ushr \vTMP1\().16b, \IN_OUT\().16b, #4 // x1 and \IN_OUT\().16b, \IN_OUT\().16b, \vTMP2\().16b // x2 - adrp xPage, High_nibble_mask - add xPage, xPage, #:lo12:High_nibble_mask - ld1 {\vTMP2\().16b}, [xPage] - and \vTMP1\().16b, \vTMP1\().16b, \vTMP2\().16b - ushr \vTMP1\().2d, \vTMP1\().2d, #4 // x1 - adrp xPage, P1 add xPage, xPage, #:lo12:P1 ld1 {\vTMP2\().16b}, [xPage] @@ -159,11 +119,10 @@ Const_comb_matrix: // s << 4 (since high nibble of each byte is 0, no masking is required) shl \IN_OUT\().2d, \IN_OUT\().2d, #4 - orr \IN_OUT\().16b, \IN_OUT\().16b, \vTMP1\().16b // t = (s << 4) | r + orr \vTMP1\().16b, \IN_OUT\().16b, \vTMP1\().16b // t = (s << 4) | r // Rotate left 5 bits in each byte, within a SIMD register - mov \vTMP1\().16b, \IN_OUT\().16b - ushr \IN_OUT\().16b, \IN_OUT\().16b, #3 + ushr \IN_OUT\().16b, \vTMP1\().16b, #3 sli \IN_OUT\().16b, \vTMP1\().16b, #5 .endm @@ -196,9 +155,7 @@ Const_comb_matrix: ld1 {\vTMP1\().16b}, [xPage] tbl \vTMP1\().16b, {\vTMP2\().16b}, \vTMP1\().16b - adrp xPage, Cancel_aes - add xPage, xPage, #:lo12:Cancel_aes - ld1 {\vTMP2\().16b}, [xPage] + movi \vTMP2\().16b, 0x63 INTEL_AESNCLAST \vTMP1, \vTMP2, \vTMP3 @@ -212,9 +169,7 @@ Const_comb_matrix: MUL_TBL_NEON \vTMP1, \vTMP2, \vIN_OUT, \vTMP3 - adrp xPage, Const_comb_matrix - add xPage, xPage, #:lo12:Const_comb_matrix - ld1 {\vTMP3\().16b}, [xPage] + movi \vTMP3\().16b, 0x55 eor \vIN_OUT\().16b, \vIN_OUT\().16b, \vTMP3\().16b .endm @@ -237,9 +192,8 @@ Const_comb_matrix: ld1 {\vTMP1\().16b}, [xPage] tbl \vTMP1\().16b, {\vTMP2\().16b}, \vTMP1\().16b - adrp xPage, Cancel_aes - add xPage, xPage, #:lo12:Cancel_aes - ld1 {\vTMP2\().16b}, [xPage] + movi \vTMP2\().16b, 0x63 + EMULATE_AESENCLAST \vTMP1, \vTMP2, \vTMP3 @@ -253,9 +207,7 @@ Const_comb_matrix: MUL_TBL_NEON \vTMP1, \vTMP2, \vIN_OUT, \vTMP3 - adrp xPage, Const_comb_matrix - add xPage, xPage, #:lo12:Const_comb_matrix - ld1 {\vTMP3\().16b}, [xPage] + movi \vTMP3\().16b, 0x55 eor \vIN_OUT\().16b, \vIN_OUT\().16b, \vTMP3\().16b .endm #endif // ifndef _ZUC_SOBX_INC_ diff --git a/lib/aarch64/zuc_simd.S b/lib/aarch64/zuc_simd.S index 5e4c3e85..c1cece98 100644 --- a/lib/aarch64/zuc_simd.S +++ b/lib/aarch64/zuc_simd.S @@ -1,5 +1,5 @@ /******************************************************************************* - Copyright (c) 2021 Arm Corporation All rights reserved. + Copyright (c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -46,14 +46,6 @@ .arch armv8-a+crypto .section .data -.align 16 -.type Ek_d, %object -Ek_d: -.word 0x0044D700, 0x0026BC00, 0x00626B00, 0x00135E00 -.word 0x00578900, 0x0035E200, 0x00713500, 0x0009AF00 -.word 0x004D7800, 0x002F1300, 0x006BC400, 0x001AF100 -.word 0x005E2600, 0x003C4D00, 0x00789A00, 0x0047AC00 -.size Ek_d,.-Ek_d // Constants to be used to initialize the LFSR registers // This table contains four different sets of constants: @@ -61,7 +53,7 @@ Ek_d: // 64-127 bytes: Authentication with tag size = 4 // 128-191 bytes: Authentication with tag size = 8 // 192-255 bytes: Authentication with tag size = 16 -.align 16 +.align 8 .type EK256_d64, %object EK256_d64: .word 0x00220000, 0x002F0000, 0x00240000, 0x002A0000 @@ -82,7 +74,16 @@ EK256_d64: .word 0x00400000, 0x00520000, 0x00100000, 0x00300000 .size EK256_d64,.-EK256_d64 -.align 16 +.align 6 +.type Ek_d, %object +Ek_d: +.word 0x0044D700, 0x0026BC00, 0x00626B00, 0x00135E00 +.word 0x00578900, 0x0035E200, 0x00713500, 0x0009AF00 +.word 0x004D7800, 0x002F1300, 0x006BC400, 0x001AF100 +.word 0x005E2600, 0x003C4D00, 0x00789A00, 0x0047AC00 +.size Ek_d,.-Ek_d + +.align 6 .type shuf_mask_key, %object shuf_mask_key: .word 0x00FFFFFF, 0x01FFFFFF, 0x02FFFFFF, 0x03FFFFFF @@ -91,7 +92,7 @@ shuf_mask_key: .word 0x0CFFFFFF, 0x0DFFFFFF, 0x0EFFFFFF, 0x0FFFFFFF .size shuf_mask_key,.-shuf_mask_key -.align 16 +.align 6 .type shuf_mask_iv, %object shuf_mask_iv: .word 0xFFFFFF00, 0xFFFFFF01, 0xFFFFFF02, 0xFFFFFF03 @@ -100,7 +101,7 @@ shuf_mask_iv: .word 0xFFFFFF0C, 0xFFFFFF0D, 0xFFFFFF0E, 0xFFFFFF0F .size shuf_mask_iv,.-shuf_mask_iv -.align 16 +.align 4 .type KS_reorder, %object KS_reorder: .quad 0x0302010007060504, 0x070605040b0a0908 @@ -116,6 +117,16 @@ KS_reorder: .altmacro declare_register xTMP x23 +/* v0-v15 are assigned to LFSR0-15, should not be reused in CIPHERNx4B_4 */ +/* v24-v26 are assigned to BRCX0-2, could be reused in CIPHERNx4B_4 */ +declare_register vBRCX0 v24 +declare_register vBRCX1 v25 +declare_register vBRCX2 v26 +/* v27-v28 are assigned to FR1-2, should not be reused in CIPHERNx4B_4 */ +declare_register vFR1 v27 +declare_register vFR2 v28 +declare_register qFR1 q27 +declare_register qFR2 q28 .macro FUNC_SAVE stp x29, x30, [sp, -160]! @@ -163,250 +174,143 @@ declare_register xTMP x23 eor v\LFSR\().16b, v\LFSR\().16b, \EKD_MASK\().16b .endm -.macro rot_mod32 vOUT, vIN, ROTATE - mov \vOUT\().16b, \vIN\().16b +.macro rot_mod32 vOUT, vIN, ROTATE, vTMP + ushr \vOUT\().4s, \vIN\().4s, 32-\ROTATE + sli \vOUT\().4s, \vIN\().4s, \ROTATE +.endm - shl \vOUT\().4s, \vOUT\().4s, \ROTATE - ushr v27.4s, \vIN\().4s, 32-\ROTATE +.macro TRANSPOSE4_U32 V_0, V_1, V_2, V_3, T_0, T_1, T_2, T_3 + zip1 v\T_0\().4s, v\V_0\().4s, v\V_1\().4s // T_0 = {b1 a1 b0 a0} + zip2 v\T_1\().4s, v\V_0\().4s, v\V_1\().4s // T_1 = {b3 a3 b2 a2} + zip1 v\T_2\().4s, v\V_2\().4s, v\V_3\().4s // T_2 = {d1 c1 d0 c0} + zip2 v\T_3\().4s, v\V_2\().4s, v\V_3\().4s // T_3 = {d3 c3 d2 c2} + + zip1 v\V_0\().2d, v\T_0\().2d, v\T_2\().2d // V_0 = {d0 c0 b0 a0} + zip2 v\V_1\().2d, v\T_0\().2d, v\T_2\().2d // V_1 = {d1 c1 b1 a1} + zip1 v\V_2\().2d, v\T_1\().2d, v\T_3\().2d // V_2 = {d2 c2 b2 a2} + zip2 v\V_3\().2d, v\T_1\().2d, v\T_3\().2d // V_3 = {d3 c3 b3 a3} +.endm - eor \vOUT\().16b, \vOUT\().16b, v27.16b +.macro USHR_4S vd, n, rot + ushr \vd\().4s, v\n\().4s, #\rot\() .endm -.macro TRANSPOSE4_U32 V_0, V_1, V_2, V_3, T_0, T_1, T_2, T_3 - zip1 \T_0\().4s, \V_0\().4s, \V_1\().4s // T_0 = {b1 a1 b0 a0} - zip2 \T_1\().4s, \V_0\().4s, \V_1\().4s // T_1 = {b3 a3 b2 a2} - zip1 \T_2\().4s, \V_2\().4s, \V_3\().4s // T_2 = {d1 c1 d0 c0} - zip2 \T_3\().4s, \V_2\().4s, \V_3\().4s // T_3 = {d3 c3 d2 c2} - - zip1 \V_0\().2d, \T_0\().2d, \T_2\().2d // V_0 = {d0 c0 b0 a0} - zip2 \V_1\().2d, \T_0\().2d, \T_2\().2d // V_1 = {d1 c1 b1 a1} - zip1 \V_2\().2d, \T_1\().2d, \T_3\().2d // V_2 = {d2 c2 b2 a2} - zip2 \V_3\().2d, \T_1\().2d, \T_3\().2d // V_3 = {d3 c3 b3 a3} +.macro STR_Q i, addrreg, offset + str q\i\(), [\addrreg\(), #\offset\()] .endm -.macro load_lfsr STATE, ROUND_NUM, REG_IDX, TMP, LFSR - add \TMP, \ROUND_NUM, \REG_IDX - and \TMP, \TMP, #0xf - lsl \TMP, \TMP, #4 - add \TMP, \TMP, \STATE - ld1 {\LFSR\().16b}, [\TMP] +.macro LDR_Q i, addrreg, offset + ldr q\i\(), [\addrreg\(), #\offset\()] .endm -.macro store_lfsr STATE, ROUND_NUM, REG_IDX, TMP, LFSR - add \TMP, \ROUND_NUM, \REG_IDX - and \TMP, \TMP, #0xf - lsl \TMP, \TMP, #4 - add \TMP, \TMP, \STATE - st1 {\LFSR\().16b}, [\TMP] +.macro TRN1_8H vd, n, m + trn1 \vd\().8h, v\n\().8h, v\m\().8h .endm -.macro bits_reorg4 STATE, IS_NUMBER=1, ROUND_NUM, TMP, OUTPUT_X3=0, X3 - // v15 = LFSR_S15 - // v14 = LFSR_S14 - // v11 = LFSR_S11 - // v9 = LFSR_S9 - // v7 = LFSR_S7 - // v5 = LFSR_S5 - // v2 = LFSR_S2 - // v0 = LFSR_S0 -.if \IS_NUMBER == 1 - ldr q15, [\STATE, ((15 + \ROUND_NUM) % 16)*16] - ldr q14, [\STATE, ((14 + \ROUND_NUM) % 16)*16] - ldr q11, [\STATE, ((11 + \ROUND_NUM) % 16)*16] - ldr q9, [\STATE, (( 9 + \ROUND_NUM) % 16)*16] - ldr q7, [\STATE, (( 7 + \ROUND_NUM) % 16)*16] - ldr q5, [\STATE, (( 5 + \ROUND_NUM) % 16)*16] - ldr q2, [\STATE, (( 2 + \ROUND_NUM) % 16)*16] - ldr q0, [\STATE, (( 0 + \ROUND_NUM) % 16)*16] -.else - load_lfsr \STATE, \ROUND_NUM, 15, \TMP, v15 - load_lfsr \STATE, \ROUND_NUM, 14, \TMP, v14 - load_lfsr \STATE, \ROUND_NUM, 11, \TMP, v11 - load_lfsr \STATE, \ROUND_NUM, 9, \TMP, v9 - load_lfsr \STATE, \ROUND_NUM, 7, \TMP, v7 - load_lfsr \STATE, \ROUND_NUM, 5, \TMP, v5 - load_lfsr \STATE, \ROUND_NUM, 2, \TMP, v2 - load_lfsr \STATE, \ROUND_NUM, 0, \TMP, v0 +.macro bits_reorg4 ROUND_NUM, OUTPUT_X3=0, X3 + USHR_4S v20, %((15 + \ROUND_NUM) % 16), 15 + USHR_4S v21, %((9 + \ROUND_NUM) % 16), 15 + USHR_4S v22, %((5 + \ROUND_NUM) % 16), 15 +.if \OUTPUT_X3 == 1 + USHR_4S v29, %((0 + \ROUND_NUM) % 16), 15 .endif - eor v1.16b, v1.16b, v1.16b - ushr v15.4s, v15.4s, #15 - shl v15.4s, v15.4s, #16 - shl v14.4s, v14.4s, #16 - ushr v14.4s, v14.4s, #16 - eor v15.16b, v15.16b, v14.16b - str q15, [\STATE, OFS_X0] // BRC_X0 - - shl v11.4s, v11.4s, #16 - ushr v9.4s, v9.4s, #15 - eor v11.16b, v11.16b, v9.16b - str q11, [\STATE, OFS_X1] // BRC_X1 - - shl v7.4s, v7.4s, #16 - ushr v5.4s, v5.4s, #15 - eor v7.16b, v7.16b, v5.16b - str q7, [\STATE, OFS_X2] // BRC_X2 + TRN1_8H vBRCX0, %((14 + \ROUND_NUM) % 16), 20 + TRN1_8H vBRCX1, 21, %((11 + \ROUND_NUM) % 16) + TRN1_8H vBRCX2, 22, %((7 + \ROUND_NUM) % 16) .if \OUTPUT_X3 == 1 - shl v2.4s, v2.4s, #16 - ushr v0.4s, v0.4s, #15 - eor v\X3\().16b, v2.16b, v0.16b // BRC_X3 + TRN1_8H v\X3\(), 29, %((2 + \ROUND_NUM) % 16) // BRC_X3 .endif .endm -.macro nonlin_fun4 STATE, OUTPUT_W=0, V_W +.macro nonlin_fun4 OUTPUT_W=0, V_W .if \OUTPUT_W == 1 - add xTMP, \STATE, OFS_X0 - ld1 {\V_W\().4s}, [xTMP] - add xTMP, \STATE, OFS_R1 - ld1 {v25.4s}, [xTMP] - eor \V_W\().16b, \V_W\().16b, v25.16b - add xTMP, \STATE, OFS_R2 - ld1 {v25.4s}, [xTMP] - add \V_W\().4s, \V_W\().4s, v25.4s // W = (BRC_X0 ^ F_R1) + F_R2 + eor \V_W\().16b, vBRCX0.16b, vFR1.16b + add \V_W\().4s, \V_W\().4s, vFR2.4s // W = (BRC_X0 ^ F_R1) + F_R2 .endif - - add xTMP, \STATE, OFS_R1 - ld1 {v1.4s}, [xTMP] - add xTMP, \STATE, OFS_X1 - ld1 {v2.4s}, [xTMP] - add v1.4s, v1.4s, v2.4s // W1 = F_R1 + BRC_X1 - - add xTMP, \STATE, OFS_R2 - ld1 {v3.4s}, [xTMP] - add xTMP, \STATE, OFS_X2 - ld1 {v2.4s}, [xTMP] - eor v2.16b, v3.16b, v2.16b // W2 = F_R2 + BRC_X2 - - mov v3.16b, v1.16b - mov v4.16b, v2.16b - shl v1.4s, v1.4s, #16 - shl v2.4s, v2.4s, #16 - ushr v3.4s, v3.4s, #16 - ushr v4.4s, v4.4s, #16 - eor v1.16b, v1.16b, v4.16b // W1L || W2H - eor v2.16b, v2.16b, v3.16b // W2L || W1H - - rot_mod32 v3, v1, 2 - rot_mod32 v4, v1, 10 - rot_mod32 v5, v1, 18 - rot_mod32 v6, v1, 24 - eor v1.16b, v1.16b, v3.16b - eor v1.16b, v1.16b, v4.16b - eor v1.16b, v1.16b, v5.16b - eor v1.16b, v1.16b, v6.16b // v1 = U = L1(P) - - rot_mod32 v3, v2, 8 - rot_mod32 v4, v2, 14 - rot_mod32 v5, v2, 22 - rot_mod32 v6, v2, 30 - eor v2.16b, v2.16b, v3.16b - eor v2.16b, v2.16b, v4.16b - eor v2.16b, v2.16b, v5.16b - eor v2.16b, v2.16b, v6.16b // v2 = V = L2(Q) - - // shuffle U and V to have all S0 lookups in v1 and all S1 lookups in v2 + add v23.4s, vFR1.4s, vBRCX1.4s // W1 = F_R1 + BRC_X1 + eor v24.16b, vFR2.16b, vBRCX2.16b // W2 = F_R2 ^ BRC_X2 + + ushr v25.4s, v23.4s, #16 + ushr v26.4s, v24.4s, #16 + trn1 v21.8h, v25.8h, v24.8h // W1L || W2H + trn1 v20.8h, v26.8h, v23.8h // W2L || W1H + + ushr v26.4s, v21.4s, 32-8 + ushr v23.4s, v20.4s, 32-10 + ushr v27.4s, v21.4s, 32-14 + ushr v22.4s, v20.4s, 32-2 + ushr v28.4s, v21.4s, 32-22 + ushr v24.4s, v20.4s, 32-18 + ushr v31.4s, v21.4s, 32-30 + ushr v25.4s, v20.4s, 32-24 + sli v26.4s, v21.4s, 8 + sli v23.4s, v20.4s, 10 + sli v27.4s, v21.4s, 14 + sli v22.4s, v20.4s, 2 + sli v28.4s, v21.4s, 22 + sli v24.4s, v20.4s, 18 + sli v31.4s, v21.4s, 30 + sli v25.4s, v20.4s, 24 + eor v26.16b, v26.16b, v27.16b + eor v22.16b, v22.16b, v23.16b + eor v28.16b, v28.16b, v31.16b + eor v24.16b, v24.16b, v25.16b + eor v21.16b, v21.16b, v26.16b + eor v20.16b, v20.16b, v22.16b + eor v21.16b, v21.16b, v28.16b // v21 = V = L2(Q) + eor v20.16b, v20.16b, v24.16b // v20 = U = L1(P) + + // shuffle U and V to have all S0 lookups in v20 and all S1 lookups in v21 // Compress all S0 and S1 input values in each register - ushr v3.8h, v1.8h, #8 - shl v3.8h, v3.8h, #8 - ushr v4.8h, v2.8h, #8 - - shl v6.8h, v1.8h, #8 - shl v7.8h, v2.8h, #8 - ushr v7.8h, v7.8h, #8 - - eor v1.16b, v3.16b, v4.16b // All S0 input values - eor v2.16b, v6.16b, v7.16b // All S1 input values + trn1 v23.16b, v21.16b, v20.16b + trn2 v22.16b, v21.16b, v20.16b // Compute S0 and S1 values - S0_compute_NEON v1, v3, v4 - S1_compute_NEON v2, v3, v4, v5 - - // Need to shuffle back v1 & v2 before storing output + S0_compute_NEON v22, v20, v21 + S1_compute_NEON v23, v20, v21, v31 + // Need to shuffle back v20 & v21 before storing output // (revert what was done before S0 and S1 computations) - shl v3.8h, v1.8h, #8 - ushr v1.8h, v1.8h, #8 - shl v1.8h, v1.8h, #8 - - ushr v4.8h, v2.8h, #8 - shl v2.8h, v2.8h, #8 - ushr v2.8h, v2.8h, #8 - - eor v1.16b, v1.16b, v4.16b - eor v2.16b, v2.16b, v3.16b - - add xTMP, \STATE, OFS_R1 - st1 {v1.16b}, [xTMP] - add xTMP, \STATE, OFS_R2 - st1 {v2.16b}, [xTMP] + trn1 vFR2.16b, v23.16b, v22.16b + trn2 vFR1.16b, v23.16b, v22.16b .endm // add_mod31() // add two 32-bit args and reduce mod (2^31-1) -.macro add_mod31 V_1, V_2, vTMP - add \V_1\().4s, \V_1\().4s, \V_2\().4s - ushr \vTMP\().4s, \V_1\().4s, #31 - shl \V_1\().4s, \V_1\().4s, #1 - ushr \V_1\().4s, \V_1\().4s, #1 - add \V_1\().4s, \V_1\().4s, \vTMP\().4s +.macro add_mod31 d, n, TMP + add v\d\().4s, v\d\().4s, v\n\().4s + ushr v\TMP\().4s, v\d\().4s, #31 + bic v\d\().4s, #0x80, LSL #24 + add v\d\().4s, v\d\().4s, v\TMP\().4s .endm // rot_mod31() // rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1) -.macro rot_mod31 ARG, BITS, TMP - shl \TMP\().4s, \ARG\().4s, \BITS - ushr \ARG\().4s, \ARG\().4s, (31 - \BITS) - eor \ARG\().16b, \TMP\().16b, \ARG\().16b - shl \ARG\().4s, \ARG\().4s, #1 - ushr \ARG\().4s, \ARG\().4s, #1 +.macro rot_mod31 ARG, DST, BITS + ushr v\DST\().4s, v\ARG\().4s, (31 - \BITS) + sli v\DST\().4s, v\ARG\().4s, \BITS + bic v\DST\().4s, #0x80, LSL #24 .endm -.macro lfsr_updt4 STATE, IS_NUM=0, ROUND_NUM, TMP, V_W - // - // v1 = LFSR_S0 - // v4 = LFSR_S4 - // v10 = LFSR_S10 - // v13 = LFSR_S13 - // v15 = LFSR_S15 - // -.if \IS_NUM == 1 - add xTMP, \STATE, (( 0 + \ROUND_NUM) % 16)*16 - ld1 {v1.16b}, [xTMP] - add xTMP, \STATE, (( 4 + \ROUND_NUM) % 16)*16 - ld1 {v4.16b}, [xTMP] - add xTMP, \STATE, ((10 + \ROUND_NUM) % 16)*16 - ld1 {v10.16b}, [xTMP] - add xTMP, \STATE, ((13 + \ROUND_NUM) % 16)*16 - ld1 {v13.16b}, [xTMP] - add xTMP, \STATE, ((15 + \ROUND_NUM) % 16)*16 - ld1 {v15.16b}, [xTMP] -.else - load_lfsr \STATE, \ROUND_NUM, 0, \TMP, v1 - load_lfsr \STATE, \ROUND_NUM, 4, \TMP, v4 - load_lfsr \STATE, \ROUND_NUM, 10, \TMP, v10 - load_lfsr \STATE, \ROUND_NUM, 13, \TMP, v13 - load_lfsr \STATE, \ROUND_NUM, 15, \TMP, v15 -.endif - +// TMP vreg: v20-v26, v31 +.macro lfsr_updt4 ROUND_NUM, W_ADD=0, V_W // Calculate LFSR feedback - add_mod31 \V_W, v1, v31 - rot_mod31 v1, 8, v31 - add_mod31 \V_W, v1, v31 - rot_mod31 v4, 20, v31 - add_mod31 \V_W, v4, v31 - rot_mod31 v10, 21, v31 - add_mod31 \V_W, v10, v31 - rot_mod31 v13, 17, v31 - add_mod31 \V_W, v13, v31 - rot_mod31 v15, 15, v31 - add_mod31 \V_W, v15, v31 - -.if \IS_NUM == 1 - add xTMP, \STATE, ((0 + \ROUND_NUM) % 16)*16 - st1 {\V_W\().16b}, [xTMP] -.else - store_lfsr \STATE, \ROUND_NUM, 0, \TMP, \V_W + // s0 = w>>1 + 2^15*s15 + 2^17*s13 + 2^21*s10 + 2^20*s4 + (1+2^8)*s0 mod (2^31-1); + + rot_mod31 %((0 + \ROUND_NUM) % 16), 20, 8 + rot_mod31 %((4 + \ROUND_NUM) % 16), 21, 20 + rot_mod31 %((10 + \ROUND_NUM) % 16), 22, 21 + rot_mod31 %((13 + \ROUND_NUM) % 16), 23, 17 + rot_mod31 %((15 + \ROUND_NUM) % 16), 24, 15 + add_mod31 20, 21, 25 + add_mod31 22, 23, 26 + add_mod31 %((0 + \ROUND_NUM) % 16), 24, 31 + add_mod31 20, 22, 25 + add_mod31 %((0 + \ROUND_NUM) % 16), 20, 31 + +.if \W_ADD == 1 + add_mod31 %((0 + \ROUND_NUM) % 16), \V_W, 26 .endif - // LFSR_S16 = (LFSR_S15++) = v1 .endm .macro load_key_iv i, j, pKe, pIv, off @@ -666,17 +570,8 @@ declare_register xTMP x23 FUNC_SAVE - // Zero out R1, R2(only lower 128bits) - eor v0.16b, v0.16b, v0.16b -.set I, 0 -.rept 2 - str q0, [pState, OFS_R1 + I*16] -.set I, (I + 1) -.endr - .if \KEY_SIZE == 128 - - // Load key and IVs + // Load key and IVs to v16-v23 .set off, 0 .set i, 16 .set j, 20 @@ -689,37 +584,37 @@ declare_register xTMP x23 // Initialize all LFSR registers .set off, 0 +.set idx_off, 0 .rept 4 adrp xTMP, shuf_mask_key - ldr q4, [xTMP, #:lo12:shuf_mask_key + off] + ldr q24, [xTMP, #:lo12:shuf_mask_key + off] adrp xTMP, shuf_mask_iv - ldr q5, [xTMP, #:lo12:shuf_mask_iv + off] + ldr q25, [xTMP, #:lo12:shuf_mask_iv + off] adrp xTMP, Ek_d - ldr q6, [xTMP, #:lo12:Ek_d + off] + ldr q26, [xTMP, #:lo12:Ek_d + off] -.set idx, 0 +.set idx, idx_off .set i, 16 .set j, 20 .rept 4 - INIT_LFSR_128 %i, %j, v4, v5, v6, %idx, v7 + INIT_LFSR_128 %i, %j, v24, v25, v26, %idx, v27 .set idx, (idx + 1) .set i, (i + 1) .set j, (j + 1) .endr +.set id0, idx_off +.set id1, (id0 + 1) +.set id2, (id1 + 1) +.set id3, (id2 + 1) + // store 4xLFSR registers in memory (reordering first, // so all SX registers are together) - TRANSPOSE4_U32 v0, v1, v2, v3, v4, v5, v6, v7 - -.set i, 0 -.rept 4 - str_vi %i, pState, off -.set i, (i+1) -.endr + TRANSPOSE4_U32 %id0, %id1, %id2, %id3, 27, 28, 29, 31 .set off, (off + 16) +.set idx_off, (idx_off + 4) .endr - .else // KEY_SIZE == 256 // Get pointer to constants (depending on tag size, this will point at // constants for encryption, authentication with 4-byte, 8-byte or 16-byte tags) @@ -753,40 +648,50 @@ declare_register xTMP x23 .endr // Read, transpose and store, so all S_X from the 4 packets are in the same register -.set off, 0 +.set idx_off, 0 .rept 4 -.irp idx,0,1,2,3 - ldr q\idx, [pState, 16*\idx+off] + +.set idx, idx_off +.rept 4 + LDR_Q %idx, pState, %(16*idx) +.set idx, (idx + 1) .endr - TRANSPOSE4_U32 v0, v1, v2, v3, v4, v5, v6, v7 +.set id0, idx_off +.set id1, (id0 + 1) +.set id2, (id1 + 1) +.set id3, (id2 + 1) + TRANSPOSE4_U32 %id0, %id1, %id2, %id3, 27, 28, 29, 31 -.irp idx,0,1,2,3 - str q\idx, [pState, 16*\idx+off] +.set idx_off, (idx_off + 4) .endr -.set off, (off + 64) -.endr .endif // KEY_SIZE == 256 - mov x9, 0 -1: - cmp x9, 32 - b.eq 2f + // Zero out R1, R2(only lower 128bits) + eor vFR1.16b, vFR1.16b, vFR1.16b + eor vFR2.16b, vFR2.16b, vFR2.16b + +.set init_round_num, 0 +.rept 32 // Shift LFSR 32-times, update state variables - bits_reorg4 pState, 0, x9, x10 - nonlin_fun4 pState, 1, v0 - ushr v0.4s, v0.4s, #1 // Shift out LSB of W - lfsr_updt4 pState, 0, x9, x10, v0 // W (v0) used in LFSR update - not set to zero - add x9, x9, #1 - b 1b + bits_reorg4 init_round_num, 0, no_reg + nonlin_fun4 1, v29 + ushr v29.4s, v29.4s, #1 // Shift out LSB of W + lfsr_updt4 init_round_num, 1, 29 // W (v0) used in LFSR update - not set to zero +.set init_round_num, (init_round_num + 1) +.endr 2: // And once more, initial round from keygen phase = 33 times - bits_reorg4 pState, 1, 0, no_reg - nonlin_fun4 pState, 0, no_reg - eor v0.16b, v0.16b, v0.16b - lfsr_updt4 pState, 1, 0, no_reg, v0 + bits_reorg4 0, 0, no_reg + nonlin_fun4 0, no_reg + lfsr_updt4 0, 0, no_reg + + STORE_LFSR_LIST pState, 0 + + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] FUNC_RESTORE @@ -926,109 +831,68 @@ end_load: .set N, 1 .set round, (\INITIAL_ROUND + N) .rept \NROUNDS - bits_reorg4 pState, 1, round, no_reg, 1, %(N+15) - nonlin_fun4 pState, 1, v0 + bits_reorg4 round, 1, %(N+15) + nonlin_fun4 1, v29 // OFS_XR XOR W (v0) - eor_vi %(N+15), %(N+15), v0 - eor v0.16b, v0.16b, v0.16b - lfsr_updt4 pState, 1, round, no_reg, v0 + eor_vi %(N+15), %(N+15), v29 + lfsr_updt4 round, 0, no_reg .set N, (N + 1) .set round, (round + 1) .endr - TRANSPOSE4_U32 v16, v17, v18, v19, v20, v21, v22, v23 + TRANSPOSE4_U32 16, 17, 18, 19, 20, 21, 22, 23 // XOR Input buffer with keystream in rounds of 16B - ldp x20, x21, [pIn, #0] - ldp x22, x23, [pIn, #16] - -.if \LAST_CALL == 4 - ldr x24, [x20] - add x24, x24, \OFFSET - umov w25, v30.h[0] - simd_load_16 v7, x24, x25 - ldr x24, [x21] - add x24, x24, \OFFSET - umov w25, v30.h[1] - simd_load_16 v8, x24, x25 - ldr x24, [x22] - add x24, x24, \OFFSET - umov w25, v30.h[2] - simd_load_16 v9, x24, x25 - ldr x24, [x23] - add x24, x24, \OFFSET - umov w25, v30.h[3] - simd_load_16 v10, x24, x25 -.else - ldr q7, [x20, \OFFSET] - ldr q8, [x21, \OFFSET] - ldr q9, [x22, \OFFSET] - ldr q10, [x23, \OFFSET] -.endif rev32 v16.16b, v16.16b + ldr q24, [x20, \OFFSET] rev32 v17.16b, v17.16b + ldr q25, [x21, \OFFSET] rev32 v18.16b, v18.16b + ldr q26, [x22, \OFFSET] rev32 v19.16b, v19.16b + ldr q31, [x19, \OFFSET] - eor v16.16b, v16.16b, v7.16b - eor v17.16b, v17.16b, v8.16b - eor v18.16b, v18.16b, v9.16b - eor v19.16b, v19.16b, v10.16b - - ldp x20, x21, [pOut, #0] - ldp x22, x23, [pOut, #16] + eor v16.16b, v16.16b, v24.16b + eor v17.16b, v17.16b, v25.16b + eor v18.16b, v18.16b, v26.16b + eor v19.16b, v19.16b, v31.16b .if \LAST_CALL == 1 umov w25, v30.h[0] - simd_store_16 x20, v16, x25, \OFFSET + simd_store_16 x26, v16, x25, \OFFSET umov w25, v30.h[1] - simd_store_16 x21, v17, x25, \OFFSET + simd_store_16 x27, v17, x25, \OFFSET umov w25, v30.h[2] - simd_store_16 x22, v18, x25, \OFFSET + simd_store_16 x28, v18, x25, \OFFSET umov w25, v30.h[3] - simd_store_16 x23, v19, x25, \OFFSET + simd_store_16 x29, v19, x25, \OFFSET .else - str q16, [x20, \OFFSET] - str q17, [x21, \OFFSET] - str q18, [x22, \OFFSET] - str q19, [x23, \OFFSET] + str q16, [x26, \OFFSET] + str q17, [x27, \OFFSET] + str q18, [x28, \OFFSET] + str q19, [x29, \OFFSET] .endif .endm -// This macro reorder the LFSR registers -// after N rounds (1 <= N <= 15), since the registers -// are shifted every round -// -// The macro clobbers v0-15 -// -.macro load_lfsr_from_state_to_vi i, STATE - add xTMP, \STATE, 16*\i - ld1 {v\i\().16b}, [xTMP] -.endm - -.macro store_lfsr_from_vj_to_state i, j, STATE - add xTMP, \STATE, 16*\i - st1 {v\j\().16b}, [xTMP] -.endm - -.macro REORDER_LFSR STATE, NUM_ROUNDS -.if \NUM_ROUNDS != 16 -.set i, 0 +.macro STORE_LFSR_LIST STATE, NUM_ROUNDS +.set round_num, \NUM_ROUNDS +.set offset, 0 .rept 16 - load_lfsr_from_state_to_vi %i, \STATE -.set i, (i + 1) + STR_Q %((round_num) % 16), \STATE\(), offset +.set round_num, (round_num + 1) +.set offset, (offset + 16) .endr +.endm -.set i, 0 -.set j, \NUM_ROUNDS +.macro LOAD_LFSR_LIST STATE, NUM_ROUNDS +.set round_num, \NUM_ROUNDS +.set offset, 0 .rept 16 - store_lfsr_from_vj_to_state %i, %j, \STATE -.set i, (i + 1) -.set j, ((j + 1) % 16) + LDR_Q %((round_num) % 16), \STATE\(), offset +.set round_num, (round_num + 1) +.set offset, (offset + 16) .endr -.endif // NUM_ROUNDS != 16 - .endm .macro store_vi_to_keyaddr i, addr1, addr2, addr3, addr4 @@ -1044,15 +908,18 @@ end_load: FUNC_SAVE + ldr qFR1, [pState, #OFS_R1] + ldr qFR2, [pState, #OFS_R2] + LOAD_LFSR_LIST pState, 0 + // Generate N*4B of keystream in N rounds .set N, 1 .rept \NUM_ROUNDS - bits_reorg4 pState, 1, N, no_reg, 1, %(N+15) - nonlin_fun4 pState, 1, v0 + bits_reorg4 N, 1, %(N+15) + nonlin_fun4 1, v29 // OFS_XR XOR W (v0) - eor_vi %(N+15), %(N+15), v0 - eor v0.16b, v0.16b, v0.16b - lfsr_updt4 pState, 1, N, no_reg, v0 + eor_vi %(N+15), %(N+15), v29 + lfsr_updt4 N, 0, no_reg .set N, (N + 1) .endr @@ -1060,7 +927,7 @@ end_load: ldp x12, x13, [pKS, 16] .if \NUM_ROUNDS == 4 - TRANSPOSE4_U32 v16, v17, v18, v19, v20, v21, v22, v23 + TRANSPOSE4_U32 16, 17, 18, 19, 20, 21, 22, 23 st1 {v16.16b}, [x10] st1 {v17.16b}, [x11] st1 {v18.16b}, [x12] @@ -1077,12 +944,15 @@ end_load: .endr .endif + STORE_LFSR_LIST pState, \NUM_ROUNDS + + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] + #ifdef SAFE_DATA eor v0.16b, v0.16b, v0.16b #endif - REORDER_LFSR pState, \NUM_ROUNDS - FUNC_RESTORE .endm @@ -1375,6 +1245,12 @@ START_FUNC(ZUC_CIPHER_4) FUNC_SAVE + ldp x20, x21, [pIn, #0] + ldp x22, x19, [pIn, #16] + + ldp x26, x27, [pOut, #0] + ldp x28, x29, [pOut, #16] + // Convert all lengths from UINT16_MAX (indicating that lane is not valid) to min length dup v0.8h, min_len ld1 {v1.4h}, [lengths] @@ -1413,6 +1289,11 @@ START_FUNC(ZUC_CIPHER_4) eor buf_idx, buf_idx, buf_idx + LOAD_LFSR_LIST pState, 0 + + ldr qFR1, [pState, #OFS_R1] + ldr qFR2, [pState, #OFS_R2] + loop_cipher64: cmp min_len, #64 b.lt exit_loop_cipher64 @@ -1426,12 +1307,11 @@ loop_cipher64: .set round_off, (round_off + 4) .endr b loop_cipher64 - exit_loop_cipher64: // Check if there are more bytes left to encrypt add w6, min_len, 3 lsr w6, w6, #2 // number of rounds left (round up length to nearest multiple of 4B) - cbz w6, exit_final_rounds + cbz w6, store_lfsr_and_exit cmp w6, 8 b.eq _num_final_rounds_is_8 @@ -1475,7 +1355,7 @@ _final_rounds_is_1_3: .irp I,1,2,3,4 _num_final_rounds_is_\I: CIPHERNx4B_4 \I, 0, buf_idx, 1 - REORDER_LFSR pState, \I + STORE_LFSR_LIST pState, \I add buf_idx, buf_idx, \I * 4 b exit_final_rounds .endr @@ -1486,7 +1366,7 @@ _num_final_rounds_is_\I: add buf_idx, buf_idx, #16 CIPHERNx4B_4 (\I-4), 4, buf_idx, 1 add buf_idx, buf_idx, ((\I-4)*4) - REORDER_LFSR pState, \I + STORE_LFSR_LIST pState, \I b exit_final_rounds .endr @@ -1498,7 +1378,7 @@ _num_final_rounds_is_\I: add buf_idx, buf_idx, #16 CIPHERNx4B_4 (\I-8), 8, buf_idx, 1 add buf_idx, buf_idx, ((\I-8)*4) - REORDER_LFSR pState, \I + STORE_LFSR_LIST pState, \I b exit_final_rounds .endr @@ -1512,11 +1392,12 @@ _num_final_rounds_is_\I: add buf_idx, buf_idx, #16 CIPHERNx4B_4 (\I-12), 12, buf_idx, 1 add buf_idx, buf_idx, ((\I-12)*4) - REORDER_LFSR pState, \I + STORE_LFSR_LIST pState, \I b exit_final_rounds .endr - +store_lfsr_and_exit: + STORE_LFSR_LIST pState, 0 exit_final_rounds: // update in/out pointers dup v0.2d, buf_idx @@ -1529,6 +1410,9 @@ exit_final_rounds: add v2.2d, v2.2d, v0.2d st1 {v1.2d, v2.2d}, [pOut] + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] + #ifdef SAFE_DATA eor v0.16b, v0.16b, v0.16b #endif -- GitLab