diff --git a/lib/aarch64/mb_mgr_aarch64.c b/lib/aarch64/mb_mgr_aarch64.c index 935cba1e59ea360dde0777d6e80110c03f7b5f6a..99b89eed590fe2be308ecffa48c3ba3b45610bc5 100644 --- a/lib/aarch64/mb_mgr_aarch64.c +++ b/lib/aarch64/mb_mgr_aarch64.c @@ -107,9 +107,11 @@ init_mb_mgr_aarch64_internal(IMB_MGR *state, const int reset_mgrs) state->eea3_4_buffer = zuc_eea3_4_buffer_aarch64; state->eea3_n_buffer = zuc_eea3_n_buffer_aarch64; state->zuc256_eea3_1_buffer = zuc256_eea3_1_buffer_aarch64; + state->zuc256_eea3_n_buffer = zuc256_eea3_n_buffer_aarch64; state->eia3_1_buffer = zuc_eia3_1_buffer_aarch64; state->eia3_n_buffer = zuc_eia3_n_buffer_aarch64; state->zuc256_eia3_1_buffer = zuc256_eia3_1_buffer_aarch64; + state->zuc256_eia3_n_buffer = zuc256_eia3_n_buffer_aarch64; state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_aarch64; state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_aarch64; diff --git a/lib/aarch64/mb_mgr_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_aarch64_no_aesni.c index ec64f3541b0c2890bcb914795f1cc3046ff3bb4f..40eaf5ff5997f5571892914699a2f7c2109a51d3 100644 --- a/lib/aarch64/mb_mgr_aarch64_no_aesni.c +++ b/lib/aarch64/mb_mgr_aarch64_no_aesni.c @@ -96,9 +96,11 @@ init_mb_mgr_aarch64_no_aesni_internal(IMB_MGR *state, const int reset_mgrs) state->eea3_4_buffer = zuc_eea3_4_buffer_aarch64_no_aesni; state->eea3_n_buffer = zuc_eea3_n_buffer_aarch64_no_aesni; state->zuc256_eea3_1_buffer = zuc256_eea3_1_buffer_aarch64_no_aesni; + state->zuc256_eea3_n_buffer = zuc256_eea3_n_buffer_aarch64_no_aesni; state->eia3_1_buffer = zuc_eia3_1_buffer_aarch64_no_aesni; state->eia3_n_buffer = zuc_eia3_n_buffer_aarch64_no_aesni; state->zuc256_eia3_1_buffer = zuc256_eia3_1_buffer_aarch64_no_aesni; + state->zuc256_eia3_n_buffer = zuc256_eia3_n_buffer_aarch64_no_aesni; state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_aarch64_no_aesni; state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_aarch64_no_aesni; diff --git a/lib/aarch64/mb_mgr_aarch64_sve256.c b/lib/aarch64/mb_mgr_aarch64_sve256.c index b6c425a66a9b34b23b9050e22a8595b3ccc57d18..e8bced59e09c1ecb437e04c270f87dcd8ea0e69e 100644 --- a/lib/aarch64/mb_mgr_aarch64_sve256.c +++ b/lib/aarch64/mb_mgr_aarch64_sve256.c @@ -107,9 +107,11 @@ init_mb_mgr_aarch64_sve256_internal(IMB_MGR *state, const int reset_mgrs) state->eea3_4_buffer = zuc_eea3_4_buffer_aarch64; state->eea3_n_buffer = zuc_eea3_n_buffer_aarch64; state->zuc256_eea3_1_buffer = zuc256_eea3_1_buffer_aarch64; + state->zuc256_eea3_n_buffer = zuc256_eea3_n_buffer_aarch64; state->eia3_1_buffer = zuc_eia3_1_buffer_aarch64; state->eia3_n_buffer = zuc_eia3_n_buffer_aarch64; state->zuc256_eia3_1_buffer = zuc256_eia3_1_buffer_aarch64; + state->zuc256_eia3_n_buffer = zuc256_eia3_n_buffer_aarch64; state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_aarch64_sve256; state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_aarch64_sve256; diff --git a/lib/aarch64/mb_mgr_code_aarch64.h b/lib/aarch64/mb_mgr_code_aarch64.h index 1723b1b6adedf8a1b3afdac4bbd7460529b3f5a8..4db3491caf5dede56a8faa66540d23455c4b87a6 100644 --- a/lib/aarch64/mb_mgr_code_aarch64.h +++ b/lib/aarch64/mb_mgr_code_aarch64.h @@ -212,11 +212,12 @@ submit_zuc_eea3_job(IMB_MGR *state, IMB_JOB *job) const uint8_t *pSrc = job->src; uint8_t *pDst = job->dst; uint32_t byteLength = job->msg_len_to_cipher_in_bytes; + uint64_t ivLength = job->iv_len_in_bytes; if (16 == job->key_len_in_bytes) { IMB_ZUC_EEA3_1_BUFFER(state, pKeys, pIvs, pSrc, pDst, byteLength); } else { - IMB_ZUC256_EEA3_1_BUFFER(state, pKeys, pIvs, pSrc, pDst, byteLength); + IMB_ZUC256_EEA3_1_BUFFER(state, pKeys, pIvs, ivLength, pSrc, pDst, byteLength); } job->status |= IMB_STATUS_COMPLETED_CIPHER; @@ -315,11 +316,14 @@ submit_zuc_eia3_job(IMB_MGR *state, IMB_JOB *job) const uint8_t *pSrc = job->src; uint32_t bitLength = job->msg_len_to_hash_in_bits; uint32_t *pMacI = (uint32_t *)job->auth_tag_output; + uint64_t ivLength = job->iv_len_in_bytes; + uint64_t tagLength = job->auth_tag_output_len_in_bytes; if (IMB_AUTH_ZUC_EIA3_BITLEN == job->hash_alg) { IMB_ZUC_EIA3_1_BUFFER(state, pKeys, pIvs, pSrc, bitLength, pMacI); } else { - IMB_ZUC256_EIA3_1_BUFFER(state, pKeys, pIvs, pSrc, bitLength, pMacI); + IMB_ZUC256_EIA3_1_BUFFER(state, pKeys, pIvs, ivLength, pSrc, + bitLength, pMacI, tagLength); } job->status |= IMB_STATUS_COMPLETED_AUTH; diff --git a/lib/aarch64/mb_mgr_zuc_submit_flush_aarch64.c b/lib/aarch64/mb_mgr_zuc_submit_flush_aarch64.c index 66787469a66fa882a3ba25e188d82352ff0137d6..4f4358e04326a52965b7ca01299997f41e5da00a 100644 --- a/lib/aarch64/mb_mgr_zuc_submit_flush_aarch64.c +++ b/lib/aarch64/mb_mgr_zuc_submit_flush_aarch64.c @@ -1,6 +1,38 @@ +/********************************************************************** + Copyright(c) 2022-2023 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + #include "include/ipsec_ooo_mgr.h" #include "include/zuc_internal.h" #include +#ifdef SAFE_PARAM +#include "error.h" +#endif #ifndef SUBMIT_JOB_ZUC128_EEA3 #define SUBMIT_JOB_ZUC128_EEA3 submit_job_zuc_eea3_aarch64_common @@ -156,7 +188,7 @@ static IMB_JOB *zuc_mb_mgr_free_eia3_job(MB_MGR_ZUC_OOO *state) ret = state->job_in_lane[i]; state->job_in_lane[i] = NULL; ret->status |= IMB_STATUS_COMPLETED_AUTH; - state->lens[i] = 0xffff; + state->lens[i] = 0xffffffff; state->unused_lanes = state->unused_lanes << 8; state->unused_lanes |= i; state->unused_lane_bitmask |= (1 << i); @@ -175,6 +207,40 @@ static IMB_JOB *zuc_mb_mgr_submit_eea3_job(MB_MGR_ZUC_OOO *state, IMB_JOB *job, ZUC_TYPE zuc) { +#ifdef SAFE_PARAM + /* reset error status */ + if (imb_errno != 0) + imb_set_errno(NULL, 0); + + if (job->enc_keys == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY); + return NULL; + } + if (job->iv == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return NULL; + } + + if (job->src == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return NULL; + } + if (job->dst == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_DST); + return NULL; + } + if ((job->msg_len_to_cipher_in_bytes == 0) || + (job->msg_len_to_cipher_in_bytes > ZUC_MAX_BYTELEN)) { + imb_set_errno(NULL, IMB_ERR_CIPH_LEN); + return NULL; + } + if (zuc == ZUC_256) { + if (job->iv_len_in_bytes != 23 && job->iv_len_in_bytes != 25) { + imb_set_errno(NULL, IMB_ERR_IV_LEN); + return NULL; + } + } +#endif IMB_JOB *ret = NULL; uint32_t state_tmp[MAX_ZUC_STATE_SZ] = {0}; uint32_t min_len = state->lens[0]; @@ -231,7 +297,7 @@ static IMB_JOB *zuc_mb_mgr_submit_eea3_job(MB_MGR_ZUC_OOO *state, return ret; } -static IMB_JOB *zuc_mb_mgr_flash_eea3_job(MB_MGR_ZUC_OOO *state, ZUC_TYPE zuc) +static IMB_JOB *zuc_mb_mgr_flush_eea3_job(MB_MGR_ZUC_OOO *state, ZUC_TYPE zuc) { IMB_JOB *ret = NULL; uint32_t state_tmp[MAX_ZUC_STATE_SZ] = {0}; @@ -241,10 +307,10 @@ static IMB_JOB *zuc_mb_mgr_flash_eea3_job(MB_MGR_ZUC_OOO *state, ZUC_TYPE zuc) if(state->unused_lanes >> 39) return ret; - // Set length = 0xFFFF in NULL jobs + // Set length = 0xFFFFFFFF in NULL jobs for (i = 0; i < ZUC_MB_MAX_LANES_SIMD; i++) { if(state->job_in_lane[i] == NULL) - state->lens[i] = 0xffff; + state->lens[i] = 0xffffffff; } ret = zuc_mb_mgr_free_eea3_job(state); @@ -328,12 +394,49 @@ static IMB_JOB *zuc_mb_mgr_flash_eea3_job(MB_MGR_ZUC_OOO *state, ZUC_TYPE zuc) static IMB_JOB *zuc_mb_mgr_submit_eia3_job(MB_MGR_ZUC_OOO *state, IMB_JOB *job, - ZUC_TYPE key) + ZUC_TYPE zuc) { + +#ifdef SAFE_PARAM + /* reset error status */ + if (imb_errno != 0) + imb_set_errno(NULL, 0); + + if (job->u.ZUC_EIA3._key == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_EXP_KEY); + return NULL; + } + + if (job->src == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return NULL; + } + if (job->auth_tag_output == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_AUTH); + return NULL; + } + if ((job->msg_len_to_hash_in_bits == 0) || + (job->msg_len_to_hash_in_bits > ZUC_MAX_BITLEN)) { + imb_set_errno(NULL, IMB_ERR_AUTH_LEN); + return NULL; + } + if (zuc == ZUC_256) { + if (job->u.ZUC_EIA3._iv == NULL && job->u.ZUC_EIA3._iv23 == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return NULL; + } + } else { + if (job->u.ZUC_EIA3._iv == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return NULL; + } + } +#endif + IMB_JOB *ret = NULL; unsigned int i; - zuc_mb_mgr_insert_eia3_job(state, job, key); + zuc_mb_mgr_insert_eia3_job(state, job, zuc); if(state->unused_lanes != 0xff) return NULL; @@ -342,7 +445,7 @@ static IMB_JOB *zuc_mb_mgr_submit_eia3_job(MB_MGR_ZUC_OOO *state, if(ret != NULL) return ret; - if(key == ZUC_128) + if(zuc == ZUC_128) ZUC_EIA3_4_BUFFER((const void * const *)state->args.keys, (const uint8_t *)state->args.iv, (const void * const *)state->args.in, @@ -355,7 +458,8 @@ static IMB_JOB *zuc_mb_mgr_submit_eia3_job(MB_MGR_ZUC_OOO *state, (const void * const *)state->args.in, (uint32_t **)state->args.out, state->lens, - (const void * const *)state->job_in_lane); + (const void * const *)state->job_in_lane, + job->auth_tag_output_len_in_bytes); // clear all lengths(function will authenticate all buffers) for (i = 0; i < ZUC_MB_MAX_LANES_SIMD; i++) { @@ -367,7 +471,7 @@ static IMB_JOB *zuc_mb_mgr_submit_eia3_job(MB_MGR_ZUC_OOO *state, return ret; } -static IMB_JOB *zuc_mb_mgr_flash_eia3_job(MB_MGR_ZUC_OOO *state, ZUC_TYPE key) +static IMB_JOB *zuc_mb_mgr_flush_eia3_job(MB_MGR_ZUC_OOO *state, ZUC_TYPE key) { IMB_JOB *ret = NULL; uint32_t min_len, i, idx = 0; @@ -380,10 +484,10 @@ static IMB_JOB *zuc_mb_mgr_flash_eia3_job(MB_MGR_ZUC_OOO *state, ZUC_TYPE key) if (ret != NULL) return ret; - // Set length = 0xFFFF in NULL jobs + // Set length = 0xFFFFFFFF in NULL jobs for (i = 0; i < ZUC_MB_MAX_LANES_SIMD; i++) { if(state->job_in_lane[i] == NULL) - state->lens[i] = 0xffff; + state->lens[i] = 0xffffffff; } min_len = state->lens[0]; @@ -419,12 +523,13 @@ static IMB_JOB *zuc_mb_mgr_flash_eia3_job(MB_MGR_ZUC_OOO *state, ZUC_TYPE key) (const void * const *)state->args.in, (uint32_t **)state->args.out, state->lens, - (const void * const *)state->job_in_lane); + (const void * const *)state->job_in_lane, + state->job_in_lane[idx]->auth_tag_output_len_in_bytes); - // clear all lengths of valid jobs and set to FFFF to NULL jobs + // clear all lengths of valid jobs and set to FFFFFFFF to NULL jobs for (i = 0; i < ZUC_MB_MAX_LANES_SIMD; i++) { if (JOB_IS_NULL(state, i)) { - state->lens[i] = 0xffff; + state->lens[i] = 0xffffffff; } else { state->lens[i] = 0; } @@ -447,12 +552,12 @@ IMB_JOB *SUBMIT_JOB_ZUC256_EEA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job) IMB_JOB *FLUSH_JOB_ZUC128_EEA3(MB_MGR_ZUC_OOO *state) { - return zuc_mb_mgr_flash_eea3_job(state, ZUC_128); + return zuc_mb_mgr_flush_eea3_job(state, ZUC_128); } IMB_JOB *FLUSH_JOB_ZUC256_EEA3(MB_MGR_ZUC_OOO *state) { - return zuc_mb_mgr_flash_eea3_job(state, ZUC_256); + return zuc_mb_mgr_flush_eea3_job(state, ZUC_256); } IMB_JOB *SUBMIT_JOB_ZUC128_EIA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job) @@ -467,10 +572,10 @@ IMB_JOB *SUBMIT_JOB_ZUC256_EIA3(MB_MGR_ZUC_OOO *state, IMB_JOB *job) IMB_JOB *FLUSH_JOB_ZUC128_EIA3(MB_MGR_ZUC_OOO *state) { - return zuc_mb_mgr_flash_eia3_job(state, ZUC_128); + return zuc_mb_mgr_flush_eia3_job(state, ZUC_128); } IMB_JOB *FLUSH_JOB_ZUC256_EIA3(MB_MGR_ZUC_OOO *state) { - return zuc_mb_mgr_flash_eia3_job(state, ZUC_256); + return zuc_mb_mgr_flush_eia3_job(state, ZUC_256); } diff --git a/lib/aarch64/mb_mgr_zuc_submit_flush_aarch64_no_aesni.c b/lib/aarch64/mb_mgr_zuc_submit_flush_aarch64_no_aesni.c index c0fb04359d45d4a9b1f975e24b38ec6015d6304a..e23be62473192dd839e625ec93ccc359380cfb5e 100644 --- a/lib/aarch64/mb_mgr_zuc_submit_flush_aarch64_no_aesni.c +++ b/lib/aarch64/mb_mgr_zuc_submit_flush_aarch64_no_aesni.c @@ -1,3 +1,32 @@ +/********************************************************************** + Copyright(c) 2022-2023 Arm Corporation All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + * Neither the name of Arm Corporation nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************/ + #define SUBMIT_JOB_ZUC128_EEA3 submit_job_zuc_eea3_aarch64_no_aesni #define SUBMIT_JOB_ZUC256_EEA3 submit_job_zuc256_eea3_aarch64_no_aesni #define FLUSH_JOB_ZUC128_EEA3 flush_job_zuc_eea3_aarch64_no_aesni diff --git a/lib/aarch64/zuc_aarch64_no_aesni_top.c b/lib/aarch64/zuc_aarch64_no_aesni_top.c index dd2f6b26ba1e6a83cfe116456ba6805e7144ceae..ba1703edd03d68641b45fef44252d5f469086ff2 100644 --- a/lib/aarch64/zuc_aarch64_no_aesni_top.c +++ b/lib/aarch64/zuc_aarch64_no_aesni_top.c @@ -24,1377 +24,37 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ - -/*----------------------------------------------------------------------- -* zuc_aarch64_no_aesni_top.c -*----------------------------------------------------------------------- -* An implementation of ZUC, the core algorithm for the -* 3GPP Confidentiality and Integrity algorithms. -* -*-----------------------------------------------------------------------*/ - -#include - -#include "include/zuc_internal.h" -#include "ipsec-mb.h" -#include "clear_regs_mem_aarch64.h" -#include "include/error.h" - -#define NUM_BUFS 4 -#define KEYSTR_ROUND_LEN 16 - -static inline -void _zuc_eea3_1_buffer_aarch64_no_aesni(const void *pKey, - const void *pIv, - const void *pBufferIn, - void *pBufferOut, - const uint32_t length) -{ - DECLARE_ALIGNED(ZucState_t zucState, 16); - DECLARE_ALIGNED(uint8_t keyStream[KEYSTR_ROUND_LEN], 16); - - const uint64_t *pIn64 = NULL; - uint64_t *pOut64 = NULL, *pKeyStream64 = NULL; - uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL; - - uint32_t numKeyStreamsPerPkt = length / KEYSTR_ROUND_LEN; - const uint32_t numBytesLeftOver = length % KEYSTR_ROUND_LEN; - - asm_ZucInitialization_aarch64_no_aesni(pKey, pIv, &(zucState)); - - /* Loop over all the Quad-Words in input buffer and XOR with the 64bits - * of generated keystream - */ - pOut64 = (uint64_t *) pBufferOut; - pIn64 = (const uint64_t *) pBufferIn; - - while (numKeyStreamsPerPkt--) { - /* Generate the key stream 16 bytes at a time */ - asm_ZucGenKeystream16B_aarch64_no_aesni((uint32_t *) &keyStream[0], &zucState); - - /* XOR The Keystream generated with the input buffer here */ - pKeyStream64 = (uint64_t *)keyStream; - asm_XorKeyStream16B_aarch64(pIn64, pOut64, pKeyStream64); - pIn64 += 2; - pOut64 += 2; - } - - /* Check for remaining 0 to 15 bytes */ - if(numBytesLeftOver) { - /* buffer to store 16 bytes of keystream */ - DECLARE_ALIGNED(uint8_t tempSrc[KEYSTR_ROUND_LEN], 16); - DECLARE_ALIGNED(uint8_t tempDst[KEYSTR_ROUND_LEN], 16); - const uint8_t *pIn8 = (const uint8_t *) pBufferIn; - uint8_t *pOut8 = (uint8_t *) pBufferOut; - const uint64_t num4BRounds = ((numBytesLeftOver - 1) / 4) + 1; - - asm_ZucGenKeystream_aarch64_no_aesni((uint32_t *) &keyStream[0], &zucState, num4BRounds); - - /* copy the remaining bytes into temporary buffer and XOR with - * the 16 bytes of keystream. Then copy on the valid bytes back - * to the output buffer */ - memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver], numBytesLeftOver); - pKeyStream64 = (uint64_t *) &keyStream[0]; - pTemp64 = (uint64_t *) &tempSrc[0]; - pdstTemp64 = (uint64_t *) &tempDst[0]; - - asm_XorKeyStream16B_aarch64(pTemp64, pdstTemp64, pKeyStream64); - memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0], numBytesLeftOver); - -#ifdef SAFE_DATA - clear_mem(tempSrc, sizeof(tempSrc)); - clear_mem(tempDst, sizeof(tempDst)); -#endif - - } -#ifdef SAFE_DATA - /* Clear sensitive data in stack */ - clear_mem(keyStream, sizeof(keyStream)); - clear_mem(&zucState, sizeof(zucState)); -#endif -} - -static inline -void _zuc256_eea3_1_buffer_aarch64_no_aesni(const void *pKey, - const void *pIv, - const void *pBufferIn, - void *pBufferOut, - const uint32_t length) -{ - DECLARE_ALIGNED(ZucState_t zucState, 16); - DECLARE_ALIGNED(uint8_t keyStream[KEYSTR_ROUND_LEN], 16); - - const uint64_t *pIn64 = NULL; - uint64_t *pOut64 = NULL, *pKeyStream64 = NULL; - uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL; - - uint32_t numKeyStreamsPerPkt = length/ KEYSTR_ROUND_LEN; - const uint32_t numBytesLeftOver = length % KEYSTR_ROUND_LEN; - - asm_Zuc256Initialization_aarch64_no_aesni(pKey, pIv, &(zucState), 2); - - /* Loop over all the Quad-Words in input buffer and XOR with the 64bits - * of generated keystream - */ - pOut64 = (uint64_t *) pBufferOut; - pIn64 = (const uint64_t *) pBufferIn; - - while (numKeyStreamsPerPkt--) { - /* Generate the key stream 16 bytes at a time */ - asm_ZucGenKeystream16B_aarch64_no_aesni((uint32_t *) &keyStream[0], &zucState); - - /* XOR The Keystream generated with the input buffer here */ - pKeyStream64 = (uint64_t *)keyStream; - asm_XorKeyStream16B_aarch64(pIn64, pOut64, pKeyStream64); - pIn64 += 2; - pOut64 += 2; - } - - /* Check for remaining 0 to 15 bytes */ - if(numBytesLeftOver) { - /* buffer to store 16 bytes of keystream */ - DECLARE_ALIGNED(uint8_t tempSrc[KEYSTR_ROUND_LEN], 16); - DECLARE_ALIGNED(uint8_t tempDst[KEYSTR_ROUND_LEN], 16); - const uint8_t *pIn8 = (const uint8_t *) pBufferIn; - uint8_t *pOut8 = (uint8_t *) pBufferOut; - const uint64_t num4BRounds = ((numBytesLeftOver - 1) / 4) + 1; - - asm_ZucGenKeystream_aarch64_no_aesni((uint32_t *) &keyStream[0], &zucState, num4BRounds); - - /* copy the remaining bytes into temporary buffer and XOR with - * the 64-bytes of keystream. Then copy on the valid bytes back - * to the output buffer */ - memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver], numBytesLeftOver); - pKeyStream64 = (uint64_t *) &keyStream[0]; - pTemp64 = (uint64_t *) &tempSrc[0]; - pdstTemp64 = (uint64_t *) &tempDst[0]; - - asm_XorKeyStream16B_aarch64(pTemp64, pdstTemp64, pKeyStream64); - memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0], numBytesLeftOver); - -#ifdef SAFE_DATA - imb_clear_mem(tempSrc, sizeof(tempSrc)); - imb_clear_mem(tempDst, sizeof(tempDst)); -#endif - } - -} - -static inline -void _zuc_eea3_4_buffer_aarch64_no_aesni(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - void *pBufferOut[NUM_BUFS], - const uint32_t length[NUM_BUFS]) -{ - DECLARE_ALIGNED(ZucState4_t state, 64); - DECLARE_ALIGNED(ZucState_t singlePktState, 64); - unsigned int i; - /* Calculate the minimum input packet size */ - uint32_t bytes1 = (length[0] < length[1] ? - length[0] : length[1]); - uint32_t bytes2 = (length[2] < length[3] ? - length[2] : length[3]); - /* min number of bytes */ - uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2; - uint32_t numKeyStreamsPerPkt; - uint16_t remainBytes[NUM_BUFS] = {0}; - DECLARE_ALIGNED(uint8_t keyStr[NUM_BUFS][KEYSTR_ROUND_LEN], 64); - /* structure to store the 4 keys */ - DECLARE_ALIGNED(ZucKey4_t keys, 64); - /* structure to store the 4 IV's */ - DECLARE_ALIGNED(uint8_t ivs[4*32], 16); - uint32_t numBytesLeftOver = 0; - const uint8_t *pTempBufInPtr = NULL; - uint8_t *pTempBufOutPtr = NULL; - const uint64_t *pIn64[NUM_BUFS]= {NULL}; - uint64_t *pOut64[NUM_BUFS] = {NULL}; - uint64_t *pKeyStream64 = NULL; - - /* - * Calculate the number of bytes left over for each packet, - * and setup the Keys and IVs - */ - for (i = 0; i < NUM_BUFS; i++) { - remainBytes[i] = length[i]; - keys.pKeys[i] = pKey[i]; - memcpy(ivs + i*32, pIv[i], 16); - } - - asm_ZucInitialization_4_aarch64_no_aesni(&keys, ivs, &state); - - for (i = 0; i < NUM_BUFS; i++) { - pOut64[i] = (uint64_t *) pBufferOut[i]; - pIn64[i] = (const uint64_t *) pBufferIn[i]; - } - - /* Encrypt common length of all buffers */ - asm_ZucCipher_4_aarch64_no_aesni(&state, pIn64, pOut64, - remainBytes, (uint16_t) bytes); - - /* process each packet separately for the remaining bytes */ - for (i = 0; i < NUM_BUFS; i++) { - if (remainBytes[i]) { - /* need to copy the zuc state to single packet state */ - singlePktState.lfsrState[0] = state.lfsrState[0][i]; - singlePktState.lfsrState[1] = state.lfsrState[1][i]; - singlePktState.lfsrState[2] = state.lfsrState[2][i]; - singlePktState.lfsrState[3] = state.lfsrState[3][i]; - singlePktState.lfsrState[4] = state.lfsrState[4][i]; - singlePktState.lfsrState[5] = state.lfsrState[5][i]; - singlePktState.lfsrState[6] = state.lfsrState[6][i]; - singlePktState.lfsrState[7] = state.lfsrState[7][i]; - singlePktState.lfsrState[8] = state.lfsrState[8][i]; - singlePktState.lfsrState[9] = state.lfsrState[9][i]; - singlePktState.lfsrState[10] = state.lfsrState[10][i]; - singlePktState.lfsrState[11] = state.lfsrState[11][i]; - singlePktState.lfsrState[12] = state.lfsrState[12][i]; - singlePktState.lfsrState[13] = state.lfsrState[13][i]; - singlePktState.lfsrState[14] = state.lfsrState[14][i]; - singlePktState.lfsrState[15] = state.lfsrState[15][i]; - - singlePktState.fR1 = state.fR1[i]; - singlePktState.fR2 = state.fR2[i]; - - numKeyStreamsPerPkt = remainBytes[i] / KEYSTR_ROUND_LEN; - numBytesLeftOver = remainBytes[i] % KEYSTR_ROUND_LEN; - - pTempBufInPtr = pBufferIn[i]; - pTempBufOutPtr = pBufferOut[i]; - - /* update the output and input pointers here to point - * to the i'th buffers */ - pOut64[0] = (uint64_t *) &pTempBufOutPtr[length[i] - - remainBytes[i]]; - pIn64[0] = (const uint64_t *) &pTempBufInPtr[length[i] - - remainBytes[i]]; - - while (numKeyStreamsPerPkt--) { - /* Generate the key stream 16 bytes at a time */ - asm_ZucGenKeystream16B_aarch64_no_aesni( - (uint32_t *) keyStr[0], - &singlePktState); - pKeyStream64 = (uint64_t *) keyStr[0]; - asm_XorKeyStream16B_aarch64(pIn64[0], pOut64[0], pKeyStream64); - pIn64[0] += 2; - pOut64[0] += 2; - } - - /* Check for remaining 0 to 15 bytes */ - if (numBytesLeftOver) { - DECLARE_ALIGNED(uint8_t tempSrc[16], 64); - DECLARE_ALIGNED(uint8_t tempDst[16], 64); - uint64_t *pTempSrc64; - uint64_t *pTempDst64; - uint32_t offset = length[i] - numBytesLeftOver; - const uint64_t num4BRounds = - ((numBytesLeftOver - 1) / 4) + 1; - - asm_ZucGenKeystream_aarch64_no_aesni((uint32_t *)&keyStr[0], - &singlePktState, - num4BRounds); - /* copy the remaining bytes into temporary - * buffer and XOR with the 16 bytes of - * keystream. Then copy on the valid bytes back - * to the output buffer */ - memcpy(&tempSrc[0], &pTempBufInPtr[offset], - numBytesLeftOver); - memset(&tempSrc[numBytesLeftOver], 0, - 16 - numBytesLeftOver); - - pKeyStream64 = (uint64_t *) &keyStr[0][0]; - pTempSrc64 = (uint64_t *) &tempSrc[0]; - pTempDst64 = (uint64_t *) &tempDst[0]; - asm_XorKeyStream16B_aarch64(pTempSrc64, pTempDst64, pKeyStream64); - - memcpy(&pTempBufOutPtr[offset], - &tempDst[0], numBytesLeftOver); -#ifdef SAFE_DATA - imb_clear_mem(tempSrc, sizeof(tempSrc)); - imb_clear_mem(tempDst, sizeof(tempDst)); -#endif - } - } - } -#ifdef SAFE_DATA - /* Clear sensitive data in stack */ - imb_clear_mem(keyStr, sizeof(keyStr)); - imb_clear_mem(&singlePktState, sizeof(singlePktState)); - imb_clear_mem(&state, sizeof(state)); - imb_clear_mem(&keys, sizeof(keys)); -#endif -} - -static inline -void _zuc_eea3_4_buffer_no_aesni(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - void *pBufferOut[NUM_BUFS], - const uint32_t length[NUM_BUFS]) -{ -#ifdef SAFE_PARAM - unsigned int i; - - if (imb_errno != 0) - imb_set_errno(NULL, 0); - - /* Check for NULL pointers */ - if (pKey == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pBufferOut == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_DST); - return; - } - - if (length == NULL) { - imb_set_errno(NULL, IMB_ERR_CIPH_LEN); - return; - } - - /* Check for NULL pointers and lengths for each buffer */ - for (i = 0; i < NUM_BUFS; i++) { - if (pKey[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pBufferOut[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_DST); - return; - } - - /* Check input data is in range of supported length */ - if (length[i] < ZUC_MIN_BYTELEN || - length[i] > ZUC_MAX_BYTELEN) { - imb_set_errno(NULL, IMB_ERR_CIPH_LEN); - return; - } - } -#endif - - _zuc_eea3_4_buffer_aarch64_no_aesni(pKey, pIv, pBufferIn, pBufferOut, length); - -#ifdef SAFE_DATA - /* Clear sensitive data in registers */ - CLEAR_SCRATCH_GPS(); - CLEAR_SCRATCH_SIMD_REGS(); -#endif -} - -static inline -void _zuc_eea3_n_buffer_no_aesni(const void * const pKey[], - const void * const pIv[], - const void * const pBufferIn[], - void *pBufferOut[], - const uint32_t length[], - const uint32_t numBuffers) -{ - unsigned int i; - unsigned int packetCount = numBuffers; - -#ifdef SAFE_PARAM - if (imb_errno != 0) - imb_set_errno(NULL, 0); - - /* Check for NULL pointers */ - if (pKey == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pBufferOut == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_DST); - return; - } - - if (length == NULL) { - imb_set_errno(NULL, IMB_ERR_CIPH_LEN); - return; - } - - /* Check for NULL pointers and lengths for each buffer */ - for (i = 0; i < numBuffers; i++) { - if (pKey[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pBufferOut[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_DST); - return; - } - - /* Check input data is in range of supported length */ - if (length[i] < ZUC_MIN_BYTELEN || - length[i] > ZUC_MAX_BYTELEN) { - imb_set_errno(NULL, IMB_ERR_CIPH_LEN); - return; - } - } -#endif - i = 0; - - while (packetCount >= NUM_BUFS) { - packetCount -= NUM_BUFS; - _zuc_eea3_4_buffer_no_aesni(&pKey[i], - &pIv[i], - &pBufferIn[i], - &pBufferOut[i], - &length[i]); - i += NUM_BUFS; - } - - while(packetCount--) { - _zuc_eea3_1_buffer_aarch64_no_aesni(pKey[i], - pIv[i], - pBufferIn[i], - pBufferOut[i], - length[i]); - i++; - } - -#ifdef SAFE_DATA - /* Clear sensitive data in registers */ - CLEAR_SCRATCH_GPS(); - CLEAR_SCRATCH_SIMD_REGS(); -#endif -} - -void zuc_eea3_1_buffer_aarch64_no_aesni(const void *pKey, - const void *pIv, - const void *pBufferIn, - void *pBufferOut, - const uint32_t length) -{ -#ifdef SAFE_PARAM - if (imb_errno != 0) - imb_set_errno(NULL, 0); - - /* Check for NULL pointers */ - if (pKey == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pBufferOut == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_DST); - return; - } - - /* Check input data is in range of supported length */ - if (length < ZUC_MIN_BYTELEN || length > ZUC_MAX_BYTELEN) { - imb_set_errno(NULL, IMB_ERR_CIPH_LEN); - return; - } -#endif - - _zuc_eea3_1_buffer_aarch64_no_aesni(pKey, pIv, pBufferIn, pBufferOut, length); - -#ifdef SAFE_DATA - /* Clear sensitive data in registers */ - CLEAR_SCRATCH_GPS(); - CLEAR_SCRATCH_SIMD_REGS(); -#endif -} - -void zuc_eea3_4_buffer_aarch64_no_aesni(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - void *pBufferOut[NUM_BUFS], - const uint32_t length[NUM_BUFS]) -{ - _zuc_eea3_4_buffer_no_aesni(pKey, pIv, pBufferIn, pBufferOut, length); -} - -void zuc_eea3_n_buffer_aarch64_no_aesni(const void * const pKey[], - const void * const pIv[], - const void * const pBufferIn[], - void *pBufferOut[], - const uint32_t length[], - const uint32_t numBuffers) -{ - _zuc_eea3_n_buffer_no_aesni(pKey, pIv, pBufferIn, pBufferOut, length, numBuffers); -} - -void zuc256_eea3_1_buffer_aarch64_no_aesni(const void *pKey, - const void *pIv, - const void *pBufferIn, - void *pBufferOut, - const uint32_t length) -{ -#ifdef SAFE_PARAM - if (imb_errno != 0) - imb_set_errno(NULL, 0); - /* Check for NULL pointers */ - if (pKey == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pBufferOut == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_DST); - return; - } - - /* Check input data is in range of supported length */ - if(length < ZUC_MIN_BYTELEN || length > ZUC_MAX_BYTELEN) { - imb_set_errno(NULL, IMB_ERR_CIPH_LEN); - return; - } -#endif - - _zuc256_eea3_1_buffer_aarch64_no_aesni(pKey, pIv, pBufferIn, pBufferOut, length); - -#ifdef SAFE_DATA - /* Clear sensitive data in registers */ - CLEAR_SCRATCH_GPS(); - CLEAR_SCRATCH_SIMD_REGS(); -#endif -} - -static inline uint64_t rotate_left(uint64_t u, size_t r) -{ - return (((u) << (r)) | ((u) >> (64 - (r)))); -} - -static inline uint64_t load_uint64(const void *ptr) -{ - return *((const uint64_t *)ptr); -} - -static inline -void _zuc_eia3_1_buffer_aarch64_no_aesni(const void *pKey, - const void *pIv, - const void *pBufferIn, - const uint32_t lengthInBits, - uint32_t *pMacI, - bool key128) -{ - DECLARE_ALIGNED(ZucState_t zucState, 64); - DECLARE_ALIGNED(uint32_t keyStream[4 * 2], 64); - const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8; - /* generate a key-stream 2 words longer than the input message */ - const uint32_t N = lengthInBits + (2 * ZUC_WORD_BITS); - uint32_t L = (N + 31) / ZUC_WORD_BITS; - uint32_t *pZuc = (uint32_t *) &keyStream[0]; - uint32_t remainingBits = lengthInBits; - uint32_t T = 0; - const uint8_t *pIn8 = (const uint8_t *) pBufferIn; - - if(key128) - asm_ZucInitialization_aarch64_no_aesni(pKey, pIv, &(zucState)); - else { - asm_Zuc256Initialization_aarch64_no_aesni(pKey, pIv, &(zucState), 4); - /* Initialize the tags with the first 4 bytes of keystream */ - asm_ZucGenKeystream4B_aarch64_no_aesni(pZuc, &zucState); - memcpy(&T, pZuc, 4); - } - - asm_ZucGenKeystream16B_aarch64_no_aesni(pZuc, &zucState); - - /* loop over the message bits */ - while (remainingBits >= keyStreamLengthInBits) { - remainingBits -= keyStreamLengthInBits; - L -= (keyStreamLengthInBits / 32); - - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainingBits) - asm_ZucGenKeystream8B_aarch64_no_aesni(&keyStream[4], &zucState); - else - asm_ZucGenKeystream16B_aarch64_no_aesni(&keyStream[4], &zucState); - T = asm_Eia3Round16B_aarch64_no_aesni(T, keyStream, pIn8); - /* Copy the last keystream generated to the first 16 bytes */ - memcpy(&keyStream[0], &keyStream[4], KEYSTR_ROUND_LEN); - pIn8 = &pIn8[KEYSTR_ROUND_LEN]; - } - - /* - * If remaining bits has more than 2 ZUC WORDS (double words), - * keystream needs to have up to another 2 ZUC WORDS (8B) - */ - if (remainingBits > (2 * 32)) - asm_ZucGenKeystream8B_aarch64_no_aesni(&keyStream[4], &zucState); - T ^= asm_Eia3Remainder_aarch64_no_aesni(&keyStream[0], pIn8, remainingBits); - T ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]), - remainingBits % 32); - - if(key128) { - /* save the final MAC-I result, only for 128bit authentification*/ - uint32_t keyBlock = keyStream[L - 1]; - T ^= keyBlock; - } - *pMacI = bswap4(T); - -#ifdef SAFE_DATA - /* Clear sensitive data (in registers and stack) */ - imb_clear_mem(keyStream, sizeof(keyStream)); - imb_clear_mem(&zucState, sizeof(zucState)); -#endif -} - -static inline -void _zuc_eia3_4_buffer_aarch64_no_aesni(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - const uint32_t lengthInBits[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS]) -{ - unsigned int i; - DECLARE_ALIGNED(ZucState4_t state, 64); - DECLARE_ALIGNED(ZucState_t singlePktState, 64); - DECLARE_ALIGNED(uint8_t keyStr[NUM_BUFS][2*KEYSTR_ROUND_LEN], 64); - /* structure to store the 4 keys */ - DECLARE_ALIGNED(ZucKey4_t keys, 64); - /* structure to store the 4 IV's */ - DECLARE_ALIGNED(uint8_t ivs[4*32], 16); - const uint8_t *pIn8[NUM_BUFS] = {NULL}; - uint32_t remainCommonBits; - uint32_t numKeyStr = 0; - uint32_t T[NUM_BUFS] = {0}; - const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8; - DECLARE_ALIGNED(uint32_t *pKeyStrArr[NUM_BUFS], 16) = {NULL}; - unsigned int allCommonBits; - - /* Check if all lengths are equal */ - if ((lengthInBits[0] == lengthInBits[1]) && - (lengthInBits[0] == lengthInBits[2]) && - (lengthInBits[0] == lengthInBits[3])) { - remainCommonBits = lengthInBits[0]; - allCommonBits = 1; - } else { - /* Calculate the minimum input packet size */ - uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ? - lengthInBits[0] : lengthInBits[1]); - uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ? - lengthInBits[2] : lengthInBits[3]); - - remainCommonBits = (bits1 < bits2) ? bits1 : bits2; - allCommonBits = 0; - } - - for (i = 0; i < NUM_BUFS; i++) { - pIn8[i] = (const uint8_t *) pBufferIn[i]; - pKeyStrArr[i] = (uint32_t *) &keyStr[i][0]; - keys.pKeys[i] = pKey[i]; - memcpy(ivs + i*32, pIv[i], 16); - } - - asm_ZucInitialization_4_aarch64_no_aesni(&keys, ivs, &state); - - /* Generate 16 bytes at a time */ - asm_ZucGenKeystream16B_4_aarch64_no_aesni(&state, pKeyStrArr); - - - /* Point at the next 16 bytes of the key */ - for (i = 0; i < NUM_BUFS; i++) - pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN]; - - /* loop over the message bits */ - while (remainCommonBits >= keyStreamLengthInBits) { - remainCommonBits -= keyStreamLengthInBits; - numKeyStr++; - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainCommonBits && allCommonBits) - asm_ZucGenKeystream8B_4_aarch64_no_aesni(&state, pKeyStrArr); - else - asm_ZucGenKeystream16B_4_aarch64_no_aesni(&state, pKeyStrArr); - - for (i = 0; i < NUM_BUFS; i++) { - T[i] = asm_Eia3Round16B_aarch64_no_aesni(T[i], keyStr[i], - pIn8[i]); - /* Copy the last keystream generated - * to the first 16 bytes */ - memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN], - KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; - } - } - - /* Process each packet separately for the remaining bits */ - for (i = 0; i < NUM_BUFS; i++) { - const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS); - uint32_t L = ((N + 31) / ZUC_WORD_BITS) - - numKeyStr*(keyStreamLengthInBits / 32); - uint32_t remainBits = lengthInBits[i] - - numKeyStr*keyStreamLengthInBits; - uint32_t *keyStr32 = (uint32_t *) keyStr[i]; - - /* If remaining bits are more than 8 bytes, we need to generate - * at least 8B more of keystream, so we need to copy - * the zuc state to single packet state first */ - if (remainBits > (2*32)) { - singlePktState.lfsrState[0] = state.lfsrState[0][i]; - singlePktState.lfsrState[1] = state.lfsrState[1][i]; - singlePktState.lfsrState[2] = state.lfsrState[2][i]; - singlePktState.lfsrState[3] = state.lfsrState[3][i]; - singlePktState.lfsrState[4] = state.lfsrState[4][i]; - singlePktState.lfsrState[5] = state.lfsrState[5][i]; - singlePktState.lfsrState[6] = state.lfsrState[6][i]; - singlePktState.lfsrState[7] = state.lfsrState[7][i]; - singlePktState.lfsrState[8] = state.lfsrState[8][i]; - singlePktState.lfsrState[9] = state.lfsrState[9][i]; - singlePktState.lfsrState[10] = state.lfsrState[10][i]; - singlePktState.lfsrState[11] = state.lfsrState[11][i]; - singlePktState.lfsrState[12] = state.lfsrState[12][i]; - singlePktState.lfsrState[13] = state.lfsrState[13][i]; - singlePktState.lfsrState[14] = state.lfsrState[14][i]; - singlePktState.lfsrState[15] = state.lfsrState[15][i]; - - singlePktState.fR1 = state.fR1[i]; - singlePktState.fR2 = state.fR2[i]; - } - - while (remainBits >= keyStreamLengthInBits) { - remainBits -= keyStreamLengthInBits; - L -= (keyStreamLengthInBits / 32); - - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainBits) - asm_ZucGenKeystream8B_aarch64_no_aesni(&keyStr32[4], - &singlePktState); - else - asm_ZucGenKeystream16B_aarch64_no_aesni(&keyStr32[4], - &singlePktState); - T[i] = asm_Eia3Round16B_aarch64_no_aesni(T[i], keyStr32, - pIn8[i]); - /* Copy the last keystream generated - * to the first 16 bytes */ - memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; - } - - /* - * If remaining bits has more than 2 ZUC WORDS (double words), - * keystream needs to have up to another 2 ZUC WORDS (8B) - */ - if (remainBits > (2 * 32)) - asm_ZucGenKeystream8B_aarch64_no_aesni(&keyStr32[4], - &singlePktState); - - uint32_t keyBlock = keyStr32[L - 1]; - - T[i] ^= asm_Eia3Remainder_aarch64_no_aesni(keyStr32, pIn8[i], remainBits); - T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]), - remainBits % 32); - - /* save the final MAC-I result */ - *(pMacI[i]) = bswap4(T[i] ^ keyBlock); - } - -#ifdef SAFE_DATA - /* Clear sensitive data (in registers and stack) */ - imb_clear_mem(keyStr, sizeof(keyStr)); - imb_clear_mem(&singlePktState, sizeof(singlePktState)); - imb_clear_mem(&state, sizeof(state)); - imb_clear_mem(&keys, sizeof(keys)); -#endif -} - -static inline -void _zuc_eia3_n_buffer_aarch64_no_aesni(const void * const pKey[], - const void * const pIv[], - const void * const pBufferIn[], - const uint32_t lengthInBits[], - uint32_t *pMacI[], - const uint32_t numBuffers) -{ - unsigned int i; - unsigned int packetCount = numBuffers; - -#ifdef SAFE_PARAM - if (imb_errno != 0) - imb_set_errno(NULL, 0); - - /* Check for NULL pointers */ - if (pKey == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pMacI == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_AUTH); - return; - } - - if (lengthInBits == NULL) { - imb_set_errno(NULL, IMB_ERR_AUTH_LEN); - return; - } - - /* Check for NULL pointers and lengths for each buffer */ - for (i = 0; i < numBuffers; i++) { - if (pKey[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pMacI[i] == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_AUTH); - return; - } - - /* Check input data is in range of supported length */ - if (lengthInBits[i] < ZUC_MIN_BITLEN || - lengthInBits[i] > ZUC_MAX_BITLEN) { - imb_set_errno(NULL, IMB_ERR_AUTH_LEN); - return; - } - } -#endif - i = 0; - - while(packetCount >= 4) { - packetCount -=4; - _zuc_eia3_4_buffer_aarch64_no_aesni(&pKey[i], - &pIv[i], - &pBufferIn[i], - &lengthInBits[i], - &pMacI[i]); - i+=4; - } - - while(packetCount--) { - _zuc_eia3_1_buffer_aarch64_no_aesni(pKey[i], - pIv[i], - pBufferIn[i], - lengthInBits[i], - pMacI[i], - true); - i++; - } - -#ifdef SAFE_DATA - /* Clear sensitive data in registers */ - CLEAR_SCRATCH_GPS(); - CLEAR_SCRATCH_SIMD_REGS(); -#endif - -} - -void zuc_eia3_1_buffer_aarch64_no_aesni(const void *pKey, - const void *pIv, - const void *pBufferIn, - const uint32_t lengthInBits, - uint32_t *pMacI) -{ -#ifdef SAFE_PARAM - if (imb_errno != 0) - imb_set_errno(NULL, 0); - - /* Check for NULL pointers */ - if (pKey == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pMacI == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_AUTH); - return; - } - - /* Check input data is in range of supported length */ - if (lengthInBits < ZUC_MIN_BITLEN || lengthInBits > ZUC_MAX_BITLEN) { - imb_set_errno(NULL, IMB_ERR_AUTH_LEN); - return; - } -#endif - - _zuc_eia3_1_buffer_aarch64_no_aesni(pKey, pIv, pBufferIn, lengthInBits, pMacI, true); - -#ifdef SAFE_DATA - CLEAR_SCRATCH_GPS(); - CLEAR_SCRATCH_SIMD_REGS(); -#endif -} - -void zuc_eia3_4_buffer_aarch64_no_aesni(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - const uint32_t lengthInBits[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS]) -{ - _zuc_eia3_4_buffer_aarch64_no_aesni(pKey, pIv, pBufferIn, lengthInBits, pMacI); -} - -void zuc_eia3_n_buffer_aarch64_no_aesni(const void * const pKey[], - const void * const pIv[], - const void * const pBufferIn[], - const uint32_t lengthInBits[], - uint32_t *pMacI[], - const uint32_t numBuffers) -{ - _zuc_eia3_n_buffer_aarch64_no_aesni(pKey, pIv, pBufferIn, lengthInBits, pMacI, numBuffers); -} - -void zuc256_eia3_1_buffer_aarch64_no_aesni(const void *pKey, - const void *pIv, - const void *pBufferIn, - const uint32_t lengthInBits, - uint32_t *pMacI) -{ -#ifdef SAFE_PARAM - if (imb_errno != 0) - imb_set_errno(NULL, 0); - /* Check for NULL pointers */ - if (pKey == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pMacI == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_AUTH); - return; - } - - /* Check input data is in range of supported length */ - if (lengthInBits < ZUC_MIN_BITLEN || lengthInBits > ZUC_MAX_BITLEN) { - imb_set_errno(NULL, IMB_ERR_AUTH_LEN); - return; - } -#endif - - _zuc_eia3_1_buffer_aarch64_no_aesni(pKey, pIv, pBufferIn, lengthInBits, pMacI, false); - -#ifdef SAFE_DATA - CLEAR_SCRATCH_GPS(); - CLEAR_SCRATCH_SIMD_REGS(); -#endif -} - -void -zuc_eia3_4_buffer_job_aarch64_no_aesni(const void * const pKey[NUM_BUFS], - const uint8_t *ivs, - const void * const pBufferIn[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS], - const uint16_t lengthInBits[NUM_BUFS], - const void * const job_in_lane[NUM_BUFS]) -{ - unsigned int i; - DECLARE_ALIGNED(ZucState4_t state, 64); - DECLARE_ALIGNED(ZucState_t singlePktState, 64); - DECLARE_ALIGNED(uint8_t keyStr[NUM_BUFS][2*KEYSTR_ROUND_LEN], 64); - /* structure to store the 4 keys */ - DECLARE_ALIGNED(ZucKey4_t keys, 64); - const uint8_t *pIn8[NUM_BUFS] = {NULL}; - uint32_t remainCommonBits; - uint32_t numKeyStr = 0; - uint32_t T[NUM_BUFS] = {0}; - const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8; - uint32_t *pKeyStrArr[NUM_BUFS] = {NULL}; - unsigned int allCommonBits; - - /* Check if all lengths are equal */ - if ((lengthInBits[0] == lengthInBits[1]) && - (lengthInBits[0] == lengthInBits[2]) && - (lengthInBits[0] == lengthInBits[3])) { - remainCommonBits = lengthInBits[0]; - allCommonBits = 1; - } else { - /* Calculate the minimum input packet size */ - uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ? - lengthInBits[0] : lengthInBits[1]); - uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ? - lengthInBits[2] : lengthInBits[3]); - - remainCommonBits = (bits1 < bits2) ? bits1 : bits2; - allCommonBits = 0; - } - - for (i = 0; i < NUM_BUFS; i++) { - pIn8[i] = (const uint8_t *) pBufferIn[i]; - pKeyStrArr[i] = (uint32_t *) &keyStr[i][0]; - keys.pKeys[i] = pKey[i]; - } - - asm_ZucInitialization_4_aarch64_no_aesni(&keys, ivs, &state); - - /* Generate 16 bytes at a time */ - asm_ZucGenKeystream16B_4_aarch64_no_aesni(&state, pKeyStrArr); - - /* Point at the next 16 bytes of the key */ - for (i = 0; i < NUM_BUFS; i++) - pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN]; - - /* loop over the message bits */ - while (remainCommonBits >= keyStreamLengthInBits) { - remainCommonBits -= keyStreamLengthInBits; - numKeyStr++; - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainCommonBits && allCommonBits) - asm_ZucGenKeystream8B_4_aarch64_no_aesni(&state, - pKeyStrArr); - else - asm_ZucGenKeystream16B_4_aarch64_no_aesni(&state, - pKeyStrArr); - - for (i = 0; i < NUM_BUFS; i++) { - if (job_in_lane[i] == NULL) - continue; - T[i] = asm_Eia3Round16B_aarch64_no_aesni(T[i], keyStr[i], - pIn8[i]); - /* Copy the last keystream generated - * to the first 16 bytes */ - memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN], - KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; - } - } - - /* Process each packet separately for the remaining bits */ - for (i = 0; i < NUM_BUFS; i++) { - if (job_in_lane[i] == NULL) - continue; - - const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS); - uint32_t L = ((N + 31) / ZUC_WORD_BITS) - - numKeyStr*(keyStreamLengthInBits / 32); - uint32_t remainBits = lengthInBits[i] - - numKeyStr*keyStreamLengthInBits; - uint32_t *keyStr32 = (uint32_t *) keyStr[i]; - - /* If remaining bits are more than 8 bytes, we need to generate - * at least 8B more of keystream, so we need to copy - * the zuc state to single packet state first */ - if (remainBits > (2*32)) { - singlePktState.lfsrState[0] = state.lfsrState[0][i]; - singlePktState.lfsrState[1] = state.lfsrState[1][i]; - singlePktState.lfsrState[2] = state.lfsrState[2][i]; - singlePktState.lfsrState[3] = state.lfsrState[3][i]; - singlePktState.lfsrState[4] = state.lfsrState[4][i]; - singlePktState.lfsrState[5] = state.lfsrState[5][i]; - singlePktState.lfsrState[6] = state.lfsrState[6][i]; - singlePktState.lfsrState[7] = state.lfsrState[7][i]; - singlePktState.lfsrState[8] = state.lfsrState[8][i]; - singlePktState.lfsrState[9] = state.lfsrState[9][i]; - singlePktState.lfsrState[10] = state.lfsrState[10][i]; - singlePktState.lfsrState[11] = state.lfsrState[11][i]; - singlePktState.lfsrState[12] = state.lfsrState[12][i]; - singlePktState.lfsrState[13] = state.lfsrState[13][i]; - singlePktState.lfsrState[14] = state.lfsrState[14][i]; - singlePktState.lfsrState[15] = state.lfsrState[15][i]; - - singlePktState.fR1 = state.fR1[i]; - singlePktState.fR2 = state.fR2[i]; - } - - while (remainBits >= keyStreamLengthInBits) { - remainBits -= keyStreamLengthInBits; - L -= (keyStreamLengthInBits / 32); - - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainBits) - asm_ZucGenKeystream8B_aarch64_no_aesni( - &keyStr32[4], - &singlePktState); - else - asm_ZucGenKeystream16B_aarch64_no_aesni( - &keyStr32[4], - &singlePktState); - T[i] = asm_Eia3Round16B_aarch64_no_aesni(T[i], keyStr32, - pIn8[i]); - /* Copy the last keystream generated - * to the first 16 bytes */ - memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; - } - - /* - * If remaining bits has more than 2 ZUC WORDS (double words), - * keystream needs to have up to another 2 ZUC WORDS (8B) - */ - if (remainBits > (2 * 32)) - asm_ZucGenKeystream8B_aarch64_no_aesni(&keyStr32[4], - &singlePktState); - - uint32_t keyBlock = keyStr32[L - 1]; - - T[i] ^= asm_Eia3Remainder_aarch64_no_aesni(keyStr32, pIn8[i], - remainBits); - T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]), - remainBits % 32); - - /* save the final MAC-I result */ - *(pMacI[i]) = bswap4(T[i] ^ keyBlock); - } - -#ifdef SAFE_DATA - /* Clear sensitive data (in registers and stack) */ - clear_mem(keyStr, sizeof(keyStr)); - clear_mem(&singlePktState, sizeof(singlePktState)); - clear_mem(&state, sizeof(state)); - clear_mem(&keys, sizeof(keys)); -#endif -} - -void -zuc256_eia3_4_buffer_job_aarch64_no_aesni(const void * const pKey[NUM_BUFS], - const uint8_t *ivs, - const void * const pBufferIn[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS], - const uint16_t lengthInBits[NUM_BUFS], - const void * const job_in_lane[NUM_BUFS]) -{ - unsigned int i; - DECLARE_ALIGNED(ZucState4_t state, 64); - DECLARE_ALIGNED(ZucState_t singlePktState, 64); - DECLARE_ALIGNED(uint8_t keyStr[NUM_BUFS][2*KEYSTR_ROUND_LEN], 64); - /* structure to store the 4 keys */ - DECLARE_ALIGNED(ZucKey4_t keys, 64); - const uint8_t *pIn8[NUM_BUFS] = {NULL}; - uint32_t remainCommonBits; - uint32_t numKeyStr = 0; - uint32_t T[NUM_BUFS] = {0}; - const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8; - uint32_t *pKeyStrArr[NUM_BUFS] = {NULL}; - unsigned int allCommonBits; - - /* Check if all lengths are equal */ - if ((lengthInBits[0] == lengthInBits[1]) && - (lengthInBits[0] == lengthInBits[2]) && - (lengthInBits[0] == lengthInBits[3])) { - remainCommonBits = lengthInBits[0]; - allCommonBits = 1; - } else { - /* Calculate the minimum input packet size */ - uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ? - lengthInBits[0] : lengthInBits[1]); - uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ? - lengthInBits[2] : lengthInBits[3]); - - remainCommonBits = (bits1 < bits2) ? bits1 : bits2; - allCommonBits = 0; - } - - for (i = 0; i < NUM_BUFS; i++) { - pIn8[i] = (const uint8_t *) pBufferIn[i]; - pKeyStrArr[i] = (uint32_t *) &keyStr[i][0]; - keys.pKeys[i] = pKey[i]; - } - - /* TODO: Handle 8 and 16-byte digest cases */ - asm_Zuc256Initialization_4_aarch64_no_aesni(&keys, ivs, &state, 4); - - /* Initialize the tags with the first 4 bytes of keystream */ - asm_ZucGenKeystream4B_4_aarch64_no_aesni(&state, pKeyStrArr); - - for (i = 0; i < NUM_BUFS; i++) - memcpy(&T[i], pKeyStrArr[i], 4); - - /* Generate 16 bytes at a time */ - asm_ZucGenKeystream16B_4_aarch64_no_aesni(&state, pKeyStrArr); - - /* Point at the next 16 bytes of the key */ - for (i = 0; i < NUM_BUFS; i++) - pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN]; - - /* loop over the message bits */ - while (remainCommonBits >= keyStreamLengthInBits) { - remainCommonBits -= keyStreamLengthInBits; - numKeyStr++; - /* Generate the next key stream 4 bytes or 16 bytes */ - if (!remainCommonBits && allCommonBits) - asm_ZucGenKeystream4B_4_aarch64_no_aesni(&state, - pKeyStrArr); - else - asm_ZucGenKeystream16B_4_aarch64_no_aesni(&state, - pKeyStrArr); - - for (i = 0; i < NUM_BUFS; i++) { - if (job_in_lane[i] == NULL) - continue; - T[i] = asm_Eia3Round16B_aarch64_no_aesni(T[i], keyStr[i], - pIn8[i]); - /* Copy the last keystream generated - * to the first 16 bytes */ - memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN], - KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; - } - } - - /* Process each packet separately for the remaining bits */ - for (i = 0; i < NUM_BUFS; i++) { - if (job_in_lane[i] == NULL) - continue; - - const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS); - uint32_t L = ((N + 31) / ZUC_WORD_BITS) - - numKeyStr*(keyStreamLengthInBits / 32); - uint32_t remainBits = lengthInBits[i] - - numKeyStr*keyStreamLengthInBits; - uint32_t *keyStr32 = (uint32_t *) keyStr[i]; - - /* If remaining bits are more than 4 bytes, we need to generate - * at least 4B more of keystream, so we need to copy - * the zuc state to single packet state first */ - if (remainBits > 32) { - singlePktState.lfsrState[0] = state.lfsrState[0][i]; - singlePktState.lfsrState[1] = state.lfsrState[1][i]; - singlePktState.lfsrState[2] = state.lfsrState[2][i]; - singlePktState.lfsrState[3] = state.lfsrState[3][i]; - singlePktState.lfsrState[4] = state.lfsrState[4][i]; - singlePktState.lfsrState[5] = state.lfsrState[5][i]; - singlePktState.lfsrState[6] = state.lfsrState[6][i]; - singlePktState.lfsrState[7] = state.lfsrState[7][i]; - singlePktState.lfsrState[8] = state.lfsrState[8][i]; - singlePktState.lfsrState[9] = state.lfsrState[9][i]; - singlePktState.lfsrState[10] = state.lfsrState[10][i]; - singlePktState.lfsrState[11] = state.lfsrState[11][i]; - singlePktState.lfsrState[12] = state.lfsrState[12][i]; - singlePktState.lfsrState[13] = state.lfsrState[13][i]; - singlePktState.lfsrState[14] = state.lfsrState[14][i]; - singlePktState.lfsrState[15] = state.lfsrState[15][i]; - - singlePktState.fR1 = state.fR1[i]; - singlePktState.fR2 = state.fR2[i]; - } - - while (remainBits >= keyStreamLengthInBits) { - remainBits -= keyStreamLengthInBits; - L -= (keyStreamLengthInBits / 32); - - /* Generate the next key stream 4 bytes or 16 bytes */ - if (!remainBits) - asm_ZucGenKeystream_aarch64_no_aesni( - &keyStr32[4], - &singlePktState, 1); - else - asm_ZucGenKeystream16B_aarch64_no_aesni( - &keyStr32[4], - &singlePktState); - T[i] = asm_Eia3Round16B_aarch64_no_aesni(T[i], keyStr32, - pIn8[i]); - /* Copy the last keystream generated - * to the first 16 bytes */ - memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; - } - - /* - * If remaining bits has more than 1 ZUC WORD (double words), - * keystream needs to have another 1 ZUC WORD (4B) - */ - if (remainBits > 32) - asm_ZucGenKeystream8B_aarch64_no_aesni(&keyStr32[4], - &singlePktState); - - T[i] ^= asm_Eia3Remainder_aarch64_no_aesni(keyStr32, pIn8[i], - remainBits); - T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]), - remainBits % 32); - - /* save the final MAC-I result */ - *(pMacI[i]) = bswap4(T[i]); - } - -#ifdef SAFE_DATA - /* Clear sensitive data (in registers and stack) */ - clear_mem(keyStr, sizeof(keyStr)); - clear_mem(&singlePktState, sizeof(singlePktState)); - clear_mem(&state, sizeof(state)); - clear_mem(&keys, sizeof(keys)); -#endif -} +#define NO_AESNI + +#define ZUC_EEA3_1_BUFFER zuc_eea3_1_buffer_aarch64_no_aesni +#define ZUC_EEA3_4_BUFFER zuc_eea3_4_buffer_aarch64_no_aesni +#define ZUC_EEA3_N_BUFFER zuc_eea3_n_buffer_aarch64_no_aesni +#define ZUC256_EEA3_1_BUFFER zuc256_eea3_1_buffer_aarch64_no_aesni +#define ZUC256_EEA3_N_BUFFER zuc256_eea3_n_buffer_aarch64_no_aesni +#define ZUC_EIA3_1_BUFFER zuc_eia3_1_buffer_aarch64_no_aesni +#define ZUC_EIA3_4_BUFFER zuc_eia3_4_buffer_aarch64_no_aesni +#define ZUC_EIA3_N_BUFFER zuc_eia3_n_buffer_aarch64_no_aesni +#define ZUC_EIA3_4_BUFFER_JOB zuc_eia3_4_buffer_job_aarch64_no_aesni +#define ZUC256_EIA3_1_BUFFER zuc256_eia3_1_buffer_aarch64_no_aesni +#define ZUC256_EIA3_N_BUFFER zuc256_eia3_n_buffer_aarch64_no_aesni +#define ZUC256_EIA3_4_BUFFER_JOB zuc256_eia3_4_buffer_job_aarch64_no_aesni + +#define ASM_ZUC_INITIALIZATION asm_ZucInitialization_aarch64_no_aesni +#define ASM_ZUC_INITIALIZATION_4 asm_ZucInitialization_4_aarch64_no_aesni +#define ASM_ZUC_GEN_KEYSTREAM4B asm_ZucGenKeystream4B_aarch64_no_aesni +#define ASM_ZUC_GEN_KEYSTREAM8B asm_ZucGenKeystream8B_aarch64_no_aesni +#define ASM_ZUC_GEN_KEYSTREAM16B asm_ZucGenKeystream16B_aarch64_no_aesni +#define ASM_ZUC_GEN_KEYSTREAM4B_4 asm_ZucGenKeystream4B_4_aarch64_no_aesni +#define ASM_ZUC_GEN_KEYSTREAM8B_4 asm_ZucGenKeystream8B_4_aarch64_no_aesni +#define ASM_ZUC_GEN_KEYSTREAM16B_4 asm_ZucGenKeystream16B_4_aarch64_no_aesni +#define ASM_ZUC_GEN_KEYSTREAM asm_ZucGenKeystream_aarch64_no_aesni +#define ASM_ZUC_CIPHER_4 asm_ZucCipher_4_aarch64_no_aesni +#define ASM_XOR_KEYSTREAM16B asm_XorKeyStream16B_aarch64_no_aesni +#define ASM_EIA3_ROUND16B asm_Eia3Round16B_aarch64_no_aesni +#define ASM_EIA3_REMAINDER asm_Eia3Remainder_aarch64_no_aesni +#define ASM_ZUC_AUTH_4 asm_ZucAuth_4_aarch64_no_aesni +#define ASM_ZUC256_INITIALIZATION asm_Zuc256Initialization_aarch64_no_aesni +#define ASM_ZUC256_INITIALIZATION_4 asm_Zuc256Initialization_4_aarch64_no_aesni +#define ASM_ZUC256_AUTH_4 asm_Zuc256Auth_4_aarch64_no_aesni + +#include "zuc_aarch64_top.c" \ No newline at end of file diff --git a/lib/aarch64/zuc_aarch64_top.c b/lib/aarch64/zuc_aarch64_top.c index a90c9666fcdb0fe130bc7634aaeab9463fb6166e..0bef8886b362c0e9de68bf5627bc635fe96a81bc 100644 --- a/lib/aarch64/zuc_aarch64_top.c +++ b/lib/aarch64/zuc_aarch64_top.c @@ -1,5 +1,5 @@ /******************************************************************************* - Copyright (c) 2021-2022 Arm Corporation All rights reserved. + Copyright (c) 2021-2023 Arm Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: @@ -26,13 +26,48 @@ *******************************************************************************/ /*----------------------------------------------------------------------- -* zuc_common_top.c +* zuc_aarch64_top.c *----------------------------------------------------------------------- * An implementation of ZUC, the core algorithm for the * 3GPP Confidentiality and Integrity algorithms. * *-----------------------------------------------------------------------*/ +#ifndef ZUC_EEA3_1_BUFFER + +#define ZUC_EEA3_1_BUFFER zuc_eea3_1_buffer_aarch64 +#define ZUC_EEA3_4_BUFFER zuc_eea3_4_buffer_aarch64 +#define ZUC_EEA3_N_BUFFER zuc_eea3_n_buffer_aarch64 +#define ZUC256_EEA3_1_BUFFER zuc256_eea3_1_buffer_aarch64 +#define ZUC256_EEA3_N_BUFFER zuc256_eea3_n_buffer_aarch64 +#define ZUC_EIA3_1_BUFFER zuc_eia3_1_buffer_aarch64 +#define ZUC_EIA3_4_BUFFER zuc_eia3_4_buffer_aarch64 +#define ZUC_EIA3_N_BUFFER zuc_eia3_n_buffer_aarch64 +#define ZUC_EIA3_4_BUFFER_JOB zuc_eia3_4_buffer_job_aarch64 +#define ZUC256_EIA3_1_BUFFER zuc256_eia3_1_buffer_aarch64 +#define ZUC256_EIA3_N_BUFFER zuc256_eia3_n_buffer_aarch64 +#define ZUC256_EIA3_4_BUFFER_JOB zuc256_eia3_4_buffer_job_aarch64 + +#define ASM_ZUC_INITIALIZATION asm_ZucInitialization_aarch64 +#define ASM_ZUC_INITIALIZATION_4 asm_ZucInitialization_4_aarch64 +#define ASM_ZUC_GEN_KEYSTREAM4B asm_ZucGenKeystream4B_aarch64 +#define ASM_ZUC_GEN_KEYSTREAM8B asm_ZucGenKeystream8B_aarch64 +#define ASM_ZUC_GEN_KEYSTREAM16B asm_ZucGenKeystream16B_aarch64 +#define ASM_ZUC_GEN_KEYSTREAM4B_4 asm_ZucGenKeystream4B_4_aarch64 +#define ASM_ZUC_GEN_KEYSTREAM8B_4 asm_ZucGenKeystream8B_4_aarch64 +#define ASM_ZUC_GEN_KEYSTREAM16B_4 asm_ZucGenKeystream16B_4_aarch64 +#define ASM_ZUC_GEN_KEYSTREAM asm_ZucGenKeystream_aarch64 +#define ASM_ZUC_CIPHER_4 asm_ZucCipher_4_aarch64 +#define ASM_XOR_KEYSTREAM16B asm_XorKeyStream16B_aarch64 +#define ASM_EIA3_ROUND16B asm_Eia3Round16B_aarch64 +#define ASM_EIA3_REMAINDER asm_Eia3Remainder_aarch64 +#define ASM_ZUC_AUTH_4 asm_ZucAuth_4_aarch64 +#define ASM_ZUC256_INITIALIZATION asm_Zuc256Initialization_aarch64 +#define ASM_ZUC256_INITIALIZATION_4 asm_Zuc256Initialization_4_aarch64 +#define ASM_ZUC256_AUTH_4 asm_Zuc256Auth_4_aarch64 + +#endif + #include "include/zuc_internal.h" #include "ipsec-mb.h" #include "clear_regs_mem_aarch64.h" @@ -42,80 +77,35 @@ #define NUM_BUFS 4 #define KEYSTR_ROUND_LEN 16 -static inline -void _zuc_eea3_1_buffer_aarch64(const void *pKey, - const void *pIv, - const void *pBufferIn, - void *pBufferOut, - const uint32_t length) +// Read 8*6 bits and store them as 8 partial bytes +// (using 6 least significant bits) +static void expand_from_6_to_8_bytes(uint8_t *pOutput, const uint8_t *pInput) { - DECLARE_ALIGNED(ZucState_t zucState, 16); - DECLARE_ALIGNED(uint8_t keyStream[KEYSTR_ROUND_LEN], 16); - - const uint64_t *pIn64 = NULL; - uint64_t *pOut64 = NULL, *pKeyStream64 = NULL; - uint64_t *pTemp64 = NULL, *pdstTemp64 = NULL; - - uint32_t numKeyStreamsPerPkt = length/ KEYSTR_ROUND_LEN; - const uint32_t numBytesLeftOver = length % KEYSTR_ROUND_LEN; - - asm_ZucInitialization_aarch64(pKey, pIv, &(zucState)); + uint64_t bit_mask[8] = {0x3f, 0xfc0, 0x3f000, 0xfc0000, 0x3f000000, 0xfc0000000, + 0x3f000000000, 0xfc0000000000}; + uint8_t inputarr[8] = {0}; + uint64_t num64bit; + int i; - /* Loop over all the Quad-Words in input buffer and XOR with the 64bits - * of generated keystream - */ - pOut64 = (uint64_t *) pBufferOut; - pIn64 = (const uint64_t *) pBufferIn; + // store 6 bytes input to 8 bytes array in reverse order, inputarr[i] = 0xfedcba00 + for (i = 0; i <= 5; i++) + inputarr[i] = *(pInput + (5 - i)); - while (numKeyStreamsPerPkt--) { - /* Generate the key stream 16 bytes at a time */ - asm_ZucGenKeystream16B_aarch64((uint32_t *) &keyStream[0], &zucState); + // cast 8 bytes array to uint64 number, num64bit=0xabcdef + num64bit = *(uint64_t *)(&inputarr[0]); - /* XOR The Keystream generated with the input buffer here */ - pKeyStream64 = (uint64_t *)keyStream; - asm_XorKeyStream16B_aarch64(pIn64, pOut64, pKeyStream64); - pIn64 += 2; - pOut64 += 2; + for (i = 0; i <= 7; i++) { + *(pOutput + i) = (num64bit & bit_mask[7 - i]) >> (48 - 6 * (i+1)); } - - /* Check for remaining 0 to 15 bytes */ - if(numBytesLeftOver) { - /* buffer to store 16 bytes of keystream */ - DECLARE_ALIGNED(uint8_t tempSrc[KEYSTR_ROUND_LEN], 16); - DECLARE_ALIGNED(uint8_t tempDst[KEYSTR_ROUND_LEN], 16); - const uint8_t *pIn8 = (const uint8_t *) pBufferIn; - uint8_t *pOut8 = (uint8_t *) pBufferOut; - const uint64_t num4BRounds = ((numBytesLeftOver - 1) / 4) + 1; - - asm_ZucGenKeystream_aarch64((uint32_t *) &keyStream[0], &zucState, num4BRounds); - - /* copy the remaining bytes into temporary buffer and XOR with - * the 64-bytes of keystream. Then copy on the valid bytes back - * to the output buffer */ - memcpy(&tempSrc[0], &pIn8[length - numBytesLeftOver], numBytesLeftOver); - pKeyStream64 = (uint64_t *) &keyStream[0]; - pTemp64 = (uint64_t *) &tempSrc[0]; - pdstTemp64 = (uint64_t *) &tempDst[0]; - - asm_XorKeyStream16B_aarch64(pTemp64, pdstTemp64, pKeyStream64); - memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0], - numBytesLeftOver); - -#ifdef SAFE_DATA - imb_clear_mem(tempSrc, sizeof(tempSrc)); - imb_clear_mem(tempDst, sizeof(tempDst)); -#endif - - } - } static inline -void _zuc256_eea3_1_buffer_aarch64(const void *pKey, - const void *pIv, - const void *pBufferIn, - void *pBufferOut, - const uint32_t length) +void _zuc_eea3_1_buffer(const void *pKey, + const void *pIv, + const void *pBufferIn, + void *pBufferOut, + const uint32_t length, + const uint32_t key_size) { DECLARE_ALIGNED(ZucState_t zucState, 16); DECLARE_ALIGNED(uint8_t keyStream[KEYSTR_ROUND_LEN], 16); @@ -127,21 +117,25 @@ void _zuc256_eea3_1_buffer_aarch64(const void *pKey, uint32_t numKeyStreamsPerPkt = length/ KEYSTR_ROUND_LEN; const uint32_t numBytesLeftOver = length % KEYSTR_ROUND_LEN; - asm_Zuc256Initialization_aarch64(pKey, pIv, &(zucState), 2); + if (key_size == 256) { + ASM_ZUC256_INITIALIZATION(pKey, pIv, &(zucState), 2); + } else { + ASM_ZUC_INITIALIZATION(pKey, pIv, &(zucState)); + } /* Loop over all the Quad-Words in input buffer and XOR with the 64bits * of generated keystream - */ + */ pOut64 = (uint64_t *) pBufferOut; pIn64 = (const uint64_t *) pBufferIn; while (numKeyStreamsPerPkt--) { /* Generate the key stream 16 bytes at a time */ - asm_ZucGenKeystream16B_aarch64((uint32_t *) &keyStream[0], &zucState); + ASM_ZUC_GEN_KEYSTREAM16B((uint32_t *) &keyStream[0], &zucState); /* XOR The Keystream generated with the input buffer here */ pKeyStream64 = (uint64_t *)keyStream; - asm_XorKeyStream16B_aarch64(pIn64, pOut64, pKeyStream64); + ASM_XOR_KEYSTREAM16B(pIn64, pOut64, pKeyStream64); pIn64 += 2; pOut64 += 2; } @@ -155,7 +149,7 @@ void _zuc256_eea3_1_buffer_aarch64(const void *pKey, uint8_t *pOut8 = (uint8_t *) pBufferOut; const uint64_t num4BRounds = ((numBytesLeftOver - 1) / 4) + 1; - asm_ZucGenKeystream_aarch64((uint32_t *) &keyStream[0], &zucState, num4BRounds); + ASM_ZUC_GEN_KEYSTREAM((uint32_t *) &keyStream[0], &zucState, num4BRounds); /* copy the remaining bytes into temporary buffer and XOR with * the 64-bytes of keystream. Then copy on the valid bytes back @@ -165,7 +159,7 @@ void _zuc256_eea3_1_buffer_aarch64(const void *pKey, pTemp64 = (uint64_t *) &tempSrc[0]; pdstTemp64 = (uint64_t *) &tempDst[0]; - asm_XorKeyStream16B_aarch64(pTemp64, pdstTemp64, pKeyStream64); + ASM_XOR_KEYSTREAM16B(pTemp64, pdstTemp64, pKeyStream64); memcpy(&pOut8[length - numBytesLeftOver], &tempDst[0], numBytesLeftOver); @@ -173,16 +167,16 @@ void _zuc256_eea3_1_buffer_aarch64(const void *pKey, imb_clear_mem(tempSrc, sizeof(tempSrc)); imb_clear_mem(tempDst, sizeof(tempDst)); #endif - } - + } } static inline -void _zuc_eea3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - void *pBufferOut[NUM_BUFS], - const uint32_t length[NUM_BUFS]) +void _zuc_eea3_4_buffer(const void * const pKey[NUM_BUFS], + const uint8_t *ivs, + const void * const pBufferIn[NUM_BUFS], + void *pBufferOut[NUM_BUFS], + const uint32_t length[NUM_BUFS], + const uint32_t key_size) { DECLARE_ALIGNED(ZucState4_t state, 64); DECLARE_ALIGNED(ZucState_t singlePktState, 64); @@ -195,12 +189,10 @@ void _zuc_eea3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], /* min number of bytes */ uint32_t bytes = (bytes1 < bytes2) ? bytes1 : bytes2; uint32_t numKeyStreamsPerPkt; - DECLARE_ALIGNED(uint16_t remainBytes[NUM_BUFS], 16) = {0}; + DECLARE_ALIGNED(uint32_t remainBytes[NUM_BUFS], 16) = {0}; DECLARE_ALIGNED(uint8_t keyStr[NUM_BUFS][KEYSTR_ROUND_LEN], 64); /* structure to store the 4 keys */ DECLARE_ALIGNED(ZucKey4_t keys, 64); - /* structure to store the 4 IV's */ - DECLARE_ALIGNED(uint8_t ivs[NUM_BUFS*32], 16); uint32_t numBytesLeftOver = 0; const uint8_t *pTempBufInPtr = NULL; uint8_t *pTempBufOutPtr = NULL; @@ -215,10 +207,12 @@ void _zuc_eea3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], for (i = 0; i < NUM_BUFS; i++) { remainBytes[i] = length[i]; keys.pKeys[i] = pKey[i]; - memcpy(ivs + i*32, pIv[i], 16); } - asm_ZucInitialization_4_aarch64(&keys, ivs, &state); + if (key_size == 256) + ASM_ZUC256_INITIALIZATION_4(&keys, ivs, &state, 2); + else + ASM_ZUC_INITIALIZATION_4(&keys, ivs, &state); for (i = 0; i < NUM_BUFS; i++) { pOut64[i] = (uint64_t *) pBufferOut[i]; @@ -226,8 +220,8 @@ void _zuc_eea3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], } /* Encrypt common length of all buffers */ - asm_ZucCipher_4_aarch64(&state, pIn64, pOut64, - remainBytes, (uint16_t) bytes); + ASM_ZUC_CIPHER_4(&state, pIn64, pOut64, + remainBytes, bytes); /* process each packet separately for the remaining bytes */ for (i = 0; i < NUM_BUFS; i++) { @@ -268,13 +262,12 @@ void _zuc_eea3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], while (numKeyStreamsPerPkt--) { /* Generate the key stream 16 bytes at a time */ - asm_ZucGenKeystream16B_aarch64( - (uint32_t *) keyStr[0], - &singlePktState); + ASM_ZUC_GEN_KEYSTREAM16B((uint32_t *) keyStr[0], + &singlePktState); pKeyStream64 = (uint64_t *) keyStr[0]; - asm_XorKeyStream16B_aarch64(pIn64[0], - pOut64[0], - pKeyStream64); + ASM_XOR_KEYSTREAM16B(pIn64[0], + pOut64[0], + pKeyStream64); pIn64[0] += 2; pOut64[0] += 2; } @@ -289,9 +282,9 @@ void _zuc_eea3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], const uint64_t num4BRounds = ((numBytesLeftOver - 1) / 4) + 1; - asm_ZucGenKeystream_aarch64((uint32_t *)&keyStr[0], - &singlePktState, - num4BRounds); + ASM_ZUC_GEN_KEYSTREAM((uint32_t *)&keyStr[0], + &singlePktState, + num4BRounds); /* copy the remaining bytes into temporary * buffer and XOR with the 16 bytes of * keystream. Then copy on the valid bytes back @@ -304,9 +297,9 @@ void _zuc_eea3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], pKeyStream64 = (uint64_t *) &keyStr[0][0]; pTempSrc64 = (uint64_t *) &tempSrc[0]; pTempDst64 = (uint64_t *) &tempDst[0]; - asm_XorKeyStream16B_aarch64(pTempSrc64, - pTempDst64, - pKeyStream64); + ASM_XOR_KEYSTREAM16B(pTempSrc64, + pTempDst64, + pKeyStream64); memcpy(&pTempBufOutPtr[offset], &tempDst[0], numBytesLeftOver); @@ -327,15 +320,99 @@ void _zuc_eea3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], } static inline -void _zuc_eea3_4_buffer(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - void *pBufferOut[NUM_BUFS], - const uint32_t length[NUM_BUFS]) +void _zuc_eea3_n_buffer(const void * const pKey[], + const void * const pIv[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t length[], + const uint32_t numBuffers) +{ + unsigned int i = 0; + unsigned int packetCount = numBuffers; + + while (packetCount >= NUM_BUFS) { + /* structure to store the 4 IV's */ + DECLARE_ALIGNED(uint8_t ivs[NUM_BUFS*32], 16); + uint32_t iv_idx; + for (iv_idx = 0; iv_idx < NUM_BUFS; iv_idx++) { + memcpy(ivs + iv_idx*32, pIv[iv_idx + i], 16); + } + + packetCount -= NUM_BUFS; + _zuc_eea3_4_buffer(&pKey[i], + ivs, + &pBufferIn[i], + &pBufferOut[i], + &length[i], + 128); + i += NUM_BUFS; + } + + while(packetCount--) { + _zuc_eea3_1_buffer(pKey[i], + pIv[i], + pBufferIn[i], + pBufferOut[i], + length[i], + 128); + i++; + } +} + +void ZUC_EEA3_1_BUFFER(const void *pKey, + const void *pIv, + const void *pBufferIn, + void *pBufferOut, + const uint32_t length) { #ifdef SAFE_PARAM - unsigned int i; + if (imb_errno != 0) + imb_set_errno(NULL, 0); + if (pKey == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_KEY); + return; + } + if (pIv == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return; + } + + if (pBufferIn == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return; + } + + if (pBufferOut == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_DST); + return; + } + + /* Check input data is in range of supported length */ + if (length < ZUC_MIN_BYTELEN || + length > ZUC_MAX_BYTELEN) { + imb_set_errno(NULL, IMB_ERR_CIPH_LEN); + return; + } +#endif + + _zuc_eea3_1_buffer(pKey, pIv, pBufferIn, pBufferOut, length, 128); + +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +} + +void ZUC_EEA3_4_BUFFER(const void * const pKey[NUM_BUFS], + const void * const pIv[NUM_BUFS], + const void * const pBufferIn[NUM_BUFS], + void *pBufferOut[NUM_BUFS], + const uint32_t length[NUM_BUFS]) +{ +#ifdef SAFE_PARAM + unsigned int i; if (imb_errno != 0) imb_set_errno(NULL, 0); @@ -396,7 +473,13 @@ void _zuc_eea3_4_buffer(const void * const pKey[NUM_BUFS], } #endif - _zuc_eea3_4_buffer_aarch64(pKey, pIv, pBufferIn, pBufferOut, length); + /* structure to store the 4 IV's */ + DECLARE_ALIGNED(uint8_t ivs[NUM_BUFS*32], 16); + uint32_t iv_idx; + for (iv_idx = 0; iv_idx < NUM_BUFS; iv_idx++) { + memcpy(ivs + iv_idx*32, pIv[iv_idx], 16); + } + _zuc_eea3_4_buffer(pKey, ivs, pBufferIn, pBufferOut, length, 128); #ifdef SAFE_DATA /* Clear sensitive data in registers */ @@ -405,18 +488,15 @@ void _zuc_eea3_4_buffer(const void * const pKey[NUM_BUFS], #endif } -static inline -void _zuc_eea3_n_buffer(const void * const pKey[], - const void * const pIv[], - const void * const pBufferIn[], - void *pBufferOut[], - const uint32_t length[], - const uint32_t numBuffers) +void ZUC_EEA3_N_BUFFER(const void * const pKey[], + const void * const pIv[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t length[], + const uint32_t numBuffers) { - unsigned int i; - unsigned int packetCount = numBuffers; - #ifdef SAFE_PARAM + unsigned int i; if (imb_errno != 0) imb_set_errno(NULL, 0); @@ -476,26 +556,8 @@ void _zuc_eea3_n_buffer(const void * const pKey[], } } #endif - i = 0; - - while (packetCount >= NUM_BUFS) { - packetCount -= NUM_BUFS; - _zuc_eea3_4_buffer(&pKey[i], - &pIv[i], - &pBufferIn[i], - &pBufferOut[i], - &length[i]); - i += NUM_BUFS; - } - while(packetCount--) { - _zuc_eea3_1_buffer_aarch64(pKey[i], - pIv[i], - pBufferIn[i], - pBufferOut[i], - length[i]); - i++; - } + _zuc_eea3_n_buffer(pKey, pIv, pBufferIn, pBufferOut, length, numBuffers); #ifdef SAFE_DATA /* Clear sensitive data in registers */ @@ -504,15 +566,17 @@ void _zuc_eea3_n_buffer(const void * const pKey[], #endif } -void zuc_eea3_1_buffer_aarch64(const void *pKey, - const void *pIv, - const void *pBufferIn, - void *pBufferOut, - const uint32_t length) +void ZUC256_EEA3_1_BUFFER(const void *pKey, + const void *pIv, + const uint32_t ivLen, + const void *pBufferIn, + void *pBufferOut, + const uint32_t length) { #ifdef SAFE_PARAM if (imb_errno != 0) imb_set_errno(NULL, 0); + /* Check for NULL pointers */ if (pKey == NULL) { imb_set_errno(NULL, IMB_ERR_NULL_KEY); return; @@ -534,14 +598,27 @@ void zuc_eea3_1_buffer_aarch64(const void *pKey, } /* Check input data is in range of supported length */ - if (length < ZUC_MIN_BYTELEN || - length > ZUC_MAX_BYTELEN) { + if(length < ZUC_MIN_BYTELEN || length > ZUC_MAX_BYTELEN) { imb_set_errno(NULL, IMB_ERR_CIPH_LEN); return; } + + if (ivLen != 23 && ivLen != 25) { + imb_set_errno(NULL, IMB_ERR_IV_LEN); + return; + } #endif + uint8_t iv[32]; + if (ivLen == 25) { + memcpy(iv, pIv, 25); + } else { + // copy first 17 bytes + memcpy(iv, pIv, 17); + // expand next 6 bytes to 8 bytes + expand_from_6_to_8_bytes(iv + 17, (const uint8_t *)pIv + 17); + } - _zuc_eea3_1_buffer_aarch64(pKey, pIv, pBufferIn, pBufferOut, length); + _zuc_eea3_1_buffer(pKey, iv, pBufferIn, pBufferOut, length, 256); #ifdef SAFE_DATA /* Clear sensitive data in registers */ @@ -550,68 +627,134 @@ void zuc_eea3_1_buffer_aarch64(const void *pKey, #endif } -void zuc_eea3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - void *pBufferOut[NUM_BUFS], - const uint32_t length[NUM_BUFS]) -{ - _zuc_eea3_4_buffer(pKey, pIv, pBufferIn, pBufferOut, length); -} - -void zuc_eea3_n_buffer_aarch64(const void * const pKey[], - const void * const pIv[], - const void * const pBufferIn[], - void *pBufferOut[], - const uint32_t length[], - const uint32_t numBuffers) +void ZUC256_EEA3_N_BUFFER(const void * const pKey[], + const void * const pIv[], + const uint32_t ivLen[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t length[], + const uint32_t numBuffers) { - _zuc_eea3_n_buffer(pKey, pIv, pBufferIn, pBufferOut, length, numBuffers); -} + unsigned int i, iv_idx; + unsigned int packetCount = numBuffers; + uint8_t ivs[32 * NUM_BUFS]; -void zuc256_eea3_1_buffer_aarch64(const void *pKey, - const void *pIv, - const void *pBufferIn, - void *pBufferOut, - const uint32_t length) -{ #ifdef SAFE_PARAM if (imb_errno != 0) imb_set_errno(NULL, 0); - /* Check for NULL pointers */ - if (pKey == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_KEY); - return; - } - - if (pIv == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_IV); - return; - } - - if (pBufferIn == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_SRC); - return; - } - - if (pBufferOut == NULL) { - imb_set_errno(NULL, IMB_ERR_NULL_DST); - return; - } - - /* Check input data is in range of supported length */ - if(length < ZUC_MIN_BYTELEN || length > ZUC_MAX_BYTELEN) { - imb_set_errno(NULL, IMB_ERR_CIPH_LEN); - return; - } + + /* Check for NULL pointers */ + if (pKey == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_KEY); + return; + } + + if (pIv == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return; + } + + if (ivLen == NULL) { + imb_set_errno(NULL, IMB_ERR_IV_LEN); + return; + } + + if (pBufferIn == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return; + } + + if (pBufferOut == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_DST); + return; + } + + if (length == NULL) { + imb_set_errno(NULL, IMB_ERR_CIPH_LEN); + return; + } + + /* Check for NULL pointers and lengths for each buffer */ + for (i = 0; i < numBuffers; i++) { + if (pKey[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_KEY); + return; + } + + if (pIv[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return; + } + + if (ivLen[i] != 23 && ivLen[i] != 25) { + imb_set_errno(NULL, IMB_ERR_IV_LEN); + return; + } + + if (pBufferIn[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return; + } + + if (pBufferOut[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_DST); + return; + } + + /* Check input data is in range of supported length */ + if (length[i] < ZUC_MIN_BYTELEN || + length[i] > ZUC_MAX_BYTELEN) { + imb_set_errno(NULL, IMB_ERR_CIPH_LEN); + return; + } + } #endif + i = 0; + + while(packetCount >= 4) { + for (iv_idx = 0; iv_idx < 4; iv_idx++) { + if (ivLen[i + iv_idx] == 25) { + memcpy(ivs + iv_idx * 32, pIv[i + iv_idx], 25); + } else { + // copy first 17 bytes + memcpy(ivs + iv_idx * 32, pIv[i + iv_idx], 17); + // expand next 6 bytes to 8 bytes + expand_from_6_to_8_bytes(ivs + iv_idx * 32 + 17, + (const uint8_t *)pIv[i + iv_idx] + 17); + } + } + packetCount -= 4; + _zuc_eea3_4_buffer(&pKey[i], + ivs, + &pBufferIn[i], + &pBufferOut[i], + &length[i], + 256); + i += 4; + } - _zuc256_eea3_1_buffer_aarch64(pKey, pIv, pBufferIn, pBufferOut, length); + while(packetCount--) { + if (ivLen[i] == 25) { + memcpy(ivs, pIv[i], 25); + } else { + // copy first 17 bytes + memcpy(ivs, pIv[i], 17); + // expand next 6 bytes to 8 bytes + expand_from_6_to_8_bytes(ivs + 17, (const uint8_t *)pIv[i] + 17); + } + _zuc_eea3_1_buffer(pKey[i], + ivs, + pBufferIn[i], + pBufferOut[i], + length[i], + 256); + i++; + } #ifdef SAFE_DATA - /* Clear sensitive data in registers */ - CLEAR_SCRATCH_GPS(); - CLEAR_SCRATCH_SIMD_REGS(); + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); #endif } @@ -626,67 +769,165 @@ static inline uint64_t load_uint64(const void *ptr) } static inline -void _zuc_eia3_1_buffer_aarch64(const void *pKey, - const void *pIv, - const void *pBufferIn, - const uint32_t lengthInBits, - uint32_t *pMacI, - bool key128) +void _zuc_eia3_1_buffer(const void *pKey, + const void *pIv, + const void *pBufferIn, + const uint32_t lengthInBits, + uint32_t *pMacI, + const uint32_t key_size, + const uint64_t tag_size) { DECLARE_ALIGNED(ZucState_t zucState, 16); DECLARE_ALIGNED(uint32_t keyStream[4 * 2], 64); const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8; /* generate a key-stream 2 words longer than the input message */ - const uint32_t N = lengthInBits + (2 * ZUC_WORD_BITS); - uint32_t L = (N + 31) / ZUC_WORD_BITS; + uint32_t L = (lengthInBits + 31 + (tag_size << 4)) / ZUC_WORD_BITS; uint32_t *pZuc = (uint32_t *) &keyStream[0]; uint32_t remainingBits = lengthInBits; - uint32_t T = 0; + uint32_t T[4] = {0}; const uint8_t *pIn8 = (const uint8_t *) pBufferIn; - - if(key128) - asm_ZucInitialization_aarch64(pKey, pIv, &(zucState)); - else { - asm_Zuc256Initialization_aarch64(pKey, pIv, &(zucState), 4); - /* Initialize the tags with the first 4 bytes of keystream */ - asm_ZucGenKeystream4B_aarch64_no_aesni(pZuc, &zucState); - memcpy(&T, pZuc, 4); + uint32_t last_key_idx = (L - 1) % 4; + + if (key_size == 256) { + ASM_ZUC256_INITIALIZATION(pKey, pIv, &(zucState), tag_size); + /* Initialize the tags with the first 4/8/16 bytes of keystream */ + switch (tag_size) + { + case 4: + ASM_ZUC_GEN_KEYSTREAM4B(pZuc, &zucState); + L -= 1; + break; + case 8: + ASM_ZUC_GEN_KEYSTREAM8B(pZuc, &zucState); + L -= 2; + break; + case 16: + ASM_ZUC_GEN_KEYSTREAM16B(pZuc, &zucState); + L -= 4; + break; + default: + return; + } + memcpy(T, pZuc, tag_size); + } else { + ASM_ZUC_INITIALIZATION(pKey, pIv, &(zucState)); } - asm_ZucGenKeystream16B_aarch64(pZuc, &zucState); + ASM_ZUC_GEN_KEYSTREAM16B(pZuc, &zucState); + /* 4 KS words are generated already */ + L = (L > 4) ? (L - 4) : 0; /* loop over the message bits */ while (remainingBits >= keyStreamLengthInBits) { remainingBits -= keyStreamLengthInBits; - L -= (keyStreamLengthInBits / 32); - - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainingBits) - asm_ZucGenKeystream8B_aarch64(&keyStream[4], &zucState); - else - asm_ZucGenKeystream16B_aarch64(&keyStream[4], &zucState); - T = asm_Eia3Round16B_aarch64(T, keyStream, pIn8); - /* Copy the last keystream generated to the first 16 bytes */ - memcpy(&keyStream[0], &keyStream[4], KEYSTR_ROUND_LEN); + + IMB_ASSERT(L > 0); + /* Generate the next key stream */ + ASM_ZUC_GEN_KEYSTREAM16B(&keyStream[4], &zucState); + if (L > 4) { + L -= 4; + } else { + last_key_idx = L - 1; + L = 0; + } + + ASM_EIA3_ROUND16B(T, keyStream, pIn8, tag_size); + /* Copy the last keystream generated to the first 16 bytes */ + memcpy(&keyStream[0], &keyStream[4], KEYSTR_ROUND_LEN); pIn8 = &pIn8[KEYSTR_ROUND_LEN]; } - /* - * If remaining bits has more than 2 ZUC WORDS (double words), - * keystream needs to have up to another 2 ZUC WORDS (8B) - */ - if (remainingBits > (2 * 32)) - asm_ZucGenKeystream8B_aarch64(&keyStream[4], &zucState); - T ^= asm_Eia3Remainder_aarch64(&keyStream[0], pIn8, remainingBits); - T ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]), - remainingBits % 32); + /* Generate final keystream if needed */ + IMB_ASSERT(L <= 4); + if (L > 0) { + ASM_ZUC_GEN_KEYSTREAM(&keyStream[4], + &zucState, L); + last_key_idx = 4 + L - 1; + } + + if (key_size == 256) { + /* Fullfill the last block with 0s, to simpify the compuation + * of last block */ + uint32_t remainBytes = (remainingBits + 7) / 8; + uint8_t pIn8_tmp[16] = {0}; + memcpy(pIn8_tmp, pIn8, remainBytes); + uint32_t clearBits = 8 - (remainingBits % 8); + if (clearBits == 8) + clearBits = 0; + if (remainBytes != 0) { + pIn8_tmp[remainBytes - 1] = ((pIn8_tmp[remainBytes - 1]) >> clearBits) + << clearBits; + } + ASM_EIA3_ROUND16B(T, keyStream, pIn8_tmp, tag_size); + + uint64_t tag, tag1, tag2, ks1, ks2, ks3, ks4; + switch (tag_size) + { + case 4: + T[0] ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]), + remainingBits % 32); + /* save the final MAC-I result */ + *pMacI = bswap4(T[0]); + break; + case 8: + tag = T[1]; + tag = tag << 32 | T[0]; + ks1 = rotate_left(load_uint64(&keyStream[remainingBits / 32]), + remainingBits % 32); + ks2 = rotate_left(load_uint64(&keyStream[remainingBits / 32 + 1]), + remainingBits % 32); + ks1 = (ks1 << 32) >> 32; + ks2 = ks2 << 32; + ks1 = ks1 ^ ks2; + tag = tag ^ ks1; + tag = BSWAP64(tag); + tag = (tag >> 32) | (tag << 32); + /* save the final MAC-I result */ + memcpy(pMacI, &tag, tag_size); + break; + case 16: + tag1 = T[1]; + tag1 = tag1 << 32 | T[0]; + tag2 = T[3]; + tag2 = tag2 << 32 | T[2]; + + ks1 = rotate_left(load_uint64(&keyStream[remainingBits / 32]), + remainingBits % 32); + ks2 = rotate_left(load_uint64(&keyStream[remainingBits / 32 + 1]), + remainingBits % 32); + ks3 = rotate_left(load_uint64(&keyStream[remainingBits / 32 + 2]), + remainingBits % 32); + ks4 = rotate_left(load_uint64(&keyStream[remainingBits / 32 + 3]), + remainingBits % 32); + ks1 = (ks1 << 32) >> 32; + ks2 = ks2 << 32; + ks1 = ks1 ^ ks2; + tag1 = tag1 ^ ks1; + ks3 = (ks3 << 32) >> 32; + ks4 = ks4 << 32; + ks3 = ks3 ^ ks4; + tag2 = tag2 ^ ks3; + tag1 = BSWAP64(tag1); + tag1 = (tag1 >> 32) | (tag1 << 32); + tag2 = BSWAP64(tag2); + tag2 = (tag2 >> 32) | (tag2 << 32); + /* save the final MAC-I result */ + memcpy(pMacI, &tag1, 8); + memcpy(pMacI + 2, &tag2, 8); + break; + default: + break; + } + } else { + T[0] ^= ASM_EIA3_REMAINDER(&keyStream[0], pIn8, remainingBits); + T[0] ^= rotate_left(load_uint64(&keyStream[remainingBits / 32]), + remainingBits % 32); - if(key128) { - /* save the final MAC-I result */ - uint32_t keyBlock = keyStream[L - 1]; - T ^= keyBlock; + /* save the final MAC-I result */ + uint32_t keyBlock = keyStream[last_key_idx]; + T[0] ^= keyBlock; + *pMacI = bswap4(T[0]); } - *pMacI = bswap4(T); #ifdef SAFE_DATA /* Clear sensitive data (in registers and stack) */ @@ -696,11 +937,15 @@ void _zuc_eia3_1_buffer_aarch64(const void *pKey, } static inline -void _zuc_eia3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - const uint32_t lengthInBits[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS]) +void _zuc_eia3_4_buffer(const void * const pKey[NUM_BUFS], + const uint8_t *ivs, + const void * const pBufferIn[NUM_BUFS], + const uint32_t lengthInBits[NUM_BUFS], + uint32_t *pMacI[NUM_BUFS], + const uint32_t job_api, + const void * const job_in_lane[NUM_BUFS], + const uint32_t key_size, + const uint64_t tag_size) { unsigned int i; DECLARE_ALIGNED(ZucState4_t state, 64); @@ -708,84 +953,93 @@ void _zuc_eia3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], DECLARE_ALIGNED(uint8_t keyStr[NUM_BUFS][2*KEYSTR_ROUND_LEN], 64); /* structure to store the 4 keys */ DECLARE_ALIGNED(ZucKey4_t keys, 64); - /* structure to store the 4 IV's */ - DECLARE_ALIGNED(uint8_t ivs[NUM_BUFS*32], 16); const uint8_t *pIn8[NUM_BUFS] = {NULL}; uint32_t remainCommonBits; uint32_t numKeyStr = 0; - uint32_t T[NUM_BUFS] = {0}; + uint32_t T[NUM_BUFS * 4] = {0}; const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8; DECLARE_ALIGNED(uint32_t *pKeyStrArr[NUM_BUFS], 16) = {NULL}; - unsigned int allCommonBits; /* Check if all lengths are equal */ if ((lengthInBits[0] == lengthInBits[1]) && (lengthInBits[0] == lengthInBits[2]) && (lengthInBits[0] == lengthInBits[3])) { remainCommonBits = lengthInBits[0]; - allCommonBits = 1; } else { /* Calculate the minimum input packet size */ uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ? - lengthInBits[0] : lengthInBits[1]); + lengthInBits[0] : lengthInBits[1]); uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ? - lengthInBits[2] : lengthInBits[3]); + lengthInBits[2] : lengthInBits[3]); remainCommonBits = (bits1 < bits2) ? bits1 : bits2; - allCommonBits = 0; } for (i = 0; i < NUM_BUFS; i++) { pIn8[i] = (const uint8_t *) pBufferIn[i]; pKeyStrArr[i] = (uint32_t *) &keyStr[i][0]; keys.pKeys[i] = pKey[i]; - memcpy(ivs + i*32, pIv[i], 16); } - asm_ZucInitialization_4_aarch64(&keys, ivs, &state); - - /* Generate 16 bytes at a time */ - asm_ZucGenKeystream16B_4_aarch64(&state, pKeyStrArr); + if (key_size == 256) { + ASM_ZUC256_INITIALIZATION_4(&keys, ivs, &state, tag_size); + /* Initialize the tags with the first 4/8/16 bytes of keystream */ + switch (tag_size) + { + case 4: + ASM_ZUC_GEN_KEYSTREAM4B_4(&state, pKeyStrArr); + break; + case 8: + ASM_ZUC_GEN_KEYSTREAM8B_4(&state, pKeyStrArr); + break; + case 16: + ASM_ZUC_GEN_KEYSTREAM16B_4(&state, pKeyStrArr); + break; + default: + return; + } + for (i = 0; i < NUM_BUFS; i++) + memcpy(&T[i * tag_size / 4], pKeyStrArr[i], tag_size); + } else { + ASM_ZUC_INITIALIZATION_4(&keys, ivs, &state); + } + numKeyStr = remainCommonBits / keyStreamLengthInBits; + if (key_size == 256) { + ASM_ZUC256_AUTH_4(&state, T, pIn8, numKeyStr, pKeyStrArr, tag_size); + } else { + ASM_ZUC_AUTH_4(&state, T, pIn8, numKeyStr, pKeyStrArr); + } + remainCommonBits = remainCommonBits % keyStreamLengthInBits; /* Point at the next 16 bytes of the key */ for (i = 0; i < NUM_BUFS; i++) pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN]; - /* loop over the message bits */ - while (remainCommonBits >= keyStreamLengthInBits) { - remainCommonBits -= keyStreamLengthInBits; - numKeyStr++; - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainCommonBits && allCommonBits) - asm_ZucGenKeystream8B_4_aarch64(&state, pKeyStrArr); - else - asm_ZucGenKeystream16B_4_aarch64(&state, pKeyStrArr); - - for (i = 0; i < NUM_BUFS; i++) { - T[i] = asm_Eia3Round16B_aarch64(T[i], keyStr[i], - pIn8[i]); - /* Copy the last keystream generated - * to the first 16 bytes */ - memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN], - KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; - } - } - /* Process each packet separately for the remaining bits */ for (i = 0; i < NUM_BUFS; i++) { - const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS); - uint32_t L = ((N + 31) / ZUC_WORD_BITS) - - numKeyStr*(keyStreamLengthInBits / 32); - uint32_t remainBits = lengthInBits[i] - - numKeyStr*keyStreamLengthInBits; + if (job_api && (job_in_lane[i] == NULL)) + continue; + + uint32_t remainBits = lengthInBits[i] - numKeyStr*keyStreamLengthInBits; uint32_t *keyStr32 = (uint32_t *) keyStr[i]; + uint32_t N = remainBits + ((uint32_t) tag_size << 4); + + /* For zuc256, first tag_sz words have been generated to initialize tags */ + if (key_size == 256) { + N -= tag_size << 3; + } + uint32_t L = ((N + 31) / ZUC_WORD_BITS); + uint32_t last_key_idx = (L - 1) % 4; + /* 4 KS words are generated already */ + L = (L > 4) ? (L - 4) : 0; - /* If remaining bits are more than 8 bytes, we need to generate - * at least 8B more of keystream, so we need to copy - * the zuc state to single packet state first */ - if (remainBits > (2*32)) { + + /* If remaining bits are more than 4 bytes, we need to generate + * at least 4B more of keystream, so we need to copy + * the zuc state to single packet state first + */ + if (L > 0) { singlePktState.lfsrState[0] = state.lfsrState[0][i]; singlePktState.lfsrState[1] = state.lfsrState[1][i]; singlePktState.lfsrState[2] = state.lfsrState[2][i]; @@ -809,39 +1063,125 @@ void _zuc_eia3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], while (remainBits >= keyStreamLengthInBits) { remainBits -= keyStreamLengthInBits; - L -= (keyStreamLengthInBits / 32); - - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainBits) - asm_ZucGenKeystream8B_aarch64(&keyStr32[4], - &singlePktState); - else - asm_ZucGenKeystream16B_aarch64(&keyStr32[4], - &singlePktState); - T[i] = asm_Eia3Round16B_aarch64(T[i], keyStr32, - pIn8[i]); + IMB_ASSERT(L > 0); + /* Generate the next key stream 4 bytes or 16 bytes */ + if (L > 4) { + ASM_ZUC_GEN_KEYSTREAM16B(&keyStr32[4], + &singlePktState); + L -= 4; + } else { + ASM_ZUC_GEN_KEYSTREAM(&keyStr32[4], + &singlePktState, L); + last_key_idx = L - 1; + L = 0; + } + ASM_EIA3_ROUND16B(&T[i * tag_size / 4], keyStr32, + pIn8[i], tag_size); /* Copy the last keystream generated * to the first 16 bytes */ memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN); pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; } - /* - * If remaining bits has more than 2 ZUC WORDS (double words), - * keystream needs to have up to another 2 ZUC WORDS (8B) - */ - if (remainBits > (2 * 32)) - asm_ZucGenKeystream8B_aarch64(&keyStr32[4], - &singlePktState); - - uint32_t keyBlock = keyStr32[L - 1]; - - T[i] ^= asm_Eia3Remainder_aarch64(keyStr32, pIn8[i], remainBits); - T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]), - remainBits % 32); + /* Generate final keystream if needed */ + IMB_ASSERT(L <= 4); + if (L > 0) { + ASM_ZUC_GEN_KEYSTREAM(&keyStr32[4], + &singlePktState, L); + last_key_idx = 4 + L - 1; + } - /* save the final MAC-I result */ - *(pMacI[i]) = bswap4(T[i] ^ keyBlock); + if (key_size == 256) { + /* Fullfill the last block with 0s, to simpify the compuation + * of last block */ + uint32_t remainBytes = (remainBits + 7) / 8; + uint8_t pIn8_tmp[16] = {0}; + memcpy(pIn8_tmp, pIn8[i], remainBytes); + uint32_t clearBits = 8 - (remainBits % 8); + if (clearBits == 8) + clearBits = 0; + if (remainBytes != 0) { + pIn8_tmp[remainBytes - 1] = (pIn8_tmp[remainBytes - 1] + >> clearBits) + << clearBits; + } + ASM_EIA3_ROUND16B(&T[i * tag_size / 4], keyStr32, + pIn8_tmp, tag_size); + + uint64_t tag, tag1, tag2, ks1, ks2, ks3, ks4; + switch (tag_size) + { + case 4: + T[i * tag_size / 4] ^= rotate_left( + load_uint64(&keyStr32[remainBits / 32]), + remainBits % 32); + /* save the final MAC-I result */ + *(pMacI[i]) = bswap4(T[i * tag_size / 4]); + break; + case 8: + tag = T[i * tag_size / 4 + 1]; + tag = tag << 32 | T[i * tag_size / 4]; + + ks1 = rotate_left( + load_uint64(&keyStr32[remainBits / 32]), + remainBits % 32); + ks2 = rotate_left( + load_uint64(&keyStr32[remainBits / 32 + 1]), + remainBits % 32); + ks1 = (ks1 << 32) >> 32; + ks2 = ks2 << 32; + ks1 = ks1 ^ ks2; + tag = tag ^ ks1; + tag = BSWAP64(tag); + tag = (tag >> 32) | (tag << 32); + /* save the final MAC-I result */ + memcpy(pMacI[i], &tag, tag_size); + break; + case 16: + tag1 = T[i * tag_size / 4 + 1]; + tag1 = tag1 << 32 | T[i * tag_size / 4 ]; + tag2 = T[i * tag_size / 4 + 3]; + tag2 = tag2 << 32 | T[i * tag_size / 4 + 2]; + + ks1 = rotate_left( + load_uint64(&keyStr32[remainBits / 32]), + remainBits % 32); + ks2 = rotate_left( + load_uint64(&keyStr32[remainBits / 32 + 1]), + remainBits % 32); + ks3 = rotate_left( + load_uint64(&keyStr32[remainBits / 32 + 2]), + remainBits % 32); + ks4 = rotate_left( + load_uint64(&keyStr32[remainBits / 32 + 3]), + remainBits % 32); + ks1 = (ks1 << 32) >> 32; + ks2 = ks2 << 32; + ks1 = ks1 ^ ks2; + tag1 = tag1 ^ ks1; + ks3 = (ks3 << 32) >> 32; + ks4 = ks4 << 32; + ks3 = ks3 ^ ks4; + tag2 = tag2 ^ ks3; + tag1 = BSWAP64(tag1); + tag1 = (tag1 >> 32) | (tag1 << 32); + tag2 = BSWAP64(tag2); + tag2 = (tag2 >> 32) | (tag2 << 32); + /* save the final MAC-I result */ + memcpy(pMacI[i], &tag1, 8); + memcpy(pMacI[i] + 2, &tag2, 8); + break; + default: + break; + } + } else { + uint32_t keyBlock = keyStr32[last_key_idx]; + T[i] ^= ASM_EIA3_REMAINDER(keyStr32, pIn8[i], remainBits); + T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]), + remainBits % 32); + /* save the final MAC-I result */ + *(pMacI[i]) = bswap4(T[i] ^ keyBlock); + } } #ifdef SAFE_DATA @@ -854,17 +1194,90 @@ void _zuc_eia3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], } static inline -void _zuc_eia3_n_buffer_aarch64(const void * const pKey[], - const void * const pIv[], - const void * const pBufferIn[], - const uint32_t lengthInBits[], - uint32_t *pMacI[], - const uint32_t numBuffers) +void _zuc_eia3_n_buffer(const void * const pKey[], + const void * const pIv[], + const void * const pBufferIn[], + const uint32_t lengthInBits[], + uint32_t *pMacI[], + const uint32_t numBuffers) { - unsigned int i; + unsigned int i = 0; unsigned int packetCount = numBuffers; + while(packetCount >= 4) { + /* structure to store the 4 IV's */ + DECLARE_ALIGNED(uint8_t ivs[NUM_BUFS*32], 16); + unsigned int iv_idx; + for (iv_idx = 0; iv_idx < 4; iv_idx++) + memcpy(ivs + iv_idx * 32, pIv[iv_idx + i], 16); + + packetCount -= 4; + _zuc_eia3_4_buffer(&pKey[i], ivs, &pBufferIn[i], + &lengthInBits[i], &pMacI[i], + 0, NULL, 128, 4); + i += 4; + } + + while(packetCount--) { + _zuc_eia3_1_buffer(pKey[i], pIv[i], pBufferIn[i], + lengthInBits[i], pMacI[i], 128, 4); + i++; + } +} + +void ZUC_EIA3_1_BUFFER(const void *pKey, + const void *pIv, + const void *pBufferIn, + const uint32_t lengthInBits, + uint32_t *pMacI) +{ #ifdef SAFE_PARAM + if (imb_errno != 0) + imb_set_errno(NULL, 0); + /* Check for NULL pointers */ + if (pKey == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_KEY); + return; + } + + if (pIv == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return; + } + + if (pBufferIn == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return; + } + + if (pMacI == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_AUTH); + return; + } + + /* Check input data is in range of supported length */ + if (lengthInBits < ZUC_MIN_BITLEN || lengthInBits > ZUC_MAX_BITLEN) { + imb_set_errno(NULL, IMB_ERR_AUTH_LEN); + return; + } +#endif + + _zuc_eia3_1_buffer(pKey, pIv, pBufferIn, lengthInBits, pMacI, 128, 4); + +#ifdef SAFE_DATA + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +} + +void ZUC_EIA3_4_BUFFER(const void * const pKey[NUM_BUFS], + const void * const pIv[NUM_BUFS], + const void * const pBufferIn[NUM_BUFS], + const uint32_t lengthInBits[NUM_BUFS], + uint32_t *pMacI[NUM_BUFS]) +{ +#ifdef SAFE_PARAM + unsigned int i; if (imb_errno != 0) imb_set_errno(NULL, 0); @@ -895,7 +1308,7 @@ void _zuc_eia3_n_buffer_aarch64(const void * const pKey[], } /* Check for NULL pointers and lengths for each buffer */ - for (i = 0; i < numBuffers; i++) { + for (i = 0; i < NUM_BUFS; i++) { if (pKey[i] == NULL) { imb_set_errno(NULL, IMB_ERR_NULL_KEY); return; @@ -924,45 +1337,35 @@ void _zuc_eia3_n_buffer_aarch64(const void * const pKey[], } } #endif - i = 0; - while(packetCount >= 4) { - packetCount -=4; - _zuc_eia3_4_buffer_aarch64(&pKey[i], - &pIv[i], - &pBufferIn[i], - &lengthInBits[i], - &pMacI[i]); - i+=4; - } + /* structure to store the 4 IV's */ + DECLARE_ALIGNED(uint8_t ivs[NUM_BUFS*32], 16); + unsigned int iv_idx; + for (iv_idx = 0; iv_idx < NUM_BUFS; iv_idx++) + memcpy(ivs + iv_idx * 32, pIv[iv_idx], 16); - while(packetCount--) { - _zuc_eia3_1_buffer_aarch64(pKey[i], - pIv[i], - pBufferIn[i], - lengthInBits[i], - pMacI[i], - true); - i++; - } + _zuc_eia3_4_buffer(pKey, ivs, pBufferIn, lengthInBits, + pMacI, 0, NULL, 128, 4); #ifdef SAFE_DATA /* Clear sensitive data in registers */ CLEAR_SCRATCH_GPS(); CLEAR_SCRATCH_SIMD_REGS(); #endif - } -void zuc_eia3_1_buffer_aarch64(const void *pKey, - const void *pIv, - const void *pBufferIn, - const uint32_t lengthInBits, - uint32_t *pMacI) +void ZUC_EIA3_N_BUFFER(const void * const pKey[], + const void * const pIv[], + const void * const pBufferIn[], + const uint32_t lengthInBits[], + uint32_t *pMacI[], + const uint32_t numBuffers) { #ifdef SAFE_PARAM + unsigned int i; if (imb_errno != 0) imb_set_errno(NULL, 0); + /* Check for NULL pointers */ if (pKey == NULL) { imb_set_errno(NULL, IMB_ERR_NULL_KEY); @@ -984,45 +1387,59 @@ void zuc_eia3_1_buffer_aarch64(const void *pKey, return; } - /* Check input data is in range of supported length */ - if (lengthInBits < ZUC_MIN_BITLEN || lengthInBits > ZUC_MAX_BITLEN) { + if (lengthInBits == NULL) { imb_set_errno(NULL, IMB_ERR_AUTH_LEN); return; } + + /* Check for NULL pointers and lengths for each buffer */ + for (i = 0; i < numBuffers; i++) { + if (pKey[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_KEY); + return; + } + + if (pIv[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return; + } + + if (pBufferIn[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return; + } + + if (pMacI[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_AUTH); + return; + } + + /* Check input data is in range of supported length */ + if (lengthInBits[i] < ZUC_MIN_BITLEN || + lengthInBits[i] > ZUC_MAX_BITLEN) { + imb_set_errno(NULL, IMB_ERR_AUTH_LEN); + return; + } + } #endif - _zuc_eia3_1_buffer_aarch64(pKey, pIv, pBufferIn, lengthInBits, pMacI, true); + _zuc_eia3_n_buffer(pKey, pIv, pBufferIn, lengthInBits, + pMacI, numBuffers); #ifdef SAFE_DATA + /* Clear sensitive data in registers */ CLEAR_SCRATCH_GPS(); CLEAR_SCRATCH_SIMD_REGS(); #endif } -void zuc_eia3_4_buffer_aarch64(const void * const pKey[NUM_BUFS], - const void * const pIv[NUM_BUFS], - const void * const pBufferIn[NUM_BUFS], - const uint32_t lengthInBits[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS]) -{ - _zuc_eia3_4_buffer_aarch64(pKey, pIv, pBufferIn, lengthInBits, pMacI); -} - -void zuc_eia3_n_buffer_aarch64(const void * const pKey[], - const void * const pIv[], - const void * const pBufferIn[], - const uint32_t lengthInBits[], - uint32_t *pMacI[], - const uint32_t numBuffers) -{ - _zuc_eia3_n_buffer_aarch64(pKey, pIv, pBufferIn, lengthInBits, pMacI, numBuffers); -} - -void zuc256_eia3_1_buffer_aarch64(const void *pKey, - const void *pIv, - const void *pBufferIn, - const uint32_t lengthInBits, - uint32_t *pMacI) +void ZUC256_EIA3_1_BUFFER(const void *pKey, + const void *pIv, + const uint32_t ivLen, + const void *pBufferIn, + const uint32_t lengthInBits, + uint32_t *pMacI, + const uint64_t tag_size) { #ifdef SAFE_PARAM if (imb_errno != 0) @@ -1053,9 +1470,24 @@ void zuc256_eia3_1_buffer_aarch64(const void *pKey, imb_set_errno(NULL, IMB_ERR_AUTH_LEN); return; } + if (tag_size != 4 && tag_size != 8 && tag_size != 16) { + imb_set_errno(NULL, IMB_ERR_AUTH_TAG_LEN); + return; + } #endif - _zuc_eia3_1_buffer_aarch64(pKey, pIv, pBufferIn, lengthInBits, pMacI, false); + uint8_t iv[32]; + if (ivLen == 25) { + memcpy(iv, pIv, 25); + } else { + // copy first 17 bytes + memcpy(iv, pIv, 17); + // expand next 6 bytes to 8 bytes + expand_from_6_to_8_bytes(iv + 17, (const uint8_t *)pIv + 17); + } + + _zuc_eia3_1_buffer(pKey, iv, pBufferIn, lengthInBits, + pMacI, 256, tag_size); #ifdef SAFE_DATA CLEAR_SCRATCH_GPS(); @@ -1063,347 +1495,166 @@ void zuc256_eia3_1_buffer_aarch64(const void *pKey, #endif } -static inline -void _zuc_eia3_4_buffer_job(const void * const pKey[NUM_BUFS], - const uint8_t *ivs, - const void * const pBufferIn[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS], - const uint16_t lengthInBits[NUM_BUFS], - const void * const job_in_lane[NUM_BUFS]) +void ZUC256_EIA3_N_BUFFER(const void * const pKey[], + const void * const pIv[], + const uint32_t ivLen[], + const void * const pBufferIn[], + const uint32_t lengthInBits[], + uint32_t *pMacI[], + const uint64_t tag_size, + const uint32_t numBuffers) { - unsigned int i; - DECLARE_ALIGNED(ZucState4_t state, 64); - DECLARE_ALIGNED(ZucState_t singlePktState, 64); - DECLARE_ALIGNED(uint8_t keyStr[NUM_BUFS][2*KEYSTR_ROUND_LEN], 64); - /* structure to store the 4 keys */ - DECLARE_ALIGNED(ZucKey4_t keys, 64); - const uint8_t *pIn8[NUM_BUFS] = {NULL}; - uint32_t remainCommonBits; - uint32_t numKeyStr = 0; - uint32_t T[NUM_BUFS] = {0}; - const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8; - DECLARE_ALIGNED(uint32_t *pKeyStrArr[NUM_BUFS], 16) = {NULL}; - unsigned int allCommonBits; + unsigned int i, iv_idx; + unsigned int packetCount = numBuffers; + uint8_t ivs[32 * NUM_BUFS]; - /* Check if all lengths are equal */ - if ((lengthInBits[0] == lengthInBits[1]) && - (lengthInBits[0] == lengthInBits[2]) && - (lengthInBits[0] == lengthInBits[3])) { - remainCommonBits = lengthInBits[0]; - allCommonBits = 1; - } else { - /* Calculate the minimum input packet size */ - uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ? - lengthInBits[0] : lengthInBits[1]); - uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ? - lengthInBits[2] : lengthInBits[3]); +#ifdef SAFE_PARAM + if (imb_errno != 0) + imb_set_errno(NULL, 0); - remainCommonBits = (bits1 < bits2) ? bits1 : bits2; - allCommonBits = 0; + /* Check for NULL pointers */ + if (pKey == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_KEY); + return; } - for (i = 0; i < NUM_BUFS; i++) { - pIn8[i] = (const uint8_t *) pBufferIn[i]; - pKeyStrArr[i] = (uint32_t *) &keyStr[i][0]; - keys.pKeys[i] = pKey[i]; + if (pIv == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return; } - asm_ZucInitialization_4_aarch64(&keys, ivs, &state); - - /* Generate 16 bytes at a time */ - asm_ZucGenKeystream16B_4_aarch64(&state, pKeyStrArr); - - /* Point at the next 16 bytes of the key */ - for (i = 0; i < NUM_BUFS; i++) - pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN]; - - /* loop over the message bits */ - while (remainCommonBits >= keyStreamLengthInBits) { - remainCommonBits -= keyStreamLengthInBits; - numKeyStr++; - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainCommonBits && allCommonBits) - asm_ZucGenKeystream8B_4_aarch64(&state, pKeyStrArr); - else - asm_ZucGenKeystream16B_4_aarch64(&state, pKeyStrArr); - - for (i = 0; i < NUM_BUFS; i++) { - if (job_in_lane[i] == NULL) - continue; - T[i] = asm_Eia3Round16B_aarch64(T[i], keyStr[i], - pIn8[i]); - /* Copy the last keystream generated to the first 16 bytes */ - memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN], - KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; - } + if (ivLen == NULL) { + imb_set_errno(NULL, IMB_ERR_IV_LEN); + return; } - /* Process each packet separately for the remaining bits */ - for (i = 0; i < NUM_BUFS; i++) { - if (job_in_lane[i] == NULL) - continue; - - const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS); - uint32_t L = ((N + 31) / ZUC_WORD_BITS) - - numKeyStr*(keyStreamLengthInBits / 32); - uint32_t remainBits = lengthInBits[i] - - numKeyStr*keyStreamLengthInBits; - uint32_t *keyStr32 = (uint32_t *) keyStr[i]; - - /* If remaining bits are more than 8 bytes, we need to generate - * at least 8B more of keystream, so we need to copy - * the zuc state to single packet state first */ - if (remainBits > (2*32)) { - singlePktState.lfsrState[0] = state.lfsrState[0][i]; - singlePktState.lfsrState[1] = state.lfsrState[1][i]; - singlePktState.lfsrState[2] = state.lfsrState[2][i]; - singlePktState.lfsrState[3] = state.lfsrState[3][i]; - singlePktState.lfsrState[4] = state.lfsrState[4][i]; - singlePktState.lfsrState[5] = state.lfsrState[5][i]; - singlePktState.lfsrState[6] = state.lfsrState[6][i]; - singlePktState.lfsrState[7] = state.lfsrState[7][i]; - singlePktState.lfsrState[8] = state.lfsrState[8][i]; - singlePktState.lfsrState[9] = state.lfsrState[9][i]; - singlePktState.lfsrState[10] = state.lfsrState[10][i]; - singlePktState.lfsrState[11] = state.lfsrState[11][i]; - singlePktState.lfsrState[12] = state.lfsrState[12][i]; - singlePktState.lfsrState[13] = state.lfsrState[13][i]; - singlePktState.lfsrState[14] = state.lfsrState[14][i]; - singlePktState.lfsrState[15] = state.lfsrState[15][i]; - - singlePktState.fR1 = state.fR1[i]; - singlePktState.fR2 = state.fR2[i]; - } - - while (remainBits >= keyStreamLengthInBits) { - remainBits -= keyStreamLengthInBits; - L -= (keyStreamLengthInBits / 32); - - /* Generate the next key stream 8 bytes or 16 bytes */ - if (!remainBits) - asm_ZucGenKeystream8B_aarch64(&keyStr32[4], - &singlePktState); - else - asm_ZucGenKeystream16B_aarch64(&keyStr32[4], - &singlePktState); - T[i] = asm_Eia3Round16B_aarch64(T[i], keyStr32, - pIn8[i]); - /* Copy the last keystream generated to the first 16 bytes */ - memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; - } - - /* If remaining bits has more than 2 ZUC WORDS (double words), - * keystream needs to have up to another 2 ZUC WORDS (8B) - */ - if (remainBits > (2 * 32)) - asm_ZucGenKeystream8B_aarch64(&keyStr32[4], - &singlePktState); - - uint32_t keyBlock = keyStr32[L - 1]; - - T[i] ^= asm_Eia3Remainder_aarch64(keyStr32, pIn8[i], remainBits); - T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]), - remainBits % 32); - - /* save the final MAC-I result */ - *(pMacI[i]) = bswap4(T[i] ^ keyBlock); + if (pBufferIn == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return; } -#ifdef SAFE_DATA - /* Clear sensitive data (in registers and stack) */ - clear_mem(keyStr, sizeof(keyStr)); - clear_mem(&singlePktState, sizeof(singlePktState)); - clear_mem(&state, sizeof(state)); - clear_mem(&keys, sizeof(keys)); -#endif -} - -static inline -void _zuc256_eia3_4_buffer_job(const void * const pKey[NUM_BUFS], - const uint8_t *ivs, - const void * const pBufferIn[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS], - const uint16_t lengthInBits[NUM_BUFS], - const void * const job_in_lane[NUM_BUFS]) -{ - unsigned int i; - DECLARE_ALIGNED(ZucState4_t state, 64); - DECLARE_ALIGNED(ZucState_t singlePktState, 64); - DECLARE_ALIGNED(uint8_t keyStr[NUM_BUFS][2*KEYSTR_ROUND_LEN], 64); - /* structure to store the 4 keys */ - DECLARE_ALIGNED(ZucKey4_t keys, 64); - const uint8_t *pIn8[NUM_BUFS] = {NULL}; - uint32_t remainCommonBits; - uint32_t numKeyStr = 0; - uint32_t T[NUM_BUFS] = {0}; - const uint32_t keyStreamLengthInBits = KEYSTR_ROUND_LEN * 8; - DECLARE_ALIGNED(uint32_t *pKeyStrArr[NUM_BUFS], 16) = {NULL}; - unsigned int allCommonBits; - - /* Check if all lengths are equal */ - if ((lengthInBits[0] == lengthInBits[1]) && - (lengthInBits[0] == lengthInBits[2]) && - (lengthInBits[0] == lengthInBits[3])) { - remainCommonBits = lengthInBits[0]; - allCommonBits = 1; - } else { - /* Calculate the minimum input packet size */ - uint32_t bits1 = (lengthInBits[0] < lengthInBits[1] ? - lengthInBits[0] : lengthInBits[1]); - uint32_t bits2 = (lengthInBits[2] < lengthInBits[3] ? - lengthInBits[2] : lengthInBits[3]); - - remainCommonBits = (bits1 < bits2) ? bits1 : bits2; - allCommonBits = 0; + if (pMacI == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_AUTH); + return; } - for (i = 0; i < NUM_BUFS; i++) { - pIn8[i] = (const uint8_t *) pBufferIn[i]; - pKeyStrArr[i] = (uint32_t *) &keyStr[i][0]; - keys.pKeys[i] = pKey[i]; + if (lengthInBits == NULL) { + imb_set_errno(NULL, IMB_ERR_AUTH_LEN); + return; } - /* TODO: Handle 8 and 16-byte digest cases */ - asm_Zuc256Initialization_4_aarch64(&keys, ivs, &state, 4); + /* Check for NULL pointers and lengths for each buffer */ + for (i = 0; i < numBuffers; i++) { + if (pKey[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_KEY); + return; + } - /* Initialize the tags with the first 4 bytes of keystream */ - asm_ZucGenKeystream4B_4_aarch64(&state, pKeyStrArr); + if (pIv[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_IV); + return; + } - for (i = 0; i < NUM_BUFS; i++) - memcpy(&T[i], pKeyStrArr[i], 4); + if (ivLen[i] != 23 && ivLen[i] != 25) { + imb_set_errno(NULL, IMB_ERR_IV_LEN); + return; + } - /* Generate 16 bytes at a time */ - asm_ZucGenKeystream16B_4_aarch64(&state, pKeyStrArr); + if (pBufferIn[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_SRC); + return; + } - /* Point at the next 16 bytes of the key */ - for (i = 0; i < NUM_BUFS; i++) - pKeyStrArr[i] = (uint32_t *) &keyStr[i][KEYSTR_ROUND_LEN]; + if (pMacI[i] == NULL) { + imb_set_errno(NULL, IMB_ERR_NULL_AUTH); + return; + } - /* loop over the message bits */ - while (remainCommonBits >= keyStreamLengthInBits) { - remainCommonBits -= keyStreamLengthInBits; - numKeyStr++; - /* Generate the next key stream 4 bytes or 16 bytes */ - if (!remainCommonBits && allCommonBits) - asm_ZucGenKeystream4B_4_aarch64(&state, - pKeyStrArr); - else - asm_ZucGenKeystream16B_4_aarch64(&state, - pKeyStrArr); - - for (i = 0; i < NUM_BUFS; i++) { - if (job_in_lane[i] == NULL) - continue; - T[i] = asm_Eia3Round16B_aarch64(T[i], keyStr[i], - pIn8[i]); - /* Copy the last keystream generated - * to the first 16 bytes */ - memcpy(&keyStr[i][0], &keyStr[i][KEYSTR_ROUND_LEN], - KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; + /* Check input data is in range of supported length */ + if (lengthInBits[i] < ZUC_MIN_BITLEN || + lengthInBits[i] > ZUC_MAX_BITLEN) { + imb_set_errno(NULL, IMB_ERR_AUTH_LEN); + return; } } - /* Process each packet separately for the remaining bits */ - for (i = 0; i < NUM_BUFS; i++) { - if (job_in_lane[i] == NULL) - continue; - - const uint32_t N = lengthInBits[i] + (2 * ZUC_WORD_BITS); - uint32_t L = ((N + 31) / ZUC_WORD_BITS) - - numKeyStr*(keyStreamLengthInBits / 32); - uint32_t remainBits = lengthInBits[i] - - numKeyStr*keyStreamLengthInBits; - uint32_t *keyStr32 = (uint32_t *) keyStr[i]; - - /* If remaining bits are more than 4 bytes, we need to generate - * at least 4B more of keystream, so we need to copy - * the zuc state to single packet state first - */ - if (remainBits > 32) { - singlePktState.lfsrState[0] = state.lfsrState[0][i]; - singlePktState.lfsrState[1] = state.lfsrState[1][i]; - singlePktState.lfsrState[2] = state.lfsrState[2][i]; - singlePktState.lfsrState[3] = state.lfsrState[3][i]; - singlePktState.lfsrState[4] = state.lfsrState[4][i]; - singlePktState.lfsrState[5] = state.lfsrState[5][i]; - singlePktState.lfsrState[6] = state.lfsrState[6][i]; - singlePktState.lfsrState[7] = state.lfsrState[7][i]; - singlePktState.lfsrState[8] = state.lfsrState[8][i]; - singlePktState.lfsrState[9] = state.lfsrState[9][i]; - singlePktState.lfsrState[10] = state.lfsrState[10][i]; - singlePktState.lfsrState[11] = state.lfsrState[11][i]; - singlePktState.lfsrState[12] = state.lfsrState[12][i]; - singlePktState.lfsrState[13] = state.lfsrState[13][i]; - singlePktState.lfsrState[14] = state.lfsrState[14][i]; - singlePktState.lfsrState[15] = state.lfsrState[15][i]; + if (tag_size != 4 && tag_size != 8 && tag_size != 16) { + imb_set_errno(NULL, IMB_ERR_AUTH_TAG_LEN); + return; + } +#endif + i = 0; - singlePktState.fR1 = state.fR1[i]; - singlePktState.fR2 = state.fR2[i]; + while(packetCount >= 4) { + for (iv_idx = 0; iv_idx < 4; iv_idx++) { + if (ivLen[i + iv_idx] == 25) { + memcpy(ivs + iv_idx * 32, pIv[i + iv_idx], 25); + } else { + // copy first 17 bytes + memcpy(ivs + iv_idx * 32, pIv[i + iv_idx], 17); + // expand next 6 bytes to 8 bytes + expand_from_6_to_8_bytes(ivs + iv_idx * 32 + 17, + (const uint8_t *)pIv[i + iv_idx] + 17); + } } + packetCount -= 4; - while (remainBits >= keyStreamLengthInBits) { - remainBits -= keyStreamLengthInBits; - L -= (keyStreamLengthInBits / 32); + _zuc_eia3_4_buffer(&pKey[i], ivs, &pBufferIn[i], &lengthInBits[i], + &pMacI[i], 0, NULL, 256, tag_size); + i += 4; + } - /* Generate the next key stream 4 bytes or 16 bytes */ - if (!remainBits) - asm_ZucGenKeystream_aarch64(&keyStr32[4], - &singlePktState, 1); - else - asm_ZucGenKeystream16B_aarch64(&keyStr32[4], - &singlePktState); - T[i] = asm_Eia3Round16B_aarch64(T[i], keyStr32, - pIn8[i]); - /* Copy the last keystream generated - * to the first 16 bytes */ - memcpy(keyStr32, &keyStr32[4], KEYSTR_ROUND_LEN); - pIn8[i] = &pIn8[i][KEYSTR_ROUND_LEN]; + while(packetCount--) { + if (ivLen[i] == 25) { + memcpy(ivs, pIv[i], 25); + } else { + // copy first 17 bytes + memcpy(ivs, pIv[i], 17); + // expand next 6 bytes to 8 bytes + expand_from_6_to_8_bytes(ivs + 17, (const uint8_t *)pIv[i] + 17); } - - /* If remaining bits has more than 1 ZUC WORD (double word), - * keystream needs to have another ZUC WORD (4B) */ - if (remainBits > 32) - asm_ZucGenKeystream_aarch64(&keyStr32[4], - &singlePktState, 1); - - T[i] ^= asm_Eia3Remainder_aarch64(keyStr32, pIn8[i], remainBits); - T[i] ^= rotate_left(load_uint64(&keyStr32[remainBits / 32]), - remainBits % 32); - - /* save the final MAC-I result */ - *(pMacI[i]) = bswap4(T[i]); + _zuc_eia3_1_buffer(pKey[i], ivs, pBufferIn[i], lengthInBits[i], + pMacI[i], 256, tag_size); + i++; } #ifdef SAFE_DATA - /* Clear sensitive data (in registers and stack) */ - clear_mem(keyStr, sizeof(keyStr)); - clear_mem(&singlePktState, sizeof(singlePktState)); - clear_mem(&state, sizeof(state)); - clear_mem(&keys, sizeof(keys)); + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); #endif } -void zuc_eia3_4_buffer_job_aarch64(const void * const pKey[NUM_BUFS], - const uint8_t *ivs, - const void * const pBufferIn[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS], - const uint16_t lengthInBits[NUM_BUFS], - const void * const job_in_lane[NUM_BUFS]) +void ZUC_EIA3_4_BUFFER_JOB(const void * const pKey[NUM_BUFS], + const uint8_t *ivs, + const void * const pBufferIn[NUM_BUFS], + uint32_t *pMacI[NUM_BUFS], + const uint32_t lengthInBits[NUM_BUFS], + const void * const job_in_lane[NUM_BUFS]) { - _zuc_eia3_4_buffer_job(pKey, ivs, pBufferIn, pMacI, lengthInBits, - job_in_lane); + + _zuc_eia3_4_buffer(pKey, ivs, pBufferIn, lengthInBits, + pMacI, 1, job_in_lane, 128, 4); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif } -void zuc256_eia3_4_buffer_job_aarch64(const void * const pKey[NUM_BUFS], - const uint8_t *ivs, - const void * const pBufferIn[NUM_BUFS], - uint32_t *pMacI[NUM_BUFS], - const uint16_t lengthInBits[NUM_BUFS], - const void * const job_in_lane[NUM_BUFS]) +void ZUC256_EIA3_4_BUFFER_JOB(const void * const pKey[NUM_BUFS], + const uint8_t *ivs, + const void * const pBufferIn[NUM_BUFS], + uint32_t *pMacI[NUM_BUFS], + const uint32_t lengthInBits[NUM_BUFS], + const void * const job_in_lane[NUM_BUFS], + const uint64_t tag_size) { - _zuc256_eia3_4_buffer_job(pKey, ivs, pBufferIn, pMacI, lengthInBits, - job_in_lane); -} + _zuc_eia3_4_buffer(pKey, ivs, pBufferIn, lengthInBits, pMacI, + 1, job_in_lane, 256, tag_size); +#ifdef SAFE_DATA + /* Clear sensitive data in registers */ + CLEAR_SCRATCH_GPS(); + CLEAR_SCRATCH_SIMD_REGS(); +#endif +} \ No newline at end of file diff --git a/lib/aarch64/zuc_common.S b/lib/aarch64/zuc_common.S index fc61ba03d368f446d332b81be8cb6400cdbac57e..7ed729bd510833ba9ff624af5a987425ef62e0b9 100644 --- a/lib/aarch64/zuc_common.S +++ b/lib/aarch64/zuc_common.S @@ -368,10 +368,6 @@ declare_register pD, x22 .set counter, (counter+1) .endr -// Reorder LFSR registers, as not all 16 rounds have been completed -// (if number of rounder is not 4,8 or 16, the only possible case is 2 -// and in that case, we don't have to update the states, as that function -// call is done at the end the algorithm .if \NUM_ROUNDS == 8 ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [pState] st1 {v2.16b, v3.16b}, [pState] @@ -384,6 +380,18 @@ declare_register pD, x22 str q0, [pstate, #48] .endif +.if \NUM_ROUNDS == 2 + mov xTMP, pState + ldp x8, x9, [xTMP], #16 + ldp x10, x11, [xTMP], #16 + ldp x12, x13, [xTMP], #16 + ldp x14, x15, [xTMP] + stp x9, x10, [pState] + stp x11, x12, [pState, #16] + stp x13, x14, [pState, #32] + stp x15, x8, [pState, #48] +.endif + .if \NUM_ROUNDS == 1 mov xTMP, pState ldr w10, [xTMP], #4 diff --git a/lib/aarch64/zuc_simd.S b/lib/aarch64/zuc_simd.S index c1cece9859199e5d13fe0674328c80f438be305a..2cd39a3f70c4d7f41e5658b585a92928d6308f04 100644 --- a/lib/aarch64/zuc_simd.S +++ b/lib/aarch64/zuc_simd.S @@ -28,20 +28,22 @@ #include "zuc_sbox.S" #include "zuc_common.S" -#ifndef ZUC_CIPHER_4 -#define ZUC_CIPHER_4 asm_ZucCipher_4_aarch64 -#define ZUC128_INIT_4 asm_ZucInitialization_4_aarch64 -#define ZUC256_INIT asm_Zuc256Initialization_aarch64 -#define ZUC256_INIT_4 asm_Zuc256Initialization_4_aarch64 -#define ZUC_KEYGEN16B_4 asm_ZucGenKeystream16B_4_aarch64 -#define ZUC_KEYGEN8B_4 asm_ZucGenKeystream8B_4_aarch64 -#define ZUC_KEYGEN4B_4 asm_ZucGenKeystream4B_4_aarch64 -#define ZUC_EIA3ROUND16B asm_Eia3Round16B_aarch64 -#define ZUC_EIA3REMAINDER asm_Eia3Remainder_aarch64 -#define ZUC_XORKEYSTREAM16B asm_XorKeyStream16B_aarch64 +#ifndef ZUC_CIPHER_4 +#define ZUC_CIPHER_4 asm_ZucCipher_4_aarch64 +#define ZUC128_INIT_4 asm_ZucInitialization_4_aarch64 +#define ZUC256_INIT asm_Zuc256Initialization_aarch64 +#define ZUC256_INIT_4 asm_Zuc256Initialization_4_aarch64 +#define ZUC_KEYGEN16B_4 asm_ZucGenKeystream16B_4_aarch64 +#define ZUC_KEYGEN8B_4 asm_ZucGenKeystream8B_4_aarch64 +#define ZUC_KEYGEN4B_4 asm_ZucGenKeystream4B_4_aarch64 +#define ZUC_EIA3ROUND16B asm_Eia3Round16B_aarch64 +#define ZUC_EIA3REMAINDER asm_Eia3Remainder_aarch64 +#define ZUC_XORKEYSTREAM16B asm_XorKeyStream16B_aarch64 +#define ZUC128_AUTH_4 asm_ZucAuth_4_aarch64 +#define ZUC256_AUTH_4 asm_Zuc256Auth_4_aarch64 #endif -#define IMB_FEATURE_PMULL (1ULL << 34) +#define IMB_FEATURE_PMULL (1ULL << 34) .arch armv8-a+crypto @@ -53,8 +55,8 @@ // 64-127 bytes: Authentication with tag size = 4 // 128-191 bytes: Authentication with tag size = 8 // 192-255 bytes: Authentication with tag size = 16 -.align 8 -.type EK256_d64, %object +.align 8 +.type EK256_d64, %object EK256_d64: .word 0x00220000, 0x002F0000, 0x00240000, 0x002A0000 .word 0x006D0000, 0x00400000, 0x00400000, 0x00400000 @@ -72,10 +74,10 @@ EK256_d64: .word 0x006D0000, 0x00400000, 0x00400000, 0x00400000 .word 0x00400000, 0x00400000, 0x00400000, 0x00400000 .word 0x00400000, 0x00520000, 0x00100000, 0x00300000 -.size EK256_d64,.-EK256_d64 +.size EK256_d64,.-EK256_d64 -.align 6 -.type Ek_d, %object +.align 6 +.type Ek_d, %object Ek_d: .word 0x0044D700, 0x0026BC00, 0x00626B00, 0x00135E00 .word 0x00578900, 0x0035E200, 0x00713500, 0x0009AF00 @@ -83,8 +85,8 @@ Ek_d: .word 0x005E2600, 0x003C4D00, 0x00789A00, 0x0047AC00 .size Ek_d,.-Ek_d -.align 6 -.type shuf_mask_key, %object +.align 6 +.type shuf_mask_key, %object shuf_mask_key: .word 0x00FFFFFF, 0x01FFFFFF, 0x02FFFFFF, 0x03FFFFFF .word 0x04FFFFFF, 0x05FFFFFF, 0x06FFFFFF, 0x07FFFFFF @@ -92,45 +94,45 @@ shuf_mask_key: .word 0x0CFFFFFF, 0x0DFFFFFF, 0x0EFFFFFF, 0x0FFFFFFF .size shuf_mask_key,.-shuf_mask_key -.align 6 -.type shuf_mask_iv, %object +.align 6 +.type shuf_mask_iv, %object shuf_mask_iv: -.word 0xFFFFFF00, 0xFFFFFF01, 0xFFFFFF02, 0xFFFFFF03 -.word 0xFFFFFF04, 0xFFFFFF05, 0xFFFFFF06, 0xFFFFFF07 -.word 0xFFFFFF08, 0xFFFFFF09, 0xFFFFFF0A, 0xFFFFFF0B -.word 0xFFFFFF0C, 0xFFFFFF0D, 0xFFFFFF0E, 0xFFFFFF0F -.size shuf_mask_iv,.-shuf_mask_iv - -.align 4 -.type KS_reorder, %object +.word 0xFFFFFF00, 0xFFFFFF01, 0xFFFFFF02, 0xFFFFFF03 +.word 0xFFFFFF04, 0xFFFFFF05, 0xFFFFFF06, 0xFFFFFF07 +.word 0xFFFFFF08, 0xFFFFFF09, 0xFFFFFF0A, 0xFFFFFF0B +.word 0xFFFFFF0C, 0xFFFFFF0D, 0xFFFFFF0E, 0xFFFFFF0F +.size shuf_mask_iv,.-shuf_mask_iv + +.align 4 +.type KS_reorder, %object KS_reorder: -.quad 0x0302010007060504, 0x070605040b0a0908 -.size KS_reorder,.-KS_reorder +.quad 0x0302010007060504, 0x070605040b0a0908 +.size KS_reorder,.-KS_reorder .text -#define OFS_R1 (16*16) -#define OFS_R2 (OFS_R1 + 16) -#define OFS_X0 (OFS_R2 + 16) -#define OFS_X1 (OFS_X0 + 16) -#define OFS_X2 (OFS_X1 + 16) +#define OFS_R1 (16*16) +#define OFS_R2 (OFS_R1 + 16) +#define OFS_X0 (OFS_R2 + 16) +#define OFS_X1 (OFS_X0 + 16) +#define OFS_X2 (OFS_X1 + 16) .altmacro -declare_register xTMP x23 +declare_register xTMP, x23 /* v0-v15 are assigned to LFSR0-15, should not be reused in CIPHERNx4B_4 */ /* v24-v26 are assigned to BRCX0-2, could be reused in CIPHERNx4B_4 */ -declare_register vBRCX0 v24 -declare_register vBRCX1 v25 -declare_register vBRCX2 v26 +declare_register vBRCX0, v24 +declare_register vBRCX1, v25 +declare_register vBRCX2, v26 /* v27-v28 are assigned to FR1-2, should not be reused in CIPHERNx4B_4 */ -declare_register vFR1 v27 -declare_register vFR2 v28 -declare_register qFR1 q27 -declare_register qFR2 q28 +declare_register vFR1, v27 +declare_register vFR2, v28 +declare_register qFR1, q27 +declare_register qFR2, q28 -.macro FUNC_SAVE +.macro FUNC_SAVE stp x29, x30, [sp, -160]! - stp d8, d9, [sp, 16] + stp d8, d9, [sp, 16] stp d10, d11, [sp, 32] stp d12, d13, [sp, 48] stp d14, d15, [sp, 64] @@ -141,7 +143,7 @@ declare_register qFR2 q28 stp x27, x28, [sp, 144] .endm -.macro FUNC_RESTORE +.macro FUNC_RESTORE ldp d8, d9,[sp, 16] ldp d10, d11, [sp, 32] ldp d12, d13, [sp, 48] @@ -166,163 +168,161 @@ declare_register qFR2 q28 // Where k_i is each byte of the key, d_i is a 15-bit constant // and iv_i is each byte of the IV. // -.macro INIT_LFSR_128 KEY, IV, SHUF_KEY, SHUF_IV, EKD_MASK, LFSR, XTMP - tbl v\LFSR\().16b, {v\KEY\().16b}, \SHUF_KEY\().16b - ushr v\LFSR\().4s, v\LFSR\().4s, #1 - tbl \XTMP\().16b, {v\IV\().16b}, \SHUF_IV\().16b - eor v\LFSR\().16b, v\LFSR\().16b, \XTMP\().16b - eor v\LFSR\().16b, v\LFSR\().16b, \EKD_MASK\().16b +.macro INIT_LFSR_128 KEY, IV, SHUF_KEY, SHUF_IV, EKD_MASK, LFSR, XTMP + tbl v\LFSR\().16b, {v\KEY\().16b}, \SHUF_KEY\().16b + ushr v\LFSR\().4s, v\LFSR\().4s, #1 + tbl \XTMP\().16b, {v\IV\().16b}, \SHUF_IV\().16b + eor v\LFSR\().16b, v\LFSR\().16b, \XTMP\().16b + eor v\LFSR\().16b, v\LFSR\().16b, \EKD_MASK\().16b .endm -.macro rot_mod32 vOUT, vIN, ROTATE, vTMP - ushr \vOUT\().4s, \vIN\().4s, 32-\ROTATE - sli \vOUT\().4s, \vIN\().4s, \ROTATE +.macro rot_mod32 vOUT, vIN, ROTATE, vTMP + ushr \vOUT\().4s, \vIN\().4s, 32-\ROTATE + sli \vOUT\().4s, \vIN\().4s, \ROTATE .endm -.macro TRANSPOSE4_U32 V_0, V_1, V_2, V_3, T_0, T_1, T_2, T_3 - zip1 v\T_0\().4s, v\V_0\().4s, v\V_1\().4s // T_0 = {b1 a1 b0 a0} - zip2 v\T_1\().4s, v\V_0\().4s, v\V_1\().4s // T_1 = {b3 a3 b2 a2} - zip1 v\T_2\().4s, v\V_2\().4s, v\V_3\().4s // T_2 = {d1 c1 d0 c0} - zip2 v\T_3\().4s, v\V_2\().4s, v\V_3\().4s // T_3 = {d3 c3 d2 c2} +.macro TRANSPOSE4_U32 V_0, V_1, V_2, V_3, T_0, T_1, T_2, T_3 + zip1 v\T_0\().4s, v\V_0\().4s, v\V_1\().4s // T_0 = {b1 a1 b0 a0} + zip2 v\T_1\().4s, v\V_0\().4s, v\V_1\().4s // T_1 = {b3 a3 b2 a2} + zip1 v\T_2\().4s, v\V_2\().4s, v\V_3\().4s // T_2 = {d1 c1 d0 c0} + zip2 v\T_3\().4s, v\V_2\().4s, v\V_3\().4s // T_3 = {d3 c3 d2 c2} - zip1 v\V_0\().2d, v\T_0\().2d, v\T_2\().2d // V_0 = {d0 c0 b0 a0} - zip2 v\V_1\().2d, v\T_0\().2d, v\T_2\().2d // V_1 = {d1 c1 b1 a1} - zip1 v\V_2\().2d, v\T_1\().2d, v\T_3\().2d // V_2 = {d2 c2 b2 a2} - zip2 v\V_3\().2d, v\T_1\().2d, v\T_3\().2d // V_3 = {d3 c3 b3 a3} + zip1 v\V_0\().2d, v\T_0\().2d, v\T_2\().2d // V_0 = {d0 c0 b0 a0} + zip2 v\V_1\().2d, v\T_0\().2d, v\T_2\().2d // V_1 = {d1 c1 b1 a1} + zip1 v\V_2\().2d, v\T_1\().2d, v\T_3\().2d // V_2 = {d2 c2 b2 a2} + zip2 v\V_3\().2d, v\T_1\().2d, v\T_3\().2d // V_3 = {d3 c3 b3 a3} .endm -.macro USHR_4S vd, n, rot - ushr \vd\().4s, v\n\().4s, #\rot\() +.macro USHR_4S vd, n, rot + ushr \vd\().4s, v\n\().4s, #\rot\() .endm -.macro STR_Q i, addrreg, offset - str q\i\(), [\addrreg\(), #\offset\()] +.macro STR_Q i, addrreg, offset + str q\i\(), [\addrreg\(), #\offset\()] .endm -.macro LDR_Q i, addrreg, offset - ldr q\i\(), [\addrreg\(), #\offset\()] +.macro LDR_Q i, addrreg, offset + ldr q\i\(), [\addrreg\(), #\offset\()] .endm -.macro TRN1_8H vd, n, m - trn1 \vd\().8h, v\n\().8h, v\m\().8h +.macro TRN1_8H vd, n, m + trn1 \vd\().8h, v\n\().8h, v\m\().8h .endm -.macro bits_reorg4 ROUND_NUM, OUTPUT_X3=0, X3 - USHR_4S v20, %((15 + \ROUND_NUM) % 16), 15 - USHR_4S v21, %((9 + \ROUND_NUM) % 16), 15 - USHR_4S v22, %((5 + \ROUND_NUM) % 16), 15 +.macro bits_reorg4 ROUND_NUM, OUTPUT_X3=0, X3 + USHR_4S v20, %((15 + \ROUND_NUM) % 16), 15 + USHR_4S v21, %((9 + \ROUND_NUM) % 16), 15 + USHR_4S v22, %((5 + \ROUND_NUM) % 16), 15 .if \OUTPUT_X3 == 1 - USHR_4S v29, %((0 + \ROUND_NUM) % 16), 15 + USHR_4S v29, %((0 + \ROUND_NUM) % 16), 15 .endif - TRN1_8H vBRCX0, %((14 + \ROUND_NUM) % 16), 20 - TRN1_8H vBRCX1, 21, %((11 + \ROUND_NUM) % 16) - TRN1_8H vBRCX2, 22, %((7 + \ROUND_NUM) % 16) + TRN1_8H vBRCX0, %((14 + \ROUND_NUM) % 16), 20 + TRN1_8H vBRCX1, 21, %((11 + \ROUND_NUM) % 16) + TRN1_8H vBRCX2, 22, %((7 + \ROUND_NUM) % 16) .if \OUTPUT_X3 == 1 - TRN1_8H v\X3\(), 29, %((2 + \ROUND_NUM) % 16) // BRC_X3 + TRN1_8H v\X3\(), 29, %((2 + \ROUND_NUM) % 16) // BRC_X3 .endif .endm -.macro nonlin_fun4 OUTPUT_W=0, V_W +.macro nonlin_fun4 OUTPUT_W=0, V_W .if \OUTPUT_W == 1 - eor \V_W\().16b, vBRCX0.16b, vFR1.16b - add \V_W\().4s, \V_W\().4s, vFR2.4s // W = (BRC_X0 ^ F_R1) + F_R2 + eor \V_W\().16b, vBRCX0.16b, vFR1.16b + add \V_W\().4s, \V_W\().4s, vFR2.4s // W = (BRC_X0 ^ F_R1) + F_R2 .endif - add v23.4s, vFR1.4s, vBRCX1.4s // W1 = F_R1 + BRC_X1 - eor v24.16b, vFR2.16b, vBRCX2.16b // W2 = F_R2 ^ BRC_X2 - - ushr v25.4s, v23.4s, #16 - ushr v26.4s, v24.4s, #16 - trn1 v21.8h, v25.8h, v24.8h // W1L || W2H - trn1 v20.8h, v26.8h, v23.8h // W2L || W1H - - ushr v26.4s, v21.4s, 32-8 - ushr v23.4s, v20.4s, 32-10 - ushr v27.4s, v21.4s, 32-14 - ushr v22.4s, v20.4s, 32-2 - ushr v28.4s, v21.4s, 32-22 - ushr v24.4s, v20.4s, 32-18 - ushr v31.4s, v21.4s, 32-30 - ushr v25.4s, v20.4s, 32-24 - sli v26.4s, v21.4s, 8 - sli v23.4s, v20.4s, 10 - sli v27.4s, v21.4s, 14 - sli v22.4s, v20.4s, 2 - sli v28.4s, v21.4s, 22 - sli v24.4s, v20.4s, 18 - sli v31.4s, v21.4s, 30 - sli v25.4s, v20.4s, 24 - eor v26.16b, v26.16b, v27.16b - eor v22.16b, v22.16b, v23.16b - eor v28.16b, v28.16b, v31.16b - eor v24.16b, v24.16b, v25.16b - eor v21.16b, v21.16b, v26.16b - eor v20.16b, v20.16b, v22.16b - eor v21.16b, v21.16b, v28.16b // v21 = V = L2(Q) - eor v20.16b, v20.16b, v24.16b // v20 = U = L1(P) + add v23.4s, vFR1.4s, vBRCX1.4s // W1 = F_R1 + BRC_X1 + eor v24.16b, vFR2.16b, vBRCX2.16b // W2 = F_R2 ^ BRC_X2 + + ushr v25.4s, v23.4s, #16 + ushr v26.4s, v24.4s, #16 + trn1 v21.8h, v25.8h, v24.8h // W1L || W2H + trn1 v20.8h, v26.8h, v23.8h // W2L || W1H + + ushr v26.4s, v21.4s, 32-8 + ushr v23.4s, v20.4s, 32-10 + ushr v27.4s, v21.4s, 32-14 + ushr v22.4s, v20.4s, 32-2 + ushr v28.4s, v21.4s, 32-22 + ushr v24.4s, v20.4s, 32-18 + ushr v31.4s, v21.4s, 32-30 + ushr v25.4s, v20.4s, 32-24 + sli v26.4s, v21.4s, 8 + sli v23.4s, v20.4s, 10 + sli v27.4s, v21.4s, 14 + sli v22.4s, v20.4s, 2 + sli v28.4s, v21.4s, 22 + sli v24.4s, v20.4s, 18 + sli v31.4s, v21.4s, 30 + sli v25.4s, v20.4s, 24 + eor v26.16b, v26.16b, v27.16b + eor v22.16b, v22.16b, v23.16b + eor v28.16b, v28.16b, v31.16b + eor v24.16b, v24.16b, v25.16b + eor v21.16b, v21.16b, v26.16b + eor v20.16b, v20.16b, v22.16b + eor v21.16b, v21.16b, v28.16b // v21 = V = L2(Q) + eor v20.16b, v20.16b, v24.16b // v20 = U = L1(P) // shuffle U and V to have all S0 lookups in v20 and all S1 lookups in v21 // Compress all S0 and S1 input values in each register - trn1 v23.16b, v21.16b, v20.16b - trn2 v22.16b, v21.16b, v20.16b + trn1 v23.16b, v21.16b, v20.16b + trn2 v22.16b, v21.16b, v20.16b // Compute S0 and S1 values - S0_compute_NEON v22, v20, v21 - S1_compute_NEON v23, v20, v21, v31 + S0_compute_NEON v22, v20, v21 + S1_compute_NEON v23, v20, v21, v31 // Need to shuffle back v20 & v21 before storing output // (revert what was done before S0 and S1 computations) - trn1 vFR2.16b, v23.16b, v22.16b - trn2 vFR1.16b, v23.16b, v22.16b + trn1 vFR2.16b, v23.16b, v22.16b + trn2 vFR1.16b, v23.16b, v22.16b .endm // add_mod31() // add two 32-bit args and reduce mod (2^31-1) -.macro add_mod31 d, n, TMP - add v\d\().4s, v\d\().4s, v\n\().4s - ushr v\TMP\().4s, v\d\().4s, #31 - bic v\d\().4s, #0x80, LSL #24 - add v\d\().4s, v\d\().4s, v\TMP\().4s +.macro add_mod31 d, n, TMP + add v\d\().4s, v\d\().4s, v\n\().4s + ushr v\TMP\().4s, v\d\().4s, #31 + bic v\d\().4s, #0x80, LSL #24 + add v\d\().4s, v\d\().4s, v\TMP\().4s .endm // rot_mod31() // rotate (mult by pow of 2) 32-bit arg and reduce mod (2^31-1) -.macro rot_mod31 ARG, DST, BITS - ushr v\DST\().4s, v\ARG\().4s, (31 - \BITS) - sli v\DST\().4s, v\ARG\().4s, \BITS - bic v\DST\().4s, #0x80, LSL #24 +.macro rot_mod31 ARG, DST, BITS + ushr v\DST\().4s, v\ARG\().4s, (31 - \BITS) + sli v\DST\().4s, v\ARG\().4s, \BITS + bic v\DST\().4s, #0x80, LSL #24 .endm // TMP vreg: v20-v26, v31 -.macro lfsr_updt4 ROUND_NUM, W_ADD=0, V_W +.macro lfsr_updt4 ROUND_NUM, W_ADD=0, V_W // Calculate LFSR feedback // s0 = w>>1 + 2^15*s15 + 2^17*s13 + 2^21*s10 + 2^20*s4 + (1+2^8)*s0 mod (2^31-1); - rot_mod31 %((0 + \ROUND_NUM) % 16), 20, 8 - rot_mod31 %((4 + \ROUND_NUM) % 16), 21, 20 - rot_mod31 %((10 + \ROUND_NUM) % 16), 22, 21 - rot_mod31 %((13 + \ROUND_NUM) % 16), 23, 17 - rot_mod31 %((15 + \ROUND_NUM) % 16), 24, 15 - add_mod31 20, 21, 25 - add_mod31 22, 23, 26 - add_mod31 %((0 + \ROUND_NUM) % 16), 24, 31 - add_mod31 20, 22, 25 - add_mod31 %((0 + \ROUND_NUM) % 16), 20, 31 + rot_mod31 %((0 + \ROUND_NUM) % 16), 20, 8 + rot_mod31 %((4 + \ROUND_NUM) % 16), 21, 20 + rot_mod31 %((10 + \ROUND_NUM) % 16), 22, 21 + rot_mod31 %((13 + \ROUND_NUM) % 16), 23, 17 + rot_mod31 %((15 + \ROUND_NUM) % 16), 24, 15 + add_mod31 20, 21, 25 + add_mod31 22, 23, 26 + add_mod31 %((0 + \ROUND_NUM) % 16), 24, 31 + add_mod31 20, 22, 25 + add_mod31 %((0 + \ROUND_NUM) % 16), 20, 31 .if \W_ADD == 1 - add_mod31 %((0 + \ROUND_NUM) % 16), \V_W, 26 + add_mod31 %((0 + \ROUND_NUM) % 16), \V_W, 26 .endif .endm -.macro load_key_iv i, j, pKe, pIv, off - ldr x8, [pKe, \off] - //ldr x9, [pIv, \off] - //add x9, pIv, \off*4 - ldr q\i, [x8] - ldr q\j, [pIv, \off*4] +.macro load_key_iv i, j, pKe, pIv, off + ldr x8, [pKe, \off] + ldr q\i, [x8] + ldr q\j, [pIv, \off*4] .endm -.macro str_vi i, pState, off - str q\i, [\pState, 4*\off + 16*\i] +.macro str_vi i, pState, off + str q\i, [\pState, 4*\off + 16*\i] .endm // @@ -340,621 +340,1085 @@ declare_register qFR2 q28 // [clobbered] wTP temporary register // [in] CONSTANTS Address to constants // -.macro INIT_LFSR_256 KEY, IV, LFSR0_3, LFSR4_7, LFSR8_11, LFSR12_15, \ - vKEY1, vKEY2, vTMP, xTP, wTP, CONSTANTS - ld1 {\vKEY1\().16b, \vKEY2\().16b}, [\KEY] +.macro INIT_LFSR_256 KEY, IV, LFSR0_3, LFSR4_7, LFSR8_11, LFSR12_15, \ + vKEY1, vKEY2, vTMP, xTP, wTP, CONSTANTS + ld1 {\vKEY1\().16b, \vKEY2\().16b}, [\KEY] // s0 - s3 - eor \LFSR0_3\().16b, \LFSR0_3\().16b, \LFSR0_3\().16b - ins \LFSR0_3\().B[3], \vKEY1\().B[0] // s0 - ins \LFSR0_3\().B[7], \vKEY1\().B[1] // s1 - ins \LFSR0_3\().B[11], \vKEY1\().B[2] // s2 - ins \LFSR0_3\().B[15], \vKEY1\().B[3] // s3 + eor \LFSR0_3\().16b, \LFSR0_3\().16b, \LFSR0_3\().16b + ins \LFSR0_3\().B[3], \vKEY1\().B[0] // s0 + ins \LFSR0_3\().B[7], \vKEY1\().B[1] // s1 + ins \LFSR0_3\().B[11], \vKEY1\().B[2] // s2 + ins \LFSR0_3\().B[15], \vKEY1\().B[3] // s3 ushr \LFSR0_3\().4s, \LFSR0_3\().4s, #1 - ld1 {\vTMP\().16b}, [\CONSTANTS], #16 - orr \LFSR0_3\().16b, \LFSR0_3\().16b, \vTMP\().16b // s0 - s3 + ld1 {\vTMP\().16b}, [\CONSTANTS], #16 + orr \LFSR0_3\().16b, \LFSR0_3\().16b, \vTMP\().16b // s0 - s3 - ins \LFSR0_3\().B[1], \vKEY2\().B[5] // s0 k21 - ins \LFSR0_3\().B[0], \vKEY2\().B[0] // s0 k16 + ins \LFSR0_3\().B[1], \vKEY2\().B[5] // s0 k21 + ins \LFSR0_3\().B[0], \vKEY2\().B[0] // s0 k16 - ins \LFSR0_3\().B[5], \vKEY2\().B[6] // s1 k22 - ins \LFSR0_3\().B[4], \vKEY2\().B[1] // s1 k17 + ins \LFSR0_3\().B[5], \vKEY2\().B[6] // s1 k22 + ins \LFSR0_3\().B[4], \vKEY2\().B[1] // s1 k17 - ins \LFSR0_3\().B[9], \vKEY2\().B[7] // s2 k23 - ins \LFSR0_3\().B[8], \vKEY2\().B[2] // s2 k18 + ins \LFSR0_3\().B[9], \vKEY2\().B[7] // s2 k23 + ins \LFSR0_3\().B[8], \vKEY2\().B[2] // s2 k18 - ins \LFSR0_3\().B[13], \vKEY2\().B[8] // s3 k24 - ins \LFSR0_3\().B[12], \vKEY2\().B[3] // s3 k19 + ins \LFSR0_3\().B[13], \vKEY2\().B[8] // s3 k24 + ins \LFSR0_3\().B[12], \vKEY2\().B[3] // s3 k19 // s4 - s7 - mov xTP, IV // xTP = IV + 0 - eor \LFSR4_7\().16b, \LFSR4_7\().16b, \LFSR4_7\().16b - ins \LFSR4_7\().B[3], \vKEY1\().B[4] // s4 - ld1 {\LFSR4_7\().B}[7], [xTP] // s5 - add xTP, xTP, #1 // xTP = IV + 1 - ld1 {\LFSR4_7\().B}[11], [xTP] // s6 - add xTP, xTP, #9 // xTP = IV + 10 - ld1 {\LFSR4_7\().B}[15], [xTP] // s7 - add xTP, xTP, #-8 // xTP = IV + 2 - - ushr \LFSR4_7\().4s, \LFSR4_7\().4s, #1 - - ins \LFSR4_7\().B[1],\vKEY2\().B[9] // s4 k25 - ins \LFSR4_7\().B[0],\vKEY2\().B[4] // s4 k20 - - ins \LFSR4_7\().B[5],\vKEY1\().B[5] // s5 k5 - ins \LFSR4_7\().B[4],\vKEY2\().B[10] // s5 k26 - - ins \LFSR4_7\().B[9],\vKEY1\().B[6] // s6 k6 - ins \LFSR4_7\().B[8],\vKEY2\().B[11] // s6 k27 - - ins \LFSR4_7\().B[13],\vKEY1\().B[7] // s7 k7 - ld1 {\LFSR4_7\().B}[12], [xTP] // s7 - add xTP, xTP, #15 // xTP = IV + 17 - - ld1 {\vTMP\().16b}, [\CONSTANTS], #16 - orr \LFSR4_7\().16b, \LFSR4_7\().16b, \vTMP\().16b // s4 - s7 - - eor \vTMP\().16b, \vTMP\().16b, \vTMP\().16b - ld1 {\vTMP\().B}[6], [xTP] - add xTP, xTP, #1 // xTP = IV + 18 - ld1 {\vTMP\().B}[10], [xTP] - add xTP, xTP, #1 // xTP = IV + 19 - ld1 {\vTMP\().B}[14], [xTP] - add xTP, xTP, #-14 // xTP = IV + 5 + mov xTP, IV // xTP = IV + 0 + eor \LFSR4_7\().16b, \LFSR4_7\().16b, \LFSR4_7\().16b + ins \LFSR4_7\().B[3], \vKEY1\().B[4] // s4 + ld1 {\LFSR4_7\().B}[7], [xTP] // s5 + add xTP, xTP, #1 // xTP = IV + 1 + ld1 {\LFSR4_7\().B}[11], [xTP] // s6 + add xTP, xTP, #9 // xTP = IV + 10 + ld1 {\LFSR4_7\().B}[15], [xTP] // s7 + add xTP, xTP, #-8 // xTP = IV + 2 + + ushr \LFSR4_7\().4s, \LFSR4_7\().4s, #1 + + ins \LFSR4_7\().B[1],\vKEY2\().B[9] // s4 k25 + ins \LFSR4_7\().B[0],\vKEY2\().B[4] // s4 k20 + + ins \LFSR4_7\().B[5],\vKEY1\().B[5] // s5 k5 + ins \LFSR4_7\().B[4],\vKEY2\().B[10] // s5 k26 + + ins \LFSR4_7\().B[9],\vKEY1\().B[6] // s6 k6 + ins \LFSR4_7\().B[8],\vKEY2\().B[11] // s6 k27 + + ins \LFSR4_7\().B[13],\vKEY1\().B[7] // s7 k7 + ld1 {\LFSR4_7\().B}[12], [xTP] // s7 + add xTP, xTP, #15 // xTP = IV + 17 + + ld1 {\vTMP\().16b}, [\CONSTANTS], #16 + orr \LFSR4_7\().16b, \LFSR4_7\().16b, \vTMP\().16b // s4 - s7 + + eor \vTMP\().16b, \vTMP\().16b, \vTMP\().16b + ld1 {\vTMP\().B}[6], [xTP] + add xTP, xTP, #1 // xTP = IV + 18 + ld1 {\vTMP\().B}[10], [xTP] + add xTP, xTP, #1 // xTP = IV + 19 + ld1 {\vTMP\().B}[14], [xTP] + add xTP, xTP, #-14 // xTP = IV + 5 // LFSR8_11 = 0x003f0000 0x003f0000 0x003f0000 0x003f0000 movi \LFSR8_11\().4s, 0x3f, lsl 16 - and \vTMP\().16b, \vTMP\().16b, \LFSR8_11\().16b + and \vTMP\().16b, \vTMP\().16b, \LFSR8_11\().16b - orr \LFSR4_7\().16b, \LFSR4_7\().16b, \vTMP\().16b + orr \LFSR4_7\().16b, \LFSR4_7\().16b, \vTMP\().16b // s8 - s11 - eor \LFSR8_11\().16b, \LFSR8_11\().16b, \LFSR8_11\().16b - ins \LFSR8_11\().b[3], \vKEY1\().b[8] // s8 - ins \LFSR8_11\().b[7], \vKEY1\().b[9] // s9 - ld1 {\LFSR8_11\().b}[11], [xTP] // s10 - add xTP, xTP, #-2 // xTP = IV + 3 - ins \LFSR8_11\().b[15], \vKEY1\().b[11] // s11 - - ushr \LFSR8_11\().4s, \LFSR8_11\().4s, #1 - - ld1 {\LFSR8_11\().b}[1], [xTP] // s8 - add xTP, xTP, #8 // xTP = IV + 11 - ld1 {\LFSR8_11\().b}[0], [xTP] // s8 - add xTP, xTP, #1 // xTP = IV + 12 - - ld1 {\LFSR8_11\().b}[5], [xTP] // s9 - add xTP, xTP, #-8 // xTP = IV + 4 - ld1 {\LFSR8_11\().b}[4], [xTP] // s9 - add xTP, xTP, #2 // xTP = IV + 6 - - ins \LFSR8_11\().b[9], \vKEY1\().b[10] // s10 k10 - ins \LFSR8_11\().b[8], \vKEY2\().b[12] // s10 k28 - - ld1 {\LFSR8_11\().b}[13], [xTP] // s11 - add xTP, xTP, #7 // xTP = IV + 13 - ld1 {\LFSR8_11\().b}[12], [xTP] // s11 - add xTP, xTP, #7 // xTP = IV + 20 - - ld1 {\vTMP\().16b}, [\CONSTANTS], #16 - orr \LFSR8_11\().16b, \LFSR8_11\().16b, \vTMP\().16b // s8 - s11 - - eor \vTMP\().16b, \vTMP\().16b, \vTMP\().16b - ld1 {\vTMP\().B}[2], [xTP] - add xTP, xTP, #1 // xTP = IV + 21 - ld1 {\vTMP\().B}[6], [xTP] - add xTP, xTP, #1 // xTP = IV + 22 - ld1 {\vTMP\().B}[10], [xTP] - add xTP, xTP, #1 // xTP = IV + 23 - ld1 {\vTMP\().B}[14], [xTP] - add xTP, xTP, #-16 // xTP = IV + 7 + eor \LFSR8_11\().16b, \LFSR8_11\().16b, \LFSR8_11\().16b + ins \LFSR8_11\().b[3], \vKEY1\().b[8] // s8 + ins \LFSR8_11\().b[7], \vKEY1\().b[9] // s9 + ld1 {\LFSR8_11\().b}[11], [xTP] // s10 + add xTP, xTP, #-2 // xTP = IV + 3 + ins \LFSR8_11\().b[15], \vKEY1\().b[11] // s11 + + ushr \LFSR8_11\().4s, \LFSR8_11\().4s, #1 + + ld1 {\LFSR8_11\().b}[1], [xTP] // s8 + add xTP, xTP, #8 // xTP = IV + 11 + ld1 {\LFSR8_11\().b}[0], [xTP] // s8 + add xTP, xTP, #1 // xTP = IV + 12 + + ld1 {\LFSR8_11\().b}[5], [xTP] // s9 + add xTP, xTP, #-8 // xTP = IV + 4 + ld1 {\LFSR8_11\().b}[4], [xTP] // s9 + add xTP, xTP, #2 // xTP = IV + 6 + + ins \LFSR8_11\().b[9], \vKEY1\().b[10] // s10 k10 + ins \LFSR8_11\().b[8], \vKEY2\().b[12] // s10 k28 + + ld1 {\LFSR8_11\().b}[13], [xTP] // s11 + add xTP, xTP, #7 // xTP = IV + 13 + ld1 {\LFSR8_11\().b}[12], [xTP] // s11 + add xTP, xTP, #7 // xTP = IV + 20 + + ld1 {\vTMP\().16b}, [\CONSTANTS], #16 + orr \LFSR8_11\().16b, \LFSR8_11\().16b, \vTMP\().16b // s8 - s11 + + eor \vTMP\().16b, \vTMP\().16b, \vTMP\().16b + ld1 {\vTMP\().B}[2], [xTP] + add xTP, xTP, #1 // xTP = IV + 21 + ld1 {\vTMP\().B}[6], [xTP] + add xTP, xTP, #1 // xTP = IV + 22 + ld1 {\vTMP\().B}[10], [xTP] + add xTP, xTP, #1 // xTP = IV + 23 + ld1 {\vTMP\().B}[14], [xTP] + add xTP, xTP, #-16 // xTP = IV + 7 // LFSR12_15 = 0x003f0000 0x003f0000 0x003f0000 0x003f0000 movi \LFSR12_15\().4s, 0x3f, lsl 16 - and \vTMP\().16b, \vTMP\().16b, \LFSR12_15\().16b + and \vTMP\().16b, \vTMP\().16b, \LFSR12_15\().16b - orr \LFSR8_11\().16b, \LFSR8_11\().16b, \vTMP\().16b + orr \LFSR8_11\().16b, \LFSR8_11\().16b, \vTMP\().16b // s12 - s15 - eor \LFSR12_15\().16b, \LFSR12_15\().16b, \LFSR12_15\().16b - ins \LFSR12_15\().b[3], \vKEY1\().b[12] // s12 - ins \LFSR12_15\().b[7], \vKEY1\().b[13] // s13 - ins \LFSR12_15\().b[11], \vKEY1\().b[14] // s14 - ins \LFSR12_15\().b[15], \vKEY1\().b[15] // s15 + eor \LFSR12_15\().16b, \LFSR12_15\().16b, \LFSR12_15\().16b + ins \LFSR12_15\().b[3], \vKEY1\().b[12] // s12 + ins \LFSR12_15\().b[7], \vKEY1\().b[13] // s13 + ins \LFSR12_15\().b[11], \vKEY1\().b[14] // s14 + ins \LFSR12_15\().b[15], \vKEY1\().b[15] // s15 - ushr \LFSR12_15\().4s, \LFSR12_15\().4s, #1 + ushr \LFSR12_15\().4s, \LFSR12_15\().4s, #1 - ld1 {\LFSR12_15\().b}[1], [xTP] // s12 - add xTP, xTP, #7 // xTP = IV + 14 - ld1 {\LFSR12_15\().b}[0], [xTP] // s12 - add xTP, xTP, #1 // xTP = IV + 15 + ld1 {\LFSR12_15\().b}[1], [xTP] // s12 + add xTP, xTP, #7 // xTP = IV + 14 + ld1 {\LFSR12_15\().b}[0], [xTP] // s12 + add xTP, xTP, #1 // xTP = IV + 15 - ld1 {\LFSR12_15\().b}[5], [xTP] // s13 - add xTP, xTP, #-7 // xTP = IV + 8 - ld1 {\LFSR12_15\().b}[4], [xTP] // s13 - add xTP, xTP, #8 // xTP = IV + 16 + ld1 {\LFSR12_15\().b}[5], [xTP] // s13 + add xTP, xTP, #-7 // xTP = IV + 8 + ld1 {\LFSR12_15\().b}[4], [xTP] // s13 + add xTP, xTP, #8 // xTP = IV + 16 - ld1 {\LFSR12_15\().b}[9], [xTP] // s14 - add xTP, xTP, #-7 // xTP = IV + 9 - ld1 {\LFSR12_15\().b}[8], [xTP] // s14 - add xTP, xTP, #15 // xTP = IV + 24 + ld1 {\LFSR12_15\().b}[9], [xTP] // s14 + add xTP, xTP, #-7 // xTP = IV + 9 + ld1 {\LFSR12_15\().b}[8], [xTP] // s14 + add xTP, xTP, #15 // xTP = IV + 24 - ins \LFSR12_15\().b[13], \vKEY2\().b[14] // s15 k30 - ins \LFSR12_15\().b[12], \vKEY2\().b[13] // s15 k29 + ins \LFSR12_15\().b[13], \vKEY2\().b[14] // s15 k30 + ins \LFSR12_15\().b[12], \vKEY2\().b[13] // s15 k29 - ld1 {\vTMP\().16b}, [\CONSTANTS] - orr \LFSR12_15\().16b, \LFSR12_15\().16b, \vTMP\().16b // s12 - s15 + ld1 {\vTMP\().16b}, [\CONSTANTS] + orr \LFSR12_15\().16b, \LFSR12_15\().16b, \vTMP\().16b // s12 - s15 - eor \vTMP\().16b, \vTMP\().16b, \vTMP\().16b - ld1 {\vTMP\().b}[2], [xTP] + eor \vTMP\().16b, \vTMP\().16b, \vTMP\().16b + ld1 {\vTMP\().b}[2], [xTP] // vKEY1(released) = 0x003f0000 0x003f0000 0x003f0000 0x003f0000 - movi \vKEY1\().4s, 0x3f, lsl 16 - and \vTMP\().16b, \vTMP\().16b, vKEY1\().16b + movi \vKEY1\().4s, 0x3f, lsl 16 + and \vTMP\().16b, \vTMP\().16b, vKEY1\().16b umov \wTP, \vKEY2\().b[15] - lsr \wTP, \wTP, #4 - lsl \wTP, \wTP, #16 // high nibble of k31 - ins \vTMP\().s[2], \wTP + lsr \wTP, \wTP, #4 + lsl \wTP, \wTP, #16 // high nibble of k31 + ins \vTMP\().s[2], \wTP - umov \wTP, \vKEY2\().b[15] - lsl \wTP, \wTP, #28 - lsr \wTP, \wTP, #12 // low nibble of k31 - ins \vTMP\().s[3], \wTP + umov \wTP, \vKEY2\().b[15] + lsl \wTP, \wTP, #28 + lsr \wTP, \wTP, #12 // low nibble of k31 + ins \vTMP\().s[3], \wTP - orr \LFSR12_15\().16b, \LFSR12_15\().16b, \vTMP\().16b + orr \LFSR12_15\().16b, \LFSR12_15\().16b, \vTMP\().16b .endm -.macro ZUC256_INIT - declare_register pKe, x0 - declare_register pIv, x1 - declare_register pState, x2 - declare_register tag_sz, x3 - declare_register xW, x18 +.macro ZUC256_INIT + declare_register pKe, x0 + declare_register pIv, x1 + declare_register pState, x2 + declare_register tag_sz, x3 + declare_register xW, x18 // save clobbered register FUNC_SAVE - adrp xTMP, EK256_d64 - add xTMP, xTMP, #:lo12:EK256_d64 - rbit tag_sz, tag_sz - clz tag_sz, tag_sz - sub tag_sz, tag_sz, #1 - lsl tag_sz, tag_sz, #6 - add x13, xTMP, tag_sz + adrp xTMP, EK256_d64 + add xTMP, xTMP, #:lo12:EK256_d64 + rbit tag_sz, tag_sz + clz tag_sz, tag_sz + sub tag_sz, tag_sz, #1 + lsl tag_sz, tag_sz, #6 + add x13, xTMP, tag_sz // Expand key - INIT_LFSR_256 x0, x1, v0, v1, v2, v3, v4, v5, v6, x11, w11, x13 - st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [pState] + INIT_LFSR_256 x0, x1, v0, v1, v2, v3, v4, v5, v6, x11, w11, x13 + st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [pState] // Set R1 and R2 to zero - eor fR1, fR1, fR1 - eor fR2, fR2, fR2 + eor fR1, fR1, fR1 + eor fR2, fR2, fR2 -.set counter, 0 -.rept 32 - BITS_REORG counter +.set counter, 0 +.rept 32 + BITS_REORG counter - NONLIN_FUNC 1 - lsr xW, xW, #1 + NONLIN_FUNC 1 + lsr xW, xW, #1 - LFSR_UPDT counter -.set counter, (counter+1) + LFSR_UPDT counter +.set counter, (counter+1) .endr // And once more, initial round from keygen phase = 33 times - BITS_REORG 0 - NONLIN_FUNC 0 - eor xW, xW, xW + BITS_REORG 0 + NONLIN_FUNC 0 + eor xW, xW, xW - LFSR_UPDT 0 + LFSR_UPDT 0 // Save ZUC's state variables - str fR1, [pState, 16*4] - str fR2, [pState, 17*4] - str BRC_X0, [pState, 18*4] - str BRC_X1, [pState, 19*4] - str BRC_X2, [pState, 20*4] - str BRC_X3, [pState, 21*4] + str fR1, [pState, 16*4] + str fR2, [pState, 17*4] + str BRC_X0, [pState, 18*4] + str BRC_X1, [pState, 19*4] + str BRC_X2, [pState, 20*4] + str BRC_X3, [pState, 21*4] // Restore clobbered register FUNC_RESTORE .endm .altmacro -.macro ZUC_INIT_4 KEY_SIZE - declare_register pKe x0 - declare_register pIv x1 - declare_register pState x2 - declare_register tag_sz x3 // Only used in ZUC-256 +.macro ZUC_INIT_4 KEY_SIZE + declare_register pKe, x0 + declare_register pIv, x1 + declare_register pState, x2 + declare_register tag_sz, x3 // Only used in ZUC-256 FUNC_SAVE .if \KEY_SIZE == 128 // Load key and IVs to v16-v23 -.set off, 0 -.set i, 16 -.set j, 20 -.rept 4 - load_key_iv %i, %j, pKe, pIv, off -.set off, (off + 8) -.set i, (i + 1) -.set j, (j + 1) +.set off, 0 +.set i, 16 +.set j, 20 +.rept 4 + load_key_iv %i, %j, pKe, pIv, off +.set off, (off + 8) +.set i, (i + 1) +.set j, (j + 1) .endr // Initialize all LFSR registers -.set off, 0 -.set idx_off, 0 -.rept 4 - adrp xTMP, shuf_mask_key - ldr q24, [xTMP, #:lo12:shuf_mask_key + off] - adrp xTMP, shuf_mask_iv - ldr q25, [xTMP, #:lo12:shuf_mask_iv + off] - adrp xTMP, Ek_d - ldr q26, [xTMP, #:lo12:Ek_d + off] - -.set idx, idx_off -.set i, 16 -.set j, 20 -.rept 4 - INIT_LFSR_128 %i, %j, v24, v25, v26, %idx, v27 -.set idx, (idx + 1) -.set i, (i + 1) -.set j, (j + 1) +.set off, 0 +.set idx_off, 0 +.rept 4 + adrp xTMP, shuf_mask_key + ldr q24, [xTMP, #:lo12:shuf_mask_key + off] + adrp xTMP, shuf_mask_iv + ldr q25, [xTMP, #:lo12:shuf_mask_iv + off] + adrp xTMP, Ek_d + ldr q26, [xTMP, #:lo12:Ek_d + off] + +.set idx, idx_off +.set i, 16 +.set j, 20 +.rept 4 + INIT_LFSR_128 %i, %j, v24, v25, v26, %idx, v27 +.set idx, (idx + 1) +.set i, (i + 1) +.set j, (j + 1) .endr -.set id0, idx_off -.set id1, (id0 + 1) -.set id2, (id1 + 1) -.set id3, (id2 + 1) +.set id0, idx_off +.set id1, (id0 + 1) +.set id2, (id1 + 1) +.set id3, (id2 + 1) // store 4xLFSR registers in memory (reordering first, // so all SX registers are together) - TRANSPOSE4_U32 %id0, %id1, %id2, %id3, 27, 28, 29, 31 + TRANSPOSE4_U32 %id0, %id1, %id2, %id3, 27, 28, 29, 31 -.set off, (off + 16) -.set idx_off, (idx_off + 4) +.set off, (off + 16) +.set idx_off, (idx_off + 4) .endr .else // KEY_SIZE == 256 // Get pointer to constants (depending on tag size, this will point at // constants for encryption, authentication with 4-byte, 8-byte or 16-byte tags) adrp xTMP, EK256_d64 - add xTMP, xTMP, #:lo12:EK256_d64 + add xTMP, xTMP, #:lo12:EK256_d64 rbit tag_sz, tag_sz - clz tag_sz, tag_sz - sub tag_sz, tag_sz, #1 - lsl tag_sz, tag_sz, #6 - add x13, xTMP, tag_sz + clz tag_sz, tag_sz + sub tag_sz, tag_sz, #1 + lsl tag_sz, tag_sz, #6 + add x13, xTMP, tag_sz // Initialize all LFSR registers -.set off, 0 -.rept 4 +.set off, 0 +.rept 4 // Load key and IV for each packet - ldr x5, [pKe, off] - //ldr x6, [pIv, off] - add x6, pIv, off*4 + ldr x5, [pKe, off] + add x6, pIv, off*4 // restore x14 - mov x14, x13 + mov x14, x13 // Initialize S0-15 for each packet - INIT_LFSR_256 x5, x6, v0, v1, v2, v3, v4, v5, v6, x11, w11, x14 + INIT_LFSR_256 x5, x6, v0, v1, v2, v3, v4, v5, v6, x11, w11, x14 -.irp idx,0,1,2,3 - str q\idx, [pState, 64*\idx + 2*off] +.irp idx,0,1,2,3 + str q\idx, [pState, 64*\idx + 2*off] .endr -.set off, (off + 8) +.set off, (off + 8) .endr // Read, transpose and store, so all S_X from the 4 packets are in the same register -.set idx_off, 0 -.rept 4 +.set idx_off, 0 +.rept 4 -.set idx, idx_off -.rept 4 - LDR_Q %idx, pState, %(16*idx) -.set idx, (idx + 1) +.set idx, idx_off +.rept 4 + LDR_Q %idx, pState, %(16*idx) +.set idx, (idx + 1) .endr -.set id0, idx_off -.set id1, (id0 + 1) -.set id2, (id1 + 1) -.set id3, (id2 + 1) - TRANSPOSE4_U32 %id0, %id1, %id2, %id3, 27, 28, 29, 31 +.set id0, idx_off +.set id1, (id0 + 1) +.set id2, (id1 + 1) +.set id3, (id2 + 1) + TRANSPOSE4_U32 %id0, %id1, %id2, %id3, 27, 28, 29, 31 -.set idx_off, (idx_off + 4) +.set idx_off, (idx_off + 4) .endr .endif // KEY_SIZE == 256 // Zero out R1, R2(only lower 128bits) - eor vFR1.16b, vFR1.16b, vFR1.16b - eor vFR2.16b, vFR2.16b, vFR2.16b + eor vFR1.16b, vFR1.16b, vFR1.16b + eor vFR2.16b, vFR2.16b, vFR2.16b -.set init_round_num, 0 -.rept 32 +.set init_round_num, 0 +.rept 32 // Shift LFSR 32-times, update state variables - bits_reorg4 init_round_num, 0, no_reg - nonlin_fun4 1, v29 - ushr v29.4s, v29.4s, #1 // Shift out LSB of W - lfsr_updt4 init_round_num, 1, 29 // W (v0) used in LFSR update - not set to zero -.set init_round_num, (init_round_num + 1) + bits_reorg4 init_round_num, 0, no_reg + nonlin_fun4 1, v29 + ushr v29.4s, v29.4s, #1 // Shift out LSB of W + lfsr_updt4 init_round_num, 1, 29 // W (v0) used in LFSR update - not set to zero +.set init_round_num, (init_round_num + 1) .endr 2: // And once more, initial round from keygen phase = 33 times - bits_reorg4 0, 0, no_reg - nonlin_fun4 0, no_reg - lfsr_updt4 0, 0, no_reg + bits_reorg4 0, 0, no_reg + nonlin_fun4 0, no_reg + lfsr_updt4 0, 0, no_reg - STORE_LFSR_LIST pState, 0 + STORE_LFSR_LIST pState, 0 - str qFR1, [pState, #OFS_R1] - str qFR2, [pState, #OFS_R2] + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] + +#ifdef SAFE_DATA + // clear intermediate value + eor v0.16b, v0.16b, v0.16b + eor v1.16b, v1.16b, v1.16b + eor v2.16b, v2.16b, v2.16b + eor v3.16b, v3.16b, v3.16b + eor v4.16b, v4.16b, v4.16b + eor v5.16b, v5.16b, v5.16b + eor v6.16b, v6.16b, v6.16b + eor v7.16b, v7.16b, v7.16b + eor v8.16b, v8.16b, v8.16b + eor v9.16b, v9.16b, v9.16b + eor v10.16b, v10.16b, v10.16b + eor v11.16b, v11.16b, v11.16b + eor v12.16b, v12.16b, v12.16b + eor v13.16b, v13.16b, v13.16b + eor v14.16b, v14.16b, v14.16b + eor v15.16b, v15.16b, v15.16b + eor v24.16b, v24.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + eor v26.16b, v26.16b, v26.16b + eor v27.16b, v27.16b, v27.16b + eor v28.16b, v28.16b, v28.16b +#endif FUNC_RESTORE ret .endm -.macro simd_load_16 DST, ADDR, SIZE +.macro simd_load_16 DST, ADDR, SIZE test \SIZE, #16 b.eq _skip_16 - ld1 {\DST\().16b}, [\ADDR] - b end_load + ld1 {\DST\().16b}, [\ADDR] + b end_load _skip_16: - eor \DST\().16b, \DST\().16b, \DST\().16b - cbz \SIZE, end_load - cmp \SIZE, 1 + eor \DST\().16b, \DST\().16b, \DST\().16b + cbz \SIZE, end_load + cmp \SIZE, 1 b.eq _size_1 - cmp \SIZE, 2 + cmp \SIZE, 2 b.eq _size_2 - cmp \SIZE, 3 + cmp \SIZE, 3 b.eq _size_3 - cmp \SIZE, 4 + cmp \SIZE, 4 b.eq _size_4 - cmp \SIZE, 5 + cmp \SIZE, 5 b.eq _size_5 - cmp \SIZE, 6 + cmp \SIZE, 6 b.eq _size_6 - cmp \SIZE, 7 + cmp \SIZE, 7 b.eq _size_7 - cmp \SIZE, 8 + cmp \SIZE, 8 b.eq _size_8 - cmp \SIZE, 9 + cmp \SIZE, 9 b.eq _size_9 - cmp \SIZE, 10 + cmp \SIZE, 10 b.eq _size_10 - cmp \SIZE, 11 + cmp \SIZE, 11 b.eq _size_11 - cmp \SIZE, 12 + cmp \SIZE, 12 b.eq _size_12 - cmp \SIZE, 13 + cmp \SIZE, 13 b.eq _size_13 - cmp \SIZE, 14 + cmp \SIZE, 14 b.eq _size_14 _size_15: - add xTMP, \ADDR, 14 - ld1 {\DST\().B}[14], [xTMP] + add xTMP, \ADDR, 14 + ld1 {\DST\().B}[14], [xTMP] _size_14: - add xTMP, \ADDR, 13 - ld1 {\DST\().B}[13], [xTMP] + add xTMP, \ADDR, 13 + ld1 {\DST\().B}[13], [xTMP] _size_13: - add xTMP, \ADDR, 12 - ld1 {\DST\().B}[12], [xTMP] + add xTMP, \ADDR, 12 + ld1 {\DST\().B}[12], [xTMP] _size_12: - add xTMP, \ADDR, 11 - ld1 {\DST\().B}[11], [xTMP] + add xTMP, \ADDR, 11 + ld1 {\DST\().B}[11], [xTMP] _size_11: - add xTMP, \ADDR, 10 - ld1 {\DST\().B}[10], [xTMP] + add xTMP, \ADDR, 10 + ld1 {\DST\().B}[10], [xTMP] _size_10: - add xTMP, \ADDR, 9 - ld1 {\DST\().B}[9], [xTMP] + add xTMP, \ADDR, 9 + ld1 {\DST\().B}[9], [xTMP] _size_9: - add xTMP, \ADDR, 8 - ld1 {\DST\().B}[8], [xTMP] + add xTMP, \ADDR, 8 + ld1 {\DST\().B}[8], [xTMP] _size_8: - ld1 {\DST\().D}[0], [ADDR] - b end_load + ld1 {\DST\().D}[0], [ADDR] + b end_load _size_7: - add xTMP, \ADDR, 6 - ld1 {\DST\().B}[6], [xTMP] + add xTMP, \ADDR, 6 + ld1 {\DST\().B}[6], [xTMP] _size_6: - add xTMP, \ADDR, 5 - ld1 {\DST\().B}[5], [xTMP] + add xTMP, \ADDR, 5 + ld1 {\DST\().B}[5], [xTMP] _size_5: - add xTMP, \ADDR, 4 - ld1 {\DST\().B}[4], [xTMP] + add xTMP, \ADDR, 4 + ld1 {\DST\().B}[4], [xTMP] _size_4: - ld1 {\DST\().S}[0], [ADDR] - b end_load + ld1 {\DST\().S}[0], [ADDR] + b end_load _size_3: - add xTMP, \ADDR, 2 - ld1 {\DST\().B}[2], [xTMP] + add xTMP, \ADDR, 2 + ld1 {\DST\().B}[2], [xTMP] _size_2: - ld1 {\DST\().H}[0], [ADDR] - b end_load + ld1 {\DST\().H}[0], [ADDR] + b end_load _size_1: - ld1 {\DST\().B}[0], [ADDR] + ld1 {\DST\().B}[0], [ADDR] end_load: .endm -.macro simd_store_16 DST, SRC, SIZE, OFFSET +.macro simd_store_16 DST, SRC, SIZE, OFFSET - mov x11, \OFFSET - tst \SIZE, 16 + mov x11, \OFFSET + tst \SIZE, 16 b.eq 1f - add x12, \DST, x11 - st1 {\SRC\().16b}, [x12] - b 2f + add x12, \DST, x11 + st1 {\SRC\().16b}, [x12] + b 2f 1: - tst \SIZE, 8 + tst \SIZE, 8 b.eq 1f - add x12, \DST, x11 - st1 {\SRC\().D}[0], [x12] - ext \SRC\().16b, \SRC\().16b, \SRC\().16b, #8 - add x11, x11, #8 + add x12, \DST, x11 + st1 {\SRC\().D}[0], [x12] + ext \SRC\().16b, \SRC\().16b, \SRC\().16b, #8 + add x11, x11, #8 1: - tst \SIZE, 4 + tst \SIZE, 4 b.eq 1f - add x12, \DST, x11 - st1 {\SRC\().S}[0], [x12] + add x12, \DST, x11 + st1 {\SRC\().S}[0], [x12] ushr \SRC\().2d, \SRC\().2d, #32 - add x11, x11, #4 + add x11, x11, #4 1: - tst \SIZE, 2 + tst \SIZE, 2 b.eq 1f - add x12, \DST, x11 - st1 {\SRC\().H}[0], [x12] - ushr \SRC\().2d, \SRC\().2d, #16 - add x11, x11, #2 + add x12, \DST, x11 + st1 {\SRC\().H}[0], [x12] + ushr \SRC\().2d, \SRC\().2d, #16 + add x11, x11, #2 1: - tst \SIZE, 1 + tst \SIZE, 1 b.eq 2f - add x12, \DST, x11 - st1 {\SRC\().B}[0], [x12] + add x12, \DST, x11 + st1 {\SRC\().B}[0], [x12] 2: .endm -.macro eor_vi i, j, vX - eor v\i\().16b, v\i\().16b, \vX\().16b +.macro eor_vi i, j, vX + eor v\i\().16b, v\i\().16b, \vX\().16b .endm -.macro CIPHERNx4B_4 NROUNDS, INITIAL_ROUND, OFFSET, LAST_CALL +.macro CIPHERNx4B_4 NROUNDS, INITIAL_ROUND, OFFSET, LAST_CALL #define TMP1 x8 #define TMP2 x9 // Generate N*4B of keystream in N rounds -.set N, 1 -.set round, (\INITIAL_ROUND + N) -.rept \NROUNDS - bits_reorg4 round, 1, %(N+15) - nonlin_fun4 1, v29 +.set N, 1 +.set round, (\INITIAL_ROUND + N) +.rept \NROUNDS + bits_reorg4 round, 1, %(N+15) + nonlin_fun4 1, v29 // OFS_XR XOR W (v0) - eor_vi %(N+15), %(N+15), v29 - lfsr_updt4 round, 0, no_reg -.set N, (N + 1) -.set round, (round + 1) + eor_vi %(N+15), %(N+15), v29 + lfsr_updt4 round, 0, no_reg +.set N, (N + 1) +.set round, (round + 1) .endr - TRANSPOSE4_U32 16, 17, 18, 19, 20, 21, 22, 23 + TRANSPOSE4_U32 16, 17, 18, 19, 20, 21, 22, 23 // XOR Input buffer with keystream in rounds of 16B rev32 v16.16b, v16.16b - ldr q24, [x20, \OFFSET] + ldr q24, [x20, \OFFSET] rev32 v17.16b, v17.16b - ldr q25, [x21, \OFFSET] + ldr q25, [x21, \OFFSET] rev32 v18.16b, v18.16b - ldr q26, [x22, \OFFSET] + ldr q26, [x22, \OFFSET] rev32 v19.16b, v19.16b - ldr q31, [x19, \OFFSET] + ldr q31, [x19, \OFFSET] - eor v16.16b, v16.16b, v24.16b - eor v17.16b, v17.16b, v25.16b - eor v18.16b, v18.16b, v26.16b - eor v19.16b, v19.16b, v31.16b + eor v16.16b, v16.16b, v24.16b + eor v17.16b, v17.16b, v25.16b + eor v18.16b, v18.16b, v26.16b + eor v19.16b, v19.16b, v31.16b .if \LAST_CALL == 1 - umov w25, v30.h[0] - simd_store_16 x26, v16, x25, \OFFSET - umov w25, v30.h[1] - simd_store_16 x27, v17, x25, \OFFSET - umov w25, v30.h[2] - simd_store_16 x28, v18, x25, \OFFSET - umov w25, v30.h[3] - simd_store_16 x29, v19, x25, \OFFSET + umov w25, v30.s[0] + simd_store_16 x26, v16, x25, \OFFSET + umov w25, v30.s[1] + simd_store_16 x27, v17, x25, \OFFSET + umov w25, v30.s[2] + simd_store_16 x28, v18, x25, \OFFSET + umov w25, v30.s[3] + simd_store_16 x29, v19, x25, \OFFSET .else - str q16, [x26, \OFFSET] - str q17, [x27, \OFFSET] - str q18, [x28, \OFFSET] - str q19, [x29, \OFFSET] + str q16, [x26, \OFFSET] + str q17, [x27, \OFFSET] + str q18, [x28, \OFFSET] + str q19, [x29, \OFFSET] .endif .endm -.macro STORE_LFSR_LIST STATE, NUM_ROUNDS -.set round_num, \NUM_ROUNDS -.set offset, 0 -.rept 16 - STR_Q %((round_num) % 16), \STATE\(), offset -.set round_num, (round_num + 1) -.set offset, (offset + 16) +.macro STORE_LFSR_LIST STATE, NUM_ROUNDS +.set round_num, \NUM_ROUNDS +.set offset, 0 +.rept 16 + STR_Q %((round_num) % 16), \STATE\(), offset +.set round_num, (round_num + 1) +.set offset, (offset + 16) .endr .endm -.macro LOAD_LFSR_LIST STATE, NUM_ROUNDS -.set round_num, \NUM_ROUNDS -.set offset, 0 -.rept 16 - LDR_Q %((round_num) % 16), \STATE\(), offset -.set round_num, (round_num + 1) -.set offset, (offset + 16) +.macro LOAD_LFSR_LIST STATE, NUM_ROUNDS +.set round_num, \NUM_ROUNDS +.set offset, 0 +.rept 16 + LDR_Q %((round_num) % 16), \STATE\(), offset +.set round_num, (round_num + 1) +.set offset, (offset + 16) .endr .endm -.macro store_vi_to_keyaddr i, addr1, addr2, addr3, addr4 - st1 {v\i\().S}[0], [\addr1] - st1 {v\i\().S}[1], [\addr2] - st1 {v\i\().S}[2], [\addr3] - st1 {v\i\().S}[3], [\addr4] +.macro store_vi_to_keyaddr i, addr1, addr2, addr3, addr4 + st1 {v\i\().S}[0], [\addr1] + st1 {v\i\().S}[1], [\addr2] + st1 {v\i\().S}[2], [\addr3] + st1 {v\i\().S}[3], [\addr4] .endm -.macro KEYGEN_4_AARCH64 NUM_ROUNDS - declare_register pState x0 - declare_register pKS x1 +.macro KEYGEN_4_AARCH64 NUM_ROUNDS + declare_register pState, x0 + declare_register pKS, x1 FUNC_SAVE - ldr qFR1, [pState, #OFS_R1] - ldr qFR2, [pState, #OFS_R2] - LOAD_LFSR_LIST pState, 0 + ldr qFR1, [pState, #OFS_R1] + ldr qFR2, [pState, #OFS_R2] + LOAD_LFSR_LIST pState, 0 // Generate N*4B of keystream in N rounds -.set N, 1 -.rept \NUM_ROUNDS - bits_reorg4 N, 1, %(N+15) - nonlin_fun4 1, v29 +.set N, 1 +.rept \NUM_ROUNDS + bits_reorg4 N, 1, %(N+15) + nonlin_fun4 1, v29 // OFS_XR XOR W (v0) - eor_vi %(N+15), %(N+15), v29 - lfsr_updt4 N, 0, no_reg -.set N, (N + 1) + eor_vi %(N+15), %(N+15), v29 + lfsr_updt4 N, 0, no_reg +.set N, (N + 1) .endr - ldp x10, x11, [pKS] - ldp x12, x13, [pKS, 16] + ldp x10, x11, [pKS] + ldp x12, x13, [pKS, 16] .if \NUM_ROUNDS == 4 TRANSPOSE4_U32 16, 17, 18, 19, 20, 21, 22, 23 - st1 {v16.16b}, [x10] - st1 {v17.16b}, [x11] - st1 {v18.16b}, [x12] - st1 {v19.16b}, [x13] + st1 {v16.16b}, [x10] + st1 {v17.16b}, [x11] + st1 {v18.16b}, [x12] + st1 {v19.16b}, [x13] .else -.set idx, 1 -.rept \NUM_ROUNDS - store_vi_to_keyaddr %(idx+15), x10, x11, x12, x13 - add x10, x10, #4 - add x11, x11, #4 - add x12, x12, #4 - add x13, x13, #4 -.set idx, (idx + 1) +.set idx, 1 +.rept \NUM_ROUNDS + store_vi_to_keyaddr %(idx+15), x10, x11, x12, x13 + add x10, x10, #4 + add x11, x11, #4 + add x12, x12, #4 + add x13, x13, #4 +.set idx, (idx + 1) .endr .endif - STORE_LFSR_LIST pState, \NUM_ROUNDS + STORE_LFSR_LIST pState, \NUM_ROUNDS - str qFR1, [pState, #OFS_R1] - str qFR2, [pState, #OFS_R2] + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] #ifdef SAFE_DATA - eor v0.16b, v0.16b, v0.16b + // clear intermediate value + eor v0.16b, v0.16b, v0.16b + eor v1.16b, v1.16b, v1.16b + eor v2.16b, v2.16b, v2.16b + eor v3.16b, v3.16b, v3.16b + eor v4.16b, v4.16b, v4.16b + eor v5.16b, v5.16b, v5.16b + eor v6.16b, v6.16b, v6.16b + eor v7.16b, v7.16b, v7.16b + eor v8.16b, v8.16b, v8.16b + eor v9.16b, v9.16b, v9.16b + eor v10.16b, v10.16b, v10.16b + eor v11.16b, v11.16b, v11.16b + eor v12.16b, v12.16b, v12.16b + eor v13.16b, v13.16b, v13.16b + eor v14.16b, v14.16b, v14.16b + eor v15.16b, v15.16b, v15.16b + eor v24.16b, v24.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + eor v26.16b, v26.16b, v26.16b + eor v27.16b, v27.16b, v27.16b + eor v28.16b, v28.16b, v28.16b + // clear generated key stream + eor v16.16b, v16.16b, v16.16b + eor v17.16b, v17.16b, v17.16b + eor v18.16b, v18.16b, v18.16b + eor v19.16b, v19.16b, v19.16b #endif - FUNC_RESTORE +.endm + +.macro ROUND LANE vKS0 vKS1 PMULL_ENABLE TAG_SZ T_IN_REG + // read 16 bytes and reverse bits + ld1 {v30.16b}, [pIN\LANE\()] + add pIN\LANE\(), pIN\LANE\(), 16 + rbit v30.16b, v30.16b + + ext v24.16b, \vKS0\().16b, \vKS1\().16b, #8 + tbl \vKS0\().16b, {\vKS0\().16b}, vKS_RO.16b // KS bits [ 63:32 31:0 95:64 63:32] + tbl v24.16b, {v24.16b}, vKS_RO.16b // KS bits [127:96 95:64 159:128 127:96] + + // - set up DATA + eor v25.16b, v25.16b, v25.16b + eor v26.16b, v26.16b, v26.16b + zip1 v25.4s, v30.4s, v25.4s + zip2 v26.4s, v30.4s, v26.4s + + // - carry-less multiplication +.if PMULL_ENABLE == 1 + pmull v27.1q, v25.1d, \vKS0\().1d + pmull2 v28.1q, v25.2d, \vKS0\().2d + pmull v29.1q, v26.1d, v24.1d + pmull2 v30.1q, v26.2d, v24.2d +.else + EMULATE_PMULL v27, v25, \vKS0\() + EMULATE_PMULL2 v28, v25, \vKS0\() + EMULATE_PMULL v29, v26, v24 + EMULATE_PMULL2 v30, v26, v24 +.endif + // - xor the results from 4 32-bit words together + eor v27.16b, v27.16b, v28.16b + eor v29.16b, v29.16b, v30.16b + eor v27.16b, v27.16b, v29.16b + +.if TAG_SZ != 4 // TAG_SZ == 8 or 16 + tbl v0.16b, {\vKS1\().16b}, vKS_RO.16b // KS bits [191:160 159:128 223:192 191:160] + dup v28.2D, \vKS0\().D[1] // KS bits [ 95:64 63:32 95:64 63:32 ] + dup v29.2D, v24.D[0] // KS bits [127:96 95:64 127:96 95:64 ] + dup v30.2D, v24.D[1] // KS bits [159:128 127:96 159:128 127:96 ] + dup v1.2D, v0.D[0] // KS bits [191:160 159:128 191:160 159:128] +.if PMULL_ENABLE == 1 + pmull v28.1q, v25.1d, v28.1d + pmull2 v29.1q, v25.2d, v29.2d + pmull v2.1q, v26.1d, v30.1d + pmull2 v3.1q, v26.2d, v1.2d +.else // PMULL_ENABLE == 0 + EMULATE_PMULL v28, v25, v28 + EMULATE_PMULL2 v29, v25, v29 + EMULATE_PMULL v2, v26, v30 + EMULATE_PMULL2 v3, v26, v1 +.endif // PMULL_ENABLE + // - xor the results from 4 32-bit words together + eor v28.16b, v28.16b, v29.16b + eor v2.16b, v2.16b, v3.16b + eor v28.16b, v28.16b, v2.16b + +.if TAG_SZ == 16 + dup \vKS1\().2D, \vKS1\().D[1] // KS bits [255:224 223:192 255:224 223:192] + rev64 \vKS1\().4s, \vKS1\().4s +.if PMULL_ENABLE == 1 + pmull v29.1q, v25.1d, v24.1d + pmull2 v2.1q, v25.2d, v24.2d + pmull v3.1q, v26.1d, v0.1d + pmull2 v4.1q, v26.2d, v0.2d +.else // PMULL_ENABLE == 0 + EMULATE_PMULL v29, v25, v24 + EMULATE_PMULL2 v2, v25, v24 + EMULATE_PMULL v3, v26, v0 + EMULATE_PMULL2 v4, v26, v0 +.endif // PMULL_ENABLE + eor v29.16b, v29.16b, v2.16b + eor v3.16b, v3.16b, v4.16b + eor v29.16b, v29.16b, v3.16b + dup v0.2D, v0.D[1] +.if PMULL_ENABLE == 1 + pmull v30.1q, v25.1d, v30.1d + pmull2 v1.1q, v25.2d, v1.2d + pmull v0.1q, v26.1d, v0.1d + pmull2 \vKS1\().1q, v26.2d, \vKS1\().2d +.else // PMULL_ENABLE == 0 + EMULATE_PMULL v30, v25, v30 + EMULATE_PMULL2 v1, v25, v1 + EMULATE_PMULL v0, v26, v0 + EMULATE_PMULL2 \vKS1\(), v26, \vKS1\() +.endif // PMULL_ENABLE + eor v30.16b, v30.16b, v1.16b + eor v0.16b, v0.16b, \vKS1\().16b + eor v30.16b, v30.16b, v0.16b +.endif // TAG_SZ == 16 +.endif // TAG_SZ == 8 or 16 + +.if TAG_SZ == 4 + // - update T +.if T_IN_REG == 0 + ldr wT\LANE\(), [pT, #(4*\LANE\())] +.endif // T_IN_REG == 0 + mov wT, v27.s[1] + eor wT\LANE\(), wT, wT\LANE\() +.if T_IN_REG == 0 + str wT\LANE\(), [pT, #(4*\LANE\())] +.endif // T_IN_REG == 0 +.else // TAG_SZ == 8 or 16 +.if TAG_SZ == 8 +.if T_IN_REG == 0 + ldr xT\LANE\(), [pT, #(8*\LANE\())] +.endif // T_IN_REG == 0 + zip1 v27.4s, v27.4s, v28.4s + mov xT, v27.d[1] + eor xT\LANE\(), xT, xT\LANE\() +.if T_IN_REG == 0 + str xT\LANE\(), [pT, #(8*\LANE\())] +.endif // T_IN_REG == 0 +.else // TAG_SZ == 16 + /* + * For 128bit tag, T could not restore in assigned vector register + * because vector registers are used out. + */ + ldr q24, [pT, #(16*\LANE\())] + zip1 v27.4s, v27.4s, v28.4s + zip1 v29.4s, v29.4s, v30.4s + zip2 v27.2d, v27.2d, v29.2d + eor v24.16b, v24.16b, v27.16b + str q24, [pT, #(16*\LANE\())] +.endif +.endif // TAG_SZ == 8 or 16 +.endm + +.macro ZUC_AUTH_4LANE KS0_0 KS1_0 KS0_1 KS1_1 KS0_2 KS1_2 KS0_3 KS1_3 PMULL_ENABLE TAG_SZ + stp q27, q28, [sp] +.if TAG_SZ != 4 + stp q0, q1, [sp, #32] + stp q2, q3, [sp, #64] +.if TAG_SZ == 16 + str q4, [sp, #96] +.endif +.endif + ROUND 0 KS0_0 KS1_0 PMULL_ENABLE TAG_SZ 1 + ROUND 1 KS0_1 KS1_1 PMULL_ENABLE TAG_SZ 1 + ROUND 2 KS0_2 KS1_2 PMULL_ENABLE TAG_SZ 1 + ROUND 3 KS0_3 KS1_3 PMULL_ENABLE TAG_SZ 1 +.if TAG_SZ != 4 + ldp q0, q1, [sp, #32] + ldp q2, q3, [sp, #64] +.if TAG_SZ == 16 + ldr q4, [sp, #96] +.endif +.endif + ldp q27, q28, [sp] +.endm + +.macro GENKS_AND_AUTH N4 KEYLEN +.set OFFSET, 4 +.rept \N4 + // Generate N*4B of keystream in N rounds +.set N, 1 +.rept 4 + bits_reorg4 %(N+OFFSET), 1, %(N+15) + nonlin_fun4 1, v29 + // OFS_XR XOR W (v0) + eor_vi %(N+15), %(N+15), v29 + lfsr_updt4 %(N+OFFSET), 0, no_reg +.set N, (N + 1) +.endr // rept 4 + + ldr q20, [pKS0] + ldr q21, [pKS1] + ldr q22, [pKS2] + ldr q23, [pKS3] + TRANSPOSE4_U32 16, 17, 18, 19, 24, 25, 26, 29 + str q16, [pKS0] + str q17, [pKS1] + str q18, [pKS2] + str q19, [pKS3] + str q20, [pKS0, #16] + str q21, [pKS1, #16] + str q22, [pKS2, #16] + str q23, [pKS3, #16] + + /* ZUC authentication part + * - 4x32 data bits + * - set up KS + */ + adrp xTMP, KS_reorder + add xTMP, xTMP, #:lo12:KS_reorder + ld1 {vKS_RO.16b}, [xTMP] + + cmp pmullEnable, 0 + b.ne 1f + +.if KEYLEN == 256 + cmp tagSize, 8 + b.eq 8f + b.gt 16f +.endif + + ZUC_AUTH_4LANE v20, v16, v21, v17, v22, v18, v23, v19, 0, 4 + b 2f +8: + ZUC_AUTH_4LANE v20, v16, v21, v17, v22, v18, v23, v19, 0, 8 + b 2f +16: + ZUC_AUTH_4LANE v20, v16, v21, v17, v22, v18, v23, v19, 0, 16 + b 2f +1: + +.if KEYLEN == 256 + cmp tagSize, 8 + b.eq 8f + b.gt 16f +.endif + + ZUC_AUTH_4LANE v20, v16, v21, v17, v22, v18, v23, v19, 1, 4 + b 2f +8: + ZUC_AUTH_4LANE v20, v16, v21, v17, v22, v18, v23, v19, 1, 8 + b 2f +16: + ZUC_AUTH_4LANE v20, v16, v21, v17, v22, v18, v23, v19, 1, 16 + b 2f +2: + +.set OFFSET, (OFFSET + 4) +.endr // rept N4 +.endm + +.macro _ZUC_AUTH_4 KEYLEN + declare_register pState, x0 + declare_register pT, x1 + declare_register pIn, x2 + declare_register rounds, x3 + declare_register pKeyStream, x4 + declare_register tagSize, x5 + declare_register pmullEnable, x24 + declare_register wT, w6 + declare_register wT0, w7 + declare_register wT1, w8 + declare_register wT2, w9 + declare_register wT3, w10 + declare_register xT, x6 + declare_register xT0, x7 + declare_register xT1, x8 + declare_register xT2, x9 + declare_register xT3, x10 + declare_register pIN0, x11 + declare_register pIN1, x12 + declare_register pIN2, x13 + declare_register pIN3, x14 + declare_register pKS0, x15 + declare_register pKS1, x16 + declare_register pKS2, x17 + declare_register pKS3, x18 + declare_register vKS_RO, v31 + + FUNC_SAVE + sub sp, sp, 112 + + // - save clobbered registers + mov x19, x0 + mov x20, x1 + mov x21, x2 + + // - tell if pmull is supported + bl cpu_feature_detect + and pmullEnable, x0, IMB_FEATURE_PMULL + + // - restore registers + mov x0, x19 + mov x1, x20 + mov x2, x21 + + ldr qFR1, [pState, #OFS_R1] + ldr qFR2, [pState, #OFS_R2] + LOAD_LFSR_LIST pState, 0 + +.if KEYLEN == 256 + cmp tagSize, 8 + b.eq 8f + b.cc 4f + b 16f +8: + ldp xT0, xT1, [pT] + ldp xT2, xT3, [pT, 16] + b 16f +.endif +4: + ldp wT0, wT1, [pT] + ldp wT2, wT3, [pT, 8] +16: + ldp pIN0, pIN1, [pIn] + ldp pIN2, pIN3, [pIn, 16] + ldp pKS0, pKS1, [pKeyStream] + ldp pKS2, pKS3, [pKeyStream, 16] + + // Generate N*4B of keystream in N rounds +.set N, 1 +.rept 4 + bits_reorg4 N, 1, %(N+15) + nonlin_fun4 1, v29 + // OFS_XR XOR W (v0) + eor_vi %(N+15), %(N+15), v29 + lfsr_updt4 N, 0, no_reg +.set N, (N + 1) +.endr + + TRANSPOSE4_U32 16, 17, 18, 19, 20, 21, 22, 23 + str q16, [pKS0] + str q17, [pKS1] + str q18, [pKS2] + str q19, [pKS3] + +auth_4_loop_zuc\KEYLEN: + cmp rounds, #4 + b.lt exit_auth_4_loop_zuc\KEYLEN + + GENKS_AND_AUTH 4 \KEYLEN + + sub rounds, rounds, 4 + b auth_4_loop_zuc\KEYLEN + +exit_auth_4_loop_zuc\KEYLEN: + cmp rounds, #3 + b.eq auth_3_zuc\KEYLEN + cmp rounds, #2 + b.eq auth_2_zuc\KEYLEN + cmp rounds, #1 + b.eq auth_1_zuc\KEYLEN + cmp rounds, #0 + b.eq auth_0_zuc\KEYLEN + +auth_3_zuc\KEYLEN: + GENKS_AND_AUTH 3 \KEYLEN + STORE_LFSR_LIST pState, (4 + 3 * 4) + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] + b auth_finish_zuc\KEYLEN +auth_2_zuc\KEYLEN: + GENKS_AND_AUTH 2 \KEYLEN + STORE_LFSR_LIST pState, (4 + 2 * 4) + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] + b auth_finish_zuc\KEYLEN +auth_1_zuc\KEYLEN: + GENKS_AND_AUTH 1 \KEYLEN + STORE_LFSR_LIST pState, (4 + 1 * 4) + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] + b auth_finish_zuc\KEYLEN +auth_0_zuc\KEYLEN: + STORE_LFSR_LIST pState, (4 + 0 * 4) + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] + b auth_finish_zuc\KEYLEN + +auth_finish_zuc\KEYLEN: +.if KEYLEN == 256 + cmp tagSize, 8 + b.eq 8f + b.cc 4f + b 16f +8: + stp xT0, xT1, [pT] + stp xT2, xT3, [pT, 16] + b 16f +.endif +4: + stp wT0, wT1, [pT] + stp wT2, wT3, [pT, 8] +16: + stp pIN0, pIN1, [pIn] + stp pIN2, pIN3, [pIn, 16] + +#ifdef SAFE_DATA + // clear intermediate value + eor v0.16b, v0.16b, v0.16b + eor v1.16b, v1.16b, v1.16b + eor v2.16b, v2.16b, v2.16b + eor v3.16b, v3.16b, v3.16b + eor v4.16b, v4.16b, v4.16b + eor v5.16b, v5.16b, v5.16b + eor v6.16b, v6.16b, v6.16b + eor v7.16b, v7.16b, v7.16b + eor v8.16b, v8.16b, v8.16b + eor v9.16b, v9.16b, v9.16b + eor v10.16b, v10.16b, v10.16b + eor v11.16b, v11.16b, v11.16b + eor v12.16b, v12.16b, v12.16b + eor v13.16b, v13.16b, v13.16b + eor v14.16b, v14.16b, v14.16b + eor v15.16b, v15.16b, v15.16b + eor v24.16b, v24.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + eor v26.16b, v26.16b, v26.16b + eor v27.16b, v27.16b, v27.16b + eor v28.16b, v28.16b, v28.16b + // clear generated key stream + eor v16.16b, v16.16b, v16.16b + eor v17.16b, v17.16b, v17.16b + eor v18.16b, v18.16b, v18.16b + eor v19.16b, v19.16b, v19.16b + eor v20.16b, v20.16b, v20.16b + eor v21.16b, v21.16b, v21.16b + eor v22.16b, v22.16b, v22.16b + eor v23.16b, v23.16b, v23.16b + // clear intermediate digest + eor wT, wT, wT + eor wT0, wT0, wT0 + eor wT1, wT1, wT1 + eor wT2, wT2, wT2 + eor wT3, wT3, wT3 +#endif + + add sp, sp, 112 + FUNC_RESTORE + ret .endm /* @@ -968,12 +1432,12 @@ end_load: START_FUNC(ZUC256_INIT) ZUC256_INIT - ret + END_FUNC(ZUC256_INIT) /* - * uint32_t asm_Eia3Round16B_aarch64(uint32_t T, const void *KS, const void *DATA) + * uint32_t asm_Eia3Round16B_aarch64(uint32_t *T, const void *KS, const void *DATA) * Updates authentication tag T based on keystream KS and DATA. * - it processes 16 bytes of DATA * - reads data in 16 byte chunks and bit reverses them @@ -986,73 +1450,88 @@ END_FUNC(ZUC256_INIT) * x2 - DATA */ START_FUNC(ZUC_EIA3ROUND16B) - declare_register T w0 - declare_register KS x1 - declare_register DATA x2 + declare_register pT, x0 + declare_register KS, x1 + declare_register pIN0, x2 + declare_register TAG_SZ, x3 + declare_register wT, w4 + declare_register xT, x4 + declare_register wT0, w5 + declare_register xT0, x5 + declare_register vKS_RO, v31 FUNC_SAVE - // read 16 bytes and reverse bits - ld1 {v0.16b}, [DATA] - rbit v0.16b, v0.16b - - /* ZUC authentication part - * - 4x32 data bits - * - set up KS - */ - ld1 {v1.16b, v2.16b}, [KS] - ext v3.16b, v1.16b, v2.16b, #8 - adrp xTMP, KS_reorder - add xTMP, xTMP, #:lo12:KS_reorder - ld1 {v4.16b}, [xTMP] - tbl v1.16b, {v1.16b}, v4.16b - tbl v2.16b, {v3.16b}, v4.16b - - // - set up DATA - eor v5.16b, v5.16b, v5.16b - eor v6.16b, v6.16b, v6.16b - ins v5.s[0], v0.s[0] - ins v5.s[2], v0.s[1] - ins v6.s[0], v0.s[2] - ins v6.s[2], v0.s[3] - // - save clobbered registers - mov x19, x0 - mov x20, x1 - mov x21, x2 + mov x19, x0 + mov x20, x1 + mov x21, x2 // - tell if pmull is supported - bl cpu_feature_detect - ands x0, x0, IMB_FEATURE_PMULL - b.eq 1f - - // - carry-less multiplication - pmull v7.1q, v5.1d, v1.1d - pmull2 v16.1q, v5.2d, v1.2d - pmull v17.1q, v6.1d, v2.1d - pmull2 v18.1q, v6.2d, v2.2d - b 2f -1: - EMULATE_PMULL v7, v5, v1 - EMULATE_PMULL2 v16, v5, v1 - EMULATE_PMULL v17, v6, v2 - EMULATE_PMULL2 v18, v6, v2 - -2: + bl cpu_feature_detect + ands x24, x0, IMB_FEATURE_PMULL // - restore clobbered registers - mov x0, x19 - mov x1, x20 - mov x2, x21 - - // - xor the results from 4 32-bit words together - eor v7.16b, v7.16b, v16.16b - eor v18.16b, v18.16b, v17.16b - eor v7.16b, v7.16b, v18.16b - - // - update T - mov w3, v7.s[1] - eor T, w3, T + mov x0, x19 + mov x1, x20 + mov x2, x21 + + adrp xTMP, KS_reorder + add xTMP, xTMP, #:lo12:KS_reorder + ld1 {vKS_RO.16b}, [xTMP] + + b.eq PMULL_DISABLE + +PMULL_ENABLE: + cmp TAG_SZ, #8 + b.eq TAG_8B_PMULL + b.cc TAG_4B_PMULL +TAG_16B_PMULL: + ld1 {v14.16b, v15.16b}, [KS] + ROUND 0, v14, v15, 1, 16, 0 + b ROUND_END +TAG_8B_PMULL: + ld1 {v14.16b, v15.16b}, [KS] + ROUND 0, v14, v15, 1, 8, 0 + b ROUND_END +TAG_4B_PMULL: + ld1 {v14.16b, v15.16b}, [KS] + ROUND 0, v14, v15, 1, 4, 0 + b ROUND_END + +PMULL_DISABLE: + cmp TAG_SZ, #8 + b.eq TAG_8B_EMUL_PMULL + b.cc TAG_4B_EMUL_PMULL +TAG_16B_EMUL_PMULL: + ld1 {v14.16b, v15.16b}, [KS] + ROUND 0, v14, v15, 0, 16, 0 + b ROUND_END +TAG_8B_EMUL_PMULL: + ld1 {v14.16b, v15.16b}, [KS] + ROUND 0, v14, v15, 0, 8, 0 + b ROUND_END +TAG_4B_EMUL_PMULL: + ld1 {v14.16b, v15.16b}, [KS] + ROUND 0, v14, v15, 0, 4, 0 + b ROUND_END + +ROUND_END: +#ifdef SAFE_DATA + // clear intermediate value + eor v0.16b, v0.16b, v0.16b + eor v1.16b, v1.16b, v1.16b + eor v2.16b, v2.16b, v2.16b + eor v3.16b, v3.16b, v3.16b + eor v4.16b, v4.16b, v4.16b + eor v5.16b, v5.16b, v5.16b + eor v6.16b, v6.16b, v6.16b + eor v7.16b, v7.16b, v7.16b + eor v16.16b, v16.16b, v16.16b + eor v17.16b, v17.16b, v17.16b + eor v18.16b, v18.16b, v18.16b + eor w3, w3, w3 +#endif FUNC_RESTORE @@ -1070,156 +1549,123 @@ END_FUNC(ZUC_EIA3ROUND16B) */ START_FUNC(ZUC_EIA3REMAINDER) - declare_register KS x3 - declare_register DATA x1 - declare_register N_BITS x2 + declare_register KS, x3 + declare_register DATA, x1 + declare_register N_BITS, x2 FUNC_SAVE - eor v7.16b, v7.16b, v7.16b + eor v7.16b, v7.16b, v7.16b - mov x19, x0 - mov x20, x1 - mov x21, x2 + mov x19, x0 + mov x20, x1 + mov x21, x2 - bl cpu_feature_detect + bl cpu_feature_detect ands x24, x0, IMB_FEATURE_PMULL - mov x3, x19 - mov x1, x20 - mov x2, x21 -.rept 3 - cmp N_BITS, #128 - b.cc Eia3Rounds_dq_end - - // read 16 bytes and reverse bits - ld1 {v0.16b}, [DATA], #16 - rbit v0.16b, v0.16b - - /* ZUC authentication part - * - 4x32 data bits - * - set up KS - */ - ldr q1, [KS], #8 - ldr q2, [KS], #8 - adrp xTMP, KS_reorder - ldr q4, [xTMP, #:lo12:KS_reorder] - tbl v1.16b, {v1.16b}, v4.16b - tbl v2.16b, {v3.16b}, v4.16b - - // - set up DATA - eor v5.16b, v5.16b, v5.16b - eor v6.16b, v6.16b, v6.16b - ins v5.s[0], v0.s[0] - ins v5.s[2], v0.s[1] - ins v6.s[0], v0.s[2] - ins v6.s[2], v0.s[3] - - // - save clobbered registers - mov x19, x0 - mov x20, x1 - mov x21, x2 - mov x22, x3 + mov x3, x19 + mov x1, x20 + mov x2, x21 - // - tell if pmull is supported - cbz x24, 1f - - // - carry-less multiplication - pmull v7.1q, v5.1d, v1.1d - pmull2 v16.1q, v5.2d, v1.2d - pmull v17.1q, v6.1d, v2.1d - pmull2 v18.1q, v6.2d, v2.2d - b 2f -1: - EMULATE_PMULL v7, v5, v1 - EMULATE_PMULL2 v16, v5, v1 - EMULATE_PMULL v17, v6, v2 - EMULATE_PMULL2 v18, v6, v2 -2: - - // -restore clobbered registers - mov x0, x19 - mov x1, x20 - mov x2, x21 - mov x3, x22 - - // - xor the results from 4 32-bit words together - eor v7.16b, v7.16b, v16.16b - eor v18.16b, v18.16b, v17.16b - eor v7.16b, v7.16b, v18.16b - - sub N_BITS, N_BITS, #128 -.endr Eia3Rounds_dq_end: -.rept 3 - cmp N_BITS, #32 - b.cc Eia3Rounds_dw_end +.rept 3 + cmp N_BITS, #32 + b.cc Eia3Rounds_dw_end // swap dwords in KS - ld1 {v1.8b}, [KS] - add KS, KS, #4 - rev64 v1.4s, v1.4s + ld1 {v1.8b}, [KS] + add KS, KS, #4 + rev64 v1.4s, v1.4s // bit-reverse 4 bytes of data - eor v0.16b, v0.16b, v0.16b - ld1 {v0.s}[0], [DATA] - add DATA, DATA, #4 - rbit v0.16b, v0.16b + eor v0.16b, v0.16b, v0.16b + ld1 {v0.s}[0], [DATA] + add DATA, DATA, #4 + rbit v0.16b, v0.16b // rol & xor - cbz x24, 1f - pmull v0.1q, v0.1d, v1.1d - b 2f + cbz x24, 1f + pmull v0.1q, v0.1d, v1.1d + b 2f 1: - EMULATE_PMULL v0, v0, v1 + EMULATE_PMULL v0, v0, v1 2: - eor v7.16b, v0.16b, v7.16b + eor v7.16b, v0.16b, v7.16b - sub N_BITS, N_BITS, #32 + sub N_BITS, N_BITS, #32 .endr Eia3Rounds_dw_end: - mov w0, v7.s[1] - cbz N_BITS, Eia3Rounds_byte_loop_end + mov w0, v7.s[1] + cbz N_BITS, Eia3Rounds_byte_loop_end - ldr KS, [KS] + ldr KS, [KS] Eia3Rounds_byte_loop: - cbz N_BITS, Eia3Rounds_byte_loop_end - cmp N_BITS, #8 - b.cc Eia3Rounds_byte_partial + cbz N_BITS, Eia3Rounds_byte_loop_end + cmp N_BITS, #8 + b.cc Eia3Rounds_byte_partial - ldrb w4, [DATA] - sub N_BITS, N_BITS, #8 - b Eia3Rounds_byte_read + ldrb w4, [DATA] + sub N_BITS, N_BITS, #8 + b Eia3Rounds_byte_read Eia3Rounds_byte_partial: // process remaining bits (up to 7) - adr xTMP, bit_mask_table - ldrb w5, [xTMP, N_BITS] - ldrb w4, [DATA] - and w4, w4, w5 - eor N_BITS, N_BITS, N_BITS + adr xTMP, bit_mask_table + ldrb w5, [xTMP, N_BITS] + ldrb w4, [DATA] + and w4, w4, w5 + eor N_BITS, N_BITS, N_BITS Eia3Rounds_byte_read: -.set DATATEST, 0x80 -.rept 8 - tst x4, DATATEST - csel x5, KS, xzr, ne - eor w0, w0, w5 - ror KS, KS, #63 -.set DATATEST, (DATATEST >> 1) +.set DATATEST, 0x80 +.rept 8 + tst x4, DATATEST + csel x5, KS, xzr, ne + eor w0, w0, w5 + ror KS, KS, #63 +.set DATATEST, (DATATEST >> 1) .endr - add DATA, DATA, #1 - b Eia3Rounds_byte_loop + add DATA, DATA, #1 + b Eia3Rounds_byte_loop Eia3Rounds_byte_loop_end: +#ifdef SAFE_DATA + // clear intermediate value + eor v0.16b, v0.16b, v0.16b + eor v1.16b, v1.16b, v1.16b + eor v2.16b, v2.16b, v2.16b + eor v3.16b, v3.16b, v3.16b + eor v4.16b, v4.16b, v4.16b + eor v5.16b, v5.16b, v5.16b + eor v6.16b, v6.16b, v6.16b + eor v7.16b, v7.16b, v7.16b + eor v16.16b, v16.16b, v16.16b + eor v17.16b, v17.16b, v17.16b + eor v18.16b, v18.16b, v18.16b + eor x3, x3, x3 + eor x4, x4, x4 + eor x5, x5, x5 +#endif + FUNC_RESTORE ret END_FUNC(ZUC_EIA3REMAINDER) +bit_mask_table: + .byte 0x00 + .byte 0x80 + .byte 0xc0 + .byte 0xe0 + .byte 0xf0 + .byte 0xf8 + .byte 0xfc + .byte 0xfe + START_FUNC(ZUC128_INIT_4) ZUC_INIT_4 128 @@ -1234,187 +1680,207 @@ END_FUNC(ZUC256_INIT_4) START_FUNC(ZUC_CIPHER_4) - declare_register pState x0 - declare_register pIn x1 - declare_register pOut x2 - declare_register lengths x3 - declare_register min_len w4 - declare_register buf_idx x5 + declare_register pState, x0 + declare_register pIn, x1 + declare_register pOut, x2 + declare_register lengths, x3 + declare_register min_len, w4 + declare_register buf_idx, x5 - cbz min_len, exit_cipher + cbz min_len, exit_cipher FUNC_SAVE - ldp x20, x21, [pIn, #0] - ldp x22, x19, [pIn, #16] + ldp x20, x21, [pIn, #0] + ldp x22, x19, [pIn, #16] - ldp x26, x27, [pOut, #0] - ldp x28, x29, [pOut, #16] + ldp x26, x27, [pOut, #0] + ldp x28, x29, [pOut, #16] // Convert all lengths from UINT16_MAX (indicating that lane is not valid) to min length - dup v0.8h, min_len - ld1 {v1.4h}, [lengths] - cmeq v2.8h, v2.8h, v2.8h // Get all ff's in v register - cmeq v3.8h, v1.8h, v2.8h // Mask with FFFF in NULL jobs + dup v0.4s, min_len + ld1 {v1.4s}, [lengths] + cmeq v2.4s, v2.4s, v2.4s // Get all ff's in v register + cmeq v3.4s, v1.4s, v2.4s // Mask with FFFF in NULL jobs - and v4.16b, v3.16b, v0.16b // Length of valid job in all NULL jobs - eor v2.16b, v2.16b, v3.16b // Mask with 0000 in NULL jobs - and v1.16b, v1.16b, v2.16b // Zero out lengths of NULL jobs - orr v1.16b, v1.16b, v4.16b // v1 contains updated lengths + and v4.16b, v3.16b, v0.16b // Length of valid job in all NULL jobs + eor v2.16b, v2.16b, v3.16b // Mask with 0000 in NULL jobs + and v1.16b, v1.16b, v2.16b // Zero out lengths of NULL jobs + orr v1.16b, v1.16b, v4.16b // v1 contains updated lengths // Round up to nearest multiple of 4 bytes - movi v5.8h, #0x3 - mov w6, 0xfffc - dup v6.8h, w6 - add v0.8h, v0.8h, v5.8h - and v0.16b, v0.16b, v6.16b + movi v5.4s, #0x3 + mov w6, 0xfffffffc + dup v6.4s, w6 + add v0.4s, v0.4s, v5.4s + and v0.16b, v0.16b, v6.16b // Calculate remaining bytes to encrypt after function call - sub v2.8h, v1.8h, v0.8h - eor v3.16b, v3.16b, v3.16b - cmgt v4.8h, v2.8h, v3.8h // Mask with FFFF in lenghts > 0 - and v2.16b, v2.16b, v4.16b // Set to zero the lengths of the lanes which are going to be completed (lengths < 0) - st1 {v2.4h}, [lengths] // Update in memory the final updated lengths + sub v2.4s, v1.4s, v0.4s + eor v3.16b, v3.16b, v3.16b + cmgt v4.4s, v2.4s, v3.4s // Mask with FFFF in lenghts > 0 + and v2.16b, v2.16b, v4.16b // Set to zero the lengths of the lanes which are going to be completed (lengths < 0) + st1 {v2.4s}, [lengths] // Update in memory the final updated lengths /* Calculate number of bytes to encrypt after rounds of 16 bytes (up to 15 bytes), * for each lane, and store it in stack to be used in the last round */ - sub v1.8h, v1.8h, v2.8h // Bytes to encrypt in all lanes - movi v5.8h, #0xf - and v1.16b, v1.16b, v5.16b // Number of final bytes (up to 15 bytes) for each lane - cmeq v2.8h, v1.8h, v3.8h // Mask with FFFF in lengths == 0 - movi v5.8h, #0x10 - and v2.16b, v2.16b, v5.16b // 16 in positions where lengths was 0 - orr v30.16b, v1.16b, v2.16b // Number of final bytes (up to 16 bytes) for each lane + sub v1.4s, v1.4s, v2.4s // Bytes to encrypt in all lanes + movi v5.4s, #0xf + and v1.16b, v1.16b, v5.16b // Number of final bytes (up to 15 bytes) for each lane + cmeq v2.4s, v1.4s, v3.4s // Mask with FFFF in lengths == 0 + movi v5.4s, #0x10 + and v2.16b, v2.16b, v5.16b // 16 in positions where lengths was 0 + orr v30.16b, v1.16b, v2.16b // Number of final bytes (up to 16 bytes) for each lane - eor buf_idx, buf_idx, buf_idx + eor buf_idx, buf_idx, buf_idx - LOAD_LFSR_LIST pState, 0 + LOAD_LFSR_LIST pState, 0 - ldr qFR1, [pState, #OFS_R1] - ldr qFR2, [pState, #OFS_R2] + ldr qFR1, [pState, #OFS_R1] + ldr qFR2, [pState, #OFS_R2] loop_cipher64: - cmp min_len, #64 + cmp min_len, #64 b.lt exit_loop_cipher64 -.set round_off, 0 -.rept 4 - CIPHERNx4B_4 4, round_off, buf_idx, 0 +.set round_off, 0 +.rept 4 + CIPHERNx4B_4 4, round_off, buf_idx, 0 - add buf_idx, buf_idx, #16 - sub min_len, min_len, #16 -.set round_off, (round_off + 4) + add buf_idx, buf_idx, #16 + sub min_len, min_len, #16 +.set round_off, (round_off + 4) .endr - b loop_cipher64 + b loop_cipher64 exit_loop_cipher64: // Check if there are more bytes left to encrypt - add w6, min_len, 3 - lsr w6, w6, #2 // number of rounds left (round up length to nearest multiple of 4B) - cbz w6, store_lfsr_and_exit + add w6, min_len, 3 + lsr w6, w6, #2 // number of rounds left (round up length to nearest multiple of 4B) + cbz w6, store_lfsr_and_exit - cmp w6, 8 + cmp w6, 8 b.eq _num_final_rounds_is_8 b.lo _final_rounds_is_1_7 // Final blocks 9-16 - cmp w6, 12 + cmp w6, 12 b.eq _num_final_rounds_is_12 b.hi _final_rounds_is_13_16 // Final blocks 9-11 - cmp w6, 10 + cmp w6, 10 b.eq _num_final_rounds_is_10 b.lo _num_final_rounds_is_9 b.hi _num_final_rounds_is_11 _final_rounds_is_13_16: - cmp w6, 16 + cmp w6, 16 b.eq _num_final_rounds_is_16 - cmp w6, 14 + cmp w6, 14 b.eq _num_final_rounds_is_14 b.lo _num_final_rounds_is_13 b.hi _num_final_rounds_is_15 _final_rounds_is_1_7: - cmp w6, 4 + cmp w6, 4 b.eq _num_final_rounds_is_4 b.lt _final_rounds_is_1_3 // Final blocks 5-7 - cmp w6, 6 + cmp w6, 6 b.eq _num_final_rounds_is_6 b.lo _num_final_rounds_is_5 b.hi _num_final_rounds_is_7 _final_rounds_is_1_3: - cmp w6, 2 + cmp w6, 2 b.eq _num_final_rounds_is_2 b.hi _num_final_rounds_is_3 -.irp I,1,2,3,4 +.irp I,1,2,3,4 _num_final_rounds_is_\I: - CIPHERNx4B_4 \I, 0, buf_idx, 1 - STORE_LFSR_LIST pState, \I - add buf_idx, buf_idx, \I * 4 - b exit_final_rounds + CIPHERNx4B_4 \I, 0, buf_idx, 1 + STORE_LFSR_LIST pState, \I + add buf_idx, buf_idx, \I * 4 + b exit_final_rounds .endr -.irp I,5,6,7,8 +.irp I,5,6,7,8 _num_final_rounds_is_\I: - CIPHERNx4B_4 4, 0, buf_idx, 0 - add buf_idx, buf_idx, #16 - CIPHERNx4B_4 (\I-4), 4, buf_idx, 1 - add buf_idx, buf_idx, ((\I-4)*4) - STORE_LFSR_LIST pState, \I - b exit_final_rounds + CIPHERNx4B_4 4, 0, buf_idx, 0 + add buf_idx, buf_idx, #16 + CIPHERNx4B_4 (\I-4), 4, buf_idx, 1 + add buf_idx, buf_idx, ((\I-4)*4) + STORE_LFSR_LIST pState, \I + b exit_final_rounds .endr -.irp I,9,10,11,12 +.irp I,9,10,11,12 _num_final_rounds_is_\I: - CIPHERNx4B_4 4, 0, buf_idx, 0 - add buf_idx, buf_idx, #16 - CIPHERNx4B_4 4, 4, buf_idx, 0 - add buf_idx, buf_idx, #16 - CIPHERNx4B_4 (\I-8), 8, buf_idx, 1 - add buf_idx, buf_idx, ((\I-8)*4) - STORE_LFSR_LIST pState, \I - b exit_final_rounds + CIPHERNx4B_4 4, 0, buf_idx, 0 + add buf_idx, buf_idx, #16 + CIPHERNx4B_4 4, 4, buf_idx, 0 + add buf_idx, buf_idx, #16 + CIPHERNx4B_4 (\I-8), 8, buf_idx, 1 + add buf_idx, buf_idx, ((\I-8)*4) + STORE_LFSR_LIST pState, \I + b exit_final_rounds .endr -.irp I,13,14,15,16 +.irp I,13,14,15,16 _num_final_rounds_is_\I: - CIPHERNx4B_4 4, 0, buf_idx, 0 - add buf_idx, buf_idx, #16 - CIPHERNx4B_4 4, 4, buf_idx, 0 - add buf_idx, buf_idx, #16 - CIPHERNx4B_4 4, 8, buf_idx, 0 - add buf_idx, buf_idx, #16 - CIPHERNx4B_4 (\I-12), 12, buf_idx, 1 - add buf_idx, buf_idx, ((\I-12)*4) - STORE_LFSR_LIST pState, \I - b exit_final_rounds + CIPHERNx4B_4 4, 0, buf_idx, 0 + add buf_idx, buf_idx, #16 + CIPHERNx4B_4 4, 4, buf_idx, 0 + add buf_idx, buf_idx, #16 + CIPHERNx4B_4 4, 8, buf_idx, 0 + add buf_idx, buf_idx, #16 + CIPHERNx4B_4 (\I-12), 12, buf_idx, 1 + add buf_idx, buf_idx, ((\I-12)*4) + STORE_LFSR_LIST pState, \I + b exit_final_rounds .endr store_lfsr_and_exit: - STORE_LFSR_LIST pState, 0 + STORE_LFSR_LIST pState, 0 exit_final_rounds: + str qFR1, [pState, #OFS_R1] + str qFR2, [pState, #OFS_R2] // update in/out pointers - dup v0.2d, buf_idx - ld1 {v1.2d, v2.2d}, [pIn] - add v1.2d, v1.2d, v0.2d - add v2.2d, v2.2d, v0.2d - st1 {v1.2d, v2.2d}, [pIn] - ld1 {v1.2d, v2.2d}, [pOut] - add v1.2d, v1.2d, v0.2d - add v2.2d, v2.2d, v0.2d - st1 {v1.2d, v2.2d}, [pOut] - - str qFR1, [pState, #OFS_R1] - str qFR2, [pState, #OFS_R2] + dup v0.2d, buf_idx + ld1 {v1.2d, v2.2d}, [pIn] + add v1.2d, v1.2d, v0.2d + add v2.2d, v2.2d, v0.2d + st1 {v1.2d, v2.2d}, [pIn] + ld1 {v1.2d, v2.2d}, [pOut] + add v1.2d, v1.2d, v0.2d + add v2.2d, v2.2d, v0.2d + st1 {v1.2d, v2.2d}, [pOut] #ifdef SAFE_DATA - eor v0.16b, v0.16b, v0.16b + // clear intermediate value + eor v0.16b, v0.16b, v0.16b + eor v1.16b, v1.16b, v1.16b + eor v2.16b, v2.16b, v2.16b + eor v3.16b, v3.16b, v3.16b + eor v4.16b, v4.16b, v4.16b + eor v5.16b, v5.16b, v5.16b + eor v6.16b, v6.16b, v6.16b + eor v7.16b, v7.16b, v7.16b + eor v8.16b, v8.16b, v8.16b + eor v9.16b, v9.16b, v9.16b + eor v10.16b, v10.16b, v10.16b + eor v11.16b, v11.16b, v11.16b + eor v12.16b, v12.16b, v12.16b + eor v13.16b, v13.16b, v13.16b + eor v14.16b, v14.16b, v14.16b + eor v15.16b, v15.16b, v15.16b + eor v24.16b, v24.16b, v24.16b + eor v25.16b, v25.16b, v25.16b + eor v26.16b, v26.16b, v26.16b + eor v27.16b, v27.16b, v27.16b + eor v28.16b, v28.16b, v28.16b #endif FUNC_RESTORE @@ -1426,17 +1892,22 @@ END_FUNC(ZUC_CIPHER_4) START_FUNC(ZUC_XORKEYSTREAM16B) - declare_register pIn x0 - declare_register pOut x1 - declare_register pKS x2 - declare_register XKEY v0 - declare_register XIN v1 + declare_register pIn, x0 + declare_register pOut, x1 + declare_register pKS, x2 + declare_register XKEY, v0 + declare_register XIN, v1 - ld1 {XKEY.16b}, [pKS] + ld1 {XKEY.16b}, [pKS] rev32 XKEY.16b, XKEY.16b - ld1 {XIN.16b}, [pIn] - eor XKEY.16b, XKEY.16b, XIN.16b - st1 {XKEY.16b}, [pOut] + ld1 {XIN.16b}, [pIn] + eor XKEY.16b, XKEY.16b, XIN.16b + st1 {XKEY.16b}, [pOut] + +#ifdef SAFE_DATA + eor XKEY.16b, XKEY.16b, XKEY.16b + eor XIN.16b, XIN.16b, XIN.16b +#endif ret @@ -1466,12 +1937,10 @@ START_FUNC(ZUC_KEYGEN4B_4) END_FUNC(ZUC_KEYGEN4B_4) -bit_mask_table: - .byte 0x00 - .byte 0x80 - .byte 0xc0 - .byte 0xe0 - .byte 0xf0 - .byte 0xf8 - .byte 0xfc - .byte 0xfe +START_FUNC(ZUC128_AUTH_4) + _ZUC_AUTH_4 128 +END_FUNC(ZUC128_AUTH_4) + +START_FUNC(ZUC256_AUTH_4) + _ZUC_AUTH_4 256 +END_FUNC(ZUC256_AUTH_4) diff --git a/lib/aarch64/zuc_simd_no_aesni.S b/lib/aarch64/zuc_simd_no_aesni.S index f2eac61e72afd720c498a3afc8a57c66b199ca0a..11e28b1ffb800396a1a036eab0bcecda00585f3d 100644 --- a/lib/aarch64/zuc_simd_no_aesni.S +++ b/lib/aarch64/zuc_simd_no_aesni.S @@ -38,4 +38,7 @@ #define ZUC_EIA3ROUND16B asm_Eia3Round16B_aarch64_no_aesni #define ZUC_EIA3REMAINDER asm_Eia3Remainder_aarch64_no_aesni #define ZUC_XORKEYSTREAM16B asm_XorKeyStream16B_aarch64_no_aesni +#define ZUC128_AUTH_4 asm_ZucAuth_4_aarch64_no_aesni +#define ZUC256_AUTH_4 asm_Zuc256Auth_4_aarch64_no_aesni + #include "aarch64/zuc_simd.S" diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h index 46e6abb3023996305a3ce49732de2396a2613bb5..62d3f1b70378939cfeee7eb3b8c1389c66c085d2 100644 --- a/lib/include/ipsec_ooo_mgr.h +++ b/lib/include/ipsec_ooo_mgr.h @@ -294,7 +294,13 @@ typedef struct { /* ZUC out-of-order scheduler fields */ typedef struct { ZUC_ARGS_x16 args; +#ifdef __aarch64__ + /* 128-EEA3 and 128-EIA3 Specification v1.8 indicates + * message can be between 1 and 2^32 bits in length */ + DECLARE_ALIGNED(uint32_t lens[16], 16); +#else DECLARE_ALIGNED(uint16_t lens[16], 16); +#endif uint64_t unused_lanes; IMB_JOB *job_in_lane[16]; uint64_t num_lanes_inuse; diff --git a/lib/include/zuc_internal.h b/lib/include/zuc_internal.h index 8c5bad213583033a5469b587b4bea357e96bd36d..a8624ee1db87a6093d40a82d864cb504841bcd93 100755 --- a/lib/include/zuc_internal.h +++ b/lib/include/zuc_internal.h @@ -1744,12 +1744,12 @@ void asm_ZucGenKeystream_aarch64_no_aesni(void *pKeystream, uint64_t numRounds); IMB_DLL_LOCAL -uint32_t asm_Eia3Round16B_aarch64(uint32_t T, const void *ks, - const void *data); +void asm_Eia3Round16B_aarch64(uint32_t *T, const void *ks, + const void *data, const uint64_t tag_size); IMB_DLL_LOCAL -uint32_t asm_Eia3Round16B_aarch64_no_aesni(uint32_t T, const void *ks, - const void *data); +void asm_Eia3Round16B_aarch64_no_aesni(uint32_t *T, const void *ks, + const void *data, const uint64_t tag_size); IMB_DLL_LOCAL uint32_t asm_Eia3Remainder_aarch64(const void *ks, const void *data, @@ -1785,15 +1785,45 @@ IMB_DLL_LOCAL void asm_ZucCipher_4_aarch64(ZucState4_t *pState, const uint64_t *pIn[4], uint64_t *pOut[4], - uint16_t lengths[4], - const uint64_t minLength); + uint32_t lengths[4], + const uint32_t minLength); IMB_DLL_LOCAL void asm_ZucCipher_4_aarch64_no_aesni(ZucState4_t *pState, const uint64_t *pIn[4], uint64_t *pOut[4], - uint16_t lengths[4], - const uint64_t minLength); + uint32_t lengths[4], + const uint32_t minLength); + +IMB_DLL_LOCAL +void asm_ZucAuth_4_aarch64(ZucState4_t *pState, + uint32_t *T, + const uint8_t *pIn[4], + uint32_t numKeyStr, + uint32_t ** pKeyStrArr); + +IMB_DLL_LOCAL +void asm_Zuc256Auth_4_aarch64(ZucState4_t *pState, + uint32_t *T, + const uint8_t *pIn[4], + uint32_t numKeyStr, + uint32_t ** pKeyStrArr, + const uint64_t tag_size); + +IMB_DLL_LOCAL +void asm_ZucAuth_4_aarch64_no_aesni(ZucState4_t *pState, + uint32_t *T, + const uint8_t *pIn[4], + uint32_t numKeyStr, + uint32_t ** pKeyStrArr); + +IMB_DLL_LOCAL +void asm_Zuc256Auth_4_aarch64_no_aesni(ZucState4_t *pState, + uint32_t *T, + const uint8_t *pIn[4], + uint32_t numKeyStr, + uint32_t ** pKeyStrArr, + const uint64_t tag_size); IMB_DLL_LOCAL void asm_ZucGenKeystream16B_4_aarch64(ZucState4_t *pState, @@ -1845,10 +1875,19 @@ void zuc_eea3_n_buffer_aarch64(const void * const pKey[], void zuc256_eea3_1_buffer_aarch64(const void *pKey, const void *pIv, + const uint32_t ivLen, const void *pBufferIn, void *pBufferOut, const uint32_t length); +void zuc256_eea3_n_buffer_aarch64(const void * const pKey[], + const void * const pIv[], + const uint32_t ivLen[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t length[], + const uint32_t numBuffers); + void zuc_eia3_1_buffer_aarch64(const void *pKey, const void *pIv, const void *pBufferIn, @@ -1870,15 +1909,26 @@ void zuc_eia3_n_buffer_aarch64(const void * const pKey[], void zuc256_eia3_1_buffer_aarch64(const void *pKey, const void *pIv, + const uint32_t ivLen, const void *pBufferIn, const uint32_t lengthInBits, - uint32_t *pMacI); + uint32_t *pMacI, + const uint64_t tag_size); + +void zuc256_eia3_n_buffer_aarch64(const void * const pKey[], + const void * const pIv[], + const uint32_t ivLen[], + const void * const pBufferIn[], + const uint32_t lengthInBits[], + uint32_t *pMacI[], + const uint64_t tag_size, + const uint32_t numBuffers); void zuc_eia3_4_buffer_job_aarch64(const void * const pKey[4], const uint8_t *ivs, const void * const pBufferIn[4], uint32_t *pMacI[4], - const uint16_t lengthInBits[4], + const uint32_t lengthInBits[4], const void * const job_in_lane[4]); @@ -1886,9 +1936,15 @@ void zuc256_eia3_4_buffer_job_aarch64(const void * const pKey[4], const uint8_t *ivs, const void * const pBufferIn[4], uint32_t *pMacI[4], - const uint16_t lengthInBits[4], - const void * const job_in_lane[4]); + const uint32_t lengthInBits[4], + const void * const job_in_lane[4], + const uint64_t tag_size); /* AARCH64 NO-AESNI*/ +IMB_DLL_LOCAL +void asm_XorKeyStream16B_aarch64_no_aesni(const void *pIn, + void *pOut, + const void *pKey); + void zuc_eea3_1_buffer_aarch64_no_aesni(const void *pKey, const void *pIv, const void *pBufferIn, @@ -1910,10 +1966,19 @@ void zuc_eea3_n_buffer_aarch64_no_aesni(const void * const pKey[], void zuc256_eea3_1_buffer_aarch64_no_aesni(const void *pKey, const void *pIv, + const uint32_t ivLen, const void *pBufferIn, void *pBufferOut, const uint32_t length); +void zuc256_eea3_n_buffer_aarch64_no_aesni(const void * const pKey[], + const void * const pIv[], + const uint32_t ivLen[], + const void * const pBufferIn[], + void *pBufferOut[], + const uint32_t length[], + const uint32_t numBuffers); + void zuc_eia3_1_buffer_aarch64_no_aesni(const void *pKey, const void *pIv, const void *pBufferIn, @@ -1935,23 +2000,33 @@ void zuc_eia3_n_buffer_aarch64_no_aesni(const void * const pKey[], void zuc256_eia3_1_buffer_aarch64_no_aesni(const void *pKey, const void *pIv, + const uint32_t ivLen, const void *pBufferIn, const uint32_t lengthInBits, - uint32_t *pMacI); + uint32_t *pMacI, + const uint64_t tag_size); + +void zuc256_eia3_n_buffer_aarch64_no_aesni(const void * const pKey[], + const void * const pIv[], + const uint32_t ivLen[], + const void * const pBufferIn[], + const uint32_t lengthInBits[], + uint32_t *pMacI[], + const uint64_t tag_size, + const uint32_t numBuffers); void zuc_eia3_4_buffer_job_aarch64_no_aesni(const void * const pKey[4], const uint8_t *ivs, const void * const pBufferIn[4], uint32_t *pMacI[4], - const uint16_t lengthInBits[4], + const uint32_t lengthInBits[4], const void * const job_in_lane[4]); - void zuc256_eia3_4_buffer_job_aarch64_no_aesni(const void * const pKey[4], const uint8_t *ivs, const void * const pBufferIn[4], uint32_t *pMacI[4], - const uint16_t lengthInBits[4], - const void * const job_in_lane[4]); + const uint32_t lengthInBits[4], + const void * const job_in_lane[4], + const uint64_t tag_size); #endif /* ZUC_INTERNAL_H_ */ - diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index 5dfcd65ff17487d9daee767b28496f9f683cb8fb..75bd41f59e691fea3cd051eae168f95777b43f68 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -869,9 +869,6 @@ typedef void (*zuc_eea3_n_buffer_t)(const void * const *, const void * const *, const void * const *, void **, const uint32_t *, const uint32_t); -typedef void (*zuc256_eea3_1_buffer_t)(const void *, const void *, const void *, - void *, const uint32_t); - typedef void (*zuc_eia3_1_buffer_t)(const void *, const void *, const void *, const uint32_t, uint32_t *); @@ -880,8 +877,23 @@ typedef void (*zuc_eia3_n_buffer_t)(const void * const *, const void * const *, const uint32_t *, uint32_t **, const uint32_t); -typedef void (*zuc256_eia3_1_buffer_t)(const void *, const void *, const void *, - const uint32_t, uint32_t *); + +#ifdef __aarch64__ +typedef void (*zuc256_eea3_1_buffer_t)(const void *, const void *, const uint32_t, const void *, + void *, const uint32_t); + +typedef void (*zuc256_eea3_n_buffer_t)(const void * const *, const void * const *, + const uint32_t *, const void * const *, void **, + const uint32_t *, const uint32_t); + +typedef void (*zuc256_eia3_1_buffer_t)(const void *, const void *, const uint32_t, const void *, + const uint32_t, uint32_t *, const uint64_t); + +typedef void (*zuc256_eia3_n_buffer_t)(const void * const *, const void * const *, + const uint32_t *, const void * const *, + const uint32_t *, uint32_t **, + const uint64_t, const uint32_t); +#endif typedef void (*kasumi_f8_1_buffer_t)(const kasumi_key_sched_t *, const uint64_t, const void *, void *, @@ -1157,10 +1169,13 @@ typedef struct IMB_MGR { zuc_eea3_1_buffer_t eea3_1_buffer; zuc_eea3_4_buffer_t eea3_4_buffer; zuc_eea3_n_buffer_t eea3_n_buffer; - zuc256_eea3_1_buffer_t zuc256_eea3_1_buffer; zuc_eia3_1_buffer_t eia3_1_buffer; +#ifdef __aarch64__ + zuc256_eea3_1_buffer_t zuc256_eea3_1_buffer; + zuc256_eea3_n_buffer_t zuc256_eea3_n_buffer; zuc256_eia3_1_buffer_t zuc256_eia3_1_buffer; - + zuc256_eia3_n_buffer_t zuc256_eia3_n_buffer; +#endif kasumi_f8_1_buffer_t f8_1_buffer; kasumi_f8_1_buffer_bit_t f8_1_buffer_bit; kasumi_f8_2_buffer_t f8_2_buffer; @@ -2120,8 +2135,13 @@ IMB_DLL_EXPORT void init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch); ((_mgr)->eea3_4_buffer((_key), (_iv), (_src), (_dst), (_len))) #define IMB_ZUC_EEA3_N_BUFFER(_mgr, _key, _iv, _src, _dst, _len, _count) \ ((_mgr)->eea3_n_buffer((_key), (_iv), (_src), (_dst), (_len), (_count))) -#define IMB_ZUC256_EEA3_1_BUFFER(_mgr, _key, _iv, _src, _dst, _len) \ - ((_mgr)->zuc256_eea3_1_buffer((_key), (_iv), (_src), (_dst), (_len))) + +#ifdef __aarch64__ +#define IMB_ZUC256_EEA3_1_BUFFER(_mgr, _key, _iv, _iv_len, _src, _dst, _len) \ + ((_mgr)->zuc256_eea3_1_buffer((_key), (_iv), (_iv_len), (_src), (_dst), (_len))) +#define IMB_ZUC256_EEA3_N_BUFFER(_mgr, _key, _iv, _iv_len, _src, _dst, _len, _count) \ + ((_mgr)->zuc256_eea3_n_buffer((_key), (_iv), (_iv_len), (_src), (_dst), (_len), (_count))) +#endif /** * @brief ZUC EIA3 Integrity function @@ -2137,9 +2157,13 @@ IMB_DLL_EXPORT void init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch); ((_mgr)->eia3_1_buffer((_key), (_iv), (_src), (_len), (_tag))) #define IMB_ZUC_EIA3_N_BUFFER(_mgr, _key, _iv, _src, _len, _tag, _count) \ ((_mgr)->eia3_n_buffer((_key), (_iv), (_src), (_len), (_tag), (_count))) -#define IMB_ZUC256_EIA3_1_BUFFER(_mgr, _key, _iv, _src, _len, _tag) \ - ((_mgr)->zuc256_eia3_1_buffer((_key), (_iv), (_src), (_len), (_tag))) +#ifdef __aarch64__ +#define IMB_ZUC256_EIA3_1_BUFFER(_mgr, _key, _iv, _iv_len, _src, _len, _tag, _tag_sz) \ + ((_mgr)->zuc256_eia3_1_buffer((_key), (_iv), (_iv_len), (_src), (_len), (_tag), (_tag_sz))) +#define IMB_ZUC256_EIA3_N_BUFFER(_mgr, _key, _iv, _iv_len, _src, _len, _tag, _tag_sz, _count) \ + ((_mgr)->zuc256_eia3_n_buffer((_key), (_iv), (_iv_len), (_src), (_len), (_tag), (_tag_sz), (_count))) +#endif /* KASUMI F8/F9 functions */ diff --git a/test/zuc_test.c b/test/zuc_test.c index f83b3864af5946e5399150c4dc0f0bddf6619390..219bee3c4662534e1af260383aec2c99334c75f3 100644 --- a/test/zuc_test.c +++ b/test/zuc_test.c @@ -75,6 +75,20 @@ int validate_zuc_EIA_n_block(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, uint32_t numBuffs, const unsigned int job_api); +#ifdef __aarch64__ +int validate_zuc256_EEA_1_block(struct IMB_MGR *mb_mgr, uint8_t *pSrcData, + uint8_t *pDstData, uint8_t *pKeys, uint8_t *pIV, + const unsigned int job_api); +int validate_zuc256_EIA_1_block(struct IMB_MGR *mb_mgr, uint8_t *pSrcData, + uint8_t *pDstData, uint8_t *pKeys, uint8_t *pIV, + const unsigned int job_api); +int validate_zuc256_EEA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, + uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, + uint32_t numBuffs, const unsigned int job_api); +int validate_zuc256_EIA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, + uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, + uint32_t numBuffs, const unsigned int job_api); +#else int validate_zuc256_EEA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, uint32_t numBuffs); @@ -82,6 +96,8 @@ int validate_zuc256_EIA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, uint32_t numBuffs); +#endif + static void byte_hexdump(const char *message, const uint8_t *ptr, int len); /****************************************************************************** @@ -282,6 +298,38 @@ int zuc_test(struct IMB_MGR *mb_mgr) test_suite_update(&eia3_ctx, 1, 0); } +#ifdef __aarch64__ + /* ZUC-EEA3-256 tests */ + if (validate_zuc256_EEA_1_block(mb_mgr, pSrcData[0], pDstData[0], pKeys[0], + pIV[0], 0)) + test_suite_update(&eea3_256_ctx, 0, 1); + else + test_suite_update(&eea3_256_ctx, 1, 0); + + for (i = 0; i < DIM(numBuffs); i++) { + if (validate_zuc256_EEA3(mb_mgr, pSrcData, pDstData, pKeys, + pIV, numBuffs[i], 0)) + test_suite_update(&eea3_256_ctx, 0, 1); + else + test_suite_update(&eea3_256_ctx, 1, 0); + } + + /* ZUC-EIA3-256 tests */ + if (validate_zuc256_EIA_1_block(mb_mgr, pSrcData[0], pDstData[0], pKeys[0], + pIV[0], 0)) + test_suite_update(&eia3_256_ctx, 0, 1); + else + test_suite_update(&eia3_256_ctx, 1, 0); + + for (i = 0; i < DIM(numBuffs); i++) { + if (validate_zuc256_EIA3(mb_mgr, pSrcData, pDstData, pKeys, + pIV, numBuffs[i], 0)) + test_suite_update(&eia3_256_ctx, 0, 1); + else + test_suite_update(&eia3_256_ctx, 1, 0); + } +#endif + /* Job API tests */ if (validate_zuc_EEA_1_block(mb_mgr, pSrcData[0], pSrcData[0], pKeys[0], pIV[0], 1)) @@ -316,10 +364,31 @@ int zuc_test(struct IMB_MGR *mb_mgr) test_suite_update(&eia3_ctx, 1, 0); } +#ifdef __aarch64__ + /* ZUC-EEA3-256 tests */ + if (validate_zuc256_EEA_1_block(mb_mgr, pSrcData[0], pDstData[0], pKeys[0], + pIV[0], 1)) + test_suite_update(&eea3_256_ctx, 0, 1); + else + test_suite_update(&eea3_256_ctx, 1, 0); + + /* ZUC-EIA3-256 tests */ + if (validate_zuc256_EIA_1_block(mb_mgr, pSrcData[0], pDstData[0], pKeys[0], + pIV[0], 1)) + test_suite_update(&eia3_256_ctx, 0, 1); + else + test_suite_update(&eia3_256_ctx, 1, 0); +#endif + /* ZUC-EEA3-256 tests */ for (i = 0; i < DIM(numBuffs); i++) { +#ifdef __aarch64__ + if (validate_zuc256_EEA3(mb_mgr, pSrcData, pDstData, pKeys, + pIV, numBuffs[i], 1)) +#else if (validate_zuc256_EEA3(mb_mgr, pSrcData, pDstData, pKeys, pIV, numBuffs[i])) +#endif test_suite_update(&eea3_256_ctx, 0, 1); else test_suite_update(&eea3_256_ctx, 1, 0); @@ -327,8 +396,13 @@ int zuc_test(struct IMB_MGR *mb_mgr) /* ZUC-EIA3-256 tests */ for (i = 0; i < DIM(numBuffs); i++) { +#ifdef __aarch64__ + if (validate_zuc256_EIA3(mb_mgr, pSrcData, pDstData, pKeys, + pIV, numBuffs[i], 1)) +#else if (validate_zuc256_EIA3(mb_mgr, pSrcData, pDstData, pKeys, pIV, numBuffs[i])) +#endif test_suite_update(&eia3_256_ctx, 0, 1); else test_suite_update(&eia3_256_ctx, 1, 0); @@ -643,6 +717,16 @@ submit_and_verify(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, return ret; } +#ifdef __aarch64__ +static int +submit_and_verify_zuc256(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, + uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, + IMB_CIPHER_DIRECTION dir, + const unsigned int var_bufs, + const unsigned int num_buffers, + const uint32_t *buf_idx, + const unsigned int job_api) +#else static int submit_and_verify_zuc256(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, @@ -650,6 +734,7 @@ submit_and_verify_zuc256(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, const unsigned int var_bufs, const unsigned int num_buffers, const uint32_t *buf_idx) +#endif { unsigned int i; uint32_t packetLen[MAXBUFS]; @@ -669,9 +754,26 @@ submit_and_verify_zuc256(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, iv_lens[i] = vector->iv_length; } +#ifdef __aarch64__ + if (job_api) { + submit_eea3_jobs(mb_mgr, pKeys, pIV, pSrcData, + pDstData, packetLen, dir, num_buffers, + ZUC256_KEY_LEN_IN_BYTES, iv_lens); + } else { + IMB_ZUC256_EEA3_N_BUFFER(mb_mgr, + (const void * const *)pKeys, + (const void * const *)pIV, + iv_lens, + (const void * const *)pSrcData, + (void **)pDstData, + packetLen, + num_buffers); + } +#else submit_eea3_jobs(mb_mgr, pKeys, pIV, pSrcData, pDstData, packetLen, dir, num_buffers, ZUC256_KEY_LEN_IN_BYTES, iv_lens); +#endif for (i = 0; i < num_buffers; i++) { uint8_t *pDst8 = (uint8_t *)pDstData[i]; @@ -802,9 +904,59 @@ int validate_zuc_EEA_n_block(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, return ret; }; +#ifdef __aarch64__ +int +validate_zuc256_EEA_1_block(struct IMB_MGR *mb_mgr, uint8_t *pSrcData, + uint8_t *pDstData, uint8_t *pKeys, uint8_t *pIV, + const unsigned int job_api) +{ + uint32_t i; + int ret = 0; + + /* ZUC-256-EEA3 */ + for (i = 0; i < NUM_ZUC_256_EEA3_TESTS; i++) { + char msg[50]; + int retTmp; + uint32_t byteLength; + uint32_t iv_len; + byteLength = (test256EEA3_vectors[i].length_in_bits + 7) / 8; + iv_len = test256EEA3_vectors[i].iv_length; + memcpy(pKeys, test256EEA3_vectors[i].CK, ZUC256_KEY_LEN_IN_BYTES); + memcpy(pIV, test256EEA3_vectors[i].IV, iv_len); + memcpy(pSrcData, test256EEA3_vectors[i].plaintext, byteLength); + + if (job_api) + submit_eea3_jobs(mb_mgr, &pKeys, &pIV, &pSrcData, + &pDstData, &byteLength, + IMB_DIR_ENCRYPT, 1, + ZUC256_KEY_LEN_IN_BYTES, + &iv_len); + else + IMB_ZUC256_EEA3_1_BUFFER(mb_mgr, pKeys, pIV, iv_len, pSrcData, + pDstData, byteLength); + + snprintf(msg, sizeof(msg), + "Validate ZUC-256 1 block test %u (Enc):", i + 1); + retTmp = test_output(pDstData, test256EEA3_vectors[i].ciphertext, + byteLength, + test256EEA3_vectors[i].length_in_bits, msg); + if (retTmp < 0) + ret = retTmp; + } + + return ret; +}; +#endif + +#ifdef __aarch64__ +int validate_zuc256_EEA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, + uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, + uint32_t numBuffs, const unsigned int job_api) +#else int validate_zuc256_EEA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, uint32_t numBuffs) +#endif { uint32_t i, j; int ret = 0; @@ -816,15 +968,28 @@ int validate_zuc256_EEA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, for (j = 0; j < numBuffs; j++) buf_idx[j] = i; + +#ifdef __aarch64__ + retTmp = submit_and_verify_zuc256(mb_mgr, pSrcData, pDstData, + pKeys, pIV, IMB_DIR_ENCRYPT, + 0, numBuffs, buf_idx, job_api); +#else retTmp = submit_and_verify_zuc256(mb_mgr, pSrcData, pDstData, pKeys, pIV, IMB_DIR_ENCRYPT, 0, numBuffs, buf_idx); +#endif if (retTmp < 0) ret = retTmp; +#ifdef __aarch64__ + retTmp = submit_and_verify_zuc256(mb_mgr, pSrcData, pDstData, + pKeys, pIV, IMB_DIR_DECRYPT, + 0, numBuffs, buf_idx, job_api); +#else retTmp = submit_and_verify_zuc256(mb_mgr, pSrcData, pDstData, pKeys, pIV, IMB_DIR_DECRYPT, 0, numBuffs, buf_idx); +#endif if (retTmp < 0) ret = retTmp; } @@ -833,15 +998,27 @@ int validate_zuc256_EEA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, for (i = 0; i < numBuffs; i++) buf_idx[i] = i % NUM_ZUC_256_EEA3_TESTS; +#ifdef __aarch64__ + retTmp = submit_and_verify_zuc256(mb_mgr, pSrcData, pDstData, pKeys, + pIV, IMB_DIR_ENCRYPT, + 1, numBuffs, buf_idx, job_api); +#else retTmp = submit_and_verify_zuc256(mb_mgr, pSrcData, pDstData, pKeys, pIV, IMB_DIR_ENCRYPT, 1, numBuffs, buf_idx); +#endif if (retTmp < 0) ret = retTmp; +#ifdef __aarch64__ + retTmp = submit_and_verify_zuc256(mb_mgr, pSrcData, pDstData, pKeys, + pIV, IMB_DIR_DECRYPT, + 1, numBuffs, buf_idx, job_api); +#else retTmp = submit_and_verify_zuc256(mb_mgr, pSrcData, pDstData, pKeys, pIV, IMB_DIR_DECRYPT, 1, numBuffs, buf_idx); +#endif if (retTmp < 0) ret = retTmp; @@ -1065,9 +1242,80 @@ verify_tag_256(void *mac, const struct test256EIA3_vectors_t *vector, return ret; } +#ifdef __aarch64__ +int validate_zuc256_EIA_1_block(struct IMB_MGR *mb_mgr, uint8_t *pSrcData, + uint8_t *pDstData, uint8_t *pKeys, uint8_t *pIV, + const unsigned int job_api) +{ + uint32_t i; + int ret = 0; + uint32_t bitLength, byteLength; + const struct test256EIA3_vectors_t *vector; + unsigned int iv_len; + unsigned tag_sz; + const void *ref_mac = NULL; + + for (i = 0; i < NUM_ZUC_256_EIA3_TESTS; i++) { + vector = &test256EIA3_vectors[i]; + memcpy(pKeys, vector->CK, ZUC256_KEY_LEN_IN_BYTES); + memcpy(pIV, vector->IV, vector->iv_length); + bitLength = vector->length_in_bits; + byteLength = (bitLength + 7) / 8; + memcpy(pSrcData, vector->message, byteLength); + iv_len = vector->iv_length; + for (tag_sz = 4; tag_sz <= 16; tag_sz *= 2) { + if (job_api) + submit_eia3_jobs(mb_mgr, &pKeys, &pIV, + &pSrcData, &pDstData, + &bitLength, 1, + ZUC256_KEY_LEN_IN_BYTES, tag_sz, + &iv_len); + else + IMB_ZUC256_EIA3_1_BUFFER(mb_mgr, pKeys, pIV, iv_len, pSrcData, + bitLength, (uint32_t *)pDstData, tag_sz); + + if (tag_sz == 4) + ref_mac = &vector->mac4; + else if (tag_sz == 8) + ref_mac = &vector->mac8; + else + ref_mac = &vector->mac16; + + const int retTmp = memcmp(pDstData, ref_mac, tag_sz); + + if (retTmp) { + printf("Validate ZUC-256 1 block test %u, " + "(Int - %u bytes): FAIL\n", + i + 1, tag_sz); + byte_hexdump("Expected", + (const uint8_t *)ref_mac, + tag_sz); + byte_hexdump("Found", pDstData, tag_sz); + ret = retTmp; + } +#ifdef DEBUG + else + printf("Validate ZUC-256 1 block test %u, " + "(Int - %u bytes): PASS\n", + i + 1, tag_sz); +#endif + fflush(stdout); + } + } + + return ret; +}; +#endif + +#ifdef __aarch64__ +int validate_zuc256_EIA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, + uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, + uint32_t numBuffs, const unsigned int job_api) +#else int validate_zuc256_EIA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, uint8_t **pDstData, uint8_t **pKeys, uint8_t **pIV, uint32_t numBuffs) +#endif { uint32_t i, j; int retTmp, ret = 0; @@ -1087,13 +1335,29 @@ int validate_zuc256_EIA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, memcpy(pSrcData[j], vector->message, byteLength); iv_lens[j] = vector->iv_length; } - // Todo: tag_sz can be 8 and 16, so far only 4 bytes mac is supported - for (tag_sz = 4; tag_sz <= 4; tag_sz *= 2) { + for (tag_sz = 4; tag_sz <= 16; tag_sz *= 2) { +#ifdef __aarch64__ + if (job_api) + submit_eia3_jobs(mb_mgr, pKeys, pIV, + pSrcData, pDstData, + bitLength, numBuffs, + ZUC256_KEY_LEN_IN_BYTES, tag_sz, + iv_lens); + else + IMB_ZUC256_EIA3_N_BUFFER(mb_mgr, + (const void * const *)pKeys, + (const void * const *)pIV, + iv_lens, + (const void * const *)pSrcData, + bitLength, (uint32_t **)pDstData, + tag_sz, numBuffs); +#else submit_eia3_jobs(mb_mgr, pKeys, pIV, pSrcData, pDstData, bitLength, numBuffs, ZUC256_KEY_LEN_IN_BYTES, tag_sz, iv_lens); +#endif for (j = 0; j < numBuffs; j++) { retTmp = verify_tag_256(pDstData[j], vector, @@ -1117,13 +1381,29 @@ int validate_zuc256_EIA3(struct IMB_MGR *mb_mgr, uint8_t **pSrcData, iv_lens[i] = vector->iv_length; } - // Todo: tag_sz can be 8 and 16, so far only 4 bytes mac is supported - for (tag_sz = 4; tag_sz <= 4; tag_sz *= 2) { + for (tag_sz = 4; tag_sz <= 16; tag_sz *= 2) { +#ifdef __aarch64__ + if (job_api) + submit_eia3_jobs(mb_mgr, pKeys, pIV, + pSrcData, pDstData, + bitLength, numBuffs, + ZUC256_KEY_LEN_IN_BYTES, tag_sz, + iv_lens); + else + IMB_ZUC256_EIA3_N_BUFFER(mb_mgr, + (const void * const *)pKeys, + (const void * const *)pIV, + iv_lens, + (const void * const *)pSrcData, + bitLength, (uint32_t **)pDstData, + tag_sz, numBuffs); +#else submit_eia3_jobs(mb_mgr, pKeys, pIV, pSrcData, pDstData, bitLength, numBuffs, ZUC256_KEY_LEN_IN_BYTES, tag_sz, iv_lens); +#endif for (i = 0; i < numBuffs; i++) { const uint32_t vector_idx = i % NUM_ZUC_256_EIA3_TESTS; diff --git a/test/zuc_test_vectors.h b/test/zuc_test_vectors.h index 8ab1b343043aab8814caf69ecaa83011e1d7fe1d..6e8d399930483084b0e918a5264ec5081fbb9263 100644 --- a/test/zuc_test_vectors.h +++ b/test/zuc_test_vectors.h @@ -33,7 +33,7 @@ #define NUM_ZUC_ALG_TESTS 3 #define NUM_ZUC_EEA3_TESTS 5 #define NUM_ZUC_256_EEA3_TESTS 10 -#define NUM_ZUC_256_EIA3_TESTS 12 +#define NUM_ZUC_256_EIA3_TESTS 15 #define NUM_ZUC_EIA3_TESTS 10 #define ZUC_KEY_LEN_IN_BYTES 16 #define ZUC_IV_LEN_IN_BYTES 16 @@ -1749,5 +1749,120 @@ const struct test256EIA3_vectors_t test256EIA3_vectors[] = { 0x23, 0x48, 0x4b, 0xcf, 0x2e, 0x70, 0xe9, 0x5b }, }, + /* Add 3 cases, whose message length is not N*8 */ + { + /* Test 13 */ + /* Key */ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + }, + /* IV */ + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00 + }, + 399, + /* IV length */ + 25, + {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00 + }, + {0x46, 0x49, 0x70, 0xCC}, + {0xCF, 0x8E, 0x99, 0x98, 0x7A, 0xE5, 0x67, 0x63}, + {0x44, 0x6B, 0xF3, 0xEC, 0x73, 0xD4, 0xE0, 0x4C, + 0x5B, 0x4F, 0x1F, 0x23, 0xD9, 0x26, 0xA8, 0x15 + }, + }, + { + /* Test 14 */ + /* Key */ + {0x8f, 0x8e, 0xf9, 0xd8, 0xfb, 0x0a, 0xce, 0x2b, + 0x23, 0x19, 0x48, 0x42, 0xcb, 0x5c, 0x6d, 0x98, + 0x1e, 0x71, 0x68, 0x74, 0xe1, 0xdf, 0xeb, 0xe0, + 0xf2, 0x46, 0x02, 0x71, 0xbb, 0x69, 0x0d, 0x9e + }, + /* IV */ + {0x2c, 0xe8, 0x87, 0x0f, 0x8c, 0x7f, 0x47, 0x2a, + 0x02, 0x2d, 0x24, 0xcd, 0x23, 0x3f, 0x4d, 0x0a, + 0x40, 0x0d, 0x12, 0xdd, 0xc4, 0x16, 0x26 + }, + 501, + /* IV length */ + 23, + {0x05, 0xa8, 0xc3, 0x4b, 0x70, 0x9c, 0x97, 0x71, + 0x67, 0x70, 0xa5, 0xa3, 0x08, 0x60, 0xca, 0x25, + 0x0a, 0x8b, 0xb5, 0xc1, 0xc9, 0xd5, 0x8c, 0x7d, + 0xfb, 0x00, 0x3b, 0xc0, 0x9d, 0xe1, 0x09, 0x9f, + 0xcc, 0x22, 0x8c, 0xf6, 0x12, 0x6f, 0xb9, 0x1e, + 0xc9, 0x45, 0x43, 0x43, 0x25, 0x7a, 0x2b, 0xba, + 0x64, 0x4b, 0x8c, 0x91, 0x77, 0xc8, 0xfd, 0xce, + 0x01, 0xcf, 0xab, 0x6b, 0xe6, 0xc2, 0x48, 0x80, + 0x82, 0x77, 0xad, 0xb8, 0xb9, 0x8d, 0x1f, 0xd7, + 0x48, 0x0b, 0x73, 0x4d, 0x98, 0x96, 0x12, 0xd5, + 0xf1, 0x86, 0xfd, 0xa1, 0x12, 0x50, 0x9a, 0x38, + 0x07, 0x37, 0xd5, 0xa3, 0xd0, 0x21, 0xfe, 0x55, + 0x7a, 0x8f, 0xff, 0xe0, 0x4f, 0x25, 0x9c, 0x73, + 0x01, 0x06, 0x66, 0xff, 0x10, 0xa4, 0xdd, 0xd4, + 0x2a, 0xbf, 0x0f, 0x5a, 0xa2, 0x29, 0x64, 0xd9, + 0x99, 0xc8, 0x46, 0xe6, 0x46, 0x48, 0x4d, 0x56, + 0xe9, 0x02, 0x17, 0xa8, 0x14, 0x28, 0x13, 0x22, + 0xf0, 0xd4, 0x43, 0xbe, 0xa0, 0x64, 0xd5, 0x28, + 0x99, 0x27, 0x24, 0x5d, 0x7c, 0x25, 0x46, 0xd6, + 0xdf, 0x2c, 0x05, 0x70, 0x5a, 0x55, 0xcd, 0xf6, + 0xe7, 0xdb, 0x3d, 0x94, 0x67, 0xfa, 0x67, 0x15, + 0xe3, 0x84, 0x96, 0x26, 0xee, 0xf4, 0x22, 0xaf, + 0x2f, 0xa4, 0x6e, 0xda, 0x2f, 0x4a, 0xa0, 0xcd, + 0x10, 0x72, 0x85, 0xb6, 0x45, 0x3b, 0x22, 0xb8, + 0x1f, 0xe0, 0x3c, 0xf9, 0x64, 0x29, 0xb4, 0x46 + }, + {0xE2, 0x98, 0x92, 0x88}, + {0xD8, 0xC0, 0xDE, 0xBE, + 0x14, 0xA5, 0x15, 0xD2}, + {0x82, 0xE0, 0x1E, 0xFF, + 0x0F, 0x86, 0xCB, 0xD5, + 0x46, 0xAC, 0xBB, 0x96, + 0xE8, 0xCE, 0xAB, 0x9E}, + }, + { + /* Test 15 */ + /* Key */ + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + }, + /* IV */ + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, 0x3f, + 0x3f + }, + 396, + /* IV length */ + 25, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff + }, + {0x94, 0x19, 0xE9, 0xBE}, + {0x9B, 0x93, 0xA9, 0x40, + 0xAF, 0x55, 0xC2, 0xF9}, + {0xFD, 0xA6, 0x5D, 0xDB, + 0xA7, 0xF0, 0x74, 0x4F, + 0x9F, 0xDA, 0x15, 0x0F, + 0x35, 0x4E, 0xE8, 0x26}, + }, }; #endif