From b227023f1e4f63df2d44a7cedd7e2b15ec067ebe Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Fri, 7 Jun 2024 15:23:04 +0100 Subject: [PATCH 01/24] lib: add API to retrieve minimum burst size for hash-only burst API Function added to retrieve the minimum burst size to be used in the hash-only burst API (IMB_SUBMIT_HASH_BURST) to get optimal performance, based on the implementation used. If a hash algorithm is not supported, it returns an error. Signed-off-by: Pablo de Lara --- lib/CMakeLists.txt | 3 +- lib/Makefile | 6 +- lib/ipsec-mb.h | 22 +++++++ lib/libIPSec_MB.def | 1 + lib/win_x64.mak | 3 +- lib/x86_64/capabilities.c | 117 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 148 insertions(+), 4 deletions(-) create mode 100644 lib/x86_64/capabilities.c diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 42f65ca8..c5cf05f0 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -162,7 +162,8 @@ if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64") "${DIR_X86_64}/zuc_iv.c" "${DIR_X86_64}/snow3g_iv.c" "${DIR_X86_64}/snow3g_tables.c" - "${DIR_X86_64}/ooo_mgr_reset.c") + "${DIR_X86_64}/ooo_mgr_reset.c" + "${DIR_X86_64}/capabilities.c") set(SRC_FILES_NO_AESNI "${DIR_NO_AESNI}/aesni_emu.c") else() file(GLOB SRC_FILES_AVX_T1 "${DIR_AVX_T1}/*.c") diff --git a/lib/Makefile b/lib/Makefile index bfe8cb62..364f46b1 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -376,7 +376,8 @@ c_lib_objs := \ cipher_suite_id.o \ ooo_mgr_reset.o \ hmac_ipad_opad_aarch64.o \ - self_test_aarch64.o + self_test_aarch64.o \ + capabilities.o asm_generic_lib_objs := \ lookup_16x8bit_neon.o \ snow3g_impl_aarch64_neon.o \ @@ -439,7 +440,8 @@ c_lib_objs := \ quic_hp_chacha20.o \ quic_chacha20_poly1305.o \ hmac_ipad_opad.o \ - cipher_suite_id.o + cipher_suite_id.o \ + capabilities.o ifeq ($(AESNI_EMU), y) c_lib_objs := $(c_lib_objs) \ diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index 37fc407e..40d34c89 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -4011,6 +4011,28 @@ imb_self_test_set_cb(IMB_MGR *state, imb_self_test_cb_t cb_fn, void *cb_arg); */ IMB_DLL_EXPORT int imb_self_test_get_cb(IMB_MGR *state, imb_self_test_cb_t *cb_fn, void **cb_arg); + +/** + * @brief Retrieves minimum burst size for good performance on hash algorithms. + * + * Depending on the architecture used, this function returns the minimum + * burst size to be used for good performance on the hash-only burst API. + * The output burst size can be 1 (in case of a synchronous single-buffer implementation + * or 0 if the algorithm is not supported by the API). + * + * @param [in] mb_mgr pointer to IMB MGR structure + * @param [in] algo hash algorithm + * @param [out] out_burst_size pointer to store min burst size + * + * @return operation status. + * @retval 0 success + * @retval IMB_ERR_HASH_ALGO not supported \a algo + * @retval IMB_ERR_NULL_MBMGR invalid \a mb_mgr pointer + * @retval IMB_ERR_NULL_BURST invalid \a out_burst_size pointer + */ +IMB_DLL_EXPORT int +imb_hash_burst_get_size(IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned *out_burst_size); + #ifdef __cplusplus } #endif diff --git a/lib/libIPSec_MB.def b/lib/libIPSec_MB.def index 4f5c283c..66aefd96 100644 --- a/lib/libIPSec_MB.def +++ b/lib/libIPSec_MB.def @@ -728,3 +728,4 @@ EXPORTS submit_job_nocheck_avx2_t4 @702 get_next_job_avx2_t4 @703 get_completed_job_avx2_t4 @704 + imb_hash_burst_get_size @705 diff --git a/lib/win_x64.mak b/lib/win_x64.mak index c1e9f55f..3e358a74 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -377,7 +377,8 @@ lib_objs1 = \ $(OBJ_DIR)\quic_chacha20_poly1305.obj \ $(OBJ_DIR)\hmac_ipad_opad.obj \ $(OBJ_DIR)\cipher_suite_id.obj \ - $(OBJ_DIR)\sm4_sse.obj + $(OBJ_DIR)\sm4_sse.obj \ + $(OBJ_DIR)\capabilities.obj lib_objs2 = \ $(OBJ_DIR)\mb_mgr_aes192_cbc_enc_flush_avx.obj \ diff --git a/lib/x86_64/capabilities.c b/lib/x86_64/capabilities.c new file mode 100644 index 00000000..b8e48fca --- /dev/null +++ b/lib/x86_64/capabilities.c @@ -0,0 +1,117 @@ +/******************************************************************************* + Copyright (c) 2024, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "ipsec-mb.h" +#include "ipsec_ooo_mgr.h" + +int +imb_hash_burst_get_size(IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned *out_burst_size) +{ +#ifdef SAFE_PARAM + if (mb_mgr == NULL) + return IMB_ERR_NULL_MBMGR; + + if (out_burst_size == NULL) + return IMB_ERR_NULL_BURST; +#endif + + IMB_ARCH used_arch = (IMB_ARCH) mb_mgr->used_arch; + + switch (algo) { +#ifndef __aarch64__ + case IMB_AUTH_HMAC_SHA_1: + case IMB_AUTH_SHA_1: + switch (used_arch) { + + case IMB_ARCH_NOAESNI: + + case IMB_ARCH_SSE: + *out_burst_size = SSE_NUM_SHA1_LANES; + break; + case IMB_ARCH_AVX: + *out_burst_size = AVX_NUM_SHA1_LANES; + break; + case IMB_ARCH_AVX2: + *out_burst_size = AVX2_NUM_SHA1_LANES; + break; + case IMB_ARCH_AVX512: + + default: + *out_burst_size = AVX2_NUM_SHA1_LANES; + break; + } + break; + case IMB_AUTH_HMAC_SHA_224: + case IMB_AUTH_SHA_224: + case IMB_AUTH_HMAC_SHA_256: + case IMB_AUTH_SHA_256: + switch (used_arch) { + case IMB_ARCH_NOAESNI: + case IMB_ARCH_SSE: + *out_burst_size = SSE_NUM_SHA256_LANES; + break; + case IMB_ARCH_AVX: + *out_burst_size = AVX_NUM_SHA256_LANES; + break; + case IMB_ARCH_AVX2: + *out_burst_size = AVX2_NUM_SHA256_LANES; + break; + case IMB_ARCH_AVX512: + default: + *out_burst_size = AVX2_NUM_SHA256_LANES; + break; + } + break; + case IMB_AUTH_HMAC_SHA_384: + case IMB_AUTH_SHA_384: + case IMB_AUTH_HMAC_SHA_512: + case IMB_AUTH_SHA_512: + switch (used_arch) { + case IMB_ARCH_NOAESNI: + case IMB_ARCH_SSE: + *out_burst_size = SSE_NUM_SHA512_LANES; + break; + case IMB_ARCH_AVX: + *out_burst_size = AVX_NUM_SHA512_LANES; + break; + case IMB_ARCH_AVX2: + *out_burst_size = AVX2_NUM_SHA512_LANES; + break; + case IMB_ARCH_AVX512: + default: + *out_burst_size = AVX2_NUM_SHA512_LANES; + break; + } + break; +#endif /* __aarch64__ */ + default: + *out_burst_size = 0; + return IMB_ERR_HASH_ALGO; + } + + return 0; +} -- GitLab From 4052a4f6819290729f5e5096ed730e18f5a68b87 Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Tue, 18 Jun 2024 07:42:39 +0100 Subject: [PATCH 02/24] perf: use new API retrieving hash algo support in burst API Signed-off-by: Pablo de Lara --- perf/ipsec_perf.c | 282 +++++++++++++++++++++++++++++----------------- 1 file changed, 179 insertions(+), 103 deletions(-) diff --git a/perf/ipsec_perf.c b/perf/ipsec_perf.c index 71bd0572..6ff7cdaa 100644 --- a/perf/ipsec_perf.c +++ b/perf/ipsec_perf.c @@ -1348,6 +1348,161 @@ translate_cipher_mode(const enum test_cipher_mode_e test_mode) return c_mode; } +/* + * This function translates enum test_hash_alg_e to be used by ipsec_mb + * library + */ +static IMB_HASH_ALG +translate_hash_alg(const enum test_hash_alg_e test_mode) +{ + IMB_HASH_ALG hash_alg = IMB_AUTH_NULL; + + switch (test_mode) { + case TEST_SHA1: + hash_alg = IMB_AUTH_SHA_1; + break; + case TEST_SHA_224: + hash_alg = IMB_AUTH_SHA_224; + break; + case TEST_SHA_256: + hash_alg = IMB_AUTH_SHA_256; + break; + case TEST_SHA_384: + hash_alg = IMB_AUTH_SHA_384; + break; + case TEST_SHA_512: + hash_alg = IMB_AUTH_SHA_512; + break; + case TEST_SHA1_HMAC: + hash_alg = IMB_AUTH_HMAC_SHA_1; + break; + case TEST_SHA_224_HMAC: + hash_alg = IMB_AUTH_HMAC_SHA_224; + break; + case TEST_SHA_256_HMAC: + hash_alg = IMB_AUTH_HMAC_SHA_256; + break; + case TEST_SHA_384_HMAC: + hash_alg = IMB_AUTH_HMAC_SHA_384; + break; + case TEST_SHA_512_HMAC: + hash_alg = IMB_AUTH_HMAC_SHA_512; + break; + case TEST_XCBC: + hash_alg = IMB_AUTH_AES_XCBC; + break; + case TEST_HASH_CCM: + hash_alg = IMB_AUTH_AES_CCM; + break; + case TEST_HASH_GCM: + if (segment_size != 0) + hash_alg = IMB_AUTH_GCM_SGL; + else + hash_alg = IMB_AUTH_AES_GMAC; + break; + case TEST_DOCSIS_CRC32: + hash_alg = IMB_AUTH_DOCSIS_CRC32; + break; + case TEST_NULL_HASH: + hash_alg = IMB_AUTH_NULL; + break; + case TEST_HASH_CMAC: + hash_alg = IMB_AUTH_AES_CMAC; + break; + case TEST_HASH_CMAC_BITLEN: + hash_alg = IMB_AUTH_AES_CMAC_BITLEN; + break; + case TEST_HASH_CMAC_256: + hash_alg = IMB_AUTH_AES_CMAC_256; + break; + case TEST_HASH_POLY1305: + hash_alg = IMB_AUTH_POLY1305; + break; + case TEST_AEAD_POLY1305: + if (segment_size != 0) + hash_alg = IMB_AUTH_CHACHA20_POLY1305_SGL; + else + hash_alg = IMB_AUTH_CHACHA20_POLY1305; + break; + case TEST_PON_CRC_BIP: + hash_alg = IMB_AUTH_PON_CRC_BIP; + break; + case TEST_ZUC_EIA3: + hash_alg = IMB_AUTH_ZUC_EIA3_BITLEN; + break; + case TEST_ZUC256_EIA3: + hash_alg = IMB_AUTH_ZUC256_EIA3_BITLEN; + break; + case TEST_SNOW3G_UIA2: + hash_alg = IMB_AUTH_SNOW3G_UIA2_BITLEN; + break; + case TEST_KASUMI_UIA1: + hash_alg = IMB_AUTH_KASUMI_UIA1; + break; + case TEST_AES_GMAC_128: + hash_alg = IMB_AUTH_AES_GMAC_128; + break; + case TEST_AES_GMAC_192: + hash_alg = IMB_AUTH_AES_GMAC_192; + break; + case TEST_AES_GMAC_256: + hash_alg = IMB_AUTH_AES_GMAC_256; + break; + case TEST_AUTH_GHASH: + hash_alg = IMB_AUTH_GHASH; + break; + case TEST_AUTH_SNOW_V_AEAD: + hash_alg = IMB_AUTH_SNOW_V_AEAD; + break; + case TEST_CRC32_ETHERNET_FCS: + hash_alg = IMB_AUTH_CRC32_ETHERNET_FCS; + break; + case TEST_CRC32_SCTP: + hash_alg = IMB_AUTH_CRC32_SCTP; + break; + case TEST_CRC32_WIMAX_OFDMA_DATA: + hash_alg = IMB_AUTH_CRC32_WIMAX_OFDMA_DATA; + break; + case TEST_CRC24_LTE_A: + hash_alg = IMB_AUTH_CRC24_LTE_A; + break; + case TEST_CRC24_LTE_B: + hash_alg = IMB_AUTH_CRC24_LTE_B; + break; + case TEST_CRC16_X25: + hash_alg = IMB_AUTH_CRC16_X25; + break; + case TEST_CRC16_FP_DATA: + hash_alg = IMB_AUTH_CRC16_FP_DATA; + break; + case TEST_CRC11_FP_HEADER: + hash_alg = IMB_AUTH_CRC11_FP_HEADER; + break; + case TEST_CRC10_IUUP_DATA: + hash_alg = IMB_AUTH_CRC10_IUUP_DATA; + break; + case TEST_CRC8_WIMAX_OFDMA_HCS: + hash_alg = IMB_AUTH_CRC8_WIMAX_OFDMA_HCS; + break; + case TEST_CRC7_FP_HEADER: + hash_alg = IMB_AUTH_CRC7_FP_HEADER; + break; + case TEST_CRC6_IUUP_HEADER: + hash_alg = IMB_AUTH_CRC6_IUUP_HEADER; + break; + case TEST_AUTH_SM3: + hash_alg = IMB_AUTH_SM3; + break; + case TEST_SM3_HMAC: + hash_alg = IMB_AUTH_HMAC_SM3; + break; + default: + break; + } + + return hash_alg; +} + static uint32_t get_next_size(const uint32_t index) { @@ -1879,173 +2034,81 @@ do_test(IMB_MGR *mb_mgr, struct params_s *params, const uint32_t num_iter, uint8 job_template.auth_tag_output = (uint8_t *) digest; + /* Translating enum to the API's one */ + job_template.hash_alg = translate_hash_alg(params->hash_alg); switch (params->hash_alg) { - case TEST_SHA1: - job_template.hash_alg = IMB_AUTH_SHA_1; - break; - case TEST_SHA_224: - job_template.hash_alg = IMB_AUTH_SHA_224; - break; - case TEST_SHA_256: - job_template.hash_alg = IMB_AUTH_SHA_256; - break; - case TEST_SHA_384: - job_template.hash_alg = IMB_AUTH_SHA_384; - break; - case TEST_SHA_512: - job_template.hash_alg = IMB_AUTH_SHA_512; - break; case TEST_XCBC: job_template.u.XCBC._k1_expanded = k1_expanded; job_template.u.XCBC._k2 = k2; job_template.u.XCBC._k3 = k3; - job_template.hash_alg = IMB_AUTH_AES_XCBC; - break; - case TEST_HASH_CCM: - job_template.hash_alg = IMB_AUTH_AES_CCM; - break; - case TEST_HASH_GCM: - if (segment_size != 0) - job_template.hash_alg = IMB_AUTH_GCM_SGL; - else - job_template.hash_alg = IMB_AUTH_AES_GMAC; - break; - case TEST_DOCSIS_CRC32: - job_template.hash_alg = IMB_AUTH_DOCSIS_CRC32; - break; - case TEST_NULL_HASH: - job_template.hash_alg = IMB_AUTH_NULL; break; case TEST_HASH_CMAC: job_template.u.CMAC._key_expanded = k1_expanded; job_template.u.CMAC._skey1 = k2; job_template.u.CMAC._skey2 = k3; - job_template.hash_alg = IMB_AUTH_AES_CMAC; break; case TEST_HASH_CMAC_BITLEN: job_template.u.CMAC._key_expanded = k1_expanded; job_template.u.CMAC._skey1 = k2; job_template.u.CMAC._skey2 = k3; - job_template.hash_alg = IMB_AUTH_AES_CMAC_BITLEN; break; case TEST_HASH_CMAC_256: job_template.u.CMAC._key_expanded = k1_expanded; job_template.u.CMAC._skey1 = k2; job_template.u.CMAC._skey2 = k3; - job_template.hash_alg = IMB_AUTH_AES_CMAC_256; break; case TEST_HASH_POLY1305: job_template.u.POLY1305._key = k1_expanded; - job_template.hash_alg = IMB_AUTH_POLY1305; - break; - case TEST_AEAD_POLY1305: - if (segment_size != 0) - job_template.hash_alg = IMB_AUTH_CHACHA20_POLY1305_SGL; - else - job_template.hash_alg = IMB_AUTH_CHACHA20_POLY1305; break; case TEST_PON_CRC_BIP: - job_template.hash_alg = IMB_AUTH_PON_CRC_BIP; job_template.cipher_start_src_offset_in_bytes = 8; break; case TEST_ZUC_EIA3: - job_template.hash_alg = IMB_AUTH_ZUC_EIA3_BITLEN; job_template.u.ZUC_EIA3._key = k3; job_template.u.ZUC_EIA3._iv = (uint8_t *) &auth_iv; break; case TEST_ZUC256_EIA3: - job_template.hash_alg = IMB_AUTH_ZUC256_EIA3_BITLEN; job_template.u.ZUC_EIA3._key = k3; job_template.u.ZUC_EIA3._iv = (uint8_t *) &auth_iv; break; case TEST_SNOW3G_UIA2: - job_template.hash_alg = IMB_AUTH_SNOW3G_UIA2_BITLEN; job_template.u.SNOW3G_UIA2._key = k3; job_template.u.SNOW3G_UIA2._iv = (uint8_t *) &auth_iv; break; case TEST_KASUMI_UIA1: - job_template.hash_alg = IMB_AUTH_KASUMI_UIA1; job_template.u.KASUMI_UIA1._key = k3; break; case TEST_AES_GMAC_128: - job_template.hash_alg = IMB_AUTH_AES_GMAC_128; IMB_AES128_GCM_PRE(mb_mgr, gcm_key, &gdata_key); job_template.u.GMAC._key = &gdata_key; job_template.u.GMAC._iv = (uint8_t *) &auth_iv; job_template.u.GMAC.iv_len_in_bytes = 12; break; case TEST_AES_GMAC_192: - job_template.hash_alg = IMB_AUTH_AES_GMAC_192; IMB_AES192_GCM_PRE(mb_mgr, gcm_key, &gdata_key); job_template.u.GMAC._key = &gdata_key; job_template.u.GMAC._iv = (uint8_t *) &auth_iv; job_template.u.GMAC.iv_len_in_bytes = 12; break; case TEST_AES_GMAC_256: - job_template.hash_alg = IMB_AUTH_AES_GMAC_256; IMB_AES256_GCM_PRE(mb_mgr, gcm_key, &gdata_key); job_template.u.GMAC._key = &gdata_key; job_template.u.GMAC._iv = (uint8_t *) &auth_iv; job_template.u.GMAC.iv_len_in_bytes = 12; break; case TEST_AUTH_GHASH: - job_template.hash_alg = IMB_AUTH_GHASH; IMB_GHASH_PRE(mb_mgr, gcm_key, &gdata_key); job_template.u.GHASH._key = &gdata_key; job_template.u.GHASH._init_tag = (uint8_t *) &auth_iv; break; - case TEST_AUTH_SNOW_V_AEAD: - job_template.hash_alg = IMB_AUTH_SNOW_V_AEAD; - break; - case TEST_CRC32_ETHERNET_FCS: - job_template.hash_alg = IMB_AUTH_CRC32_ETHERNET_FCS; - break; - case TEST_CRC32_SCTP: - job_template.hash_alg = IMB_AUTH_CRC32_SCTP; - break; - case TEST_CRC32_WIMAX_OFDMA_DATA: - job_template.hash_alg = IMB_AUTH_CRC32_WIMAX_OFDMA_DATA; - break; - case TEST_CRC24_LTE_A: - job_template.hash_alg = IMB_AUTH_CRC24_LTE_A; - break; - case TEST_CRC24_LTE_B: - job_template.hash_alg = IMB_AUTH_CRC24_LTE_B; - break; - case TEST_CRC16_X25: - job_template.hash_alg = IMB_AUTH_CRC16_X25; - break; - case TEST_CRC16_FP_DATA: - job_template.hash_alg = IMB_AUTH_CRC16_FP_DATA; - break; - case TEST_CRC11_FP_HEADER: - job_template.hash_alg = IMB_AUTH_CRC11_FP_HEADER; - break; - case TEST_CRC10_IUUP_DATA: - job_template.hash_alg = IMB_AUTH_CRC10_IUUP_DATA; - break; - case TEST_CRC8_WIMAX_OFDMA_HCS: - job_template.hash_alg = IMB_AUTH_CRC8_WIMAX_OFDMA_HCS; - break; - case TEST_CRC7_FP_HEADER: - job_template.hash_alg = IMB_AUTH_CRC7_FP_HEADER; - break; - case TEST_CRC6_IUUP_HEADER: - job_template.hash_alg = IMB_AUTH_CRC6_IUUP_HEADER; - break; - case TEST_AUTH_SM3: - job_template.hash_alg = IMB_AUTH_SM3; - break; case TEST_SM3_HMAC: job_template.u.HMAC._hashed_auth_key_xor_ipad = (uint8_t *) ipad; job_template.u.HMAC._hashed_auth_key_xor_opad = (uint8_t *) opad; - job_template.hash_alg = IMB_AUTH_HMAC_SM3; break; default: /* HMAC hash algorithm */ job_template.u.HMAC._hashed_auth_key_xor_ipad = (uint8_t *) ipad; job_template.u.HMAC._hashed_auth_key_xor_opad = (uint8_t *) opad; - job_template.hash_alg = (IMB_HASH_ALG) params->hash_alg; break; } if (tag_size == 0) @@ -4070,15 +4133,28 @@ main(int argc, char *argv[]) return EXIT_FAILURE; } - /* currently only HMAC-SHAx algs supported by hash-only burst API */ - if (test_api == TEST_API_HASH_BURST && - ((custom_job_params.hash_alg != TEST_SHA1_HMAC) && - (custom_job_params.hash_alg != TEST_SHA_224_HMAC) && - (custom_job_params.hash_alg != TEST_SHA_256_HMAC) && - (custom_job_params.hash_alg != TEST_SHA_384_HMAC) && - (custom_job_params.hash_alg != TEST_SHA_512_HMAC))) { - fprintf(stderr, "Unsupported hash-only burst API algorithm selected\n"); - return EXIT_FAILURE; + /* only a few algorithms support the hash-only burst API */ + if (test_api == TEST_API_HASH_BURST) { + uint32_t optim_burst_size; + IMB_MGR *aux_mgr = alloc_mb_mgr(0); + + if (aux_mgr == NULL) { + fprintf(stderr, "Error allocating MB_MGR structure!\n"); + return EXIT_FAILURE; + } + init_mb_mgr_auto(aux_mgr, NULL); + + if (imb_hash_burst_get_size(aux_mgr, translate_hash_alg(custom_job_params.hash_alg), + &optim_burst_size) == IMB_ERR_HASH_ALGO) { + fprintf(stderr, "Unsupported hash-only burst API algorithm selected\n"); + free_mb_mgr(aux_mgr); + return EXIT_FAILURE; + } + + if (optim_burst_size > burst_size) + fprintf(stderr, "NOTE: Burst size is lower than the minimum size for an " + "optimal performance\n"); + free_mb_mgr(aux_mgr); } if (test_api == TEST_API_DIRECT && ((custom_job_params.cipher_mode != TEST_GCM) && -- GitLab From 115aa604343d29ee754c984006cbd02e3c972794 Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Wed, 19 Jun 2024 10:06:24 +0100 Subject: [PATCH 03/24] lib: add API to retrieve used architecture type Most algorithms implemented in this library has multiple implementations. To help understanding which implementation is used, a new field in IMB_MGR is added to store the architecture subtype, and a new API to retrieve the one selected (e.g. AVX512 Type 2). Signed-off-by: Pablo de Lara --- lib/avx2_t1/mb_mgr_avx2_t1.c | 3 ++ lib/avx2_t2/mb_mgr_avx2_t2.c | 3 ++ lib/avx2_t3/mb_mgr_avx2_t3.c | 3 ++ lib/avx2_t4/mb_mgr_avx2_t4.c | 3 ++ lib/avx512_t1/mb_mgr_avx512_t1.c | 3 ++ lib/avx512_t2/mb_mgr_avx512_t2.c | 3 ++ lib/avx_t1/mb_mgr_avx_t1.c | 3 ++ lib/avx_t2/mb_mgr_avx_t2.c | 3 ++ lib/ipsec-mb.h | 25 ++++++++++++-- lib/libIPSec_MB.def | 1 + lib/no-aesni/mb_mgr_sse_no_aesni.c | 3 ++ lib/sse_t1/mb_mgr_sse_t1.c | 3 ++ lib/sse_t2/mb_mgr_sse_t2.c | 3 ++ lib/sse_t3/mb_mgr_sse_t3.c | 3 ++ lib/x86_64/capabilities.c | 55 +++++++++++++++++++++++++++++- perf/ipsec_perf.c | 39 ++++++++++++++++----- 16 files changed, 143 insertions(+), 13 deletions(-) diff --git a/lib/avx2_t1/mb_mgr_avx2_t1.c b/lib/avx2_t1/mb_mgr_avx2_t1.c index 58d6b08f..dc4e599e 100644 --- a/lib/avx2_t1/mb_mgr_avx2_t1.c +++ b/lib/avx2_t1/mb_mgr_avx2_t1.c @@ -346,6 +346,9 @@ init_mb_mgr_avx2_t1_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_AVX2; + /* Set architecture type for future checks */ + state->used_arch_type = 1; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/avx2_t2/mb_mgr_avx2_t2.c b/lib/avx2_t2/mb_mgr_avx2_t2.c index 5ccb6be4..8b899aa5 100644 --- a/lib/avx2_t2/mb_mgr_avx2_t2.c +++ b/lib/avx2_t2/mb_mgr_avx2_t2.c @@ -349,6 +349,9 @@ init_mb_mgr_avx2_t2_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_AVX2; + /* Set architecture type for future checks */ + state->used_arch_type = 2; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/avx2_t3/mb_mgr_avx2_t3.c b/lib/avx2_t3/mb_mgr_avx2_t3.c index 884856bc..a9a78f0e 100644 --- a/lib/avx2_t3/mb_mgr_avx2_t3.c +++ b/lib/avx2_t3/mb_mgr_avx2_t3.c @@ -349,6 +349,9 @@ init_mb_mgr_avx2_t3_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_AVX2; + /* Set architecture type for future checks */ + state->used_arch_type = 3; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/avx2_t4/mb_mgr_avx2_t4.c b/lib/avx2_t4/mb_mgr_avx2_t4.c index 16cd470f..aa3779f6 100644 --- a/lib/avx2_t4/mb_mgr_avx2_t4.c +++ b/lib/avx2_t4/mb_mgr_avx2_t4.c @@ -350,6 +350,9 @@ init_mb_mgr_avx2_t4_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_AVX2; + /* Set architecture type for future checks */ + state->used_arch_type = 4; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/avx512_t1/mb_mgr_avx512_t1.c b/lib/avx512_t1/mb_mgr_avx512_t1.c index 334b016f..cd9625c9 100644 --- a/lib/avx512_t1/mb_mgr_avx512_t1.c +++ b/lib/avx512_t1/mb_mgr_avx512_t1.c @@ -442,6 +442,9 @@ init_mb_mgr_avx512_t1_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_AVX512; + /* Set architecture type for future checks */ + state->used_arch_type = 1; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/avx512_t2/mb_mgr_avx512_t2.c b/lib/avx512_t2/mb_mgr_avx512_t2.c index bcc66811..b09dbd61 100644 --- a/lib/avx512_t2/mb_mgr_avx512_t2.c +++ b/lib/avx512_t2/mb_mgr_avx512_t2.c @@ -449,6 +449,9 @@ init_mb_mgr_avx512_t2_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_AVX512; + /* Set architecture type for future checks */ + state->used_arch_type = 2; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/avx_t1/mb_mgr_avx_t1.c b/lib/avx_t1/mb_mgr_avx_t1.c index b8f2a617..73b4b33e 100644 --- a/lib/avx_t1/mb_mgr_avx_t1.c +++ b/lib/avx_t1/mb_mgr_avx_t1.c @@ -345,6 +345,9 @@ init_mb_mgr_avx_t1_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_AVX; + /* Set architecture type for future checks */ + state->used_arch_type = 1; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/avx_t2/mb_mgr_avx_t2.c b/lib/avx_t2/mb_mgr_avx_t2.c index f198097b..bf2aa688 100644 --- a/lib/avx_t2/mb_mgr_avx_t2.c +++ b/lib/avx_t2/mb_mgr_avx_t2.c @@ -350,6 +350,9 @@ init_mb_mgr_avx_t2_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_AVX; + /* Set architecture type for future checks */ + state->used_arch_type = 2; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index 40d34c89..91ca5549 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -998,8 +998,10 @@ typedef struct IMB_MGR { uint64_t flags; /**< passed to alloc_mb_mgr() */ uint64_t features; /**< reflects features of multi-buffer instance */ - uint64_t reserved[5]; /**< reserved for the future */ - uint32_t used_arch; /**< Architecture being used */ + uint64_t reserved[4]; /**< reserved for the future */ + uint8_t reserved2[7]; /**< reserved for the future */ + uint8_t used_arch_type; /**< Architecture type being used */ + uint32_t used_arch; /**< Architecture being used */ int imb_errno; /**< per mb_mgr error status */ @@ -4012,6 +4014,23 @@ imb_self_test_set_cb(IMB_MGR *state, imb_self_test_cb_t cb_fn, void *cb_arg); IMB_DLL_EXPORT int imb_self_test_get_cb(IMB_MGR *state, imb_self_test_cb_t *cb_fn, void **cb_arg); +/** + * @brief API to get a string with the architecture type being used. + * + * init_mb_mgr_XXX() must be called before this function call, + * where XXX is the desired architecture (can be auto). + * + * @param [in] state pointer to IMB_MGR + * @param [out] arch_type string with architecture type + * @param [out] description string with description of the arch type + * + * @return operation status. + * @retval 0 success + * @retval IMB_ERR_NULL_MBMGR invalid \a mb_mgr pointer + */ +IMB_DLL_EXPORT int +imb_get_arch_type_string(const IMB_MGR *state, const char **arch_type, const char **description); + /** * @brief Retrieves minimum burst size for good performance on hash algorithms. * @@ -4031,7 +4050,7 @@ imb_self_test_get_cb(IMB_MGR *state, imb_self_test_cb_t *cb_fn, void **cb_arg); * @retval IMB_ERR_NULL_BURST invalid \a out_burst_size pointer */ IMB_DLL_EXPORT int -imb_hash_burst_get_size(IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned *out_burst_size); +imb_hash_burst_get_size(const IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned *out_burst_size); #ifdef __cplusplus } diff --git a/lib/libIPSec_MB.def b/lib/libIPSec_MB.def index 66aefd96..5ad31e45 100644 --- a/lib/libIPSec_MB.def +++ b/lib/libIPSec_MB.def @@ -729,3 +729,4 @@ EXPORTS get_next_job_avx2_t4 @703 get_completed_job_avx2_t4 @704 imb_hash_burst_get_size @705 + imb_get_arch_type_string @706 diff --git a/lib/no-aesni/mb_mgr_sse_no_aesni.c b/lib/no-aesni/mb_mgr_sse_no_aesni.c index 7565ea88..aad7227f 100644 --- a/lib/no-aesni/mb_mgr_sse_no_aesni.c +++ b/lib/no-aesni/mb_mgr_sse_no_aesni.c @@ -329,6 +329,9 @@ init_mb_mgr_sse_no_aesni_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_NOAESNI; + /* Set architecture type for future checks */ + state->used_arch_type = 1; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/sse_t1/mb_mgr_sse_t1.c b/lib/sse_t1/mb_mgr_sse_t1.c index a6bebbac..ec76ba2e 100644 --- a/lib/sse_t1/mb_mgr_sse_t1.c +++ b/lib/sse_t1/mb_mgr_sse_t1.c @@ -351,6 +351,9 @@ init_mb_mgr_sse_t1_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_SSE; + /* Set architecture type for future checks */ + state->used_arch_type = 1; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/sse_t2/mb_mgr_sse_t2.c b/lib/sse_t2/mb_mgr_sse_t2.c index 96e30d65..5976cfbc 100644 --- a/lib/sse_t2/mb_mgr_sse_t2.c +++ b/lib/sse_t2/mb_mgr_sse_t2.c @@ -353,6 +353,9 @@ init_mb_mgr_sse_t2_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_SSE; + /* Set architecture type for future checks */ + state->used_arch_type = 2; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/sse_t3/mb_mgr_sse_t3.c b/lib/sse_t3/mb_mgr_sse_t3.c index 55fb57cb..0d3fd02c 100644 --- a/lib/sse_t3/mb_mgr_sse_t3.c +++ b/lib/sse_t3/mb_mgr_sse_t3.c @@ -354,6 +354,9 @@ init_mb_mgr_sse_t3_internal(IMB_MGR *state, const int reset_mgrs) /* Set architecture for future checks */ state->used_arch = (uint32_t) IMB_ARCH_SSE; + /* Set architecture type for future checks */ + state->used_arch_type = 3; + if (reset_mgrs) { reset_ooo_mgrs(state); diff --git a/lib/x86_64/capabilities.c b/lib/x86_64/capabilities.c index b8e48fca..632efde1 100644 --- a/lib/x86_64/capabilities.c +++ b/lib/x86_64/capabilities.c @@ -29,7 +29,7 @@ #include "ipsec_ooo_mgr.h" int -imb_hash_burst_get_size(IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned *out_burst_size) +imb_hash_burst_get_size(const IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned *out_burst_size) { #ifdef SAFE_PARAM if (mb_mgr == NULL) @@ -115,3 +115,56 @@ imb_hash_burst_get_size(IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned *out_ return 0; } + +int +imb_get_arch_type_string(const IMB_MGR *state, const char **arch_type, const char **description) +{ +#ifdef SAFE_PARAM + if (state == NULL) + return IMB_ERR_NULL_MBMGR; + if (arch_type == NULL) + return EINVAL; +#endif + struct arch_type_map { + IMB_ARCH arch; + uint8_t type; + const char *arch_type; + const char *description; + }; + + const struct arch_type_map arch_type_mappings[] = { + { IMB_ARCH_NOAESNI, 0, "AESNI Emulation", "CPU ISA: SSE" }, +#ifndef __aarch64__ + { IMB_ARCH_SSE, 1, "SSE Type 1", "CPU ISA: AES, PCLMUL, SSE" }, + { IMB_ARCH_SSE, 2, "SSE Type 2", "CPU ISA: AES, PCLMUL, SSE, SHA-NI" }, + { IMB_ARCH_SSE, 3, "SSE Type 3", "CPU ISA: AES, PCLMUL, SSE, SHA-NI, GFNI" }, + { IMB_ARCH_AVX, 1, "AVX Type 1", "CPU ISA: AES, PCLMUL, SSE, AVX" }, + { IMB_ARCH_AVX, 2, "AVX Type 2", "CPU ISA: AES, PCLMUL, SSE, AVX, SHA-NI" }, + { IMB_ARCH_AVX2, 1, "AVX2 Type 1", "CPU ISA: AES, PCLMUL, SSE, AVX, AVX2" }, + { IMB_ARCH_AVX2, 2, "AVX2 Type 2", + "CPU ISA: VAES, VPCLMUL, SSE, AVX, AVX2, SHA-NI, GFNI" }, + { IMB_ARCH_AVX2, 3, "AVX2 Type 3", + "CPU ISA: VAES, VPCLMUL, SSE, AVX, AVX2, SHA-NI, GFNI, IFMA" }, + { IMB_ARCH_AVX2, 4, "AVX2 Type 4", + "CPU ISA: VAES, VPCLMUL, SSE, AVX, AVX2, SHA-NI, GFNI, IFMA, SHA512-NI, SM3-NI, " + "SM4-NI" }, + { IMB_ARCH_AVX512, 1, "AVX512 Type 1", + "CPU ISA: AES, PCLMUL, SSE, AVX, AVX2, AVX512" }, + { IMB_ARCH_AVX512, 2, "AVX512 Type 2", + "CPU ISA: VAES, VPCLMUL, SSE, AVX, AVX2, AVX512, GFNI, SHA-NI" }, +#endif /* __aarch64__ */ + }; + + for (unsigned int i = 0; i < IMB_DIM(arch_type_mappings); i++) { + if (arch_type_mappings[i].arch == state->used_arch && + arch_type_mappings[i].type == state->used_arch_type) { + *arch_type = arch_type_mappings[i].arch_type; + if (description != NULL) + *description = arch_type_mappings[i].description; + + break; + } + *arch_type = "Invalid arch type"; + } + return 0; +} diff --git a/perf/ipsec_perf.c b/perf/ipsec_perf.c index 6ff7cdaa..b39cafbb 100644 --- a/perf/ipsec_perf.c +++ b/perf/ipsec_perf.c @@ -4347,18 +4347,39 @@ main(int argc, char *argv[]) } #ifndef __aarch64__ - if (archs[ARCH_SSE]) { - IMB_MGR *p_mgr = alloc_mb_mgr(flags); + IMB_MGR *p_mgr = alloc_mb_mgr(flags); - if (p_mgr == NULL) { - fprintf(stderr, "Error allocating MB_MGR structure!\n"); - return EXIT_FAILURE; + if (p_mgr == NULL) { + fprintf(stderr, "Error allocating MB_MGR structure!\n"); + return EXIT_FAILURE; + } + + fprintf(stderr, "Testing "); + for (enum arch_type_e arch = ARCH_SSE; arch <= ARCH_AVX512; arch++) { + if (archs[arch] == 0) + continue; + + switch (arch) { + case ARCH_SSE: + init_mb_mgr_sse(p_mgr); + break; + case ARCH_AVX: + init_mb_mgr_avx(p_mgr); + break; + case ARCH_AVX2: + init_mb_mgr_avx2(p_mgr); + break; + default: /* ARCH_AV512 */ + init_mb_mgr_avx512(p_mgr); + break; } - init_mb_mgr_sse(p_mgr); - fprintf(stderr, "%s SHA extensions (shani) for SSE arch\n", - (p_mgr->features & IMB_FEATURE_SHANI) ? "Using" : "Not using"); - free_mb_mgr(p_mgr); + const char *arch_type; + + imb_get_arch_type_string(p_mgr, &arch_type, NULL); + fprintf(stderr, "\"%s\" ", arch_type); } + fprintf(stderr, "implementation/s\n"); + free_mb_mgr(p_mgr); #endif /* __aarch64__ */ memset(t_info, 0, sizeof(t_info)); -- GitLab From d37d2868e9c8d0837017b6e5fe609f58af860141 Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Wed, 19 Jun 2024 11:21:38 +0100 Subject: [PATCH 04/24] lib: store number of lanes in SHA/MD5 OOO managers Signed-off-by: Pablo de Lara --- lib/include/ipsec_ooo_mgr.h | 7 +++++++ lib/include/mb_mgr_datastruct.inc | 4 ++++ lib/x86_64/ooo_mgr_reset.c | 9 +++++++++ 3 files changed, 20 insertions(+) diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h index c446099f..1af399e0 100644 --- a/lib/include/ipsec_ooo_mgr.h +++ b/lib/include/ipsec_ooo_mgr.h @@ -350,6 +350,7 @@ typedef struct { uint64_t unused_lanes; HMAC_SHA1_LANE_DATA ldata[AVX512_NUM_SHA1_LANES]; uint32_t num_lanes_inuse; + uint32_t total_num_lanes; uint64_t road_block; } MB_MGR_HMAC_SHA_1_OOO; @@ -359,6 +360,7 @@ typedef struct { uint64_t unused_lanes; HMAC_SHA1_LANE_DATA ldata[AVX512_NUM_SHA1_LANES]; uint32_t num_lanes_inuse; + uint32_t total_num_lanes; uint64_t road_block; } MB_MGR_SHA_1_OOO; @@ -368,6 +370,7 @@ typedef struct { uint64_t unused_lanes; HMAC_SHA1_LANE_DATA ldata[AVX512_NUM_SHA256_LANES]; uint32_t num_lanes_inuse; + uint32_t total_num_lanes; uint64_t road_block; } MB_MGR_HMAC_SHA_256_OOO; @@ -377,6 +380,7 @@ typedef struct { uint64_t unused_lanes; HMAC_SHA1_LANE_DATA ldata[AVX512_NUM_SHA256_LANES]; uint32_t num_lanes_inuse; + uint32_t total_num_lanes; uint64_t road_block; } MB_MGR_SHA_256_OOO; @@ -385,6 +389,7 @@ typedef struct { DECLARE_ALIGNED(uint16_t lens[8], 16); uint64_t unused_lanes; HMAC_SHA512_LANE_DATA ldata[AVX512_NUM_SHA512_LANES]; + uint32_t total_num_lanes; uint64_t road_block; } MB_MGR_HMAC_SHA_512_OOO; @@ -394,6 +399,7 @@ typedef struct { uint64_t unused_lanes; HMAC_SHA512_LANE_DATA ldata[AVX512_NUM_SHA512_LANES]; uint32_t num_lanes_inuse; + uint32_t total_num_lanes; uint64_t road_block; } MB_MGR_SHA_512_OOO; @@ -408,6 +414,7 @@ typedef struct { uint64_t unused_lanes; HMAC_SHA1_LANE_DATA ldata[AVX512_NUM_MD5_LANES]; uint32_t num_lanes_inuse; + uint32_t total_num_lanes; uint64_t road_block; } MB_MGR_HMAC_MD5_OOO; diff --git a/lib/include/mb_mgr_datastruct.inc b/lib/include/mb_mgr_datastruct.inc index da06c842..3f4c0085 100644 --- a/lib/include/mb_mgr_datastruct.inc +++ b/lib/include/mb_mgr_datastruct.inc @@ -321,6 +321,7 @@ FIELD _lens, 32, 32 FIELD _unused_lanes, 8, 8 FIELD _ldata, _HMAC_SHA1_LANE_DATA_size*MAX_SHA1_LANES, _HMAC_SHA1_LANE_DATA_align FIELD _num_lanes_inuse_sha1, 4, 4 +FIELD _total_num_lanes_sha1, 4, 4 FIELD _road_block_sha1, 8, 8 END_FIELDS %assign _MB_MGR_HMAC_SHA_1_OOO_size _FIELD_OFFSET @@ -348,6 +349,7 @@ FIELD _lens_sha256, 16*2, 16 FIELD _unused_lanes_sha256, 8, 8 FIELD _ldata_sha256, _HMAC_SHA1_LANE_DATA_size * MAX_SHA256_LANES, _HMAC_SHA1_LANE_DATA_align FIELD _num_lanes_inuse_sha256, 4, 4 +FIELD _total_num_lanes_sha256, 4, 4 FIELD _road_block_sha256, 8, 8 END_FIELDS %assign _MB_MGR_HMAC_SHA_256_OOO_size _FIELD_OFFSET @@ -375,6 +377,7 @@ FIELD _args_sha512, _SHA512_ARGS_size, _SHA512_ARGS_align FIELD _lens_sha512, 16, 16 FIELD _unused_lanes_sha512, 8, 8 FIELD _ldata_sha512, _SHA512_LANE_DATA_size * MAX_SHA512_LANES, _SHA512_LANE_DATA_align +FIELD _total_num_lanes_sha512, 4, 4 FIELD _road_block_sha512, 8, 8 END_FIELDS %assign _MB_MGR_HMAC_SHA_512_OOO_size _FIELD_OFFSET @@ -404,6 +407,7 @@ FIELD _lens_md5, MAX_MD5_LANES*2, 16 FIELD _unused_lanes_md5, 8, 8 FIELD _ldata_md5, _HMAC_SHA1_LANE_DATA_size * MAX_MD5_LANES, _HMAC_SHA1_LANE_DATA_align FIELD _num_lanes_inuse_md5, 4, 8 +FIELD _total_num_lanes_md5, 4, 4 FIELD _road_block_md5, 8, 8 END_FIELDS %assign _MB_MGR_HMAC_MD5_OOO_size _FIELD_OFFSET diff --git a/lib/x86_64/ooo_mgr_reset.c b/lib/x86_64/ooo_mgr_reset.c index f73ca3da..4894fea8 100644 --- a/lib/x86_64/ooo_mgr_reset.c +++ b/lib/x86_64/ooo_mgr_reset.c @@ -134,6 +134,7 @@ ooo_mgr_hmac_sha1_reset(void *p_ooo_mgr, const unsigned num_lanes) memset(p_mgr, 0, offsetof(MB_MGR_HMAC_SHA_1_OOO, road_block)); memset(p_mgr->lens, 0xff, sizeof(p_mgr->lens)); + p_mgr->total_num_lanes = num_lanes; for (i = 0; i < num_lanes; i++) { p_mgr->ldata[i].extra_block[IMB_SHA1_BLOCK_SIZE] = 0x80; @@ -164,6 +165,7 @@ ooo_mgr_hmac_sha224_reset(void *p_ooo_mgr, const unsigned num_lanes) memset(p_mgr, 0, offsetof(MB_MGR_HMAC_SHA_256_OOO, road_block)); memset(p_mgr->lens, 0xff, sizeof(p_mgr->lens)); + p_mgr->total_num_lanes = num_lanes; for (i = 0; i < num_lanes; i++) { p_mgr->ldata[i].extra_block[IMB_SHA_256_BLOCK_SIZE] = 0x80; @@ -194,6 +196,7 @@ ooo_mgr_hmac_sha256_reset(void *p_ooo_mgr, const unsigned num_lanes) memset(p_mgr, 0, offsetof(MB_MGR_HMAC_SHA_256_OOO, road_block)); memset(p_mgr->lens, 0xff, sizeof(p_mgr->lens)); + p_mgr->total_num_lanes = num_lanes; for (i = 0; i < num_lanes; i++) { p_mgr->ldata[i].extra_block[IMB_SHA_256_BLOCK_SIZE] = 0x80; @@ -224,6 +227,7 @@ ooo_mgr_hmac_sha384_reset(void *p_ooo_mgr, const unsigned num_lanes) memset(p_mgr, 0, offsetof(MB_MGR_HMAC_SHA_512_OOO, road_block)); memset(p_mgr->lens, 0xff, sizeof(p_mgr->lens)); + p_mgr->total_num_lanes = num_lanes; for (i = 0; i < num_lanes; i++) { p_mgr->ldata[i].extra_block[IMB_SHA_384_BLOCK_SIZE] = 0x80; @@ -260,6 +264,7 @@ ooo_mgr_hmac_sha512_reset(void *p_ooo_mgr, const unsigned num_lanes) memset(p_mgr, 0, offsetof(MB_MGR_HMAC_SHA_512_OOO, road_block)); memset(p_mgr->lens, 0xff, sizeof(p_mgr->lens)); + p_mgr->total_num_lanes = num_lanes; for (i = 0; i < num_lanes; i++) { p_mgr->ldata[i].extra_block[IMB_SHA_512_BLOCK_SIZE] = 0x80; @@ -296,6 +301,7 @@ ooo_mgr_hmac_md5_reset(void *p_ooo_mgr, const unsigned num_lanes) memset(p_mgr, 0, offsetof(MB_MGR_HMAC_MD5_OOO, road_block)); memset(p_mgr->lens, 0xff, sizeof(p_mgr->lens)); + p_mgr->total_num_lanes = num_lanes; for (i = 0; i < num_lanes; i++) { p_mgr->ldata[i].extra_block[64] = 0x80; @@ -343,6 +349,7 @@ ooo_mgr_sha1_reset(void *p_ooo_mgr, const unsigned num_lanes) MB_MGR_SHA_1_OOO *p_mgr = (MB_MGR_SHA_1_OOO *) p_ooo_mgr; memset(p_mgr, 0, offsetof(MB_MGR_SHA_1_OOO, road_block)); + p_mgr->total_num_lanes = num_lanes; if (num_lanes == 2) p_mgr->unused_lanes = 0xF10; /* SHANI */ @@ -361,6 +368,7 @@ ooo_mgr_sha256_reset(void *p_ooo_mgr, const unsigned num_lanes) MB_MGR_SHA_256_OOO *p_mgr = (MB_MGR_SHA_256_OOO *) p_ooo_mgr; memset(p_mgr, 0, offsetof(MB_MGR_SHA_256_OOO, road_block)); + p_mgr->total_num_lanes = num_lanes; if (num_lanes == 2) p_mgr->unused_lanes = 0xF10; /* SHANI */ @@ -379,6 +387,7 @@ ooo_mgr_sha512_reset(void *p_ooo_mgr, const unsigned num_lanes) MB_MGR_SHA_512_OOO *p_mgr = (MB_MGR_SHA_512_OOO *) p_ooo_mgr; memset(p_mgr, 0, offsetof(MB_MGR_SHA_512_OOO, road_block)); + p_mgr->total_num_lanes = num_lanes; if (num_lanes == AVX_NUM_SHA512_LANES) p_mgr->unused_lanes = 0xF10; -- GitLab From af31ec65c63e51dacdafaa22cebf92c5c31244df Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Wed, 19 Jun 2024 11:22:49 +0100 Subject: [PATCH 05/24] lib: retrieve optimal hash burst size from internal number of lanes Signed-off-by: Pablo de Lara --- lib/ipsec-mb.h | 7 +++- lib/x86_64/capabilities.c | 75 ++++++++++++--------------------------- 2 files changed, 28 insertions(+), 54 deletions(-) diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index 91ca5549..3c893707 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -4036,7 +4036,12 @@ imb_get_arch_type_string(const IMB_MGR *state, const char **arch_type, const cha * * Depending on the architecture used, this function returns the minimum * burst size to be used for good performance on the hash-only burst API. - * The output burst size can be 1 (in case of a synchronous single-buffer implementation + * Note that this will not return a value for best performance, but the minimum needed + * to start maximizing the CPU core (i.e. enough buffers to utilize efficiently the CPU core + * resources, taking into account that when buffers have different sizes, a higher burst size is + * recommended). + * + * The output burst size may also be 1 (in case of a synchronous single-buffer implementation * or 0 if the algorithm is not supported by the API). * * @param [in] mb_mgr pointer to IMB MGR structure diff --git a/lib/x86_64/capabilities.c b/lib/x86_64/capabilities.c index 632efde1..a515fe9d 100644 --- a/lib/x86_64/capabilities.c +++ b/lib/x86_64/capabilities.c @@ -39,73 +39,42 @@ imb_hash_burst_get_size(const IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned return IMB_ERR_NULL_BURST; #endif - IMB_ARCH used_arch = (IMB_ARCH) mb_mgr->used_arch; - switch (algo) { #ifndef __aarch64__ case IMB_AUTH_HMAC_SHA_1: + *out_burst_size = + ((MB_MGR_HMAC_SHA_1_OOO *) (mb_mgr->hmac_sha_1_ooo))->total_num_lanes; + break; case IMB_AUTH_SHA_1: - switch (used_arch) { - - case IMB_ARCH_NOAESNI: - - case IMB_ARCH_SSE: - *out_burst_size = SSE_NUM_SHA1_LANES; - break; - case IMB_ARCH_AVX: - *out_burst_size = AVX_NUM_SHA1_LANES; - break; - case IMB_ARCH_AVX2: - *out_burst_size = AVX2_NUM_SHA1_LANES; - break; - case IMB_ARCH_AVX512: - - default: - *out_burst_size = AVX2_NUM_SHA1_LANES; - break; - } + *out_burst_size = ((MB_MGR_SHA_1_OOO *) (mb_mgr->sha_1_ooo))->total_num_lanes; break; case IMB_AUTH_HMAC_SHA_224: + *out_burst_size = + ((MB_MGR_HMAC_SHA_256_OOO *) (mb_mgr->hmac_sha_224_ooo))->total_num_lanes; + break; case IMB_AUTH_SHA_224: + *out_burst_size = ((MB_MGR_SHA_256_OOO *) (mb_mgr->sha_224_ooo))->total_num_lanes; + break; case IMB_AUTH_HMAC_SHA_256: + *out_burst_size = + ((MB_MGR_HMAC_SHA_256_OOO *) (mb_mgr->hmac_sha_256_ooo))->total_num_lanes; + break; case IMB_AUTH_SHA_256: - switch (used_arch) { - case IMB_ARCH_NOAESNI: - case IMB_ARCH_SSE: - *out_burst_size = SSE_NUM_SHA256_LANES; - break; - case IMB_ARCH_AVX: - *out_burst_size = AVX_NUM_SHA256_LANES; - break; - case IMB_ARCH_AVX2: - *out_burst_size = AVX2_NUM_SHA256_LANES; - break; - case IMB_ARCH_AVX512: - default: - *out_burst_size = AVX2_NUM_SHA256_LANES; - break; - } + *out_burst_size = ((MB_MGR_SHA_256_OOO *) (mb_mgr->sha_256_ooo))->total_num_lanes; break; case IMB_AUTH_HMAC_SHA_384: + *out_burst_size = + ((MB_MGR_HMAC_SHA_512_OOO *) (mb_mgr->hmac_sha_384_ooo))->total_num_lanes; + break; case IMB_AUTH_SHA_384: + *out_burst_size = ((MB_MGR_SHA_512_OOO *) (mb_mgr->sha_384_ooo))->total_num_lanes; + break; case IMB_AUTH_HMAC_SHA_512: + *out_burst_size = + ((MB_MGR_HMAC_SHA_512_OOO *) (mb_mgr->hmac_sha_512_ooo))->total_num_lanes; + break; case IMB_AUTH_SHA_512: - switch (used_arch) { - case IMB_ARCH_NOAESNI: - case IMB_ARCH_SSE: - *out_burst_size = SSE_NUM_SHA512_LANES; - break; - case IMB_ARCH_AVX: - *out_burst_size = AVX_NUM_SHA512_LANES; - break; - case IMB_ARCH_AVX2: - *out_burst_size = AVX2_NUM_SHA512_LANES; - break; - case IMB_ARCH_AVX512: - default: - *out_burst_size = AVX2_NUM_SHA512_LANES; - break; - } + *out_burst_size = ((MB_MGR_SHA_512_OOO *) (mb_mgr->sha_512_ooo))->total_num_lanes; break; #endif /* __aarch64__ */ default: -- GitLab From f615acc090386cbccc51bff6b6dde86dbda12a73 Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Wed, 19 Jun 2024 16:17:44 +0000 Subject: [PATCH 06/24] avx2_t4: [SHA512] add multi-buffer implementation Signed-off-by: Marcel Cornu --- lib/Makefile | 1 + lib/avx2_t1/sha_mb_avx2.c | 8 +- lib/avx2_t4/sha512_x2_ni_avx2.asm | 412 ++++++++++++++++++++++++++++++ lib/avx2_t4/sha_ni_avx2.c | 24 +- lib/avx512_t1/sha_mb_avx512.c | 8 +- lib/avx_t1/sha_mb_avx.c | 8 +- lib/include/arch_avx2_type4.h | 3 + lib/include/constants.inc | 1 + lib/include/sha_mb_mgr.h | 45 +++- lib/sse_t1/sha_mb_sse.c | 8 +- lib/win_x64.mak | 1 + 11 files changed, 489 insertions(+), 30 deletions(-) create mode 100644 lib/avx2_t4/sha512_x2_ni_avx2.asm diff --git a/lib/Makefile b/lib/Makefile index 364f46b1..02d7503f 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -779,6 +779,7 @@ asm_avx2_t4_lib_objs := \ sm3_msg_avx2.o \ sm3_hmac_avx2.o \ sha512_x1_ni_avx2.o \ + sha512_x2_ni_avx2.o \ sha512_hmac_ni_avx2.o # diff --git a/lib/avx2_t1/sha_mb_avx2.c b/lib/avx2_t1/sha_mb_avx2.c index eee83390..5777ccbf 100644 --- a/lib/avx2_t1/sha_mb_avx2.c +++ b/lib/avx2_t1/sha_mb_avx2.c @@ -111,7 +111,7 @@ IMB_JOB * submit_job_sha384_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 4, 1, 384, IMB_SHA_512_BLOCK_SIZE, - SHA384_PAD_SIZE, call_sha512_x4_avx2_from_c); + SHA384_PAD_SIZE, call_sha512_x4_avx2_from_c, 0); } IMB_DLL_LOCAL @@ -119,7 +119,7 @@ IMB_JOB * flush_job_sha384_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 4, 0, 384, IMB_SHA_512_BLOCK_SIZE, - SHA384_PAD_SIZE, call_sha512_x4_avx2_from_c); + SHA384_PAD_SIZE, call_sha512_x4_avx2_from_c, 0); } /* ========================================================================== */ @@ -132,7 +132,7 @@ IMB_JOB * submit_job_sha512_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 4, 1, 512, IMB_SHA_512_BLOCK_SIZE, - SHA512_PAD_SIZE, call_sha512_x4_avx2_from_c); + SHA512_PAD_SIZE, call_sha512_x4_avx2_from_c, 0); } IMB_DLL_LOCAL @@ -140,5 +140,5 @@ IMB_JOB * flush_job_sha512_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 4, 0, 512, IMB_SHA_512_BLOCK_SIZE, - SHA512_PAD_SIZE, call_sha512_x4_avx2_from_c); + SHA512_PAD_SIZE, call_sha512_x4_avx2_from_c, 0); } diff --git a/lib/avx2_t4/sha512_x2_ni_avx2.asm b/lib/avx2_t4/sha512_x2_ni_avx2.asm new file mode 100644 index 00000000..6995a275 --- /dev/null +++ b/lib/avx2_t4/sha512_x2_ni_avx2.asm @@ -0,0 +1,412 @@ +;; +;; Copyright (c) 2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; =========================================================== +;; NOTE about comment format: +;; +;; xmm = a b c d +;; ^ ^ +;; | | +;; MSB--+ +--LSB +;; +;; a - most significant word in `ymm` +;; d - least significant word in `ymm` +;; =========================================================== + +%use smartalign + +%include "include/os.inc" +%include "include/clear_regs.inc" +%include "include/reg_sizes.inc" +%include "include/mb_mgr_datastruct.inc" + +; resdq = res0 => 16 bytes +struc frame +.ABEF_SAVE resy 1 +.CDGH_SAVE resy 1 +.ABEF_SAVEb resy 1 +.CDGH_SAVEb resy 1 +endstruc + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%endif + +%define args arg1 +%define NUM_BLKS arg2 + +%define INP arg3 +%define INPb arg4 + +%define SHA512_CONSTS rax + +%define MSG ymm0 +%define STATE0 ymm1 +%define STATE1 ymm2 +%define MSGTMP0 ymm3 +%define MSGTMP1 ymm4 +%define MSGTMP2 ymm5 + +%define YTMP0 ymm6 +%define YTMP1 ymm7 + +%define STATE0b ymm8 +%define STATE1b ymm9 +%define MSGb ymm10 + +%define YTMP2 ymm11 +%define YTMP3 ymm12 + +%define MSGTMP0b ymm13 +%define MSGTMP1b ymm14 +%define MSGTMP2b ymm15 + +%define GP_STORAGE 6*8 +%ifndef LINUX +%define XMM_STORAGE 10*16 +%else +%define XMM_STORAGE 0 +%endif + +%define VARIABLE_OFFSET XMM_STORAGE + GP_STORAGE +%define GP_OFFSET XMM_STORAGE + +%macro FUNC_SAVE 0 + mov r11, rsp + sub rsp, VARIABLE_OFFSET + and rsp, ~31 ; align rsp to 32 bytes + + mov [rsp + 0*8], rbx + mov [rsp + 1*8], rbp + mov [rsp + 2*8], r12 +%ifndef LINUX + mov [rsp + 3*8], rsi + mov [rsp + 4*8], rdi + vmovdqa [rsp + 3*16], xmm6 + vmovdqa [rsp + 4*16], xmm7 + vmovdqa [rsp + 5*16], xmm8 + vmovdqa [rsp + 6*16], xmm9 + vmovdqa [rsp + 7*16], xmm10 + vmovdqa [rsp + 8*16], xmm11 + vmovdqa [rsp + 9*16], xmm12 + vmovdqa [rsp + 10*16], xmm13 + vmovdqa [rsp + 11*16], xmm14 + vmovdqa [rsp + 12*16], xmm15 +%endif ; LINUX + mov [rsp + 5*8], r11 ;; rsp pointer +%endmacro + +%macro FUNC_RESTORE 0 + mov rbx, [rsp + 0*8] + mov rbp, [rsp + 1*8] + mov r12, [rsp + 2*8] +%ifndef LINUX + mov rsi, [rsp + 3*8] + mov rdi, [rsp + 4*8] + vmovdqa xmm6, [rsp + 3*16] + vmovdqa xmm7, [rsp + 4*16] + vmovdqa xmm8, [rsp + 5*16] + vmovdqa xmm9, [rsp + 6*16] + vmovdqa xmm10, [rsp + 7*16] + vmovdqa xmm11, [rsp + 8*16] + vmovdqa xmm12, [rsp + 9*16] + vmovdqa xmm13, [rsp + 10*16] + vmovdqa xmm14, [rsp + 11*16] + vmovdqa xmm15, [rsp + 12*16] +%endif ; LINUX + mov rsp, [rsp + 5*8] ;; rsp pointer +%endmacro + +%macro SHA512ROUNDS4 7 +%define %%Y0 %1 +%define %%Y1 %2 +%define %%Y2 %3 +%define %%Y3 %4 +%define %%Y4 %5 +%define %%Y6 %6 +%define %%I %7 + + vpaddq %%Y0, %%Y3, [SHA512_CONSTS+32*%%I] + vpermq YTMP3, %%Y3, 0x1b + vpermq YTMP1, %%Y6, 0x39 + vpblendd YTMP1, YTMP3, YTMP1, 0x3f + vpaddq %%Y4, %%Y4, YTMP1 + vsha512msg2 %%Y4, %%Y3 + vsha512rnds2 %%Y2, %%Y1, XWORD(%%Y0) + vperm2i128 %%Y0, %%Y0, %%Y0, 0x01 + vsha512rnds2 %%Y1, %%Y2, XWORD(%%Y0) + vsha512msg1 %%Y6, XWORD(%%Y3) +%endmacro + +%macro SHA512ROUNDS4_FINAL 7 +%define %%Y0 %1 +%define %%Y1 %2 +%define %%Y2 %3 +%define %%Y3 %4 +%define %%Y4 %5 +%define %%Y6 %6 +%define %%I %7 + + vpaddq %%Y0, %%Y3, [SHA512_CONSTS+32*%%I] + vpermq YTMP3, %%Y3, 0x1b + vpermq YTMP1, %%Y6, 0x39 + vpblendd YTMP1, YTMP3, YTMP1, 0x3f + vpaddq %%Y4, %%Y4, YTMP1 + vsha512msg2 %%Y4, %%Y3 + vsha512rnds2 %%Y2, %%Y1, XWORD(%%Y0) + vperm2i128 %%Y0, %%Y0, %%Y0, 0x01 + vsha512rnds2 %%Y1, %%Y2, XWORD(%%Y0) +%endmacro + +;; re-use symbols from AVX codebase +extern SHA512_K_AVX + +mksection .rodata +default rel + +align 32 +SHUF_MASK: + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +mksection .text +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha512_ni_x2(SHA512_ARGS *args, UINT64 size_in_blocks) +;; arg1 : pointer to args +;; arg2 : size (in blocks) ;; assumed to be >= 1 +align 32 +MKGLOBAL(sha512_ni_x2,function,internal) +sha512_ni_x2: + mov r11, rsp + sub rsp, frame_size + and rsp, -32 + + or NUM_BLKS, NUM_BLKS + je .done_hash + + ;; load input pointers + mov INP, [args + _data_ptr_sha512 + 0*PTR_SZ] + mov INPb, [args + _data_ptr_sha512 + 1*PTR_SZ] + + ;; load constants pointer + lea SHA512_CONSTS, [rel SHA512_K_AVX] + + ;; load current hash value and transform + vmovdqu STATE0, [args + 0*SHA512NI_DIGEST_ROW_SIZE] + vmovdqu STATE1, [args + 0*SHA512NI_DIGEST_ROW_SIZE + 32] + vmovdqu STATE0b, [args + 1*SHA512NI_DIGEST_ROW_SIZE] + vmovdqu STATE1b, [args + 1*SHA512NI_DIGEST_ROW_SIZE + 32] + + vperm2i128 YTMP1, STATE0, STATE1, 0x20 + vperm2i128 YTMP0, STATE0b, STATE1b, 0x20 + vperm2i128 STATE1, STATE0, STATE1, 0x31 + vperm2i128 STATE1b, STATE0b, STATE1b, 0x31 + vpermq STATE0, YTMP1, 0x1b + vpermq STATE0b, YTMP0, 0x1b + vpermq STATE1, STATE1, 0x1b + vpermq STATE1b, STATE1b, 0x1b + +align 32 +.block_loop: + ;; Save digests + vmovdqa [rsp + frame.ABEF_SAVE], STATE0 + vmovdqa [rsp + frame.CDGH_SAVE], STATE1 + vmovdqa [rsp + frame.ABEF_SAVEb], STATE0b + vmovdqa [rsp + frame.CDGH_SAVEb], STATE1b + + ;; R0- R3 + vmovdqu MSG, [INP+32*0] + vmovdqu MSGb, [INPb+32*0] + vpshufb MSG, MSG, [SHUF_MASK] + vpshufb MSGb, MSGb, [SHUF_MASK] + vmovdqu MSGTMP0, MSG + vmovdqu MSGTMP0b, MSGb + vpaddq MSG, MSG, [SHA512_CONSTS+32*0] + vpaddq MSGb, MSGb, [SHA512_CONSTS+32*0] + vsha512rnds2 STATE1, STATE0, XWORD(MSG) + vsha512rnds2 STATE1b, STATE0b, XWORD(MSGb) + vperm2i128 MSG, MSG, MSG, 0x01 + vperm2i128 MSGb, MSGb, MSGb, 0x01 + vsha512rnds2 STATE0, STATE1, XWORD(MSG) + vsha512rnds2 STATE0b, STATE1b, XWORD(MSGb) + + ;; R4-7 + vmovdqu MSG, [INP+32*1] + vmovdqu MSGb, [INPb+32*1] + vpshufb MSG, MSG, [SHUF_MASK] + vpshufb MSGb, MSGb, [SHUF_MASK] + vmovdqu MSGTMP1, MSG + vmovdqu MSGTMP1b, MSGb + vpaddq MSG, MSG, [SHA512_CONSTS+32*1] + vpaddq MSGb, MSGb, [SHA512_CONSTS+32*1] + vsha512rnds2 STATE1, STATE0, XWORD(MSG) + vsha512rnds2 STATE1b, STATE0b, XWORD(MSGb) + vperm2i128 MSG, MSG, MSG, 0x01 + vperm2i128 MSGb, MSGb, MSGb, 0x01 + vsha512rnds2 STATE0, STATE1, XWORD(MSG) + vsha512rnds2 STATE0b, STATE1b, XWORD(MSGb) + vsha512msg1 MSGTMP0, XWORD(MSGTMP1) + vsha512msg1 MSGTMP0b, XWORD(MSGTMP1b) + + ;; R8-R11 + vmovdqu MSG, [INP+32*2] + vmovdqu MSGb, [INPb+32*2] + vpshufb MSG, MSG, [SHUF_MASK] + vpshufb MSGb, MSGb, [SHUF_MASK] + vmovdqu MSGTMP2, MSG + vmovdqu MSGTMP2b, MSGb + + + vpaddq MSG, MSG, [SHA512_CONSTS+32*2] + vpaddq MSGb, MSGb, [SHA512_CONSTS+32*2] + vsha512rnds2 STATE1, STATE0, XWORD(MSG) + vsha512rnds2 STATE1b, STATE0b, XWORD(MSGb) + vperm2i128 MSG, MSG, MSG, 0x01 + vperm2i128 MSGb, MSGb, MSGb, 0x01 + vsha512rnds2 STATE0, STATE1, XWORD(MSG) + vsha512rnds2 STATE0b, STATE1b, XWORD(MSGb) + vsha512msg1 MSGTMP1, XWORD(MSGTMP2) + vsha512msg1 MSGTMP1b, XWORD(MSGTMP2b) + + ;; R12-15 + vmovdqu MSG, [INP+32*3] + vmovdqu MSGb, [INPb+32*3] + vpshufb MSG, MSG, [SHUF_MASK] + vpshufb MSGb, MSGb, [SHUF_MASK] + vmovdqu YTMP0, MSG + vmovdqu YTMP2, MSGb + + ;; R16-75 + SHA512ROUNDS4 MSG, STATE0, STATE1, YTMP0, MSGTMP0, MSGTMP2, 3 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, YTMP2, MSGTMP0b, MSGTMP2b, 3 + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP0, MSGTMP1, YTMP0, 4 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP0b, MSGTMP1b, YTMP2, 4 + + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP1, MSGTMP2, MSGTMP0, 5 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP1b, MSGTMP2b, MSGTMP0b, 5 + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP2, YTMP0, MSGTMP1, 6 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP2b, YTMP2, MSGTMP1b, 6 + + SHA512ROUNDS4 MSG, STATE0, STATE1, YTMP0, MSGTMP0, MSGTMP2, 7 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, YTMP2, MSGTMP0b, MSGTMP2b, 7 + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP0, MSGTMP1, YTMP0, 8 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP0b, MSGTMP1b, YTMP2, 8 + + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP1, MSGTMP2, MSGTMP0, 9 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP1b, MSGTMP2b, MSGTMP0b, 9 + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP2, YTMP0, MSGTMP1, 10 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP2b, YTMP2, MSGTMP1b, 10 + + SHA512ROUNDS4 MSG, STATE0, STATE1, YTMP0, MSGTMP0, MSGTMP2, 11 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, YTMP2, MSGTMP0b, MSGTMP2b, 11 + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP0, MSGTMP1, YTMP0, 12 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP0b, MSGTMP1b, YTMP2, 12 + + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP1, MSGTMP2, MSGTMP0, 13 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP1b, MSGTMP2b, MSGTMP0b, 13 + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP2, YTMP0, MSGTMP1, 14 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP2b, YTMP2, MSGTMP1b, 14 + + SHA512ROUNDS4 MSG, STATE0, STATE1, YTMP0, MSGTMP0, MSGTMP2, 15 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, YTMP2, MSGTMP0b, MSGTMP2b, 15 + SHA512ROUNDS4 MSG, STATE0, STATE1, MSGTMP0, MSGTMP1, YTMP0, 16 + SHA512ROUNDS4 MSGb, STATE0b, STATE1b, MSGTMP0b, MSGTMP1b, YTMP2, 16 + + SHA512ROUNDS4_FINAL MSG, STATE0, STATE1, MSGTMP1, MSGTMP2, MSGTMP0, 17 + SHA512ROUNDS4_FINAL MSGb, STATE0b, STATE1b, MSGTMP1b, MSGTMP2b, MSGTMP0b, 17 + SHA512ROUNDS4_FINAL MSG, STATE0, STATE1, MSGTMP2, YTMP0, MSGTMP1, 18 + SHA512ROUNDS4_FINAL MSGb, STATE0b, STATE1b, MSGTMP2b, YTMP2, MSGTMP1b, 18 + + ;; R76-79 + vpaddq MSG, YTMP0, [SHA512_CONSTS+32*19] + vpaddq MSGb, YTMP2, [SHA512_CONSTS+32*19] + vsha512rnds2 STATE1, STATE0, XWORD(MSG) + vsha512rnds2 STATE1b, STATE0b, XWORD(MSGb) + vperm2i128 MSG, MSG, MSG, 0x01 + vperm2i128 MSGb, MSGb, MSGb, 0x01 + vsha512rnds2 STATE0, STATE1, XWORD(MSG) + vsha512rnds2 STATE0b, STATE1b, XWORD(MSGb) + + vpaddq STATE0, STATE0, [rsp + frame.ABEF_SAVE] + vpaddq STATE1, STATE1, [rsp + frame.CDGH_SAVE] + vpaddq STATE0b, STATE0b, [rsp + frame.ABEF_SAVEb] + vpaddq STATE1b, STATE1b, [rsp + frame.CDGH_SAVEb] + + lea INP, [INP+128] + lea INPb, [INPb+128] + + dec NUM_BLKS + jne .block_loop + + ;; Update input pointers + mov [args + _data_ptr_sha512 + 0*PTR_SZ], INP + mov [args + _data_ptr_sha512 + 1*PTR_SZ], INPb + + ; Reorder and write back the hash value + vperm2i128 MSGTMP0, STATE0, STATE1, 0x31 + vperm2i128 MSGTMP1, STATE0b, STATE1b, 0x31 + vperm2i128 MSGTMP2, STATE0, STATE1, 0x20 + vperm2i128 YTMP0, STATE0b, STATE1b, 0x20 + vpermq STATE0, MSGTMP0, 0xb1 + vpermq STATE1, MSGTMP2, 0xb1 + vpermq STATE0b, MSGTMP1, 0xb1 + vpermq STATE1b, YTMP0, 0xb1 + + ;; update digests + vmovdqu [args + 0*SHA512NI_DIGEST_ROW_SIZE], STATE0 + vmovdqu [args + 0*SHA512NI_DIGEST_ROW_SIZE + 32], STATE1 + vmovdqu [args + 1*SHA512NI_DIGEST_ROW_SIZE], STATE0b + vmovdqu [args + 1*SHA512NI_DIGEST_ROW_SIZE + 32], STATE1b + + vzeroupper + +.done_hash: + + mov rsp, r11 + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; void call_sha512_ni_x2_avx2_from_c(SHA512_ARGS *args, UINT64 size_in_blocks); +MKGLOBAL(call_sha512_ni_x2_avx2_from_c,function,internal) +call_sha512_ni_x2_avx2_from_c: + FUNC_SAVE + call sha512_ni_x2 + FUNC_RESTORE + ret + +mksection stack-noexec diff --git a/lib/avx2_t4/sha_ni_avx2.c b/lib/avx2_t4/sha_ni_avx2.c index 7cdab60a..1453fc87 100644 --- a/lib/avx2_t4/sha_ni_avx2.c +++ b/lib/avx2_t4/sha_ni_avx2.c @@ -26,6 +26,7 @@ *******************************************************************************/ #include "include/sha_generic.h" +#include "include/sha_mb_mgr.h" #include "include/arch_avx2_type4.h" /* ========================================================================== */ @@ -95,27 +96,30 @@ flush_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) /* ========================================================================== */ /* - * SHA512 API for JOB API + * SHA512 MB API for JOB API */ IMB_JOB * submit_job_sha512_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { - const void *msg = (job->src + job->hash_start_src_offset_in_bytes); - const uint64_t length = job->msg_len_to_hash_in_bytes; - uint64_t tag[8]; - +#ifdef SMX_NI + return submit_flush_job_sha_512(state, job, 2, 1, 512, IMB_SHA_512_BLOCK_SIZE, + SHA512_PAD_SIZE, call_sha512_ni_x2_avx2_from_c, 1); +#else (void) state; - - sha512_ni_avx2(msg, length, tag); - memcpy(job->auth_tag_output, tag, job->auth_tag_output_len_in_bytes); - job->status |= IMB_STATUS_COMPLETED_AUTH; - return job; + (void) job; + return NULL; +#endif /* ifdef SMX_NI */ } IMB_JOB * flush_job_sha512_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { +#ifdef SMX_NI + return submit_flush_job_sha_512(state, job, 2, 0, 512, IMB_SHA_512_BLOCK_SIZE, + SHA512_PAD_SIZE, call_sha512_ni_x2_avx2_from_c, 1); +#else (void) state; (void) job; return NULL; +#endif /* ifdef SMX_NI */ } diff --git a/lib/avx512_t1/sha_mb_avx512.c b/lib/avx512_t1/sha_mb_avx512.c index 4c2cfc07..30bc20fa 100644 --- a/lib/avx512_t1/sha_mb_avx512.c +++ b/lib/avx512_t1/sha_mb_avx512.c @@ -111,7 +111,7 @@ IMB_JOB * submit_job_sha384_avx512(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 8, 1, 384, IMB_SHA_512_BLOCK_SIZE, - SHA384_PAD_SIZE, call_sha512_x8_avx512_from_c); + SHA384_PAD_SIZE, call_sha512_x8_avx512_from_c, 0); } IMB_DLL_LOCAL @@ -119,7 +119,7 @@ IMB_JOB * flush_job_sha384_avx512(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 8, 0, 384, IMB_SHA_512_BLOCK_SIZE, - SHA384_PAD_SIZE, call_sha512_x8_avx512_from_c); + SHA384_PAD_SIZE, call_sha512_x8_avx512_from_c, 0); } /* ========================================================================== */ @@ -132,7 +132,7 @@ IMB_JOB * submit_job_sha512_avx512(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 8, 1, 512, IMB_SHA_512_BLOCK_SIZE, - SHA512_PAD_SIZE, call_sha512_x8_avx512_from_c); + SHA512_PAD_SIZE, call_sha512_x8_avx512_from_c, 0); } IMB_DLL_LOCAL @@ -140,5 +140,5 @@ IMB_JOB * flush_job_sha512_avx512(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 8, 0, 512, IMB_SHA_512_BLOCK_SIZE, - SHA512_PAD_SIZE, call_sha512_x8_avx512_from_c); + SHA512_PAD_SIZE, call_sha512_x8_avx512_from_c, 0); } diff --git a/lib/avx_t1/sha_mb_avx.c b/lib/avx_t1/sha_mb_avx.c index e452f132..4c7f269d 100644 --- a/lib/avx_t1/sha_mb_avx.c +++ b/lib/avx_t1/sha_mb_avx.c @@ -111,7 +111,7 @@ IMB_JOB * submit_job_sha384_avx(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 2, 1, 384, IMB_SHA_512_BLOCK_SIZE, - SHA384_PAD_SIZE, call_sha512_x2_avx_from_c); + SHA384_PAD_SIZE, call_sha512_x2_avx_from_c, 0); } IMB_DLL_LOCAL @@ -119,7 +119,7 @@ IMB_JOB * flush_job_sha384_avx(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 2, 0, 384, IMB_SHA_512_BLOCK_SIZE, - SHA384_PAD_SIZE, call_sha512_x2_avx_from_c); + SHA384_PAD_SIZE, call_sha512_x2_avx_from_c, 0); } /* ========================================================================== */ @@ -132,7 +132,7 @@ IMB_JOB * submit_job_sha512_avx(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 2, 1, 512, IMB_SHA_512_BLOCK_SIZE, - SHA512_PAD_SIZE, call_sha512_x2_avx_from_c); + SHA512_PAD_SIZE, call_sha512_x2_avx_from_c, 0); } IMB_DLL_LOCAL @@ -140,5 +140,5 @@ IMB_JOB * flush_job_sha512_avx(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 2, 0, 512, IMB_SHA_512_BLOCK_SIZE, - SHA512_PAD_SIZE, call_sha512_x2_avx_from_c); + SHA512_PAD_SIZE, call_sha512_x2_avx_from_c, 0); } diff --git a/lib/include/arch_avx2_type4.h b/lib/include/arch_avx2_type4.h index 8b9628ce..44e1f7a4 100644 --- a/lib/include/arch_avx2_type4.h +++ b/lib/include/arch_avx2_type4.h @@ -90,4 +90,7 @@ submit_job_hmac_sha_384_ni_avx2(MB_MGR_HMAC_SHA_512_OOO *state, IMB_JOB *job); IMB_JOB * submit_job_hmac_sha_512_ni_avx2(MB_MGR_HMAC_SHA_512_OOO *state, IMB_JOB *job); +void +call_sha512_ni_x2_avx2_from_c(SHA512_ARGS *args, uint64_t size_in_blocks); + #endif /* IMB_ASM_AVX2_T4_H */ diff --git a/lib/include/constants.inc b/lib/include/constants.inc index 152c1ca3..a259ea0a 100644 --- a/lib/include/constants.inc +++ b/lib/include/constants.inc @@ -59,6 +59,7 @@ ;; Sanity checks to fail build if not satisfied %define SHA1NI_DIGEST_ROW_SIZE (NUM_SHA1_DIGEST_WORDS * SHA1_DIGEST_WORD_SIZE) %define SHA256NI_DIGEST_ROW_SIZE (NUM_SHA256_DIGEST_WORDS * SHA256_DIGEST_WORD_SIZE) +%define SHA512NI_DIGEST_ROW_SIZE (NUM_SHA512_DIGEST_WORDS * SHA512_DIGEST_WORD_SIZE) %define MD5_BLK_SZ 128 ; in bytes %define SHA1_BLK_SZ 64 ; in bytes diff --git a/lib/include/sha_mb_mgr.h b/lib/include/sha_mb_mgr.h index 7d1ebbee..0584df27 100644 --- a/lib/include/sha_mb_mgr.h +++ b/lib/include/sha_mb_mgr.h @@ -70,6 +70,18 @@ copy_bswap8_array_mb(void *dst, const void *src, const size_t num, const size_t outp[i] = bswap8(inp[lane + i * offset]); } +__forceinline void +copy_bswap8_array_mb_ni(void *dst, const void *src, const size_t num, const unsigned lane, + const int digest_row_sz) +{ + uint64_t *outp = (uint64_t *) dst; + const uint64_t *inp = (const uint64_t *) src; + size_t i; + + for (i = 0; i < num; i++) + outp[i] = bswap8(inp[digest_row_sz * lane + i]); +} + __forceinline void sha1_mb_init_digest(uint32_t *digest, const unsigned lane) { @@ -168,6 +180,19 @@ sha512_mb_init_digest(uint64_t *digest, const unsigned lane) digest[lane + 7 * 8] = SHA512_H7; } +__forceinline void +sha512_ni_mb_init_digest(uint64_t *digest, const unsigned lane) +{ + digest[8 * lane + 0] = SHA512_H0; + digest[8 * lane + 1] = SHA512_H1; + digest[8 * lane + 2] = SHA512_H2; + digest[8 * lane + 3] = SHA512_H3; + digest[8 * lane + 4] = SHA512_H4; + digest[8 * lane + 5] = SHA512_H5; + digest[8 * lane + 6] = SHA512_H6; + digest[8 * lane + 7] = SHA512_H7; +} + __forceinline void sha_mb_generic_init(void *digest, const int sha_type, const unsigned lane) { @@ -192,6 +217,8 @@ sha_ni_mb_generic_init(void *digest, const int sha_type, const unsigned lane) sha224_ni_mb_init_digest(digest, lane); else if (sha_type == 256) sha256_ni_mb_init_digest(digest, lane); + else if (sha_type == 512) + sha512_ni_mb_init_digest(digest, lane); } __forceinline void @@ -219,6 +246,8 @@ sha_ni_mb_generic_write_digest(void *dst, const void *src, const int sha_type, c copy_bswap4_array_mb_ni(dst, src, NUM_SHA_224_DIGEST_WORDS, lane, 8); else if (sha_type == 256) copy_bswap4_array_mb_ni(dst, src, NUM_SHA_256_DIGEST_WORDS, lane, 8); + else if (sha_type == 512) + copy_bswap8_array_mb_ni(dst, src, NUM_SHA_512_DIGEST_WORDS, lane, 8); } __forceinline void @@ -545,7 +574,8 @@ submit_flush_job_sha_256(MB_MGR_SHA_256_OOO *state, IMB_JOB *job, const unsigned __forceinline IMB_JOB * submit_flush_job_sha_512(MB_MGR_SHA_512_OOO *state, IMB_JOB *job, const unsigned max_jobs, const int is_submit, const int sha_type, const uint64_t blk_size, - const uint64_t pad_size, void (*fn)(SHA512_ARGS *, uint64_t)) + const uint64_t pad_size, void (*fn)(SHA512_ARGS *, uint64_t), + const int shani) { unsigned lane, min_idx; IMB_JOB *ret_job = NULL; @@ -561,7 +591,10 @@ submit_flush_job_sha_512(MB_MGR_SHA_512_OOO *state, IMB_JOB *job, const unsigned state->num_lanes_inuse++; state->args.data_ptr[lane] = job->src + job->hash_start_src_offset_in_bytes; - sha_mb_generic_init(state->args.digest, sha_type, lane); + if (shani) + sha_ni_mb_generic_init(state->args.digest, sha_type, lane); + else + sha_mb_generic_init(state->args.digest, sha_type, lane); /* copy job data in and set up initial blocks */ state->ldata[lane].job_in_lane = job; @@ -656,8 +689,12 @@ submit_flush_job_sha_512(MB_MGR_SHA_512_OOO *state, IMB_JOB *job, const unsigned /* put back processed packet into unused lanes, set job as complete */ state->unused_lanes = (state->unused_lanes << 4) | min_idx; state->num_lanes_inuse--; - sha_mb_generic_write_digest(ret_job->auth_tag_output, state->args.digest, sha_type, 8, - min_idx); + if (shani) + sha_ni_mb_generic_write_digest(ret_job->auth_tag_output, state->args.digest, + sha_type, min_idx); + else + sha_mb_generic_write_digest(ret_job->auth_tag_output, state->args.digest, sha_type, + 8, min_idx); ret_job->status |= IMB_STATUS_COMPLETED_AUTH; state->ldata[min_idx].job_in_lane = NULL; return ret_job; diff --git a/lib/sse_t1/sha_mb_sse.c b/lib/sse_t1/sha_mb_sse.c index c8dcb904..841717d5 100644 --- a/lib/sse_t1/sha_mb_sse.c +++ b/lib/sse_t1/sha_mb_sse.c @@ -101,7 +101,7 @@ IMB_JOB * submit_job_sha384_sse(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 2, 1, 384, IMB_SHA_512_BLOCK_SIZE, - SHA384_PAD_SIZE, call_sha512_x2_sse_from_c); + SHA384_PAD_SIZE, call_sha512_x2_sse_from_c, 0); } IMB_DLL_LOCAL @@ -109,7 +109,7 @@ IMB_JOB * flush_job_sha384_sse(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 2, 0, 384, IMB_SHA_512_BLOCK_SIZE, - SHA384_PAD_SIZE, call_sha512_x2_sse_from_c); + SHA384_PAD_SIZE, call_sha512_x2_sse_from_c, 0); } /* ========================================================================== */ @@ -122,7 +122,7 @@ IMB_JOB * submit_job_sha512_sse(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 2, 1, 512, IMB_SHA_512_BLOCK_SIZE, - SHA512_PAD_SIZE, call_sha512_x2_sse_from_c); + SHA512_PAD_SIZE, call_sha512_x2_sse_from_c, 0); } IMB_DLL_LOCAL @@ -130,5 +130,5 @@ IMB_JOB * flush_job_sha512_sse(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { return submit_flush_job_sha_512(state, job, 2, 0, 512, IMB_SHA_512_BLOCK_SIZE, - SHA512_PAD_SIZE, call_sha512_x2_sse_from_c); + SHA512_PAD_SIZE, call_sha512_x2_sse_from_c, 0); } diff --git a/lib/win_x64.mak b/lib/win_x64.mak index 3e358a74..49214766 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -597,6 +597,7 @@ avx2_t4_objs = \ $(OBJ_DIR)\sm3_msg_avx2.obj \ $(OBJ_DIR)\sm3_hmac_avx2.obj \ $(OBJ_DIR)\sha512_x1_ni_avx2.obj \ + $(OBJ_DIR)\sha512_x2_ni_avx2.obj \ $(OBJ_DIR)\sha_ni_avx2.obj \ $(OBJ_DIR)\sha512_hmac_ni_avx2.obj -- GitLab From f502ff4f33a192167147e3c36f3058fb09139688 Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Fri, 21 Jun 2024 16:28:34 +0100 Subject: [PATCH 07/24] avx2_t4: remove duplicate function prototypes Signed-off-by: Marcel Cornu --- lib/include/arch_avx2_type4.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/lib/include/arch_avx2_type4.h b/lib/include/arch_avx2_type4.h index 44e1f7a4..ba24878c 100644 --- a/lib/include/arch_avx2_type4.h +++ b/lib/include/arch_avx2_type4.h @@ -75,11 +75,6 @@ flush_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); IMB_JOB * flush_job_sha512_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); -IMB_JOB * -submit_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); -IMB_JOB * -submit_job_sha512_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); - IMB_JOB * flush_job_hmac_sha_384_ni_avx2(MB_MGR_HMAC_SHA_512_OOO *state); IMB_JOB * -- GitLab From 91c5a22d2d4c132310ca30e1a80f7e2d2f2a9ca9 Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Fri, 21 Jun 2024 16:50:58 +0100 Subject: [PATCH 08/24] avx2_t4: [SHA384] add multi-buffer implementation Signed-off-by: Marcel Cornu --- lib/avx2_t4/sha_ni_avx2.c | 23 +++++++++++++---------- lib/include/sha_mb_mgr.h | 17 +++++++++++++++++ 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/lib/avx2_t4/sha_ni_avx2.c b/lib/avx2_t4/sha_ni_avx2.c index 1453fc87..2a08e992 100644 --- a/lib/avx2_t4/sha_ni_avx2.c +++ b/lib/avx2_t4/sha_ni_avx2.c @@ -69,29 +69,32 @@ sha512_ni_avx2(const void *data, const uint64_t length, void *digest) /* ========================================================================== */ /* - * SHA384 API for JOB API + * SHA384 MB API for JOB API */ IMB_JOB * submit_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { - const void *msg = (job->src + job->hash_start_src_offset_in_bytes); - const uint64_t length = job->msg_len_to_hash_in_bytes; - uint64_t tag[8]; - +#ifdef SMX_NI + return submit_flush_job_sha_512(state, job, 2, 1, 384, IMB_SHA_384_BLOCK_SIZE, + SHA384_PAD_SIZE, call_sha512_ni_x2_avx2_from_c, 1); +#else (void) state; - - sha384_ni_avx2(msg, length, tag); - memcpy(job->auth_tag_output, tag, job->auth_tag_output_len_in_bytes); - job->status |= IMB_STATUS_COMPLETED_AUTH; - return job; + (void) job; + return NULL; +#endif /* ifdef SMX_NI */ } IMB_JOB * flush_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) { +#ifdef SMX_NI + return submit_flush_job_sha_512(state, job, 2, 0, 384, IMB_SHA_384_BLOCK_SIZE, + SHA384_PAD_SIZE, call_sha512_ni_x2_avx2_from_c, 1); +#else (void) state; (void) job; return NULL; +#endif /* ifdef SMX_NI */ } /* ========================================================================== */ diff --git a/lib/include/sha_mb_mgr.h b/lib/include/sha_mb_mgr.h index 0584df27..702dc371 100644 --- a/lib/include/sha_mb_mgr.h +++ b/lib/include/sha_mb_mgr.h @@ -167,6 +167,19 @@ sha384_mb_init_digest(uint64_t *digest, const unsigned lane) digest[lane + 7 * 8] = SHA384_H7; } +__forceinline void +sha384_ni_mb_init_digest(uint64_t *digest, const unsigned lane) +{ + digest[8 * lane + 0] = SHA384_H0; + digest[8 * lane + 1] = SHA384_H1; + digest[8 * lane + 2] = SHA384_H2; + digest[8 * lane + 3] = SHA384_H3; + digest[8 * lane + 4] = SHA384_H4; + digest[8 * lane + 5] = SHA384_H5; + digest[8 * lane + 6] = SHA384_H6; + digest[8 * lane + 7] = SHA384_H7; +} + __forceinline void sha512_mb_init_digest(uint64_t *digest, const unsigned lane) { @@ -217,6 +230,8 @@ sha_ni_mb_generic_init(void *digest, const int sha_type, const unsigned lane) sha224_ni_mb_init_digest(digest, lane); else if (sha_type == 256) sha256_ni_mb_init_digest(digest, lane); + else if (sha_type == 384) + sha384_ni_mb_init_digest(digest, lane); else if (sha_type == 512) sha512_ni_mb_init_digest(digest, lane); } @@ -246,6 +261,8 @@ sha_ni_mb_generic_write_digest(void *dst, const void *src, const int sha_type, c copy_bswap4_array_mb_ni(dst, src, NUM_SHA_224_DIGEST_WORDS, lane, 8); else if (sha_type == 256) copy_bswap4_array_mb_ni(dst, src, NUM_SHA_256_DIGEST_WORDS, lane, 8); + else if (sha_type == 384) + copy_bswap8_array_mb_ni(dst, src, NUM_SHA_384_DIGEST_WORDS, lane, 8); else if (sha_type == 512) copy_bswap8_array_mb_ni(dst, src, NUM_SHA_512_DIGEST_WORDS, lane, 8); } -- GitLab From e0a89e33a77cf53533e77d3635e22a22a4796047 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Tue, 25 Jun 2024 14:33:54 +0100 Subject: [PATCH 09/24] test: [mp-app] check if shared memory already exists in the primary process Extra messages added on system call errors. --- test/mp-app/mp_shared_mem.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/test/mp-app/mp_shared_mem.c b/test/mp-app/mp_shared_mem.c index b996335c..816cafdf 100644 --- a/test/mp-app/mp_shared_mem.c +++ b/test/mp-app/mp_shared_mem.c @@ -86,13 +86,17 @@ shm_destroy(struct shared_memory *sm, const int is_pri) int ret = 0; if (!is_pri) - if (munmap(sm->ptr, sm->size) != 0) + if (munmap(sm->ptr, sm->size) != 0) { + perror("shm_destroy()"); ret = -1; + } sm->ptr = NULL; if (is_pri) - if (shm_unlink(sm->name) != 0) + if (shm_unlink(sm->name) != 0) { + perror("shm_destroy()"); ret = -1; + } sm->name = NULL; sm->size = 0; @@ -110,17 +114,27 @@ shm_create(struct shared_memory *sm, const int is_pri, const char *name, const s sm->ptr = MAP_FAILED; /* create the shared memory object */ - if (is_pri) + if (is_pri) { + fd = shm_open(sm->name, O_RDWR, 0666); + if (fd != -1) { + printf("shm_open(): %s already exists!\n", sm->name); + close(fd); + return -1; + } fd = shm_open(sm->name, O_CREAT | O_RDWR, 0666); - else + } else { fd = shm_open(sm->name, O_RDWR, 0666); + } - if (fd == -1) + if (fd == -1) { + perror("shm_create()"); return -1; + } /* configure the size of the shared memory object */ if (is_pri) { if (ftruncate(fd, sm->size) != 0) { + perror("shm_create()"); (void) shm_destroy(sm, is_pri); close(fd); return -1; @@ -170,6 +184,7 @@ shm_create(struct shared_memory *sm, const int is_pri, const char *name, const s close(fd); if (sm->ptr == MAP_FAILED) { + perror("shm_create()"); fprintf(stderr, "!mmap() of %s shared memory error\n", sm->name); (void) shm_destroy(sm, is_pri); return -1; -- GitLab From 5c352f22c9b4fbb4ad0c67f331809607efd652e5 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Tue, 25 Jun 2024 14:36:39 +0100 Subject: [PATCH 10/24] test: [mp-app] randomize shared memory names Primary process randomizes name of shared memory region so that multiple instances of the test can run in parallel. Randomized names are passed on from the primary process to the secondary process as command line arguments. --- test/mp-app/imb-mp-primary.c | 126 +++++++++++++++++++++++++++++++-- test/mp-app/imb-mp-secondary.c | 18 +++-- 2 files changed, 132 insertions(+), 12 deletions(-) diff --git a/test/mp-app/imb-mp-primary.c b/test/mp-app/imb-mp-primary.c index d44913a3..b45c78c7 100644 --- a/test/mp-app/imb-mp-primary.c +++ b/test/mp-app/imb-mp-primary.c @@ -50,6 +50,11 @@ mp_primary(const char *name2) #if defined(__linux__) || defined(__FreeBSD__) #include +#include /* close() and unlink() */ +#endif + +#ifdef _WIN32 +#include /* _mktemp() */ #endif /* @@ -136,25 +141,87 @@ prepare_reference_output(struct info_context *ctx, const int is_pri) return 0; } +static char * +randomize_shm_name(const char *name) +{ + if (name == NULL) + return NULL; + + char temp[8]; + + memset(temp, 0, sizeof(temp)); + strncpy(temp, "XXXXXX", sizeof(temp) - 1); + +#if defined(__linux__) || defined(__FreeBSD__) + int fd = mkstemp(temp); + + if (fd == -1) + return NULL; + + close(fd); + unlink(temp); +#endif + +#ifdef _WIN32 + (void) _mktemp(temp); +#endif + + const size_t name_len = strlen(name); + const size_t temp_len = strlen(temp); + const size_t new_len = name_len + temp_len + 1; + char *new_name = malloc(new_len); + + if (new_name == NULL) + return NULL; + + const int ret_len = snprintf(new_name, new_len, "%s%s", name, temp); + + if (ret_len >= (int) new_len || ret_len < 0) { + free(new_name); + return NULL; + } + + return new_name; +} + static int mp_primary(const char *name2) { const int is_pri = 1; + + char *shm_info_uname = randomize_shm_name(SHM_INFO_NAME); + + if (shm_info_uname == NULL) + return -1; + + char *shm_data_uname = randomize_shm_name(SHM_DATA_NAME); + + if (shm_data_uname == NULL) { + free(shm_info_uname); + return -1; + } + + fprintf(stdout, "PRIMARY: init start %p, %s, %s\n", (void *) imb_get_errno, shm_info_uname, + shm_data_uname); + struct shared_memory app_shm, info_shm; struct info_context *ctx = NULL; struct allocator app_alloc; - fprintf(stdout, "PRIMARY: init start %p\n", (void *) imb_get_errno); - - if (shm_create(&info_shm, is_pri, SHM_INFO_NAME, SHM_INFO_SIZE, NULL) != 0) + if (shm_create(&info_shm, is_pri, shm_info_uname, SHM_INFO_SIZE, NULL) != 0) { + free(shm_info_uname); + free(shm_data_uname); return -1; + } /* cast info shared memory onto info context structure */ ctx = (struct info_context *) info_shm.ptr; memset(ctx, 0, sizeof(*ctx)); - if (shm_create(&app_shm, is_pri, SHM_DATA_NAME, SHM_DATA_SIZE, NULL) != 0) { + if (shm_create(&app_shm, is_pri, shm_data_uname, SHM_DATA_SIZE, NULL) != 0) { (void) shm_destroy(&info_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); return -1; } @@ -169,6 +236,8 @@ mp_primary(const char *name2) if (ctx->mb_mgr == NULL) { (void) shm_destroy(&info_shm, is_pri); (void) shm_destroy(&app_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); return -1; } @@ -178,6 +247,8 @@ mp_primary(const char *name2) if (alloc_crypto_op_data(ctx, &app_alloc, is_pri) != 0) { (void) shm_destroy(&info_shm, is_pri); (void) shm_destroy(&app_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); return -1; } @@ -185,6 +256,8 @@ mp_primary(const char *name2) if (prepare_reference_output(ctx, is_pri) != 0) { (void) shm_destroy(&info_shm, is_pri); (void) shm_destroy(&app_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); return -1; } @@ -196,6 +269,8 @@ mp_primary(const char *name2) &ctx->jobs_sent, ctx->exp_enc_key, ctx->iv, buffer_size) != 0) { (void) shm_destroy(&info_shm, is_pri); (void) shm_destroy(&app_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); return -1; } @@ -205,6 +280,8 @@ mp_primary(const char *name2) if (ctx->jobs_sent != IMB_DIM(ctx->buffer_table_in_out)) { (void) shm_destroy(&info_shm, is_pri); (void) shm_destroy(&app_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); return -1; } @@ -215,7 +292,35 @@ mp_primary(const char *name2) */ fprintf(stdout, "PRIMARY: starting SECONDARY process now\n"); - const int status = system(name2); + const size_t cmd_length = + strlen(name2) + 1 + strlen(shm_info_uname) + 1 + strlen(shm_data_uname) + 1; + char *cmd = malloc(cmd_length); + + if (cmd == NULL) { + (void) shm_destroy(&info_shm, is_pri); + (void) shm_destroy(&app_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); + return -1; + } + + memset(cmd, 0, cmd_length); + + const int cmd_length_ret = + snprintf(cmd, cmd_length, "%s %s %s", name2, shm_info_uname, shm_data_uname); + + if (cmd_length_ret >= (int) cmd_length || cmd_length_ret < 0) { + (void) shm_destroy(&info_shm, is_pri); + (void) shm_destroy(&app_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); + free(cmd); + return -1; + } + + const int status = system(cmd); + + free(cmd); #ifdef _WIN32 const int err = (status != EXIT_SUCCESS); @@ -230,6 +335,8 @@ mp_primary(const char *name2) fprintf(stdout, "MULTI-PROCESS TEST: FAILED\n"); (void) shm_destroy(&info_shm, is_pri); (void) shm_destroy(&app_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); return -1; } @@ -251,11 +358,18 @@ mp_primary(const char *name2) /* clean up and exit */ if (shm_destroy(&info_shm, is_pri) != 0) { (void) shm_destroy(&app_shm, is_pri); + free(shm_info_uname); + free(shm_data_uname); return -1; } - if (shm_destroy(&app_shm, is_pri) != 0) + if (shm_destroy(&app_shm, is_pri) != 0) { + free(shm_info_uname); + free(shm_data_uname); return -1; + } + free(shm_info_uname); + free(shm_data_uname); return 0; } #endif /* _WIN32 || __linux__ || __FreeBSD__ */ diff --git a/test/mp-app/imb-mp-secondary.c b/test/mp-app/imb-mp-secondary.c index 7df2ff86..8d32f305 100644 --- a/test/mp-app/imb-mp-secondary.c +++ b/test/mp-app/imb-mp-secondary.c @@ -38,8 +38,10 @@ #if defined(__MINGW32__) static int -mp_secondary(void) +mp_secondary(const char *shm_info_uname, const char *shm_data_uname) { + (void) shm_info_uname; + (void) shm_data_uname; printf("Multi-Process test not executed.\n"); return 0; } @@ -53,15 +55,16 @@ mp_secondary(void) */ static int -mp_secondary(void) +mp_secondary(const char *shm_info_uname, const char *shm_data_uname) { const int is_pri = 0; struct shared_memory app_shm, info_shm; struct info_context *ctx = NULL; - fprintf(stdout, "SECONDARY: init start %p\n", (void *) imb_get_errno); + fprintf(stdout, "SECONDARY: init start %p, %s, %s\n", (void *) imb_get_errno, + shm_info_uname, shm_data_uname); - if (shm_create(&info_shm, is_pri, SHM_INFO_NAME, SHM_INFO_SIZE, NULL) != 0) + if (shm_create(&info_shm, is_pri, shm_info_uname, SHM_INFO_SIZE, NULL) != 0) return -1; /* cast info shared memory onto info context structure */ @@ -73,7 +76,7 @@ mp_secondary(void) return -1; } - if (shm_create(&app_shm, is_pri, SHM_DATA_NAME, SHM_DATA_SIZE, ctx->app_mmap) != 0) { + if (shm_create(&app_shm, is_pri, shm_data_uname, SHM_DATA_SIZE, ctx->app_mmap) != 0) { (void) shm_destroy(&info_shm, is_pri); return -1; } @@ -130,10 +133,13 @@ mp_secondary(void) int main(int argc, char **argv) { + int ret = -1; + (void) argc; (void) argv; - const int ret = mp_secondary(); + if (argc == 3) + ret = mp_secondary(argv[1], argv[2]); return (ret == 0) ? EXIT_SUCCESS : EXIT_FAILURE; } -- GitLab From 3b8083d40d58528c5c66b666684553c708d23992 Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Wed, 26 Jun 2024 13:27:22 +0100 Subject: [PATCH 11/24] sse_t1: [SM3] fix incorrect register name define Signed-off-by: Marcel Cornu --- lib/sse_t1/sm3_base_hmac_sse.asm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/sse_t1/sm3_base_hmac_sse.asm b/lib/sse_t1/sm3_base_hmac_sse.asm index 74882807..7af3693c 100644 --- a/lib/sse_t1/sm3_base_hmac_sse.asm +++ b/lib/sse_t1/sm3_base_hmac_sse.asm @@ -80,7 +80,7 @@ extern sm3_base_update %xdefine t1 gp1 %xdefine t2 gp2 %xdefine t3 gp3 -%xdefine t4 gp3 +%xdefine t4 gp4 %xdefine r1 gp12 %xdefine r2 gp11 -- GitLab From 5d1bac3e42527201ff1339321ef3caf9336758b5 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Mon, 1 Jul 2024 12:49:59 +0100 Subject: [PATCH 12/24] test: [mp-app] fix for ctest run on Windows with msbuild --- test/mp-app/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/mp-app/CMakeLists.txt b/test/mp-app/CMakeLists.txt index 839abc71..79a3af90 100644 --- a/test/mp-app/CMakeLists.txt +++ b/test/mp-app/CMakeLists.txt @@ -96,6 +96,12 @@ else() set(TEST_APP_BIN_DIR "${CMAKE_CURRENT_BINARY_DIR}") endif() +# append config type for multi-config generators +get_property(multi_config_gen GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) +if (multi_config_gen) + STRING(APPEND TEST_APP_BIN_DIR "/$,Debug,Release>") +endif() + add_test(NAME MULTIPROCESS COMMAND ${MP_APP_PRI} ${TEST_APP_BIN_DIR}/${MP_APP_SEC} WORKING_DIRECTORY ${TEST_APP_BIN_DIR}) -- GitLab From 1d1437dea03330cd70a26aecbcf394e2f0a644fb Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Thu, 6 Jun 2024 14:44:00 +0000 Subject: [PATCH 13/24] lib: add CCM support to new AEAD burst API Signed-off-by: Pablo de Lara --- lib/avx2_t1/mb_mgr_avx2_t1.c | 4 + lib/avx2_t2/mb_mgr_avx2_t2.c | 4 + lib/avx2_t3/mb_mgr_avx2_t3.c | 4 + lib/avx2_t4/mb_mgr_avx2_t4.c | 4 + lib/avx512_t1/mb_mgr_avx512_t1.c | 4 + lib/avx512_t2/mb_mgr_avx512_t2.c | 4 + lib/avx_t1/mb_mgr_avx_t1.c | 4 + lib/avx_t2/mb_mgr_avx_t2.c | 4 + lib/include/ipsec_ooo_mgr.h | 90 +++++++++++++++++++ lib/include/mb_mgr_burst.h | 135 +++++++++++++++++++++++++++++ lib/include/noaesni.h | 10 +++ lib/ipsec-mb.h | 35 ++++++++ lib/libIPSec_MB.def | 24 +++++ lib/no-aesni/mb_mgr_sse_no_aesni.c | 4 + lib/sse_t1/mb_mgr_sse_t1.c | 4 + lib/sse_t2/mb_mgr_sse_t2.c | 4 + lib/sse_t3/mb_mgr_sse_t3.c | 4 + 17 files changed, 342 insertions(+) diff --git a/lib/avx2_t1/mb_mgr_avx2_t1.c b/lib/avx2_t1/mb_mgr_avx2_t1.c index dc4e599e..5c8b0a3c 100644 --- a/lib/avx2_t1/mb_mgr_avx2_t1.c +++ b/lib/avx2_t1/mb_mgr_avx2_t1.c @@ -71,6 +71,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_avx2_t1 #define SUBMIT_HASH_BURST submit_hash_burst_avx2_t1 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_avx2_t1 +#define SUBMIT_AEAD_BURST submit_aead_burst_avx2_t1 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_avx2_t1 #define SET_SUITE_ID_FN set_suite_id_avx2_t1 /* Hash */ @@ -372,6 +374,8 @@ init_mb_mgr_avx2_t1_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_avx2; diff --git a/lib/avx2_t2/mb_mgr_avx2_t2.c b/lib/avx2_t2/mb_mgr_avx2_t2.c index 8b899aa5..3cb93029 100644 --- a/lib/avx2_t2/mb_mgr_avx2_t2.c +++ b/lib/avx2_t2/mb_mgr_avx2_t2.c @@ -73,6 +73,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_avx2_t2 #define SUBMIT_HASH_BURST submit_hash_burst_avx2_t2 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_avx2_t2 +#define SUBMIT_AEAD_BURST submit_aead_burst_avx2_t2 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_avx2_t2 #define SET_SUITE_ID_FN set_suite_id_avx2_t2 /* Hash */ @@ -375,6 +377,8 @@ init_mb_mgr_avx2_t2_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_avx2; diff --git a/lib/avx2_t3/mb_mgr_avx2_t3.c b/lib/avx2_t3/mb_mgr_avx2_t3.c index a9a78f0e..16b6d20c 100644 --- a/lib/avx2_t3/mb_mgr_avx2_t3.c +++ b/lib/avx2_t3/mb_mgr_avx2_t3.c @@ -74,6 +74,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_avx2_t3 #define SUBMIT_HASH_BURST submit_hash_burst_avx2_t3 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_avx2_t3 +#define SUBMIT_AEAD_BURST submit_aead_burst_avx2_t3 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_avx2_t3 #define SET_SUITE_ID_FN set_suite_id_avx2_t3 /* Hash */ @@ -375,6 +377,8 @@ init_mb_mgr_avx2_t3_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_avx2; diff --git a/lib/avx2_t4/mb_mgr_avx2_t4.c b/lib/avx2_t4/mb_mgr_avx2_t4.c index aa3779f6..77a6b791 100644 --- a/lib/avx2_t4/mb_mgr_avx2_t4.c +++ b/lib/avx2_t4/mb_mgr_avx2_t4.c @@ -75,6 +75,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_avx2_t4 #define SUBMIT_HASH_BURST submit_hash_burst_avx2_t4 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_avx2_t4 +#define SUBMIT_AEAD_BURST submit_aead_burst_avx2_t4 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_avx2_t4 #define SET_SUITE_ID_FN set_suite_id_avx2_t4 /* Hash */ @@ -376,6 +378,8 @@ init_mb_mgr_avx2_t4_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_avx2; diff --git a/lib/avx512_t1/mb_mgr_avx512_t1.c b/lib/avx512_t1/mb_mgr_avx512_t1.c index cd9625c9..813d1100 100644 --- a/lib/avx512_t1/mb_mgr_avx512_t1.c +++ b/lib/avx512_t1/mb_mgr_avx512_t1.c @@ -74,6 +74,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_avx512_t1 #define SUBMIT_HASH_BURST submit_hash_burst_avx512_t1 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_avx512_t1 +#define SUBMIT_AEAD_BURST submit_aead_burst_avx512_t1 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_avx512_t1 #define SET_SUITE_ID_FN set_suite_id_avx512_t1 /* Hash */ @@ -468,6 +470,8 @@ init_mb_mgr_avx512_t1_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_avx512; diff --git a/lib/avx512_t2/mb_mgr_avx512_t2.c b/lib/avx512_t2/mb_mgr_avx512_t2.c index b09dbd61..ac375077 100644 --- a/lib/avx512_t2/mb_mgr_avx512_t2.c +++ b/lib/avx512_t2/mb_mgr_avx512_t2.c @@ -73,6 +73,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_avx512_t2 #define SUBMIT_HASH_BURST submit_hash_burst_avx512_t2 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_avx512_t2 +#define SUBMIT_AEAD_BURST submit_aead_burst_avx512_t2 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_avx512_t2 #define GET_NEXT_BURST get_next_burst_avx512_t2 #define SUBMIT_BURST submit_burst_avx512_t2 #define SUBMIT_BURST_NOCHECK submit_burst_nocheck_avx512_t2 @@ -475,6 +477,8 @@ init_mb_mgr_avx512_t2_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_avx512; diff --git a/lib/avx_t1/mb_mgr_avx_t1.c b/lib/avx_t1/mb_mgr_avx_t1.c index 73b4b33e..52ab3f6e 100644 --- a/lib/avx_t1/mb_mgr_avx_t1.c +++ b/lib/avx_t1/mb_mgr_avx_t1.c @@ -66,6 +66,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_avx_t1 #define SUBMIT_HASH_BURST submit_hash_burst_avx_t1 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_avx_t1 +#define SUBMIT_AEAD_BURST submit_aead_burst_avx_t1 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_avx_t1 #define SET_SUITE_ID_FN set_suite_id_avx_t1 /* Hash */ @@ -371,6 +373,8 @@ init_mb_mgr_avx_t1_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_avx; diff --git a/lib/avx_t2/mb_mgr_avx_t2.c b/lib/avx_t2/mb_mgr_avx_t2.c index bf2aa688..3899e995 100644 --- a/lib/avx_t2/mb_mgr_avx_t2.c +++ b/lib/avx_t2/mb_mgr_avx_t2.c @@ -71,6 +71,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_avx_t2 #define SUBMIT_HASH_BURST submit_hash_burst_avx_t2 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_avx_t2 +#define SUBMIT_AEAD_BURST submit_aead_burst_avx_t2 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_avx_t2 #define SET_SUITE_ID_FN set_suite_id_avx_t2 /* Hash */ @@ -376,6 +378,8 @@ init_mb_mgr_avx_t2_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_avx; diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h index 1af399e0..37e5948d 100644 --- a/lib/include/ipsec_ooo_mgr.h +++ b/lib/include/ipsec_ooo_mgr.h @@ -716,6 +716,96 @@ IMB_DLL_EXPORT uint32_t submit_hash_burst_nocheck_avx512_t2(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const IMB_HASH_ALG hash); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_sse_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_sse_t2(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_sse_t3(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_avx_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_avx_t2(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_avx2_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_avx2_t2(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_avx2_t3(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_avx2_t4(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_avx512_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_avx512_t2(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); + +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_sse_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_sse_t2(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_sse_t3(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_avx_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_avx_t2(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_avx2_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_avx2_t2(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_avx2_t3(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_avx2_t4(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_avx512_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_avx512_t2(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); + /* SSE TYPE1 manager functions */ IMB_DLL_EXPORT IMB_JOB * submit_job_sse_t1(IMB_MGR *state); diff --git a/lib/include/mb_mgr_burst.h b/lib/include/mb_mgr_burst.h index da761536..7d870920 100644 --- a/lib/include/mb_mgr_burst.h +++ b/lib/include/mb_mgr_burst.h @@ -36,6 +36,102 @@ #ifndef __aarch64__ __forceinline uint32_t +submit_aes_ccm_burst(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_KEY_SIZE_BYTES key_size, const int run_check, + const IMB_CIPHER_DIRECTION dir) +{ + uint32_t completed_jobs = 0; + MB_MGR_CCM_OOO *aes_ccm_ooo; + typedef IMB_JOB *(*submit_ccm_t)(MB_MGR_CCM_OOO *state, IMB_JOB *job); + submit_ccm_t submit_auth_ccm_fn; + typedef IMB_JOB *(*flush_ccm_t)(MB_MGR_CCM_OOO *state); + flush_ccm_t flush_auth_ccm_fn; + typedef IMB_JOB *(*aes_cntr_ccm_t)(IMB_JOB *job); + aes_cntr_ccm_t cntr_ccm_fn; + uint32_t i; + + if (key_size == IMB_KEY_128_BYTES) { + aes_ccm_ooo = state->aes_ccm_ooo; + submit_auth_ccm_fn = SUBMIT_JOB_AES128_CCM_AUTH; + flush_auth_ccm_fn = FLUSH_JOB_AES128_CCM_AUTH; + cntr_ccm_fn = AES_CNTR_CCM_128; + } else { + aes_ccm_ooo = state->aes256_ccm_ooo; + submit_auth_ccm_fn = SUBMIT_JOB_AES256_CCM_AUTH; + flush_auth_ccm_fn = FLUSH_JOB_AES256_CCM_AUTH; + cntr_ccm_fn = AES_CNTR_CCM_256; + } + + if (run_check) { + + /* validate jobs */ + for (i = 0; i < n_jobs; i++) { + IMB_JOB *job = &jobs[i]; + + /* validate job */ + if (is_job_invalid(state, job, IMB_CIPHER_CCM, IMB_AUTH_AES_CCM, dir, + key_size)) { + job->status = IMB_STATUS_INVALID_ARGS; + return 0; + } + } + } + + if (dir == IMB_DIR_ENCRYPT) { + /* First authenticate with AES-CMAC */ + /* submit all jobs */ + for (i = 0; i < n_jobs; i++) { + IMB_JOB *job = &jobs[i]; + + job = submit_auth_ccm_fn(aes_ccm_ooo, job); + if (job != NULL) + completed_jobs++; + } + /* flush any outstanding jobs */ + if (completed_jobs != n_jobs) + while (flush_auth_ccm_fn(aes_ccm_ooo) != NULL) + completed_jobs++; + + /* Then encrypt with AES-CTR */ + for (i = 0; i < n_jobs; i++) { + IMB_JOB *job = &jobs[i]; + + cntr_ccm_fn(job); + job->status = IMB_STATUS_COMPLETED; + } + } else { + /* First decrypt with AES-CTR */ + for (i = 0; i < n_jobs; i++) { + IMB_JOB *job = &jobs[i]; + + cntr_ccm_fn(job); + } + + /* Then authenticate with AES-CMAC */ + /* submit all jobs */ + for (i = 0; i < n_jobs; i++) { + IMB_JOB *job = &jobs[i]; + + job = submit_auth_ccm_fn(aes_ccm_ooo, job); + if (job != NULL) { + job->status = IMB_STATUS_COMPLETED; + completed_jobs++; + } + } + /* flush any outstanding jobs */ + if (completed_jobs != n_jobs) { + IMB_JOB *job = NULL; + + while ((job = flush_auth_ccm_fn(aes_ccm_ooo)) != NULL) { + job->status = IMB_STATUS_COMPLETED; + completed_jobs++; + } + } + } + + return completed_jobs; +} +__forceinline uint32_t submit_aes_cbc_burst_enc(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const IMB_KEY_SIZE_BYTES key_size, const int run_check) { @@ -432,6 +528,45 @@ SUBMIT_CIPHER_BURST_NOCHECK(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs } #ifndef __aarch64__ +__forceinline uint32_t +submit_aead_burst_and_check(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size, const int run_check) +{ + /* reset error status */ + imb_set_errno(state, 0); + + if (run_check) + if (jobs == NULL) { + imb_set_errno(state, IMB_ERR_NULL_BURST); + return 0; + } + + if (cipher == IMB_CIPHER_CCM) + return submit_aes_ccm_burst(state, jobs, n_jobs, key_size, run_check, dir); + + /* unsupported cipher mode */ + imb_set_errno(state, IMB_ERR_CIPH_MODE); + + return 0; +} + +uint32_t +SUBMIT_AEAD_BURST(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size) +{ + return submit_aead_burst_and_check(state, jobs, n_jobs, cipher, dir, key_size, 1); +} + +uint32_t +SUBMIT_AEAD_BURST_NOCHECK(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size) +{ + return submit_aead_burst_and_check(state, jobs, n_jobs, cipher, dir, key_size, 0); +} + __forceinline uint32_t submit_burst_hmac_sha_x(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const int run_check, const IMB_HASH_ALG hash_alg) diff --git a/lib/include/noaesni.h b/lib/include/noaesni.h index 326c72c1..2c8635ac 100644 --- a/lib/include/noaesni.h +++ b/lib/include/noaesni.h @@ -86,6 +86,16 @@ submit_hash_burst_sse_no_aesni(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_j IMB_DLL_EXPORT uint32_t submit_hash_burst_nocheck_sse_no_aesni(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const IMB_HASH_ALG hash); + +IMB_DLL_EXPORT uint32_t +submit_aead_burst_sse_no_aesni(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t +submit_aead_burst_nocheck_sse_no_aesni(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); + IMB_DLL_EXPORT void aes_keyexp_128_sse_no_aesni(const void *key, void *enc_exp_keys, void *dec_exp_keys); IMB_DLL_EXPORT void diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index 3c893707..f54458db 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -1162,6 +1162,9 @@ typedef struct IMB_MGR { imb_self_test_cb_t self_test_cb_fn; void *self_test_cb_arg; + submit_cipher_burst_t submit_aead_burst; + submit_cipher_burst_t submit_aead_burst_nocheck; + /* in-order scheduler fields */ int earliest_job; /**< byte offset, -1 if none */ int next_job; /**< byte offset */ @@ -1741,6 +1744,38 @@ init_mb_mgr_auto(IMB_MGR *state, IMB_ARCH *arch); #define IMB_SUBMIT_HASH_BURST_NOCHECK(_mgr, _jobs, _n_jobs, _hash) \ ((_mgr)->submit_hash_burst_nocheck((_mgr), (_jobs), (_n_jobs), (_hash))) +/** + * Submit multiple cipher jobs to be processed synchronously after validating. + * + * @param [in] _mgr Pointer to initialized IMB_MGR structure + * @param [in,out] _jobs Pointer to array of IMB_JOB structures + * @param [in] _n_jobs Number of jobs to process + * @param [in] _cipher Cipher algorithm of type #IMB_CIPHER_MODE + * @param [in] _dir Cipher direction of type #IMB_CIPHER_DIRECTION + * @param [in] _key_size Key size in bytes of type #IMB_KEY_SIZE_BYTES + * + * @return Number of completed jobs + */ +#define IMB_SUBMIT_AEAD_BURST(_mgr, _jobs, _n_jobs, _cipher, _dir, _key_size) \ + ((_mgr)->submit_aead_burst((_mgr), (_jobs), (_n_jobs), (_cipher), (_dir), (_key_size))) +/** + * Submit multiple cipher jobs to be processed synchronously without validating. + * + * This is more performant but less secure than IMB_SUBMIT_AEAD_BURST(). + * + * @param [in] _mgr Pointer to initialized IMB_MGR structure + * @param [in,out] _jobs Pointer to array of IMB_JOB structures + * @param [in] _n_jobs Number of jobs to process + * @param [in] _cipher Cipher algorithm of type #IMB_CIPHER_MODE + * @param [in] _dir Cipher direction of type #IMB_CIPHER_DIRECTION + * @param [in] _key_size Key size in bytes of type #IMB_KEY_SIZE_BYTES + * + * @return Number of completed jobs + */ +#define IMB_SUBMIT_AEAD_BURST_NOCHECK(_mgr, _jobs, _n_jobs, _cipher, _dir, _key_size) \ + ((_mgr)->submit_aead_burst_nocheck((_mgr), (_jobs), (_n_jobs), (_cipher), (_dir), \ + (_key_size))) + /* Key expansion and generation API's */ /** diff --git a/lib/libIPSec_MB.def b/lib/libIPSec_MB.def index 5ad31e45..af21809c 100644 --- a/lib/libIPSec_MB.def +++ b/lib/libIPSec_MB.def @@ -730,3 +730,27 @@ EXPORTS get_completed_job_avx2_t4 @704 imb_hash_burst_get_size @705 imb_get_arch_type_string @706 + submit_aead_burst_sse_t1 @707 + submit_aead_burst_sse_t2 @708 + submit_aead_burst_sse_t3 @709 + submit_aead_burst_sse_no_aesni @710 + submit_aead_burst_avx_t1 @711 + submit_aead_burst_avx_t2 @712 + submit_aead_burst_avx2_t1 @713 + submit_aead_burst_avx2_t2 @714 + submit_aead_burst_avx2_t3 @715 + submit_aead_burst_avx2_t4 @716 + submit_aead_burst_avx512_t1 @717 + submit_aead_burst_avx512_t2 @718 + submit_aead_burst_nocheck_sse_t1 @719 + submit_aead_burst_nocheck_sse_t2 @720 + submit_aead_burst_nocheck_sse_t3 @721 + submit_aead_burst_nocheck_sse_no_aesni @722 + submit_aead_burst_nocheck_avx_t1 @723 + submit_aead_burst_nocheck_avx_t2 @724 + submit_aead_burst_nocheck_avx2_t1 @725 + submit_aead_burst_nocheck_avx2_t2 @726 + submit_aead_burst_nocheck_avx2_t3 @727 + submit_aead_burst_nocheck_avx2_t4 @728 + submit_aead_burst_nocheck_avx512_t1 @729 + submit_aead_burst_nocheck_avx512_t2 @730 diff --git a/lib/no-aesni/mb_mgr_sse_no_aesni.c b/lib/no-aesni/mb_mgr_sse_no_aesni.c index aad7227f..364f2103 100644 --- a/lib/no-aesni/mb_mgr_sse_no_aesni.c +++ b/lib/no-aesni/mb_mgr_sse_no_aesni.c @@ -166,6 +166,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_sse_no_aesni #define SUBMIT_HASH_BURST submit_hash_burst_sse_no_aesni #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_sse_no_aesni +#define SUBMIT_AEAD_BURST submit_aead_burst_sse_no_aesni +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_sse_no_aesni #define SET_SUITE_ID_FN set_suite_id_sse_no_aesni #define SUBMIT_JOB_AES128_DEC submit_job_aes128_dec_sse_no_aesni @@ -351,6 +353,8 @@ init_mb_mgr_sse_no_aesni_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = submit_cipher_burst_nocheck_sse_no_aesni; state->submit_hash_burst = submit_hash_burst_sse_no_aesni; state->submit_hash_burst_nocheck = submit_hash_burst_nocheck_sse_no_aesni; + state->submit_aead_burst = submit_aead_burst_sse_no_aesni; + state->submit_aead_burst_nocheck = submit_aead_burst_nocheck_sse_no_aesni; state->submit_job_nocheck = submit_job_nocheck_sse_no_aesni; state->get_completed_job = get_completed_job_sse_no_aesni; state->flush_job = flush_job_sse_no_aesni; diff --git a/lib/sse_t1/mb_mgr_sse_t1.c b/lib/sse_t1/mb_mgr_sse_t1.c index ec76ba2e..1a1b38cd 100644 --- a/lib/sse_t1/mb_mgr_sse_t1.c +++ b/lib/sse_t1/mb_mgr_sse_t1.c @@ -69,6 +69,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_sse_t1 #define SUBMIT_HASH_BURST submit_hash_burst_sse_t1 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_sse_t1 +#define SUBMIT_AEAD_BURST submit_aead_burst_sse_t1 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_sse_t1 #define SET_SUITE_ID_FN set_suite_id_sse_t1 @@ -375,6 +377,8 @@ init_mb_mgr_sse_t1_internal(IMB_MGR *state, const int reset_mgrs) state->flush_burst = FLUSH_BURST; state->submit_cipher_burst = SUBMIT_CIPHER_BURST; state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; diff --git a/lib/sse_t2/mb_mgr_sse_t2.c b/lib/sse_t2/mb_mgr_sse_t2.c index 5976cfbc..2b5825a9 100644 --- a/lib/sse_t2/mb_mgr_sse_t2.c +++ b/lib/sse_t2/mb_mgr_sse_t2.c @@ -70,6 +70,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_sse_t2 #define SUBMIT_HASH_BURST submit_hash_burst_sse_t2 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_sse_t2 +#define SUBMIT_AEAD_BURST submit_aead_burst_sse_t2 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_sse_t2 #define SET_SUITE_ID_FN set_suite_id_sse_t2 /* Hash */ @@ -379,6 +381,8 @@ init_mb_mgr_sse_t2_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_sse; diff --git a/lib/sse_t3/mb_mgr_sse_t3.c b/lib/sse_t3/mb_mgr_sse_t3.c index 0d3fd02c..ea21e91e 100644 --- a/lib/sse_t3/mb_mgr_sse_t3.c +++ b/lib/sse_t3/mb_mgr_sse_t3.c @@ -71,6 +71,8 @@ #define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_sse_t3 #define SUBMIT_HASH_BURST submit_hash_burst_sse_t3 #define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_sse_t3 +#define SUBMIT_AEAD_BURST submit_aead_burst_sse_t3 +#define SUBMIT_AEAD_BURST_NOCHECK submit_aead_burst_nocheck_sse_t3 #define SET_SUITE_ID_FN set_suite_id_sse_t3 /* Hash */ @@ -380,6 +382,8 @@ init_mb_mgr_sse_t3_internal(IMB_MGR *state, const int reset_mgrs) state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; state->submit_hash_burst = SUBMIT_HASH_BURST; state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->submit_aead_burst = SUBMIT_AEAD_BURST; + state->submit_aead_burst_nocheck = SUBMIT_AEAD_BURST_NOCHECK; state->set_suite_id = SET_SUITE_ID_FN; state->keyexp_128 = aes_keyexp_128_sse; -- GitLab From 39398de20c8517f59277fd20e7708800caf10bb1 Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Wed, 12 Jun 2024 13:38:25 +0100 Subject: [PATCH 14/24] test: extend CCM test to cover AEAD burst API Signed-off-by: Pablo de Lara --- test/kat-app/ccm_test.c | 203 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) diff --git a/test/kat-app/ccm_test.c b/test/kat-app/ccm_test.c index ca3a347d..4e570908 100644 --- a/test/kat-app/ccm_test.c +++ b/test/kat-app/ccm_test.c @@ -144,6 +144,145 @@ ccm_job_ok(const struct aead_test *vec, const struct IMB_JOB *job, const uint8_t return 1; } +static int +test_ccm_aead_burst(struct IMB_MGR *mb_mgr, const struct aead_test *vec, const int dir, + const int in_place, const int num_jobs, const uint64_t key_length) +{ + DECLARE_ALIGNED(uint32_t expkey[4 * 15], 16); + DECLARE_ALIGNED(uint32_t dust[4 * 15], 16); + struct IMB_JOB *job, jobs[IMB_MAX_BURST_SIZE]; + uint8_t padding[16]; + uint8_t **targets = malloc(num_jobs * sizeof(void *)); + uint8_t **auths = malloc(num_jobs * sizeof(void *)); + int i, completed_jobs, jobs_rx = 0, ret = -1; + const int order = (dir == IMB_DIR_ENCRYPT) ? IMB_ORDER_HASH_CIPHER : IMB_ORDER_CIPHER_HASH; + + if (targets == NULL || auths == NULL) { + fprintf(stderr, "Can't allocate buffer memory\n"); + goto end2; + } + + memset(padding, -1, sizeof(padding)); + memset(targets, 0, num_jobs * sizeof(void *)); + memset(auths, 0, num_jobs * sizeof(void *)); + + for (i = 0; i < num_jobs; i++) { + targets[i] = malloc(vec->msgSize / 8 + (sizeof(padding) * 2)); + auths[i] = malloc(16 + (sizeof(padding) * 2)); + if (targets[i] == NULL || auths[i] == NULL) { + fprintf(stderr, "Can't allocate buffer memory\n"); + goto end; + } + + memset(targets[i], -1, vec->msgSize / 8 + (sizeof(padding) * 2)); + memset(auths[i], -1, 16 + (sizeof(padding) * 2)); + + if (in_place) { + if (dir == IMB_DIR_ENCRYPT) + memcpy(targets[i] + sizeof(padding), (const void *) vec->msg, + vec->msgSize / 8); + else + memcpy(targets[i] + sizeof(padding), (const void *) vec->ct, + vec->msgSize / 8); + } + } + + if (key_length == 16) + IMB_AES_KEYEXP_128(mb_mgr, vec->key, expkey, dust); + else + IMB_AES_KEYEXP_256(mb_mgr, vec->key, expkey, dust); + + for (i = 0; i < num_jobs; i++) { + job = &jobs[i]; + job->cipher_direction = dir; + job->chain_order = order; + if (in_place) { + job->dst = targets[i] + sizeof(padding) + vec->aadSize / 8; + job->src = targets[i] + sizeof(padding); + } else { + if (dir == IMB_DIR_ENCRYPT) { + job->dst = targets[i] + sizeof(padding); + job->src = (const void *) vec->msg; + } else { + job->dst = targets[i] + sizeof(padding); + job->src = (const void *) vec->ct; + } + } + job->cipher_mode = IMB_CIPHER_CCM; + job->enc_keys = expkey; + job->dec_keys = expkey; + job->key_len_in_bytes = key_length; + job->iv = (const void *) vec->iv; + job->iv_len_in_bytes = vec->ivSize / 8; + job->cipher_start_src_offset_in_bytes = vec->aadSize / 8; + job->msg_len_to_cipher_in_bytes = vec->msgSize / 8 - vec->aadSize / 8; + + job->hash_alg = IMB_AUTH_AES_CCM; + job->hash_start_src_offset_in_bytes = vec->aadSize / 8; + job->msg_len_to_hash_in_bytes = vec->msgSize / 8 - vec->aadSize / 8; + job->auth_tag_output = auths[i] + sizeof(padding); + job->auth_tag_output_len_in_bytes = vec->tagSize / 8; + + job->u.CCM.aad_len_in_bytes = vec->aadSize / 8; + job->u.CCM.aad = job->src; + + job->user_data = targets[i]; + job->user_data2 = auths[i]; + } + + completed_jobs = + IMB_SUBMIT_AEAD_BURST(mb_mgr, jobs, num_jobs, IMB_CIPHER_CCM, dir, key_length); + if (completed_jobs != num_jobs) { + int err = imb_get_errno(mb_mgr); + + if (err != 0) { + printf("submit_burst error %d : '%s'\n", err, imb_get_strerror(err)); + goto end; + } else { + printf("submit_burst error: not enough " + "jobs returned!\n"); + goto end; + } + } + + for (i = 0; i < num_jobs; i++) { + job = &jobs[i]; + + if (job->status != IMB_STATUS_COMPLETED) { + printf("job %d status not complete!\n", i + 1); + goto end; + } + + jobs_rx++; + if (!ccm_job_ok(vec, job, job->user_data, padding, job->user_data2, sizeof(padding), + dir, in_place)) + goto end; + } + + if (jobs_rx != num_jobs) { + printf("Expected %d jobs, received %d\n", num_jobs, jobs_rx); + goto end; + } + ret = 0; + +end: + for (i = 0; i < num_jobs; i++) { + if (targets[i] != NULL) + free(targets[i]); + if (auths[i] != NULL) + free(auths[i]); + } + +end2: + if (targets != NULL) + free(targets); + + if (auths != NULL) + free(auths); + + return ret; +} + static int test_ccm(struct IMB_MGR *mb_mgr, const struct aead_test *vec, const int dir, const int in_place, const int num_jobs, const uint64_t key_length) @@ -327,6 +466,38 @@ test_ccm_128_std_vectors(struct IMB_MGR *mb_mgr, struct test_suite_context *ctx, } else { test_suite_update(ctx, 1, 0); } + + if (test_ccm_aead_burst(mb_mgr, v, IMB_DIR_ENCRYPT, 1, num_jobs, + IMB_KEY_128_BYTES)) { + printf("error #%zu encrypt in-place (aead burst)\n", v->tcId); + test_suite_update(ctx, 0, 1); + } else { + test_suite_update(ctx, 1, 0); + } + + if (test_ccm_aead_burst(mb_mgr, v, IMB_DIR_DECRYPT, 1, num_jobs, + IMB_KEY_128_BYTES)) { + printf("error #%zu decrypt in-place (aead burst)\n", v->tcId); + test_suite_update(ctx, 0, 1); + } else { + test_suite_update(ctx, 1, 0); + } + + if (test_ccm_aead_burst(mb_mgr, v, IMB_DIR_ENCRYPT, 0, num_jobs, + IMB_KEY_128_BYTES)) { + printf("error #%zu encrypt out-of-place (aead burst)\n", v->tcId); + test_suite_update(ctx, 0, 1); + } else { + test_suite_update(ctx, 1, 0); + } + + if (test_ccm_aead_burst(mb_mgr, v, IMB_DIR_DECRYPT, 0, num_jobs, + IMB_KEY_128_BYTES)) { + printf("error #%zu decrypt out-of-place (aead burst)\n", v->tcId); + test_suite_update(ctx, 0, 1); + } else { + test_suite_update(ctx, 1, 0); + } } if (!quiet_mode) printf("\n"); @@ -379,6 +550,38 @@ test_ccm_256_std_vectors(struct IMB_MGR *mb_mgr, struct test_suite_context *ctx, } else { test_suite_update(ctx, 1, 0); } + + if (test_ccm_aead_burst(mb_mgr, v, IMB_DIR_ENCRYPT, 1, num_jobs, + IMB_KEY_256_BYTES)) { + printf("error #%zu encrypt in-place (aead burst)\n", v->tcId); + test_suite_update(ctx, 0, 1); + } else { + test_suite_update(ctx, 1, 0); + } + + if (test_ccm_aead_burst(mb_mgr, v, IMB_DIR_DECRYPT, 1, num_jobs, + IMB_KEY_256_BYTES)) { + printf("error #%zu decrypt in-place (aead burst)\n", v->tcId); + test_suite_update(ctx, 0, 1); + } else { + test_suite_update(ctx, 1, 0); + } + + if (test_ccm_aead_burst(mb_mgr, v, IMB_DIR_ENCRYPT, 0, num_jobs, + IMB_KEY_256_BYTES)) { + printf("error #%zu encrypt out-of-place (aead burst)\n", v->tcId); + test_suite_update(ctx, 0, 1); + } else { + test_suite_update(ctx, 1, 0); + } + + if (test_ccm_aead_burst(mb_mgr, v, IMB_DIR_DECRYPT, 0, num_jobs, + IMB_KEY_256_BYTES)) { + printf("error #%zu decrypt out-of-place (aead burst)\n", v->tcId); + test_suite_update(ctx, 0, 1); + } else { + test_suite_update(ctx, 1, 0); + } } if (!quiet_mode) printf("\n"); -- GitLab From ec379286afe8d808e4ba98b7a2acdc56af2ce815 Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Wed, 12 Jun 2024 13:40:33 +0100 Subject: [PATCH 15/24] lib: add API to retrieve minimum burst size for cipher-only burst API Function added to retrieve the minimum burst size to be used in the cipher-only burst API (IMB_SUBMIT_CIPHER_BURST) to get optimal performance, based on the implementation used. If a cipher algorithm is not supported, it returns an error. Signed-off-by: Pablo de Lara --- lib/include/ipsec_ooo_mgr.h | 1 + lib/include/mb_mgr_datastruct.inc | 1 + lib/ipsec-mb.h | 22 ++++++++++++++++++++++ lib/libIPSec_MB.def | 1 + lib/x86_64/capabilities.c | 20 ++++++++++++++++++++ lib/x86_64/ooo_mgr_reset.c | 2 ++ 6 files changed, 47 insertions(+) diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h index 37e5948d..65e40fec 100644 --- a/lib/include/ipsec_ooo_mgr.h +++ b/lib/include/ipsec_ooo_mgr.h @@ -215,6 +215,7 @@ typedef struct { IMB_JOB *job_in_lane[16]; uint64_t num_lanes_inuse; DECLARE_ALIGNED(uint64_t lens64[16], 64); + uint32_t total_num_lanes; uint64_t road_block; } MB_MGR_AES_OOO; diff --git a/lib/include/mb_mgr_datastruct.inc b/lib/include/mb_mgr_datastruct.inc index 3f4c0085..1b836c8d 100644 --- a/lib/include/mb_mgr_datastruct.inc +++ b/lib/include/mb_mgr_datastruct.inc @@ -62,6 +62,7 @@ FIELD _aes_unused_lanes, 8, 8 FIELD _aes_job_in_lane, 16*8, 8 FIELD _aes_lanes_in_use, 8, 8 FIELD _aes_lens_64, 16*8, 64 +FIELD _aes_total_num_lanes, 4, 4 FIELD _aes_road_block, 8, 8 END_FIELDS %assign _MB_MGR_AES_OOO_size _FIELD_OFFSET diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index f54458db..ddbb3f89 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -4092,6 +4092,28 @@ imb_get_arch_type_string(const IMB_MGR *state, const char **arch_type, const cha IMB_DLL_EXPORT int imb_hash_burst_get_size(const IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned *out_burst_size); +/** + * @brief Retrieves minimum burst size for good performance on cipher algorithms. + * + * Depending on the architecture used, this function returns the minimum + * burst size to be used for good performance on the cipher-only burst API. + * The output burst size can be 1 (in case of a synchronous single-buffer implementation + * or 0 if the algorithm is not supported by the API). + * + * @param [in] mb_mgr pointer to IMB MGR structure + * @param [in] cipher_mode cipher mode + * @param [out] out_burst_size pointer to store min burst size + * + * @return operation status. + * @retval 0 success + * @retval IMB_ERR_CIPHER_MODE not supported \a algo + * @retval IMB_ERR_NULL_MBMGR invalid \a mb_mgr pointer + * @retval IMB_ERR_NULL_BURST invalid \a out_burst_size pointer + */ +IMB_DLL_EXPORT int +imb_cipher_burst_get_size(const IMB_MGR *mb_mgr, const IMB_CIPHER_MODE cipher_mode, + unsigned *out_burst_size); + #ifdef __cplusplus } #endif diff --git a/lib/libIPSec_MB.def b/lib/libIPSec_MB.def index af21809c..9ca7091c 100644 --- a/lib/libIPSec_MB.def +++ b/lib/libIPSec_MB.def @@ -754,3 +754,4 @@ EXPORTS submit_aead_burst_nocheck_avx2_t4 @728 submit_aead_burst_nocheck_avx512_t1 @729 submit_aead_burst_nocheck_avx512_t2 @730 + imb_cipher_burst_get_size @731 diff --git a/lib/x86_64/capabilities.c b/lib/x86_64/capabilities.c index a515fe9d..df97686f 100644 --- a/lib/x86_64/capabilities.c +++ b/lib/x86_64/capabilities.c @@ -85,6 +85,26 @@ imb_hash_burst_get_size(const IMB_MGR *mb_mgr, const IMB_HASH_ALG algo, unsigned return 0; } +int +imb_cipher_burst_get_size(const IMB_MGR *mb_mgr, const IMB_CIPHER_MODE cipher_mode, + unsigned *out_burst_size) +{ + switch (cipher_mode) { + case IMB_CIPHER_ECB: + case IMB_CIPHER_CNTR: + *out_burst_size = 1; + break; + case IMB_CIPHER_CBC: + *out_burst_size = ((MB_MGR_AES_OOO *) (mb_mgr->aes128_ooo))->total_num_lanes; + break; + default: + *out_burst_size = 0; + return IMB_ERR_CIPH_MODE; + } + + return 0; +} + int imb_get_arch_type_string(const IMB_MGR *state, const char **arch_type, const char **description) { diff --git a/lib/x86_64/ooo_mgr_reset.c b/lib/x86_64/ooo_mgr_reset.c index 4894fea8..262a3a04 100644 --- a/lib/x86_64/ooo_mgr_reset.c +++ b/lib/x86_64/ooo_mgr_reset.c @@ -41,6 +41,8 @@ ooo_mgr_aes_reset(void *p_ooo_mgr, const unsigned num_lanes) memset(p_mgr, 0, offsetof(MB_MGR_AES_OOO, road_block)); memset(p_mgr->lens, 0xff, sizeof(p_mgr->lens)); + p_mgr->total_num_lanes = num_lanes; + if (num_lanes == 4) p_mgr->unused_lanes = 0xF3210; else if (num_lanes == 8) -- GitLab From 6bab3ce5bd4dd4858e49ce8d4e6d9a76a1e85914 Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Wed, 12 Jun 2024 13:43:23 +0100 Subject: [PATCH 16/24] lib: add API to retrieve support of aead burst API Function added to return if a AEAD algorithm is supported by the AEAD burst API. Signed-off-by: Pablo de Lara --- lib/include/ipsec_ooo_mgr.h | 1 + lib/include/mb_mgr_datastruct.inc | 1 + lib/ipsec-mb.h | 22 ++++++++++++++++++++++ lib/libIPSec_MB.def | 1 + lib/x86_64/capabilities.c | 13 +++++++++++++ lib/x86_64/ooo_mgr_reset.c | 2 ++ 6 files changed, 40 insertions(+) diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h index 65e40fec..4b67cadd 100644 --- a/lib/include/ipsec_ooo_mgr.h +++ b/lib/include/ipsec_ooo_mgr.h @@ -266,6 +266,7 @@ typedef struct { DECLARE_ALIGNED(IMB_JOB *job_in_lane[16], 16); uint64_t num_lanes_inuse; DECLARE_ALIGNED(uint8_t init_blocks[16 * (4 * 16)], 64); + uint32_t total_num_lanes; uint64_t road_block; } MB_MGR_CCM_OOO; diff --git a/lib/include/mb_mgr_datastruct.inc b/lib/include/mb_mgr_datastruct.inc index 1b836c8d..800fbcce 100644 --- a/lib/include/mb_mgr_datastruct.inc +++ b/lib/include/mb_mgr_datastruct.inc @@ -179,6 +179,7 @@ FIELD _aes_ccm_unused_lanes, 8, 8 FIELD _aes_ccm_job_in_lane, 16*8, 16 FIELD _aes_ccm_num_lanes_inuse, 8, 8 FIELD _aes_ccm_init_blocks, 16*4*16, 64 +FIELD _aes_ccm_total_num_lanes, 4, 4 FIELD _aes_ccm_road_block, 8, 8 END_FIELDS %assign _MB_MGR_CCM_OOO_size _FIELD_OFFSET diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index ddbb3f89..6985c7a9 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -4114,6 +4114,28 @@ IMB_DLL_EXPORT int imb_cipher_burst_get_size(const IMB_MGR *mb_mgr, const IMB_CIPHER_MODE cipher_mode, unsigned *out_burst_size); +/** + * @brief Retrieves minimum burst size for good performance on AEAD algorithms. + * + * Depending on the architecture used, this function returns the minimum + * burst size to be used for good performance on the AEAD burst API. + * The output burst size can be 1 (in case of a synchronous single-buffer implementation + * or 0 if the algorithm is not supported by the API). + * + * @param [in] mb_mgr pointer to IMB MGR structure + * @param [in] cipher_mode cipher mode + * @param [out] out_burst_size pointer to store min burst size + * + * @return operation status. + * @retval 0 success + * @retval IMB_ERR_CIPHER_MODE not supported \a algo + * @retval IMB_ERR_NULL_MBMGR invalid \a mb_mgr pointer + * @retval IMB_ERR_NULL_BURST invalid \a out_burst_size pointer + */ +IMB_DLL_EXPORT int +imb_aead_burst_get_size(const IMB_MGR *mb_mgr, const IMB_CIPHER_MODE cipher_mode, + unsigned *out_burst_size); + #ifdef __cplusplus } #endif diff --git a/lib/libIPSec_MB.def b/lib/libIPSec_MB.def index 9ca7091c..9aa9b86b 100644 --- a/lib/libIPSec_MB.def +++ b/lib/libIPSec_MB.def @@ -755,3 +755,4 @@ EXPORTS submit_aead_burst_nocheck_avx512_t1 @729 submit_aead_burst_nocheck_avx512_t2 @730 imb_cipher_burst_get_size @731 + imb_aead_burst_get_size @732 diff --git a/lib/x86_64/capabilities.c b/lib/x86_64/capabilities.c index df97686f..740824dc 100644 --- a/lib/x86_64/capabilities.c +++ b/lib/x86_64/capabilities.c @@ -105,6 +105,19 @@ imb_cipher_burst_get_size(const IMB_MGR *mb_mgr, const IMB_CIPHER_MODE cipher_mo return 0; } +int +imb_aead_burst_get_size(const IMB_MGR *mb_mgr, const IMB_CIPHER_MODE cipher_mode, + unsigned *out_burst_size) +{ + if (cipher_mode == IMB_CIPHER_CCM) { + *out_burst_size = ((MB_MGR_CCM_OOO *) (mb_mgr->aes_ccm_ooo))->total_num_lanes; + return 0; + } else { + *out_burst_size = 0; + return IMB_ERR_CIPH_MODE; + } +} + int imb_get_arch_type_string(const IMB_MGR *state, const char **arch_type, const char **description) { diff --git a/lib/x86_64/ooo_mgr_reset.c b/lib/x86_64/ooo_mgr_reset.c index 262a3a04..098d0504 100644 --- a/lib/x86_64/ooo_mgr_reset.c +++ b/lib/x86_64/ooo_mgr_reset.c @@ -98,6 +98,8 @@ ooo_mgr_ccm_reset(void *p_ooo_mgr, const unsigned num_lanes) memset(p_mgr, 0, offsetof(MB_MGR_CCM_OOO, road_block)); memset(p_mgr->lens, 0xff, sizeof(p_mgr->lens)); + p_mgr->total_num_lanes = num_lanes; + if (num_lanes == 4) p_mgr->unused_lanes = 0xF3210; else if (num_lanes == 8) -- GitLab From 3d4d567bb7c85539e1079b5fea15967b37504fc5 Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Wed, 12 Jun 2024 13:52:39 +0100 Subject: [PATCH 17/24] perf: use new API retrieving cipher algo support in burst API Signed-off-by: Pablo de Lara --- perf/ipsec_perf.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/perf/ipsec_perf.c b/perf/ipsec_perf.c index b39cafbb..6f4df75b 100644 --- a/perf/ipsec_perf.c +++ b/perf/ipsec_perf.c @@ -4125,16 +4125,8 @@ main(int argc, char *argv[]) if (test_api != TEST_API_JOB && burst_size == 0) burst_size = DEFAULT_BURST_SIZE; - /* currently only AES-CBC & CTR supported by cipher-only burst API */ - if (test_api == TEST_API_CIPHER_BURST && (custom_job_params.cipher_mode != TEST_CBC && - custom_job_params.cipher_mode != TEST_CNTR)) { - fprintf(stderr, "Unsupported cipher-only burst " - "API algorithm selected\n"); - return EXIT_FAILURE; - } - - /* only a few algorithms support the hash-only burst API */ - if (test_api == TEST_API_HASH_BURST) { + /* only a few algorithms support the hash/cipher-only burst API */ + if (test_api == TEST_API_HASH_BURST || test_api == TEST_API_CIPHER_BURST) { uint32_t optim_burst_size; IMB_MGR *aux_mgr = alloc_mb_mgr(0); @@ -4144,11 +4136,24 @@ main(int argc, char *argv[]) } init_mb_mgr_auto(aux_mgr, NULL); - if (imb_hash_burst_get_size(aux_mgr, translate_hash_alg(custom_job_params.hash_alg), - &optim_burst_size) == IMB_ERR_HASH_ALGO) { - fprintf(stderr, "Unsupported hash-only burst API algorithm selected\n"); - free_mb_mgr(aux_mgr); - return EXIT_FAILURE; + if (test_api == TEST_API_HASH_BURST) { + if (imb_hash_burst_get_size(aux_mgr, + translate_hash_alg(custom_job_params.hash_alg), + &optim_burst_size) == IMB_ERR_HASH_ALGO) { + fprintf(stderr, + "Unsupported hash-only burst API algorithm selected\n"); + free_mb_mgr(aux_mgr); + return EXIT_FAILURE; + } + } else if (test_api == TEST_API_CIPHER_BURST) { + if (imb_cipher_burst_get_size( + aux_mgr, translate_cipher_mode(custom_job_params.cipher_mode), + &optim_burst_size) == IMB_ERR_CIPH_MODE) { + fprintf(stderr, + "Unsupported cipher-only burst API algorithm selected\n"); + free_mb_mgr(aux_mgr); + return EXIT_FAILURE; + } } if (optim_burst_size > burst_size) -- GitLab From 558d45b058cfd891c259a4234441579a2d2358bd Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Wed, 12 Jun 2024 14:23:12 +0100 Subject: [PATCH 18/24] perf: add AEAD-only burst API support Signed-off-by: Pablo de Lara --- perf/ipsec_perf.c | 89 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 4 deletions(-) diff --git a/perf/ipsec_perf.c b/perf/ipsec_perf.c index 6f4df75b..2819bcfe 100644 --- a/perf/ipsec_perf.c +++ b/perf/ipsec_perf.c @@ -771,6 +771,7 @@ typedef enum { TEST_API_BURST, TEST_API_CIPHER_BURST, TEST_API_HASH_BURST, + TEST_API_AEAD_BURST, TEST_API_DIRECT, TEST_API_QUIC, TEST_API_NUMOF @@ -2444,6 +2445,75 @@ do_test(IMB_MGR *mb_mgr, struct params_s *params, const uint32_t num_iter, uint8 } jobs_done = num_iter - num_jobs; + /* test AEAD burst api */ + } else if (test_api == TEST_API_AEAD_BURST) { + IMB_JOB jobs[MAX_BURST_SIZE]; + IMB_JOB *jt = &job_template; + uint32_t num_jobs = num_iter; + uint32_t list_idx; + + while (num_jobs && timebox_on) { + uint32_t n_jobs = (num_jobs / burst_size) ? burst_size : num_jobs; + + /* set all job params */ + for (i = 0; i < n_jobs; i++) { + job = &jobs[i]; + + /* If IMIX testing is being done, set the buffer + * size to cipher going through the + * list of sizes precalculated */ + if (imix_list_count != 0) { + list_idx = i & (JOB_SIZE_IMIX_LIST - 1); + job->msg_len_to_cipher_in_bytes = + cipher_size_list[list_idx]; + } else + job->msg_len_to_cipher_in_bytes = + jt->msg_len_to_cipher_in_bytes; + + job->src = get_src_buffer(index, p_buffer); + job->dst = get_dst_buffer(index, p_buffer); + job->enc_keys = job->dec_keys = + (const uint32_t *) get_key_pointer(index, p_keys); + job->cipher_start_src_offset_in_bytes = + jt->cipher_start_src_offset_in_bytes; + job->iv = jt->iv; + job->iv_len_in_bytes = jt->iv_len_in_bytes; + job->msg_len_to_hash_in_bytes = jt->msg_len_to_hash_in_bytes; + job->hash_start_src_offset_in_bytes = + jt->hash_start_src_offset_in_bytes; + job->auth_tag_output_len_in_bytes = + jt->auth_tag_output_len_in_bytes; + job->auth_tag_output = jt->auth_tag_output; + if (jt->cipher_mode == IMB_CIPHER_CCM) { + job->u.CCM.aad_len_in_bytes = aad_size; + job->u.CCM.aad = job->src; + } + + index = get_next_index(index); + } + /* submit AEAD burst */ +#ifdef DEBUG + const uint32_t completed_jobs = + IMB_SUBMIT_AEAD_BURST(mb_mgr, jobs, n_jobs, jt->cipher_mode, + jt->cipher_direction, jt->key_len_in_bytes); + + if (completed_jobs != n_jobs) { + const int err = imb_get_errno(mb_mgr); + + if (err != 0) { + printf("submit_aead_burst error " + "%d : '%s'\n", + err, imb_get_strerror(err)); + } + } +#else + IMB_SUBMIT_AEAD_BURST_NOCHECK(mb_mgr, jobs, n_jobs, jt->cipher_mode, + jt->cipher_direction, jt->key_len_in_bytes); +#endif + num_jobs -= n_jobs; + } + jobs_done = num_iter - num_jobs; + } else { /* TEST_API_JOB */ imb_set_session(mb_mgr, &job_template); @@ -4076,6 +4146,8 @@ main(int argc, char *argv[]) test_api = TEST_API_CIPHER_BURST; } else if (strcmp(argv[i], "--hash-burst-api") == 0) { test_api = TEST_API_HASH_BURST; + } else if (strcmp(argv[i], "--aead-burst-api") == 0) { + test_api = TEST_API_AEAD_BURST; } else if (strcmp(argv[i], "--burst-size") == 0) { i = get_next_num_arg((const char *const *) argv, i, argc, &burst_size, sizeof(burst_size)); @@ -4117,16 +4189,17 @@ main(int argc, char *argv[]) if (burst_size != 0 && test_api == TEST_API_JOB) { fprintf(stderr, "--burst-size can only be used with " - "--burst-api, --cipher-burst-api or " - "--hash-burst-api options\n"); + "--burst-api, --cipher-burst-api, " + "--hash-burst-api or --aead-burst-api options\n"); return EXIT_FAILURE; } if (test_api != TEST_API_JOB && burst_size == 0) burst_size = DEFAULT_BURST_SIZE; - /* only a few algorithms support the hash/cipher-only burst API */ - if (test_api == TEST_API_HASH_BURST || test_api == TEST_API_CIPHER_BURST) { + /* only a few algorithms support the hash-only/cipher-only/AEAD burst API */ + if (test_api == TEST_API_HASH_BURST || test_api == TEST_API_CIPHER_BURST || + test_api == TEST_API_AEAD_BURST) { uint32_t optim_burst_size; IMB_MGR *aux_mgr = alloc_mb_mgr(0); @@ -4154,6 +4227,14 @@ main(int argc, char *argv[]) free_mb_mgr(aux_mgr); return EXIT_FAILURE; } + } else { /* AEAD */ + if (imb_aead_burst_get_size( + aux_mgr, translate_cipher_mode(custom_job_params.cipher_mode), + &optim_burst_size) == IMB_ERR_CIPH_MODE) { + fprintf(stderr, "Unsupported AEAD burst API algorithm selected\n"); + free_mb_mgr(aux_mgr); + return EXIT_FAILURE; + } } if (optim_burst_size > burst_size) -- GitLab From 1d036ba6551e1882f5ad3c912bf3777a14c70568 Mon Sep 17 00:00:00 2001 From: "Lipinska, Kamila" Date: Tue, 16 Jul 2024 14:33:01 +0200 Subject: [PATCH 19/24] test: Rename AES-CFB single block tests --- test/kat-app/Makefile | 4 ++-- .../{aes_cfb_test.c => aes_cfb_one_block_test.c} | 16 ++++++++-------- ...test.json.c => aes_cfb_one_block_test.json.c} | 2 +- test/kat-app/main.c | 4 ++-- test/kat-app/win_x64.mak | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) rename test/kat-app/{aes_cfb_test.c => aes_cfb_one_block_test.c} (92%) rename test/kat-app/{aes_cfb_test.json.c => aes_cfb_one_block_test.json.c} (99%) diff --git a/test/kat-app/Makefile b/test/kat-app/Makefile index 0729b40a..9b9f77ec 100644 --- a/test/kat-app/Makefile +++ b/test/kat-app/Makefile @@ -40,8 +40,8 @@ SOURCES := main.c gcm_test.c ctr_test.c customop_test.c des_test.c ccm_test.c \ chacha20_poly1305_test.c null_test.c snow_v_test.c direct_api_param_test.c quic_ecb_test.c \ hmac_sha1.json.c hmac_sha224.json.c hmac_sha256.json.c hmac_sha384.json.c hmac_sha512.json.c \ hmac_md5.json.c gmac_test.json.c ghash_test.c ghash_test.json.c poly1305_test.json.c \ - cmac_test.json.c xcbc_test.json.c sha_test.json.c aes_cbcs_test.json.c gmac_test.c aes_cfb_test.c \ - ecb_test.json.c aes_cfb_test.json.c aes_cbc_test.c aes_cbc_test.json.c ctr_test.json.c \ + cmac_test.json.c xcbc_test.json.c sha_test.json.c aes_cbcs_test.json.c gmac_test.c aes_cfb_one_block_test.c \ + ecb_test.json.c aes_cfb_one_block_test.json.c aes_cbc_test.c aes_cbc_test.json.c ctr_test.json.c \ des_test.json.c chacha_test.json.c gcm_test.json.c ccm_test.json.c quic_chacha20_test.c \ chacha20_poly1305_test.json.c snow3g_test_f8_vectors.json.c snow3g_test_f9_vectors.json.c \ sm4_ecb_test.c sm4_ecb_test.json.c sm4_cbc_test.c sm4_cbc_test.json.c sm3_test.c \ diff --git a/test/kat-app/aes_cfb_test.c b/test/kat-app/aes_cfb_one_block_test.c similarity index 92% rename from test/kat-app/aes_cfb_test.c rename to test/kat-app/aes_cfb_one_block_test.c index 53b860c8..9466eff2 100644 --- a/test/kat-app/aes_cfb_test.c +++ b/test/kat-app/aes_cfb_one_block_test.c @@ -37,16 +37,16 @@ #include "cipher_test.h" int -cfb_test(struct IMB_MGR *mb_mgr); +cfb_one_block_test(struct IMB_MGR *mb_mgr); -extern const struct cipher_test cfb_test_json[]; +extern const struct cipher_test cfb_one_block_test_json[]; static int cfb_validate_ok(const uint8_t *output, const uint8_t *in_text, const size_t plen, const uint32_t klen, const unsigned i, const unsigned is_enc, const int in_place) { if (memcmp(output, in_text, plen) != 0) { - printf("\nAES-CFB%s standard test vector %u %s (%s): fail\n", + printf("\nAES-CFB-ONE%s standard test vector %u %s (%s): fail\n", (klen == 16) ? "128" : "256", i + 1, (is_enc) ? "encrypt" : "decrypt", (in_place) ? "in-place" : "out-of-place"); return 0; @@ -129,14 +129,14 @@ static void cfb_test_vectors(struct IMB_MGR *mb_mgr, struct test_suite_context *ctx128, struct test_suite_context *ctx256) { - const struct cipher_test *v = cfb_test_json; + const struct cipher_test *v = cfb_one_block_test_json; for (; v->msg != NULL; v++) { struct test_suite_context *ctx; if (!quiet_mode) { #ifdef DEBUG - printf("AES-CFB Test Case %zu key_len:%zu\n", v->tcId, v->keySize); + printf("AES-CFB-ONE Test Case %zu key_len:%zu\n", v->tcId, v->keySize); #else printf("."); #endif @@ -156,14 +156,14 @@ cfb_test_vectors(struct IMB_MGR *mb_mgr, struct test_suite_context *ctx128, } int -cfb_test(struct IMB_MGR *mb_mgr) +cfb_one_block_test(struct IMB_MGR *mb_mgr) { int errors = 0; struct test_suite_context ctx128; struct test_suite_context ctx256; - test_suite_start(&ctx128, "AES-CFB-128"); - test_suite_start(&ctx256, "AES-CFB-256"); + test_suite_start(&ctx128, "AES-CFB-128 ONE-BLOCK"); + test_suite_start(&ctx256, "AES-CFB-256 ONE-BLOCK"); cfb_test_vectors(mb_mgr, &ctx128, &ctx256); errors += test_suite_end(&ctx128); errors += test_suite_end(&ctx256); diff --git a/test/kat-app/aes_cfb_test.json.c b/test/kat-app/aes_cfb_one_block_test.json.c similarity index 99% rename from test/kat-app/aes_cfb_test.json.c rename to test/kat-app/aes_cfb_one_block_test.json.c index 3ee167cc..342dfb32 100644 --- a/test/kat-app/aes_cfb_test.json.c +++ b/test/kat-app/aes_cfb_one_block_test.json.c @@ -27,7 +27,7 @@ /* CFB */ #include "cipher_test.h" -const struct cipher_test cfb_test_json[] = { +const struct cipher_test cfb_one_block_test_json[] = { /* Vectors from CM-SP-SECv3.1-I06-160602 section I.10.2 */ { 128, 128, 1, "\x01\x23\x45\x67\x89\xab\xcd\xef\x01\x23\x45\x67\x89\xab\xcd\xef", "\x12\x34\x56\x78\x90\xab\xcd\xef\x12\x34\x56\x78\x90\xab\xcd\xef", diff --git a/test/kat-app/main.c b/test/kat-app/main.c index a8ab2452..e2d3fdf5 100644 --- a/test/kat-app/main.c +++ b/test/kat-app/main.c @@ -102,7 +102,7 @@ ghash_test(struct IMB_MGR *mb_mgr); extern int cbc_test(struct IMB_MGR *mb_mgr); extern int -cfb_test(struct IMB_MGR *mb_mgr); +cfb_one_block_test(struct IMB_MGR *mb_mgr); extern int ctr_test(struct IMB_MGR *mb_mgr); extern int @@ -135,7 +135,7 @@ struct imb_test tests[] = { { .str = "KAT", .fn = known_answer_test, .enabled = 1 }, { .str = "DO_TEST", .fn = do_test, .enabled = 1 }, { .str = "CBC", .fn = cbc_test, .enabled = 1 }, - { .str = "CFB", .fn = cfb_test, .enabled = 1 }, + { .str = "CFB", .fn = cfb_one_block_test, .enabled = 1 }, { .str = "CTR", .fn = ctr_test, .enabled = 1 }, { .str = "PON", .fn = pon_test, .enabled = 1 }, { .str = "XCBC", .fn = xcbc_test, .enabled = 1 }, diff --git a/test/kat-app/win_x64.mak b/test/kat-app/win_x64.mak index 6156f7c0..e09eecfd 100644 --- a/test/kat-app/win_x64.mak +++ b/test/kat-app/win_x64.mak @@ -29,7 +29,7 @@ APP = imb-kat include ..\common\win_x64_common.mk -TEST_OBJS = utils.obj main.obj gcm_test.obj ctr_test.obj customop_test.obj des_test.obj ccm_test.obj cmac_test.obj hmac_sha1_test.obj hmac_sha256_sha512_test.obj hmac_md5_test.obj aes_test.obj sha_test.obj chained_test.obj api_test.obj pon_test.obj ecb_test.obj zuc_eea3_test.obj zuc_eia3_test.obj kasumi_test.obj snow3g_test.obj direct_api_test.obj clear_mem_test.obj hec_test.obj xcbc_test.obj aes_cbcs_test.obj crc_test.obj chacha_test.obj poly1305_test.obj chacha20_poly1305_test.obj null_test.obj snow_v_test.obj direct_api_param_test.obj quic_ecb_test.obj hmac_sha1.json.obj hmac_sha224.json.obj hmac_sha256.json.obj hmac_sha384.json.obj hmac_sha512.json.obj hmac_md5.json.obj gmac_test.obj gmac_test.json.obj ghash_test.obj ghash_test.json.obj poly1305_test.json.obj cmac_test.json.obj xcbc_test.json.obj sha_test.json.obj aes_cfb_test.obj aes_cfb_test.json.obj aes_cbcs_test.json.obj aes_cbc_test.obj aes_cbc_test.json.obj ecb_test.json.obj ctr_test.json.obj chacha_test.json.obj des_test.json.obj gcm_test.json.obj quic_chacha20_test.obj chacha20_poly1305_test.json.obj ccm_test.json.obj snow3g_test_f8_vectors.json.obj snow3g_test_f9_vectors.json.obj sm4_ecb_test.obj sm4_ecb_test.json.obj sm4_cbc_test.obj sm4_cbc_test.json.obj sm3_test.obj sm3_test.json.obj zuc_eia3_128.json.obj zuc_eia3_256.json.obj zuc_eea3_128.json.obj zuc_eea3_256.json.obj kasumi_f8.json.obj kasumi_f9.json.obj snow_v_test.json.obj hmac_sm3_test.obj hmac_sm3.json.obj snow_v_aead.json.obj +TEST_OBJS = utils.obj main.obj gcm_test.obj ctr_test.obj customop_test.obj des_test.obj ccm_test.obj cmac_test.obj hmac_sha1_test.obj hmac_sha256_sha512_test.obj hmac_md5_test.obj aes_test.obj sha_test.obj chained_test.obj api_test.obj pon_test.obj ecb_test.obj zuc_eea3_test.obj zuc_eia3_test.obj kasumi_test.obj snow3g_test.obj direct_api_test.obj clear_mem_test.obj hec_test.obj xcbc_test.obj aes_cbcs_test.obj crc_test.obj chacha_test.obj poly1305_test.obj chacha20_poly1305_test.obj null_test.obj snow_v_test.obj direct_api_param_test.obj quic_ecb_test.obj hmac_sha1.json.obj hmac_sha224.json.obj hmac_sha256.json.obj hmac_sha384.json.obj hmac_sha512.json.obj hmac_md5.json.obj gmac_test.obj gmac_test.json.obj ghash_test.obj ghash_test.json.obj poly1305_test.json.obj cmac_test.json.obj xcbc_test.json.obj sha_test.json.obj aes_cfb_one_block_test.obj aes_cfb_one_block_test.json.obj aes_cbcs_test.json.obj aes_cbc_test.obj aes_cbc_test.json.obj ecb_test.json.obj ctr_test.json.obj chacha_test.json.obj des_test.json.obj gcm_test.json.obj quic_chacha20_test.obj chacha20_poly1305_test.json.obj ccm_test.json.obj snow3g_test_f8_vectors.json.obj snow3g_test_f9_vectors.json.obj sm4_ecb_test.obj sm4_ecb_test.json.obj sm4_cbc_test.obj sm4_cbc_test.json.obj sm3_test.obj sm3_test.json.obj zuc_eia3_128.json.obj zuc_eia3_256.json.obj zuc_eea3_128.json.obj zuc_eea3_256.json.obj kasumi_f8.json.obj kasumi_f9.json.obj snow_v_test.json.obj hmac_sm3_test.obj hmac_sm3.json.obj snow_v_aead.json.obj TEST_LFLAGS = /out:$(APP).exe $(DLFLAGS) -- GitLab From 0524b3aa7413cdee8beee8120e1abf32aeb13516 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Wed, 17 Jul 2024 12:56:40 +0100 Subject: [PATCH 20/24] lib: [cpu feature] XSAVE feature detection and check to fix issue #153 This code change adds XSAVE feature detection and makes AVX architecture depend on it. Note that AVX2 and AVX512 architectures also depend on all AVX features and consequently depend on XSAVE feature, too. --- lib/ipsec-mb.h | 3 ++- lib/x86_64/cpu_feature.c | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index 6985c7a9..13d54c5c 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -949,6 +949,7 @@ typedef int (*imb_self_test_cb_t)(void *cb_arg, const IMB_SELF_TEST_CALLBACK_DAT #define IMB_FEATURE_SM3NI (1ULL << 24) #define IMB_FEATURE_SM4NI (1ULL << 25) #define IMB_FEATURE_SHA512NI (1ULL << 26) +#define IMB_FEATURE_XSAVE (1ULL << 27) /** * Self test defines @@ -969,7 +970,7 @@ typedef int (*imb_self_test_cb_t)(void *cb_arg, const IMB_SELF_TEST_CALLBACK_DAT #define IMB_CPUFLAGS_SSE (IMB_CPUFLAGS_NO_AESNI | IMB_FEATURE_AESNI | IMB_FEATURE_PCLMULQDQ) #define IMB_CPUFLAGS_SSE_T2 (IMB_CPUFLAGS_SSE | IMB_FEATURE_SHANI) #define IMB_CPUFLAGS_SSE_T3 (IMB_CPUFLAGS_SSE_T2 | IMB_FEATURE_GFNI) -#define IMB_CPUFLAGS_AVX (IMB_CPUFLAGS_SSE | IMB_FEATURE_AVX) +#define IMB_CPUFLAGS_AVX (IMB_CPUFLAGS_SSE | IMB_FEATURE_AVX | IMB_FEATURE_XSAVE) #define IMB_CPUFLAGS_AVX2 (IMB_CPUFLAGS_AVX | IMB_FEATURE_AVX2 | IMB_FEATURE_BMI2) #define IMB_CPUFLAGS_AVX512 (IMB_CPUFLAGS_AVX2 | IMB_FEATURE_AVX512_SKX) #define IMB_CPUFLAGS_AVX512_T2 \ diff --git a/lib/x86_64/cpu_feature.c b/lib/x86_64/cpu_feature.c index 3de9acbf..ed385a16 100644 --- a/lib/x86_64/cpu_feature.c +++ b/lib/x86_64/cpu_feature.c @@ -221,6 +221,13 @@ detect_sm4ni(void) #endif } +static uint32_t +detect_xsave(void) +{ + /* Check presence of XSAVE - bit 26 of ECX */ + return (cpuid_1_0.ecx & (1UL << 26)); +} + uint64_t cpu_feature_detect(void) { @@ -249,7 +256,8 @@ cpu_feature_detect(void) { 7, IMB_FEATURE_HYBRID, detect_hybrid }, { 7, IMB_FEATURE_SM3NI, detect_sm3ni }, { 7, IMB_FEATURE_SM4NI, detect_sm4ni }, - { 7, IMB_FEATURE_SHA512NI, detect_sha512ni } }; + { 7, IMB_FEATURE_SHA512NI, detect_sha512ni }, + { 1, IMB_FEATURE_XSAVE, detect_xsave } }; struct cpuid_regs r; unsigned hi_leaf_number = 0; uint64_t features = 0; -- GitLab From 7f09bbdb66c4d3abdf8cabb3ee79bcc702f77f53 Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Fri, 28 Jun 2024 17:28:43 +0100 Subject: [PATCH 21/24] avx2_t4: rename SHA-512 MB function - Rename function - Add correct digest offset name Signed-off-by: Marcel Cornu --- lib/avx2_t4/sha512_x2_ni_avx2.asm | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/avx2_t4/sha512_x2_ni_avx2.asm b/lib/avx2_t4/sha512_x2_ni_avx2.asm index 6995a275..43618286 100644 --- a/lib/avx2_t4/sha512_x2_ni_avx2.asm +++ b/lib/avx2_t4/sha512_x2_ni_avx2.asm @@ -204,12 +204,12 @@ SHUF_MASK: mksection .text ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; void sha512_ni_x2(SHA512_ARGS *args, UINT64 size_in_blocks) +;; void sha512_ni_x2_avx2(SHA512_ARGS *args, UINT64 size_in_blocks) ;; arg1 : pointer to args ;; arg2 : size (in blocks) ;; assumed to be >= 1 align 32 -MKGLOBAL(sha512_ni_x2,function,internal) -sha512_ni_x2: +MKGLOBAL(sha512_ni_x2_avx2,function,internal) +sha512_ni_x2_avx2: mov r11, rsp sub rsp, frame_size and rsp, -32 @@ -225,10 +225,10 @@ sha512_ni_x2: lea SHA512_CONSTS, [rel SHA512_K_AVX] ;; load current hash value and transform - vmovdqu STATE0, [args + 0*SHA512NI_DIGEST_ROW_SIZE] - vmovdqu STATE1, [args + 0*SHA512NI_DIGEST_ROW_SIZE + 32] - vmovdqu STATE0b, [args + 1*SHA512NI_DIGEST_ROW_SIZE] - vmovdqu STATE1b, [args + 1*SHA512NI_DIGEST_ROW_SIZE + 32] + vmovdqu STATE0, [args + _args_digest_sha512 + 0*SHA512NI_DIGEST_ROW_SIZE] + vmovdqu STATE1, [args + _args_digest_sha512 + 0*SHA512NI_DIGEST_ROW_SIZE + 32] + vmovdqu STATE0b, [args + _args_digest_sha512 + 1*SHA512NI_DIGEST_ROW_SIZE] + vmovdqu STATE1b, [args + _args_digest_sha512 + 1*SHA512NI_DIGEST_ROW_SIZE + 32] vperm2i128 YTMP1, STATE0, STATE1, 0x20 vperm2i128 YTMP0, STATE0b, STATE1b, 0x20 @@ -386,10 +386,10 @@ align 32 vpermq STATE1b, YTMP0, 0xb1 ;; update digests - vmovdqu [args + 0*SHA512NI_DIGEST_ROW_SIZE], STATE0 - vmovdqu [args + 0*SHA512NI_DIGEST_ROW_SIZE + 32], STATE1 - vmovdqu [args + 1*SHA512NI_DIGEST_ROW_SIZE], STATE0b - vmovdqu [args + 1*SHA512NI_DIGEST_ROW_SIZE + 32], STATE1b + vmovdqu [args + _args_digest_sha512 + 0*SHA512NI_DIGEST_ROW_SIZE], STATE0 + vmovdqu [args + _args_digest_sha512 + 0*SHA512NI_DIGEST_ROW_SIZE + 32], STATE1 + vmovdqu [args + _args_digest_sha512 + 1*SHA512NI_DIGEST_ROW_SIZE], STATE0b + vmovdqu [args + _args_digest_sha512 + 1*SHA512NI_DIGEST_ROW_SIZE + 32], STATE1b vzeroupper @@ -405,7 +405,7 @@ align 32 MKGLOBAL(call_sha512_ni_x2_avx2_from_c,function,internal) call_sha512_ni_x2_avx2_from_c: FUNC_SAVE - call sha512_ni_x2 + call sha512_ni_x2_avx2 FUNC_RESTORE ret -- GitLab From d07c24377ebd7a199cda988343c62d4d2c9dc1e8 Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Fri, 5 Jul 2024 10:24:54 +0100 Subject: [PATCH 22/24] avx2_t4: [HMAC-SHA512] add multi-buffer implementation Signed-off-by: Marcel Cornu --- lib/Makefile | 4 +- lib/avx2_t4/mb_mgr_avx2_t4.c | 2 +- .../mb_mgr_hmac_sha512_flush_ni_avx2.asm | 310 ++++++++++++++ .../mb_mgr_hmac_sha512_submit_ni_avx2.asm | 389 ++++++++++++++++++ lib/avx2_t4/sha512_hmac_ni_avx2.asm | 10 - lib/win_x64.mak | 4 +- 6 files changed, 706 insertions(+), 13 deletions(-) create mode 100644 lib/avx2_t4/mb_mgr_hmac_sha512_flush_ni_avx2.asm create mode 100644 lib/avx2_t4/mb_mgr_hmac_sha512_submit_ni_avx2.asm diff --git a/lib/Makefile b/lib/Makefile index 02d7503f..e5b01003 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -780,7 +780,9 @@ asm_avx2_t4_lib_objs := \ sm3_hmac_avx2.o \ sha512_x1_ni_avx2.o \ sha512_x2_ni_avx2.o \ - sha512_hmac_ni_avx2.o + sha512_hmac_ni_avx2.o \ + mb_mgr_hmac_sha512_flush_ni_avx2.o \ + mb_mgr_hmac_sha512_submit_ni_avx2.o # # List of ASM modules (avx512 directory) diff --git a/lib/avx2_t4/mb_mgr_avx2_t4.c b/lib/avx2_t4/mb_mgr_avx2_t4.c index 77a6b791..11df31e9 100644 --- a/lib/avx2_t4/mb_mgr_avx2_t4.c +++ b/lib/avx2_t4/mb_mgr_avx2_t4.c @@ -299,7 +299,7 @@ reset_ooo_mgrs(IMB_MGR *state) ooo_mgr_hmac_sha384_reset(state->hmac_sha_384_ooo, AVX2_NUM_SHA512_LANES); /* Init HMAC/SHA512 out-of-order fields */ - ooo_mgr_hmac_sha512_reset(state->hmac_sha_512_ooo, AVX2_NUM_SHA512_LANES); + ooo_mgr_hmac_sha512_reset(state->hmac_sha_512_ooo, 2); /* Init HMAC/MD5 out-of-order fields */ ooo_mgr_hmac_md5_reset(state->hmac_md5_ooo, AVX2_NUM_MD5_LANES); diff --git a/lib/avx2_t4/mb_mgr_hmac_sha512_flush_ni_avx2.asm b/lib/avx2_t4/mb_mgr_hmac_sha512_flush_ni_avx2.asm new file mode 100644 index 00000000..c6ccdc07 --- /dev/null +++ b/lib/avx2_t4/mb_mgr_hmac_sha512_flush_ni_avx2.asm @@ -0,0 +1,310 @@ +;; +;; Copyright (c) 2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.inc" +%include "include/imb_job.inc" +%include "include/mb_mgr_datastruct.inc" +%include "include/reg_sizes.inc" + +%define SHA512_FUNC sha512_ni_x2_avx2 +%define FUNC flush_job_hmac_sha_512_ni_avx2 +%define SHA_X_DIGEST_SIZE 512 + +extern SHA512_FUNC + +mksection .rodata +default rel + +align 16 +byteswap: + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x0001020304050607, 0x08090a0b0c0d0e0f +len_masks: + dq 0xFFFFFFFF0000FFFF, 0xFFFFFFFFFFFFFFFF + dq 0xFFFFFFFFFFFF0000, 0xFFFFFFFFFFFFFFFF +lane_1: dq 1 + +mksection .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%else +%define arg1 rcx +%define arg2 rdx +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx needs to be in rbp, r15 +%define idx rbp + +%define unused_lanes rbx +%define lane_data rbx +%define tmp2 rbx + +%define job_rax rax +%define tmp1 rax +%define size_offset rax +%define tmp rax +%define start_offset rax + +%define tmp3 arg1 + +%define extra_blocks arg2 +%define p arg2 + +%define tmp4 r8 + +%define tmp5 r9 + +%define tmp6 r10 + +%endif + +struc STACK +_gpr_save: resq 3 +_rsp_save: resq 1 +endstruc + +%define APPEND(a,b) a %+ b + +; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state) +; arg 1 : rcx : state +MKGLOBAL(FUNC,function,internal) +FUNC: + mov rax, rsp + sub rsp, STACK_size + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + bt unused_lanes, 16+7 + jc return_null + + ; find a lane with a non-null job + xor idx, idx + cmp qword [state + _ldata_sha512 + 1 * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + cmovne idx, [rel lane_1] + +copy_lane_data: + ; copy good lane (idx) to empty lanes + vmovdqa xmm0, [state + _lens_sha512] + mov tmp, [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*idx] + +%assign I 0 +%rep 2 + cmp qword [state + _ldata_sha512 + I * _SHA512_LANE_DATA_size + _job_in_lane_sha512], 0 + jne APPEND(skip_,I) + mov [state + _args_sha512 + _data_ptr_sha512 + PTR_SZ*I], tmp + vpor xmm0, xmm0, [rel len_masks + 16*I] +APPEND(skip_,I): +%assign I (I+1) +%endrep + vmovdqa [state + _lens_sha512], xmm0 + + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...3) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0x00 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call SHA512_FUNC + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + mov word [state + _lens_sha512 + 2*idx], 1 + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + + ; move digest into data location + lea tmp5, [idx*8] ;; scale up to SHA512_DIGEST_ROW_SIZE (8*8) + vmovdqu ymm0, [state + _args_digest_sha512 + tmp5*8] + vmovdqu ymm1, [state + _args_digest_sha512 + tmp5*8 + 32] + vpshufb ymm0, [rel byteswap] + vpshufb ymm1, [rel byteswap] + vmovdqu [lane_data + _outer_block_sha512], ymm0 + vmovdqu [lane_data + _outer_block_sha512+32], ymm1 + + ; move the opad key into digest + mov tmp, [job + _auth_key_xor_opad] + + vmovdqu ymm0, [tmp] + vmovdqu ymm1, [tmp + 32] + vmovdqu [state + _args_digest_sha512 + tmp5*8], ymm0 + vmovdqu [state + _args_digest_sha512 + tmp5*8 + 32], ymm1 + + jmp copy_lane_data + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + mov [state + _lens_sha512 + 2*idx], WORD(extra_blocks) + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp copy_lane_data + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], IMB_STATUS_COMPLETED_AUTH + mov unused_lanes, [state + _unused_lanes_sha512] + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ;; scale idx*64 + shl idx, 6 + +%if (SHA_X_DIGEST_SIZE != 384) + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24 + jne copy_full_digest +%endif + +%if (SHA_X_DIGEST_SIZE != 384) + ;; copy 32 bytes for SHA512 / 24 bytes for SHA384 + vmovdqu ymm0, [state + _args_digest_sha512 + idx] + vpshufb ymm0, [rel byteswap] + vmovdqu [p], ymm0 +%else + mov QWORD(tmp2), [state + _args_digest_sha512 + idx + 0*SHA512_DIGEST_WORD_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + idx + 1*SHA512_DIGEST_WORD_SIZE] + mov QWORD(tmp6), [state + _args_digest_sha512 + idx + 2*SHA512_DIGEST_WORD_SIZE] + bswap QWORD(tmp2) + bswap QWORD(tmp4) + bswap QWORD(tmp6) + mov [p + 0*8], QWORD(tmp2) + mov [p + 1*8], QWORD(tmp4) + mov [p + 2*8], QWORD(tmp6) +%endif + jmp clear_ret + +copy_full_digest: + ;; copy 64 bytes for SHA512 / 48 bytes for SHA384 +%if (SHA_X_DIGEST_SIZE != 384) + vmovdqu ymm0, [state + _args_digest_sha512 + idx + 0*SHA512_DIGEST_WORD_SIZE] + vmovdqu ymm1, [state + _args_digest_sha512 + idx + 4*SHA512_DIGEST_WORD_SIZE] + vpshufb ymm0, [rel byteswap] + vpshufb ymm1, [rel byteswap] + vmovdqu [p], ymm0 + vmovdqu [p + 32], ymm1 +%else + vmovdqu ymm0, [state + _args_digest_sha512 + idx + 0*SHA512_DIGEST_WORD_SIZE] + vmovdqu xmm1, [state + _args_digest_sha512 + idx + 4*SHA512_DIGEST_WORD_SIZE] + vpshufb ymm0, [rel byteswap] + vpshufb xmm1, [rel byteswap] + vmovdqu [p], ymm0 + vmovdqu [p + 32], xmm1 +%endif + +clear_ret: +%ifdef SAFE_DATA + vpxor ymm0, ymm0 + + ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job +%assign I 0 +%rep 2 + cmp qword [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size) + _job_in_lane_sha512], 0 + jne APPEND(skip_clear_,I) + + ;; Clear digest (48 bytes for SHA-384, 64 bytes for SHA-512 bytes) + vmovdqa [state + _args_digest_sha512 + I*64], ymm0 +%if (SHA_X_DIGEST_SIZE == 384) + vmovdqa [state + _args_digest_sha512 + I*64 + 32], xmm0 +%else + vmovdqa [state + _args_digest_sha512 + I*64 + 32], ymm0 +%endif + + lea lane_data, [state + _ldata_sha512 + (I*_SHA512_LANE_DATA_size)] + ;; Clear first 128 bytes of extra_block +%assign offset 0 +%rep 4 + vmovdqa [lane_data + _extra_block + offset], ymm0 +%assign offset (offset + 32) +%endrep + + ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block + vmovdqu [lane_data + _outer_block], ymm0 +%if (SHA_X_DIGEST_SIZE == 384) + vmovdqa [lane_data + _outer_block + 32], xmm0 +%else + vmovdqu [lane_data + _outer_block + 32], ymm0 +%endif + +APPEND(skip_clear_,I): +%assign I (I+1) +%endrep + +%endif ;; SAFE_DATA + +return: + vzeroupper + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] + mov rsp, [rsp + _rsp_save] ; original SP + + ret + +mksection stack-noexec diff --git a/lib/avx2_t4/mb_mgr_hmac_sha512_submit_ni_avx2.asm b/lib/avx2_t4/mb_mgr_hmac_sha512_submit_ni_avx2.asm new file mode 100644 index 00000000..47e5e00b --- /dev/null +++ b/lib/avx2_t4/mb_mgr_hmac_sha512_submit_ni_avx2.asm @@ -0,0 +1,389 @@ +;; Copyright (c) 2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.inc" +%include "include/imb_job.inc" +%include "include/mb_mgr_datastruct.inc" +%include "include/reg_sizes.inc" +%include "include/memcpy.inc" +%include "include/const.inc" + +%define SHA512_FUNC sha512_ni_x2_avx2 +%define FUNC submit_job_hmac_sha_512_ni_avx2 +%define SHA_X_DIGEST_SIZE 512 + +extern SHA512_FUNC + +mksection .rodata +default rel +align 16 +byteswap: + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +unused_lane_lens: + dq 0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF + +mksection .text + +%if 1 +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define reg3 rcx +%define reg4 rdx +%else +%define arg1 rcx +%define arg2 rdx +%define reg3 rdi +%define reg4 rsi +%endif + +%define state arg1 +%define job arg2 +%define len2 arg2 + +; idx needs to be in rbp, r13, r14, r16 +%define last_len rbp +%define idx rbp + +%define p r11 +%define start_offset r11 + +%define unused_lanes rbx +%define tmp4 rbx + +%define job_rax rax +%define len rax + +%define size_offset reg3 +%define tmp2 reg3 + +%define lane reg4 +%define tmp3 reg4 + +%define extra_blocks r8 + +%define tmp r9 +%define p2 r9 + +%define lane_data r10 + +%endif + +; Define stack usage +struc STACK +_gpr_save: resq 5 +_rsp_save: resq 1 +endstruc + +; JOB* FUNC(MB_MGR_HMAC_sha_512_OOO *state, IMB_JOB *job) +; arg 1 : rcx : state +; arg 2 : rdx : job +MKGLOBAL(FUNC,function,internal) +FUNC: + mov rax, rsp + sub rsp, STACK_size + mov [rsp + _gpr_save + 8*0], rbx + mov [rsp + _gpr_save + 8*1], rbp + mov [rsp + _gpr_save + 8*2], r12 +%ifndef LINUX + mov [rsp + _gpr_save + 8*3], rsi + mov [rsp + _gpr_save + 8*4], rdi +%endif + mov [rsp + _rsp_save], rax ; original SP + + mov unused_lanes, [state + _unused_lanes_sha512] + movzx lane, BYTE(unused_lanes) + shr unused_lanes, 8 + imul lane_data, lane, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov [state + _unused_lanes_sha512], unused_lanes + mov len, [job + _msg_len_to_hash_in_bytes] + mov tmp, len + shr tmp, 7 ; divide by 128, len in terms of blocks + + mov [lane_data + _job_in_lane_sha512], job + mov dword [lane_data + _outer_done_sha512], 0 + + vmovdqa xmm0, [state + _lens_sha512] + XVPINSRW xmm0, xmm1, extra_blocks, lane, tmp, scale_x16 + ;; reset unused lanes to UINT16_MAX before storing + vpor xmm0, [rel unused_lane_lens] + vmovdqa [state + _lens_sha512], xmm0 + + mov last_len, len + and last_len, 127 + lea extra_blocks, [last_len + 17 + 127] + shr extra_blocks, 7 + mov [lane_data + _extra_blocks_sha512], DWORD(extra_blocks) + + mov p, [job + _src] + add p, [job + _hash_start_src_offset_in_bytes] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], p + + cmp len, 128 + jb copy_lt128 + +fast_copy: + add p, len + vmovdqu ymm0, [p - 128 + 0*32] + vmovdqu ymm1, [p - 128 + 1*32] + vmovdqu ymm2, [p - 128 + 2*32] + vmovdqu ymm3, [p - 128 + 3*32] + vmovdqu [lane_data + _extra_block_sha512 + 0*32], ymm0 + vmovdqu [lane_data + _extra_block_sha512 + 1*32], ymm1 + vmovdqu [lane_data + _extra_block_sha512 + 2*32], ymm2 + vmovdqu [lane_data + _extra_block_sha512 + 3*32], ymm3 +end_fast_copy: + + mov size_offset, extra_blocks + shl size_offset, 7 + sub size_offset, last_len + add size_offset, 128-8 + mov [lane_data + _size_offset_sha512], DWORD(size_offset) + mov start_offset, 128 + sub start_offset, last_len + mov [lane_data + _start_offset_sha512], DWORD(start_offset) + + lea tmp, [8*128 + 8*len] + bswap tmp + mov [lane_data + _extra_block_sha512 + size_offset], tmp + + mov tmp, [job + _auth_key_xor_ipad] + + vmovdqu ymm0, [tmp] + vmovdqu ymm1, [tmp + 32] + lea tmp, [lane*8] + vmovdqu [state + _args_digest_sha512 + tmp*8], ymm0 + vmovdqu [state + _args_digest_sha512 + tmp*8 + 32], ymm1 + + test len, ~127 + jnz ge128_bytes + +lt128_bytes: + vmovdqa xmm0, [state + _lens_sha512] + XVPINSRW xmm0, xmm1, tmp, lane, extra_blocks, scale_x16 + vmovdqa [state + _lens_sha512], xmm0 + + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*lane], tmp ;; 8 to hold a UINT8 + mov dword [lane_data + _extra_blocks_sha512], 0 + +ge128_bytes: + cmp unused_lanes, 0xff + jne return_null + jmp start_loop + + align 16 +start_loop: + ; Find min length + vmovdqa xmm0, [state + _lens_sha512] + vphminposuw xmm1, xmm0 + vpextrw DWORD(len2), xmm1, 0 ; min value + vpextrw DWORD(idx), xmm1, 1 ; min index (0...1) + cmp len2, 0 + je len_is_0 + + vpshuflw xmm1, xmm1, 0x00 + vpsubw xmm0, xmm0, xmm1 + vmovdqa [state + _lens_sha512], xmm0 + + ; "state" and "args" are the same address, arg1 + ; len is arg2 + call SHA512_FUNC + ; state and idx are intact + +len_is_0: + ; process completed job "idx" + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + mov DWORD(extra_blocks), [lane_data + _extra_blocks_sha512] + cmp extra_blocks, 0 + jne proc_extra_blocks + cmp dword [lane_data + _outer_done_sha512], 0 + jne end_loop + +proc_outer: + mov dword [lane_data + _outer_done_sha512], 1 + mov DWORD(size_offset), [lane_data + _size_offset_sha512] + mov qword [lane_data + _extra_block_sha512 + size_offset], 0 + + vmovdqa xmm0, [state + _lens_sha512] + XVPINSRW xmm0, xmm1, tmp, idx, 1, scale_x16 + vmovdqa [state + _lens_sha512], xmm0 + + lea tmp, [lane_data + _outer_block_sha512] + mov job, [lane_data + _job_in_lane_sha512] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp + + lea idx, [idx*8] ;; scale up to SHA512_DIGEST_ROW_SIZE (8*8) + vmovdqu ymm0, [state + _args_digest_sha512 + idx*8] + vmovdqu ymm1, [state + _args_digest_sha512 + idx*8 + 32] + vpshufb ymm0, [rel byteswap] + vpshufb ymm1, [rel byteswap] + vmovdqu [lane_data + _outer_block_sha512], ymm0 + vmovdqu [lane_data + _outer_block_sha512+32], ymm1 + + mov tmp, [job + _auth_key_xor_opad] + vmovdqu ymm0, [tmp] + vmovdqu ymm1, [tmp + 32] + vmovdqu [state + _args_digest_sha512 + idx*8], ymm0 + vmovdqu [state + _args_digest_sha512 + idx*8 + 32], ymm1 + + jmp start_loop + + align 16 +proc_extra_blocks: + mov DWORD(start_offset), [lane_data + _start_offset_sha512] + + vmovdqa xmm0, [state + _lens_sha512] + XVPINSRW xmm0, xmm1, tmp, idx, extra_blocks, scale_x16 + vmovdqa [state + _lens_sha512], xmm0 + + lea tmp, [lane_data + _extra_block_sha512 + start_offset] + mov [state + _args_data_ptr_sha512 + PTR_SZ*idx], tmp ;; idx is index of shortest length message + mov dword [lane_data + _extra_blocks_sha512], 0 + jmp start_loop + + align 16 +copy_lt128: + ;; less than one message block of data + ;; destination extra block but backwards by len from where 0x80 pre-populated + lea p2, [lane_data + _extra_block + 128] + sub p2, len + memcpy_avx2_128_1 p2, p, len, tmp4, tmp2, ymm0, ymm1, ymm2, ymm3 + mov unused_lanes, [state + _unused_lanes_sha512] + jmp end_fast_copy + +return_null: + xor job_rax, job_rax + jmp return + + align 16 +end_loop: + mov job_rax, [lane_data + _job_in_lane_sha512] + mov unused_lanes, [state + _unused_lanes_sha512] + mov qword [lane_data + _job_in_lane_sha512], 0 + or dword [job_rax + _status], IMB_STATUS_COMPLETED_AUTH + shl unused_lanes, 8 + or unused_lanes, idx + mov [state + _unused_lanes_sha512], unused_lanes + + mov p, [job_rax + _auth_tag_output] + + ;; scale idx*64 + shl idx, 6 + +%if (SHA_X_DIGEST_SIZE != 384) + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 32 + jne copy_full_digest +%else + cmp qword [job_rax + _auth_tag_output_len_in_bytes], 24 + jne copy_full_digest +%endif + +%if (SHA_X_DIGEST_SIZE != 384) + ;; copy 32 bytes for SHA512 / 24 bytes for SHA384 + vmovdqu ymm0, [state + _args_digest_sha512 + idx] + vpshufb ymm0, [rel byteswap] + vmovdqu [p], ymm0 +%else + mov QWORD(tmp2), [state + _args_digest_sha512 + idx + 0*SHA512_DIGEST_WORD_SIZE] + mov QWORD(tmp4), [state + _args_digest_sha512 + idx + 1*SHA512_DIGEST_WORD_SIZE] + mov QWORD(tmp6), [state + _args_digest_sha512 + idx + 2*SHA512_DIGEST_WORD_SIZE] + bswap QWORD(tmp2) + bswap QWORD(tmp4) + bswap QWORD(tmp6) + mov [p + 0*8], QWORD(tmp2) + mov [p + 1*8], QWORD(tmp4) + mov [p + 2*8], QWORD(tmp6) +%endif + jmp clear_ret + +copy_full_digest: + ;; copy 64 bytes for SHA512 / 48 bytes for SHA384 +%if (SHA_X_DIGEST_SIZE != 384) + vmovdqu ymm0, [state + _args_digest_sha512 + idx + 0*SHA512_DIGEST_WORD_SIZE] + vmovdqu ymm1, [state + _args_digest_sha512 + idx + 4*SHA512_DIGEST_WORD_SIZE] + vpshufb ymm0, [rel byteswap] + vpshufb ymm1, [rel byteswap] + vmovdqu [p], ymm0 + vmovdqu [p + 32], ymm1 +%else + vmovdqu ymm0, [state + _args_digest_sha512 + idx + 0*SHA512_DIGEST_WORD_SIZE] + vmovdqu xmm1, [state + _args_digest_sha512 + idx + 4*SHA512_DIGEST_WORD_SIZE] + vpshufb ymm0, [rel byteswap] + vpshufb xmm1, [rel byteswap] + vmovdqu [p], ymm0 + vmovdqu [p + 32], xmm1 +%endif + +clear_ret: +%ifdef SAFE_DATA + ;; Clear digest (48B/64B), outer_block (48B/64B) and extra_block (128B) of returned job + vpxor ymm0, ymm0 + vmovdqa [state + _args_digest_sha512 + idx], ymm0 +%if (SHA_X_DIGEST_SIZE == 384) + vmovdqa [state + _args_digest_sha512 + idx + 32], xmm0 +%else + vmovdqa [state + _args_digest_sha512 + idx + 32], ymm0 +%endif + + shr idx, 6 ;; Restore lane idx to 0 or 1 + imul lane_data, idx, _SHA512_LANE_DATA_size + lea lane_data, [state + _ldata_sha512 + lane_data] + + ;; Clear first 128 bytes of extra_block +%assign offset 0 +%rep 4 + vmovdqa [lane_data + _extra_block + offset], ymm0 +%assign offset (offset + 32) +%endrep + + ;; Clear first 48 bytes (SHA-384) or 64 bytes (SHA-512) of outer_block + vmovdqu [lane_data + _outer_block], ymm0 +%if (SHA_X_DIGEST_SIZE == 384) + vmovdqa [lane_data + _outer_block + 32], xmm0 +%else + vmovdqu [lane_data + _outer_block + 32], ymm0 +%endif +%endif ;; SAFE_DATA + +return: + vzeroupper + + mov rbx, [rsp + _gpr_save + 8*0] + mov rbp, [rsp + _gpr_save + 8*1] + mov r12, [rsp + _gpr_save + 8*2] +%ifndef LINUX + mov rsi, [rsp + _gpr_save + 8*3] + mov rdi, [rsp + _gpr_save + 8*4] +%endif + mov rsp, [rsp + _rsp_save] ; original SP + ret + +mksection stack-noexec diff --git a/lib/avx2_t4/sha512_hmac_ni_avx2.asm b/lib/avx2_t4/sha512_hmac_ni_avx2.asm index 41c21df8..c912f92a 100644 --- a/lib/avx2_t4/sha512_hmac_ni_avx2.asm +++ b/lib/avx2_t4/sha512_hmac_ni_avx2.asm @@ -407,14 +407,6 @@ sha512_384_hmac_submit_ni_avx2: FUNC_END ret -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; IMB_JOB *submit_job_hmac_sha_512_ni_avx2(MB_MGR_HMAC_SHA_512_OOO *state, IMB_JOB *job) -align 32 -MKGLOBAL(submit_job_hmac_sha_512_ni_avx2,function,internal) -submit_job_hmac_sha_512_ni_avx2: - mov DWORD(arg1), 512 - jmp sha512_384_hmac_submit_ni_avx2 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; IMB_JOB *submit_job_hmac_sha_384_ni_avx2(MB_MGR_SHA384_OOO *state, IMB_JOB *job) align 32 @@ -427,9 +419,7 @@ submit_job_hmac_sha_384_ni_avx2: ;; IMB_JOB *flush_job_hmac_sha_512_ni_avx2(MB_MGR_SHA512_OOO *state) ;; IMB_JOB *flush_job_hmac_sha_384_ni_avx2(MB_MGR_SHA384_OOO *state) align 32 -MKGLOBAL(flush_job_hmac_sha_512_ni_avx2,function,internal) MKGLOBAL(flush_job_hmac_sha_384_ni_avx2,function,internal) -flush_job_hmac_sha_512_ni_avx2: flush_job_hmac_sha_384_ni_avx2: xor rax, rax ret diff --git a/lib/win_x64.mak b/lib/win_x64.mak index 49214766..c9dcc992 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -599,7 +599,9 @@ avx2_t4_objs = \ $(OBJ_DIR)\sha512_x1_ni_avx2.obj \ $(OBJ_DIR)\sha512_x2_ni_avx2.obj \ $(OBJ_DIR)\sha_ni_avx2.obj \ - $(OBJ_DIR)\sha512_hmac_ni_avx2.obj + $(OBJ_DIR)\sha512_hmac_ni_avx2.obj \ + $(OBJ_DIR)\mb_mgr_hmac_sha512_submit_ni_avx2.obj \ + $(OBJ_DIR)\mb_mgr_hmac_sha512_flush_ni_avx2.obj !if "$(AESNI_EMU)" == "y" all_objs = $(lib_objs1) $(lib_objs2) $(gcm_objs) $(no_aesni_objs) -- GitLab From 0a5c2c9f2375f21aec88428d835bd32a2c75a47b Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Fri, 5 Jul 2024 14:07:10 +0100 Subject: [PATCH 23/24] avx2_t4: [HMAC-SHA384] add multi-buffer implementation Signed-off-by: Marcel Cornu --- lib/Makefile | 5 +- lib/avx2_t4/mb_mgr_avx2_t4.c | 2 +- .../mb_mgr_hmac_sha384_flush_ni_avx2.asm | 31 ++ .../mb_mgr_hmac_sha384_submit_ni_avx2.asm | 30 ++ .../mb_mgr_hmac_sha512_flush_ni_avx2.asm | 33 +- .../mb_mgr_hmac_sha512_submit_ni_avx2.asm | 24 +- lib/avx2_t4/sha512_hmac_ni_avx2.asm | 427 ------------------ lib/win_x64.mak | 5 +- 8 files changed, 95 insertions(+), 462 deletions(-) create mode 100644 lib/avx2_t4/mb_mgr_hmac_sha384_flush_ni_avx2.asm create mode 100644 lib/avx2_t4/mb_mgr_hmac_sha384_submit_ni_avx2.asm delete mode 100644 lib/avx2_t4/sha512_hmac_ni_avx2.asm diff --git a/lib/Makefile b/lib/Makefile index e5b01003..f6fd7e7e 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -780,9 +780,10 @@ asm_avx2_t4_lib_objs := \ sm3_hmac_avx2.o \ sha512_x1_ni_avx2.o \ sha512_x2_ni_avx2.o \ - sha512_hmac_ni_avx2.o \ mb_mgr_hmac_sha512_flush_ni_avx2.o \ - mb_mgr_hmac_sha512_submit_ni_avx2.o + mb_mgr_hmac_sha512_submit_ni_avx2.o \ + mb_mgr_hmac_sha384_flush_ni_avx2.o \ + mb_mgr_hmac_sha384_submit_ni_avx2.o # # List of ASM modules (avx512 directory) diff --git a/lib/avx2_t4/mb_mgr_avx2_t4.c b/lib/avx2_t4/mb_mgr_avx2_t4.c index 11df31e9..cc9583d9 100644 --- a/lib/avx2_t4/mb_mgr_avx2_t4.c +++ b/lib/avx2_t4/mb_mgr_avx2_t4.c @@ -296,7 +296,7 @@ reset_ooo_mgrs(IMB_MGR *state) ooo_mgr_hmac_sha256_reset(state->hmac_sha_256_ooo, 2); /* Init HMAC/SHA384 out-of-order fields */ - ooo_mgr_hmac_sha384_reset(state->hmac_sha_384_ooo, AVX2_NUM_SHA512_LANES); + ooo_mgr_hmac_sha384_reset(state->hmac_sha_384_ooo, 2); /* Init HMAC/SHA512 out-of-order fields */ ooo_mgr_hmac_sha512_reset(state->hmac_sha_512_ooo, 2); diff --git a/lib/avx2_t4/mb_mgr_hmac_sha384_flush_ni_avx2.asm b/lib/avx2_t4/mb_mgr_hmac_sha384_flush_ni_avx2.asm new file mode 100644 index 00000000..bb2178de --- /dev/null +++ b/lib/avx2_t4/mb_mgr_hmac_sha384_flush_ni_avx2.asm @@ -0,0 +1,31 @@ +;; +;; Copyright (c) 2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC flush_job_hmac_sha_384_ni_avx2 +%define SHA_X_DIGEST_SIZE 384 + +%include "avx2_t4/mb_mgr_hmac_sha512_flush_ni_avx2.asm" diff --git a/lib/avx2_t4/mb_mgr_hmac_sha384_submit_ni_avx2.asm b/lib/avx2_t4/mb_mgr_hmac_sha384_submit_ni_avx2.asm new file mode 100644 index 00000000..daca35c2 --- /dev/null +++ b/lib/avx2_t4/mb_mgr_hmac_sha384_submit_ni_avx2.asm @@ -0,0 +1,30 @@ +;; Copyright (c) 2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%define FUNC submit_job_hmac_sha_384_ni_avx2 +%define SHA_X_DIGEST_SIZE 384 + +%include "avx2_t4/mb_mgr_hmac_sha512_submit_ni_avx2.asm" diff --git a/lib/avx2_t4/mb_mgr_hmac_sha512_flush_ni_avx2.asm b/lib/avx2_t4/mb_mgr_hmac_sha512_flush_ni_avx2.asm index c6ccdc07..8c94253d 100644 --- a/lib/avx2_t4/mb_mgr_hmac_sha512_flush_ni_avx2.asm +++ b/lib/avx2_t4/mb_mgr_hmac_sha512_flush_ni_avx2.asm @@ -30,11 +30,12 @@ %include "include/mb_mgr_datastruct.inc" %include "include/reg_sizes.inc" -%define SHA512_FUNC sha512_ni_x2_avx2 +%ifndef FUNC %define FUNC flush_job_hmac_sha_512_ni_avx2 %define SHA_X_DIGEST_SIZE 512 +%endif -extern SHA512_FUNC +extern sha512_ni_x2_avx2 mksection .rodata default rel @@ -50,7 +51,6 @@ lane_1: dq 1 mksection .text -%if 1 %ifdef LINUX %define arg1 rdi %define arg2 rsi @@ -71,24 +71,18 @@ mksection .text %define tmp2 rbx %define job_rax rax -%define tmp1 rax %define size_offset rax %define tmp rax %define start_offset rax %define tmp3 arg1 - %define extra_blocks arg2 %define p arg2 %define tmp4 r8 - %define tmp5 r9 - %define tmp6 r10 -%endif - struc STACK _gpr_save: resq 3 _rsp_save: resq 1 @@ -97,7 +91,7 @@ endstruc %define APPEND(a,b) a %+ b ; JOB* FUNC(MB_MGR_HMAC_SHA_512_OOO *state) -; arg 1 : rcx : state +; arg 1 : state MKGLOBAL(FUNC,function,internal) FUNC: mov rax, rsp @@ -144,7 +138,7 @@ APPEND(skip_,I): ; "state" and "args" are the same address, arg1 ; len is arg2 - call SHA512_FUNC + call sha512_ni_x2_avx2 ; state and idx are intact len_is_0: @@ -173,7 +167,11 @@ proc_outer: vpshufb ymm0, [rel byteswap] vpshufb ymm1, [rel byteswap] vmovdqu [lane_data + _outer_block_sha512], ymm0 +%if (SHA_X_DIGEST_SIZE != 384) vmovdqu [lane_data + _outer_block_sha512+32], ymm1 +%else + vmovdqu [lane_data + _outer_block_sha512+32], xmm1 +%endif ; move the opad key into digest mov tmp, [job + _auth_key_xor_opad] @@ -227,15 +225,12 @@ end_loop: vpshufb ymm0, [rel byteswap] vmovdqu [p], ymm0 %else - mov QWORD(tmp2), [state + _args_digest_sha512 + idx + 0*SHA512_DIGEST_WORD_SIZE] - mov QWORD(tmp4), [state + _args_digest_sha512 + idx + 1*SHA512_DIGEST_WORD_SIZE] - mov QWORD(tmp6), [state + _args_digest_sha512 + idx + 2*SHA512_DIGEST_WORD_SIZE] + vmovdqu xmm0, [state + _args_digest_sha512 + idx] + vpshufb xmm0, [rel byteswap] + mov QWORD(tmp2), [state + _args_digest_sha512 + idx + 16] bswap QWORD(tmp2) - bswap QWORD(tmp4) - bswap QWORD(tmp6) - mov [p + 0*8], QWORD(tmp2) - mov [p + 1*8], QWORD(tmp4) - mov [p + 2*8], QWORD(tmp6) + vmovdqu [p], xmm0 + mov [p + 16], QWORD(tmp2) %endif jmp clear_ret diff --git a/lib/avx2_t4/mb_mgr_hmac_sha512_submit_ni_avx2.asm b/lib/avx2_t4/mb_mgr_hmac_sha512_submit_ni_avx2.asm index 47e5e00b..d4bd4450 100644 --- a/lib/avx2_t4/mb_mgr_hmac_sha512_submit_ni_avx2.asm +++ b/lib/avx2_t4/mb_mgr_hmac_sha512_submit_ni_avx2.asm @@ -31,11 +31,12 @@ %include "include/memcpy.inc" %include "include/const.inc" -%define SHA512_FUNC sha512_ni_x2_avx2 +%ifndef FUNC %define FUNC submit_job_hmac_sha_512_ni_avx2 %define SHA_X_DIGEST_SIZE 512 +%endif -extern SHA512_FUNC +extern sha512_ni_x2_avx2 mksection .rodata default rel @@ -214,7 +215,7 @@ start_loop: ; "state" and "args" are the same address, arg1 ; len is arg2 - call SHA512_FUNC + call sha512_ni_x2_avx2 ; state and idx are intact len_is_0: @@ -246,7 +247,11 @@ proc_outer: vpshufb ymm0, [rel byteswap] vpshufb ymm1, [rel byteswap] vmovdqu [lane_data + _outer_block_sha512], ymm0 +%if (SHA_X_DIGEST_SIZE != 384) vmovdqu [lane_data + _outer_block_sha512+32], ymm1 +%else + vmovdqu [lane_data + _outer_block_sha512+32], xmm1 +%endif mov tmp, [job + _auth_key_xor_opad] vmovdqu ymm0, [tmp] @@ -312,15 +317,12 @@ end_loop: vpshufb ymm0, [rel byteswap] vmovdqu [p], ymm0 %else - mov QWORD(tmp2), [state + _args_digest_sha512 + idx + 0*SHA512_DIGEST_WORD_SIZE] - mov QWORD(tmp4), [state + _args_digest_sha512 + idx + 1*SHA512_DIGEST_WORD_SIZE] - mov QWORD(tmp6), [state + _args_digest_sha512 + idx + 2*SHA512_DIGEST_WORD_SIZE] + vmovdqu xmm0, [state + _args_digest_sha512 + idx] + vpshufb xmm0, [rel byteswap] + mov QWORD(tmp2), [state + _args_digest_sha512 + idx + 16] bswap QWORD(tmp2) - bswap QWORD(tmp4) - bswap QWORD(tmp6) - mov [p + 0*8], QWORD(tmp2) - mov [p + 1*8], QWORD(tmp4) - mov [p + 2*8], QWORD(tmp6) + vmovdqu [p], xmm0 + mov [p + 16], QWORD(tmp2) %endif jmp clear_ret diff --git a/lib/avx2_t4/sha512_hmac_ni_avx2.asm b/lib/avx2_t4/sha512_hmac_ni_avx2.asm deleted file mode 100644 index c912f92a..00000000 --- a/lib/avx2_t4/sha512_hmac_ni_avx2.asm +++ /dev/null @@ -1,427 +0,0 @@ -;; -;; Copyright (c) 2023-2024, Intel Corporation -;; -;; Redistribution and use in source and binary forms, with or without -;; modification, are permitted provided that the following conditions are met: -;; -;; * Redistributions of source code must retain the above copyright notice, -;; this list of conditions and the following disclaimer. -;; * Redistributions in binary form must reproduce the above copyright -;; notice, this list of conditions and the following disclaimer in the -;; documentation and/or other materials provided with the distribution. -;; * Neither the name of Intel Corporation nor the names of its contributors -;; may be used to endorse or promote products derived from this software -;; without specific prior written permission. -;; -;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;; - -;; FIPS PUB 180-4, FEDERAL INFORMATION PROCESSING STANDARDS PUBLICATION, Secure Hash Standard (SHS) -;; https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf - -extern sha512_update_ni_x1 - -%include "include/os.inc" -%include "include/constants.inc" -%include "include/reg_sizes.inc" -%include "include/imb_job.inc" -%include "include/memcpy.inc" - -%ifdef LINUX - -%define arg1 rdi -%define arg2 rsi -%define arg3 rdx -%define arg4 rcx - -%define gp1 rax -%define gp2 r8 -%define gp3 r9 -%define gp4 r10 -%define gp5 r11 -%define gp6 arg4 -%define gp7 r12 -%define gp8 r13 -%define gp9 r14 -%define gp10 r15 -%define gp11 rbx -%define gp12 rbp - -%else - -%define arg1 rcx -%define arg2 rdx -%define arg3 r8 -%define arg4 r9 - -%define gp1 rax -%define gp2 r10 -%define gp3 r11 -%define gp4 arg4 -%define gp5 rdi -%define gp6 rsi -%define gp7 r12 -%define gp8 r13 -%define gp9 r14 -%define gp10 r15 -%define gp11 rbx -%define gp12 rbp - -%endif - -%xdefine t1 gp1 -%xdefine t2 gp2 -%xdefine t3 gp3 -%xdefine t4 gp4 - -%xdefine r1 gp12 -%xdefine r2 gp11 -%xdefine r3 gp10 -%xdefine r4 gp9 - -%define arg_job r1 -%define arg_msg r2 -%define arg_msg_length r3 -%define arg_sha_type r4 - -;; HMAC-SHA512/384 stack frame -struc STACK -_B: resb SHA512_BLK_SZ ; two SHA512 blocks (aligned to 16) -_D: resb SHA512_DIGEST_SIZE ; digest -_gpr_save: resq 8 ; space for GPR's -_rsp_save: resq 1 ; space for rsp pointer -endstruc - -mksection .rodata - -align 32 -SHUFF_MASK: - dq 0x0001020304050607, 0x08090a0b0c0d0e0f - dq 0x0001020304050607, 0x08090a0b0c0d0e0f - -;; End-of-Message pattern -align 32 -EOM_32BYTES: - db 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - -;; PAD BLOCKS are used for OPAD where digest of IPAD + message is put into the block. -;; The blocks below fill up top 32 bytes of the block, -;; low 64/48 bytes get filled with the digest followed by EOM. -align 32 -SHA512_OPAD_LENGTH: - ;; last two qwords has to encode length in bits of: BLOCK size + DIGEST size - ;; (128 + 64) * 8 = 1536 = 0x600 in hex - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00 - -align 32 -SHA384_OPAD_LENGTH: - ;; last two qwords has to encode length in bits of: BLOCK size + DIGEST size - ;; (128 + 48) * 8 = 1408 = 0x580 in hex - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 - db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x80 - -mksection .text - -;; ============================================================================= -;; Save registers on the stack and create stack frame -;; ============================================================================= - -%macro FUNC_START 0 - mov rax, rsp - sub rsp, STACK_size - and rsp, -32 - mov [rsp + _rsp_save], rax - mov [rsp + _gpr_save + 0*8], rbx - mov [rsp + _gpr_save + 1*8], rbp - mov [rsp + _gpr_save + 2*8], r12 - mov [rsp + _gpr_save + 3*8], r13 - mov [rsp + _gpr_save + 4*8], r14 - mov [rsp + _gpr_save + 5*8], r15 -%ifidn __OUTPUT_FORMAT__, win64 - mov [rsp + _gpr_save + 6*8], rdi - mov [rsp + _gpr_save + 7*8], rsi -%endif -%endmacro - -;; ============================================================================= -;; Restore registers from the stack -;; ============================================================================= - -%macro FUNC_END 0 - mov rbx, [rsp + _gpr_save + 0*8] - mov rbp, [rsp + _gpr_save + 1*8] - mov r12, [rsp + _gpr_save + 2*8] - mov r13, [rsp + _gpr_save + 3*8] - mov r14, [rsp + _gpr_save + 4*8] - mov r15, [rsp + _gpr_save + 5*8] -%ifidn __OUTPUT_FORMAT__, win64 - mov rdi, [rsp + _gpr_save + 6*8] - mov rsi, [rsp + _gpr_save + 7*8] -%endif - mov rsp, [rsp + _rsp_save] -%endmacro - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; void sha512_tag_store(void *tag_ptr, uint64_t tag_len, ymm1:ymm0 tag) -align 32 -MKGLOBAL(sha512_tag_store,function,internal) -sha512_tag_store: - cmp arg2, 16 - jb .tag_store_1_15 - je .tag_store_16 - - cmp arg2, 32 - je .tag_store_32 - jb .tag_store_17_31 - - cmp arg2, 48 - je .tag_store_48 - jb .tag_store_33_47 - - cmp arg2, 64 - je .tag_store_64 - -.tag_store_49_63: - vmovdqu [arg1 + 0*32], ymm0 - vmovdqu [arg1 + 1*32], xmm1 - vextracti128 xmm0, ymm1, 1 - lea arg1, [arg1 + 48] - sub arg2, 48 - jmp .tag_store_1_15 - -.tag_store_33_47: - vmovdqu [arg1 + 0*32], ymm0 - lea arg1, [arg1 + 32] - vmovdqa ymm0, ymm1 - sub arg2, 32 - jmp .tag_store_1_15 - -.tag_store_17_31: - vmovdqu [arg1 + 0*16], xmm0 - vextracti128 xmm0, ymm0, 1 - lea arg1, [arg1 + 16] - sub arg2, 16 - ;; fall through to store remaining tag bytes - -.tag_store_1_15: - simd_store_avx arg1, xmm0, arg2, t1, t2 - jmp .tag_store_end - -.tag_store_16: - vmovdqu [arg1 + 0*16], xmm0 - jmp .tag_store_end - -.tag_store_32: - vmovdqu [arg1 + 0*32], ymm0 - jmp .tag_store_end - -.tag_store_48: - vmovdqu [arg1 + 0*32], ymm0 - vmovdqu [arg1 + 1*32], xmm1 - jmp .tag_store_end - -.tag_store_64: - vmovdqu [arg1 + 0*32], ymm0 - vmovdqu [arg1 + 1*32], ymm1 - -.tag_store_end: - ret - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; IMB_JOB *sha512_384_hmac_submit_ni_avx2(const unsigned sha_type, IMB_JOB *job) -align 32 -MKGLOBAL(sha512_384_hmac_submit_ni_avx2,function,internal) -sha512_384_hmac_submit_ni_avx2: - FUNC_START - - ;; save input arguments - mov arg_job, arg2 - mov arg_sha_type, arg1 - - ;; init the digest with IPAD - mov t1, [arg_job + _auth_key_xor_ipad] - vmovdqu ymm0, [t1 + 0*32] - vmovdqu ymm1, [t1 + 1*32] - vmovdqa [rsp + _D + 0*32], ymm0 - vmovdqa [rsp + _D + 1*32], ymm1 - - ;; update digest for full number of blocks - lea arg1, [rsp + _D] - mov arg2, [arg_job + _src] - add arg2, [arg_job + _hash_start_src_offset] - mov arg_msg, arg2 - mov arg_msg_length, [arg_job + _msg_len_to_hash_in_bytes] - mov arg3, arg_msg_length - shr arg3, 7 ;; msg_length / SHA512_BLK_SZ - call sha512_update_ni_x1 - - ;; prepare partial block - mov DWORD(arg3), SHA512_BLK_SZ - 1 - not arg3 - and arg3, arg_msg_length ;; number of bytes processed already - add arg_msg, arg3 ;; move message pointer to start of the partial block - mov t2, arg_msg_length - sub t2, arg3 ;; t2 = number of bytes left - - xor DWORD(arg1), DWORD(arg1) -.partial_block_copy: - cmp DWORD(arg1), DWORD(t2) - je .partial_block_copy_exit - mov BYTE(t1), [arg_msg + arg1] - mov [rsp + _B + arg1], BYTE(t1) - inc DWORD(arg1) - jmp .partial_block_copy - -.partial_block_copy_exit: - ;; put end of message marker - mov BYTE [rsp + _B + arg1], 0x80 - inc DWORD(arg1) - - xor DWORD(t1), DWORD(t1) -.partial_block_zero: - cmp DWORD(arg1), SHA512_BLK_SZ - je .partial_block_zero_exit - mov [rsp + _B + arg1], BYTE(t1) - inc DWORD(arg1) - jmp .partial_block_zero - -.partial_block_zero_exit: - cmp DWORD(t2), SHA512_BLK_SZ - 16 - jb .add_msg_length - - ;; if length field doesn't fit into this partial block - ;; - compute digest on the current block - ;; - clear the block for the length to be put into it next - lea arg1, [rsp + _D] - lea arg2, [rsp + _B] - mov DWORD(arg3), 1 - call sha512_update_ni_x1 - - ;; clear the block - vpxor xmm0, xmm0, xmm0 - vmovdqa [rsp + _B + 0*32], ymm0 - vmovdqa [rsp + _B + 1*32], ymm0 - vmovdqa [rsp + _B + 2*32], ymm0 - vmovdqa [rsp + _B + 3*32], xmm0 ;; the last 16 bytes will be set below - -.add_msg_length: - lea arg2, [arg_msg_length + SHA512_BLK_SZ] ;; original message length + IPAD block - lea arg1, [arg2 * 8] ;; length in bits - shr arg2, 61 - movbe [rsp + _B + SHA512_BLK_SZ - 2*8], arg2 - movbe [rsp + _B + SHA512_BLK_SZ - 1*8], arg1 - - lea arg1, [rsp + _D] - lea arg2, [rsp + _B] - mov DWORD(arg3), 1 - call sha512_update_ni_x1 - -.process_opad: - cmp DWORD(arg_sha_type), 512 - jne .opad_hmac_sha384 - -.opad_hmac_sha512: - vmovdqa ymm0, [rsp + _D + 0*32] - vmovdqa ymm1, [rsp + _D + 1*32] - vpshufb ymm0, ymm0, [rel SHUFF_MASK] - vpshufb ymm1, ymm1, [rel SHUFF_MASK] - vmovdqa ymm2, [rel EOM_32BYTES] - vmovdqa ymm3, [rel SHA512_OPAD_LENGTH] - vmovdqa [rsp + _B + 0*32], ymm0 - vmovdqa [rsp + _B + 1*32], ymm1 - vmovdqa [rsp + _B + 2*32], ymm2 - vmovdqa [rsp + _B + 3*32], ymm3 - jmp .opad_update - -.opad_hmac_sha384: - vmovdqa ymm0, [rsp + _D + 0*32] - vmovdqa xmm1, [rsp + _D + 1*32] - vpshufb ymm0, ymm0, [rel SHUFF_MASK] - vpshufb xmm1, xmm1, [rel SHUFF_MASK] - vinserti128 ymm1, [rel EOM_32BYTES], 1 - vpxor xmm2, xmm2, xmm2 - vmovdqa ymm3, [rel SHA384_OPAD_LENGTH] - vmovdqa [rsp + _B + 0*32], ymm0 - vmovdqa [rsp + _B + 1*32], ymm1 - vmovdqa [rsp + _B + 2*32], ymm2 - vmovdqa [rsp + _B + 3*32], ymm3 - -.opad_update: - ;; init the digest with OPAD - mov t1, [arg_job + _auth_key_xor_opad] - vmovdqu ymm0, [t1 + 0*32] - vmovdqu ymm1, [t1 + 1*32] - vmovdqa [rsp + _D + 0*32], ymm0 - vmovdqa [rsp + _D + 1*32], ymm1 - - lea arg1, [rsp + _D] - lea arg2, [rsp + _B] - mov DWORD(arg3), 1 - call sha512_update_ni_x1 - -.tag_store_start: - ;; byte swap the digest and write it back - lea arg1, [rsp + _D] - vmovdqa ymm0, [arg1 + 0*32] - vmovdqa ymm1, [arg1 + 1*32] - vpshufb ymm0, ymm0, [rel SHUFF_MASK] - vpshufb ymm1, ymm1, [rel SHUFF_MASK] - - mov arg1, [arg_job + _auth_tag_output] - mov arg2, [arg_job + _auth_tag_output_len_in_bytes] - call sha512_tag_store - -%ifdef SAFE_DATA - vpxor xmm0, xmm0, xmm0 - vpxor xmm1, xmm1, xmm1 - vpxor xmm2, xmm2, xmm2 - vpxor xmm3, xmm3, xmm3 - - vmovdqu [rsp + _B + 0*32], ymm0 - vmovdqu [rsp + _B + 1*32], ymm0 - vmovdqu [rsp + _B + 2*32], ymm0 - vmovdqu [rsp + _B + 3*32], ymm0 -%endif - vzeroupper - - mov rax, arg_job - or dword [arg_job + _status], IMB_STATUS_COMPLETED_AUTH - FUNC_END - ret - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; IMB_JOB *submit_job_hmac_sha_384_ni_avx2(MB_MGR_SHA384_OOO *state, IMB_JOB *job) -align 32 -MKGLOBAL(submit_job_hmac_sha_384_ni_avx2,function,internal) -submit_job_hmac_sha_384_ni_avx2: - mov DWORD(arg1), 384 - jmp sha512_384_hmac_submit_ni_avx2 - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; IMB_JOB *flush_job_hmac_sha_512_ni_avx2(MB_MGR_SHA512_OOO *state) -;; IMB_JOB *flush_job_hmac_sha_384_ni_avx2(MB_MGR_SHA384_OOO *state) -align 32 -MKGLOBAL(flush_job_hmac_sha_384_ni_avx2,function,internal) -flush_job_hmac_sha_384_ni_avx2: - xor rax, rax - ret - -mksection stack-noexec diff --git a/lib/win_x64.mak b/lib/win_x64.mak index c9dcc992..530c0004 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -599,9 +599,10 @@ avx2_t4_objs = \ $(OBJ_DIR)\sha512_x1_ni_avx2.obj \ $(OBJ_DIR)\sha512_x2_ni_avx2.obj \ $(OBJ_DIR)\sha_ni_avx2.obj \ - $(OBJ_DIR)\sha512_hmac_ni_avx2.obj \ $(OBJ_DIR)\mb_mgr_hmac_sha512_submit_ni_avx2.obj \ - $(OBJ_DIR)\mb_mgr_hmac_sha512_flush_ni_avx2.obj + $(OBJ_DIR)\mb_mgr_hmac_sha512_flush_ni_avx2.obj \ + $(OBJ_DIR)\mb_mgr_hmac_sha384_submit_ni_avx2.obj \ + $(OBJ_DIR)\mb_mgr_hmac_sha384_flush_ni_avx2.obj !if "$(AESNI_EMU)" == "y" all_objs = $(lib_objs1) $(lib_objs2) $(gcm_objs) $(no_aesni_objs) -- GitLab From 698be713d480a909f925417efba93ce5fbc6c258 Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Fri, 19 Jul 2024 10:19:56 +0100 Subject: [PATCH 24/24] lib: [AES-CCM] add required endbranch instructions Signed-off-by: Marcel Cornu --- lib/avx512_t2/aes_cntr_ccm_api_by16_vaes_avx512.asm | 3 +++ .../mb_mgr_aes128_ccm_auth_submit_flush_x16_vaes_avx512.asm | 3 +++ lib/avx_t1/aes128_cntr_by8_avx.asm | 2 ++ lib/avx_t1/aes256_cntr_by8_avx.asm | 2 ++ lib/avx_t1/mb_mgr_aes128_ccm_auth_submit_flush_x8_avx.asm | 3 +++ lib/sse_t1/aes128_cntr_by8_sse.asm | 2 ++ lib/sse_t1/aes256_cntr_by8_sse.asm | 2 ++ lib/sse_t1/mb_mgr_aes128_ccm_auth_submit_flush_x4_sse.asm | 3 +++ 8 files changed, 20 insertions(+) diff --git a/lib/avx512_t2/aes_cntr_ccm_api_by16_vaes_avx512.asm b/lib/avx512_t2/aes_cntr_ccm_api_by16_vaes_avx512.asm index 5e81d4f9..175d76ce 100644 --- a/lib/avx512_t2/aes_cntr_ccm_api_by16_vaes_avx512.asm +++ b/lib/avx512_t2/aes_cntr_ccm_api_by16_vaes_avx512.asm @@ -28,12 +28,14 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %include "include/aes_cntr_by16_vaes_avx512.inc" +%include "include/cet.inc" ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;IMB_JOB * aes_cntr_ccm_128_vaes_avx512(IMB_JOB *job) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(aes_cntr_ccm_128_vaes_avx512,function,internal) aes_cntr_ccm_128_vaes_avx512: + endbranch64 FUNC_SAVE CNTR ;; arg1 - [in] job ;; arg2 - [in] NROUNDS @@ -48,6 +50,7 @@ aes_cntr_ccm_128_vaes_avx512: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; MKGLOBAL(aes_cntr_ccm_256_vaes_avx512,function,internal) aes_cntr_ccm_256_vaes_avx512: + endbranch64 FUNC_SAVE CNTR ;; arg1 - [in] job ;; arg2 - [in] NROUNDS diff --git a/lib/avx512_t2/mb_mgr_aes128_ccm_auth_submit_flush_x16_vaes_avx512.asm b/lib/avx512_t2/mb_mgr_aes128_ccm_auth_submit_flush_x16_vaes_avx512.asm index f04a6d81..1e30b69f 100644 --- a/lib/avx512_t2/mb_mgr_aes128_ccm_auth_submit_flush_x16_vaes_avx512.asm +++ b/lib/avx512_t2/mb_mgr_aes128_ccm_auth_submit_flush_x16_vaes_avx512.asm @@ -32,6 +32,7 @@ %include "include/const.inc" %include "include/memcpy.inc" %include "include/clear_regs.inc" +%include "include/cet.inc" %ifndef AES_CBC_MAC %define AES_CBC_MAC aes128_cbc_mac_vaes_avx512 @@ -726,12 +727,14 @@ align 64 ; arg 2 : job MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal) SUBMIT_JOB_AES_CCM_AUTH: + endbranch64 GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX SUBMIT ; IMB_JOB * flush_job_aes128/256_ccm_auth_vaes_avx512(MB_MGR_CCM_OOO *state) ; arg 1 : state MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal) FLUSH_JOB_AES_CCM_AUTH: + endbranch64 GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX FLUSH mksection stack-noexec diff --git a/lib/avx_t1/aes128_cntr_by8_avx.asm b/lib/avx_t1/aes128_cntr_by8_avx.asm index af1407bf..b0d80ace 100644 --- a/lib/avx_t1/aes128_cntr_by8_avx.asm +++ b/lib/avx_t1/aes128_cntr_by8_avx.asm @@ -31,6 +31,7 @@ %include "include/const.inc" %include "include/reg_sizes.inc" %include "include/clear_regs.inc" +%include "include/cet.inc" ; routine to do AES128 CNTR enc/decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level @@ -593,6 +594,7 @@ align 32 ; arg 1 : job MKGLOBAL(aes_cntr_ccm_128_avx,function,internal) aes_cntr_ccm_128_avx: + endbranch64 DO_CNTR CCM %else ;; aes_cntr_128_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, diff --git a/lib/avx_t1/aes256_cntr_by8_avx.asm b/lib/avx_t1/aes256_cntr_by8_avx.asm index 61502bdf..237f9c45 100644 --- a/lib/avx_t1/aes256_cntr_by8_avx.asm +++ b/lib/avx_t1/aes256_cntr_by8_avx.asm @@ -31,6 +31,7 @@ %include "include/const.inc" %include "include/reg_sizes.inc" %include "include/clear_regs.inc" +%include "include/cet.inc" ; routine to do AES256 CNTR enc/decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level @@ -587,6 +588,7 @@ align 32 ; arg 1 : job MKGLOBAL(aes_cntr_ccm_256_avx,function,internal) aes_cntr_ccm_256_avx: + endbranch64 DO_CNTR CCM %else ;; aes_cntr_256_avx(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, diff --git a/lib/avx_t1/mb_mgr_aes128_ccm_auth_submit_flush_x8_avx.asm b/lib/avx_t1/mb_mgr_aes128_ccm_auth_submit_flush_x8_avx.asm index 2f43a83f..33064e70 100644 --- a/lib/avx_t1/mb_mgr_aes128_ccm_auth_submit_flush_x8_avx.asm +++ b/lib/avx_t1/mb_mgr_aes128_ccm_auth_submit_flush_x8_avx.asm @@ -31,6 +31,7 @@ %include "include/reg_sizes.inc" %include "include/const.inc" %include "include/memcpy.inc" +%include "include/cet.inc" %ifndef AES_CBC_MAC @@ -595,12 +596,14 @@ align 64 ; arg 2 : job MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal) SUBMIT_JOB_AES_CCM_AUTH: + endbranch64 GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX SUBMIT ; IMB_JOB * flush_job_aes128/256_ccm_auth_avx(MB_MGR_CCM_OOO *state) ; arg 1 : state MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal) FLUSH_JOB_AES_CCM_AUTH: + endbranch64 GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_AVX FLUSH mksection stack-noexec diff --git a/lib/sse_t1/aes128_cntr_by8_sse.asm b/lib/sse_t1/aes128_cntr_by8_sse.asm index 84879607..c680f4d0 100644 --- a/lib/sse_t1/aes128_cntr_by8_sse.asm +++ b/lib/sse_t1/aes128_cntr_by8_sse.asm @@ -31,6 +31,7 @@ %include "include/const.inc" %include "include/reg_sizes.inc" %include "include/clear_regs.inc" +%include "include/cet.inc" ; routine to do AES128 CNTR enc/decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level @@ -566,6 +567,7 @@ align 32 align 32 MKGLOBAL(AES_CNTR_CCM_128,function,internal) AES_CNTR_CCM_128: + endbranch64 DO_CNTR CCM %else ;; aes_cntr_128_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) diff --git a/lib/sse_t1/aes256_cntr_by8_sse.asm b/lib/sse_t1/aes256_cntr_by8_sse.asm index f3e304bb..5c346438 100644 --- a/lib/sse_t1/aes256_cntr_by8_sse.asm +++ b/lib/sse_t1/aes256_cntr_by8_sse.asm @@ -31,6 +31,7 @@ %include "include/const.inc" %include "include/reg_sizes.inc" %include "include/clear_regs.inc" +%include "include/cet.inc" ; routine to do AES256 CNTR enc/decrypt "by8" ; XMM registers are clobbered. Saving/restoring must be done at a higher level @@ -594,6 +595,7 @@ align 32 align 32 MKGLOBAL(AES_CNTR_CCM_256,function,internal) AES_CNTR_CCM_256: + endbranch64 DO_CNTR CCM %else ;; aes_cntr_256_sse(void *in, void *IV, void *keys, void *out, UINT64 num_bytes, UINT64 iv_len) diff --git a/lib/sse_t1/mb_mgr_aes128_ccm_auth_submit_flush_x4_sse.asm b/lib/sse_t1/mb_mgr_aes128_ccm_auth_submit_flush_x4_sse.asm index 638f50b8..970bf332 100644 --- a/lib/sse_t1/mb_mgr_aes128_ccm_auth_submit_flush_x4_sse.asm +++ b/lib/sse_t1/mb_mgr_aes128_ccm_auth_submit_flush_x4_sse.asm @@ -31,6 +31,7 @@ %include "include/reg_sizes.inc" %include "include/const.inc" %include "include/memcpy.inc" +%include "include/cet.inc" %ifndef NUM_LANES %define NUM_LANES 4 @@ -614,12 +615,14 @@ align 64 ; arg 2 : job MKGLOBAL(SUBMIT_JOB_AES_CCM_AUTH,function,internal) SUBMIT_JOB_AES_CCM_AUTH: + endbranch64 GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_SSE SUBMIT ; IMB_JOB * flush_job_aes_ccm_auth_sse(MB_MGR_CCM_OOO *state) ; arg 1 : state MKGLOBAL(FLUSH_JOB_AES_CCM_AUTH,function,internal) FLUSH_JOB_AES_CCM_AUTH: + endbranch64 GENERIC_SUBMIT_FLUSH_JOB_AES_CCM_AUTH_SSE FLUSH mksection stack-noexec -- GitLab