diff --git a/INSTALL.md b/INSTALL.md index 7ba1cb2a7651eca83ec5e3da4295743aa6832291..9a35f18e01273d67d2b85b40e2de4af63042c67a 100644 --- a/INSTALL.md +++ b/INSTALL.md @@ -133,7 +133,7 @@ For more build options and their explanation run: ### Building with CMake (experimental) -Minimum CMake version: 3.16 +Minimum CMake version: 3.18 Create build directory: ``` diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 7fb957c2841ea5de4074950155715ac5bd269905..e28100959d34497d4bfafcfd59d553c094eb6d21 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -49,12 +49,13 @@ endif() set(NASM_VERSION_REQUIRED "2.14") set(NASM_VERSION_AVX_IFMA "2.16") +set(NASM_VERSION_SMX_NI "2.16.02") execute_process( COMMAND ${CMAKE_ASM_NASM_COMPILER} -v OUTPUT_VARIABLE NASM_VERSION_OUTPUT OUTPUT_STRIP_TRAILING_WHITESPACE) -string(REGEX MATCH "NASM version ([0-9]*.[0-9]*)" NASM_VERSION +string(REGEX MATCH "NASM version ([0-9]*.[0-9]*.[0-9]*)" NASM_VERSION "${NASM_VERSION_OUTPUT}") if(NASM_VERSION) if(NASM_VERSION_REQUIRED VERSION_GREATER ${CMAKE_MATCH_1}) @@ -64,13 +65,22 @@ if(NASM_VERSION) message(STATUS "NASM version: ${CMAKE_MATCH_1}") if(NASM_VERSION_AVX_IFMA VERSION_GREATER ${CMAKE_MATCH_1}) message( - WARNING + NOTICE "Minimum required NASM version for AVX-IFMA: ${NASM_VERSION_AVX_IFMA}. AVX-IFMA code not compiled - update NASM." ) else() # AVX IFMA supported by NASM set(AVX_IFMA 1) endif() + if(NASM_VERSION_SMX_NI VERSION_GREATER ${CMAKE_MATCH_1}) + message( + NOTICE + "Minimum required NASM version for SM3/SM4/SHA512-NI: ${NASM_VERSION_SMX_NI}. SM3/SHA/SHA512-NI code not compiled - update NASM." + ) + else() + # SM3/SM4/SHA512-NI supported by NASM + set(SMX_NI 1) + endif() else() message( WARNING @@ -105,7 +115,9 @@ else() set(DIR_AVX2_T3 ${CMAKE_CURRENT_SOURCE_DIR}/avx2_t3) endif() endif() - +if(SMX_NI) + set(DIR_AVX2_T4 ${CMAKE_CURRENT_SOURCE_DIR}/avx2_t4) +endif() ######################################## # create list of all source directories ######################################## @@ -121,6 +133,7 @@ else() ${DIR_AVX_T2} ${DIR_AVX2_T1} ${DIR_AVX2_T2} + ${DIR_AVX2_T4} ${DIR_AVX512_T1} ${DIR_AVX512_T2} ${DIR_SSE_T1} @@ -157,6 +170,7 @@ else() file(GLOB SRC_FILES_AVX2_T1 "${DIR_AVX2_T1}/*.c") file(GLOB SRC_FILES_AVX2_T2 "${DIR_AVX2_T2}/*.c") file(GLOB SRC_FILES_AVX2_T3 "${DIR_AVX2_T3}/*.c") + file(GLOB SRC_FILES_AVX2_T4 "${DIR_AVX2_T4}/*.c") file(GLOB SRC_FILES_AVX512_T1 "${DIR_AVX512_T1}/*.c") file(GLOB SRC_FILES_AVX512_T2 "${DIR_AVX512_T2}/*.c") file(GLOB SRC_FILES_SSE_T1 "${DIR_SSE_T1}/*.c") @@ -185,6 +199,7 @@ else() ${SRC_FILES_SSE_T2} ${SRC_FILES_SSE_T3} ${SRC_FILES_X86_64} + ${SRC_FILES_AVX2_T4} ) if(AESNI_EMU) list(APPEND SRC_FILES_C ${SRC_FILES_NO_AESNI}) @@ -192,8 +207,12 @@ else() if(AVX_IFMA) list(APPEND SRC_FILES_C ${SRC_FILES_AVX2_T3}) endif() + if(SMX_NI) + list(APPEND SRC_FILES_C ${SRC_FILES_AVX2_T4}) + endif() endif() + list(SORT SRC_FILES_C) ######################################## diff --git a/lib/Makefile b/lib/Makefile index c3c64b62bbba19b4bd438cc37c6e3aaa53bfdb80..bfe8cb627417f195aab805d4853c3eb34375fbb7 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -76,14 +76,18 @@ NASM_VERSION = $(shell $(NASM) -v | cut -d " " -f 3) NASM_MAJOR_REQ = 2 NASM_MINOR_REQ = 14 -NASM_MIN_REQ = $(shell expr $(NASM_MAJOR_REQ) $(MULT) 100 + $(NASM_MINOR_REQ) ) +NASM_MIN_REQ = $(shell expr $(NASM_MAJOR_REQ) $(MULT) 10000 + $(NASM_MINOR_REQ) $(MULT) 100) ifeq ($(NASM_VERSION),) $(error "NASM is not installed! Minimum required version: $(NASM_MAJOR_REQ).$(NASM_MINOR_REQ)") else NASM_MAJOR_VER = $(shell echo $(NASM_VERSION) | cut -d "." -f 1) NASM_MINOR_VER = $(shell echo $(NASM_VERSION) | cut -d "." -f 2 | cut -c 1-2) -NASM_VER = $(shell expr $(NASM_MAJOR_VER) $(MULT) 100 + $(NASM_MINOR_VER) ) +NASM_REV_VER = $(shell echo $(NASM_VERSION) | cut -d "." -f 3 | cut -c 1-2) +ifeq ($(NASM_REV_VER),) +NASM_REV_VER = 0 +endif +NASM_VER = $(shell expr $(NASM_MAJOR_VER) $(MULT) 10000 + $(NASM_MINOR_VER) $(MULT) 100 + $(NASM_REV_VER)) NASM_GE_REQ = $(shell [ $(NASM_VER) -ge $(NASM_MIN_REQ) ] && echo true) @@ -98,7 +102,7 @@ endif # NASM_VERSION AVX_IFMA := y NASM_MAJOR_AVX_IFMA = 2 NASM_MINOR_AVX_IFMA = 16 -NASM_MIN_IFMA_REQ = $(shell expr $(NASM_MAJOR_AVX_IFMA) $(MULT) 100 + $(NASM_MINOR_AVX_IFMA) ) +NASM_MIN_IFMA_REQ = $(shell expr $(NASM_MAJOR_AVX_IFMA) $(MULT) 10000 + $(NASM_MINOR_AVX_IFMA) $(MULT) 100) NASM_IFMA_GE_REQ = $(shell [ $(NASM_VER) -ge $(NASM_MIN_IFMA_REQ) ] && echo true) ifneq ($(NASM_IFMA_GE_REQ),true) @@ -107,6 +111,19 @@ AVX_IFMA := n endif # NASM_AVX_IFMA_GET_REQ endif # x86_64 +# Minimum version of NASM with SM3/SM4/SHA512-NI support: 2.16.02 +SMX_NI := y +NASM_MAJOR_SMX_NI = 2 +NASM_MINOR_SMX_NI = 16 +NASM_REV_SMX_NI = 02 +NASM_MIN_SMX_REQ = $(shell expr $(NASM_MAJOR_SMX_NI) $(MULT) 10000 + $(NASM_MINOR_SMX_NI) $(MULT) 100 + $(NASM_REV_SMX_NI)) + +NASM_SMX_GE_REQ = $(shell [ $(NASM_VER) -ge $(NASM_MIN_SMX_REQ) ] && echo true) +ifneq ($(NASM_SMX_GE_REQ),true) +$(warning Minimum required NASM version for SM3/SM4/SHA512-NI: $(NASM_MAJOR_SMX_NI).$(NASM_MINOR_SMX_NI).$(NASM_REV_SMX_NI) SM3/SM4/SHA512-NI code not compiled - update NASM.) +SMX_NI := n +endif # NASM_SMX_NI_GET_REQ + INCLUDE_DIRS := include . no-aesni INCLUDES := $(foreach i,$(INCLUDE_DIRS),-I $i) @@ -160,6 +177,10 @@ ifeq ($(AVX_IFMA), y) CFLAGS += -DAVX_IFMA endif +ifeq ($(SMX_NI), y) +CFLAGS += -DSMX_NI +endif + ASM_INCLUDE_DIRS := . NASM_INCLUDES := $(foreach i,$(ASM_INCLUDE_DIRS),-I$i/) @@ -387,6 +408,7 @@ c_lib_objs := \ sha_avx.o \ sha_avx2.o \ sha_avx512.o \ + sha_ni_avx2.o \ sha_mb_avx.o \ sha_mb_avx2.o \ sha_mb_avx512.o \ @@ -431,6 +453,12 @@ ifeq ($(AVX_IFMA), y) c_lib_objs := $(c_lib_objs) \ mb_mgr_avx2_t3.o endif + +ifeq ($(SMX_NI), y) +c_lib_objs := $(c_lib_objs) \ + mb_mgr_avx2_t4.o +endif + # # List of ASM modules (root directory/common) # @@ -451,8 +479,7 @@ asm_generic_lib_objs := \ poly1305.o \ chacha20_poly1305.o \ mbcpuid.o \ - atomic.o \ - sm3.o + atomic.o # # List of ASM modules (no-aesni directory) @@ -619,7 +646,12 @@ asm_sse_lib_objs := \ memcpy_sse.o \ snow_v_sse.o \ snow3g_uia2_by4_sse.o \ - sm4_sse.o + sm4_sse.o \ + sm3_base_init_sse.o \ + sm3_base_update_sse.o \ + sm3_base_one_block_sse.o \ + sm3_base_msg_sse.o \ + sm3_base_hmac_sse.o # # List of ASM modules (avx directory) @@ -736,6 +768,17 @@ asm_avx2_lib_objs := \ asm_avx2_ifma_lib_objs := \ poly_fma_avx2.o +# +# List of ASM modules (avx2_t4 directory) +# +asm_avx2_t4_lib_objs := \ + sm4_ni_avx2.o \ + sm3_ni_x1_avx2.o \ + sm3_msg_avx2.o \ + sm3_hmac_avx2.o \ + sha512_x1_ni_avx2.o \ + sha512_hmac_ni_avx2.o + # # List of ASM modules (avx512 directory) # @@ -853,6 +896,9 @@ endif ifeq ($(AVX_IFMA), y) asm_obj_files := $(asm_obj_files) $(asm_avx2_ifma_lib_objs) endif +ifeq ($(SMX_NI), y) +asm_obj_files := $(asm_obj_files) $(asm_avx2_t4_lib_objs) +endif c_obj_files := $(c_lib_objs) $(c_gcm_objs) endif # aarch64 @@ -883,16 +929,25 @@ all: $(LIB_DIR)/$(LIBNAME) STR_FILTER = "" ifneq ($(AESNI_EMU),y) -ifneq ($(AVX_IFMA),y) -STR_FILTER = "_no_aesni\|_avx2_t3" -else + +ifneq ($(AVX_IFMA),y) # No AESNI_EMU, AVX2 Type 3 and Type 4 +STR_FILTER = "_no_aesni\|_avx2_t3\|_avx2_t4" +else # AVX_IFMA +ifneq ($(SMX_NI),y) # No AESNI_EMU and AVX2 Type4 +STR_FILTER = "_no_aesni\|_avx2_t4" +else # No AESNI EMU only STR_FILTER = "_no_aesni" -endif -else -ifneq ($(AVX_IFMA),y) -STR_FILTER = "_avx2_t3" -endif -endif +endif # SMX_NI +endif # AVX_IFMA + +else # AESNI_EMU = y +ifneq ($(AVX_IFMA),y) # No AVX2 Type 3 and Type 4 +STR_FILTER = "_avx2_t3\|_avx2_t4" +else # No AVX2 Type 4 +STR_FILTER = "_avx2_t4" +endif # AVX_IFMA + +endif # AESNI_EMU $(LIB)_lnk.def: $(LIB).def ifneq ($(STR_FILTER), "") @@ -1036,6 +1091,16 @@ $(OBJ_DIR)/%.o:avx2_t3/%.c $(OBJ_DIR)/%.o:avx2_t3/%.asm $(NASM) -MD $(@:.o=.d) -MT $@ -o $@ $(NASM_FLAGS) $< +$(OBJ_DIR)/%.o:avx2_t4/%.c + $(CC) -MMD $(OPT_AVX2) -c $(CFLAGS) $< -o $@ + +$(OBJ_DIR)/%.o:avx2_t4/%.asm +ifeq ($(USE_YASM),y) + $(YASM) $(YASM_FLAGS) $< -o $@ +else + $(NASM) -MD $(@:.o=.d) -MT $@ -o $@ $(NASM_FLAGS) $< +endif + $(OBJ_DIR)/%.o:avx512_t1/%.c $(CC) -MMD $(OPT_AVX512) -c $(CFLAGS) $< -o $@ diff --git a/lib/aarch64/zuc_common.inc b/lib/aarch64/zuc_common.inc index 6fb5b97e2ce9d9a0550fa8a9776bf59e43af9006..c16ba45ebbed4078d41c76680b9d75ead0a1542a 100644 --- a/lib/aarch64/zuc_common.inc +++ b/lib/aarch64/zuc_common.inc @@ -273,10 +273,10 @@ declare_register pD, x22 lsr xTMP, xTMP, #31 add xW, xW, xTMP - mov xTMP, xW - mov xTMP1, 0x7FFFFFFF - subs xTMP, xTMP, xTMP1 - csel xW, xTMP, xW, cs + mov xTMP, xW + and xW, xW, #0x7FFFFFFF + lsr xTMP, xTMP, #31 + add xW, xW, xTMP str wW, [pState, ((0 + \N) % 16)*4] .endm diff --git a/lib/avx2_t1/mb_mgr_avx2.c b/lib/avx2_t1/mb_mgr_avx2.c index 28508d51056ddd6e209842ae05216e3d1ff7e475..7e7c291da50f0d17f25a38ced0a3b396578ebd5f 100644 --- a/lib/avx2_t1/mb_mgr_avx2.c +++ b/lib/avx2_t1/mb_mgr_avx2.c @@ -52,6 +52,12 @@ init_mb_mgr_avx2_internal(IMB_MGR *state, const int reset_mgrs) state->features = cpu_feature_adjust(state->flags, cpu_feature_detect()); +#ifdef SMX_NI + if ((state->features & IMB_CPUFLAGS_AVX2_T4) == IMB_CPUFLAGS_AVX2_T4) { + init_mb_mgr_avx2_t4_internal(state, reset_mgrs); + return; + } +#endif #ifdef AVX_IFMA if ((state->features & IMB_CPUFLAGS_AVX2_T3) == IMB_CPUFLAGS_AVX2_T3) { init_mb_mgr_avx2_t3_internal(state, reset_mgrs); diff --git a/lib/avx2_t1/mb_mgr_avx2_t1.c b/lib/avx2_t1/mb_mgr_avx2_t1.c index b6294df32a84d9610c12f7ddd864a35aa94b4ff1..58d6b08fb94f032ed0964b522f672409c17fbeff 100644 --- a/lib/avx2_t1/mb_mgr_avx2_t1.c +++ b/lib/avx2_t1/mb_mgr_avx2_t1.c @@ -250,6 +250,12 @@ flush_snow3g_uea2_job_avx2_t1(IMB_MGR *state) #define SM4_CBC_ENC sm4_cbc_enc_sse #define SM4_CBC_DEC sm4_cbc_dec_sse +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + static void reset_ooo_mgrs(IMB_MGR *state) { diff --git a/lib/avx2_t2/mb_mgr_avx2_t2.c b/lib/avx2_t2/mb_mgr_avx2_t2.c index fa4d7821f13a2696607225b1f404214ddfe92fa4..5ccb6be4bc0978e65ae74652e343e98d15b4b5c0 100644 --- a/lib/avx2_t2/mb_mgr_avx2_t2.c +++ b/lib/avx2_t2/mb_mgr_avx2_t2.c @@ -253,6 +253,12 @@ flush_snow3g_uea2_job_avx2_t2(IMB_MGR *state) #define SM4_CBC_ENC sm4_cbc_enc_sse #define SM4_CBC_DEC sm4_cbc_dec_sse +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + static void reset_ooo_mgrs(IMB_MGR *state) { diff --git a/lib/avx2_t3/mb_mgr_avx2_t3.c b/lib/avx2_t3/mb_mgr_avx2_t3.c index 826573e598f7b4bea58895bca485a70c13be0e09..884856bcb6a447fa8912d366f3cbe6e3fd3fe1bb 100644 --- a/lib/avx2_t3/mb_mgr_avx2_t3.c +++ b/lib/avx2_t3/mb_mgr_avx2_t3.c @@ -253,6 +253,12 @@ flush_snow3g_uea2_job_avx2_t2(IMB_MGR *state) #define SM4_CBC_ENC sm4_cbc_enc_sse #define SM4_CBC_DEC sm4_cbc_dec_sse +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + static void reset_ooo_mgrs(IMB_MGR *state) { diff --git a/lib/avx2_t4/README b/lib/avx2_t4/README new file mode 100644 index 0000000000000000000000000000000000000000..bb3a2e767b84f1963f174b06549b40243ce76e64 --- /dev/null +++ b/lib/avx2_t4/README @@ -0,0 +1,3 @@ +AVX2 TYPE4: +- AVX2 TYPE3: AVX2, BMI2, AESNI, PCLMULQDQ, CMOV, VAES, VPCLMULQDQ, SHANI, GFNI, AVXIFMA +- SM3NI, SM4NI, SHA512NI diff --git a/lib/avx2_t4/mb_mgr_avx2_t4.c b/lib/avx2_t4/mb_mgr_avx2_t4.c new file mode 100644 index 0000000000000000000000000000000000000000..16cd470fd9a67714eea080ca79ca749939a97d83 --- /dev/null +++ b/lib/avx2_t4/mb_mgr_avx2_t4.c @@ -0,0 +1,509 @@ +/******************************************************************************* + Copyright (c) 2023-2024, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include +#include +#include + +#define AVX2 + +#include "ipsec-mb.h" +#include "include/ipsec_ooo_mgr.h" +#include "include/kasumi_interface.h" +#include "include/zuc_internal.h" +#include "include/snow3g.h" +#include "include/snow3g_submit.h" +#include "include/gcm.h" +#include "include/chacha20_poly1305.h" + +#include "include/save_xmms.h" +#include "include/des.h" +#include "include/cpu_feature.h" +#include "include/noaesni.h" +#include "include/aesni_emu.h" +#include "include/error.h" + +#include "include/arch_sse_type1.h" /* poly1305, snow3g */ +#include "include/arch_sse_type2.h" /* shani */ +#include "include/arch_avx_type1.h" +#include "include/arch_avx2_type1.h" +#include "include/arch_avx2_type2.h" +#include "include/arch_avx2_type3.h" +#include "include/arch_avx2_type4.h" + +#include "include/ooo_mgr_reset.h" + +#define SAVE_XMMS save_xmms_avx +#define RESTORE_XMMS restore_xmms_avx + +/* JOB API */ +#define SUBMIT_JOB submit_job_avx2_t4 +#define FLUSH_JOB flush_job_avx2_t4 +#define QUEUE_SIZE queue_size_avx2_t4 +#define SUBMIT_JOB_NOCHECK submit_job_nocheck_avx2_t4 +#define GET_NEXT_JOB get_next_job_avx2_t4 +#define GET_COMPLETED_JOB get_completed_job_avx2_t4 +#define GET_NEXT_BURST get_next_burst_avx2_t4 +#define SUBMIT_BURST submit_burst_avx2_t4 +#define SUBMIT_BURST_NOCHECK submit_burst_nocheck_avx2_t4 +#define FLUSH_BURST flush_burst_avx2_t4 +#define SUBMIT_CIPHER_BURST submit_cipher_burst_avx2_t4 +#define SUBMIT_CIPHER_BURST_NOCHECK submit_cipher_burst_nocheck_avx2_t4 +#define SUBMIT_HASH_BURST submit_hash_burst_avx2_t4 +#define SUBMIT_HASH_BURST_NOCHECK submit_hash_burst_nocheck_avx2_t4 +#define SET_SUITE_ID_FN set_suite_id_avx2_t4 + +/* Hash */ +#define SUBMIT_JOB_HASH SUBMIT_JOB_HASH_AVX2 +#define FLUSH_JOB_HASH FLUSH_JOB_HASH_AVX2 + +/* Cipher encrypt / decrypt */ +#define SUBMIT_JOB_CIPHER_ENC SUBMIT_JOB_CIPHER_ENC_AVX2 +#define FLUSH_JOB_CIPHER_ENC FLUSH_JOB_CIPHER_ENC_AVX2 +#define SUBMIT_JOB_CIPHER_DEC SUBMIT_JOB_CIPHER_DEC_AVX2 + +/* AES-GCM */ +#define AES_GCM_DEC_IV_128 aes_gcm_dec_var_iv_128_vaes_avx2 +#define AES_GCM_ENC_IV_128 aes_gcm_enc_var_iv_128_vaes_avx2 +#define AES_GCM_DEC_IV_192 aes_gcm_dec_var_iv_192_vaes_avx2 +#define AES_GCM_ENC_IV_192 aes_gcm_enc_var_iv_192_vaes_avx2 +#define AES_GCM_DEC_IV_256 aes_gcm_dec_var_iv_256_vaes_avx2 +#define AES_GCM_ENC_IV_256 aes_gcm_enc_var_iv_256_vaes_avx2 + +#define SUBMIT_JOB_AES_GCM_DEC submit_job_gcm_dec_avx2 +#define SUBMIT_JOB_AES_GCM_ENC submit_job_gcm_enc_avx2 + +/* AES-CBC */ +#define SUBMIT_JOB_AES_CBC_128_ENC submit_job_aes128_enc_avx +#define SUBMIT_JOB_AES_CBC_128_DEC submit_job_aes128_dec_avx +#define FLUSH_JOB_AES_CBC_128_ENC flush_job_aes128_enc_avx + +#define SUBMIT_JOB_AES_CBC_192_ENC submit_job_aes192_enc_avx +#define SUBMIT_JOB_AES_CBC_192_DEC submit_job_aes192_dec_avx +#define FLUSH_JOB_AES_CBC_192_ENC flush_job_aes192_enc_avx + +#define SUBMIT_JOB_AES_CBC_256_ENC submit_job_aes256_enc_avx +#define SUBMIT_JOB_AES_CBC_256_DEC submit_job_aes256_dec_avx +#define FLUSH_JOB_AES_CBC_256_ENC flush_job_aes256_enc_avx + +#define AES_CBC_DEC_128 aes_cbc_dec_128_avx +#define AES_CBC_DEC_192 aes_cbc_dec_192_avx +#define AES_CBC_DEC_256 aes_cbc_dec_256_avx + +/* AES-CBCS */ +#define SUBMIT_JOB_AES128_CBCS_1_9_ENC submit_job_aes128_cbcs_1_9_enc_avx +#define FLUSH_JOB_AES128_CBCS_1_9_ENC flush_job_aes128_cbcs_1_9_enc_avx +#define SUBMIT_JOB_AES128_CBCS_1_9_DEC submit_job_aes128_cbcs_1_9_dec_avx +#define AES_CBCS_1_9_DEC_128 aes_cbcs_1_9_dec_128_avx + +/* AES-ECB */ +#define SUBMIT_JOB_AES_ECB_128_ENC submit_job_aes_ecb_128_enc_vaes_avx2 +#define SUBMIT_JOB_AES_ECB_128_DEC submit_job_aes_ecb_128_dec_vaes_avx2 +#define SUBMIT_JOB_AES_ECB_192_ENC submit_job_aes_ecb_192_enc_vaes_avx2 +#define SUBMIT_JOB_AES_ECB_192_DEC submit_job_aes_ecb_192_dec_vaes_avx2 +#define SUBMIT_JOB_AES_ECB_256_ENC submit_job_aes_ecb_256_enc_vaes_avx2 +#define SUBMIT_JOB_AES_ECB_256_DEC submit_job_aes_ecb_256_dec_vaes_avx2 + +#define AES_ECB_ENC_128 aes_ecb_enc_128_vaes_avx2 +#define AES_ECB_ENC_192 aes_ecb_enc_192_vaes_avx2 +#define AES_ECB_ENC_256 aes_ecb_enc_256_vaes_avx2 +#define AES_ECB_DEC_128 aes_ecb_dec_128_vaes_avx2 +#define AES_ECB_DEC_192 aes_ecb_dec_192_vaes_avx2 +#define AES_ECB_DEC_256 aes_ecb_dec_256_vaes_avx2 + +/* AES-CTR */ +#define AES_CTR_128 aes_cntr_128_vaes_avx2 +#define AES_CTR_192 aes_cntr_192_vaes_avx2 +#define AES_CTR_256 aes_cntr_256_vaes_avx2 +#define AES_CTR_128_BIT aes_cntr_bit_128_avx +#define AES_CTR_192_BIT aes_cntr_bit_192_avx +#define AES_CTR_256_BIT aes_cntr_bit_256_avx + +/* AES-CCM */ +#define AES_CNTR_CCM_128 aes_cntr_ccm_128_avx +#define AES_CNTR_CCM_256 aes_cntr_ccm_256_avx + +#define FLUSH_JOB_AES128_CCM_AUTH flush_job_aes128_ccm_auth_avx +#define SUBMIT_JOB_AES128_CCM_AUTH submit_job_aes128_ccm_auth_avx + +#define FLUSH_JOB_AES256_CCM_AUTH flush_job_aes256_ccm_auth_avx +#define SUBMIT_JOB_AES256_CCM_AUTH submit_job_aes256_ccm_auth_avx + +/* AES-CMAC */ +#define FLUSH_JOB_AES128_CMAC_AUTH flush_job_aes128_cmac_auth_avx +#define SUBMIT_JOB_AES128_CMAC_AUTH submit_job_aes128_cmac_auth_avx + +#define FLUSH_JOB_AES256_CMAC_AUTH flush_job_aes256_cmac_auth_avx +#define SUBMIT_JOB_AES256_CMAC_AUTH submit_job_aes256_cmac_auth_avx + +/* AES-CFB */ +#define AES_CFB_128_ONE aes_cfb_128_one_avx2 +#define AES_CFB_256_ONE aes_cfb_256_one_avx2 + +/* AES-XCBC */ +#define SUBMIT_JOB_AES_XCBC submit_job_aes_xcbc_avx +#define FLUSH_JOB_AES_XCBC flush_job_aes_xcbc_avx + +/* PON */ +#define SUBMIT_JOB_PON_ENC submit_job_pon_enc_avx +#define SUBMIT_JOB_PON_DEC submit_job_pon_dec_avx +#define SUBMIT_JOB_PON_ENC_NO_CTR submit_job_pon_enc_no_ctr_avx +#define SUBMIT_JOB_PON_DEC_NO_CTR submit_job_pon_dec_no_ctr_avx + +/* SHA1/224/256/384/512 */ +#define SUBMIT_JOB_SHA1 submit_job_sha1_ni_sse +#define FLUSH_JOB_SHA1 flush_job_sha1_ni_sse +#define SUBMIT_JOB_SHA224 submit_job_sha224_ni_sse +#define FLUSH_JOB_SHA224 flush_job_sha224_ni_sse +#define SUBMIT_JOB_SHA256 submit_job_sha256_ni_sse +#define FLUSH_JOB_SHA256 flush_job_sha256_ni_sse +#define SUBMIT_JOB_SHA384 submit_job_sha384_ni_avx2 +#define FLUSH_JOB_SHA384 flush_job_sha384_ni_avx2 +#define SUBMIT_JOB_SHA512 submit_job_sha512_ni_avx2 +#define FLUSH_JOB_SHA512 flush_job_sha512_ni_avx2 + +/* HMAC-SHA1/224/256/384/512 */ +#define SUBMIT_JOB_HMAC submit_job_hmac_ni_sse +#define FLUSH_JOB_HMAC flush_job_hmac_ni_sse +#define SUBMIT_JOB_HMAC_SHA_224 submit_job_hmac_sha_224_ni_sse +#define FLUSH_JOB_HMAC_SHA_224 flush_job_hmac_sha_224_ni_sse +#define SUBMIT_JOB_HMAC_SHA_256 submit_job_hmac_sha_256_ni_sse +#define FLUSH_JOB_HMAC_SHA_256 flush_job_hmac_sha_256_ni_sse +#define SUBMIT_JOB_HMAC_SHA_384 submit_job_hmac_sha_384_ni_avx2 +#define FLUSH_JOB_HMAC_SHA_384 flush_job_hmac_sha_384_ni_avx2 +#define SUBMIT_JOB_HMAC_SHA_512 submit_job_hmac_sha_512_ni_avx2 +#define FLUSH_JOB_HMAC_SHA_512 flush_job_hmac_sha_512_ni_avx2 +#define SUBMIT_JOB_HMAC_MD5 submit_job_hmac_md5_avx2 +#define FLUSH_JOB_HMAC_MD5 flush_job_hmac_md5_avx2 + +/* CHACHA20 & POLY1305 */ +#define SUBMIT_JOB_CHACHA20_ENC_DEC submit_job_chacha20_enc_dec_avx2 +#define SUBMIT_JOB_CHACHA20_POLY1305 aead_chacha20_poly1305_avx2 +#define SUBMIT_JOB_CHACHA20_POLY1305_SGL aead_chacha20_poly1305_sgl_avx2 +#define POLY1305_MAC poly1305_mac_fma_avx2 + +/* ZUC EEA3 & EIA3 */ +#define SUBMIT_JOB_ZUC_EEA3 submit_job_zuc_eea3_gfni_avx2 +#define FLUSH_JOB_ZUC_EEA3 flush_job_zuc_eea3_gfni_avx2 +#define SUBMIT_JOB_ZUC_EIA3 submit_job_zuc_eia3_gfni_avx2 +#define FLUSH_JOB_ZUC_EIA3 flush_job_zuc_eia3_gfni_avx2 +#define SUBMIT_JOB_ZUC256_EEA3 submit_job_zuc256_eea3_gfni_avx2 +#define FLUSH_JOB_ZUC256_EEA3 flush_job_zuc256_eea3_gfni_avx2 +#define SUBMIT_JOB_ZUC256_EIA3 submit_job_zuc256_eia3_gfni_avx2 +#define FLUSH_JOB_ZUC256_EIA3 flush_job_zuc256_eia3_gfni_avx2 + +/* SNOW-V */ +#define SUBMIT_JOB_SNOW_V snow_v_avx +#define SUBMIT_JOB_SNOW_V_AEAD snow_v_aead_init_avx + +/* SNOW3G UEA2 & UIA2 */ +static IMB_JOB * +submit_snow3g_uea2_job_avx2_t2(IMB_MGR *state, IMB_JOB *job) +{ + MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo; + + if ((job->msg_len_to_cipher_in_bits & 7) || (job->cipher_start_offset_in_bits & 7)) + return def_submit_snow3g_uea2_job(state, job); + + return submit_job_snow3g_uea2_sse(snow3g_uea2_ooo, job); +} + +static IMB_JOB * +flush_snow3g_uea2_job_avx2_t2(IMB_MGR *state) +{ + MB_MGR_SNOW3G_OOO *snow3g_uea2_ooo = state->snow3g_uea2_ooo; + + return flush_job_snow3g_uea2_sse(snow3g_uea2_ooo); +} + +#define SUBMIT_JOB_SNOW3G_UEA2 submit_snow3g_uea2_job_avx2_t2 +#define FLUSH_JOB_SNOW3G_UEA2 flush_snow3g_uea2_job_avx2_t2 + +#define SUBMIT_JOB_SNOW3G_UIA2 submit_job_snow3g_uia2_sse +#define FLUSH_JOB_SNOW3G_UIA2 flush_job_snow3g_uia2_sse + +/* AES-DOCSIS */ +#define ETHERNET_FCS ethernet_fcs_avx_local + +/* SM4 */ +#define SM4_ECB sm4_ecb_ni_avx2 +#define SM4_CBC_ENC sm4_cbc_enc_sse +#define SM4_CBC_DEC sm4_cbc_dec_sse + +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_ni_avx2 +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_ni_avx2 +#define FLUSH_JOB_HMAC_SM3 unused + +static void +reset_ooo_mgrs(IMB_MGR *state) +{ + /* Init AES out-of-order fields */ + ooo_mgr_aes_reset(state->aes128_ooo, 8); + ooo_mgr_aes_reset(state->aes192_ooo, 8); + ooo_mgr_aes_reset(state->aes256_ooo, 8); + + /* DOCSIS SEC BPI (AES CBC + AES CFB for partial block) + * uses same settings as AES CBC. + */ + ooo_mgr_docsis_aes_reset(state->docsis128_sec_ooo, 8); + ooo_mgr_docsis_aes_reset(state->docsis128_crc32_sec_ooo, 8); + ooo_mgr_docsis_aes_reset(state->docsis256_sec_ooo, 8); + ooo_mgr_docsis_aes_reset(state->docsis256_crc32_sec_ooo, 8); + + /* Init ZUC out-of-order fields */ + ooo_mgr_zuc_reset(state->zuc_eea3_ooo, 8); + ooo_mgr_zuc_reset(state->zuc_eia3_ooo, 8); + ooo_mgr_zuc_reset(state->zuc256_eea3_ooo, 8); + ooo_mgr_zuc_reset(state->zuc256_eia3_ooo, 8); + ooo_mgr_zuc_reset(state->zuc256_eia3_8B_ooo, 8); + ooo_mgr_zuc_reset(state->zuc256_eia3_16B_ooo, 8); + + /* Init HMAC/SHA1 out-of-order fields */ + ooo_mgr_hmac_sha1_reset(state->hmac_sha_1_ooo, 2); + + /* Init HMAC/SHA224 out-of-order fields */ + ooo_mgr_hmac_sha224_reset(state->hmac_sha_224_ooo, 2); + + /* Init HMAC/SHA_256 out-of-order fields */ + ooo_mgr_hmac_sha256_reset(state->hmac_sha_256_ooo, 2); + + /* Init HMAC/SHA384 out-of-order fields */ + ooo_mgr_hmac_sha384_reset(state->hmac_sha_384_ooo, AVX2_NUM_SHA512_LANES); + + /* Init HMAC/SHA512 out-of-order fields */ + ooo_mgr_hmac_sha512_reset(state->hmac_sha_512_ooo, AVX2_NUM_SHA512_LANES); + + /* Init HMAC/MD5 out-of-order fields */ + ooo_mgr_hmac_md5_reset(state->hmac_md5_ooo, AVX2_NUM_MD5_LANES); + + /* Init AES/XCBC OOO fields */ + ooo_mgr_aes_xcbc_reset(state->aes_xcbc_ooo, 8); + + /* Init AES-CCM auth out-of-order fields */ + ooo_mgr_ccm_reset(state->aes_ccm_ooo, 8); + ooo_mgr_ccm_reset(state->aes256_ccm_ooo, 8); + + /* Init AES-CMAC auth out-of-order fields */ + ooo_mgr_cmac_reset(state->aes_cmac_ooo, 8); + ooo_mgr_cmac_reset(state->aes256_cmac_ooo, 8); + + /* Init AES CBC-S out-of-order fields */ + ooo_mgr_aes_reset(state->aes128_cbcs_ooo, 8); + + /* Init SHA1 out-of-order fields */ + ooo_mgr_sha1_reset(state->sha_1_ooo, AVX2_NUM_SHA1_LANES); + + /* Init SHA224 out-of-order fields */ + ooo_mgr_sha256_reset(state->sha_224_ooo, 2); + + /* Init SHA256 out-of-order fields */ + ooo_mgr_sha256_reset(state->sha_256_ooo, 2); + + /* Init SHA384 out-of-order fields */ + ooo_mgr_sha512_reset(state->sha_384_ooo, AVX2_NUM_SHA512_LANES); + + /* Init SHA512 out-of-order fields */ + ooo_mgr_sha512_reset(state->sha_512_ooo, AVX2_NUM_SHA512_LANES); + + /* Init SNOW3G-UEA out-of-order fields */ + ooo_mgr_snow3g_reset(state->snow3g_uea2_ooo, 4); + + /* Init SNOW3G-UIA out-of-order fields */ + ooo_mgr_snow3g_reset(state->snow3g_uia2_ooo, 4); +} + +IMB_DLL_LOCAL void +init_mb_mgr_avx2_t4_internal(IMB_MGR *state, const int reset_mgrs) +{ + /* Check if CPU flags needed for AVX2 interface are present */ + if ((state->features & IMB_CPUFLAGS_AVX2) != IMB_CPUFLAGS_AVX2) { + imb_set_errno(state, IMB_ERR_MISSING_CPUFLAGS_INIT_MGR); + return; + } + + /* Set architecture for future checks */ + state->used_arch = (uint32_t) IMB_ARCH_AVX2; + + if (reset_mgrs) { + reset_ooo_mgrs(state); + + /* Init "in order" components */ + state->next_job = 0; + state->earliest_job = -1; + } + + /* set handlers */ + state->get_next_job = GET_NEXT_JOB; + state->submit_job = SUBMIT_JOB; + state->submit_job_nocheck = SUBMIT_JOB_NOCHECK; + state->get_completed_job = GET_COMPLETED_JOB; + state->flush_job = FLUSH_JOB; + state->queue_size = QUEUE_SIZE; + state->get_next_burst = GET_NEXT_BURST; + state->submit_burst = SUBMIT_BURST; + state->submit_burst_nocheck = SUBMIT_BURST_NOCHECK; + state->flush_burst = FLUSH_BURST; + state->submit_cipher_burst = SUBMIT_CIPHER_BURST; + state->submit_cipher_burst_nocheck = SUBMIT_CIPHER_BURST_NOCHECK; + state->submit_hash_burst = SUBMIT_HASH_BURST; + state->submit_hash_burst_nocheck = SUBMIT_HASH_BURST_NOCHECK; + state->set_suite_id = SET_SUITE_ID_FN; + + state->keyexp_128 = aes_keyexp_128_avx2; + state->keyexp_192 = aes_keyexp_192_avx2; + state->keyexp_256 = aes_keyexp_256_avx2; + + state->cmac_subkey_gen_128 = aes_cmac_subkey_gen_avx2; + state->cmac_subkey_gen_256 = aes_cmac_256_subkey_gen_avx2; + + state->xcbc_keyexp = aes_xcbc_expand_key_avx2; + state->des_key_sched = des_key_schedule; + + state->sha1_one_block = sha1_one_block_sse_shani; + state->sha1 = sha1_sse_shani; + state->sha224_one_block = sha224_one_block_sse_shani; + state->sha224 = sha224_sse_shani; + state->sha256_one_block = sha256_one_block_sse_shani; + state->sha256 = sha256_sse_shani; + state->sha384_one_block = sha384_one_block_avx2; + state->sha384 = sha384_avx2; + state->sha512_one_block = sha512_one_block_avx2; + state->sha512 = sha512_avx2; + state->md5_one_block = md5_one_block_avx2; + + state->aes128_cfb_one = aes_cfb_128_one_avx2; + state->aes256_cfb_one = aes_cfb_256_one_avx2; + + state->eea3_1_buffer = zuc_eea3_1_buffer_avx2; + state->eea3_4_buffer = zuc_eea3_4_buffer_avx; + state->eea3_n_buffer = zuc_eea3_n_buffer_avx2; + state->eia3_1_buffer = zuc_eia3_1_buffer_avx2; + state->eia3_n_buffer = zuc_eia3_n_buffer_avx2; + + state->f8_1_buffer = kasumi_f8_1_buffer_avx; + state->f8_1_buffer_bit = kasumi_f8_1_buffer_bit_avx; + state->f8_2_buffer = kasumi_f8_2_buffer_avx; + state->f8_3_buffer = kasumi_f8_3_buffer_avx; + state->f8_4_buffer = kasumi_f8_4_buffer_avx; + state->f8_n_buffer = kasumi_f8_n_buffer_avx; + state->f9_1_buffer = kasumi_f9_1_buffer_avx; + state->f9_1_buffer_user = kasumi_f9_1_buffer_user_avx; + state->kasumi_init_f8_key_sched = kasumi_init_f8_key_sched_avx; + state->kasumi_init_f9_key_sched = kasumi_init_f9_key_sched_avx; + state->kasumi_key_sched_size = kasumi_key_sched_size_avx; + + state->snow3g_f8_1_buffer_bit = snow3g_f8_1_buffer_bit_avx2; + state->snow3g_f8_1_buffer = snow3g_f8_1_buffer_avx2; + state->snow3g_f8_2_buffer = snow3g_f8_2_buffer_avx2; + state->snow3g_f8_4_buffer = snow3g_f8_4_buffer_avx2; + state->snow3g_f8_8_buffer = snow3g_f8_8_buffer_avx2; + state->snow3g_f8_n_buffer = snow3g_f8_n_buffer_avx2; + state->snow3g_f8_8_buffer_multikey = snow3g_f8_8_buffer_multikey_avx2; + state->snow3g_f8_n_buffer_multikey = snow3g_f8_n_buffer_multikey_avx2; + state->snow3g_f9_1_buffer = snow3g_f9_1_buffer_avx2; + state->snow3g_init_key_sched = snow3g_init_key_sched_avx2; + state->snow3g_key_sched_size = snow3g_key_sched_size_avx2; + + state->hec_32 = hec_32_avx; + state->hec_64 = hec_64_avx; + + state->crc32_ethernet_fcs = ethernet_fcs_avx; + state->crc16_x25 = crc16_x25_avx; + state->crc32_sctp = crc32_sctp_avx; + state->crc24_lte_a = crc24_lte_a_avx; + state->crc24_lte_b = crc24_lte_b_avx; + state->crc16_fp_data = crc16_fp_data_avx; + state->crc11_fp_header = crc11_fp_header_avx; + state->crc7_fp_header = crc7_fp_header_avx; + state->crc10_iuup_data = crc10_iuup_data_avx; + state->crc6_iuup_header = crc6_iuup_header_avx; + state->crc32_wimax_ofdma_data = crc32_wimax_ofdma_data_avx; + state->crc8_wimax_ofdma_hcs = crc8_wimax_ofdma_hcs_avx; + +#ifdef AVX_IFMA + state->chacha20_poly1305_init = init_chacha20_poly1305_fma_avx2; + state->chacha20_poly1305_enc_update = update_enc_chacha20_poly1305_fma_avx2; + state->chacha20_poly1305_dec_update = update_dec_chacha20_poly1305_fma_avx2; + state->chacha20_poly1305_finalize = finalize_chacha20_poly1305_fma_avx2; +#endif + + state->gcm128_enc = aes_gcm_enc_128_vaes_avx2; + state->gcm192_enc = aes_gcm_enc_192_vaes_avx2; + state->gcm256_enc = aes_gcm_enc_256_vaes_avx2; + state->gcm128_dec = aes_gcm_dec_128_vaes_avx2; + state->gcm192_dec = aes_gcm_dec_192_vaes_avx2; + state->gcm256_dec = aes_gcm_dec_256_vaes_avx2; + state->gcm128_init = aes_gcm_init_128_vaes_avx2; + state->gcm192_init = aes_gcm_init_192_vaes_avx2; + state->gcm256_init = aes_gcm_init_256_vaes_avx2; + state->gcm128_init_var_iv = aes_gcm_init_var_iv_128_vaes_avx2; + state->gcm192_init_var_iv = aes_gcm_init_var_iv_192_vaes_avx2; + state->gcm256_init_var_iv = aes_gcm_init_var_iv_256_vaes_avx2; + state->gcm128_enc_update = aes_gcm_enc_128_update_vaes_avx2; + state->gcm192_enc_update = aes_gcm_enc_192_update_vaes_avx2; + state->gcm256_enc_update = aes_gcm_enc_256_update_vaes_avx2; + state->gcm128_dec_update = aes_gcm_dec_128_update_vaes_avx2; + state->gcm192_dec_update = aes_gcm_dec_192_update_vaes_avx2; + state->gcm256_dec_update = aes_gcm_dec_256_update_vaes_avx2; + state->gcm128_enc_finalize = aes_gcm_enc_128_finalize_vaes_avx2; + state->gcm192_enc_finalize = aes_gcm_enc_192_finalize_vaes_avx2; + state->gcm256_enc_finalize = aes_gcm_enc_256_finalize_vaes_avx2; + state->gcm128_dec_finalize = aes_gcm_dec_128_finalize_vaes_avx2; + state->gcm192_dec_finalize = aes_gcm_dec_192_finalize_vaes_avx2; + state->gcm256_dec_finalize = aes_gcm_dec_256_finalize_vaes_avx2; + state->gcm128_precomp = aes_gcm_precomp_128_vaes_avx2; + state->gcm192_precomp = aes_gcm_precomp_192_vaes_avx2; + state->gcm256_precomp = aes_gcm_precomp_256_vaes_avx2; + state->gcm128_pre = aes_gcm_pre_128_vaes_avx2; + state->gcm192_pre = aes_gcm_pre_192_vaes_avx2; + state->gcm256_pre = aes_gcm_pre_256_vaes_avx2; + + state->ghash = ghash_vaes_avx2; + state->ghash_pre = ghash_pre_vaes_avx2; + + state->gmac128_init = imb_aes_gmac_init_128_vaes_avx2; + state->gmac192_init = imb_aes_gmac_init_192_vaes_avx2; + state->gmac256_init = imb_aes_gmac_init_256_vaes_avx2; + state->gmac128_update = imb_aes_gmac_update_128_vaes_avx2; + state->gmac192_update = imb_aes_gmac_update_192_vaes_avx2; + state->gmac256_update = imb_aes_gmac_update_256_vaes_avx2; + state->gmac128_finalize = imb_aes_gmac_finalize_128_vaes_avx2; + state->gmac192_finalize = imb_aes_gmac_finalize_192_vaes_avx2; + state->gmac256_finalize = imb_aes_gmac_finalize_256_vaes_avx2; + + state->aes_ecb_128_quic = aes_ecb_quic_enc_128_avx; + state->aes_ecb_256_quic = aes_ecb_quic_enc_256_avx; + + state->chacha20_poly1305_quic = aead_chacha20_poly1305_avx2; + state->chacha20_hp_quic = quic_hp_chacha20_avx2; + + state->sm4_keyexp = sm4_set_key_ni_avx2; +} + +#include "mb_mgr_code.h" diff --git a/lib/avx2_t4/sha512_hmac_ni_avx2.asm b/lib/avx2_t4/sha512_hmac_ni_avx2.asm new file mode 100644 index 0000000000000000000000000000000000000000..41c21df84ff82bf26dfebabc72bc409906692d75 --- /dev/null +++ b/lib/avx2_t4/sha512_hmac_ni_avx2.asm @@ -0,0 +1,437 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; FIPS PUB 180-4, FEDERAL INFORMATION PROCESSING STANDARDS PUBLICATION, Secure Hash Standard (SHS) +;; https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf + +extern sha512_update_ni_x1 + +%include "include/os.inc" +%include "include/constants.inc" +%include "include/reg_sizes.inc" +%include "include/imb_job.inc" +%include "include/memcpy.inc" + +%ifdef LINUX + +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx + +%define gp1 rax +%define gp2 r8 +%define gp3 r9 +%define gp4 r10 +%define gp5 r11 +%define gp6 arg4 +%define gp7 r12 +%define gp8 r13 +%define gp9 r14 +%define gp10 r15 +%define gp11 rbx +%define gp12 rbp + +%else + +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 + +%define gp1 rax +%define gp2 r10 +%define gp3 r11 +%define gp4 arg4 +%define gp5 rdi +%define gp6 rsi +%define gp7 r12 +%define gp8 r13 +%define gp9 r14 +%define gp10 r15 +%define gp11 rbx +%define gp12 rbp + +%endif + +%xdefine t1 gp1 +%xdefine t2 gp2 +%xdefine t3 gp3 +%xdefine t4 gp4 + +%xdefine r1 gp12 +%xdefine r2 gp11 +%xdefine r3 gp10 +%xdefine r4 gp9 + +%define arg_job r1 +%define arg_msg r2 +%define arg_msg_length r3 +%define arg_sha_type r4 + +;; HMAC-SHA512/384 stack frame +struc STACK +_B: resb SHA512_BLK_SZ ; two SHA512 blocks (aligned to 16) +_D: resb SHA512_DIGEST_SIZE ; digest +_gpr_save: resq 8 ; space for GPR's +_rsp_save: resq 1 ; space for rsp pointer +endstruc + +mksection .rodata + +align 32 +SHUFF_MASK: + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + dq 0x0001020304050607, 0x08090a0b0c0d0e0f + +;; End-of-Message pattern +align 32 +EOM_32BYTES: + db 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + +;; PAD BLOCKS are used for OPAD where digest of IPAD + message is put into the block. +;; The blocks below fill up top 32 bytes of the block, +;; low 64/48 bytes get filled with the digest followed by EOM. +align 32 +SHA512_OPAD_LENGTH: + ;; last two qwords has to encode length in bits of: BLOCK size + DIGEST size + ;; (128 + 64) * 8 = 1536 = 0x600 in hex + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x06, 0x00 + +align 32 +SHA384_OPAD_LENGTH: + ;; last two qwords has to encode length in bits of: BLOCK size + DIGEST size + ;; (128 + 48) * 8 = 1408 = 0x580 in hex + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x05, 0x80 + +mksection .text + +;; ============================================================================= +;; Save registers on the stack and create stack frame +;; ============================================================================= + +%macro FUNC_START 0 + mov rax, rsp + sub rsp, STACK_size + and rsp, -32 + mov [rsp + _rsp_save], rax + mov [rsp + _gpr_save + 0*8], rbx + mov [rsp + _gpr_save + 1*8], rbp + mov [rsp + _gpr_save + 2*8], r12 + mov [rsp + _gpr_save + 3*8], r13 + mov [rsp + _gpr_save + 4*8], r14 + mov [rsp + _gpr_save + 5*8], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _gpr_save + 6*8], rdi + mov [rsp + _gpr_save + 7*8], rsi +%endif +%endmacro + +;; ============================================================================= +;; Restore registers from the stack +;; ============================================================================= + +%macro FUNC_END 0 + mov rbx, [rsp + _gpr_save + 0*8] + mov rbp, [rsp + _gpr_save + 1*8] + mov r12, [rsp + _gpr_save + 2*8] + mov r13, [rsp + _gpr_save + 3*8] + mov r14, [rsp + _gpr_save + 4*8] + mov r15, [rsp + _gpr_save + 5*8] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + _gpr_save + 6*8] + mov rsi, [rsp + _gpr_save + 7*8] +%endif + mov rsp, [rsp + _rsp_save] +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha512_tag_store(void *tag_ptr, uint64_t tag_len, ymm1:ymm0 tag) +align 32 +MKGLOBAL(sha512_tag_store,function,internal) +sha512_tag_store: + cmp arg2, 16 + jb .tag_store_1_15 + je .tag_store_16 + + cmp arg2, 32 + je .tag_store_32 + jb .tag_store_17_31 + + cmp arg2, 48 + je .tag_store_48 + jb .tag_store_33_47 + + cmp arg2, 64 + je .tag_store_64 + +.tag_store_49_63: + vmovdqu [arg1 + 0*32], ymm0 + vmovdqu [arg1 + 1*32], xmm1 + vextracti128 xmm0, ymm1, 1 + lea arg1, [arg1 + 48] + sub arg2, 48 + jmp .tag_store_1_15 + +.tag_store_33_47: + vmovdqu [arg1 + 0*32], ymm0 + lea arg1, [arg1 + 32] + vmovdqa ymm0, ymm1 + sub arg2, 32 + jmp .tag_store_1_15 + +.tag_store_17_31: + vmovdqu [arg1 + 0*16], xmm0 + vextracti128 xmm0, ymm0, 1 + lea arg1, [arg1 + 16] + sub arg2, 16 + ;; fall through to store remaining tag bytes + +.tag_store_1_15: + simd_store_avx arg1, xmm0, arg2, t1, t2 + jmp .tag_store_end + +.tag_store_16: + vmovdqu [arg1 + 0*16], xmm0 + jmp .tag_store_end + +.tag_store_32: + vmovdqu [arg1 + 0*32], ymm0 + jmp .tag_store_end + +.tag_store_48: + vmovdqu [arg1 + 0*32], ymm0 + vmovdqu [arg1 + 1*32], xmm1 + jmp .tag_store_end + +.tag_store_64: + vmovdqu [arg1 + 0*32], ymm0 + vmovdqu [arg1 + 1*32], ymm1 + +.tag_store_end: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; IMB_JOB *sha512_384_hmac_submit_ni_avx2(const unsigned sha_type, IMB_JOB *job) +align 32 +MKGLOBAL(sha512_384_hmac_submit_ni_avx2,function,internal) +sha512_384_hmac_submit_ni_avx2: + FUNC_START + + ;; save input arguments + mov arg_job, arg2 + mov arg_sha_type, arg1 + + ;; init the digest with IPAD + mov t1, [arg_job + _auth_key_xor_ipad] + vmovdqu ymm0, [t1 + 0*32] + vmovdqu ymm1, [t1 + 1*32] + vmovdqa [rsp + _D + 0*32], ymm0 + vmovdqa [rsp + _D + 1*32], ymm1 + + ;; update digest for full number of blocks + lea arg1, [rsp + _D] + mov arg2, [arg_job + _src] + add arg2, [arg_job + _hash_start_src_offset] + mov arg_msg, arg2 + mov arg_msg_length, [arg_job + _msg_len_to_hash_in_bytes] + mov arg3, arg_msg_length + shr arg3, 7 ;; msg_length / SHA512_BLK_SZ + call sha512_update_ni_x1 + + ;; prepare partial block + mov DWORD(arg3), SHA512_BLK_SZ - 1 + not arg3 + and arg3, arg_msg_length ;; number of bytes processed already + add arg_msg, arg3 ;; move message pointer to start of the partial block + mov t2, arg_msg_length + sub t2, arg3 ;; t2 = number of bytes left + + xor DWORD(arg1), DWORD(arg1) +.partial_block_copy: + cmp DWORD(arg1), DWORD(t2) + je .partial_block_copy_exit + mov BYTE(t1), [arg_msg + arg1] + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_copy + +.partial_block_copy_exit: + ;; put end of message marker + mov BYTE [rsp + _B + arg1], 0x80 + inc DWORD(arg1) + + xor DWORD(t1), DWORD(t1) +.partial_block_zero: + cmp DWORD(arg1), SHA512_BLK_SZ + je .partial_block_zero_exit + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_zero + +.partial_block_zero_exit: + cmp DWORD(t2), SHA512_BLK_SZ - 16 + jb .add_msg_length + + ;; if length field doesn't fit into this partial block + ;; - compute digest on the current block + ;; - clear the block for the length to be put into it next + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sha512_update_ni_x1 + + ;; clear the block + vpxor xmm0, xmm0, xmm0 + vmovdqa [rsp + _B + 0*32], ymm0 + vmovdqa [rsp + _B + 1*32], ymm0 + vmovdqa [rsp + _B + 2*32], ymm0 + vmovdqa [rsp + _B + 3*32], xmm0 ;; the last 16 bytes will be set below + +.add_msg_length: + lea arg2, [arg_msg_length + SHA512_BLK_SZ] ;; original message length + IPAD block + lea arg1, [arg2 * 8] ;; length in bits + shr arg2, 61 + movbe [rsp + _B + SHA512_BLK_SZ - 2*8], arg2 + movbe [rsp + _B + SHA512_BLK_SZ - 1*8], arg1 + + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sha512_update_ni_x1 + +.process_opad: + cmp DWORD(arg_sha_type), 512 + jne .opad_hmac_sha384 + +.opad_hmac_sha512: + vmovdqa ymm0, [rsp + _D + 0*32] + vmovdqa ymm1, [rsp + _D + 1*32] + vpshufb ymm0, ymm0, [rel SHUFF_MASK] + vpshufb ymm1, ymm1, [rel SHUFF_MASK] + vmovdqa ymm2, [rel EOM_32BYTES] + vmovdqa ymm3, [rel SHA512_OPAD_LENGTH] + vmovdqa [rsp + _B + 0*32], ymm0 + vmovdqa [rsp + _B + 1*32], ymm1 + vmovdqa [rsp + _B + 2*32], ymm2 + vmovdqa [rsp + _B + 3*32], ymm3 + jmp .opad_update + +.opad_hmac_sha384: + vmovdqa ymm0, [rsp + _D + 0*32] + vmovdqa xmm1, [rsp + _D + 1*32] + vpshufb ymm0, ymm0, [rel SHUFF_MASK] + vpshufb xmm1, xmm1, [rel SHUFF_MASK] + vinserti128 ymm1, [rel EOM_32BYTES], 1 + vpxor xmm2, xmm2, xmm2 + vmovdqa ymm3, [rel SHA384_OPAD_LENGTH] + vmovdqa [rsp + _B + 0*32], ymm0 + vmovdqa [rsp + _B + 1*32], ymm1 + vmovdqa [rsp + _B + 2*32], ymm2 + vmovdqa [rsp + _B + 3*32], ymm3 + +.opad_update: + ;; init the digest with OPAD + mov t1, [arg_job + _auth_key_xor_opad] + vmovdqu ymm0, [t1 + 0*32] + vmovdqu ymm1, [t1 + 1*32] + vmovdqa [rsp + _D + 0*32], ymm0 + vmovdqa [rsp + _D + 1*32], ymm1 + + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sha512_update_ni_x1 + +.tag_store_start: + ;; byte swap the digest and write it back + lea arg1, [rsp + _D] + vmovdqa ymm0, [arg1 + 0*32] + vmovdqa ymm1, [arg1 + 1*32] + vpshufb ymm0, ymm0, [rel SHUFF_MASK] + vpshufb ymm1, ymm1, [rel SHUFF_MASK] + + mov arg1, [arg_job + _auth_tag_output] + mov arg2, [arg_job + _auth_tag_output_len_in_bytes] + call sha512_tag_store + +%ifdef SAFE_DATA + vpxor xmm0, xmm0, xmm0 + vpxor xmm1, xmm1, xmm1 + vpxor xmm2, xmm2, xmm2 + vpxor xmm3, xmm3, xmm3 + + vmovdqu [rsp + _B + 0*32], ymm0 + vmovdqu [rsp + _B + 1*32], ymm0 + vmovdqu [rsp + _B + 2*32], ymm0 + vmovdqu [rsp + _B + 3*32], ymm0 +%endif + vzeroupper + + mov rax, arg_job + or dword [arg_job + _status], IMB_STATUS_COMPLETED_AUTH + FUNC_END + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; IMB_JOB *submit_job_hmac_sha_512_ni_avx2(MB_MGR_HMAC_SHA_512_OOO *state, IMB_JOB *job) +align 32 +MKGLOBAL(submit_job_hmac_sha_512_ni_avx2,function,internal) +submit_job_hmac_sha_512_ni_avx2: + mov DWORD(arg1), 512 + jmp sha512_384_hmac_submit_ni_avx2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; IMB_JOB *submit_job_hmac_sha_384_ni_avx2(MB_MGR_SHA384_OOO *state, IMB_JOB *job) +align 32 +MKGLOBAL(submit_job_hmac_sha_384_ni_avx2,function,internal) +submit_job_hmac_sha_384_ni_avx2: + mov DWORD(arg1), 384 + jmp sha512_384_hmac_submit_ni_avx2 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; IMB_JOB *flush_job_hmac_sha_512_ni_avx2(MB_MGR_SHA512_OOO *state) +;; IMB_JOB *flush_job_hmac_sha_384_ni_avx2(MB_MGR_SHA384_OOO *state) +align 32 +MKGLOBAL(flush_job_hmac_sha_512_ni_avx2,function,internal) +MKGLOBAL(flush_job_hmac_sha_384_ni_avx2,function,internal) +flush_job_hmac_sha_512_ni_avx2: +flush_job_hmac_sha_384_ni_avx2: + xor rax, rax + ret + +mksection stack-noexec diff --git a/lib/avx2_t4/sha512_x1_ni_avx2.asm b/lib/avx2_t4/sha512_x1_ni_avx2.asm new file mode 100644 index 0000000000000000000000000000000000000000..9ea0ea93f5fbf96241541e2c9bd2b2f9a5ef594d --- /dev/null +++ b/lib/avx2_t4/sha512_x1_ni_avx2.asm @@ -0,0 +1,303 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; =========================================================== +;; NOTE about comment format: +;; +;; xmm = a b c d +;; ^ ^ +;; | | +;; MSB--+ +--LSB +;; +;; a - most significant word in `ymm` +;; d - least significant word in `ymm` +;; =========================================================== + +%use smartalign + +%include "include/os.inc" +%include "include/clear_regs.inc" +%include "include/reg_sizes.inc" + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%endif + +%define arg_hash arg1 +%define arg_msg arg2 +%define arg_num_blks arg3 + +;; re-use symbols from AVX codebase +extern SHA512_K_AVX +extern SHA512_SHUFF_MASK_AVX + +mksection .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha512_update_ni_x1(uint64_t digest[8], const void *input, uint64_t num_blocks) +;; arg1 : [in/out] pointer to hash value +;; arg2 : [in] message pointer +;; arg3 : [in] number of blocks to process + +align 32 +MKGLOBAL(sha512_update_ni_x1,function,internal) +sha512_update_ni_x1: + or arg_num_blks, arg_num_blks + je .done_hash + +%ifidn __OUTPUT_FORMAT__, win64 + ;; xmm6:xmm15 need to be maintained for Windows + sub rsp, 10*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm7 + vmovdqu [rsp + 2*16], xmm8 + vmovdqu [rsp + 3*16], xmm9 + vmovdqu [rsp + 4*16], xmm10 + vmovdqu [rsp + 5*16], xmm11 + vmovdqu [rsp + 6*16], xmm12 + vmovdqu [rsp + 7*16], xmm13 + vmovdqu [rsp + 8*16], xmm14 + vmovdqu [rsp + 9*16], xmm15 +%endif + vbroadcasti128 ymm15, [rel SHA512_SHUFF_MASK_AVX] + + ;; load current hash value and transform + vmovdqu ymm0, [arg_hash] + vmovdqu ymm1, [arg_hash + 32] + ;; ymm0 = D C B A, ymm1 = H G F E + vperm2i128 ymm2, ymm0, ymm1, 0x20 + vperm2i128 ymm3, ymm0, ymm1, 0x31 + ;; ymm2 = F E B A, ymm3 = H G D C + vpermq ymm13, ymm2, 0x1b + vpermq ymm14, ymm3, 0x1b + ;; ymm13 = A B E F, ymm14 = C D G H + + lea rax, [rel SHA512_K_AVX] +align 32 +.block_loop: + vmovdqa ymm11, ymm13 ;; ABEF + vmovdqa ymm12, ymm14 ;; CDGH + + ;; R0 - R3 + vmovdqu ymm0, [arg_msg + 0 * 32] + vpshufb ymm3, ymm0, ymm15 ;; ymm0/ymm3 = W[0..3] + vpaddq ymm0, ymm3, [rax + 0 * 32] + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + + ;; R4 - R7 + vmovdqu ymm0, [arg_msg + 1 * 32] + vpshufb ymm4, ymm0, ymm15 ;; ymm0/ymm4 = W[4..7] + vpaddq ymm0, ymm4, [rax + 1 * 32] + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + vsha512msg1 ymm3, xmm4 ;; ymm3 = W[0..3] + S0(W[1..4]) + + ;; R8 - R11 + vmovdqu ymm0, [arg_msg + 2 * 32] + vpshufb ymm5, ymm0, ymm15 ;; ymm0/ymm5 = W[8..11] + vpaddq ymm0, ymm5, [rax + 2 * 32] + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + vsha512msg1 ymm4, xmm5 ;; ymm4 = W[4..7] + S0(W[5..8]) + + ;; R12 - R15 + vmovdqu ymm0, [arg_msg + 3 * 32] + vpshufb ymm6, ymm0, ymm15 ;; ymm0/ymm6 = W[12..15] + vpaddq ymm0, ymm6, [rax + 3 * 32] + vpermq ymm8, ymm6, 0x1b ;; ymm8 = W[12] W[13] W[14] W[15] + vpermq ymm9, ymm5, 0x39 ;; ymm9 = W[8] W[11] W[10] W[9] + vpblendd ymm8, ymm8, ymm9, 0x3f ;; ymm8 = W[12] W[11] W[10] W[9] + vpaddq ymm3, ymm3, ymm8 + vsha512msg2 ymm3, ymm6 ;; W[16..19] = ymm3 + W[9..12] + S1(W[14..17]) + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + vsha512msg1 ymm5, xmm6 ;; ymm5 = W[8..11] + S0(W[9..12]) + +%assign I 4 + +%rep 3 + ;; R16 - R19, R32 - R35, R48 - R51 + vpaddq ymm0, ymm3, [rax + I * 32] + vpermq ymm8, ymm3, 0x1b ;; ymm8 = W[16] W[17] W[18] W[19] + vpermq ymm9, ymm6, 0x39 ;; ymm9 = W[12] W[15] W[14] W[13] + vpblendd ymm7, ymm8, ymm9, 0x3f ;; ymm7 = W[16] W[15] W[14] W[13] + vpaddq ymm4, ymm4, ymm7 ;; ymm4 = W[4..7] + S0(W[5..8]) + W[13..16] + vsha512msg2 ymm4, ymm3 ;; ymm4 += S1(W[14..17]) + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + vsha512msg1 ymm6, xmm3 ;; ymm6 = W[12..15] + S0(W[13..16]) +%assign I (I + 1) + + ;; R20 - R23, R36 - R39, R52 - R55 + vpaddq ymm0, ymm4, [rax + I * 32] + vpermq ymm8, ymm4, 0x1b ;; ymm8 = W[20] W[21] W[22] W[23] + vpermq ymm9, ymm3, 0x39 ;; ymm9 = W[16] W[19] W[18] W[17] + vpblendd ymm7, ymm8, ymm9, 0x3f ;; ymm7 = W[20] W[19] W[18] W[17] + vpaddq ymm5, ymm5, ymm7 ;; ymm5 = W[8..11] + S0(W[9..12]) + W[17..20] + vsha512msg2 ymm5, ymm4 ;; ymm5 += S1(W[18..21]) + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + vsha512msg1 ymm3, xmm4 ;; ymm3 = W[16..19] + S0(W[17..20]) +%assign I (I + 1) + + ;; R24 - R27, R40 - R43, R56 - R59 + vpaddq ymm0, ymm5, [rax + I * 32] + vpermq ymm8, ymm5, 0x1b ;; ymm8 = W[24] W[25] W[26] W[27] + vpermq ymm9, ymm4, 0x39 ;; ymm9 = W[20] W[23] W[22] W[21] + vpblendd ymm7, ymm8, ymm9, 0x3f ;; ymm7 = W[24] W[23] W[22] W[21] + vpaddq ymm6, ymm6, ymm7 ;; ymm6 = W[12..15] + S0(W[13..16]) + W[21..24] + vsha512msg2 ymm6, ymm5 ;; ymm6 += S1(W[22..25]) + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + vsha512msg1 ymm4, xmm5 ;; ymm4 = W[20..23] + S0(W[21..24]) +%assign I (I + 1) + + ;; R28 - R31, R44 - R47, R60 - R63 + vpaddq ymm0, ymm6, [rax + I * 32] + vpermq ymm8, ymm6, 0x1b ;; ymm8 = W[28] W[29] W[30] W[31] + vpermq ymm9, ymm5, 0x39 ;; ymm9 = W[24] W[27] W[26] W[25] + vpblendd ymm7, ymm8, ymm9, 0x3f ;; ymm7 = W[28] W[27] W[26] W[25] + vpaddq ymm3, ymm3, ymm7 ;; ymm3 = W[16..19] + S0(W[17..20]) + W[25..28] + vsha512msg2 ymm3, ymm6 ;; ymm3 += S1(W[26..29]) + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + vsha512msg1 ymm5, xmm6 ;; ymm5 = W[24..27] + S0(W[25..28]) +%assign I (I + 1) +%endrep + + ;; R64 - R67 + vpaddq ymm0, ymm3, [rax + 16 * 32] + vpermq ymm8, ymm3, 0x1b ;; ymm8 = W[64] W[65] W[66] W[67] + vpermq ymm9, ymm6, 0x39 ;; ymm9 = W[60] W[63] W[62] W[61] + vpblendd ymm7, ymm8, ymm9, 0x3f ;; ymm7 = W[64] W[63] W[62] W[61] + vpaddq ymm4, ymm4, ymm7 ;; ymm4 = W[52..55] + S0(W[53..56]) + W[61..64] + vsha512msg2 ymm4, ymm3 ;; ymm4 += S1(W[62..65]) + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + vsha512msg1 ymm6, xmm3 ;; ymm6 = W[60..63] + S0(W[61..64]) + + ;; R68 - R71 + vpaddq ymm0, ymm4, [rax + 17 * 32] + vpermq ymm8, ymm4, 0x1b ;; ymm8 = W[68] W[69] W[70] W[71] + vpermq ymm9, ymm3, 0x39 ;; ymm9 = W[64] W[67] W[66] W[65] + vpblendd ymm7, ymm8, ymm9, 0x3f ;; ymm7 = W[68] W[67] W[66] W[65] + vpaddq ymm5, ymm5, ymm7 ;; ymm5 = W[56..59] + S0(W[57..60]) + W[65..68] + vsha512msg2 ymm5, ymm4 ;; ymm5 += S1(W[66..69]) + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + + ;; R72 - R75 + vpaddq ymm0, ymm5, [rax + 18 * 32] + vpermq ymm8, ymm5, 0x1b ;; ymm8 = W[72] W[73] W[74] W[75] + vpermq ymm9, ymm4, 0x39 ;; ymm9 = W[68] W[71] W[70] W[69] + vpblendd ymm7, ymm8, ymm9, 0x3f ;; ymm7 = W[72] W[71] W[70] W[69] + vpaddq ymm6, ymm6, ymm7 ;; ymm6 = W[60..63] + S0(W[61..64]) + W[69..72] + vsha512msg2 ymm6, ymm5 ;; ymm6 += S1(W[70..73]) + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + + ;; R76 - R79 + vpaddq ymm0, ymm6, [rax + 19 * 32] + vsha512rnds2 ymm12, ymm11, xmm0 + vperm2i128 ymm0, ymm0, ymm0, 0x01 + vsha512rnds2 ymm11, ymm12, xmm0 + + ;; update hash value + vpaddq ymm14, ymm14, ymm12 + vpaddq ymm13, ymm13, ymm11 + add arg_msg, 4 * 32 + dec arg_num_blks + jnz .block_loop + + ;; store the hash value back in memory + ;; ymm13 = ABEF + ;; ymm14 = CDGH + vperm2i128 ymm1, ymm13, ymm14, 0x31 + vperm2i128 ymm2, ymm13, ymm14, 0x20 + vpermq ymm1, ymm1, 0xb1 ;; ymm1 = D C B A + vpermq ymm2, ymm2, 0xb1 ;; ymm2 = H G F E + vmovdqu [arg_hash + 0*32], ymm1 + vmovdqu [arg_hash + 1*32], ymm2 + + vzeroupper + +%ifidn __OUTPUT_FORMAT__, win64 + ;; xmm6:xmm15 need to be maintained for Windows + vmovdqu xmm6, [rsp + 0*16] + vmovdqu xmm7, [rsp + 1*16] + vmovdqu xmm8, [rsp + 2*16] + vmovdqu xmm9, [rsp + 3*16] + vmovdqu xmm10, [rsp + 4*16] + vmovdqu xmm11, [rsp + 5*16] + vmovdqu xmm12, [rsp + 6*16] + vmovdqu xmm13, [rsp + 7*16] + vmovdqu xmm14, [rsp + 8*16] + vmovdqu xmm15, [rsp + 9*16] + add rsp, 10*16 +%endif + +.done_hash: + + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sha512_ni_block_avx2(const void *input, uint64_t digest[8]) +;; arg1 : [in] message pointer +;; arg2 : [in/out] pointer to hash value + +align 32 +MKGLOBAL(sha512_ni_block_avx2,function,internal) +sha512_ni_block_avx2: + mov rax, arg1 + mov arg1, arg2 + mov arg2, rax + mov DWORD(arg3), 1 + jmp sha512_update_ni_x1 + +mksection stack-noexec diff --git a/lib/avx2_t4/sha_ni_avx2.c b/lib/avx2_t4/sha_ni_avx2.c new file mode 100644 index 0000000000000000000000000000000000000000..7cdab60a7383591dbb54374b49053c16ac1b2e95 --- /dev/null +++ b/lib/avx2_t4/sha_ni_avx2.c @@ -0,0 +1,121 @@ +/******************************************************************************* + Copyright (c) 2020-2024, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "include/sha_generic.h" +#include "include/arch_avx2_type4.h" + +/* ========================================================================== */ +/* One block SHA384 computation for IPAD / OPAD usage only */ +void +sha384_one_block_ni_avx2(const void *data, void *digest) +{ + sha_generic_1block(data, digest, ARCH_AVX2_SHANI, 384 /* SHA384 */); +} + +/* ========================================================================== */ +/* + * SHA384 API for use in HMAC-SHA384 when key is longer than the block size + */ +void +sha384_ni_avx2(const void *data, const uint64_t length, void *digest) +{ + sha_generic(data, length, digest, ARCH_AVX2_SHANI, 384, IMB_SHA_384_BLOCK_SIZE, + SHA384_PAD_SIZE); +} + +/* ========================================================================== */ +/* One block SHA512 computation for IPAD / OPAD usage only */ +void +sha512_one_block_ni_avx2(const void *data, void *digest) +{ + sha_generic_1block(data, digest, ARCH_AVX2_SHANI, 512 /* SHA512 */); +} + +/* ========================================================================== */ +/* + * SHA512 API for use in HMAC-SHA512 when key is longer than the block size + */ +void +sha512_ni_avx2(const void *data, const uint64_t length, void *digest) +{ + sha_generic(data, length, digest, ARCH_AVX2_SHANI, 512, IMB_SHA_512_BLOCK_SIZE, + SHA512_PAD_SIZE); +} + +/* ========================================================================== */ +/* + * SHA384 API for JOB API + */ +IMB_JOB * +submit_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) +{ + const void *msg = (job->src + job->hash_start_src_offset_in_bytes); + const uint64_t length = job->msg_len_to_hash_in_bytes; + uint64_t tag[8]; + + (void) state; + + sha384_ni_avx2(msg, length, tag); + memcpy(job->auth_tag_output, tag, job->auth_tag_output_len_in_bytes); + job->status |= IMB_STATUS_COMPLETED_AUTH; + return job; +} + +IMB_JOB * +flush_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) +{ + (void) state; + (void) job; + return NULL; +} + +/* ========================================================================== */ +/* + * SHA512 API for JOB API + */ +IMB_JOB * +submit_job_sha512_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) +{ + const void *msg = (job->src + job->hash_start_src_offset_in_bytes); + const uint64_t length = job->msg_len_to_hash_in_bytes; + uint64_t tag[8]; + + (void) state; + + sha512_ni_avx2(msg, length, tag); + memcpy(job->auth_tag_output, tag, job->auth_tag_output_len_in_bytes); + job->status |= IMB_STATUS_COMPLETED_AUTH; + return job; +} + +IMB_JOB * +flush_job_sha512_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job) +{ + (void) state; + (void) job; + return NULL; +} diff --git a/lib/avx2_t4/sm3_hmac_avx2.asm b/lib/avx2_t4/sm3_hmac_avx2.asm new file mode 100644 index 0000000000000000000000000000000000000000..b70a1a72e164cd3629951ed9c6d427992cdf1094 --- /dev/null +++ b/lib/avx2_t4/sm3_hmac_avx2.asm @@ -0,0 +1,311 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash + +extern sm3_base_init +extern sm3_update_ni_x1 +extern sm3_tag_store_avx + +%include "include/os.inc" +%include "include/reg_sizes.inc" +%include "include/imb_job.inc" +%include "include/memcpy.inc" + +%ifdef LINUX + +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx + +%define gp1 rax +%define gp2 r8 +%define gp3 r9 +%define gp4 r10 +%define gp5 r11 +%define gp6 arg4 +%define gp7 r12 +%define gp8 r13 +%define gp9 r14 +%define gp10 r15 +%define gp11 rbx +%define gp12 rbp + +%else + +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 + +%define gp1 rax +%define gp2 r10 +%define gp3 r11 +%define gp4 arg4 +%define gp5 rdi +%define gp6 rsi +%define gp7 r12 +%define gp8 r13 +%define gp9 r14 +%define gp10 r15 +%define gp11 rbx +%define gp12 rbp + +%endif + +%xdefine t1 gp1 +%xdefine t2 gp2 +%xdefine t3 gp3 +%xdefine t4 gp3 + +%xdefine r1 gp12 +%xdefine r2 gp11 +%xdefine r3 gp10 + +%define arg_job r1 +%define arg_msg r2 +%define arg_msg_length r3 + +;; HMAC-SM3 stack frame +struc STACK +_B: resb 64 ; two SM3 blocks (aligned to 16) +_D: resd 8 ; digest +_gpr_save: resq 8 ; space for GPR's +_rsp_save: resq 1 ; space for rsp pointer +endstruc + +mksection .rodata + +align 16 +SHUFF_MASK: + db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +;; PAD BLOCKS are used for OPAD where digest of IPAD + message is put into the block. +;; The blocks below fill up top 32 bytes of the block, +;; low 32 bytes get filled with the digest. +align 16 +PAD_BLOCK1: + db 0x80, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + +align 16 +PAD_BLOCK2: + ;; last qword has to encode length in bits of: BLOCK size + DIGEST size + ;; (64 + 32) * 8 = 768 = 0x300 in hex + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x03, 0x00 + +mksection .text + +;; ============================================================================= +;; Save registers on the stack and create stack frame +;; ============================================================================= + +%macro FUNC_START 0 + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax + mov [rsp + _gpr_save + 0*8], rbx + mov [rsp + _gpr_save + 1*8], rbp + mov [rsp + _gpr_save + 2*8], r12 + mov [rsp + _gpr_save + 3*8], r13 + mov [rsp + _gpr_save + 4*8], r14 + mov [rsp + _gpr_save + 5*8], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _gpr_save + 6*8], rdi + mov [rsp + _gpr_save + 7*8], rsi +%endif +%endmacro + +;; ============================================================================= +;; Restore registers from the stack +;; ============================================================================= + +%macro FUNC_END 0 + mov rbx, [rsp + _gpr_save + 0*8] + mov rbp, [rsp + _gpr_save + 1*8] + mov r12, [rsp + _gpr_save + 2*8] + mov r13, [rsp + _gpr_save + 3*8] + mov r14, [rsp + _gpr_save + 4*8] + mov r15, [rsp + _gpr_save + 5*8] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + _gpr_save + 6*8] + mov rsi, [rsp + _gpr_save + 7*8] +%endif + mov rsp, [rsp + _rsp_save] +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; IMB_JOB *sm3_hmac_submit_ni_avx2(IMB_JOB *job) +align 32 +MKGLOBAL(sm3_hmac_submit_ni_avx2,function,internal) +sm3_hmac_submit_ni_avx2: + FUNC_START + + ;; save input arguments + mov arg_job, arg1 + + ;; init the digest with IPAD + mov t1, [arg_job + _auth_key_xor_ipad] + vmovdqu xmm0, [t1 + 0*16] + vmovdqu xmm1, [t1 + 1*16] + vmovdqa [rsp + _D + 0*16], xmm0 + vmovdqa [rsp + _D + 1*16], xmm1 + + ;; update digest for full number of blocks + lea arg1, [rsp + _D] + mov arg2, [arg_job + _src] + add arg2, [arg_job + _hash_start_src_offset] + mov arg_msg, arg2 + mov arg_msg_length, [arg_job + _msg_len_to_hash_in_bytes] + mov arg3, arg_msg_length + shr arg3, 6 ;; msg_length / SM3_BLOCK_SIZE + call sm3_update_ni_x1 + + ;; prepare partial block + mov DWORD(arg3), 63 + not arg3 + and arg3, arg_msg_length ;; number of bytes processed already + add arg_msg, arg3 ;; move message pointer to start of the partial block + mov t2, arg_msg_length + sub t2, arg3 ;; t2 = number of bytes left + + xor DWORD(arg1), DWORD(arg1) +.partial_block_copy: + cmp DWORD(arg1), DWORD(t2) + je .partial_block_copy_exit + mov BYTE(t1), [arg_msg + arg1] + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_copy + +.partial_block_copy_exit: + ;; put end of message marker + mov BYTE [rsp + _B + arg1], 0x80 + inc DWORD(arg1) + + xor DWORD(t1), DWORD(t1) +.partial_block_zero: + cmp DWORD(arg1), 64 + je .partial_block_zero_exit + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_zero + +.partial_block_zero_exit: + cmp DWORD(t2), 64 - 8 + jb .add_msg_length + + ;; if length field doesn't fit into this partial block + ;; - compute digest on the current block + ;; - clear the block for the length to be put into it next + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_update_ni_x1 + + xor DWORD(t1), DWORD(t1) + mov [rsp + _B + 0*8], t1 + mov [rsp + _B + 1*8], t1 + mov [rsp + _B + 2*8], t1 + mov [rsp + _B + 3*8], t1 + mov [rsp + _B + 4*8], t1 + mov [rsp + _B + 5*8], t1 + mov [rsp + _B + 6*8], t1 + +.add_msg_length: + lea t1, [arg_msg_length*8 + 64*8] ;; original message length in bits + 1 IPAD block + bswap t1 + mov [rsp + _B + 7*8], t1 + + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_update_ni_x1 + +.process_opad: + vmovdqa xmm0, [rsp + _D + 0*16] + vmovdqa xmm1, [rsp + _D + 1*16] + vpshufb xmm0, xmm0, [rel SHUFF_MASK] + vpshufb xmm1, xmm1, [rel SHUFF_MASK] + vmovdqa xmm2, [rel PAD_BLOCK1] + vmovdqa xmm3, [rel PAD_BLOCK2] + vmovdqa [rsp + _B + 0*16], xmm0 + vmovdqa [rsp + _B + 1*16], xmm1 + vmovdqa [rsp + _B + 2*16], xmm2 + vmovdqa [rsp + _B + 3*16], xmm3 + + ;; init the digest with OPAD + mov t1, [arg_job + _auth_key_xor_opad] + vmovdqu xmm0, [t1 + 0*16] + vmovdqu xmm1, [t1 + 1*16] + vmovdqa [rsp + _D + 0*16], xmm0 + vmovdqa [rsp + _D + 1*16], xmm1 + + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_update_ni_x1 + +.tag_store_start: + ;; byte swap the digest and write it back + vmovdqa xmm0, [rsp + _D + 0*16] + vmovdqa xmm1, [rsp + _D + 1*16] + vpshufb xmm0, xmm0, [rel SHUFF_MASK] + vpshufb xmm1, xmm1, [rel SHUFF_MASK] + + mov arg1, [arg_job + _auth_tag_output] + mov arg2, [arg_job + _auth_tag_output_len_in_bytes] + call sm3_tag_store_avx + +%ifdef SAFE_DATA + vpxor xmm0, xmm0, xmm0 + vpxor xmm1, xmm1, xmm1 + vpxor xmm2, xmm2, xmm2 + vpxor xmm3, xmm3, xmm3 + + ;; update uses xmm10 and xmm11 and in some case fragment of the key can be found there + vpxor xmm10, xmm10, xmm10 + vpxor xmm11, xmm11, xmm11 + + vmovdqu [rsp + _B + 0*16], ymm0 + vmovdqu [rsp + _B + 2*16], ymm0 +%endif + + mov rax, arg_job + or dword [arg_job + _status], IMB_STATUS_COMPLETED_AUTH + FUNC_END + ret + +mksection stack-noexec diff --git a/lib/avx2_t4/sm3_msg_avx2.asm b/lib/avx2_t4/sm3_msg_avx2.asm new file mode 100644 index 0000000000000000000000000000000000000000..0a0d942ad1f6efd2aa9d39e790283cfc54fd760f --- /dev/null +++ b/lib/avx2_t4/sm3_msg_avx2.asm @@ -0,0 +1,304 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash + +extern sm3_base_init +extern sm3_update_ni_x1 + +%include "include/os.inc" +%include "include/reg_sizes.inc" +%include "include/memcpy.inc" +%include "include/imb_job.inc" + +%ifdef LINUX + +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx + +%define t1 rax +%define t2 r8 +%define t3 r9 +%define t4 r10 +%define t5 r11 +%define t6 r12 +%define t7 r13 +%define t8 r14 +%define t9 r15 +%define t10 rbx +%define t11 rbp + +%else + +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 + +%define t1 rax +%define t2 r10 +%define t3 r11 +%define t4 rdi +%define t5 rsi +%define t6 r12 +%define t7 r13 +%define t8 r14 +%define t9 r15 +%define t10 rbx +%define t11 rbp + +%endif + +%xdefine r1 t6 +%xdefine r2 t7 +%xdefine r3 t8 +%xdefine r4 t9 +%xdefine r5 t10 +%xdefine r6 t11 + +%define arg_tag r1 +%define arg_tag_length r2 +%define arg_msg r3 +%define arg_msg_length r4 + +;; SM3 stack frame +struc STACK +_B: resb 64 ; one SM3 block (aligned to 16) +_D: resd 8 ; digest +_gpr_save: resq 8 ; space for GPR's +_rsp_save: resq 1 ; space for rsp pointer +endstruc + +mksection .rodata + +align 16 +SHUFF_MASK: + db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +mksection .text + +;; ============================================================================= +;; Save registers on the stack and create stack frame +;; ============================================================================= + +%macro FUNC_START 0 + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax + mov [rsp + _gpr_save + 0*8], rbx + mov [rsp + _gpr_save + 1*8], rbp + mov [rsp + _gpr_save + 2*8], r12 + mov [rsp + _gpr_save + 3*8], r13 + mov [rsp + _gpr_save + 4*8], r14 + mov [rsp + _gpr_save + 5*8], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _gpr_save + 6*8], rdi + mov [rsp + _gpr_save + 7*8], rsi +%endif +%endmacro + +;; ============================================================================= +;; Restore registers from the stack +;; ============================================================================= + +%macro FUNC_END 0 + mov rbx, [rsp + _gpr_save + 0*8] + mov rbp, [rsp + _gpr_save + 1*8] + mov r12, [rsp + _gpr_save + 2*8] + mov r13, [rsp + _gpr_save + 3*8] + mov r14, [rsp + _gpr_save + 4*8] + mov r15, [rsp + _gpr_save + 5*8] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + _gpr_save + 6*8] + mov rsi, [rsp + _gpr_save + 7*8] +%endif + mov rsp, [rsp + _rsp_save] +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sm3_tag_store_avx(void *tag_ptr, const uint64_t tag_length, xmm1:xmm0 tag) +;; NOTE: may clobber t1 & t2 (rax & r10) +align 32 +MKGLOBAL(sm3_tag_store_avx,function,internal) +sm3_tag_store_avx: + cmp arg2, 32 + je .tag_store_32 + + cmp arg2, 16 + jb .tag_store_1_15 + je .tag_store_16 + +.tag_store_16_31: + vmovdqu [arg1 + 0*16], xmm0 + lea arg1, [arg1 + 16] + vmovdqa xmm0, xmm1 + sub arg2, 16 + ;; fall through to store remaining tag bytes + +.tag_store_1_15: + simd_store_avx arg1, xmm0, arg2, t1, t2 + jmp .tag_store_end + +.tag_store_32: + vmovdqu [arg1 + 1*16], xmm1 + ;; fall through to store 1st 16 bytes + +.tag_store_16: + vmovdqu [arg1 + 0*16], xmm0 + ;; fall through + +.tag_store_end: + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sm3_msg_ni_avx2(void *tag, const uint64_t tag_length, const void *msg, const uint64_t msg_length) +align 32 +MKGLOBAL(sm3_msg_ni_avx2,function,internal) +sm3_msg_ni_avx2: + FUNC_START + + ;; save input arguments + mov arg_tag, arg1 + mov arg_tag_length, arg2 + mov arg_msg, arg3 + mov arg_msg_length, arg4 + + ;; init the digest + lea arg1, [rsp + _D] + call sm3_base_init + + ;; update digest for full number of blocks + ;; - arg1 stays unchanged + mov arg2, arg_msg + mov arg3, arg_msg_length + shr arg3, 6 ;; msg_length / SM3_BLOCK_SIZE + call sm3_update_ni_x1 + + ;; prepare partial block + mov DWORD(arg3), 63 + not arg3 + and arg3, arg_msg_length ;; number of bytes processed already + add arg_msg, arg3 ;; move message pointer to start of the partial block + mov r5, arg_msg_length + sub r5, arg3 ;; r5 = number of bytes left + + xor DWORD(arg1), DWORD(arg1) +.partial_block_copy: + cmp DWORD(arg1), DWORD(r5) + je .partial_block_copy_exit + mov BYTE(t1), [arg_msg + arg1] + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_copy + +.partial_block_copy_exit: + ;; put end of message marker + mov BYTE [rsp + _B + arg1], 0x80 + inc DWORD(arg1) + + xor DWORD(t1), DWORD(t1) +.partial_block_zero: + cmp DWORD(arg1), 64 + je .partial_block_zero_exit + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_zero + +.partial_block_zero_exit: + cmp DWORD(r5), 64 - 8 + jb .add_msg_length + + ;; if length field doesn't fit into this partial block + ;; - compute digest on the current block + ;; - clear the block for the length to be put into it next + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_update_ni_x1 + + xor DWORD(t1), DWORD(t1) + mov [rsp + _B + 0*8], t1 + mov [rsp + _B + 1*8], t1 + mov [rsp + _B + 2*8], t1 + mov [rsp + _B + 3*8], t1 + mov [rsp + _B + 4*8], t1 + mov [rsp + _B + 5*8], t1 + mov [rsp + _B + 6*8], t1 + +.add_msg_length: + lea t1, [arg_msg_length*8] ;; original message length in bits + movbe [rsp + _B + 7*8], t1 + + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_update_ni_x1 + +.tag_store_start: + ;; byte swap the digest and write it back + vmovdqa xmm0, [rsp + _D + 0*16] + vmovdqa xmm1, [rsp + _D + 1*16] + vpshufb xmm0, xmm0, [rel SHUFF_MASK] + vpshufb xmm1, xmm1, [rel SHUFF_MASK] + + mov arg1, arg_tag + mov arg2, arg_tag_length + call sm3_tag_store_avx + +%ifdef SAFE_DATA + vpxor xmm0, xmm0, xmm0 + vpxor xmm1, xmm1, xmm1 + + vmovdqu [rsp + _B + 0*16], ymm0 + vmovdqu [rsp + _B + 2*16], ymm0 +%endif + FUNC_END + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; IMB_JOB *sm3_msg_submit_ni_avx2(IMB_JOB *) +align 32 +MKGLOBAL(sm3_msg_submit_ni_avx2,function,internal) +sm3_msg_submit_ni_avx2: + push arg1 + + mov arg4, [arg1 + _msg_len_to_hash_in_bytes] + mov arg3, [arg1 + _src] + add arg3, [arg1 + _hash_start_src_offset] + mov arg2, [arg1 + _auth_tag_output_len_in_bytes] + mov arg1, [arg1 + _auth_tag_output] + call sm3_msg_ni_avx2 + + pop rax + or dword [rax + _status], IMB_STATUS_COMPLETED_AUTH + ret + +mksection stack-noexec diff --git a/lib/avx2_t4/sm3_ni_x1_avx2.asm b/lib/avx2_t4/sm3_ni_x1_avx2.asm new file mode 100644 index 0000000000000000000000000000000000000000..ecd6d8fab58447d39cd4b39358d2dafb5fddd249 --- /dev/null +++ b/lib/avx2_t4/sm3_ni_x1_avx2.asm @@ -0,0 +1,267 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; =========================================================== +;; NOTE about comment format: +;; +;; xmm = a b c d +;; ^ ^ +;; | | +;; MSB--+ +--LSB +;; +;; a - most significant word in `xmm` +;; d - least significant word in `xmm` +;; =========================================================== + +%use smartalign + +%include "include/os.inc" +%include "include/clear_regs.inc" + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%endif + +%define arg_hash arg1 +%define arg_msg arg2 +%define arg_num_blks arg3 + +mksection .rodata +default rel + +align 16 +SHUFF_MASK: + db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +mksection .text + +;; *************************************************************************** +;; Create 4 x 32-bit new words of message schedule W[] using SM3-NI ISA +;; *************************************************************************** +%macro SM3MSG 7 +%define %%W03_00 %1 ;; [in] XMM register with W[0..3] +%define %%W07_04 %2 ;; [in] XMM register with W[4..7] +%define %%W11_08 %3 ;; [in] XMM register with W[8..11] +%define %%W15_12 %4 ;; [in] XMM register with W[12..15] +%define %%W19_16 %5 ;; [out] XMM register with W[19..16] +%define %%T1 %6 ;; [clobbered] XMM register +%define %%T2 %7 ;; [clobbered] XMM register + +%define %%T3 %%W19_16 + + vpalignr %%T3, %%W11_08, %%W07_04, 3*4 ;; xmm8 = W10 W9 W8 W7 + vpsrldq %%T1, %%W15_12, 4 ;; xmm9 = 0 W15 W14 W13 + vsm3msg1 %%T3, %%T1, %%W03_00 ;; xmm8 = WTMP3 WTMP2 WTMP1 WTMP0 + vpalignr %%T1, %%W07_04, %%W03_00, 3*4 ;; xmm9 = W6 W5 W4 W3 + vpalignr %%T2, %%W15_12, %%W11_08, 2*4 ;; xmm1 = W13 W12 W11 W10 + vsm3msg2 %%T3, %%T1, %%T2 ;; xmm8 = W19 W18 W17 W16 +%endmacro + +;; *************************************************************************** +;; Performs 4 rounds of SM3 algorithm +;; - consumes 4 words of message schedule W[] +;; - updates SM3 state registers: ABEF and CDGH +;; *************************************************************************** +%macro SM3ROUNDS4 6 +%define %%ABEF %1 ;; [in/out] XMM register with ABEF registers +%define %%CDGH %2 ;; [in/out] XMM register with CDGH registers +%define %%W03_00 %3 ;; [in] XMM register with W[8..11] +%define %%W07_04 %4 ;; [in] XMM register with W[12..15] +%define %%T1 %5 ;; [clobbered] XMM register +%define %%R %6 ;; [in] round number + + vpunpcklqdq %%T1, %%W03_00, %%W07_04 ;; T1 = W5 W4 W1 W0 + vsm3rnds2 %%CDGH, %%ABEF, %%T1, %%R ;; CDGH = updated ABEF // 2 rounds + vpunpckhqdq %%T1, %%W03_00, %%W07_04 ;; T1 = W7 W6 W3 W2 + vsm3rnds2 %%ABEF, %%CDGH, %%T1, (%%R + 2) ;; ABEF = updated CDGH // 2 rounds +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sm3_update(uint32_t digest[8], const void *input, uint64_t num_blocks) +;; arg1 : [in/out] pointer to hash value +;; arg2 : [in] message pointer +;; arg3 : [in] number of blocks to process + +align 32 +MKGLOBAL(sm3_update_ni_x1,function,internal) +sm3_update_ni_x1: + or arg_num_blks, arg_num_blks + je done_hash + +%ifidn __OUTPUT_FORMAT__, win64 + ;; xmm6:xmm12 need to be maintained for Windows + sub rsp, 7*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm7 + vmovdqu [rsp + 2*16], xmm8 + vmovdqu [rsp + 3*16], xmm9 + vmovdqu [rsp + 4*16], xmm10 + vmovdqu [rsp + 5*16], xmm11 + vmovdqu [rsp + 6*16], xmm12 +%endif + + ;; load current hash value and transform + vmovdqu xmm6, [arg_hash] + vmovdqu xmm7, [arg_hash + 16] + ;; xmm6 = D C B A, xmm7 = H G F E + + vpshufd xmm0, xmm6, 0x1B ;; xmm0 = A B C D + vpshufd xmm1, xmm7, 0x1B ;; xmm1 = E F G H + vpunpckhqdq xmm6, xmm1, xmm0 ;; xmm6 = A B E F + vpunpcklqdq xmm7, xmm1, xmm0 ;; xmm7 = C D G H + vpsrld xmm2, xmm7, 9 + vpslld xmm3, xmm7, 23 + vpxor xmm1, xmm2, xmm3 ;; xmm1 = xmm2 ^ xmm3 = ROL32(CDGH, 23) + vpsrld xmm4, xmm7, 19 + vpslld xmm5, xmm7, 13 + vpxor xmm0, xmm4, xmm5 ;; xmm0 = xmm2 ^ xmm3 = ROL32(CDGH, 13) + vpblendd xmm7, xmm1, xmm0, 0x3 ;; xmm7 = ROL32(C, 23) ROL32(D, 23) ROL32(G, 13) ROL32(H, 13) + + vmovdqa xmm12, [rel SHUFF_MASK] +align 32 +block_loop: + vmovdqa xmm10, xmm6 + vmovdqa xmm11, xmm7 + + ;; prepare W[0..15] - read and shuffle the data + vmovdqu xmm2, [arg_msg + 0*16] + vmovdqu xmm3, [arg_msg + 1*16] + vmovdqu xmm4, [arg_msg + 2*16] + vmovdqu xmm5, [arg_msg + 3*16] + vpshufb xmm2, xmm2, xmm12 ;; xmm2 = W03 W02 W01 W00 + vpshufb xmm3, xmm3, xmm12 ;; xmm3 = W07 W06 W05 W04 + vpshufb xmm4, xmm4, xmm12 ;; xmm4 = W11 W10 W09 W08 + vpshufb xmm5, xmm5, xmm12 ;; xmm5 = W15 W14 W13 W12 + + SM3MSG xmm2, xmm3, xmm4, xmm5, xmm8, xmm9, xmm1 ;; xmm8 = W19 W18 W17 W16 + SM3ROUNDS4 xmm6, xmm7, xmm2, xmm3, xmm1, 0 + + vmovdqa xmm2, xmm8 + SM3MSG xmm3, xmm4, xmm5, xmm2, xmm8, xmm9, xmm1 ;; xmm8 = W23 W22 W21 W20 + SM3ROUNDS4 xmm6, xmm7, xmm3, xmm4, xmm1, 4 + + vmovdqa xmm3, xmm8 + SM3MSG xmm4, xmm5, xmm2, xmm3, xmm8, xmm9, xmm1 ;; xmm8 = W27 W26 W25 W24 + SM3ROUNDS4 xmm6, xmm7, xmm4, xmm5, xmm1, 8 + + vmovdqa xmm4, xmm8 + SM3MSG xmm5, xmm2, xmm3, xmm4, xmm8, xmm9, xmm1 ;; xmm8 = W31 W30 W29 W28 + SM3ROUNDS4 xmm6, xmm7, xmm5, xmm2, xmm1, 12 + + vmovdqa xmm5, xmm8 + SM3MSG xmm2, xmm3, xmm4, xmm5, xmm8, xmm9, xmm1 ;; xmm8 = W35 W34 W33 W32 + SM3ROUNDS4 xmm6, xmm7, xmm2, xmm3, xmm1, 16 + + vmovdqa xmm2, xmm8 + SM3MSG xmm3, xmm4, xmm5, xmm2, xmm8, xmm9, xmm1 ;; xmm8 = W39 W38 W37 W36 + SM3ROUNDS4 xmm6, xmm7, xmm3, xmm4, xmm1, 20 + + vmovdqa xmm3, xmm8 + SM3MSG xmm4, xmm5, xmm2, xmm3, xmm8, xmm9, xmm1 ;; xmm8 = W43 W42 W41 W40 + SM3ROUNDS4 xmm6, xmm7, xmm4, xmm5, xmm1, 24 + + vmovdqa xmm4, xmm8 + SM3MSG xmm5, xmm2, xmm3, xmm4, xmm8, xmm9, xmm1 ;; xmm8 = W47 W46 W45 W44 + SM3ROUNDS4 xmm6, xmm7, xmm5, xmm2, xmm1, 28 + + vmovdqa xmm5, xmm8 + SM3MSG xmm2, xmm3, xmm4, xmm5, xmm8, xmm9, xmm1 ;; xmm8 = W51 W50 W49 W48 + SM3ROUNDS4 xmm6, xmm7, xmm2, xmm3, xmm1, 32 + + vmovdqa xmm2, xmm8 + SM3MSG xmm3, xmm4, xmm5, xmm2, xmm8, xmm9, xmm1 ;; xmm8 = W55 W54 W53 W52 + SM3ROUNDS4 xmm6, xmm7, xmm3, xmm4, xmm1, 36 + + vmovdqa xmm3, xmm8 + SM3MSG xmm4, xmm5, xmm2, xmm3, xmm8, xmm9, xmm1 ;; xmm8 = W59 W58 W57 W56 + SM3ROUNDS4 xmm6, xmm7, xmm4, xmm5, xmm1, 40 + + vmovdqa xmm4, xmm8 + SM3MSG xmm5, xmm2, xmm3, xmm4, xmm8, xmm9, xmm1 ;; xmm8 = W63 W62 W61 W60 + SM3ROUNDS4 xmm6, xmm7, xmm5, xmm2, xmm1, 44 + + vmovdqa xmm5, xmm8 + SM3MSG xmm2, xmm3, xmm4, xmm5, xmm8, xmm9, xmm1 ;; xmm8 = W67 W66 W65 W64 + SM3ROUNDS4 xmm6, xmm7, xmm2, xmm3, xmm1, 48 + + vmovdqa xmm2, xmm8 + SM3ROUNDS4 xmm6, xmm7, xmm3, xmm4, xmm1, 52 + + SM3ROUNDS4 xmm6, xmm7, xmm4, xmm5, xmm1, 56 + + SM3ROUNDS4 xmm6, xmm7, xmm5, xmm2, xmm1, 60 + + ;; update hash value + vpxor xmm6, xmm6, xmm10 + vpxor xmm7, xmm7, xmm11 + add arg_msg, 64 + dec arg_num_blks + jnz block_loop + + ;; store the hash value back in memory + vpslld xmm2, xmm7, 9 + vpsrld xmm3, xmm7, 23 + vpxor xmm1, xmm2, xmm3 ;; xmm1 = xmm2 ^ xmm3 = ROL32(CDGH, 9) + vpslld xmm4, xmm7, 19 + vpsrld xmm5, xmm7, 13 + vpxor xmm0, xmm4, xmm5 ;; xmm0 = xmm2 ^ xmm3 = ROL32(CDGH, 19) + vpblendd xmm7, xmm1, xmm0, 0x3 ;; xmm7 = ROL32(C, 9) ROL32(D, 9) ROL32(G, 19) ROL32(H, 19) + vpshufd xmm0, xmm6, 0x1B ;; xmm0 = F E B A + vpshufd xmm1, xmm7, 0x1B ;; xmm1 = H G D C + + vpunpcklqdq xmm6, xmm0, xmm1 ;; xmm6 = D C B A + vpunpckhqdq xmm7, xmm0, xmm1 ;; xmm7 = H G F E + + vmovdqu [arg_hash], xmm6 + vmovdqu [arg_hash + 16], xmm7 + +%ifidn __OUTPUT_FORMAT__, win64 + ;; xmm6:xmm12 need to be maintained for Windows + vmovdqu xmm6, [rsp + 0*16] + vmovdqu xmm7, [rsp + 1*16] + vmovdqu xmm8, [rsp + 2*16] + vmovdqu xmm9, [rsp + 3*16] + vmovdqu xmm10, [rsp + 4*16] + vmovdqu xmm11, [rsp + 5*16] + vmovdqu xmm12, [rsp + 6*16] + add rsp, 7*16 +%endif + +done_hash: + + ret + +mksection stack-noexec diff --git a/lib/avx2_t4/sm4_ni_avx2.asm b/lib/avx2_t4/sm4_ni_avx2.asm new file mode 100644 index 0000000000000000000000000000000000000000..7cc550bdafdc9766c957ae90d5c7bc372d63e905 --- /dev/null +++ b/lib/avx2_t4/sm4_ni_avx2.asm @@ -0,0 +1,174 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +%include "include/os.inc" +%include "include/clear_regs.inc" +%include "include/cet.inc" +%include "include/error.inc" + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%endif + +mksection .rodata +default rel + +align 16 +constants: +dd 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc, +dd 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269, +dd 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9, +dd 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249, +dd 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9, +dd 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229, +dd 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299, +dd 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209, +dd 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 + +in_shufb: +db 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04 +db 0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c + +out_shufb: +db 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 +db 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 + +mksection .text + +align 32 +MKGLOBAL(sm4_ecb_ni_avx2,function,internal) +sm4_ecb_ni_avx2: + +%define IN arg1 +%define OUT arg2 +%define SIZE arg3 +%define KEY_EXP arg4 + +%define IDX r10 +%define XDATA0 xmm0 + + xor IDX, IDX +main_loop: + or SIZE, SIZE + jz done + + vmovdqu XDATA0, [IN + IDX] + vpshufb XDATA0, XDATA0, [rel in_shufb] + +%assign i 0 +%rep 8 + vsm4rnds4 XDATA0, XDATA0, [KEY_EXP + 16*i] +%assign i (i + 1) +%endrep + vpshufb XDATA0, [rel out_shufb] + + vmovdqu [OUT + IDX], XDATA0 + + add IDX, 16 + sub SIZE, 16 + jmp main_loop + +done: + +%ifdef SAFE_DATA + clear_all_ymms_asm +%else + vzeroupper +%endif + ret + +;; +;;void sm4_set_key_sse(const void *key, const uint32_t *exp_enc_keys, +;; const uint32_t *exp_dec_keys) +;; +; arg 1: KEY: pointer to 128-bit key +; arg 2: EXP_ENC_KEYS: pointer to expanded encryption keys +; arg 3: EXP_DEC_KEYS: pointer to expanded decryption keys +; +align 32 +MKGLOBAL(sm4_set_key_ni_avx2,function,internal) +sm4_set_key_ni_avx2: + +%define KEY arg1 +%define ENC_KEY_EXP arg2 +%define DEC_KEY_EXP arg3 + + endbranch64 +%ifdef SAFE_PARAM + IMB_ERR_CHECK_RESET + + cmp KEY, 0 + jz error_set_key_ni_avx2 + cmp ENC_KEY_EXP, 0 + jz error_set_key_ni_avx2 + cmp DEC_KEY_EXP, 0 + jz error_set_key_ni_avx2 +%endif + + vmovdqu xmm0, [KEY] + vpshufb xmm0, xmm0, [rel in_shufb] + vpxor xmm0, [rel constants] + +%assign i 1 +%rep 8 + vsm4key4 xmm0, xmm0, [rel constants + 16*i] + vmovdqu [ENC_KEY_EXP + 16*(i-1)], xmm0 + vpshufd xmm1, xmm0, 0x1B + vmovdqu [DEC_KEY_EXP + 16*(7-i+1)], xmm1 + +%assign i (i + 1) +%endrep + +sm4_set_key_ni_avx2_return: + +%ifdef SAFE_DATA + clear_all_ymms_asm +%else + vzeroupper +%endif + ret + +%ifdef SAFE_PARAM +error_set_key_ni_avx2: + IMB_ERR_CHECK_START rax + IMB_ERR_CHECK_NULL KEY, rax, IMB_ERR_NULL_KEY + IMB_ERR_CHECK_NULL ENC_KEY_EXP, rax, IMB_ERR_NULL_EXP_KEY + IMB_ERR_CHECK_NULL DEC_KEY_EXP, rax, IMB_ERR_NULL_EXP_KEY + IMB_ERR_CHECK_END rax + + ret +%endif + +mksection stack-noexec diff --git a/lib/avx512_t1/mb_mgr_avx512_t1.c b/lib/avx512_t1/mb_mgr_avx512_t1.c index d20f90597a30a802819926132d9d4ac47af45797..334b016f9b6fbd843039077712f6f5bae2bd2dc3 100644 --- a/lib/avx512_t1/mb_mgr_avx512_t1.c +++ b/lib/avx512_t1/mb_mgr_avx512_t1.c @@ -326,11 +326,6 @@ submit_aes_docsis256_dec_crc32_avx512(MB_MGR_DOCSIS_AES_OOO *state, IMB_JOB *job return job; } -/* SM4 */ -#define SM4_ECB sm4_ecb_sse -#define SM4_CBC_ENC sm4_cbc_enc_sse -#define SM4_CBC_DEC sm4_cbc_dec_sse - #define SUBMIT_JOB_DOCSIS128_SEC_CRC_ENC submit_job_aes_docsis128_enc_crc32_avx512 #define SUBMIT_JOB_DOCSIS256_SEC_CRC_ENC submit_job_aes_docsis256_enc_crc32_avx512 #define FLUSH_JOB_DOCSIS128_SEC_CRC_ENC flush_job_aes_docsis128_enc_crc32_avx512 @@ -338,6 +333,17 @@ submit_aes_docsis256_dec_crc32_avx512(MB_MGR_DOCSIS_AES_OOO *state, IMB_JOB *job #define SUBMIT_JOB_DOCSIS128_SEC_CRC_DEC submit_aes_docsis128_dec_crc32_avx512 #define SUBMIT_JOB_DOCSIS256_SEC_CRC_DEC submit_aes_docsis256_dec_crc32_avx512 +/* SM4 */ +#define SM4_ECB sm4_ecb_sse +#define SM4_CBC_ENC sm4_cbc_enc_sse +#define SM4_CBC_DEC sm4_cbc_dec_sse + +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + /* ====================================================================== */ static void diff --git a/lib/avx512_t2/ghash_api_vaes_avx512.asm b/lib/avx512_t2/ghash_api_vaes_avx512.asm index a0c5c0fb22761d748c80abbcf9703c43c0e6a9ec..2cb0a9215ae6123781a85ff3b26c39763d51edb5 100644 --- a/lib/avx512_t2/ghash_api_vaes_avx512.asm +++ b/lib/avx512_t2/ghash_api_vaes_avx512.asm @@ -175,7 +175,7 @@ ghash_vaes_avx512: vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap simd_store_avx arg4, xmm0, arg5, r12, rax %ifdef SAFE_DATA - clear_zmms_avx512 xmm0, xmm3, xmm4, xmm5, xmm6, xmm15, xmm16, xmm9, xmm19 + clear_zmms_avx512 xmm0, xmm2, xmm3, xmm4, xmm5, xmm6, xmm15, xmm16, xmm9, xmm19 %endif exit_ghash: FUNC_RESTORE diff --git a/lib/avx512_t2/mb_mgr_avx512_t2.c b/lib/avx512_t2/mb_mgr_avx512_t2.c index e8cc7ea4f92bfabe1e8ae5e8857c513342abb891..bcc66811d12b9460f6966acc21f9a25e886cde93 100644 --- a/lib/avx512_t2/mb_mgr_avx512_t2.c +++ b/lib/avx512_t2/mb_mgr_avx512_t2.c @@ -333,11 +333,6 @@ submit_job_docsis256_sec_crc_dec_vaes_avx512(MB_MGR_DOCSIS_AES_OOO *state, IMB_J return job; } -/* SM4 */ -#define SM4_ECB sm4_ecb_sse -#define SM4_CBC_ENC sm4_cbc_enc_sse -#define SM4_CBC_DEC sm4_cbc_dec_sse - #define SUBMIT_JOB_DOCSIS128_SEC_CRC_ENC submit_job_aes_docsis128_enc_crc32_vaes_avx512 #define SUBMIT_JOB_DOCSIS256_SEC_CRC_ENC submit_job_aes_docsis256_enc_crc32_vaes_avx512 #define FLUSH_JOB_DOCSIS128_SEC_CRC_ENC flush_job_aes_docsis128_enc_crc32_vaes_avx512 @@ -345,6 +340,17 @@ submit_job_docsis256_sec_crc_dec_vaes_avx512(MB_MGR_DOCSIS_AES_OOO *state, IMB_J #define SUBMIT_JOB_DOCSIS128_SEC_CRC_DEC submit_job_docsis128_sec_crc_dec_vaes_avx512 #define SUBMIT_JOB_DOCSIS256_SEC_CRC_DEC submit_job_docsis256_sec_crc_dec_vaes_avx512 +/* SM4 */ +#define SM4_ECB sm4_ecb_sse +#define SM4_CBC_ENC sm4_cbc_enc_sse +#define SM4_CBC_DEC sm4_cbc_dec_sse + +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + /* ====================================================================== */ static void diff --git a/lib/avx_t1/mb_mgr_avx_t1.c b/lib/avx_t1/mb_mgr_avx_t1.c index e18b5d6625f39bee7597aabd1008f19533a18dec..b8f2a617edd216ad78affafedec124837d506a2a 100644 --- a/lib/avx_t1/mb_mgr_avx_t1.c +++ b/lib/avx_t1/mb_mgr_avx_t1.c @@ -249,6 +249,12 @@ flush_snow3g_uea2_job_avx_t1(IMB_MGR *state) #define SM4_CBC_ENC sm4_cbc_enc_sse #define SM4_CBC_DEC sm4_cbc_dec_sse +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + static void reset_ooo_mgrs(IMB_MGR *state) { diff --git a/lib/avx_t1/sha512_one_block_avx.asm b/lib/avx_t1/sha512_one_block_avx.asm index 592ebc59001b212ab48a8b8b2730deafb43f654e..b8e46724f04c0514bd51d50b617ac5531890fc2d 100644 --- a/lib/avx_t1/sha512_one_block_avx.asm +++ b/lib/avx_t1/sha512_one_block_avx.asm @@ -31,9 +31,11 @@ %include "include/clear_regs.inc" %define VMOVDQ vmovdqu ;; assume buffers not aligned +%define EXPORT_DATA 0 %ifndef FUNC %define FUNC sha512_block_avx +%define EXPORT_DATA 1 %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Define Macros @@ -260,10 +262,13 @@ rotate_Xs ROTATE_ARGS %endm +%if EXPORT_DATA != 0 mksection .rodata default rel + +MKGLOBAL(SHA512_K_AVX,data,internal) align 64 -K512: +SHA512_K_AVX: dq 0x428a2f98d728ae22,0x7137449123ef65cd dq 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc dq 0x3956c25bf348b538,0x59f111f1b605d019 @@ -305,10 +310,19 @@ K512: dq 0x4cc5d4becb3e42b6,0x597f299cfc657e2a dq 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +MKGLOBAL(SHA512_SHUFF_MASK_AVX,data,internal) align 16 -PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x08090a0b0c0d0e0f0001020304050607 +SHA512_SHUFF_MASK_AVX: dq 0x0001020304050607, 0x08090a0b0c0d0e0f +%else + +extern SHA512_K_AVX +extern SHA512_SHUFF_MASK_AVX + +%endif + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; void FUNC(void *input_data, UINT64 digest[8]) @@ -350,9 +364,9 @@ FUNC: mov g, [8*6 + CTX] mov h, [8*7 + CTX] - vmovdqa BYTE_FLIP_MASK, [rel PSHUFFLE_BYTE_FLIP_MASK] + vmovdqa BYTE_FLIP_MASK, [rel SHA512_SHUFF_MASK_AVX] - lea TBL,[rel K512] + lea TBL,[rel SHA512_K_AVX] ;; byte swap first 16 qwords COPY_XMM_AND_BSWAP X0, [INP + 0*16], BYTE_FLIP_MASK diff --git a/lib/avx_t2/mb_mgr_avx_t2.c b/lib/avx_t2/mb_mgr_avx_t2.c index 2d49fda06f793599ee63076232c39a1d037fb0cb..f198097bb9eacf650426ec5597ea6950f3e665f5 100644 --- a/lib/avx_t2/mb_mgr_avx_t2.c +++ b/lib/avx_t2/mb_mgr_avx_t2.c @@ -254,6 +254,12 @@ flush_snow3g_uea2_job_avx_t2(IMB_MGR *state) #define SM4_CBC_ENC sm4_cbc_enc_sse #define SM4_CBC_DEC sm4_cbc_dec_sse +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + static void reset_ooo_mgrs(IMB_MGR *state) { diff --git a/lib/cmake/windows-mingw.cmake b/lib/cmake/windows-mingw.cmake index 3e34dbbbb2de9d9be0203fbca06411cb7370dd0a..e5d8a5cb9b5d26f2f462df2be73b56173fd215f6 100644 --- a/lib/cmake/windows-mingw.cmake +++ b/lib/cmake/windows-mingw.cmake @@ -89,6 +89,9 @@ endif() if(NOT AVX_IFMA) set(STR_FILTER "${STR_FILTER} /c:_avx2_t3") endif() +if(NOT SMX_NI) + set(STR_FILTER "${STR_FILTER} /c:_avx2_t4") +endif() # filter unused symbol exports if(NOT STR_FILTER) diff --git a/lib/cmake/windows.cmake b/lib/cmake/windows.cmake index f4760f5252e6c08a8ab1c2d365dac859a6f23050..67e3a7edf84fdf0d784a5cdc391dd448b080b7ff 100644 --- a/lib/cmake/windows.cmake +++ b/lib/cmake/windows.cmake @@ -78,6 +78,9 @@ endif() if(NOT AVX_IFMA) set(STR_FILTER "${STR_FILTER} /c:_avx2_t3") endif() +if(NOT SMX_NI) + set(STR_FILTER "${STR_FILTER} /c:_avx2_t4") +endif() # filter unused symbol exports if(NOT STR_FILTER) diff --git a/lib/include/arch_avx2_type4.h b/lib/include/arch_avx2_type4.h new file mode 100644 index 0000000000000000000000000000000000000000..8b9628ce213091f08f29ac961773de527d021401 --- /dev/null +++ b/lib/include/arch_avx2_type4.h @@ -0,0 +1,93 @@ +/******************************************************************************* + Copyright (c) 2023, Intel Corporation + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE + FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef IMB_ASM_AVX2_T4_H +#define IMB_ASM_AVX2_T4_H + +#include "ipsec-mb.h" +#include "ipsec_ooo_mgr.h" + +IMB_DLL_EXPORT void +set_suite_id_avx2_t4(IMB_MGR *state, IMB_JOB *job); + +/* SM4 */ +void +sm4_ecb_ni_avx2(const void *in, void *out, const int size, const void *exp_keys); + +void +sm4_set_key_ni_avx2(const void *pKey, void *exp_enc_keys, void *exp_dec_keys); + +/* SM3 */ +void +sm3_msg_ni_avx2(void *tag, const uint64_t tag_length, const void *msg, const uint64_t msg_length); +IMB_JOB * +sm3_hmac_submit_ni_avx2(IMB_JOB *job); +IMB_JOB * +sm3_msg_submit_ni_avx2(IMB_JOB *job); + +/* SHA512 */ +IMB_DLL_EXPORT void +sha384_ni_avx2(const void *data, const uint64_t length, void *digest); +IMB_DLL_EXPORT void +sha384_one_block_ni_avx2(const void *data, void *digest); + +IMB_DLL_EXPORT void +sha512_one_block_ni_avx2(const void *data, void *digest); +IMB_DLL_EXPORT void +sha512_ni_avx2(const void *data, const uint64_t length, void *digest); + +void +sha512_ni_block_avx2(const void *input, void *); +void +sha512_update_ni_x1(uint64_t digest[8], const void *input, uint64_t num_blocks); + +IMB_JOB * +submit_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); +IMB_JOB * +submit_job_sha512_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); + +IMB_JOB * +flush_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); +IMB_JOB * +flush_job_sha512_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); + +IMB_JOB * +submit_job_sha384_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); +IMB_JOB * +submit_job_sha512_ni_avx2(MB_MGR_SHA_512_OOO *state, IMB_JOB *job); + +IMB_JOB * +flush_job_hmac_sha_384_ni_avx2(MB_MGR_HMAC_SHA_512_OOO *state); +IMB_JOB * +flush_job_hmac_sha_512_ni_avx2(MB_MGR_HMAC_SHA_512_OOO *state); + +IMB_JOB * +submit_job_hmac_sha_384_ni_avx2(MB_MGR_HMAC_SHA_512_OOO *state, IMB_JOB *job); +IMB_JOB * +submit_job_hmac_sha_512_ni_avx2(MB_MGR_HMAC_SHA_512_OOO *state, IMB_JOB *job); + +#endif /* IMB_ASM_AVX2_T4_H */ diff --git a/lib/include/arch_avx_type1.h b/lib/include/arch_avx_type1.h index b621c31e8780ad9381be4f57d7b75b7d451fc65a..e75cdb5291a2ee9ebd46ea8928b084b4b4cde007 100644 --- a/lib/include/arch_avx_type1.h +++ b/lib/include/arch_avx_type1.h @@ -310,6 +310,17 @@ call_sha_256_mult_avx_from_c(SHA256_ARGS *args, uint32_t size_in_blocks); void call_sha512_x2_avx_from_c(SHA512_ARGS *args, uint64_t size_in_blocks); +void +sha1_block_avx(const void *, void *); +void +sha224_block_avx(const void *, void *); +void +sha256_block_avx(const void *, void *); +void +sha384_block_avx(const void *, void *); +void +sha512_block_avx(const void *, void *); + IMB_DLL_EXPORT void sha1_avx(const void *data, const uint64_t length, void *digest); IMB_DLL_EXPORT void diff --git a/lib/include/arch_sse_type1.h b/lib/include/arch_sse_type1.h index 73b863c9a9215a209a7f255a649a49ac36a3efaf..4d4cd8ba572e8d8b059bb91005e406ff6c3fb34d 100644 --- a/lib/include/arch_sse_type1.h +++ b/lib/include/arch_sse_type1.h @@ -162,6 +162,17 @@ call_sha_256_mult_sse_from_c(SHA256_ARGS *args, uint32_t size_in_blocks); void call_sha512_x2_sse_from_c(SHA512_ARGS *args, uint64_t size_in_blocks); +void +sha1_block_sse(const void *, void *); +void +sha224_block_sse(const void *, void *); +void +sha256_block_sse(const void *, void *); +void +sha384_block_sse(const void *, void *); +void +sha512_block_sse(const void *, void *); + IMB_DLL_EXPORT void sha1_sse(const void *data, const uint64_t length, void *digest); IMB_DLL_EXPORT void @@ -348,6 +359,17 @@ sm4_cbc_dec_sse(const void *in, void *out, const int size, const void *exp_dec_k void sm4_set_key_sse(const void *pKey, void *exp_enc_keys, void *exp_dec_keys); +/* SM3 */ +void +sm3_one_block_sse(void *tag, const void *msg); +void +sm3_msg_sse(void *tag, const uint64_t tag_length, const void *msg, const uint64_t msg_length); +IMB_JOB * +sm3_hmac_submit_sse(IMB_JOB *job); +IMB_JOB * +sm3_msg_submit_sse(IMB_JOB *job); + +/* suite id */ IMB_DLL_EXPORT void set_suite_id_sse_t1(IMB_MGR *state, IMB_JOB *job); diff --git a/lib/include/arch_sse_type2.h b/lib/include/arch_sse_type2.h index b291f9a96adbd16af39758ba6574377a696616a0..adcf15cdaddb7eeead79fe29400f3fb4ae357b7f 100644 --- a/lib/include/arch_sse_type2.h +++ b/lib/include/arch_sse_type2.h @@ -41,6 +41,11 @@ call_sha224_ni_x2_sse_from_c(SHA256_ARGS *args, uint32_t size_in_blocks); void call_sha256_ni_x2_sse_from_c(SHA256_ARGS *args, uint32_t size_in_blocks); +void +sha1_ni_block_sse(const void *, void *); +void +sha256_ni_block_sse(const void *, void *); + IMB_DLL_EXPORT void sha1_sse_shani(const void *data, const uint64_t length, void *digest); IMB_DLL_EXPORT void diff --git a/lib/include/gcm_api_vaes_avx512.inc b/lib/include/gcm_api_vaes_avx512.inc index e3fc9cdde5239bf186c10f95fa38ddd999c7f063..1da6d1380eddb990280819b971b52180f5808aa8 100644 --- a/lib/include/gcm_api_vaes_avx512.inc +++ b/lib/include/gcm_api_vaes_avx512.inc @@ -458,7 +458,7 @@ align 32 %ifdef SAFE_DATA clear_zmms_avx512 xmm0, xmm1, xmm2, xmm7, xmm8, xmm9, xmm11, xmm10, xmm14, \ - xmm15, xmm16, xmm17, xmm18, xmm20, xmm21 + xmm15, xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm26 %endif .exit_enc: @@ -526,7 +526,8 @@ align 32 DEC, {qword [arg2 + _iv_len_in_bytes]} %ifdef SAFE_DATA - clear_zmms_avx512 xmm2, xmm3, xmm4, xmm5, xmm9, xmm15, xmm16, xmm17, xmm18, xmm19, xmm20, xmm21 + clear_zmms_avx512 xmm0, xmm2, xmm3, xmm4, xmm5, xmm8, xmm9, xmm14, xmm15, \ + xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm26, xmm27 %endif .exit_dec: ;; mark job complete diff --git a/lib/include/ipsec_ooo_mgr.h b/lib/include/ipsec_ooo_mgr.h index ed427f79430b288066c86681059ab1e2c2e977ac..c446099feae2b3e9af3c017c8063af53b407f2d9 100644 --- a/lib/include/ipsec_ooo_mgr.h +++ b/lib/include/ipsec_ooo_mgr.h @@ -451,6 +451,8 @@ init_mb_mgr_avx2_t2_internal(IMB_MGR *state, const int reset_mgrs); IMB_DLL_LOCAL void init_mb_mgr_avx2_t3_internal(IMB_MGR *state, const int reset_mgrs); IMB_DLL_LOCAL void +init_mb_mgr_avx2_t4_internal(IMB_MGR *state, const int reset_mgrs); +IMB_DLL_LOCAL void init_mb_mgr_avx512_internal(IMB_MGR *state, const int reset_mgrs); IMB_DLL_LOCAL void init_mb_mgr_avx512_t1_internal(IMB_MGR *state, const int reset_mgrs); @@ -474,6 +476,8 @@ get_next_burst_avx2_t2(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t get_next_burst_avx2_t3(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t +get_next_burst_avx2_t4(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); +IMB_DLL_EXPORT uint32_t get_next_burst_avx512_t1(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t get_next_burst_avx512_t2(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); @@ -495,6 +499,8 @@ submit_burst_avx2_t2(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t submit_burst_avx2_t3(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t +submit_burst_avx2_t4(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); +IMB_DLL_EXPORT uint32_t submit_burst_avx512_t1(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t submit_burst_avx512_t2(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); @@ -516,6 +522,8 @@ submit_burst_nocheck_avx2_t2(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jo IMB_DLL_EXPORT uint32_t submit_burst_nocheck_avx2_t3(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t +submit_burst_nocheck_avx2_t4(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); +IMB_DLL_EXPORT uint32_t submit_burst_nocheck_avx512_t1(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t submit_burst_nocheck_avx512_t2(IMB_MGR *state, const uint32_t n_jobs, IMB_JOB **jobs); @@ -537,6 +545,8 @@ flush_burst_avx2_t2(IMB_MGR *state, const uint32_t max_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t flush_burst_avx2_t3(IMB_MGR *state, const uint32_t max_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t +flush_burst_avx2_t4(IMB_MGR *state, const uint32_t max_jobs, IMB_JOB **jobs); +IMB_DLL_EXPORT uint32_t flush_burst_avx512_t1(IMB_MGR *state, const uint32_t max_jobs, IMB_JOB **jobs); IMB_DLL_EXPORT uint32_t flush_burst_avx512_t2(IMB_MGR *state, const uint32_t max_jobs, IMB_JOB **jobs); @@ -574,6 +584,10 @@ submit_cipher_burst_avx2_t3(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, const IMB_KEY_SIZE_BYTES key_size); IMB_DLL_EXPORT uint32_t +submit_cipher_burst_avx2_t4(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t submit_cipher_burst_avx512_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, const IMB_KEY_SIZE_BYTES key_size); @@ -615,6 +629,10 @@ submit_cipher_burst_nocheck_avx2_t3(IMB_MGR *state, IMB_JOB *jobs, const uint32_ const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, const IMB_KEY_SIZE_BYTES key_size); IMB_DLL_EXPORT uint32_t +submit_cipher_burst_nocheck_avx2_t4(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, + const IMB_KEY_SIZE_BYTES key_size); +IMB_DLL_EXPORT uint32_t submit_cipher_burst_nocheck_avx512_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const IMB_CIPHER_MODE cipher, const IMB_CIPHER_DIRECTION dir, const IMB_KEY_SIZE_BYTES key_size); @@ -648,6 +666,9 @@ IMB_DLL_EXPORT uint32_t submit_hash_burst_avx2_t3(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const IMB_HASH_ALG hash); IMB_DLL_EXPORT uint32_t +submit_hash_burst_avx2_t4(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_HASH_ALG hash); +IMB_DLL_EXPORT uint32_t submit_hash_burst_avx512_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const IMB_HASH_ALG hash); IMB_DLL_EXPORT uint32_t @@ -679,6 +700,9 @@ IMB_DLL_EXPORT uint32_t submit_hash_burst_nocheck_avx2_t3(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const IMB_HASH_ALG hash); IMB_DLL_EXPORT uint32_t +submit_hash_burst_nocheck_avx2_t4(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, + const IMB_HASH_ALG hash); +IMB_DLL_EXPORT uint32_t submit_hash_burst_nocheck_avx512_t1(IMB_MGR *state, IMB_JOB *jobs, const uint32_t n_jobs, const IMB_HASH_ALG hash); IMB_DLL_EXPORT uint32_t @@ -797,6 +821,20 @@ get_next_job_avx2_t3(IMB_MGR *state); IMB_DLL_EXPORT IMB_JOB * get_completed_job_avx2_t3(IMB_MGR *state); +/* AVX2 TYPE4 manager functions */ +IMB_DLL_EXPORT IMB_JOB * +submit_job_avx2_t4(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB * +flush_job_avx2_t4(IMB_MGR *state); +IMB_DLL_EXPORT uint32_t +queue_size_avx2_t4(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB * +submit_job_nocheck_avx2_t4(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB * +get_next_job_avx2_t4(IMB_MGR *state); +IMB_DLL_EXPORT IMB_JOB * +get_completed_job_avx2_t4(IMB_MGR *state); + /* AVX512 TYPE1 manager functions */ IMB_DLL_EXPORT IMB_JOB * submit_job_avx512_t1(IMB_MGR *state); diff --git a/lib/include/mb_mgr_job_api.h b/lib/include/mb_mgr_job_api.h index 3e49b54a59a45aa8f2c8625a4d52ca317ace6a0d..bacb4e99be42061ecb5eab7ee51d5fc7adeb27fe 100644 --- a/lib/include/mb_mgr_job_api.h +++ b/lib/include/mb_mgr_job_api.h @@ -56,7 +56,6 @@ #include "include/job_api_kasumi.h" #endif /* __aarch64__ */ #include "include/mb_mgr_job_check.h" /* is_job_invalid() */ -#include "include/sm3.h" #ifndef __aarch64__ #define CRC(func, state, job) \ @@ -2692,12 +2691,7 @@ SUBMIT_JOB_HASH_EX(IMB_MGR *state, IMB_JOB *job, const IMB_HASH_ALG hash_alg) case IMB_AUTH_HMAC_SHA_512: return SUBMIT_JOB_HMAC_SHA_512(hmac_sha_512_ooo, job); case IMB_AUTH_HMAC_SM3: - sm3_hmac_msg(job->auth_tag_output, job->auth_tag_output_len_in_bytes, - job->src + job->hash_start_src_offset_in_bytes, - job->msg_len_to_hash_in_bytes, job->u.HMAC._hashed_auth_key_xor_ipad, - job->u.HMAC._hashed_auth_key_xor_opad); - job->status |= IMB_STATUS_COMPLETED_AUTH; - return job; + return SUBMIT_JOB_HMAC_SM3(job); case IMB_AUTH_AES_XCBC: return SUBMIT_JOB_AES_XCBC(aes_xcbc_ooo, job); case IMB_AUTH_MD5: @@ -2838,11 +2832,7 @@ SUBMIT_JOB_HASH_EX(IMB_MGR *state, IMB_JOB *job, const IMB_HASH_ALG hash_alg) case IMB_AUTH_GHASH: return process_ghash(state, job); case IMB_AUTH_SM3: - sm3_msg(job->auth_tag_output, job->auth_tag_output_len_in_bytes, - job->src + job->hash_start_src_offset_in_bytes, - job->msg_len_to_hash_in_bytes); - job->status |= IMB_STATUS_COMPLETED_AUTH; - return job; + return SUBMIT_JOB_SM3(job); #endif /* __aarch64__ */ default: /** diff --git a/lib/include/reg_sizes.inc b/lib/include/reg_sizes.inc index 42f14e2db84917d65f4130660f6a7bac564d495c..60a4ed75c6ca774e7086de60c8954872e7c09b9b 100644 --- a/lib/include/reg_sizes.inc +++ b/lib/include/reg_sizes.inc @@ -30,34 +30,66 @@ %ifndef _REG_SIZES_INC_ %define _REG_SIZES_INC_ +%define raxq rax +%define eaxq rax %define raxd eax %define raxw ax %define raxb al +%define rbxq rbx +%define ebxq rbx %define rbxd ebx %define rbxw bx %define rbxb bl +%define rcxq rcx +%define ecxq rcx %define rcxd ecx %define rcxw cx %define rcxb cl +%define rdxq rdx +%define edxq rdx %define rdxd edx %define rdxw dx %define rdxb dl +%define rsiq rsi +%define esiq rsi %define rsid esi %define rsiw si %define rsib sil +%define rdiq rdi +%define ediq rdi %define rdid edi %define rdiw di %define rdib dil +%define rbpq rbp +%define ebpq rbp %define rbpd ebp %define rbpw bp %define rbpb bpl +%define r8dq r8 +%define r9dq r9 +%define r10dq r10 +%define r11dq r11 +%define r12dq r12 +%define r13dq r13 +%define r14dq r14 +%define r15dq r15 + +%define r8q r8 +%define r9q r9 +%define r10q r10 +%define r11q r11 +%define r12q r12 +%define r13q r13 +%define r14q r14 +%define r15q r15 + %xdefine zmm0x xmm0 %xdefine zmm1x xmm1 %xdefine zmm2x xmm2 @@ -355,6 +387,7 @@ %xdefine zmm30z zmm30 %xdefine zmm31z zmm31 +%define QWORD(reg) reg %+ q %define DWORD(reg) reg %+ d %define WORD(reg) reg %+ w %define BYTE(reg) reg %+ b diff --git a/lib/include/sha_generic.h b/lib/include/sha_generic.h index c1d68d6ace0fe49ac77955436a4628a8ce7e3b3a..85c917baaee6eb17f7334511d27b8ed3aa53e315 100644 --- a/lib/include/sha_generic.h +++ b/lib/include/sha_generic.h @@ -36,57 +36,35 @@ #include "constants.h" #include "include/clear_regs_mem.h" #include "include/error.h" +#include "include/arch_sse_type1.h" +#include "include/arch_sse_type2.h" +#include "include/arch_avx_type1.h" +#include "include/arch_avx2_type4.h" -extern void -sha1_block_sse(const void *, void *); -extern void -sha1_block_avx(const void *, void *); -extern void -sha1_ni_block_sse(const void *, void *); - -extern void -sha224_block_sse(const void *, void *); -extern void -sha224_block_avx(const void *, void *); - -extern void -sha256_block_sse(const void *, void *); -extern void -sha256_block_avx(const void *, void *); -extern void -sha256_ni_block_sse(const void *, void *); - -extern void -sha384_block_sse(const void *, void *); -extern void -sha384_block_avx(const void *, void *); - -extern void -sha512_block_sse(const void *, void *); -extern void -sha512_block_avx(const void *, void *); - -enum arch_type { ARCH_SSE = 0, ARCH_SSE_SHANI, ARCH_AVX }; +enum arch_type { ARCH_SSE = 0, ARCH_SSE_SHANI, ARCH_AVX, ARCH_AVX2_SHANI }; /* ========================================================================== */ /* * Various utility functions for SHA API */ - __forceinline uint32_t bswap4(const uint32_t val) { - return ((val >> 24) | /**< A*/ - ((val & 0xff0000) >> 8) | /**< B*/ - ((val & 0xff00) << 8) | /**< C*/ - (val << 24)); /**< D*/ +#ifdef LINUX + return __builtin_bswap32(val); +#else + return _byteswap_ulong(val); +#endif } __forceinline uint64_t bswap8(const uint64_t val) { - return (((uint64_t) bswap4((uint32_t) val)) << 32) | - (((uint64_t) bswap4((uint32_t) (val >> 32)))); +#ifdef LINUX + return __builtin_bswap64(val); +#else + return _byteswap_uint64(val); +#endif } __forceinline void @@ -132,6 +110,7 @@ __forceinline void sha_generic_one_block(const void *inp, void *digest, const enum arch_type arch, const int sha_type) { if (sha_type == 1) { + IMB_ASSERT(arch != ARCH_AVX2_SHANI); if (arch == ARCH_AVX) sha1_block_avx(inp, digest); else if (arch == ARCH_SSE) @@ -139,14 +118,15 @@ sha_generic_one_block(const void *inp, void *digest, const enum arch_type arch, else /* arch == ARCH_SSE_SHANI */ sha1_ni_block_sse(inp, digest); } else if (sha_type == 224) { + IMB_ASSERT(arch != ARCH_AVX2_SHANI); if (arch == ARCH_AVX) sha224_block_avx(inp, digest); else if (arch == ARCH_SSE) sha224_block_sse(inp, digest); else /* arch == ARCH_SSE_SHANI */ - /* Same as SHA-224 */ sha256_ni_block_sse(inp, digest); } else if (sha_type == 256) { + IMB_ASSERT(arch != ARCH_AVX2_SHANI); if (arch == ARCH_AVX) sha256_block_avx(inp, digest); else if (arch == ARCH_SSE) @@ -154,15 +134,31 @@ sha_generic_one_block(const void *inp, void *digest, const enum arch_type arch, else /* arch == ARCH_SSE_SHANI */ sha256_ni_block_sse(inp, digest); } else if (sha_type == 384) { + IMB_ASSERT(arch != ARCH_SSE_SHANI); if (arch == ARCH_AVX) sha384_block_avx(inp, digest); +#ifdef SMX_NI + else if (arch == ARCH_SSE) + sha384_block_sse(inp, digest); + else /* arch == ARCH_AVX2_SHANI */ + sha512_ni_block_avx2(inp, digest); +#else else sha384_block_sse(inp, digest); +#endif } else if (sha_type == 512) { + IMB_ASSERT(arch != ARCH_SSE_SHANI); if (arch == ARCH_AVX) sha512_block_avx(inp, digest); +#ifdef SMX_NI + else if (arch == ARCH_SSE) + sha512_block_sse(inp, digest); + else /* arch == ARCH_AVX2_SHANI */ + sha512_ni_block_avx2(inp, digest); +#else else sha512_block_sse(inp, digest); +#endif } } @@ -318,7 +314,7 @@ sha_generic(const void *data, const uint64_t length, void *digest, const enum ar clear_mem(cb, sizeof(cb)); clear_mem(&local_digest, sizeof(local_digest)); clear_scratch_gps(); - if (arch == ARCH_AVX) + if (arch == ARCH_AVX || arch == ARCH_AVX2_SHANI) clear_scratch_xmms_avx(); else clear_scratch_xmms_sse(); @@ -343,7 +339,7 @@ sha_generic_1block(const void *data, void *digest, const enum arch_type arch, co sha_generic_one_block(data, digest, arch, sha_type); #ifdef SAFE_DATA clear_scratch_gps(); - if (arch == ARCH_AVX) + if (arch == ARCH_AVX || arch == ARCH_AVX2_SHANI) clear_scratch_xmms_avx(); else clear_scratch_xmms_sse(); diff --git a/lib/include/sm3.h b/lib/include/sm3.h deleted file mode 100644 index ca6e82731e29d138269997c4df6677714583777a..0000000000000000000000000000000000000000 --- a/lib/include/sm3.h +++ /dev/null @@ -1,67 +0,0 @@ -/******************************************************************************* - Copyright (c) 2023, Intel Corporation - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ - -#include "ipsec-mb.h" - -#ifndef SM3_H -#define SM3_H - -/** - * @brief Authenticates message with SM3 hash algorithm - * - * @param tag pointer to store computed SM3 digest - * @param tag_length output tag length in bytes - * @param msg pointer to input message to compute digest over - * @param msg_length length of the message in bytes - */ -IMB_DLL_LOCAL void -sm3_msg(void *tag, const uint64_t tag_len, const void *msg, const uint64_t msg_length); - -/** - * @brief Authenticates one block with SM3 hash algorithm - * - * @param tag pointer to store computed SM3 digest - * @param msg pointer to input block compute digest over - */ -IMB_DLL_LOCAL void -sm3_one_block(void *tag, const void *msg); - -/** - * @brief Authenticates message with SM3-HMAC hash algorithm - * - * @param tag pointer to store computed SM3 digest - * @param tag_length output tag length in bytes - * @param msg pointer to input message to compute digest over - * @param msg_length length of the message in bytes - * @param ipad_hash block-sized inner padding - * @param opad_hash block-sized outer padding - */ -IMB_DLL_LOCAL void -sm3_hmac_msg(void *tag, const uint64_t tag_len, const void *msg, const uint64_t msg_length, - const void *ipad_hash, const void *opad_hash); - -#endif /* SM3_H */ diff --git a/lib/ipsec-mb.h b/lib/ipsec-mb.h index 3ea415c6f6adc28d5c7aaf886df16cb85c33fb8c..94c6c96f64e552703214a6009948e92b21e1dfa0 100644 --- a/lib/ipsec-mb.h +++ b/lib/ipsec-mb.h @@ -946,6 +946,9 @@ typedef int (*imb_self_test_cb_t)(void *cb_arg, const IMB_SELF_TEST_CALLBACK_DAT #define IMB_FEATURE_SELF_TEST_PASS (1ULL << 21) /* self-test passed */ #define IMB_FEATURE_AVX_IFMA (1ULL << 22) #define IMB_FEATURE_HYBRID (1ULL << 23) /* Hybrid core */ +#define IMB_FEATURE_SM3NI (1ULL << 24) +#define IMB_FEATURE_SM4NI (1ULL << 25) +#define IMB_FEATURE_SHA512NI (1ULL << 26) /** * Self test defines @@ -977,6 +980,8 @@ typedef int (*imb_self_test_cb_t)(void *cb_arg, const IMB_SELF_TEST_CALLBACK_DAT IMB_FEATURE_GFNI) #define IMB_CPUFLAGS_AVX_T2 (IMB_CPUFLAGS_AVX | IMB_FEATURE_SHANI | IMB_FEATURE_GFNI) #define IMB_CPUFLAGS_AVX2_T3 (IMB_CPUFLAGS_AVX2_T2 | IMB_FEATURE_AVX_IFMA) +#define IMB_CPUFLAGS_AVX2_T4 \ + (IMB_CPUFLAGS_AVX2_T3 | IMB_FEATURE_SM3NI | IMB_FEATURE_SM4NI | IMB_FEATURE_SHA512NI) #define IMB_FEATURE_AARCH64 (1ULL << 32) #define IMB_FEATURE_ASIMD (1ULL << 33) diff --git a/lib/libIPSec_MB.def b/lib/libIPSec_MB.def index 5cb115d9961a2e14e01791c57612539df2599c53..4f5c283c8aaa10e8ee8b5a7bdeb77451fa4bc08e 100644 --- a/lib/libIPSec_MB.def +++ b/lib/libIPSec_MB.def @@ -714,3 +714,17 @@ EXPORTS imb_quic_hp_chacha20 @688 imb_self_test_set_cb @689 imb_self_test_get_cb @690 + get_next_burst_avx2_t4 @691 + submit_burst_avx2_t4 @692 + submit_burst_nocheck_avx2_t4 @693 + flush_burst_avx2_t4 @694 + submit_cipher_burst_avx2_t4 @695 + submit_cipher_burst_nocheck_avx2_t4 @696 + submit_hash_burst_avx2_t4 @697 + submit_hash_burst_nocheck_avx2_t4 @698 + flush_job_avx2_t4 @699 + queue_size_avx2_t4 @700 + submit_job_avx2_t4 @701 + submit_job_nocheck_avx2_t4 @702 + get_next_job_avx2_t4 @703 + get_completed_job_avx2_t4 @704 diff --git a/lib/no-aesni/mb_mgr_sse_no_aesni.c b/lib/no-aesni/mb_mgr_sse_no_aesni.c index f02c1673c6507a1463236180a491e6026684582b..7565ea887ed254648609e208eaa066b2bc84af0e 100644 --- a/lib/no-aesni/mb_mgr_sse_no_aesni.c +++ b/lib/no-aesni/mb_mgr_sse_no_aesni.c @@ -225,6 +225,12 @@ #define SM4_CBC_ENC sm4_cbc_enc_sse_no_aesni #define SM4_CBC_DEC sm4_cbc_dec_sse_no_aesni +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + static void reset_ooo_mgrs(IMB_MGR *state) { diff --git a/lib/sse_t1/mb_mgr_sse_t1.c b/lib/sse_t1/mb_mgr_sse_t1.c index 3adfd9513de5988ad8eefa11a0f7a375fd5c7c60..a6bebbac26208c663a2a91a5a1631542cd94f1bc 100644 --- a/lib/sse_t1/mb_mgr_sse_t1.c +++ b/lib/sse_t1/mb_mgr_sse_t1.c @@ -255,6 +255,12 @@ flush_snow3g_uea2_job_sse(IMB_MGR *state) #define SM4_CBC_ENC sm4_cbc_enc_sse #define SM4_CBC_DEC sm4_cbc_dec_sse +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + /* ====================================================================== */ static void diff --git a/lib/sse_t1/sm3_base_hmac_sse.asm b/lib/sse_t1/sm3_base_hmac_sse.asm new file mode 100644 index 0000000000000000000000000000000000000000..748828070452384ec2ecf1d59b670c646e6d2fe4 --- /dev/null +++ b/lib/sse_t1/sm3_base_hmac_sse.asm @@ -0,0 +1,335 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash + +extern sm3_base_init +extern sm3_base_update + +%include "include/os.inc" +%include "include/reg_sizes.inc" +%include "include/imb_job.inc" +%include "include/memcpy.inc" + +%ifdef LINUX + +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx + +%define gp1 rax +%define gp2 r8 +%define gp3 r9 +%define gp4 r10 +%define gp5 r11 +%define gp6 arg4 +%define gp7 r12 +%define gp8 r13 +%define gp9 r14 +%define gp10 r15 +%define gp11 rbx +%define gp12 rbp + +%else + +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 + +%define gp1 rax +%define gp2 r10 +%define gp3 r11 +%define gp4 arg4 +%define gp5 rdi +%define gp6 rsi +%define gp7 r12 +%define gp8 r13 +%define gp9 r14 +%define gp10 r15 +%define gp11 rbx +%define gp12 rbp + +%endif + +%xdefine t1 gp1 +%xdefine t2 gp2 +%xdefine t3 gp3 +%xdefine t4 gp3 + +%xdefine r1 gp12 +%xdefine r2 gp11 +%xdefine r3 gp10 + +%define arg_job r1 +%define arg_msg r2 +%define arg_msg_length r3 + +;; HMAC-SM3 stack frame +struc STACK +_B: resb 64 ; two SM3 blocks (aligned to 16) +_D: resd 8 ; digest +_gpr_save: resq 8 ; space for GPR's +_rsp_save: resq 1 ; space for rsp pointer +endstruc + +mksection .rodata + +align 16 +SHUFF_MASK: + db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +;; PAD BLOCKS are used for OPAD where digest of IPAD + message is put into the block. +;; The blocks below fill up top 32 bytes of the block, +;; low 32 bytes get filled with the digest. +align 16 +PAD_BLOCK1: + db 0x80, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + +align 16 +PAD_BLOCK2: + ;; last qword has to encode length in bits of: BLOCK size + DIGEST size + ;; (64 + 32) * 8 = 768 = 0x300 in hex + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x00, 0x00 + db 0x00, 0x00, 0x03, 0x00 + +mksection .text + +;; ============================================================================= +;; Save registers on the stack and create stack frame +;; ============================================================================= + +%macro FUNC_START 0 + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax + mov [rsp + _gpr_save + 0*8], rbx + mov [rsp + _gpr_save + 1*8], rbp + mov [rsp + _gpr_save + 2*8], r12 + mov [rsp + _gpr_save + 3*8], r13 + mov [rsp + _gpr_save + 4*8], r14 + mov [rsp + _gpr_save + 5*8], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _gpr_save + 6*8], rdi + mov [rsp + _gpr_save + 7*8], rsi +%endif +%endmacro + +;; ============================================================================= +;; Restore registers from the stack +;; ============================================================================= + +%macro FUNC_END 0 + mov rbx, [rsp + _gpr_save + 0*8] + mov rbp, [rsp + _gpr_save + 1*8] + mov r12, [rsp + _gpr_save + 2*8] + mov r13, [rsp + _gpr_save + 3*8] + mov r14, [rsp + _gpr_save + 4*8] + mov r15, [rsp + _gpr_save + 5*8] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + _gpr_save + 6*8] + mov rsi, [rsp + _gpr_save + 7*8] +%endif + mov rsp, [rsp + _rsp_save] +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; IMB_JOB *sm3_hmac_submit_sse(IMB_JOB *job) +align 32 +MKGLOBAL(sm3_hmac_submit_sse,function,internal) +sm3_hmac_submit_sse: + FUNC_START + + ;; save input arguments + mov arg_job, arg1 + + ;; init the digest with IPAD + mov t1, [arg_job + _auth_key_xor_ipad] + movdqu xmm0, [t1 + 0*16] + movdqu xmm1, [t1 + 1*16] + movdqa [rsp + _D + 0*16], xmm0 + movdqa [rsp + _D + 1*16], xmm1 + + ;; update digest for full number of blocks + lea arg1, [rsp + _D] + mov arg2, [arg_job + _src] + add arg2, [arg_job + _hash_start_src_offset] + mov arg_msg, arg2 + mov arg_msg_length, [arg_job + _msg_len_to_hash_in_bytes] + mov arg3, arg_msg_length + shr arg3, 6 ;; msg_length / SM3_BLOCK_SIZE + call sm3_base_update + + ;; prepare partial block + mov DWORD(arg3), 63 + not arg3 + and arg3, arg_msg_length ;; number of bytes processed already + add arg_msg, arg3 ;; move message pointer to start of the partial block + mov t2, arg_msg_length + sub t2, arg3 ;; t2 = number of bytes left + + xor DWORD(arg1), DWORD(arg1) +.partial_block_copy: + cmp DWORD(arg1), DWORD(t2) + je .partial_block_copy_exit + mov BYTE(t1), [arg_msg + arg1] + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_copy + +.partial_block_copy_exit: + ;; put end of message marker + mov BYTE [rsp + _B + arg1], 0x80 + inc DWORD(arg1) + + xor DWORD(t1), DWORD(t1) +.partial_block_zero: + cmp DWORD(arg1), 64 + je .partial_block_zero_exit + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_zero + +.partial_block_zero_exit: + cmp DWORD(t2), 64 - 8 + jb .add_msg_length + + ;; if length field doesn't fit into this partial block + ;; - compute digest on the current block + ;; - clear the block for the length to be put into it next + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_base_update + + xor DWORD(t1), DWORD(t1) + mov [rsp + _B + 0*8], t1 + mov [rsp + _B + 1*8], t1 + mov [rsp + _B + 2*8], t1 + mov [rsp + _B + 3*8], t1 + mov [rsp + _B + 4*8], t1 + mov [rsp + _B + 5*8], t1 + mov [rsp + _B + 6*8], t1 + +.add_msg_length: + lea t1, [arg_msg_length*8 + 64*8] ;; original message length in bits + 1 IPAD block + bswap t1 + mov [rsp + _B + 7*8], t1 + + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_base_update + +.process_opad: + movdqa xmm0, [rsp + _D + 0*16] + movdqa xmm1, [rsp + _D + 1*16] + movdqa xmm2, [rel PAD_BLOCK1] + movdqa xmm3, [rel PAD_BLOCK2] + pshufb xmm0, [rel SHUFF_MASK] + pshufb xmm1, [rel SHUFF_MASK] + movdqa [rsp + _B + 0*16], xmm0 + movdqa [rsp + _B + 1*16], xmm1 + movdqa [rsp + _B + 2*16], xmm2 + movdqa [rsp + _B + 3*16], xmm3 + + ;; init the digest with OPAD + mov t1, [arg_job + _auth_key_xor_opad] + movdqu xmm0, [t1 + 0*16] + movdqu xmm1, [t1 + 1*16] + movdqa [rsp + _D + 0*16], xmm0 + movdqa [rsp + _D + 1*16], xmm1 + + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_base_update + +.tag_store_start: + ;; byte swap the digest and write it back + lea arg1, [rsp + _D] + movdqa xmm0, [arg1 + 0*16] + movdqa xmm1, [arg1 + 1*16] + pshufb xmm0, [rel SHUFF_MASK] + pshufb xmm1, [rel SHUFF_MASK] + + mov t1, [arg_job + _auth_tag_output] + mov t2, [arg_job + _auth_tag_output_len_in_bytes] + cmp t2, 32 + je .tag_store_32 + + cmp t2, 16 + jb .tag_store_1_15 + je .tag_store_16 + +.tag_store_16_31: + movdqu [t1 + 0*16], xmm0 + lea t1, [t1 + 16] + movdqa xmm1, xmm0 + sub t2, 16 + ;; fall through to store remaining tag bytes + +.tag_store_1_15: + simd_store_sse t1, xmm0, t2, t3, t4 + jmp .tag_store_end + +.tag_store_32: + movdqu [t1 + 1*16], xmm1 + ;; fall through to store 1st 16 bytes + +.tag_store_16: + movdqu [t1 + 0*16], xmm0 + ;; fall through + +.tag_store_end: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + pxor xmm1, xmm1 + pxor xmm2, xmm2 + pxor xmm3, xmm3 + + movdqu [rsp + _B + 0*16], xmm0 + movdqu [rsp + _B + 1*16], xmm0 + movdqu [rsp + _B + 2*16], xmm0 + movdqu [rsp + _B + 3*16], xmm0 +%endif + + mov rax, arg_job + or dword [arg_job + _status], IMB_STATUS_COMPLETED_AUTH + FUNC_END + ret + +mksection stack-noexec diff --git a/lib/sse_t1/sm3_base_init_sse.asm b/lib/sse_t1/sm3_base_init_sse.asm new file mode 100644 index 0000000000000000000000000000000000000000..3da7ed87847a745c0be90286d782c9cc2d737165 --- /dev/null +++ b/lib/sse_t1/sm3_base_init_sse.asm @@ -0,0 +1,67 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash + +%use smartalign +%include "include/os.inc" + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%endif + +mksection .rodata +default rel + +align 16 +I_const: + dd 0x7380166f, 0x4914b2b9, 0x172442d7, 0xda8a0600 + dd 0xa96f30bc, 0x163138aa, 0xe38dee4d, 0xb0fb0e4e + +mksection .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sm3_base_init(uint32_t digest[8]) +align 32 +MKGLOBAL(sm3_base_init,function,internal) +sm3_base_init: + movdqu xmm0, [rel I_const + 0*16] + movdqu xmm1, [rel I_const + 1*16] + movdqu [arg1 + 0*16], xmm0 + movdqu [arg1 + 1*16], xmm1 + ret + + +mksection stack-noexec diff --git a/lib/sse_t1/sm3_base_msg_sse.asm b/lib/sse_t1/sm3_base_msg_sse.asm new file mode 100644 index 0000000000000000000000000000000000000000..2cb3f5438103d013871e453e723b62ee8b9eb9dd --- /dev/null +++ b/lib/sse_t1/sm3_base_msg_sse.asm @@ -0,0 +1,297 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash + +extern sm3_base_init +extern sm3_base_update + +%include "include/os.inc" +%include "include/reg_sizes.inc" +%include "include/memcpy.inc" +%include "include/imb_job.inc" + +%ifdef LINUX + +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx + +%define t1 rax +%define t2 r8 +%define t3 r9 +%define t4 r10 +%define t5 r11 +%define t6 r12 +%define t7 r13 +%define t8 r14 +%define t9 r15 +%define t10 rbx +%define t11 rbp + +%else + +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 + +%define t1 rax +%define t2 r10 +%define t3 r11 +%define t4 rdi +%define t5 rsi +%define t6 r12 +%define t7 r13 +%define t8 r14 +%define t9 r15 +%define t10 rbx +%define t11 rbp + +%endif + +%xdefine r1 t6 +%xdefine r2 t7 +%xdefine r3 t8 +%xdefine r4 t9 +%xdefine r5 t10 +%xdefine r6 t11 + +%define arg_tag r1 +%define arg_tag_length r2 +%define arg_msg r3 +%define arg_msg_length r4 + +;; SM3 stack frame +struc STACK +_B: resb 64 ; one SM3 block (aligned to 16) +_D: resd 8 ; digest +_gpr_save: resq 8 ; space for GPR's +_rsp_save: resq 1 ; space for rsp pointer +endstruc + +mksection .rodata + +align 16 +SHUFF_MASK: + db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +mksection .text + +;; ============================================================================= +;; Save registers on the stack and create stack frame +;; ============================================================================= + +%macro FUNC_START 0 + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax + mov [rsp + _gpr_save + 0*8], rbx + mov [rsp + _gpr_save + 1*8], rbp + mov [rsp + _gpr_save + 2*8], r12 + mov [rsp + _gpr_save + 3*8], r13 + mov [rsp + _gpr_save + 4*8], r14 + mov [rsp + _gpr_save + 5*8], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _gpr_save + 6*8], rdi + mov [rsp + _gpr_save + 7*8], rsi +%endif +%endmacro + +;; ============================================================================= +;; Restore registers from the stack +;; ============================================================================= + +%macro FUNC_END 0 + mov rbx, [rsp + _gpr_save + 0*8] + mov rbp, [rsp + _gpr_save + 1*8] + mov r12, [rsp + _gpr_save + 2*8] + mov r13, [rsp + _gpr_save + 3*8] + mov r14, [rsp + _gpr_save + 4*8] + mov r15, [rsp + _gpr_save + 5*8] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + _gpr_save + 6*8] + mov rsi, [rsp + _gpr_save + 7*8] +%endif + mov rsp, [rsp + _rsp_save] +%endmacro + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sm3_msg_sse(void *tag, const uint64_t tag_length, const void *msg, const uint64_t msg_length) +align 32 +MKGLOBAL(sm3_msg_sse,function,internal) +sm3_msg_sse: + FUNC_START + + ;; save input arguments + mov arg_tag, arg1 + mov arg_tag_length, arg2 + mov arg_msg, arg3 + mov arg_msg_length, arg4 + + ;; init the digest + lea arg1, [rsp + _D] + call sm3_base_init + + ;; update digest for full number of blocks + ;; - arg1 stays unchanged + mov arg2, arg_msg + mov arg3, arg_msg_length + shr arg3, 6 ;; msg_length / SM3_BLOCK_SIZE + call sm3_base_update + + ;; prepare partial block + mov DWORD(arg3), 63 + not arg3 + and arg3, arg_msg_length ;; number of bytes processed already + add arg_msg, arg3 ;; move message pointer to start of the partial block + mov r5, arg_msg_length + sub r5, arg3 ;; r5 = number of bytes left + + xor DWORD(arg1), DWORD(arg1) +.partial_block_copy: + cmp DWORD(arg1), DWORD(r5) + je .partial_block_copy_exit + mov BYTE(t1), [arg_msg + arg1] + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_copy + +.partial_block_copy_exit: + ;; put end of message marker + mov BYTE [rsp + _B + arg1], 0x80 + inc DWORD(arg1) + + xor DWORD(t1), DWORD(t1) +.partial_block_zero: + cmp DWORD(arg1), 64 + je .partial_block_zero_exit + mov [rsp + _B + arg1], BYTE(t1) + inc DWORD(arg1) + jmp .partial_block_zero + +.partial_block_zero_exit: + cmp DWORD(r5), 64 - 8 + jb .add_msg_length + + ;; if length field doesn't fit into this partial block + ;; - compute digest on the current block + ;; - clear the block for the length to be put into it next + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_base_update + + xor DWORD(t1), DWORD(t1) + mov [rsp + _B + 0*8], t1 + mov [rsp + _B + 1*8], t1 + mov [rsp + _B + 2*8], t1 + mov [rsp + _B + 3*8], t1 + mov [rsp + _B + 4*8], t1 + mov [rsp + _B + 5*8], t1 + mov [rsp + _B + 6*8], t1 + +.add_msg_length: + lea t1, [arg_msg_length*8] ;; original message length in bits + bswap t1 + mov [rsp + _B + 7*8], t1 + + lea arg1, [rsp + _D] + lea arg2, [rsp + _B] + mov DWORD(arg3), 1 + call sm3_base_update + +.tag_store_start: + ;; byte swap the digest and write it back + lea arg1, [rsp + _D] + movdqa xmm0, [arg1 + 0*16] + movdqa xmm1, [arg1 + 1*16] + pshufb xmm0, [rel SHUFF_MASK] + pshufb xmm1, [rel SHUFF_MASK] + + cmp arg_tag_length, 32 + je .tag_store_32 + + cmp arg_tag_length, 16 + jb .tag_store_1_15 + je .tag_store_16 + +.tag_store_16_31: + movdqu [arg_tag + 0*16], xmm0 + lea arg_tag, [arg_tag + 16] + movdqa xmm1, xmm0 + sub arg_tag_length, 16 + ;; fall through to store remaining tag bytes + +.tag_store_1_15: + simd_store_sse arg_tag, xmm0, arg_tag_length, r5, t1 + jmp .tag_store_end + +.tag_store_32: + movdqu [arg_tag + 1*16], xmm1 + ;; fall through to store 1st 16 bytes + +.tag_store_16: + movdqu [arg_tag + 0*16], xmm0 + ;; fall through + +.tag_store_end: + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + movdqu [rsp + _B + 0*16], xmm0 + movdqu [rsp + _B + 1*16], xmm0 + movdqu [rsp + _B + 2*16], xmm0 + movdqu [rsp + _B + 3*16], xmm0 +%endif + FUNC_END + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; IMB_JOB *sm3_msg_submit_sse(IMB_JOB *) +align 32 +MKGLOBAL(sm3_msg_submit_sse,function,internal) +sm3_msg_submit_sse: + push arg1 + + mov arg4, [arg1 + _msg_len_to_hash_in_bytes] + mov arg3, [arg1 + _src] + add arg3, [arg1 + _hash_start_src_offset] + mov arg2, [arg1 + _auth_tag_output_len_in_bytes] + mov arg1, [arg1 + _auth_tag_output] + call sm3_msg_sse + + pop rax + or dword [rax + _status], IMB_STATUS_COMPLETED_AUTH + ret + +mksection stack-noexec diff --git a/lib/sse_t1/sm3_base_one_block_sse.asm b/lib/sse_t1/sm3_base_one_block_sse.asm new file mode 100644 index 0000000000000000000000000000000000000000..4c1413e2d0124396a15ca3f0f66e642ab2f1593e --- /dev/null +++ b/lib/sse_t1/sm3_base_one_block_sse.asm @@ -0,0 +1,65 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash + +extern sm3_base_init +extern sm3_base_update + +%include "include/os.inc" +%include "include/reg_sizes.inc" + +%ifdef LINUX + +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx + +%else + +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 + +%endif + + +mksection .text + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sm3_one_block_sm3(void *tag, const void *msg) +align 32 +MKGLOBAL(sm3_one_block_sse,function,internal) +sm3_one_block_sse: + call sm3_base_init + mov DWORD(arg3), 1 + call sm3_base_update + ret + +mksection stack-noexec diff --git a/lib/sse_t1/sm3_base_update_sse.asm b/lib/sse_t1/sm3_base_update_sse.asm new file mode 100644 index 0000000000000000000000000000000000000000..9a32991a011c6b511d772b3f5e5529ec35e3d989 --- /dev/null +++ b/lib/sse_t1/sm3_base_update_sse.asm @@ -0,0 +1,429 @@ +;; +;; Copyright (c) 2023-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash + +%include "include/os.inc" +%include "include/reg_sizes.inc" + +%ifdef LINUX + +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx + +%define t1 rax +%define t2 r8 +%define t3 r9 +%define t4 r10 +%define t5 r11 +%define t6 arg4 +%define t7 r12 +%define t8 r13 +%define t9 r14 +%define t10 r15 +%define t11 rbx +%define t12 rbp + +%else + +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 + +%define t1 rax +%define t2 rdi +%define t3 rsi +%define t4 r10 +%define t5 r11 +%define t6 arg4 +%define t7 r12 +%define t8 r13 +%define t9 r14 +%define t10 r15 +%define t11 rbx +%define t12 rbp + +%endif + +%define A DWORD(t1) +%define B DWORD(t2) +%define C DWORD(t3) +%define D DWORD(t4) +%define E DWORD(t5) +%define F DWORD(t6) +%define G DWORD(t7) +%define H DWORD(t8) + +;; SM3 stack frame +struc STACK +_W: resd 68 ; expanded message W[] +_TT2: resd 1 +%ifidn __OUTPUT_FORMAT__, win64 +_gpr_save: resq 8 ; space for 8 GPR's +%else +_gpr_save: resq 6 ; space for 6 GPR's +%endif +_rsp_save: resq 1 ; space for rsp pointer +endstruc + +mksection .rodata + +align 16 +K_const: + dd 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb, 0x9cc45197, 0x3988a32f, 0x7311465e, + dd 0xe6228cbc, 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce, 0xc451979c, 0x88a32f39, + dd 0x11465e73, 0x228cbce6, 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c, 0xd8a7a879, + dd 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, + dd 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5, 0x7a879d8a, 0xf50f3b14, 0xea1e7629, + dd 0xd43cec53, 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d, 0x879d8a7a, 0x0f3b14f5, + dd 0x1e7629ea, 0x3cec53d4, 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43, 0x9d8a7a87, + dd 0x3b14f50f, 0x7629ea1e, 0xec53d43c, 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, + dd 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, + dd 0x3d43cec5 + +align 16 +SHUFF_MASK: + db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 + +mksection .text + +;; ============================================================================= +;; FF0(x, y, z) = x ^ y ^ z +;; ============================================================================= +%macro FF0 3 +%define %%X %1 ;; [in/out] 32-bit GPR +%define %%Y %2 ;; [in] 32-bit GPR +%define %%Z %3 ;; [in] 32-bit GPR + + xor %%X, %%Y + xor %%X, %%Z +%endmacro + +;; ============================================================================= +;; GG0(x, y, z) = x ^ y ^ z +;; ============================================================================= +%macro GG0 3 +%define %%X %1 ;; [in/out] 32-bit GPR +%define %%Y %2 ;; [in] 32-bit GPR +%define %%Z %3 ;; [in] 32-bit GPR + + xor %%X, %%Y + xor %%X, %%Z +%endmacro + +;; ============================================================================= +;; FF1(x, y, z) = (x & y) | ((x | y) & z) +;; ============================================================================= +%macro FF1 4 +%define %%X %1 ;; [in/out] 32-bit GPR +%define %%Y %2 ;; [in] 32-bit GPR +%define %%Z %3 ;; [in] 32-bit GPR +%define %%T %4 ;; [clobbered] temporary GPR + + mov %%T, %%X + and %%X, %%Y + or %%T, %%Y + and %%T, %%Z + or %%X, %%T +%endmacro + +;; ============================================================================= +;; GG1(x, y, z) = z ^ (x & (y ^ z)) +;; ============================================================================= +%macro GG1 4 +%define %%X %1 ;; [in/out] 32-bit GPR +%define %%Y %2 ;; [in] 32-bit GPR +%define %%Z %3 ;; [in] 32-bit GPR +%define %%T %4 ;; [clobbered] temporary GPR + + mov %%T, %%Z + xor %%T, %%Y + and %%X, %%T + xor %%X, %%Z +%endmacro + +;; ============================================================================= +;; P0(x) = x ^ ROL32(x, 9) ^ ROL32(x, 17) +;; ============================================================================= +%macro P0 3 +%define %%X %1 ;; [in/out] 32-bit GPR +%define %%T1 %2 ;; [clobbered] temporary GPR +%define %%T2 %3 ;; [clobbered] temporary GPR + + mov %%T1, %%X + mov %%T2, %%X + rol %%T1, 9 + rol %%T2, 17 + xor %%X, %%T1 + xor %%X, %%T2 +%endmacro + +;; ============================================================================= +;; P1(x) = x ^ ROL32(x, 15) ^ ROL32(x, 23) +;; ============================================================================= +%macro P1 3 +%define %%X %1 ;; [in/out] 32-bit GPR +%define %%T1 %2 ;; [clobbered] temporary GPR +%define %%T2 %3 ;; [clobbered] temporary GPR + + mov %%T1, %%X + mov %%T2, %%X + rol %%T1, 15 + rol %%T2, 23 + xor %%X, %%T1 + xor %%X, %%T2 +%endmacro + +;; ============================================================================= +;; Compress macro +;; SS1 = ROL32((ROL32(A, 12) + E + K[i]), 7); +;; SS2 = SS1 ^ ROL32(A, 12); +;; TT1 = (i < 16) ? FF0(A, B, C) + D + SS2 + (W[i] ^ W[i + 4]) : +;; FF1(A, B, C) + D + SS2 + (W[i] ^ W[i + 4]); +;; TT2 = (i < 16) ? GG0(E, F, G) + H + SS1 + W[i] : +;; GG1(E, F, G) + H + SS1 + W[i]; +;; +;; D = C; +;; C = ROL32(B, 9); +;; B = A; +;; A = TT1; +;; H = G; +;; G = ROL32(F, 19); +;; F = E; +;; E = P0(TT2); +;; +;; Updates registers A, B, C, D, E, F, G and H +;; ============================================================================= +%macro SM3_COMPRESS 5 +%define %%IDX %1 ;; [in] GPR with current index to W[] +%define %%I %2 ;; [in] immediate value: 0 -> 0 <= index < 16, 1 -> 16 <= index < 64 +%define %%T1 %3 ;; [clobbered] temporary 32-bit GPR +%define %%T2 %4 ;; [clobbered] temporary 32-bit GPR +%define %%T3 %5 ;; [clobbered] temporary 32-bit GPR + + ;; calculate SS1 and SS2 + mov %%T1, A + rol %%T1, 12 + mov %%T2, %%T1 ;; T1 = T2 = ROL32(A, 12) + + add %%T1, E + lea QWORD(%%T3), [rel K_const] + add %%T1, [QWORD(%%T3) + %%IDX*4] + ;; T1 = ROL32(A, 12) + E + K[i] + rol %%T1, 7 + ;; T1 = SS1 = ROL32(ROL32(A, 12) + E + K[i], 7) + xor %%T2, %%T1 + ;; T2 = SS2 = SS1 ^ ROL32(A, 12) + + ;; calculate TT1 and TT2 + add %%T1, [rsp + _W + %%IDX*4] ;; SS1 += W[i] + add %%T1, H ;; SS1 += H + mov [rsp + _TT2], %%T1 ;; TT2 = H + SS1 + W[i] + mov %%T1, E +%if %%I == 0 + GG0 %%T1, F, G +%else + GG1 %%T1, F, G, %%T3 +%endif + add [rsp + _TT2], %%T1 ;; TT2 += GGx(E, F, G) + + add %%T2, D ;; SS2 += D + mov %%T1, [rsp + _W + %%IDX*4] ;; T1 = W[i] + xor %%T1, [rsp + _W + %%IDX*4 + 4*4];; T1 ^= W[i + 4] + add %%T2, %%T1 ;; TT1 = D + SS2 + (W[i] ^ W[i + 4]) + mov %%T1, A +%if %%I == 0 + FF0 %%T1, B, C +%else + FF1 %%T1, B, C, %%T3 +%endif + add %%T2, %%T1 ;; TT1 += FFx(A, B, C) + ;; T2 = TT1 + + ;; update state registers + mov D, C ;; D = C + mov C, B + rol C, 9 ;; C = ROL32(B, 9) + mov B, A ;; B = A + mov A, %%T2 ;; A = TT1 + mov H, G ;; H = G + mov G, F + rol G, 19 ;; G = ROL32(F, 19) + mov F, E ;; F = E + mov E, [rsp + _TT2] + P0 E, %%T1, %%T2 ;; E = P0(TT2) +%endmacro + +;; ============================================================================= +;; Save registers on the stack and create stack frame +;; ============================================================================= + +%macro FUNC_START 0 + mov rax, rsp + sub rsp, STACK_size + and rsp, -16 + mov [rsp + _rsp_save], rax + mov [rsp + _gpr_save + 0*8], rbx + mov [rsp + _gpr_save + 1*8], rbp + mov [rsp + _gpr_save + 2*8], r12 + mov [rsp + _gpr_save + 3*8], r13 + mov [rsp + _gpr_save + 4*8], r14 + mov [rsp + _gpr_save + 5*8], r15 +%ifidn __OUTPUT_FORMAT__, win64 + mov [rsp + _gpr_save + 6*8], rdi + mov [rsp + _gpr_save + 7*8], rsi +%endif +%endmacro + +;; ============================================================================= +;; Restore registers from the stack +;; ============================================================================= + +%macro FUNC_END 0 + mov rbx, [rsp + _gpr_save + 0*8] + mov rbp, [rsp + _gpr_save + 1*8] + mov r12, [rsp + _gpr_save + 2*8] + mov r13, [rsp + _gpr_save + 3*8] + mov r14, [rsp + _gpr_save + 4*8] + mov r15, [rsp + _gpr_save + 5*8] +%ifidn __OUTPUT_FORMAT__, win64 + mov rdi, [rsp + _gpr_save + 6*8] + mov rsi, [rsp + _gpr_save + 7*8] +%endif + mov rsp, [rsp + _rsp_save] +%endmacro + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; void sm3_base_update(uint32_t digest[8], const void *input, uint64_t num_blocks) +align 32 +MKGLOBAL(sm3_base_update,function,internal) +sm3_base_update: + or arg3, arg3 + jz sm3_base_update_end + + FUNC_START + +sm3_base_loop: + ;; W[0..15]: load and shuffle 16 bytes of message + movdqu xmm0, [arg2 + 0*16] + movdqu xmm1, [arg2 + 1*16] + pshufb xmm0, [rel SHUFF_MASK] + pshufb xmm1, [rel SHUFF_MASK] + movdqu [rsp + _W + 0*16], xmm0 + movdqu [rsp + _W + 1*16], xmm1 + + movdqu xmm0, [arg2 + 2*16] + movdqu xmm1, [arg2 + 3*16] + pshufb xmm0, [rel SHUFF_MASK] + pshufb xmm1, [rel SHUFF_MASK] + movdqu [rsp + _W + 2*16], xmm0 + movdqu [rsp + _W + 3*16], xmm1 + + ;; W[16..67]: expand W[] + lea t9, [rsp + _W] + mov DWORD(t10), 16 + +align 32 +sm3_base_W_expand: + ;; W[i] = P1(W[i - 16] ^ W[i - 9] ^ ROL32(W[i - 3], 15)) ^ + ;; ROL32(W[i - 13], 7) ^ W[i - 6] + + mov DWORD(t1), [t9 + 13*4] ;; W[i - 3] + rol DWORD(t1), 15 + xor DWORD(t1), [t9 + 0*4] ;; W[i - 16] + xor DWORD(t1), [t9 + 7*4] ;; W[i - 9] + ;; t1 = W[i - 16] ^ W[i - 9] ^ ROL32(W[i - 3], 15) + P1 DWORD(t1), DWORD(t2), DWORD(t3) + ;; t1 = P1(W[i - 16] ^ W[i - 9] ^ ROL32(W[i - 3], 15)) + xor DWORD(t1), [t9 + 10*4] ;; W[i - 6] + mov DWORD(t2), [t9 + 3*4] ;; W[i - 13] + rol DWORD(t2), 7 + xor DWORD(t1), DWORD(t2) + mov [rsp + _W + t10*4], DWORD(t1) + add t9, 4 + inc t10 + cmp DWORD(t10), 68 + jne sm3_base_W_expand + + ;; read digest + mov A, [arg1 + 0*4] + mov B, [arg1 + 1*4] + mov C, [arg1 + 2*4] + mov D, [arg1 + 3*4] + mov E, [arg1 + 4*4] + mov F, [arg1 + 5*4] + mov G, [arg1 + 6*4] + mov H, [arg1 + 7*4] + + ;; compress + xor DWORD(t10), DWORD(t10) + +align 32 +sm3_base_compress_0_15: + SM3_COMPRESS t10, 0, DWORD(t9), DWORD(t11), DWORD(t12) + inc DWORD(t10) + cmp DWORD(t10), 16 + jne sm3_base_compress_0_15 + +align 32 +sm3_base_compress_16_63: + SM3_COMPRESS t10, 1, DWORD(t9), DWORD(t11), DWORD(t12) + inc DWORD(t10) + cmp DWORD(t10), 64 + jne sm3_base_compress_16_63 + + ;; update digest + xor [arg1 + 0*4], A + xor [arg1 + 1*4], B + xor [arg1 + 2*4], C + xor [arg1 + 3*4], D + xor [arg1 + 4*4], E + xor [arg1 + 5*4], F + xor [arg1 + 6*4], G + xor [arg1 + 7*4], H + + add arg2, 64 + dec arg3 + jnz sm3_base_loop + +%ifdef SAFE_DATA + pxor xmm0, xmm0 + pxor xmm1, xmm1 + + movdqu [rsp + _W + 0*4], xmm0 + movdqu [rsp + _W + 4*4], xmm0 + movdqu [rsp + _W + 8*4], xmm0 + movdqu [rsp + _W + 12*4], xmm0 +%endif + FUNC_END +sm3_base_update_end: + ret + +mksection stack-noexec diff --git a/lib/sse_t2/mb_mgr_sse_t2.c b/lib/sse_t2/mb_mgr_sse_t2.c index 54097df384a51e49c839cdffe15825d372230612..96e30d65f089e72813ad695e9445bc320e9afafd 100644 --- a/lib/sse_t2/mb_mgr_sse_t2.c +++ b/lib/sse_t2/mb_mgr_sse_t2.c @@ -257,6 +257,12 @@ flush_snow3g_uea2_job_sse(IMB_MGR *state) #define SM4_CBC_ENC sm4_cbc_enc_sse #define SM4_CBC_DEC sm4_cbc_dec_sse +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + /* ====================================================================== */ static void diff --git a/lib/sse_t3/mb_mgr_sse_t3.c b/lib/sse_t3/mb_mgr_sse_t3.c index f333918531afb17693719a1ba122ca7336850db3..55fb57cb424e06ca9748a971ba6ce5af4f3c1882 100644 --- a/lib/sse_t3/mb_mgr_sse_t3.c +++ b/lib/sse_t3/mb_mgr_sse_t3.c @@ -258,6 +258,12 @@ flush_snow3g_uea2_job_sse(IMB_MGR *state) #define SM4_CBC_ENC sm4_cbc_enc_sse #define SM4_CBC_DEC sm4_cbc_dec_sse +/* SM3 */ +#define SUBMIT_JOB_SM3 sm3_msg_submit_sse +#define FLUSH_JOB_SM3 unused +#define SUBMIT_JOB_HMAC_SM3 sm3_hmac_submit_sse +#define FLUSH_JOB_HMAC_SM3 unused + /* ====================================================================== */ static void diff --git a/lib/win_x64.mak b/lib/win_x64.mak index 2828abb14b015272e87381c8433d8e7662a52d77..c1e9f55fca6267cee33a02f76027a5637f5beb80 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -224,7 +224,11 @@ lib_objs1 = \ $(OBJ_DIR)\save_xmms.obj \ $(OBJ_DIR)\mbcpuid.obj \ $(OBJ_DIR)\atomic.obj \ - $(OBJ_DIR)\sm3.obj \ + $(OBJ_DIR)\sm3_base_init_sse.obj \ + $(OBJ_DIR)\sm3_base_update_sse.obj \ + $(OBJ_DIR)\sm3_base_one_block_sse.obj \ + $(OBJ_DIR)\sm3_base_msg_sse.obj \ + $(OBJ_DIR)\sm3_base_hmac_sse.obj \ $(OBJ_DIR)\clear_regs_mem_fns.obj \ $(OBJ_DIR)\sha1_x4_avx.obj \ $(OBJ_DIR)\sha1_x4_sse.obj \ @@ -585,6 +589,16 @@ gcm_objs = \ $(OBJ_DIR)\gcm256_sgl_api_by8_sse.obj \ $(OBJ_DIR)\gcm256_gmac_api_by8_sse.obj +avx2_t4_objs = \ + $(OBJ_DIR)\mb_mgr_avx2_t4.obj \ + $(OBJ_DIR)\sm4_ni_avx2.obj \ + $(OBJ_DIR)\sm3_ni_x1_avx2.obj \ + $(OBJ_DIR)\sm3_msg_avx2.obj \ + $(OBJ_DIR)\sm3_hmac_avx2.obj \ + $(OBJ_DIR)\sha512_x1_ni_avx2.obj \ + $(OBJ_DIR)\sha_ni_avx2.obj \ + $(OBJ_DIR)\sha512_hmac_ni_avx2.obj + !if "$(AESNI_EMU)" == "y" all_objs = $(lib_objs1) $(lib_objs2) $(gcm_objs) $(no_aesni_objs) !else @@ -596,6 +610,11 @@ all_objs = $(all_objs) $(OBJ_DIR)\mb_mgr_avx2_t3.obj $(OBJ_DIR)\poly_fma_avx2.ob DCFLAGS = $(DCFLAGS) /DAVX_IFMA !endif +!if "$(SMX_NI)" == "y" +all_objs = $(all_objs) $(avx2_t4_objs) +DCFLAGS = $(DCFLAGS) /DSMX_NI +!endif + all: $(LIB_DIR)\$(LIBNAME) $(DEPALL) $(LIB_DIR)\$(LIBNAME): $(all_objs) $(LIBBASE)_lnk.def @@ -622,15 +641,32 @@ $(LIB_DIR)\$(LIBNAME): $(all_objs) $(LIBBASE)_lnk.def STR_FILTER = "" !if "$(AESNI_EMU)" != "y" !if "$(AVX_IFMA)" != "y" +!if "$(SMX_NI)" != "y" +STR_FILTER = "_no_aesni _avx2_t3 _avx2_t4" +!else # SMX_NI = y STR_FILTER = "_no_aesni _avx2_t3" -!else -STR_FILTER = "_no_aesni" !endif -!else +!else # AVX_IFMA = y +!if "$(SMX_NI)" != "y" +STR_FILTER = "_no_aesni _avx2_t4" +!else # SMX_NI = y +STR_FILTER = "_no_aesni" +!endif # SMX_NI +!endif # AVX_IFMA + +!else # AESNI_EMU = y !if "$(AVX_IFMA)" != "y" +!if "$(SMX_NI)" != "y" +STR_FILTER = "_avx2_t3 _avx2_t4" +!else # SMX_NI = y STR_FILTER = "_avx2_t3" !endif -!endif +!else # AVX_IFMA = y +!if "$(SMX_NI)" != "y" +STR_FILTER = "_avx2_t4" +!endif # SMX_NI +!endif # AVX_IFMA +!endif # AESNI_EMU $(all_objs): $(OBJ_DIR) $(LIB_DIR) @@ -707,6 +743,13 @@ $(DEPALL): $(all_objs) {avx2_t3\}.asm{$(OBJ_DIR)}.obj: $(AS) -MD $@.dep -o $@ $(AFLAGS) $< +{avx2_t4\}.c{$(OBJ_DIR)}.obj: + $(CC) /arch:AVX /Fo$@ /c $(CFLAGS) $< + $(DEPTOOL) $< $@ "$(DEPFLAGS)" > $@.dep + +{avx2_t4\}.asm{$(OBJ_DIR)}.obj: + $(AS) -MD $@.dep -o $@ $(AFLAGS) $< + {avx512_t1\}.c{$(OBJ_DIR)}.obj: $(CC) /arch:AVX /Fo$@ /c $(CFLAGS) $< $(DEPTOOL) $< $@ "$(DEPFLAGS)" > $@.dep diff --git a/lib/x86_64/cpu_feature.c b/lib/x86_64/cpu_feature.c index 8b5f4f7dd886388b6013bb896efcd4a45f2dc890..3de9acbf5c86909e42ee3d2e0e6aaf6301ff00e6 100644 --- a/lib/x86_64/cpu_feature.c +++ b/lib/x86_64/cpu_feature.c @@ -188,6 +188,39 @@ detect_hybrid(void) return (cpuid_7_0.edx & (1UL << 15)); } +static uint32_t +detect_sha512ni(void) +{ +#ifdef SMX_NI + /* Check presence of SHA512NI - bit 0 of EAX */ + return (cpuid_7_1.eax & (1UL << 0)); +#else + return 0; +#endif +} + +static uint32_t +detect_sm3ni(void) +{ +#ifdef SMX_NI + /* Check presence of SM3NI - bit 1 of EAX */ + return (cpuid_7_1.eax & (1UL << 1)); +#else + return 0; +#endif +} + +static uint32_t +detect_sm4ni(void) +{ +#ifdef SMX_NI + /* Check presence of SM3NI - bit 2 of EAX */ + return (cpuid_7_1.eax & (1UL << 2)); +#else + return 0; +#endif +} + uint64_t cpu_feature_detect(void) { @@ -195,27 +228,28 @@ cpu_feature_detect(void) unsigned req_leaf_number; uint64_t feat; uint32_t (*detect_fn)(void); - } feat_tab[] = { - { 7, IMB_FEATURE_SHANI, detect_shani }, - { 1, IMB_FEATURE_AESNI, detect_aesni }, - { 1, IMB_FEATURE_PCLMULQDQ, detect_pclmulqdq }, - { 1, IMB_FEATURE_CMOV, detect_cmov }, - { 1, IMB_FEATURE_SSE4_2, detect_sse42 }, - { 1, IMB_FEATURE_AVX, detect_avx }, - { 7, IMB_FEATURE_AVX2, detect_avx2 }, - { 7, IMB_FEATURE_AVX512F, detect_avx512f }, - { 7, IMB_FEATURE_AVX512DQ, detect_avx512dq }, - { 7, IMB_FEATURE_AVX512CD, detect_avx512cd }, - { 7, IMB_FEATURE_AVX512BW, detect_avx512bw }, - { 7, IMB_FEATURE_AVX512VL, detect_avx512vl }, - { 7, IMB_FEATURE_VAES, detect_vaes }, - { 7, IMB_FEATURE_VPCLMULQDQ, detect_vpclmulqdq }, - { 7, IMB_FEATURE_GFNI, detect_gfni }, - { 7, IMB_FEATURE_AVX512_IFMA, detect_avx512_ifma }, - { 7, IMB_FEATURE_BMI2, detect_bmi2 }, - { 7, IMB_FEATURE_AVX_IFMA, detect_avx_ifma }, - { 7, IMB_FEATURE_HYBRID, detect_hybrid }, - }; + } feat_tab[] = { { 7, IMB_FEATURE_SHANI, detect_shani }, + { 1, IMB_FEATURE_AESNI, detect_aesni }, + { 1, IMB_FEATURE_PCLMULQDQ, detect_pclmulqdq }, + { 1, IMB_FEATURE_CMOV, detect_cmov }, + { 1, IMB_FEATURE_SSE4_2, detect_sse42 }, + { 1, IMB_FEATURE_AVX, detect_avx }, + { 7, IMB_FEATURE_AVX2, detect_avx2 }, + { 7, IMB_FEATURE_AVX512F, detect_avx512f }, + { 7, IMB_FEATURE_AVX512DQ, detect_avx512dq }, + { 7, IMB_FEATURE_AVX512CD, detect_avx512cd }, + { 7, IMB_FEATURE_AVX512BW, detect_avx512bw }, + { 7, IMB_FEATURE_AVX512VL, detect_avx512vl }, + { 7, IMB_FEATURE_VAES, detect_vaes }, + { 7, IMB_FEATURE_VPCLMULQDQ, detect_vpclmulqdq }, + { 7, IMB_FEATURE_GFNI, detect_gfni }, + { 7, IMB_FEATURE_AVX512_IFMA, detect_avx512_ifma }, + { 7, IMB_FEATURE_BMI2, detect_bmi2 }, + { 7, IMB_FEATURE_AVX_IFMA, detect_avx_ifma }, + { 7, IMB_FEATURE_HYBRID, detect_hybrid }, + { 7, IMB_FEATURE_SM3NI, detect_sm3ni }, + { 7, IMB_FEATURE_SM4NI, detect_sm4ni }, + { 7, IMB_FEATURE_SHA512NI, detect_sha512ni } }; struct cpuid_regs r; unsigned hi_leaf_number = 0; uint64_t features = 0; diff --git a/lib/x86_64/hmac_ipad_opad.c b/lib/x86_64/hmac_ipad_opad.c index 4df54d0dcb72c8fd4c4ff374e7d000e8ea17924e..f92e1ae45043974e87c0113a9fca2e03abc51c39 100644 --- a/lib/x86_64/hmac_ipad_opad.c +++ b/lib/x86_64/hmac_ipad_opad.c @@ -32,8 +32,8 @@ #include #include "include/error.h" -#include "include/sm3.h" #include "include/memcpy.h" +#include "include/arch_sse_type1.h" /* sm3_one_block_sse(), sm3_msg_sse() */ IMB_DLL_EXPORT void @@ -119,7 +119,7 @@ imb_hmac_ipad_opad(IMB_MGR *mb_mgr, const IMB_HASH_ALG sha_type, const void *pke IMB_SHA384(mb_mgr, pkey, key_len, key); break; case IMB_AUTH_HMAC_SM3: - sm3_msg(key, IMB_SM3_DIGEST_SIZE, pkey, key_len); + sm3_msg_sse(key, IMB_SM3_DIGEST_SIZE, pkey, key_len); break; default: /* For SHA-512 */ IMB_SHA512(mb_mgr, pkey, key_len, key); @@ -147,7 +147,7 @@ imb_hmac_ipad_opad(IMB_MGR *mb_mgr, const IMB_HASH_ALG sha_type, const void *pke IMB_SHA512_ONE_BLOCK(mb_mgr, buf, ipad_hash); break; case IMB_AUTH_HMAC_SM3: - sm3_one_block(ipad_hash, buf); + sm3_one_block_sse(ipad_hash, buf); break; default: /* For MD5*/ IMB_MD5_ONE_BLOCK(mb_mgr, buf, ipad_hash); @@ -176,7 +176,7 @@ imb_hmac_ipad_opad(IMB_MGR *mb_mgr, const IMB_HASH_ALG sha_type, const void *pke IMB_SHA512_ONE_BLOCK(mb_mgr, buf, opad_hash); break; case IMB_AUTH_HMAC_SM3: - sm3_one_block(opad_hash, buf); + sm3_one_block_sse(opad_hash, buf); break; default: /* For MD5 */ IMB_MD5_ONE_BLOCK(mb_mgr, buf, opad_hash); diff --git a/lib/x86_64/sm3.c b/lib/x86_64/sm3.c deleted file mode 100644 index 21324e5d2773dfab973208aae82cbf42bf0eb029..0000000000000000000000000000000000000000 --- a/lib/x86_64/sm3.c +++ /dev/null @@ -1,323 +0,0 @@ -/******************************************************************************* - Copyright (c) 2023, Intel Corporation - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, - this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - * Neither the name of Intel Corporation nor the names of its contributors - may be used to endorse or promote products derived from this software - without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ - -#include "ipsec-mb.h" -#include -#include "include/error.h" -#include "include/sm3.h" -#include "include/clear_regs_mem.h" - -/* https://datatracker.ietf.org/doc/html/draft-shen-sm3-hash */ - -#ifdef LINUX -#define BSWAP32 __builtin_bswap32 -#define BSWAP64 __builtin_bswap64 -#else -#define BSWAP32 _byteswap_ulong -#define BSWAP64 _byteswap_uint64 -#endif - -/** - * @note \a outp needs to be of volatile type to avoid the operation being - * optimized out in some scenarios - */ -__forceinline void -store8_be(volatile void *outp, const uint64_t val) -{ - *((volatile uint64_t *) outp) = BSWAP64(val); -} - -__forceinline uint32_t -XOR3(const uint32_t x, const uint32_t y, const uint32_t z) -{ - return x ^ y ^ z; -} - -__forceinline uint32_t -FF0(const uint32_t x, const uint32_t y, const uint32_t z) -{ - return XOR3(x, y, z); -} - -__forceinline uint32_t -GG0(const uint32_t x, const uint32_t y, const uint32_t z) -{ - return XOR3(x, y, z); -} - -__forceinline uint32_t -FF1(const uint32_t x, const uint32_t y, const uint32_t z) -{ - return (x & y) | ((x | y) & z); -} - -__forceinline uint32_t -GG1(const uint32_t x, const uint32_t y, const uint32_t z) -{ - return z ^ (x & (y ^ z)); -} - -__forceinline uint32_t -ROL32(const uint32_t a, const unsigned b) -{ - return (a << b) | (a >> (32 - b)); -} - -__forceinline uint32_t -P0(const uint32_t x) -{ - return x ^ ROL32(x, 9) ^ ROL32(x, 17); -} - -__forceinline uint32_t -P1(const uint32_t x) -{ - return x ^ ROL32(x, 15) ^ ROL32(x, 23); -} - -static const uint32_t K[64] = { - 0x79cc4519, 0xf3988a32, 0xe7311465, 0xce6228cb, 0x9cc45197, 0x3988a32f, 0x7311465e, - 0xe6228cbc, 0xcc451979, 0x988a32f3, 0x311465e7, 0x6228cbce, 0xc451979c, 0x88a32f39, - 0x11465e73, 0x228cbce6, 0x9d8a7a87, 0x3b14f50f, 0x7629ea1e, 0xec53d43c, 0xd8a7a879, - 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, - 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, 0x3d43cec5, 0x7a879d8a, 0xf50f3b14, 0xea1e7629, - 0xd43cec53, 0xa879d8a7, 0x50f3b14f, 0xa1e7629e, 0x43cec53d, 0x879d8a7a, 0x0f3b14f5, - 0x1e7629ea, 0x3cec53d4, 0x79d8a7a8, 0xf3b14f50, 0xe7629ea1, 0xcec53d43, 0x9d8a7a87, - 0x3b14f50f, 0x7629ea1e, 0xec53d43c, 0xd8a7a879, 0xb14f50f3, 0x629ea1e7, 0xc53d43ce, - 0x8a7a879d, 0x14f50f3b, 0x29ea1e76, 0x53d43cec, 0xa7a879d8, 0x4f50f3b1, 0x9ea1e762, - 0x3d43cec5 -}; - -static void -sm3_init(uint32_t digest[8]) -{ - digest[0] = 0x7380166f; - digest[1] = 0x4914b2b9; - digest[2] = 0x172442d7; - digest[3] = 0xda8a0600; - digest[4] = 0xa96f30bc; - digest[5] = 0x163138aa; - digest[6] = 0xe38dee4d; - digest[7] = 0xb0fb0e4e; -} - -static void -sm3_update(uint32_t digest[8], const void *input, uint64_t num_blocks) -{ - const uint32_t *data = (const uint32_t *) input; - volatile uint32_t W[68]; - - while (num_blocks--) { - /* prepare W[] - read data first */ - for (int i = 0; i < 16; i++) - W[i] = BSWAP32(data[i]); - - /* expand W[] */ - for (int i = 16; i < 68; i++) - W[i] = P1(W[i - 16] ^ W[i - 9] ^ ROL32(W[i - 3], 15)) ^ - ROL32(W[i - 13], 7) ^ W[i - 6]; - - /* read current digest */ - register uint32_t A = digest[0]; - register uint32_t B = digest[1]; - register uint32_t C = digest[2]; - register uint32_t D = digest[3]; - register uint32_t E = digest[4]; - register uint32_t F = digest[5]; - register uint32_t G = digest[6]; - register uint32_t H = digest[7]; - - /* compress */ - for (int i = 0; i < 16; i++) { - const uint32_t SS1 = ROL32((ROL32(A, 12) + E + K[i]), 7); - const uint32_t SS2 = SS1 ^ ROL32(A, 12); - const uint32_t TT1 = FF0(A, B, C) + D + SS2 + (W[i] ^ W[i + 4]); - const uint32_t TT2 = GG0(E, F, G) + H + SS1 + W[i]; - - D = C; - C = ROL32(B, 9); - B = A; - A = TT1; - H = G; - G = ROL32(F, 19); - F = E; - E = P0(TT2); - } - - for (int i = 16; i < 64; i++) { - const uint32_t SS1 = ROL32((ROL32(A, 12) + E + K[i]), 7); - const uint32_t SS2 = SS1 ^ ROL32(A, 12); - const uint32_t TT1 = FF1(A, B, C) + D + SS2 + (W[i] ^ W[i + 4]); - const uint32_t TT2 = GG1(E, F, G) + H + SS1 + W[i]; - - D = C; - C = ROL32(B, 9); - B = A; - A = TT1; - H = G; - G = ROL32(F, 19); - F = E; - E = P0(TT2); - } - - /* update digest and move data pointer */ - digest[0] ^= A; - digest[1] ^= B; - digest[2] ^= C; - digest[3] ^= D; - digest[4] ^= E; - digest[5] ^= F; - digest[6] ^= G; - digest[7] ^= H; - - data += (IMB_SM3_BLOCK_SIZE / sizeof(uint32_t)); - } - -#ifdef SAFE_DATA - force_memset_zero_vol(W, sizeof(W)); -#endif -} - -void -sm3_msg(void *tag, const uint64_t tag_length, const void *msg, const uint64_t msg_length) -{ - uint32_t digest[8]; - uint8_t block[IMB_SM3_BLOCK_SIZE]; - - sm3_init(digest); - sm3_update(digest, msg, msg_length / IMB_SM3_BLOCK_SIZE); - - const uint64_t partial_bytes = msg_length % IMB_SM3_BLOCK_SIZE; - const uint8_t *trail = &((const uint8_t *) msg)[msg_length - partial_bytes]; - - memset(block, 0, sizeof(block)); - memcpy(block, trail, partial_bytes); - block[partial_bytes] = 0x80; - - if (partial_bytes >= (IMB_SM3_BLOCK_SIZE - 8)) { - /* - * length field doesn't fit into this block - * - compute digest on the current block - * - clear the block for the length to be put into it next - */ - sm3_update(digest, block, 1); - memset(block, 0, sizeof(block)); - } - - store8_be(&block[IMB_SM3_BLOCK_SIZE - 8], msg_length * 8 /* bit length */); - - sm3_update(digest, block, 1); - - for (unsigned i = 0; i < IMB_DIM(digest); i++) - digest[i] = BSWAP32(digest[i]); - - memcpy(tag, digest, tag_length); - -#ifdef SAFE_DATA - clear_scratch_xmms_sse(); - clear_mem(block, sizeof(block)); -#endif -} - -void -sm3_one_block(void *tag, const void *msg) -{ - uint32_t digest[8]; - - sm3_init(digest); - sm3_update(digest, msg, 1); - - memcpy(tag, digest, IMB_SM3_DIGEST_SIZE); - -#ifdef SAFE_DATA - clear_mem(digest, sizeof(digest)); - clear_scratch_xmms_sse(); -#endif -} - -void -sm3_hmac_msg(void *tag, const uint64_t tag_length, const void *msg, const uint64_t msg_length, - const void *ipad, const void *opad) -{ - uint32_t digest[8]; - uint8_t block[IMB_SM3_BLOCK_SIZE]; - uint32_t *block32 = (uint32_t *) block; - - /* Initialize internal digest with IPAD */ - memcpy(digest, ipad, IMB_SM3_DIGEST_SIZE); - - /* Digest full blocks */ - sm3_update(digest, msg, msg_length / IMB_SM3_BLOCK_SIZE); - - const uint64_t partial_bytes = msg_length % IMB_SM3_BLOCK_SIZE; - const uint8_t *trail = &((const uint8_t *) msg)[msg_length - partial_bytes]; - - /* Prepare last one or two blocks (depending on size of last partial block) */ - memset(block, 0, sizeof(block)); - memcpy(block, trail, partial_bytes); - block[partial_bytes] = 0x80; - - if (partial_bytes >= (IMB_SM3_BLOCK_SIZE - 8)) { - /* - * length field doesn't fit into this block - * - compute digest on the current block - * - clear the block for the length to be put into it next - */ - sm3_update(digest, block, 1); - memset(block, 0, sizeof(block)); - } - - /* Store message length plus block size (from IPAD) at the end of the block */ - store8_be(&block[IMB_SM3_BLOCK_SIZE - 8], - (IMB_SM3_BLOCK_SIZE + msg_length) * 8 /* bit length */); - - sm3_update(digest, block, 1); - - memset(block, 0, sizeof(block)); - for (unsigned i = 0; i < IMB_DIM(digest); i++) - block32[i] = BSWAP32(digest[i]); - - block[IMB_SM3_DIGEST_SIZE] = 0x80; - /* Store length of inner hash plus block size (from OPAD) at the end of the block */ - store8_be(&block[IMB_SM3_BLOCK_SIZE - 8], - (IMB_SM3_BLOCK_SIZE + IMB_SM3_DIGEST_SIZE) * 8 /* bit length */); - - /* Initialize internal digest with OPAD */ - memcpy(digest, opad, IMB_SM3_DIGEST_SIZE); - - sm3_update(digest, block, 1); - - for (unsigned i = 0; i < IMB_DIM(digest); i++) - digest[i] = BSWAP32(digest[i]); - - memcpy(tag, digest, tag_length); - -#ifdef SAFE_DATA - clear_scratch_xmms_sse(); - clear_mem(block, sizeof(block)); -#endif -} diff --git a/lib/x86_64/zuc_common.asm b/lib/x86_64/zuc_common.asm index e811ae683edae2da7f99846fc54123d1f17d73fd..dc6249d29a5775f4117b8aff8f152dc4a509aa44 100644 --- a/lib/x86_64/zuc_common.asm +++ b/lib/x86_64/zuc_common.asm @@ -235,9 +235,10 @@ mksection .text shr rbx, 31 add rax, rbx - mov rbx, rax - sub rbx, 0x7FFFFFFF - cmovns rax, rbx + mov rbx, rax + and rax, 0x7FFFFFFF + shr rbx, 31 + add rax, rbx ; LFSR_S16 = (LFSR_S15++) = eax mov [rsi + (( 0 + %1) % 16)*4], eax diff --git a/test/kat-app/CMakeLists.txt b/test/kat-app/CMakeLists.txt index 9b6766bc836e486e19adadd8fed13a53104d1e39..a86e6d345fa0e813304e46bbd18c8c9a8b806520 100644 --- a/test/kat-app/CMakeLists.txt +++ b/test/kat-app/CMakeLists.txt @@ -119,16 +119,13 @@ else() endif() if("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "aarch64") - add_test(NAME KAT-AARCH64 COMMAND ${TEST_APP} ${ARCH_AARCH64} WORKING_DIRECTORY ${TEST_APP_BIN_DIR}) add_test(NAME KAT-SVE256 COMMAND ${TEST_APP} ${ARCH_SVE256} WORKING_DIRECTORY ${TEST_APP_BIN_DIR}) - else() - add_test(NAME KAT-SSE-T1 COMMAND ${TEST_APP} ${ARCH_SSE} --shani-off WORKING_DIRECTORY ${TEST_APP_BIN_DIR}) @@ -160,7 +157,10 @@ else() # SDE tests if(SDE) add_test(NAME KAT-AVX2-T3 - COMMAND ${SDE} -cmt -- ./${TEST_APP} ${ARCH_AVX2} + COMMAND ${SDE} -srf -- ./${TEST_APP} ${ARCH_AVX2} + WORKING_DIRECTORY ${TEST_APP_BIN_DIR}) + add_test(NAME KAT-AVX2-T4 + COMMAND ${SDE} -arl -- ./${TEST_APP} ${ARCH_AVX2} WORKING_DIRECTORY ${TEST_APP_BIN_DIR}) endif() diff --git a/test/kat-app/main.c b/test/kat-app/main.c index 3ba5cf229fdc1a1f1f3e0435d24eca424526052e..a8ab2452635864f56212e32f647e561a4664c79c 100644 --- a/test/kat-app/main.c +++ b/test/kat-app/main.c @@ -264,8 +264,9 @@ print_hw_features(void) { IMB_FEATURE_VAES, "VAES" }, { IMB_FEATURE_VPCLMULQDQ, "VPCLMULQDQ" }, { IMB_FEATURE_GFNI, "GFNI" }, { IMB_FEATURE_AVX512_IFMA, "AVX512-IFMA" }, { IMB_FEATURE_AVX_IFMA, "AVX-IFMA" }, { IMB_FEATURE_BMI2, "BMI2" }, - { IMB_FEATURE_HYBRID, "HYBRID-CORE" }, { IMB_FEATURE_AARCH64, "AARCH64" }, { IMB_FEATURE_SVE256, "SVE256" }, + { IMB_FEATURE_HYBRID, "HYBRID-CORE" }, { IMB_FEATURE_SM3NI, "SM3NI" }, + { IMB_FEATURE_SM4NI, "SM4NI" }, { IMB_FEATURE_SHA512NI, "SHA512NI" } }; IMB_MGR *p_mgr = NULL; unsigned i; diff --git a/test/kat-app/zuc_eea3_128.json.c b/test/kat-app/zuc_eea3_128.json.c index f148c43b8d0c2bb38b941c8cae3ebf74da046bac..a45c46624ac20400127fc90b56a0008fae8db4ff 100644 --- a/test/kat-app/zuc_eea3_128.json.c +++ b/test/kat-app/zuc_eea3_128.json.c @@ -180,5 +180,17 @@ const struct cipher_test zuc_eea3_128_test_json[] = { { 128, 128, 9, "\x4d\x32\x0b\xfa\xd4\xc2\x85\xbf\xd6\xb8\xbd\x00\xf3\x9d\x8b\x41", "\x52\x95\x9d\xab\xa0\xbf\x17\x6e\xce\x2d\xc3\x15\x04\x9e\xb5\x74", "\x00\x00\x00\x00\x00\x00\x00\x00", "\xed\x44\x00\xe7\x06\x33\xe5\xc5", 1, 64 }, + /* + * Extra vector to test corner case of LFSR update + */ + { 48, 128, 10, "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f", + "\x4b\x72\x50\xcf\x00\x00", + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" + "\x00\x00\x00\x00\x00", + "\xF5\x55\x33\x65\x01\x31\x2E\xD7\x72\x08\xC8\xFC\x30\xB5\xA4\x4A" + "\x7d\x09\x7d\x6e\x74\x4e\x10\x40\x07\x5f\x47\x85\x12\x69\x99\xd5" + "\x6e\xb2\x3b\x97\x2a", + 1, 37 * 8 }, { 0, 0, 0, NULL, NULL, NULL, NULL, 0, 0 } }; diff --git a/test/xvalid-app/ipsec_xvalid.c b/test/xvalid-app/ipsec_xvalid.c index 5909a0368425262adbd3f5901d83ce7d29d12f99..f03c62cd00667ca1afe21700d5fe4369949e5415 100644 --- a/test/xvalid-app/ipsec_xvalid.c +++ b/test/xvalid-app/ipsec_xvalid.c @@ -100,9 +100,9 @@ static int pattern_auth_key; static int pattern_cipher_key; static int pattern_plain_text; -static uint64_t pattern8_auth_key; -static uint64_t pattern8_cipher_key; -static uint64_t pattern8_plain_text; +uint64_t pattern8_auth_key; +uint64_t pattern8_cipher_key; +uint64_t pattern8_plain_text; #define MAX_OOO_MGR_SIZE 8192 @@ -702,29 +702,47 @@ clear_data(struct data *data) imb_clear_mem(&data->dec_keys, sizeof(struct cipher_auth_keys)); } -/** Generate random fill patterns */ +/** + * Generate fill patterns + * - make sure each patterns are different + * - do not return zero pattern + * - make sure it takes as long as possible before pattern is re-used again + */ +static int +get_pattern_seed(void) +{ + static int pattern_seed = 0; + + if (pattern_seed == 0) + pattern_seed = (pattern_seed + 1) & 255; + + const int ret_seed = pattern_seed; + + pattern_seed = (pattern_seed + 1) & 255; + return ret_seed; +} + static void generate_patterns(void) { - /* randomize fill values - make sure they are unique and non-zero */ - do { - pattern_auth_key = rand() & 255; - pattern_cipher_key = rand() & 255; - pattern_plain_text = rand() & 255; - } while (pattern_auth_key == pattern_cipher_key || pattern_auth_key == pattern_plain_text || - pattern_cipher_key == pattern_plain_text || pattern_auth_key == 0 || - pattern_cipher_key == 0 || pattern_plain_text == 0); + pattern_auth_key = get_pattern_seed(); + pattern_cipher_key = get_pattern_seed(); + pattern_plain_text = get_pattern_seed(); nosimd_memset(&pattern8_auth_key, pattern_auth_key, sizeof(pattern8_auth_key)); nosimd_memset(&pattern8_cipher_key, pattern_cipher_key, sizeof(pattern8_cipher_key)); nosimd_memset(&pattern8_plain_text, pattern_plain_text, sizeof(pattern8_plain_text)); +} +static void +print_patterns(void) +{ printf(">>> Patterns: AUTH_KEY = 0x%02x, CIPHER_KEY = 0x%02x, " "PLAIN_TEXT = 0x%02x\n", pattern_auth_key, pattern_cipher_key, pattern_plain_text); } -/* +/** * @brief Searches across a block of memory if a pattern is present * (indicating there is some left over sensitive data) * @@ -735,17 +753,12 @@ generate_patterns(void) * @retval FOUND_TEXT fragment of TEXT found */ static int -search_patterns_ex(const void *ptr, const size_t mem_size, size_t *offset) +search_patterns(const void *ptr, const size_t mem_size, size_t *offset) { const uint8_t *ptr8 = (const uint8_t *) ptr; const size_t limit = mem_size - sizeof(uint64_t); - if (mem_size < sizeof(uint64_t) || offset == NULL) - return 0; - - *offset = 0; - - for (size_t i = 0; i <= limit; i++) { + for (size_t i = *offset; i <= limit; i++) { const uint64_t string = *((const uint64_t *) &ptr8[i]); if (string == pattern8_cipher_key) { @@ -767,6 +780,211 @@ search_patterns_ex(const void *ptr, const size_t mem_size, size_t *offset) return 0; } +#ifndef __aarch64__ +/** + * @brief Tests memory pattern search function for specific buffer size + * + * @param [in] cb_size size of the test buffer + * @param [in] pattern byte pattern to be used in the test + * + * @return Test status + * @retval 0 OK + * @retval -1 Test case 1 failed + * @retval -2 Test case 2 failed + * @retval -3 Test case 3 failed + * @retval -100 Buffer allocation error + */ +static int +mem_search_avx2_test_case(const size_t cb_size, const int pattern) +{ + uint8_t *cb = malloc(cb_size); + int ret = 0; + + if (cb == NULL) + return -100; + + size_t i = 0; + + /* test 1: pattern shrinks from start to the end */ + for (i = 0; i < cb_size; i++) { + const size_t current_sz = cb_size - i; + uint8_t *p = &cb[i]; + + if (i != 0) + nosimd_memset(cb, 0, i); + nosimd_memset(p, pattern, current_sz); + + const uint64_t r1 = mem_search_avx2(cb, cb_size); + + if (current_sz >= sizeof(uint64_t) && r1 == 0ULL) { + ret = -1; + break; + } + + const uint64_t r2 = mem_search_avx2(p, current_sz); + + if (current_sz >= sizeof(uint64_t) && r2 == 0ULL) { + ret = -1; + break; + } + } + + /* test 2: pattern grows from end to start */ + for (i = 0; (ret == 0) && (i < cb_size); i++) { + const size_t current_sz = cb_size - i; + uint8_t *p = &cb[current_sz]; + + nosimd_memset(cb, 0, current_sz); + if (i != 0) + nosimd_memset(p, pattern, i); + + const uint64_t r1 = mem_search_avx2(cb, cb_size); + + if (i >= sizeof(uint64_t) && r1 == 0ULL) { + ret = -2; + break; + } + + const uint64_t r2 = mem_search_avx2(p, i); + + if (i >= sizeof(uint64_t) && r2 == 0ULL) { + ret = -2; + break; + } + } + + /* test 3: moving and growing pattern */ + for (i = 0; (ret == 0) && (i < cb_size); i++) { + const size_t current_sz = cb_size - i; + uint8_t *p = &cb[i]; + + for (size_t j = 1; (ret == 0) && (j < current_sz); j++) { + if ((i + j) > cb_size) + break; + + nosimd_memset(cb, 0, cb_size); + nosimd_memset(p, pattern, j); + + const uint64_t r1 = mem_search_avx2(cb, cb_size); + + if (j >= sizeof(uint64_t) && r1 == 0ULL) { + ret = -3; + break; + } + + const uint64_t r2 = mem_search_avx2(p, current_sz); + + if (j >= sizeof(uint64_t) && r2 == 0ULL) { + ret = -3; + break; + } + } + } + + free(cb); + return ret; +} + +/* + * @brief Tests memory pattern search function for range of memory buffer sizes + * + * @return Test status + * @retval 0 OK + * @retval -1 Test case 1 failed + * @retval -2 Test case 2 failed + * @retval -3 Test case 3 failed + * @retval -4 Negative test case 4 failed + * @retval -100 Buffer allocation error + */ +static int +mem_search_avx2_test(void) +{ + const int pattern_tab[3] = { pattern_cipher_key, pattern_auth_key, pattern_plain_text }; + int ret = 0; + + /* positive tests */ + for (size_t i = 8; (ret == 0) && (i <= 128); i++) + for (size_t n = 0; (ret == 0) && (n < IMB_DIM(pattern_tab)); n++) + ret = mem_search_avx2_test_case(i, pattern_tab[n]); + + /* negative test */ + if (ret == 0) { + int negative_pattern = 0; + + for (negative_pattern = 1; negative_pattern < 256; negative_pattern++) { + size_t n = 0; + + for (n = 0; n < IMB_DIM(pattern_tab); n++) + if (negative_pattern == pattern_tab[n]) + break; + + /* there was no match against existing patterns */ + if (n >= IMB_DIM(pattern_tab)) + break; + } + + if (mem_search_avx2_test_case(128, negative_pattern) == 0) + ret = -4; + } + + return ret; +} +#endif /* __aarch64__ */ + +/** + * @brief Searches across a block of memory if a pattern is present + * (indicating there is some left over sensitive data) + * + * @return search status + * @retval 0 nothing found + * @retval FOUND_CIPHER_KEY fragment of CIPHER_KEY found + * @retval FOUND_AUTH_KEY fragment of AUTH_KEY found + * @retval FOUND_TEXT fragment of TEXT found + */ +static int +search_patterns_ex(const void *ptr, const size_t mem_size, size_t *offset) +{ +#ifndef __aarch64__ + static uint32_t avx2_check = UINT32_MAX; +#endif /* __aarch64__ */ + + if (mem_size < sizeof(uint64_t) || offset == NULL) + return 0; + + if (ptr == NULL) + return 0; + + *offset = 0; + +#ifndef __aarch64__ + if (avx2_check == UINT32_MAX) { + /* Check presence of AVX2 - bit 5 of EBX, leaf 7, subleaf 0 */ + struct misc_cpuid_regs r = { 0 }; + + misc_cpuid(7, 0, &r); + avx2_check = r.ebx & (1UL << 5); + + /* run test of mem_search_avx2() function */ + if (avx2_check && (mem_search_avx2_test() != 0)) { + printf("ERROR: test_mem_search_avx2() test failed!\n"); + avx2_check = 0; + } + } + + if (avx2_check) + if (mem_search_avx2(ptr, mem_size) == 0ULL) + return 0; +#endif /* __aarch64__ */ + + /* + * If AVX2 fast search reports a problem then run the slow check + * - also run slow check if AVX2 not available + */ + const size_t limit = mem_size - sizeof(uint64_t); + + return search_patterns(ptr, limit, offset); +} + struct safe_check_ctx { int key_exp_phase; @@ -1350,6 +1568,8 @@ prepare_keys(IMB_MGR *mb_mgr, struct cipher_auth_keys *keys, const uint8_t *ciph #ifndef __aarch64__ case IMB_AUTH_AES_XCBC: nosimd_memset(k1_expanded, pattern_auth_key, sizeof(keys->k1_expanded)); + nosimd_memset(k2, pattern_auth_key, sizeof(keys->k2)); + nosimd_memset(k3, pattern_auth_key, sizeof(keys->k3)); break; case IMB_AUTH_AES_CMAC: case IMB_AUTH_AES_CMAC_BITLEN: @@ -2116,9 +2336,8 @@ print_fail_context(IMB_MGR *enc_mb_mgr, const IMB_ARCH enc_arch, IMB_MGR *dec_mb print_algo_info(params); printf("\nEncrypting "); print_tested_arch(enc_mb_mgr->features, enc_arch); - printf("\nDecrypting "); + printf("Decrypting "); print_tested_arch(dec_mb_mgr->features, dec_arch); - printf("\n"); /* * Print buffer size info if the failure was caused by an actual job, * where "idx" indicates the index of the job failing @@ -2414,8 +2633,14 @@ exit: num_jobs, i, job_ctx_tab, NULL); } else if (ret == -2) { if (p_safe_check != NULL) { - p_safe_check->job_idx = i; - p_safe_check->job_size = job_ctx_tab[i].buf_size; + /* + * Only set job info if the error is coming from an actual job, + * and not something else like key expansion + */ + if (i < num_jobs) { + p_safe_check->job_idx = i; + p_safe_check->job_size = job_ctx_tab[i].buf_size; + } } } @@ -2519,8 +2744,8 @@ test_single(IMB_MGR *enc_mgr, const IMB_ARCH enc_arch, IMB_MGR *dec_mgr, const I if (result2 == -2 && compare_match(&safe_ctx1, &safe_ctx2) == 0) { - if (verbose) - printf("FAIL\n"); + printf("FAIL\n"); + print_patterns(); print_fail_context(enc_mgr, enc_arch, dec_mgr, dec_arch, params, variant_data, 0, 1, 0, NULL, &safe_ctx2); @@ -2579,8 +2804,7 @@ process_variant(IMB_MGR *enc_mgr, const IMB_ARCH enc_arch, IMB_MGR *dec_mgr, for (j = 0; j < IMIX_ITER; j++) { if (do_test(enc_mgr, enc_arch, dec_mgr, dec_arch, params, variant_data, 0, 1, i) < 0) { - if (verbose) - printf("FAIL\n"); + printf("FAIL\n"); exit(EXIT_FAILURE); } } @@ -3286,5 +3510,7 @@ main(int argc, char *argv[]) run_tests(safe_check); + fprintf(stdout, "All tests passed\n"); + return EXIT_SUCCESS; } diff --git a/test/xvalid-app/misc.asm b/test/xvalid-app/misc.asm index eb0358a4bddab96b5ef2a44dfa3834f5b16bc32c..fd0e50524c094e981e380c27f7e3edc6034d1dc7 100644 --- a/test/xvalid-app/misc.asm +++ b/test/xvalid-app/misc.asm @@ -27,6 +27,7 @@ ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%use smartalign %ifdef LINUX ;;; macro to declare global symbols @@ -66,6 +67,22 @@ %define arg4d r9d %endif +;; External symbols +extern pattern8_cipher_key +extern pattern8_auth_key +extern pattern8_plain_text + +;; Data section +section .data +default rel + +align 16 + db 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 +shiftr: + db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + db 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff + + section .bss default rel @@ -79,6 +96,291 @@ simd_regs: resb 32*64 section .text +;; ymm0 [in] pattern 1 +;; ymm1 [in] pattern 2 +;; ymm2 [in] pattern 3 +;; xmm11 [in] data block (old) +;; xmm12 [in] data block (new) +;; ymm8 [in/out] - mask for pattern 1 matches +;; ymm9 [in/out] - mask for pattern 2 matches +;; ymm10 [in/out] - mask for pattern 3 matches +;; clobbers ymm3-ymm6 +align 32 +mem_search_helper_avx: + vpalignr xmm3, xmm12, xmm11, 9 + vpcmpeqq xmm4, xmm0, xmm3 + vpcmpeqq xmm5, xmm1, xmm3 + vpcmpeqq xmm6, xmm2, xmm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr xmm3, xmm12, xmm11, 10 + vpcmpeqq xmm4, xmm0, xmm3 + vpcmpeqq xmm5, xmm1, xmm3 + vpcmpeqq xmm6, xmm2, xmm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr xmm3, xmm12, xmm11, 11 + vpcmpeqq xmm4, xmm0, xmm3 + vpcmpeqq xmm5, xmm1, xmm3 + vpcmpeqq xmm6, xmm2, xmm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr xmm3, xmm12, xmm11, 12 + vpcmpeqq xmm4, xmm0, xmm3 + vpcmpeqq xmm5, xmm1, xmm3 + vpcmpeqq xmm6, xmm2, xmm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr xmm3, xmm12, xmm11, 13 + vpcmpeqq xmm4, xmm0, xmm3 + vpcmpeqq xmm5, xmm1, xmm3 + vpcmpeqq xmm6, xmm2, xmm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr xmm3, xmm12, xmm11, 14 + vpcmpeqq xmm4, xmm0, xmm3 + vpcmpeqq xmm5, xmm1, xmm3 + vpcmpeqq xmm6, xmm2, xmm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr xmm3, xmm12, xmm11, 15 + vpcmpeqq xmm4, xmm0, xmm3 + vpcmpeqq xmm5, xmm1, xmm3 + vpcmpeqq xmm6, xmm2, xmm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpcmpeqq xmm4, xmm0, xmm12 + vpcmpeqq xmm5, xmm1, xmm12 + vpcmpeqq xmm6, xmm2, xmm12 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + ret + +;; Loads 0 to 8 bytes (arg2) from arg1 location +;; arg1 [in] current data pointer +;; arg2 [in] number of bytes to load +;; r15 [clobbered] temporary register +;; xmm5 [out] read data block (1 to 7 bytes) +;; xmm6 [clobbered] temporary read +align 32 +mem_search_load_0_to_8_bytes: + ;; read the rest of the bytes in the buffer + ;; - read 8 from the end and remove overlapping bytes + ;; - it is safe to do this read because message length is + ;; guaranteed to be >= 8 bytes + lea r15, [arg1 + arg2] + vmovq xmm5, [r15 - 8] + + lea r15, [shiftr] + sub r15, arg2 + vmovdqu xmm6, [r15] + vpshufb xmm5, xmm5, xmm6 + ret + +;; uint64_t mem_search_avx2(const void *mem, const size_t size) +MKGLOBAL(mem_search_avx2,function,) +align 32 +mem_search_avx2: + push r12 + push r13 + push r14 + push r15 + +%ifdef WIN_ABI + sub rsp, 7 * 16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm7 + vmovdqu [rsp + 2*16], xmm8 + vmovdqu [rsp + 3*16], xmm9 + vmovdqu [rsp + 4*16], xmm10 + vmovdqu [rsp + 5*16], xmm11 + vmovdqu [rsp + 6*16], xmm12 +%endif + ;; clear result registers first; this is to return 0 if length is < 8 + vpxor ymm8, ymm8, ymm8 + vpxor ymm9, ymm9, ymm9 + vpxor ymm10, ymm10, ymm10 + + ;; quick length check + cmp arg2, 8 + jb .exit + + ;; prepare data for the main loop + vpxor xmm11, xmm11, xmm11 ;; clear the data block (old) + vpxor xmm12, xmm12, xmm12 ;; clear the data block (new) + + vpbroadcastq ymm0, [pattern8_cipher_key] + vpbroadcastq ymm1, [pattern8_auth_key] + vpbroadcastq ymm2, [pattern8_plain_text] + + cmp arg2, 32 + 8 + jb .loop16 + +align 32 +.loop32: + vmovdqu ymm11, [arg1] + + vextracti128 xmm12, ymm11, 1 + vmovq xmm3, [arg1 + 32] + vinserti128 ymm12, xmm3, 1 + + vpcmpeqq ymm4, ymm0, ymm11 + vpcmpeqq ymm5, ymm1, ymm11 + vpcmpeqq ymm6, ymm2, ymm11 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr ymm3, ymm12, ymm11, 1 + vpcmpeqq ymm4, ymm0, ymm3 + vpcmpeqq ymm5, ymm1, ymm3 + vpcmpeqq ymm6, ymm2, ymm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr ymm3, ymm12, ymm11, 2 + vpcmpeqq ymm4, ymm0, ymm3 + vpcmpeqq ymm5, ymm1, ymm3 + vpcmpeqq ymm6, ymm2, ymm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr ymm3, ymm12, ymm11, 3 + vpcmpeqq ymm4, ymm0, ymm3 + vpcmpeqq ymm5, ymm1, ymm3 + vpcmpeqq ymm6, ymm2, ymm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr ymm3, ymm12, ymm11, 4 + vpcmpeqq ymm4, ymm0, ymm3 + vpcmpeqq ymm5, ymm1, ymm3 + vpcmpeqq ymm6, ymm2, ymm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr ymm3, ymm12, ymm11, 5 + vpcmpeqq ymm4, ymm0, ymm3 + vpcmpeqq ymm5, ymm1, ymm3 + vpcmpeqq ymm6, ymm2, ymm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr ymm3, ymm12, ymm11, 6 + vpcmpeqq ymm4, ymm0, ymm3 + vpcmpeqq ymm5, ymm1, ymm3 + vpcmpeqq ymm6, ymm2, ymm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + vpalignr ymm3, ymm12, ymm11, 7 + vpcmpeqq ymm4, ymm0, ymm3 + vpcmpeqq ymm5, ymm1, ymm3 + vpcmpeqq ymm6, ymm2, ymm3 + vpor ymm8, ymm8, ymm4 + vpor ymm9, ymm9, ymm5 + vpor ymm10, ymm10, ymm6 + + add arg1, 32 + sub arg2, 32 + cmp arg2, 32 + 8 + jae .loop32 + + vmovdqu xmm11, [arg1 - 16] + +.loop16: + cmp arg2, 16 + jb .process_below_16bytes + vmovdqu xmm12, [arg1] + call mem_search_helper_avx + vmovdqa xmm11, xmm12 + add arg1, 16 + sub arg2, 16 + jmp .loop16 + +.process_below_16bytes: + or arg2, arg2 + jz .exit + + cmp arg2, 8 + jb .process_below_8bytes + + ;; load 8 bytes + vmovq xmm4, [arg1] + add arg1, 8 + sub arg2, 8 + ;; xmm4 = MSB [ ZERO 64-bit | full 64-bit data block ] LSB + jz .run_final_check + ;; load bytes 9 to 15 + call mem_search_load_0_to_8_bytes + vpunpcklqdq xmm4, xmm4, xmm5 + ;; xmm4 = MSB [ partial 64-bit data block | full 64-bit data block ] LSB + jmp .run_final_check + +.process_below_8bytes: + call mem_search_load_0_to_8_bytes + vmovdqa xmm4, xmm5 + ;; xmm4 = MSB [ ZERO 64-bits | partial 64-bit data block ] LSB + ;; fall through to run the final check + +.run_final_check: + vmovdqa xmm12, xmm4 + call mem_search_helper_avx + +.exit: + ;; fold the result masks to get the return status + vpmovmskb eax, ymm8 + vpmovmskb r12d, ymm9 + vpmovmskb r13d, ymm10 + or eax, r12d + or eax, r13d + + ;; clear the patterns + vpxor xmm0, xmm0, xmm0 + vpxor xmm1, xmm1, xmm1 + vpxor xmm2, xmm2, xmm2 + + vzeroupper + + ;; rax == 0 OK + ;; rax != 0 match found (RAX = address to start precise scalar check) +%ifdef WIN_ABI + vmovdqu xmm6, [rsp + 0*16] + vmovdqu xmm7, [rsp + 1*16] + vmovdqu xmm8, [rsp + 2*16] + vmovdqu xmm9, [rsp + 3*16] + vmovdqu xmm10, [rsp + 4*16] + vmovdqu xmm11, [rsp + 5*16] + vmovdqu xmm12, [rsp + 6*16] + add rsp, 7 * 16 +%endif + pop r15 + pop r14 + pop r13 + pop r12 + ret + ;; uint32_t avx_sse_transition_check(void) MKGLOBAL(avx_sse_transition_check,function,) align 16 diff --git a/test/xvalid-app/misc.h b/test/xvalid-app/misc.h index 760e2a5e71466a80f7f4c49be92b4f74d9455add..2456c10bfbb05946f57bdde17fc00616e3c43108 100644 --- a/test/xvalid-app/misc.h +++ b/test/xvalid-app/misc.h @@ -83,6 +83,12 @@ nosimd_memcpy(void *dst, const void *src, size_t n); uint32_t avx_sse_transition_check(void); +/* + * Quick search for patterns in memory (AVX2) + */ +uint64_t +mem_search_avx2(const void *ptr, const size_t size); + #define MISC_AVX_SSE_YMM0_15_ISSUE (1 << 2) #define MISC_AVX_SSE_ZMM0_15_ISSUE (1 << 6) #define MISC_AVX_SSE_ISSUE (MISC_AVX_SSE_YMM0_15_ISSUE | MISC_AVX_SSE_ZMM0_15_ISSUE)