From a06a7153702d203a4c6a992633f0e3f01ebb9d4a Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Tue, 3 Dec 2024 17:01:18 +0000 Subject: [PATCH 01/30] cmake: update CET check check if both compiler and linker support CET before adding flags --- CMakeLists.txt | 5 +++-- cmake/utils.cmake | 12 +++++++++++- lib/cmake/unix.cmake | 2 +- perf/cmake/unix.cmake | 2 +- test/cmake/unix.cmake | 2 +- 5 files changed, 17 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f9ab0776..1547747a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +cmake_minimum_required(VERSION 3.18) +cmake_policy(VERSION 3.18) + include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/utils.cmake") # get version from public header file @@ -31,8 +34,6 @@ set(IMB_HDR "${CMAKE_CURRENT_SOURCE_DIR}/lib/ipsec-mb.h") imb_get_version(${IMB_HDR}) message(STATUS "Project Version: ${IPSEC_MB_VERSION_FULL}") -cmake_minimum_required(VERSION 3.16) - # set default project values imb_set_proj_defaults() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 15a32455..28a54c0d 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -24,6 +24,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. include(CheckCCompilerFlag) +include(CheckLinkerFlag) # extract library version from header file macro(imb_get_version IMB_HDR_FILE) @@ -133,7 +134,16 @@ macro(imb_compiler_check) (CMAKE_C_COMPILER_VERSION VERSION_LESS 5.0)) message(FATAL_ERROR "GNU C Compiler version must be 5.0 or higher") endif() - check_c_compiler_flag("-fcf-protection" CC_HAS_CET) + + # enable CET if supported by both compiler and linker + check_c_compiler_flag("-fcf-protection=full" CC_CET_CHECK) + check_linker_flag("C" "-z ibt" LD_IBT_CHECK) + if(CC_CET_CHECK AND LD_IBT_CHECK) + set(CET_SUPPORT YES) + else() + set(CET_SUPPORT NO) + endif() + message(STATUS "CET SUPPORT... ${CET_SUPPORT}") endmacro() # add uninstall target diff --git a/lib/cmake/unix.cmake b/lib/cmake/unix.cmake index 3cdf3dbe..fd93288a 100644 --- a/lib/cmake/unix.cmake +++ b/lib/cmake/unix.cmake @@ -64,7 +64,7 @@ if(CMAKE_COMPILER_IS_GNUCC) string(APPEND CMAKE_C_FLAGS " -fno-strict-overflow") endif() -if(CC_HAS_CET) +if(CET_SUPPORT) string(APPEND CMAKE_C_FLAGS " -fcf-protection=full") string(APPEND CMAKE_SHARED_LINKER_FLAGS " -Wl,-z,ibt -Wl,-z,shstk -Wl,-z,cet-report=error") endif() diff --git a/perf/cmake/unix.cmake b/perf/cmake/unix.cmake index 9c9d897e..4ce2e632 100644 --- a/perf/cmake/unix.cmake +++ b/perf/cmake/unix.cmake @@ -61,7 +61,7 @@ if(CMAKE_COMPILER_IS_GNUCC) string(APPEND CMAKE_C_FLAGS " -fno-strict-overflow") endif() -if(CC_HAS_CET) +if(CET_SUPPORT) string(APPEND CMAKE_C_FLAGS " -fcf-protection=full") string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-z,ibt -Wl,-z,shstk -Wl,-z,cet-report=error") endif() diff --git a/test/cmake/unix.cmake b/test/cmake/unix.cmake index c8949fc3..02f17675 100644 --- a/test/cmake/unix.cmake +++ b/test/cmake/unix.cmake @@ -62,7 +62,7 @@ if(CMAKE_COMPILER_IS_GNUCC) string(APPEND CMAKE_C_FLAGS " -fno-strict-overflow") endif() -if(CC_HAS_CET) +if(CET_SUPPORT) string(APPEND CMAKE_C_FLAGS " -fcf-protection=full") string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-z,ibt -Wl,-z,shstk -Wl,-z,cet-report=error") endif() -- GitLab From 0d1864e36dac1e6ab0839f6c69a63de3aba826eb Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Tue, 3 Dec 2024 16:13:53 +0000 Subject: [PATCH 02/30] workflows: add freebsd cmake builds --- .github/workflows/freebsd.yml | 50 +++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml index 56119607..aa106f34 100644 --- a/.github/workflows/freebsd.yml +++ b/.github/workflows/freebsd.yml @@ -46,3 +46,53 @@ jobs: run: | freebsd-version gmake CC=gcc -j 4 + + # CMake release build with Clang + release-cmake-clang: + runs-on: ubuntu-22.04 + + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + repository: 'intel/intel-ipsec-mb' + + - name: Clang Release Build + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + mem: 8192 + prepare: pkg install -y curl nasm gmake cmake + run: | + echo ">>> CMAKE CONFIGURE" + cmake -B ./build -DCMAKE_BUILD_TYPE=Release + echo ">>> CMAKE BUILD" + cd ./build + cmake --build . --config Release -j4 -v + ctest -j 5 -C Release + echo ">>> CMAKE INSTALL" + cmake --install . + + # CMake release build with GCC + release-cmake-gcc: + runs-on: ubuntu-22.04 + + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + repository: 'intel/intel-ipsec-mb' + + - name: Release build with GCC + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + mem: 8192 + prepare: pkg install -y curl nasm gmake cmake gcc + run: | + echo ">>> CMAKE CONFIGURE" + cmake -B ./build -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc + echo ">>> CMAKE BUILD" + cd ./build + cmake --build . --config Release -j4 -v + ctest -j 5 -C Release -- GitLab From 9a54fd4d0457aad61931086bca4eeecc994a2a5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20K=C5=82oczko?= Date: Wed, 15 Nov 2023 08:43:42 +0000 Subject: [PATCH 03/30] fix man pages install path on unix platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On Unix platform man pages install path is $(prefix)/share/man Signed-off-by: Tomasz Kłoczko --- lib/cmake/unix.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/cmake/unix.cmake b/lib/cmake/unix.cmake index fd93288a..6f118743 100644 --- a/lib/cmake/unix.cmake +++ b/lib/cmake/unix.cmake @@ -128,7 +128,7 @@ if(NOT INCLUDE_INSTALL_DIR) set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/include") endif() if(NOT MAN_INSTALL_DIR) - set(MAN_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/man/man7") + set(MAN_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/man/man7") endif() message(STATUS "LIB_INSTALL_DIR... ${LIB_INSTALL_DIR}") -- GitLab From 60af211caacea53c3387763db4efa14860f7fb04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20K=C5=82oczko?= Date: Wed, 15 Nov 2023 08:45:16 +0000 Subject: [PATCH 04/30] add cmake project() LANGUAGES and HOMEPAGE_URL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By default if not specified cmake uses LANGUAGES C++ but there is no any C++ files in source tree. Change LANGUAGES to C and ASM to omit checking for c++ compiler. Added HOMEPAGE_URL as well. Signed-off-by: Tomasz Kłoczko --- CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1547747a..521a1f10 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,8 +37,13 @@ message(STATUS "Project Version: ${IPSEC_MB_VERSION_FULL}") # set default project values imb_set_proj_defaults() -project(ipsec-mb VERSION ${IPSEC_MB_VERSION} - DESCRIPTION "IPsec Multi-Buffer library") +project( + ipsec-mb + VERSION ${IPSEC_MB_VERSION} + DESCRIPTION "IPsec Multi-Buffer library" + LANGUAGES C ASM + HOMEPAGE_URL https://git.gitlab.arm.com/arm-reference-solutions/ipsec-mb/ +) # add testing support include(CTest) -- GitLab From abc0eace3af3aa4d81c25059ff608b44e466bd59 Mon Sep 17 00:00:00 2001 From: Marcel Cornu Date: Thu, 14 Dec 2023 12:06:24 +0000 Subject: [PATCH 05/30] lib: [cmake] use GNUInstallDirs module to set install path Signed-off-by: Marcel Cornu --- lib/cmake/unix.cmake | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/cmake/unix.cmake b/lib/cmake/unix.cmake index 6f118743..27ce99b1 100644 --- a/lib/cmake/unix.cmake +++ b/lib/cmake/unix.cmake @@ -26,6 +26,7 @@ # ############################################################################## # IPSec_MB library CMake Unix config # ############################################################################## +include(GNUInstallDirs) set(LIB IPSec_MB) # 'lib' prefix assumed on Linux @@ -122,13 +123,13 @@ if(NOT CMAKE_INSTALL_PREFIX) CACHE STRING "Set default installation directory" FORCE) endif() if(NOT LIB_INSTALL_DIR) - set(LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib") + set(LIB_INSTALL_DIR "${CMAKE_INSTALL_FULL_LIBDIR}") endif() if(NOT INCLUDE_INSTALL_DIR) - set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/include") + set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_FULL_INCLUDEDIR}") endif() if(NOT MAN_INSTALL_DIR) - set(MAN_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/man/man7") + set(MAN_INSTALL_DIR "${CMAKE_INSTALL_FULL_MANDIR}/man7") endif() message(STATUS "LIB_INSTALL_DIR... ${LIB_INSTALL_DIR}") -- GitLab From 609d39aefa08503fe4dadebb4d299d768c5c824c Mon Sep 17 00:00:00 2001 From: Sankaranarayanan Venkatasubramanian Date: Fri, 12 Jan 2024 15:34:07 +0530 Subject: [PATCH 06/30] Remove printf in lib code that causes linker issue in environments like SGX Signed-off-by: Sankaranarayanan Venkatasubramanian --- lib/include/kasumi_internal.h | 1 - lib/include/snow3g_common.h | 3 --- lib/include/zuc_internal.h | 11 ----------- 3 files changed, 15 deletions(-) diff --git a/lib/include/kasumi_internal.h b/lib/include/kasumi_internal.h index 7b84526b..4e76bc11 100644 --- a/lib/include/kasumi_internal.h +++ b/lib/include/kasumi_internal.h @@ -1334,7 +1334,6 @@ kasumi_f8_n_buffer(const kasumi_key_sched_t *pKeySchedule, const uint64_t IV[], if (bufCount > 16) { pOut[0] = NULL; - printf("dataCount too high (%u)\n", (unsigned) bufCount); return; } diff --git a/lib/include/snow3g_common.h b/lib/include/snow3g_common.h index a58a5ef9..8dbfd17e 100644 --- a/lib/include/snow3g_common.h +++ b/lib/include/snow3g_common.h @@ -35,7 +35,6 @@ #ifndef SNOW3G_COMMON_H #define SNOW3G_COMMON_H -#include /* printf() */ #include /* memset(), memcpy() */ #include @@ -3103,7 +3102,6 @@ SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx, const void *const IV[], if (packetCount > NUM_PACKETS_16) { pBufferOut[0] = NULL; - printf("packetCount too high (%u)\n", (unsigned) packetCount); return; } @@ -3267,7 +3265,6 @@ SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t *const pCtx[], const voi if (packetCount > NUM_PACKETS_16) { pBufferOut[0] = NULL; - printf("packetCount too high (%u)\n", (unsigned) packetCount); return; } diff --git a/lib/include/zuc_internal.h b/lib/include/zuc_internal.h index e93062a5..9c962453 100644 --- a/lib/include/zuc_internal.h +++ b/lib/include/zuc_internal.h @@ -61,17 +61,6 @@ #define ZUC_MIN_BYTELEN 1 #define ZUC_MAX_BYTELEN (ZUC_MAX_BITLEN / 8) -#ifdef DEBUG -#ifdef _WIN32 -#define DEBUG_PRINT(_fmt, ...) \ - fprintf(stderr, "%s()::%d " _fmt, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define DEBUG_PRINT(_fmt, ...) fprintf(stderr, "%s()::%d " _fmt, __func__, __LINE__, __VA_ARGS__) -#endif -#else -#define DEBUG_PRINT(_fmt, ...) -#endif - /** ****************************************************************************** * @description -- GitLab From d5b232e03f3ccf15d8a91b6637aec956e7283126 Mon Sep 17 00:00:00 2001 From: Pablo de Lara Date: Fri, 12 Jan 2024 10:02:41 +0000 Subject: [PATCH 07/30] lib: [GCM] remove unneeded store Signed-off-by: Pablo de Lara --- lib/include/gcm_common_avx2_avx512.inc | 1 - lib/include/gcm_sse.inc | 1 - 2 files changed, 2 deletions(-) diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index 0c39e1b3..ab51bf21 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -2793,7 +2793,6 @@ align 32 je %%_partial_done GHASH_MUL2 xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block - vmovdqu [%%GDATA_CTX + AadHash], xmm14 %%_partial_done: diff --git a/lib/include/gcm_sse.inc b/lib/include/gcm_sse.inc index f1b58ac4..7e60c9e3 100644 --- a/lib/include/gcm_sse.inc +++ b/lib/include/gcm_sse.inc @@ -2015,7 +2015,6 @@ je %%_partial_done GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block - movdqu [%%GDATA_CTX + AadHash], xmm14 %%_partial_done: -- GitLab From f6d86429faa14258d0686b26e13243e3d11e9d7c Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 11 Jan 2024 10:24:49 +0000 Subject: [PATCH 08/30] vaes-avx512: [ghash] add internal GHASH function to compute API - separate GHASH implementations into a separate module --- lib/Makefile | 1 + lib/avx512_t2/ghash_api_vaes_avx512.asm | 208 +++++++++++++++++++++++ lib/include/gcm_gmac_api_vaes_avx512.inc | 152 ----------------- lib/win_x64.mak | 1 + 4 files changed, 210 insertions(+), 152 deletions(-) create mode 100644 lib/avx512_t2/ghash_api_vaes_avx512.asm diff --git a/lib/Makefile b/lib/Makefile index eecb22fb..779c67fa 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -837,6 +837,7 @@ asm_avx2_gcm_objs := \ asm_avx512_gcm_objs := \ aes128_gcm_api_vaes_avx512.o aes192_gcm_api_vaes_avx512.o aes256_gcm_api_vaes_avx512.o \ aes128_gcm_sgl_api_vaes_avx512.o aes192_gcm_sgl_api_vaes_avx512.o aes256_gcm_sgl_api_vaes_avx512.o \ + ghash_api_vaes_avx512.o \ aes128_gmac_api_vaes_avx512.o aes192_gmac_api_vaes_avx512.o aes256_gmac_api_vaes_avx512.o \ aes128_gcm_by8_avx512.o aes192_gcm_by8_avx512.o aes256_gcm_by8_avx512.o diff --git a/lib/avx512_t2/ghash_api_vaes_avx512.asm b/lib/avx512_t2/ghash_api_vaes_avx512.asm new file mode 100644 index 00000000..e3952888 --- /dev/null +++ b/lib/avx512_t2/ghash_api_vaes_avx512.asm @@ -0,0 +1,208 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE +%include "include/gcm_vaes_avx512.inc" + +%include "include/error.inc" +%include "include/clear_regs.inc" + +mksection .text +default rel + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void ghash_pre_vaes_avx512 +; (const void *key, struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_pre_vaes_avx512,function,) +ghash_pre_vaes_avx512: + endbranch64 +;; Parameter is passed through register +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key != NULL + cmp arg1, 0 + jz error_ghash_pre + + ;; Check key_data != NULL + cmp arg2, 0 + jz error_ghash_pre +%endif + + FUNC_SAVE small_frame + + vmovdqu xmm6, [arg1] + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [rel TWOONE] + vpand xmm2, xmm2, [rel POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg2 + HashKey_1], xmm6 ; store HashKey<<1 mod poly + + PRECOMPUTE arg2, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7, xmm8 +%ifdef SAFE_DATA + clear_zmms_avx512 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 +%endif + FUNC_RESTORE +exit_ghash_pre: + + ret + +%ifdef SAFE_PARAM +error_ghash_pre: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_KEY + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_EXP_KEY + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_ghash_pre +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; ghash_internal_vaes_avx512() +; arg2 [in] message pointer +; arg3 [in] message length +; xmm0 [in/out] ghash value +; arg1 [in] pointer to key structure +; clobbers: zmm1-zmm19, r10-r12, k1 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_internal_vaes_avx512,function,internal) +ghash_internal_vaes_avx512: + CALC_AAD_HASH arg2, arg3, xmm0, arg1, zmm1, zmm2, zmm3, zmm4, zmm5, \ + zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, \ + zmm15, zmm16, zmm17, zmm18, zmm19, r10, r11, r12, k1 + ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text + ;; **zmm13, zmm15, zmm18 and zmm8 may contain hash key + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void ghash_vaes_avx512 +; const struct gcm_key_data *key_data, +; const void *in, +; const u64 in_len, +; void *io_tag, +; const u64 tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_vaes_avx512,function,) +ghash_vaes_avx512: + endbranch64 + FUNC_SAVE small_frame + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + cmp arg1, 0 + jz error_ghash + + ;; Check in != NULL + cmp arg2, 0 + jz error_ghash + + ;; Check in_len != 0 + cmp arg3, 0 + jz error_ghash + + ;; Check tag != NULL + cmp arg4, 0 + jz error_ghash + + ;; Check tag_len != 0 + cmp arg5, 0 + jz error_ghash +%endif + + ;; copy tag to xmm0 + vmovdqu xmm0, [arg4] + vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap + + ;; arg1 [in] pointer to key structure + ;; arg2 [in] message pointer + ;; arg3 [in] message length + ;; xmm0 [in/out] ghash value + call ghash_internal_vaes_avx512 + + vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap + simd_store_avx arg4, xmm0, arg5, r12, rax +%ifdef SAFE_DATA + clear_zmms_avx512 xmm0, xmm2, xmm3, xmm4, xmm5, xmm13, xmm15, xmm8, xmm18 +%endif +exit_ghash: + FUNC_RESTORE + ret + +%ifdef SAFE_PARAM +error_ghash: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check in != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_SRC + + ;; Check in_len != 0 + IMB_ERR_CHECK_ZERO arg3, rax, IMB_ERR_AUTH_LEN + + ;; Check tag != NULL + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AUTH + + ;; Check tag_len != 0 + IMB_ERR_CHECK_ZERO arg5, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + + jmp exit_ghash +%endif + +mksection stack-noexec diff --git a/lib/include/gcm_gmac_api_vaes_avx512.inc b/lib/include/gcm_gmac_api_vaes_avx512.inc index b82e86da..ab7c657f 100644 --- a/lib/include/gcm_gmac_api_vaes_avx512.inc +++ b/lib/include/gcm_gmac_api_vaes_avx512.inc @@ -37,158 +37,6 @@ mksection .text default rel -%ifdef GCM128_MODE -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void ghash_pre_vaes_avx512 -; (const void *key, struct gcm_key_data *key_data) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -MKGLOBAL(ghash_pre_vaes_avx512,function,) -ghash_pre_vaes_avx512: - endbranch64 -;; Parameter is passed through register -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key != NULL - cmp arg1, 0 - jz error_ghash_pre - - ;; Check key_data != NULL - cmp arg2, 0 - jz error_ghash_pre -%endif - - FUNC_SAVE small_frame - - vmovdqu xmm6, [arg1] - vpshufb xmm6, [rel SHUF_MASK] - ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; - vmovdqa xmm2, xmm6 - vpsllq xmm6, xmm6, 1 - vpsrlq xmm2, xmm2, 63 - vmovdqa xmm1, xmm2 - vpslldq xmm2, xmm2, 8 - vpsrldq xmm1, xmm1, 8 - vpor xmm6, xmm6, xmm2 - ;reduction - vpshufd xmm2, xmm1, 00100100b - vpcmpeqd xmm2, [rel TWOONE] - vpand xmm2, xmm2, [rel POLY] - vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - vmovdqu [arg2 + HashKey_1], xmm6 ; store HashKey<<1 mod poly - - PRECOMPUTE arg2, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm7, xmm8 -%ifdef SAFE_DATA - clear_zmms_avx512 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8 -%endif - FUNC_RESTORE -exit_ghash_pre: - - ret - -%ifdef SAFE_PARAM -error_ghash_pre: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_KEY - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_EXP_KEY - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_ghash_pre -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void ghash_vaes_avx512 -; const struct gcm_key_data *key_data, -; const void *in, -; const u64 in_len, -; void *io_tag, -; const u64 tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -MKGLOBAL(ghash_vaes_avx512,function,) -ghash_vaes_avx512: - endbranch64 - FUNC_SAVE small_frame - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - cmp arg1, 0 - jz error_ghash - - ;; Check in != NULL - cmp arg2, 0 - jz error_ghash - - ;; Check in_len != 0 - cmp arg3, 0 - jz error_ghash - - ;; Check tag != NULL - cmp arg4, 0 - jz error_ghash - - ;; Check tag_len != 0 - cmp arg5, 0 - jz error_ghash -%endif - - ;; copy tag to xmm0 - vmovdqu xmm0, [arg4] - vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - - CALC_AAD_HASH arg2, arg3, xmm0, arg1, zmm1, zmm2, zmm3, zmm4, zmm5, \ - zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, \ - zmm15, zmm16, zmm17, zmm18, zmm19, r10, r11, r12, k1 - ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text - ;; **zmm13, zmm15, zmm18 and zmm8 may contain authentication key - - vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - simd_store_avx arg4, xmm0, arg5, r12, rax -%ifdef SAFE_DATA - clear_zmms_avx512 xmm0, xmm2, xmm3, xmm4, xmm5, xmm13, xmm15, xmm8, xmm18 -%endif -exit_ghash: - FUNC_RESTORE - ret - -%ifdef SAFE_PARAM -error_ghash: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check in != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_SRC - - ;; Check in_len != 0 - IMB_ERR_CHECK_ZERO arg3, rax, IMB_ERR_AUTH_LEN - - ;; Check tag != NULL - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AUTH - - ;; Check tag_len != 0 - IMB_ERR_CHECK_ZERO arg5, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - - jmp exit_ghash -%endif - -%endif ;; GCM128_MODE - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void imb_aes_gmac_update_128_vaes_avx512 / ; imb_aes_gmac_update_192_vaes_avx512 / diff --git a/lib/win_x64.mak b/lib/win_x64.mak index e226dbdf..04020865 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -564,6 +564,7 @@ gcm_objs = \ $(OBJ_DIR)\aes128_gcm_by8_avx512.obj \ $(OBJ_DIR)\aes128_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes128_gcm_sgl_api_vaes_avx512.obj \ + $(OBJ_DIR)\ghash_api_vaes_avx512.obj \ $(OBJ_DIR)\aes128_gmac_api_vaes_avx512.obj \ $(OBJ_DIR)\aes192_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes192_gcm_vaes_avx2.obj \ -- GitLab From f06fed532dc98b8ba7f509854bc0116f8e14574a Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 11 Jan 2024 13:41:54 +0000 Subject: [PATCH 09/30] vaes-avx512: [gmac] use internal GHASH compute function in AES-GMAC update functions --- lib/include/gcm_gmac_api_vaes_avx512.inc | 41 ++++++++++++++++-------- 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/lib/include/gcm_gmac_api_vaes_avx512.inc b/lib/include/gcm_gmac_api_vaes_avx512.inc index ab7c657f..4629415f 100644 --- a/lib/include/gcm_gmac_api_vaes_avx512.inc +++ b/lib/include/gcm_gmac_api_vaes_avx512.inc @@ -34,6 +34,8 @@ %ifndef GCM_GMAC_API_VAES_AVX512_INC %define GCM_GMAC_API_VAES_AVX512_INC +extern ghash_internal_vaes_avx512 + mksection .text default rel @@ -78,28 +80,41 @@ GMAC_FN_NAME(update): ;; Deal with previous partial block xor r11, r11 - vmovdqu64 xmm8, [arg2 + AadHash] + vmovdqu64 xmm0, [arg2 + AadHash] - PARTIAL_BLOCK_GMAC arg1, arg2, arg3, arg4, r11, xmm8, r10, r12, rax, \ - zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, k1 + PARTIAL_BLOCK_GMAC arg1, arg2, arg3, arg4, r11, xmm0, r10, r12, rax, \ + zmm8, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, k1 %ifdef SAFE_DATA - clear_zmms_avx512 xmm0 + clear_zmms_avx512 xmm8 %endif ; CALC_AAD_HASH needs to deal with multiple of 16 bytes sub arg4, r11 add arg3, r11 - vmovq xmm14, arg4 ; Save remaining length + vmovq xmm21, arg4 ; Save remaining length and arg4, -16 ; Get multiple of 16 bytes or arg4, arg4 jz no_full_blocks ;; Calculate GHASH of this segment - CALC_AAD_HASH arg3, arg4, xmm8, arg1, zmm1, zmm2, zmm3, zmm4, zmm5, \ - zmm6, zmm7, zmm9, zmm10, zmm11, zmm12, zmm13, zmm15, \ - zmm16, zmm17, zmm18, zmm19, zmm20, r10, r11, r12, k1 - vmovdqu64 [arg2 + AadHash], xmm8 ; ctx_data.aad hash = aad_hash + + ;; arg1 [in] pointer to key structure - arg1 here + ;; arg2 [in] message pointer - arg3 here + ;; arg3 [in] message length - arg4 here + mov r15, arg2 + mov r13, arg3 + mov arg2, arg3 + mov arg3, arg4 + + ;; xmm0 [in/out] ghash value + call ghash_internal_vaes_avx512 + + ;; restore original arguments + mov arg2, r15 + mov arg3, r13 + + vmovdqu64 [arg2 + AadHash], xmm0 ; ctx_data.aad hash = aad_hash ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text ;; **zmm19 may contain authentication key @@ -110,7 +125,7 @@ GMAC_FN_NAME(update): no_full_blocks: add arg3, arg4 ; Point at partial block - vmovq arg4, xmm14 ; Restore original remaining length + vmovq arg4, xmm21 ; Restore original remaining length and arg4, 15 jz exit_gmac_update @@ -118,11 +133,11 @@ no_full_blocks: mov [arg2 + PBlockLen], arg4 READ_SMALL_DATA_INPUT_AVX512 xmm1, arg3, arg4, r11, k1 vpshufb xmm1, xmm1, [rel SHUF_MASK] - vpxorq xmm8, xmm8, xmm1 - vmovdqu64 [arg2 + AadHash], xmm8 + vpxorq xmm0, xmm0, xmm1 + vmovdqu64 [arg2 + AadHash], xmm0 %ifdef SAFE_DATA ;; **xmm1 and xmm8 may contain some clear text - clear_zmms_avx512 xmm1, xmm8 + clear_zmms_avx512 xmm1, xmm0 %endif exit_gmac_update: FUNC_RESTORE -- GitLab From 2c42444ca15bdf57ef9da50b74c1c5ffeed058ff Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 11 Jan 2024 14:08:45 +0000 Subject: [PATCH 10/30] vaes-avx512: [gmac] consolidate AES-GMAC update API's --- lib/Makefile | 2 +- lib/avx512_t2/aes128_gmac_api_vaes_avx512.asm | 31 -------------- lib/avx512_t2/aes192_gmac_api_vaes_avx512.asm | 31 -------------- lib/avx512_t2/aes256_gmac_api_vaes_avx512.asm | 31 -------------- .../gmac_api_vaes_avx512.asm} | 42 ++++++++++--------- lib/win_x64.mak | 4 +- 6 files changed, 24 insertions(+), 117 deletions(-) delete mode 100644 lib/avx512_t2/aes128_gmac_api_vaes_avx512.asm delete mode 100644 lib/avx512_t2/aes192_gmac_api_vaes_avx512.asm delete mode 100644 lib/avx512_t2/aes256_gmac_api_vaes_avx512.asm rename lib/{include/gcm_gmac_api_vaes_avx512.inc => avx512_t2/gmac_api_vaes_avx512.asm} (86%) diff --git a/lib/Makefile b/lib/Makefile index 779c67fa..b52707e7 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -838,7 +838,7 @@ asm_avx512_gcm_objs := \ aes128_gcm_api_vaes_avx512.o aes192_gcm_api_vaes_avx512.o aes256_gcm_api_vaes_avx512.o \ aes128_gcm_sgl_api_vaes_avx512.o aes192_gcm_sgl_api_vaes_avx512.o aes256_gcm_sgl_api_vaes_avx512.o \ ghash_api_vaes_avx512.o \ - aes128_gmac_api_vaes_avx512.o aes192_gmac_api_vaes_avx512.o aes256_gmac_api_vaes_avx512.o \ + gmac_api_vaes_avx512.o \ aes128_gcm_by8_avx512.o aes192_gcm_by8_avx512.o aes256_gcm_by8_avx512.o endif # aarch64 diff --git a/lib/avx512_t2/aes128_gmac_api_vaes_avx512.asm b/lib/avx512_t2/aes128_gmac_api_vaes_avx512.asm deleted file mode 100644 index 099484ca..00000000 --- a/lib/avx512_t2/aes128_gmac_api_vaes_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2021-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM128_MODE 1 -%include "include/gcm_gmac_api_vaes_avx512.inc" diff --git a/lib/avx512_t2/aes192_gmac_api_vaes_avx512.asm b/lib/avx512_t2/aes192_gmac_api_vaes_avx512.asm deleted file mode 100644 index d82a5e20..00000000 --- a/lib/avx512_t2/aes192_gmac_api_vaes_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2021-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM192_MODE 1 -%include "include/gcm_gmac_api_vaes_avx512.inc" diff --git a/lib/avx512_t2/aes256_gmac_api_vaes_avx512.asm b/lib/avx512_t2/aes256_gmac_api_vaes_avx512.asm deleted file mode 100644 index 422e2ba4..00000000 --- a/lib/avx512_t2/aes256_gmac_api_vaes_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2021-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM256_MODE 1 -%include "include/gcm_gmac_api_vaes_avx512.inc" diff --git a/lib/include/gcm_gmac_api_vaes_avx512.inc b/lib/avx512_t2/gmac_api_vaes_avx512.asm similarity index 86% rename from lib/include/gcm_gmac_api_vaes_avx512.inc rename to lib/avx512_t2/gmac_api_vaes_avx512.asm index 4629415f..df91c2c4 100644 --- a/lib/include/gcm_gmac_api_vaes_avx512.inc +++ b/lib/avx512_t2/gmac_api_vaes_avx512.asm @@ -1,5 +1,5 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2021-2023, Intel Corporation All rights reserved. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -27,13 +27,11 @@ ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define GCM128_MODE 1 %include "include/gcm_vaes_avx512.inc" %include "include/error.inc" %include "include/clear_regs.inc" -%ifndef GCM_GMAC_API_VAES_AVX512_INC -%define GCM_GMAC_API_VAES_AVX512_INC - extern ghash_internal_vaes_avx512 mksection .text @@ -48,8 +46,13 @@ default rel ; const u8 *in, ; const u64 msg_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -MKGLOBAL(GMAC_FN_NAME(update),function,) -GMAC_FN_NAME(update): +align 32 +MKGLOBAL(imb_aes_gmac_update_128_vaes_avx512,function,) +MKGLOBAL(imb_aes_gmac_update_192_vaes_avx512,function,) +MKGLOBAL(imb_aes_gmac_update_256_vaes_avx512,function,) +imb_aes_gmac_update_128_vaes_avx512: +imb_aes_gmac_update_192_vaes_avx512: +imb_aes_gmac_update_256_vaes_avx512: endbranch64 FUNC_SAVE small_frame @@ -59,20 +62,20 @@ GMAC_FN_NAME(update): %endif ;; Check if msg_len == 0 cmp arg4, 0 - je exit_gmac_update + je .exit_gmac_update %ifdef SAFE_PARAM ;; Check key_data != NULL cmp arg1, 0 - jz error_gmac_update + jz .error_gmac_update ;; Check context_data != NULL cmp arg2, 0 - jz error_gmac_update + jz .error_gmac_update ;; Check in != NULL (msg_len != 0) cmp arg3, 0 - jz error_gmac_update + jz .error_gmac_update %endif ; Increment size of "AAD length" for GMAC @@ -95,7 +98,7 @@ GMAC_FN_NAME(update): and arg4, -16 ; Get multiple of 16 bytes or arg4, arg4 - jz no_full_blocks + jz .no_full_blocks ;; Calculate GHASH of this segment @@ -119,15 +122,15 @@ GMAC_FN_NAME(update): ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text ;; **zmm19 may contain authentication key %ifdef SAFE_DATA - clear_zmms_avx512 xmm2, xmm3, xmm4, xmm5, xmm18, xmm19 + clear_zmms_avx512 xmm2, xmm3, xmm4, xmm5, xmm18, xmm8 %endif -no_full_blocks: +.no_full_blocks: add arg3, arg4 ; Point at partial block vmovq arg4, xmm21 ; Restore original remaining length and arg4, 15 - jz exit_gmac_update + jz .exit_gmac_update ; Save next partial block mov [arg2 + PBlockLen], arg4 @@ -136,15 +139,15 @@ no_full_blocks: vpxorq xmm0, xmm0, xmm1 vmovdqu64 [arg2 + AadHash], xmm0 %ifdef SAFE_DATA - ;; **xmm1 and xmm8 may contain some clear text + ;; **xmm1 and xmm0 may contain some clear text clear_zmms_avx512 xmm1, xmm0 %endif -exit_gmac_update: +.exit_gmac_update: FUNC_RESTORE ret %ifdef SAFE_PARAM -error_gmac_update: +.error_gmac_update: ;; Clear reg and imb_errno IMB_ERR_CHECK_START rax @@ -159,9 +162,8 @@ error_gmac_update: ;; Set imb_errno IMB_ERR_CHECK_END rax - jmp exit_gmac_update + jmp .exit_gmac_update %endif mksection stack-noexec -%endif ; GCM_GMAC_API_VAES_AVX512_INC diff --git a/lib/win_x64.mak b/lib/win_x64.mak index 04020865..71d33702 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -565,19 +565,17 @@ gcm_objs = \ $(OBJ_DIR)\aes128_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes128_gcm_sgl_api_vaes_avx512.obj \ $(OBJ_DIR)\ghash_api_vaes_avx512.obj \ - $(OBJ_DIR)\aes128_gmac_api_vaes_avx512.obj \ + $(OBJ_DIR)\gmac_api_vaes_avx512.obj \ $(OBJ_DIR)\aes192_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes192_gcm_vaes_avx2.obj \ $(OBJ_DIR)\aes192_gcm_by8_avx512.obj \ $(OBJ_DIR)\aes192_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes192_gcm_sgl_api_vaes_avx512.obj \ - $(OBJ_DIR)\aes192_gmac_api_vaes_avx512.obj \ $(OBJ_DIR)\aes256_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes256_gcm_vaes_avx2.obj \ $(OBJ_DIR)\aes256_gcm_by8_avx512.obj \ $(OBJ_DIR)\aes256_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes256_gcm_sgl_api_vaes_avx512.obj \ - $(OBJ_DIR)\aes256_gmac_api_vaes_avx512.obj \ $(OBJ_DIR)\gcm128_api_by8_sse.obj \ $(OBJ_DIR)\gcm128_sgl_api_by8_sse.obj \ $(OBJ_DIR)\gcm128_gmac_api_by8_sse.obj \ -- GitLab From 2fcd9b661766ce739db2534c21768122fb943b03 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 11 Jan 2024 14:42:33 +0000 Subject: [PATCH 11/30] vaes-avx512: [ghash] improve register usage for internal GHASH API --- lib/avx512_t2/ghash_api_vaes_avx512.asm | 20 ++-- lib/avx512_t2/gmac_api_vaes_avx512.asm | 24 ++-- lib/include/gcm_vaes_avx512.inc | 150 ++++++++++++++++-------- 3 files changed, 117 insertions(+), 77 deletions(-) diff --git a/lib/avx512_t2/ghash_api_vaes_avx512.asm b/lib/avx512_t2/ghash_api_vaes_avx512.asm index e3952888..1fe559d9 100644 --- a/lib/avx512_t2/ghash_api_vaes_avx512.asm +++ b/lib/avx512_t2/ghash_api_vaes_avx512.asm @@ -105,18 +105,18 @@ error_ghash_pre: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; ghash_internal_vaes_avx512() -; arg2 [in] message pointer -; arg3 [in] message length +; r12 [in/clobbered] message pointer +; r13 [in/clobbered] message length ; xmm0 [in/out] ghash value ; arg1 [in] pointer to key structure -; clobbers: zmm1-zmm19, r10-r12, k1 +; clobbers: zmm1-zmm19, rax, k1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(ghash_internal_vaes_avx512,function,internal) ghash_internal_vaes_avx512: - CALC_AAD_HASH arg2, arg3, xmm0, arg1, zmm1, zmm2, zmm3, zmm4, zmm5, \ - zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, \ - zmm15, zmm16, zmm17, zmm18, zmm19, r10, r11, r12, k1 + CALC_GHASH r12, r13, xmm0, arg1, zmm1, zmm2, zmm3, zmm4, zmm5, \ + zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, \ + zmm15, zmm16, zmm17, zmm18, zmm19, rax, k1 ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text ;; **zmm13, zmm15, zmm18 and zmm8 may contain hash key ret @@ -164,10 +164,12 @@ ghash_vaes_avx512: vmovdqu xmm0, [arg4] vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - ;; arg1 [in] pointer to key structure - ;; arg2 [in] message pointer - ;; arg3 [in] message length + ;; arg1 [in] pointer to key structure => arg1 + ;; r12 [in] message pointer => arg2 + ;; r13 [in] message length => arg3 ;; xmm0 [in/out] ghash value + mov r12, arg2 + mov r13, arg3 call ghash_internal_vaes_avx512 vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap diff --git a/lib/avx512_t2/gmac_api_vaes_avx512.asm b/lib/avx512_t2/gmac_api_vaes_avx512.asm index df91c2c4..d4d8c950 100644 --- a/lib/avx512_t2/gmac_api_vaes_avx512.asm +++ b/lib/avx512_t2/gmac_api_vaes_avx512.asm @@ -94,29 +94,21 @@ imb_aes_gmac_update_256_vaes_avx512: sub arg4, r11 add arg3, r11 - vmovq xmm21, arg4 ; Save remaining length - and arg4, -16 ; Get multiple of 16 bytes - - or arg4, arg4 + mov r10, arg4 ; Save remaining length + and arg4, -16 ; Get multiple of 16 bytes jz .no_full_blocks ;; Calculate GHASH of this segment ;; arg1 [in] pointer to key structure - arg1 here - ;; arg2 [in] message pointer - arg3 here - ;; arg3 [in] message length - arg4 here - mov r15, arg2 - mov r13, arg3 - mov arg2, arg3 - mov arg3, arg4 + ;; r12 [in] message pointer - arg3 here + ;; r13 [in] message length - arg4 here + mov r12, arg3 + mov r13, arg4 ;; xmm0 [in/out] ghash value call ghash_internal_vaes_avx512 - ;; restore original arguments - mov arg2, r15 - mov arg3, r13 - vmovdqu64 [arg2 + AadHash], xmm0 ; ctx_data.aad hash = aad_hash ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text @@ -126,9 +118,9 @@ imb_aes_gmac_update_256_vaes_avx512: %endif .no_full_blocks: - add arg3, arg4 ; Point at partial block + add arg3, arg4 ; Point at partial block - vmovq arg4, xmm21 ; Restore original remaining length + mov arg4, r10 ; Restore original remaining length and arg4, 15 jz .exit_gmac_update diff --git a/lib/include/gcm_vaes_avx512.inc b/lib/include/gcm_vaes_avx512.inc index ae944dfe..bc2d2132 100644 --- a/lib/include/gcm_vaes_avx512.inc +++ b/lib/include/gcm_vaes_avx512.inc @@ -824,13 +824,13 @@ %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; CALC_GHASH: Calculates the hash of the data which will not be encrypted. ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). ; Output: The hash of the data (AAD_HASH). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro CALC_AAD_HASH 26 -%define %%A_IN %1 ; [in] AAD text pointer -%define %%A_LEN %2 ; [in] AAD length +%macro CALC_GHASH 24 +%define %%A_IN %1 ; [in/clobbered] AAD text pointer +%define %%A_LEN %2 ; [in/clobbered] AAD length %define %%AAD_HASH %3 ; [in/out] xmm ghash value %define %%GDATA_KEY %4 ; [in] pointer to keys %define %%ZT0 %5 ; [clobbered] ZMM register @@ -851,89 +851,88 @@ %define %%ZT15 %20 ; [clobbered] ZMM register %define %%ZT16 %21 ; [clobbered] ZMM register %define %%ZT17 %22 ; [clobbered] ZMM register -%define %%T1 %23 ; [clobbered] GP register -%define %%T2 %24 ; [clobbered] GP register -%define %%T3 %25 ; [clobbered] GP register -%define %%MASKREG %26 ; [clobbered] mask register +%define %%T3 %23 ; [clobbered] GP register +%define %%MASKREG %24 ; [clobbered] mask register %define %%SHFMSK %%ZT13 - mov %%T1, %%A_IN ; T1 = AAD - mov %%T2, %%A_LEN ; T2 = aadLen - - cmp %%T2, (16*16) + cmp %%A_LEN, (16*16) jb %%_less_than_16x16 vmovdqa64 %%SHFMSK, [rel SHUF_MASK] +align 32 %%_get_AAD_loop2x32x16: - cmp %%T2, (2*32*16) + cmp %%A_LEN, (2*32*16) jb %%_get_AAD_loop32x16 GHASH_16 start, hk_bcast, %%ZT5, %%ZT6, \ - %%T1, (0*16*16), 0, \ + %%A_IN, (0*16*16), 0, \ %%GDATA_KEY, HashKey_32, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK GHASH_16 end_reduce_no_hxor, hk_bcast, %%ZT5, %%ZT6, \ - %%T1, (1*16*16), 0, \ + %%A_IN, (1*16*16), 0, \ %%GDATA_KEY, HashKey_16, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK ;; **ZT1 can potentially include clear text, ZT16 & ZT17 hash key - add %%T1, (32*16) - sub %%T2, (32*16) + add %%A_IN, (32*16) + sub %%A_LEN, (32*16) jmp %%_get_AAD_loop2x32x16 +align 32 %%_get_AAD_loop32x16: - cmp %%T2, (32*16) + cmp %%A_LEN, (32*16) jb %%_exit_AAD_loop32x16 GHASH_16 start, hk_load, %%ZT5, %%ZT6, \ - %%T1, (0*16*16), 0, \ + %%A_IN, (0*16*16), 0, \ %%GDATA_KEY, HashKey_32, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK GHASH_16 end_reduce, hk_load, %%ZT5, %%ZT6, \ - %%T1, (1*16*16), 0, \ + %%A_IN, (1*16*16), 0, \ %%GDATA_KEY, HashKey_16, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK ;; **ZT1 can potentially include clear text, ZT16 & ZT17 hash key - sub %%T2, (32*16) + sub %%A_LEN, (32*16) je %%_CALC_AAD_done - add %%T1, (32*16) + add %%A_IN, (32*16) jmp %%_get_AAD_loop32x16 +align 32 %%_exit_AAD_loop32x16: ; Less than 32x16 bytes remaining - cmp %%T2, (16*16) + cmp %%A_LEN, (16*16) jb %%_less_than_16x16 je %%_equal_16x16 %%_less_than_32x16: ;; calculate offset to hash key to start with - lea %%T3, [%%T2 + 15] + lea %%T3, [%%A_LEN + 15] and %%T3, ~15 neg %%T3 add %%T3, HashKey_1 + 16 GHASH_16 start, hk_load, %%ZT5, %%ZT6, \ - %%T1, (0*64), 0, \ + %%A_IN, (0*64), 0, \ %%GDATA_KEY, %%T3, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK ;; **ZT1 can potentially include clear text, ZT16 & ZT17 hash key - sub %%T2, (16*16) - add %%T1, (16*16) + sub %%A_LEN, (16*16) + add %%A_IN, (16*16) jmp %%_less_than_16x16_remain +align 32 %%_equal_16x16: GHASH_16 start_reduce, hk_load, %%ZT5, %%ZT6, \ - %%T1, (0*64), 0, \ + %%A_IN, (0*64), 0, \ %%GDATA_KEY, HashKey_16, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK @@ -941,38 +940,39 @@ jmp %%_CALC_AAD_done ; Less than 16x16 bytes remaining +align 32 %%_less_than_16x16_remain: ;; ZT5 (H), ZT6 (L) contain ghash sums ;; prep mask source address lea %%T3, [rel byte64_len_to_mask_table] - lea %%T3, [%%T3 + %%T2*8] + lea %%T3, [%%T3 + %%A_LEN*8] ;; calculate number of blocks to ghash (including partial bytes) - add DWORD(%%T2), 15 - shr DWORD(%%T2), 4 + add DWORD(%%A_LEN), 15 + shr DWORD(%%A_LEN), 4 jz %%_CALC_AAD_done ;; catch zero length - cmp DWORD(%%T2), 2 + cmp DWORD(%%A_LEN), 2 jb %%_AAD_blocks_cont_1 je %%_AAD_blocks_cont_2 - cmp DWORD(%%T2), 4 + cmp DWORD(%%A_LEN), 4 jb %%_AAD_blocks_cont_3 je %%_AAD_blocks_cont_4 - cmp DWORD(%%T2), 6 + cmp DWORD(%%A_LEN), 6 jb %%_AAD_blocks_cont_5 je %%_AAD_blocks_cont_6 - cmp DWORD(%%T2), 8 + cmp DWORD(%%A_LEN), 8 jb %%_AAD_blocks_cont_7 je %%_AAD_blocks_cont_8 - cmp DWORD(%%T2), 10 + cmp DWORD(%%A_LEN), 10 jb %%_AAD_blocks_cont_9 je %%_AAD_blocks_cont_10 - cmp DWORD(%%T2), 12 + cmp DWORD(%%A_LEN), 12 jb %%_AAD_blocks_cont_11 je %%_AAD_blocks_cont_12 - cmp DWORD(%%T2), 14 + cmp DWORD(%%A_LEN), 14 jb %%_AAD_blocks_cont_13 je %%_AAD_blocks_cont_14 - cmp DWORD(%%T2), 15 + cmp DWORD(%%A_LEN), 15 je %%_AAD_blocks_cont_15 ;; fall through for 16 blocks @@ -987,6 +987,7 @@ ;; generate all 16 cases using preprocessor %rep 16 +align 32 %%_AAD_blocks_cont_ %+ I: %if I > 12 sub %%T3, 12 * 16 * 8 @@ -998,7 +999,7 @@ kmovq %%MASKREG, [%%T3] ZMM_LOAD_MASKED_BLOCKS_0_16 \ - I, %%T1, 0, \ + I, %%A_IN, 0, \ %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%MASKREG ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 \ @@ -1020,37 +1021,38 @@ %endrep ; Less than 16x16 bytes +align 32 %%_less_than_16x16: ;; prep mask source address lea %%T3, [rel byte64_len_to_mask_table] - lea %%T3, [%%T3 + %%T2*8] + lea %%T3, [%%T3 + %%A_LEN*8] ;; calculate number of blocks to ghash (including partial bytes) - add DWORD(%%T2), 15 - shr DWORD(%%T2), 4 + add DWORD(%%A_LEN), 15 + shr DWORD(%%A_LEN), 4 jz %%_CALC_AAD_done ;; catch zero length - cmp DWORD(%%T2), 2 + cmp DWORD(%%A_LEN), 2 jb %%_AAD_blocks_1 je %%_AAD_blocks_2 - cmp DWORD(%%T2), 4 + cmp DWORD(%%A_LEN), 4 jb %%_AAD_blocks_3 je %%_AAD_blocks_4 - cmp DWORD(%%T2), 6 + cmp DWORD(%%A_LEN), 6 jb %%_AAD_blocks_5 je %%_AAD_blocks_6 - cmp DWORD(%%T2), 8 + cmp DWORD(%%A_LEN), 8 jb %%_AAD_blocks_7 je %%_AAD_blocks_8 - cmp DWORD(%%T2), 10 + cmp DWORD(%%A_LEN), 10 jb %%_AAD_blocks_9 je %%_AAD_blocks_10 - cmp DWORD(%%T2), 12 + cmp DWORD(%%A_LEN), 12 jb %%_AAD_blocks_11 je %%_AAD_blocks_12 - cmp DWORD(%%T2), 14 + cmp DWORD(%%A_LEN), 14 jb %%_AAD_blocks_13 je %%_AAD_blocks_14 - cmp DWORD(%%T2), 15 + cmp DWORD(%%A_LEN), 15 je %%_AAD_blocks_15 ;; fall through for 16 blocks @@ -1065,6 +1067,7 @@ ;; generate all 16 cases using preprocessor %rep 16 +align 32 %%_AAD_blocks_ %+ I: %if I >= 3 vmovdqa64 %%SHFMSK, [rel SHUF_MASK] @@ -1084,7 +1087,7 @@ kmovq %%MASKREG, [%%T3] ZMM_LOAD_MASKED_BLOCKS_0_16 \ - I, %%T1, 0, \ + I, %%A_IN, 0, \ %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%MASKREG ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 \ @@ -1109,6 +1112,49 @@ %%_CALC_AAD_done: ;; result in AAD_HASH +%endmacro ; CALC_GHASH + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 26 +%define %%A_IN %1 ; [in] AAD text pointer +%define %%A_LEN %2 ; [in] AAD length +%define %%AAD_HASH %3 ; [in/out] xmm ghash value +%define %%GDATA_KEY %4 ; [in] pointer to keys +%define %%ZT0 %5 ; [clobbered] ZMM register +%define %%ZT1 %6 ; [clobbered**] ZMM register +%define %%ZT2 %7 ; [clobbered**] ZMM register +%define %%ZT3 %8 ; [clobbered**] ZMM register +%define %%ZT4 %9 ; [clobbered**] ZMM register +%define %%ZT5 %10 ; [clobbered] ZMM register +%define %%ZT6 %11 ; [clobbered] ZMM register +%define %%ZT7 %12 ; [clobbered] ZMM register +%define %%ZT8 %13 ; [clobbered] ZMM register +%define %%ZT9 %14 ; [clobbered] ZMM register +%define %%ZT10 %15 ; [clobbered] ZMM register +%define %%ZT11 %16 ; [clobbered] ZMM register +%define %%ZT12 %17 ; [clobbered] ZMM register +%define %%ZT13 %18 ; [clobbered] ZMM register +%define %%ZT14 %19 ; [clobbered] ZMM register +%define %%ZT15 %20 ; [clobbered] ZMM register +%define %%ZT16 %21 ; [clobbered] ZMM register +%define %%ZT17 %22 ; [clobbered] ZMM register +%define %%T1 %23 ; [clobbered] GP register +%define %%T2 %24 ; [clobbered] GP register +%define %%T3 %25 ; [clobbered] GP register +%define %%MASKREG %26 ; [clobbered] mask register + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + + CALC_GHASH %%T1, %%T2, %%AAD_HASH, %%GDATA_KEY, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, %%ZT13, %%ZT14, \ + %%ZT15, %%ZT16, %%ZT17, %%T3, %%MASKREG + %endmacro ; CALC_AAD_HASH ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -- GitLab From efe6575bba5d374e7f8cc8de3520825111f53c3d Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 11 Jan 2024 15:10:41 +0000 Subject: [PATCH 12/30] vaes-avx512: [gcm] move check for IV length == 12 inside the GCM_INIT macro --- lib/include/gcm_api_vaes_avx512.inc | 26 +------------------------ lib/include/gcm_sgl_api_vaes_avx512.inc | 10 ---------- lib/include/gcm_vaes_avx512.inc | 22 ++++++++++++++------- 3 files changed, 16 insertions(+), 42 deletions(-) diff --git a/lib/include/gcm_api_vaes_avx512.inc b/lib/include/gcm_api_vaes_avx512.inc index a897c13c..3b4a97ce 100644 --- a/lib/include/gcm_api_vaes_avx512.inc +++ b/lib/include/gcm_api_vaes_avx512.inc @@ -393,25 +393,13 @@ FN_NAME(enc_var_iv,_): mov arg1, [arg2 + _enc_keys] - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_enc_IV - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call, \ - {[arg2 + _iv_len_in_bytes]} - jmp skip_iv_len_12_enc_IV - -iv_len_12_enc_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ - r10, r11, r12, k1, xmm14, xmm2, \ - zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ - zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call + {qword [arg2 + _iv_len_in_bytes]} -skip_iv_len_12_enc_IV: mov arg3, [arg2 + _src] add arg3, [arg2 + _cipher_start_src_offset] mov arg4, [arg2 + _dst] @@ -448,25 +436,13 @@ FN_NAME(dec_var_iv,_): mov arg1, [arg2 + _dec_keys] - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_dec_IV - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call, \ {qword [arg2 + _iv_len_in_bytes]} - jmp skip_iv_len_12_dec_IV - -iv_len_12_dec_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ - r10, r11, r12, k1, xmm14, xmm2, \ - zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ - zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call -skip_iv_len_12_dec_IV: mov arg3, [arg2 + _src] add arg3, [arg2 + _cipher_start_src_offset] mov arg4, [arg2 + _dst] diff --git a/lib/include/gcm_sgl_api_vaes_avx512.inc b/lib/include/gcm_sgl_api_vaes_avx512.inc index ecdcf4cc..b653ee37 100644 --- a/lib/include/gcm_sgl_api_vaes_avx512.inc +++ b/lib/include/gcm_sgl_api_vaes_avx512.inc @@ -159,20 +159,10 @@ FN_NAME(init_var_iv,_): skip_aad_check_init_IV: %endif - cmp arg4, 12 - je iv_len_12_init_IV - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, multi_call, arg4 - jmp skip_iv_len_12_init_IV - -iv_len_12_init_IV: - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, k1, xmm14, xmm2, \ - zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ - zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, multi_call -skip_iv_len_12_init_IV: ;; SAFE_DATA covered in FUNC_RESTORE() exit_init_IV: diff --git a/lib/include/gcm_vaes_avx512.inc b/lib/include/gcm_vaes_avx512.inc index bc2d2132..3e11acc4 100644 --- a/lib/include/gcm_vaes_avx512.inc +++ b/lib/include/gcm_vaes_avx512.inc @@ -2952,19 +2952,30 @@ align 32 %define %%IV_LEN %31 ; [in] IV length ;; prepare IV -%if %0 == 31 ;; IV is different than 12 bytes +%if %0 == 31 ;; IV may different than 12 bytes + cmp %%IV_LEN, 12 + je .iv_length_is_12_bytes + CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%CUR_COUNT, \ %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, %%ZT13, \ %%ZT14, %%ZT15, %%ZT16, %%ZT17, %%GPR1, %%GPR2, %%GPR3, %%MASKREG -%else ;; IV is 12 bytes + jmp .iv_prep_is_done + +%endif ;; IV is 12 bytes + +.iv_length_is_12_bytes: ;; read 12 IV bytes and pad with 0x00000001 vmovdqa64 %%CUR_COUNT, [rel ONEf] mov %%GPR2, %%IV mov DWORD(%%GPR1), 0x0000_0fff kmovd %%MASKREG, DWORD(%%GPR1) vmovdqu8 %%CUR_COUNT{%%MASKREG}, [%%GPR2] ; ctr = IV | 0x1 -%endif + +.iv_prep_is_done: + vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv + vpshufb %%CUR_COUNT, %%CUR_COUNT, [rel SHUF_MASK] + vmovdqu64 [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv (LE format) ;; calculate AAD hash cmp %%A_LEN, 12 @@ -2988,7 +2999,7 @@ align 32 jmp %%_aad_compute_done %%_aad_is_not_12_bytes: - vpxor %%AAD_HASH, %%AAD_HASH + vpxor %%AAD_HASH, %%AAD_HASH, %%AAD_HASH CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \ %%ZT10, %%ZT11, %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, %%ZT17, \ @@ -3007,9 +3018,6 @@ align 32 mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx.partial_block_length = 0 %endif - vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv - vpshufb %%CUR_COUNT, [rel SHUF_MASK] - vmovdqu64 [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv (LE format) %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -- GitLab From fd748e91dfa71e5d8fc9ccc3172d4b769a041ca2 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 11 Jan 2024 15:53:00 +0000 Subject: [PATCH 13/30] vaes-avx512: [gcm] use internal GHASH API in GCM_INIT for AAD != 12 bytes code path - tune internal GHASH API SIMD register usage to avoid data clobbering in GCM_INIT --- lib/avx512_t2/ghash_api_vaes_avx512.asm | 12 +++++----- lib/avx512_t2/gmac_api_vaes_avx512.asm | 4 +--- lib/include/gcm_api_vaes_avx512.inc | 5 ++++ lib/include/gcm_sgl_api_vaes_avx512.inc | 5 ++++ lib/include/gcm_vaes_avx512.inc | 32 ++++++++++++++----------- 5 files changed, 35 insertions(+), 23 deletions(-) diff --git a/lib/avx512_t2/ghash_api_vaes_avx512.asm b/lib/avx512_t2/ghash_api_vaes_avx512.asm index 1fe559d9..a0c5c0fb 100644 --- a/lib/avx512_t2/ghash_api_vaes_avx512.asm +++ b/lib/avx512_t2/ghash_api_vaes_avx512.asm @@ -109,16 +109,16 @@ error_ghash_pre: ; r13 [in/clobbered] message length ; xmm0 [in/out] ghash value ; arg1 [in] pointer to key structure -; clobbers: zmm1-zmm19, rax, k1 +; clobbers: zmm1, zmm3-zmm13, zmm15-zmm20, rax, k1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(ghash_internal_vaes_avx512,function,internal) ghash_internal_vaes_avx512: - CALC_GHASH r12, r13, xmm0, arg1, zmm1, zmm2, zmm3, zmm4, zmm5, \ + CALC_GHASH r12, r13, xmm0, arg1, zmm1, zmm3, zmm4, zmm5, \ zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, \ - zmm15, zmm16, zmm17, zmm18, zmm19, rax, k1 - ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text - ;; **zmm13, zmm15, zmm18 and zmm8 may contain hash key + zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, rax, k1 + ;; **zmm3, zmm4, zmm5 and zmm6 may contain clear text + ;; **zmm15, zmm16, zmm19 and zmm9 may contain hash key ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -175,7 +175,7 @@ ghash_vaes_avx512: vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap simd_store_avx arg4, xmm0, arg5, r12, rax %ifdef SAFE_DATA - clear_zmms_avx512 xmm0, xmm2, xmm3, xmm4, xmm5, xmm13, xmm15, xmm8, xmm18 + clear_zmms_avx512 xmm0, xmm3, xmm4, xmm5, xmm6, xmm15, xmm16, xmm9, xmm19 %endif exit_ghash: FUNC_RESTORE diff --git a/lib/avx512_t2/gmac_api_vaes_avx512.asm b/lib/avx512_t2/gmac_api_vaes_avx512.asm index d4d8c950..0a57335c 100644 --- a/lib/avx512_t2/gmac_api_vaes_avx512.asm +++ b/lib/avx512_t2/gmac_api_vaes_avx512.asm @@ -111,10 +111,8 @@ imb_aes_gmac_update_256_vaes_avx512: vmovdqu64 [arg2 + AadHash], xmm0 ; ctx_data.aad hash = aad_hash - ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text - ;; **zmm19 may contain authentication key %ifdef SAFE_DATA - clear_zmms_avx512 xmm2, xmm3, xmm4, xmm5, xmm18, xmm8 + clear_zmms_avx512 xmm3, xmm4, xmm5, xmm6, xmm19, xmm9 %endif .no_full_blocks: diff --git a/lib/include/gcm_api_vaes_avx512.inc b/lib/include/gcm_api_vaes_avx512.inc index 3b4a97ce..1a24b96e 100644 --- a/lib/include/gcm_api_vaes_avx512.inc +++ b/lib/include/gcm_api_vaes_avx512.inc @@ -36,6 +36,11 @@ %ifndef GCM_API_VAES_AVX512_INC %define GCM_API_VAES_AVX512_INC +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; External symbols +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +extern ghash_internal_vaes_avx512 + mksection .text default rel diff --git a/lib/include/gcm_sgl_api_vaes_avx512.inc b/lib/include/gcm_sgl_api_vaes_avx512.inc index b653ee37..f10d1d38 100644 --- a/lib/include/gcm_sgl_api_vaes_avx512.inc +++ b/lib/include/gcm_sgl_api_vaes_avx512.inc @@ -35,6 +35,11 @@ %ifndef GCM_SGL_API_VAES_AVX512_INC %define GCM_SGL_API_VAES_AVX512_INC +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; External symbols +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +extern ghash_internal_vaes_avx512 + mksection .text default rel diff --git a/lib/include/gcm_vaes_avx512.inc b/lib/include/gcm_vaes_avx512.inc index 3e11acc4..6a4351ba 100644 --- a/lib/include/gcm_vaes_avx512.inc +++ b/lib/include/gcm_vaes_avx512.inc @@ -2952,19 +2952,19 @@ align 32 %define %%IV_LEN %31 ; [in] IV length ;; prepare IV -%if %0 == 31 ;; IV may different than 12 bytes +%if %0 == 31 ;; IV may be different than 12 bytes cmp %%IV_LEN, 12 - je .iv_length_is_12_bytes + je %%_iv_length_is_12_bytes CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%CUR_COUNT, \ %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, %%ZT13, \ %%ZT14, %%ZT15, %%ZT16, %%ZT17, %%GPR1, %%GPR2, %%GPR3, %%MASKREG - jmp .iv_prep_is_done + jmp %%_iv_prep_is_done -%endif ;; IV is 12 bytes +%endif -.iv_length_is_12_bytes: +%%_iv_length_is_12_bytes: ;; read 12 IV bytes and pad with 0x00000001 vmovdqa64 %%CUR_COUNT, [rel ONEf] mov %%GPR2, %%IV @@ -2972,7 +2972,7 @@ align 32 kmovd %%MASKREG, DWORD(%%GPR1) vmovdqu8 %%CUR_COUNT{%%MASKREG}, [%%GPR2] ; ctr = IV | 0x1 -.iv_prep_is_done: +%%_iv_prep_is_done: vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv vpshufb %%CUR_COUNT, %%CUR_COUNT, [rel SHUF_MASK] vmovdqu64 [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv (LE format) @@ -2982,7 +2982,7 @@ align 32 jne %%_aad_is_not_12_bytes ;; load 12 bytes of AAD -%if %0 == 31 ;; IV is different than 12 bytes +%if %0 == 31 ;; IV may be different than 12 bytes mov DWORD(%%GPR1), 0x0000_0fff kmovd %%MASKREG, DWORD(%%GPR1) %endif @@ -2990,7 +2990,7 @@ align 32 vmovdqu8 XWORD(%%AAD_HASH){%%MASKREG}{z}, [%%GPR1] vmovdqu8 XWORD(%%ZT0), [%%GDATA_KEY + HashKey_1] vmovdqu8 XWORD(%%ZT5), [%%GDATA_KEY + HashKey_1 + HKeyGap] - vpshufb XWORD(%%AAD_HASH), [rel SHUF_MASK] + vpshufb XWORD(%%AAD_HASH), XWORD(%%AAD_HASH), [rel SHUF_MASK] ;; GHASH 12 bytes of AAD GHASH_MUL2 XWORD(%%AAD_HASH), XWORD(%%ZT0), XWORD(%%ZT5), \ @@ -2999,12 +2999,16 @@ align 32 jmp %%_aad_compute_done %%_aad_is_not_12_bytes: - vpxor %%AAD_HASH, %%AAD_HASH, %%AAD_HASH - CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ - %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \ - %%ZT10, %%ZT11, %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, %%ZT17, \ - %%GPR1, %%GPR2, %%GPR3, %%MASKREG - ;; **ZT1, ZT2, ZT3 may contain AAD but AAD is not considered sensitive + vpxor xmm0, xmm0, xmm0 + ;; arg1 - GDATA_KEY + ;; r12 - message pointer + ;; r13 - message length + ;; xmm0 - hash in/out + mov r12, %%A_IN + mov r13, %%A_LEN + call ghash_internal_vaes_avx512 + vmovdqa %%AAD_HASH, xmm0 + %%_aad_compute_done: ;; set up context fields -- GitLab From 94eb2304fed391252bbcf76aa1b8a3a9914faeb1 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 11 Jan 2024 17:13:38 +0000 Subject: [PATCH 14/30] vaes-avx512: [gcm] use internal GHASH API for J0 calculation --- lib/include/gcm_vaes_avx512.inc | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/lib/include/gcm_vaes_avx512.inc b/lib/include/gcm_vaes_avx512.inc index 6a4351ba..8802f73d 100644 --- a/lib/include/gcm_vaes_avx512.inc +++ b/lib/include/gcm_vaes_avx512.inc @@ -2889,19 +2889,21 @@ align 32 ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ ;; Calculate GHASH of (IV || 0s) - vpxor %%J0, %%J0 - CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ - %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, %%ZT10, %%ZT11, \ - %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, %%ZT17, \ - %%T1, %%T2, %%T3, %%MASKREG - ;; **ZT1, ZT2, ZT3 may contain sensitive data + vpxor xmm0, xmm0, xmm0 + ;; arg1 - GDATA_KEY + ;; r12 - message pointer + ;; r13 - message length + ;; xmm0 - hash in/out + mov r12, %%IV + mov r13, %%IV_LEN + call ghash_internal_vaes_avx512 + vmovdqa64 %%J0, xmm0 ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) mov %%T1, %%IV_LEN shl %%T1, 3 ;; IV length in bits vmovq XWORD(%%ZT2), %%T1 - ;; Might need shuffle of ZT2 vpxorq %%J0, XWORD(%%ZT2), %%J0 vmovdqu64 XWORD(%%ZT0), [%%KEY + HashKey_1] @@ -2909,7 +2911,7 @@ align 32 GHASH_MUL2 %%J0, XWORD(%%ZT0), XWORD(%%ZT5), XWORD(%%ZT1), XWORD(%%ZT2), XWORD(%%ZT3), XWORD(%%ZT4) ;; **ZT1, ZT2, ZT3 overwritten with ghash products - vpshufb %%J0, [rel SHUF_MASK] ; perform a 16Byte swap + vpshufb %%J0, %%J0, [rel SHUF_MASK] ; perform a 16Byte swap %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -- GitLab From c9e9513e4ad8cee182e0fe92bf97a49132b165dd Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Mon, 8 Jan 2024 15:23:29 +0000 Subject: [PATCH 15/30] build: [aes-gcm] remove API embodiment from the common include module API's compiled in two groups: - GHASH API's - internal GHASH API added (GHASH API uses it) - GCM, GCM SGL and GMAC API's - GMAC API's use new internal GHASH API --- lib/Makefile | 4 +- lib/avx2_t1/ghash_by8_avx2.asm | 33 + lib/avx512_t1/ghash_by8_avx512.asm | 33 + lib/include/gcm_api_avx2_avx512.inc | 1179 +++++++++++++++++++ lib/include/gcm_avx512.inc | 4 +- lib/include/gcm_avx_gen4.inc | 4 +- lib/include/gcm_common_avx2_avx512.inc | 1371 +--------------------- lib/include/ghash_common_avx2_avx512.inc | 212 ++++ lib/win_x64.mak | 2 + 9 files changed, 1507 insertions(+), 1335 deletions(-) create mode 100644 lib/avx2_t1/ghash_by8_avx2.asm create mode 100644 lib/avx512_t1/ghash_by8_avx512.asm create mode 100644 lib/include/gcm_api_avx2_avx512.inc create mode 100644 lib/include/ghash_common_avx2_avx512.inc diff --git a/lib/Makefile b/lib/Makefile index b52707e7..d85a7506 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -831,7 +831,7 @@ asm_sse_gcm_objs := \ asm_avx_gcm_objs := asm_avx2_gcm_objs := \ - aes128_gcm_by8_avx2.o aes192_gcm_by8_avx2.o aes256_gcm_by8_avx2.o \ + ghash_by8_avx2.o aes128_gcm_by8_avx2.o aes192_gcm_by8_avx2.o aes256_gcm_by8_avx2.o \ aes128_gcm_vaes_avx2.o aes192_gcm_vaes_avx2.o aes256_gcm_vaes_avx2.o asm_avx512_gcm_objs := \ @@ -839,7 +839,7 @@ asm_avx512_gcm_objs := \ aes128_gcm_sgl_api_vaes_avx512.o aes192_gcm_sgl_api_vaes_avx512.o aes256_gcm_sgl_api_vaes_avx512.o \ ghash_api_vaes_avx512.o \ gmac_api_vaes_avx512.o \ - aes128_gcm_by8_avx512.o aes192_gcm_by8_avx512.o aes256_gcm_by8_avx512.o + ghash_by8_avx512.o aes128_gcm_by8_avx512.o aes192_gcm_by8_avx512.o aes256_gcm_by8_avx512.o endif # aarch64 diff --git a/lib/avx2_t1/ghash_by8_avx2.asm b/lib/avx2_t1/ghash_by8_avx2.asm new file mode 100644 index 00000000..fe10e497 --- /dev/null +++ b/lib/avx2_t1/ghash_by8_avx2.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define IS_AVX2_GCM +%define GCM128_MODE +%include "include/ghash_common_avx2_avx512.inc" + diff --git a/lib/avx512_t1/ghash_by8_avx512.asm b/lib/avx512_t1/ghash_by8_avx512.asm new file mode 100644 index 00000000..e7c37300 --- /dev/null +++ b/lib/avx512_t1/ghash_by8_avx512.asm @@ -0,0 +1,33 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define IS_AVX512_GCM +%define GCM128_MODE +%include "include/ghash_common_avx2_avx512.inc" + diff --git a/lib/include/gcm_api_avx2_avx512.inc b/lib/include/gcm_api_avx2_avx512.inc new file mode 100644 index 00000000..155092b8 --- /dev/null +++ b/lib/include/gcm_api_avx2_avx512.inc @@ -0,0 +1,1179 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%use smartalign + +%include "include/gcm_common_avx2_avx512.inc" + +mksection .text +default rel + +extern GHASH_FN_NAME(ghash_internal) + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_avx_gen4 / +; aes_gcm_precomp_192_avx_gen4 / +; aes_gcm_precomp_256_avx_gen4 / +; aes_gcm_precomp_128_avx512 / +; aes_gcm_precomp_192_avx512 / +; aes_gcm_precomp_256_avx512 +; (struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(precomp,_),function,) +FN_NAME(precomp,_): + endbranch64 +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_precomp +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + sub rsp, 1*16 + ; only xmm6 needs to be maintained + vmovdqu [rsp + 0*16],xmm6 +%endif + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [rel TWOONE] + vpand xmm2, xmm2, [rel POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 0*16] + add rsp, 1*16 +%endif + +exit_precomp: + + ret + +%ifdef SAFE_PARAM +error_precomp: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + + jmp exit_precomp +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 / +; aes_gcm_init_128_avx512 / aes_gcm_init_192_avx512 / aes_gcm_init_256_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(init,_),function,) +FN_NAME(init,_): + endbranch64 + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + push r14 + push r15 + lea r14, [rsp + 4*8] + ; xmm6 needs to be maintained for Windows + sub rsp, 1*16 + vmovdqu [rsp + 0*16], xmm6 +%endif + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_init + + ;; Check context_data != NULL + or arg2, arg2 + jz error_init + + ;; Check IV != NULL + or arg3, arg3 + jz error_init + + ;; Check if aad_len == 0 + cmp arg5, 0 + jz skip_aad_check_init + + ;; Check aad != NULL (aad_len != 0) + or arg4, arg4 + jz error_init + +skip_aad_check_init: +%endif + GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12 + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif +exit_init: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 1*16 + pop r15 + pop r14 +%endif + pop r13 + pop r12 + ret + +%ifdef SAFE_PARAM +error_init: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check IV != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_IV + + ;; Check if aad_len == 0 + cmp arg5, 0 + jz skip_aad_check_error_init + + ;; Check aad != NULL (aad_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AAD + +skip_aad_check_error_init: + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_init +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_var_iv_128_avx_gen4 / aes_gcm_init_var_iv_192_avx_gen4 / +; aes_gcm_init_var_iv_256_avx_gen4 +; aes_gcm_init_var_iv_128_avx512 / aes_gcm_init_var_iv_192_avx512 / +; aes_gcm_init_var_iv_256_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u64 iv_len, +; const u8 *aad, +; const u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(init_var_iv,_),function,) +FN_NAME(init_var_iv,_): + endbranch64 + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + push r14 + push r15 + lea r14, [rsp + 4*8] + ; xmm6 & xmm14 need to be maintained for Windows + sub rsp, 2*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm14 +%endif + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_init_IV + + ;; Check context_data != NULL + or arg2, arg2 + jz error_init_IV + + ;; Check IV != NULL + or arg3, arg3 + jz error_init_IV + + ;; Check iv_len != 0 + or arg4, arg4 + jz error_init_IV + + ;; Check if aad_len == 0 + cmp arg6, 0 + jz skip_aad_check_init_IV + + ;; Check aad != NULL (aad_len != 0) + cmp arg5, 0 + jz error_init_IV + +skip_aad_check_init_IV: +%endif + cmp arg4, 12 + je iv_len_12_init_IV + + GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, arg4 + jmp skip_iv_len_12_init_IV + +iv_len_12_init_IV: + GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12 + +skip_iv_len_12_init_IV: +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif +exit_init_IV: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 0*16] + vmovdqu xmm14, [rsp + 1*16] + add rsp, 2*16 + pop r15 + pop r14 +%endif + pop r13 + pop r12 + ret + +%ifdef SAFE_PARAM +error_init_IV: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check IV != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_IV + + ;; Check iv_len != 0 + IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_IV_LEN + + ;; Check if aad_len == 0 + cmp arg6, 0 + jz skip_aad_check_error_init_IV + + ;; Check aad != NULL (aad_len != 0) + IMB_ERR_CHECK_NULL arg5, rax, IMB_ERR_NULL_AAD + +skip_aad_check_error_init_IV: + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_init_IV +%endif + + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / +; aes_gcm_enc_128_update_avx_gen4 / +; aes_gcm_enc_128_update_avx512 / aes_gcm_enc_192_update_avx512 / +; aes_gcm_enc_256_update_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 msg_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(enc,_update_),function,) +FN_NAME(enc,_update_): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Load max len to reg on windows + INIT_GCM_MAX_LENGTH + + ;; Check key_data != NULL + or arg1, arg1 + jz error_update_enc + + ;; Check context_data != NULL + or arg2, arg2 + jz error_update_enc + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz error_update_enc + + ;; Check if msg_len > max_len + cmp arg5, GCM_MAX_LENGTH + ja error_update_enc + + ;; Check out != NULL (msg_len != 0) + or arg3, arg3 + jz error_update_enc + + ;; Check in != NULL (msg_len != 0) + or arg4, arg4 + jz error_update_enc +%endif + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call + +exit_update_enc: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +error_update_enc: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_error_update_enc + + ;; Check if msg_len > max_len + IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN + + ;; Check out != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST + + ;; Check in != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC + +skip_in_out_check_error_update_enc: + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_update_enc +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / +; aes_gcm_dec_256_update_avx_gen4 / +; aes_gcm_dec_128_update_avx512 / aes_gcm_dec_192_update_avx512 / +; aes_gcm_dec_256_update_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 msg_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(dec,_update_),function,) +FN_NAME(dec,_update_): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Load max len to reg on windows + INIT_GCM_MAX_LENGTH + + ;; Check key_data != NULL + or arg1, arg1 + jz error_update_dec + + ;; Check context_data != NULL + or arg2, arg2 + jz error_update_dec + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz error_update_dec + + ;; Check if msg_len > max_len + cmp arg5, GCM_MAX_LENGTH + ja error_update_dec + + ;; Check out != NULL (msg_len != 0) + or arg3, arg3 + jz error_update_dec + + ;; Check in != NULL (msg_len != 0) + or arg4, arg4 + jz error_update_dec +%endif + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call + +exit_update_dec: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +error_update_dec: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_error_update_dec + + ;; Check if msg_len > max_len + IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN + + ;; Check out != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST + + ;; Check in != NULL (plaintext_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC + +skip_in_out_check_error_update_dec: + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_update_dec +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / +; aes_gcm_enc_256_finalize_avx_gen4 / +; aes_gcm_enc_128_finalize_avx512 / aes_gcm_enc_192_finalize_avx512 / +; aes_gcm_enc_256_finalize_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(enc,_finalize_),function,) +FN_NAME(enc,_finalize_): + endbranch64 +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_enc_fin + + ;; Check context_data != NULL + or arg2, arg2 + jz error_enc_fin + + ;; Check auth_tag != NULL + or arg3, arg3 + jz error_enc_fin + + ;; Check auth_tag_len == 0 or > 16 + or arg4, arg4 + jz error_enc_fin + + cmp arg4, 16 + ja error_enc_fin +%endif + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 7*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm10 + vmovdqu [rsp + 3*16], xmm11 + vmovdqu [rsp + 4*16], xmm13 + vmovdqu [rsp + 5*16], xmm14 + vmovdqu [rsp + 6*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 6*16] + vmovdqu xmm14, [rsp + 5*16] + vmovdqu xmm13, [rsp + 4*16] + vmovdqu xmm11, [rsp + 3*16] + vmovdqu xmm10, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 7*16 +%endif + pop r12 +exit_enc_fin: + ret + +%ifdef SAFE_PARAM +error_enc_fin: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check auth_tag != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_AUTH + + ;; Check auth_tag_len == 0 or > 16 + IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_AUTH_TAG_LEN + + IMB_ERR_CHECK_ABOVE arg4, 16, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_enc_fin +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 +; aes_gcm_dec_256_finalize_avx_gen4 / +; aes_gcm_dec_128_finalize_avx512 / aes_gcm_dec_192_finalize_avx512 +; aes_gcm_dec_256_finalize_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(dec,_finalize_),function,) +FN_NAME(dec,_finalize_): + endbranch64 +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_dec_fin + + ;; Check context_data != NULL + or arg2, arg2 + jz error_dec_fin + + ;; Check auth_tag != NULL + or arg3, arg3 + jz error_dec_fin + + ;; Check auth_tag_len == 0 or > 16 + or arg4, arg4 + jz error_dec_fin + + cmp arg4, 16 + ja error_dec_fin +%endif + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 7*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm10 + vmovdqu [rsp + 3*16], xmm11 + vmovdqu [rsp + 4*16], xmm13 + vmovdqu [rsp + 5*16], xmm14 + vmovdqu [rsp + 6*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 6*16] + vmovdqu xmm14, [rsp + 5*16] + vmovdqu xmm13, [rsp + 4*16] + vmovdqu xmm11, [rsp + 3*16] + vmovdqu xmm10, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 7*16 +%endif + + pop r12 + +exit_dec_fin: + ret + +%ifdef SAFE_PARAM +error_dec_fin: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check auth_tag != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_AUTH + + ;; Check auth_tag_len == 0 or > 16 + IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_AUTH_TAG_LEN + + IMB_ERR_CHECK_ABOVE arg4, 16, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_dec_fin +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 / +; aes_gcm_enc_128_avx512 / aes_gcm_enc_192_avx512 / aes_gcm_enc_256_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 msg_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(enc,_),function,) +FN_NAME(enc,_): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Load max len to reg on windows + INIT_GCM_MAX_LENGTH + + ;; Check key_data != NULL + or arg1, arg1 + jz error_enc + + ;; Check context_data != NULL + or arg2, arg2 + jz error_enc + + ;; Check IV != NULL + cmp arg6, 0 + jz error_enc + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz error_enc + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz error_enc + + cmp arg10, 16 + ja error_enc + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz skip_in_out_check_enc + + ;; Check if msg_len > max_len + cmp arg5, GCM_MAX_LENGTH + ja error_enc + + ;; Check out != NULL (msg_len != 0) + or arg3, arg3 + jz error_enc + + ;; Check in != NULL (msg_len != 0) + or arg4, arg4 + jz error_enc + +skip_in_out_check_enc: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_enc + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz error_enc + +skip_aad_check_enc: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, single_call + +exit_enc: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +error_enc: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check IV != NULL + IMB_ERR_CHECK_NULL arg6, rax, IMB_ERR_NULL_IV + + ;; Check auth_tag != NULL + IMB_ERR_CHECK_NULL arg9, rax, IMB_ERR_NULL_AUTH + + ;; Check auth_tag_len == 0 or > 16 + IMB_ERR_CHECK_ZERO arg10, rax, IMB_ERR_AUTH_TAG_LEN + + IMB_ERR_CHECK_ABOVE arg10, 16, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz skip_in_out_check_error_enc + + ;; Check if msg_len > max_len + IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN + + ;; Check out != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST + + ;; Check in != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC + +skip_in_out_check_error_enc: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_error_enc + + ;; Check aad != NULL (aad_len != 0) + IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD + +skip_aad_check_error_enc: + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_enc +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 / +; aes_gcm_dec_128_avx512 / aes_gcm_dec_192_avx512 / aes_gcm_dec_256_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 msg_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(dec,_),function,) +FN_NAME(dec,_): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Load max len to reg on windows + INIT_GCM_MAX_LENGTH + + ;; Check key_data != NULL + or arg1, arg1 + jz error_dec + + ;; Check context_data != NULL + or arg2, arg2 + jz error_dec + + ;; Check IV != NULL + cmp arg6, 0 + jz error_dec + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz error_dec + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz error_dec + + cmp arg10, 16 + ja error_dec + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz skip_in_out_check_dec + + ;; Check if msg_len > max_len + cmp arg5, GCM_MAX_LENGTH + ja error_dec + + ;; Check out != NULL (msg_len != 0) + or arg3, arg3 + jz error_dec + + ;; Check in != NULL (msg_len != 0) + or arg4, arg4 + jz error_dec + +skip_in_out_check_dec: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_dec + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz error_dec + +skip_aad_check_dec: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, single_call + +exit_dec: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +error_dec: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check IV != NULL + IMB_ERR_CHECK_NULL arg6, rax, IMB_ERR_NULL_IV + + ;; Check auth_tag != NULL + IMB_ERR_CHECK_NULL arg9, rax, IMB_ERR_NULL_AUTH + + ;; Check auth_tag_len == 0 or > 16 + IMB_ERR_CHECK_ZERO arg10, rax, IMB_ERR_AUTH_TAG_LEN + + IMB_ERR_CHECK_ABOVE arg10, 16, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz skip_in_out_check_error_dec + + ;; Check if msg_len > max_len + IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN + + ;; Check out != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST + + ;; Check in != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC + +skip_in_out_check_error_dec: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_error_dec + + ;; Check aad != NULL (aad_len != 0) + IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD + +skip_aad_check_error_dec: + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_dec +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; NOTE: THIS API IS USED BY JOB-API ONLY, NO NEED FOR 2ND SAFE PARAM CHECK +; +;IMB_JOB * aes_gcm_enc_var_iv_128_avx_gen4 / aes_gcm_enc_var_iv_192_avx_gen4 / +; aes_gcm_enc_var_iv_256_avx_gen4 / +; aes_gcm_enc_var_iv_128_avx512 / aes_gcm_enc_var_iv_192_avx512 / +; aes_gcm_enc_var_iv_256_avx512 +; (IMB_MGR *state, IMB_JOB *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(enc_var_iv,_),function,internal) +FN_NAME(enc_var_iv,_): + endbranch64 + FUNC_SAVE alloc_context + + mov arg1, [arg2 + _enc_keys] + + cmp qword [arg2 + _iv_len_in_bytes], 12 + je iv_len_12_enc_IV + + GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ + r10, r11, r12, {[arg2 + _iv_len_in_bytes]} + + jmp skip_iv_len_12_enc_IV + +iv_len_12_enc_IV: + GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ + r10, r11, r12 + +skip_iv_len_12_enc_IV: + mov arg3, [arg2 + _src] + add arg3, [arg2 + _cipher_start_src_offset] + mov arg4, [arg2 + _dst] + mov [rsp + GP_OFFSET + 5*8], arg2 ; preserve job pointer + mov arg2, [arg2 + _msg_len_to_cipher] + GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, arg2, ENC, single_call + + mov arg2, [rsp + GP_OFFSET + 5*8] + GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ + {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ + single_call + + ;; mark job complete + mov dword [arg2 + _status], IMB_STATUS_COMPLETED + + mov rax, arg2 ;; return the job + + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; NOTE: THIS API IS USED BY JOB-API ONLY, NO NEED FOR 2ND SAFE PARAM CHECK +; +;IMB_JOB *aes_gcm_dec_var_iv_128_avx_gen4 / aes_gcm_dec_var_iv_192_avx_gen4 / +; aes_gcm_dec_var_iv_256_avx_gen4 / +; aes_gcm_dec_var_iv_128_avx512 / aes_gcm_dec_var_iv_192_avx512 / +; aes_gcm_dec_var_iv_256_avx512 +; (IMB_MGR *state, IMB_JOB *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(dec_var_iv,_),function,internal) +FN_NAME(dec_var_iv,_): + endbranch64 + FUNC_SAVE alloc_context + + mov arg1, [arg2 + _dec_keys] + + cmp qword [arg2 + _iv_len_in_bytes], 12 + je iv_len_12_dec_IV + + GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ + r10, r11, r12, {[arg2 + _iv_len_in_bytes]} + + jmp skip_iv_len_12_dec_IV + +iv_len_12_dec_IV: + GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ + r10, r11, r12 + +skip_iv_len_12_dec_IV: + mov arg3, [arg2 + _src] + add arg3, [arg2 + _cipher_start_src_offset] + mov arg4, [arg2 + _dst] + mov [rsp + GP_OFFSET + 5*8], arg2 ; preserve job pointer + mov arg2, [arg2 + _msg_len_to_cipher] + GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, arg2, DEC, single_call + + mov arg2, [rsp + GP_OFFSET + 5*8] + GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ + {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ + single_call + + ;; mark job complete + mov dword [arg2 + _status], IMB_STATUS_COMPLETED + + mov rax, arg2 ;; return the job + + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void imb_aes_gmac_update_128_avx_gen4 / imb_aes_gmac_update_192_avx_gen4 / +; imb_aes_gmac_update_256_avx_gen4 +; imb_aes_gmac_update_128_avx512 / imb_aes_gmac_update_192_avx512 / +; imb_aes_gmac_update_256_avx512 ( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; const u8 *in, +; const u64 msg_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(GMAC_FN_NAME(update),function,) +GMAC_FN_NAME(update): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET +%endif + ;; Check if msg_len == 0 + or arg4, arg4 + je exit_gmac_update + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + or arg1, arg1 + jz error_gmac_update + + ;; Check context_data != NULL + or arg2, arg2 + jz error_gmac_update + + ;; Check in != NULL (msg_len != 0) + or arg3, arg3 + jz error_gmac_update +%endif + + ; Increment size of "AAD length" for GMAC + add [arg2 + AadLen], arg4 + + ;; Deal with previous partial block + xor r11, r11 + vmovdqu xmm13, [arg1 + HashKey] + vmovdqu xmm0, [arg2 + AadHash] + + PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm0, xmm13, \ + xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm8, xmm9, xmm10 + + ; CALC_AAD_HASH needs to deal with multiple of 16 bytes + sub arg4, r11 + add arg3, r11 + + vmovq xmm7, arg4 ; Save remaining length + and arg4, -16 ; Get multiple of 16 bytes + + or arg4, arg4 + jz no_full_blocks + + ;; Calculate GHASH of this segment + mov r12, arg3 + mov r13, arg4 + ;; arg1 = key + ;; xmm0 = hash in/out + call GHASH_FN_NAME(ghash_internal) + + vmovdqu [arg2 + AadHash], xmm0 ; ctx_data.aad hash = aad_hash + +no_full_blocks: + add arg3, arg4 ; Point at partial block + + vmovq arg4, xmm7 ; Restore original remaining length + and arg4, 15 + jz exit_gmac_update + + ; Save next partial block + mov [arg2 + PBlockLen], arg4 +%ifdef IS_AVX2_GCM + READ_SMALL_DATA_INPUT_AVX xmm1, arg3, arg4, r11 +%else + READ_SMALL_DATA_INPUT_AVX512 xmm1, arg3, arg4, r11, k1 +%endif + vpshufb xmm1, xmm1, [rel SHUF_MASK] + vpxor xmm0, xmm0, xmm1 + vmovdqu [arg2 + AadHash], xmm0 + +exit_gmac_update: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +error_gmac_update: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check in != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_SRC + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_gmac_update +%endif + +mksection stack-noexec diff --git a/lib/include/gcm_avx512.inc b/lib/include/gcm_avx512.inc index 2c972596..79bd5295 100644 --- a/lib/include/gcm_avx512.inc +++ b/lib/include/gcm_avx512.inc @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2018-2023, Intel Corporation All rights reserved. +; Copyright(c) 2018-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,4 +28,4 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define IS_AVX512_GCM -%include "include/gcm_common_avx2_avx512.inc" +%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/include/gcm_avx_gen4.inc b/lib/include/gcm_avx_gen4.inc index 12f596c4..86983099 100644 --- a/lib/include/gcm_avx_gen4.inc +++ b/lib/include/gcm_avx_gen4.inc @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2023, Intel Corporation All rights reserved. +; Copyright(c) 2011-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,4 +28,4 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define IS_AVX2_GCM -%include "include/gcm_common_avx2_avx512.inc" +%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index ab51bf21..81ba3adb 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2023, Intel Corporation All rights reserved. +; Copyright(c) 2011-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -127,14 +127,14 @@ %ifndef GCM128_MODE %ifndef GCM192_MODE %ifndef GCM256_MODE -%error "No GCM key size selected for gcm_avx2_avx512.inc!" +%error "No GCM key size selected for gcm_common_avx2_avx512.inc!" %endif %endif %endif %ifndef IS_AVX2_GCM %ifndef IS_AVX512_GCM -%error "No GCM AVX2 or AVX512 selection made for gcm_avx2_avx512.inc!" +%error "No GCM AVX2 or AVX512 selection made for gcm_common_avx2_avx512.inc!" %endif %endif @@ -167,11 +167,8 @@ %define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ GCM_API_KEYSZ %+ _ %+ GCM_API_POSTFIX %define GHASH_FN_NAME(x) x %+ _ %+ GCM_API_POSTFIX -mksection .text -default rel - ; need to store 5 GP registers on stack (align to 16 bytes) -%define GP_STORAGE 8*6 +%define GP_STORAGE 8*8 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) %define TMP3 16*1 ; Temporary storage for AES State 3 @@ -2248,6 +2245,9 @@ align 32 mov [rsp + GP_OFFSET + 2*8], r13 mov [rsp + GP_OFFSET + 3*8], r14 mov [rsp + GP_OFFSET + 4*8], r15 + + mov [rsp + GP_OFFSET + 6*8], rbx + mov r14, rax %ifidn __OUTPUT_FORMAT__, win64 @@ -2288,9 +2288,14 @@ align 32 mov r13, [rsp + GP_OFFSET + 2*8] mov r14, [rsp + GP_OFFSET + 3*8] mov r15, [rsp + GP_OFFSET + 4*8] + + mov rbx, [rsp + GP_OFFSET + 6*8] + mov rsp, [rsp + GP_OFFSET + 0*8] %endmacro +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + %macro CALC_J0 15 %define %%KEY %1 ;; [in] Pointer to GCM KEY structure %define %%IV %2 ;; [in] Pointer to IV @@ -2854,1210 +2859,30 @@ align 32 %endif %endmacro ; GCM_COMPLETE -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_precomp_128_avx_gen4 / -; aes_gcm_precomp_192_avx_gen4 / -; aes_gcm_precomp_256_avx_gen4 / -; aes_gcm_precomp_128_avx512 / -; aes_gcm_precomp_192_avx512 / -; aes_gcm_precomp_256_avx512 -; (struct gcm_key_data *key_data) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -MKGLOBAL(FN_NAME(precomp,_),function,) -FN_NAME(precomp,_): - endbranch64 -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_precomp -%endif - -%ifidn __OUTPUT_FORMAT__, win64 - sub rsp, 1*16 - ; only xmm6 needs to be maintained - vmovdqu [rsp + 0*16],xmm6 -%endif - - vpxor xmm6, xmm6 - ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey - - vpshufb xmm6, [rel SHUF_MASK] - ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; - vmovdqa xmm2, xmm6 - vpsllq xmm6, xmm6, 1 - vpsrlq xmm2, xmm2, 63 - vmovdqa xmm1, xmm2 - vpslldq xmm2, xmm2, 8 - vpsrldq xmm1, xmm1, 8 - vpor xmm6, xmm6, xmm2 - ;reduction - vpshufd xmm2, xmm1, 00100100b - vpcmpeqd xmm2, [rel TWOONE] - vpand xmm2, xmm2, [rel POLY] - vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly - - PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif - -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm6, [rsp + 0*16] - add rsp, 1*16 -%endif - -exit_precomp: - - ret - -%ifdef SAFE_PARAM -error_precomp: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - - jmp exit_precomp -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 / -; aes_gcm_init_128_avx512 / aes_gcm_init_192_avx512 / aes_gcm_init_256_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *iv, -; const u8 *aad, -; u64 aad_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -MKGLOBAL(FN_NAME(init,_),function,) -FN_NAME(init,_): - endbranch64 - push r12 - push r13 -%ifidn __OUTPUT_FORMAT__, win64 - push r14 - push r15 - lea r14, [rsp + 4*8] - ; xmm6 needs to be maintained for Windows - sub rsp, 1*16 - vmovdqu [rsp + 0*16], xmm6 -%endif - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_init - - ;; Check context_data != NULL - or arg2, arg2 - jz error_init - - ;; Check IV != NULL - or arg3, arg3 - jz error_init - - ;; Check if aad_len == 0 - cmp arg5, 0 - jz skip_aad_check_init - - ;; Check aad != NULL (aad_len != 0) - or arg4, arg4 - jz error_init - -skip_aad_check_init: -%endif - GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12 - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif -exit_init: - -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm6 , [rsp + 0*16] - add rsp, 1*16 - pop r15 - pop r14 -%endif - pop r13 - pop r12 - ret - -%ifdef SAFE_PARAM -error_init: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check IV != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_IV - - ;; Check if aad_len == 0 - cmp arg5, 0 - jz skip_aad_check_error_init - - ;; Check aad != NULL (aad_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AAD - -skip_aad_check_error_init: - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_init -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_init_var_iv_128_avx_gen4 / aes_gcm_init_var_iv_192_avx_gen4 / -; aes_gcm_init_var_iv_256_avx_gen4 -; aes_gcm_init_var_iv_128_avx512 / aes_gcm_init_var_iv_192_avx512 / -; aes_gcm_init_var_iv_256_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *iv, -; const u64 iv_len, -; const u8 *aad, -; const u64 aad_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(init_var_iv,_),function,) -FN_NAME(init_var_iv,_): - endbranch64 - push r12 - push r13 -%ifidn __OUTPUT_FORMAT__, win64 - push r14 - push r15 - lea r14, [rsp + 4*8] - ; xmm6 & xmm14 need to be maintained for Windows - sub rsp, 2*16 - vmovdqu [rsp + 0*16], xmm6 - vmovdqu [rsp + 1*16], xmm14 -%endif - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_init_IV - - ;; Check context_data != NULL - or arg2, arg2 - jz error_init_IV - - ;; Check IV != NULL - or arg3, arg3 - jz error_init_IV - - ;; Check iv_len != 0 - or arg4, arg4 - jz error_init_IV - - ;; Check if aad_len == 0 - cmp arg6, 0 - jz skip_aad_check_init_IV - - ;; Check aad != NULL (aad_len != 0) - cmp arg5, 0 - jz error_init_IV - -skip_aad_check_init_IV: -%endif - cmp arg4, 12 - je iv_len_12_init_IV - - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, arg4 - jmp skip_iv_len_12_init_IV - -iv_len_12_init_IV: - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12 - -skip_iv_len_12_init_IV: -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif -exit_init_IV: - -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm6, [rsp + 0*16] - vmovdqu xmm14, [rsp + 1*16] - add rsp, 2*16 - pop r15 - pop r14 -%endif - pop r13 - pop r12 - ret - -%ifdef SAFE_PARAM -error_init_IV: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check IV != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_IV - - ;; Check iv_len != 0 - IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_IV_LEN - - ;; Check if aad_len == 0 - cmp arg6, 0 - jz skip_aad_check_error_init_IV - - ;; Check aad != NULL (aad_len != 0) - IMB_ERR_CHECK_NULL arg5, rax, IMB_ERR_NULL_AAD - -skip_aad_check_error_init_IV: - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_init_IV -%endif - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / -; aes_gcm_enc_128_update_avx_gen4 / -; aes_gcm_enc_128_update_avx512 / aes_gcm_enc_192_update_avx512 / -; aes_gcm_enc_256_update_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *out, -; const u8 *in, -; u64 msg_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(enc,_update_),function,) -FN_NAME(enc,_update_): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Load max len to reg on windows - INIT_GCM_MAX_LENGTH - - ;; Check key_data != NULL - or arg1, arg1 - jz error_update_enc - - ;; Check context_data != NULL - or arg2, arg2 - jz error_update_enc - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz error_update_enc - - ;; Check if msg_len > max_len - cmp arg5, GCM_MAX_LENGTH - ja error_update_enc - - ;; Check out != NULL (msg_len != 0) - or arg3, arg3 - jz error_update_enc - - ;; Check in != NULL (msg_len != 0) - or arg4, arg4 - jz error_update_enc -%endif - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call - -exit_update_enc: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_update_enc: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check if plaintext_len == 0 - cmp arg5, 0 - jz skip_in_out_check_error_update_enc - - ;; Check if msg_len > max_len - IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN - - ;; Check out != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST - - ;; Check in != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC - -skip_in_out_check_error_update_enc: - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_update_enc -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / -; aes_gcm_dec_256_update_avx_gen4 / -; aes_gcm_dec_128_update_avx512 / aes_gcm_dec_192_update_avx512 / -; aes_gcm_dec_256_update_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *out, -; const u8 *in, -; u64 msg_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(dec,_update_),function,) -FN_NAME(dec,_update_): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Load max len to reg on windows - INIT_GCM_MAX_LENGTH - - ;; Check key_data != NULL - or arg1, arg1 - jz error_update_dec - - ;; Check context_data != NULL - or arg2, arg2 - jz error_update_dec - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz error_update_dec - - ;; Check if msg_len > max_len - cmp arg5, GCM_MAX_LENGTH - ja error_update_dec - - ;; Check out != NULL (msg_len != 0) - or arg3, arg3 - jz error_update_dec - - ;; Check in != NULL (msg_len != 0) - or arg4, arg4 - jz error_update_dec -%endif - - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call - -exit_update_dec: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_update_dec: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check if plaintext_len == 0 - cmp arg5, 0 - jz skip_in_out_check_error_update_dec - - ;; Check if msg_len > max_len - IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN - - ;; Check out != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST - - ;; Check in != NULL (plaintext_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC - -skip_in_out_check_error_update_dec: - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_update_dec -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / -; aes_gcm_enc_256_finalize_avx_gen4 / -; aes_gcm_enc_128_finalize_avx512 / aes_gcm_enc_192_finalize_avx512 / -; aes_gcm_enc_256_finalize_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *auth_tag, -; u64 auth_tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(enc,_finalize_),function,) -FN_NAME(enc,_finalize_): - endbranch64 -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_enc_fin - - ;; Check context_data != NULL - or arg2, arg2 - jz error_enc_fin - - ;; Check auth_tag != NULL - or arg3, arg3 - jz error_enc_fin - - ;; Check auth_tag_len == 0 or > 16 - or arg4, arg4 - jz error_enc_fin - - cmp arg4, 16 - ja error_enc_fin -%endif - push r12 - -%ifidn __OUTPUT_FORMAT__, win64 - ; xmm6:xmm15 need to be maintained for Windows - sub rsp, 7*16 - vmovdqu [rsp + 0*16], xmm6 - vmovdqu [rsp + 1*16], xmm9 - vmovdqu [rsp + 2*16], xmm10 - vmovdqu [rsp + 3*16], xmm11 - vmovdqu [rsp + 4*16], xmm13 - vmovdqu [rsp + 5*16], xmm14 - vmovdqu [rsp + 6*16], xmm15 -%endif - GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif - -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm15, [rsp + 6*16] - vmovdqu xmm14, [rsp + 5*16] - vmovdqu xmm13, [rsp + 4*16] - vmovdqu xmm11, [rsp + 3*16] - vmovdqu xmm10, [rsp + 2*16] - vmovdqu xmm9, [rsp + 1*16] - vmovdqu xmm6, [rsp + 0*16] - add rsp, 7*16 -%endif - pop r12 -exit_enc_fin: - ret - -%ifdef SAFE_PARAM -error_enc_fin: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check auth_tag != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_AUTH - - ;; Check auth_tag_len == 0 or > 16 - IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_AUTH_TAG_LEN - - IMB_ERR_CHECK_ABOVE arg4, 16, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_enc_fin -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 -; aes_gcm_dec_256_finalize_avx_gen4 / -; aes_gcm_dec_128_finalize_avx512 / aes_gcm_dec_192_finalize_avx512 -; aes_gcm_dec_256_finalize_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *auth_tag, -; u64 auth_tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(dec,_finalize_),function,) -FN_NAME(dec,_finalize_): - endbranch64 -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_dec_fin - - ;; Check context_data != NULL - or arg2, arg2 - jz error_dec_fin - - ;; Check auth_tag != NULL - or arg3, arg3 - jz error_dec_fin - - ;; Check auth_tag_len == 0 or > 16 - or arg4, arg4 - jz error_dec_fin - - cmp arg4, 16 - ja error_dec_fin -%endif - - push r12 - -%ifidn __OUTPUT_FORMAT__, win64 - ; xmm6:xmm15 need to be maintained for Windows - sub rsp, 7*16 - vmovdqu [rsp + 0*16], xmm6 - vmovdqu [rsp + 1*16], xmm9 - vmovdqu [rsp + 2*16], xmm10 - vmovdqu [rsp + 3*16], xmm11 - vmovdqu [rsp + 4*16], xmm13 - vmovdqu [rsp + 5*16], xmm14 - vmovdqu [rsp + 6*16], xmm15 -%endif - GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm15, [rsp + 6*16] - vmovdqu xmm14, [rsp + 5*16] - vmovdqu xmm13, [rsp + 4*16] - vmovdqu xmm11, [rsp + 3*16] - vmovdqu xmm10, [rsp + 2*16] - vmovdqu xmm9, [rsp + 1*16] - vmovdqu xmm6, [rsp + 0*16] - add rsp, 7*16 -%endif - - pop r12 - -exit_dec_fin: - ret - -%ifdef SAFE_PARAM -error_dec_fin: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check auth_tag != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_AUTH - - ;; Check auth_tag_len == 0 or > 16 - IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_AUTH_TAG_LEN - - IMB_ERR_CHECK_ABOVE arg4, 16, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_dec_fin -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 / -; aes_gcm_enc_128_avx512 / aes_gcm_enc_192_avx512 / aes_gcm_enc_256_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *out, -; const u8 *in, -; u64 msg_len, -; u8 *iv, -; const u8 *aad, -; u64 aad_len, -; u8 *auth_tag, -; u64 auth_tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(enc,_),function,) -FN_NAME(enc,_): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Load max len to reg on windows - INIT_GCM_MAX_LENGTH - - ;; Check key_data != NULL - or arg1, arg1 - jz error_enc - - ;; Check context_data != NULL - or arg2, arg2 - jz error_enc - - ;; Check IV != NULL - cmp arg6, 0 - jz error_enc - - ;; Check auth_tag != NULL - cmp arg9, 0 - jz error_enc - - ;; Check auth_tag_len == 0 or > 16 - cmp arg10, 0 - jz error_enc - - cmp arg10, 16 - ja error_enc - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz skip_in_out_check_enc - - ;; Check if msg_len > max_len - cmp arg5, GCM_MAX_LENGTH - ja error_enc - - ;; Check out != NULL (msg_len != 0) - or arg3, arg3 - jz error_enc - - ;; Check in != NULL (msg_len != 0) - or arg4, arg4 - jz error_enc - -skip_in_out_check_enc: - ;; Check if aad_len == 0 - cmp arg8, 0 - jz skip_aad_check_enc - - ;; Check aad != NULL (aad_len != 0) - cmp arg7, 0 - jz error_enc - -skip_aad_check_enc: -%endif - GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 - - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call - - GCM_COMPLETE arg1, arg2, arg9, arg10, single_call - -exit_enc: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_enc: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check IV != NULL - IMB_ERR_CHECK_NULL arg6, rax, IMB_ERR_NULL_IV - - ;; Check auth_tag != NULL - IMB_ERR_CHECK_NULL arg9, rax, IMB_ERR_NULL_AUTH - - ;; Check auth_tag_len == 0 or > 16 - IMB_ERR_CHECK_ZERO arg10, rax, IMB_ERR_AUTH_TAG_LEN - - IMB_ERR_CHECK_ABOVE arg10, 16, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz skip_in_out_check_error_enc - - ;; Check if msg_len > max_len - IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN - - ;; Check out != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST - - ;; Check in != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC - -skip_in_out_check_error_enc: - ;; Check if aad_len == 0 - cmp arg8, 0 - jz skip_aad_check_error_enc - - ;; Check aad != NULL (aad_len != 0) - IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD - -skip_aad_check_error_enc: - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_enc -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 / -; aes_gcm_dec_128_avx512 / aes_gcm_dec_192_avx512 / aes_gcm_dec_256_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *out, -; const u8 *in, -; u64 msg_len, -; u8 *iv, -; const u8 *aad, -; u64 aad_len, -; u8 *auth_tag, -; u64 auth_tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(dec,_),function,) -FN_NAME(dec,_): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Load max len to reg on windows - INIT_GCM_MAX_LENGTH - - ;; Check key_data != NULL - or arg1, arg1 - jz error_dec - - ;; Check context_data != NULL - or arg2, arg2 - jz error_dec - - ;; Check IV != NULL - cmp arg6, 0 - jz error_dec - - ;; Check auth_tag != NULL - cmp arg9, 0 - jz error_dec - - ;; Check auth_tag_len == 0 or > 16 - cmp arg10, 0 - jz error_dec - - cmp arg10, 16 - ja error_dec - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz skip_in_out_check_dec - - ;; Check if msg_len > max_len - cmp arg5, GCM_MAX_LENGTH - ja error_dec - - ;; Check out != NULL (msg_len != 0) - or arg3, arg3 - jz error_dec - - ;; Check in != NULL (msg_len != 0) - or arg4, arg4 - jz error_dec - -skip_in_out_check_dec: - ;; Check if aad_len == 0 - cmp arg8, 0 - jz skip_aad_check_dec - - ;; Check aad != NULL (aad_len != 0) - cmp arg7, 0 - jz error_dec - -skip_aad_check_dec: -%endif - GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 - - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call - - GCM_COMPLETE arg1, arg2, arg9, arg10, single_call - -exit_dec: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_dec: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check IV != NULL - IMB_ERR_CHECK_NULL arg6, rax, IMB_ERR_NULL_IV - - ;; Check auth_tag != NULL - IMB_ERR_CHECK_NULL arg9, rax, IMB_ERR_NULL_AUTH - - ;; Check auth_tag_len == 0 or > 16 - IMB_ERR_CHECK_ZERO arg10, rax, IMB_ERR_AUTH_TAG_LEN - - IMB_ERR_CHECK_ABOVE arg10, 16, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz skip_in_out_check_error_dec - - ;; Check if msg_len > max_len - IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN - - ;; Check out != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST - - ;; Check in != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC - -skip_in_out_check_error_dec: - ;; Check if aad_len == 0 - cmp arg8, 0 - jz skip_aad_check_error_dec - - ;; Check aad != NULL (aad_len != 0) - IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD - -skip_aad_check_error_dec: - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_dec -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; NOTE: THIS API IS USED BY JOB-API ONLY, NO NEED FOR 2ND SAFE PARAM CHECK -; -;IMB_JOB * aes_gcm_enc_var_iv_128_avx_gen4 / aes_gcm_enc_var_iv_192_avx_gen4 / -; aes_gcm_enc_var_iv_256_avx_gen4 / -; aes_gcm_enc_var_iv_128_avx512 / aes_gcm_enc_var_iv_192_avx512 / -; aes_gcm_enc_var_iv_256_avx512 -; (IMB_MGR *state, IMB_JOB *job) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(enc_var_iv,_),function,internal) -FN_NAME(enc_var_iv,_): - endbranch64 - FUNC_SAVE alloc_context - - mov arg1, [arg2 + _enc_keys] - - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_enc_IV - - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12, {[arg2 + _iv_len_in_bytes]} - - jmp skip_iv_len_12_enc_IV - -iv_len_12_enc_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12 - -skip_iv_len_12_enc_IV: - mov arg3, [arg2 + _src] - add arg3, [arg2 + _cipher_start_src_offset] - mov arg4, [arg2 + _dst] - mov [rsp + GP_OFFSET + 5*8], arg2 ; preserve job pointer - mov arg2, [arg2 + _msg_len_to_cipher] - GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, arg2, ENC, single_call - - mov arg2, [rsp + GP_OFFSET + 5*8] - GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ - {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ - single_call - - ;; mark job complete - mov dword [arg2 + _status], IMB_STATUS_COMPLETED - - mov rax, arg2 ;; return the job - - FUNC_RESTORE - ret - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; NOTE: THIS API IS USED BY JOB-API ONLY, NO NEED FOR 2ND SAFE PARAM CHECK -; -;IMB_JOB *aes_gcm_dec_var_iv_128_avx_gen4 / aes_gcm_dec_var_iv_192_avx_gen4 / -; aes_gcm_dec_var_iv_256_avx_gen4 / -; aes_gcm_dec_var_iv_128_avx512 / aes_gcm_dec_var_iv_192_avx512 / -; aes_gcm_dec_var_iv_256_avx512 -; (IMB_MGR *state, IMB_JOB *job) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(dec_var_iv,_),function,internal) -FN_NAME(dec_var_iv,_): - endbranch64 - FUNC_SAVE alloc_context - - mov arg1, [arg2 + _dec_keys] - - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_dec_IV - - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12, {[arg2 + _iv_len_in_bytes]} - - jmp skip_iv_len_12_dec_IV - -iv_len_12_dec_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12 - -skip_iv_len_12_dec_IV: - mov arg3, [arg2 + _src] - add arg3, [arg2 + _cipher_start_src_offset] - mov arg4, [arg2 + _dst] - mov [rsp + GP_OFFSET + 5*8], arg2 ; preserve job pointer - mov arg2, [arg2 + _msg_len_to_cipher] - GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, arg2, DEC, single_call - - mov arg2, [rsp + GP_OFFSET + 5*8] - GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ - {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ - single_call - - ;; mark job complete - mov dword [arg2 + _status], IMB_STATUS_COMPLETED - - mov rax, arg2 ;; return the job - - FUNC_RESTORE - ret - -%ifdef GCM128_MODE -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void ghash_pre_avx_gen4 / ghash_pre_avx512 -; (const void *key, struct gcm_key_data *key_data) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GHASH_FN_NAME(ghash_pre),function,) -GHASH_FN_NAME(ghash_pre): - endbranch64 -;; Parameter is passed through register -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key != NULL - cmp arg1, 0 - jz error_ghash_pre - - ;; Check key_data != NULL - cmp arg2, 0 - jz error_ghash_pre -%endif - -%ifidn __OUTPUT_FORMAT__, win64 - sub rsp, 1*16 - - ; only xmm6 needs to be maintained - vmovdqu [rsp + 0*16], xmm6 -%endif - vmovdqu xmm6, [arg1] - vpshufb xmm6, [rel SHUF_MASK] - ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; - vmovdqa xmm2, xmm6 - vpsllq xmm6, xmm6, 1 - vpsrlq xmm2, xmm2, 63 - vmovdqa xmm1, xmm2 - vpslldq xmm2, xmm2, 8 - vpsrldq xmm1, xmm1, 8 - vpor xmm6, xmm6, xmm2 - ;reduction - vpshufd xmm2, xmm1, 00100100b - vpcmpeqd xmm2, [rel TWOONE] - vpand xmm2, xmm2, [rel POLY] - vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - vmovdqu [arg2 + HashKey], xmm6 ; store HashKey<<1 mod poly - - PRECOMPUTE arg2, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm6, [rsp + 0*16] - add rsp, 1*16 -%endif -exit_ghash_pre: - ret - -%ifdef SAFE_PARAM -error_ghash_pre: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_KEY - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_EXP_KEY - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_ghash_pre -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void ghash_avx_gen4 / ghash_avx512 ( -; const struct gcm_key_data *key_data, -; const void *in, -; const u64 in_len, -; void *io_tag, -; const u64 tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GHASH_FN_NAME(ghash),function,) -GHASH_FN_NAME(ghash): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_ghash - - ;; Check in != NULL - or arg2, arg2 - jz error_ghash - - ;; Check in_len != 0 - or arg3, arg3 - jz error_ghash - - ;; Check tag != NULL - or arg4, arg4 - jz error_ghash - - ;; Check tag_len != 0 - cmp arg5, 0 - jz error_ghash -%endif - - ;; copy tag to xmm0 - vmovdqu xmm0, [arg4] - vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - -%ifdef IS_AVX2_GCM - CALC_AAD_HASH arg2, arg3, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, r12, r13, rax -%else - CALC_AAD_HASH arg2, arg3, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, r12 -%endif - vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - - simd_store_avx arg4, xmm0, arg5, r12, rax - -exit_ghash: - FUNC_RESTORE - ret - -%ifdef SAFE_PARAM -error_ghash: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check in != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_SRC - - ;; Check in_len != 0 - IMB_ERR_CHECK_ZERO arg3, rax, IMB_ERR_AUTH_LEN - - ;; Check tag != NULL - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AUTH - - ;; Check tag_len != 0 - IMB_ERR_CHECK_ZERO arg5, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - - jmp exit_ghash -%endif - -%endif ;; GCM128_MODE - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PARTIAL_BLOCK_GMAC: Handles the tag partial blocks between update calls. ; Requires the input data be at least 1 byte long. ; Input: gcm_context_data (GDATA_CTX), input text (PLAIN_IN), hash subkey (HASH_SUBKEY) ; input text length (PLAIN_LEN). ; Output: Updated GDATA_CTX -; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11 +; Clobbers rax, r10, r12, r13, r15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro PARTIAL_BLOCK_GMAC 6 -%define %%GDATA_CTX %1 -%define %%PLAIN_IN %2 -%define %%PLAIN_LEN %3 -%define %%DATA_OFFSET %4 -%define %%AAD_HASH %5 -%define %%HASH_SUBKEY %6 +%macro PARTIAL_BLOCK_GMAC 15 +%define %%GDATA_CTX %1 ;; [in/out] GPR pointer to GCM context +%define %%PLAIN_IN %2 ;; [in] GPR pointer to plain/cipher text +%define %%PLAIN_LEN %3 ;; [in] text length in bytes, GPR or memory location (win64) +%define %%DATA_OFFSET %4 ;; [out] GPR data offset +%define %%AAD_HASH %5 ;; [in/out] xmm with hash value +%define %%HASH_SUBKEY %6 ;; [in] hash key +%define %%XMM0 %7 ;; [clobbered] xmm register +%define %%XMM1 %8 ;; [clobbered] xmm register +%define %%XMM2 %9 ;; [clobbered] xmm register +%define %%XMM3 %10 ;; [clobbered] xmm register +%define %%XMM5 %11 ;; [clobbered] xmm register +%define %%XMM6 %12 ;; [clobbered] xmm register +%define %%XMM9 %13 ;; [clobbered] xmm register +%define %%XMM10 %14 ;; [clobbered] xmm register +%define %%XMM11 %15 ;; [clobbered] xmm register mov r13, [%%GDATA_CTX + PBlockLen] or r13, r13 @@ -4069,15 +2894,15 @@ error_ghash: cmp %%PLAIN_LEN, 16 jl %%_fewer_than_16_bytes ; If more than 16 bytes of data, just fill the xmm register - VXLDR xmm1, [%%PLAIN_IN] + VXLDR %%XMM1, [%%PLAIN_IN] jmp %%_data_read %%_fewer_than_16_bytes: lea r10, [%%PLAIN_IN] - READ_SMALL_DATA_INPUT_AVX xmm1, r10, %%PLAIN_LEN, rax + READ_SMALL_DATA_INPUT_AVX %%XMM1, r10, %%PLAIN_LEN, rax %else ; Read in input data without over reading - READ_SMALL_DATA_INPUT_LEN_BT16_AVX512 xmm1, %%PLAIN_IN, %%PLAIN_LEN, r12, rax, k1 + READ_SMALL_DATA_INPUT_LEN_BT16_AVX512 %%XMM1, %%PLAIN_IN, %%PLAIN_LEN, r12, rax, k1 %endif ; Finished reading in data %%_data_read: @@ -4087,8 +2912,8 @@ error_ghash: ; (16-r13 is the number of bytes in plaintext mod 16) add r12, r13 ; Get the appropriate shuffle mask - vmovdqu xmm2, [r12] - vmovdqa xmm3, xmm1 + vmovdqu %%XMM2, [r12] + vmovdqa %%XMM3, %%XMM1 mov r15, %%PLAIN_LEN add r15, r13 @@ -4099,19 +2924,19 @@ error_ghash: sub r12, r15 %%_no_extra_mask_1: - ; Get the appropriate mask to mask out bottom r13 bytes of xmm3 - vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] + ; Get the appropriate mask to mask out bottom r13 bytes of %%XMM3 + vmovdqu %%XMM1, [r12 + ALL_F-SHIFT_MASK] - vpand xmm3, xmm1 - vpshufb xmm3, [rel SHUF_MASK] - vpshufb xmm3, xmm2 - vpxor %%AAD_HASH, xmm3 + vpand %%XMM3, %%XMM3, %%XMM1 + vpshufb %%XMM3, %%XMM3, [rel SHUF_MASK] + vpshufb %%XMM3, %%XMM3, %%XMM2 + vpxor %%AAD_HASH, %%AAD_HASH, %%XMM3 - cmp r15,0 + cmp r15, 0 jl %%_partial_incomplete_1 ; GHASH computation for the last <16 Byte block - GHASH_MUL %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6 + GHASH_MUL %%AAD_HASH, %%HASH_SUBKEY, %%XMM0, %%XMM10, %%XMM11, %%XMM5, %%XMM6 xor rax, rax mov [%%GDATA_CTX + PBlockLen], rax jmp %%_ghash_done @@ -4139,115 +2964,3 @@ error_ghash: mov %%DATA_OFFSET, r12 %%_partial_block_done: %endmacro ; PARTIAL_BLOCK_GMAC - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void imb_aes_gmac_update_128_avx_gen4 / imb_aes_gmac_update_192_avx_gen4 / -; imb_aes_gmac_update_256_avx_gen4 -; imb_aes_gmac_update_128_avx512 / imb_aes_gmac_update_192_avx512 / -; imb_aes_gmac_update_256_avx512 ( -; const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; const u8 *in, -; const u64 msg_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GMAC_FN_NAME(update),function,) -GMAC_FN_NAME(update): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET -%endif - ;; Check if msg_len == 0 - or arg4, arg4 - je exit_gmac_update - -%ifdef SAFE_PARAM - ;; Check key_data != NULL - or arg1, arg1 - jz error_gmac_update - - ;; Check context_data != NULL - or arg2, arg2 - jz error_gmac_update - - ;; Check in != NULL (msg_len != 0) - or arg3, arg3 - jz error_gmac_update -%endif - - ; Increment size of "AAD length" for GMAC - add [arg2 + AadLen], arg4 - - ;; Deal with previous partial block - xor r11, r11 - vmovdqu xmm13, [arg1 + HashKey] - vmovdqu xmm8, [arg2 + AadHash] - - PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm8, xmm13 - - ; CALC_AAD_HASH needs to deal with multiple of 16 bytes - sub arg4, r11 - add arg3, r11 - - vmovq xmm7, arg4 ; Save remaining length - and arg4, -16 ; Get multiple of 16 bytes - - or arg4, arg4 - jz no_full_blocks - - ;; Calculate GHASH of this segment -%ifdef IS_AVX2_GCM - CALC_AAD_HASH arg3, arg4, xmm8, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, r12, r13, rax -%else - CALC_AAD_HASH arg3, arg4, xmm8, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, r12 -%endif - vmovdqu [arg2 + AadHash], xmm8 ; ctx_data.aad hash = aad_hash - -no_full_blocks: - add arg3, arg4 ; Point at partial block - - vmovq arg4, xmm7 ; Restore original remaining length - and arg4, 15 - jz exit_gmac_update - - ; Save next partial block - mov [arg2 + PBlockLen], arg4 -%ifdef IS_AVX2_GCM - READ_SMALL_DATA_INPUT_AVX xmm1, arg3, arg4, r11 -%else - READ_SMALL_DATA_INPUT_AVX512 xmm1, arg3, arg4, r11, k1 -%endif - vpshufb xmm1, [rel SHUF_MASK] - vpxor xmm8, xmm1 - vmovdqu [arg2 + AadHash], xmm8 - -exit_gmac_update: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_gmac_update: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check in != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_SRC - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_gmac_update -%endif - -mksection stack-noexec diff --git a/lib/include/ghash_common_avx2_avx512.inc b/lib/include/ghash_common_avx2_avx512.inc new file mode 100644 index 00000000..a0b96891 --- /dev/null +++ b/lib/include/ghash_common_avx2_avx512.inc @@ -0,0 +1,212 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%use smartalign + +%include "include/gcm_common_avx2_avx512.inc" + +mksection .text +default rel + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void ghash_pre_avx_gen4 / ghash_pre_avx512 +; (const void *key, struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(GHASH_FN_NAME(ghash_pre),function,) +GHASH_FN_NAME(ghash_pre): + endbranch64 +;; Parameter is passed through register +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key != NULL + cmp arg1, 0 + jz error_ghash_pre + + ;; Check key_data != NULL + cmp arg2, 0 + jz error_ghash_pre +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + sub rsp, 1*16 + + ; only xmm6 needs to be maintained + vmovdqu [rsp + 0*16], xmm6 +%endif + vmovdqu xmm6, [arg1] + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [rel TWOONE] + vpand xmm2, xmm2, [rel POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg2 + HashKey], xmm6 ; store HashKey<<1 mod poly + + PRECOMPUTE arg2, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 0*16] + add rsp, 1*16 +%endif +exit_ghash_pre: + ret + +%ifdef SAFE_PARAM +error_ghash_pre: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_KEY + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_EXP_KEY + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_ghash_pre +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ghash_internal_avx_gen4 / ghash_internal_avx512 +;; [in] r12 = A_IN +;; [in] r13 = A_LEN +;; [in] arg1 = GDATA_KEY +;; [in/out] xmm0 = hash in/out +;; [clobbered] xmm1-xmm6 +;; [clobbered] r10, r11, rax, [r15, rbx] +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(GHASH_FN_NAME(ghash_internal),function,internal) +GHASH_FN_NAME(ghash_internal): +%ifdef IS_AVX2_GCM + CALC_AAD_HASH r12, r13, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ + r10, r11, rax, r15, rbx +%else + CALC_AAD_HASH r12, r13, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ + r10, r11, rax +%endif + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void ghash_avx_gen4 / ghash_avx512 ( +; const struct gcm_key_data *key_data, +; const void *in, +; const u64 in_len, +; void *io_tag, +; const u64 tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(GHASH_FN_NAME(ghash),function,) +GHASH_FN_NAME(ghash): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_ghash + + ;; Check in != NULL + or arg2, arg2 + jz error_ghash + + ;; Check in_len != 0 + or arg3, arg3 + jz error_ghash + + ;; Check tag != NULL + or arg4, arg4 + jz error_ghash + + ;; Check tag_len != 0 + cmp arg5, 0 + jz error_ghash +%endif + + ;; copy tag to xmm0 + vmovdqu xmm0, [arg4] + vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap + + mov r12, arg2 + mov r13, arg3 + call GHASH_FN_NAME(ghash_internal) + vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap + + simd_store_avx arg4, xmm0, arg5, r12, rax + +exit_ghash: + FUNC_RESTORE + ret + +%ifdef SAFE_PARAM +error_ghash: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check in != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_SRC + + ;; Check in_len != 0 + IMB_ERR_CHECK_ZERO arg3, rax, IMB_ERR_AUTH_LEN + + ;; Check tag != NULL + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AUTH + + ;; Check tag_len != 0 + IMB_ERR_CHECK_ZERO arg5, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + + jmp exit_ghash +%endif + +mksection stack-noexec diff --git a/lib/win_x64.mak b/lib/win_x64.mak index 71d33702..ebf69590 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -559,6 +559,8 @@ no_aesni_objs = \ gcm_objs = \ $(OBJ_DIR)\gcm.obj \ + $(OBJ_DIR)\ghash_by8_avx2.obj \ + $(OBJ_DIR)\ghash_by8_avx512.obj \ $(OBJ_DIR)\aes128_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes128_gcm_vaes_avx2.obj \ $(OBJ_DIR)\aes128_gcm_by8_avx512.obj \ -- GitLab From 98b9b383dc485822c0a45c92f6293b32c2dcebf8 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Tue, 9 Jan 2024 14:34:32 +0000 Subject: [PATCH 16/30] avx2/avx512: [gcm] reduce GPR usage in CALC_AAD_HASH and CALC_J0 macros - add new CALC_GHASH macro that requires two GPR's less that CALC_AAD_HASH - remove RBX save/restore in function entry and exit --- lib/include/gcm_common_avx2_avx512.inc | 155 +++++++++++++---------- lib/include/ghash_common_avx2_avx512.inc | 5 - 2 files changed, 89 insertions(+), 71 deletions(-) diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index 81ba3adb..2c2b3cfd 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -168,7 +168,8 @@ %define GHASH_FN_NAME(x) x %+ _ %+ GCM_API_POSTFIX ; need to store 5 GP registers on stack (align to 16 bytes) -%define GP_STORAGE 8*8 +; @note: the last, 8-byte slot is used in JOB API to save/restore a register +%define GP_STORAGE 8*6 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) %define TMP3 16*1 ; Temporary storage for AES State 3 @@ -355,13 +356,13 @@ %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. -; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; CALC_GHASH: Calculates the hash of selected data +; Input: The input data (A_IN), that data's length (A_LEN), input hash value (AAD_HASH) ; Output: The hash of the data (AAD_HASH). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro CALC_AAD_HASH 13-15 -%define %%A_IN %1 ;; [in] message pointer -%define %%A_LEN %2 ;; [in] message length +%macro CALC_GHASH 11-12 +%define %%A_IN %1 ;; [in/clobbered] message pointer +%define %%A_LEN %2 ;; [in/clobbered] message length %define %%AAD_HASH %3 ;; [in] input hash value (XMM) %define %%GDATA_KEY %4 ;; [in] pointer to GCM key data %define %%XTMP0 %5 ;; [clobbered] temporary XMM @@ -370,35 +371,33 @@ %define %%XTMP3 %8 ;; [clobbered] temporary XMM %define %%XTMP4 %9 ;; [clobbered] temporary XMM %define %%XTMP5 %10 ;; [clobbered] temporary XMM -%define %%T1 %11 ;; [clobbered] temporary GP register -%define %%T2 %12 ;; [clobbered] temporary GP register -%define %%T3 %13 ;; [clobbered] temporary GP register -%define %%T4 %14 ;; [clobbered] temporary GP register (obsolete with avx512) -%define %%T5 %15 ;; [clobbered] temporary GP register (obsolete with avx512) +%define %%T3 %11 ;; [clobbered] temporary GP register +%define %%MASKREG %12 ;; [clobbered] mask register %ifdef IS_AVX2_GCM -%if %0 != 15 -%error "AVX2 CALC_AAD_HASH needs 15 arguments!" +%if %0 != 11 +%error "AVX2 CALC_GHASH needs 11 arguments!" %endif %endif %ifdef IS_AVX512_GCM -%if %0 != 13 -%error "AVX512 CALC_AAD_HASH needs 13 arguments!" +%if %0 != 12 +%error "AVX512 CALC_GHASH needs 12 arguments!" %endif %endif - mov %%T1, %%A_IN ; T1 = AAD - mov %%T2, %%A_LEN ; T2 = aadLen + cmp %%A_LEN, 16 + jb %%_get_small_AAD_block + align 32 %%_get_AAD_loop128: - cmp %%T2, 128 - jl %%_exit_AAD_loop128 + cmp %%A_LEN, 128 + jb %%_exit_AAD_loop128 - vmovdqu %%XTMP0, [%%T1 + 16*0] - vpshufb %%XTMP0, [rel SHUF_MASK] + vmovdqu %%XTMP0, [%%A_IN + 16*0] + vpshufb %%XTMP0, %%XTMP0, [rel SHUF_MASK] - vpxor %%XTMP0, %%AAD_HASH + vpxor %%XTMP0, %%XTMP0, %%AAD_HASH vmovdqa %%XTMP5, [%%GDATA_KEY + HashKeyK_8] vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x00 ; XTMP1 = XTMP_L * KK_L @@ -412,8 +411,8 @@ align 32 %assign i 1 %assign j 7 %rep 7 - vmovdqu %%XTMP0, [%%T1 + 16*i] - vpshufb %%XTMP0, [rel SHUF_MASK] + vmovdqu %%XTMP0, [%%A_IN + 16*i] + vpshufb %%XTMP0, %%XTMP0, [rel SHUF_MASK] vmovdqa %%XTMP5, [%%GDATA_KEY + HashKeyK_ %+ j] vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x00 ; XTMP1 = XTMP_L * KK_L @@ -437,25 +436,25 @@ align 32 vpxor %%AAD_HASH, %%AAD_HASH, %%XTMP2 vpxor %%AAD_HASH, %%AAD_HASH, %%XTMP3 ; the result is in %%AAD_HASH - sub %%T2, 128 + sub %%A_LEN, 128 je %%_CALC_AAD_done - add %%T1, 128 + add %%A_IN, 128 jmp %%_get_AAD_loop128 %%_exit_AAD_loop128: - cmp %%T2, 16 + cmp %%A_LEN, 16 jb %%_get_small_AAD_block ;; calculate hash_key position to start with - mov %%T3, %%T2 + mov %%T3, %%A_LEN and %%T3, -16 ; 1 to 7 blocks possible here neg %%T3 add %%T3, HashKey_1 + 16 lea %%T3, [%%GDATA_KEY + %%T3] - vmovdqu %%XTMP0, [%%T1] - vpshufb %%XTMP0, [rel SHUF_MASK] + vmovdqu %%XTMP0, [%%A_IN] + vpshufb %%XTMP0, %%XTMP0, [rel SHUF_MASK] vpxor %%XTMP0, %%XTMP0, %%AAD_HASH @@ -469,15 +468,15 @@ align 32 vpxor %%XTMP2, %%XTMP2, %%XTMP4 ; XTMP2 = XTMP2 + XTMP4 add %%T3, 16 ; move to next hashkey - add %%T1, 16 ; move to next data block - sub %%T2, 16 - cmp %%T2, 16 + add %%A_IN, 16 ; move to next data block + sub %%A_LEN, 16 + cmp %%A_LEN, 16 jb %%_AAD_reduce align 32 %%_AAD_blocks: - vmovdqu %%XTMP0, [%%T1] - vpshufb %%XTMP0, [rel SHUF_MASK] + vmovdqu %%XTMP0, [%%A_IN] + vpshufb %%XTMP0, %%XTMP0, [rel SHUF_MASK] vmovdqa %%XTMP5, [%%T3 + HKeyGap] vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x00 ; XTMP1 = XTMP_L * KK_L @@ -491,9 +490,9 @@ align 32 vpxor %%XTMP2, %%XTMP2, %%XTMP4 add %%T3, 16 ; move to next hashkey - add %%T1, 16 - sub %%T2, 16 - cmp %%T2, 16 + add %%A_IN, 16 + sub %%A_LEN, 16 + cmp %%A_LEN, 16 jae %%_AAD_blocks %%_AAD_reduce: @@ -505,15 +504,15 @@ align 32 vpxor %%AAD_HASH, %%AAD_HASH, %%XTMP3 ; the result is in %%AAD_HASH %%_get_small_AAD_block: - or %%T2, %%T2 + or %%A_LEN, %%A_LEN je %%_CALC_AAD_done vmovdqa %%XTMP0, [%%GDATA_KEY + HashKey_1] vmovdqa %%XTMP1, [%%GDATA_KEY + HashKeyK_1] %ifdef IS_AVX2_GCM - READ_SMALL_DATA_INPUT_AVX %%XTMP2, %%T1, %%T2, %%T3 + READ_SMALL_DATA_INPUT_AVX %%XTMP2, %%A_IN, %%A_LEN, %%T3 %else - READ_SMALL_DATA_INPUT_AVX512 %%XTMP2, %%T1, %%T2, %%T3, k1 + READ_SMALL_DATA_INPUT_AVX512 %%XTMP2, %%A_IN, %%A_LEN, %%T3, %%MASKREG %endif ;byte-reflect the AAD data vpshufb %%XTMP2, %%XTMP2, [rel SHUF_MASK] @@ -524,6 +523,44 @@ align 32 %endmacro ; CALC_AAD_HASH + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of selected data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and input hash (AAD_HASH) +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 13 +%define %%A_IN %1 ;; [in] message pointer +%define %%A_LEN %2 ;; [in] message length +%define %%AAD_HASH %3 ;; [in] input hash value (XMM) +%define %%GDATA_KEY %4 ;; [in] pointer to GCM key data +%define %%XTMP0 %5 ;; [clobbered] temporary XMM +%define %%XTMP1 %6 ;; [clobbered] temporary XMM +%define %%XTMP2 %7 ;; [clobbered] temporary XMM +%define %%XTMP3 %8 ;; [clobbered] temporary XMM +%define %%XTMP4 %9 ;; [clobbered] temporary XMM +%define %%XTMP5 %10 ;; [clobbered] temporary XMM +%define %%T1 %11 ;; [clobbered] temporary GP register +%define %%T2 %12 ;; [clobbered] temporary GP register +%define %%T3 %13 ;; [clobbered] temporary GP register + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + +%ifdef IS_AVX2_GCM + CALC_GHASH %%T1, %%T2, %%AAD_HASH, %%GDATA_KEY, \ + %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5, \ + %%T3 +%endif + +%ifdef IS_AVX512_GCM + CALC_GHASH %%T1, %%T2, %%AAD_HASH, %%GDATA_KEY, \ + %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5, \ + %%T3, k1 +%endif + +%endmacro ; CALC_AAD_HASH + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PARTIAL_BLOCK: Handles encryption/decryption and the tag partial blocks between update calls. ; Requires the input data be at least 1 byte long. @@ -2246,8 +2283,6 @@ align 32 mov [rsp + GP_OFFSET + 3*8], r14 mov [rsp + GP_OFFSET + 4*8], r15 - mov [rsp + GP_OFFSET + 6*8], rbx - mov r14, rax %ifidn __OUTPUT_FORMAT__, win64 @@ -2288,15 +2323,12 @@ align 32 mov r13, [rsp + GP_OFFSET + 2*8] mov r14, [rsp + GP_OFFSET + 3*8] mov r15, [rsp + GP_OFFSET + 4*8] - - mov rbx, [rsp + GP_OFFSET + 6*8] - mov rsp, [rsp + GP_OFFSET + 0*8] %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro CALC_J0 15 +%macro CALC_J0 13 %define %%KEY %1 ;; [in] Pointer to GCM KEY structure %define %%IV %2 ;; [in] Pointer to IV %define %%IV_LEN %3 ;; [in] IV length @@ -2304,27 +2336,20 @@ align 32 %define %%TMP0 %5 ;; [clobbered] Temporary GP reg %define %%TMP1 %6 ;; [clobbered] Temporary GP reg %define %%TMP2 %7 ;; [clobbered] Temporary GP reg -%define %%TMP3 %8 ;; [clobbered] Temporary GP reg (unused with AVX512) -%define %%TMP4 %9 ;; [clobbered] Temporary GP reg (unused with AVX512) -%define %%XTMP0 %10 ;; [clobbered] Temporary XMM reg -%define %%XTMP1 %11 ;; [clobbered] Temporary XMM reg -%define %%XTMP2 %12 ;; [clobbered] Temporary XMM reg -%define %%XTMP3 %13 ;; [clobbered] Temporary XMM reg -%define %%XTMP4 %14 ;; [clobbered] Temporary XMM reg -%define %%XTMP5 %15 ;; [clobbered] Temporary XMM reg +%define %%XTMP0 %8 ;; [clobbered] Temporary XMM reg +%define %%XTMP1 %9 ;; [clobbered] Temporary XMM reg +%define %%XTMP2 %10 ;; [clobbered] Temporary XMM reg +%define %%XTMP3 %11 ;; [clobbered] Temporary XMM reg +%define %%XTMP4 %12 ;; [clobbered] Temporary XMM reg +%define %%XTMP5 %13 ;; [clobbered] Temporary XMM reg ;; J0 = GHASH(IV || 0s+64 || len(IV)64) ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ ;; Calculate GHASH of (IV || 0s) vpxor %%J0, %%J0, %%J0 -%ifdef IS_AVX2_GCM - CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%XTMP0, %%XTMP1, %%XTMP2, \ - %%XTMP3, %%XTMP4, %%XTMP5, %%TMP0, %%TMP1, %%TMP2, %%TMP3, %%TMP4 -%else CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%XTMP0, %%XTMP1, %%XTMP2, \ %%XTMP3, %%XTMP4, %%XTMP5, %%TMP0, %%TMP1, %%TMP2 -%endif ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) vmovdqu %%XTMP0, [%%KEY + HashKey_1] @@ -2359,11 +2384,9 @@ align 32 %define %%AAD_HASH xmm14 vpxor %%AAD_HASH, %%AAD_HASH -%ifdef IS_AVX2_GCM - CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3, r13, rax -%else - CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3 -%endif + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ + xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3 + mov %%GPR1, %%A_LEN vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx_data.aad_length = aad_length @@ -2373,7 +2396,7 @@ align 32 mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx_data.partial_block_length = 0 %if %0 == 9 ;; IV is different than 12 bytes - CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, xmm2, r10, r11, r12, r13, rax, xmm0, xmm1, \ + CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, xmm2, r10, r11, r12, xmm0, xmm1, \ xmm3, xmm4, xmm5, xmm6 %else ;; IV is 12 bytes ;; read 12 IV bytes and pad with 0x00000001 diff --git a/lib/include/ghash_common_avx2_avx512.inc b/lib/include/ghash_common_avx2_avx512.inc index a0b96891..8c354a15 100644 --- a/lib/include/ghash_common_avx2_avx512.inc +++ b/lib/include/ghash_common_avx2_avx512.inc @@ -120,13 +120,8 @@ error_ghash_pre: align 32 MKGLOBAL(GHASH_FN_NAME(ghash_internal),function,internal) GHASH_FN_NAME(ghash_internal): -%ifdef IS_AVX2_GCM - CALC_AAD_HASH r12, r13, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, rax, r15, rbx -%else CALC_AAD_HASH r12, r13, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ r10, r11, rax -%endif ret ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -- GitLab From d5249248085f2158490c069eeb7a04a9767db1ba Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Tue, 9 Jan 2024 16:03:56 +0000 Subject: [PATCH 17/30] avx2/avx512: [gcm] change CALC_J0 macro to call ghash_internal function for GHASH calculation instead of expanding CALC_AAD_HASH --- lib/include/gcm_api_avx2_avx512.inc | 20 +++---- lib/include/gcm_common_avx2_avx512.inc | 71 ++++++++++++------------ lib/include/ghash_common_avx2_avx512.inc | 2 +- 3 files changed, 47 insertions(+), 46 deletions(-) diff --git a/lib/include/gcm_api_avx2_avx512.inc b/lib/include/gcm_api_avx2_avx512.inc index 155092b8..16412c39 100644 --- a/lib/include/gcm_api_avx2_avx512.inc +++ b/lib/include/gcm_api_avx2_avx512.inc @@ -163,7 +163,7 @@ FN_NAME(init,_): skip_aad_check_init: %endif - GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12 + GCM_INIT arg1, arg2, arg3, arg4, arg5 %ifdef SAFE_DATA clear_scratch_xmms_avx_asm @@ -269,11 +269,11 @@ skip_aad_check_init_IV: cmp arg4, 12 je iv_len_12_init_IV - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, arg4 + GCM_INIT arg1, arg2, arg3, arg5, arg6, arg4 jmp skip_iv_len_12_init_IV iv_len_12_init_IV: - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12 + GCM_INIT arg1, arg2, arg3, arg5, arg6 skip_iv_len_12_init_IV: %ifdef SAFE_DATA @@ -762,7 +762,7 @@ skip_in_out_check_enc: skip_aad_check_enc: %endif - GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 + GCM_INIT arg1, arg2, arg6, arg7, arg8 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call @@ -899,7 +899,7 @@ skip_in_out_check_dec: skip_aad_check_dec: %endif - GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 + GCM_INIT arg1, arg2, arg6, arg7, arg8 GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call @@ -983,14 +983,13 @@ FN_NAME(enc_var_iv,_): GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12, {[arg2 + _iv_len_in_bytes]} + {[arg2 + _iv_len_in_bytes]} jmp skip_iv_len_12_enc_IV iv_len_12_enc_IV: GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12 + {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]} skip_iv_len_12_enc_IV: mov arg3, [arg2 + _src] @@ -1036,14 +1035,13 @@ FN_NAME(dec_var_iv,_): GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12, {[arg2 + _iv_len_in_bytes]} + {[arg2 + _iv_len_in_bytes]} jmp skip_iv_len_12_dec_IV iv_len_12_dec_IV: GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12 + {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]} skip_iv_len_12_dec_IV: mov arg3, [arg2 + _src] diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index 2c2b3cfd..46f68243 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -521,8 +521,7 @@ align 32 %%_CALC_AAD_done: -%endmacro ; CALC_AAD_HASH - +%endmacro ; CALC_GHASH ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; CALC_AAD_HASH: Calculates the hash of selected data which will not be encrypted. @@ -2328,35 +2327,35 @@ align 32 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro CALC_J0 13 +%macro CALC_J0 3 %define %%KEY %1 ;; [in] Pointer to GCM KEY structure %define %%IV %2 ;; [in] Pointer to IV %define %%IV_LEN %3 ;; [in] IV length -%define %%J0 %4 ;; [out] XMM reg to contain J0 -%define %%TMP0 %5 ;; [clobbered] Temporary GP reg -%define %%TMP1 %6 ;; [clobbered] Temporary GP reg -%define %%TMP2 %7 ;; [clobbered] Temporary GP reg -%define %%XTMP0 %8 ;; [clobbered] Temporary XMM reg -%define %%XTMP1 %9 ;; [clobbered] Temporary XMM reg -%define %%XTMP2 %10 ;; [clobbered] Temporary XMM reg -%define %%XTMP3 %11 ;; [clobbered] Temporary XMM reg -%define %%XTMP4 %12 ;; [clobbered] Temporary XMM reg -%define %%XTMP5 %13 ;; [clobbered] Temporary XMM reg + +%define %%J0 xmm0 ;; [out] XMM reg to contain J0 + +%define %%XTMP0 xmm1 ;; [clobbered] Temporary XMM reg +%define %%XTMP1 xmm2 ;; [clobbered] Temporary XMM reg +%define %%XTMP2 xmm3 ;; [clobbered] Temporary XMM reg +%define %%XTMP3 xmm4 ;; [clobbered] Temporary XMM reg +%define %%XTMP4 xmm5 ;; [clobbered] Temporary XMM reg +%define %%XTMP5 xmm6 ;; [clobbered] Temporary XMM reg ;; J0 = GHASH(IV || 0s+64 || len(IV)64) ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ ;; Calculate GHASH of (IV || 0s) vpxor %%J0, %%J0, %%J0 - CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%XTMP0, %%XTMP1, %%XTMP2, \ - %%XTMP3, %%XTMP4, %%XTMP5, %%TMP0, %%TMP1, %%TMP2 + ;; arg1 = key pointer + mov r12, %%IV + mov r13, %%IV_LEN + call GHASH_FN_NAME(ghash_internal) ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) + vmovq %%XTMP2, %%IV_LEN + vpsllq %%XTMP2, %%XTMP2, 3 ;; IV length in bits vmovdqu %%XTMP0, [%%KEY + HashKey_1] vmovdqu %%XTMP1, [%%KEY + HashKeyK_1] - mov %%TMP2, %%IV_LEN - shl %%TMP2, 3 ;; IV length in bits - vmovq %%XTMP2, %%TMP2 vpxor %%J0, %%J0, %%XTMP2 GHASH_MUL2 %%J0, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 @@ -2367,23 +2366,26 @@ align 32 ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, IV_LEN, ; Additional Authentication data (A_IN), Additional Data length (A_LEN). -; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA. -; Clobbers rax, r10-r13 and xmm0-xmm6 +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash=xmm14) and +; initialized other parts of GDATA. +; xmm2 - holds counter block (LE format) +; Clobbers: rax, r10-r13 and xmm0-xmm6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro GCM_INIT 8-9 +%macro GCM_INIT 5-6 %define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer %define %%GDATA_CTX %2 ; [in] GCM context pointer %define %%IV %3 ; [in] IV pointer %define %%A_IN %4 ; [in] AAD pointer %define %%A_LEN %5 ; [in] AAD length in bytes -%define %%GPR1 %6 ; temp GPR -%define %%GPR2 %7 ; temp GPR -%define %%GPR3 %8 ; temp GPR -%define %%IV_LEN %9 ; [in] IV length +%define %%IV_LEN %6 ; [in] IV length + +%define %%GPR1 r10 ; temp GPR +%define %%GPR2 r11 ; temp GPR +%define %%GPR3 rax ; temp GPR %define %%AAD_HASH xmm14 - vpxor %%AAD_HASH, %%AAD_HASH + vpxor %%AAD_HASH, %%AAD_HASH, %%AAD_HASH CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3 @@ -2395,21 +2397,22 @@ align 32 mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx_data.in_length = 0 mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx_data.partial_block_length = 0 -%if %0 == 9 ;; IV is different than 12 bytes - CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, xmm2, r10, r11, r12, xmm0, xmm1, \ - xmm3, xmm4, xmm5, xmm6 +%if %0 == 6 ;; IV is different than 12 bytes + ;; uses xmm0-xmm6, r10-r13, rax + CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN %else ;; IV is 12 bytes ;; read 12 IV bytes and pad with 0x00000001 mov %%GPR2, %%IV - vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 - vpinsrq xmm2, [%%GPR2], 0 - vpinsrd xmm2, [%%GPR2 + 8], 2 + vmovdqa xmm0, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 + vpinsrq xmm0, [%%GPR2], 0 + vpinsrd xmm0, [%%GPR2 + 8], 2 %endif - vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + vmovdqu [%%GDATA_CTX + OrigIV], xmm0 ; ctx_data.orig_IV = iv ;; store IV as counter in LE format - vpshufb xmm2, [rel SHUF_MASK] + vpshufb xmm2, xmm0, [rel SHUF_MASK] vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv + ;; @note: xmm2 - needs to return counter block %endmacro %macro GCM_ENC_DEC_SMALL 12 diff --git a/lib/include/ghash_common_avx2_avx512.inc b/lib/include/ghash_common_avx2_avx512.inc index 8c354a15..39e094a0 100644 --- a/lib/include/ghash_common_avx2_avx512.inc +++ b/lib/include/ghash_common_avx2_avx512.inc @@ -115,7 +115,7 @@ error_ghash_pre: ;; [in] arg1 = GDATA_KEY ;; [in/out] xmm0 = hash in/out ;; [clobbered] xmm1-xmm6 -;; [clobbered] r10, r11, rax, [r15, rbx] +;; [clobbered] r10, r11, rax ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(GHASH_FN_NAME(ghash_internal),function,internal) -- GitLab From 3c317f8b0d2f26e528518868124f017977fac4e4 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Tue, 9 Jan 2024 16:22:00 +0000 Subject: [PATCH 18/30] avx2/avx512: [gcm] move check for IV_LEN==12 into GCM_INIT macro --- lib/include/gcm_api_avx2_avx512.inc | 33 +++----------------------- lib/include/gcm_common_avx2_avx512.inc | 16 ++++++++++--- 2 files changed, 16 insertions(+), 33 deletions(-) diff --git a/lib/include/gcm_api_avx2_avx512.inc b/lib/include/gcm_api_avx2_avx512.inc index 16412c39..59ee5e5c 100644 --- a/lib/include/gcm_api_avx2_avx512.inc +++ b/lib/include/gcm_api_avx2_avx512.inc @@ -266,16 +266,9 @@ FN_NAME(init_var_iv,_): skip_aad_check_init_IV: %endif - cmp arg4, 12 - je iv_len_12_init_IV - GCM_INIT arg1, arg2, arg3, arg5, arg6, arg4 - jmp skip_iv_len_12_init_IV + GCM_INIT arg1, arg2, arg3, arg5, arg6, arg4 -iv_len_12_init_IV: - GCM_INIT arg1, arg2, arg3, arg5, arg6 - -skip_iv_len_12_init_IV: %ifdef SAFE_DATA clear_scratch_xmms_avx_asm %endif @@ -978,20 +971,10 @@ FN_NAME(enc_var_iv,_): mov arg1, [arg2 + _enc_keys] - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_enc_IV - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - {[arg2 + _iv_len_in_bytes]} - - jmp skip_iv_len_12_enc_IV - -iv_len_12_enc_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]} + {qword [arg2 + _iv_len_in_bytes]} -skip_iv_len_12_enc_IV: mov arg3, [arg2 + _src] add arg3, [arg2 + _cipher_start_src_offset] mov arg4, [arg2 + _dst] @@ -1030,20 +1013,10 @@ FN_NAME(dec_var_iv,_): mov arg1, [arg2 + _dec_keys] - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_dec_IV - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - {[arg2 + _iv_len_in_bytes]} - - jmp skip_iv_len_12_dec_IV - -iv_len_12_dec_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]} + {qword [arg2 + _iv_len_in_bytes]} -skip_iv_len_12_dec_IV: mov arg3, [arg2 + _src] add arg3, [arg2 + _cipher_start_src_offset] mov arg4, [arg2 + _dst] diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index 46f68243..e0a47bfb 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -2397,16 +2397,26 @@ align 32 mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx_data.in_length = 0 mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx_data.partial_block_length = 0 -%if %0 == 6 ;; IV is different than 12 bytes +%if %0 == 6 + ;; IV may be different than 12 bytes + cmp %%IV_LEN, 12 + je %%_iv_len_is_12 + ;; uses xmm0-xmm6, r10-r13, rax CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN -%else ;; IV is 12 bytes + jmp %%_iv_is_done + +%%_iv_len_is_12: +%endif + + ;; IV is 12 bytes ;; read 12 IV bytes and pad with 0x00000001 mov %%GPR2, %%IV vmovdqa xmm0, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 vpinsrq xmm0, [%%GPR2], 0 vpinsrd xmm0, [%%GPR2 + 8], 2 -%endif + +%%_iv_is_done: vmovdqu [%%GDATA_CTX + OrigIV], xmm0 ; ctx_data.orig_IV = iv ;; store IV as counter in LE format -- GitLab From 5910d591e62950c1100f12b436b02c7c6ac2d9eb Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Tue, 9 Jan 2024 17:04:37 +0000 Subject: [PATCH 19/30] avx2/avx512: [gcm] add optimized code path for 12 bytes AAD in GCM_INIT --- lib/include/gcm_api_avx2_avx512.inc | 4 ++-- lib/include/gcm_common_avx2_avx512.inc | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/lib/include/gcm_api_avx2_avx512.inc b/lib/include/gcm_api_avx2_avx512.inc index 59ee5e5c..8a64a7f1 100644 --- a/lib/include/gcm_api_avx2_avx512.inc +++ b/lib/include/gcm_api_avx2_avx512.inc @@ -972,7 +972,7 @@ FN_NAME(enc_var_iv,_): mov arg1, [arg2 + _enc_keys] GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ + {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ {qword [arg2 + _iv_len_in_bytes]} mov arg3, [arg2 + _src] @@ -1014,7 +1014,7 @@ FN_NAME(dec_var_iv,_): mov arg1, [arg2 + _dec_keys] GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ + {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ {qword [arg2 + _iv_len_in_bytes]} mov arg3, [arg2 + _src] diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index e0a47bfb..7967bc70 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -2385,10 +2385,27 @@ align 32 %define %%AAD_HASH xmm14 + ;; IV may be different than 12 bytes + cmp %%A_LEN, 12 + je %%_aad_len_is_12 + vpxor %%AAD_HASH, %%AAD_HASH, %%AAD_HASH CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3 + jmp %%_aad_is_done + +%%_aad_len_is_12: + ;; GHASH 12 bytes of AAD + mov %%GPR1, %%A_IN + vmovq %%AAD_HASH, [%%GPR1] + vpinsrd %%AAD_HASH, [%%GPR1 + 8], 2 + vmovdqa xmm1, [%%GDATA_KEY + HashKey_1] + vmovdqa xmm2, [%%GDATA_KEY + HashKey_1 + HKeyGap] + vpshufb %%AAD_HASH, %%AAD_HASH, [rel SHUF_MASK] + + GHASH_MUL2 %%AAD_HASH, xmm1, xmm2, xmm6, xmm5, xmm4, xmm3 +%%_aad_is_done: mov %%GPR1, %%A_LEN vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx_data.aad_length = aad_length @@ -2412,9 +2429,9 @@ align 32 ;; IV is 12 bytes ;; read 12 IV bytes and pad with 0x00000001 mov %%GPR2, %%IV - vmovdqa xmm0, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 - vpinsrq xmm0, [%%GPR2], 0 + vmovq xmm0, [%%GPR2] vpinsrd xmm0, [%%GPR2 + 8], 2 + vpinsrd xmm0, [rel ONEf + 12], 3 ; read 12 IV bytes and pad with 0x00000001 %%_iv_is_done: vmovdqu [%%GDATA_CTX + OrigIV], xmm0 ; ctx_data.orig_IV = iv -- GitLab From a0165c0a7fc5467cf1eaa551d6501e910d1e2148 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Wed, 10 Jan 2024 11:40:33 +0000 Subject: [PATCH 20/30] avx2/avx512: [gmac] use better optimized GHASH_MUL2 in PARTIAL_BLOCK_GMAC --- lib/include/gcm_api_avx2_avx512.inc | 5 +++-- lib/include/gcm_common_avx2_avx512.inc | 30 ++++++++++++++------------ 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/lib/include/gcm_api_avx2_avx512.inc b/lib/include/gcm_api_avx2_avx512.inc index 8a64a7f1..7239e240 100644 --- a/lib/include/gcm_api_avx2_avx512.inc +++ b/lib/include/gcm_api_avx2_avx512.inc @@ -1080,10 +1080,11 @@ GMAC_FN_NAME(update): ;; Deal with previous partial block xor r11, r11 - vmovdqu xmm13, [arg1 + HashKey] + vmovdqu xmm13, [arg1 + HashKey_1] + vmovdqu xmm14, [arg1 + HashKeyK_1] vmovdqu xmm0, [arg2 + AadHash] - PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm0, xmm13, \ + PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm0, xmm13, xmm14, \ xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm8, xmm9, xmm10 ; CALC_AAD_HASH needs to deal with multiple of 16 bytes diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index 7967bc70..ad31adb3 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -2920,28 +2920,30 @@ align 32 ; Output: Updated GDATA_CTX ; Clobbers rax, r10, r12, r13, r15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro PARTIAL_BLOCK_GMAC 15 +%macro PARTIAL_BLOCK_GMAC 16 %define %%GDATA_CTX %1 ;; [in/out] GPR pointer to GCM context %define %%PLAIN_IN %2 ;; [in] GPR pointer to plain/cipher text %define %%PLAIN_LEN %3 ;; [in] text length in bytes, GPR or memory location (win64) %define %%DATA_OFFSET %4 ;; [out] GPR data offset %define %%AAD_HASH %5 ;; [in/out] xmm with hash value %define %%HASH_SUBKEY %6 ;; [in] hash key -%define %%XMM0 %7 ;; [clobbered] xmm register -%define %%XMM1 %8 ;; [clobbered] xmm register -%define %%XMM2 %9 ;; [clobbered] xmm register -%define %%XMM3 %10 ;; [clobbered] xmm register -%define %%XMM5 %11 ;; [clobbered] xmm register -%define %%XMM6 %12 ;; [clobbered] xmm register -%define %%XMM9 %13 ;; [clobbered] xmm register -%define %%XMM10 %14 ;; [clobbered] xmm register -%define %%XMM11 %15 ;; [clobbered] xmm register - - mov r13, [%%GDATA_CTX + PBlockLen] - or r13, r13 +%define %%HASHK_SUBKEY %7 ;; [in] hash-K key +%define %%XMM0 %8 ;; [clobbered] xmm register +%define %%XMM1 %9 ;; [clobbered] xmm register +%define %%XMM2 %10 ;; [clobbered] xmm register +%define %%XMM3 %11 ;; [clobbered] xmm register +%define %%XMM5 %12 ;; [clobbered] xmm register +%define %%XMM6 %13 ;; [clobbered] xmm register +%define %%XMM9 %14 ;; [clobbered] xmm register +%define %%XMM10 %15 ;; [clobbered] xmm register +%define %%XMM11 %16 ;; [clobbered] xmm register + ; Leave Macro if no partial blocks + cmp qword [%%GDATA_CTX + PBlockLen], 0 je %%_partial_block_done + mov r13, [%%GDATA_CTX + PBlockLen] + ; Read in input data without over reading %ifdef IS_AVX2_GCM cmp %%PLAIN_LEN, 16 @@ -2989,7 +2991,7 @@ align 32 jl %%_partial_incomplete_1 ; GHASH computation for the last <16 Byte block - GHASH_MUL %%AAD_HASH, %%HASH_SUBKEY, %%XMM0, %%XMM10, %%XMM11, %%XMM5, %%XMM6 + GHASH_MUL2 %%AAD_HASH, %%HASH_SUBKEY, %%HASHK_SUBKEY, %%XMM0, %%XMM10, %%XMM11, %%XMM5 xor rax, rax mov [%%GDATA_CTX + PBlockLen], rax jmp %%_ghash_done -- GitLab From 459931668ae48752b1c2cddd1158b015102506e1 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Wed, 10 Jan 2024 14:38:46 +0000 Subject: [PATCH 21/30] avx2/avx512: [gmac] move handling of GMAC partial block into an internal common function --- lib/include/gcm_api_avx2_avx512.inc | 39 +++++++++++++++--------- lib/include/gcm_common_avx2_avx512.inc | 7 ++--- lib/include/ghash_common_avx2_avx512.inc | 19 ++++++++++++ 3 files changed, 45 insertions(+), 20 deletions(-) diff --git a/lib/include/gcm_api_avx2_avx512.inc b/lib/include/gcm_api_avx2_avx512.inc index 7239e240..3933c098 100644 --- a/lib/include/gcm_api_avx2_avx512.inc +++ b/lib/include/gcm_api_avx2_avx512.inc @@ -35,6 +35,7 @@ mksection .text default rel extern GHASH_FN_NAME(ghash_internal) +extern GHASH_FN_NAME(partial_block_gmac) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_precomp_128_avx_gen4 / @@ -1059,43 +1060,51 @@ GMAC_FN_NAME(update): %endif ;; Check if msg_len == 0 or arg4, arg4 - je exit_gmac_update + je .exit_gmac_update %ifdef SAFE_PARAM ;; Check key_data != NULL or arg1, arg1 - jz error_gmac_update + jz .error_gmac_update ;; Check context_data != NULL or arg2, arg2 - jz error_gmac_update + jz .error_gmac_update ;; Check in != NULL (msg_len != 0) or arg3, arg3 - jz error_gmac_update + jz .error_gmac_update %endif ; Increment size of "AAD length" for GMAC add [arg2 + AadLen], arg4 + vmovdqu xmm0, [arg2 + AadHash] + + cmp qword [arg2 + PBlockLen], 0 + je .partial_block_is_zero_len + ;; Deal with previous partial block - xor r11, r11 vmovdqu xmm13, [arg1 + HashKey_1] vmovdqu xmm14, [arg1 + HashKeyK_1] - vmovdqu xmm0, [arg2 + AadHash] - - PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm0, xmm13, xmm14, \ - xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm8, xmm9, xmm10 + ;; arg2 = [in] context + ;; arg3 = [in] message pointer + ;; arg4 = [in] message length + ;; xmm0 = [in/out] hash + ;; xmm13/xmm14 = [in] hash keys + call GHASH_FN_NAME(partial_block_gmac) + ;; r11 = bytes processed ; CALC_AAD_HASH needs to deal with multiple of 16 bytes sub arg4, r11 add arg3, r11 +.partial_block_is_zero_len: vmovq xmm7, arg4 ; Save remaining length and arg4, -16 ; Get multiple of 16 bytes or arg4, arg4 - jz no_full_blocks + jz .no_full_blocks ;; Calculate GHASH of this segment mov r12, arg3 @@ -1106,12 +1115,12 @@ GMAC_FN_NAME(update): vmovdqu [arg2 + AadHash], xmm0 ; ctx_data.aad hash = aad_hash -no_full_blocks: +.no_full_blocks: add arg3, arg4 ; Point at partial block vmovq arg4, xmm7 ; Restore original remaining length and arg4, 15 - jz exit_gmac_update + jz .exit_gmac_update ; Save next partial block mov [arg2 + PBlockLen], arg4 @@ -1124,13 +1133,13 @@ no_full_blocks: vpxor xmm0, xmm0, xmm1 vmovdqu [arg2 + AadHash], xmm0 -exit_gmac_update: +.exit_gmac_update: FUNC_RESTORE ret %ifdef SAFE_PARAM -error_gmac_update: +.error_gmac_update: ;; Clear reg and imb_errno IMB_ERR_CHECK_START rax @@ -1145,7 +1154,7 @@ error_gmac_update: ;; Set imb_errno IMB_ERR_CHECK_END rax - jmp exit_gmac_update + jmp .exit_gmac_update %endif mksection stack-noexec diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index ad31adb3..dbff90e0 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -2938,10 +2938,7 @@ align 32 %define %%XMM10 %15 ;; [clobbered] xmm register %define %%XMM11 %16 ;; [clobbered] xmm register - ; Leave Macro if no partial blocks - cmp qword [%%GDATA_CTX + PBlockLen], 0 - je %%_partial_block_done - + ;; @note PBlockLen must not be zero mov r13, [%%GDATA_CTX + PBlockLen] ; Read in input data without over reading @@ -3017,5 +3014,5 @@ align 32 mov r12, %%PLAIN_LEN %%offset_set: mov %%DATA_OFFSET, r12 -%%_partial_block_done: + %endmacro ; PARTIAL_BLOCK_GMAC diff --git a/lib/include/ghash_common_avx2_avx512.inc b/lib/include/ghash_common_avx2_avx512.inc index 39e094a0..869c3c5e 100644 --- a/lib/include/ghash_common_avx2_avx512.inc +++ b/lib/include/ghash_common_avx2_avx512.inc @@ -124,6 +124,25 @@ GHASH_FN_NAME(ghash_internal): r10, r11, rax ret +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; partial_block_gmac_avx_gen4 / partial_block_gmac_avx512 +;; [in] arg2 = GDATA_CTX +;; [in] arg3 = PLAIN_IN +;; [in] arg4 = PLAIN_LEN +;; [out] r11 = DATA_OFFSET +;; [in/out] xmm0 = hash in/out +;; [in] xmm13 = hash key +;; [in] xmm14 = hash-K key +;; [clobbered] xmm1-xmm6, xmm8, xmm9, xmm10 +;; [clobbered] r10, r12, r13, r15, rax +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(GHASH_FN_NAME(partial_block_gmac),function,internal) +GHASH_FN_NAME(partial_block_gmac): + PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm0, xmm13, xmm14, \ + xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm8, xmm9, xmm10 + ret + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void ghash_avx_gen4 / ghash_avx512 ( ; const struct gcm_key_data *key_data, -- GitLab From ab5ccf9eb23584b8d435f60dee32117561ff5c44 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Fri, 19 Jan 2024 16:54:26 +0000 Subject: [PATCH 22/30] avx512: [gcm] remove this implementation and use AVX2 one - remove AVX512 type 1 AES-GCM implementation and use AVX2 type 1 instead - both deliver identical performance - AVX512 AES-GCM API symbols are retained (map onto AVX2 ones) --- lib/Makefile | 3 +- lib/avx2_t1/aes128_gcm_by8_avx2.asm | 4 +- lib/avx2_t1/aes192_gcm_by8_avx2.asm | 4 +- lib/avx2_t1/aes256_gcm_by8_avx2.asm | 4 +- lib/avx2_t1/ghash_by8_avx2.asm | 204 +++++++++++++++++++- lib/avx512_t1/aes128_gcm_by8_avx512.asm | 31 ---- lib/avx512_t1/aes192_gcm_by8_avx512.asm | 31 ---- lib/avx512_t1/aes256_gcm_by8_avx512.asm | 31 ---- lib/avx512_t1/ghash_by8_avx512.asm | 33 ---- lib/include/gcm_api_avx2_avx512.inc | 33 +++- lib/include/gcm_avx_gen4.inc | 31 ---- lib/include/gcm_common_avx2_avx512.inc | 23 +-- lib/include/ghash_common_avx2_avx512.inc | 226 ----------------------- lib/win_x64.mak | 4 - 14 files changed, 249 insertions(+), 413 deletions(-) delete mode 100644 lib/avx512_t1/aes128_gcm_by8_avx512.asm delete mode 100644 lib/avx512_t1/aes192_gcm_by8_avx512.asm delete mode 100644 lib/avx512_t1/aes256_gcm_by8_avx512.asm delete mode 100644 lib/avx512_t1/ghash_by8_avx512.asm delete mode 100644 lib/include/gcm_avx_gen4.inc delete mode 100644 lib/include/ghash_common_avx2_avx512.inc diff --git a/lib/Makefile b/lib/Makefile index d85a7506..d9e7b9a8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -838,8 +838,7 @@ asm_avx512_gcm_objs := \ aes128_gcm_api_vaes_avx512.o aes192_gcm_api_vaes_avx512.o aes256_gcm_api_vaes_avx512.o \ aes128_gcm_sgl_api_vaes_avx512.o aes192_gcm_sgl_api_vaes_avx512.o aes256_gcm_sgl_api_vaes_avx512.o \ ghash_api_vaes_avx512.o \ - gmac_api_vaes_avx512.o \ - ghash_by8_avx512.o aes128_gcm_by8_avx512.o aes192_gcm_by8_avx512.o aes256_gcm_by8_avx512.o + gmac_api_vaes_avx512.o endif # aarch64 diff --git a/lib/avx2_t1/aes128_gcm_by8_avx2.asm b/lib/avx2_t1/aes128_gcm_by8_avx2.asm index be3a4d15..1b7efabf 100644 --- a/lib/avx2_t1/aes128_gcm_by8_avx2.asm +++ b/lib/avx2_t1/aes128_gcm_by8_avx2.asm @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2017-2023, Intel Corporation All rights reserved. +; Copyright(c) 2017-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,4 +28,4 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM128_MODE 1 -%include "include/gcm_avx_gen4.inc" +%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/avx2_t1/aes192_gcm_by8_avx2.asm b/lib/avx2_t1/aes192_gcm_by8_avx2.asm index 4d28c0d6..58737ae6 100644 --- a/lib/avx2_t1/aes192_gcm_by8_avx2.asm +++ b/lib/avx2_t1/aes192_gcm_by8_avx2.asm @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2017-2023, Intel Corporation All rights reserved. +; Copyright(c) 2017-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,4 +28,4 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM192_MODE 1 -%include "include/gcm_avx_gen4.inc" +%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/avx2_t1/aes256_gcm_by8_avx2.asm b/lib/avx2_t1/aes256_gcm_by8_avx2.asm index 63c87273..eb4ea60c 100644 --- a/lib/avx2_t1/aes256_gcm_by8_avx2.asm +++ b/lib/avx2_t1/aes256_gcm_by8_avx2.asm @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2017-2023, Intel Corporation All rights reserved. +; Copyright(c) 2017-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,4 +28,4 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM256_MODE 1 -%include "include/gcm_avx_gen4.inc" +%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/avx2_t1/ghash_by8_avx2.asm b/lib/avx2_t1/ghash_by8_avx2.asm index fe10e497..72f3f3d7 100644 --- a/lib/avx2_t1/ghash_by8_avx2.asm +++ b/lib/avx2_t1/ghash_by8_avx2.asm @@ -27,7 +27,207 @@ ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%define IS_AVX2_GCM %define GCM128_MODE -%include "include/ghash_common_avx2_avx512.inc" + +%use smartalign + +%include "include/gcm_common_avx2_avx512.inc" + +mksection .text +default rel + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void ghash_pre_avx_gen4 / ghash_pre_avx512 +; (const void *key, struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_pre_avx_gen4,function,) +MKGLOBAL(ghash_pre_avx512,function,) +ghash_pre_avx_gen4: +ghash_pre_avx512: + endbranch64 +;; Parameter is passed through register +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key != NULL + cmp arg1, 0 + jz error_ghash_pre + + ;; Check key_data != NULL + cmp arg2, 0 + jz error_ghash_pre +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + sub rsp, 1*16 + + ; only xmm6 needs to be maintained + vmovdqu [rsp + 0*16], xmm6 +%endif + vmovdqu xmm6, [arg1] + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [rel TWOONE] + vpand xmm2, xmm2, [rel POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg2 + HashKey], xmm6 ; store HashKey<<1 mod poly + + PRECOMPUTE arg2, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 0*16] + add rsp, 1*16 +%endif +exit_ghash_pre: + ret + +%ifdef SAFE_PARAM +error_ghash_pre: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_KEY + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_EXP_KEY + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_ghash_pre +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ghash_internal_avx_gen4 +;; [in] r12 = A_IN +;; [in] r13 = A_LEN +;; [in] arg1 = GDATA_KEY +;; [in/out] xmm0 = hash in/out +;; [clobbered] xmm1-xmm6 +;; [clobbered] r10, r11, rax +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_internal_avx_gen4,function,internal) +ghash_internal_avx_gen4: + CALC_AAD_HASH r12, r13, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ + r10, r11, rax + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; partial_block_gmac_avx_gen4 +;; [in] arg2 = GDATA_CTX +;; [in] arg3 = PLAIN_IN +;; [in] arg4 = PLAIN_LEN +;; [out] r11 = DATA_OFFSET +;; [in/out] xmm0 = hash in/out +;; [in] xmm13 = hash key +;; [in] xmm14 = hash-K key +;; [clobbered] xmm1-xmm6, xmm8, xmm9, xmm10 +;; [clobbered] r10, r12, r13, r15, rax +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(partial_block_gmac_avx_gen4,function,internal) +partial_block_gmac_avx_gen4: + PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm0, xmm13, xmm14, \ + xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm8, xmm9, xmm10 + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void ghash_avx_gen4 / ghash_avx512 ( +; const struct gcm_key_data *key_data, +; const void *in, +; const u64 in_len, +; void *io_tag, +; const u64 tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_avx_gen4,function,) +MKGLOBAL(ghash_avx512,function,) +ghash_avx_gen4: +ghash_avx512: + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_ghash + + ;; Check in != NULL + or arg2, arg2 + jz error_ghash + + ;; Check in_len != 0 + or arg3, arg3 + jz error_ghash + + ;; Check tag != NULL + or arg4, arg4 + jz error_ghash + + ;; Check tag_len != 0 + cmp arg5, 0 + jz error_ghash +%endif + + ;; copy tag to xmm0 + vmovdqu xmm0, [arg4] + vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap + + mov r12, arg2 + mov r13, arg3 + call ghash_internal_avx_gen4 + vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap + + simd_store_avx arg4, xmm0, arg5, r12, rax + +exit_ghash: + FUNC_RESTORE + ret + +%ifdef SAFE_PARAM +error_ghash: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check in != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_SRC + + ;; Check in_len != 0 + IMB_ERR_CHECK_ZERO arg3, rax, IMB_ERR_AUTH_LEN + + ;; Check tag != NULL + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AUTH + + ;; Check tag_len != 0 + IMB_ERR_CHECK_ZERO arg5, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + + jmp exit_ghash +%endif + +mksection stack-noexec diff --git a/lib/avx512_t1/aes128_gcm_by8_avx512.asm b/lib/avx512_t1/aes128_gcm_by8_avx512.asm deleted file mode 100644 index 5487a4fe..00000000 --- a/lib/avx512_t1/aes128_gcm_by8_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2018-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM128_MODE 1 -%include "include/gcm_avx512.inc" diff --git a/lib/avx512_t1/aes192_gcm_by8_avx512.asm b/lib/avx512_t1/aes192_gcm_by8_avx512.asm deleted file mode 100644 index 9a1e645f..00000000 --- a/lib/avx512_t1/aes192_gcm_by8_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2018-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM192_MODE 1 -%include "include/gcm_avx512.inc" diff --git a/lib/avx512_t1/aes256_gcm_by8_avx512.asm b/lib/avx512_t1/aes256_gcm_by8_avx512.asm deleted file mode 100644 index ea7728b7..00000000 --- a/lib/avx512_t1/aes256_gcm_by8_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2018-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM256_MODE 1 -%include "include/gcm_avx512.inc" diff --git a/lib/avx512_t1/ghash_by8_avx512.asm b/lib/avx512_t1/ghash_by8_avx512.asm deleted file mode 100644 index e7c37300..00000000 --- a/lib/avx512_t1/ghash_by8_avx512.asm +++ /dev/null @@ -1,33 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2024, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define IS_AVX512_GCM -%define GCM128_MODE -%include "include/ghash_common_avx2_avx512.inc" - diff --git a/lib/include/gcm_api_avx2_avx512.inc b/lib/include/gcm_api_avx2_avx512.inc index 3933c098..d69ec1f6 100644 --- a/lib/include/gcm_api_avx2_avx512.inc +++ b/lib/include/gcm_api_avx2_avx512.inc @@ -34,8 +34,8 @@ mksection .text default rel -extern GHASH_FN_NAME(ghash_internal) -extern GHASH_FN_NAME(partial_block_gmac) +extern ghash_internal_avx_gen4 +extern partial_block_gmac_avx_gen4 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_precomp_128_avx_gen4 / @@ -48,7 +48,9 @@ extern GHASH_FN_NAME(partial_block_gmac) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(precomp,_),function,) +MKGLOBAL(FN_NAME_AVX512(precomp,_),function,) FN_NAME(precomp,_): +FN_NAME_AVX512(precomp,_): endbranch64 %ifdef SAFE_PARAM ;; Reset imb_errno @@ -125,7 +127,9 @@ error_precomp: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(init,_),function,) +MKGLOBAL(FN_NAME_AVX512(init,_),function,) FN_NAME(init,_): +FN_NAME_AVX512(init,_): endbranch64 push r12 push r13 @@ -223,7 +227,9 @@ skip_aad_check_error_init: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(init_var_iv,_),function,) +MKGLOBAL(FN_NAME_AVX512(init_var_iv,_),function,) FN_NAME(init_var_iv,_): +FN_NAME_AVX512(init_var_iv,_): endbranch64 push r12 push r13 @@ -317,7 +323,6 @@ skip_aad_check_error_init_IV: jmp exit_init_IV %endif - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / ; aes_gcm_enc_128_update_avx_gen4 / @@ -331,7 +336,9 @@ skip_aad_check_error_init_IV: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(enc,_update_),function,) +MKGLOBAL(FN_NAME_AVX512(enc,_update_),function,) FN_NAME(enc,_update_): +FN_NAME_AVX512(enc,_update_): endbranch64 FUNC_SAVE @@ -416,7 +423,9 @@ skip_in_out_check_error_update_enc: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(dec,_update_),function,) +MKGLOBAL(FN_NAME_AVX512(dec,_update_),function,) FN_NAME(dec,_update_): +FN_NAME_AVX512(dec,_update_): endbranch64 FUNC_SAVE @@ -501,7 +510,9 @@ skip_in_out_check_error_update_dec: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(enc,_finalize_),function,) +MKGLOBAL(FN_NAME_AVX512(enc,_finalize_),function,) FN_NAME(enc,_finalize_): +FN_NAME_AVX512(enc,_finalize_): endbranch64 %ifdef SAFE_PARAM ;; Reset imb_errno @@ -595,7 +606,9 @@ error_enc_fin: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(dec,_finalize_),function,) +MKGLOBAL(FN_NAME_AVX512(dec,_finalize_),function,) FN_NAME(dec,_finalize_): +FN_NAME_AVX512(dec,_finalize_): endbranch64 %ifdef SAFE_PARAM ;; Reset imb_errno @@ -695,7 +708,9 @@ error_dec_fin: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(enc,_),function,) +MKGLOBAL(FN_NAME_AVX512(enc,_),function,) FN_NAME(enc,_): +FN_NAME_AVX512(enc,_): endbranch64 FUNC_SAVE @@ -832,7 +847,9 @@ skip_aad_check_error_enc: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(dec,_),function,) +MKGLOBAL(FN_NAME_AVX512(dec,_),function,) FN_NAME(dec,_): +FN_NAME_AVX512(dec,_): endbranch64 FUNC_SAVE @@ -966,7 +983,9 @@ skip_aad_check_error_dec: ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(enc_var_iv,_),function,internal) +MKGLOBAL(FN_NAME_AVX512(enc_var_iv,_),function,internal) FN_NAME(enc_var_iv,_): +FN_NAME_AVX512(enc_var_iv,_): endbranch64 FUNC_SAVE alloc_context @@ -1008,7 +1027,9 @@ FN_NAME(enc_var_iv,_): ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(FN_NAME(dec_var_iv,_),function,internal) +MKGLOBAL(FN_NAME_AVX512(dec_var_iv,_),function,internal) FN_NAME(dec_var_iv,_): +FN_NAME_AVX512(dec_var_iv,_): endbranch64 FUNC_SAVE alloc_context @@ -1050,7 +1071,9 @@ FN_NAME(dec_var_iv,_): ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; align 32 MKGLOBAL(GMAC_FN_NAME(update),function,) +MKGLOBAL(GMAC_FN_NAME_AVX512(update),function,) GMAC_FN_NAME(update): +GMAC_FN_NAME_AVX512(update): endbranch64 FUNC_SAVE @@ -1092,7 +1115,7 @@ GMAC_FN_NAME(update): ;; arg4 = [in] message length ;; xmm0 = [in/out] hash ;; xmm13/xmm14 = [in] hash keys - call GHASH_FN_NAME(partial_block_gmac) + call partial_block_gmac_avx_gen4 ;; r11 = bytes processed ; CALC_AAD_HASH needs to deal with multiple of 16 bytes @@ -1111,7 +1134,7 @@ GMAC_FN_NAME(update): mov r13, arg4 ;; arg1 = key ;; xmm0 = hash in/out - call GHASH_FN_NAME(ghash_internal) + call ghash_internal_avx_gen4 vmovdqu [arg2 + AadHash], xmm0 ; ctx_data.aad hash = aad_hash diff --git a/lib/include/gcm_avx_gen4.inc b/lib/include/gcm_avx_gen4.inc deleted file mode 100644 index 86983099..00000000 --- a/lib/include/gcm_avx_gen4.inc +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2024, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define IS_AVX2_GCM -%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index dbff90e0..31d5ce95 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -132,19 +132,18 @@ %endif %endif -%ifndef IS_AVX2_GCM -%ifndef IS_AVX512_GCM -%error "No GCM AVX2 or AVX512 selection made for gcm_common_avx2_avx512.inc!" -%endif +%ifdef IS_AVX512_GCM +%error "IS_AVX512_GCM: AVX512 variant removed!" %endif %ifdef IS_AVX2_GCM -%xdefine GCM_API_POSTFIX avx_gen4 +%error "IS_AVX2_GCM: Definition not required!" %endif -%ifdef IS_AVX512_GCM -%xdefine GCM_API_POSTFIX avx512 -%endif +%define IS_AVX2_GCM + +%xdefine GCM_API_POSTFIX avx_gen4 +%xdefine GCM_API_POSTFIX_AVX512 avx512 ;; Decide on AES-GCM key size to compile for %ifdef GCM128_MODE @@ -164,11 +163,13 @@ ;; Decide on AES-GCM key size to compile for %define FN_NAME(x,y) aes_gcm_ %+ x %+ GCM_API_KEYSZ %+ y %+ GCM_API_POSTFIX +%define FN_NAME_AVX512(x,y) aes_gcm_ %+ x %+ GCM_API_KEYSZ %+ y %+ GCM_API_POSTFIX_AVX512 + %define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ GCM_API_KEYSZ %+ _ %+ GCM_API_POSTFIX -%define GHASH_FN_NAME(x) x %+ _ %+ GCM_API_POSTFIX +%define GMAC_FN_NAME_AVX512(x) imb_aes_gmac_ %+ x %+ GCM_API_KEYSZ %+ _ %+ GCM_API_POSTFIX_AVX512 ; need to store 5 GP registers on stack (align to 16 bytes) -; @note: the last, 8-byte slot is used in JOB API to save/restore a register +; @note: the last 8-byte slot is used in JOB API to save/restore a register %define GP_STORAGE 8*6 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) @@ -2349,7 +2350,7 @@ align 32 ;; arg1 = key pointer mov r12, %%IV mov r13, %%IV_LEN - call GHASH_FN_NAME(ghash_internal) + call ghash_internal_avx_gen4 ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) vmovq %%XTMP2, %%IV_LEN diff --git a/lib/include/ghash_common_avx2_avx512.inc b/lib/include/ghash_common_avx2_avx512.inc deleted file mode 100644 index 869c3c5e..00000000 --- a/lib/include/ghash_common_avx2_avx512.inc +++ /dev/null @@ -1,226 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2024, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%use smartalign - -%include "include/gcm_common_avx2_avx512.inc" - -mksection .text -default rel - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void ghash_pre_avx_gen4 / ghash_pre_avx512 -; (const void *key, struct gcm_key_data *key_data) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GHASH_FN_NAME(ghash_pre),function,) -GHASH_FN_NAME(ghash_pre): - endbranch64 -;; Parameter is passed through register -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key != NULL - cmp arg1, 0 - jz error_ghash_pre - - ;; Check key_data != NULL - cmp arg2, 0 - jz error_ghash_pre -%endif - -%ifidn __OUTPUT_FORMAT__, win64 - sub rsp, 1*16 - - ; only xmm6 needs to be maintained - vmovdqu [rsp + 0*16], xmm6 -%endif - vmovdqu xmm6, [arg1] - vpshufb xmm6, [rel SHUF_MASK] - ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; - vmovdqa xmm2, xmm6 - vpsllq xmm6, xmm6, 1 - vpsrlq xmm2, xmm2, 63 - vmovdqa xmm1, xmm2 - vpslldq xmm2, xmm2, 8 - vpsrldq xmm1, xmm1, 8 - vpor xmm6, xmm6, xmm2 - ;reduction - vpshufd xmm2, xmm1, 00100100b - vpcmpeqd xmm2, [rel TWOONE] - vpand xmm2, xmm2, [rel POLY] - vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - vmovdqu [arg2 + HashKey], xmm6 ; store HashKey<<1 mod poly - - PRECOMPUTE arg2, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm6, [rsp + 0*16] - add rsp, 1*16 -%endif -exit_ghash_pre: - ret - -%ifdef SAFE_PARAM -error_ghash_pre: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_KEY - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_EXP_KEY - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_ghash_pre -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; ghash_internal_avx_gen4 / ghash_internal_avx512 -;; [in] r12 = A_IN -;; [in] r13 = A_LEN -;; [in] arg1 = GDATA_KEY -;; [in/out] xmm0 = hash in/out -;; [clobbered] xmm1-xmm6 -;; [clobbered] r10, r11, rax -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GHASH_FN_NAME(ghash_internal),function,internal) -GHASH_FN_NAME(ghash_internal): - CALC_AAD_HASH r12, r13, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, rax - ret - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; partial_block_gmac_avx_gen4 / partial_block_gmac_avx512 -;; [in] arg2 = GDATA_CTX -;; [in] arg3 = PLAIN_IN -;; [in] arg4 = PLAIN_LEN -;; [out] r11 = DATA_OFFSET -;; [in/out] xmm0 = hash in/out -;; [in] xmm13 = hash key -;; [in] xmm14 = hash-K key -;; [clobbered] xmm1-xmm6, xmm8, xmm9, xmm10 -;; [clobbered] r10, r12, r13, r15, rax -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GHASH_FN_NAME(partial_block_gmac),function,internal) -GHASH_FN_NAME(partial_block_gmac): - PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm0, xmm13, xmm14, \ - xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm8, xmm9, xmm10 - ret - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void ghash_avx_gen4 / ghash_avx512 ( -; const struct gcm_key_data *key_data, -; const void *in, -; const u64 in_len, -; void *io_tag, -; const u64 tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GHASH_FN_NAME(ghash),function,) -GHASH_FN_NAME(ghash): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_ghash - - ;; Check in != NULL - or arg2, arg2 - jz error_ghash - - ;; Check in_len != 0 - or arg3, arg3 - jz error_ghash - - ;; Check tag != NULL - or arg4, arg4 - jz error_ghash - - ;; Check tag_len != 0 - cmp arg5, 0 - jz error_ghash -%endif - - ;; copy tag to xmm0 - vmovdqu xmm0, [arg4] - vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - - mov r12, arg2 - mov r13, arg3 - call GHASH_FN_NAME(ghash_internal) - vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - - simd_store_avx arg4, xmm0, arg5, r12, rax - -exit_ghash: - FUNC_RESTORE - ret - -%ifdef SAFE_PARAM -error_ghash: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check in != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_SRC - - ;; Check in_len != 0 - IMB_ERR_CHECK_ZERO arg3, rax, IMB_ERR_AUTH_LEN - - ;; Check tag != NULL - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AUTH - - ;; Check tag_len != 0 - IMB_ERR_CHECK_ZERO arg5, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - - jmp exit_ghash -%endif - -mksection stack-noexec diff --git a/lib/win_x64.mak b/lib/win_x64.mak index ebf69590..ff8af96e 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -560,22 +560,18 @@ no_aesni_objs = \ gcm_objs = \ $(OBJ_DIR)\gcm.obj \ $(OBJ_DIR)\ghash_by8_avx2.obj \ - $(OBJ_DIR)\ghash_by8_avx512.obj \ $(OBJ_DIR)\aes128_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes128_gcm_vaes_avx2.obj \ - $(OBJ_DIR)\aes128_gcm_by8_avx512.obj \ $(OBJ_DIR)\aes128_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes128_gcm_sgl_api_vaes_avx512.obj \ $(OBJ_DIR)\ghash_api_vaes_avx512.obj \ $(OBJ_DIR)\gmac_api_vaes_avx512.obj \ $(OBJ_DIR)\aes192_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes192_gcm_vaes_avx2.obj \ - $(OBJ_DIR)\aes192_gcm_by8_avx512.obj \ $(OBJ_DIR)\aes192_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes192_gcm_sgl_api_vaes_avx512.obj \ $(OBJ_DIR)\aes256_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes256_gcm_vaes_avx2.obj \ - $(OBJ_DIR)\aes256_gcm_by8_avx512.obj \ $(OBJ_DIR)\aes256_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes256_gcm_sgl_api_vaes_avx512.obj \ $(OBJ_DIR)\gcm128_api_by8_sse.obj \ -- GitLab From 85c2d8a14fb0b67c288bc29e5558fe740a4830cc Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Tue, 16 Jan 2024 16:09:45 +0000 Subject: [PATCH 23/30] avx512: [DES] split algorithmic macros from API creation --- lib/avx512_t1/des_x16_avx512.asm | 2095 +---------------------------- lib/include/des_avx512.inc | 2119 ++++++++++++++++++++++++++++++ 2 files changed, 2120 insertions(+), 2094 deletions(-) create mode 100644 lib/include/des_avx512.inc diff --git a/lib/avx512_t1/des_x16_avx512.asm b/lib/avx512_t1/des_x16_avx512.asm index c940dd8d..635e3083 100644 --- a/lib/avx512_t1/des_x16_avx512.asm +++ b/lib/avx512_t1/des_x16_avx512.asm @@ -35,2102 +35,9 @@ ;; Windows x64 ABI ;; callee saves: RBX, RBP, RDI, RSI, RSP, R12-R15 -;; -;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 -;; ----------------------------------------------------------- -;; Windows clobbers: RAX R8 R9 R10 R11 -;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15 -;; ----------------------------------------------------------- -;; Linux clobbers: RAX RCX RDX R10 R11 -;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15 -;; ----------------------------------------------------------- ;; Clobbers ZMM0-31 and K1 to K7 -%include "include/os.inc" -%include "include/reg_sizes.inc" -%include "include/mb_mgr_datastruct.inc" -%include "include/constants.inc" -;%define DO_DBGPRINT -%include "include/dbgprint.inc" -%include "include/clear_regs.inc" - -%ifdef LINUX -%define arg1 rdi -%define arg2 rsi -%define arg3 rdx -%define arg4 rcx -%else -%define arg1 rcx -%define arg2 rdx -%define arg3 r8 -%define arg4 r9 -%endif - -%define STATE arg1 -%define SIZE arg2 - -%define OFFSET rax - -%define IA0 arg3 -%define IA1 arg4 -%define IA2 r10 - -%define INP0 r11 -%define INP1 r12 -%define INP2 r13 -%define INP3 r14 -%define INP4 r15 - -%define KSOFFSET r11 - -%define ZW0 zmm0 -%define ZW1 zmm1 -%define ZW2 zmm2 -%define ZW3 zmm3 -%define ZW4 zmm4 -%define ZW5 zmm5 -%define ZW6 zmm6 -%define ZW7 zmm7 -%define ZW8 zmm8 -%define ZW9 zmm9 -%define ZW10 zmm10 -%define ZW11 zmm11 -%define ZW12 zmm12 -%define ZW13 zmm13 -%define ZW14 zmm14 -%define ZW15 zmm15 - -%define ZIV0 zmm16 -%define ZIV1 zmm17 - -%define ZTMP0 zmm18 -%define ZTMP1 zmm19 -%define ZTMP2 zmm20 -%define ZTMP3 zmm21 -%define ZTMP4 zmm22 -%define ZTMP5 zmm23 -%define ZTMP6 zmm24 -%define ZTMP7 zmm25 -%define ZTMP8 zmm26 -%define ZTMP9 zmm27 -%define ZTMP10 zmm28 -%define ZTMP11 zmm29 -%define ZTMP12 zmm30 -%define ZTMP13 zmm31 - -struc STACKFRAME -_key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 -_key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 -_key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 -_tmp_iv: resq 16 ; 2 x 64 bytes -_tmp_in: resq 16 ; 2 x 64 bytes -_tmp_out: resq 16 ; 2 x 64 bytes -_tmp_mask: resd 16 ; 1 x 64 bytes -_gpr_save: resq 4 ; r12 to r15 -_rsp_save: resq 1 -_mask_save: resq 1 -_size_save: resq 1 -endstruc - -;;; =========================================================================== -;;; =========================================================================== -;;; MACROS -;;; =========================================================================== -;;; =========================================================================== - -;;; =========================================================================== -;;; CLEAR TRANSPOSED KEY SCHEDULE (if SAFE_DATA is selected) -;;; =========================================================================== -%macro CLEAR_KEY_SCHEDULE 2 -%define %%ALG %1 ; [in] DES or 3DES -%define %%ZT %2 ; [clobbered] temporary ZMM register - -%ifdef SAFE_DATA - vpxorq %%ZT, %%ZT -%assign rep_num (2048 / 64) -%ifidn %%ALG, 3DES -%assign rep_num (rep_num * 3) -%endif - -%assign offset 0 -%rep rep_num - vmovdqa64 [rsp + _key_sched + offset], %%ZT -%assign offset (offset + 64) -%endrep - -%endif ; SAFE_DATA - -%endmacro - -;;; =========================================================================== -;;; PERMUTE -;;; =========================================================================== -;;; A [in/out] - zmm register -;;; B [in/out] - zmm register -;;; NSHIFT [in] - constant to shift words by -;;; MASK [in] - zmm or m512 with mask -;;; T0 [clobbered] - temporary zmm register -%macro PERMUTE 5 -%define %%A %1 -%define %%B %2 -%define %%NSHIFT %3 -%define %%MASK %4 -%define %%T0 %5 - - vpsrld %%T0, %%A, %%NSHIFT - vpxord %%T0, %%T0, %%B - vpandd %%T0, %%T0, %%MASK - vpxord %%B, %%B, %%T0 - vpslld %%T0, %%T0, %%NSHIFT - vpxord %%A, %%A, %%T0 -%endmacro - -;;; =========================================================================== -;;; INITIAL PERMUTATION -;;; =========================================================================== -;;; L [in/out] - zmm register -;;; R [in/out] - zmm register -;;; T0 [clobbered] - temporary zmm register -%macro IP_Z 3 -%define %%L %1 -%define %%R %2 -%define %%T0 %3 - PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0 - PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0 - PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0 - PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0 - PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0 -%endmacro - -;;; =========================================================================== -;;; FINAL PERMUTATION -;;; =========================================================================== -;;; L [in/out] - zmm register -;;; R [in/out] - zmm register -;;; T0 [clobbered] - temporary zmm register -%macro FP_Z 3 -%define %%L %1 -%define %%R %2 -%define %%T0 %3 - PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0 - PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0 - PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0 - PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0 - PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0 -%endmacro - -;;; =========================================================================== -;;; P PHASE -;;; =========================================================================== -;;; W0 [in/out] - zmm register -;;; in: vector of 16 x 32bits from S phase -;;; out: permuted in vector -;;; T0-T3 [clobbered] - temporary zmm register -%macro P_PHASE 5 -%define %%W0 %1 -%define %%T0 %2 -%define %%T1 %3 -%define %%T2 %4 -%define %%T3 %5 - - vprord %%T0, %%W0, 3 - vpandd %%T0, %%T0, [rel mask_values + 0*64] - vprord %%T1, %%W0, 5 - vpandd %%T1, %%T1, [rel mask_values + 1*64] - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 24 - vpandd %%T1, %%T1, [rel mask_values + 2*64] - vprord %%T2, %%W0, 26 - vpandd %%T2, %%T2, [rel mask_values + 3*64] - vpord %%T1, %%T1, %%T2 - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 15 - vpandd %%T1, %%T1, [rel mask_values + 4*64] - vprord %%T2, %%W0, 17 - vpandd %%T2, %%T2, [rel mask_values + 5*64] - vpord %%T1, %%T1, %%T2 - - vprord %%T2, %%W0, 6 - vpandd %%T2, %%T2, [rel mask_values + 6*64] - vprord %%T3, %%W0, 21 - vpandd %%T3, %%T3, [rel mask_values + 7*64] - vpord %%T2, %%T2, %%T3 - vpord %%T1, %%T1, %%T2 - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 12 - vpandd %%T1, %%T1, [rel mask_values + 8*64] - vprord %%T2, %%W0, 14 - vpandd %%T2, %%T2, [rel mask_values + 9*64] - vpord %%T1, %%T1, %%T2 - - vprord %%T2, %%W0, 4 - vpandd %%T2, %%T2, [rel mask_values + 10*64] - vprord %%T3, %%W0, 11 - vpandd %%T3, %%T3, [rel mask_values + 11*64] - vpord %%T2, %%T2, %%T3 - vpord %%T1, %%T1, %%T2 - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 16 - vpandd %%T1, %%T1, [rel mask_values + 12*64] - vprord %%T2, %%W0, 22 - vpandd %%T2, %%T2, [rel mask_values + 13*64] - vpord %%T1, %%T1, %%T2 - - vprord %%T2, %%W0, 19 - vpandd %%T2, %%T2, [rel mask_values + 14*64] - vprord %%T3, %%W0, 10 - vpandd %%T3, %%T3, [rel mask_values + 15*64] - vpord %%T2, %%T2, %%T3 - vpord %%T1, %%T1, %%T2 - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 9 - vpandd %%T1, %%T1, [rel mask_values + 16*64] - vprord %%T2, %%W0, 13 - vpandd %%T2, %%T2, [rel mask_values + 17*64] - vpord %%T1, %%T1, %%T2 - - vprord %%T2, %%W0, 25 - vpandd %%T2, %%T2, [rel mask_values + 18*64] - vpord %%T1, %%T1, %%T2 - vpord %%W0, %%T0, %%T1 -%endmacro - -;;; =========================================================================== -;;; E PHASE -;;; =========================================================================== -;;; -;;; Expands 16x32-bit words into 16x48-bit words -;;; plus XOR's result with the key schedule. -;;; The output is adjusted to be friendly as S phase input. -;;; -;;; in [in] - zmm register -;;; out0a [out] - zmm register -;;; out0b [out] - zmm register -;;; out1a [out] - zmm register -;;; out1b [out] - zmm register -;;; k0 [in] - key schedule; zmm or m512 -;;; k1 [in] - key schedule; zmm or m512 -;;; t0-t1 [clobbered] - temporary zmm register -%macro E_PHASE 9 -%define %%IN %1 -%define %%OUT0A %2 -%define %%OUT0B %3 -%define %%OUT1A %4 -%define %%OUT1B %5 -%define %%K0 %6 -%define %%K1 %7 -%define %%T0 %8 -%define %%T1 %9 - - vprord %%T0, %%IN, 31 - vprord %%T1, %%IN, 3 - vpshufb %%T0, %%T0, [rel idx_e] - vpshufb %%T1, %%T1, [rel idx_e] - vpunpcklbw %%OUT0A, %%T0, %%T1 - vpunpckhbw %%OUT1A, %%T0, %%T1 - vpxord %%OUT0A, %%OUT0A, %%K0 - vpxord %%OUT1A, %%OUT1A, %%K1 - vpandd %%OUT0B, %%OUT0A, [rel and_eu] - vpsrlw %%OUT0B, %%OUT0B, 8 - vpandd %%OUT0A, %%OUT0A, [rel and_ed] - vpandd %%OUT1B, %%OUT1A, [rel and_eu] - vpsrlw %%OUT1B, %%OUT1B, 8 - vpandd %%OUT1A, %%OUT1A, [rel and_ed] -%endmacro - -;;; =========================================================================== -;;; S-BOX -;;; =========================================================================== -;;; -;;; NOTE: clobbers k1-k6 OpMask registers -;;; -;;; IN0A [in] - zmm register; output from E-phase -;;; IN0B [in] - zmm register; output from E-phase -;;; IN1A [in] - zmm register; output from E-phase -;;; IN1B [in] - zmm register; output from E-phase -;;; OUT [out] - zmm register; output from E-phase -;;; T0-T5 [clobbered] - temporary zmm register -%macro S_PHASE 11 -%define %%IN0A %1 -%define %%IN0B %2 -%define %%IN1A %3 -%define %%IN1B %4 -%define %%OUT %5 -%define %%T0 %6 -%define %%T1 %7 -%define %%T2 %8 -%define %%T3 %9 -%define %%T4 %10 -%define %%T5 %11 - - vmovdqa64 %%T0, [rel reg_values16bit_7] - vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE - vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE - vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE - vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE - - mov DWORD(IA0), 0x55555555 - kmovd k1, DWORD(IA0) - mov DWORD(IA0), 0xaaaaaaaa - kmovd k2, DWORD(IA0) - - vmovdqa64 %%T0, [rel S_box_flipped + 0*64] - vmovdqa64 %%T1, [rel S_box_flipped + 1*64] - vmovdqa64 %%T2, [rel S_box_flipped + 4*64] - vmovdqa64 %%T3, [rel S_box_flipped + 5*64] - vpermw %%T0{k1}{z}, %%IN0A, %%T0 - vpermw %%T1{k1}{z}, %%IN0A, %%T1 - vpermw %%T2{k2}{z}, %%IN0A, %%T2 - vpermw %%T3{k2}{z}, %%IN0A, %%T3 - vpxord %%T0, %%T0, %%T2 - vpxord %%OUT, %%T1, %%T3 - vmovdqu16 %%OUT{k3}, %%T0 - - vmovdqa64 %%T0, [rel S_box_flipped + 2*64] - vmovdqa64 %%T1, [rel S_box_flipped + 3*64] - vmovdqa64 %%T2, [rel S_box_flipped + 6*64] - vmovdqa64 %%T3, [rel S_box_flipped + 7*64] - vpermw %%T0{k1}{z}, %%IN0B, %%T0 - vpermw %%T1{k1}{z}, %%IN0B, %%T1 - vpermw %%T2{k2}{z}, %%IN0B, %%T2 - vpermw %%T3{k2}{z}, %%IN0B, %%T3 - vpxord %%T0, %%T0, %%T2 - vpxord %%T3, %%T1, %%T3 - vmovdqu16 %%T3{k4}, %%T0 - vpsllw %%T3, %%T3, 4 - vpxord %%OUT, %%OUT, %%T3 - - vmovdqa64 %%T0, [rel S_box_flipped + 8*64] - vmovdqa64 %%T1, [rel S_box_flipped + 9*64] - vmovdqa64 %%T2, [rel S_box_flipped + 12*64] - vmovdqa64 %%T3, [rel S_box_flipped + 13*64] - vpermw %%T0{k1}{z}, %%IN1A, %%T0 - vpermw %%T1{k1}{z}, %%IN1A, %%T1 - vpermw %%T2{k2}{z}, %%IN1A, %%T2 - vpermw %%T3{k2}{z}, %%IN1A, %%T3 - vpxord %%T0, %%T0, %%T2 - vpxord %%T4, %%T1, %%T3 - vmovdqu16 %%T4{k5}, %%T0 - - vmovdqa64 %%T0, [rel S_box_flipped + 10*64] - vmovdqa64 %%T1, [rel S_box_flipped + 11*64] - vmovdqa64 %%T2, [rel S_box_flipped + 14*64] - vmovdqa64 %%T3, [rel S_box_flipped + 15*64] - vpermw %%T0{k1}{z}, %%IN1B, %%T0 - vpermw %%T1{k1}{z}, %%IN1B, %%T1 - vpermw %%T2{k2}{z}, %%IN1B, %%T2 - vpermw %%T3{k2}{z}, %%IN1B, %%T3 - vpxord %%T0, %%T0, %%T2 - vpxord %%T5, %%T1, %%T3 - vmovdqu16 %%T5{k6}, %%T0 - vpsllw %%T5, %%T5, 4 - - vpxord %%T4, %%T4, %%T5 - vpsllw %%T4, %%T4, 8 - vpxord %%OUT, %%OUT, %%T4 - vpshufb %%OUT, %%OUT, [rel shuffle_reg] -%endmacro - -;;; =========================================================================== -;;; DES encryption/decryption round -;;; =========================================================================== -;;; -;;; Clobbers k1-k6 OpMask registers -;;; -;;; ENC_DEC [in] - ENC for encryption, DEC for decryption -;;; R [in/out] - zmm register; plain text in & cipher text out -;;; L [in/out] - zmm register; plain text in & cipher text out -;;; KS [in] - pointer to the key schedule -;;; T0-T11 [clobbered] - temporary zmm register -%macro DES_ENC_DEC 16 -%define %%ENC_DEC %1 -%define %%R %2 -%define %%L %3 -%define %%KS %4 -%define %%T0 %5 -%define %%T1 %6 -%define %%T2 %7 -%define %%T3 %8 -%define %%T4 %9 -%define %%T5 %10 -%define %%T6 %11 -%define %%T7 %12 -%define %%T8 %13 -%define %%T9 %14 -%define %%T10 %15 -%define %%T11 %16 - - IP_Z %%R, %%L, %%T0 - -%ifidn %%ENC_DEC, ENC - ;; ENCRYPTION - xor KSOFFSET, KSOFFSET -%%_des_enc_loop: - E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7 - S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 - P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 - vpxord %%L, %%L, %%T0 - - E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7 - S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 - P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 - vpxord %%R, %%R, %%T0 - - add KSOFFSET, (4*64) - cmp KSOFFSET, (8*(4*64)) - jb %%_des_enc_loop - -%else - ;; DECRYPTION - mov KSOFFSET, (8*(4*64)) -%%_des_dec_loop: - E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7 - S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 - P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 - vpxord %%L, %%L, %%T0 - - E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7 - S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 - P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 - vpxord %%R, %%R, %%T0 - sub KSOFFSET, (4*64) - jnz %%_des_dec_loop -%endif ; DECRYPTION - - FP_Z %%R, %%L, %%T0 -%endmacro - -;;; =========================================================================== -;;; DATA TRANSPOSITION AT DATA INPUT -;;; =========================================================================== -;;; -;;; IN00 - IN15 [in/out]: -;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data -;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 -;;; T0-T3 [clobbered] - temporary zmm registers -;;; K0-K5 [clobbered] - temporary zmm registers -;;; H0-H3 [clobbered] - temporary zmm registers -%macro TRANSPOSE_IN 30 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T1 %18 -%define %%T2 %19 -%define %%T3 %20 -%define %%K0 %21 -%define %%K1 %22 -%define %%K2 %23 -%define %%K3 %24 -%define %%K4 %25 -%define %%K5 %26 -%define %%H0 %27 -%define %%H1 %28 -%define %%H2 %29 -%define %%H3 %30 - - vpunpckldq %%K0, %%IN00, %%IN01 - vpunpckhdq %%K1, %%IN00, %%IN01 - vpunpckldq %%T0, %%IN02, %%IN03 - vpunpckhdq %%T1, %%IN02, %%IN03 - - vpunpckldq %%IN00, %%IN04, %%IN05 - vpunpckhdq %%IN01, %%IN04, %%IN05 - vpunpckldq %%IN02, %%IN06, %%IN07 - vpunpckhdq %%IN03, %%IN06, %%IN07 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - vpunpcklqdq %%K3, %%K1, %%T1 - vpunpckhqdq %%T3, %%K1, %%T1 - - vpunpcklqdq %%K0, %%IN00, %%IN02 - vpunpckhqdq %%K1, %%IN00, %%IN02 - vpunpcklqdq %%T0, %%IN01, %%IN03 - vpunpckhqdq %%T1, %%IN01, %%IN03 - - vpunpckldq %%K4, %%IN08, %%IN09 - vpunpckhdq %%K5, %%IN08, %%IN09 - vpunpckldq %%IN04, %%IN10, %%IN11 - vpunpckhdq %%IN05, %%IN10, %%IN11 - vpunpckldq %%IN06, %%IN12, %%IN13 - vpunpckhdq %%IN07, %%IN12, %%IN13 - vpunpckldq %%IN10, %%IN14, %%IN15 - vpunpckhdq %%IN11, %%IN14, %%IN15 - - vpunpcklqdq %%IN12, %%K4, %%IN04 - vpunpckhqdq %%IN13, %%K4, %%IN04 - vpunpcklqdq %%IN14, %%K5, %%IN05 - vpunpckhqdq %%IN15, %%K5, %%IN05 - vpunpcklqdq %%IN00, %%IN06, %%IN10 - vpunpckhqdq %%IN01, %%IN06, %%IN10 - vpunpcklqdq %%IN02, %%IN07, %%IN11 - vpunpckhqdq %%IN03, %%IN07, %%IN11 - - vshufi64x2 %%H0, %%K2, %%K0, 0x44 - vshufi64x2 %%H1, %%K2, %%K0, 0xee - vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 - vshufi64x2 %%H3, %%IN12, %%IN00, 0xee - vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 - vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 - vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 - vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 - - vshufi64x2 %%H0, %%T2, %%K1, 0x44 - vshufi64x2 %%H1, %%T2, %%K1, 0xee - vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 - vshufi64x2 %%H3, %%IN13, %%IN01, 0xee - vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 - vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 - vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 - vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 - - vshufi64x2 %%H0, %%K3, %%T0, 0x44 - vshufi64x2 %%H1, %%K3, %%T0, 0xee - vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 - vshufi64x2 %%H3, %%IN14, %%IN02, 0xee - vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 - vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 - vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 - vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 - - vshufi64x2 %%H0, %%T3, %%T1, 0x44 - vshufi64x2 %%H1, %%T3, %%T1, 0xee - vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 - vshufi64x2 %%H3, %%IN15, %%IN03, 0xee - vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 - vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 - vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 - vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 -%endmacro - -;;; =========================================================================== -;;; DATA TRANSPOSITION AT DATA OUTPUT -;;; =========================================================================== -;;; -;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: -;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 -;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data -;;; T0-T3 [clobbered] - temporary zmm registers -;;; K0-K5 [clobbered] - temporary zmm registers -;;; H0-H3 [clobbered] - temporary zmm registers -%macro TRANSPOSE_OUT 30 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T1 %18 -%define %%T2 %19 -%define %%T3 %20 -%define %%K0 %21 -%define %%K1 %22 -%define %%K2 %23 -%define %%K3 %24 -%define %%K4 %25 -%define %%K5 %26 -%define %%H0 %27 -%define %%H1 %28 -%define %%H2 %29 -%define %%H3 %30 - - vpunpckldq %%K0, %%IN01, %%IN00 - vpunpckhdq %%K1, %%IN01, %%IN00 - vpunpckldq %%T0, %%IN03, %%IN02 - vpunpckhdq %%T1, %%IN03, %%IN02 - - vpunpckldq %%IN00, %%IN05, %%IN04 - vpunpckhdq %%IN01, %%IN05, %%IN04 - vpunpckldq %%IN02, %%IN07, %%IN06 - vpunpckhdq %%IN03, %%IN07, %%IN06 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - vpunpcklqdq %%K3, %%K1, %%T1 - vpunpckhqdq %%T3, %%K1, %%T1 - - vpunpcklqdq %%K0, %%IN00, %%IN02 - vpunpckhqdq %%K1, %%IN00, %%IN02 - vpunpcklqdq %%T0, %%IN01, %%IN03 - vpunpckhqdq %%T1, %%IN01, %%IN03 - - vpunpckldq %%K4, %%IN09, %%IN08 - vpunpckhdq %%K5, %%IN09, %%IN08 - vpunpckldq %%IN04, %%IN11, %%IN10 - vpunpckhdq %%IN05, %%IN11, %%IN10 - vpunpckldq %%IN06, %%IN13, %%IN12 - vpunpckhdq %%IN07, %%IN13, %%IN12 - vpunpckldq %%IN10, %%IN15, %%IN14 - vpunpckhdq %%IN11, %%IN15, %%IN14 - - vpunpcklqdq %%IN12, %%K4, %%IN04 - vpunpckhqdq %%IN13, %%K4, %%IN04 - vpunpcklqdq %%IN14, %%K5, %%IN05 - vpunpckhqdq %%IN15, %%K5, %%IN05 - vpunpcklqdq %%IN00, %%IN06, %%IN10 - vpunpckhqdq %%IN01, %%IN06, %%IN10 - vpunpcklqdq %%IN02, %%IN07, %%IN11 - vpunpckhqdq %%IN03, %%IN07, %%IN11 - - vshufi64x2 %%H0, %%K2, %%K0, 0x44 - vshufi64x2 %%H1, %%K2, %%K0, 0xee - vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 - vshufi64x2 %%H3, %%IN12, %%IN00, 0xee - vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 - vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 - vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 - vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 - - vshufi64x2 %%H0, %%T2, %%K1, 0x44 - vshufi64x2 %%H1, %%T2, %%K1, 0xee - vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 - vshufi64x2 %%H3, %%IN13, %%IN01, 0xee - vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 - vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 - vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 - vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 - - vshufi64x2 %%H0, %%K3, %%T0, 0x44 - vshufi64x2 %%H1, %%K3, %%T0, 0xee - vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 - vshufi64x2 %%H3, %%IN14, %%IN02, 0xee - vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 - vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 - vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 - vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 - - vshufi64x2 %%H0, %%T3, %%T1, 0x44 - vshufi64x2 %%H1, %%T3, %%T1, 0xee - vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 - vshufi64x2 %%H3, %%IN15, %%IN03, 0xee - vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 - vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 - vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 - vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 -%endmacro - -;;; =========================================================================== -;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT -;;; =========================================================================== -;;; -;;; IN00-IN15 / R0/L0-R7/L7 [in/out]: -;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data -;;; out: R0 - 16 x word0, L0 - 16 x word1 -;;; T0,T2 [clobbered] - temporary zmm registers -;;; K0-K4 [clobbered] - temporary zmm registers -;;; H0,H2 [clobbered] - temporary zmm registers -%macro TRANSPOSE_IN_ONE 24 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T2 %18 -%define %%K0 %19 -%define %%K1 %20 -%define %%K2 %21 -%define %%K4 %22 -%define %%H0 %23 -%define %%H2 %24 - - vpunpckldq %%K0, %%IN00, %%IN01 - vpunpckhdq %%K1, %%IN00, %%IN01 - vpunpckldq %%T0, %%IN02, %%IN03 - - vpunpckldq %%IN00, %%IN04, %%IN05 - vpunpckhdq %%IN01, %%IN04, %%IN05 - vpunpckldq %%IN02, %%IN06, %%IN07 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - - vpunpcklqdq %%K0, %%IN00, %%IN02 - vpunpckhqdq %%K1, %%IN00, %%IN02 - - vpunpckldq %%K4, %%IN08, %%IN09 - vpunpckldq %%IN04, %%IN10, %%IN11 - vpunpckldq %%IN06, %%IN12, %%IN13 - vpunpckldq %%IN10, %%IN14, %%IN15 - - vpunpcklqdq %%IN12, %%K4, %%IN04 - vpunpckhqdq %%IN13, %%K4, %%IN04 - vpunpcklqdq %%IN00, %%IN06, %%IN10 - vpunpckhqdq %%IN01, %%IN06, %%IN10 - - vshufi64x2 %%H0, %%K2, %%K0, 0x44 - vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 - vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 - - vshufi64x2 %%H0, %%T2, %%K1, 0x44 - vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 - vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 -%endmacro - -;;; =========================================================================== -;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT -;;; =========================================================================== -;;; -;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: -;;; in: R0 - 16 x word0, L0 - 16 x word1 -;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data -;;; T0-T3 [clobbered] - temporary zmm registers -;;; K0-K3 [clobbered] - temporary zmm registers -;;; H0,H1 [clobbered] - temporary zmm registers -%macro TRANSPOSE_OUT_ONE 25 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T2 %18 -%define %%T3 %19 -%define %%K0 %20 -%define %%K1 %21 -%define %%K2 %22 -%define %%K3 %23 -%define %%H0 %24 -%define %%H1 %25 - - vpxord %%T0, %%T0, %%T0 - - vpunpckldq %%K0, %%IN01, %%IN00 - vpunpckhdq %%K1, %%IN01, %%IN00 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - vpunpcklqdq %%K3, %%K1, %%T0 - vpunpckhqdq %%T3, %%K1, %%T0 - - vshufi64x2 %%H0, %%K2, %%T0, 0x44 - vshufi64x2 %%H1, %%K2, %%T0, 0xee - vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0 - vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2 - vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4 - vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6 - - vshufi64x2 %%H0, %%T2, %%T0, 0x44 - vshufi64x2 %%H1, %%T2, %%T0, 0xee - vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0 - vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2 - vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4 - vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6 - - vshufi64x2 %%H0, %%K3, %%T0, 0x44 - vshufi64x2 %%H1, %%K3, %%T0, 0xee - vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1 - vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3 - vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5 - vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7 - - vshufi64x2 %%H0, %%T3, %%T0, 0x44 - vshufi64x2 %%H1, %%T3, %%T0, 0xee - vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1 - vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3 - vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5 - vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7 -%endmacro - -;;; =========================================================================== -;;; DES INITIALIZATION -;;; key schedule transposition and IV set up -;;; =========================================================================== -;;; -;;; STATE_KEYS [in] - KEYS in DES OOO STATE -;;; STATE_IV [ in] - IV in DES OOO STATE -;;; KS [out] - place to store transposed key schedule or NULL -;;; IV0 [out] - r512; initialization vector -;;; IV1 [out] - r512; initialization vector -;;; T0-T27 [clobbered] - temporary r512 -%macro DES_INIT 33 -%define %%STATE_KEYS %1 -%define %%STATE_IV %2 -%define %%KS %3 -%define %%IV0 %4 -%define %%IV1 %5 -%define %%T0 %6 -%define %%T1 %7 -%define %%T2 %8 -%define %%T3 %9 -%define %%T4 %10 -%define %%T5 %11 -%define %%T6 %12 -%define %%T7 %13 -%define %%T8 %14 -%define %%T9 %15 -%define %%T10 %16 -%define %%T11 %17 -%define %%T12 %18 -%define %%T13 %19 -%define %%T14 %20 -%define %%T15 %21 -%define %%T16 %22 -%define %%T17 %23 -%define %%T18 %24 -%define %%T19 %25 -%define %%T20 %26 -%define %%T21 %27 -%define %%T22 %28 -%define %%T23 %29 -%define %%T24 %30 -%define %%T25 %31 -%define %%T26 %32 -%define %%T27 %33 - - ;; set up the key schedule - ;; - load first half of the keys & transpose - ;; - transpose and store - ;; note: we can use IV registers as temporary ones here -%assign IDX 0 -%rep 16 - mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] - vmovdqu64 %%T %+ IDX, [IA0] -%assign IDX (IDX + 1) -%endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 -%assign IDX 0 -%rep 16 - vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX -%assign IDX (IDX + 1) -%endrep - ;; - load second half of the keys & transpose - ;; - transpose and store - ;; note: we can use IV registers as temporary ones here -%assign IDX 0 -%rep 16 - mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] - vmovdqu64 %%T %+ IDX, [IA0 + 64] -%assign IDX (IDX + 1) -%endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 -%assign IDX 0 -%rep 16 - vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX -%assign IDX (IDX + 1) -%endrep - - ;; set up IV - ;; - they are already kept transposed so this is enough to load them - vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)] - vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)] -%endmacro - -;;; =========================================================================== -;;; 3DES INITIALIZATION -;;; key schedule transposition and IV set up -;;; =========================================================================== -;;; -;;; STATE_KEYS [in] - KEYS in 3DES OOO STATE -;;; STATE_IV [ in] - IV in 3DES OOO STATE -;;; KS1 [out] - place to store transposed key schedule or NULL -;;; KS2 [out] - place to store transposed key schedule or NULL -;;; KS3 [out] - place to store transposed key schedule or NULL -;;; IV0 [out] - r512; initialization vector -;;; IV1 [out] - r512; initialization vector -;;; T0-T27 [clobbered] - temporary r512 -;;; DIR [in] - ENC/DEC (keys arranged in different order for enc/dec) -%macro DES3_INIT 36 -%define %%STATE_KEYS %1 -%define %%STATE_IV %2 -%define %%KS1 %3 -%define %%KS2 %4 -%define %%KS3 %5 -%define %%IV0 %6 -%define %%IV1 %7 -%define %%T0 %8 -%define %%T1 %9 -%define %%T2 %10 -%define %%T3 %11 -%define %%T4 %12 -%define %%T5 %13 -%define %%T6 %14 -%define %%T7 %15 -%define %%T8 %16 -%define %%T9 %17 -%define %%T10 %18 -%define %%T11 %19 -%define %%T12 %20 -%define %%T13 %21 -%define %%T14 %22 -%define %%T15 %23 -%define %%T16 %24 -%define %%T17 %25 -%define %%T18 %26 -%define %%T19 %27 -%define %%T20 %28 -%define %%T21 %29 -%define %%T22 %30 -%define %%T23 %31 -%define %%T24 %32 -%define %%T25 %33 -%define %%T26 %34 -%define %%T27 %35 -%define %%DIR %36 - -%ifidn %%DIR, ENC -%assign KEY_IDX 0 -%else -%assign KEY_IDX 2 -%endif -%assign KS_IDX 1 - -%rep 3 - ;; set up the key schedule - ;; - load first half of the keys & transpose - ;; - transpose and store - ;; note: we can use IV registers as temporary ones here - -%assign IDX 0 -%rep 16 - mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] - mov IA0, [IA0 + (KEY_IDX * PTR_SZ)] - vmovdqu64 %%T %+ IDX, [IA0] -%assign IDX (IDX + 1) -%endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 -%assign IDX 0 -%rep 16 - vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX -%assign IDX (IDX + 1) -%endrep - ;; - load second half of the keys & transpose - ;; - transpose and store - ;; note: we can use IV registers as temporary ones here -%assign IDX 0 -%rep 16 - mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] - mov IA0, [IA0 + (KEY_IDX * PTR_SZ)] - vmovdqu64 %%T %+ IDX, [IA0 + 64] -%assign IDX (IDX + 1) -%endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 -%assign IDX 0 -%rep 16 - vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX -%assign IDX (IDX + 1) -%endrep - -%ifidn %%DIR, ENC -%assign KEY_IDX (KEY_IDX + 1) -%else -%assign KEY_IDX (KEY_IDX - 1) -%endif -%assign KS_IDX (KS_IDX + 1) -%endrep ; KEY_IDX / KS_IDX - - ;; set up IV - ;; - they are already kept transposed so this is enough to load them - vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)] - vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)] - -%endmacro - -;;; =========================================================================== -;;; DES FINISH -;;; Update in/out pointers and store IV -;;; =========================================================================== -;;; -;;; Needs: STATE & SIZE -;;; IV0 [in] - r512; initialization vector -;;; IV1 [in] - r512; initialization vector -;;; T0-T4 [clobbered] - temporary r512 registers -%macro DES_FINISH 7 -%define %%IV0 %1 -%define %%IV1 %2 -%define %%T0 %3 -%define %%T1 %4 -%define %%T2 %5 -%define %%T3 %6 -%define %%T4 %7 - - vpbroadcastq %%T4, SIZE - vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)] - vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)] - vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)] - vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)] - vpaddq %%T0, %%T0, %%T4 - vpaddq %%T1, %%T1, %%T4 - vpaddq %%T2, %%T2, %%T4 - vpaddq %%T3, %%T3, %%T4 - vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0 - vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1 - vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2 - vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3 - - vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0 - vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1 -%endmacro - -;;; =========================================================================== -;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY -;;; =========================================================================== -;;; -;;; Needs: STATE, IA0-IA2 -;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection -;;; KS [in] - key schedule -;;; T0-T24 [clobbered] - temporary r512 -;;; T_IN [in] - 16 * 8 byte storage -;;; T_OUT [in] - 16 * 8 byte storage -;;; T_MASK [in] - 16 * 4 byte storage -;;; T_IV [in] - 16 * 8 byte storage -;;; -;;; NOTE: clobbers OpMask registers -%macro DES_CFB_ONE 31 -%define %%ENC_DEC %1 -%define %%KS %2 -%define %%T0 %3 -%define %%T1 %4 -%define %%T2 %5 -%define %%T3 %6 -%define %%T4 %7 -%define %%T5 %8 -%define %%T6 %9 -%define %%T7 %10 -%define %%T8 %11 -%define %%T9 %12 -%define %%T10 %13 -%define %%T11 %14 -%define %%T12 %15 -%define %%T13 %16 -%define %%T14 %17 -%define %%T15 %18 -%define %%T16 %19 -%define %%T17 %20 -%define %%T18 %21 -%define %%T19 %22 -%define %%T20 %23 -%define %%T21 %24 -%define %%T22 %25 -%define %%T23 %26 -%define %%T24 %27 -%define %%T_IN %28 -%define %%T_OUT %29 -%define %%T_IV %30 -%define %%T_MASK %31 - - ;; - find mask for non-zero partial lengths - vpxord %%T10, %%T10, %%T10 - vmovdqu64 %%T0, [STATE + _des_args_PLen] - vpcmpd k3, %%T0, %%T10, 4 ; NEQ - kmovw DWORD(IA0), k3 - movzx DWORD(IA0), WORD(IA0) - or DWORD(IA0), DWORD(IA0) - jz %%_des_cfb_one_end ; no non-zero partial lengths - -%ifidn %%ENC_DEC, ENC - ;; For encyrption case we need to make sure that - ;; all full blocks are complete before proceeding - ;; with CFB partial block. - ;; To do that current out position is compared against - ;; calculated last full block position. - vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)] - vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)] - vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)] - vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)] - vpcmpq k4, %%T1, %%T2, 0 ; EQ - vpcmpq k5, %%T3, %%T4, 0 ; EQ - kmovw DWORD(IA1), k4 - movzx DWORD(IA1), BYTE(IA1) - kmovw DWORD(IA2), k5 - movzx DWORD(IA2), BYTE(IA2) - shl DWORD(IA2), 8 - or DWORD(IA2), DWORD(IA1) - and DWORD(IA0), DWORD(IA2) - jz %%_des_cfb_one_end ; no non-zero lengths left - kmovw k3, DWORD(IA0) -%endif - ;; Calculate ((1 << partial_bytes) - 1) - ;; in order to get the mask for loads and stores - ;; k3 & IA0 - hold valid mask - vmovdqa64 %%T1, [rel vec_ones_32b] - vpsllvd %%T2{k3}{z}, %%T1, %%T0 - vpsubd %%T2{k3}{z}, %%T2, %%T1 - vmovdqu64 [%%T_MASK], %%T2 - - ;; clear selected partial lens not to do them twice - vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10 - - ;; copy IV, in and out pointers - vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)] - vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)] - vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)] - vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)] - vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)] - vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)] - vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1 - vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2 - vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3 - vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4 - vmovdqu64 [%%T_IV + (0*64)], %%T5 - vmovdqu64 [%%T_IV + (1*64)], %%T6 - - ;; calculate last block case mask - ;; - first block case requires no modifications to in/out/IV - vmovdqu64 %%T1, [STATE + _des_args_BLen] - vpcmpd k2, %%T1, %%T10, 4 ; NEQ - kmovw DWORD(IA1), k2 - and DWORD(IA1), DWORD(IA0) - jz %%_des_cfb_one_no_last_blocks - - ;; set up IV, in and out for the last block case - ;; - Last block needs in and out to be set differently (decryption only) - ;; - IA1 holds the last block mask -%ifidn %%ENC_DEC, DEC - mov DWORD(IA0), DWORD(IA1) - mov DWORD(IA2), DWORD(IA1) - shr DWORD(IA1), 8 - and DWORD(IA2), 0xff - kmovw k4, DWORD(IA2) - kmovw k5, DWORD(IA1) - vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)] - vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)] - vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)] - vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)] - vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1 - vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2 - vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3 - vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4 -%endif ; decryption - ;; - IV has to be set differently for CFB as well - ;; - IA0 holds the last block mask -%assign IDX 0 -%rep 16 - test DWORD(IA0), (1 << IDX) - jz %%_des_cfb_one_copy_iv_next %+ IDX -%ifidn %%ENC_DEC, ENC - mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)] -%else - mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)] -%endif - mov IA2, [IA2 - 8] - mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2) - shr IA2, 32 - mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2) -%%_des_cfb_one_copy_iv_next %+ IDX: -%assign IDX (IDX + 1) -%endrep - -%%_des_cfb_one_no_last_blocks: - ;; Uffff ... finally let's do some DES CFB - ;; - let's use T_IN, T_OUT, T_IV and T_MASK - - ;; - load data with the corresponding masks & transpose - ;; - T0 to T15 will hold the data - xor IA0, IA0 -%assign IDX 0 -%assign K_IDX 1 -%rep 16 - mov IA1, [%%T_IN + (IDX*PTR_SZ)] - mov DWORD(IA0), [%%T_MASK + (IDX*4)] - kmovq k %+ K_IDX, IA0 - vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1] -%assign IDX (IDX + 1) -%assign K_IDX (K_IDX + 1) -%if K_IDX > 7 -%assign K_IDX 1 ; iterate through K1 to K7 -%endif -%endrep - ;; - transpose the data in T0 to T15, T16 to T23 are clobbered - TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23 - - ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1 - vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0 - vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1 - ;; DES encrypt - ;; - R0 - %%T0 - ;; - L0 - %%T1 - DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13 - ;; CFB style xor with R0/L0 with IV - ;; - IV0 - %%T16 - ;; - IV1 - %%T17 - vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1 - vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0 - vmovdqa64 %%T1, %%T2 - ;; - new R0 = L0 ^ IV0 (%%T0) - ;; - new L0 = R0 ^ IV1 (%%T1) - - ;; Transpose the data out - ;; - %%T2 to %%T24 clobbered - TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24 - - ;; Store the transposed data - ;; - T0 to T15 will hold the data - xor IA0, IA0 -%assign IDX 0 -%assign K_IDX 1 -%rep 16 - mov IA1, [%%T_OUT + (IDX*PTR_SZ)] - mov DWORD(IA0), [%%T_MASK + (IDX*4)] - kmovq k %+ K_IDX, IA0 - vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX -%assign IDX (IDX + 1) -%assign K_IDX (K_IDX + 1) -%if K_IDX > 7 -%assign K_IDX 1 ; iterate through K1 to K7 -%endif -%endrep - -%ifdef SAFE_DATA - ;; Clear copied IV's - vpxorq %%T5, %%T5 - vmovdqu64 [%%T_IV + (0*64)], %%T5 - vmovdqu64 [%%T_IV + (1*64)], %%T5 -%endif - -%%_des_cfb_one_end: - -%endmacro - -;;; =========================================================================== -;;; Converts length into mask of DES blocks -;;; =========================================================================== -;;; -;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64) -;;; USES: IA0, IA1 IA2 -;;; ASSUMES: SIZE - OFFSET < 64 -%macro GET_MASK8 1 -%define %%MASK %1 - -%ifidn IA1, rcx -%define myrcx IA1 -%else -%define myrcx rcx - mov IA1, rcx -%endif - mov myrcx, SIZE - sub myrcx, OFFSET - ;; - myrcx - remaining length - ;; - divide by 8 (DES block size) - ;; - create bit mask of the result - mov DWORD(%%MASK), 1 - shr DWORD(myrcx), 3 - shl DWORD(%%MASK), BYTE(myrcx) - sub DWORD(%%MASK), 1 -%ifnidn IA1, rcx - mov rcx, IA1 -%endif -%endmacro - -;;; =========================================================================== -;;; DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only) -;;; =========================================================================== -;;; -;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only -;;; DES_KS [in] - pointer to transposed key schedule -;;; -;;; NOTE: clobbers OpMask registers -;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 -%macro GEN_DES_ENC_CIPHER 2 -%define %%NUM_DES_BLOCKS %1 -%define %%DES_KS %2 - -%assign RN 0 -%assign LN 1 -%assign RNN 2 -%assign LNN 3 -%rep %%NUM_DES_BLOCKS - 1 - DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0 - vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0 -%assign RN (RN + 2) -%assign LN (LN + 2) -%assign RNN (RNN + 2) -%assign LNN (LNN + 2) -%endrep - DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7 - vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7 -%endmacro - -;;; =========================================================================== -;;; DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only) -;;; =========================================================================== -;;; -;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only -;;; DES_KS [in] - pointer to transposed key schedule -;;; -;;; NOTE: clobbers OpMask registers -;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 -%macro GEN_DES_DEC_CIPHER 2 -%define %%NUM_DES_BLOCKS %1 -%define %%DES_KS %2 - -%assign RN 0 -%assign LN 1 -%rep %%NUM_DES_BLOCKS - vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round - vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round - DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1 - vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0 - vmovdqa64 ZIV0, ZTMP12 - vmovdqa64 ZIV1, ZTMP13 -%assign RN (RN + 2) -%assign LN (LN + 2) -%endrep -%endmacro - -;;; =========================================================================== -;;; 3DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only) -;;; =========================================================================== -;;; -;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only -;;; DES_KS1 [in] - pointer to transposed key schedule 1 -;;; DES_KS2 [in] - pointer to transposed key schedule 2 -;;; DES_KS3 [in] - pointer to transposed key schedule 3 -;;; -;;; NOTE: clobbers OpMask registers -;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 -%macro GEN_3DES_ENC_CIPHER 4 -%define %%NUM_DES_BLOCKS %1 -%define %%DES_KS1 %2 -%define %%DES_KS2 %3 -%define %%DES_KS3 %4 - -%assign RN 0 -%assign LN 1 -%assign RNN 2 -%assign LNN 3 -%rep %%NUM_DES_BLOCKS - ;; ENC - DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - ;; DEC - DES_ENC_DEC DEC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - ;; ENC - DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 -%if (RNN < (%%NUM_DES_BLOCKS * 2)) - vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0 - vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0 -%else - vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7 - vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7 -%endif - -%assign RN (RN + 2) -%assign LN (LN + 2) -%assign RNN (RNN + 2) -%assign LNN (LNN + 2) -%endrep - -%endmacro - -;;; =========================================================================== -;;; 3DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only) -;;; =========================================================================== -;;; -;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only -;;; DES_KS1 [in] - pointer to transposed key schedule 1 -;;; DES_KS2 [in] - pointer to transposed key schedule 2 -;;; DES_KS3 [in] - pointer to transposed key schedule 3 -;;; -;;; NOTE: clobbers OpMask registers -;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 -%macro GEN_3DES_DEC_CIPHER 4 -%define %%NUM_DES_BLOCKS %1 -%define %%DES_KS1 %2 -%define %%DES_KS2 %3 -%define %%DES_KS3 %4 - -%assign RN 0 -%assign LN 1 -%rep %%NUM_DES_BLOCKS - vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round - vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round - ;; DEC - DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - ;; ENC - DES_ENC_DEC ENC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - ;; DEC - DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1 - vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0 - vmovdqa64 ZIV0, ZTMP12 - vmovdqa64 ZIV1, ZTMP13 - -%assign RN (RN + 2) -%assign LN (LN + 2) -%endrep - -%endmacro - -;;; =========================================================================== -;;; DES CBC / DOCSIS DES ENCRYPT -;;; =========================================================================== -;;; -;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and -;;; 3DES (3DES CBC) -;;; -;;; NOTE: clobbers OpMask registers -%macro GENERIC_DES_ENC 1 -%define %%DES_DOCSIS %1 - - ;; push the registers and allocate the stack frame - mov rax, rsp - sub rsp, STACKFRAME_size - and rsp, -64 - mov [rsp + _rsp_save], rax ; original SP - mov [rsp + _gpr_save + 0*8], r12 - mov [rsp + _gpr_save + 1*8], r13 - mov [rsp + _gpr_save + 2*8], r14 - mov [rsp + _gpr_save + 3*8], r15 - -%ifnidn %%DES_DOCSIS, 3DES - ;; DES and DOCSIS DES - DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 -%else - ;; 3DES - DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC -%endif - mov [rsp + _size_save], SIZE - and SIZE, -64 - xor OFFSET, OFFSET - ;; This loop processes message in blocks of 64 bytes. - ;; Anything smaller than 64 bytes is handled separately after the loop. -%%_gen_des_enc_loop: - cmp OFFSET, SIZE - jz %%_gen_des_enc_loop_end - ;; run loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] - vmovdqu64 ZW0, [IA0 + OFFSET] - vmovdqu64 ZW1, [IA1 + OFFSET] - vmovdqu64 ZW2, [IA2 + OFFSET] - vmovdqu64 ZW3, [INP0 + OFFSET] - vmovdqu64 ZW4, [INP1 + OFFSET] - vmovdqu64 ZW5, [INP2 + OFFSET] - vmovdqu64 ZW6, [INP3 + OFFSET] - vmovdqu64 ZW7, [INP4 + OFFSET] - - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] - vmovdqu64 ZW8, [IA0 + OFFSET] - vmovdqu64 ZW9, [IA1 + OFFSET] - vmovdqu64 ZW10, [IA2 + OFFSET] - vmovdqu64 ZW11, [INP0 + OFFSET] - vmovdqu64 ZW12, [INP1 + OFFSET] - vmovdqu64 ZW13, [INP2 + OFFSET] - vmovdqu64 ZW14, [INP3 + OFFSET] - vmovdqu64 ZW15, [INP4 + OFFSET] - - ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; DES CBC ENC comes here - vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 - vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1 - -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 8, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - - ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - ;; run stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET], ZW0 - vmovdqu64 [IA1 + OFFSET], ZW1 - vmovdqu64 [IA2 + OFFSET], ZW2 - vmovdqu64 [INP0 + OFFSET], ZW3 - vmovdqu64 [INP1 + OFFSET], ZW4 - vmovdqu64 [INP2 + OFFSET], ZW5 - vmovdqu64 [INP3 + OFFSET], ZW6 - vmovdqu64 [INP4 + OFFSET], ZW7 - - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET], ZW8 - vmovdqu64 [IA1 + OFFSET], ZW9 - vmovdqu64 [IA2 + OFFSET], ZW10 - vmovdqu64 [INP0 + OFFSET], ZW11 - vmovdqu64 [INP1 + OFFSET], ZW12 - vmovdqu64 [INP2 + OFFSET], ZW13 - vmovdqu64 [INP3 + OFFSET], ZW14 - vmovdqu64 [INP4 + OFFSET], ZW15 - - add OFFSET, 64 - jmp %%_gen_des_enc_loop -%%_gen_des_enc_loop_end: - ;; This is where we check if there is anything less than 64 bytes - ;; of message left for processing. - mov SIZE, [rsp + _size_save] - cmp OFFSET, SIZE - jz %%_gen_des_enc_part_end - ;; calculate min of bytes_left and 64, convert to qword mask - GET_MASK8 IA0 ; IA0 = mask - - kmovw k7, DWORD(IA0) - mov [rsp + _mask_save], IA0 - ;; run masked loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] - vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] - vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] - vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] - vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] - vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] - vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] - vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] - vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] - - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] - vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] - vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] - vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] - vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] - vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] - vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] - vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] - vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] - - ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; DES CBC ENC comes here - vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 - vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1 - - mov IA0, [rsp + _mask_save] - cmp BYTE(IA0), 0x0f - ja %%_gt_4 - jz %%_blocks_4 - - cmp BYTE(IA0), 0x03 - ja %%_blocks_3 - jz %%_blocks_2 - - ;; process one block and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 1, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_2: - ;; process two blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 2, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_3: - ;; process three blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 3, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_4: - ;; process four blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 4, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_gt_4: - cmp BYTE(IA0), 0x3f - ja %%_blocks_7 - jz %%_blocks_6 -%%_blocks_5: - ;; process five blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 5, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_6: - ;; process six blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 6, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_7: - ;; process seven blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 7, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - -%%_transpose_out: - ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; run masked stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 - vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 - vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 - vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 - vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 - vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 - vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 - vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 - - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 - vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 - vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 - vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 - vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 - vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 - vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 - vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 -%%_gen_des_enc_part_end: - - ;; store IV and update pointers - DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 - - ;; CFB part for DOCSIS -%ifidn %%DES_DOCSIS, DOCSIS - DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask -%endif - - CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 - - ;; restore stack pointer and registers - mov r12, [rsp + _gpr_save + 0*8] - mov r13, [rsp + _gpr_save + 1*8] - mov r14, [rsp + _gpr_save + 2*8] - mov r15, [rsp + _gpr_save + 3*8] - mov rsp, [rsp + _rsp_save] ; original SP - -%ifdef SAFE_DATA - clear_all_zmms_asm -%else - vzeroupper -%endif ;; SAFE_DATA - -%endmacro - -;;; =========================================================================== -;;; DES CBC / DOCSIS DES DECRYPT -;;; =========================================================================== -;;; -;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and -;;; 3DES (3DES CBC) -;;; -;;; NOTE: clobbers OpMask registers -%macro GENERIC_DES_DEC 1 -%define %%DES_DOCSIS %1 - - ;; push the registers and allocate the stack frame - mov rax, rsp - sub rsp, STACKFRAME_size - and rsp, -64 - mov [rsp + _rsp_save], rax ; original SP - mov [rsp + _gpr_save + 0*8], r12 - mov [rsp + _gpr_save + 1*8], r13 - mov [rsp + _gpr_save + 2*8], r14 - mov [rsp + _gpr_save + 3*8], r15 - -%ifnidn %%DES_DOCSIS, 3DES - ;; DES and DOCSIS - DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 -%else - ;; 3DES - DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC -%endif - - ;; CFB part for DOCSIS -%ifidn %%DES_DOCSIS, DOCSIS - DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask -%endif - - mov [rsp + _size_save], SIZE - and SIZE, -64 - xor OFFSET, OFFSET - ;; This loop processes message in blocks of 64 bytes. - ;; Anything smaller than 64 bytes is handled separately after the loop. -%%_gen_des_dec_loop: - cmp OFFSET, SIZE - jz %%_gen_des_dec_loop_end - ;; run loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] - vmovdqu64 ZW0, [IA0 + OFFSET] - vmovdqu64 ZW1, [IA1 + OFFSET] - vmovdqu64 ZW2, [IA2 + OFFSET] - vmovdqu64 ZW3, [INP0 + OFFSET] - vmovdqu64 ZW4, [INP1 + OFFSET] - vmovdqu64 ZW5, [INP2 + OFFSET] - vmovdqu64 ZW6, [INP3 + OFFSET] - vmovdqu64 ZW7, [INP4 + OFFSET] - - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] - vmovdqu64 ZW8, [IA0 + OFFSET] - vmovdqu64 ZW9, [IA1 + OFFSET] - vmovdqu64 ZW10, [IA2 + OFFSET] - vmovdqu64 ZW11, [INP0 + OFFSET] - vmovdqu64 ZW12, [INP1 + OFFSET] - vmovdqu64 ZW13, [INP2 + OFFSET] - vmovdqu64 ZW14, [INP3 + OFFSET] - vmovdqu64 ZW15, [INP4 + OFFSET] - - ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - -%ifnidn %%DES_DOCSIS, 3DES - ;; DES CBC DEC comes here - GEN_DES_DEC_CIPHER 8, rsp + _key_sched -%else - ;; 3DES CBC DEC comes here - GEN_3DES_DEC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - - ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; run stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET], ZW0 - vmovdqu64 [IA1 + OFFSET], ZW1 - vmovdqu64 [IA2 + OFFSET], ZW2 - vmovdqu64 [INP0 + OFFSET], ZW3 - vmovdqu64 [INP1 + OFFSET], ZW4 - vmovdqu64 [INP2 + OFFSET], ZW5 - vmovdqu64 [INP3 + OFFSET], ZW6 - vmovdqu64 [INP4 + OFFSET], ZW7 - - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET], ZW8 - vmovdqu64 [IA1 + OFFSET], ZW9 - vmovdqu64 [IA2 + OFFSET], ZW10 - vmovdqu64 [INP0 + OFFSET], ZW11 - vmovdqu64 [INP1 + OFFSET], ZW12 - vmovdqu64 [INP2 + OFFSET], ZW13 - vmovdqu64 [INP3 + OFFSET], ZW14 - vmovdqu64 [INP4 + OFFSET], ZW15 - - add OFFSET, 64 - jmp %%_gen_des_dec_loop -%%_gen_des_dec_loop_end: - ;; This is where we check if there is anything less than 64 bytes - ;; of message left for processing. - mov SIZE, [rsp + _size_save] - cmp OFFSET, SIZE - jz %%_gen_des_dec_part_end - ;; calculate min of bytes_left and 64, convert to qword mask - GET_MASK8 IA0 ; IA0 = mask - - kmovw k7, DWORD(IA0) - mov [rsp + _mask_save], IA0 - ;; run masked loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] - vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] - vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] - vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] - vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] - vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] - vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] - vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] - vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] - - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] - vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] - vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] - vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] - vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] - vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] - vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] - vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] - vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] - - ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; DES CBC DEC comes here - mov IA0, [rsp + _mask_save] - cmp BYTE(IA0), 0x0f - ja %%_gt_4 - jz %%_blocks_4 - - cmp BYTE(IA0), 0x03 - ja %%_blocks_3 - jz %%_blocks_2 - ;; process one block and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 1, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_2: - ;; process two blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 2, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_3: - ;; process three blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 3, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_4: - ;; process four blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 4, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_gt_4: - cmp BYTE(IA0), 0x3f - ja %%_blocks_7 - jz %%_blocks_6 -%%_blocks_5: - ;; process five blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 5, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_6: - ;; process six blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 6, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_7: - ;; process seven blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 7, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - -%%_transpose_out: - ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; run masked stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 - vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 - vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 - vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 - vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 - vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 - vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 - vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 - - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 - vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 - vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 - vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 - vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 - vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 - vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 - vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 -%%_gen_des_dec_part_end: - - ;; store IV and update pointers - DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 - - CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 - - ;; restore stack pointer and registers - mov r12, [rsp + _gpr_save + 0*8] - mov r13, [rsp + _gpr_save + 1*8] - mov r14, [rsp + _gpr_save + 2*8] - mov r15, [rsp + _gpr_save + 3*8] - mov rsp, [rsp + _rsp_save] ; original SP - -%ifdef SAFE_DATA - clear_all_zmms_asm -%else - vzeroupper -%endif ;; SAFE_DATA - -%endmacro +%include "include/des_avx512.inc" ;;; ======================================================== ;;; DATA diff --git a/lib/include/des_avx512.inc b/lib/include/des_avx512.inc new file mode 100644 index 00000000..c9abc881 --- /dev/null +++ b/lib/include/des_avx512.inc @@ -0,0 +1,2119 @@ +;; +;; Copyright (c) 2017-2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Authors: +;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2) +;; (1) University of Haifa, Israel +;; (2) Intel Corporation + +%include "include/os.inc" +%include "include/reg_sizes.inc" +%include "include/mb_mgr_datastruct.inc" +%include "include/constants.inc" +;%define DO_DBGPRINT +;%include "include/dbgprint.inc" +%include "include/clear_regs.inc" + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%endif + +%define STATE arg1 +%define SIZE arg2 + +%define OFFSET rax + +%define IA0 arg3 +%define IA1 arg4 +%define IA2 r10 + +%define INP0 r11 +%define INP1 r12 +%define INP2 r13 +%define INP3 r14 +%define INP4 r15 + +%define KSOFFSET r11 + +%define ZW0 zmm0 +%define ZW1 zmm1 +%define ZW2 zmm2 +%define ZW3 zmm3 +%define ZW4 zmm4 +%define ZW5 zmm5 +%define ZW6 zmm6 +%define ZW7 zmm7 +%define ZW8 zmm8 +%define ZW9 zmm9 +%define ZW10 zmm10 +%define ZW11 zmm11 +%define ZW12 zmm12 +%define ZW13 zmm13 +%define ZW14 zmm14 +%define ZW15 zmm15 + +%define ZIV0 zmm16 +%define ZIV1 zmm17 + +%define ZTMP0 zmm18 +%define ZTMP1 zmm19 +%define ZTMP2 zmm20 +%define ZTMP3 zmm21 +%define ZTMP4 zmm22 +%define ZTMP5 zmm23 +%define ZTMP6 zmm24 +%define ZTMP7 zmm25 +%define ZTMP8 zmm26 +%define ZTMP9 zmm27 +%define ZTMP10 zmm28 +%define ZTMP11 zmm29 +%define ZTMP12 zmm30 +%define ZTMP13 zmm31 + +struc STACKFRAME +_key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 +_key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 +_key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 +_tmp_iv: resq 16 ; 2 x 64 bytes +_tmp_in: resq 16 ; 2 x 64 bytes +_tmp_out: resq 16 ; 2 x 64 bytes +_tmp_mask: resd 16 ; 1 x 64 bytes +_gpr_save: resq 4 ; r12 to r15 +_rsp_save: resq 1 +_mask_save: resq 1 +_size_save: resq 1 +endstruc + +;;; =========================================================================== +;;; =========================================================================== +;;; MACROS +;;; =========================================================================== +;;; =========================================================================== + +;;; =========================================================================== +;;; CLEAR TRANSPOSED KEY SCHEDULE (if SAFE_DATA is selected) +;;; =========================================================================== +%macro CLEAR_KEY_SCHEDULE 2 +%define %%ALG %1 ; [in] DES or 3DES +%define %%ZT %2 ; [clobbered] temporary ZMM register + +%ifdef SAFE_DATA + vpxorq %%ZT, %%ZT +%assign rep_num (2048 / 64) +%ifidn %%ALG, 3DES +%assign rep_num (rep_num * 3) +%endif + +%assign offset 0 +%rep rep_num + vmovdqa64 [rsp + _key_sched + offset], %%ZT +%assign offset (offset + 64) +%endrep + +%endif ; SAFE_DATA + +%endmacro + +;;; =========================================================================== +;;; PERMUTE +;;; =========================================================================== +;;; A [in/out] - zmm register +;;; B [in/out] - zmm register +;;; NSHIFT [in] - constant to shift words by +;;; MASK [in] - zmm or m512 with mask +;;; T0 [clobbered] - temporary zmm register +%macro PERMUTE 5 +%define %%A %1 +%define %%B %2 +%define %%NSHIFT %3 +%define %%MASK %4 +%define %%T0 %5 + + vpsrld %%T0, %%A, %%NSHIFT + vpxord %%T0, %%T0, %%B + vpandd %%T0, %%T0, %%MASK + vpxord %%B, %%B, %%T0 + vpslld %%T0, %%T0, %%NSHIFT + vpxord %%A, %%A, %%T0 +%endmacro + +;;; =========================================================================== +;;; INITIAL PERMUTATION +;;; =========================================================================== +;;; L [in/out] - zmm register +;;; R [in/out] - zmm register +;;; T0 [clobbered] - temporary zmm register +%macro IP_Z 3 +%define %%L %1 +%define %%R %2 +%define %%T0 %3 + PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0 + PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0 + PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0 + PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0 + PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0 +%endmacro + +;;; =========================================================================== +;;; FINAL PERMUTATION +;;; =========================================================================== +;;; L [in/out] - zmm register +;;; R [in/out] - zmm register +;;; T0 [clobbered] - temporary zmm register +%macro FP_Z 3 +%define %%L %1 +%define %%R %2 +%define %%T0 %3 + PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0 + PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0 + PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0 + PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0 + PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0 +%endmacro + +;;; =========================================================================== +;;; P PHASE +;;; =========================================================================== +;;; W0 [in/out] - zmm register +;;; in: vector of 16 x 32bits from S phase +;;; out: permuted in vector +;;; T0-T3 [clobbered] - temporary zmm register +%macro P_PHASE 5 +%define %%W0 %1 +%define %%T0 %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 + + vprord %%T0, %%W0, 3 + vpandd %%T0, %%T0, [rel mask_values + 0*64] + vprord %%T1, %%W0, 5 + vpandd %%T1, %%T1, [rel mask_values + 1*64] + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 24 + vpandd %%T1, %%T1, [rel mask_values + 2*64] + vprord %%T2, %%W0, 26 + vpandd %%T2, %%T2, [rel mask_values + 3*64] + vpord %%T1, %%T1, %%T2 + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 15 + vpandd %%T1, %%T1, [rel mask_values + 4*64] + vprord %%T2, %%W0, 17 + vpandd %%T2, %%T2, [rel mask_values + 5*64] + vpord %%T1, %%T1, %%T2 + + vprord %%T2, %%W0, 6 + vpandd %%T2, %%T2, [rel mask_values + 6*64] + vprord %%T3, %%W0, 21 + vpandd %%T3, %%T3, [rel mask_values + 7*64] + vpord %%T2, %%T2, %%T3 + vpord %%T1, %%T1, %%T2 + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 12 + vpandd %%T1, %%T1, [rel mask_values + 8*64] + vprord %%T2, %%W0, 14 + vpandd %%T2, %%T2, [rel mask_values + 9*64] + vpord %%T1, %%T1, %%T2 + + vprord %%T2, %%W0, 4 + vpandd %%T2, %%T2, [rel mask_values + 10*64] + vprord %%T3, %%W0, 11 + vpandd %%T3, %%T3, [rel mask_values + 11*64] + vpord %%T2, %%T2, %%T3 + vpord %%T1, %%T1, %%T2 + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 16 + vpandd %%T1, %%T1, [rel mask_values + 12*64] + vprord %%T2, %%W0, 22 + vpandd %%T2, %%T2, [rel mask_values + 13*64] + vpord %%T1, %%T1, %%T2 + + vprord %%T2, %%W0, 19 + vpandd %%T2, %%T2, [rel mask_values + 14*64] + vprord %%T3, %%W0, 10 + vpandd %%T3, %%T3, [rel mask_values + 15*64] + vpord %%T2, %%T2, %%T3 + vpord %%T1, %%T1, %%T2 + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 9 + vpandd %%T1, %%T1, [rel mask_values + 16*64] + vprord %%T2, %%W0, 13 + vpandd %%T2, %%T2, [rel mask_values + 17*64] + vpord %%T1, %%T1, %%T2 + + vprord %%T2, %%W0, 25 + vpandd %%T2, %%T2, [rel mask_values + 18*64] + vpord %%T1, %%T1, %%T2 + vpord %%W0, %%T0, %%T1 +%endmacro + +;;; =========================================================================== +;;; E PHASE +;;; =========================================================================== +;;; +;;; Expands 16x32-bit words into 16x48-bit words +;;; plus XOR's result with the key schedule. +;;; The output is adjusted to be friendly as S phase input. +;;; +;;; in [in] - zmm register +;;; out0a [out] - zmm register +;;; out0b [out] - zmm register +;;; out1a [out] - zmm register +;;; out1b [out] - zmm register +;;; k0 [in] - key schedule; zmm or m512 +;;; k1 [in] - key schedule; zmm or m512 +;;; t0-t1 [clobbered] - temporary zmm register +%macro E_PHASE 9 +%define %%IN %1 +%define %%OUT0A %2 +%define %%OUT0B %3 +%define %%OUT1A %4 +%define %%OUT1B %5 +%define %%K0 %6 +%define %%K1 %7 +%define %%T0 %8 +%define %%T1 %9 + + vprord %%T0, %%IN, 31 + vprord %%T1, %%IN, 3 + vpshufb %%T0, %%T0, [rel idx_e] + vpshufb %%T1, %%T1, [rel idx_e] + vpunpcklbw %%OUT0A, %%T0, %%T1 + vpunpckhbw %%OUT1A, %%T0, %%T1 + vpxord %%OUT0A, %%OUT0A, %%K0 + vpxord %%OUT1A, %%OUT1A, %%K1 + vpandd %%OUT0B, %%OUT0A, [rel and_eu] + vpsrlw %%OUT0B, %%OUT0B, 8 + vpandd %%OUT0A, %%OUT0A, [rel and_ed] + vpandd %%OUT1B, %%OUT1A, [rel and_eu] + vpsrlw %%OUT1B, %%OUT1B, 8 + vpandd %%OUT1A, %%OUT1A, [rel and_ed] +%endmacro + +;;; =========================================================================== +;;; S-BOX +;;; =========================================================================== +;;; +;;; NOTE: clobbers k1-k6 OpMask registers +;;; +;;; IN0A [in] - zmm register; output from E-phase +;;; IN0B [in] - zmm register; output from E-phase +;;; IN1A [in] - zmm register; output from E-phase +;;; IN1B [in] - zmm register; output from E-phase +;;; OUT [out] - zmm register; output from E-phase +;;; T0-T5 [clobbered] - temporary zmm register +%macro S_PHASE 11 +%define %%IN0A %1 +%define %%IN0B %2 +%define %%IN1A %3 +%define %%IN1B %4 +%define %%OUT %5 +%define %%T0 %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%T3 %9 +%define %%T4 %10 +%define %%T5 %11 + + vmovdqa64 %%T0, [rel reg_values16bit_7] + vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE + vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE + vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE + vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE + + mov DWORD(IA0), 0x55555555 + kmovd k1, DWORD(IA0) + mov DWORD(IA0), 0xaaaaaaaa + kmovd k2, DWORD(IA0) + + vmovdqa64 %%T0, [rel S_box_flipped + 0*64] + vmovdqa64 %%T1, [rel S_box_flipped + 1*64] + vmovdqa64 %%T2, [rel S_box_flipped + 4*64] + vmovdqa64 %%T3, [rel S_box_flipped + 5*64] + vpermw %%T0{k1}{z}, %%IN0A, %%T0 + vpermw %%T1{k1}{z}, %%IN0A, %%T1 + vpermw %%T2{k2}{z}, %%IN0A, %%T2 + vpermw %%T3{k2}{z}, %%IN0A, %%T3 + vpxord %%T0, %%T0, %%T2 + vpxord %%OUT, %%T1, %%T3 + vmovdqu16 %%OUT{k3}, %%T0 + + vmovdqa64 %%T0, [rel S_box_flipped + 2*64] + vmovdqa64 %%T1, [rel S_box_flipped + 3*64] + vmovdqa64 %%T2, [rel S_box_flipped + 6*64] + vmovdqa64 %%T3, [rel S_box_flipped + 7*64] + vpermw %%T0{k1}{z}, %%IN0B, %%T0 + vpermw %%T1{k1}{z}, %%IN0B, %%T1 + vpermw %%T2{k2}{z}, %%IN0B, %%T2 + vpermw %%T3{k2}{z}, %%IN0B, %%T3 + vpxord %%T0, %%T0, %%T2 + vpxord %%T3, %%T1, %%T3 + vmovdqu16 %%T3{k4}, %%T0 + vpsllw %%T3, %%T3, 4 + vpxord %%OUT, %%OUT, %%T3 + + vmovdqa64 %%T0, [rel S_box_flipped + 8*64] + vmovdqa64 %%T1, [rel S_box_flipped + 9*64] + vmovdqa64 %%T2, [rel S_box_flipped + 12*64] + vmovdqa64 %%T3, [rel S_box_flipped + 13*64] + vpermw %%T0{k1}{z}, %%IN1A, %%T0 + vpermw %%T1{k1}{z}, %%IN1A, %%T1 + vpermw %%T2{k2}{z}, %%IN1A, %%T2 + vpermw %%T3{k2}{z}, %%IN1A, %%T3 + vpxord %%T0, %%T0, %%T2 + vpxord %%T4, %%T1, %%T3 + vmovdqu16 %%T4{k5}, %%T0 + + vmovdqa64 %%T0, [rel S_box_flipped + 10*64] + vmovdqa64 %%T1, [rel S_box_flipped + 11*64] + vmovdqa64 %%T2, [rel S_box_flipped + 14*64] + vmovdqa64 %%T3, [rel S_box_flipped + 15*64] + vpermw %%T0{k1}{z}, %%IN1B, %%T0 + vpermw %%T1{k1}{z}, %%IN1B, %%T1 + vpermw %%T2{k2}{z}, %%IN1B, %%T2 + vpermw %%T3{k2}{z}, %%IN1B, %%T3 + vpxord %%T0, %%T0, %%T2 + vpxord %%T5, %%T1, %%T3 + vmovdqu16 %%T5{k6}, %%T0 + vpsllw %%T5, %%T5, 4 + + vpxord %%T4, %%T4, %%T5 + vpsllw %%T4, %%T4, 8 + vpxord %%OUT, %%OUT, %%T4 + vpshufb %%OUT, %%OUT, [rel shuffle_reg] +%endmacro + +;;; =========================================================================== +;;; DES encryption/decryption round +;;; =========================================================================== +;;; +;;; Clobbers k1-k6 OpMask registers +;;; +;;; ENC_DEC [in] - ENC for encryption, DEC for decryption +;;; R [in/out] - zmm register; plain text in & cipher text out +;;; L [in/out] - zmm register; plain text in & cipher text out +;;; KS [in] - pointer to the key schedule +;;; T0-T11 [clobbered] - temporary zmm register +%macro DES_ENC_DEC 16 +%define %%ENC_DEC %1 +%define %%R %2 +%define %%L %3 +%define %%KS %4 +%define %%T0 %5 +%define %%T1 %6 +%define %%T2 %7 +%define %%T3 %8 +%define %%T4 %9 +%define %%T5 %10 +%define %%T6 %11 +%define %%T7 %12 +%define %%T8 %13 +%define %%T9 %14 +%define %%T10 %15 +%define %%T11 %16 + + IP_Z %%R, %%L, %%T0 + +%ifidn %%ENC_DEC, ENC + ;; ENCRYPTION + xor KSOFFSET, KSOFFSET +%%_des_enc_loop: + E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7 + S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 + P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 + vpxord %%L, %%L, %%T0 + + E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7 + S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 + P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 + vpxord %%R, %%R, %%T0 + + add KSOFFSET, (4*64) + cmp KSOFFSET, (8*(4*64)) + jb %%_des_enc_loop + +%else + ;; DECRYPTION + mov KSOFFSET, (8*(4*64)) +%%_des_dec_loop: + E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7 + S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 + P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 + vpxord %%L, %%L, %%T0 + + E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7 + S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 + P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 + vpxord %%R, %%R, %%T0 + sub KSOFFSET, (4*64) + jnz %%_des_dec_loop +%endif ; DECRYPTION + + FP_Z %%R, %%L, %%T0 + +%endmacro + +;;; =========================================================================== +;;; DATA TRANSPOSITION AT DATA INPUT +;;; =========================================================================== +;;; +;;; IN00 - IN15 [in/out]: +;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data +;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 +;;; T0-T3 [clobbered] - temporary zmm registers +;;; K0-K5 [clobbered] - temporary zmm registers +;;; H0-H3 [clobbered] - temporary zmm registers +%macro TRANSPOSE_IN 30 +%define %%IN00 %1 ; R0 +%define %%IN01 %2 ; L0 +%define %%IN02 %3 ; R1 +%define %%IN03 %4 ; L1 +%define %%IN04 %5 ; R2 +%define %%IN05 %6 ; L2 +%define %%IN06 %7 ; R3 +%define %%IN07 %8 ; L3 +%define %%IN08 %9 ; R4 +%define %%IN09 %10 ; L4 +%define %%IN10 %11 ; R5 +%define %%IN11 %12 ; L5 +%define %%IN12 %13 ; R6 +%define %%IN13 %14 ; L6 +%define %%IN14 %15 ; R7 +%define %%IN15 %16 ; L7 +%define %%T0 %17 +%define %%T1 %18 +%define %%T2 %19 +%define %%T3 %20 +%define %%K0 %21 +%define %%K1 %22 +%define %%K2 %23 +%define %%K3 %24 +%define %%K4 %25 +%define %%K5 %26 +%define %%H0 %27 +%define %%H1 %28 +%define %%H2 %29 +%define %%H3 %30 + + vpunpckldq %%K0, %%IN00, %%IN01 + vpunpckhdq %%K1, %%IN00, %%IN01 + vpunpckldq %%T0, %%IN02, %%IN03 + vpunpckhdq %%T1, %%IN02, %%IN03 + + vpunpckldq %%IN00, %%IN04, %%IN05 + vpunpckhdq %%IN01, %%IN04, %%IN05 + vpunpckldq %%IN02, %%IN06, %%IN07 + vpunpckhdq %%IN03, %%IN06, %%IN07 + + vpunpcklqdq %%K2, %%K0, %%T0 + vpunpckhqdq %%T2, %%K0, %%T0 + vpunpcklqdq %%K3, %%K1, %%T1 + vpunpckhqdq %%T3, %%K1, %%T1 + + vpunpcklqdq %%K0, %%IN00, %%IN02 + vpunpckhqdq %%K1, %%IN00, %%IN02 + vpunpcklqdq %%T0, %%IN01, %%IN03 + vpunpckhqdq %%T1, %%IN01, %%IN03 + + vpunpckldq %%K4, %%IN08, %%IN09 + vpunpckhdq %%K5, %%IN08, %%IN09 + vpunpckldq %%IN04, %%IN10, %%IN11 + vpunpckhdq %%IN05, %%IN10, %%IN11 + vpunpckldq %%IN06, %%IN12, %%IN13 + vpunpckhdq %%IN07, %%IN12, %%IN13 + vpunpckldq %%IN10, %%IN14, %%IN15 + vpunpckhdq %%IN11, %%IN14, %%IN15 + + vpunpcklqdq %%IN12, %%K4, %%IN04 + vpunpckhqdq %%IN13, %%K4, %%IN04 + vpunpcklqdq %%IN14, %%K5, %%IN05 + vpunpckhqdq %%IN15, %%K5, %%IN05 + vpunpcklqdq %%IN00, %%IN06, %%IN10 + vpunpckhqdq %%IN01, %%IN06, %%IN10 + vpunpcklqdq %%IN02, %%IN07, %%IN11 + vpunpckhqdq %%IN03, %%IN07, %%IN11 + + vshufi64x2 %%H0, %%K2, %%K0, 0x44 + vshufi64x2 %%H1, %%K2, %%K0, 0xee + vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 + vshufi64x2 %%H3, %%IN12, %%IN00, 0xee + vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 + vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 + vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 + vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 + + vshufi64x2 %%H0, %%T2, %%K1, 0x44 + vshufi64x2 %%H1, %%T2, %%K1, 0xee + vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 + vshufi64x2 %%H3, %%IN13, %%IN01, 0xee + vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 + vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 + vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 + vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 + + vshufi64x2 %%H0, %%K3, %%T0, 0x44 + vshufi64x2 %%H1, %%K3, %%T0, 0xee + vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 + vshufi64x2 %%H3, %%IN14, %%IN02, 0xee + vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 + vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 + vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 + vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 + + vshufi64x2 %%H0, %%T3, %%T1, 0x44 + vshufi64x2 %%H1, %%T3, %%T1, 0xee + vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 + vshufi64x2 %%H3, %%IN15, %%IN03, 0xee + vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 + vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 + vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 + vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 +%endmacro + +;;; =========================================================================== +;;; DATA TRANSPOSITION AT DATA OUTPUT +;;; =========================================================================== +;;; +;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: +;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 +;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data +;;; T0-T3 [clobbered] - temporary zmm registers +;;; K0-K5 [clobbered] - temporary zmm registers +;;; H0-H3 [clobbered] - temporary zmm registers +%macro TRANSPOSE_OUT 30 +%define %%IN00 %1 ; R0 +%define %%IN01 %2 ; L0 +%define %%IN02 %3 ; R1 +%define %%IN03 %4 ; L1 +%define %%IN04 %5 ; R2 +%define %%IN05 %6 ; L2 +%define %%IN06 %7 ; R3 +%define %%IN07 %8 ; L3 +%define %%IN08 %9 ; R4 +%define %%IN09 %10 ; L4 +%define %%IN10 %11 ; R5 +%define %%IN11 %12 ; L5 +%define %%IN12 %13 ; R6 +%define %%IN13 %14 ; L6 +%define %%IN14 %15 ; R7 +%define %%IN15 %16 ; L7 +%define %%T0 %17 +%define %%T1 %18 +%define %%T2 %19 +%define %%T3 %20 +%define %%K0 %21 +%define %%K1 %22 +%define %%K2 %23 +%define %%K3 %24 +%define %%K4 %25 +%define %%K5 %26 +%define %%H0 %27 +%define %%H1 %28 +%define %%H2 %29 +%define %%H3 %30 + + vpunpckldq %%K0, %%IN01, %%IN00 + vpunpckhdq %%K1, %%IN01, %%IN00 + vpunpckldq %%T0, %%IN03, %%IN02 + vpunpckhdq %%T1, %%IN03, %%IN02 + + vpunpckldq %%IN00, %%IN05, %%IN04 + vpunpckhdq %%IN01, %%IN05, %%IN04 + vpunpckldq %%IN02, %%IN07, %%IN06 + vpunpckhdq %%IN03, %%IN07, %%IN06 + + vpunpcklqdq %%K2, %%K0, %%T0 + vpunpckhqdq %%T2, %%K0, %%T0 + vpunpcklqdq %%K3, %%K1, %%T1 + vpunpckhqdq %%T3, %%K1, %%T1 + + vpunpcklqdq %%K0, %%IN00, %%IN02 + vpunpckhqdq %%K1, %%IN00, %%IN02 + vpunpcklqdq %%T0, %%IN01, %%IN03 + vpunpckhqdq %%T1, %%IN01, %%IN03 + + vpunpckldq %%K4, %%IN09, %%IN08 + vpunpckhdq %%K5, %%IN09, %%IN08 + vpunpckldq %%IN04, %%IN11, %%IN10 + vpunpckhdq %%IN05, %%IN11, %%IN10 + vpunpckldq %%IN06, %%IN13, %%IN12 + vpunpckhdq %%IN07, %%IN13, %%IN12 + vpunpckldq %%IN10, %%IN15, %%IN14 + vpunpckhdq %%IN11, %%IN15, %%IN14 + + vpunpcklqdq %%IN12, %%K4, %%IN04 + vpunpckhqdq %%IN13, %%K4, %%IN04 + vpunpcklqdq %%IN14, %%K5, %%IN05 + vpunpckhqdq %%IN15, %%K5, %%IN05 + vpunpcklqdq %%IN00, %%IN06, %%IN10 + vpunpckhqdq %%IN01, %%IN06, %%IN10 + vpunpcklqdq %%IN02, %%IN07, %%IN11 + vpunpckhqdq %%IN03, %%IN07, %%IN11 + + vshufi64x2 %%H0, %%K2, %%K0, 0x44 + vshufi64x2 %%H1, %%K2, %%K0, 0xee + vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 + vshufi64x2 %%H3, %%IN12, %%IN00, 0xee + vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 + vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 + vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 + vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 + + vshufi64x2 %%H0, %%T2, %%K1, 0x44 + vshufi64x2 %%H1, %%T2, %%K1, 0xee + vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 + vshufi64x2 %%H3, %%IN13, %%IN01, 0xee + vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 + vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 + vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 + vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 + + vshufi64x2 %%H0, %%K3, %%T0, 0x44 + vshufi64x2 %%H1, %%K3, %%T0, 0xee + vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 + vshufi64x2 %%H3, %%IN14, %%IN02, 0xee + vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 + vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 + vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 + vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 + + vshufi64x2 %%H0, %%T3, %%T1, 0x44 + vshufi64x2 %%H1, %%T3, %%T1, 0xee + vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 + vshufi64x2 %%H3, %%IN15, %%IN03, 0xee + vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 + vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 + vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 + vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 +%endmacro + +;;; =========================================================================== +;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT +;;; =========================================================================== +;;; +;;; IN00-IN15 / R0/L0-R7/L7 [in/out]: +;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data +;;; out: R0 - 16 x word0, L0 - 16 x word1 +;;; T0,T2 [clobbered] - temporary zmm registers +;;; K0-K4 [clobbered] - temporary zmm registers +;;; H0,H2 [clobbered] - temporary zmm registers +%macro TRANSPOSE_IN_ONE 24 +%define %%IN00 %1 ; R0 +%define %%IN01 %2 ; L0 +%define %%IN02 %3 ; R1 +%define %%IN03 %4 ; L1 +%define %%IN04 %5 ; R2 +%define %%IN05 %6 ; L2 +%define %%IN06 %7 ; R3 +%define %%IN07 %8 ; L3 +%define %%IN08 %9 ; R4 +%define %%IN09 %10 ; L4 +%define %%IN10 %11 ; R5 +%define %%IN11 %12 ; L5 +%define %%IN12 %13 ; R6 +%define %%IN13 %14 ; L6 +%define %%IN14 %15 ; R7 +%define %%IN15 %16 ; L7 +%define %%T0 %17 +%define %%T2 %18 +%define %%K0 %19 +%define %%K1 %20 +%define %%K2 %21 +%define %%K4 %22 +%define %%H0 %23 +%define %%H2 %24 + + vpunpckldq %%K0, %%IN00, %%IN01 + vpunpckhdq %%K1, %%IN00, %%IN01 + vpunpckldq %%T0, %%IN02, %%IN03 + + vpunpckldq %%IN00, %%IN04, %%IN05 + vpunpckhdq %%IN01, %%IN04, %%IN05 + vpunpckldq %%IN02, %%IN06, %%IN07 + + vpunpcklqdq %%K2, %%K0, %%T0 + vpunpckhqdq %%T2, %%K0, %%T0 + + vpunpcklqdq %%K0, %%IN00, %%IN02 + vpunpckhqdq %%K1, %%IN00, %%IN02 + + vpunpckldq %%K4, %%IN08, %%IN09 + vpunpckldq %%IN04, %%IN10, %%IN11 + vpunpckldq %%IN06, %%IN12, %%IN13 + vpunpckldq %%IN10, %%IN14, %%IN15 + + vpunpcklqdq %%IN12, %%K4, %%IN04 + vpunpckhqdq %%IN13, %%K4, %%IN04 + vpunpcklqdq %%IN00, %%IN06, %%IN10 + vpunpckhqdq %%IN01, %%IN06, %%IN10 + + vshufi64x2 %%H0, %%K2, %%K0, 0x44 + vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 + vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 + + vshufi64x2 %%H0, %%T2, %%K1, 0x44 + vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 + vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 +%endmacro + +;;; =========================================================================== +;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT +;;; =========================================================================== +;;; +;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: +;;; in: R0 - 16 x word0, L0 - 16 x word1 +;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data +;;; T0-T3 [clobbered] - temporary zmm registers +;;; K0-K3 [clobbered] - temporary zmm registers +;;; H0,H1 [clobbered] - temporary zmm registers +%macro TRANSPOSE_OUT_ONE 25 +%define %%IN00 %1 ; R0 +%define %%IN01 %2 ; L0 +%define %%IN02 %3 ; R1 +%define %%IN03 %4 ; L1 +%define %%IN04 %5 ; R2 +%define %%IN05 %6 ; L2 +%define %%IN06 %7 ; R3 +%define %%IN07 %8 ; L3 +%define %%IN08 %9 ; R4 +%define %%IN09 %10 ; L4 +%define %%IN10 %11 ; R5 +%define %%IN11 %12 ; L5 +%define %%IN12 %13 ; R6 +%define %%IN13 %14 ; L6 +%define %%IN14 %15 ; R7 +%define %%IN15 %16 ; L7 +%define %%T0 %17 +%define %%T2 %18 +%define %%T3 %19 +%define %%K0 %20 +%define %%K1 %21 +%define %%K2 %22 +%define %%K3 %23 +%define %%H0 %24 +%define %%H1 %25 + + vpxord %%T0, %%T0, %%T0 + + vpunpckldq %%K0, %%IN01, %%IN00 + vpunpckhdq %%K1, %%IN01, %%IN00 + + vpunpcklqdq %%K2, %%K0, %%T0 + vpunpckhqdq %%T2, %%K0, %%T0 + vpunpcklqdq %%K3, %%K1, %%T0 + vpunpckhqdq %%T3, %%K1, %%T0 + + vshufi64x2 %%H0, %%K2, %%T0, 0x44 + vshufi64x2 %%H1, %%K2, %%T0, 0xee + vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0 + vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2 + vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4 + vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6 + + vshufi64x2 %%H0, %%T2, %%T0, 0x44 + vshufi64x2 %%H1, %%T2, %%T0, 0xee + vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0 + vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2 + vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4 + vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6 + + vshufi64x2 %%H0, %%K3, %%T0, 0x44 + vshufi64x2 %%H1, %%K3, %%T0, 0xee + vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1 + vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3 + vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5 + vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7 + + vshufi64x2 %%H0, %%T3, %%T0, 0x44 + vshufi64x2 %%H1, %%T3, %%T0, 0xee + vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1 + vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3 + vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5 + vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7 +%endmacro + +;;; =========================================================================== +;;; DES INITIALIZATION +;;; key schedule transposition and IV set up +;;; =========================================================================== +;;; +;;; STATE_KEYS [in] - KEYS in DES OOO STATE +;;; STATE_IV [ in] - IV in DES OOO STATE +;;; KS [out] - place to store transposed key schedule or NULL +;;; IV0 [out] - r512; initialization vector +;;; IV1 [out] - r512; initialization vector +;;; T0-T27 [clobbered] - temporary r512 +%macro DES_INIT 33 +%define %%STATE_KEYS %1 +%define %%STATE_IV %2 +%define %%KS %3 +%define %%IV0 %4 +%define %%IV1 %5 +%define %%T0 %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%T3 %9 +%define %%T4 %10 +%define %%T5 %11 +%define %%T6 %12 +%define %%T7 %13 +%define %%T8 %14 +%define %%T9 %15 +%define %%T10 %16 +%define %%T11 %17 +%define %%T12 %18 +%define %%T13 %19 +%define %%T14 %20 +%define %%T15 %21 +%define %%T16 %22 +%define %%T17 %23 +%define %%T18 %24 +%define %%T19 %25 +%define %%T20 %26 +%define %%T21 %27 +%define %%T22 %28 +%define %%T23 %29 +%define %%T24 %30 +%define %%T25 %31 +%define %%T26 %32 +%define %%T27 %33 + + ;; set up the key schedule + ;; - load first half of the keys & transpose + ;; - transpose and store + ;; note: we can use IV registers as temporary ones here +%assign IDX 0 +%rep 16 + mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] + vmovdqu64 %%T %+ IDX, [IA0] +%assign IDX (IDX + 1) +%endrep + TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 +%assign IDX 0 +%rep 16 + vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX +%assign IDX (IDX + 1) +%endrep + ;; - load second half of the keys & transpose + ;; - transpose and store + ;; note: we can use IV registers as temporary ones here +%assign IDX 0 +%rep 16 + mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] + vmovdqu64 %%T %+ IDX, [IA0 + 64] +%assign IDX (IDX + 1) +%endrep + TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 +%assign IDX 0 +%rep 16 + vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX +%assign IDX (IDX + 1) +%endrep + + ;; set up IV + ;; - they are already kept transposed so this is enough to load them + vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)] + vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)] +%endmacro + +;;; =========================================================================== +;;; 3DES INITIALIZATION +;;; key schedule transposition and IV set up +;;; =========================================================================== +;;; +;;; STATE_KEYS [in] - KEYS in 3DES OOO STATE +;;; STATE_IV [ in] - IV in 3DES OOO STATE +;;; KS1 [out] - place to store transposed key schedule or NULL +;;; KS2 [out] - place to store transposed key schedule or NULL +;;; KS3 [out] - place to store transposed key schedule or NULL +;;; IV0 [out] - r512; initialization vector +;;; IV1 [out] - r512; initialization vector +;;; T0-T27 [clobbered] - temporary r512 +;;; DIR [in] - ENC/DEC (keys arranged in different order for enc/dec) +%macro DES3_INIT 36 +%define %%STATE_KEYS %1 +%define %%STATE_IV %2 +%define %%KS1 %3 +%define %%KS2 %4 +%define %%KS3 %5 +%define %%IV0 %6 +%define %%IV1 %7 +%define %%T0 %8 +%define %%T1 %9 +%define %%T2 %10 +%define %%T3 %11 +%define %%T4 %12 +%define %%T5 %13 +%define %%T6 %14 +%define %%T7 %15 +%define %%T8 %16 +%define %%T9 %17 +%define %%T10 %18 +%define %%T11 %19 +%define %%T12 %20 +%define %%T13 %21 +%define %%T14 %22 +%define %%T15 %23 +%define %%T16 %24 +%define %%T17 %25 +%define %%T18 %26 +%define %%T19 %27 +%define %%T20 %28 +%define %%T21 %29 +%define %%T22 %30 +%define %%T23 %31 +%define %%T24 %32 +%define %%T25 %33 +%define %%T26 %34 +%define %%T27 %35 +%define %%DIR %36 + +%ifidn %%DIR, ENC +%assign KEY_IDX 0 +%else +%assign KEY_IDX 2 +%endif +%assign KS_IDX 1 + +%rep 3 + ;; set up the key schedule + ;; - load first half of the keys & transpose + ;; - transpose and store + ;; note: we can use IV registers as temporary ones here + +%assign IDX 0 +%rep 16 + mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] + mov IA0, [IA0 + (KEY_IDX * PTR_SZ)] + vmovdqu64 %%T %+ IDX, [IA0] +%assign IDX (IDX + 1) +%endrep + TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 +%assign IDX 0 +%rep 16 + vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX +%assign IDX (IDX + 1) +%endrep + ;; - load second half of the keys & transpose + ;; - transpose and store + ;; note: we can use IV registers as temporary ones here +%assign IDX 0 +%rep 16 + mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] + mov IA0, [IA0 + (KEY_IDX * PTR_SZ)] + vmovdqu64 %%T %+ IDX, [IA0 + 64] +%assign IDX (IDX + 1) +%endrep + TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 +%assign IDX 0 +%rep 16 + vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX +%assign IDX (IDX + 1) +%endrep + +%ifidn %%DIR, ENC +%assign KEY_IDX (KEY_IDX + 1) +%else +%assign KEY_IDX (KEY_IDX - 1) +%endif +%assign KS_IDX (KS_IDX + 1) +%endrep ; KEY_IDX / KS_IDX + + ;; set up IV + ;; - they are already kept transposed so this is enough to load them + vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)] + vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)] + +%endmacro + +;;; =========================================================================== +;;; DES FINISH +;;; Update in/out pointers and store IV +;;; =========================================================================== +;;; +;;; Needs: STATE & SIZE +;;; IV0 [in] - r512; initialization vector +;;; IV1 [in] - r512; initialization vector +;;; T0-T4 [clobbered] - temporary r512 registers +%macro DES_FINISH 7 +%define %%IV0 %1 +%define %%IV1 %2 +%define %%T0 %3 +%define %%T1 %4 +%define %%T2 %5 +%define %%T3 %6 +%define %%T4 %7 + + vpbroadcastq %%T4, SIZE + vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)] + vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)] + vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)] + vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)] + vpaddq %%T0, %%T0, %%T4 + vpaddq %%T1, %%T1, %%T4 + vpaddq %%T2, %%T2, %%T4 + vpaddq %%T3, %%T3, %%T4 + vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0 + vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1 + vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2 + vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3 + + vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0 + vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1 +%endmacro + +;;; =========================================================================== +;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY +;;; =========================================================================== +;;; +;;; Needs: STATE, IA0-IA2 +;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection +;;; KS [in] - key schedule +;;; T0-T24 [clobbered] - temporary r512 +;;; T_IN [in] - 16 * 8 byte storage +;;; T_OUT [in] - 16 * 8 byte storage +;;; T_MASK [in] - 16 * 4 byte storage +;;; T_IV [in] - 16 * 8 byte storage +;;; +;;; NOTE: clobbers OpMask registers +%macro DES_CFB_ONE 31 +%define %%ENC_DEC %1 +%define %%KS %2 +%define %%T0 %3 +%define %%T1 %4 +%define %%T2 %5 +%define %%T3 %6 +%define %%T4 %7 +%define %%T5 %8 +%define %%T6 %9 +%define %%T7 %10 +%define %%T8 %11 +%define %%T9 %12 +%define %%T10 %13 +%define %%T11 %14 +%define %%T12 %15 +%define %%T13 %16 +%define %%T14 %17 +%define %%T15 %18 +%define %%T16 %19 +%define %%T17 %20 +%define %%T18 %21 +%define %%T19 %22 +%define %%T20 %23 +%define %%T21 %24 +%define %%T22 %25 +%define %%T23 %26 +%define %%T24 %27 +%define %%T_IN %28 +%define %%T_OUT %29 +%define %%T_IV %30 +%define %%T_MASK %31 + + ;; - find mask for non-zero partial lengths + vpxord %%T10, %%T10, %%T10 + vmovdqu64 %%T0, [STATE + _des_args_PLen] + vpcmpd k3, %%T0, %%T10, 4 ; NEQ + kmovw DWORD(IA0), k3 + movzx DWORD(IA0), WORD(IA0) + or DWORD(IA0), DWORD(IA0) + jz %%_des_cfb_one_end ; no non-zero partial lengths + +%ifidn %%ENC_DEC, ENC + ;; For encyrption case we need to make sure that + ;; all full blocks are complete before proceeding + ;; with CFB partial block. + ;; To do that current out position is compared against + ;; calculated last full block position. + vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)] + vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)] + vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)] + vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)] + vpcmpq k4, %%T1, %%T2, 0 ; EQ + vpcmpq k5, %%T3, %%T4, 0 ; EQ + kmovw DWORD(IA1), k4 + movzx DWORD(IA1), BYTE(IA1) + kmovw DWORD(IA2), k5 + movzx DWORD(IA2), BYTE(IA2) + shl DWORD(IA2), 8 + or DWORD(IA2), DWORD(IA1) + and DWORD(IA0), DWORD(IA2) + jz %%_des_cfb_one_end ; no non-zero lengths left + kmovw k3, DWORD(IA0) +%endif + ;; Calculate ((1 << partial_bytes) - 1) + ;; in order to get the mask for loads and stores + ;; k3 & IA0 - hold valid mask + vmovdqa64 %%T1, [rel vec_ones_32b] + vpsllvd %%T2{k3}{z}, %%T1, %%T0 + vpsubd %%T2{k3}{z}, %%T2, %%T1 + vmovdqu64 [%%T_MASK], %%T2 + + ;; clear selected partial lens not to do them twice + vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10 + + ;; copy IV, in and out pointers + vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)] + vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)] + vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)] + vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)] + vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)] + vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)] + vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1 + vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2 + vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3 + vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4 + vmovdqu64 [%%T_IV + (0*64)], %%T5 + vmovdqu64 [%%T_IV + (1*64)], %%T6 + + ;; calculate last block case mask + ;; - first block case requires no modifications to in/out/IV + vmovdqu64 %%T1, [STATE + _des_args_BLen] + vpcmpd k2, %%T1, %%T10, 4 ; NEQ + kmovw DWORD(IA1), k2 + and DWORD(IA1), DWORD(IA0) + jz %%_des_cfb_one_no_last_blocks + + ;; set up IV, in and out for the last block case + ;; - Last block needs in and out to be set differently (decryption only) + ;; - IA1 holds the last block mask +%ifidn %%ENC_DEC, DEC + mov DWORD(IA0), DWORD(IA1) + mov DWORD(IA2), DWORD(IA1) + shr DWORD(IA1), 8 + and DWORD(IA2), 0xff + kmovw k4, DWORD(IA2) + kmovw k5, DWORD(IA1) + vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)] + vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)] + vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)] + vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)] + vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1 + vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2 + vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3 + vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4 +%endif ; decryption + ;; - IV has to be set differently for CFB as well + ;; - IA0 holds the last block mask +%assign IDX 0 +%rep 16 + test DWORD(IA0), (1 << IDX) + jz %%_des_cfb_one_copy_iv_next %+ IDX +%ifidn %%ENC_DEC, ENC + mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)] +%else + mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)] +%endif + mov IA2, [IA2 - 8] + mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2) + shr IA2, 32 + mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2) +%%_des_cfb_one_copy_iv_next %+ IDX: +%assign IDX (IDX + 1) +%endrep + +%%_des_cfb_one_no_last_blocks: + ;; Uffff ... finally let's do some DES CFB + ;; - let's use T_IN, T_OUT, T_IV and T_MASK + + ;; - load data with the corresponding masks & transpose + ;; - T0 to T15 will hold the data + xor IA0, IA0 +%assign IDX 0 +%assign K_IDX 1 +%rep 16 + mov IA1, [%%T_IN + (IDX*PTR_SZ)] + mov DWORD(IA0), [%%T_MASK + (IDX*4)] + kmovq k %+ K_IDX, IA0 + vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1] +%assign IDX (IDX + 1) +%assign K_IDX (K_IDX + 1) +%if K_IDX > 7 +%assign K_IDX 1 ; iterate through K1 to K7 +%endif +%endrep + ;; - transpose the data in T0 to T15, T16 to T23 are clobbered + TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23 + + ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1 + vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0 + vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1 + ;; DES encrypt + ;; - R0 - %%T0 + ;; - L0 - %%T1 + DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13 + ;; CFB style xor with R0/L0 with IV + ;; - IV0 - %%T16 + ;; - IV1 - %%T17 + vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1 + vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0 + vmovdqa64 %%T1, %%T2 + ;; - new R0 = L0 ^ IV0 (%%T0) + ;; - new L0 = R0 ^ IV1 (%%T1) + + ;; Transpose the data out + ;; - %%T2 to %%T24 clobbered + TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24 + + ;; Store the transposed data + ;; - T0 to T15 will hold the data + xor IA0, IA0 +%assign IDX 0 +%assign K_IDX 1 +%rep 16 + mov IA1, [%%T_OUT + (IDX*PTR_SZ)] + mov DWORD(IA0), [%%T_MASK + (IDX*4)] + kmovq k %+ K_IDX, IA0 + vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX +%assign IDX (IDX + 1) +%assign K_IDX (K_IDX + 1) +%if K_IDX > 7 +%assign K_IDX 1 ; iterate through K1 to K7 +%endif +%endrep + +%ifdef SAFE_DATA + ;; Clear copied IV's + vpxorq %%T5, %%T5 + vmovdqu64 [%%T_IV + (0*64)], %%T5 + vmovdqu64 [%%T_IV + (1*64)], %%T5 +%endif + +%%_des_cfb_one_end: + +%endmacro + +;;; =========================================================================== +;;; Converts length into mask of DES blocks +;;; =========================================================================== +;;; +;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64) +;;; USES: IA0, IA1 IA2 +;;; ASSUMES: SIZE - OFFSET < 64 +%macro GET_MASK8 1 +%define %%MASK %1 + +%ifidn IA1, rcx +%define myrcx IA1 +%else +%define myrcx rcx + mov IA1, rcx +%endif + mov myrcx, SIZE + sub myrcx, OFFSET + ;; - myrcx - remaining length + ;; - divide by 8 (DES block size) + ;; - create bit mask of the result + mov DWORD(%%MASK), 1 + shr DWORD(myrcx), 3 + shl DWORD(%%MASK), BYTE(myrcx) + sub DWORD(%%MASK), 1 +%ifnidn IA1, rcx + mov rcx, IA1 +%endif +%endmacro + +;;; =========================================================================== +;;; DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only) +;;; =========================================================================== +;;; +;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only +;;; DES_KS [in] - pointer to transposed key schedule +;;; +;;; NOTE: clobbers OpMask registers +;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 +%macro GEN_DES_ENC_CIPHER 2 +%define %%NUM_DES_BLOCKS %1 +%define %%DES_KS %2 + +%assign RN 0 +%assign LN 1 +%assign RNN 2 +%assign LNN 3 +%rep %%NUM_DES_BLOCKS - 1 + DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0 + vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0 +%assign RN (RN + 2) +%assign LN (LN + 2) +%assign RNN (RNN + 2) +%assign LNN (LNN + 2) +%endrep + DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7 + vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7 +%endmacro + +;;; =========================================================================== +;;; DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only) +;;; =========================================================================== +;;; +;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only +;;; DES_KS [in] - pointer to transposed key schedule +;;; +;;; NOTE: clobbers OpMask registers +;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 +%macro GEN_DES_DEC_CIPHER 2 +%define %%NUM_DES_BLOCKS %1 +%define %%DES_KS %2 + +%assign RN 0 +%assign LN 1 +%rep %%NUM_DES_BLOCKS + vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round + vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round + DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1 + vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0 + vmovdqa64 ZIV0, ZTMP12 + vmovdqa64 ZIV1, ZTMP13 +%assign RN (RN + 2) +%assign LN (LN + 2) +%endrep +%endmacro + +;;; =========================================================================== +;;; 3DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only) +;;; =========================================================================== +;;; +;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only +;;; DES_KS1 [in] - pointer to transposed key schedule 1 +;;; DES_KS2 [in] - pointer to transposed key schedule 2 +;;; DES_KS3 [in] - pointer to transposed key schedule 3 +;;; +;;; NOTE: clobbers OpMask registers +;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 +%macro GEN_3DES_ENC_CIPHER 4 +%define %%NUM_DES_BLOCKS %1 +%define %%DES_KS1 %2 +%define %%DES_KS2 %3 +%define %%DES_KS3 %4 + +%assign RN 0 +%assign LN 1 +%assign RNN 2 +%assign LNN 3 +%rep %%NUM_DES_BLOCKS + ;; ENC + DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + ;; DEC + DES_ENC_DEC DEC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + ;; ENC + DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 +%if (RNN < (%%NUM_DES_BLOCKS * 2)) + vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0 + vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0 +%else + vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7 + vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7 +%endif + +%assign RN (RN + 2) +%assign LN (LN + 2) +%assign RNN (RNN + 2) +%assign LNN (LNN + 2) +%endrep + +%endmacro + +;;; =========================================================================== +;;; 3DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only) +;;; =========================================================================== +;;; +;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only +;;; DES_KS1 [in] - pointer to transposed key schedule 1 +;;; DES_KS2 [in] - pointer to transposed key schedule 2 +;;; DES_KS3 [in] - pointer to transposed key schedule 3 +;;; +;;; NOTE: clobbers OpMask registers +;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 +%macro GEN_3DES_DEC_CIPHER 4 +%define %%NUM_DES_BLOCKS %1 +%define %%DES_KS1 %2 +%define %%DES_KS2 %3 +%define %%DES_KS3 %4 + +%assign RN 0 +%assign LN 1 +%rep %%NUM_DES_BLOCKS + vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round + vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round + ;; DEC + DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + ;; ENC + DES_ENC_DEC ENC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + ;; DEC + DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1 + vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0 + vmovdqa64 ZIV0, ZTMP12 + vmovdqa64 ZIV1, ZTMP13 + +%assign RN (RN + 2) +%assign LN (LN + 2) +%endrep + +%endmacro + +;;; =========================================================================== +;;; DES CBC / DOCSIS DES ENCRYPT +;;; =========================================================================== +;;; +;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and +;;; 3DES (3DES CBC) +;;; +;;; NOTE: clobbers OpMask registers +%macro GENERIC_DES_ENC 1 +%define %%DES_DOCSIS %1 + + ;; push the registers and allocate the stack frame + mov rax, rsp + sub rsp, STACKFRAME_size + and rsp, -64 + mov [rsp + _rsp_save], rax ; original SP + mov [rsp + _gpr_save + 0*8], r12 + mov [rsp + _gpr_save + 1*8], r13 + mov [rsp + _gpr_save + 2*8], r14 + mov [rsp + _gpr_save + 3*8], r15 + +%ifnidn %%DES_DOCSIS, 3DES + ;; DES and DOCSIS DES + DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 +%else + ;; 3DES + DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC +%endif + mov [rsp + _size_save], SIZE + and SIZE, -64 + xor OFFSET, OFFSET + ;; This loop processes message in blocks of 64 bytes. + ;; Anything smaller than 64 bytes is handled separately after the loop. +%%_gen_des_enc_loop: + cmp OFFSET, SIZE + jz %%_gen_des_enc_loop_end + ;; run loads + mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] + vmovdqu64 ZW0, [IA0 + OFFSET] + vmovdqu64 ZW1, [IA1 + OFFSET] + vmovdqu64 ZW2, [IA2 + OFFSET] + vmovdqu64 ZW3, [INP0 + OFFSET] + vmovdqu64 ZW4, [INP1 + OFFSET] + vmovdqu64 ZW5, [INP2 + OFFSET] + vmovdqu64 ZW6, [INP3 + OFFSET] + vmovdqu64 ZW7, [INP4 + OFFSET] + + mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] + vmovdqu64 ZW8, [IA0 + OFFSET] + vmovdqu64 ZW9, [IA1 + OFFSET] + vmovdqu64 ZW10, [IA2 + OFFSET] + vmovdqu64 ZW11, [INP0 + OFFSET] + vmovdqu64 ZW12, [INP1 + OFFSET] + vmovdqu64 ZW13, [INP2 + OFFSET] + vmovdqu64 ZW14, [INP3 + OFFSET] + vmovdqu64 ZW15, [INP4 + OFFSET] + + ;; Transpose input + TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; DES CBC ENC comes here + vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 + vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1 + +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 8, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + + ;; transpose data on output + TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + ;; run stores + mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET], ZW0 + vmovdqu64 [IA1 + OFFSET], ZW1 + vmovdqu64 [IA2 + OFFSET], ZW2 + vmovdqu64 [INP0 + OFFSET], ZW3 + vmovdqu64 [INP1 + OFFSET], ZW4 + vmovdqu64 [INP2 + OFFSET], ZW5 + vmovdqu64 [INP3 + OFFSET], ZW6 + vmovdqu64 [INP4 + OFFSET], ZW7 + + mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET], ZW8 + vmovdqu64 [IA1 + OFFSET], ZW9 + vmovdqu64 [IA2 + OFFSET], ZW10 + vmovdqu64 [INP0 + OFFSET], ZW11 + vmovdqu64 [INP1 + OFFSET], ZW12 + vmovdqu64 [INP2 + OFFSET], ZW13 + vmovdqu64 [INP3 + OFFSET], ZW14 + vmovdqu64 [INP4 + OFFSET], ZW15 + + add OFFSET, 64 + jmp %%_gen_des_enc_loop +%%_gen_des_enc_loop_end: + ;; This is where we check if there is anything less than 64 bytes + ;; of message left for processing. + mov SIZE, [rsp + _size_save] + cmp OFFSET, SIZE + jz %%_gen_des_enc_part_end + ;; calculate min of bytes_left and 64, convert to qword mask + GET_MASK8 IA0 ; IA0 = mask + + kmovw k7, DWORD(IA0) + mov [rsp + _mask_save], IA0 + ;; run masked loads + mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] + vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] + vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] + vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] + vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] + vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] + vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] + vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] + vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] + + mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] + vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] + vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] + vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] + vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] + vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] + vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] + vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] + vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] + + ;; Transpose input + TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; DES CBC ENC comes here + vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 + vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1 + + mov IA0, [rsp + _mask_save] + cmp BYTE(IA0), 0x0f + ja %%_gt_4 + jz %%_blocks_4 + + cmp BYTE(IA0), 0x03 + ja %%_blocks_3 + jz %%_blocks_2 + + ;; process one block and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 1, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_2: + ;; process two blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 2, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_3: + ;; process three blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 3, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_4: + ;; process four blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 4, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_gt_4: + cmp BYTE(IA0), 0x3f + ja %%_blocks_7 + jz %%_blocks_6 +%%_blocks_5: + ;; process five blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 5, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_6: + ;; process six blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 6, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_7: + ;; process seven blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 7, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + +%%_transpose_out: + ;; transpose data on output + TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; run masked stores + mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 + vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 + vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 + vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 + vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 + vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 + vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 + vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 + + mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 + vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 + vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 + vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 + vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 + vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 + vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 + vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 +%%_gen_des_enc_part_end: + + ;; store IV and update pointers + DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 + + ;; CFB part for DOCSIS +%ifidn %%DES_DOCSIS, DOCSIS + DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask +%endif + + CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 + + ;; restore stack pointer and registers + mov r12, [rsp + _gpr_save + 0*8] + mov r13, [rsp + _gpr_save + 1*8] + mov r14, [rsp + _gpr_save + 2*8] + mov r15, [rsp + _gpr_save + 3*8] + mov rsp, [rsp + _rsp_save] ; original SP + +%ifdef SAFE_DATA + clear_all_zmms_asm +%else + vzeroupper +%endif ;; SAFE_DATA + +%endmacro + +;;; =========================================================================== +;;; DES CBC / DOCSIS DES DECRYPT +;;; =========================================================================== +;;; +;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and +;;; 3DES (3DES CBC) +;;; +;;; NOTE: clobbers OpMask registers +%macro GENERIC_DES_DEC 1 +%define %%DES_DOCSIS %1 + + ;; push the registers and allocate the stack frame + mov rax, rsp + sub rsp, STACKFRAME_size + and rsp, -64 + mov [rsp + _rsp_save], rax ; original SP + mov [rsp + _gpr_save + 0*8], r12 + mov [rsp + _gpr_save + 1*8], r13 + mov [rsp + _gpr_save + 2*8], r14 + mov [rsp + _gpr_save + 3*8], r15 + +%ifnidn %%DES_DOCSIS, 3DES + ;; DES and DOCSIS + DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 +%else + ;; 3DES + DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC +%endif + + ;; CFB part for DOCSIS +%ifidn %%DES_DOCSIS, DOCSIS + DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask +%endif + + mov [rsp + _size_save], SIZE + and SIZE, -64 + xor OFFSET, OFFSET + ;; This loop processes message in blocks of 64 bytes. + ;; Anything smaller than 64 bytes is handled separately after the loop. +%%_gen_des_dec_loop: + cmp OFFSET, SIZE + jz %%_gen_des_dec_loop_end + ;; run loads + mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] + vmovdqu64 ZW0, [IA0 + OFFSET] + vmovdqu64 ZW1, [IA1 + OFFSET] + vmovdqu64 ZW2, [IA2 + OFFSET] + vmovdqu64 ZW3, [INP0 + OFFSET] + vmovdqu64 ZW4, [INP1 + OFFSET] + vmovdqu64 ZW5, [INP2 + OFFSET] + vmovdqu64 ZW6, [INP3 + OFFSET] + vmovdqu64 ZW7, [INP4 + OFFSET] + + mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] + vmovdqu64 ZW8, [IA0 + OFFSET] + vmovdqu64 ZW9, [IA1 + OFFSET] + vmovdqu64 ZW10, [IA2 + OFFSET] + vmovdqu64 ZW11, [INP0 + OFFSET] + vmovdqu64 ZW12, [INP1 + OFFSET] + vmovdqu64 ZW13, [INP2 + OFFSET] + vmovdqu64 ZW14, [INP3 + OFFSET] + vmovdqu64 ZW15, [INP4 + OFFSET] + + ;; Transpose input + TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + +%ifnidn %%DES_DOCSIS, 3DES + ;; DES CBC DEC comes here + GEN_DES_DEC_CIPHER 8, rsp + _key_sched +%else + ;; 3DES CBC DEC comes here + GEN_3DES_DEC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + + ;; transpose data on output + TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; run stores + mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET], ZW0 + vmovdqu64 [IA1 + OFFSET], ZW1 + vmovdqu64 [IA2 + OFFSET], ZW2 + vmovdqu64 [INP0 + OFFSET], ZW3 + vmovdqu64 [INP1 + OFFSET], ZW4 + vmovdqu64 [INP2 + OFFSET], ZW5 + vmovdqu64 [INP3 + OFFSET], ZW6 + vmovdqu64 [INP4 + OFFSET], ZW7 + + mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET], ZW8 + vmovdqu64 [IA1 + OFFSET], ZW9 + vmovdqu64 [IA2 + OFFSET], ZW10 + vmovdqu64 [INP0 + OFFSET], ZW11 + vmovdqu64 [INP1 + OFFSET], ZW12 + vmovdqu64 [INP2 + OFFSET], ZW13 + vmovdqu64 [INP3 + OFFSET], ZW14 + vmovdqu64 [INP4 + OFFSET], ZW15 + + add OFFSET, 64 + jmp %%_gen_des_dec_loop +%%_gen_des_dec_loop_end: + ;; This is where we check if there is anything less than 64 bytes + ;; of message left for processing. + mov SIZE, [rsp + _size_save] + cmp OFFSET, SIZE + jz %%_gen_des_dec_part_end + ;; calculate min of bytes_left and 64, convert to qword mask + GET_MASK8 IA0 ; IA0 = mask + + kmovw k7, DWORD(IA0) + mov [rsp + _mask_save], IA0 + ;; run masked loads + mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] + vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] + vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] + vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] + vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] + vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] + vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] + vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] + vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] + + mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] + vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] + vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] + vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] + vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] + vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] + vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] + vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] + vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] + + ;; Transpose input + TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; DES CBC DEC comes here + mov IA0, [rsp + _mask_save] + cmp BYTE(IA0), 0x0f + ja %%_gt_4 + jz %%_blocks_4 + + cmp BYTE(IA0), 0x03 + ja %%_blocks_3 + jz %%_blocks_2 + ;; process one block and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 1, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_2: + ;; process two blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 2, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_3: + ;; process three blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 3, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_4: + ;; process four blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 4, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_gt_4: + cmp BYTE(IA0), 0x3f + ja %%_blocks_7 + jz %%_blocks_6 +%%_blocks_5: + ;; process five blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 5, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_6: + ;; process six blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 6, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_7: + ;; process seven blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 7, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + +%%_transpose_out: + ;; transpose data on output + TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; run masked stores + mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 + vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 + vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 + vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 + vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 + vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 + vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 + vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 + + mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 + vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 + vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 + vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 + vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 + vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 + vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 + vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 +%%_gen_des_dec_part_end: + + ;; store IV and update pointers + DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 + + CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 + + ;; restore stack pointer and registers + mov r12, [rsp + _gpr_save + 0*8] + mov r13, [rsp + _gpr_save + 1*8] + mov r14, [rsp + _gpr_save + 2*8] + mov r15, [rsp + _gpr_save + 3*8] + mov rsp, [rsp + _rsp_save] ; original SP + +%ifdef SAFE_DATA + clear_all_zmms_asm +%else + vzeroupper +%endif ;; SAFE_DATA + +%endmacro + -- GitLab From e6e5e61c94108ef021ef59974863898bae1e003c Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Wed, 17 Jan 2024 17:00:57 +0000 Subject: [PATCH 24/30] avx512: [des] generate range of encrypt functions for code re-use between DES, 3DES/TDES and DES-DOCSIS - add new module with encrypt functions with input/output registers commonly used in the code - add wrapper macro for encrypt so that code expansion can be done or call to generated/predefined function can be made (much smaller memory footprint) - rename DES constants so that they are unique and can be exported to other library modules --- lib/Makefile | 1 + lib/avx512_t1/des_common_avx512.asm | 221 ++++++++++ lib/avx512_t1/des_x16_avx512.asm | 72 +-- lib/include/des_avx512.inc | 653 ++++++++++++++++------------ lib/win_x64.mak | 1 + 5 files changed, 639 insertions(+), 309 deletions(-) create mode 100644 lib/avx512_t1/des_common_avx512.asm diff --git a/lib/Makefile b/lib/Makefile index d9e7b9a8..98a0986a 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -753,6 +753,7 @@ asm_avx512_lib_objs := \ sha256_x16_avx512.o \ sha512_x8_avx512.o \ des_x16_avx512.o \ + des_common_avx512.o \ aes_ecb_vaes_avx512.o \ aes_ecb_quic_vaes_avx512.o \ aes_cntr_api_by16_vaes_avx512.o \ diff --git a/lib/avx512_t1/des_common_avx512.asm b/lib/avx512_t1/des_common_avx512.asm new file mode 100644 index 00000000..d103615b --- /dev/null +++ b/lib/avx512_t1/des_common_avx512.asm @@ -0,0 +1,221 @@ +; +;; Copyright (c) 2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Collection of functions generated by DES_ENC_DEC macro with preset input/output arguments. +;; This method allows to reduce code footprint while maintaining identical performance. +;; +;; If register usage changes then generated functions below may need to be corrected. +;; See DES_ENC_DEC macro for more details. + +%include "include/des_avx512.inc" + +;;; ======================================================== +;;; DATA + +extern des_mask_values_avx512 +extern des_init_perm_consts_avx512 +extern des_S_box_flipped_avx512 +extern des_vec_ones_32b_avx512 +extern des_and_eu_avx512 +extern des_and_ed_avx512 +extern des_idx_e_avx512 +extern des_reg_values16bit_7_avx512 +extern des_shuffle_reg_avx512 + +;;; ======================================================== +;;; CODE +mksection .text + +;;; >>>>>>>>>>>>>> ENCRYPT FUNCTIONS + +;;; r15 : key schedule pointer +;;; zmm0 : [in/out] R +;;; zmm1 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm0_zmm1_avx512,function,internal) +des_enc_zmm0_zmm1_avx512: + DES_ENC_DEC_EXP ENC,zmm0,zmm1,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm2 : [in/out] R +;;; zmm3 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm2_zmm3_avx512,function,internal) +des_enc_zmm2_zmm3_avx512: + DES_ENC_DEC_EXP ENC,zmm2,zmm3,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm4 : [in/out] R +;;; zmm5 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm4_zmm5_avx512,function,internal) +des_enc_zmm4_zmm5_avx512: + DES_ENC_DEC_EXP ENC,zmm4,zmm5,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm6 : [in/out] R +;;; zmm7 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm6_zmm7_avx512,function,internal) +des_enc_zmm6_zmm7_avx512: + DES_ENC_DEC_EXP ENC,zmm6,zmm7,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm8 : [in/out] R +;;; zmm9 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm8_zmm9_avx512,function,internal) +des_enc_zmm8_zmm9_avx512: + DES_ENC_DEC_EXP ENC,zmm8,zmm9,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm10 : [in/out] R +;;; zmm11 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm10_zmm11_avx512,function,internal) +des_enc_zmm10_zmm11_avx512: + DES_ENC_DEC_EXP ENC,zmm10,zmm11,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm12 : [in/out] R +;;; zmm13 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm12_zmm13_avx512,function,internal) +des_enc_zmm12_zmm13_avx512: + DES_ENC_DEC_EXP ENC,zmm12,zmm13,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm14 : [in/out] R +;;; zmm15 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm14_zmm15_avx512,function,internal) +des_enc_zmm14_zmm15_avx512: + DES_ENC_DEC_EXP ENC,zmm14,zmm15,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; CFB ONE use case +;;; r15 : key schedule pointer +;;; zmm18 : [in/out] R +;;; zmm19 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm18_zmm19_avx512,function,internal) +des_enc_zmm18_zmm19_avx512: + DES_ENC_DEC_EXP ENC,zmm18,zmm19,r15,zmm2,zmm3,zmm4,zmm5,zmm6,zmm7,zmm8,zmm9,zmm10,zmm11,zmm12,zmm13 + ret + +;;; >>>>>>>>>>>>>> DECRYPT FUNCTIONS + +;;; r15 : key schedule pointer +;;; zmm0 : [in/out] R +;;; zmm1 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm0_zmm1_avx512,function,internal) +des_dec_zmm0_zmm1_avx512: + DES_ENC_DEC_EXP DEC,zmm0,zmm1,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm2 : [in/out] R +;;; zmm3 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm2_zmm3_avx512,function,internal) +des_dec_zmm2_zmm3_avx512: + DES_ENC_DEC_EXP DEC,zmm2,zmm3,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm4 : [in/out] R +;;; zmm5 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm4_zmm5_avx512,function,internal) +des_dec_zmm4_zmm5_avx512: + DES_ENC_DEC_EXP DEC,zmm4,zmm5,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm6 : [in/out] R +;;; zmm7 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm6_zmm7_avx512,function,internal) +des_dec_zmm6_zmm7_avx512: + DES_ENC_DEC_EXP DEC,zmm6,zmm7,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm8 : [in/out] R +;;; zmm9 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm8_zmm9_avx512,function,internal) +des_dec_zmm8_zmm9_avx512: + DES_ENC_DEC_EXP DEC,zmm8,zmm9,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm10 : [in/out] R +;;; zmm11 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm10_zmm11_avx512,function,internal) +des_dec_zmm10_zmm11_avx512: + DES_ENC_DEC_EXP DEC,zmm10,zmm11,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm12 : [in/out] R +;;; zmm13 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm12_zmm13_avx512,function,internal) +des_dec_zmm12_zmm13_avx512: + DES_ENC_DEC_EXP DEC,zmm12,zmm13,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm14 : [in/out] R +;;; zmm15 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm14_zmm15_avx512,function,internal) +des_dec_zmm14_zmm15_avx512: + DES_ENC_DEC_EXP DEC,zmm14,zmm15,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; CFB ONE use case +;;; r15 : key schedule pointer +;;; zmm18 : [in/out] R +;;; zmm19 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm18_zmm19_avx512,function,internal) +des_dec_zmm18_zmm19_avx512: + DES_ENC_DEC_EXP DEC,zmm18,zmm19,r15,zmm2,zmm3,zmm4,zmm5,zmm6,zmm7,zmm8,zmm9,zmm10,zmm11,zmm12,zmm13 + ret + +mksection stack-noexec diff --git a/lib/avx512_t1/des_x16_avx512.asm b/lib/avx512_t1/des_x16_avx512.asm index 635e3083..f1f6323f 100644 --- a/lib/avx512_t1/des_x16_avx512.asm +++ b/lib/avx512_t1/des_x16_avx512.asm @@ -25,19 +25,32 @@ ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; -;; Authors: -;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2) -;; (1) University of Haifa, Israel -;; (2) Intel Corporation +;; DES, TDES/3DES and DES-DOCSIS API generation -;; In System V AMD64 ABI -;; callee saves: RBX, RBP, R12-R15 -;; Windows x64 ABI -;; callee saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +%include "include/des_avx512.inc" + +;;; ======================================================== +;;; External module functions needed here -;; Clobbers ZMM0-31 and K1 to K7 +extern des_enc_zmm0_zmm1_avx512 +extern des_enc_zmm2_zmm3_avx512 +extern des_enc_zmm4_zmm5_avx512 +extern des_enc_zmm6_zmm7_avx512 +extern des_enc_zmm8_zmm9_avx512 +extern des_enc_zmm10_zmm11_avx512 +extern des_enc_zmm12_zmm13_avx512 +extern des_enc_zmm14_zmm15_avx512 +extern des_enc_zmm18_zmm19_avx512 -%include "include/des_avx512.inc" +extern des_dec_zmm0_zmm1_avx512 +extern des_dec_zmm2_zmm3_avx512 +extern des_dec_zmm4_zmm5_avx512 +extern des_dec_zmm6_zmm7_avx512 +extern des_dec_zmm8_zmm9_avx512 +extern des_dec_zmm10_zmm11_avx512 +extern des_dec_zmm12_zmm13_avx512 +extern des_dec_zmm14_zmm15_avx512 +extern des_dec_zmm18_zmm19_avx512 ;;; ======================================================== ;;; DATA @@ -45,7 +58,8 @@ mksection .rodata default rel align 64 -mask_values: +MKGLOBAL(des_mask_values_avx512,data,internal) +des_mask_values_avx512: dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 @@ -124,7 +138,8 @@ mask_values: dd 0x90000000, 0x90000000, 0x90000000, 0x90000000 align 64 -init_perm_consts: +MKGLOBAL(des_init_perm_consts_avx512,data,internal) +des_init_perm_consts_avx512: dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f @@ -148,7 +163,8 @@ init_perm_consts: ;;; S-Box table align 64 -S_box_flipped: +MKGLOBAL(des_S_box_flipped_avx512,data,internal) +des_S_box_flipped_avx512: ;; SBOX0 dw 0x07, 0x02, 0x0c, 0x0f, 0x04, 0x0b, 0x0a, 0x0c dw 0x0b, 0x07, 0x06, 0x09, 0x0d, 0x04, 0x00, 0x0a @@ -224,39 +240,45 @@ S_box_flipped: ;;; Used in DOCSIS DES partial block scheduling 16 x 32bit of value 1 align 64 -vec_ones_32b: +MKGLOBAL(des_vec_ones_32b_avx512,data,internal) +des_vec_ones_32b_avx512: dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 align 64 -and_eu: +MKGLOBAL(des_and_eu_avx512,data,internal) +des_and_eu_avx512: dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 align 64 -and_ed: +MKGLOBAL(des_and_ed_avx512,data,internal) +des_and_ed_avx512: dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f align 64 -idx_e: +MKGLOBAL(des_idx_e_avx512,data,internal) +des_idx_e_avx512: dq 0x0d0c090805040100, 0x0f0e0b0a07060302 dq 0x1d1c191815141110, 0x1f1e1b1a17161312 dq 0x2d2c292825242120, 0x2f2e2b2a27262322 dq 0x3d3c393835343130, 0x3f3e3b3a37363332 align 64 -reg_values16bit_7: +MKGLOBAL(des_reg_values16bit_7_avx512,data,internal) +des_reg_values16bit_7_avx512: dq 0x001f001f001f001f, 0x001f001f001f001f dq 0x001f001f001f001f, 0x001f001f001f001f dq 0x001f001f001f001f, 0x001f001f001f001f dq 0x001f001f001f001f, 0x001f001f001f001f align 64 -shuffle_reg: +MKGLOBAL(des_shuffle_reg_avx512,data,internal) +des_shuffle_reg_avx512: dq 0x0705060403010200, 0x0f0d0e0c0b090a08 dq 0x1715161413111210, 0x1f1d1e1c1b191a18 dq 0x2725262423212220, 0x2f2d2e2c2b292a28 @@ -271,7 +293,7 @@ mksection .text align 64 MKGLOBAL(des_x16_cbc_enc_avx512,function,internal) des_x16_cbc_enc_avx512: - GENERIC_DES_ENC DES + GENERIC_DES_ENC DES, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -279,7 +301,7 @@ des_x16_cbc_enc_avx512: align 64 MKGLOBAL(des_x16_cbc_dec_avx512,function,internal) des_x16_cbc_dec_avx512: - GENERIC_DES_DEC DES + GENERIC_DES_DEC DES, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -287,7 +309,7 @@ des_x16_cbc_dec_avx512: align 64 MKGLOBAL(des3_x16_cbc_enc_avx512,function,internal) des3_x16_cbc_enc_avx512: - GENERIC_DES_ENC 3DES + GENERIC_DES_ENC 3DES, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -295,7 +317,7 @@ des3_x16_cbc_enc_avx512: align 64 MKGLOBAL(des3_x16_cbc_dec_avx512,function,internal) des3_x16_cbc_dec_avx512: - GENERIC_DES_DEC 3DES + GENERIC_DES_DEC 3DES, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -303,7 +325,7 @@ des3_x16_cbc_dec_avx512: align 64 MKGLOBAL(docsis_des_x16_enc_avx512,function,internal) docsis_des_x16_enc_avx512: - GENERIC_DES_ENC DOCSIS + GENERIC_DES_ENC DOCSIS, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -311,7 +333,7 @@ docsis_des_x16_enc_avx512: align 64 MKGLOBAL(docsis_des_x16_dec_avx512,function,internal) docsis_des_x16_dec_avx512: - GENERIC_DES_DEC DOCSIS + GENERIC_DES_DEC DOCSIS, arg1, arg2 ret mksection stack-noexec diff --git a/lib/include/des_avx512.inc b/lib/include/des_avx512.inc index c9abc881..47ea79c1 100644 --- a/lib/include/des_avx512.inc +++ b/lib/include/des_avx512.inc @@ -1,5 +1,5 @@ ;; -;; Copyright (c) 2017-2024, Intel Corporation +;; Copyright (c) 2024, Intel Corporation ;; ;; Redistribution and use in source and binary forms, with or without ;; modification, are permitted provided that the following conditions are met: @@ -30,6 +30,8 @@ ;; (1) University of Haifa, Israel ;; (2) Intel Corporation +%use smartalign + %include "include/os.inc" %include "include/reg_sizes.inc" %include "include/mb_mgr_datastruct.inc" @@ -50,9 +52,6 @@ %define arg4 r9 %endif -%define STATE arg1 -%define SIZE arg2 - %define OFFSET rax %define IA0 arg3 @@ -179,11 +178,11 @@ endstruc %define %%L %1 %define %%R %2 %define %%T0 %3 - PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0 - PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0 - PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0 - PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0 - PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0 + PERMUTE %%R, %%L, 4, [rel des_init_perm_consts_avx512 + 0*64], %%T0 + PERMUTE %%L, %%R, 16, [rel des_init_perm_consts_avx512 + 1*64], %%T0 + PERMUTE %%R, %%L, 2, [rel des_init_perm_consts_avx512 + 2*64], %%T0 + PERMUTE %%L, %%R, 8, [rel des_init_perm_consts_avx512 + 3*64], %%T0 + PERMUTE %%R, %%L, 1, [rel des_init_perm_consts_avx512 + 4*64], %%T0 %endmacro ;;; =========================================================================== @@ -196,11 +195,11 @@ endstruc %define %%L %1 %define %%R %2 %define %%T0 %3 - PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0 - PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0 - PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0 - PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0 - PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0 + PERMUTE %%L, %%R, 1, [rel des_init_perm_consts_avx512 + 4*64], %%T0 + PERMUTE %%R, %%L, 8, [rel des_init_perm_consts_avx512 + 3*64], %%T0 + PERMUTE %%L, %%R, 2, [rel des_init_perm_consts_avx512 + 2*64], %%T0 + PERMUTE %%R, %%L, 16, [rel des_init_perm_consts_avx512 + 1*64], %%T0 + PERMUTE %%L, %%R, 4, [rel des_init_perm_consts_avx512 + 0*64], %%T0 %endmacro ;;; =========================================================================== @@ -218,68 +217,68 @@ endstruc %define %%T3 %5 vprord %%T0, %%W0, 3 - vpandd %%T0, %%T0, [rel mask_values + 0*64] + vpandd %%T0, %%T0, [rel des_mask_values_avx512 + 0*64] vprord %%T1, %%W0, 5 - vpandd %%T1, %%T1, [rel mask_values + 1*64] + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 1*64] vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 24 - vpandd %%T1, %%T1, [rel mask_values + 2*64] + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 2*64] vprord %%T2, %%W0, 26 - vpandd %%T2, %%T2, [rel mask_values + 3*64] + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 3*64] vpord %%T1, %%T1, %%T2 vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 15 - vpandd %%T1, %%T1, [rel mask_values + 4*64] + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 4*64] vprord %%T2, %%W0, 17 - vpandd %%T2, %%T2, [rel mask_values + 5*64] + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 5*64] vpord %%T1, %%T1, %%T2 vprord %%T2, %%W0, 6 - vpandd %%T2, %%T2, [rel mask_values + 6*64] + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 6*64] vprord %%T3, %%W0, 21 - vpandd %%T3, %%T3, [rel mask_values + 7*64] + vpandd %%T3, %%T3, [rel des_mask_values_avx512 + 7*64] vpord %%T2, %%T2, %%T3 vpord %%T1, %%T1, %%T2 vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 12 - vpandd %%T1, %%T1, [rel mask_values + 8*64] + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 8*64] vprord %%T2, %%W0, 14 - vpandd %%T2, %%T2, [rel mask_values + 9*64] + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 9*64] vpord %%T1, %%T1, %%T2 vprord %%T2, %%W0, 4 - vpandd %%T2, %%T2, [rel mask_values + 10*64] + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 10*64] vprord %%T3, %%W0, 11 - vpandd %%T3, %%T3, [rel mask_values + 11*64] + vpandd %%T3, %%T3, [rel des_mask_values_avx512 + 11*64] vpord %%T2, %%T2, %%T3 vpord %%T1, %%T1, %%T2 vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 16 - vpandd %%T1, %%T1, [rel mask_values + 12*64] + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 12*64] vprord %%T2, %%W0, 22 - vpandd %%T2, %%T2, [rel mask_values + 13*64] + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 13*64] vpord %%T1, %%T1, %%T2 vprord %%T2, %%W0, 19 - vpandd %%T2, %%T2, [rel mask_values + 14*64] + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 14*64] vprord %%T3, %%W0, 10 - vpandd %%T3, %%T3, [rel mask_values + 15*64] + vpandd %%T3, %%T3, [rel des_mask_values_avx512 + 15*64] vpord %%T2, %%T2, %%T3 vpord %%T1, %%T1, %%T2 vpord %%T0, %%T0, %%T1 vprord %%T1, %%W0, 9 - vpandd %%T1, %%T1, [rel mask_values + 16*64] + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 16*64] vprord %%T2, %%W0, 13 - vpandd %%T2, %%T2, [rel mask_values + 17*64] + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 17*64] vpord %%T1, %%T1, %%T2 vprord %%T2, %%W0, 25 - vpandd %%T2, %%T2, [rel mask_values + 18*64] + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 18*64] vpord %%T1, %%T1, %%T2 vpord %%W0, %%T0, %%T1 %endmacro @@ -313,18 +312,18 @@ endstruc vprord %%T0, %%IN, 31 vprord %%T1, %%IN, 3 - vpshufb %%T0, %%T0, [rel idx_e] - vpshufb %%T1, %%T1, [rel idx_e] + vpshufb %%T0, %%T0, [rel des_idx_e_avx512] + vpshufb %%T1, %%T1, [rel des_idx_e_avx512] vpunpcklbw %%OUT0A, %%T0, %%T1 vpunpckhbw %%OUT1A, %%T0, %%T1 vpxord %%OUT0A, %%OUT0A, %%K0 vpxord %%OUT1A, %%OUT1A, %%K1 - vpandd %%OUT0B, %%OUT0A, [rel and_eu] + vpandd %%OUT0B, %%OUT0A, [rel des_and_eu_avx512] vpsrlw %%OUT0B, %%OUT0B, 8 - vpandd %%OUT0A, %%OUT0A, [rel and_ed] - vpandd %%OUT1B, %%OUT1A, [rel and_eu] + vpandd %%OUT0A, %%OUT0A, [rel des_and_ed_avx512] + vpandd %%OUT1B, %%OUT1A, [rel des_and_eu_avx512] vpsrlw %%OUT1B, %%OUT1B, 8 - vpandd %%OUT1A, %%OUT1A, [rel and_ed] + vpandd %%OUT1A, %%OUT1A, [rel des_and_ed_avx512] %endmacro ;;; =========================================================================== @@ -352,7 +351,7 @@ endstruc %define %%T4 %10 %define %%T5 %11 - vmovdqa64 %%T0, [rel reg_values16bit_7] + vmovdqa64 %%T0, [rel des_reg_values16bit_7_avx512] vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE @@ -363,10 +362,10 @@ endstruc mov DWORD(IA0), 0xaaaaaaaa kmovd k2, DWORD(IA0) - vmovdqa64 %%T0, [rel S_box_flipped + 0*64] - vmovdqa64 %%T1, [rel S_box_flipped + 1*64] - vmovdqa64 %%T2, [rel S_box_flipped + 4*64] - vmovdqa64 %%T3, [rel S_box_flipped + 5*64] + vmovdqa64 %%T0, [rel des_S_box_flipped_avx512 + 0*64] + vmovdqa64 %%T1, [rel des_S_box_flipped_avx512 + 1*64] + vmovdqa64 %%T2, [rel des_S_box_flipped_avx512 + 4*64] + vmovdqa64 %%T3, [rel des_S_box_flipped_avx512 + 5*64] vpermw %%T0{k1}{z}, %%IN0A, %%T0 vpermw %%T1{k1}{z}, %%IN0A, %%T1 vpermw %%T2{k2}{z}, %%IN0A, %%T2 @@ -375,10 +374,10 @@ endstruc vpxord %%OUT, %%T1, %%T3 vmovdqu16 %%OUT{k3}, %%T0 - vmovdqa64 %%T0, [rel S_box_flipped + 2*64] - vmovdqa64 %%T1, [rel S_box_flipped + 3*64] - vmovdqa64 %%T2, [rel S_box_flipped + 6*64] - vmovdqa64 %%T3, [rel S_box_flipped + 7*64] + vmovdqa64 %%T0, [rel des_S_box_flipped_avx512 + 2*64] + vmovdqa64 %%T1, [rel des_S_box_flipped_avx512 + 3*64] + vmovdqa64 %%T2, [rel des_S_box_flipped_avx512 + 6*64] + vmovdqa64 %%T3, [rel des_S_box_flipped_avx512 + 7*64] vpermw %%T0{k1}{z}, %%IN0B, %%T0 vpermw %%T1{k1}{z}, %%IN0B, %%T1 vpermw %%T2{k2}{z}, %%IN0B, %%T2 @@ -389,10 +388,10 @@ endstruc vpsllw %%T3, %%T3, 4 vpxord %%OUT, %%OUT, %%T3 - vmovdqa64 %%T0, [rel S_box_flipped + 8*64] - vmovdqa64 %%T1, [rel S_box_flipped + 9*64] - vmovdqa64 %%T2, [rel S_box_flipped + 12*64] - vmovdqa64 %%T3, [rel S_box_flipped + 13*64] + vmovdqa64 %%T0, [rel des_S_box_flipped_avx512 + 8*64] + vmovdqa64 %%T1, [rel des_S_box_flipped_avx512 + 9*64] + vmovdqa64 %%T2, [rel des_S_box_flipped_avx512 + 12*64] + vmovdqa64 %%T3, [rel des_S_box_flipped_avx512 + 13*64] vpermw %%T0{k1}{z}, %%IN1A, %%T0 vpermw %%T1{k1}{z}, %%IN1A, %%T1 vpermw %%T2{k2}{z}, %%IN1A, %%T2 @@ -401,10 +400,10 @@ endstruc vpxord %%T4, %%T1, %%T3 vmovdqu16 %%T4{k5}, %%T0 - vmovdqa64 %%T0, [rel S_box_flipped + 10*64] - vmovdqa64 %%T1, [rel S_box_flipped + 11*64] - vmovdqa64 %%T2, [rel S_box_flipped + 14*64] - vmovdqa64 %%T3, [rel S_box_flipped + 15*64] + vmovdqa64 %%T0, [rel des_S_box_flipped_avx512 + 10*64] + vmovdqa64 %%T1, [rel des_S_box_flipped_avx512 + 11*64] + vmovdqa64 %%T2, [rel des_S_box_flipped_avx512 + 14*64] + vmovdqa64 %%T3, [rel des_S_box_flipped_avx512 + 15*64] vpermw %%T0{k1}{z}, %%IN1B, %%T0 vpermw %%T1{k1}{z}, %%IN1B, %%T1 vpermw %%T2{k2}{z}, %%IN1B, %%T2 @@ -417,7 +416,7 @@ endstruc vpxord %%T4, %%T4, %%T5 vpsllw %%T4, %%T4, 8 vpxord %%OUT, %%OUT, %%T4 - vpshufb %%OUT, %%OUT, [rel shuffle_reg] + vpshufb %%OUT, %%OUT, [rel des_shuffle_reg_avx512] %endmacro ;;; =========================================================================== @@ -431,7 +430,7 @@ endstruc ;;; L [in/out] - zmm register; plain text in & cipher text out ;;; KS [in] - pointer to the key schedule ;;; T0-T11 [clobbered] - temporary zmm register -%macro DES_ENC_DEC 16 +%macro DES_ENC_DEC_EXP 16 %define %%ENC_DEC %1 %define %%R %2 %define %%L %3 @@ -449,11 +448,23 @@ endstruc %define %%T10 %15 %define %%T11 %16 + + ;; Comment out section below & compile to see macro invocations in the code (registers & parameters). + ;; + ;; %define arglist "arglist:" + ;; %rep %0 + ;; %xdefine arglist arglist %+ %1 %+ , + ;; %rotate 1 + ;; %endrep + ;; %warning arglist + ;; %undef arglist + IP_Z %%R, %%L, %%T0 %ifidn %%ENC_DEC, ENC ;; ENCRYPTION xor KSOFFSET, KSOFFSET +align 32 %%_des_enc_loop: E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 @@ -472,6 +483,7 @@ endstruc %else ;; DECRYPTION mov KSOFFSET, (8*(4*64)) +align 32 %%_des_dec_loop: E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7 S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 @@ -490,6 +502,93 @@ endstruc %endmacro +;;; =========================================================================== +;;; Wrapper macro for DES_ENC_DEC_EXP +;;; =========================================================================== +;;; +%macro DES_ENC_DEC 16 +%define %%ENC_DEC %1 +%define %%R %2 +%define %%L %3 +%define %%KS %4 + + ;; Make call to a function or expand macro with algorithmic code + ;; 0 - expand - use 0 to verify macro invocations & parameters vs functions in des_common.asm + ;; 1 - make call - use it in production (smaller code foot print) +%assign make_call 1 + +%if make_call != 0 + + ;; Retrieve R and L ZMM register numbers + ;; - this fragment could look better (with newer NASM) but it is compatible with NASM 2.14.02 + ;; - map R and L to string + ;; - get sub-strings with ZMM number + ;; - make new definition using the sub-strings - it results in a number that can be compared +%defstr %%RSTR %%R +%defstr %%LSTR %%L +%substr %%r_idx %%RSTR 4, -1 +%substr %%l_idx %%LSTR 4, -1 +%define %%RNUM %%r_idx +%define %%LNUM %%l_idx + + ;; swap input/output zmm's only if R zmm has higher number than L one +%assign %%swap_zmms 0 +%if %%RNUM > %%LNUM +%assign %%swap_zmms 1 +%endif + +%if %%swap_zmms != 0 + ;; register names are swapped + ;; - meaning there is function generated for the same pair of zmm's but they are swapped (R with L) + ;; - the idea is to re-use existing function and swap register values before the call (3DES/TDES use case) +%define %%NEW_R %%L +%define %%NEW_L %%R + vmovdqa64 %5, %%R + vmovdqa64 %%R, %%L + vmovdqa64 %%L, %5 +%else + ;; no swap needed +%define %%NEW_R %%R +%define %%NEW_L %%L +%endif + + ;; construct name of the function to be called + ;; des____avx512 +%ifidn %%ENC_DEC, ENC +%define %%fn_name des_enc_ %+ %%NEW_R %+ _ %+ %%NEW_L %+ _avx512 +%else +%define %%fn_name des_dec_ %+ %%NEW_R %+ _ %+ %%NEW_L %+ _avx512 +%endif + + lea r15, [%%KS] ;; r15 is safe to be used as an input argument + call %%fn_name + +%if %%swap_zmms != 0 + ;; register names were swapped, unswap them + vmovdqa64 %5, %%R + vmovdqa64 %%R, %%L + vmovdqa64 %%L, %5 +%endif + + ;; clean-up temporary macro definitions +%undef %%fn_name +%undef %%NEW_R +%undef %%NEW_L +%undef %%r_idx +%undef %%l_idx +%undef %%RSTR +%undef %%LSTR +%undef %%RNUM +%undef %%LNUM +%undef %%swap_zmms + +%else + ;; Expand the macro in-place + DES_ENC_DEC_EXP %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16 +%endif + +%endmacro + ;;; =========================================================================== ;;; DATA TRANSPOSITION AT DATA INPUT ;;; =========================================================================== @@ -1067,43 +1166,40 @@ endstruc ;;; DES FINISH ;;; Update in/out pointers and store IV ;;; =========================================================================== -;;; -;;; Needs: STATE & SIZE -;;; IV0 [in] - r512; initialization vector -;;; IV1 [in] - r512; initialization vector -;;; T0-T4 [clobbered] - temporary r512 registers -%macro DES_FINISH 7 -%define %%IV0 %1 -%define %%IV1 %2 -%define %%T0 %3 -%define %%T1 %4 -%define %%T2 %5 -%define %%T3 %6 -%define %%T4 %7 - - vpbroadcastq %%T4, SIZE - vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)] - vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)] - vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)] - vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)] +%macro DES_FINISH 9 +%define %%IV0 %1 ;; [in] zmm initialization vector (IV) +%define %%IV1 %2 ;; [in] zmm initialization vector (IV) +%define %%T0 %3 ;; [clobbered] temporary zmm +%define %%T1 %4 ;; [clobbered] temporary zmm +%define %%T2 %5 ;; [clobbered] temporary zmm +%define %%T3 %6 ;; [clobbered] temporary zmm +%define %%T4 %7 ;; [clobbered] temporary zmm +%define %%STATE %8 ;; [in] pointer to OOO manager +%define %%SIZE %9 ;; [in] processed message size in bytes + + vpbroadcastq %%T4, %%SIZE + vmovdqu64 %%T0, [%%STATE + _des_args_in + (0 * PTR_SZ)] + vmovdqu64 %%T1, [%%STATE + _des_args_in + (8 * PTR_SZ)] + vmovdqu64 %%T2, [%%STATE + _des_args_out + (0 * PTR_SZ)] + vmovdqu64 %%T3, [%%STATE + _des_args_out + (8 * PTR_SZ)] vpaddq %%T0, %%T0, %%T4 vpaddq %%T1, %%T1, %%T4 vpaddq %%T2, %%T2, %%T4 vpaddq %%T3, %%T3, %%T4 - vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0 - vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1 - vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2 - vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3 + vmovdqu64 [%%STATE + _des_args_in + (0 * PTR_SZ)], %%T0 + vmovdqu64 [%%STATE + _des_args_in + (8 * PTR_SZ)], %%T1 + vmovdqu64 [%%STATE + _des_args_out + (0 * PTR_SZ)], %%T2 + vmovdqu64 [%%STATE + _des_args_out + (8 * PTR_SZ)], %%T3 - vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0 - vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1 + vmovdqu64 [%%STATE + _des_args_IV + (0 * 64)], %%IV0 + vmovdqu64 [%%STATE + _des_args_IV + (1 * 64)], %%IV1 %endmacro ;;; =========================================================================== ;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY ;;; =========================================================================== ;;; -;;; Needs: STATE, IA0-IA2 +;;; Needs: IA0-IA2 ;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection ;;; KS [in] - key schedule ;;; T0-T24 [clobbered] - temporary r512 @@ -1113,7 +1209,7 @@ endstruc ;;; T_IV [in] - 16 * 8 byte storage ;;; ;;; NOTE: clobbers OpMask registers -%macro DES_CFB_ONE 31 +%macro DES_CFB_ONE 32 %define %%ENC_DEC %1 %define %%KS %2 %define %%T0 %3 @@ -1145,10 +1241,11 @@ endstruc %define %%T_OUT %29 %define %%T_IV %30 %define %%T_MASK %31 +%define %%STATE %32 ;; [in] GP with pointer to OOO manager ;; - find mask for non-zero partial lengths vpxord %%T10, %%T10, %%T10 - vmovdqu64 %%T0, [STATE + _des_args_PLen] + vmovdqu64 %%T0, [%%STATE + _des_args_PLen] vpcmpd k3, %%T0, %%T10, 4 ; NEQ kmovw DWORD(IA0), k3 movzx DWORD(IA0), WORD(IA0) @@ -1161,10 +1258,10 @@ endstruc ;; with CFB partial block. ;; To do that current out position is compared against ;; calculated last full block position. - vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)] - vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)] - vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)] - vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)] + vmovdqu64 %%T1, [%%STATE + _des_args_out + (0*8)] + vmovdqu64 %%T2, [%%STATE + _des_args_LOut + (0*8)] + vmovdqu64 %%T3, [%%STATE + _des_args_out + (8*8)] + vmovdqu64 %%T4, [%%STATE + _des_args_LOut + (8*8)] vpcmpq k4, %%T1, %%T2, 0 ; EQ vpcmpq k5, %%T3, %%T4, 0 ; EQ kmovw DWORD(IA1), k4 @@ -1180,21 +1277,21 @@ endstruc ;; Calculate ((1 << partial_bytes) - 1) ;; in order to get the mask for loads and stores ;; k3 & IA0 - hold valid mask - vmovdqa64 %%T1, [rel vec_ones_32b] + vmovdqa64 %%T1, [rel des_vec_ones_32b_avx512] vpsllvd %%T2{k3}{z}, %%T1, %%T0 vpsubd %%T2{k3}{z}, %%T2, %%T1 vmovdqu64 [%%T_MASK], %%T2 ;; clear selected partial lens not to do them twice - vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10 + vmovdqu32 [%%STATE + _des_args_PLen]{k3}, %%T10 ;; copy IV, in and out pointers - vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)] - vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)] - vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)] - vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)] - vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)] - vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)] + vmovdqu64 %%T1, [%%STATE + _des_args_in + (0*PTR_SZ)] + vmovdqu64 %%T2, [%%STATE + _des_args_in + (8*PTR_SZ)] + vmovdqu64 %%T3, [%%STATE + _des_args_out + (0*PTR_SZ)] + vmovdqu64 %%T4, [%%STATE + _des_args_out + (8*PTR_SZ)] + vmovdqu64 %%T5, [%%STATE + _des_args_IV + (0*64)] + vmovdqu64 %%T6, [%%STATE + _des_args_IV + (1*64)] vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1 vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2 vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3 @@ -1204,7 +1301,7 @@ endstruc ;; calculate last block case mask ;; - first block case requires no modifications to in/out/IV - vmovdqu64 %%T1, [STATE + _des_args_BLen] + vmovdqu64 %%T1, [%%STATE + _des_args_BLen] vpcmpd k2, %%T1, %%T10, 4 ; NEQ kmovw DWORD(IA1), k2 and DWORD(IA1), DWORD(IA0) @@ -1220,10 +1317,10 @@ endstruc and DWORD(IA2), 0xff kmovw k4, DWORD(IA2) kmovw k5, DWORD(IA1) - vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)] - vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)] - vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)] - vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)] + vmovdqu64 %%T1, [%%STATE + _des_args_LOut + (0*PTR_SZ)] + vmovdqu64 %%T2, [%%STATE + _des_args_LOut + (8*PTR_SZ)] + vmovdqu64 %%T3, [%%STATE + _des_args_LIn + (0*PTR_SZ)] + vmovdqu64 %%T4, [%%STATE + _des_args_LIn + (8*PTR_SZ)] vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1 vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2 vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3 @@ -1236,9 +1333,9 @@ endstruc test DWORD(IA0), (1 << IDX) jz %%_des_cfb_one_copy_iv_next %+ IDX %ifidn %%ENC_DEC, ENC - mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)] + mov IA2, [%%STATE + _des_args_LOut + (IDX*PTR_SZ)] %else - mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)] + mov IA2, [%%STATE + _des_args_LIn + (IDX*PTR_SZ)] %endif mov IA2, [IA2 - 8] mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2) @@ -1323,30 +1420,18 @@ endstruc ;;; Converts length into mask of DES blocks ;;; =========================================================================== ;;; -;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64) -;;; USES: IA0, IA1 IA2 ;;; ASSUMES: SIZE - OFFSET < 64 -%macro GET_MASK8 1 -%define %%MASK %1 - -%ifidn IA1, rcx -%define myrcx IA1 -%else -%define myrcx rcx - mov IA1, rcx -%endif - mov myrcx, SIZE - sub myrcx, OFFSET - ;; - myrcx - remaining length - ;; - divide by 8 (DES block size) - ;; - create bit mask of the result - mov DWORD(%%MASK), 1 - shr DWORD(myrcx), 3 - shl DWORD(%%MASK), BYTE(myrcx) - sub DWORD(%%MASK), 1 -%ifnidn IA1, rcx - mov rcx, IA1 -%endif +%macro GET_MASK8 3 +%define %%MASK %1 ;; [out] GP for mask value (load/store) +%define %%SIZE %2 ;; [in] GP with message size in bytes +%define %%TMP %3 ;; [clobbered] temporary GP + + xor %%MASK, %%MASK + mov %%TMP, %%SIZE + sub %%TMP, OFFSET + shr %%TMP, 3 + bts %%MASK, %%TMP + sub %%MASK, 1 %endmacro ;;; =========================================================================== @@ -1499,8 +1584,10 @@ endstruc ;;; 3DES (3DES CBC) ;;; ;;; NOTE: clobbers OpMask registers -%macro GENERIC_DES_ENC 1 -%define %%DES_DOCSIS %1 +%macro GENERIC_DES_ENC 3 +%define %%DES_DOCSIS %1 ;; [in] select between DES (DES CBC), DOCSIS (DOCSIS DES) and 3DES (3DES CBC) +%define %%STATE %2 ;; [in] GP with pointer to OOO manager +%define %%SIZE %3 ;; [in] GP with message size in bytes ;; push the registers and allocate the stack frame mov rax, rsp @@ -1514,28 +1601,28 @@ endstruc %ifnidn %%DES_DOCSIS, 3DES ;; DES and DOCSIS DES - DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + DES_INIT %%STATE + _des_args_keys, %%STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 %else ;; 3DES - DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC + DES3_INIT %%STATE + _des_args_keys, %%STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC %endif - mov [rsp + _size_save], SIZE - and SIZE, -64 + mov [rsp + _size_save], %%SIZE + and %%SIZE, -64 xor OFFSET, OFFSET ;; This loop processes message in blocks of 64 bytes. ;; Anything smaller than 64 bytes is handled separately after the loop. %%_gen_des_enc_loop: - cmp OFFSET, SIZE + cmp OFFSET, %%SIZE jz %%_gen_des_enc_loop_end ;; run loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] + mov IA0, [%%STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (7*PTR_SZ)] vmovdqu64 ZW0, [IA0 + OFFSET] vmovdqu64 ZW1, [IA1 + OFFSET] vmovdqu64 ZW2, [IA2 + OFFSET] @@ -1545,14 +1632,14 @@ endstruc vmovdqu64 ZW6, [INP3 + OFFSET] vmovdqu64 ZW7, [INP4 + OFFSET] - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] + mov IA0, [%%STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (15*PTR_SZ)] vmovdqu64 ZW8, [IA0 + OFFSET] vmovdqu64 ZW9, [IA1 + OFFSET] vmovdqu64 ZW10, [IA2 + OFFSET] @@ -1578,14 +1665,14 @@ endstruc ;; transpose data on output TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] + mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (7*PTR_SZ)] vmovdqu64 [IA0 + OFFSET], ZW0 vmovdqu64 [IA1 + OFFSET], ZW1 vmovdqu64 [IA2 + OFFSET], ZW2 @@ -1595,14 +1682,14 @@ endstruc vmovdqu64 [INP3 + OFFSET], ZW6 vmovdqu64 [INP4 + OFFSET], ZW7 - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] + mov IA0, [%%STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (15*PTR_SZ)] vmovdqu64 [IA0 + OFFSET], ZW8 vmovdqu64 [IA1 + OFFSET], ZW9 vmovdqu64 [IA2 + OFFSET], ZW10 @@ -1617,23 +1704,23 @@ endstruc %%_gen_des_enc_loop_end: ;; This is where we check if there is anything less than 64 bytes ;; of message left for processing. - mov SIZE, [rsp + _size_save] - cmp OFFSET, SIZE + mov %%SIZE, [rsp + _size_save] + cmp OFFSET, %%SIZE jz %%_gen_des_enc_part_end ;; calculate min of bytes_left and 64, convert to qword mask - GET_MASK8 IA0 ; IA0 = mask + GET_MASK8 IA0, %%SIZE, IA1 ; IA0 = mask kmovw k7, DWORD(IA0) mov [rsp + _mask_save], IA0 ;; run masked loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] + mov IA0, [%%STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (7*PTR_SZ)] vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] @@ -1643,14 +1730,14 @@ endstruc vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] + mov IA0, [%%STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (15*PTR_SZ)] vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] @@ -1746,14 +1833,14 @@ endstruc TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run masked stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] + mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (7*PTR_SZ)] vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 @@ -1763,14 +1850,14 @@ endstruc vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] + mov IA0, [%%STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (15*PTR_SZ)] vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 @@ -1782,11 +1869,11 @@ endstruc %%_gen_des_enc_part_end: ;; store IV and update pointers - DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 + DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, %%STATE, %%SIZE ;; CFB part for DOCSIS %ifidn %%DES_DOCSIS, DOCSIS - DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask + DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask, %%STATE %endif CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 @@ -1810,12 +1897,11 @@ endstruc ;;; DES CBC / DOCSIS DES DECRYPT ;;; =========================================================================== ;;; -;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and -;;; 3DES (3DES CBC) -;;; ;;; NOTE: clobbers OpMask registers -%macro GENERIC_DES_DEC 1 -%define %%DES_DOCSIS %1 +%macro GENERIC_DES_DEC 3 +%define %%DES_DOCSIS %1 ;; [in] select between DES (DES CBC), DOCSIS (DOCSIS DES) and 3DES (3DES CBC) +%define %%STATE %2 ;; [in] GP with pointer to OOO manager +%define %%SIZE %3 ;; [in] GP with message size in bytes ;; push the registers and allocate the stack frame mov rax, rsp @@ -1829,34 +1915,34 @@ endstruc %ifnidn %%DES_DOCSIS, 3DES ;; DES and DOCSIS - DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + DES_INIT %%STATE + _des_args_keys, %%STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 %else ;; 3DES - DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC + DES3_INIT %%STATE + _des_args_keys, %%STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC %endif ;; CFB part for DOCSIS %ifidn %%DES_DOCSIS, DOCSIS - DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask + DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask, %%STATE %endif - mov [rsp + _size_save], SIZE - and SIZE, -64 + mov [rsp + _size_save], %%SIZE + and %%SIZE, -64 xor OFFSET, OFFSET ;; This loop processes message in blocks of 64 bytes. ;; Anything smaller than 64 bytes is handled separately after the loop. %%_gen_des_dec_loop: - cmp OFFSET, SIZE + cmp OFFSET, %%SIZE jz %%_gen_des_dec_loop_end ;; run loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] + mov IA0, [%%STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (7*PTR_SZ)] vmovdqu64 ZW0, [IA0 + OFFSET] vmovdqu64 ZW1, [IA1 + OFFSET] vmovdqu64 ZW2, [IA2 + OFFSET] @@ -1866,14 +1952,14 @@ endstruc vmovdqu64 ZW6, [INP3 + OFFSET] vmovdqu64 ZW7, [INP4 + OFFSET] - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] + mov IA0, [%%STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (15*PTR_SZ)] vmovdqu64 ZW8, [IA0 + OFFSET] vmovdqu64 ZW9, [IA1 + OFFSET] vmovdqu64 ZW10, [IA2 + OFFSET] @@ -1898,14 +1984,14 @@ endstruc TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] + mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (7*PTR_SZ)] vmovdqu64 [IA0 + OFFSET], ZW0 vmovdqu64 [IA1 + OFFSET], ZW1 vmovdqu64 [IA2 + OFFSET], ZW2 @@ -1915,14 +2001,14 @@ endstruc vmovdqu64 [INP3 + OFFSET], ZW6 vmovdqu64 [INP4 + OFFSET], ZW7 - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] + mov IA0, [%%STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (15*PTR_SZ)] vmovdqu64 [IA0 + OFFSET], ZW8 vmovdqu64 [IA1 + OFFSET], ZW9 vmovdqu64 [IA2 + OFFSET], ZW10 @@ -1937,23 +2023,23 @@ endstruc %%_gen_des_dec_loop_end: ;; This is where we check if there is anything less than 64 bytes ;; of message left for processing. - mov SIZE, [rsp + _size_save] - cmp OFFSET, SIZE + mov %%SIZE, [rsp + _size_save] + cmp OFFSET, %%SIZE jz %%_gen_des_dec_part_end ;; calculate min of bytes_left and 64, convert to qword mask - GET_MASK8 IA0 ; IA0 = mask + GET_MASK8 IA0, %%SIZE, IA1 ; IA0 = mask kmovw k7, DWORD(IA0) mov [rsp + _mask_save], IA0 ;; run masked loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] + mov IA0, [%%STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (7*PTR_SZ)] vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] @@ -1963,14 +2049,14 @@ endstruc vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] + mov IA0, [%%STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (15*PTR_SZ)] vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] @@ -2062,14 +2148,14 @@ endstruc TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run masked stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] + mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (7*PTR_SZ)] vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 @@ -2079,14 +2165,14 @@ endstruc vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] + mov IA0, [%%STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (15*PTR_SZ)] vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 @@ -2098,7 +2184,7 @@ endstruc %%_gen_des_dec_part_end: ;; store IV and update pointers - DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 + DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, %%STATE, %%SIZE CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 @@ -2116,4 +2202,3 @@ endstruc %endif ;; SAFE_DATA %endmacro - diff --git a/lib/win_x64.mak b/lib/win_x64.mak index ff8af96e..2828abb1 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -304,6 +304,7 @@ lib_objs1 = \ $(OBJ_DIR)\poly_avx512.obj \ $(OBJ_DIR)\poly_fma_avx512.obj \ $(OBJ_DIR)\des_x16_avx512.obj \ + $(OBJ_DIR)\des_common_avx512.obj \ $(OBJ_DIR)\aes_cntr_api_by16_vaes_avx512.obj \ $(OBJ_DIR)\aes_cntr_bit_api_by16_vaes_avx512.obj \ $(OBJ_DIR)\aes_cntr_ccm_api_by16_vaes_avx512.obj \ -- GitLab From ea21086437abd0316ca0469b7da70c250e194c60 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Mon, 22 Jan 2024 13:47:01 +0000 Subject: [PATCH 25/30] avx512: [des] replace custom TRANSPOSE IN/OUT with standard TRANSPOSE16_U32 macro --- lib/include/des_avx512.inc | 261 ++----------------------------------- 1 file changed, 14 insertions(+), 247 deletions(-) diff --git a/lib/include/des_avx512.inc b/lib/include/des_avx512.inc index 47ea79c1..2a8b2a64 100644 --- a/lib/include/des_avx512.inc +++ b/lib/include/des_avx512.inc @@ -39,6 +39,7 @@ ;%define DO_DBGPRINT ;%include "include/dbgprint.inc" %include "include/clear_regs.inc" +%include "include/transpose_avx512.inc" %ifdef LINUX %define arg1 rdi @@ -582,247 +583,13 @@ align 32 %undef %%LNUM %undef %%swap_zmms -%else +%else ; make_call != 0 ;; Expand the macro in-place DES_ENC_DEC_EXP %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16 %endif %endmacro -;;; =========================================================================== -;;; DATA TRANSPOSITION AT DATA INPUT -;;; =========================================================================== -;;; -;;; IN00 - IN15 [in/out]: -;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data -;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 -;;; T0-T3 [clobbered] - temporary zmm registers -;;; K0-K5 [clobbered] - temporary zmm registers -;;; H0-H3 [clobbered] - temporary zmm registers -%macro TRANSPOSE_IN 30 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T1 %18 -%define %%T2 %19 -%define %%T3 %20 -%define %%K0 %21 -%define %%K1 %22 -%define %%K2 %23 -%define %%K3 %24 -%define %%K4 %25 -%define %%K5 %26 -%define %%H0 %27 -%define %%H1 %28 -%define %%H2 %29 -%define %%H3 %30 - - vpunpckldq %%K0, %%IN00, %%IN01 - vpunpckhdq %%K1, %%IN00, %%IN01 - vpunpckldq %%T0, %%IN02, %%IN03 - vpunpckhdq %%T1, %%IN02, %%IN03 - - vpunpckldq %%IN00, %%IN04, %%IN05 - vpunpckhdq %%IN01, %%IN04, %%IN05 - vpunpckldq %%IN02, %%IN06, %%IN07 - vpunpckhdq %%IN03, %%IN06, %%IN07 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - vpunpcklqdq %%K3, %%K1, %%T1 - vpunpckhqdq %%T3, %%K1, %%T1 - - vpunpcklqdq %%K0, %%IN00, %%IN02 - vpunpckhqdq %%K1, %%IN00, %%IN02 - vpunpcklqdq %%T0, %%IN01, %%IN03 - vpunpckhqdq %%T1, %%IN01, %%IN03 - - vpunpckldq %%K4, %%IN08, %%IN09 - vpunpckhdq %%K5, %%IN08, %%IN09 - vpunpckldq %%IN04, %%IN10, %%IN11 - vpunpckhdq %%IN05, %%IN10, %%IN11 - vpunpckldq %%IN06, %%IN12, %%IN13 - vpunpckhdq %%IN07, %%IN12, %%IN13 - vpunpckldq %%IN10, %%IN14, %%IN15 - vpunpckhdq %%IN11, %%IN14, %%IN15 - - vpunpcklqdq %%IN12, %%K4, %%IN04 - vpunpckhqdq %%IN13, %%K4, %%IN04 - vpunpcklqdq %%IN14, %%K5, %%IN05 - vpunpckhqdq %%IN15, %%K5, %%IN05 - vpunpcklqdq %%IN00, %%IN06, %%IN10 - vpunpckhqdq %%IN01, %%IN06, %%IN10 - vpunpcklqdq %%IN02, %%IN07, %%IN11 - vpunpckhqdq %%IN03, %%IN07, %%IN11 - - vshufi64x2 %%H0, %%K2, %%K0, 0x44 - vshufi64x2 %%H1, %%K2, %%K0, 0xee - vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 - vshufi64x2 %%H3, %%IN12, %%IN00, 0xee - vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 - vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 - vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 - vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 - - vshufi64x2 %%H0, %%T2, %%K1, 0x44 - vshufi64x2 %%H1, %%T2, %%K1, 0xee - vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 - vshufi64x2 %%H3, %%IN13, %%IN01, 0xee - vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 - vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 - vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 - vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 - - vshufi64x2 %%H0, %%K3, %%T0, 0x44 - vshufi64x2 %%H1, %%K3, %%T0, 0xee - vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 - vshufi64x2 %%H3, %%IN14, %%IN02, 0xee - vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 - vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 - vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 - vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 - - vshufi64x2 %%H0, %%T3, %%T1, 0x44 - vshufi64x2 %%H1, %%T3, %%T1, 0xee - vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 - vshufi64x2 %%H3, %%IN15, %%IN03, 0xee - vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 - vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 - vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 - vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 -%endmacro - -;;; =========================================================================== -;;; DATA TRANSPOSITION AT DATA OUTPUT -;;; =========================================================================== -;;; -;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: -;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 -;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data -;;; T0-T3 [clobbered] - temporary zmm registers -;;; K0-K5 [clobbered] - temporary zmm registers -;;; H0-H3 [clobbered] - temporary zmm registers -%macro TRANSPOSE_OUT 30 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T1 %18 -%define %%T2 %19 -%define %%T3 %20 -%define %%K0 %21 -%define %%K1 %22 -%define %%K2 %23 -%define %%K3 %24 -%define %%K4 %25 -%define %%K5 %26 -%define %%H0 %27 -%define %%H1 %28 -%define %%H2 %29 -%define %%H3 %30 - - vpunpckldq %%K0, %%IN01, %%IN00 - vpunpckhdq %%K1, %%IN01, %%IN00 - vpunpckldq %%T0, %%IN03, %%IN02 - vpunpckhdq %%T1, %%IN03, %%IN02 - - vpunpckldq %%IN00, %%IN05, %%IN04 - vpunpckhdq %%IN01, %%IN05, %%IN04 - vpunpckldq %%IN02, %%IN07, %%IN06 - vpunpckhdq %%IN03, %%IN07, %%IN06 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - vpunpcklqdq %%K3, %%K1, %%T1 - vpunpckhqdq %%T3, %%K1, %%T1 - - vpunpcklqdq %%K0, %%IN00, %%IN02 - vpunpckhqdq %%K1, %%IN00, %%IN02 - vpunpcklqdq %%T0, %%IN01, %%IN03 - vpunpckhqdq %%T1, %%IN01, %%IN03 - - vpunpckldq %%K4, %%IN09, %%IN08 - vpunpckhdq %%K5, %%IN09, %%IN08 - vpunpckldq %%IN04, %%IN11, %%IN10 - vpunpckhdq %%IN05, %%IN11, %%IN10 - vpunpckldq %%IN06, %%IN13, %%IN12 - vpunpckhdq %%IN07, %%IN13, %%IN12 - vpunpckldq %%IN10, %%IN15, %%IN14 - vpunpckhdq %%IN11, %%IN15, %%IN14 - - vpunpcklqdq %%IN12, %%K4, %%IN04 - vpunpckhqdq %%IN13, %%K4, %%IN04 - vpunpcklqdq %%IN14, %%K5, %%IN05 - vpunpckhqdq %%IN15, %%K5, %%IN05 - vpunpcklqdq %%IN00, %%IN06, %%IN10 - vpunpckhqdq %%IN01, %%IN06, %%IN10 - vpunpcklqdq %%IN02, %%IN07, %%IN11 - vpunpckhqdq %%IN03, %%IN07, %%IN11 - - vshufi64x2 %%H0, %%K2, %%K0, 0x44 - vshufi64x2 %%H1, %%K2, %%K0, 0xee - vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 - vshufi64x2 %%H3, %%IN12, %%IN00, 0xee - vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 - vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 - vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 - vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 - - vshufi64x2 %%H0, %%T2, %%K1, 0x44 - vshufi64x2 %%H1, %%T2, %%K1, 0xee - vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 - vshufi64x2 %%H3, %%IN13, %%IN01, 0xee - vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 - vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 - vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 - vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 - - vshufi64x2 %%H0, %%K3, %%T0, 0x44 - vshufi64x2 %%H1, %%K3, %%T0, 0xee - vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 - vshufi64x2 %%H3, %%IN14, %%IN02, 0xee - vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 - vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 - vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 - vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 - - vshufi64x2 %%H0, %%T3, %%T1, 0x44 - vshufi64x2 %%H1, %%T3, %%T1, 0xee - vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 - vshufi64x2 %%H3, %%IN15, %%IN03, 0xee - vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 - vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 - vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 - vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 -%endmacro - ;;; =========================================================================== ;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT ;;; =========================================================================== @@ -1024,7 +791,7 @@ align 32 vmovdqu64 %%T %+ IDX, [IA0] %assign IDX (IDX + 1) %endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 + TRANSPOSE16_U32 %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 %assign IDX 0 %rep 16 vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX @@ -1039,7 +806,7 @@ align 32 vmovdqu64 %%T %+ IDX, [IA0 + 64] %assign IDX (IDX + 1) %endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 + TRANSPOSE16_U32 %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 %assign IDX 0 %rep 16 vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX @@ -1124,7 +891,7 @@ align 32 vmovdqu64 %%T %+ IDX, [IA0] %assign IDX (IDX + 1) %endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 + TRANSPOSE16_U32 %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 %assign IDX 0 %rep 16 vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX @@ -1140,7 +907,7 @@ align 32 vmovdqu64 %%T %+ IDX, [IA0 + 64] %assign IDX (IDX + 1) %endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 + TRANSPOSE16_U32 %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 %assign IDX 0 %rep 16 vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX @@ -1650,7 +1417,7 @@ align 32 vmovdqu64 ZW15, [INP4 + OFFSET] ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + TRANSPOSE16_U32 ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; DES CBC ENC comes here vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 @@ -1663,7 +1430,7 @@ align 32 %endif ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + TRANSPOSE16_U32 ZW1, ZW0, ZW3, ZW2, ZW5, ZW4, ZW7, ZW6, ZW9, ZW8, ZW11, ZW10, ZW13, ZW12, ZW15, ZW14, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run stores mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] mov IA1, [%%STATE + _des_args_out + (1*PTR_SZ)] @@ -1748,7 +1515,7 @@ align 32 vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + TRANSPOSE16_U32 ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; DES CBC ENC comes here vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 @@ -1830,7 +1597,7 @@ align 32 %%_transpose_out: ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + TRANSPOSE16_U32 ZW1, ZW0, ZW3, ZW2, ZW5, ZW4, ZW7, ZW6, ZW9, ZW8, ZW11, ZW10, ZW13, ZW12, ZW15, ZW14, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run masked stores mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] @@ -1970,7 +1737,7 @@ align 32 vmovdqu64 ZW15, [INP4 + OFFSET] ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + TRANSPOSE16_U32 ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 %ifnidn %%DES_DOCSIS, 3DES ;; DES CBC DEC comes here @@ -1981,7 +1748,7 @@ align 32 %endif ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + TRANSPOSE16_U32 ZW1, ZW0, ZW3, ZW2, ZW5, ZW4, ZW7, ZW6, ZW9, ZW8, ZW11, ZW10, ZW13, ZW12, ZW15, ZW14, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run stores mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] @@ -2067,7 +1834,7 @@ align 32 vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + TRANSPOSE16_U32 ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; DES CBC DEC comes here mov IA0, [rsp + _mask_save] @@ -2145,7 +1912,7 @@ align 32 %%_transpose_out: ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + TRANSPOSE16_U32 ZW1, ZW0, ZW3, ZW2, ZW5, ZW4, ZW7, ZW6, ZW9, ZW8, ZW11, ZW10, ZW13, ZW12, ZW15, ZW14, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 ;; run masked stores mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] -- GitLab From e76c2b849ab29f8f4ab10aff02d04a25cbec57c5 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Mon, 22 Jan 2024 17:50:37 +0000 Subject: [PATCH 26/30] avx512: [des] use smaller stack frame for DES and DES-DOCSIS --- lib/include/des_avx512.inc | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/lib/include/des_avx512.inc b/lib/include/des_avx512.inc index 2a8b2a64..0105bbbc 100644 --- a/lib/include/des_avx512.inc +++ b/lib/include/des_avx512.inc @@ -103,17 +103,18 @@ %define ZTMP13 zmm31 struc STACKFRAME -_key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 -_key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 -_key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 -_tmp_iv: resq 16 ; 2 x 64 bytes -_tmp_in: resq 16 ; 2 x 64 bytes -_tmp_out: resq 16 ; 2 x 64 bytes -_tmp_mask: resd 16 ; 1 x 64 bytes _gpr_save: resq 4 ; r12 to r15 _rsp_save: resq 1 _mask_save: resq 1 _size_save: resq 1 +_padding: resq 1 +_tmp_iv: resq 16 ; 2 x 64 bytes +_tmp_in: resq 16 ; 2 x 64 bytes +_tmp_out: resq 16 ; 2 x 64 bytes +_tmp_mask: resd 16 ; 1 x 64 bytes +_key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 +_key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 +_key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 endstruc ;;; =========================================================================== @@ -130,10 +131,12 @@ endstruc %define %%ZT %2 ; [clobbered] temporary ZMM register %ifdef SAFE_DATA - vpxorq %%ZT, %%ZT -%assign rep_num (2048 / 64) + vpxorq %%ZT, %%ZT, %%ZT + %ifidn %%ALG, 3DES -%assign rep_num (rep_num * 3) +%assign rep_num ((3 * (16 * 16 * 8)) / 64) +%else +%assign rep_num ((16 * 16 * 8) / 64) %endif %assign offset 0 @@ -1358,7 +1361,11 @@ align 32 ;; push the registers and allocate the stack frame mov rax, rsp +%ifnidn %%DES_DOCSIS, 3DES + sub rsp, _key_sched2 ;; no need for schedule 2 and 3 for non-TDES/3DES algos +%else sub rsp, STACKFRAME_size +%endif and rsp, -64 mov [rsp + _rsp_save], rax ; original SP mov [rsp + _gpr_save + 0*8], r12 @@ -1368,10 +1375,10 @@ align 32 %ifnidn %%DES_DOCSIS, 3DES ;; DES and DOCSIS DES - DES_INIT %%STATE + _des_args_keys, %%STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + DES_INIT {%%STATE + _des_args_keys}, {%%STATE + _des_args_IV}, {rsp + _key_sched}, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 %else ;; 3DES - DES3_INIT %%STATE + _des_args_keys, %%STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC + DES3_INIT {%%STATE + _des_args_keys}, {%%STATE + _des_args_IV}, {rsp + _key_sched}, {rsp + _key_sched2}, {rsp + _key_sched3}, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC %endif mov [rsp + _size_save], %%SIZE and %%SIZE, -64 -- GitLab From 14a683009f90f5a8fa613f14ced16209895bbdba Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 21 Dec 2023 09:40:30 +0000 Subject: [PATCH 27/30] vaes-avx512: [gcm] add special code path for aes-gcm encrypt (message size <=256 bytes) Changes vs the original approach: - move AAD calculation after AES-CTR encrypt - move original IV encrypt to the message encrypt - for certain message sizes original IV is appended to counter block registers for encryption - optimize number of GHASH reductions --- lib/include/gcm_api_vaes_avx512.inc | 53 ++- lib/include/gcm_vaes_avx512.inc | 583 +++++++++++++++++++++++++++- 2 files changed, 614 insertions(+), 22 deletions(-) diff --git a/lib/include/gcm_api_vaes_avx512.inc b/lib/include/gcm_api_vaes_avx512.inc index 1a24b96e..7f8456b0 100644 --- a/lib/include/gcm_api_vaes_avx512.inc +++ b/lib/include/gcm_api_vaes_avx512.inc @@ -137,54 +137,58 @@ FN_NAME(enc,_): ;; Check key_data != NULL cmp arg1, 0 - jz error_enc + jz .error_enc ;; Check context_data != NULL cmp arg2, 0 - jz error_enc + jz .error_enc ;; Check IV != NULL cmp arg6, 0 - jz error_enc + jz .error_enc ;; Check auth_tag != NULL cmp arg9, 0 - jz error_enc + jz .error_enc ;; Check auth_tag_len == 0 or > 16 cmp arg10, 0 - jz error_enc + jz .error_enc cmp arg10, 16 - ja error_enc + ja .error_enc ;; Check if msg_len == 0 cmp arg5, 0 - jz skip_in_out_check_enc + jz .skip_in_out_check_enc ;; Check if msg_len > max_len cmp arg5, GCM_MAX_LENGTH - ja error_enc + ja .error_enc ;; Check out != NULL (msg_len != 0) cmp arg3, 0 - jz error_enc + jz .error_enc ;; Check in != NULL (msg_len != 0) cmp arg4, 0 - jz error_enc + jz .error_enc -skip_in_out_check_enc: +.skip_in_out_check_enc: ;; Check if aad_len == 0 cmp arg8, 0 - jz skip_aad_check_enc + jz .skip_aad_check_enc ;; Check aad != NULL (aad_len != 0) cmp arg7, 0 - jz error_enc + jz .error_enc -skip_aad_check_enc: +.skip_aad_check_enc: %endif + ;; Check if msg_len < 256 + cmp arg5, 16 * 16 + jbe .small_packet_path + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call @@ -193,13 +197,20 @@ skip_aad_check_enc: %ifdef SAFE_DATA clear_zmms_avx512 xmm6 %endif + jmp .exit_enc + +.small_packet_path: + GCM_ENC_DEC_VSMALL arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, ENC +%ifdef SAFE_DATA + clear_zmms_avx512 xmm6 +%endif -exit_enc: +.exit_enc: FUNC_RESTORE ret %ifdef SAFE_PARAM -error_enc: +.error_enc: ;; Clear reg and imb_errno IMB_ERR_CHECK_START rax @@ -222,7 +233,7 @@ error_enc: ;; Check if msg_len == 0 cmp arg5, 0 - jz skip_in_out_check_error_enc + jz .skip_in_out_check_error_enc ;; Check if msg_len > max_len IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN @@ -233,18 +244,18 @@ error_enc: ;; Check in != NULL (msg_len != 0) IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC -skip_in_out_check_error_enc: +.skip_in_out_check_error_enc: ;; Check if aad_len == 0 cmp arg8, 0 - jz skip_aad_check_error_enc + jz .skip_aad_check_error_enc ;; Check aad != NULL (aad_len != 0) IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD -skip_aad_check_error_enc: +.skip_aad_check_error_enc: ;; Set imb_errno IMB_ERR_CHECK_END rax - jmp exit_enc + jmp .exit_enc %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/lib/include/gcm_vaes_avx512.inc b/lib/include/gcm_vaes_avx512.inc index 8802f73d..eb5bc9cd 100644 --- a/lib/include/gcm_vaes_avx512.inc +++ b/lib/include/gcm_vaes_avx512.inc @@ -421,9 +421,19 @@ %assign hashk HashKey_ %+ %%NUM_BLOCKS %if %0 == 18 + ;; no GH/GL sums passed so add current HASH value to block 0 vpxorq %%CIPHER_IN0, %%CIPHER_IN0, %%AAD_HASH_IN %endif +%if %0 == 20 +%ifnum %%AAD_HASH_IN + ;; %%AAD_HASH_IN defines number of extra blocks to add to %%NUM_BLOCKS +%assign NB (%%NUM_BLOCKS + %%AAD_HASH_IN) +%assign hashk HashKey_ %+ NB + +%endif +%endif + %if %%NUM_BLOCKS == 16 vmovdqu64 %%HK1, [%%KP + hashk] vmovdqu64 %%HK2, [%%KP + hashk + HKeyGap] @@ -466,6 +476,8 @@ vpternlogq %%TLL1, %%TLL2, %%THL2, 0x96 vpternlogq %%THH1, %%THH2, %%TLH2, 0x96 +%assign hashk (hashk + (4 * 64)) + %elif %%NUM_BLOCKS >= 12 vmovdqu64 %%HK1, [%%KP + hashk] @@ -497,6 +509,8 @@ vpxorq %%TLL1, %%TLL1, %%THL1 vpxorq %%THH1, %%THH1, %%TLH1 +%assign hashk (hashk + (3 * 64)) + %elif %%NUM_BLOCKS >= 8 vmovdqu64 %%HK1, [%%KP + hashk] @@ -519,6 +533,8 @@ vpternlogq %%TLL1, %%TLL2, %%THL2, 0x96 vpternlogq %%THH1, %%THH2, %%TLH2, 0x96 +%assign hashk (hashk + (2 * 64)) + %elif %%NUM_BLOCKS >= 4 vmovdqu64 %%HK1, [%%KP + hashk] @@ -531,6 +547,9 @@ ;; add sums into THH1:TLL1 vpxorq %%TLL1, %%TLL1, %%THL1 vpxorq %%THH1, %%THH1, %%TLH1 + +%assign hashk (hashk + (1 * 64)) + %endif ;; T1H/L/M1/M2 - hold current product sums (provided %%NUM_BLOCKS >= 4) @@ -542,7 +561,7 @@ ;; It may also be that they are the only blocks to process. ;; Set hash key and register index position for the remaining 1 to 3 blocks -%assign hashk HashKey_ %+ blocks_left +;; %assign hashk HashKey_ %+ blocks_left %assign reg_idx (%%NUM_BLOCKS / 4) %xdefine %%REG_IN %%CIPHER_IN %+ reg_idx @@ -3137,6 +3156,568 @@ align 32 %endmacro ; GCM_ENC_DEC_SMALL +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; GCM_ENC_DEC_VSMALL Encodes/Decodes given data. Assumes that the passed gcm_context_data struct +; has been initialized by GCM_INIT +; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. +; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CIPH_IN), +; input text length (PLAIN_CIPH_LEN) and whether encoding or decoding (ENC_DEC). +; Output: A cipher of the given plain text (CIPH_PLAIN_OUT), and updated GDATA_CTX +; Clobbers rax, r10-r15, and zmm0-zmm31, k1 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC_VSMALL 11-12 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%GDATA_CTX %2 ; [in] context pointer +%define %%CIPH_PLAIN_OUT %3 ; [in] output buffer pointer +%define %%PLAIN_CIPH_IN %4 ; [in] input buffer pointer +%define %%PLAIN_CIPH_LEN %5 ; [in] buffer length +%define %%IV %6 ; [in] IV pointer +%define %%A_IN %7 ; [in] AAD pointer +%define %%A_LEN %8 ; [in] AAD length in bytes +%define %%AUTH_TAG %9 ; [in] pointer to store auth tag into (GP or mem) +%define %%AUTH_TAG_LEN %10 ; [in] length in bytes of auth tag (GP or mem) +%define %%ENC_DEC %11 ; [in] cipher direction +%define %%IV_LEN %12 ; [in] IV length + +%define %%IA0 r10 +%define %%IA1 r12 +%define %%IA2 r13 +%define %%IA3 r15 +%define %%IA4 r11 +%define %%IA5 rax +%define %%IA6 rbx + +%ifidn __OUTPUT_FORMAT__, win64 +%define %%LENGTH %%IA6 +%endif + +%define %%CTR_BLOCKz zmm0 +%define %%CTR_BLOCKx xmm0 ; hardcoded in GCM_INIT + +%define %%AAD_HASHz zmm1 +%define %%AAD_HASHy ymm1 +%define %%AAD_HASHx xmm1 ; hardcoded in GCM_COMPLETE + +%define %%SHUF_MASK zmm30 +%define %%SHUF_MASKy ymm30 +%define %%SHUF_MASKx xmm30 + +%define %%ORIG_IV zmm31 +%define %%ORIG_IVx xmm31 + +%define %%ZTMP0 zmm2 +%define %%ZTMP1 zmm3 ; **sensitive +%define %%ZTMP2 zmm4 ; **sensitive (small data) +%define %%ZTMP3 zmm5 ; **sensitive (small data) +%define %%ZTMP4 zmm6 +%define %%ZTMP5 zmm7 +%define %%ZTMP6 zmm8 +%define %%ZTMP7 zmm9 +%define %%ZTMP8 zmm10 +%define %%ZTMP9 zmm11 +%define %%ZTMP10 zmm12 +%define %%ZTMP11 zmm13 +%define %%ZTMP12 zmm14 +%define %%ZTMP13 zmm15 +%define %%ZTMP14 zmm16 +%define %%ZTMP15 zmm17 +%define %%ZTMP16 zmm18 +%define %%ZTMP17 zmm19 +%define %%ZTMP18 zmm20 +%define %%ZTMP19 zmm21 +%define %%ZTMP20 zmm22 +%define %%ZTMP21 zmm23 +%define %%ZTMP22 zmm24 +%define %%ZTMP23 zmm25 +%define %%ZTMP24 zmm26 +%define %%ZTMP25 zmm27 +%define %%ZTMP26 zmm28 +%define %%ZTMP27 zmm29 + +%define %%MASKREG k1 + + +;; GCM_INIT START + ;; prepare IV +%if %0 == 12 ;; IV is different than 12 bytes + CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%ORIG_IVx, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, \ + %%ZTMP14, %%ZTMP15, %%ZTMP16, %%ZTMP17, %%IA1, %%IA2, %%IA3, %%MASKREG +%else ;; IV is 12 bytes + ;; read 12 IV bytes and pad with 0x00000001 + vmovdqa64 %%ORIG_IVx, [rel ONEf] + mov %%IA2, %%IV + mov DWORD(%%IA1), 0x0000_0fff + kmovd k2, DWORD(%%IA1) + vmovdqu8 %%ORIG_IVx{k2}, [%%IA2] ; ctr = IV | 0x1 +%endif + ;; set up context fields + vpshufb %%CTR_BLOCKx, %%ORIG_IVx, [rel SHUF_MASK] + +;; GCM_INIT END + +;; GCM_ECN_DEC START + +%ifidn __OUTPUT_FORMAT__, win64 + cmp %%PLAIN_CIPH_LEN, 0 +%else + or %%PLAIN_CIPH_LEN, %%PLAIN_CIPH_LEN +%endif + je %%_small_initial_num_blocks_is_0 + + ;; Save the amount of data left to process in %%LENGTH +%ifidn __OUTPUT_FORMAT__, win64 + mov %%LENGTH, %%PLAIN_CIPH_LEN +%else +%define %%LENGTH %%PLAIN_CIPH_LEN ;; PLAIN_CIPH_LEN is a register on linux +%endif + +%%_message_below_equal_16_blocks: + ;; Determine how many blocks to process + ;; - process one additional block if there is a partial block +%define %%NUM_BLOCKS %%IA5 + + mov DWORD(%%NUM_BLOCKS), DWORD(%%LENGTH) + add DWORD(%%NUM_BLOCKS), 15 + shr DWORD(%%NUM_BLOCKS), 4 + ;; %%NUM_BLOCKS can be in the range from 0 to 16 + +;; %macro GCM_ENC_DEC_SMALL 39 + + cmp DWORD(%%NUM_BLOCKS), 8 + je %%_small_initial_num_blocks_is_8 + jb %%_small_initial_num_blocks_is_7_1 + + cmp DWORD(%%NUM_BLOCKS), 12 + je %%_small_initial_num_blocks_is_12 + jb %%_small_initial_num_blocks_is_11_9 + + ;; 16, 15, 14 or 13 + cmp DWORD(%%NUM_BLOCKS), 15 + ja %%_small_initial_num_blocks_is_16 + je %%_small_initial_num_blocks_is_15 + cmp DWORD(%%NUM_BLOCKS), 14 + je %%_small_initial_num_blocks_is_14 + jmp %%_small_initial_num_blocks_is_13 + +%%_small_initial_num_blocks_is_11_9: + ;; 11, 10 or 9 + cmp DWORD(%%NUM_BLOCKS), 10 + ja %%_small_initial_num_blocks_is_11 + je %%_small_initial_num_blocks_is_10 + jmp %%_small_initial_num_blocks_is_9 + +%%_small_initial_num_blocks_is_7_1: + cmp DWORD(%%NUM_BLOCKS), 4 + je %%_small_initial_num_blocks_is_4 + jb %%_small_initial_num_blocks_is_3_1 + ;; 7, 6 or 5 + cmp DWORD(%%NUM_BLOCKS), 6 + ja %%_small_initial_num_blocks_is_7 + je %%_small_initial_num_blocks_is_6 + jmp %%_small_initial_num_blocks_is_5 + +%%_small_initial_num_blocks_is_3_1: + ;; 3, 2 or 1 + cmp DWORD(%%NUM_BLOCKS), 2 + ja %%_small_initial_num_blocks_is_3 + je %%_small_initial_num_blocks_is_2 + + ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed + + ;; Use rep to generate different block size variants + ;; - one block size has to be the first one + ;; - ZTMP15 - ZTMP22 are free +%assign num_blocks 1 +%rep 16 + +%%_small_initial_num_blocks_is_ %+ num_blocks : + +;; %macro INITIAL_BLOCKS_PARTIAL 31 +%define %%CTR0 %%ZTMP0 +%define %%CTR1 %%ZTMP1 +%define %%CTR2 %%ZTMP2 +%define %%CTR3 %%ZTMP3 +%define %%DAT0 %%ZTMP4 +%define %%DAT1 %%ZTMP5 +%define %%DAT2 %%ZTMP6 +%define %%DAT3 %%ZTMP7 +%define %%LAST_CIPHER_BLK %%ZTMP8 +%define %%LAST_GHASH_BLK %%ZTMP9 + +;; %macro INITIAL_BLOCKS_PARTIAL_CIPHER 25 + +%if num_blocks == 1 + vmovdqa64 %%SHUF_MASKx, [rel SHUF_MASK] +%elif num_blocks == 2 + vmovdqa64 %%SHUF_MASKy, [rel SHUF_MASK] +%else + vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK] +%endif + vmovd DWORD(%%IA2), %%CTR_BLOCKx + + ;; get load/store mask + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH +%if num_blocks > 12 + sub %%IA1, 3 * 64 +%elif num_blocks > 8 + sub %%IA1, 2 * 64 +%elif num_blocks > 4 + sub %%IA1, 64 +%endif + kmovq %%MASKREG, [%%IA0 + %%IA1*8] + + cmp BYTE(%%IA2), 256 - num_blocks + jae %%_ctr_overflow_ %+ num_blocks + + ;; prepare AES counter blocks (BE format - no overflow) +%if num_blocks == 1 + vpaddd XWORD(%%CTR0), %%ORIG_IVx, [rel ONEf] +%elif num_blocks == 2 + vshufi64x2 YWORD(%%CTR0), YWORD(%%ORIG_IV), YWORD(%%ORIG_IV), 0 + vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_addbe_1234] +%else + vshufi64x2 %%CTR_BLOCKz, %%ORIG_IV, %%ORIG_IV, 0 + vpaddd %%CTR0, %%CTR_BLOCKz, [rel ddq_addbe_1234] +%if num_blocks > 4 + vpaddd %%CTR1, %%CTR_BLOCKz, [rel ddq_addbe_5678] +%endif +%if num_blocks > 8 + vpaddd %%CTR2, %%CTR0, [rel ddq_addbe_8888] +%endif +%if num_blocks > 12 + vpaddd %%CTR3, %%CTR1, [rel ddq_addbe_8888] +%endif +%endif + jmp %%_ctr_ready_ %+ num_blocks + +%%_ctr_overflow_ %+ num_blocks : + ;; prepare AES counter blocks (LE format - overflow) +%if num_blocks == 1 + vpaddd XWORD(%%CTR0), %%CTR_BLOCKx, [rel ONE] +%elif num_blocks == 2 + vshufi64x2 YWORD(%%CTR0), YWORD(%%CTR_BLOCKz), YWORD(%%CTR_BLOCKz), 0 + vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_add_1234] +%else + vshufi64x2 %%CTR_BLOCKz, %%CTR_BLOCKz, %%CTR_BLOCKz, 0 + vpaddd %%CTR0, %%CTR_BLOCKz, [rel ddq_add_1234] +%if num_blocks > 4 + vpaddd %%CTR1, %%CTR_BLOCKz, [rel ddq_add_5678] +%endif +%if num_blocks > 8 + vpaddd %%CTR2, %%CTR0, [rel ddq_add_8888] +%endif +%if num_blocks > 12 + vpaddd %%CTR3, %%CTR1, [rel ddq_add_8888] +%endif +%endif + + ;; shuffle the counters for AES rounds + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpshufb, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK + +%%_ctr_ready_ %+ num_blocks : + + ;; blend original IV into message blocks for AES encryption +%if (num_blocks >= 14) && (num_blocks <= 15) +%assign num_blocks_aes (num_blocks + 1) +%assign blend_orig_iv_aes 1 + vinserti64x2 %%CTR3, %%ORIG_IVx, num_blocks - 12 +%elif (num_blocks == 13) +%assign num_blocks_aes (num_blocks + 1) +%assign blend_orig_iv_aes 1 + vinserti64x2 YWORD(%%CTR3), %%ORIG_IVx, num_blocks - 12 +%elif (num_blocks >= 10) && (num_blocks <= 11) +%assign num_blocks_aes (num_blocks + 1) +%assign blend_orig_iv_aes 1 + vinserti64x2 %%CTR2, %%ORIG_IVx, num_blocks - 8 +%elif (num_blocks == 9) +%assign num_blocks_aes (num_blocks + 1) +%assign blend_orig_iv_aes 1 + vinserti64x2 YWORD(%%CTR2), %%ORIG_IVx, num_blocks - 8 +%elif (num_blocks >= 6) && (num_blocks <= 7) +%assign num_blocks_aes (num_blocks + 1) +%assign blend_orig_iv_aes 1 + vinserti64x2 %%CTR1, %%ORIG_IVx, num_blocks - 4 +%elif (num_blocks == 5) +%assign num_blocks_aes (num_blocks + 1) +%assign blend_orig_iv_aes 1 + vinserti64x2 YWORD(%%CTR1), %%ORIG_IVx, num_blocks - 4 +%elif (num_blocks >= 2) && (num_blocks <= 3) +%assign num_blocks_aes (num_blocks + 1) +%assign blend_orig_iv_aes 1 + vinserti64x2 %%CTR0, %%ORIG_IVx, num_blocks +%elif (num_blocks == 1) +%assign num_blocks_aes (num_blocks + 1) +%assign blend_orig_iv_aes 1 + vinserti64x2 YWORD(%%CTR0), %%ORIG_IVx, num_blocks +%else +%assign num_blocks_aes num_blocks +%assign blend_orig_iv_aes 0 +%endif + + ;; load plain/cipher text + ZMM_LOAD_MASKED_BLOCKS_0_16 num_blocks, %%PLAIN_CIPH_IN, 0, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG + + + ;; AES rounds and XOR with plain/cipher text +%assign j 0 + + vbroadcastf64x2 %%ZTMP10, [%%GDATA_KEY + (j * 16)] +%if blend_orig_iv_aes == 0 + vpxorq %%ORIG_IVx, %%ORIG_IVx, XWORD(%%ZTMP10) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks_aes, vpxorq, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%ZTMP10, %%ZTMP10, %%ZTMP10, %%ZTMP10 +%assign j (j + 1) + +%rep NROUNDS + vbroadcastf64x2 %%ZTMP10, [%%GDATA_KEY + (j * 16)] +%if blend_orig_iv_aes == 0 + vaesenc %%ORIG_IVx, %%ORIG_IVx, XWORD(%%ZTMP10) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks_aes, vaesenc, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%ZTMP10, %%ZTMP10, %%ZTMP10, %%ZTMP10 +%assign j (j + 1) +%endrep + + vbroadcastf64x2 %%ZTMP10, [%%GDATA_KEY + (j * 16)] +%if blend_orig_iv_aes == 0 + vaesenclast %%ORIG_IVx, %%ORIG_IVx, XWORD(%%ZTMP10) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks_aes, vaesenclast, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%ZTMP10, %%ZTMP10, %%ZTMP10, %%ZTMP10 + +%if blend_orig_iv_aes != 0 +%if num_blocks >= 12 + vextracti32x4 %%ORIG_IVx, %%CTR3, num_blocks - 12 +%elif num_blocks >= 8 + vextracti32x4 %%ORIG_IVx, %%CTR2, num_blocks - 8 +%elif num_blocks >= 4 + vextracti32x4 %%ORIG_IVx, %%CTR1, num_blocks - 4 +%else + vextracti32x4 %%ORIG_IVx, %%CTR0, num_blocks +%endif +%endif + + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpxorq, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3 + + ;; **DAT0, DAT1, DAT2, DAT3 may contain clear text + + ;; write cipher/plain text back to output and + ZMM_STORE_MASKED_BLOCKS_0_16 num_blocks, %%CIPH_PLAIN_OUT, 0, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG + ;; **CTR0, CTR1, CTR2, CTR3 may contain clear text + + ;; Shuffle the cipher text blocks for hashing part + ;; ZT5 and ZT6 are expected outputs with blocks for hashing +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ;; - cipher blocks are in ZT5 & ZT6 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK +%else + ;; Encrypt case + + ;; zero bytes outside the mask before hashing +%if num_blocks <= 4 + vmovdqu8 %%CTR0{%%MASKREG}{z}, %%CTR0 +%elif num_blocks <= 8 + vmovdqu8 %%CTR1{%%MASKREG}{z}, %%CTR1 +%elif num_blocks <= 12 + vmovdqu8 %%CTR2{%%MASKREG}{z}, %%CTR2 +%else + vmovdqu8 %%CTR3{%%MASKREG}{z}, %%CTR3 +%endif + + ;; - cipher blocks are in CTR0-CTR3 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK +%endif ; Encrypt + ;; **DAT0, DAT1, DAT2, DAT3 overwritten with shuffled cipher text + +;; %endmacro ; INITIAL_BLOCKS_PARTIAL_CIPHER + + ;; **CTR0, CTR1, CTR2, CTR3, %%ZT0 may contain sensitive data + + ;; calculate AAD hash + cmp %%A_LEN, 12 + jne %%_aad_is_not_12_bytes_ %+ num_blocks + + ;; load 12 bytes of AAD +%if %0 == 12 ;; IV is different than 12 bytes + mov DWORD(%%IA1), 0x0000_0fff + kmovd k2, DWORD(%%IA1) +%endif + mov %%IA1, %%A_IN + vmovdqu8 %%AAD_HASHx{k2}{z}, [%%IA1] + vpshufb %%AAD_HASHx, %%AAD_HASHx, %%SHUF_MASKx + + vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN + vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) + vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits + vinserti64x2 %%AAD_HASHy, XWORD(%%ZTMP15), 1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; GHASH 12 byte AAD with last block using respective GHASH key powers + +%assign num_blocks2 (num_blocks + 1) +%define StartHashKey HashKey_ %+ num_blocks2 + + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + StartHashKey + HKeyGap] + vinserti64x2 YWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1 + HKeyGap], 1 + vpclmulqdq YWORD(%%ZTMP14), %%AAD_HASHy, YWORD(%%ZTMP13), 0x00 ; TLL = GH_L * KK_L + vpclmulqdq YWORD(%%ZTMP15), %%AAD_HASHy, YWORD(%%ZTMP13), 0x10 ; TLH = GH_L * KK_H + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + StartHashKey] + vinserti64x2 YWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1], 1 + vpclmulqdq YWORD(%%ZTMP16), %%AAD_HASHy, YWORD(%%ZTMP13), 0x01 ; THL = GH_H * HK_L + vpclmulqdq YWORD(%%ZTMP17), %%AAD_HASHy, YWORD(%%ZTMP13), 0x11 ; THH = GH_H * HK_H + +%undef StartHashKey + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; add products + + vpxorq YWORD(%%ZTMP14), YWORD(%%ZTMP14), YWORD(%%ZTMP16) ;; TLL += THL + vpxorq YWORD(%%ZTMP15), YWORD(%%ZTMP15), YWORD(%%ZTMP17) ;; TLH += THH + + ;; continue GHASH compute followed by reduction + GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP20, \ + %%ZTMP21, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, 1, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, num_blocks, %%ZTMP15, %%ZTMP14 + + jmp %%_small_initial_blocks_encrypted + +%%_aad_is_not_12_bytes_ %+ num_blocks: + vpxor %%AAD_HASHx, %%AAD_HASHx, %%AAD_HASHx + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASHx, %%GDATA_KEY, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, %%ZTMP16, %%ZTMP17, \ + %%ZTMP18, %%ZTMP19, %%ZTMP20, %%ZTMP21, %%ZTMP22, %%ZTMP23, %%ZTMP24, %%ZTMP25, \ + %%ZTMP26, %%ZTMP27, %%IA1, %%IA2, %%IA3, %%MASKREG + +%if num_blocks == 16 + + ;; start GHASH compute + GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \ + %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, %%AAD_HASHz, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, num_blocks + + ;; GHASH block with encoded lengths + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1] + vmovdqu8 XWORD(%%ZTMP14), [%%GDATA_KEY + HashKey_1 + HKeyGap] + + vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN + vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) + vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits + + vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP15) + GHASH_MUL2 %%AAD_HASHx, XWORD(%%ZTMP13), XWORD(%%ZTMP14), XWORD(%%ZTMP16), XWORD(%%ZTMP17), XWORD(%%ZTMP18), XWORD(%%ZTMP19) + +%else + + ;; blend block with lengths into message GHASH operation + vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN + vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) + vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits + +%if num_blocks == 12 + vmovdqa64 XWORD(%%DAT3), XWORD(%%ZTMP15) +%elif num_blocks > 12 + vinserti64x2 %%DAT3, XWORD(%%ZTMP15), num_blocks - 12 +%elif num_blocks == 8 + vmovdqa64 XWORD(%%DAT2), XWORD(%%ZTMP15) +%elif num_blocks > 8 + vinserti64x2 %%DAT2, XWORD(%%ZTMP15), num_blocks - 8 +%elif num_blocks == 4 + vmovdqa64 XWORD(%%DAT1), XWORD(%%ZTMP15) +%elif num_blocks > 4 + vinserti64x2 %%DAT1, XWORD(%%ZTMP15), num_blocks - 4 +%else + vinserti64x2 %%DAT0, XWORD(%%ZTMP15), num_blocks +%endif + +%assign num_blocks2 (num_blocks + 1) + + ;; start GHASH compute + GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \ + %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, %%AAD_HASHz, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, num_blocks2 + +%endif + + jmp %%_small_initial_blocks_encrypted + +;; %endmacro ; INITIAL_BLOCKS_PARTIAL + + +%assign num_blocks (num_blocks + 1) + +%endrep + +;; %endmacro ; GCM_ENC_DEC_SMALL + + +%%_small_initial_num_blocks_is_0: + vmovdqa64 %%SHUF_MASKx, [rel SHUF_MASK] + + ;; calculate AAD hash for 0 message length case + vpxor %%AAD_HASHx, %%AAD_HASHx, %%AAD_HASHx + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASHx, %%GDATA_KEY, \ + %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ + %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ + %%ZTMP16, %%ZTMP17, %%IA1, %%IA2, %%IA3, %%MASKREG + + ;; encrypt original IV + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, %%ORIG_IVx ; E(K, Y0) + + ;; GHASH block with encoded lengths + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1] + vmovdqu8 XWORD(%%ZTMP14), [%%GDATA_KEY + HashKey_1 + HKeyGap] + + vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN + vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) + vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits + + vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP15) + GHASH_MUL2 %%AAD_HASHx, XWORD(%%ZTMP13), XWORD(%%ZTMP14), XWORD(%%ZTMP16), XWORD(%%ZTMP17), XWORD(%%ZTMP18), XWORD(%%ZTMP19) + +%%_small_initial_blocks_encrypted: + vpshufb %%AAD_HASHx, %%AAD_HASHx, %%SHUF_MASKx ; perform a 16Byte swap + vpxorq %%ORIG_IVx, %%ORIG_IVx, %%AAD_HASHx ; add hash value with encrypted original IV + +;; GCM_COMPLETE START + mov %%IA0, %%AUTH_TAG ; r10 = authTag + mov %%IA1, %%AUTH_TAG_LEN ; r11 = auth_tag_len + + lea %%IA2, [rel byte64_len_to_mask_table] + kmovq %%MASKREG, [%%IA2 + %%IA1*8] + vmovdqu8 [%%IA0]{%%MASKREG}, %%ORIG_IVx + +;; GCM_COMPLETE END + +%endmacro ; GCM_ENC_DEC_VSMALL + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct ; has been initialized by GCM_INIT -- GitLab From 1c1c5c69dd1b93e9a53f3fd9849994ca2321b0a2 Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 18 Jan 2024 13:28:50 +0000 Subject: [PATCH 28/30] vaes-avx512: [gcm] use internal GHASH function instead expanding GHASH calculation macro for small packets - use GCM_ENC_DEC_0_TO_256 macro name for small packet code - tidy up comments and register usage notes - reduce CALC_J0 macro arguments and update register usage notes --- lib/include/gcm_api_vaes_avx512.inc | 2 +- lib/include/gcm_vaes_avx512.inc | 299 +++++++++++++++------------- 2 files changed, 167 insertions(+), 134 deletions(-) diff --git a/lib/include/gcm_api_vaes_avx512.inc b/lib/include/gcm_api_vaes_avx512.inc index 7f8456b0..6979e962 100644 --- a/lib/include/gcm_api_vaes_avx512.inc +++ b/lib/include/gcm_api_vaes_avx512.inc @@ -200,7 +200,7 @@ FN_NAME(enc,_): jmp .exit_enc .small_packet_path: - GCM_ENC_DEC_VSMALL arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, ENC + GCM_ENC_DEC_0_TO_256 arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, ENC %ifdef SAFE_DATA clear_zmms_avx512 xmm6 %endif diff --git a/lib/include/gcm_vaes_avx512.inc b/lib/include/gcm_vaes_avx512.inc index eb5bc9cd..479e95b9 100644 --- a/lib/include/gcm_vaes_avx512.inc +++ b/lib/include/gcm_vaes_avx512.inc @@ -2876,33 +2876,25 @@ align 32 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Calculate J0 for cases when IV length is different than 12 bytes -%macro CALC_J0 26 +;;; - uses ghash_internal_vaes_avx512() function +;;; - clobbers: zmm0-zmm1, zmm3-zmm13, zmm15-zmm20, r12, r13, rax, k1 +%macro CALC_J0 4-5 %define %%KEY %1 ;; [in] Pointer to GCM KEY structure %define %%IV %2 ;; [in] Pointer to IV %define %%IV_LEN %3 ;; [in] IV length %define %%J0 %4 ;; [out] XMM reg to contain J0 -%define %%ZT0 %5 ;; [clobbered] ZMM register -%define %%ZT1 %6 ;; [clobbered] ZMM register -%define %%ZT2 %7 ;; [clobbered] ZMM register -%define %%ZT3 %8 ;; [clobbered] ZMM register -%define %%ZT4 %9 ;; [clobbered] ZMM register -%define %%ZT5 %10 ;; [clobbered] ZMM register -%define %%ZT6 %11 ;; [clobbered] ZMM register -%define %%ZT7 %12 ;; [clobbered] ZMM register -%define %%ZT8 %13 ;; [clobbered] ZMM register -%define %%ZT9 %14 ;; [clobbered] ZMM register -%define %%ZT10 %15 ;; [clobbered] ZMM register -%define %%ZT11 %16 ;; [clobbered] ZMM register -%define %%ZT12 %17 ;; [clobbered] ZMM register -%define %%ZT13 %18 ;; [clobbered] ZMM register -%define %%ZT14 %19 ;; [clobbered] ZMM register -%define %%ZT15 %20 ;; [clobbered] ZMM register -%define %%ZT16 %21 ;; [clobbered] ZMM register -%define %%ZT17 %22 ;; [clobbered] ZMM register -%define %%T1 %23 ;; [clobbered] GP register -%define %%T2 %24 ;; [clobbered] GP register -%define %%T3 %25 ;; [clobbered] GP register -%define %%MASKREG %26 ;; [clobbered] mask register +%define %%SHUFMASK %5 ;; [in] register with shuffle mask + +%define %%ZT0 zmm3 +%define %%ZT1 zmm4 +%define %%ZT2 zmm5 +%define %%ZT3 zmm6 +%define %%ZT4 zmm7 +%define %%ZT5 zmm8 +%define %%ZT6 zmm9 + +%define %%T1 r12 +%define %%T2 r13 ;; J0 = GHASH(IV || 0s+64 || len(IV)64) ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ @@ -2916,7 +2908,9 @@ align 32 mov r12, %%IV mov r13, %%IV_LEN call ghash_internal_vaes_avx512 +%ifnidn %%J0, xmm0 vmovdqa64 %%J0, xmm0 +%endif ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) mov %%T1, %%IV_LEN @@ -2928,9 +2922,12 @@ align 32 vmovdqu64 XWORD(%%ZT0), [%%KEY + HashKey_1] vmovdqu64 XWORD(%%ZT5), [%%KEY + HashKey_1 + HKeyGap] GHASH_MUL2 %%J0, XWORD(%%ZT0), XWORD(%%ZT5), XWORD(%%ZT1), XWORD(%%ZT2), XWORD(%%ZT3), XWORD(%%ZT4) - ;; **ZT1, ZT2, ZT3 overwritten with ghash products - vpshufb %%J0, %%J0, [rel SHUF_MASK] ; perform a 16Byte swap +%if %0 == 4 + vpshufb %%J0, %%J0, [rel SHUF_MASK] +%elif %0 == 5 + vpshufb %%J0, %%J0, XWORD(%%SHUFMASK) +%endif %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2977,10 +2974,7 @@ align 32 cmp %%IV_LEN, 12 je %%_iv_length_is_12_bytes - CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%CUR_COUNT, \ - %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ - %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, %%ZT13, \ - %%ZT14, %%ZT15, %%ZT16, %%ZT17, %%GPR1, %%GPR2, %%GPR3, %%MASKREG + CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%CUR_COUNT jmp %%_iv_prep_is_done %endif @@ -3157,15 +3151,19 @@ align 32 %endmacro ; GCM_ENC_DEC_SMALL ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; GCM_ENC_DEC_VSMALL Encodes/Decodes given data. Assumes that the passed gcm_context_data struct -; has been initialized by GCM_INIT -; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA. -; Input: gcm_key_data struct* (GDATA_KEY), gcm_context_data *(GDATA_CTX), input text (PLAIN_CIPH_IN), -; input text length (PLAIN_CIPH_LEN) and whether encoding or decoding (ENC_DEC). -; Output: A cipher of the given plain text (CIPH_PLAIN_OUT), and updated GDATA_CTX -; Clobbers rax, r10-r15, and zmm0-zmm31, k1 +;; GCM_ENC_DEC_0_TO_256 +;; - combines and optimizes functionality of three macros: +;; - GCM_INIT +;; - GCM_ENC_DEC +;; - GCM_COMPLETE +;; - works for packet sizes between 0 and 256 bytes +;; - it is limited to single_call case only +;; - works with AAD size +;; - works with IV size provided IV length is provided +;; Output: C and T +;; Clobbers rax, r12, r13, zmm0-zmm23, zmm30, zmm31, k1, k2, r11 (windows) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro GCM_ENC_DEC_VSMALL 11-12 +%macro GCM_ENC_DEC_0_TO_256 11-12 %define %%GDATA_KEY %1 ; [in] key pointer %define %%GDATA_CTX %2 ; [in] context pointer %define %%CIPH_PLAIN_OUT %3 ; [in] output buffer pointer @@ -3179,17 +3177,10 @@ align 32 %define %%ENC_DEC %11 ; [in] cipher direction %define %%IV_LEN %12 ; [in] IV length -%define %%IA0 r10 +%define %%IA0 rax %define %%IA1 r12 %define %%IA2 r13 -%define %%IA3 r15 -%define %%IA4 r11 -%define %%IA5 rax -%define %%IA6 rbx - -%ifidn __OUTPUT_FORMAT__, win64 -%define %%LENGTH %%IA6 -%endif +%define %%IA3 r11 %define %%CTR_BLOCKz zmm0 %define %%CTR_BLOCKx xmm0 ; hardcoded in GCM_INIT @@ -3206,9 +3197,9 @@ align 32 %define %%ORIG_IVx xmm31 %define %%ZTMP0 zmm2 -%define %%ZTMP1 zmm3 ; **sensitive -%define %%ZTMP2 zmm4 ; **sensitive (small data) -%define %%ZTMP3 zmm5 ; **sensitive (small data) +%define %%ZTMP1 zmm3 +%define %%ZTMP2 zmm4 +%define %%ZTMP3 zmm5 %define %%ZTMP4 zmm6 %define %%ZTMP5 zmm7 %define %%ZTMP6 zmm8 @@ -3227,37 +3218,40 @@ align 32 %define %%ZTMP19 zmm21 %define %%ZTMP20 zmm22 %define %%ZTMP21 zmm23 -%define %%ZTMP22 zmm24 -%define %%ZTMP23 zmm25 -%define %%ZTMP24 zmm26 -%define %%ZTMP25 zmm27 -%define %%ZTMP26 zmm28 -%define %%ZTMP27 zmm29 +%define %%ZTMP22 zmm24 ; not used +%define %%ZTMP23 zmm25 ; not used +%define %%ZTMP24 zmm26 ; not used +%define %%ZTMP25 zmm27 ; not used +%define %%ZTMP26 zmm28 ; not used +%define %%ZTMP27 zmm29 ; not used %define %%MASKREG k1 - -;; GCM_INIT START + ;; =================================================================== ;; prepare IV -%if %0 == 12 ;; IV is different than 12 bytes - CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%ORIG_IVx, \ - %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ - %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, \ - %%ZTMP14, %%ZTMP15, %%ZTMP16, %%ZTMP17, %%IA1, %%IA2, %%IA3, %%MASKREG -%else ;; IV is 12 bytes +%if %0 == 12 + ;; IV may be different than 12 bytes + cmp %%IV_LEN, 12 + je %%_iv_length_is_12_bytes + + CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%ORIG_IVx + jmp %%_iv_prep_is_done +%endif ;; IV_LEN provided + +%%_iv_length_is_12_bytes: ;; read 12 IV bytes and pad with 0x00000001 vmovdqa64 %%ORIG_IVx, [rel ONEf] mov %%IA2, %%IV mov DWORD(%%IA1), 0x0000_0fff kmovd k2, DWORD(%%IA1) vmovdqu8 %%ORIG_IVx{k2}, [%%IA2] ; ctr = IV | 0x1 -%endif + +%%_iv_prep_is_done: ;; set up context fields vpshufb %%CTR_BLOCKx, %%ORIG_IVx, [rel SHUF_MASK] -;; GCM_INIT END - -;; GCM_ECN_DEC START + ;; =================================================================== + ;; check for zero message length %ifidn __OUTPUT_FORMAT__, win64 cmp %%PLAIN_CIPH_LEN, 0 @@ -3266,25 +3260,25 @@ align 32 %endif je %%_small_initial_num_blocks_is_0 - ;; Save the amount of data left to process in %%LENGTH + ;; =================================================================== + ;; Prepare %%LENGTH register %ifidn __OUTPUT_FORMAT__, win64 +%define %%LENGTH %%IA3 mov %%LENGTH, %%PLAIN_CIPH_LEN %else %define %%LENGTH %%PLAIN_CIPH_LEN ;; PLAIN_CIPH_LEN is a register on linux %endif - -%%_message_below_equal_16_blocks: + ;; =================================================================== ;; Determine how many blocks to process - ;; - process one additional block if there is a partial block -%define %%NUM_BLOCKS %%IA5 + ;; - process one additional block if there is a partial block (round up) + +%define %%NUM_BLOCKS %%IA1 mov DWORD(%%NUM_BLOCKS), DWORD(%%LENGTH) add DWORD(%%NUM_BLOCKS), 15 shr DWORD(%%NUM_BLOCKS), 4 ;; %%NUM_BLOCKS can be in the range from 0 to 16 -;; %macro GCM_ENC_DEC_SMALL 39 - cmp DWORD(%%NUM_BLOCKS), 8 je %%_small_initial_num_blocks_is_8 jb %%_small_initial_num_blocks_is_7_1 @@ -3326,15 +3320,23 @@ align 32 ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed - ;; Use rep to generate different block size variants - ;; - one block size has to be the first one - ;; - ZTMP15 - ZTMP22 are free + ;; =================================================================== + ;; Use rep to generate different optimized code for block size variants + ;; - one block size variant has to be the first one + %assign num_blocks 1 %rep 16 + ;; =================================================================== + ;; =================================================================== + ;; Optimized small packet AES-GCM generation + ;; - at this stage, IV is ready + ;; - prepare counter blocks + ;; - do AES-CTR & encryption of original IV + ;; - do AAD, GHASH of message and block with sizes + %%_small_initial_num_blocks_is_ %+ num_blocks : -;; %macro INITIAL_BLOCKS_PARTIAL 31 %define %%CTR0 %%ZTMP0 %define %%CTR1 %%ZTMP1 %define %%CTR2 %%ZTMP2 @@ -3346,8 +3348,9 @@ align 32 %define %%LAST_CIPHER_BLK %%ZTMP8 %define %%LAST_GHASH_BLK %%ZTMP9 -;; %macro INITIAL_BLOCKS_PARTIAL_CIPHER 25 - + ;; =================================================================== + ;; - load shuffle mask + ;; - retrieve 32-bit counter in BE format %if num_blocks == 1 vmovdqa64 %%SHUF_MASKx, [rel SHUF_MASK] %elif num_blocks == 2 @@ -3357,7 +3360,8 @@ align 32 %endif vmovd DWORD(%%IA2), %%CTR_BLOCKx - ;; get load/store mask + ;; =================================================================== + ;; get load/store mask for plain/cipher text lea %%IA0, [rel byte64_len_to_mask_table] mov %%IA1, %%LENGTH %if num_blocks > 12 @@ -3369,10 +3373,14 @@ align 32 %endif kmovq %%MASKREG, [%%IA0 + %%IA1*8] + ;; =================================================================== + ;; Check if counter blocks can be prepared in BE format or + ;; LE format is required cmp BYTE(%%IA2), 256 - num_blocks jae %%_ctr_overflow_ %+ num_blocks - ;; prepare AES counter blocks (BE format - no overflow) + ;; =================================================================== + ;; Prepare AES counter blocks (BE format, no byte overflow) %if num_blocks == 1 vpaddd XWORD(%%CTR0), %%ORIG_IVx, [rel ONEf] %elif num_blocks == 2 @@ -3394,7 +3402,8 @@ align 32 jmp %%_ctr_ready_ %+ num_blocks %%_ctr_overflow_ %+ num_blocks : - ;; prepare AES counter blocks (LE format - overflow) + ;; =================================================================== + ;; Prepare AES counter blocks (LE format, byte overflow) %if num_blocks == 1 vpaddd XWORD(%%CTR0), %%CTR_BLOCKx, [rel ONE] %elif num_blocks == 2 @@ -3414,7 +3423,8 @@ align 32 %endif %endif - ;; shuffle the counters for AES rounds + ;; =================================================================== + ;; shuffle the counter blcoks for AES rounds ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpshufb, \ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ @@ -3422,7 +3432,8 @@ align 32 %%_ctr_ready_ %+ num_blocks : - ;; blend original IV into message blocks for AES encryption + ;; =================================================================== + ;; append original IV to message blocks for AES encryption, if possible %if (num_blocks >= 14) && (num_blocks <= 15) %assign num_blocks_aes (num_blocks + 1) %assign blend_orig_iv_aes 1 @@ -3456,15 +3467,18 @@ align 32 %assign blend_orig_iv_aes 1 vinserti64x2 YWORD(%%CTR0), %%ORIG_IVx, num_blocks %else + ;; 16 or 0 block cases %assign num_blocks_aes num_blocks %assign blend_orig_iv_aes 0 %endif + ;; =================================================================== ;; load plain/cipher text ZMM_LOAD_MASKED_BLOCKS_0_16 num_blocks, %%PLAIN_CIPH_IN, 0, \ %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG + ;; =================================================================== ;; AES rounds and XOR with plain/cipher text %assign j 0 @@ -3499,6 +3513,8 @@ align 32 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ %%ZTMP10, %%ZTMP10, %%ZTMP10, %%ZTMP10 + ;; =================================================================== + ;; Extract encrypted original IV %if blend_orig_iv_aes != 0 %if num_blocks >= 12 vextracti32x4 %%ORIG_IVx, %%CTR3, num_blocks - 12 @@ -3516,18 +3532,16 @@ align 32 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ %%DAT0, %%DAT1, %%DAT2, %%DAT3 - ;; **DAT0, DAT1, DAT2, DAT3 may contain clear text - + ;; =================================================================== ;; write cipher/plain text back to output and ZMM_STORE_MASKED_BLOCKS_0_16 num_blocks, %%CIPH_PLAIN_OUT, 0, \ %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG - ;; **CTR0, CTR1, CTR2, CTR3 may contain clear text + ;; =================================================================== ;; Shuffle the cipher text blocks for hashing part - ;; ZT5 and ZT6 are expected outputs with blocks for hashing + ;; - GHASH always works on cipher text %ifidn %%ENC_DEC, DEC ;; Decrypt case - ;; - cipher blocks are in ZT5 & ZT6 ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpshufb, \ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ @@ -3535,7 +3549,7 @@ align 32 %else ;; Encrypt case - ;; zero bytes outside the mask before hashing + ;; - zero bytes outside the mask before hashing %if num_blocks <= 4 vmovdqu8 %%CTR0{%%MASKREG}{z}, %%CTR0 %elif num_blocks <= 8 @@ -3552,18 +3566,18 @@ align 32 %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK %endif ; Encrypt - ;; **DAT0, DAT1, DAT2, DAT3 overwritten with shuffled cipher text - -;; %endmacro ; INITIAL_BLOCKS_PARTIAL_CIPHER - ;; **CTR0, CTR1, CTR2, CTR3, %%ZT0 may contain sensitive data - - ;; calculate AAD hash + ;; =================================================================== + ;; Calculate AAD hash cmp %%A_LEN, 12 jne %%_aad_is_not_12_bytes_ %+ num_blocks - ;; load 12 bytes of AAD -%if %0 == 12 ;; IV is different than 12 bytes + ;; =================================================================== + ;; load 12 bytes of AAD (most common case) + ;; - AAD and block with sizes get hashed together + ;; - one reduction for everything (AAD + message + length block) + +%if %0 == 12 ;; IV may be different than 12 bytes and k2 not set mov DWORD(%%IA1), 0x0000_0fff kmovd k2, DWORD(%%IA1) %endif @@ -3577,7 +3591,7 @@ align 32 vinserti64x2 %%AAD_HASHy, XWORD(%%ZTMP15), 1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; GHASH 12 byte AAD with last block using respective GHASH key powers + ;; GHASH 12 byte AAD with length block using respective GHASH key powers %assign num_blocks2 (num_blocks + 1) %define StartHashKey HashKey_ %+ num_blocks2 @@ -3599,7 +3613,8 @@ align 32 vpxorq YWORD(%%ZTMP14), YWORD(%%ZTMP14), YWORD(%%ZTMP16) ;; TLL += THL vpxorq YWORD(%%ZTMP15), YWORD(%%ZTMP15), YWORD(%%ZTMP17) ;; TLH += THH - ;; continue GHASH compute followed by reduction + ;; =================================================================== + ;; continue with message GHASH followed by reduction GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP20, \ %%ZTMP21, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, 1, \ @@ -3608,21 +3623,29 @@ align 32 jmp %%_small_initial_blocks_encrypted %%_aad_is_not_12_bytes_ %+ num_blocks: - vpxor %%AAD_HASHx, %%AAD_HASHx, %%AAD_HASHx - CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASHx, %%GDATA_KEY, \ - %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, %%ZTMP16, %%ZTMP17, \ - %%ZTMP18, %%ZTMP19, %%ZTMP20, %%ZTMP21, %%ZTMP22, %%ZTMP23, %%ZTMP24, %%ZTMP25, \ - %%ZTMP26, %%ZTMP27, %%IA1, %%IA2, %%IA3, %%MASKREG + ;; =================================================================== + ;; Calculate AAD hash (different than 12 bytes) -%if num_blocks == 16 + vpxor xmm0, xmm0, xmm0 + ;; arg1 - GDATA_KEY + ;; r12 - message pointer + ;; r13 - message length + ;; xmm0 - hash in/out + mov r12, %%A_IN + mov r13, %%A_LEN + call ghash_internal_vaes_avx512 + vmovdqa64 %%AAD_HASHx, xmm0 - ;; start GHASH compute +%if num_blocks == 16 + ;; =================================================================== + ;; message GHASH compute GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \ %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, %%AAD_HASHz, \ %%DAT0, %%DAT1, %%DAT2, %%DAT3, num_blocks - ;; GHASH block with encoded lengths + ;; =================================================================== + ;; GHASH length block vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1] vmovdqu8 XWORD(%%ZTMP14), [%%GDATA_KEY + HashKey_1 + HKeyGap] @@ -3634,8 +3657,8 @@ align 32 GHASH_MUL2 %%AAD_HASHx, XWORD(%%ZTMP13), XWORD(%%ZTMP14), XWORD(%%ZTMP16), XWORD(%%ZTMP17), XWORD(%%ZTMP18), XWORD(%%ZTMP19) %else - - ;; blend block with lengths into message GHASH operation + ;; =================================================================== + ;; create & append length block into message for GHASH vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits @@ -3656,42 +3679,48 @@ align 32 vinserti64x2 %%DAT0, XWORD(%%ZTMP15), num_blocks %endif + ;; =================================================================== + ;; message + length block GHASH compute + %assign num_blocks2 (num_blocks + 1) - ;; start GHASH compute GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \ %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, %%AAD_HASHz, \ %%DAT0, %%DAT1, %%DAT2, %%DAT3, num_blocks2 %endif - jmp %%_small_initial_blocks_encrypted -;; %endmacro ; INITIAL_BLOCKS_PARTIAL - - + ;; =================================================================== + ;; increment number of blocks and repeat code generation %assign num_blocks (num_blocks + 1) %endrep -;; %endmacro ; GCM_ENC_DEC_SMALL - - + ;; =================================================================== + ;; Zero message size case (not optimized, not used very often) %%_small_initial_num_blocks_is_0: vmovdqa64 %%SHUF_MASKx, [rel SHUF_MASK] + ;; =================================================================== ;; calculate AAD hash for 0 message length case - vpxor %%AAD_HASHx, %%AAD_HASHx, %%AAD_HASHx - CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASHx, %%GDATA_KEY, \ - %%ZTMP0, %%ZTMP1, %%ZTMP2, %%ZTMP3, %%ZTMP4, %%ZTMP5, %%ZTMP6, %%ZTMP7, \ - %%ZTMP8, %%ZTMP9, %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, %%ZTMP15, \ - %%ZTMP16, %%ZTMP17, %%IA1, %%IA2, %%IA3, %%MASKREG + vpxor xmm0, xmm0, xmm0 + ;; arg1 - GDATA_KEY + ;; r12 - message pointer + ;; r13 - message length + ;; xmm0 - hash in/out + mov r12, %%A_IN + mov r13, %%A_LEN + call ghash_internal_vaes_avx512 + vmovdqa64 %%AAD_HASHx, xmm0 + ;; =================================================================== ;; encrypt original IV ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, %%ORIG_IVx ; E(K, Y0) - ;; GHASH block with encoded lengths + ;; =================================================================== + ;; GHASH length block vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1] vmovdqu8 XWORD(%%ZTMP14), [%%GDATA_KEY + HashKey_1 + HKeyGap] @@ -3703,20 +3732,24 @@ align 32 GHASH_MUL2 %%AAD_HASHx, XWORD(%%ZTMP13), XWORD(%%ZTMP14), XWORD(%%ZTMP16), XWORD(%%ZTMP17), XWORD(%%ZTMP18), XWORD(%%ZTMP19) %%_small_initial_blocks_encrypted: - vpshufb %%AAD_HASHx, %%AAD_HASHx, %%SHUF_MASKx ; perform a 16Byte swap - vpxorq %%ORIG_IVx, %%ORIG_IVx, %%AAD_HASHx ; add hash value with encrypted original IV + ;; =================================================================== + ;; Complete GMAC computation + ;; S => %%AAD_HASHx + ;; CIPH(J0) => %%ORIG_IVx + ;; T = MSB(GCTR(J0,S)) + vpshufb %%AAD_HASHx, %%AAD_HASHx, %%SHUF_MASKx + vpxorq %%ORIG_IVx, %%ORIG_IVx, %%AAD_HASHx -;; GCM_COMPLETE START - mov %%IA0, %%AUTH_TAG ; r10 = authTag - mov %%IA1, %%AUTH_TAG_LEN ; r11 = auth_tag_len + ;; =================================================================== + ;; Store the tag T + mov %%IA0, %%AUTH_TAG + mov %%IA1, %%AUTH_TAG_LEN lea %%IA2, [rel byte64_len_to_mask_table] kmovq %%MASKREG, [%%IA2 + %%IA1*8] vmovdqu8 [%%IA0]{%%MASKREG}, %%ORIG_IVx -;; GCM_COMPLETE END - -%endmacro ; GCM_ENC_DEC_VSMALL +%endmacro ; GCM_ENC_DEC_0_TO_256 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct -- GitLab From c8a7cf665534193c5e0a6942d6942a81593315fa Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 18 Jan 2024 15:37:21 +0000 Subject: [PATCH 29/30] vaes-avx512: [gcm] add option to exclude small packet code generation in GCM_ENC_DEC macro This is to reduce code size in cases when GCM_ENC_DEC_0_TO_256 is used. --- lib/include/gcm_vaes_avx512.inc | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/lib/include/gcm_vaes_avx512.inc b/lib/include/gcm_vaes_avx512.inc index 479e95b9..f039e6d3 100644 --- a/lib/include/gcm_vaes_avx512.inc +++ b/lib/include/gcm_vaes_avx512.inc @@ -3760,7 +3760,7 @@ align 32 ; Output: A cipher of the given plain text (CIPH_PLAIN_OUT), and updated GDATA_CTX ; Clobbers rax, r10-r15, and zmm0-zmm31, k1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro GCM_ENC_DEC 7 +%macro GCM_ENC_DEC 7-8 %define %%GDATA_KEY %1 ; [in] key pointer %define %%GDATA_CTX %2 ; [in] context pointer %define %%CIPH_PLAIN_OUT %3 ; [in] output buffer pointer @@ -3768,6 +3768,15 @@ align 32 %define %%PLAIN_CIPH_LEN %5 ; [in] buffer length %define %%ENC_DEC %6 ; [in] cipher direction %define %%INSTANCE_TYPE %7 ; [in] 'single_call' or 'multi_call' selection +%define %%MSG_SIZE_SCOPE %8 ; [in] '>256' to remove small packets code path + +%assign include_small_packets 1 + +%if %0 > 7 +%ifidn %%MSG_SIZE_SCOPE, '>256' +%assign include_small_packets 0 +%endif +%endif %define %%IA0 r10 %define %%IA1 r12 @@ -3849,12 +3858,14 @@ align 32 ;;; - hash 16 blocks ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) +%if include_small_packets != 0 %ifidn __OUTPUT_FORMAT__, win64 cmp %%PLAIN_CIPH_LEN, 0 %else or %%PLAIN_CIPH_LEN, %%PLAIN_CIPH_LEN %endif je %%_enc_dec_done +%endif ; include_small_packets != 0 ;; Update length of data processed %ifidn __OUTPUT_FORMAT__, win64 @@ -3900,8 +3911,10 @@ align 32 je %%_enc_dec_done %endif ; %%INSTANCE_TYPE, multi_call +%if include_small_packets != 0 cmp %%LENGTH, (16 * 16) jbe %%_message_below_equal_16_blocks +%endif ; include_small_packets != 0 vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK] vmovdqa64 %%ADDBE_4x4, [rel ddq_addbe_4444] @@ -4155,8 +4168,12 @@ align 32 %ifidn %%INSTANCE_TYPE, multi_call vpshufb %%CTR_BLOCKx, %%CTR_BLOCKx, XWORD(%%SHUF_MASK) %endif + +%if include_small_packets != 0 jmp %%_ghash_done +%endif ; include_small_packets != 0 +%if include_small_packets != 0 %%_message_below_equal_16_blocks: ;; Determine how many blocks to process ;; - process one additional block if there is a partial block @@ -4183,6 +4200,7 @@ align 32 vpxorq %%ZTMP10, %%ZTMP10, %%ZTMP10 %endif ;; fall through to exit +%endif ; include_small_packets != 0 %%_ghash_done: %ifdef SAFE_DATA -- GitLab From abe21643aaed09e91149961edfa660e22104a05a Mon Sep 17 00:00:00 2001 From: Tomasz Kantecki Date: Thu, 18 Jan 2024 15:41:07 +0000 Subject: [PATCH 30/30] vaes-avx512: [gcm] enable small packet optimization for all available single call API variants The optimization is enabled for: - single call cases - any IV size (variable IV interface) and 12 byte IV size - both cipher directions To save space small packet code generation is disabled in GCM_ENC_DEC in in the above cases. This is no longer needed as new small packet code covers it. GCM context got removed from argument list for GCM_ENC_DEC_0_TO_256 macro (not required). --- lib/include/gcm_api_vaes_avx512.inc | 122 ++++++++++++++++------- lib/include/gcm_vaes_avx512.inc | 148 ++++++++++++++-------------- 2 files changed, 162 insertions(+), 108 deletions(-) diff --git a/lib/include/gcm_api_vaes_avx512.inc b/lib/include/gcm_api_vaes_avx512.inc index 6979e962..e3fc9cdd 100644 --- a/lib/include/gcm_api_vaes_avx512.inc +++ b/lib/include/gcm_api_vaes_avx512.inc @@ -50,6 +50,7 @@ default rel ; aes_gcm_precomp_256_vaes_avx512 ; (struct gcm_key_data *key_data) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(precomp,_),function,) FN_NAME(precomp,_): endbranch64 @@ -123,6 +124,7 @@ error_precomp: ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(enc,_),function,) FN_NAME(enc,_): endbranch64 @@ -185,25 +187,19 @@ FN_NAME(enc,_): .skip_aad_check_enc: %endif - ;; Check if msg_len < 256 + ;; Check if msg_len <= 256 cmp arg5, 16 * 16 jbe .small_packet_path GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call, '>256' GCM_COMPLETE arg1, arg2, arg9, arg10, single_call, k1, r10, r11, r12 -%ifdef SAFE_DATA - clear_zmms_avx512 xmm6 -%endif jmp .exit_enc .small_packet_path: - GCM_ENC_DEC_0_TO_256 arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, ENC -%ifdef SAFE_DATA - clear_zmms_avx512 xmm6 -%endif + GCM_ENC_DEC_0_TO_256 arg1, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, ENC .exit_enc: FUNC_RESTORE @@ -271,6 +267,7 @@ FN_NAME(enc,_): ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(dec,_),function,) FN_NAME(dec,_): endbranch64 @@ -285,68 +282,77 @@ FN_NAME(dec,_): ;; Check key_data != NULL cmp arg1, 0 - jz error_dec + jz .error_dec ;; Check context_data != NULL cmp arg2, 0 - jz error_dec + jz .error_dec ;; Check IV != NULL cmp arg6, 0 - jz error_dec + jz .error_dec ;; Check auth_tag != NULL cmp arg9, 0 - jz error_dec + jz .error_dec ;; Check auth_tag_len == 0 or > 16 cmp arg10, 0 - jz error_dec + jz .error_dec cmp arg10, 16 - ja error_dec + ja .error_dec ;; Check if msg_len == 0 cmp arg5, 0 - jz skip_in_out_check_dec + jz .skip_in_out_check_dec ;; Check if msg_len > max_len cmp arg5, GCM_MAX_LENGTH - ja error_dec + ja .error_dec ;; Check out != NULL (msg_len != 0) cmp arg3, 0 - jz error_dec + jz .error_dec ;; Check in != NULL (msg_len != 0) cmp arg4, 0 - jz error_dec + jz .error_dec -skip_in_out_check_dec: +.skip_in_out_check_dec: ;; Check if aad_len == 0 cmp arg8, 0 - jz skip_aad_check_dec + jz .skip_aad_check_dec ;; Check aad != NULL (aad_len != 0) cmp arg7, 0 - jz error_dec + jz .error_dec -skip_aad_check_dec: +.skip_aad_check_dec: %endif + ;; Check if msg_len <= 256 + cmp arg5, 16 * 16 + jbe .small_packet_path + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call, '>256' GCM_COMPLETE arg1, arg2, arg9, arg10, single_call, k1, r10, r11, r12 %ifdef SAFE_DATA clear_zmms_avx512 xmm6 %endif -exit_dec: + jmp .exit_dec + +.small_packet_path: + GCM_ENC_DEC_0_TO_256 arg1, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, DEC + +.exit_dec: FUNC_RESTORE ret %ifdef SAFE_PARAM -error_dec: +.error_dec: ;; Clear reg and imb_errno IMB_ERR_CHECK_START rax @@ -369,7 +375,7 @@ error_dec: ;; Check if msg_len == 0 cmp arg5, 0 - jz skip_in_out_check_error_dec + jz .skip_in_out_check_error_dec ;; Check if msg_len > max_len IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN @@ -380,19 +386,19 @@ error_dec: ;; Check in != NULL (msg_len != 0) IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC -skip_in_out_check_error_dec: +.skip_in_out_check_error_dec: ;; Check if aad_len == 0 cmp arg8, 0 - jz skip_aad_check_error_dec + jz .skip_aad_check_error_dec ;; Check aad != NULL (aad_len != 0) IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD -skip_aad_check_error_dec: +.skip_aad_check_error_dec: ;; Set imb_errno IMB_ERR_CHECK_END rax - jmp exit_dec + jmp .exit_dec %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -402,11 +408,16 @@ skip_aad_check_error_dec: ;IMB_JOB *aes_gcm_enc_var_iv_128_vaes_avx512 / aes_gcm_enc_var_iv_192_vaes_avx512 / ; aes_gcm_enc_var_iv_256_vaes_avx512(IMB_MGR *state, IMB_JOB *job) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(enc_var_iv,_),function,internal) FN_NAME(enc_var_iv,_): endbranch64 FUNC_SAVE alloc_context + ;; Check if msg_len <= 256 + cmp qword [arg2 + _msg_len_to_cipher], 16 * 16 + jbe .small_packet_path + mov arg1, [arg2 + _enc_keys] GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ @@ -420,7 +431,7 @@ FN_NAME(enc_var_iv,_): add arg3, [arg2 + _cipher_start_src_offset] mov arg4, [arg2 + _dst] mov rbp, [arg2 + _msg_len_to_cipher] - GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, rbp, ENC, single_call + GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, rbp, ENC, single_call, '>256' GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ @@ -430,6 +441,27 @@ FN_NAME(enc_var_iv,_): clear_zmms_avx512 xmm1, xmm4, xmm6, xmm7, xmm8, xmm12, xmm13, xmm14, \ xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm26, xmm30, xmm31 %endif + jmp .exit_enc + +align 32 +.small_packet_path: + mov arg1, [arg2 + _enc_keys] + mov arg3, [arg2 + _src] + add arg3, [arg2 + _cipher_start_src_offset] + mov arg4, [arg2 + _dst] + mov rbp, [arg2 + _msg_len_to_cipher] + GCM_ENC_DEC_0_TO_256 arg1, arg4, arg3, rbp, \ + {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ + {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ + ENC, {qword [arg2 + _iv_len_in_bytes]} + +%ifdef SAFE_DATA + clear_zmms_avx512 xmm0, xmm1, xmm2, xmm7, xmm8, xmm9, xmm11, xmm10, xmm14, \ + xmm15, xmm16, xmm17, xmm18, xmm20, xmm21 +%endif + +.exit_enc: ;; mark job complete mov dword [arg2 + _status], IMB_STATUS_COMPLETED @@ -445,11 +477,16 @@ FN_NAME(enc_var_iv,_): ;IMB_JOB *aes_gcm_dec_var_iv_128_vaes_avx512 / aes_gcm_dec_var_iv_192_vaes_avx512 / ; aes_gcm_dec_var_iv_256_vaes_avx512(IMB_MGR *state, IMB_JOB *job) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(dec_var_iv,_),function,internal) FN_NAME(dec_var_iv,_): endbranch64 FUNC_SAVE alloc_context + ;; Check if msg_len <= 256 + cmp qword [arg2 + _msg_len_to_cipher], 16 * 16 + jbe .small_packet_path + mov arg1, [arg2 + _dec_keys] GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ @@ -463,7 +500,7 @@ FN_NAME(dec_var_iv,_): add arg3, [arg2 + _cipher_start_src_offset] mov arg4, [arg2 + _dst] mov rbp, [arg2 + _msg_len_to_cipher] - GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, rbp, DEC, single_call + GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, rbp, DEC, single_call, '>256' GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ @@ -473,6 +510,25 @@ FN_NAME(dec_var_iv,_): clear_zmms_avx512 xmm1, xmm4, xmm6, xmm7, xmm8, xmm12, xmm13, xmm14, \ xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm26, xmm30, xmm31 %endif + jmp .exit_dec + +align 32 +.small_packet_path: + mov arg1, [arg2 + _enc_keys] + mov arg3, [arg2 + _src] + add arg3, [arg2 + _cipher_start_src_offset] + mov arg4, [arg2 + _dst] + mov rbp, [arg2 + _msg_len_to_cipher] + GCM_ENC_DEC_0_TO_256 arg1, arg4, arg3, rbp, \ + {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ + {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ + DEC, {qword [arg2 + _iv_len_in_bytes]} + +%ifdef SAFE_DATA + clear_zmms_avx512 xmm2, xmm3, xmm4, xmm5, xmm9, xmm15, xmm16, xmm17, xmm18, xmm19, xmm20, xmm21 +%endif +.exit_dec: ;; mark job complete mov dword [arg2 + _status], IMB_STATUS_COMPLETED diff --git a/lib/include/gcm_vaes_avx512.inc b/lib/include/gcm_vaes_avx512.inc index f039e6d3..de394b4d 100644 --- a/lib/include/gcm_vaes_avx512.inc +++ b/lib/include/gcm_vaes_avx512.inc @@ -561,7 +561,6 @@ ;; It may also be that they are the only blocks to process. ;; Set hash key and register index position for the remaining 1 to 3 blocks -;; %assign hashk HashKey_ %+ blocks_left %assign reg_idx (%%NUM_BLOCKS / 4) %xdefine %%REG_IN %%CIPHER_IN %+ reg_idx @@ -3161,21 +3160,20 @@ align 32 ;; - works with AAD size ;; - works with IV size provided IV length is provided ;; Output: C and T -;; Clobbers rax, r12, r13, zmm0-zmm23, zmm30, zmm31, k1, k2, r11 (windows) +;; Clobbers rax, r12, r13, zmm0-zmm23, zmm26-zmm29, zmm30, zmm31, k1, k2, r11 (windows) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro GCM_ENC_DEC_0_TO_256 11-12 +%macro GCM_ENC_DEC_0_TO_256 10-11 %define %%GDATA_KEY %1 ; [in] key pointer -%define %%GDATA_CTX %2 ; [in] context pointer -%define %%CIPH_PLAIN_OUT %3 ; [in] output buffer pointer -%define %%PLAIN_CIPH_IN %4 ; [in] input buffer pointer -%define %%PLAIN_CIPH_LEN %5 ; [in] buffer length -%define %%IV %6 ; [in] IV pointer -%define %%A_IN %7 ; [in] AAD pointer -%define %%A_LEN %8 ; [in] AAD length in bytes -%define %%AUTH_TAG %9 ; [in] pointer to store auth tag into (GP or mem) -%define %%AUTH_TAG_LEN %10 ; [in] length in bytes of auth tag (GP or mem) -%define %%ENC_DEC %11 ; [in] cipher direction -%define %%IV_LEN %12 ; [in] IV length +%define %%CIPH_PLAIN_OUT %2 ; [in] output buffer pointer +%define %%PLAIN_CIPH_IN %3 ; [in] input buffer pointer +%define %%PLAIN_CIPH_LEN %4 ; [in] buffer length +%define %%IV %5 ; [in] IV pointer +%define %%A_IN %6 ; [in] AAD pointer +%define %%A_LEN %7 ; [in] AAD length in bytes +%define %%AUTH_TAG %8 ; [in] pointer to store auth tag into (GP or mem) +%define %%AUTH_TAG_LEN %9 ; [in] length in bytes of auth tag (GP or mem) +%define %%ENC_DEC %10 ; [in] cipher direction +%define %%IV_LEN %11 ; [in] IV length %define %%IA0 rax %define %%IA1 r12 @@ -3220,16 +3218,23 @@ align 32 %define %%ZTMP21 zmm23 %define %%ZTMP22 zmm24 ; not used %define %%ZTMP23 zmm25 ; not used -%define %%ZTMP24 zmm26 ; not used -%define %%ZTMP25 zmm27 ; not used -%define %%ZTMP26 zmm28 ; not used -%define %%ZTMP27 zmm29 ; not used +%define %%ZTMP24 zmm26 +%define %%ZTMP25 zmm27 +%define %%ZTMP26 zmm28 +%define %%ZTMP27 zmm29 -%define %%MASKREG k1 +%define %%DAT0 %%ZTMP24 +%define %%DAT1 %%ZTMP25 +%define %%DAT2 %%ZTMP26 +%define %%DAT3 %%ZTMP27 + +%define %%MASK_TEXT k1 +%define %%MASK_TAG k1 +%define %%MASK_IVAAD k2 ;; =================================================================== ;; prepare IV -%if %0 == 12 +%if %0 == 11 ;; IV may be different than 12 bytes cmp %%IV_LEN, 12 je %%_iv_length_is_12_bytes @@ -3243,28 +3248,28 @@ align 32 vmovdqa64 %%ORIG_IVx, [rel ONEf] mov %%IA2, %%IV mov DWORD(%%IA1), 0x0000_0fff - kmovd k2, DWORD(%%IA1) - vmovdqu8 %%ORIG_IVx{k2}, [%%IA2] ; ctr = IV | 0x1 + kmovd %%MASK_IVAAD, DWORD(%%IA1) + vmovdqu8 %%ORIG_IVx{%%MASK_IVAAD}, [%%IA2] ; ctr = IV | 0x1 %%_iv_prep_is_done: ;; set up context fields - vpshufb %%CTR_BLOCKx, %%ORIG_IVx, [rel SHUF_MASK] + vpshufb %%CTR_BLOCKx, %%ORIG_IVx, [rel SHUF_MASK] ;; =================================================================== ;; check for zero message length %ifidn __OUTPUT_FORMAT__, win64 - cmp %%PLAIN_CIPH_LEN, 0 + cmp %%PLAIN_CIPH_LEN, 0 %else - or %%PLAIN_CIPH_LEN, %%PLAIN_CIPH_LEN + or %%PLAIN_CIPH_LEN, %%PLAIN_CIPH_LEN %endif - je %%_small_initial_num_blocks_is_0 + je %%_small_initial_num_blocks_is_0 ;; =================================================================== - ;; Prepare %%LENGTH register + ;; Prepare %%LENGTH register %ifidn __OUTPUT_FORMAT__, win64 %define %%LENGTH %%IA3 - mov %%LENGTH, %%PLAIN_CIPH_LEN + mov %%LENGTH, %%PLAIN_CIPH_LEN %else %define %%LENGTH %%PLAIN_CIPH_LEN ;; PLAIN_CIPH_LEN is a register on linux %endif @@ -3274,9 +3279,9 @@ align 32 %define %%NUM_BLOCKS %%IA1 - mov DWORD(%%NUM_BLOCKS), DWORD(%%LENGTH) - add DWORD(%%NUM_BLOCKS), 15 - shr DWORD(%%NUM_BLOCKS), 4 + mov DWORD(%%NUM_BLOCKS), DWORD(%%LENGTH) + add DWORD(%%NUM_BLOCKS), 15 + shr DWORD(%%NUM_BLOCKS), 4 ;; %%NUM_BLOCKS can be in the range from 0 to 16 cmp DWORD(%%NUM_BLOCKS), 8 @@ -3341,12 +3346,6 @@ align 32 %define %%CTR1 %%ZTMP1 %define %%CTR2 %%ZTMP2 %define %%CTR3 %%ZTMP3 -%define %%DAT0 %%ZTMP4 -%define %%DAT1 %%ZTMP5 -%define %%DAT2 %%ZTMP6 -%define %%DAT3 %%ZTMP7 -%define %%LAST_CIPHER_BLK %%ZTMP8 -%define %%LAST_GHASH_BLK %%ZTMP9 ;; =================================================================== ;; - load shuffle mask @@ -3371,7 +3370,7 @@ align 32 %elif num_blocks > 4 sub %%IA1, 64 %endif - kmovq %%MASKREG, [%%IA0 + %%IA1*8] + kmovq %%MASK_TEXT, [%%IA0 + %%IA1*8] ;; =================================================================== ;; Check if counter blocks can be prepared in BE format or @@ -3424,7 +3423,7 @@ align 32 %endif ;; =================================================================== - ;; shuffle the counter blcoks for AES rounds + ;; shuffle the counter blocks for AES rounds ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpshufb, \ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ @@ -3434,40 +3433,31 @@ align 32 ;; =================================================================== ;; append original IV to message blocks for AES encryption, if possible -%if (num_blocks >= 14) && (num_blocks <= 15) + +%if (num_blocks % 4) != 0 %assign num_blocks_aes (num_blocks + 1) %assign blend_orig_iv_aes 1 + +%if (num_blocks >= 14) && (num_blocks <= 15) vinserti64x2 %%CTR3, %%ORIG_IVx, num_blocks - 12 %elif (num_blocks == 13) -%assign num_blocks_aes (num_blocks + 1) -%assign blend_orig_iv_aes 1 vinserti64x2 YWORD(%%CTR3), %%ORIG_IVx, num_blocks - 12 %elif (num_blocks >= 10) && (num_blocks <= 11) -%assign num_blocks_aes (num_blocks + 1) -%assign blend_orig_iv_aes 1 vinserti64x2 %%CTR2, %%ORIG_IVx, num_blocks - 8 %elif (num_blocks == 9) -%assign num_blocks_aes (num_blocks + 1) -%assign blend_orig_iv_aes 1 vinserti64x2 YWORD(%%CTR2), %%ORIG_IVx, num_blocks - 8 %elif (num_blocks >= 6) && (num_blocks <= 7) -%assign num_blocks_aes (num_blocks + 1) -%assign blend_orig_iv_aes 1 vinserti64x2 %%CTR1, %%ORIG_IVx, num_blocks - 4 %elif (num_blocks == 5) -%assign num_blocks_aes (num_blocks + 1) -%assign blend_orig_iv_aes 1 vinserti64x2 YWORD(%%CTR1), %%ORIG_IVx, num_blocks - 4 %elif (num_blocks >= 2) && (num_blocks <= 3) -%assign num_blocks_aes (num_blocks + 1) -%assign blend_orig_iv_aes 1 vinserti64x2 %%CTR0, %%ORIG_IVx, num_blocks -%elif (num_blocks == 1) -%assign num_blocks_aes (num_blocks + 1) -%assign blend_orig_iv_aes 1 +%else ; (num_blocks == 1) vinserti64x2 YWORD(%%CTR0), %%ORIG_IVx, num_blocks +%endif + %else - ;; 16 or 0 block cases + ;; 16, 12, 8, 4 or 0 block cases %assign num_blocks_aes num_blocks %assign blend_orig_iv_aes 0 %endif @@ -3475,7 +3465,7 @@ align 32 ;; =================================================================== ;; load plain/cipher text ZMM_LOAD_MASKED_BLOCKS_0_16 num_blocks, %%PLAIN_CIPH_IN, 0, \ - %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASKREG + %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASK_TEXT ;; =================================================================== @@ -3535,7 +3525,7 @@ align 32 ;; =================================================================== ;; write cipher/plain text back to output and ZMM_STORE_MASKED_BLOCKS_0_16 num_blocks, %%CIPH_PLAIN_OUT, 0, \ - %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASKREG + %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASK_TEXT ;; =================================================================== ;; Shuffle the cipher text blocks for hashing part @@ -3551,13 +3541,13 @@ align 32 ;; - zero bytes outside the mask before hashing %if num_blocks <= 4 - vmovdqu8 %%CTR0{%%MASKREG}{z}, %%CTR0 + vmovdqu8 %%CTR0{%%MASK_TEXT}{z}, %%CTR0 %elif num_blocks <= 8 - vmovdqu8 %%CTR1{%%MASKREG}{z}, %%CTR1 + vmovdqu8 %%CTR1{%%MASK_TEXT}{z}, %%CTR1 %elif num_blocks <= 12 - vmovdqu8 %%CTR2{%%MASKREG}{z}, %%CTR2 + vmovdqu8 %%CTR2{%%MASK_TEXT}{z}, %%CTR2 %else - vmovdqu8 %%CTR3{%%MASKREG}{z}, %%CTR3 + vmovdqu8 %%CTR3{%%MASK_TEXT}{z}, %%CTR3 %endif ;; - cipher blocks are in CTR0-CTR3 @@ -3577,12 +3567,12 @@ align 32 ;; - AAD and block with sizes get hashed together ;; - one reduction for everything (AAD + message + length block) -%if %0 == 12 ;; IV may be different than 12 bytes and k2 not set +%if %0 == 11 ;; IV may be different than 12 bytes and %%MASK_IVAAD not set mov DWORD(%%IA1), 0x0000_0fff - kmovd k2, DWORD(%%IA1) + kmovd %%MASK_IVAAD, DWORD(%%IA1) %endif mov %%IA1, %%A_IN - vmovdqu8 %%AAD_HASHx{k2}{z}, [%%IA1] + vmovdqu8 %%AAD_HASHx{%%MASK_IVAAD}{z}, [%%IA1] vpshufb %%AAD_HASHx, %%AAD_HASHx, %%SHUF_MASKx vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN @@ -3592,20 +3582,22 @@ align 32 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; GHASH 12 byte AAD with length block using respective GHASH key powers + ;; AAD_HASHy = [ AAD: 0-127 | LENGTH: 128-255 ] + ;; HASH_KEY = [ HK ^ (N + 2) | HK ^ 1 ] -%assign num_blocks2 (num_blocks + 1) -%define StartHashKey HashKey_ %+ num_blocks2 +%assign num_blocks2 (num_blocks + 2) +%define %%HKeyN2 HashKey_ %+ num_blocks2 - vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + StartHashKey + HKeyGap] + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + %%HKeyN2 + HKeyGap] vinserti64x2 YWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1 + HKeyGap], 1 vpclmulqdq YWORD(%%ZTMP14), %%AAD_HASHy, YWORD(%%ZTMP13), 0x00 ; TLL = GH_L * KK_L vpclmulqdq YWORD(%%ZTMP15), %%AAD_HASHy, YWORD(%%ZTMP13), 0x10 ; TLH = GH_L * KK_H - vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + StartHashKey] + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + %%HKeyN2] vinserti64x2 YWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1], 1 vpclmulqdq YWORD(%%ZTMP16), %%AAD_HASHy, YWORD(%%ZTMP13), 0x01 ; THL = GH_H * HK_L vpclmulqdq YWORD(%%ZTMP17), %%AAD_HASHy, YWORD(%%ZTMP13), 0x11 ; THH = GH_H * HK_H -%undef StartHashKey +%undef %%HKeyN2 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; add products @@ -3615,9 +3607,15 @@ align 32 ;; =================================================================== ;; continue with message GHASH followed by reduction + ;; + ;; Hash key powers and corresponding message blocks: + ;; HASH_KEY = [ HK ^ (N + 1), HK ^ N, ... HK ^ 2 ] + ;; MSG = [ MSG1, MSG2, ... MSGN ] + GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP20, \ - %%ZTMP21, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, 1, \ + %%ZTMP21, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + 1, \ %%DAT0, %%DAT1, %%DAT2, %%DAT3, num_blocks, %%ZTMP15, %%ZTMP14 jmp %%_small_initial_blocks_encrypted @@ -3724,7 +3722,7 @@ align 32 vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1] vmovdqu8 XWORD(%%ZTMP14), [%%GDATA_KEY + HashKey_1 + HKeyGap] - vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN + vpxorq XWORD(%%ZTMP15), XWORD(%%ZTMP15), XWORD(%%ZTMP15) ; len(C) = 0 vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits @@ -3735,7 +3733,7 @@ align 32 ;; =================================================================== ;; Complete GMAC computation ;; S => %%AAD_HASHx - ;; CIPH(J0) => %%ORIG_IVx + ;; CIPHER(J0) => %%ORIG_IVx ;; T = MSB(GCTR(J0,S)) vpshufb %%AAD_HASHx, %%AAD_HASHx, %%SHUF_MASKx vpxorq %%ORIG_IVx, %%ORIG_IVx, %%AAD_HASHx @@ -3746,8 +3744,8 @@ align 32 mov %%IA1, %%AUTH_TAG_LEN lea %%IA2, [rel byte64_len_to_mask_table] - kmovq %%MASKREG, [%%IA2 + %%IA1*8] - vmovdqu8 [%%IA0]{%%MASKREG}, %%ORIG_IVx + kmovq %%MASK_TAG, [%%IA2 + %%IA1*8] + vmovdqu8 [%%IA0]{%%MASK_TAG}, %%ORIG_IVx %endmacro ; GCM_ENC_DEC_0_TO_256 -- GitLab