diff --git a/.github/workflows/freebsd.yml b/.github/workflows/freebsd.yml index 561196070a354ba4613dd6c9850160e05bb17451..aa106f344bf5fdd96c97c5a53ead76706e229cd9 100644 --- a/.github/workflows/freebsd.yml +++ b/.github/workflows/freebsd.yml @@ -46,3 +46,53 @@ jobs: run: | freebsd-version gmake CC=gcc -j 4 + + # CMake release build with Clang + release-cmake-clang: + runs-on: ubuntu-22.04 + + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + repository: 'intel/intel-ipsec-mb' + + - name: Clang Release Build + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + mem: 8192 + prepare: pkg install -y curl nasm gmake cmake + run: | + echo ">>> CMAKE CONFIGURE" + cmake -B ./build -DCMAKE_BUILD_TYPE=Release + echo ">>> CMAKE BUILD" + cd ./build + cmake --build . --config Release -j4 -v + ctest -j 5 -C Release + echo ">>> CMAKE INSTALL" + cmake --install . + + # CMake release build with GCC + release-cmake-gcc: + runs-on: ubuntu-22.04 + + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + repository: 'intel/intel-ipsec-mb' + + - name: Release build with GCC + uses: vmactions/freebsd-vm@v1 + with: + usesh: true + mem: 8192 + prepare: pkg install -y curl nasm gmake cmake gcc + run: | + echo ">>> CMAKE CONFIGURE" + cmake -B ./build -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc + echo ">>> CMAKE BUILD" + cd ./build + cmake --build . --config Release -j4 -v + ctest -j 5 -C Release diff --git a/CMakeLists.txt b/CMakeLists.txt index f9ab07766a883537ec5117ce07f81972d358523c..521a1f10171a9031583b3f51a68ae4d09570313a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +cmake_minimum_required(VERSION 3.18) +cmake_policy(VERSION 3.18) + include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/utils.cmake") # get version from public header file @@ -31,13 +34,16 @@ set(IMB_HDR "${CMAKE_CURRENT_SOURCE_DIR}/lib/ipsec-mb.h") imb_get_version(${IMB_HDR}) message(STATUS "Project Version: ${IPSEC_MB_VERSION_FULL}") -cmake_minimum_required(VERSION 3.16) - # set default project values imb_set_proj_defaults() -project(ipsec-mb VERSION ${IPSEC_MB_VERSION} - DESCRIPTION "IPsec Multi-Buffer library") +project( + ipsec-mb + VERSION ${IPSEC_MB_VERSION} + DESCRIPTION "IPsec Multi-Buffer library" + LANGUAGES C ASM + HOMEPAGE_URL https://git.gitlab.arm.com/arm-reference-solutions/ipsec-mb/ +) # add testing support include(CTest) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 15a324555590e710088f9ac9ba9e80d0a4a5784e..28a54c0df7280586416e7cd572022fa449e77f65 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -24,6 +24,7 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. include(CheckCCompilerFlag) +include(CheckLinkerFlag) # extract library version from header file macro(imb_get_version IMB_HDR_FILE) @@ -133,7 +134,16 @@ macro(imb_compiler_check) (CMAKE_C_COMPILER_VERSION VERSION_LESS 5.0)) message(FATAL_ERROR "GNU C Compiler version must be 5.0 or higher") endif() - check_c_compiler_flag("-fcf-protection" CC_HAS_CET) + + # enable CET if supported by both compiler and linker + check_c_compiler_flag("-fcf-protection=full" CC_CET_CHECK) + check_linker_flag("C" "-z ibt" LD_IBT_CHECK) + if(CC_CET_CHECK AND LD_IBT_CHECK) + set(CET_SUPPORT YES) + else() + set(CET_SUPPORT NO) + endif() + message(STATUS "CET SUPPORT... ${CET_SUPPORT}") endmacro() # add uninstall target diff --git a/lib/Makefile b/lib/Makefile index eecb22fb1e3ab350dd4a749072aceca0ae0d8e66..98a0986a3404eb688305d060538b6bf68bcb5cb7 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -753,6 +753,7 @@ asm_avx512_lib_objs := \ sha256_x16_avx512.o \ sha512_x8_avx512.o \ des_x16_avx512.o \ + des_common_avx512.o \ aes_ecb_vaes_avx512.o \ aes_ecb_quic_vaes_avx512.o \ aes_cntr_api_by16_vaes_avx512.o \ @@ -831,14 +832,14 @@ asm_sse_gcm_objs := \ asm_avx_gcm_objs := asm_avx2_gcm_objs := \ - aes128_gcm_by8_avx2.o aes192_gcm_by8_avx2.o aes256_gcm_by8_avx2.o \ + ghash_by8_avx2.o aes128_gcm_by8_avx2.o aes192_gcm_by8_avx2.o aes256_gcm_by8_avx2.o \ aes128_gcm_vaes_avx2.o aes192_gcm_vaes_avx2.o aes256_gcm_vaes_avx2.o asm_avx512_gcm_objs := \ aes128_gcm_api_vaes_avx512.o aes192_gcm_api_vaes_avx512.o aes256_gcm_api_vaes_avx512.o \ aes128_gcm_sgl_api_vaes_avx512.o aes192_gcm_sgl_api_vaes_avx512.o aes256_gcm_sgl_api_vaes_avx512.o \ - aes128_gmac_api_vaes_avx512.o aes192_gmac_api_vaes_avx512.o aes256_gmac_api_vaes_avx512.o \ - aes128_gcm_by8_avx512.o aes192_gcm_by8_avx512.o aes256_gcm_by8_avx512.o + ghash_api_vaes_avx512.o \ + gmac_api_vaes_avx512.o endif # aarch64 diff --git a/lib/avx2_t1/aes128_gcm_by8_avx2.asm b/lib/avx2_t1/aes128_gcm_by8_avx2.asm index be3a4d15722c92885d1bdba60ee7e48e52d3a318..1b7efabf4b78bfe3e4d019c20148b9963beca71f 100644 --- a/lib/avx2_t1/aes128_gcm_by8_avx2.asm +++ b/lib/avx2_t1/aes128_gcm_by8_avx2.asm @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2017-2023, Intel Corporation All rights reserved. +; Copyright(c) 2017-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,4 +28,4 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM128_MODE 1 -%include "include/gcm_avx_gen4.inc" +%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/avx2_t1/aes192_gcm_by8_avx2.asm b/lib/avx2_t1/aes192_gcm_by8_avx2.asm index 4d28c0d6795b1e1932f2ff52079ece680da0c8b8..58737ae64a9fc09327a63ef259f2cf1451e68baf 100644 --- a/lib/avx2_t1/aes192_gcm_by8_avx2.asm +++ b/lib/avx2_t1/aes192_gcm_by8_avx2.asm @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2017-2023, Intel Corporation All rights reserved. +; Copyright(c) 2017-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,4 +28,4 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM192_MODE 1 -%include "include/gcm_avx_gen4.inc" +%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/avx2_t1/aes256_gcm_by8_avx2.asm b/lib/avx2_t1/aes256_gcm_by8_avx2.asm index 63c87273160091e2b148d6a328b9d2a193bd6cb5..eb4ea60c04df9dbe8435e68a62bd25b1a96ad580 100644 --- a/lib/avx2_t1/aes256_gcm_by8_avx2.asm +++ b/lib/avx2_t1/aes256_gcm_by8_avx2.asm @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2017-2023, Intel Corporation All rights reserved. +; Copyright(c) 2017-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,4 +28,4 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define GCM256_MODE 1 -%include "include/gcm_avx_gen4.inc" +%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/avx2_t1/ghash_by8_avx2.asm b/lib/avx2_t1/ghash_by8_avx2.asm new file mode 100644 index 0000000000000000000000000000000000000000..72f3f3d7e589ac5955bbff68323dff07c4dda8dd --- /dev/null +++ b/lib/avx2_t1/ghash_by8_avx2.asm @@ -0,0 +1,233 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE + +%use smartalign + +%include "include/gcm_common_avx2_avx512.inc" + +mksection .text +default rel + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void ghash_pre_avx_gen4 / ghash_pre_avx512 +; (const void *key, struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_pre_avx_gen4,function,) +MKGLOBAL(ghash_pre_avx512,function,) +ghash_pre_avx_gen4: +ghash_pre_avx512: + endbranch64 +;; Parameter is passed through register +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key != NULL + cmp arg1, 0 + jz error_ghash_pre + + ;; Check key_data != NULL + cmp arg2, 0 + jz error_ghash_pre +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + sub rsp, 1*16 + + ; only xmm6 needs to be maintained + vmovdqu [rsp + 0*16], xmm6 +%endif + vmovdqu xmm6, [arg1] + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [rel TWOONE] + vpand xmm2, xmm2, [rel POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg2 + HashKey], xmm6 ; store HashKey<<1 mod poly + + PRECOMPUTE arg2, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 0*16] + add rsp, 1*16 +%endif +exit_ghash_pre: + ret + +%ifdef SAFE_PARAM +error_ghash_pre: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_KEY + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_EXP_KEY + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_ghash_pre +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; ghash_internal_avx_gen4 +;; [in] r12 = A_IN +;; [in] r13 = A_LEN +;; [in] arg1 = GDATA_KEY +;; [in/out] xmm0 = hash in/out +;; [clobbered] xmm1-xmm6 +;; [clobbered] r10, r11, rax +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_internal_avx_gen4,function,internal) +ghash_internal_avx_gen4: + CALC_AAD_HASH r12, r13, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ + r10, r11, rax + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; partial_block_gmac_avx_gen4 +;; [in] arg2 = GDATA_CTX +;; [in] arg3 = PLAIN_IN +;; [in] arg4 = PLAIN_LEN +;; [out] r11 = DATA_OFFSET +;; [in/out] xmm0 = hash in/out +;; [in] xmm13 = hash key +;; [in] xmm14 = hash-K key +;; [clobbered] xmm1-xmm6, xmm8, xmm9, xmm10 +;; [clobbered] r10, r12, r13, r15, rax +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(partial_block_gmac_avx_gen4,function,internal) +partial_block_gmac_avx_gen4: + PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm0, xmm13, xmm14, \ + xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm8, xmm9, xmm10 + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void ghash_avx_gen4 / ghash_avx512 ( +; const struct gcm_key_data *key_data, +; const void *in, +; const u64 in_len, +; void *io_tag, +; const u64 tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_avx_gen4,function,) +MKGLOBAL(ghash_avx512,function,) +ghash_avx_gen4: +ghash_avx512: + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_ghash + + ;; Check in != NULL + or arg2, arg2 + jz error_ghash + + ;; Check in_len != 0 + or arg3, arg3 + jz error_ghash + + ;; Check tag != NULL + or arg4, arg4 + jz error_ghash + + ;; Check tag_len != 0 + cmp arg5, 0 + jz error_ghash +%endif + + ;; copy tag to xmm0 + vmovdqu xmm0, [arg4] + vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap + + mov r12, arg2 + mov r13, arg3 + call ghash_internal_avx_gen4 + vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap + + simd_store_avx arg4, xmm0, arg5, r12, rax + +exit_ghash: + FUNC_RESTORE + ret + +%ifdef SAFE_PARAM +error_ghash: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check in != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_SRC + + ;; Check in_len != 0 + IMB_ERR_CHECK_ZERO arg3, rax, IMB_ERR_AUTH_LEN + + ;; Check tag != NULL + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AUTH + + ;; Check tag_len != 0 + IMB_ERR_CHECK_ZERO arg5, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + + jmp exit_ghash +%endif + +mksection stack-noexec + diff --git a/lib/avx512_t1/aes128_gcm_by8_avx512.asm b/lib/avx512_t1/aes128_gcm_by8_avx512.asm deleted file mode 100644 index 5487a4feed403aa0000e2ef33942fb31ab0a0347..0000000000000000000000000000000000000000 --- a/lib/avx512_t1/aes128_gcm_by8_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2018-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM128_MODE 1 -%include "include/gcm_avx512.inc" diff --git a/lib/avx512_t1/aes192_gcm_by8_avx512.asm b/lib/avx512_t1/aes192_gcm_by8_avx512.asm deleted file mode 100644 index 9a1e645ffa52fed3f2c3cbc65ec03779ce88e168..0000000000000000000000000000000000000000 --- a/lib/avx512_t1/aes192_gcm_by8_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2018-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM192_MODE 1 -%include "include/gcm_avx512.inc" diff --git a/lib/avx512_t1/aes256_gcm_by8_avx512.asm b/lib/avx512_t1/aes256_gcm_by8_avx512.asm deleted file mode 100644 index ea7728b707a906fe81da4f3918ed64976f61865d..0000000000000000000000000000000000000000 --- a/lib/avx512_t1/aes256_gcm_by8_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2018-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM256_MODE 1 -%include "include/gcm_avx512.inc" diff --git a/lib/avx512_t1/des_common_avx512.asm b/lib/avx512_t1/des_common_avx512.asm new file mode 100644 index 0000000000000000000000000000000000000000..d103615bf29f7cafb939e38e0712f8803553b8fa --- /dev/null +++ b/lib/avx512_t1/des_common_avx512.asm @@ -0,0 +1,221 @@ +; +;; Copyright (c) 2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Collection of functions generated by DES_ENC_DEC macro with preset input/output arguments. +;; This method allows to reduce code footprint while maintaining identical performance. +;; +;; If register usage changes then generated functions below may need to be corrected. +;; See DES_ENC_DEC macro for more details. + +%include "include/des_avx512.inc" + +;;; ======================================================== +;;; DATA + +extern des_mask_values_avx512 +extern des_init_perm_consts_avx512 +extern des_S_box_flipped_avx512 +extern des_vec_ones_32b_avx512 +extern des_and_eu_avx512 +extern des_and_ed_avx512 +extern des_idx_e_avx512 +extern des_reg_values16bit_7_avx512 +extern des_shuffle_reg_avx512 + +;;; ======================================================== +;;; CODE +mksection .text + +;;; >>>>>>>>>>>>>> ENCRYPT FUNCTIONS + +;;; r15 : key schedule pointer +;;; zmm0 : [in/out] R +;;; zmm1 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm0_zmm1_avx512,function,internal) +des_enc_zmm0_zmm1_avx512: + DES_ENC_DEC_EXP ENC,zmm0,zmm1,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm2 : [in/out] R +;;; zmm3 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm2_zmm3_avx512,function,internal) +des_enc_zmm2_zmm3_avx512: + DES_ENC_DEC_EXP ENC,zmm2,zmm3,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm4 : [in/out] R +;;; zmm5 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm4_zmm5_avx512,function,internal) +des_enc_zmm4_zmm5_avx512: + DES_ENC_DEC_EXP ENC,zmm4,zmm5,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm6 : [in/out] R +;;; zmm7 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm6_zmm7_avx512,function,internal) +des_enc_zmm6_zmm7_avx512: + DES_ENC_DEC_EXP ENC,zmm6,zmm7,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm8 : [in/out] R +;;; zmm9 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm8_zmm9_avx512,function,internal) +des_enc_zmm8_zmm9_avx512: + DES_ENC_DEC_EXP ENC,zmm8,zmm9,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm10 : [in/out] R +;;; zmm11 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm10_zmm11_avx512,function,internal) +des_enc_zmm10_zmm11_avx512: + DES_ENC_DEC_EXP ENC,zmm10,zmm11,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm12 : [in/out] R +;;; zmm13 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm12_zmm13_avx512,function,internal) +des_enc_zmm12_zmm13_avx512: + DES_ENC_DEC_EXP ENC,zmm12,zmm13,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm14 : [in/out] R +;;; zmm15 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm14_zmm15_avx512,function,internal) +des_enc_zmm14_zmm15_avx512: + DES_ENC_DEC_EXP ENC,zmm14,zmm15,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; CFB ONE use case +;;; r15 : key schedule pointer +;;; zmm18 : [in/out] R +;;; zmm19 : [in/out] L +align 64 +MKGLOBAL(des_enc_zmm18_zmm19_avx512,function,internal) +des_enc_zmm18_zmm19_avx512: + DES_ENC_DEC_EXP ENC,zmm18,zmm19,r15,zmm2,zmm3,zmm4,zmm5,zmm6,zmm7,zmm8,zmm9,zmm10,zmm11,zmm12,zmm13 + ret + +;;; >>>>>>>>>>>>>> DECRYPT FUNCTIONS + +;;; r15 : key schedule pointer +;;; zmm0 : [in/out] R +;;; zmm1 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm0_zmm1_avx512,function,internal) +des_dec_zmm0_zmm1_avx512: + DES_ENC_DEC_EXP DEC,zmm0,zmm1,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm2 : [in/out] R +;;; zmm3 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm2_zmm3_avx512,function,internal) +des_dec_zmm2_zmm3_avx512: + DES_ENC_DEC_EXP DEC,zmm2,zmm3,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm4 : [in/out] R +;;; zmm5 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm4_zmm5_avx512,function,internal) +des_dec_zmm4_zmm5_avx512: + DES_ENC_DEC_EXP DEC,zmm4,zmm5,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm6 : [in/out] R +;;; zmm7 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm6_zmm7_avx512,function,internal) +des_dec_zmm6_zmm7_avx512: + DES_ENC_DEC_EXP DEC,zmm6,zmm7,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm8 : [in/out] R +;;; zmm9 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm8_zmm9_avx512,function,internal) +des_dec_zmm8_zmm9_avx512: + DES_ENC_DEC_EXP DEC,zmm8,zmm9,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm10 : [in/out] R +;;; zmm11 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm10_zmm11_avx512,function,internal) +des_dec_zmm10_zmm11_avx512: + DES_ENC_DEC_EXP DEC,zmm10,zmm11,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm12 : [in/out] R +;;; zmm13 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm12_zmm13_avx512,function,internal) +des_dec_zmm12_zmm13_avx512: + DES_ENC_DEC_EXP DEC,zmm12,zmm13,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; r15 : key schedule pointer +;;; zmm14 : [in/out] R +;;; zmm15 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm14_zmm15_avx512,function,internal) +des_dec_zmm14_zmm15_avx512: + DES_ENC_DEC_EXP DEC,zmm14,zmm15,r15,zmm18,zmm19,zmm20,zmm21,zmm22,zmm23,zmm24,zmm25,zmm26,zmm27,zmm28,zmm29 + ret + +;;; CFB ONE use case +;;; r15 : key schedule pointer +;;; zmm18 : [in/out] R +;;; zmm19 : [in/out] L +align 64 +MKGLOBAL(des_dec_zmm18_zmm19_avx512,function,internal) +des_dec_zmm18_zmm19_avx512: + DES_ENC_DEC_EXP DEC,zmm18,zmm19,r15,zmm2,zmm3,zmm4,zmm5,zmm6,zmm7,zmm8,zmm9,zmm10,zmm11,zmm12,zmm13 + ret + +mksection stack-noexec diff --git a/lib/avx512_t1/des_x16_avx512.asm b/lib/avx512_t1/des_x16_avx512.asm index c940dd8d1ba5a4f6b497882c2eb00efb233a4be5..f1f6323f9cd33598bc7ab97ea70f97fe0a21c307 100644 --- a/lib/avx512_t1/des_x16_avx512.asm +++ b/lib/avx512_t1/des_x16_avx512.asm @@ -25,2112 +25,32 @@ ;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;; -;; Authors: -;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2) -;; (1) University of Haifa, Israel -;; (2) Intel Corporation +;; DES, TDES/3DES and DES-DOCSIS API generation -;; In System V AMD64 ABI -;; callee saves: RBX, RBP, R12-R15 -;; Windows x64 ABI -;; callee saves: RBX, RBP, RDI, RSI, RSP, R12-R15 +%include "include/des_avx512.inc" -;; -;; Registers: RAX RBX RCX RDX RBP RSI RDI R8 R9 R10 R11 R12 R13 R14 R15 -;; ----------------------------------------------------------- -;; Windows clobbers: RAX R8 R9 R10 R11 -;; Windows preserves: RBX RCX RDX RBP RSI RDI R12 R13 R14 R15 -;; ----------------------------------------------------------- -;; Linux clobbers: RAX RCX RDX R10 R11 -;; Linux preserves: RBX RBP RSI RDI R8 R9 R12 R13 R14 R15 -;; ----------------------------------------------------------- -;; Clobbers ZMM0-31 and K1 to K7 - -%include "include/os.inc" -%include "include/reg_sizes.inc" -%include "include/mb_mgr_datastruct.inc" -%include "include/constants.inc" -;%define DO_DBGPRINT -%include "include/dbgprint.inc" -%include "include/clear_regs.inc" - -%ifdef LINUX -%define arg1 rdi -%define arg2 rsi -%define arg3 rdx -%define arg4 rcx -%else -%define arg1 rcx -%define arg2 rdx -%define arg3 r8 -%define arg4 r9 -%endif - -%define STATE arg1 -%define SIZE arg2 - -%define OFFSET rax - -%define IA0 arg3 -%define IA1 arg4 -%define IA2 r10 - -%define INP0 r11 -%define INP1 r12 -%define INP2 r13 -%define INP3 r14 -%define INP4 r15 - -%define KSOFFSET r11 - -%define ZW0 zmm0 -%define ZW1 zmm1 -%define ZW2 zmm2 -%define ZW3 zmm3 -%define ZW4 zmm4 -%define ZW5 zmm5 -%define ZW6 zmm6 -%define ZW7 zmm7 -%define ZW8 zmm8 -%define ZW9 zmm9 -%define ZW10 zmm10 -%define ZW11 zmm11 -%define ZW12 zmm12 -%define ZW13 zmm13 -%define ZW14 zmm14 -%define ZW15 zmm15 - -%define ZIV0 zmm16 -%define ZIV1 zmm17 - -%define ZTMP0 zmm18 -%define ZTMP1 zmm19 -%define ZTMP2 zmm20 -%define ZTMP3 zmm21 -%define ZTMP4 zmm22 -%define ZTMP5 zmm23 -%define ZTMP6 zmm24 -%define ZTMP7 zmm25 -%define ZTMP8 zmm26 -%define ZTMP9 zmm27 -%define ZTMP10 zmm28 -%define ZTMP11 zmm29 -%define ZTMP12 zmm30 -%define ZTMP13 zmm31 - -struc STACKFRAME -_key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 -_key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 -_key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 -_tmp_iv: resq 16 ; 2 x 64 bytes -_tmp_in: resq 16 ; 2 x 64 bytes -_tmp_out: resq 16 ; 2 x 64 bytes -_tmp_mask: resd 16 ; 1 x 64 bytes -_gpr_save: resq 4 ; r12 to r15 -_rsp_save: resq 1 -_mask_save: resq 1 -_size_save: resq 1 -endstruc - -;;; =========================================================================== -;;; =========================================================================== -;;; MACROS -;;; =========================================================================== -;;; =========================================================================== - -;;; =========================================================================== -;;; CLEAR TRANSPOSED KEY SCHEDULE (if SAFE_DATA is selected) -;;; =========================================================================== -%macro CLEAR_KEY_SCHEDULE 2 -%define %%ALG %1 ; [in] DES or 3DES -%define %%ZT %2 ; [clobbered] temporary ZMM register - -%ifdef SAFE_DATA - vpxorq %%ZT, %%ZT -%assign rep_num (2048 / 64) -%ifidn %%ALG, 3DES -%assign rep_num (rep_num * 3) -%endif - -%assign offset 0 -%rep rep_num - vmovdqa64 [rsp + _key_sched + offset], %%ZT -%assign offset (offset + 64) -%endrep - -%endif ; SAFE_DATA - -%endmacro - -;;; =========================================================================== -;;; PERMUTE -;;; =========================================================================== -;;; A [in/out] - zmm register -;;; B [in/out] - zmm register -;;; NSHIFT [in] - constant to shift words by -;;; MASK [in] - zmm or m512 with mask -;;; T0 [clobbered] - temporary zmm register -%macro PERMUTE 5 -%define %%A %1 -%define %%B %2 -%define %%NSHIFT %3 -%define %%MASK %4 -%define %%T0 %5 - - vpsrld %%T0, %%A, %%NSHIFT - vpxord %%T0, %%T0, %%B - vpandd %%T0, %%T0, %%MASK - vpxord %%B, %%B, %%T0 - vpslld %%T0, %%T0, %%NSHIFT - vpxord %%A, %%A, %%T0 -%endmacro - -;;; =========================================================================== -;;; INITIAL PERMUTATION -;;; =========================================================================== -;;; L [in/out] - zmm register -;;; R [in/out] - zmm register -;;; T0 [clobbered] - temporary zmm register -%macro IP_Z 3 -%define %%L %1 -%define %%R %2 -%define %%T0 %3 - PERMUTE %%R, %%L, 4, [rel init_perm_consts + 0*64], %%T0 - PERMUTE %%L, %%R, 16, [rel init_perm_consts + 1*64], %%T0 - PERMUTE %%R, %%L, 2, [rel init_perm_consts + 2*64], %%T0 - PERMUTE %%L, %%R, 8, [rel init_perm_consts + 3*64], %%T0 - PERMUTE %%R, %%L, 1, [rel init_perm_consts + 4*64], %%T0 -%endmacro - -;;; =========================================================================== -;;; FINAL PERMUTATION -;;; =========================================================================== -;;; L [in/out] - zmm register -;;; R [in/out] - zmm register -;;; T0 [clobbered] - temporary zmm register -%macro FP_Z 3 -%define %%L %1 -%define %%R %2 -%define %%T0 %3 - PERMUTE %%L, %%R, 1, [rel init_perm_consts + 4*64], %%T0 - PERMUTE %%R, %%L, 8, [rel init_perm_consts + 3*64], %%T0 - PERMUTE %%L, %%R, 2, [rel init_perm_consts + 2*64], %%T0 - PERMUTE %%R, %%L, 16, [rel init_perm_consts + 1*64], %%T0 - PERMUTE %%L, %%R, 4, [rel init_perm_consts + 0*64], %%T0 -%endmacro - -;;; =========================================================================== -;;; P PHASE -;;; =========================================================================== -;;; W0 [in/out] - zmm register -;;; in: vector of 16 x 32bits from S phase -;;; out: permuted in vector -;;; T0-T3 [clobbered] - temporary zmm register -%macro P_PHASE 5 -%define %%W0 %1 -%define %%T0 %2 -%define %%T1 %3 -%define %%T2 %4 -%define %%T3 %5 - - vprord %%T0, %%W0, 3 - vpandd %%T0, %%T0, [rel mask_values + 0*64] - vprord %%T1, %%W0, 5 - vpandd %%T1, %%T1, [rel mask_values + 1*64] - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 24 - vpandd %%T1, %%T1, [rel mask_values + 2*64] - vprord %%T2, %%W0, 26 - vpandd %%T2, %%T2, [rel mask_values + 3*64] - vpord %%T1, %%T1, %%T2 - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 15 - vpandd %%T1, %%T1, [rel mask_values + 4*64] - vprord %%T2, %%W0, 17 - vpandd %%T2, %%T2, [rel mask_values + 5*64] - vpord %%T1, %%T1, %%T2 - - vprord %%T2, %%W0, 6 - vpandd %%T2, %%T2, [rel mask_values + 6*64] - vprord %%T3, %%W0, 21 - vpandd %%T3, %%T3, [rel mask_values + 7*64] - vpord %%T2, %%T2, %%T3 - vpord %%T1, %%T1, %%T2 - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 12 - vpandd %%T1, %%T1, [rel mask_values + 8*64] - vprord %%T2, %%W0, 14 - vpandd %%T2, %%T2, [rel mask_values + 9*64] - vpord %%T1, %%T1, %%T2 - - vprord %%T2, %%W0, 4 - vpandd %%T2, %%T2, [rel mask_values + 10*64] - vprord %%T3, %%W0, 11 - vpandd %%T3, %%T3, [rel mask_values + 11*64] - vpord %%T2, %%T2, %%T3 - vpord %%T1, %%T1, %%T2 - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 16 - vpandd %%T1, %%T1, [rel mask_values + 12*64] - vprord %%T2, %%W0, 22 - vpandd %%T2, %%T2, [rel mask_values + 13*64] - vpord %%T1, %%T1, %%T2 - - vprord %%T2, %%W0, 19 - vpandd %%T2, %%T2, [rel mask_values + 14*64] - vprord %%T3, %%W0, 10 - vpandd %%T3, %%T3, [rel mask_values + 15*64] - vpord %%T2, %%T2, %%T3 - vpord %%T1, %%T1, %%T2 - vpord %%T0, %%T0, %%T1 - - vprord %%T1, %%W0, 9 - vpandd %%T1, %%T1, [rel mask_values + 16*64] - vprord %%T2, %%W0, 13 - vpandd %%T2, %%T2, [rel mask_values + 17*64] - vpord %%T1, %%T1, %%T2 - - vprord %%T2, %%W0, 25 - vpandd %%T2, %%T2, [rel mask_values + 18*64] - vpord %%T1, %%T1, %%T2 - vpord %%W0, %%T0, %%T1 -%endmacro - -;;; =========================================================================== -;;; E PHASE -;;; =========================================================================== -;;; -;;; Expands 16x32-bit words into 16x48-bit words -;;; plus XOR's result with the key schedule. -;;; The output is adjusted to be friendly as S phase input. -;;; -;;; in [in] - zmm register -;;; out0a [out] - zmm register -;;; out0b [out] - zmm register -;;; out1a [out] - zmm register -;;; out1b [out] - zmm register -;;; k0 [in] - key schedule; zmm or m512 -;;; k1 [in] - key schedule; zmm or m512 -;;; t0-t1 [clobbered] - temporary zmm register -%macro E_PHASE 9 -%define %%IN %1 -%define %%OUT0A %2 -%define %%OUT0B %3 -%define %%OUT1A %4 -%define %%OUT1B %5 -%define %%K0 %6 -%define %%K1 %7 -%define %%T0 %8 -%define %%T1 %9 - - vprord %%T0, %%IN, 31 - vprord %%T1, %%IN, 3 - vpshufb %%T0, %%T0, [rel idx_e] - vpshufb %%T1, %%T1, [rel idx_e] - vpunpcklbw %%OUT0A, %%T0, %%T1 - vpunpckhbw %%OUT1A, %%T0, %%T1 - vpxord %%OUT0A, %%OUT0A, %%K0 - vpxord %%OUT1A, %%OUT1A, %%K1 - vpandd %%OUT0B, %%OUT0A, [rel and_eu] - vpsrlw %%OUT0B, %%OUT0B, 8 - vpandd %%OUT0A, %%OUT0A, [rel and_ed] - vpandd %%OUT1B, %%OUT1A, [rel and_eu] - vpsrlw %%OUT1B, %%OUT1B, 8 - vpandd %%OUT1A, %%OUT1A, [rel and_ed] -%endmacro - -;;; =========================================================================== -;;; S-BOX -;;; =========================================================================== -;;; -;;; NOTE: clobbers k1-k6 OpMask registers -;;; -;;; IN0A [in] - zmm register; output from E-phase -;;; IN0B [in] - zmm register; output from E-phase -;;; IN1A [in] - zmm register; output from E-phase -;;; IN1B [in] - zmm register; output from E-phase -;;; OUT [out] - zmm register; output from E-phase -;;; T0-T5 [clobbered] - temporary zmm register -%macro S_PHASE 11 -%define %%IN0A %1 -%define %%IN0B %2 -%define %%IN1A %3 -%define %%IN1B %4 -%define %%OUT %5 -%define %%T0 %6 -%define %%T1 %7 -%define %%T2 %8 -%define %%T3 %9 -%define %%T4 %10 -%define %%T5 %11 - - vmovdqa64 %%T0, [rel reg_values16bit_7] - vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE - vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE - vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE - vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE - - mov DWORD(IA0), 0x55555555 - kmovd k1, DWORD(IA0) - mov DWORD(IA0), 0xaaaaaaaa - kmovd k2, DWORD(IA0) - - vmovdqa64 %%T0, [rel S_box_flipped + 0*64] - vmovdqa64 %%T1, [rel S_box_flipped + 1*64] - vmovdqa64 %%T2, [rel S_box_flipped + 4*64] - vmovdqa64 %%T3, [rel S_box_flipped + 5*64] - vpermw %%T0{k1}{z}, %%IN0A, %%T0 - vpermw %%T1{k1}{z}, %%IN0A, %%T1 - vpermw %%T2{k2}{z}, %%IN0A, %%T2 - vpermw %%T3{k2}{z}, %%IN0A, %%T3 - vpxord %%T0, %%T0, %%T2 - vpxord %%OUT, %%T1, %%T3 - vmovdqu16 %%OUT{k3}, %%T0 - - vmovdqa64 %%T0, [rel S_box_flipped + 2*64] - vmovdqa64 %%T1, [rel S_box_flipped + 3*64] - vmovdqa64 %%T2, [rel S_box_flipped + 6*64] - vmovdqa64 %%T3, [rel S_box_flipped + 7*64] - vpermw %%T0{k1}{z}, %%IN0B, %%T0 - vpermw %%T1{k1}{z}, %%IN0B, %%T1 - vpermw %%T2{k2}{z}, %%IN0B, %%T2 - vpermw %%T3{k2}{z}, %%IN0B, %%T3 - vpxord %%T0, %%T0, %%T2 - vpxord %%T3, %%T1, %%T3 - vmovdqu16 %%T3{k4}, %%T0 - vpsllw %%T3, %%T3, 4 - vpxord %%OUT, %%OUT, %%T3 - - vmovdqa64 %%T0, [rel S_box_flipped + 8*64] - vmovdqa64 %%T1, [rel S_box_flipped + 9*64] - vmovdqa64 %%T2, [rel S_box_flipped + 12*64] - vmovdqa64 %%T3, [rel S_box_flipped + 13*64] - vpermw %%T0{k1}{z}, %%IN1A, %%T0 - vpermw %%T1{k1}{z}, %%IN1A, %%T1 - vpermw %%T2{k2}{z}, %%IN1A, %%T2 - vpermw %%T3{k2}{z}, %%IN1A, %%T3 - vpxord %%T0, %%T0, %%T2 - vpxord %%T4, %%T1, %%T3 - vmovdqu16 %%T4{k5}, %%T0 - - vmovdqa64 %%T0, [rel S_box_flipped + 10*64] - vmovdqa64 %%T1, [rel S_box_flipped + 11*64] - vmovdqa64 %%T2, [rel S_box_flipped + 14*64] - vmovdqa64 %%T3, [rel S_box_flipped + 15*64] - vpermw %%T0{k1}{z}, %%IN1B, %%T0 - vpermw %%T1{k1}{z}, %%IN1B, %%T1 - vpermw %%T2{k2}{z}, %%IN1B, %%T2 - vpermw %%T3{k2}{z}, %%IN1B, %%T3 - vpxord %%T0, %%T0, %%T2 - vpxord %%T5, %%T1, %%T3 - vmovdqu16 %%T5{k6}, %%T0 - vpsllw %%T5, %%T5, 4 - - vpxord %%T4, %%T4, %%T5 - vpsllw %%T4, %%T4, 8 - vpxord %%OUT, %%OUT, %%T4 - vpshufb %%OUT, %%OUT, [rel shuffle_reg] -%endmacro - -;;; =========================================================================== -;;; DES encryption/decryption round -;;; =========================================================================== -;;; -;;; Clobbers k1-k6 OpMask registers -;;; -;;; ENC_DEC [in] - ENC for encryption, DEC for decryption -;;; R [in/out] - zmm register; plain text in & cipher text out -;;; L [in/out] - zmm register; plain text in & cipher text out -;;; KS [in] - pointer to the key schedule -;;; T0-T11 [clobbered] - temporary zmm register -%macro DES_ENC_DEC 16 -%define %%ENC_DEC %1 -%define %%R %2 -%define %%L %3 -%define %%KS %4 -%define %%T0 %5 -%define %%T1 %6 -%define %%T2 %7 -%define %%T3 %8 -%define %%T4 %9 -%define %%T5 %10 -%define %%T6 %11 -%define %%T7 %12 -%define %%T8 %13 -%define %%T9 %14 -%define %%T10 %15 -%define %%T11 %16 - - IP_Z %%R, %%L, %%T0 - -%ifidn %%ENC_DEC, ENC - ;; ENCRYPTION - xor KSOFFSET, KSOFFSET -%%_des_enc_loop: - E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7 - S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 - P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 - vpxord %%L, %%L, %%T0 - - E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7 - S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 - P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 - vpxord %%R, %%R, %%T0 - - add KSOFFSET, (4*64) - cmp KSOFFSET, (8*(4*64)) - jb %%_des_enc_loop - -%else - ;; DECRYPTION - mov KSOFFSET, (8*(4*64)) -%%_des_dec_loop: - E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7 - S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 - P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 - vpxord %%L, %%L, %%T0 - - E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7 - S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 - P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 - vpxord %%R, %%R, %%T0 - sub KSOFFSET, (4*64) - jnz %%_des_dec_loop -%endif ; DECRYPTION - - FP_Z %%R, %%L, %%T0 -%endmacro - -;;; =========================================================================== -;;; DATA TRANSPOSITION AT DATA INPUT -;;; =========================================================================== -;;; -;;; IN00 - IN15 [in/out]: -;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data -;;; out: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 -;;; T0-T3 [clobbered] - temporary zmm registers -;;; K0-K5 [clobbered] - temporary zmm registers -;;; H0-H3 [clobbered] - temporary zmm registers -%macro TRANSPOSE_IN 30 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T1 %18 -%define %%T2 %19 -%define %%T3 %20 -%define %%K0 %21 -%define %%K1 %22 -%define %%K2 %23 -%define %%K3 %24 -%define %%K4 %25 -%define %%K5 %26 -%define %%H0 %27 -%define %%H1 %28 -%define %%H2 %29 -%define %%H3 %30 - - vpunpckldq %%K0, %%IN00, %%IN01 - vpunpckhdq %%K1, %%IN00, %%IN01 - vpunpckldq %%T0, %%IN02, %%IN03 - vpunpckhdq %%T1, %%IN02, %%IN03 - - vpunpckldq %%IN00, %%IN04, %%IN05 - vpunpckhdq %%IN01, %%IN04, %%IN05 - vpunpckldq %%IN02, %%IN06, %%IN07 - vpunpckhdq %%IN03, %%IN06, %%IN07 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - vpunpcklqdq %%K3, %%K1, %%T1 - vpunpckhqdq %%T3, %%K1, %%T1 - - vpunpcklqdq %%K0, %%IN00, %%IN02 - vpunpckhqdq %%K1, %%IN00, %%IN02 - vpunpcklqdq %%T0, %%IN01, %%IN03 - vpunpckhqdq %%T1, %%IN01, %%IN03 - - vpunpckldq %%K4, %%IN08, %%IN09 - vpunpckhdq %%K5, %%IN08, %%IN09 - vpunpckldq %%IN04, %%IN10, %%IN11 - vpunpckhdq %%IN05, %%IN10, %%IN11 - vpunpckldq %%IN06, %%IN12, %%IN13 - vpunpckhdq %%IN07, %%IN12, %%IN13 - vpunpckldq %%IN10, %%IN14, %%IN15 - vpunpckhdq %%IN11, %%IN14, %%IN15 - - vpunpcklqdq %%IN12, %%K4, %%IN04 - vpunpckhqdq %%IN13, %%K4, %%IN04 - vpunpcklqdq %%IN14, %%K5, %%IN05 - vpunpckhqdq %%IN15, %%K5, %%IN05 - vpunpcklqdq %%IN00, %%IN06, %%IN10 - vpunpckhqdq %%IN01, %%IN06, %%IN10 - vpunpcklqdq %%IN02, %%IN07, %%IN11 - vpunpckhqdq %%IN03, %%IN07, %%IN11 - - vshufi64x2 %%H0, %%K2, %%K0, 0x44 - vshufi64x2 %%H1, %%K2, %%K0, 0xee - vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 - vshufi64x2 %%H3, %%IN12, %%IN00, 0xee - vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 - vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 - vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 - vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 - - vshufi64x2 %%H0, %%T2, %%K1, 0x44 - vshufi64x2 %%H1, %%T2, %%K1, 0xee - vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 - vshufi64x2 %%H3, %%IN13, %%IN01, 0xee - vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 - vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 - vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 - vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 - - vshufi64x2 %%H0, %%K3, %%T0, 0x44 - vshufi64x2 %%H1, %%K3, %%T0, 0xee - vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 - vshufi64x2 %%H3, %%IN14, %%IN02, 0xee - vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 - vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 - vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 - vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 - - vshufi64x2 %%H0, %%T3, %%T1, 0x44 - vshufi64x2 %%H1, %%T3, %%T1, 0xee - vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 - vshufi64x2 %%H3, %%IN15, %%IN03, 0xee - vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 - vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 - vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 - vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 -%endmacro - -;;; =========================================================================== -;;; DATA TRANSPOSITION AT DATA OUTPUT -;;; =========================================================================== -;;; -;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: -;;; in: R0 - 16 x word0, L0 - 16 x word1, ... L7 - 16 x word15 -;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data -;;; T0-T3 [clobbered] - temporary zmm registers -;;; K0-K5 [clobbered] - temporary zmm registers -;;; H0-H3 [clobbered] - temporary zmm registers -%macro TRANSPOSE_OUT 30 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T1 %18 -%define %%T2 %19 -%define %%T3 %20 -%define %%K0 %21 -%define %%K1 %22 -%define %%K2 %23 -%define %%K3 %24 -%define %%K4 %25 -%define %%K5 %26 -%define %%H0 %27 -%define %%H1 %28 -%define %%H2 %29 -%define %%H3 %30 - - vpunpckldq %%K0, %%IN01, %%IN00 - vpunpckhdq %%K1, %%IN01, %%IN00 - vpunpckldq %%T0, %%IN03, %%IN02 - vpunpckhdq %%T1, %%IN03, %%IN02 - - vpunpckldq %%IN00, %%IN05, %%IN04 - vpunpckhdq %%IN01, %%IN05, %%IN04 - vpunpckldq %%IN02, %%IN07, %%IN06 - vpunpckhdq %%IN03, %%IN07, %%IN06 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - vpunpcklqdq %%K3, %%K1, %%T1 - vpunpckhqdq %%T3, %%K1, %%T1 - - vpunpcklqdq %%K0, %%IN00, %%IN02 - vpunpckhqdq %%K1, %%IN00, %%IN02 - vpunpcklqdq %%T0, %%IN01, %%IN03 - vpunpckhqdq %%T1, %%IN01, %%IN03 - - vpunpckldq %%K4, %%IN09, %%IN08 - vpunpckhdq %%K5, %%IN09, %%IN08 - vpunpckldq %%IN04, %%IN11, %%IN10 - vpunpckhdq %%IN05, %%IN11, %%IN10 - vpunpckldq %%IN06, %%IN13, %%IN12 - vpunpckhdq %%IN07, %%IN13, %%IN12 - vpunpckldq %%IN10, %%IN15, %%IN14 - vpunpckhdq %%IN11, %%IN15, %%IN14 - - vpunpcklqdq %%IN12, %%K4, %%IN04 - vpunpckhqdq %%IN13, %%K4, %%IN04 - vpunpcklqdq %%IN14, %%K5, %%IN05 - vpunpckhqdq %%IN15, %%K5, %%IN05 - vpunpcklqdq %%IN00, %%IN06, %%IN10 - vpunpckhqdq %%IN01, %%IN06, %%IN10 - vpunpcklqdq %%IN02, %%IN07, %%IN11 - vpunpckhqdq %%IN03, %%IN07, %%IN11 - - vshufi64x2 %%H0, %%K2, %%K0, 0x44 - vshufi64x2 %%H1, %%K2, %%K0, 0xee - vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 - vshufi64x2 %%H3, %%IN12, %%IN00, 0xee - vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 - vshufi64x2 %%IN04, %%H0, %%H2, 0xdd ; R2 - vshufi64x2 %%IN08, %%H1, %%H3, 0x88 ; R4 - vshufi64x2 %%IN12, %%H1, %%H3, 0xdd ; R6 - - vshufi64x2 %%H0, %%T2, %%K1, 0x44 - vshufi64x2 %%H1, %%T2, %%K1, 0xee - vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 - vshufi64x2 %%H3, %%IN13, %%IN01, 0xee - vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 - vshufi64x2 %%IN05, %%H0, %%H2, 0xdd ; L2 - vshufi64x2 %%IN09, %%H1, %%H3, 0x88 ; L4 - vshufi64x2 %%IN13, %%H1, %%H3, 0xdd ; L6 - - vshufi64x2 %%H0, %%K3, %%T0, 0x44 - vshufi64x2 %%H1, %%K3, %%T0, 0xee - vshufi64x2 %%H2, %%IN14, %%IN02, 0x44 - vshufi64x2 %%H3, %%IN14, %%IN02, 0xee - vshufi64x2 %%IN02, %%H0, %%H2, 0x88 ; R1 - vshufi64x2 %%IN06, %%H0, %%H2, 0xdd ; R3 - vshufi64x2 %%IN10, %%H1, %%H3, 0x88 ; R5 - vshufi64x2 %%IN14, %%H1, %%H3, 0xdd ; R7 - - vshufi64x2 %%H0, %%T3, %%T1, 0x44 - vshufi64x2 %%H1, %%T3, %%T1, 0xee - vshufi64x2 %%H2, %%IN15, %%IN03, 0x44 - vshufi64x2 %%H3, %%IN15, %%IN03, 0xee - vshufi64x2 %%IN03, %%H0, %%H2, 0x88 ; L1 - vshufi64x2 %%IN07, %%H0, %%H2, 0xdd ; L3 - vshufi64x2 %%IN11, %%H1, %%H3, 0x88 ; L5 - vshufi64x2 %%IN15, %%H1, %%H3, 0xdd ; L7 -%endmacro - -;;; =========================================================================== -;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT -;;; =========================================================================== -;;; -;;; IN00-IN15 / R0/L0-R7/L7 [in/out]: -;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data -;;; out: R0 - 16 x word0, L0 - 16 x word1 -;;; T0,T2 [clobbered] - temporary zmm registers -;;; K0-K4 [clobbered] - temporary zmm registers -;;; H0,H2 [clobbered] - temporary zmm registers -%macro TRANSPOSE_IN_ONE 24 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T2 %18 -%define %%K0 %19 -%define %%K1 %20 -%define %%K2 %21 -%define %%K4 %22 -%define %%H0 %23 -%define %%H2 %24 - - vpunpckldq %%K0, %%IN00, %%IN01 - vpunpckhdq %%K1, %%IN00, %%IN01 - vpunpckldq %%T0, %%IN02, %%IN03 - - vpunpckldq %%IN00, %%IN04, %%IN05 - vpunpckhdq %%IN01, %%IN04, %%IN05 - vpunpckldq %%IN02, %%IN06, %%IN07 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - - vpunpcklqdq %%K0, %%IN00, %%IN02 - vpunpckhqdq %%K1, %%IN00, %%IN02 - - vpunpckldq %%K4, %%IN08, %%IN09 - vpunpckldq %%IN04, %%IN10, %%IN11 - vpunpckldq %%IN06, %%IN12, %%IN13 - vpunpckldq %%IN10, %%IN14, %%IN15 - - vpunpcklqdq %%IN12, %%K4, %%IN04 - vpunpckhqdq %%IN13, %%K4, %%IN04 - vpunpcklqdq %%IN00, %%IN06, %%IN10 - vpunpckhqdq %%IN01, %%IN06, %%IN10 - - vshufi64x2 %%H0, %%K2, %%K0, 0x44 - vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 - vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 - - vshufi64x2 %%H0, %%T2, %%K1, 0x44 - vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 - vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 -%endmacro - -;;; =========================================================================== -;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT -;;; =========================================================================== -;;; -;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: -;;; in: R0 - 16 x word0, L0 - 16 x word1 -;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data -;;; T0-T3 [clobbered] - temporary zmm registers -;;; K0-K3 [clobbered] - temporary zmm registers -;;; H0,H1 [clobbered] - temporary zmm registers -%macro TRANSPOSE_OUT_ONE 25 -%define %%IN00 %1 ; R0 -%define %%IN01 %2 ; L0 -%define %%IN02 %3 ; R1 -%define %%IN03 %4 ; L1 -%define %%IN04 %5 ; R2 -%define %%IN05 %6 ; L2 -%define %%IN06 %7 ; R3 -%define %%IN07 %8 ; L3 -%define %%IN08 %9 ; R4 -%define %%IN09 %10 ; L4 -%define %%IN10 %11 ; R5 -%define %%IN11 %12 ; L5 -%define %%IN12 %13 ; R6 -%define %%IN13 %14 ; L6 -%define %%IN14 %15 ; R7 -%define %%IN15 %16 ; L7 -%define %%T0 %17 -%define %%T2 %18 -%define %%T3 %19 -%define %%K0 %20 -%define %%K1 %21 -%define %%K2 %22 -%define %%K3 %23 -%define %%H0 %24 -%define %%H1 %25 - - vpxord %%T0, %%T0, %%T0 - - vpunpckldq %%K0, %%IN01, %%IN00 - vpunpckhdq %%K1, %%IN01, %%IN00 - - vpunpcklqdq %%K2, %%K0, %%T0 - vpunpckhqdq %%T2, %%K0, %%T0 - vpunpcklqdq %%K3, %%K1, %%T0 - vpunpckhqdq %%T3, %%K1, %%T0 - - vshufi64x2 %%H0, %%K2, %%T0, 0x44 - vshufi64x2 %%H1, %%K2, %%T0, 0xee - vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0 - vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2 - vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4 - vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6 - - vshufi64x2 %%H0, %%T2, %%T0, 0x44 - vshufi64x2 %%H1, %%T2, %%T0, 0xee - vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0 - vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2 - vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4 - vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6 - - vshufi64x2 %%H0, %%K3, %%T0, 0x44 - vshufi64x2 %%H1, %%K3, %%T0, 0xee - vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1 - vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3 - vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5 - vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7 - - vshufi64x2 %%H0, %%T3, %%T0, 0x44 - vshufi64x2 %%H1, %%T3, %%T0, 0xee - vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1 - vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3 - vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5 - vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7 -%endmacro - -;;; =========================================================================== -;;; DES INITIALIZATION -;;; key schedule transposition and IV set up -;;; =========================================================================== -;;; -;;; STATE_KEYS [in] - KEYS in DES OOO STATE -;;; STATE_IV [ in] - IV in DES OOO STATE -;;; KS [out] - place to store transposed key schedule or NULL -;;; IV0 [out] - r512; initialization vector -;;; IV1 [out] - r512; initialization vector -;;; T0-T27 [clobbered] - temporary r512 -%macro DES_INIT 33 -%define %%STATE_KEYS %1 -%define %%STATE_IV %2 -%define %%KS %3 -%define %%IV0 %4 -%define %%IV1 %5 -%define %%T0 %6 -%define %%T1 %7 -%define %%T2 %8 -%define %%T3 %9 -%define %%T4 %10 -%define %%T5 %11 -%define %%T6 %12 -%define %%T7 %13 -%define %%T8 %14 -%define %%T9 %15 -%define %%T10 %16 -%define %%T11 %17 -%define %%T12 %18 -%define %%T13 %19 -%define %%T14 %20 -%define %%T15 %21 -%define %%T16 %22 -%define %%T17 %23 -%define %%T18 %24 -%define %%T19 %25 -%define %%T20 %26 -%define %%T21 %27 -%define %%T22 %28 -%define %%T23 %29 -%define %%T24 %30 -%define %%T25 %31 -%define %%T26 %32 -%define %%T27 %33 - - ;; set up the key schedule - ;; - load first half of the keys & transpose - ;; - transpose and store - ;; note: we can use IV registers as temporary ones here -%assign IDX 0 -%rep 16 - mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] - vmovdqu64 %%T %+ IDX, [IA0] -%assign IDX (IDX + 1) -%endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 -%assign IDX 0 -%rep 16 - vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX -%assign IDX (IDX + 1) -%endrep - ;; - load second half of the keys & transpose - ;; - transpose and store - ;; note: we can use IV registers as temporary ones here -%assign IDX 0 -%rep 16 - mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] - vmovdqu64 %%T %+ IDX, [IA0 + 64] -%assign IDX (IDX + 1) -%endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 -%assign IDX 0 -%rep 16 - vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX -%assign IDX (IDX + 1) -%endrep - - ;; set up IV - ;; - they are already kept transposed so this is enough to load them - vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)] - vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)] -%endmacro - -;;; =========================================================================== -;;; 3DES INITIALIZATION -;;; key schedule transposition and IV set up -;;; =========================================================================== -;;; -;;; STATE_KEYS [in] - KEYS in 3DES OOO STATE -;;; STATE_IV [ in] - IV in 3DES OOO STATE -;;; KS1 [out] - place to store transposed key schedule or NULL -;;; KS2 [out] - place to store transposed key schedule or NULL -;;; KS3 [out] - place to store transposed key schedule or NULL -;;; IV0 [out] - r512; initialization vector -;;; IV1 [out] - r512; initialization vector -;;; T0-T27 [clobbered] - temporary r512 -;;; DIR [in] - ENC/DEC (keys arranged in different order for enc/dec) -%macro DES3_INIT 36 -%define %%STATE_KEYS %1 -%define %%STATE_IV %2 -%define %%KS1 %3 -%define %%KS2 %4 -%define %%KS3 %5 -%define %%IV0 %6 -%define %%IV1 %7 -%define %%T0 %8 -%define %%T1 %9 -%define %%T2 %10 -%define %%T3 %11 -%define %%T4 %12 -%define %%T5 %13 -%define %%T6 %14 -%define %%T7 %15 -%define %%T8 %16 -%define %%T9 %17 -%define %%T10 %18 -%define %%T11 %19 -%define %%T12 %20 -%define %%T13 %21 -%define %%T14 %22 -%define %%T15 %23 -%define %%T16 %24 -%define %%T17 %25 -%define %%T18 %26 -%define %%T19 %27 -%define %%T20 %28 -%define %%T21 %29 -%define %%T22 %30 -%define %%T23 %31 -%define %%T24 %32 -%define %%T25 %33 -%define %%T26 %34 -%define %%T27 %35 -%define %%DIR %36 - -%ifidn %%DIR, ENC -%assign KEY_IDX 0 -%else -%assign KEY_IDX 2 -%endif -%assign KS_IDX 1 - -%rep 3 - ;; set up the key schedule - ;; - load first half of the keys & transpose - ;; - transpose and store - ;; note: we can use IV registers as temporary ones here - -%assign IDX 0 -%rep 16 - mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] - mov IA0, [IA0 + (KEY_IDX * PTR_SZ)] - vmovdqu64 %%T %+ IDX, [IA0] -%assign IDX (IDX + 1) -%endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 -%assign IDX 0 -%rep 16 - vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX -%assign IDX (IDX + 1) -%endrep - ;; - load second half of the keys & transpose - ;; - transpose and store - ;; note: we can use IV registers as temporary ones here -%assign IDX 0 -%rep 16 - mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] - mov IA0, [IA0 + (KEY_IDX * PTR_SZ)] - vmovdqu64 %%T %+ IDX, [IA0 + 64] -%assign IDX (IDX + 1) -%endrep - TRANSPOSE_IN %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 -%assign IDX 0 -%rep 16 - vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX -%assign IDX (IDX + 1) -%endrep - -%ifidn %%DIR, ENC -%assign KEY_IDX (KEY_IDX + 1) -%else -%assign KEY_IDX (KEY_IDX - 1) -%endif -%assign KS_IDX (KS_IDX + 1) -%endrep ; KEY_IDX / KS_IDX - - ;; set up IV - ;; - they are already kept transposed so this is enough to load them - vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)] - vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)] - -%endmacro - -;;; =========================================================================== -;;; DES FINISH -;;; Update in/out pointers and store IV -;;; =========================================================================== -;;; -;;; Needs: STATE & SIZE -;;; IV0 [in] - r512; initialization vector -;;; IV1 [in] - r512; initialization vector -;;; T0-T4 [clobbered] - temporary r512 registers -%macro DES_FINISH 7 -%define %%IV0 %1 -%define %%IV1 %2 -%define %%T0 %3 -%define %%T1 %4 -%define %%T2 %5 -%define %%T3 %6 -%define %%T4 %7 - - vpbroadcastq %%T4, SIZE - vmovdqu64 %%T0, [STATE + _des_args_in + (0 * PTR_SZ)] - vmovdqu64 %%T1, [STATE + _des_args_in + (8 * PTR_SZ)] - vmovdqu64 %%T2, [STATE + _des_args_out + (0 * PTR_SZ)] - vmovdqu64 %%T3, [STATE + _des_args_out + (8 * PTR_SZ)] - vpaddq %%T0, %%T0, %%T4 - vpaddq %%T1, %%T1, %%T4 - vpaddq %%T2, %%T2, %%T4 - vpaddq %%T3, %%T3, %%T4 - vmovdqu64 [STATE + _des_args_in + (0 * PTR_SZ)], %%T0 - vmovdqu64 [STATE + _des_args_in + (8 * PTR_SZ)], %%T1 - vmovdqu64 [STATE + _des_args_out + (0 * PTR_SZ)], %%T2 - vmovdqu64 [STATE + _des_args_out + (8 * PTR_SZ)], %%T3 - - vmovdqu64 [STATE + _des_args_IV + (0 * 64)], %%IV0 - vmovdqu64 [STATE + _des_args_IV + (1 * 64)], %%IV1 -%endmacro - -;;; =========================================================================== -;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY -;;; =========================================================================== -;;; -;;; Needs: STATE, IA0-IA2 -;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection -;;; KS [in] - key schedule -;;; T0-T24 [clobbered] - temporary r512 -;;; T_IN [in] - 16 * 8 byte storage -;;; T_OUT [in] - 16 * 8 byte storage -;;; T_MASK [in] - 16 * 4 byte storage -;;; T_IV [in] - 16 * 8 byte storage -;;; -;;; NOTE: clobbers OpMask registers -%macro DES_CFB_ONE 31 -%define %%ENC_DEC %1 -%define %%KS %2 -%define %%T0 %3 -%define %%T1 %4 -%define %%T2 %5 -%define %%T3 %6 -%define %%T4 %7 -%define %%T5 %8 -%define %%T6 %9 -%define %%T7 %10 -%define %%T8 %11 -%define %%T9 %12 -%define %%T10 %13 -%define %%T11 %14 -%define %%T12 %15 -%define %%T13 %16 -%define %%T14 %17 -%define %%T15 %18 -%define %%T16 %19 -%define %%T17 %20 -%define %%T18 %21 -%define %%T19 %22 -%define %%T20 %23 -%define %%T21 %24 -%define %%T22 %25 -%define %%T23 %26 -%define %%T24 %27 -%define %%T_IN %28 -%define %%T_OUT %29 -%define %%T_IV %30 -%define %%T_MASK %31 - - ;; - find mask for non-zero partial lengths - vpxord %%T10, %%T10, %%T10 - vmovdqu64 %%T0, [STATE + _des_args_PLen] - vpcmpd k3, %%T0, %%T10, 4 ; NEQ - kmovw DWORD(IA0), k3 - movzx DWORD(IA0), WORD(IA0) - or DWORD(IA0), DWORD(IA0) - jz %%_des_cfb_one_end ; no non-zero partial lengths - -%ifidn %%ENC_DEC, ENC - ;; For encyrption case we need to make sure that - ;; all full blocks are complete before proceeding - ;; with CFB partial block. - ;; To do that current out position is compared against - ;; calculated last full block position. - vmovdqu64 %%T1, [STATE + _des_args_out + (0*8)] - vmovdqu64 %%T2, [STATE + _des_args_LOut + (0*8)] - vmovdqu64 %%T3, [STATE + _des_args_out + (8*8)] - vmovdqu64 %%T4, [STATE + _des_args_LOut + (8*8)] - vpcmpq k4, %%T1, %%T2, 0 ; EQ - vpcmpq k5, %%T3, %%T4, 0 ; EQ - kmovw DWORD(IA1), k4 - movzx DWORD(IA1), BYTE(IA1) - kmovw DWORD(IA2), k5 - movzx DWORD(IA2), BYTE(IA2) - shl DWORD(IA2), 8 - or DWORD(IA2), DWORD(IA1) - and DWORD(IA0), DWORD(IA2) - jz %%_des_cfb_one_end ; no non-zero lengths left - kmovw k3, DWORD(IA0) -%endif - ;; Calculate ((1 << partial_bytes) - 1) - ;; in order to get the mask for loads and stores - ;; k3 & IA0 - hold valid mask - vmovdqa64 %%T1, [rel vec_ones_32b] - vpsllvd %%T2{k3}{z}, %%T1, %%T0 - vpsubd %%T2{k3}{z}, %%T2, %%T1 - vmovdqu64 [%%T_MASK], %%T2 - - ;; clear selected partial lens not to do them twice - vmovdqu32 [STATE + _des_args_PLen]{k3}, %%T10 - - ;; copy IV, in and out pointers - vmovdqu64 %%T1, [STATE + _des_args_in + (0*PTR_SZ)] - vmovdqu64 %%T2, [STATE + _des_args_in + (8*PTR_SZ)] - vmovdqu64 %%T3, [STATE + _des_args_out + (0*PTR_SZ)] - vmovdqu64 %%T4, [STATE + _des_args_out + (8*PTR_SZ)] - vmovdqu64 %%T5, [STATE + _des_args_IV + (0*64)] - vmovdqu64 %%T6, [STATE + _des_args_IV + (1*64)] - vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1 - vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2 - vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3 - vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4 - vmovdqu64 [%%T_IV + (0*64)], %%T5 - vmovdqu64 [%%T_IV + (1*64)], %%T6 - - ;; calculate last block case mask - ;; - first block case requires no modifications to in/out/IV - vmovdqu64 %%T1, [STATE + _des_args_BLen] - vpcmpd k2, %%T1, %%T10, 4 ; NEQ - kmovw DWORD(IA1), k2 - and DWORD(IA1), DWORD(IA0) - jz %%_des_cfb_one_no_last_blocks - - ;; set up IV, in and out for the last block case - ;; - Last block needs in and out to be set differently (decryption only) - ;; - IA1 holds the last block mask -%ifidn %%ENC_DEC, DEC - mov DWORD(IA0), DWORD(IA1) - mov DWORD(IA2), DWORD(IA1) - shr DWORD(IA1), 8 - and DWORD(IA2), 0xff - kmovw k4, DWORD(IA2) - kmovw k5, DWORD(IA1) - vmovdqu64 %%T1, [STATE + _des_args_LOut + (0*PTR_SZ)] - vmovdqu64 %%T2, [STATE + _des_args_LOut + (8*PTR_SZ)] - vmovdqu64 %%T3, [STATE + _des_args_LIn + (0*PTR_SZ)] - vmovdqu64 %%T4, [STATE + _des_args_LIn + (8*PTR_SZ)] - vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1 - vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2 - vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3 - vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4 -%endif ; decryption - ;; - IV has to be set differently for CFB as well - ;; - IA0 holds the last block mask -%assign IDX 0 -%rep 16 - test DWORD(IA0), (1 << IDX) - jz %%_des_cfb_one_copy_iv_next %+ IDX -%ifidn %%ENC_DEC, ENC - mov IA2, [STATE + _des_args_LOut + (IDX*PTR_SZ)] -%else - mov IA2, [STATE + _des_args_LIn + (IDX*PTR_SZ)] -%endif - mov IA2, [IA2 - 8] - mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2) - shr IA2, 32 - mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2) -%%_des_cfb_one_copy_iv_next %+ IDX: -%assign IDX (IDX + 1) -%endrep - -%%_des_cfb_one_no_last_blocks: - ;; Uffff ... finally let's do some DES CFB - ;; - let's use T_IN, T_OUT, T_IV and T_MASK - - ;; - load data with the corresponding masks & transpose - ;; - T0 to T15 will hold the data - xor IA0, IA0 -%assign IDX 0 -%assign K_IDX 1 -%rep 16 - mov IA1, [%%T_IN + (IDX*PTR_SZ)] - mov DWORD(IA0), [%%T_MASK + (IDX*4)] - kmovq k %+ K_IDX, IA0 - vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1] -%assign IDX (IDX + 1) -%assign K_IDX (K_IDX + 1) -%if K_IDX > 7 -%assign K_IDX 1 ; iterate through K1 to K7 -%endif -%endrep - ;; - transpose the data in T0 to T15, T16 to T23 are clobbered - TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23 - - ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1 - vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0 - vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1 - ;; DES encrypt - ;; - R0 - %%T0 - ;; - L0 - %%T1 - DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13 - ;; CFB style xor with R0/L0 with IV - ;; - IV0 - %%T16 - ;; - IV1 - %%T17 - vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1 - vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0 - vmovdqa64 %%T1, %%T2 - ;; - new R0 = L0 ^ IV0 (%%T0) - ;; - new L0 = R0 ^ IV1 (%%T1) - - ;; Transpose the data out - ;; - %%T2 to %%T24 clobbered - TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24 - - ;; Store the transposed data - ;; - T0 to T15 will hold the data - xor IA0, IA0 -%assign IDX 0 -%assign K_IDX 1 -%rep 16 - mov IA1, [%%T_OUT + (IDX*PTR_SZ)] - mov DWORD(IA0), [%%T_MASK + (IDX*4)] - kmovq k %+ K_IDX, IA0 - vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX -%assign IDX (IDX + 1) -%assign K_IDX (K_IDX + 1) -%if K_IDX > 7 -%assign K_IDX 1 ; iterate through K1 to K7 -%endif -%endrep - -%ifdef SAFE_DATA - ;; Clear copied IV's - vpxorq %%T5, %%T5 - vmovdqu64 [%%T_IV + (0*64)], %%T5 - vmovdqu64 [%%T_IV + (1*64)], %%T5 -%endif - -%%_des_cfb_one_end: - -%endmacro - -;;; =========================================================================== -;;; Converts length into mask of DES blocks -;;; =========================================================================== -;;; -;;; MASK [out] - mask8 for value; for masked 64b loads and stores (r64) -;;; USES: IA0, IA1 IA2 -;;; ASSUMES: SIZE - OFFSET < 64 -%macro GET_MASK8 1 -%define %%MASK %1 - -%ifidn IA1, rcx -%define myrcx IA1 -%else -%define myrcx rcx - mov IA1, rcx -%endif - mov myrcx, SIZE - sub myrcx, OFFSET - ;; - myrcx - remaining length - ;; - divide by 8 (DES block size) - ;; - create bit mask of the result - mov DWORD(%%MASK), 1 - shr DWORD(myrcx), 3 - shl DWORD(%%MASK), BYTE(myrcx) - sub DWORD(%%MASK), 1 -%ifnidn IA1, rcx - mov rcx, IA1 -%endif -%endmacro - -;;; =========================================================================== -;;; DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only) -;;; =========================================================================== -;;; -;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only -;;; DES_KS [in] - pointer to transposed key schedule -;;; -;;; NOTE: clobbers OpMask registers -;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 -%macro GEN_DES_ENC_CIPHER 2 -%define %%NUM_DES_BLOCKS %1 -%define %%DES_KS %2 - -%assign RN 0 -%assign LN 1 -%assign RNN 2 -%assign LNN 3 -%rep %%NUM_DES_BLOCKS - 1 - DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0 - vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0 -%assign RN (RN + 2) -%assign LN (LN + 2) -%assign RNN (RNN + 2) -%assign LNN (LNN + 2) -%endrep - DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7 - vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7 -%endmacro - -;;; =========================================================================== -;;; DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only) -;;; =========================================================================== -;;; -;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only -;;; DES_KS [in] - pointer to transposed key schedule -;;; -;;; NOTE: clobbers OpMask registers -;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 -%macro GEN_DES_DEC_CIPHER 2 -%define %%NUM_DES_BLOCKS %1 -%define %%DES_KS %2 - -%assign RN 0 -%assign LN 1 -%rep %%NUM_DES_BLOCKS - vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round - vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round - DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1 - vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0 - vmovdqa64 ZIV0, ZTMP12 - vmovdqa64 ZIV1, ZTMP13 -%assign RN (RN + 2) -%assign LN (LN + 2) -%endrep -%endmacro - -;;; =========================================================================== -;;; 3DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only) -;;; =========================================================================== -;;; -;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only -;;; DES_KS1 [in] - pointer to transposed key schedule 1 -;;; DES_KS2 [in] - pointer to transposed key schedule 2 -;;; DES_KS3 [in] - pointer to transposed key schedule 3 -;;; -;;; NOTE: clobbers OpMask registers -;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 -%macro GEN_3DES_ENC_CIPHER 4 -%define %%NUM_DES_BLOCKS %1 -%define %%DES_KS1 %2 -%define %%DES_KS2 %3 -%define %%DES_KS3 %4 - -%assign RN 0 -%assign LN 1 -%assign RNN 2 -%assign LNN 3 -%rep %%NUM_DES_BLOCKS - ;; ENC - DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - ;; DEC - DES_ENC_DEC DEC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - ;; ENC - DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 -%if (RNN < (%%NUM_DES_BLOCKS * 2)) - vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0 - vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0 -%else - vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7 - vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7 -%endif - -%assign RN (RN + 2) -%assign LN (LN + 2) -%assign RNN (RNN + 2) -%assign LNN (LNN + 2) -%endrep - -%endmacro - -;;; =========================================================================== -;;; 3DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only) -;;; =========================================================================== -;;; -;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only -;;; DES_KS1 [in] - pointer to transposed key schedule 1 -;;; DES_KS2 [in] - pointer to transposed key schedule 2 -;;; DES_KS3 [in] - pointer to transposed key schedule 3 -;;; -;;; NOTE: clobbers OpMask registers -;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 -%macro GEN_3DES_DEC_CIPHER 4 -%define %%NUM_DES_BLOCKS %1 -%define %%DES_KS1 %2 -%define %%DES_KS2 %3 -%define %%DES_KS3 %4 - -%assign RN 0 -%assign LN 1 -%rep %%NUM_DES_BLOCKS - vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round - vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round - ;; DEC - DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - ;; ENC - DES_ENC_DEC ENC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - ;; DEC - DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 - vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1 - vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0 - vmovdqa64 ZIV0, ZTMP12 - vmovdqa64 ZIV1, ZTMP13 - -%assign RN (RN + 2) -%assign LN (LN + 2) -%endrep - -%endmacro - -;;; =========================================================================== -;;; DES CBC / DOCSIS DES ENCRYPT -;;; =========================================================================== -;;; -;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and -;;; 3DES (3DES CBC) -;;; -;;; NOTE: clobbers OpMask registers -%macro GENERIC_DES_ENC 1 -%define %%DES_DOCSIS %1 - - ;; push the registers and allocate the stack frame - mov rax, rsp - sub rsp, STACKFRAME_size - and rsp, -64 - mov [rsp + _rsp_save], rax ; original SP - mov [rsp + _gpr_save + 0*8], r12 - mov [rsp + _gpr_save + 1*8], r13 - mov [rsp + _gpr_save + 2*8], r14 - mov [rsp + _gpr_save + 3*8], r15 - -%ifnidn %%DES_DOCSIS, 3DES - ;; DES and DOCSIS DES - DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 -%else - ;; 3DES - DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC -%endif - mov [rsp + _size_save], SIZE - and SIZE, -64 - xor OFFSET, OFFSET - ;; This loop processes message in blocks of 64 bytes. - ;; Anything smaller than 64 bytes is handled separately after the loop. -%%_gen_des_enc_loop: - cmp OFFSET, SIZE - jz %%_gen_des_enc_loop_end - ;; run loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] - vmovdqu64 ZW0, [IA0 + OFFSET] - vmovdqu64 ZW1, [IA1 + OFFSET] - vmovdqu64 ZW2, [IA2 + OFFSET] - vmovdqu64 ZW3, [INP0 + OFFSET] - vmovdqu64 ZW4, [INP1 + OFFSET] - vmovdqu64 ZW5, [INP2 + OFFSET] - vmovdqu64 ZW6, [INP3 + OFFSET] - vmovdqu64 ZW7, [INP4 + OFFSET] - - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] - vmovdqu64 ZW8, [IA0 + OFFSET] - vmovdqu64 ZW9, [IA1 + OFFSET] - vmovdqu64 ZW10, [IA2 + OFFSET] - vmovdqu64 ZW11, [INP0 + OFFSET] - vmovdqu64 ZW12, [INP1 + OFFSET] - vmovdqu64 ZW13, [INP2 + OFFSET] - vmovdqu64 ZW14, [INP3 + OFFSET] - vmovdqu64 ZW15, [INP4 + OFFSET] - - ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; DES CBC ENC comes here - vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 - vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1 - -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 8, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - - ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - ;; run stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET], ZW0 - vmovdqu64 [IA1 + OFFSET], ZW1 - vmovdqu64 [IA2 + OFFSET], ZW2 - vmovdqu64 [INP0 + OFFSET], ZW3 - vmovdqu64 [INP1 + OFFSET], ZW4 - vmovdqu64 [INP2 + OFFSET], ZW5 - vmovdqu64 [INP3 + OFFSET], ZW6 - vmovdqu64 [INP4 + OFFSET], ZW7 - - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET], ZW8 - vmovdqu64 [IA1 + OFFSET], ZW9 - vmovdqu64 [IA2 + OFFSET], ZW10 - vmovdqu64 [INP0 + OFFSET], ZW11 - vmovdqu64 [INP1 + OFFSET], ZW12 - vmovdqu64 [INP2 + OFFSET], ZW13 - vmovdqu64 [INP3 + OFFSET], ZW14 - vmovdqu64 [INP4 + OFFSET], ZW15 - - add OFFSET, 64 - jmp %%_gen_des_enc_loop -%%_gen_des_enc_loop_end: - ;; This is where we check if there is anything less than 64 bytes - ;; of message left for processing. - mov SIZE, [rsp + _size_save] - cmp OFFSET, SIZE - jz %%_gen_des_enc_part_end - ;; calculate min of bytes_left and 64, convert to qword mask - GET_MASK8 IA0 ; IA0 = mask - - kmovw k7, DWORD(IA0) - mov [rsp + _mask_save], IA0 - ;; run masked loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] - vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] - vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] - vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] - vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] - vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] - vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] - vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] - vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] - - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] - vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] - vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] - vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] - vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] - vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] - vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] - vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] - vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] - - ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; DES CBC ENC comes here - vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 - vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1 - - mov IA0, [rsp + _mask_save] - cmp BYTE(IA0), 0x0f - ja %%_gt_4 - jz %%_blocks_4 - - cmp BYTE(IA0), 0x03 - ja %%_blocks_3 - jz %%_blocks_2 - - ;; process one block and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 1, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_2: - ;; process two blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 2, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_3: - ;; process three blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 3, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_4: - ;; process four blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 4, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_gt_4: - cmp BYTE(IA0), 0x3f - ja %%_blocks_7 - jz %%_blocks_6 -%%_blocks_5: - ;; process five blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 5, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_6: - ;; process six blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 6, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_7: - ;; process seven blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_ENC_CIPHER 7, rsp + _key_sched -%else - GEN_3DES_ENC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - -%%_transpose_out: - ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; run masked stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 - vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 - vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 - vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 - vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 - vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 - vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 - vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 - - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 - vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 - vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 - vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 - vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 - vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 - vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 - vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 -%%_gen_des_enc_part_end: - - ;; store IV and update pointers - DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 - - ;; CFB part for DOCSIS -%ifidn %%DES_DOCSIS, DOCSIS - DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask -%endif - - CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 - - ;; restore stack pointer and registers - mov r12, [rsp + _gpr_save + 0*8] - mov r13, [rsp + _gpr_save + 1*8] - mov r14, [rsp + _gpr_save + 2*8] - mov r15, [rsp + _gpr_save + 3*8] - mov rsp, [rsp + _rsp_save] ; original SP - -%ifdef SAFE_DATA - clear_all_zmms_asm -%else - vzeroupper -%endif ;; SAFE_DATA - -%endmacro - -;;; =========================================================================== -;;; DES CBC / DOCSIS DES DECRYPT -;;; =========================================================================== -;;; -;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and -;;; 3DES (3DES CBC) -;;; -;;; NOTE: clobbers OpMask registers -%macro GENERIC_DES_DEC 1 -%define %%DES_DOCSIS %1 - - ;; push the registers and allocate the stack frame - mov rax, rsp - sub rsp, STACKFRAME_size - and rsp, -64 - mov [rsp + _rsp_save], rax ; original SP - mov [rsp + _gpr_save + 0*8], r12 - mov [rsp + _gpr_save + 1*8], r13 - mov [rsp + _gpr_save + 2*8], r14 - mov [rsp + _gpr_save + 3*8], r15 - -%ifnidn %%DES_DOCSIS, 3DES - ;; DES and DOCSIS - DES_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 -%else - ;; 3DES - DES3_INIT STATE + _des_args_keys, STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC -%endif - - ;; CFB part for DOCSIS -%ifidn %%DES_DOCSIS, DOCSIS - DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask -%endif - - mov [rsp + _size_save], SIZE - and SIZE, -64 - xor OFFSET, OFFSET - ;; This loop processes message in blocks of 64 bytes. - ;; Anything smaller than 64 bytes is handled separately after the loop. -%%_gen_des_dec_loop: - cmp OFFSET, SIZE - jz %%_gen_des_dec_loop_end - ;; run loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] - vmovdqu64 ZW0, [IA0 + OFFSET] - vmovdqu64 ZW1, [IA1 + OFFSET] - vmovdqu64 ZW2, [IA2 + OFFSET] - vmovdqu64 ZW3, [INP0 + OFFSET] - vmovdqu64 ZW4, [INP1 + OFFSET] - vmovdqu64 ZW5, [INP2 + OFFSET] - vmovdqu64 ZW6, [INP3 + OFFSET] - vmovdqu64 ZW7, [INP4 + OFFSET] - - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] - vmovdqu64 ZW8, [IA0 + OFFSET] - vmovdqu64 ZW9, [IA1 + OFFSET] - vmovdqu64 ZW10, [IA2 + OFFSET] - vmovdqu64 ZW11, [INP0 + OFFSET] - vmovdqu64 ZW12, [INP1 + OFFSET] - vmovdqu64 ZW13, [INP2 + OFFSET] - vmovdqu64 ZW14, [INP3 + OFFSET] - vmovdqu64 ZW15, [INP4 + OFFSET] - - ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - -%ifnidn %%DES_DOCSIS, 3DES - ;; DES CBC DEC comes here - GEN_DES_DEC_CIPHER 8, rsp + _key_sched -%else - ;; 3DES CBC DEC comes here - GEN_3DES_DEC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - - ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; run stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET], ZW0 - vmovdqu64 [IA1 + OFFSET], ZW1 - vmovdqu64 [IA2 + OFFSET], ZW2 - vmovdqu64 [INP0 + OFFSET], ZW3 - vmovdqu64 [INP1 + OFFSET], ZW4 - vmovdqu64 [INP2 + OFFSET], ZW5 - vmovdqu64 [INP3 + OFFSET], ZW6 - vmovdqu64 [INP4 + OFFSET], ZW7 - - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET], ZW8 - vmovdqu64 [IA1 + OFFSET], ZW9 - vmovdqu64 [IA2 + OFFSET], ZW10 - vmovdqu64 [INP0 + OFFSET], ZW11 - vmovdqu64 [INP1 + OFFSET], ZW12 - vmovdqu64 [INP2 + OFFSET], ZW13 - vmovdqu64 [INP3 + OFFSET], ZW14 - vmovdqu64 [INP4 + OFFSET], ZW15 - - add OFFSET, 64 - jmp %%_gen_des_dec_loop -%%_gen_des_dec_loop_end: - ;; This is where we check if there is anything less than 64 bytes - ;; of message left for processing. - mov SIZE, [rsp + _size_save] - cmp OFFSET, SIZE - jz %%_gen_des_dec_part_end - ;; calculate min of bytes_left and 64, convert to qword mask - GET_MASK8 IA0 ; IA0 = mask - - kmovw k7, DWORD(IA0) - mov [rsp + _mask_save], IA0 - ;; run masked loads - mov IA0, [STATE + _des_args_in + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (7*PTR_SZ)] - vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] - vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] - vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] - vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] - vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] - vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] - vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] - vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] - - mov IA0, [STATE + _des_args_in + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_in + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_in + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_in + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_in + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_in + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_in + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_in + (15*PTR_SZ)] - vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] - vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] - vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] - vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] - vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] - vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] - vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] - vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] - - ;; Transpose input - TRANSPOSE_IN ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; DES CBC DEC comes here - mov IA0, [rsp + _mask_save] - cmp BYTE(IA0), 0x0f - ja %%_gt_4 - jz %%_blocks_4 - - cmp BYTE(IA0), 0x03 - ja %%_blocks_3 - jz %%_blocks_2 - ;; process one block and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 1, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_2: - ;; process two blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 2, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_3: - ;; process three blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 3, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_4: - ;; process four blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 4, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_gt_4: - cmp BYTE(IA0), 0x3f - ja %%_blocks_7 - jz %%_blocks_6 -%%_blocks_5: - ;; process five blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 5, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_6: - ;; process six blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 6, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - jmp %%_transpose_out - -%%_blocks_7: - ;; process seven blocks and move to transpose out -%ifnidn %%DES_DOCSIS, 3DES - GEN_DES_DEC_CIPHER 7, rsp + _key_sched -%else - GEN_3DES_DEC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 -%endif - -%%_transpose_out: - ;; transpose data on output - TRANSPOSE_OUT ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 - - ;; run masked stores - mov IA0, [STATE + _des_args_out + (0*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (1*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (2*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (3*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (4*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (5*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (6*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (7*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 - vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 - vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 - vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 - vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 - vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 - vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 - vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 - - mov IA0, [STATE + _des_args_out + (8*PTR_SZ)] - mov IA1, [STATE + _des_args_out + (9*PTR_SZ)] - mov IA2, [STATE + _des_args_out + (10*PTR_SZ)] - mov INP0, [STATE + _des_args_out + (11*PTR_SZ)] - mov INP1, [STATE + _des_args_out + (12*PTR_SZ)] - mov INP2, [STATE + _des_args_out + (13*PTR_SZ)] - mov INP3, [STATE + _des_args_out + (14*PTR_SZ)] - mov INP4, [STATE + _des_args_out + (15*PTR_SZ)] - vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 - vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 - vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 - vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 - vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 - vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 - vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 - vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 -%%_gen_des_dec_part_end: - - ;; store IV and update pointers - DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4 - - CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 - - ;; restore stack pointer and registers - mov r12, [rsp + _gpr_save + 0*8] - mov r13, [rsp + _gpr_save + 1*8] - mov r14, [rsp + _gpr_save + 2*8] - mov r15, [rsp + _gpr_save + 3*8] - mov rsp, [rsp + _rsp_save] ; original SP - -%ifdef SAFE_DATA - clear_all_zmms_asm -%else - vzeroupper -%endif ;; SAFE_DATA - -%endmacro +;;; ======================================================== +;;; External module functions needed here + +extern des_enc_zmm0_zmm1_avx512 +extern des_enc_zmm2_zmm3_avx512 +extern des_enc_zmm4_zmm5_avx512 +extern des_enc_zmm6_zmm7_avx512 +extern des_enc_zmm8_zmm9_avx512 +extern des_enc_zmm10_zmm11_avx512 +extern des_enc_zmm12_zmm13_avx512 +extern des_enc_zmm14_zmm15_avx512 +extern des_enc_zmm18_zmm19_avx512 + +extern des_dec_zmm0_zmm1_avx512 +extern des_dec_zmm2_zmm3_avx512 +extern des_dec_zmm4_zmm5_avx512 +extern des_dec_zmm6_zmm7_avx512 +extern des_dec_zmm8_zmm9_avx512 +extern des_dec_zmm10_zmm11_avx512 +extern des_dec_zmm12_zmm13_avx512 +extern des_dec_zmm14_zmm15_avx512 +extern des_dec_zmm18_zmm19_avx512 ;;; ======================================================== ;;; DATA @@ -2138,7 +58,8 @@ endstruc mksection .rodata default rel align 64 -mask_values: +MKGLOBAL(des_mask_values_avx512,data,internal) +des_mask_values_avx512: dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 dd 0x04000000, 0x04000000, 0x04000000, 0x04000000 @@ -2217,7 +138,8 @@ mask_values: dd 0x90000000, 0x90000000, 0x90000000, 0x90000000 align 64 -init_perm_consts: +MKGLOBAL(des_init_perm_consts_avx512,data,internal) +des_init_perm_consts_avx512: dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f dd 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f, 0x0f0f0f0f @@ -2241,7 +163,8 @@ init_perm_consts: ;;; S-Box table align 64 -S_box_flipped: +MKGLOBAL(des_S_box_flipped_avx512,data,internal) +des_S_box_flipped_avx512: ;; SBOX0 dw 0x07, 0x02, 0x0c, 0x0f, 0x04, 0x0b, 0x0a, 0x0c dw 0x0b, 0x07, 0x06, 0x09, 0x0d, 0x04, 0x00, 0x0a @@ -2317,39 +240,45 @@ S_box_flipped: ;;; Used in DOCSIS DES partial block scheduling 16 x 32bit of value 1 align 64 -vec_ones_32b: +MKGLOBAL(des_vec_ones_32b_avx512,data,internal) +des_vec_ones_32b_avx512: dd 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 align 64 -and_eu: +MKGLOBAL(des_and_eu_avx512,data,internal) +des_and_eu_avx512: dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 dd 0x3f003f00, 0x3f003f00, 0x3f003f00, 0x3f003f00 align 64 -and_ed: +MKGLOBAL(des_and_ed_avx512,data,internal) +des_and_ed_avx512: dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f dd 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f align 64 -idx_e: +MKGLOBAL(des_idx_e_avx512,data,internal) +des_idx_e_avx512: dq 0x0d0c090805040100, 0x0f0e0b0a07060302 dq 0x1d1c191815141110, 0x1f1e1b1a17161312 dq 0x2d2c292825242120, 0x2f2e2b2a27262322 dq 0x3d3c393835343130, 0x3f3e3b3a37363332 align 64 -reg_values16bit_7: +MKGLOBAL(des_reg_values16bit_7_avx512,data,internal) +des_reg_values16bit_7_avx512: dq 0x001f001f001f001f, 0x001f001f001f001f dq 0x001f001f001f001f, 0x001f001f001f001f dq 0x001f001f001f001f, 0x001f001f001f001f dq 0x001f001f001f001f, 0x001f001f001f001f align 64 -shuffle_reg: +MKGLOBAL(des_shuffle_reg_avx512,data,internal) +des_shuffle_reg_avx512: dq 0x0705060403010200, 0x0f0d0e0c0b090a08 dq 0x1715161413111210, 0x1f1d1e1c1b191a18 dq 0x2725262423212220, 0x2f2d2e2c2b292a28 @@ -2364,7 +293,7 @@ mksection .text align 64 MKGLOBAL(des_x16_cbc_enc_avx512,function,internal) des_x16_cbc_enc_avx512: - GENERIC_DES_ENC DES + GENERIC_DES_ENC DES, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -2372,7 +301,7 @@ des_x16_cbc_enc_avx512: align 64 MKGLOBAL(des_x16_cbc_dec_avx512,function,internal) des_x16_cbc_dec_avx512: - GENERIC_DES_DEC DES + GENERIC_DES_DEC DES, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -2380,7 +309,7 @@ des_x16_cbc_dec_avx512: align 64 MKGLOBAL(des3_x16_cbc_enc_avx512,function,internal) des3_x16_cbc_enc_avx512: - GENERIC_DES_ENC 3DES + GENERIC_DES_ENC 3DES, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -2388,7 +317,7 @@ des3_x16_cbc_enc_avx512: align 64 MKGLOBAL(des3_x16_cbc_dec_avx512,function,internal) des3_x16_cbc_dec_avx512: - GENERIC_DES_DEC 3DES + GENERIC_DES_DEC 3DES, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -2396,7 +325,7 @@ des3_x16_cbc_dec_avx512: align 64 MKGLOBAL(docsis_des_x16_enc_avx512,function,internal) docsis_des_x16_enc_avx512: - GENERIC_DES_ENC DOCSIS + GENERIC_DES_ENC DOCSIS, arg1, arg2 ret ;;; arg 1 : pointer to DES OOO structure @@ -2404,7 +333,7 @@ docsis_des_x16_enc_avx512: align 64 MKGLOBAL(docsis_des_x16_dec_avx512,function,internal) docsis_des_x16_dec_avx512: - GENERIC_DES_DEC DOCSIS + GENERIC_DES_DEC DOCSIS, arg1, arg2 ret mksection stack-noexec diff --git a/lib/avx512_t2/aes128_gmac_api_vaes_avx512.asm b/lib/avx512_t2/aes128_gmac_api_vaes_avx512.asm deleted file mode 100644 index 099484ca5aca6c95bd862dfbfbc6497d7ed8801f..0000000000000000000000000000000000000000 --- a/lib/avx512_t2/aes128_gmac_api_vaes_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2021-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM128_MODE 1 -%include "include/gcm_gmac_api_vaes_avx512.inc" diff --git a/lib/avx512_t2/aes192_gmac_api_vaes_avx512.asm b/lib/avx512_t2/aes192_gmac_api_vaes_avx512.asm deleted file mode 100644 index d82a5e20dc24f506030d22c3c5c0957bfec74ea0..0000000000000000000000000000000000000000 --- a/lib/avx512_t2/aes192_gmac_api_vaes_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2021-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM192_MODE 1 -%include "include/gcm_gmac_api_vaes_avx512.inc" diff --git a/lib/avx512_t2/aes256_gmac_api_vaes_avx512.asm b/lib/avx512_t2/aes256_gmac_api_vaes_avx512.asm deleted file mode 100644 index 422e2ba44f363406ccb2b01b66224f23bb1a057e..0000000000000000000000000000000000000000 --- a/lib/avx512_t2/aes256_gmac_api_vaes_avx512.asm +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2021-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define GCM256_MODE 1 -%include "include/gcm_gmac_api_vaes_avx512.inc" diff --git a/lib/include/gcm_gmac_api_vaes_avx512.inc b/lib/avx512_t2/ghash_api_vaes_avx512.asm similarity index 60% rename from lib/include/gcm_gmac_api_vaes_avx512.inc rename to lib/avx512_t2/ghash_api_vaes_avx512.asm index b82e86da8d070015a9045d934a1f68edad387510..a0c5c0fb22761d748c80abbcf9703c43c0e6a9ec 100644 --- a/lib/include/gcm_gmac_api_vaes_avx512.inc +++ b/lib/avx512_t2/ghash_api_vaes_avx512.asm @@ -1,5 +1,5 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2021-2023, Intel Corporation All rights reserved. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -27,21 +27,20 @@ ; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%define GCM128_MODE %include "include/gcm_vaes_avx512.inc" + %include "include/error.inc" %include "include/clear_regs.inc" -%ifndef GCM_GMAC_API_VAES_AVX512_INC -%define GCM_GMAC_API_VAES_AVX512_INC - mksection .text default rel -%ifdef GCM128_MODE ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void ghash_pre_vaes_avx512 ; (const void *key, struct gcm_key_data *key_data) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(ghash_pre_vaes_avx512,function,) ghash_pre_vaes_avx512: endbranch64 @@ -104,6 +103,24 @@ error_ghash_pre: jmp exit_ghash_pre %endif +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; ghash_internal_vaes_avx512() +; r12 [in/clobbered] message pointer +; r13 [in/clobbered] message length +; xmm0 [in/out] ghash value +; arg1 [in] pointer to key structure +; clobbers: zmm1, zmm3-zmm13, zmm15-zmm20, rax, k1 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(ghash_internal_vaes_avx512,function,internal) +ghash_internal_vaes_avx512: + CALC_GHASH r12, r13, xmm0, arg1, zmm1, zmm3, zmm4, zmm5, \ + zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, \ + zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, rax, k1 + ;; **zmm3, zmm4, zmm5 and zmm6 may contain clear text + ;; **zmm15, zmm16, zmm19 and zmm9 may contain hash key + ret + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;void ghash_vaes_avx512 ; const struct gcm_key_data *key_data, @@ -112,6 +129,7 @@ error_ghash_pre: ; void *io_tag, ; const u64 tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(ghash_vaes_avx512,function,) ghash_vaes_avx512: endbranch64 @@ -146,16 +164,18 @@ ghash_vaes_avx512: vmovdqu xmm0, [arg4] vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - CALC_AAD_HASH arg2, arg3, xmm0, arg1, zmm1, zmm2, zmm3, zmm4, zmm5, \ - zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, zmm12, zmm13, \ - zmm15, zmm16, zmm17, zmm18, zmm19, r10, r11, r12, k1 - ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text - ;; **zmm13, zmm15, zmm18 and zmm8 may contain authentication key + ;; arg1 [in] pointer to key structure => arg1 + ;; r12 [in] message pointer => arg2 + ;; r13 [in] message length => arg3 + ;; xmm0 [in/out] ghash value + mov r12, arg2 + mov r13, arg3 + call ghash_internal_vaes_avx512 vpshufb xmm0, xmm0, [rel SHUF_MASK] ; perform a 16Byte swap simd_store_avx arg4, xmm0, arg5, r12, rax %ifdef SAFE_DATA - clear_zmms_avx512 xmm0, xmm2, xmm3, xmm4, xmm5, xmm13, xmm15, xmm8, xmm18 + clear_zmms_avx512 xmm0, xmm3, xmm4, xmm5, xmm6, xmm15, xmm16, xmm9, xmm19 %endif exit_ghash: FUNC_RESTORE @@ -187,118 +207,4 @@ error_ghash: jmp exit_ghash %endif -%endif ;; GCM128_MODE - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void imb_aes_gmac_update_128_vaes_avx512 / -; imb_aes_gmac_update_192_vaes_avx512 / -; imb_aes_gmac_update_256_vaes_avx512 -; const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; const u8 *in, -; const u64 msg_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -MKGLOBAL(GMAC_FN_NAME(update),function,) -GMAC_FN_NAME(update): - endbranch64 - FUNC_SAVE small_frame - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET -%endif - ;; Check if msg_len == 0 - cmp arg4, 0 - je exit_gmac_update - -%ifdef SAFE_PARAM - ;; Check key_data != NULL - cmp arg1, 0 - jz error_gmac_update - - ;; Check context_data != NULL - cmp arg2, 0 - jz error_gmac_update - - ;; Check in != NULL (msg_len != 0) - cmp arg3, 0 - jz error_gmac_update -%endif - - ; Increment size of "AAD length" for GMAC - add [arg2 + AadLen], arg4 - - ;; Deal with previous partial block - xor r11, r11 - vmovdqu64 xmm8, [arg2 + AadHash] - - PARTIAL_BLOCK_GMAC arg1, arg2, arg3, arg4, r11, xmm8, r10, r12, rax, \ - zmm0, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, k1 -%ifdef SAFE_DATA - clear_zmms_avx512 xmm0 -%endif - ; CALC_AAD_HASH needs to deal with multiple of 16 bytes - sub arg4, r11 - add arg3, r11 - - vmovq xmm14, arg4 ; Save remaining length - and arg4, -16 ; Get multiple of 16 bytes - - or arg4, arg4 - jz no_full_blocks - - ;; Calculate GHASH of this segment - CALC_AAD_HASH arg3, arg4, xmm8, arg1, zmm1, zmm2, zmm3, zmm4, zmm5, \ - zmm6, zmm7, zmm9, zmm10, zmm11, zmm12, zmm13, zmm15, \ - zmm16, zmm17, zmm18, zmm19, zmm20, r10, r11, r12, k1 - vmovdqu64 [arg2 + AadHash], xmm8 ; ctx_data.aad hash = aad_hash - - ;; **zmm2, zmm3, zmm4 and zmm5 may contain clear text - ;; **zmm19 may contain authentication key -%ifdef SAFE_DATA - clear_zmms_avx512 xmm2, xmm3, xmm4, xmm5, xmm18, xmm19 -%endif - -no_full_blocks: - add arg3, arg4 ; Point at partial block - - vmovq arg4, xmm14 ; Restore original remaining length - and arg4, 15 - jz exit_gmac_update - - ; Save next partial block - mov [arg2 + PBlockLen], arg4 - READ_SMALL_DATA_INPUT_AVX512 xmm1, arg3, arg4, r11, k1 - vpshufb xmm1, xmm1, [rel SHUF_MASK] - vpxorq xmm8, xmm8, xmm1 - vmovdqu64 [arg2 + AadHash], xmm8 -%ifdef SAFE_DATA - ;; **xmm1 and xmm8 may contain some clear text - clear_zmms_avx512 xmm1, xmm8 -%endif -exit_gmac_update: - FUNC_RESTORE - ret - -%ifdef SAFE_PARAM -error_gmac_update: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check in != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_SRC - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_gmac_update -%endif - mksection stack-noexec - -%endif ; GCM_GMAC_API_VAES_AVX512_INC diff --git a/lib/avx512_t2/gmac_api_vaes_avx512.asm b/lib/avx512_t2/gmac_api_vaes_avx512.asm new file mode 100644 index 0000000000000000000000000000000000000000..0a57335ce44d9e68b6bf3d1eb9f9d3e18a525167 --- /dev/null +++ b/lib/avx512_t2/gmac_api_vaes_avx512.asm @@ -0,0 +1,159 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%define GCM128_MODE 1 +%include "include/gcm_vaes_avx512.inc" +%include "include/error.inc" +%include "include/clear_regs.inc" + +extern ghash_internal_vaes_avx512 + +mksection .text +default rel + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void imb_aes_gmac_update_128_vaes_avx512 / +; imb_aes_gmac_update_192_vaes_avx512 / +; imb_aes_gmac_update_256_vaes_avx512 +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; const u8 *in, +; const u64 msg_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(imb_aes_gmac_update_128_vaes_avx512,function,) +MKGLOBAL(imb_aes_gmac_update_192_vaes_avx512,function,) +MKGLOBAL(imb_aes_gmac_update_256_vaes_avx512,function,) +imb_aes_gmac_update_128_vaes_avx512: +imb_aes_gmac_update_192_vaes_avx512: +imb_aes_gmac_update_256_vaes_avx512: + endbranch64 + FUNC_SAVE small_frame + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET +%endif + ;; Check if msg_len == 0 + cmp arg4, 0 + je .exit_gmac_update + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + cmp arg1, 0 + jz .error_gmac_update + + ;; Check context_data != NULL + cmp arg2, 0 + jz .error_gmac_update + + ;; Check in != NULL (msg_len != 0) + cmp arg3, 0 + jz .error_gmac_update +%endif + + ; Increment size of "AAD length" for GMAC + add [arg2 + AadLen], arg4 + + ;; Deal with previous partial block + xor r11, r11 + vmovdqu64 xmm0, [arg2 + AadHash] + + PARTIAL_BLOCK_GMAC arg1, arg2, arg3, arg4, r11, xmm0, r10, r12, rax, \ + zmm8, zmm1, zmm2, zmm3, zmm4, zmm5, zmm6, zmm7, k1 +%ifdef SAFE_DATA + clear_zmms_avx512 xmm8 +%endif + ; CALC_AAD_HASH needs to deal with multiple of 16 bytes + sub arg4, r11 + add arg3, r11 + + mov r10, arg4 ; Save remaining length + and arg4, -16 ; Get multiple of 16 bytes + jz .no_full_blocks + + ;; Calculate GHASH of this segment + + ;; arg1 [in] pointer to key structure - arg1 here + ;; r12 [in] message pointer - arg3 here + ;; r13 [in] message length - arg4 here + mov r12, arg3 + mov r13, arg4 + + ;; xmm0 [in/out] ghash value + call ghash_internal_vaes_avx512 + + vmovdqu64 [arg2 + AadHash], xmm0 ; ctx_data.aad hash = aad_hash + +%ifdef SAFE_DATA + clear_zmms_avx512 xmm3, xmm4, xmm5, xmm6, xmm19, xmm9 +%endif + +.no_full_blocks: + add arg3, arg4 ; Point at partial block + + mov arg4, r10 ; Restore original remaining length + and arg4, 15 + jz .exit_gmac_update + + ; Save next partial block + mov [arg2 + PBlockLen], arg4 + READ_SMALL_DATA_INPUT_AVX512 xmm1, arg3, arg4, r11, k1 + vpshufb xmm1, xmm1, [rel SHUF_MASK] + vpxorq xmm0, xmm0, xmm1 + vmovdqu64 [arg2 + AadHash], xmm0 +%ifdef SAFE_DATA + ;; **xmm1 and xmm0 may contain some clear text + clear_zmms_avx512 xmm1, xmm0 +%endif +.exit_gmac_update: + FUNC_RESTORE + ret + +%ifdef SAFE_PARAM +.error_gmac_update: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check in != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_SRC + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp .exit_gmac_update +%endif + +mksection stack-noexec + diff --git a/lib/cmake/unix.cmake b/lib/cmake/unix.cmake index 3cdf3dbe19ebfa041dba80fbcaaeb3179cf02f2b..27ce99b186dbd074d708f6a2441dcfc59be620ef 100644 --- a/lib/cmake/unix.cmake +++ b/lib/cmake/unix.cmake @@ -26,6 +26,7 @@ # ############################################################################## # IPSec_MB library CMake Unix config # ############################################################################## +include(GNUInstallDirs) set(LIB IPSec_MB) # 'lib' prefix assumed on Linux @@ -64,7 +65,7 @@ if(CMAKE_COMPILER_IS_GNUCC) string(APPEND CMAKE_C_FLAGS " -fno-strict-overflow") endif() -if(CC_HAS_CET) +if(CET_SUPPORT) string(APPEND CMAKE_C_FLAGS " -fcf-protection=full") string(APPEND CMAKE_SHARED_LINKER_FLAGS " -Wl,-z,ibt -Wl,-z,shstk -Wl,-z,cet-report=error") endif() @@ -122,13 +123,13 @@ if(NOT CMAKE_INSTALL_PREFIX) CACHE STRING "Set default installation directory" FORCE) endif() if(NOT LIB_INSTALL_DIR) - set(LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib") + set(LIB_INSTALL_DIR "${CMAKE_INSTALL_FULL_LIBDIR}") endif() if(NOT INCLUDE_INSTALL_DIR) - set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/include") + set(INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_FULL_INCLUDEDIR}") endif() if(NOT MAN_INSTALL_DIR) - set(MAN_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/man/man7") + set(MAN_INSTALL_DIR "${CMAKE_INSTALL_FULL_MANDIR}/man7") endif() message(STATUS "LIB_INSTALL_DIR... ${LIB_INSTALL_DIR}") diff --git a/lib/include/des_avx512.inc b/lib/include/des_avx512.inc new file mode 100644 index 0000000000000000000000000000000000000000..0105bbbcfdf7e5e760d21bc235daba9f99209b99 --- /dev/null +++ b/lib/include/des_avx512.inc @@ -0,0 +1,1978 @@ +;; +;; Copyright (c) 2024, Intel Corporation +;; +;; Redistribution and use in source and binary forms, with or without +;; modification, are permitted provided that the following conditions are met: +;; +;; * Redistributions of source code must retain the above copyright notice, +;; this list of conditions and the following disclaimer. +;; * Redistributions in binary form must reproduce the above copyright +;; notice, this list of conditions and the following disclaimer in the +;; documentation and/or other materials provided with the distribution. +;; * Neither the name of Intel Corporation nor the names of its contributors +;; may be used to endorse or promote products derived from this software +;; without specific prior written permission. +;; +;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;; + +;; Authors: +;; Shay Gueron (1, 2), Regev Shemy (2), Tomasz kantecki (2) +;; (1) University of Haifa, Israel +;; (2) Intel Corporation + +%use smartalign + +%include "include/os.inc" +%include "include/reg_sizes.inc" +%include "include/mb_mgr_datastruct.inc" +%include "include/constants.inc" +;%define DO_DBGPRINT +;%include "include/dbgprint.inc" +%include "include/clear_regs.inc" +%include "include/transpose_avx512.inc" + +%ifdef LINUX +%define arg1 rdi +%define arg2 rsi +%define arg3 rdx +%define arg4 rcx +%else +%define arg1 rcx +%define arg2 rdx +%define arg3 r8 +%define arg4 r9 +%endif + +%define OFFSET rax + +%define IA0 arg3 +%define IA1 arg4 +%define IA2 r10 + +%define INP0 r11 +%define INP1 r12 +%define INP2 r13 +%define INP3 r14 +%define INP4 r15 + +%define KSOFFSET r11 + +%define ZW0 zmm0 +%define ZW1 zmm1 +%define ZW2 zmm2 +%define ZW3 zmm3 +%define ZW4 zmm4 +%define ZW5 zmm5 +%define ZW6 zmm6 +%define ZW7 zmm7 +%define ZW8 zmm8 +%define ZW9 zmm9 +%define ZW10 zmm10 +%define ZW11 zmm11 +%define ZW12 zmm12 +%define ZW13 zmm13 +%define ZW14 zmm14 +%define ZW15 zmm15 + +%define ZIV0 zmm16 +%define ZIV1 zmm17 + +%define ZTMP0 zmm18 +%define ZTMP1 zmm19 +%define ZTMP2 zmm20 +%define ZTMP3 zmm21 +%define ZTMP4 zmm22 +%define ZTMP5 zmm23 +%define ZTMP6 zmm24 +%define ZTMP7 zmm25 +%define ZTMP8 zmm26 +%define ZTMP9 zmm27 +%define ZTMP10 zmm28 +%define ZTMP11 zmm29 +%define ZTMP12 zmm30 +%define ZTMP13 zmm31 + +struc STACKFRAME +_gpr_save: resq 4 ; r12 to r15 +_rsp_save: resq 1 +_mask_save: resq 1 +_size_save: resq 1 +_padding: resq 1 +_tmp_iv: resq 16 ; 2 x 64 bytes +_tmp_in: resq 16 ; 2 x 64 bytes +_tmp_out: resq 16 ; 2 x 64 bytes +_tmp_mask: resd 16 ; 1 x 64 bytes +_key_sched: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 +_key_sched2: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 +_key_sched3: resq 16*16 ; 16 lanes x 16 qwords; 16 x 128 bytes = 2048 +endstruc + +;;; =========================================================================== +;;; =========================================================================== +;;; MACROS +;;; =========================================================================== +;;; =========================================================================== + +;;; =========================================================================== +;;; CLEAR TRANSPOSED KEY SCHEDULE (if SAFE_DATA is selected) +;;; =========================================================================== +%macro CLEAR_KEY_SCHEDULE 2 +%define %%ALG %1 ; [in] DES or 3DES +%define %%ZT %2 ; [clobbered] temporary ZMM register + +%ifdef SAFE_DATA + vpxorq %%ZT, %%ZT, %%ZT + +%ifidn %%ALG, 3DES +%assign rep_num ((3 * (16 * 16 * 8)) / 64) +%else +%assign rep_num ((16 * 16 * 8) / 64) +%endif + +%assign offset 0 +%rep rep_num + vmovdqa64 [rsp + _key_sched + offset], %%ZT +%assign offset (offset + 64) +%endrep + +%endif ; SAFE_DATA + +%endmacro + +;;; =========================================================================== +;;; PERMUTE +;;; =========================================================================== +;;; A [in/out] - zmm register +;;; B [in/out] - zmm register +;;; NSHIFT [in] - constant to shift words by +;;; MASK [in] - zmm or m512 with mask +;;; T0 [clobbered] - temporary zmm register +%macro PERMUTE 5 +%define %%A %1 +%define %%B %2 +%define %%NSHIFT %3 +%define %%MASK %4 +%define %%T0 %5 + + vpsrld %%T0, %%A, %%NSHIFT + vpxord %%T0, %%T0, %%B + vpandd %%T0, %%T0, %%MASK + vpxord %%B, %%B, %%T0 + vpslld %%T0, %%T0, %%NSHIFT + vpxord %%A, %%A, %%T0 +%endmacro + +;;; =========================================================================== +;;; INITIAL PERMUTATION +;;; =========================================================================== +;;; L [in/out] - zmm register +;;; R [in/out] - zmm register +;;; T0 [clobbered] - temporary zmm register +%macro IP_Z 3 +%define %%L %1 +%define %%R %2 +%define %%T0 %3 + PERMUTE %%R, %%L, 4, [rel des_init_perm_consts_avx512 + 0*64], %%T0 + PERMUTE %%L, %%R, 16, [rel des_init_perm_consts_avx512 + 1*64], %%T0 + PERMUTE %%R, %%L, 2, [rel des_init_perm_consts_avx512 + 2*64], %%T0 + PERMUTE %%L, %%R, 8, [rel des_init_perm_consts_avx512 + 3*64], %%T0 + PERMUTE %%R, %%L, 1, [rel des_init_perm_consts_avx512 + 4*64], %%T0 +%endmacro + +;;; =========================================================================== +;;; FINAL PERMUTATION +;;; =========================================================================== +;;; L [in/out] - zmm register +;;; R [in/out] - zmm register +;;; T0 [clobbered] - temporary zmm register +%macro FP_Z 3 +%define %%L %1 +%define %%R %2 +%define %%T0 %3 + PERMUTE %%L, %%R, 1, [rel des_init_perm_consts_avx512 + 4*64], %%T0 + PERMUTE %%R, %%L, 8, [rel des_init_perm_consts_avx512 + 3*64], %%T0 + PERMUTE %%L, %%R, 2, [rel des_init_perm_consts_avx512 + 2*64], %%T0 + PERMUTE %%R, %%L, 16, [rel des_init_perm_consts_avx512 + 1*64], %%T0 + PERMUTE %%L, %%R, 4, [rel des_init_perm_consts_avx512 + 0*64], %%T0 +%endmacro + +;;; =========================================================================== +;;; P PHASE +;;; =========================================================================== +;;; W0 [in/out] - zmm register +;;; in: vector of 16 x 32bits from S phase +;;; out: permuted in vector +;;; T0-T3 [clobbered] - temporary zmm register +%macro P_PHASE 5 +%define %%W0 %1 +%define %%T0 %2 +%define %%T1 %3 +%define %%T2 %4 +%define %%T3 %5 + + vprord %%T0, %%W0, 3 + vpandd %%T0, %%T0, [rel des_mask_values_avx512 + 0*64] + vprord %%T1, %%W0, 5 + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 1*64] + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 24 + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 2*64] + vprord %%T2, %%W0, 26 + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 3*64] + vpord %%T1, %%T1, %%T2 + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 15 + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 4*64] + vprord %%T2, %%W0, 17 + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 5*64] + vpord %%T1, %%T1, %%T2 + + vprord %%T2, %%W0, 6 + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 6*64] + vprord %%T3, %%W0, 21 + vpandd %%T3, %%T3, [rel des_mask_values_avx512 + 7*64] + vpord %%T2, %%T2, %%T3 + vpord %%T1, %%T1, %%T2 + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 12 + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 8*64] + vprord %%T2, %%W0, 14 + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 9*64] + vpord %%T1, %%T1, %%T2 + + vprord %%T2, %%W0, 4 + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 10*64] + vprord %%T3, %%W0, 11 + vpandd %%T3, %%T3, [rel des_mask_values_avx512 + 11*64] + vpord %%T2, %%T2, %%T3 + vpord %%T1, %%T1, %%T2 + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 16 + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 12*64] + vprord %%T2, %%W0, 22 + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 13*64] + vpord %%T1, %%T1, %%T2 + + vprord %%T2, %%W0, 19 + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 14*64] + vprord %%T3, %%W0, 10 + vpandd %%T3, %%T3, [rel des_mask_values_avx512 + 15*64] + vpord %%T2, %%T2, %%T3 + vpord %%T1, %%T1, %%T2 + vpord %%T0, %%T0, %%T1 + + vprord %%T1, %%W0, 9 + vpandd %%T1, %%T1, [rel des_mask_values_avx512 + 16*64] + vprord %%T2, %%W0, 13 + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 17*64] + vpord %%T1, %%T1, %%T2 + + vprord %%T2, %%W0, 25 + vpandd %%T2, %%T2, [rel des_mask_values_avx512 + 18*64] + vpord %%T1, %%T1, %%T2 + vpord %%W0, %%T0, %%T1 +%endmacro + +;;; =========================================================================== +;;; E PHASE +;;; =========================================================================== +;;; +;;; Expands 16x32-bit words into 16x48-bit words +;;; plus XOR's result with the key schedule. +;;; The output is adjusted to be friendly as S phase input. +;;; +;;; in [in] - zmm register +;;; out0a [out] - zmm register +;;; out0b [out] - zmm register +;;; out1a [out] - zmm register +;;; out1b [out] - zmm register +;;; k0 [in] - key schedule; zmm or m512 +;;; k1 [in] - key schedule; zmm or m512 +;;; t0-t1 [clobbered] - temporary zmm register +%macro E_PHASE 9 +%define %%IN %1 +%define %%OUT0A %2 +%define %%OUT0B %3 +%define %%OUT1A %4 +%define %%OUT1B %5 +%define %%K0 %6 +%define %%K1 %7 +%define %%T0 %8 +%define %%T1 %9 + + vprord %%T0, %%IN, 31 + vprord %%T1, %%IN, 3 + vpshufb %%T0, %%T0, [rel des_idx_e_avx512] + vpshufb %%T1, %%T1, [rel des_idx_e_avx512] + vpunpcklbw %%OUT0A, %%T0, %%T1 + vpunpckhbw %%OUT1A, %%T0, %%T1 + vpxord %%OUT0A, %%OUT0A, %%K0 + vpxord %%OUT1A, %%OUT1A, %%K1 + vpandd %%OUT0B, %%OUT0A, [rel des_and_eu_avx512] + vpsrlw %%OUT0B, %%OUT0B, 8 + vpandd %%OUT0A, %%OUT0A, [rel des_and_ed_avx512] + vpandd %%OUT1B, %%OUT1A, [rel des_and_eu_avx512] + vpsrlw %%OUT1B, %%OUT1B, 8 + vpandd %%OUT1A, %%OUT1A, [rel des_and_ed_avx512] +%endmacro + +;;; =========================================================================== +;;; S-BOX +;;; =========================================================================== +;;; +;;; NOTE: clobbers k1-k6 OpMask registers +;;; +;;; IN0A [in] - zmm register; output from E-phase +;;; IN0B [in] - zmm register; output from E-phase +;;; IN1A [in] - zmm register; output from E-phase +;;; IN1B [in] - zmm register; output from E-phase +;;; OUT [out] - zmm register; output from E-phase +;;; T0-T5 [clobbered] - temporary zmm register +%macro S_PHASE 11 +%define %%IN0A %1 +%define %%IN0B %2 +%define %%IN1A %3 +%define %%IN1B %4 +%define %%OUT %5 +%define %%T0 %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%T3 %9 +%define %%T4 %10 +%define %%T5 %11 + + vmovdqa64 %%T0, [rel des_reg_values16bit_7_avx512] + vpcmpuw k3, %%IN0A, %%T0, 2 ; 2 -> LE + vpcmpuw k4, %%IN0B, %%T0, 2 ; 2 -> LE + vpcmpuw k5, %%IN1A, %%T0, 2 ; 2 -> LE + vpcmpuw k6, %%IN1B, %%T0, 2 ; 2 -> LE + + mov DWORD(IA0), 0x55555555 + kmovd k1, DWORD(IA0) + mov DWORD(IA0), 0xaaaaaaaa + kmovd k2, DWORD(IA0) + + vmovdqa64 %%T0, [rel des_S_box_flipped_avx512 + 0*64] + vmovdqa64 %%T1, [rel des_S_box_flipped_avx512 + 1*64] + vmovdqa64 %%T2, [rel des_S_box_flipped_avx512 + 4*64] + vmovdqa64 %%T3, [rel des_S_box_flipped_avx512 + 5*64] + vpermw %%T0{k1}{z}, %%IN0A, %%T0 + vpermw %%T1{k1}{z}, %%IN0A, %%T1 + vpermw %%T2{k2}{z}, %%IN0A, %%T2 + vpermw %%T3{k2}{z}, %%IN0A, %%T3 + vpxord %%T0, %%T0, %%T2 + vpxord %%OUT, %%T1, %%T3 + vmovdqu16 %%OUT{k3}, %%T0 + + vmovdqa64 %%T0, [rel des_S_box_flipped_avx512 + 2*64] + vmovdqa64 %%T1, [rel des_S_box_flipped_avx512 + 3*64] + vmovdqa64 %%T2, [rel des_S_box_flipped_avx512 + 6*64] + vmovdqa64 %%T3, [rel des_S_box_flipped_avx512 + 7*64] + vpermw %%T0{k1}{z}, %%IN0B, %%T0 + vpermw %%T1{k1}{z}, %%IN0B, %%T1 + vpermw %%T2{k2}{z}, %%IN0B, %%T2 + vpermw %%T3{k2}{z}, %%IN0B, %%T3 + vpxord %%T0, %%T0, %%T2 + vpxord %%T3, %%T1, %%T3 + vmovdqu16 %%T3{k4}, %%T0 + vpsllw %%T3, %%T3, 4 + vpxord %%OUT, %%OUT, %%T3 + + vmovdqa64 %%T0, [rel des_S_box_flipped_avx512 + 8*64] + vmovdqa64 %%T1, [rel des_S_box_flipped_avx512 + 9*64] + vmovdqa64 %%T2, [rel des_S_box_flipped_avx512 + 12*64] + vmovdqa64 %%T3, [rel des_S_box_flipped_avx512 + 13*64] + vpermw %%T0{k1}{z}, %%IN1A, %%T0 + vpermw %%T1{k1}{z}, %%IN1A, %%T1 + vpermw %%T2{k2}{z}, %%IN1A, %%T2 + vpermw %%T3{k2}{z}, %%IN1A, %%T3 + vpxord %%T0, %%T0, %%T2 + vpxord %%T4, %%T1, %%T3 + vmovdqu16 %%T4{k5}, %%T0 + + vmovdqa64 %%T0, [rel des_S_box_flipped_avx512 + 10*64] + vmovdqa64 %%T1, [rel des_S_box_flipped_avx512 + 11*64] + vmovdqa64 %%T2, [rel des_S_box_flipped_avx512 + 14*64] + vmovdqa64 %%T3, [rel des_S_box_flipped_avx512 + 15*64] + vpermw %%T0{k1}{z}, %%IN1B, %%T0 + vpermw %%T1{k1}{z}, %%IN1B, %%T1 + vpermw %%T2{k2}{z}, %%IN1B, %%T2 + vpermw %%T3{k2}{z}, %%IN1B, %%T3 + vpxord %%T0, %%T0, %%T2 + vpxord %%T5, %%T1, %%T3 + vmovdqu16 %%T5{k6}, %%T0 + vpsllw %%T5, %%T5, 4 + + vpxord %%T4, %%T4, %%T5 + vpsllw %%T4, %%T4, 8 + vpxord %%OUT, %%OUT, %%T4 + vpshufb %%OUT, %%OUT, [rel des_shuffle_reg_avx512] +%endmacro + +;;; =========================================================================== +;;; DES encryption/decryption round +;;; =========================================================================== +;;; +;;; Clobbers k1-k6 OpMask registers +;;; +;;; ENC_DEC [in] - ENC for encryption, DEC for decryption +;;; R [in/out] - zmm register; plain text in & cipher text out +;;; L [in/out] - zmm register; plain text in & cipher text out +;;; KS [in] - pointer to the key schedule +;;; T0-T11 [clobbered] - temporary zmm register +%macro DES_ENC_DEC_EXP 16 +%define %%ENC_DEC %1 +%define %%R %2 +%define %%L %3 +%define %%KS %4 +%define %%T0 %5 +%define %%T1 %6 +%define %%T2 %7 +%define %%T3 %8 +%define %%T4 %9 +%define %%T5 %10 +%define %%T6 %11 +%define %%T7 %12 +%define %%T8 %13 +%define %%T9 %14 +%define %%T10 %15 +%define %%T11 %16 + + + ;; Comment out section below & compile to see macro invocations in the code (registers & parameters). + ;; + ;; %define arglist "arglist:" + ;; %rep %0 + ;; %xdefine arglist arglist %+ %1 %+ , + ;; %rotate 1 + ;; %endrep + ;; %warning arglist + ;; %undef arglist + + IP_Z %%R, %%L, %%T0 + +%ifidn %%ENC_DEC, ENC + ;; ENCRYPTION + xor KSOFFSET, KSOFFSET +align 32 +%%_des_enc_loop: + E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (0*64)], [%%KS + KSOFFSET + (1*64)], %%T6, %%T7 + S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 + P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 + vpxord %%L, %%L, %%T0 + + E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET + (2*64)], [%%KS + KSOFFSET + (3*64)], %%T6, %%T7 + S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 + P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 + vpxord %%R, %%R, %%T0 + + add KSOFFSET, (4*64) + cmp KSOFFSET, (8*(4*64)) + jb %%_des_enc_loop + +%else + ;; DECRYPTION + mov KSOFFSET, (8*(4*64)) +align 32 +%%_des_dec_loop: + E_PHASE %%R, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (2*64)], [%%KS + KSOFFSET - (1*64)], %%T6, %%T7 + S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 + P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 + vpxord %%L, %%L, %%T0 + + E_PHASE %%L, %%T1, %%T2, %%T3, %%T4, [%%KS + KSOFFSET - (4*64)], [%%KS + KSOFFSET - (3*64)], %%T6, %%T7 + S_PHASE %%T1, %%T2, %%T3, %%T4, %%T0, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11 + P_PHASE %%T0, %%T1, %%T2, %%T3, %%T4 + vpxord %%R, %%R, %%T0 + sub KSOFFSET, (4*64) + jnz %%_des_dec_loop +%endif ; DECRYPTION + + FP_Z %%R, %%L, %%T0 + +%endmacro + +;;; =========================================================================== +;;; Wrapper macro for DES_ENC_DEC_EXP +;;; =========================================================================== +;;; +%macro DES_ENC_DEC 16 +%define %%ENC_DEC %1 +%define %%R %2 +%define %%L %3 +%define %%KS %4 + + ;; Make call to a function or expand macro with algorithmic code + ;; 0 - expand - use 0 to verify macro invocations & parameters vs functions in des_common.asm + ;; 1 - make call - use it in production (smaller code foot print) +%assign make_call 1 + +%if make_call != 0 + + ;; Retrieve R and L ZMM register numbers + ;; - this fragment could look better (with newer NASM) but it is compatible with NASM 2.14.02 + ;; - map R and L to string + ;; - get sub-strings with ZMM number + ;; - make new definition using the sub-strings - it results in a number that can be compared +%defstr %%RSTR %%R +%defstr %%LSTR %%L +%substr %%r_idx %%RSTR 4, -1 +%substr %%l_idx %%LSTR 4, -1 +%define %%RNUM %%r_idx +%define %%LNUM %%l_idx + + ;; swap input/output zmm's only if R zmm has higher number than L one +%assign %%swap_zmms 0 +%if %%RNUM > %%LNUM +%assign %%swap_zmms 1 +%endif + +%if %%swap_zmms != 0 + ;; register names are swapped + ;; - meaning there is function generated for the same pair of zmm's but they are swapped (R with L) + ;; - the idea is to re-use existing function and swap register values before the call (3DES/TDES use case) +%define %%NEW_R %%L +%define %%NEW_L %%R + vmovdqa64 %5, %%R + vmovdqa64 %%R, %%L + vmovdqa64 %%L, %5 +%else + ;; no swap needed +%define %%NEW_R %%R +%define %%NEW_L %%L +%endif + + ;; construct name of the function to be called + ;; des____avx512 +%ifidn %%ENC_DEC, ENC +%define %%fn_name des_enc_ %+ %%NEW_R %+ _ %+ %%NEW_L %+ _avx512 +%else +%define %%fn_name des_dec_ %+ %%NEW_R %+ _ %+ %%NEW_L %+ _avx512 +%endif + + lea r15, [%%KS] ;; r15 is safe to be used as an input argument + call %%fn_name + +%if %%swap_zmms != 0 + ;; register names were swapped, unswap them + vmovdqa64 %5, %%R + vmovdqa64 %%R, %%L + vmovdqa64 %%L, %5 +%endif + + ;; clean-up temporary macro definitions +%undef %%fn_name +%undef %%NEW_R +%undef %%NEW_L +%undef %%r_idx +%undef %%l_idx +%undef %%RSTR +%undef %%LSTR +%undef %%RNUM +%undef %%LNUM +%undef %%swap_zmms + +%else ; make_call != 0 + ;; Expand the macro in-place + DES_ENC_DEC_EXP %1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15, %16 +%endif + +%endmacro + +;;; =========================================================================== +;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA INPUT +;;; =========================================================================== +;;; +;;; IN00-IN15 / R0/L0-R7/L7 [in/out]: +;;; in: IN00 - lane 0 data, IN01 - lane 1 data, ... IN15 - lane 15 data +;;; out: R0 - 16 x word0, L0 - 16 x word1 +;;; T0,T2 [clobbered] - temporary zmm registers +;;; K0-K4 [clobbered] - temporary zmm registers +;;; H0,H2 [clobbered] - temporary zmm registers +%macro TRANSPOSE_IN_ONE 24 +%define %%IN00 %1 ; R0 +%define %%IN01 %2 ; L0 +%define %%IN02 %3 ; R1 +%define %%IN03 %4 ; L1 +%define %%IN04 %5 ; R2 +%define %%IN05 %6 ; L2 +%define %%IN06 %7 ; R3 +%define %%IN07 %8 ; L3 +%define %%IN08 %9 ; R4 +%define %%IN09 %10 ; L4 +%define %%IN10 %11 ; R5 +%define %%IN11 %12 ; L5 +%define %%IN12 %13 ; R6 +%define %%IN13 %14 ; L6 +%define %%IN14 %15 ; R7 +%define %%IN15 %16 ; L7 +%define %%T0 %17 +%define %%T2 %18 +%define %%K0 %19 +%define %%K1 %20 +%define %%K2 %21 +%define %%K4 %22 +%define %%H0 %23 +%define %%H2 %24 + + vpunpckldq %%K0, %%IN00, %%IN01 + vpunpckhdq %%K1, %%IN00, %%IN01 + vpunpckldq %%T0, %%IN02, %%IN03 + + vpunpckldq %%IN00, %%IN04, %%IN05 + vpunpckhdq %%IN01, %%IN04, %%IN05 + vpunpckldq %%IN02, %%IN06, %%IN07 + + vpunpcklqdq %%K2, %%K0, %%T0 + vpunpckhqdq %%T2, %%K0, %%T0 + + vpunpcklqdq %%K0, %%IN00, %%IN02 + vpunpckhqdq %%K1, %%IN00, %%IN02 + + vpunpckldq %%K4, %%IN08, %%IN09 + vpunpckldq %%IN04, %%IN10, %%IN11 + vpunpckldq %%IN06, %%IN12, %%IN13 + vpunpckldq %%IN10, %%IN14, %%IN15 + + vpunpcklqdq %%IN12, %%K4, %%IN04 + vpunpckhqdq %%IN13, %%K4, %%IN04 + vpunpcklqdq %%IN00, %%IN06, %%IN10 + vpunpckhqdq %%IN01, %%IN06, %%IN10 + + vshufi64x2 %%H0, %%K2, %%K0, 0x44 + vshufi64x2 %%H2, %%IN12, %%IN00, 0x44 + vshufi64x2 %%IN00, %%H0, %%H2, 0x88 ; R0 + + vshufi64x2 %%H0, %%T2, %%K1, 0x44 + vshufi64x2 %%H2, %%IN13, %%IN01, 0x44 + vshufi64x2 %%IN01, %%H0, %%H2, 0x88 ; L0 +%endmacro + +;;; =========================================================================== +;;; DATA TRANSPOSITION OF ONE DES BLOCK AT DATA OUTPUT +;;; =========================================================================== +;;; +;;; IN00-IN15 aka R0/L0 - R7/L7 [in/out]: +;;; in: R0 - 16 x word0, L0 - 16 x word1 +;;; out: R0 - lane 0 data, L0 - lane 1 data, ... L7 - lane 15 data +;;; T0-T3 [clobbered] - temporary zmm registers +;;; K0-K3 [clobbered] - temporary zmm registers +;;; H0,H1 [clobbered] - temporary zmm registers +%macro TRANSPOSE_OUT_ONE 25 +%define %%IN00 %1 ; R0 +%define %%IN01 %2 ; L0 +%define %%IN02 %3 ; R1 +%define %%IN03 %4 ; L1 +%define %%IN04 %5 ; R2 +%define %%IN05 %6 ; L2 +%define %%IN06 %7 ; R3 +%define %%IN07 %8 ; L3 +%define %%IN08 %9 ; R4 +%define %%IN09 %10 ; L4 +%define %%IN10 %11 ; R5 +%define %%IN11 %12 ; L5 +%define %%IN12 %13 ; R6 +%define %%IN13 %14 ; L6 +%define %%IN14 %15 ; R7 +%define %%IN15 %16 ; L7 +%define %%T0 %17 +%define %%T2 %18 +%define %%T3 %19 +%define %%K0 %20 +%define %%K1 %21 +%define %%K2 %22 +%define %%K3 %23 +%define %%H0 %24 +%define %%H1 %25 + + vpxord %%T0, %%T0, %%T0 + + vpunpckldq %%K0, %%IN01, %%IN00 + vpunpckhdq %%K1, %%IN01, %%IN00 + + vpunpcklqdq %%K2, %%K0, %%T0 + vpunpckhqdq %%T2, %%K0, %%T0 + vpunpcklqdq %%K3, %%K1, %%T0 + vpunpckhqdq %%T3, %%K1, %%T0 + + vshufi64x2 %%H0, %%K2, %%T0, 0x44 + vshufi64x2 %%H1, %%K2, %%T0, 0xee + vshufi64x2 %%IN00, %%H0, %%T0, 0x88 ; R0 + vshufi64x2 %%IN04, %%H0, %%T0, 0xdd ; R2 + vshufi64x2 %%IN08, %%H1, %%T0, 0x88 ; R4 + vshufi64x2 %%IN12, %%H1, %%T0, 0xdd ; R6 + + vshufi64x2 %%H0, %%T2, %%T0, 0x44 + vshufi64x2 %%H1, %%T2, %%T0, 0xee + vshufi64x2 %%IN01, %%H0, %%T0, 0x88 ; L0 + vshufi64x2 %%IN05, %%H0, %%T0, 0xdd ; L2 + vshufi64x2 %%IN09, %%H1, %%T0, 0x88 ; L4 + vshufi64x2 %%IN13, %%H1, %%T0, 0xdd ; L6 + + vshufi64x2 %%H0, %%K3, %%T0, 0x44 + vshufi64x2 %%H1, %%K3, %%T0, 0xee + vshufi64x2 %%IN02, %%H0, %%T0, 0x88 ; R1 + vshufi64x2 %%IN06, %%H0, %%T0, 0xdd ; R3 + vshufi64x2 %%IN10, %%H1, %%T0, 0x88 ; R5 + vshufi64x2 %%IN14, %%H1, %%T0, 0xdd ; R7 + + vshufi64x2 %%H0, %%T3, %%T0, 0x44 + vshufi64x2 %%H1, %%T3, %%T0, 0xee + vshufi64x2 %%IN03, %%H0, %%T0, 0x88 ; L1 + vshufi64x2 %%IN07, %%H0, %%T0, 0xdd ; L3 + vshufi64x2 %%IN11, %%H1, %%T0, 0x88 ; L5 + vshufi64x2 %%IN15, %%H1, %%T0, 0xdd ; L7 +%endmacro + +;;; =========================================================================== +;;; DES INITIALIZATION +;;; key schedule transposition and IV set up +;;; =========================================================================== +;;; +;;; STATE_KEYS [in] - KEYS in DES OOO STATE +;;; STATE_IV [ in] - IV in DES OOO STATE +;;; KS [out] - place to store transposed key schedule or NULL +;;; IV0 [out] - r512; initialization vector +;;; IV1 [out] - r512; initialization vector +;;; T0-T27 [clobbered] - temporary r512 +%macro DES_INIT 33 +%define %%STATE_KEYS %1 +%define %%STATE_IV %2 +%define %%KS %3 +%define %%IV0 %4 +%define %%IV1 %5 +%define %%T0 %6 +%define %%T1 %7 +%define %%T2 %8 +%define %%T3 %9 +%define %%T4 %10 +%define %%T5 %11 +%define %%T6 %12 +%define %%T7 %13 +%define %%T8 %14 +%define %%T9 %15 +%define %%T10 %16 +%define %%T11 %17 +%define %%T12 %18 +%define %%T13 %19 +%define %%T14 %20 +%define %%T15 %21 +%define %%T16 %22 +%define %%T17 %23 +%define %%T18 %24 +%define %%T19 %25 +%define %%T20 %26 +%define %%T21 %27 +%define %%T22 %28 +%define %%T23 %29 +%define %%T24 %30 +%define %%T25 %31 +%define %%T26 %32 +%define %%T27 %33 + + ;; set up the key schedule + ;; - load first half of the keys & transpose + ;; - transpose and store + ;; note: we can use IV registers as temporary ones here +%assign IDX 0 +%rep 16 + mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] + vmovdqu64 %%T %+ IDX, [IA0] +%assign IDX (IDX + 1) +%endrep + TRANSPOSE16_U32 %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 +%assign IDX 0 +%rep 16 + vmovdqu64 [%%KS + (IDX * 64)], %%T %+ IDX +%assign IDX (IDX + 1) +%endrep + ;; - load second half of the keys & transpose + ;; - transpose and store + ;; note: we can use IV registers as temporary ones here +%assign IDX 0 +%rep 16 + mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] + vmovdqu64 %%T %+ IDX, [IA0 + 64] +%assign IDX (IDX + 1) +%endrep + TRANSPOSE16_U32 %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 +%assign IDX 0 +%rep 16 + vmovdqu64 [%%KS + (16 * 64) + (IDX * 64)], %%T %+ IDX +%assign IDX (IDX + 1) +%endrep + + ;; set up IV + ;; - they are already kept transposed so this is enough to load them + vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)] + vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)] +%endmacro + +;;; =========================================================================== +;;; 3DES INITIALIZATION +;;; key schedule transposition and IV set up +;;; =========================================================================== +;;; +;;; STATE_KEYS [in] - KEYS in 3DES OOO STATE +;;; STATE_IV [ in] - IV in 3DES OOO STATE +;;; KS1 [out] - place to store transposed key schedule or NULL +;;; KS2 [out] - place to store transposed key schedule or NULL +;;; KS3 [out] - place to store transposed key schedule or NULL +;;; IV0 [out] - r512; initialization vector +;;; IV1 [out] - r512; initialization vector +;;; T0-T27 [clobbered] - temporary r512 +;;; DIR [in] - ENC/DEC (keys arranged in different order for enc/dec) +%macro DES3_INIT 36 +%define %%STATE_KEYS %1 +%define %%STATE_IV %2 +%define %%KS1 %3 +%define %%KS2 %4 +%define %%KS3 %5 +%define %%IV0 %6 +%define %%IV1 %7 +%define %%T0 %8 +%define %%T1 %9 +%define %%T2 %10 +%define %%T3 %11 +%define %%T4 %12 +%define %%T5 %13 +%define %%T6 %14 +%define %%T7 %15 +%define %%T8 %16 +%define %%T9 %17 +%define %%T10 %18 +%define %%T11 %19 +%define %%T12 %20 +%define %%T13 %21 +%define %%T14 %22 +%define %%T15 %23 +%define %%T16 %24 +%define %%T17 %25 +%define %%T18 %26 +%define %%T19 %27 +%define %%T20 %28 +%define %%T21 %29 +%define %%T22 %30 +%define %%T23 %31 +%define %%T24 %32 +%define %%T25 %33 +%define %%T26 %34 +%define %%T27 %35 +%define %%DIR %36 + +%ifidn %%DIR, ENC +%assign KEY_IDX 0 +%else +%assign KEY_IDX 2 +%endif +%assign KS_IDX 1 + +%rep 3 + ;; set up the key schedule + ;; - load first half of the keys & transpose + ;; - transpose and store + ;; note: we can use IV registers as temporary ones here + +%assign IDX 0 +%rep 16 + mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] + mov IA0, [IA0 + (KEY_IDX * PTR_SZ)] + vmovdqu64 %%T %+ IDX, [IA0] +%assign IDX (IDX + 1) +%endrep + TRANSPOSE16_U32 %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 +%assign IDX 0 +%rep 16 + vmovdqu64 [%%KS %+ KS_IDX + (IDX * 64)], %%T %+ IDX +%assign IDX (IDX + 1) +%endrep + ;; - load second half of the keys & transpose + ;; - transpose and store + ;; note: we can use IV registers as temporary ones here +%assign IDX 0 +%rep 16 + mov IA0, [%%STATE_KEYS + (IDX*PTR_SZ)] + mov IA0, [IA0 + (KEY_IDX * PTR_SZ)] + vmovdqu64 %%T %+ IDX, [IA0 + 64] +%assign IDX (IDX + 1) +%endrep + TRANSPOSE16_U32 %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24, %%T25, %%T26, %%T27, %%IV0, %%IV1 +%assign IDX 0 +%rep 16 + vmovdqu64 [%%KS %+ KS_IDX + (16 * 64) + (IDX * 64)], %%T %+ IDX +%assign IDX (IDX + 1) +%endrep + +%ifidn %%DIR, ENC +%assign KEY_IDX (KEY_IDX + 1) +%else +%assign KEY_IDX (KEY_IDX - 1) +%endif +%assign KS_IDX (KS_IDX + 1) +%endrep ; KEY_IDX / KS_IDX + + ;; set up IV + ;; - they are already kept transposed so this is enough to load them + vmovdqu64 %%IV0, [%%STATE_IV + (0 * 64)] + vmovdqu64 %%IV1, [%%STATE_IV + (1 * 64)] + +%endmacro + +;;; =========================================================================== +;;; DES FINISH +;;; Update in/out pointers and store IV +;;; =========================================================================== +%macro DES_FINISH 9 +%define %%IV0 %1 ;; [in] zmm initialization vector (IV) +%define %%IV1 %2 ;; [in] zmm initialization vector (IV) +%define %%T0 %3 ;; [clobbered] temporary zmm +%define %%T1 %4 ;; [clobbered] temporary zmm +%define %%T2 %5 ;; [clobbered] temporary zmm +%define %%T3 %6 ;; [clobbered] temporary zmm +%define %%T4 %7 ;; [clobbered] temporary zmm +%define %%STATE %8 ;; [in] pointer to OOO manager +%define %%SIZE %9 ;; [in] processed message size in bytes + + vpbroadcastq %%T4, %%SIZE + vmovdqu64 %%T0, [%%STATE + _des_args_in + (0 * PTR_SZ)] + vmovdqu64 %%T1, [%%STATE + _des_args_in + (8 * PTR_SZ)] + vmovdqu64 %%T2, [%%STATE + _des_args_out + (0 * PTR_SZ)] + vmovdqu64 %%T3, [%%STATE + _des_args_out + (8 * PTR_SZ)] + vpaddq %%T0, %%T0, %%T4 + vpaddq %%T1, %%T1, %%T4 + vpaddq %%T2, %%T2, %%T4 + vpaddq %%T3, %%T3, %%T4 + vmovdqu64 [%%STATE + _des_args_in + (0 * PTR_SZ)], %%T0 + vmovdqu64 [%%STATE + _des_args_in + (8 * PTR_SZ)], %%T1 + vmovdqu64 [%%STATE + _des_args_out + (0 * PTR_SZ)], %%T2 + vmovdqu64 [%%STATE + _des_args_out + (8 * PTR_SZ)], %%T3 + + vmovdqu64 [%%STATE + _des_args_IV + (0 * 64)], %%IV0 + vmovdqu64 [%%STATE + _des_args_IV + (1 * 64)], %%IV1 +%endmacro + +;;; =========================================================================== +;;; DES CFB ENCRYPT/DECRYPT - ONE BLOCK ONLY +;;; =========================================================================== +;;; +;;; Needs: IA0-IA2 +;;; ENC_DEC [in] - encyrpt (ENC) or decrypt (DEC) selection +;;; KS [in] - key schedule +;;; T0-T24 [clobbered] - temporary r512 +;;; T_IN [in] - 16 * 8 byte storage +;;; T_OUT [in] - 16 * 8 byte storage +;;; T_MASK [in] - 16 * 4 byte storage +;;; T_IV [in] - 16 * 8 byte storage +;;; +;;; NOTE: clobbers OpMask registers +%macro DES_CFB_ONE 32 +%define %%ENC_DEC %1 +%define %%KS %2 +%define %%T0 %3 +%define %%T1 %4 +%define %%T2 %5 +%define %%T3 %6 +%define %%T4 %7 +%define %%T5 %8 +%define %%T6 %9 +%define %%T7 %10 +%define %%T8 %11 +%define %%T9 %12 +%define %%T10 %13 +%define %%T11 %14 +%define %%T12 %15 +%define %%T13 %16 +%define %%T14 %17 +%define %%T15 %18 +%define %%T16 %19 +%define %%T17 %20 +%define %%T18 %21 +%define %%T19 %22 +%define %%T20 %23 +%define %%T21 %24 +%define %%T22 %25 +%define %%T23 %26 +%define %%T24 %27 +%define %%T_IN %28 +%define %%T_OUT %29 +%define %%T_IV %30 +%define %%T_MASK %31 +%define %%STATE %32 ;; [in] GP with pointer to OOO manager + + ;; - find mask for non-zero partial lengths + vpxord %%T10, %%T10, %%T10 + vmovdqu64 %%T0, [%%STATE + _des_args_PLen] + vpcmpd k3, %%T0, %%T10, 4 ; NEQ + kmovw DWORD(IA0), k3 + movzx DWORD(IA0), WORD(IA0) + or DWORD(IA0), DWORD(IA0) + jz %%_des_cfb_one_end ; no non-zero partial lengths + +%ifidn %%ENC_DEC, ENC + ;; For encyrption case we need to make sure that + ;; all full blocks are complete before proceeding + ;; with CFB partial block. + ;; To do that current out position is compared against + ;; calculated last full block position. + vmovdqu64 %%T1, [%%STATE + _des_args_out + (0*8)] + vmovdqu64 %%T2, [%%STATE + _des_args_LOut + (0*8)] + vmovdqu64 %%T3, [%%STATE + _des_args_out + (8*8)] + vmovdqu64 %%T4, [%%STATE + _des_args_LOut + (8*8)] + vpcmpq k4, %%T1, %%T2, 0 ; EQ + vpcmpq k5, %%T3, %%T4, 0 ; EQ + kmovw DWORD(IA1), k4 + movzx DWORD(IA1), BYTE(IA1) + kmovw DWORD(IA2), k5 + movzx DWORD(IA2), BYTE(IA2) + shl DWORD(IA2), 8 + or DWORD(IA2), DWORD(IA1) + and DWORD(IA0), DWORD(IA2) + jz %%_des_cfb_one_end ; no non-zero lengths left + kmovw k3, DWORD(IA0) +%endif + ;; Calculate ((1 << partial_bytes) - 1) + ;; in order to get the mask for loads and stores + ;; k3 & IA0 - hold valid mask + vmovdqa64 %%T1, [rel des_vec_ones_32b_avx512] + vpsllvd %%T2{k3}{z}, %%T1, %%T0 + vpsubd %%T2{k3}{z}, %%T2, %%T1 + vmovdqu64 [%%T_MASK], %%T2 + + ;; clear selected partial lens not to do them twice + vmovdqu32 [%%STATE + _des_args_PLen]{k3}, %%T10 + + ;; copy IV, in and out pointers + vmovdqu64 %%T1, [%%STATE + _des_args_in + (0*PTR_SZ)] + vmovdqu64 %%T2, [%%STATE + _des_args_in + (8*PTR_SZ)] + vmovdqu64 %%T3, [%%STATE + _des_args_out + (0*PTR_SZ)] + vmovdqu64 %%T4, [%%STATE + _des_args_out + (8*PTR_SZ)] + vmovdqu64 %%T5, [%%STATE + _des_args_IV + (0*64)] + vmovdqu64 %%T6, [%%STATE + _des_args_IV + (1*64)] + vmovdqu64 [%%T_IN + (0*PTR_SZ)], %%T1 + vmovdqu64 [%%T_IN + (8*PTR_SZ)], %%T2 + vmovdqu64 [%%T_OUT + (0*PTR_SZ)], %%T3 + vmovdqu64 [%%T_OUT + (8*PTR_SZ)], %%T4 + vmovdqu64 [%%T_IV + (0*64)], %%T5 + vmovdqu64 [%%T_IV + (1*64)], %%T6 + + ;; calculate last block case mask + ;; - first block case requires no modifications to in/out/IV + vmovdqu64 %%T1, [%%STATE + _des_args_BLen] + vpcmpd k2, %%T1, %%T10, 4 ; NEQ + kmovw DWORD(IA1), k2 + and DWORD(IA1), DWORD(IA0) + jz %%_des_cfb_one_no_last_blocks + + ;; set up IV, in and out for the last block case + ;; - Last block needs in and out to be set differently (decryption only) + ;; - IA1 holds the last block mask +%ifidn %%ENC_DEC, DEC + mov DWORD(IA0), DWORD(IA1) + mov DWORD(IA2), DWORD(IA1) + shr DWORD(IA1), 8 + and DWORD(IA2), 0xff + kmovw k4, DWORD(IA2) + kmovw k5, DWORD(IA1) + vmovdqu64 %%T1, [%%STATE + _des_args_LOut + (0*PTR_SZ)] + vmovdqu64 %%T2, [%%STATE + _des_args_LOut + (8*PTR_SZ)] + vmovdqu64 %%T3, [%%STATE + _des_args_LIn + (0*PTR_SZ)] + vmovdqu64 %%T4, [%%STATE + _des_args_LIn + (8*PTR_SZ)] + vmovdqu64 [%%T_OUT + (0*PTR_SZ)]{k4}, %%T1 + vmovdqu64 [%%T_OUT + (8*PTR_SZ)]{k5}, %%T2 + vmovdqu64 [%%T_IN + (0*PTR_SZ)]{k4}, %%T3 + vmovdqu64 [%%T_IN + (8*PTR_SZ)]{k5}, %%T4 +%endif ; decryption + ;; - IV has to be set differently for CFB as well + ;; - IA0 holds the last block mask +%assign IDX 0 +%rep 16 + test DWORD(IA0), (1 << IDX) + jz %%_des_cfb_one_copy_iv_next %+ IDX +%ifidn %%ENC_DEC, ENC + mov IA2, [%%STATE + _des_args_LOut + (IDX*PTR_SZ)] +%else + mov IA2, [%%STATE + _des_args_LIn + (IDX*PTR_SZ)] +%endif + mov IA2, [IA2 - 8] + mov [%%T_IV + (0*4) + (IDX*4)], DWORD(IA2) + shr IA2, 32 + mov [%%T_IV + (16*4) + (IDX*4)], DWORD(IA2) +%%_des_cfb_one_copy_iv_next %+ IDX: +%assign IDX (IDX + 1) +%endrep + +%%_des_cfb_one_no_last_blocks: + ;; Uffff ... finally let's do some DES CFB + ;; - let's use T_IN, T_OUT, T_IV and T_MASK + + ;; - load data with the corresponding masks & transpose + ;; - T0 to T15 will hold the data + xor IA0, IA0 +%assign IDX 0 +%assign K_IDX 1 +%rep 16 + mov IA1, [%%T_IN + (IDX*PTR_SZ)] + mov DWORD(IA0), [%%T_MASK + (IDX*4)] + kmovq k %+ K_IDX, IA0 + vmovdqu8 %%T %+ IDX{k %+ K_IDX}{z}, [IA1] +%assign IDX (IDX + 1) +%assign K_IDX (K_IDX + 1) +%if K_IDX > 7 +%assign K_IDX 1 ; iterate through K1 to K7 +%endif +%endrep + ;; - transpose the data in T0 to T15, T16 to T23 are clobbered + TRANSPOSE_IN_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23 + + ;; - set up IV and %%T16 & %%T17 used as IV0 and IV1 + vmovdqu64 %%T16, [%%T_IV + (0 * 64)] ;IV0 + vmovdqu64 %%T17, [%%T_IV + (1 * 64)] ;IV1 + ;; DES encrypt + ;; - R0 - %%T0 + ;; - L0 - %%T1 + DES_ENC_DEC ENC, %%T16, %%T17, %%KS, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13 + ;; CFB style xor with R0/L0 with IV + ;; - IV0 - %%T16 + ;; - IV1 - %%T17 + vpxord %%T2, %%T17, %%T0 ; R0 ^ IV1 + vpxord %%T0, %%T16, %%T1 ; L0 ^ IV0 + vmovdqa64 %%T1, %%T2 + ;; - new R0 = L0 ^ IV0 (%%T0) + ;; - new L0 = R0 ^ IV1 (%%T1) + + ;; Transpose the data out + ;; - %%T2 to %%T24 clobbered + TRANSPOSE_OUT_ONE %%T0, %%T1, %%T2, %%T3, %%T4, %%T5, %%T6, %%T7, %%T8, %%T9, %%T10, %%T11, %%T12, %%T13, %%T14, %%T15, %%T16, %%T17, %%T18, %%T19, %%T20, %%T21, %%T22, %%T23, %%T24 + + ;; Store the transposed data + ;; - T0 to T15 will hold the data + xor IA0, IA0 +%assign IDX 0 +%assign K_IDX 1 +%rep 16 + mov IA1, [%%T_OUT + (IDX*PTR_SZ)] + mov DWORD(IA0), [%%T_MASK + (IDX*4)] + kmovq k %+ K_IDX, IA0 + vmovdqu8 [IA1]{k %+ K_IDX}, %%T %+ IDX +%assign IDX (IDX + 1) +%assign K_IDX (K_IDX + 1) +%if K_IDX > 7 +%assign K_IDX 1 ; iterate through K1 to K7 +%endif +%endrep + +%ifdef SAFE_DATA + ;; Clear copied IV's + vpxorq %%T5, %%T5 + vmovdqu64 [%%T_IV + (0*64)], %%T5 + vmovdqu64 [%%T_IV + (1*64)], %%T5 +%endif + +%%_des_cfb_one_end: + +%endmacro + +;;; =========================================================================== +;;; Converts length into mask of DES blocks +;;; =========================================================================== +;;; +;;; ASSUMES: SIZE - OFFSET < 64 +%macro GET_MASK8 3 +%define %%MASK %1 ;; [out] GP for mask value (load/store) +%define %%SIZE %2 ;; [in] GP with message size in bytes +%define %%TMP %3 ;; [clobbered] temporary GP + + xor %%MASK, %%MASK + mov %%TMP, %%SIZE + sub %%TMP, OFFSET + shr %%TMP, 3 + bts %%MASK, %%TMP + sub %%MASK, 1 +%endmacro + +;;; =========================================================================== +;;; DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only) +;;; =========================================================================== +;;; +;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only +;;; DES_KS [in] - pointer to transposed key schedule +;;; +;;; NOTE: clobbers OpMask registers +;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 +%macro GEN_DES_ENC_CIPHER 2 +%define %%NUM_DES_BLOCKS %1 +%define %%DES_KS %2 + +%assign RN 0 +%assign LN 1 +%assign RNN 2 +%assign LNN 3 +%rep %%NUM_DES_BLOCKS - 1 + DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0 + vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0 +%assign RN (RN + 2) +%assign LN (LN + 2) +%assign RNN (RNN + 2) +%assign LNN (LNN + 2) +%endrep + DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7 + vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7 +%endmacro + +;;; =========================================================================== +;;; DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only) +;;; =========================================================================== +;;; +;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only +;;; DES_KS [in] - pointer to transposed key schedule +;;; +;;; NOTE: clobbers OpMask registers +;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 +%macro GEN_DES_DEC_CIPHER 2 +%define %%NUM_DES_BLOCKS %1 +%define %%DES_KS %2 + +%assign RN 0 +%assign LN 1 +%rep %%NUM_DES_BLOCKS + vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round + vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round + DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1 + vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0 + vmovdqa64 ZIV0, ZTMP12 + vmovdqa64 ZIV1, ZTMP13 +%assign RN (RN + 2) +%assign LN (LN + 2) +%endrep +%endmacro + +;;; =========================================================================== +;;; 3DES CBC ENCRYPT CIPHER ONLY (1 to 8 DES blocks only) +;;; =========================================================================== +;;; +;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only +;;; DES_KS1 [in] - pointer to transposed key schedule 1 +;;; DES_KS2 [in] - pointer to transposed key schedule 2 +;;; DES_KS3 [in] - pointer to transposed key schedule 3 +;;; +;;; NOTE: clobbers OpMask registers +;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 +%macro GEN_3DES_ENC_CIPHER 4 +%define %%NUM_DES_BLOCKS %1 +%define %%DES_KS1 %2 +%define %%DES_KS2 %3 +%define %%DES_KS3 %4 + +%assign RN 0 +%assign LN 1 +%assign RNN 2 +%assign LNN 3 +%rep %%NUM_DES_BLOCKS + ;; ENC + DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + ;; DEC + DES_ENC_DEC DEC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + ;; ENC + DES_ENC_DEC ENC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 +%if (RNN < (%%NUM_DES_BLOCKS * 2)) + vpxord ZW %+ RNN, ZW %+ RNN, ZW %+ LN ; R1 = R1 ^ L0 + vpxord ZW %+ LNN, ZW %+ LNN, ZW %+ RN ; L1 = L1 ^ R0 +%else + vmovdqa64 ZIV0, ZW %+ LN ; IV0 = L7 + vmovdqa64 ZIV1, ZW %+ RN ; IV1 = R7 +%endif + +%assign RN (RN + 2) +%assign LN (LN + 2) +%assign RNN (RNN + 2) +%assign LNN (LNN + 2) +%endrep + +%endmacro + +;;; =========================================================================== +;;; 3DES CBC DECRYPT CIPHER ONLY (1 to 8 DES blocks only) +;;; =========================================================================== +;;; +;;; NUM_DES_BLOCKS [in] - 1 to 8 DES blocks only +;;; DES_KS1 [in] - pointer to transposed key schedule 1 +;;; DES_KS2 [in] - pointer to transposed key schedule 2 +;;; DES_KS3 [in] - pointer to transposed key schedule 3 +;;; +;;; NOTE: clobbers OpMask registers +;;; REQUIRES: ZTMP0 - ZTMP13, ZW0-ZW15 (depends on NUM_DES_BLOCKS), ZIV0, ZIV1 +%macro GEN_3DES_DEC_CIPHER 4 +%define %%NUM_DES_BLOCKS %1 +%define %%DES_KS1 %2 +%define %%DES_KS2 %3 +%define %%DES_KS3 %4 + +%assign RN 0 +%assign LN 1 +%rep %%NUM_DES_BLOCKS + vmovdqa64 ZTMP12, ZW %+ RN ; keep R0 as IV for the next round + vmovdqa64 ZTMP13, ZW %+ LN ; keep L0 as IV for the next round + ;; DEC + DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + ;; ENC + DES_ENC_DEC ENC, ZW %+ LN, ZW %+ RN, %%DES_KS2, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + ;; DEC + DES_ENC_DEC DEC, ZW %+ RN, ZW %+ LN, %%DES_KS3, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 + vpxord ZW %+ RN, ZW %+ RN, ZIV1 ; R0 = R0 ^ IV1 + vpxord ZW %+ LN, ZW %+ LN, ZIV0 ; L0 = L0 ^ IV0 + vmovdqa64 ZIV0, ZTMP12 + vmovdqa64 ZIV1, ZTMP13 + +%assign RN (RN + 2) +%assign LN (LN + 2) +%endrep + +%endmacro + +;;; =========================================================================== +;;; DES CBC / DOCSIS DES ENCRYPT +;;; =========================================================================== +;;; +;;; DES_DOCSIS [in] - select between DES (DES CBC), DOCSIS (DOCSIS DES) and +;;; 3DES (3DES CBC) +;;; +;;; NOTE: clobbers OpMask registers +%macro GENERIC_DES_ENC 3 +%define %%DES_DOCSIS %1 ;; [in] select between DES (DES CBC), DOCSIS (DOCSIS DES) and 3DES (3DES CBC) +%define %%STATE %2 ;; [in] GP with pointer to OOO manager +%define %%SIZE %3 ;; [in] GP with message size in bytes + + ;; push the registers and allocate the stack frame + mov rax, rsp +%ifnidn %%DES_DOCSIS, 3DES + sub rsp, _key_sched2 ;; no need for schedule 2 and 3 for non-TDES/3DES algos +%else + sub rsp, STACKFRAME_size +%endif + and rsp, -64 + mov [rsp + _rsp_save], rax ; original SP + mov [rsp + _gpr_save + 0*8], r12 + mov [rsp + _gpr_save + 1*8], r13 + mov [rsp + _gpr_save + 2*8], r14 + mov [rsp + _gpr_save + 3*8], r15 + +%ifnidn %%DES_DOCSIS, 3DES + ;; DES and DOCSIS DES + DES_INIT {%%STATE + _des_args_keys}, {%%STATE + _des_args_IV}, {rsp + _key_sched}, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 +%else + ;; 3DES + DES3_INIT {%%STATE + _des_args_keys}, {%%STATE + _des_args_IV}, {rsp + _key_sched}, {rsp + _key_sched2}, {rsp + _key_sched3}, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ENC +%endif + mov [rsp + _size_save], %%SIZE + and %%SIZE, -64 + xor OFFSET, OFFSET + ;; This loop processes message in blocks of 64 bytes. + ;; Anything smaller than 64 bytes is handled separately after the loop. +%%_gen_des_enc_loop: + cmp OFFSET, %%SIZE + jz %%_gen_des_enc_loop_end + ;; run loads + mov IA0, [%%STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (7*PTR_SZ)] + vmovdqu64 ZW0, [IA0 + OFFSET] + vmovdqu64 ZW1, [IA1 + OFFSET] + vmovdqu64 ZW2, [IA2 + OFFSET] + vmovdqu64 ZW3, [INP0 + OFFSET] + vmovdqu64 ZW4, [INP1 + OFFSET] + vmovdqu64 ZW5, [INP2 + OFFSET] + vmovdqu64 ZW6, [INP3 + OFFSET] + vmovdqu64 ZW7, [INP4 + OFFSET] + + mov IA0, [%%STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (15*PTR_SZ)] + vmovdqu64 ZW8, [IA0 + OFFSET] + vmovdqu64 ZW9, [IA1 + OFFSET] + vmovdqu64 ZW10, [IA2 + OFFSET] + vmovdqu64 ZW11, [INP0 + OFFSET] + vmovdqu64 ZW12, [INP1 + OFFSET] + vmovdqu64 ZW13, [INP2 + OFFSET] + vmovdqu64 ZW14, [INP3 + OFFSET] + vmovdqu64 ZW15, [INP4 + OFFSET] + + ;; Transpose input + TRANSPOSE16_U32 ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; DES CBC ENC comes here + vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 + vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1 + +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 8, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + + ;; transpose data on output + TRANSPOSE16_U32 ZW1, ZW0, ZW3, ZW2, ZW5, ZW4, ZW7, ZW6, ZW9, ZW8, ZW11, ZW10, ZW13, ZW12, ZW15, ZW14, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + ;; run stores + mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (7*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET], ZW0 + vmovdqu64 [IA1 + OFFSET], ZW1 + vmovdqu64 [IA2 + OFFSET], ZW2 + vmovdqu64 [INP0 + OFFSET], ZW3 + vmovdqu64 [INP1 + OFFSET], ZW4 + vmovdqu64 [INP2 + OFFSET], ZW5 + vmovdqu64 [INP3 + OFFSET], ZW6 + vmovdqu64 [INP4 + OFFSET], ZW7 + + mov IA0, [%%STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (15*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET], ZW8 + vmovdqu64 [IA1 + OFFSET], ZW9 + vmovdqu64 [IA2 + OFFSET], ZW10 + vmovdqu64 [INP0 + OFFSET], ZW11 + vmovdqu64 [INP1 + OFFSET], ZW12 + vmovdqu64 [INP2 + OFFSET], ZW13 + vmovdqu64 [INP3 + OFFSET], ZW14 + vmovdqu64 [INP4 + OFFSET], ZW15 + + add OFFSET, 64 + jmp %%_gen_des_enc_loop +%%_gen_des_enc_loop_end: + ;; This is where we check if there is anything less than 64 bytes + ;; of message left for processing. + mov %%SIZE, [rsp + _size_save] + cmp OFFSET, %%SIZE + jz %%_gen_des_enc_part_end + ;; calculate min of bytes_left and 64, convert to qword mask + GET_MASK8 IA0, %%SIZE, IA1 ; IA0 = mask + + kmovw k7, DWORD(IA0) + mov [rsp + _mask_save], IA0 + ;; run masked loads + mov IA0, [%%STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (7*PTR_SZ)] + vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] + vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] + vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] + vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] + vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] + vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] + vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] + vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] + + mov IA0, [%%STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (15*PTR_SZ)] + vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] + vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] + vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] + vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] + vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] + vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] + vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] + vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] + + ;; Transpose input + TRANSPOSE16_U32 ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; DES CBC ENC comes here + vpxord ZW0, ZW0, ZIV0 ; R0 = R0 ^ IV0 + vpxord ZW1, ZW1, ZIV1 ; L0 = L0 ^ IV1 + + mov IA0, [rsp + _mask_save] + cmp BYTE(IA0), 0x0f + ja %%_gt_4 + jz %%_blocks_4 + + cmp BYTE(IA0), 0x03 + ja %%_blocks_3 + jz %%_blocks_2 + + ;; process one block and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 1, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_2: + ;; process two blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 2, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_3: + ;; process three blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 3, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_4: + ;; process four blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 4, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_gt_4: + cmp BYTE(IA0), 0x3f + ja %%_blocks_7 + jz %%_blocks_6 +%%_blocks_5: + ;; process five blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 5, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_6: + ;; process six blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 6, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_7: + ;; process seven blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_ENC_CIPHER 7, rsp + _key_sched +%else + GEN_3DES_ENC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + +%%_transpose_out: + ;; transpose data on output + TRANSPOSE16_U32 ZW1, ZW0, ZW3, ZW2, ZW5, ZW4, ZW7, ZW6, ZW9, ZW8, ZW11, ZW10, ZW13, ZW12, ZW15, ZW14, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; run masked stores + mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (7*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 + vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 + vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 + vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 + vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 + vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 + vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 + vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 + + mov IA0, [%%STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (15*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 + vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 + vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 + vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 + vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 + vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 + vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 + vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 +%%_gen_des_enc_part_end: + + ;; store IV and update pointers + DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, %%STATE, %%SIZE + + ;; CFB part for DOCSIS +%ifidn %%DES_DOCSIS, DOCSIS + DES_CFB_ONE ENC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask, %%STATE +%endif + + CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 + + ;; restore stack pointer and registers + mov r12, [rsp + _gpr_save + 0*8] + mov r13, [rsp + _gpr_save + 1*8] + mov r14, [rsp + _gpr_save + 2*8] + mov r15, [rsp + _gpr_save + 3*8] + mov rsp, [rsp + _rsp_save] ; original SP + +%ifdef SAFE_DATA + clear_all_zmms_asm +%else + vzeroupper +%endif ;; SAFE_DATA + +%endmacro + +;;; =========================================================================== +;;; DES CBC / DOCSIS DES DECRYPT +;;; =========================================================================== +;;; +;;; NOTE: clobbers OpMask registers +%macro GENERIC_DES_DEC 3 +%define %%DES_DOCSIS %1 ;; [in] select between DES (DES CBC), DOCSIS (DOCSIS DES) and 3DES (3DES CBC) +%define %%STATE %2 ;; [in] GP with pointer to OOO manager +%define %%SIZE %3 ;; [in] GP with message size in bytes + + ;; push the registers and allocate the stack frame + mov rax, rsp + sub rsp, STACKFRAME_size + and rsp, -64 + mov [rsp + _rsp_save], rax ; original SP + mov [rsp + _gpr_save + 0*8], r12 + mov [rsp + _gpr_save + 1*8], r13 + mov [rsp + _gpr_save + 2*8], r14 + mov [rsp + _gpr_save + 3*8], r15 + +%ifnidn %%DES_DOCSIS, 3DES + ;; DES and DOCSIS + DES_INIT %%STATE + _des_args_keys, %%STATE + _des_args_IV, rsp + _key_sched, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11 +%else + ;; 3DES + DES3_INIT %%STATE + _des_args_keys, %%STATE + _des_args_IV, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3, ZIV0, ZIV1, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, DEC +%endif + + ;; CFB part for DOCSIS +%ifidn %%DES_DOCSIS, DOCSIS + DES_CFB_ONE DEC, rsp + _key_sched, ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, rsp + _tmp_in, rsp + _tmp_out, rsp + _tmp_iv, rsp + _tmp_mask, %%STATE +%endif + + mov [rsp + _size_save], %%SIZE + and %%SIZE, -64 + xor OFFSET, OFFSET + ;; This loop processes message in blocks of 64 bytes. + ;; Anything smaller than 64 bytes is handled separately after the loop. +%%_gen_des_dec_loop: + cmp OFFSET, %%SIZE + jz %%_gen_des_dec_loop_end + ;; run loads + mov IA0, [%%STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (7*PTR_SZ)] + vmovdqu64 ZW0, [IA0 + OFFSET] + vmovdqu64 ZW1, [IA1 + OFFSET] + vmovdqu64 ZW2, [IA2 + OFFSET] + vmovdqu64 ZW3, [INP0 + OFFSET] + vmovdqu64 ZW4, [INP1 + OFFSET] + vmovdqu64 ZW5, [INP2 + OFFSET] + vmovdqu64 ZW6, [INP3 + OFFSET] + vmovdqu64 ZW7, [INP4 + OFFSET] + + mov IA0, [%%STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (15*PTR_SZ)] + vmovdqu64 ZW8, [IA0 + OFFSET] + vmovdqu64 ZW9, [IA1 + OFFSET] + vmovdqu64 ZW10, [IA2 + OFFSET] + vmovdqu64 ZW11, [INP0 + OFFSET] + vmovdqu64 ZW12, [INP1 + OFFSET] + vmovdqu64 ZW13, [INP2 + OFFSET] + vmovdqu64 ZW14, [INP3 + OFFSET] + vmovdqu64 ZW15, [INP4 + OFFSET] + + ;; Transpose input + TRANSPOSE16_U32 ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + +%ifnidn %%DES_DOCSIS, 3DES + ;; DES CBC DEC comes here + GEN_DES_DEC_CIPHER 8, rsp + _key_sched +%else + ;; 3DES CBC DEC comes here + GEN_3DES_DEC_CIPHER 8, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + + ;; transpose data on output + TRANSPOSE16_U32 ZW1, ZW0, ZW3, ZW2, ZW5, ZW4, ZW7, ZW6, ZW9, ZW8, ZW11, ZW10, ZW13, ZW12, ZW15, ZW14, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; run stores + mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (7*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET], ZW0 + vmovdqu64 [IA1 + OFFSET], ZW1 + vmovdqu64 [IA2 + OFFSET], ZW2 + vmovdqu64 [INP0 + OFFSET], ZW3 + vmovdqu64 [INP1 + OFFSET], ZW4 + vmovdqu64 [INP2 + OFFSET], ZW5 + vmovdqu64 [INP3 + OFFSET], ZW6 + vmovdqu64 [INP4 + OFFSET], ZW7 + + mov IA0, [%%STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (15*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET], ZW8 + vmovdqu64 [IA1 + OFFSET], ZW9 + vmovdqu64 [IA2 + OFFSET], ZW10 + vmovdqu64 [INP0 + OFFSET], ZW11 + vmovdqu64 [INP1 + OFFSET], ZW12 + vmovdqu64 [INP2 + OFFSET], ZW13 + vmovdqu64 [INP3 + OFFSET], ZW14 + vmovdqu64 [INP4 + OFFSET], ZW15 + + add OFFSET, 64 + jmp %%_gen_des_dec_loop +%%_gen_des_dec_loop_end: + ;; This is where we check if there is anything less than 64 bytes + ;; of message left for processing. + mov %%SIZE, [rsp + _size_save] + cmp OFFSET, %%SIZE + jz %%_gen_des_dec_part_end + ;; calculate min of bytes_left and 64, convert to qword mask + GET_MASK8 IA0, %%SIZE, IA1 ; IA0 = mask + + kmovw k7, DWORD(IA0) + mov [rsp + _mask_save], IA0 + ;; run masked loads + mov IA0, [%%STATE + _des_args_in + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (7*PTR_SZ)] + vmovdqu64 ZW0{k7}{z}, [IA0 + OFFSET] + vmovdqu64 ZW1{k7}{z}, [IA1 + OFFSET] + vmovdqu64 ZW2{k7}{z}, [IA2 + OFFSET] + vmovdqu64 ZW3{k7}{z}, [INP0 + OFFSET] + vmovdqu64 ZW4{k7}{z}, [INP1 + OFFSET] + vmovdqu64 ZW5{k7}{z}, [INP2 + OFFSET] + vmovdqu64 ZW6{k7}{z}, [INP3 + OFFSET] + vmovdqu64 ZW7{k7}{z}, [INP4 + OFFSET] + + mov IA0, [%%STATE + _des_args_in + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_in + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_in + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_in + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_in + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_in + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_in + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_in + (15*PTR_SZ)] + vmovdqu64 ZW8{k7}{z}, [IA0 + OFFSET] + vmovdqu64 ZW9{k7}{z}, [IA1 + OFFSET] + vmovdqu64 ZW10{k7}{z}, [IA2 + OFFSET] + vmovdqu64 ZW11{k7}{z}, [INP0 + OFFSET] + vmovdqu64 ZW12{k7}{z}, [INP1 + OFFSET] + vmovdqu64 ZW13{k7}{z}, [INP2 + OFFSET] + vmovdqu64 ZW14{k7}{z}, [INP3 + OFFSET] + vmovdqu64 ZW15{k7}{z}, [INP4 + OFFSET] + + ;; Transpose input + TRANSPOSE16_U32 ZW0, ZW1, ZW2, ZW3, ZW4, ZW5, ZW6, ZW7, ZW8, ZW9, ZW10, ZW11, ZW12, ZW13, ZW14, ZW15, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; DES CBC DEC comes here + mov IA0, [rsp + _mask_save] + cmp BYTE(IA0), 0x0f + ja %%_gt_4 + jz %%_blocks_4 + + cmp BYTE(IA0), 0x03 + ja %%_blocks_3 + jz %%_blocks_2 + ;; process one block and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 1, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 1, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_2: + ;; process two blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 2, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 2, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_3: + ;; process three blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 3, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 3, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_4: + ;; process four blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 4, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 4, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_gt_4: + cmp BYTE(IA0), 0x3f + ja %%_blocks_7 + jz %%_blocks_6 +%%_blocks_5: + ;; process five blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 5, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 5, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_6: + ;; process six blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 6, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 6, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + jmp %%_transpose_out + +%%_blocks_7: + ;; process seven blocks and move to transpose out +%ifnidn %%DES_DOCSIS, 3DES + GEN_DES_DEC_CIPHER 7, rsp + _key_sched +%else + GEN_3DES_DEC_CIPHER 7, rsp + _key_sched, rsp + _key_sched2, rsp + _key_sched3 +%endif + +%%_transpose_out: + ;; transpose data on output + TRANSPOSE16_U32 ZW1, ZW0, ZW3, ZW2, ZW5, ZW4, ZW7, ZW6, ZW9, ZW8, ZW11, ZW10, ZW13, ZW12, ZW15, ZW14, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, ZTMP5, ZTMP6, ZTMP7, ZTMP8, ZTMP9, ZTMP10, ZTMP11, ZTMP12, ZTMP13 + + ;; run masked stores + mov IA0, [%%STATE + _des_args_out + (0*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (1*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (2*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (3*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (4*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (5*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (6*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (7*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET]{k7}, ZW0 + vmovdqu64 [IA1 + OFFSET]{k7}, ZW1 + vmovdqu64 [IA2 + OFFSET]{k7}, ZW2 + vmovdqu64 [INP0 + OFFSET]{k7}, ZW3 + vmovdqu64 [INP1 + OFFSET]{k7}, ZW4 + vmovdqu64 [INP2 + OFFSET]{k7}, ZW5 + vmovdqu64 [INP3 + OFFSET]{k7}, ZW6 + vmovdqu64 [INP4 + OFFSET]{k7}, ZW7 + + mov IA0, [%%STATE + _des_args_out + (8*PTR_SZ)] + mov IA1, [%%STATE + _des_args_out + (9*PTR_SZ)] + mov IA2, [%%STATE + _des_args_out + (10*PTR_SZ)] + mov INP0, [%%STATE + _des_args_out + (11*PTR_SZ)] + mov INP1, [%%STATE + _des_args_out + (12*PTR_SZ)] + mov INP2, [%%STATE + _des_args_out + (13*PTR_SZ)] + mov INP3, [%%STATE + _des_args_out + (14*PTR_SZ)] + mov INP4, [%%STATE + _des_args_out + (15*PTR_SZ)] + vmovdqu64 [IA0 + OFFSET]{k7}, ZW8 + vmovdqu64 [IA1 + OFFSET]{k7}, ZW9 + vmovdqu64 [IA2 + OFFSET]{k7}, ZW10 + vmovdqu64 [INP0 + OFFSET]{k7}, ZW11 + vmovdqu64 [INP1 + OFFSET]{k7}, ZW12 + vmovdqu64 [INP2 + OFFSET]{k7}, ZW13 + vmovdqu64 [INP3 + OFFSET]{k7}, ZW14 + vmovdqu64 [INP4 + OFFSET]{k7}, ZW15 +%%_gen_des_dec_part_end: + + ;; store IV and update pointers + DES_FINISH ZIV0, ZIV1, ZTMP0, ZTMP1, ZTMP2, ZTMP3, ZTMP4, %%STATE, %%SIZE + + CLEAR_KEY_SCHEDULE %%DES_DOCSIS, ZW0 + + ;; restore stack pointer and registers + mov r12, [rsp + _gpr_save + 0*8] + mov r13, [rsp + _gpr_save + 1*8] + mov r14, [rsp + _gpr_save + 2*8] + mov r15, [rsp + _gpr_save + 3*8] + mov rsp, [rsp + _rsp_save] ; original SP + +%ifdef SAFE_DATA + clear_all_zmms_asm +%else + vzeroupper +%endif ;; SAFE_DATA + +%endmacro diff --git a/lib/include/gcm_api_avx2_avx512.inc b/lib/include/gcm_api_avx2_avx512.inc new file mode 100644 index 0000000000000000000000000000000000000000..d69ec1f6d91aa6e9b408667281d3aa110b7a3c10 --- /dev/null +++ b/lib/include/gcm_api_avx2_avx512.inc @@ -0,0 +1,1183 @@ +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; Copyright(c) 2024, Intel Corporation All rights reserved. +; +; Redistribution and use in source and binary forms, with or without +; modification, are permitted provided that the following conditions +; are met: +; * Redistributions of source code must retain the above copyright +; notice, this list of conditions and the following disclaimer. +; * Redistributions in binary form must reproduce the above copyright +; notice, this list of conditions and the following disclaimer in +; the documentation and/or other materials provided with the +; distribution. +; * Neither the name of Intel Corporation nor the names of its +; contributors may be used to endorse or promote products derived +; from this software without specific prior written permission. +; +; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%use smartalign + +%include "include/gcm_common_avx2_avx512.inc" + +mksection .text +default rel + +extern ghash_internal_avx_gen4 +extern partial_block_gmac_avx_gen4 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_precomp_128_avx_gen4 / +; aes_gcm_precomp_192_avx_gen4 / +; aes_gcm_precomp_256_avx_gen4 / +; aes_gcm_precomp_128_avx512 / +; aes_gcm_precomp_192_avx512 / +; aes_gcm_precomp_256_avx512 +; (struct gcm_key_data *key_data) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(precomp,_),function,) +MKGLOBAL(FN_NAME_AVX512(precomp,_),function,) +FN_NAME(precomp,_): +FN_NAME_AVX512(precomp,_): + endbranch64 +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_precomp +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + sub rsp, 1*16 + ; only xmm6 needs to be maintained + vmovdqu [rsp + 0*16],xmm6 +%endif + + vpxor xmm6, xmm6 + ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey + + vpshufb xmm6, [rel SHUF_MASK] + ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; + vmovdqa xmm2, xmm6 + vpsllq xmm6, xmm6, 1 + vpsrlq xmm2, xmm2, 63 + vmovdqa xmm1, xmm2 + vpslldq xmm2, xmm2, 8 + vpsrldq xmm1, xmm1, 8 + vpor xmm6, xmm6, xmm2 + ;reduction + vpshufd xmm2, xmm1, 00100100b + vpcmpeqd xmm2, [rel TWOONE] + vpand xmm2, xmm2, [rel POLY] + vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly + + PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 0*16] + add rsp, 1*16 +%endif + +exit_precomp: + + ret + +%ifdef SAFE_PARAM +error_precomp: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + + jmp exit_precomp +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 / +; aes_gcm_init_128_avx512 / aes_gcm_init_192_avx512 / aes_gcm_init_256_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u8 *aad, +; u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(init,_),function,) +MKGLOBAL(FN_NAME_AVX512(init,_),function,) +FN_NAME(init,_): +FN_NAME_AVX512(init,_): + endbranch64 + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + push r14 + push r15 + lea r14, [rsp + 4*8] + ; xmm6 needs to be maintained for Windows + sub rsp, 1*16 + vmovdqu [rsp + 0*16], xmm6 +%endif + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_init + + ;; Check context_data != NULL + or arg2, arg2 + jz error_init + + ;; Check IV != NULL + or arg3, arg3 + jz error_init + + ;; Check if aad_len == 0 + cmp arg5, 0 + jz skip_aad_check_init + + ;; Check aad != NULL (aad_len != 0) + or arg4, arg4 + jz error_init + +skip_aad_check_init: +%endif + GCM_INIT arg1, arg2, arg3, arg4, arg5 + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif +exit_init: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6 , [rsp + 0*16] + add rsp, 1*16 + pop r15 + pop r14 +%endif + pop r13 + pop r12 + ret + +%ifdef SAFE_PARAM +error_init: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check IV != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_IV + + ;; Check if aad_len == 0 + cmp arg5, 0 + jz skip_aad_check_error_init + + ;; Check aad != NULL (aad_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AAD + +skip_aad_check_error_init: + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_init +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_init_var_iv_128_avx_gen4 / aes_gcm_init_var_iv_192_avx_gen4 / +; aes_gcm_init_var_iv_256_avx_gen4 +; aes_gcm_init_var_iv_128_avx512 / aes_gcm_init_var_iv_192_avx512 / +; aes_gcm_init_var_iv_256_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *iv, +; const u64 iv_len, +; const u8 *aad, +; const u64 aad_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(init_var_iv,_),function,) +MKGLOBAL(FN_NAME_AVX512(init_var_iv,_),function,) +FN_NAME(init_var_iv,_): +FN_NAME_AVX512(init_var_iv,_): + endbranch64 + push r12 + push r13 +%ifidn __OUTPUT_FORMAT__, win64 + push r14 + push r15 + lea r14, [rsp + 4*8] + ; xmm6 & xmm14 need to be maintained for Windows + sub rsp, 2*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm14 +%endif + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_init_IV + + ;; Check context_data != NULL + or arg2, arg2 + jz error_init_IV + + ;; Check IV != NULL + or arg3, arg3 + jz error_init_IV + + ;; Check iv_len != 0 + or arg4, arg4 + jz error_init_IV + + ;; Check if aad_len == 0 + cmp arg6, 0 + jz skip_aad_check_init_IV + + ;; Check aad != NULL (aad_len != 0) + cmp arg5, 0 + jz error_init_IV + +skip_aad_check_init_IV: +%endif + + GCM_INIT arg1, arg2, arg3, arg5, arg6, arg4 + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif +exit_init_IV: + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm6, [rsp + 0*16] + vmovdqu xmm14, [rsp + 1*16] + add rsp, 2*16 + pop r15 + pop r14 +%endif + pop r13 + pop r12 + ret + +%ifdef SAFE_PARAM +error_init_IV: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check IV != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_IV + + ;; Check iv_len != 0 + IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_IV_LEN + + ;; Check if aad_len == 0 + cmp arg6, 0 + jz skip_aad_check_error_init_IV + + ;; Check aad != NULL (aad_len != 0) + IMB_ERR_CHECK_NULL arg5, rax, IMB_ERR_NULL_AAD + +skip_aad_check_error_init_IV: + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_init_IV +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / +; aes_gcm_enc_128_update_avx_gen4 / +; aes_gcm_enc_128_update_avx512 / aes_gcm_enc_192_update_avx512 / +; aes_gcm_enc_256_update_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 msg_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(enc,_update_),function,) +MKGLOBAL(FN_NAME_AVX512(enc,_update_),function,) +FN_NAME(enc,_update_): +FN_NAME_AVX512(enc,_update_): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Load max len to reg on windows + INIT_GCM_MAX_LENGTH + + ;; Check key_data != NULL + or arg1, arg1 + jz error_update_enc + + ;; Check context_data != NULL + or arg2, arg2 + jz error_update_enc + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz error_update_enc + + ;; Check if msg_len > max_len + cmp arg5, GCM_MAX_LENGTH + ja error_update_enc + + ;; Check out != NULL (msg_len != 0) + or arg3, arg3 + jz error_update_enc + + ;; Check in != NULL (msg_len != 0) + or arg4, arg4 + jz error_update_enc +%endif + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call + +exit_update_enc: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +error_update_enc: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_error_update_enc + + ;; Check if msg_len > max_len + IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN + + ;; Check out != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST + + ;; Check in != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC + +skip_in_out_check_error_update_enc: + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_update_enc +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / +; aes_gcm_dec_256_update_avx_gen4 / +; aes_gcm_dec_128_update_avx512 / aes_gcm_dec_192_update_avx512 / +; aes_gcm_dec_256_update_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 msg_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(dec,_update_),function,) +MKGLOBAL(FN_NAME_AVX512(dec,_update_),function,) +FN_NAME(dec,_update_): +FN_NAME_AVX512(dec,_update_): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Load max len to reg on windows + INIT_GCM_MAX_LENGTH + + ;; Check key_data != NULL + or arg1, arg1 + jz error_update_dec + + ;; Check context_data != NULL + or arg2, arg2 + jz error_update_dec + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz error_update_dec + + ;; Check if msg_len > max_len + cmp arg5, GCM_MAX_LENGTH + ja error_update_dec + + ;; Check out != NULL (msg_len != 0) + or arg3, arg3 + jz error_update_dec + + ;; Check in != NULL (msg_len != 0) + or arg4, arg4 + jz error_update_dec +%endif + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call + +exit_update_dec: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +error_update_dec: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check if plaintext_len == 0 + cmp arg5, 0 + jz skip_in_out_check_error_update_dec + + ;; Check if msg_len > max_len + IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN + + ;; Check out != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST + + ;; Check in != NULL (plaintext_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC + +skip_in_out_check_error_update_dec: + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_update_dec +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / +; aes_gcm_enc_256_finalize_avx_gen4 / +; aes_gcm_enc_128_finalize_avx512 / aes_gcm_enc_192_finalize_avx512 / +; aes_gcm_enc_256_finalize_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(enc,_finalize_),function,) +MKGLOBAL(FN_NAME_AVX512(enc,_finalize_),function,) +FN_NAME(enc,_finalize_): +FN_NAME_AVX512(enc,_finalize_): + endbranch64 +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_enc_fin + + ;; Check context_data != NULL + or arg2, arg2 + jz error_enc_fin + + ;; Check auth_tag != NULL + or arg3, arg3 + jz error_enc_fin + + ;; Check auth_tag_len == 0 or > 16 + or arg4, arg4 + jz error_enc_fin + + cmp arg4, 16 + ja error_enc_fin +%endif + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 7*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm10 + vmovdqu [rsp + 3*16], xmm11 + vmovdqu [rsp + 4*16], xmm13 + vmovdqu [rsp + 5*16], xmm14 + vmovdqu [rsp + 6*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif + +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 6*16] + vmovdqu xmm14, [rsp + 5*16] + vmovdqu xmm13, [rsp + 4*16] + vmovdqu xmm11, [rsp + 3*16] + vmovdqu xmm10, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 7*16 +%endif + pop r12 +exit_enc_fin: + ret + +%ifdef SAFE_PARAM +error_enc_fin: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check auth_tag != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_AUTH + + ;; Check auth_tag_len == 0 or > 16 + IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_AUTH_TAG_LEN + + IMB_ERR_CHECK_ABOVE arg4, 16, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_enc_fin +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 +; aes_gcm_dec_256_finalize_avx_gen4 / +; aes_gcm_dec_128_finalize_avx512 / aes_gcm_dec_192_finalize_avx512 +; aes_gcm_dec_256_finalize_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(dec,_finalize_),function,) +MKGLOBAL(FN_NAME_AVX512(dec,_finalize_),function,) +FN_NAME(dec,_finalize_): +FN_NAME_AVX512(dec,_finalize_): + endbranch64 +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Check key_data != NULL + or arg1, arg1 + jz error_dec_fin + + ;; Check context_data != NULL + or arg2, arg2 + jz error_dec_fin + + ;; Check auth_tag != NULL + or arg3, arg3 + jz error_dec_fin + + ;; Check auth_tag_len == 0 or > 16 + or arg4, arg4 + jz error_dec_fin + + cmp arg4, 16 + ja error_dec_fin +%endif + + push r12 + +%ifidn __OUTPUT_FORMAT__, win64 + ; xmm6:xmm15 need to be maintained for Windows + sub rsp, 7*16 + vmovdqu [rsp + 0*16], xmm6 + vmovdqu [rsp + 1*16], xmm9 + vmovdqu [rsp + 2*16], xmm10 + vmovdqu [rsp + 3*16], xmm11 + vmovdqu [rsp + 4*16], xmm13 + vmovdqu [rsp + 5*16], xmm14 + vmovdqu [rsp + 6*16], xmm15 +%endif + GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call + +%ifdef SAFE_DATA + clear_scratch_xmms_avx_asm +%endif +%ifidn __OUTPUT_FORMAT__, win64 + vmovdqu xmm15, [rsp + 6*16] + vmovdqu xmm14, [rsp + 5*16] + vmovdqu xmm13, [rsp + 4*16] + vmovdqu xmm11, [rsp + 3*16] + vmovdqu xmm10, [rsp + 2*16] + vmovdqu xmm9, [rsp + 1*16] + vmovdqu xmm6, [rsp + 0*16] + add rsp, 7*16 +%endif + + pop r12 + +exit_dec_fin: + ret + +%ifdef SAFE_PARAM +error_dec_fin: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check auth_tag != NULL + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_AUTH + + ;; Check auth_tag_len == 0 or > 16 + IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_AUTH_TAG_LEN + + IMB_ERR_CHECK_ABOVE arg4, 16, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_dec_fin +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 / +; aes_gcm_enc_128_avx512 / aes_gcm_enc_192_avx512 / aes_gcm_enc_256_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 msg_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(enc,_),function,) +MKGLOBAL(FN_NAME_AVX512(enc,_),function,) +FN_NAME(enc,_): +FN_NAME_AVX512(enc,_): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Load max len to reg on windows + INIT_GCM_MAX_LENGTH + + ;; Check key_data != NULL + or arg1, arg1 + jz error_enc + + ;; Check context_data != NULL + or arg2, arg2 + jz error_enc + + ;; Check IV != NULL + cmp arg6, 0 + jz error_enc + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz error_enc + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz error_enc + + cmp arg10, 16 + ja error_enc + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz skip_in_out_check_enc + + ;; Check if msg_len > max_len + cmp arg5, GCM_MAX_LENGTH + ja error_enc + + ;; Check out != NULL (msg_len != 0) + or arg3, arg3 + jz error_enc + + ;; Check in != NULL (msg_len != 0) + or arg4, arg4 + jz error_enc + +skip_in_out_check_enc: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_enc + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz error_enc + +skip_aad_check_enc: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, single_call + +exit_enc: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +error_enc: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check IV != NULL + IMB_ERR_CHECK_NULL arg6, rax, IMB_ERR_NULL_IV + + ;; Check auth_tag != NULL + IMB_ERR_CHECK_NULL arg9, rax, IMB_ERR_NULL_AUTH + + ;; Check auth_tag_len == 0 or > 16 + IMB_ERR_CHECK_ZERO arg10, rax, IMB_ERR_AUTH_TAG_LEN + + IMB_ERR_CHECK_ABOVE arg10, 16, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz skip_in_out_check_error_enc + + ;; Check if msg_len > max_len + IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN + + ;; Check out != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST + + ;; Check in != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC + +skip_in_out_check_error_enc: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_error_enc + + ;; Check aad != NULL (aad_len != 0) + IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD + +skip_aad_check_error_enc: + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_enc +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 / +; aes_gcm_dec_128_avx512 / aes_gcm_dec_192_avx512 / aes_gcm_dec_256_avx512 +; (const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; u8 *out, +; const u8 *in, +; u64 msg_len, +; u8 *iv, +; const u8 *aad, +; u64 aad_len, +; u8 *auth_tag, +; u64 auth_tag_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(dec,_),function,) +MKGLOBAL(FN_NAME_AVX512(dec,_),function,) +FN_NAME(dec,_): +FN_NAME_AVX512(dec,_): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET + + ;; Load max len to reg on windows + INIT_GCM_MAX_LENGTH + + ;; Check key_data != NULL + or arg1, arg1 + jz error_dec + + ;; Check context_data != NULL + or arg2, arg2 + jz error_dec + + ;; Check IV != NULL + cmp arg6, 0 + jz error_dec + + ;; Check auth_tag != NULL + cmp arg9, 0 + jz error_dec + + ;; Check auth_tag_len == 0 or > 16 + cmp arg10, 0 + jz error_dec + + cmp arg10, 16 + ja error_dec + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz skip_in_out_check_dec + + ;; Check if msg_len > max_len + cmp arg5, GCM_MAX_LENGTH + ja error_dec + + ;; Check out != NULL (msg_len != 0) + or arg3, arg3 + jz error_dec + + ;; Check in != NULL (msg_len != 0) + or arg4, arg4 + jz error_dec + +skip_in_out_check_dec: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_dec + + ;; Check aad != NULL (aad_len != 0) + cmp arg7, 0 + jz error_dec + +skip_aad_check_dec: +%endif + GCM_INIT arg1, arg2, arg6, arg7, arg8 + + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + + GCM_COMPLETE arg1, arg2, arg9, arg10, single_call + +exit_dec: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +error_dec: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check IV != NULL + IMB_ERR_CHECK_NULL arg6, rax, IMB_ERR_NULL_IV + + ;; Check auth_tag != NULL + IMB_ERR_CHECK_NULL arg9, rax, IMB_ERR_NULL_AUTH + + ;; Check auth_tag_len == 0 or > 16 + IMB_ERR_CHECK_ZERO arg10, rax, IMB_ERR_AUTH_TAG_LEN + + IMB_ERR_CHECK_ABOVE arg10, 16, rax, IMB_ERR_AUTH_TAG_LEN + + ;; Check if msg_len == 0 + cmp arg5, 0 + jz skip_in_out_check_error_dec + + ;; Check if msg_len > max_len + IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN + + ;; Check out != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST + + ;; Check in != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC + +skip_in_out_check_error_dec: + ;; Check if aad_len == 0 + cmp arg8, 0 + jz skip_aad_check_error_dec + + ;; Check aad != NULL (aad_len != 0) + IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD + +skip_aad_check_error_dec: + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp exit_dec +%endif + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; NOTE: THIS API IS USED BY JOB-API ONLY, NO NEED FOR 2ND SAFE PARAM CHECK +; +;IMB_JOB * aes_gcm_enc_var_iv_128_avx_gen4 / aes_gcm_enc_var_iv_192_avx_gen4 / +; aes_gcm_enc_var_iv_256_avx_gen4 / +; aes_gcm_enc_var_iv_128_avx512 / aes_gcm_enc_var_iv_192_avx512 / +; aes_gcm_enc_var_iv_256_avx512 +; (IMB_MGR *state, IMB_JOB *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(enc_var_iv,_),function,internal) +MKGLOBAL(FN_NAME_AVX512(enc_var_iv,_),function,internal) +FN_NAME(enc_var_iv,_): +FN_NAME_AVX512(enc_var_iv,_): + endbranch64 + FUNC_SAVE alloc_context + + mov arg1, [arg2 + _enc_keys] + + GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ + {qword [arg2 + _iv_len_in_bytes]} + + mov arg3, [arg2 + _src] + add arg3, [arg2 + _cipher_start_src_offset] + mov arg4, [arg2 + _dst] + mov [rsp + GP_OFFSET + 5*8], arg2 ; preserve job pointer + mov arg2, [arg2 + _msg_len_to_cipher] + GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, arg2, ENC, single_call + + mov arg2, [rsp + GP_OFFSET + 5*8] + GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ + {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ + single_call + + ;; mark job complete + mov dword [arg2 + _status], IMB_STATUS_COMPLETED + + mov rax, arg2 ;; return the job + + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; +; NOTE: THIS API IS USED BY JOB-API ONLY, NO NEED FOR 2ND SAFE PARAM CHECK +; +;IMB_JOB *aes_gcm_dec_var_iv_128_avx_gen4 / aes_gcm_dec_var_iv_192_avx_gen4 / +; aes_gcm_dec_var_iv_256_avx_gen4 / +; aes_gcm_dec_var_iv_128_avx512 / aes_gcm_dec_var_iv_192_avx512 / +; aes_gcm_dec_var_iv_256_avx512 +; (IMB_MGR *state, IMB_JOB *job) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(FN_NAME(dec_var_iv,_),function,internal) +MKGLOBAL(FN_NAME_AVX512(dec_var_iv,_),function,internal) +FN_NAME(dec_var_iv,_): +FN_NAME_AVX512(dec_var_iv,_): + endbranch64 + FUNC_SAVE alloc_context + + mov arg1, [arg2 + _dec_keys] + + GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ + {qword [arg2 + _iv_len_in_bytes]} + + mov arg3, [arg2 + _src] + add arg3, [arg2 + _cipher_start_src_offset] + mov arg4, [arg2 + _dst] + mov [rsp + GP_OFFSET + 5*8], arg2 ; preserve job pointer + mov arg2, [arg2 + _msg_len_to_cipher] + GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, arg2, DEC, single_call + + mov arg2, [rsp + GP_OFFSET + 5*8] + GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ + {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ + single_call + + ;; mark job complete + mov dword [arg2 + _status], IMB_STATUS_COMPLETED + + mov rax, arg2 ;; return the job + + FUNC_RESTORE + ret + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;void imb_aes_gmac_update_128_avx_gen4 / imb_aes_gmac_update_192_avx_gen4 / +; imb_aes_gmac_update_256_avx_gen4 +; imb_aes_gmac_update_128_avx512 / imb_aes_gmac_update_192_avx512 / +; imb_aes_gmac_update_256_avx512 ( +; const struct gcm_key_data *key_data, +; struct gcm_context_data *context_data, +; const u8 *in, +; const u64 msg_len); +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 +MKGLOBAL(GMAC_FN_NAME(update),function,) +MKGLOBAL(GMAC_FN_NAME_AVX512(update),function,) +GMAC_FN_NAME(update): +GMAC_FN_NAME_AVX512(update): + endbranch64 + FUNC_SAVE + +%ifdef SAFE_PARAM + ;; Reset imb_errno + IMB_ERR_CHECK_RESET +%endif + ;; Check if msg_len == 0 + or arg4, arg4 + je .exit_gmac_update + +%ifdef SAFE_PARAM + ;; Check key_data != NULL + or arg1, arg1 + jz .error_gmac_update + + ;; Check context_data != NULL + or arg2, arg2 + jz .error_gmac_update + + ;; Check in != NULL (msg_len != 0) + or arg3, arg3 + jz .error_gmac_update +%endif + + ; Increment size of "AAD length" for GMAC + add [arg2 + AadLen], arg4 + + vmovdqu xmm0, [arg2 + AadHash] + + cmp qword [arg2 + PBlockLen], 0 + je .partial_block_is_zero_len + + ;; Deal with previous partial block + vmovdqu xmm13, [arg1 + HashKey_1] + vmovdqu xmm14, [arg1 + HashKeyK_1] + ;; arg2 = [in] context + ;; arg3 = [in] message pointer + ;; arg4 = [in] message length + ;; xmm0 = [in/out] hash + ;; xmm13/xmm14 = [in] hash keys + call partial_block_gmac_avx_gen4 + ;; r11 = bytes processed + + ; CALC_AAD_HASH needs to deal with multiple of 16 bytes + sub arg4, r11 + add arg3, r11 + +.partial_block_is_zero_len: + vmovq xmm7, arg4 ; Save remaining length + and arg4, -16 ; Get multiple of 16 bytes + + or arg4, arg4 + jz .no_full_blocks + + ;; Calculate GHASH of this segment + mov r12, arg3 + mov r13, arg4 + ;; arg1 = key + ;; xmm0 = hash in/out + call ghash_internal_avx_gen4 + + vmovdqu [arg2 + AadHash], xmm0 ; ctx_data.aad hash = aad_hash + +.no_full_blocks: + add arg3, arg4 ; Point at partial block + + vmovq arg4, xmm7 ; Restore original remaining length + and arg4, 15 + jz .exit_gmac_update + + ; Save next partial block + mov [arg2 + PBlockLen], arg4 +%ifdef IS_AVX2_GCM + READ_SMALL_DATA_INPUT_AVX xmm1, arg3, arg4, r11 +%else + READ_SMALL_DATA_INPUT_AVX512 xmm1, arg3, arg4, r11, k1 +%endif + vpshufb xmm1, xmm1, [rel SHUF_MASK] + vpxor xmm0, xmm0, xmm1 + vmovdqu [arg2 + AadHash], xmm0 + +.exit_gmac_update: + FUNC_RESTORE + + ret + +%ifdef SAFE_PARAM +.error_gmac_update: + ;; Clear reg and imb_errno + IMB_ERR_CHECK_START rax + + ;; Check key_data != NULL + IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY + + ;; Check context_data != NULL + IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX + + ;; Check in != NULL (msg_len != 0) + IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_SRC + + ;; Set imb_errno + IMB_ERR_CHECK_END rax + jmp .exit_gmac_update +%endif + +mksection stack-noexec diff --git a/lib/include/gcm_api_vaes_avx512.inc b/lib/include/gcm_api_vaes_avx512.inc index a897c13c5f40fdff95de327eae4bc4d658c35953..e3fc9cdde5239bf186c10f95fa38ddd999c7f063 100644 --- a/lib/include/gcm_api_vaes_avx512.inc +++ b/lib/include/gcm_api_vaes_avx512.inc @@ -36,6 +36,11 @@ %ifndef GCM_API_VAES_AVX512_INC %define GCM_API_VAES_AVX512_INC +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; External symbols +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +extern ghash_internal_vaes_avx512 + mksection .text default rel @@ -45,6 +50,7 @@ default rel ; aes_gcm_precomp_256_vaes_avx512 ; (struct gcm_key_data *key_data) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(precomp,_),function,) FN_NAME(precomp,_): endbranch64 @@ -118,6 +124,7 @@ error_precomp: ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(enc,_),function,) FN_NAME(enc,_): endbranch64 @@ -132,69 +139,74 @@ FN_NAME(enc,_): ;; Check key_data != NULL cmp arg1, 0 - jz error_enc + jz .error_enc ;; Check context_data != NULL cmp arg2, 0 - jz error_enc + jz .error_enc ;; Check IV != NULL cmp arg6, 0 - jz error_enc + jz .error_enc ;; Check auth_tag != NULL cmp arg9, 0 - jz error_enc + jz .error_enc ;; Check auth_tag_len == 0 or > 16 cmp arg10, 0 - jz error_enc + jz .error_enc cmp arg10, 16 - ja error_enc + ja .error_enc ;; Check if msg_len == 0 cmp arg5, 0 - jz skip_in_out_check_enc + jz .skip_in_out_check_enc ;; Check if msg_len > max_len cmp arg5, GCM_MAX_LENGTH - ja error_enc + ja .error_enc ;; Check out != NULL (msg_len != 0) cmp arg3, 0 - jz error_enc + jz .error_enc ;; Check in != NULL (msg_len != 0) cmp arg4, 0 - jz error_enc + jz .error_enc -skip_in_out_check_enc: +.skip_in_out_check_enc: ;; Check if aad_len == 0 cmp arg8, 0 - jz skip_aad_check_enc + jz .skip_aad_check_enc ;; Check aad != NULL (aad_len != 0) cmp arg7, 0 - jz error_enc + jz .error_enc -skip_aad_check_enc: +.skip_aad_check_enc: %endif + ;; Check if msg_len <= 256 + cmp arg5, 16 * 16 + jbe .small_packet_path + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call, '>256' GCM_COMPLETE arg1, arg2, arg9, arg10, single_call, k1, r10, r11, r12 -%ifdef SAFE_DATA - clear_zmms_avx512 xmm6 -%endif + jmp .exit_enc + +.small_packet_path: + GCM_ENC_DEC_0_TO_256 arg1, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, ENC -exit_enc: +.exit_enc: FUNC_RESTORE ret %ifdef SAFE_PARAM -error_enc: +.error_enc: ;; Clear reg and imb_errno IMB_ERR_CHECK_START rax @@ -217,7 +229,7 @@ error_enc: ;; Check if msg_len == 0 cmp arg5, 0 - jz skip_in_out_check_error_enc + jz .skip_in_out_check_error_enc ;; Check if msg_len > max_len IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN @@ -228,18 +240,18 @@ error_enc: ;; Check in != NULL (msg_len != 0) IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC -skip_in_out_check_error_enc: +.skip_in_out_check_error_enc: ;; Check if aad_len == 0 cmp arg8, 0 - jz skip_aad_check_error_enc + jz .skip_aad_check_error_enc ;; Check aad != NULL (aad_len != 0) IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD -skip_aad_check_error_enc: +.skip_aad_check_error_enc: ;; Set imb_errno IMB_ERR_CHECK_END rax - jmp exit_enc + jmp .exit_enc %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -255,6 +267,7 @@ skip_aad_check_error_enc: ; u8 *auth_tag, ; u64 auth_tag_len); ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(dec,_),function,) FN_NAME(dec,_): endbranch64 @@ -269,68 +282,77 @@ FN_NAME(dec,_): ;; Check key_data != NULL cmp arg1, 0 - jz error_dec + jz .error_dec ;; Check context_data != NULL cmp arg2, 0 - jz error_dec + jz .error_dec ;; Check IV != NULL cmp arg6, 0 - jz error_dec + jz .error_dec ;; Check auth_tag != NULL cmp arg9, 0 - jz error_dec + jz .error_dec ;; Check auth_tag_len == 0 or > 16 cmp arg10, 0 - jz error_dec + jz .error_dec cmp arg10, 16 - ja error_dec + ja .error_dec ;; Check if msg_len == 0 cmp arg5, 0 - jz skip_in_out_check_dec + jz .skip_in_out_check_dec ;; Check if msg_len > max_len cmp arg5, GCM_MAX_LENGTH - ja error_dec + ja .error_dec ;; Check out != NULL (msg_len != 0) cmp arg3, 0 - jz error_dec + jz .error_dec ;; Check in != NULL (msg_len != 0) cmp arg4, 0 - jz error_dec + jz .error_dec -skip_in_out_check_dec: +.skip_in_out_check_dec: ;; Check if aad_len == 0 cmp arg8, 0 - jz skip_aad_check_dec + jz .skip_aad_check_dec ;; Check aad != NULL (aad_len != 0) cmp arg7, 0 - jz error_dec + jz .error_dec -skip_aad_check_dec: +.skip_aad_check_dec: %endif + ;; Check if msg_len <= 256 + cmp arg5, 16 * 16 + jbe .small_packet_path + GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, zmm11, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call + GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call, '>256' GCM_COMPLETE arg1, arg2, arg9, arg10, single_call, k1, r10, r11, r12 %ifdef SAFE_DATA clear_zmms_avx512 xmm6 %endif -exit_dec: + jmp .exit_dec + +.small_packet_path: + GCM_ENC_DEC_0_TO_256 arg1, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, DEC + +.exit_dec: FUNC_RESTORE ret %ifdef SAFE_PARAM -error_dec: +.error_dec: ;; Clear reg and imb_errno IMB_ERR_CHECK_START rax @@ -353,7 +375,7 @@ error_dec: ;; Check if msg_len == 0 cmp arg5, 0 - jz skip_in_out_check_error_dec + jz .skip_in_out_check_error_dec ;; Check if msg_len > max_len IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN @@ -364,19 +386,19 @@ error_dec: ;; Check in != NULL (msg_len != 0) IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC -skip_in_out_check_error_dec: +.skip_in_out_check_error_dec: ;; Check if aad_len == 0 cmp arg8, 0 - jz skip_aad_check_error_dec + jz .skip_aad_check_error_dec ;; Check aad != NULL (aad_len != 0) IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD -skip_aad_check_error_dec: +.skip_aad_check_error_dec: ;; Set imb_errno IMB_ERR_CHECK_END rax - jmp exit_dec + jmp .exit_dec %endif ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -386,37 +408,30 @@ skip_aad_check_error_dec: ;IMB_JOB *aes_gcm_enc_var_iv_128_vaes_avx512 / aes_gcm_enc_var_iv_192_vaes_avx512 / ; aes_gcm_enc_var_iv_256_vaes_avx512(IMB_MGR *state, IMB_JOB *job) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(enc_var_iv,_),function,internal) FN_NAME(enc_var_iv,_): endbranch64 FUNC_SAVE alloc_context - mov arg1, [arg2 + _enc_keys] + ;; Check if msg_len <= 256 + cmp qword [arg2 + _msg_len_to_cipher], 16 * 16 + jbe .small_packet_path - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_enc_IV + mov arg1, [arg2 + _enc_keys] GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call, \ - {[arg2 + _iv_len_in_bytes]} - jmp skip_iv_len_12_enc_IV - -iv_len_12_enc_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ - r10, r11, r12, k1, xmm14, xmm2, \ - zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ - zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call + {qword [arg2 + _iv_len_in_bytes]} -skip_iv_len_12_enc_IV: mov arg3, [arg2 + _src] add arg3, [arg2 + _cipher_start_src_offset] mov arg4, [arg2 + _dst] mov rbp, [arg2 + _msg_len_to_cipher] - GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, rbp, ENC, single_call + GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, rbp, ENC, single_call, '>256' GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ @@ -426,6 +441,27 @@ skip_iv_len_12_enc_IV: clear_zmms_avx512 xmm1, xmm4, xmm6, xmm7, xmm8, xmm12, xmm13, xmm14, \ xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm26, xmm30, xmm31 %endif + jmp .exit_enc + +align 32 +.small_packet_path: + mov arg1, [arg2 + _enc_keys] + mov arg3, [arg2 + _src] + add arg3, [arg2 + _cipher_start_src_offset] + mov arg4, [arg2 + _dst] + mov rbp, [arg2 + _msg_len_to_cipher] + GCM_ENC_DEC_0_TO_256 arg1, arg4, arg3, rbp, \ + {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ + {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ + ENC, {qword [arg2 + _iv_len_in_bytes]} + +%ifdef SAFE_DATA + clear_zmms_avx512 xmm0, xmm1, xmm2, xmm7, xmm8, xmm9, xmm11, xmm10, xmm14, \ + xmm15, xmm16, xmm17, xmm18, xmm20, xmm21 +%endif + +.exit_enc: ;; mark job complete mov dword [arg2 + _status], IMB_STATUS_COMPLETED @@ -441,15 +477,17 @@ skip_iv_len_12_enc_IV: ;IMB_JOB *aes_gcm_dec_var_iv_128_vaes_avx512 / aes_gcm_dec_var_iv_192_vaes_avx512 / ; aes_gcm_dec_var_iv_256_vaes_avx512(IMB_MGR *state, IMB_JOB *job) ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +align 32 MKGLOBAL(FN_NAME(dec_var_iv,_),function,internal) FN_NAME(dec_var_iv,_): endbranch64 FUNC_SAVE alloc_context - mov arg1, [arg2 + _dec_keys] + ;; Check if msg_len <= 256 + cmp qword [arg2 + _msg_len_to_cipher], 16 * 16 + jbe .small_packet_path - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_dec_IV + mov arg1, [arg2 + _dec_keys] GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ @@ -457,21 +495,12 @@ FN_NAME(dec_var_iv,_): zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call, \ {qword [arg2 + _iv_len_in_bytes]} - jmp skip_iv_len_12_dec_IV -iv_len_12_dec_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ - r10, r11, r12, k1, xmm14, xmm2, \ - zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ - zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, single_call - -skip_iv_len_12_dec_IV: mov arg3, [arg2 + _src] add arg3, [arg2 + _cipher_start_src_offset] mov arg4, [arg2 + _dst] mov rbp, [arg2 + _msg_len_to_cipher] - GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, rbp, DEC, single_call + GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, rbp, DEC, single_call, '>256' GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ @@ -481,6 +510,25 @@ skip_iv_len_12_dec_IV: clear_zmms_avx512 xmm1, xmm4, xmm6, xmm7, xmm8, xmm12, xmm13, xmm14, \ xmm16, xmm17, xmm18, xmm19, xmm20, xmm21, xmm26, xmm30, xmm31 %endif + jmp .exit_dec + +align 32 +.small_packet_path: + mov arg1, [arg2 + _enc_keys] + mov arg3, [arg2 + _src] + add arg3, [arg2 + _cipher_start_src_offset] + mov arg4, [arg2 + _dst] + mov rbp, [arg2 + _msg_len_to_cipher] + GCM_ENC_DEC_0_TO_256 arg1, arg4, arg3, rbp, \ + {[arg2 + _iv]}, \ + {[arg2 + _gcm_aad]}, {qword [arg2 + _gcm_aad_len]}, \ + {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ + DEC, {qword [arg2 + _iv_len_in_bytes]} + +%ifdef SAFE_DATA + clear_zmms_avx512 xmm2, xmm3, xmm4, xmm5, xmm9, xmm15, xmm16, xmm17, xmm18, xmm19, xmm20, xmm21 +%endif +.exit_dec: ;; mark job complete mov dword [arg2 + _status], IMB_STATUS_COMPLETED diff --git a/lib/include/gcm_avx512.inc b/lib/include/gcm_avx512.inc index 2c97259683a25a07880bd006af5b005183d823b5..79bd5295aab59f2ad254ed3718a7c20ed5d74c91 100644 --- a/lib/include/gcm_avx512.inc +++ b/lib/include/gcm_avx512.inc @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2018-2023, Intel Corporation All rights reserved. +; Copyright(c) 2018-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -28,4 +28,4 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; %define IS_AVX512_GCM -%include "include/gcm_common_avx2_avx512.inc" +%include "include/gcm_api_avx2_avx512.inc" diff --git a/lib/include/gcm_avx_gen4.inc b/lib/include/gcm_avx_gen4.inc deleted file mode 100644 index 12f596c47c161cd90b9aaefbcee5f8bb6f4a03f6..0000000000000000000000000000000000000000 --- a/lib/include/gcm_avx_gen4.inc +++ /dev/null @@ -1,31 +0,0 @@ -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2023, Intel Corporation All rights reserved. -; -; Redistribution and use in source and binary forms, with or without -; modification, are permitted provided that the following conditions -; are met: -; * Redistributions of source code must retain the above copyright -; notice, this list of conditions and the following disclaimer. -; * Redistributions in binary form must reproduce the above copyright -; notice, this list of conditions and the following disclaimer in -; the documentation and/or other materials provided with the -; distribution. -; * Neither the name of Intel Corporation nor the names of its -; contributors may be used to endorse or promote products derived -; from this software without specific prior written permission. -; -; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -; "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -; OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -; SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -; LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -; DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -; THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -%define IS_AVX2_GCM -%include "include/gcm_common_avx2_avx512.inc" diff --git a/lib/include/gcm_common_avx2_avx512.inc b/lib/include/gcm_common_avx2_avx512.inc index 0c39e1b361b6a27573b5db4918d3a7dc21c1c761..31d5ce95efa0c1608c432a0e13652c4dcb47ae3d 100644 --- a/lib/include/gcm_common_avx2_avx512.inc +++ b/lib/include/gcm_common_avx2_avx512.inc @@ -1,5 +1,5 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; Copyright(c) 2011-2023, Intel Corporation All rights reserved. +; Copyright(c) 2011-2024, Intel Corporation All rights reserved. ; ; Redistribution and use in source and binary forms, with or without ; modification, are permitted provided that the following conditions @@ -127,24 +127,23 @@ %ifndef GCM128_MODE %ifndef GCM192_MODE %ifndef GCM256_MODE -%error "No GCM key size selected for gcm_avx2_avx512.inc!" +%error "No GCM key size selected for gcm_common_avx2_avx512.inc!" %endif %endif %endif -%ifndef IS_AVX2_GCM -%ifndef IS_AVX512_GCM -%error "No GCM AVX2 or AVX512 selection made for gcm_avx2_avx512.inc!" -%endif +%ifdef IS_AVX512_GCM +%error "IS_AVX512_GCM: AVX512 variant removed!" %endif %ifdef IS_AVX2_GCM -%xdefine GCM_API_POSTFIX avx_gen4 +%error "IS_AVX2_GCM: Definition not required!" %endif -%ifdef IS_AVX512_GCM -%xdefine GCM_API_POSTFIX avx512 -%endif +%define IS_AVX2_GCM + +%xdefine GCM_API_POSTFIX avx_gen4 +%xdefine GCM_API_POSTFIX_AVX512 avx512 ;; Decide on AES-GCM key size to compile for %ifdef GCM128_MODE @@ -164,13 +163,13 @@ ;; Decide on AES-GCM key size to compile for %define FN_NAME(x,y) aes_gcm_ %+ x %+ GCM_API_KEYSZ %+ y %+ GCM_API_POSTFIX -%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ GCM_API_KEYSZ %+ _ %+ GCM_API_POSTFIX -%define GHASH_FN_NAME(x) x %+ _ %+ GCM_API_POSTFIX +%define FN_NAME_AVX512(x,y) aes_gcm_ %+ x %+ GCM_API_KEYSZ %+ y %+ GCM_API_POSTFIX_AVX512 -mksection .text -default rel +%define GMAC_FN_NAME(x) imb_aes_gmac_ %+ x %+ GCM_API_KEYSZ %+ _ %+ GCM_API_POSTFIX +%define GMAC_FN_NAME_AVX512(x) imb_aes_gmac_ %+ x %+ GCM_API_KEYSZ %+ _ %+ GCM_API_POSTFIX_AVX512 ; need to store 5 GP registers on stack (align to 16 bytes) +; @note: the last 8-byte slot is used in JOB API to save/restore a register %define GP_STORAGE 8*6 %define TMP2 16*0 ; Temporary storage for AES State 2 (State 1 is stored in an XMM register) @@ -358,13 +357,13 @@ default rel %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. -; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; CALC_GHASH: Calculates the hash of selected data +; Input: The input data (A_IN), that data's length (A_LEN), input hash value (AAD_HASH) ; Output: The hash of the data (AAD_HASH). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro CALC_AAD_HASH 13-15 -%define %%A_IN %1 ;; [in] message pointer -%define %%A_LEN %2 ;; [in] message length +%macro CALC_GHASH 11-12 +%define %%A_IN %1 ;; [in/clobbered] message pointer +%define %%A_LEN %2 ;; [in/clobbered] message length %define %%AAD_HASH %3 ;; [in] input hash value (XMM) %define %%GDATA_KEY %4 ;; [in] pointer to GCM key data %define %%XTMP0 %5 ;; [clobbered] temporary XMM @@ -373,35 +372,33 @@ default rel %define %%XTMP3 %8 ;; [clobbered] temporary XMM %define %%XTMP4 %9 ;; [clobbered] temporary XMM %define %%XTMP5 %10 ;; [clobbered] temporary XMM -%define %%T1 %11 ;; [clobbered] temporary GP register -%define %%T2 %12 ;; [clobbered] temporary GP register -%define %%T3 %13 ;; [clobbered] temporary GP register -%define %%T4 %14 ;; [clobbered] temporary GP register (obsolete with avx512) -%define %%T5 %15 ;; [clobbered] temporary GP register (obsolete with avx512) +%define %%T3 %11 ;; [clobbered] temporary GP register +%define %%MASKREG %12 ;; [clobbered] mask register %ifdef IS_AVX2_GCM -%if %0 != 15 -%error "AVX2 CALC_AAD_HASH needs 15 arguments!" +%if %0 != 11 +%error "AVX2 CALC_GHASH needs 11 arguments!" %endif %endif %ifdef IS_AVX512_GCM -%if %0 != 13 -%error "AVX512 CALC_AAD_HASH needs 13 arguments!" +%if %0 != 12 +%error "AVX512 CALC_GHASH needs 12 arguments!" %endif %endif - mov %%T1, %%A_IN ; T1 = AAD - mov %%T2, %%A_LEN ; T2 = aadLen + cmp %%A_LEN, 16 + jb %%_get_small_AAD_block + align 32 %%_get_AAD_loop128: - cmp %%T2, 128 - jl %%_exit_AAD_loop128 + cmp %%A_LEN, 128 + jb %%_exit_AAD_loop128 - vmovdqu %%XTMP0, [%%T1 + 16*0] - vpshufb %%XTMP0, [rel SHUF_MASK] + vmovdqu %%XTMP0, [%%A_IN + 16*0] + vpshufb %%XTMP0, %%XTMP0, [rel SHUF_MASK] - vpxor %%XTMP0, %%AAD_HASH + vpxor %%XTMP0, %%XTMP0, %%AAD_HASH vmovdqa %%XTMP5, [%%GDATA_KEY + HashKeyK_8] vpclmulqdq %%XTMP1, %%XTMP0, %%XTMP5, 0x00 ; XTMP1 = XTMP_L * KK_L @@ -415,8 +412,8 @@ align 32 %assign i 1 %assign j 7 %rep 7 - vmovdqu %%XTMP0, [%%T1 + 16*i] - vpshufb %%XTMP0, [rel SHUF_MASK] + vmovdqu %%XTMP0, [%%A_IN + 16*i] + vpshufb %%XTMP0, %%XTMP0, [rel SHUF_MASK] vmovdqa %%XTMP5, [%%GDATA_KEY + HashKeyK_ %+ j] vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x00 ; XTMP1 = XTMP_L * KK_L @@ -440,25 +437,25 @@ align 32 vpxor %%AAD_HASH, %%AAD_HASH, %%XTMP2 vpxor %%AAD_HASH, %%AAD_HASH, %%XTMP3 ; the result is in %%AAD_HASH - sub %%T2, 128 + sub %%A_LEN, 128 je %%_CALC_AAD_done - add %%T1, 128 + add %%A_IN, 128 jmp %%_get_AAD_loop128 %%_exit_AAD_loop128: - cmp %%T2, 16 + cmp %%A_LEN, 16 jb %%_get_small_AAD_block ;; calculate hash_key position to start with - mov %%T3, %%T2 + mov %%T3, %%A_LEN and %%T3, -16 ; 1 to 7 blocks possible here neg %%T3 add %%T3, HashKey_1 + 16 lea %%T3, [%%GDATA_KEY + %%T3] - vmovdqu %%XTMP0, [%%T1] - vpshufb %%XTMP0, [rel SHUF_MASK] + vmovdqu %%XTMP0, [%%A_IN] + vpshufb %%XTMP0, %%XTMP0, [rel SHUF_MASK] vpxor %%XTMP0, %%XTMP0, %%AAD_HASH @@ -472,15 +469,15 @@ align 32 vpxor %%XTMP2, %%XTMP2, %%XTMP4 ; XTMP2 = XTMP2 + XTMP4 add %%T3, 16 ; move to next hashkey - add %%T1, 16 ; move to next data block - sub %%T2, 16 - cmp %%T2, 16 + add %%A_IN, 16 ; move to next data block + sub %%A_LEN, 16 + cmp %%A_LEN, 16 jb %%_AAD_reduce align 32 %%_AAD_blocks: - vmovdqu %%XTMP0, [%%T1] - vpshufb %%XTMP0, [rel SHUF_MASK] + vmovdqu %%XTMP0, [%%A_IN] + vpshufb %%XTMP0, %%XTMP0, [rel SHUF_MASK] vmovdqa %%XTMP5, [%%T3 + HKeyGap] vpclmulqdq %%XTMP3, %%XTMP0, %%XTMP5, 0x00 ; XTMP1 = XTMP_L * KK_L @@ -494,9 +491,9 @@ align 32 vpxor %%XTMP2, %%XTMP2, %%XTMP4 add %%T3, 16 ; move to next hashkey - add %%T1, 16 - sub %%T2, 16 - cmp %%T2, 16 + add %%A_IN, 16 + sub %%A_LEN, 16 + cmp %%A_LEN, 16 jae %%_AAD_blocks %%_AAD_reduce: @@ -508,15 +505,15 @@ align 32 vpxor %%AAD_HASH, %%AAD_HASH, %%XTMP3 ; the result is in %%AAD_HASH %%_get_small_AAD_block: - or %%T2, %%T2 + or %%A_LEN, %%A_LEN je %%_CALC_AAD_done vmovdqa %%XTMP0, [%%GDATA_KEY + HashKey_1] vmovdqa %%XTMP1, [%%GDATA_KEY + HashKeyK_1] %ifdef IS_AVX2_GCM - READ_SMALL_DATA_INPUT_AVX %%XTMP2, %%T1, %%T2, %%T3 + READ_SMALL_DATA_INPUT_AVX %%XTMP2, %%A_IN, %%A_LEN, %%T3 %else - READ_SMALL_DATA_INPUT_AVX512 %%XTMP2, %%T1, %%T2, %%T3, k1 + READ_SMALL_DATA_INPUT_AVX512 %%XTMP2, %%A_IN, %%A_LEN, %%T3, %%MASKREG %endif ;byte-reflect the AAD data vpshufb %%XTMP2, %%XTMP2, [rel SHUF_MASK] @@ -525,6 +522,43 @@ align 32 %%_CALC_AAD_done: +%endmacro ; CALC_GHASH + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of selected data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and input hash (AAD_HASH) +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 13 +%define %%A_IN %1 ;; [in] message pointer +%define %%A_LEN %2 ;; [in] message length +%define %%AAD_HASH %3 ;; [in] input hash value (XMM) +%define %%GDATA_KEY %4 ;; [in] pointer to GCM key data +%define %%XTMP0 %5 ;; [clobbered] temporary XMM +%define %%XTMP1 %6 ;; [clobbered] temporary XMM +%define %%XTMP2 %7 ;; [clobbered] temporary XMM +%define %%XTMP3 %8 ;; [clobbered] temporary XMM +%define %%XTMP4 %9 ;; [clobbered] temporary XMM +%define %%XTMP5 %10 ;; [clobbered] temporary XMM +%define %%T1 %11 ;; [clobbered] temporary GP register +%define %%T2 %12 ;; [clobbered] temporary GP register +%define %%T3 %13 ;; [clobbered] temporary GP register + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + +%ifdef IS_AVX2_GCM + CALC_GHASH %%T1, %%T2, %%AAD_HASH, %%GDATA_KEY, \ + %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5, \ + %%T3 +%endif + +%ifdef IS_AVX512_GCM + CALC_GHASH %%T1, %%T2, %%AAD_HASH, %%GDATA_KEY, \ + %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5, \ + %%T3, k1 +%endif + %endmacro ; CALC_AAD_HASH ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2248,6 +2282,7 @@ align 32 mov [rsp + GP_OFFSET + 2*8], r13 mov [rsp + GP_OFFSET + 3*8], r14 mov [rsp + GP_OFFSET + 4*8], r15 + mov r14, rax %ifidn __OUTPUT_FORMAT__, win64 @@ -2291,42 +2326,37 @@ align 32 mov rsp, [rsp + GP_OFFSET + 0*8] %endmacro -%macro CALC_J0 15 +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +%macro CALC_J0 3 %define %%KEY %1 ;; [in] Pointer to GCM KEY structure %define %%IV %2 ;; [in] Pointer to IV %define %%IV_LEN %3 ;; [in] IV length -%define %%J0 %4 ;; [out] XMM reg to contain J0 -%define %%TMP0 %5 ;; [clobbered] Temporary GP reg -%define %%TMP1 %6 ;; [clobbered] Temporary GP reg -%define %%TMP2 %7 ;; [clobbered] Temporary GP reg -%define %%TMP3 %8 ;; [clobbered] Temporary GP reg (unused with AVX512) -%define %%TMP4 %9 ;; [clobbered] Temporary GP reg (unused with AVX512) -%define %%XTMP0 %10 ;; [clobbered] Temporary XMM reg -%define %%XTMP1 %11 ;; [clobbered] Temporary XMM reg -%define %%XTMP2 %12 ;; [clobbered] Temporary XMM reg -%define %%XTMP3 %13 ;; [clobbered] Temporary XMM reg -%define %%XTMP4 %14 ;; [clobbered] Temporary XMM reg -%define %%XTMP5 %15 ;; [clobbered] Temporary XMM reg + +%define %%J0 xmm0 ;; [out] XMM reg to contain J0 + +%define %%XTMP0 xmm1 ;; [clobbered] Temporary XMM reg +%define %%XTMP1 xmm2 ;; [clobbered] Temporary XMM reg +%define %%XTMP2 xmm3 ;; [clobbered] Temporary XMM reg +%define %%XTMP3 xmm4 ;; [clobbered] Temporary XMM reg +%define %%XTMP4 xmm5 ;; [clobbered] Temporary XMM reg +%define %%XTMP5 xmm6 ;; [clobbered] Temporary XMM reg ;; J0 = GHASH(IV || 0s+64 || len(IV)64) ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ ;; Calculate GHASH of (IV || 0s) vpxor %%J0, %%J0, %%J0 -%ifdef IS_AVX2_GCM - CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%XTMP0, %%XTMP1, %%XTMP2, \ - %%XTMP3, %%XTMP4, %%XTMP5, %%TMP0, %%TMP1, %%TMP2, %%TMP3, %%TMP4 -%else - CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%XTMP0, %%XTMP1, %%XTMP2, \ - %%XTMP3, %%XTMP4, %%XTMP5, %%TMP0, %%TMP1, %%TMP2 -%endif + ;; arg1 = key pointer + mov r12, %%IV + mov r13, %%IV_LEN + call ghash_internal_avx_gen4 ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) + vmovq %%XTMP2, %%IV_LEN + vpsllq %%XTMP2, %%XTMP2, 3 ;; IV length in bits vmovdqu %%XTMP0, [%%KEY + HashKey_1] vmovdqu %%XTMP1, [%%KEY + HashKeyK_1] - mov %%TMP2, %%IV_LEN - shl %%TMP2, 3 ;; IV length in bits - vmovq %%XTMP2, %%TMP2 vpxor %%J0, %%J0, %%XTMP2 GHASH_MUL2 %%J0, %%XTMP0, %%XTMP1, %%XTMP2, %%XTMP3, %%XTMP4, %%XTMP5 @@ -2337,28 +2367,46 @@ align 32 ; GCM_INIT initializes a gcm_context_data struct to prepare for encoding/decoding. ; Input: gcm_key_data * (GDATA_KEY), gcm_context_data *(GDATA_CTX), IV, IV_LEN, ; Additional Authentication data (A_IN), Additional Data length (A_LEN). -; Output: Updated GDATA_CTX with the hash of A_IN (AadHash) and initialized other parts of GDATA. -; Clobbers rax, r10-r13 and xmm0-xmm6 +; Output: Updated GDATA_CTX with the hash of A_IN (AadHash=xmm14) and +; initialized other parts of GDATA. +; xmm2 - holds counter block (LE format) +; Clobbers: rax, r10-r13 and xmm0-xmm6 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro GCM_INIT 8-9 +%macro GCM_INIT 5-6 %define %%GDATA_KEY %1 ; [in] GCM expanded keys pointer %define %%GDATA_CTX %2 ; [in] GCM context pointer %define %%IV %3 ; [in] IV pointer %define %%A_IN %4 ; [in] AAD pointer %define %%A_LEN %5 ; [in] AAD length in bytes -%define %%GPR1 %6 ; temp GPR -%define %%GPR2 %7 ; temp GPR -%define %%GPR3 %8 ; temp GPR -%define %%IV_LEN %9 ; [in] IV length +%define %%IV_LEN %6 ; [in] IV length + +%define %%GPR1 r10 ; temp GPR +%define %%GPR2 r11 ; temp GPR +%define %%GPR3 rax ; temp GPR %define %%AAD_HASH xmm14 - vpxor %%AAD_HASH, %%AAD_HASH -%ifdef IS_AVX2_GCM - CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3, r13, rax -%else - CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3 -%endif + ;; IV may be different than 12 bytes + cmp %%A_LEN, 12 + je %%_aad_len_is_12 + + vpxor %%AAD_HASH, %%AAD_HASH, %%AAD_HASH + CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ + xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, %%GPR1, %%GPR2, %%GPR3 + jmp %%_aad_is_done + +%%_aad_len_is_12: + ;; GHASH 12 bytes of AAD + mov %%GPR1, %%A_IN + vmovq %%AAD_HASH, [%%GPR1] + vpinsrd %%AAD_HASH, [%%GPR1 + 8], 2 + vmovdqa xmm1, [%%GDATA_KEY + HashKey_1] + vmovdqa xmm2, [%%GDATA_KEY + HashKey_1 + HKeyGap] + vpshufb %%AAD_HASH, %%AAD_HASH, [rel SHUF_MASK] + + GHASH_MUL2 %%AAD_HASH, xmm1, xmm2, xmm6, xmm5, xmm4, xmm3 + +%%_aad_is_done: mov %%GPR1, %%A_LEN vmovdqu [%%GDATA_CTX + AadHash], %%AAD_HASH ; ctx_data.aad hash = aad_hash mov [%%GDATA_CTX + AadLen], %%GPR1 ; ctx_data.aad_length = aad_length @@ -2367,21 +2415,32 @@ align 32 mov [%%GDATA_CTX + InLen], %%GPR1 ; ctx_data.in_length = 0 mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx_data.partial_block_length = 0 -%if %0 == 9 ;; IV is different than 12 bytes - CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, xmm2, r10, r11, r12, r13, rax, xmm0, xmm1, \ - xmm3, xmm4, xmm5, xmm6 -%else ;; IV is 12 bytes +%if %0 == 6 + ;; IV may be different than 12 bytes + cmp %%IV_LEN, 12 + je %%_iv_len_is_12 + + ;; uses xmm0-xmm6, r10-r13, rax + CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN + jmp %%_iv_is_done + +%%_iv_len_is_12: +%endif + + ;; IV is 12 bytes ;; read 12 IV bytes and pad with 0x00000001 mov %%GPR2, %%IV - vmovdqa xmm2, [rel ONEf] ; read 12 IV bytes and pad with 0x00000001 - vpinsrq xmm2, [%%GPR2], 0 - vpinsrd xmm2, [%%GPR2 + 8], 2 -%endif - vmovdqu [%%GDATA_CTX + OrigIV], xmm2 ; ctx_data.orig_IV = iv + vmovq xmm0, [%%GPR2] + vpinsrd xmm0, [%%GPR2 + 8], 2 + vpinsrd xmm0, [rel ONEf + 12], 3 ; read 12 IV bytes and pad with 0x00000001 + +%%_iv_is_done: + vmovdqu [%%GDATA_CTX + OrigIV], xmm0 ; ctx_data.orig_IV = iv ;; store IV as counter in LE format - vpshufb xmm2, [rel SHUF_MASK] + vpshufb xmm2, xmm0, [rel SHUF_MASK] vmovdqu [%%GDATA_CTX + CurCount], xmm2 ; ctx_data.current_counter = iv + ;; @note: xmm2 - needs to return counter block %endmacro %macro GCM_ENC_DEC_SMALL 12 @@ -2793,7 +2852,6 @@ align 32 je %%_partial_done GHASH_MUL2 xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block - vmovdqu [%%GDATA_CTX + AadHash], xmm14 %%_partial_done: @@ -2855,1230 +2913,49 @@ align 32 %endif %endmacro ; GCM_COMPLETE -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_precomp_128_avx_gen4 / -; aes_gcm_precomp_192_avx_gen4 / -; aes_gcm_precomp_256_avx_gen4 / -; aes_gcm_precomp_128_avx512 / -; aes_gcm_precomp_192_avx512 / -; aes_gcm_precomp_256_avx512 -; (struct gcm_key_data *key_data) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -MKGLOBAL(FN_NAME(precomp,_),function,) -FN_NAME(precomp,_): - endbranch64 -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_precomp -%endif - -%ifidn __OUTPUT_FORMAT__, win64 - sub rsp, 1*16 - ; only xmm6 needs to be maintained - vmovdqu [rsp + 0*16],xmm6 -%endif - - vpxor xmm6, xmm6 - ENCRYPT_SINGLE_BLOCK arg1, xmm6 ; xmm6 = HashKey - - vpshufb xmm6, [rel SHUF_MASK] - ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; - vmovdqa xmm2, xmm6 - vpsllq xmm6, xmm6, 1 - vpsrlq xmm2, xmm2, 63 - vmovdqa xmm1, xmm2 - vpslldq xmm2, xmm2, 8 - vpsrldq xmm1, xmm1, 8 - vpor xmm6, xmm6, xmm2 - ;reduction - vpshufd xmm2, xmm1, 00100100b - vpcmpeqd xmm2, [rel TWOONE] - vpand xmm2, xmm2, [rel POLY] - vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - vmovdqu [arg1 + HashKey], xmm6 ; store HashKey<<1 mod poly - - PRECOMPUTE arg1, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif - -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm6, [rsp + 0*16] - add rsp, 1*16 -%endif - -exit_precomp: - - ret - -%ifdef SAFE_PARAM -error_precomp: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - - jmp exit_precomp -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_init_128_avx_gen4 / aes_gcm_init_192_avx_gen4 / aes_gcm_init_256_avx_gen4 / -; aes_gcm_init_128_avx512 / aes_gcm_init_192_avx512 / aes_gcm_init_256_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *iv, -; const u8 *aad, -; u64 aad_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -MKGLOBAL(FN_NAME(init,_),function,) -FN_NAME(init,_): - endbranch64 - push r12 - push r13 -%ifidn __OUTPUT_FORMAT__, win64 - push r14 - push r15 - lea r14, [rsp + 4*8] - ; xmm6 needs to be maintained for Windows - sub rsp, 1*16 - vmovdqu [rsp + 0*16], xmm6 -%endif - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_init - - ;; Check context_data != NULL - or arg2, arg2 - jz error_init - - ;; Check IV != NULL - or arg3, arg3 - jz error_init - - ;; Check if aad_len == 0 - cmp arg5, 0 - jz skip_aad_check_init - - ;; Check aad != NULL (aad_len != 0) - or arg4, arg4 - jz error_init - -skip_aad_check_init: -%endif - GCM_INIT arg1, arg2, arg3, arg4, arg5, r10, r11, r12 - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif -exit_init: - -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm6 , [rsp + 0*16] - add rsp, 1*16 - pop r15 - pop r14 -%endif - pop r13 - pop r12 - ret - -%ifdef SAFE_PARAM -error_init: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check IV != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_IV - - ;; Check if aad_len == 0 - cmp arg5, 0 - jz skip_aad_check_error_init - - ;; Check aad != NULL (aad_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AAD - -skip_aad_check_error_init: - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_init -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_init_var_iv_128_avx_gen4 / aes_gcm_init_var_iv_192_avx_gen4 / -; aes_gcm_init_var_iv_256_avx_gen4 -; aes_gcm_init_var_iv_128_avx512 / aes_gcm_init_var_iv_192_avx512 / -; aes_gcm_init_var_iv_256_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *iv, -; const u64 iv_len, -; const u8 *aad, -; const u64 aad_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(init_var_iv,_),function,) -FN_NAME(init_var_iv,_): - endbranch64 - push r12 - push r13 -%ifidn __OUTPUT_FORMAT__, win64 - push r14 - push r15 - lea r14, [rsp + 4*8] - ; xmm6 & xmm14 need to be maintained for Windows - sub rsp, 2*16 - vmovdqu [rsp + 0*16], xmm6 - vmovdqu [rsp + 1*16], xmm14 -%endif - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_init_IV - - ;; Check context_data != NULL - or arg2, arg2 - jz error_init_IV - - ;; Check IV != NULL - or arg3, arg3 - jz error_init_IV - - ;; Check iv_len != 0 - or arg4, arg4 - jz error_init_IV - - ;; Check if aad_len == 0 - cmp arg6, 0 - jz skip_aad_check_init_IV - - ;; Check aad != NULL (aad_len != 0) - cmp arg5, 0 - jz error_init_IV - -skip_aad_check_init_IV: -%endif - cmp arg4, 12 - je iv_len_12_init_IV - - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, arg4 - jmp skip_iv_len_12_init_IV - -iv_len_12_init_IV: - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12 - -skip_iv_len_12_init_IV: -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif -exit_init_IV: - -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm6, [rsp + 0*16] - vmovdqu xmm14, [rsp + 1*16] - add rsp, 2*16 - pop r15 - pop r14 -%endif - pop r13 - pop r12 - ret - -%ifdef SAFE_PARAM -error_init_IV: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check IV != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_IV - - ;; Check iv_len != 0 - IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_IV_LEN - - ;; Check if aad_len == 0 - cmp arg6, 0 - jz skip_aad_check_error_init_IV - - ;; Check aad != NULL (aad_len != 0) - IMB_ERR_CHECK_NULL arg5, rax, IMB_ERR_NULL_AAD - -skip_aad_check_error_init_IV: - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_init_IV -%endif - - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_enc_128_update_avx_gen4 / aes_gcm_enc_192_update_avx_gen4 / -; aes_gcm_enc_128_update_avx_gen4 / -; aes_gcm_enc_128_update_avx512 / aes_gcm_enc_192_update_avx512 / -; aes_gcm_enc_256_update_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *out, -; const u8 *in, -; u64 msg_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(enc,_update_),function,) -FN_NAME(enc,_update_): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Load max len to reg on windows - INIT_GCM_MAX_LENGTH - - ;; Check key_data != NULL - or arg1, arg1 - jz error_update_enc - - ;; Check context_data != NULL - or arg2, arg2 - jz error_update_enc - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz error_update_enc - - ;; Check if msg_len > max_len - cmp arg5, GCM_MAX_LENGTH - ja error_update_enc - - ;; Check out != NULL (msg_len != 0) - or arg3, arg3 - jz error_update_enc - - ;; Check in != NULL (msg_len != 0) - or arg4, arg4 - jz error_update_enc -%endif - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, multi_call - -exit_update_enc: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_update_enc: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check if plaintext_len == 0 - cmp arg5, 0 - jz skip_in_out_check_error_update_enc - - ;; Check if msg_len > max_len - IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN - - ;; Check out != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST - - ;; Check in != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC - -skip_in_out_check_error_update_enc: - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_update_enc -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_dec_128_update_avx_gen4 / aes_gcm_dec_192_update_avx_gen4 / -; aes_gcm_dec_256_update_avx_gen4 / -; aes_gcm_dec_128_update_avx512 / aes_gcm_dec_192_update_avx512 / -; aes_gcm_dec_256_update_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *out, -; const u8 *in, -; u64 msg_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(dec,_update_),function,) -FN_NAME(dec,_update_): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Load max len to reg on windows - INIT_GCM_MAX_LENGTH - - ;; Check key_data != NULL - or arg1, arg1 - jz error_update_dec - - ;; Check context_data != NULL - or arg2, arg2 - jz error_update_dec - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz error_update_dec - - ;; Check if msg_len > max_len - cmp arg5, GCM_MAX_LENGTH - ja error_update_dec - - ;; Check out != NULL (msg_len != 0) - or arg3, arg3 - jz error_update_dec - - ;; Check in != NULL (msg_len != 0) - or arg4, arg4 - jz error_update_dec -%endif - - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, multi_call - -exit_update_dec: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_update_dec: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check if plaintext_len == 0 - cmp arg5, 0 - jz skip_in_out_check_error_update_dec - - ;; Check if msg_len > max_len - IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN - - ;; Check out != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST - - ;; Check in != NULL (plaintext_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC - -skip_in_out_check_error_update_dec: - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_update_dec -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_enc_128_finalize_avx_gen4 / aes_gcm_enc_192_finalize_avx_gen4 / -; aes_gcm_enc_256_finalize_avx_gen4 / -; aes_gcm_enc_128_finalize_avx512 / aes_gcm_enc_192_finalize_avx512 / -; aes_gcm_enc_256_finalize_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *auth_tag, -; u64 auth_tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(enc,_finalize_),function,) -FN_NAME(enc,_finalize_): - endbranch64 -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_enc_fin - - ;; Check context_data != NULL - or arg2, arg2 - jz error_enc_fin - - ;; Check auth_tag != NULL - or arg3, arg3 - jz error_enc_fin - - ;; Check auth_tag_len == 0 or > 16 - or arg4, arg4 - jz error_enc_fin - - cmp arg4, 16 - ja error_enc_fin -%endif - push r12 - -%ifidn __OUTPUT_FORMAT__, win64 - ; xmm6:xmm15 need to be maintained for Windows - sub rsp, 7*16 - vmovdqu [rsp + 0*16], xmm6 - vmovdqu [rsp + 1*16], xmm9 - vmovdqu [rsp + 2*16], xmm10 - vmovdqu [rsp + 3*16], xmm11 - vmovdqu [rsp + 4*16], xmm13 - vmovdqu [rsp + 5*16], xmm14 - vmovdqu [rsp + 6*16], xmm15 -%endif - GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif - -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm15, [rsp + 6*16] - vmovdqu xmm14, [rsp + 5*16] - vmovdqu xmm13, [rsp + 4*16] - vmovdqu xmm11, [rsp + 3*16] - vmovdqu xmm10, [rsp + 2*16] - vmovdqu xmm9, [rsp + 1*16] - vmovdqu xmm6, [rsp + 0*16] - add rsp, 7*16 -%endif - pop r12 -exit_enc_fin: - ret - -%ifdef SAFE_PARAM -error_enc_fin: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check auth_tag != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_AUTH - - ;; Check auth_tag_len == 0 or > 16 - IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_AUTH_TAG_LEN - - IMB_ERR_CHECK_ABOVE arg4, 16, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_enc_fin -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_dec_128_finalize_avx_gen4 / aes_gcm_dec_192_finalize_avx_gen4 -; aes_gcm_dec_256_finalize_avx_gen4 / -; aes_gcm_dec_128_finalize_avx512 / aes_gcm_dec_192_finalize_avx512 -; aes_gcm_dec_256_finalize_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *auth_tag, -; u64 auth_tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(dec,_finalize_),function,) -FN_NAME(dec,_finalize_): - endbranch64 -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_dec_fin - - ;; Check context_data != NULL - or arg2, arg2 - jz error_dec_fin - - ;; Check auth_tag != NULL - or arg3, arg3 - jz error_dec_fin - - ;; Check auth_tag_len == 0 or > 16 - or arg4, arg4 - jz error_dec_fin - - cmp arg4, 16 - ja error_dec_fin -%endif - - push r12 - -%ifidn __OUTPUT_FORMAT__, win64 - ; xmm6:xmm15 need to be maintained for Windows - sub rsp, 7*16 - vmovdqu [rsp + 0*16], xmm6 - vmovdqu [rsp + 1*16], xmm9 - vmovdqu [rsp + 2*16], xmm10 - vmovdqu [rsp + 3*16], xmm11 - vmovdqu [rsp + 4*16], xmm13 - vmovdqu [rsp + 5*16], xmm14 - vmovdqu [rsp + 6*16], xmm15 -%endif - GCM_COMPLETE arg1, arg2, arg3, arg4, multi_call - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm15, [rsp + 6*16] - vmovdqu xmm14, [rsp + 5*16] - vmovdqu xmm13, [rsp + 4*16] - vmovdqu xmm11, [rsp + 3*16] - vmovdqu xmm10, [rsp + 2*16] - vmovdqu xmm9, [rsp + 1*16] - vmovdqu xmm6, [rsp + 0*16] - add rsp, 7*16 -%endif - - pop r12 - -exit_dec_fin: - ret - -%ifdef SAFE_PARAM -error_dec_fin: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check auth_tag != NULL - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_AUTH - - ;; Check auth_tag_len == 0 or > 16 - IMB_ERR_CHECK_ZERO arg4, rax, IMB_ERR_AUTH_TAG_LEN - - IMB_ERR_CHECK_ABOVE arg4, 16, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_dec_fin -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_enc_128_avx_gen4 / aes_gcm_enc_192_avx_gen4 / aes_gcm_enc_256_avx_gen4 / -; aes_gcm_enc_128_avx512 / aes_gcm_enc_192_avx512 / aes_gcm_enc_256_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *out, -; const u8 *in, -; u64 msg_len, -; u8 *iv, -; const u8 *aad, -; u64 aad_len, -; u8 *auth_tag, -; u64 auth_tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(enc,_),function,) -FN_NAME(enc,_): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Load max len to reg on windows - INIT_GCM_MAX_LENGTH - - ;; Check key_data != NULL - or arg1, arg1 - jz error_enc - - ;; Check context_data != NULL - or arg2, arg2 - jz error_enc - - ;; Check IV != NULL - cmp arg6, 0 - jz error_enc - - ;; Check auth_tag != NULL - cmp arg9, 0 - jz error_enc - - ;; Check auth_tag_len == 0 or > 16 - cmp arg10, 0 - jz error_enc - - cmp arg10, 16 - ja error_enc - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz skip_in_out_check_enc - - ;; Check if msg_len > max_len - cmp arg5, GCM_MAX_LENGTH - ja error_enc - - ;; Check out != NULL (msg_len != 0) - or arg3, arg3 - jz error_enc - - ;; Check in != NULL (msg_len != 0) - or arg4, arg4 - jz error_enc - -skip_in_out_check_enc: - ;; Check if aad_len == 0 - cmp arg8, 0 - jz skip_aad_check_enc - - ;; Check aad != NULL (aad_len != 0) - cmp arg7, 0 - jz error_enc - -skip_aad_check_enc: -%endif - GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 - - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, ENC, single_call - - GCM_COMPLETE arg1, arg2, arg9, arg10, single_call - -exit_enc: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_enc: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check IV != NULL - IMB_ERR_CHECK_NULL arg6, rax, IMB_ERR_NULL_IV - - ;; Check auth_tag != NULL - IMB_ERR_CHECK_NULL arg9, rax, IMB_ERR_NULL_AUTH - - ;; Check auth_tag_len == 0 or > 16 - IMB_ERR_CHECK_ZERO arg10, rax, IMB_ERR_AUTH_TAG_LEN - - IMB_ERR_CHECK_ABOVE arg10, 16, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz skip_in_out_check_error_enc - - ;; Check if msg_len > max_len - IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN - - ;; Check out != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST - - ;; Check in != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC - -skip_in_out_check_error_enc: - ;; Check if aad_len == 0 - cmp arg8, 0 - jz skip_aad_check_error_enc - - ;; Check aad != NULL (aad_len != 0) - IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD - -skip_aad_check_error_enc: - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_enc -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void aes_gcm_dec_128_avx_gen4 / aes_gcm_dec_192_avx_gen4 / aes_gcm_dec_256_avx_gen4 / -; aes_gcm_dec_128_avx512 / aes_gcm_dec_192_avx512 / aes_gcm_dec_256_avx512 -; (const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; u8 *out, -; const u8 *in, -; u64 msg_len, -; u8 *iv, -; const u8 *aad, -; u64 aad_len, -; u8 *auth_tag, -; u64 auth_tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(dec,_),function,) -FN_NAME(dec,_): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Load max len to reg on windows - INIT_GCM_MAX_LENGTH - - ;; Check key_data != NULL - or arg1, arg1 - jz error_dec - - ;; Check context_data != NULL - or arg2, arg2 - jz error_dec - - ;; Check IV != NULL - cmp arg6, 0 - jz error_dec - - ;; Check auth_tag != NULL - cmp arg9, 0 - jz error_dec - - ;; Check auth_tag_len == 0 or > 16 - cmp arg10, 0 - jz error_dec - - cmp arg10, 16 - ja error_dec - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz skip_in_out_check_dec - - ;; Check if msg_len > max_len - cmp arg5, GCM_MAX_LENGTH - ja error_dec - - ;; Check out != NULL (msg_len != 0) - or arg3, arg3 - jz error_dec - - ;; Check in != NULL (msg_len != 0) - or arg4, arg4 - jz error_dec - -skip_in_out_check_dec: - ;; Check if aad_len == 0 - cmp arg8, 0 - jz skip_aad_check_dec - - ;; Check aad != NULL (aad_len != 0) - cmp arg7, 0 - jz error_dec - -skip_aad_check_dec: -%endif - GCM_INIT arg1, arg2, arg6, arg7, arg8, r10, r11, r12 - - GCM_ENC_DEC arg1, arg2, arg3, arg4, arg5, DEC, single_call - - GCM_COMPLETE arg1, arg2, arg9, arg10, single_call - -exit_dec: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_dec: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check IV != NULL - IMB_ERR_CHECK_NULL arg6, rax, IMB_ERR_NULL_IV - - ;; Check auth_tag != NULL - IMB_ERR_CHECK_NULL arg9, rax, IMB_ERR_NULL_AUTH - - ;; Check auth_tag_len == 0 or > 16 - IMB_ERR_CHECK_ZERO arg10, rax, IMB_ERR_AUTH_TAG_LEN - - IMB_ERR_CHECK_ABOVE arg10, 16, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Check if msg_len == 0 - cmp arg5, 0 - jz skip_in_out_check_error_dec - - ;; Check if msg_len > max_len - IMB_ERR_CHECK_ABOVE arg5, GCM_MAX_LENGTH, rax, IMB_ERR_CIPH_LEN - - ;; Check out != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_DST - - ;; Check in != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_SRC - -skip_in_out_check_error_dec: - ;; Check if aad_len == 0 - cmp arg8, 0 - jz skip_aad_check_error_dec - - ;; Check aad != NULL (aad_len != 0) - IMB_ERR_CHECK_NULL arg7, rax, IMB_ERR_NULL_AAD - -skip_aad_check_error_dec: - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_dec -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; NOTE: THIS API IS USED BY JOB-API ONLY, NO NEED FOR 2ND SAFE PARAM CHECK -; -;IMB_JOB * aes_gcm_enc_var_iv_128_avx_gen4 / aes_gcm_enc_var_iv_192_avx_gen4 / -; aes_gcm_enc_var_iv_256_avx_gen4 / -; aes_gcm_enc_var_iv_128_avx512 / aes_gcm_enc_var_iv_192_avx512 / -; aes_gcm_enc_var_iv_256_avx512 -; (IMB_MGR *state, IMB_JOB *job) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(enc_var_iv,_),function,internal) -FN_NAME(enc_var_iv,_): - endbranch64 - FUNC_SAVE alloc_context - - mov arg1, [arg2 + _enc_keys] - - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_enc_IV - - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12, {[arg2 + _iv_len_in_bytes]} - - jmp skip_iv_len_12_enc_IV - -iv_len_12_enc_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12 - -skip_iv_len_12_enc_IV: - mov arg3, [arg2 + _src] - add arg3, [arg2 + _cipher_start_src_offset] - mov arg4, [arg2 + _dst] - mov [rsp + GP_OFFSET + 5*8], arg2 ; preserve job pointer - mov arg2, [arg2 + _msg_len_to_cipher] - GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, arg2, ENC, single_call - - mov arg2, [rsp + GP_OFFSET + 5*8] - GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ - {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ - single_call - - ;; mark job complete - mov dword [arg2 + _status], IMB_STATUS_COMPLETED - - mov rax, arg2 ;; return the job - - FUNC_RESTORE - ret - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; -; NOTE: THIS API IS USED BY JOB-API ONLY, NO NEED FOR 2ND SAFE PARAM CHECK -; -;IMB_JOB *aes_gcm_dec_var_iv_128_avx_gen4 / aes_gcm_dec_var_iv_192_avx_gen4 / -; aes_gcm_dec_var_iv_256_avx_gen4 / -; aes_gcm_dec_var_iv_128_avx512 / aes_gcm_dec_var_iv_192_avx512 / -; aes_gcm_dec_var_iv_256_avx512 -; (IMB_MGR *state, IMB_JOB *job) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(FN_NAME(dec_var_iv,_),function,internal) -FN_NAME(dec_var_iv,_): - endbranch64 - FUNC_SAVE alloc_context - - mov arg1, [arg2 + _dec_keys] - - cmp qword [arg2 + _iv_len_in_bytes], 12 - je iv_len_12_dec_IV - - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12, {[arg2 + _iv_len_in_bytes]} - - jmp skip_iv_len_12_dec_IV - -iv_len_12_dec_IV: - GCM_INIT arg1, {rsp + CONTEXT_OFFSET}, {[arg2 + _iv]}, \ - {[arg2 + _gcm_aad]}, {[arg2 + _gcm_aad_len]}, \ - r10, r11, r12 - -skip_iv_len_12_dec_IV: - mov arg3, [arg2 + _src] - add arg3, [arg2 + _cipher_start_src_offset] - mov arg4, [arg2 + _dst] - mov [rsp + GP_OFFSET + 5*8], arg2 ; preserve job pointer - mov arg2, [arg2 + _msg_len_to_cipher] - GCM_ENC_DEC arg1, {rsp + CONTEXT_OFFSET}, arg4, arg3, arg2, DEC, single_call - - mov arg2, [rsp + GP_OFFSET + 5*8] - GCM_COMPLETE arg1, {rsp + CONTEXT_OFFSET}, \ - {[arg2 + _auth_tag_output]}, {[arg2 + _auth_tag_output_len_in_bytes]}, \ - single_call - - ;; mark job complete - mov dword [arg2 + _status], IMB_STATUS_COMPLETED - - mov rax, arg2 ;; return the job - - FUNC_RESTORE - ret - -%ifdef GCM128_MODE -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void ghash_pre_avx_gen4 / ghash_pre_avx512 -; (const void *key, struct gcm_key_data *key_data) -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GHASH_FN_NAME(ghash_pre),function,) -GHASH_FN_NAME(ghash_pre): - endbranch64 -;; Parameter is passed through register -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key != NULL - cmp arg1, 0 - jz error_ghash_pre - - ;; Check key_data != NULL - cmp arg2, 0 - jz error_ghash_pre -%endif - -%ifidn __OUTPUT_FORMAT__, win64 - sub rsp, 1*16 - - ; only xmm6 needs to be maintained - vmovdqu [rsp + 0*16], xmm6 -%endif - vmovdqu xmm6, [arg1] - vpshufb xmm6, [rel SHUF_MASK] - ;;;;;;;;;;;;;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey;;;;;;;;;;;;;;; - vmovdqa xmm2, xmm6 - vpsllq xmm6, xmm6, 1 - vpsrlq xmm2, xmm2, 63 - vmovdqa xmm1, xmm2 - vpslldq xmm2, xmm2, 8 - vpsrldq xmm1, xmm1, 8 - vpor xmm6, xmm6, xmm2 - ;reduction - vpshufd xmm2, xmm1, 00100100b - vpcmpeqd xmm2, [rel TWOONE] - vpand xmm2, xmm2, [rel POLY] - vpxor xmm6, xmm6, xmm2 ; xmm6 holds the HashKey<<1 mod poly - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - vmovdqu [arg2 + HashKey], xmm6 ; store HashKey<<1 mod poly - - PRECOMPUTE arg2, xmm6, xmm0, xmm1, xmm2, xmm3, xmm4, xmm5 - -%ifdef SAFE_DATA - clear_scratch_xmms_avx_asm -%endif -%ifidn __OUTPUT_FORMAT__, win64 - vmovdqu xmm6, [rsp + 0*16] - add rsp, 1*16 -%endif -exit_ghash_pre: - ret - -%ifdef SAFE_PARAM -error_ghash_pre: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_KEY - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_EXP_KEY - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_ghash_pre -%endif - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void ghash_avx_gen4 / ghash_avx512 ( -; const struct gcm_key_data *key_data, -; const void *in, -; const u64 in_len, -; void *io_tag, -; const u64 tag_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GHASH_FN_NAME(ghash),function,) -GHASH_FN_NAME(ghash): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET - - ;; Check key_data != NULL - or arg1, arg1 - jz error_ghash - - ;; Check in != NULL - or arg2, arg2 - jz error_ghash - - ;; Check in_len != 0 - or arg3, arg3 - jz error_ghash - - ;; Check tag != NULL - or arg4, arg4 - jz error_ghash - - ;; Check tag_len != 0 - cmp arg5, 0 - jz error_ghash -%endif - - ;; copy tag to xmm0 - vmovdqu xmm0, [arg4] - vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - -%ifdef IS_AVX2_GCM - CALC_AAD_HASH arg2, arg3, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, r12, r13, rax -%else - CALC_AAD_HASH arg2, arg3, xmm0, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, r12 -%endif - vpshufb xmm0, [rel SHUF_MASK] ; perform a 16Byte swap - - simd_store_avx arg4, xmm0, arg5, r12, rax - -exit_ghash: - FUNC_RESTORE - ret - -%ifdef SAFE_PARAM -error_ghash: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check in != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_SRC - - ;; Check in_len != 0 - IMB_ERR_CHECK_ZERO arg3, rax, IMB_ERR_AUTH_LEN - - ;; Check tag != NULL - IMB_ERR_CHECK_NULL arg4, rax, IMB_ERR_NULL_AUTH - - ;; Check tag_len != 0 - IMB_ERR_CHECK_ZERO arg5, rax, IMB_ERR_AUTH_TAG_LEN - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - - jmp exit_ghash -%endif - -%endif ;; GCM128_MODE - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; PARTIAL_BLOCK_GMAC: Handles the tag partial blocks between update calls. ; Requires the input data be at least 1 byte long. ; Input: gcm_context_data (GDATA_CTX), input text (PLAIN_IN), hash subkey (HASH_SUBKEY) ; input text length (PLAIN_LEN). ; Output: Updated GDATA_CTX -; Clobbers rax, r10, r12, r13, r15, xmm0, xmm1, xmm2, xmm3, xmm5, xmm6, xmm9, xmm10, xmm11 +; Clobbers rax, r10, r12, r13, r15 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro PARTIAL_BLOCK_GMAC 6 -%define %%GDATA_CTX %1 -%define %%PLAIN_IN %2 -%define %%PLAIN_LEN %3 -%define %%DATA_OFFSET %4 -%define %%AAD_HASH %5 -%define %%HASH_SUBKEY %6 - - mov r13, [%%GDATA_CTX + PBlockLen] - or r13, r13 - ; Leave Macro if no partial blocks - je %%_partial_block_done +%macro PARTIAL_BLOCK_GMAC 16 +%define %%GDATA_CTX %1 ;; [in/out] GPR pointer to GCM context +%define %%PLAIN_IN %2 ;; [in] GPR pointer to plain/cipher text +%define %%PLAIN_LEN %3 ;; [in] text length in bytes, GPR or memory location (win64) +%define %%DATA_OFFSET %4 ;; [out] GPR data offset +%define %%AAD_HASH %5 ;; [in/out] xmm with hash value +%define %%HASH_SUBKEY %6 ;; [in] hash key +%define %%HASHK_SUBKEY %7 ;; [in] hash-K key +%define %%XMM0 %8 ;; [clobbered] xmm register +%define %%XMM1 %9 ;; [clobbered] xmm register +%define %%XMM2 %10 ;; [clobbered] xmm register +%define %%XMM3 %11 ;; [clobbered] xmm register +%define %%XMM5 %12 ;; [clobbered] xmm register +%define %%XMM6 %13 ;; [clobbered] xmm register +%define %%XMM9 %14 ;; [clobbered] xmm register +%define %%XMM10 %15 ;; [clobbered] xmm register +%define %%XMM11 %16 ;; [clobbered] xmm register + + ;; @note PBlockLen must not be zero + mov r13, [%%GDATA_CTX + PBlockLen] ; Read in input data without over reading %ifdef IS_AVX2_GCM cmp %%PLAIN_LEN, 16 jl %%_fewer_than_16_bytes ; If more than 16 bytes of data, just fill the xmm register - VXLDR xmm1, [%%PLAIN_IN] + VXLDR %%XMM1, [%%PLAIN_IN] jmp %%_data_read %%_fewer_than_16_bytes: lea r10, [%%PLAIN_IN] - READ_SMALL_DATA_INPUT_AVX xmm1, r10, %%PLAIN_LEN, rax + READ_SMALL_DATA_INPUT_AVX %%XMM1, r10, %%PLAIN_LEN, rax %else ; Read in input data without over reading - READ_SMALL_DATA_INPUT_LEN_BT16_AVX512 xmm1, %%PLAIN_IN, %%PLAIN_LEN, r12, rax, k1 + READ_SMALL_DATA_INPUT_LEN_BT16_AVX512 %%XMM1, %%PLAIN_IN, %%PLAIN_LEN, r12, rax, k1 %endif ; Finished reading in data %%_data_read: @@ -4088,8 +2965,8 @@ error_ghash: ; (16-r13 is the number of bytes in plaintext mod 16) add r12, r13 ; Get the appropriate shuffle mask - vmovdqu xmm2, [r12] - vmovdqa xmm3, xmm1 + vmovdqu %%XMM2, [r12] + vmovdqa %%XMM3, %%XMM1 mov r15, %%PLAIN_LEN add r15, r13 @@ -4100,19 +2977,19 @@ error_ghash: sub r12, r15 %%_no_extra_mask_1: - ; Get the appropriate mask to mask out bottom r13 bytes of xmm3 - vmovdqu xmm1, [r12 + ALL_F-SHIFT_MASK] + ; Get the appropriate mask to mask out bottom r13 bytes of %%XMM3 + vmovdqu %%XMM1, [r12 + ALL_F-SHIFT_MASK] - vpand xmm3, xmm1 - vpshufb xmm3, [rel SHUF_MASK] - vpshufb xmm3, xmm2 - vpxor %%AAD_HASH, xmm3 + vpand %%XMM3, %%XMM3, %%XMM1 + vpshufb %%XMM3, %%XMM3, [rel SHUF_MASK] + vpshufb %%XMM3, %%XMM3, %%XMM2 + vpxor %%AAD_HASH, %%AAD_HASH, %%XMM3 - cmp r15,0 + cmp r15, 0 jl %%_partial_incomplete_1 ; GHASH computation for the last <16 Byte block - GHASH_MUL %%AAD_HASH, %%HASH_SUBKEY, xmm0, xmm10, xmm11, xmm5, xmm6 + GHASH_MUL2 %%AAD_HASH, %%HASH_SUBKEY, %%HASHK_SUBKEY, %%XMM0, %%XMM10, %%XMM11, %%XMM5 xor rax, rax mov [%%GDATA_CTX + PBlockLen], rax jmp %%_ghash_done @@ -4138,117 +3015,5 @@ error_ghash: mov r12, %%PLAIN_LEN %%offset_set: mov %%DATA_OFFSET, r12 -%%_partial_block_done: -%endmacro ; PARTIAL_BLOCK_GMAC - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;void imb_aes_gmac_update_128_avx_gen4 / imb_aes_gmac_update_192_avx_gen4 / -; imb_aes_gmac_update_256_avx_gen4 -; imb_aes_gmac_update_128_avx512 / imb_aes_gmac_update_192_avx512 / -; imb_aes_gmac_update_256_avx512 ( -; const struct gcm_key_data *key_data, -; struct gcm_context_data *context_data, -; const u8 *in, -; const u64 msg_len); -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -align 32 -MKGLOBAL(GMAC_FN_NAME(update),function,) -GMAC_FN_NAME(update): - endbranch64 - FUNC_SAVE - -%ifdef SAFE_PARAM - ;; Reset imb_errno - IMB_ERR_CHECK_RESET -%endif - ;; Check if msg_len == 0 - or arg4, arg4 - je exit_gmac_update - -%ifdef SAFE_PARAM - ;; Check key_data != NULL - or arg1, arg1 - jz error_gmac_update - - ;; Check context_data != NULL - or arg2, arg2 - jz error_gmac_update - - ;; Check in != NULL (msg_len != 0) - or arg3, arg3 - jz error_gmac_update -%endif - ; Increment size of "AAD length" for GMAC - add [arg2 + AadLen], arg4 - - ;; Deal with previous partial block - xor r11, r11 - vmovdqu xmm13, [arg1 + HashKey] - vmovdqu xmm8, [arg2 + AadHash] - - PARTIAL_BLOCK_GMAC arg2, arg3, arg4, r11, xmm8, xmm13 - - ; CALC_AAD_HASH needs to deal with multiple of 16 bytes - sub arg4, r11 - add arg3, r11 - - vmovq xmm7, arg4 ; Save remaining length - and arg4, -16 ; Get multiple of 16 bytes - - or arg4, arg4 - jz no_full_blocks - - ;; Calculate GHASH of this segment -%ifdef IS_AVX2_GCM - CALC_AAD_HASH arg3, arg4, xmm8, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, r12, r13, rax -%else - CALC_AAD_HASH arg3, arg4, xmm8, arg1, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, \ - r10, r11, r12 -%endif - vmovdqu [arg2 + AadHash], xmm8 ; ctx_data.aad hash = aad_hash - -no_full_blocks: - add arg3, arg4 ; Point at partial block - - vmovq arg4, xmm7 ; Restore original remaining length - and arg4, 15 - jz exit_gmac_update - - ; Save next partial block - mov [arg2 + PBlockLen], arg4 -%ifdef IS_AVX2_GCM - READ_SMALL_DATA_INPUT_AVX xmm1, arg3, arg4, r11 -%else - READ_SMALL_DATA_INPUT_AVX512 xmm1, arg3, arg4, r11, k1 -%endif - vpshufb xmm1, [rel SHUF_MASK] - vpxor xmm8, xmm1 - vmovdqu [arg2 + AadHash], xmm8 - -exit_gmac_update: - FUNC_RESTORE - - ret - -%ifdef SAFE_PARAM -error_gmac_update: - ;; Clear reg and imb_errno - IMB_ERR_CHECK_START rax - - ;; Check key_data != NULL - IMB_ERR_CHECK_NULL arg1, rax, IMB_ERR_NULL_EXP_KEY - - ;; Check context_data != NULL - IMB_ERR_CHECK_NULL arg2, rax, IMB_ERR_NULL_CTX - - ;; Check in != NULL (msg_len != 0) - IMB_ERR_CHECK_NULL arg3, rax, IMB_ERR_NULL_SRC - - ;; Set imb_errno - IMB_ERR_CHECK_END rax - jmp exit_gmac_update -%endif - -mksection stack-noexec +%endmacro ; PARTIAL_BLOCK_GMAC diff --git a/lib/include/gcm_sgl_api_vaes_avx512.inc b/lib/include/gcm_sgl_api_vaes_avx512.inc index ecdcf4cc9bd71a32b84c545bb6d5d38c7f2636d5..f10d1d3825531de73c33f25583fb3354dd667260 100644 --- a/lib/include/gcm_sgl_api_vaes_avx512.inc +++ b/lib/include/gcm_sgl_api_vaes_avx512.inc @@ -35,6 +35,11 @@ %ifndef GCM_SGL_API_VAES_AVX512_INC %define GCM_SGL_API_VAES_AVX512_INC +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;; External symbols +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +extern ghash_internal_vaes_avx512 + mksection .text default rel @@ -159,20 +164,10 @@ FN_NAME(init_var_iv,_): skip_aad_check_init_IV: %endif - cmp arg4, 12 - je iv_len_12_init_IV - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, k1, xmm14, xmm2, \ zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, multi_call, arg4 - jmp skip_iv_len_12_init_IV - -iv_len_12_init_IV: - GCM_INIT arg1, arg2, arg3, arg5, arg6, r10, r11, r12, k1, xmm14, xmm2, \ - zmm1, zmm11, zmm3, zmm4, zmm5, zmm6, zmm7, zmm8, zmm9, zmm10, \ - zmm12, zmm13, zmm15, zmm16, zmm17, zmm18, zmm19, zmm20, multi_call -skip_iv_len_12_init_IV: ;; SAFE_DATA covered in FUNC_RESTORE() exit_init_IV: diff --git a/lib/include/gcm_sse.inc b/lib/include/gcm_sse.inc index f1b58ac48fc6069fa14360cdd0c79e1ef6190f03..7e60c9e33fb0c9afa9790a1d64efb8de91b39fa0 100644 --- a/lib/include/gcm_sse.inc +++ b/lib/include/gcm_sse.inc @@ -2015,7 +2015,6 @@ je %%_partial_done GHASH_MUL xmm14, xmm13, xmm0, xmm10, xmm11, xmm5, xmm6 ;GHASH computation for the last <16 Byte block - movdqu [%%GDATA_CTX + AadHash], xmm14 %%_partial_done: diff --git a/lib/include/gcm_vaes_avx512.inc b/lib/include/gcm_vaes_avx512.inc index ae944dfe09558c65066241731f645ccc34538c3a..de394b4d94a93b0ae96f5e079458ef0388b50ee5 100644 --- a/lib/include/gcm_vaes_avx512.inc +++ b/lib/include/gcm_vaes_avx512.inc @@ -421,9 +421,19 @@ %assign hashk HashKey_ %+ %%NUM_BLOCKS %if %0 == 18 + ;; no GH/GL sums passed so add current HASH value to block 0 vpxorq %%CIPHER_IN0, %%CIPHER_IN0, %%AAD_HASH_IN %endif +%if %0 == 20 +%ifnum %%AAD_HASH_IN + ;; %%AAD_HASH_IN defines number of extra blocks to add to %%NUM_BLOCKS +%assign NB (%%NUM_BLOCKS + %%AAD_HASH_IN) +%assign hashk HashKey_ %+ NB + +%endif +%endif + %if %%NUM_BLOCKS == 16 vmovdqu64 %%HK1, [%%KP + hashk] vmovdqu64 %%HK2, [%%KP + hashk + HKeyGap] @@ -466,6 +476,8 @@ vpternlogq %%TLL1, %%TLL2, %%THL2, 0x96 vpternlogq %%THH1, %%THH2, %%TLH2, 0x96 +%assign hashk (hashk + (4 * 64)) + %elif %%NUM_BLOCKS >= 12 vmovdqu64 %%HK1, [%%KP + hashk] @@ -497,6 +509,8 @@ vpxorq %%TLL1, %%TLL1, %%THL1 vpxorq %%THH1, %%THH1, %%TLH1 +%assign hashk (hashk + (3 * 64)) + %elif %%NUM_BLOCKS >= 8 vmovdqu64 %%HK1, [%%KP + hashk] @@ -519,6 +533,8 @@ vpternlogq %%TLL1, %%TLL2, %%THL2, 0x96 vpternlogq %%THH1, %%THH2, %%TLH2, 0x96 +%assign hashk (hashk + (2 * 64)) + %elif %%NUM_BLOCKS >= 4 vmovdqu64 %%HK1, [%%KP + hashk] @@ -531,6 +547,9 @@ ;; add sums into THH1:TLL1 vpxorq %%TLL1, %%TLL1, %%THL1 vpxorq %%THH1, %%THH1, %%TLH1 + +%assign hashk (hashk + (1 * 64)) + %endif ;; T1H/L/M1/M2 - hold current product sums (provided %%NUM_BLOCKS >= 4) @@ -542,7 +561,6 @@ ;; It may also be that they are the only blocks to process. ;; Set hash key and register index position for the remaining 1 to 3 blocks -%assign hashk HashKey_ %+ blocks_left %assign reg_idx (%%NUM_BLOCKS / 4) %xdefine %%REG_IN %%CIPHER_IN %+ reg_idx @@ -824,13 +842,13 @@ %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; CALC_GHASH: Calculates the hash of the data which will not be encrypted. ; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). ; Output: The hash of the data (AAD_HASH). ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro CALC_AAD_HASH 26 -%define %%A_IN %1 ; [in] AAD text pointer -%define %%A_LEN %2 ; [in] AAD length +%macro CALC_GHASH 24 +%define %%A_IN %1 ; [in/clobbered] AAD text pointer +%define %%A_LEN %2 ; [in/clobbered] AAD length %define %%AAD_HASH %3 ; [in/out] xmm ghash value %define %%GDATA_KEY %4 ; [in] pointer to keys %define %%ZT0 %5 ; [clobbered] ZMM register @@ -851,89 +869,88 @@ %define %%ZT15 %20 ; [clobbered] ZMM register %define %%ZT16 %21 ; [clobbered] ZMM register %define %%ZT17 %22 ; [clobbered] ZMM register -%define %%T1 %23 ; [clobbered] GP register -%define %%T2 %24 ; [clobbered] GP register -%define %%T3 %25 ; [clobbered] GP register -%define %%MASKREG %26 ; [clobbered] mask register +%define %%T3 %23 ; [clobbered] GP register +%define %%MASKREG %24 ; [clobbered] mask register %define %%SHFMSK %%ZT13 - mov %%T1, %%A_IN ; T1 = AAD - mov %%T2, %%A_LEN ; T2 = aadLen - - cmp %%T2, (16*16) + cmp %%A_LEN, (16*16) jb %%_less_than_16x16 vmovdqa64 %%SHFMSK, [rel SHUF_MASK] +align 32 %%_get_AAD_loop2x32x16: - cmp %%T2, (2*32*16) + cmp %%A_LEN, (2*32*16) jb %%_get_AAD_loop32x16 GHASH_16 start, hk_bcast, %%ZT5, %%ZT6, \ - %%T1, (0*16*16), 0, \ + %%A_IN, (0*16*16), 0, \ %%GDATA_KEY, HashKey_32, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK GHASH_16 end_reduce_no_hxor, hk_bcast, %%ZT5, %%ZT6, \ - %%T1, (1*16*16), 0, \ + %%A_IN, (1*16*16), 0, \ %%GDATA_KEY, HashKey_16, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK ;; **ZT1 can potentially include clear text, ZT16 & ZT17 hash key - add %%T1, (32*16) - sub %%T2, (32*16) + add %%A_IN, (32*16) + sub %%A_LEN, (32*16) jmp %%_get_AAD_loop2x32x16 +align 32 %%_get_AAD_loop32x16: - cmp %%T2, (32*16) + cmp %%A_LEN, (32*16) jb %%_exit_AAD_loop32x16 GHASH_16 start, hk_load, %%ZT5, %%ZT6, \ - %%T1, (0*16*16), 0, \ + %%A_IN, (0*16*16), 0, \ %%GDATA_KEY, HashKey_32, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK GHASH_16 end_reduce, hk_load, %%ZT5, %%ZT6, \ - %%T1, (1*16*16), 0, \ + %%A_IN, (1*16*16), 0, \ %%GDATA_KEY, HashKey_16, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK ;; **ZT1 can potentially include clear text, ZT16 & ZT17 hash key - sub %%T2, (32*16) + sub %%A_LEN, (32*16) je %%_CALC_AAD_done - add %%T1, (32*16) + add %%A_IN, (32*16) jmp %%_get_AAD_loop32x16 +align 32 %%_exit_AAD_loop32x16: ; Less than 32x16 bytes remaining - cmp %%T2, (16*16) + cmp %%A_LEN, (16*16) jb %%_less_than_16x16 je %%_equal_16x16 %%_less_than_32x16: ;; calculate offset to hash key to start with - lea %%T3, [%%T2 + 15] + lea %%T3, [%%A_LEN + 15] and %%T3, ~15 neg %%T3 add %%T3, HashKey_1 + 16 GHASH_16 start, hk_load, %%ZT5, %%ZT6, \ - %%T1, (0*64), 0, \ + %%A_IN, (0*64), 0, \ %%GDATA_KEY, %%T3, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK ;; **ZT1 can potentially include clear text, ZT16 & ZT17 hash key - sub %%T2, (16*16) - add %%T1, (16*16) + sub %%A_LEN, (16*16) + add %%A_IN, (16*16) jmp %%_less_than_16x16_remain +align 32 %%_equal_16x16: GHASH_16 start_reduce, hk_load, %%ZT5, %%ZT6, \ - %%T1, (0*64), 0, \ + %%A_IN, (0*64), 0, \ %%GDATA_KEY, HashKey_16, 0, ZWORD(%%AAD_HASH), \ %%ZT0, %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, \ %%ZT14, %%ZT15, %%ZT16, %%ZT1, %%ZT7, %%SHFMSK @@ -941,38 +958,39 @@ jmp %%_CALC_AAD_done ; Less than 16x16 bytes remaining +align 32 %%_less_than_16x16_remain: ;; ZT5 (H), ZT6 (L) contain ghash sums ;; prep mask source address lea %%T3, [rel byte64_len_to_mask_table] - lea %%T3, [%%T3 + %%T2*8] + lea %%T3, [%%T3 + %%A_LEN*8] ;; calculate number of blocks to ghash (including partial bytes) - add DWORD(%%T2), 15 - shr DWORD(%%T2), 4 + add DWORD(%%A_LEN), 15 + shr DWORD(%%A_LEN), 4 jz %%_CALC_AAD_done ;; catch zero length - cmp DWORD(%%T2), 2 + cmp DWORD(%%A_LEN), 2 jb %%_AAD_blocks_cont_1 je %%_AAD_blocks_cont_2 - cmp DWORD(%%T2), 4 + cmp DWORD(%%A_LEN), 4 jb %%_AAD_blocks_cont_3 je %%_AAD_blocks_cont_4 - cmp DWORD(%%T2), 6 + cmp DWORD(%%A_LEN), 6 jb %%_AAD_blocks_cont_5 je %%_AAD_blocks_cont_6 - cmp DWORD(%%T2), 8 + cmp DWORD(%%A_LEN), 8 jb %%_AAD_blocks_cont_7 je %%_AAD_blocks_cont_8 - cmp DWORD(%%T2), 10 + cmp DWORD(%%A_LEN), 10 jb %%_AAD_blocks_cont_9 je %%_AAD_blocks_cont_10 - cmp DWORD(%%T2), 12 + cmp DWORD(%%A_LEN), 12 jb %%_AAD_blocks_cont_11 je %%_AAD_blocks_cont_12 - cmp DWORD(%%T2), 14 + cmp DWORD(%%A_LEN), 14 jb %%_AAD_blocks_cont_13 je %%_AAD_blocks_cont_14 - cmp DWORD(%%T2), 15 + cmp DWORD(%%A_LEN), 15 je %%_AAD_blocks_cont_15 ;; fall through for 16 blocks @@ -987,6 +1005,7 @@ ;; generate all 16 cases using preprocessor %rep 16 +align 32 %%_AAD_blocks_cont_ %+ I: %if I > 12 sub %%T3, 12 * 16 * 8 @@ -998,7 +1017,7 @@ kmovq %%MASKREG, [%%T3] ZMM_LOAD_MASKED_BLOCKS_0_16 \ - I, %%T1, 0, \ + I, %%A_IN, 0, \ %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%MASKREG ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 \ @@ -1020,37 +1039,38 @@ %endrep ; Less than 16x16 bytes +align 32 %%_less_than_16x16: ;; prep mask source address lea %%T3, [rel byte64_len_to_mask_table] - lea %%T3, [%%T3 + %%T2*8] + lea %%T3, [%%T3 + %%A_LEN*8] ;; calculate number of blocks to ghash (including partial bytes) - add DWORD(%%T2), 15 - shr DWORD(%%T2), 4 + add DWORD(%%A_LEN), 15 + shr DWORD(%%A_LEN), 4 jz %%_CALC_AAD_done ;; catch zero length - cmp DWORD(%%T2), 2 + cmp DWORD(%%A_LEN), 2 jb %%_AAD_blocks_1 je %%_AAD_blocks_2 - cmp DWORD(%%T2), 4 + cmp DWORD(%%A_LEN), 4 jb %%_AAD_blocks_3 je %%_AAD_blocks_4 - cmp DWORD(%%T2), 6 + cmp DWORD(%%A_LEN), 6 jb %%_AAD_blocks_5 je %%_AAD_blocks_6 - cmp DWORD(%%T2), 8 + cmp DWORD(%%A_LEN), 8 jb %%_AAD_blocks_7 je %%_AAD_blocks_8 - cmp DWORD(%%T2), 10 + cmp DWORD(%%A_LEN), 10 jb %%_AAD_blocks_9 je %%_AAD_blocks_10 - cmp DWORD(%%T2), 12 + cmp DWORD(%%A_LEN), 12 jb %%_AAD_blocks_11 je %%_AAD_blocks_12 - cmp DWORD(%%T2), 14 + cmp DWORD(%%A_LEN), 14 jb %%_AAD_blocks_13 je %%_AAD_blocks_14 - cmp DWORD(%%T2), 15 + cmp DWORD(%%A_LEN), 15 je %%_AAD_blocks_15 ;; fall through for 16 blocks @@ -1065,6 +1085,7 @@ ;; generate all 16 cases using preprocessor %rep 16 +align 32 %%_AAD_blocks_ %+ I: %if I >= 3 vmovdqa64 %%SHFMSK, [rel SHUF_MASK] @@ -1084,7 +1105,7 @@ kmovq %%MASKREG, [%%T3] ZMM_LOAD_MASKED_BLOCKS_0_16 \ - I, %%T1, 0, \ + I, %%A_IN, 0, \ %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%MASKREG ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 \ @@ -1109,6 +1130,49 @@ %%_CALC_AAD_done: ;; result in AAD_HASH +%endmacro ; CALC_GHASH + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +; CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted. +; Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY). +; Output: The hash of the data (AAD_HASH). +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro CALC_AAD_HASH 26 +%define %%A_IN %1 ; [in] AAD text pointer +%define %%A_LEN %2 ; [in] AAD length +%define %%AAD_HASH %3 ; [in/out] xmm ghash value +%define %%GDATA_KEY %4 ; [in] pointer to keys +%define %%ZT0 %5 ; [clobbered] ZMM register +%define %%ZT1 %6 ; [clobbered**] ZMM register +%define %%ZT2 %7 ; [clobbered**] ZMM register +%define %%ZT3 %8 ; [clobbered**] ZMM register +%define %%ZT4 %9 ; [clobbered**] ZMM register +%define %%ZT5 %10 ; [clobbered] ZMM register +%define %%ZT6 %11 ; [clobbered] ZMM register +%define %%ZT7 %12 ; [clobbered] ZMM register +%define %%ZT8 %13 ; [clobbered] ZMM register +%define %%ZT9 %14 ; [clobbered] ZMM register +%define %%ZT10 %15 ; [clobbered] ZMM register +%define %%ZT11 %16 ; [clobbered] ZMM register +%define %%ZT12 %17 ; [clobbered] ZMM register +%define %%ZT13 %18 ; [clobbered] ZMM register +%define %%ZT14 %19 ; [clobbered] ZMM register +%define %%ZT15 %20 ; [clobbered] ZMM register +%define %%ZT16 %21 ; [clobbered] ZMM register +%define %%ZT17 %22 ; [clobbered] ZMM register +%define %%T1 %23 ; [clobbered] GP register +%define %%T2 %24 ; [clobbered] GP register +%define %%T3 %25 ; [clobbered] GP register +%define %%MASKREG %26 ; [clobbered] mask register + + mov %%T1, %%A_IN ; T1 = AAD + mov %%T2, %%A_LEN ; T2 = aadLen + + CALC_GHASH %%T1, %%T2, %%AAD_HASH, %%GDATA_KEY, \ + %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ + %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, %%ZT13, %%ZT14, \ + %%ZT15, %%ZT16, %%ZT17, %%T3, %%MASKREG + %endmacro ; CALC_AAD_HASH ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2811,59 +2875,58 @@ ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;; Calculate J0 for cases when IV length is different than 12 bytes -%macro CALC_J0 26 +;;; - uses ghash_internal_vaes_avx512() function +;;; - clobbers: zmm0-zmm1, zmm3-zmm13, zmm15-zmm20, r12, r13, rax, k1 +%macro CALC_J0 4-5 %define %%KEY %1 ;; [in] Pointer to GCM KEY structure %define %%IV %2 ;; [in] Pointer to IV %define %%IV_LEN %3 ;; [in] IV length %define %%J0 %4 ;; [out] XMM reg to contain J0 -%define %%ZT0 %5 ;; [clobbered] ZMM register -%define %%ZT1 %6 ;; [clobbered] ZMM register -%define %%ZT2 %7 ;; [clobbered] ZMM register -%define %%ZT3 %8 ;; [clobbered] ZMM register -%define %%ZT4 %9 ;; [clobbered] ZMM register -%define %%ZT5 %10 ;; [clobbered] ZMM register -%define %%ZT6 %11 ;; [clobbered] ZMM register -%define %%ZT7 %12 ;; [clobbered] ZMM register -%define %%ZT8 %13 ;; [clobbered] ZMM register -%define %%ZT9 %14 ;; [clobbered] ZMM register -%define %%ZT10 %15 ;; [clobbered] ZMM register -%define %%ZT11 %16 ;; [clobbered] ZMM register -%define %%ZT12 %17 ;; [clobbered] ZMM register -%define %%ZT13 %18 ;; [clobbered] ZMM register -%define %%ZT14 %19 ;; [clobbered] ZMM register -%define %%ZT15 %20 ;; [clobbered] ZMM register -%define %%ZT16 %21 ;; [clobbered] ZMM register -%define %%ZT17 %22 ;; [clobbered] ZMM register -%define %%T1 %23 ;; [clobbered] GP register -%define %%T2 %24 ;; [clobbered] GP register -%define %%T3 %25 ;; [clobbered] GP register -%define %%MASKREG %26 ;; [clobbered] mask register +%define %%SHUFMASK %5 ;; [in] register with shuffle mask + +%define %%ZT0 zmm3 +%define %%ZT1 zmm4 +%define %%ZT2 zmm5 +%define %%ZT3 zmm6 +%define %%ZT4 zmm7 +%define %%ZT5 zmm8 +%define %%ZT6 zmm9 + +%define %%T1 r12 +%define %%T2 r13 ;; J0 = GHASH(IV || 0s+64 || len(IV)64) ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */ ;; Calculate GHASH of (IV || 0s) - vpxor %%J0, %%J0 - CALC_AAD_HASH %%IV, %%IV_LEN, %%J0, %%KEY, %%ZT0, %%ZT1, %%ZT2, %%ZT3, \ - %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, %%ZT10, %%ZT11, \ - %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, %%ZT17, \ - %%T1, %%T2, %%T3, %%MASKREG - ;; **ZT1, ZT2, ZT3 may contain sensitive data + vpxor xmm0, xmm0, xmm0 + ;; arg1 - GDATA_KEY + ;; r12 - message pointer + ;; r13 - message length + ;; xmm0 - hash in/out + mov r12, %%IV + mov r13, %%IV_LEN + call ghash_internal_vaes_avx512 +%ifnidn %%J0, xmm0 + vmovdqa64 %%J0, xmm0 +%endif ;; Calculate GHASH of last 16-byte block (0 || len(IV)64) mov %%T1, %%IV_LEN shl %%T1, 3 ;; IV length in bits vmovq XWORD(%%ZT2), %%T1 - ;; Might need shuffle of ZT2 vpxorq %%J0, XWORD(%%ZT2), %%J0 vmovdqu64 XWORD(%%ZT0), [%%KEY + HashKey_1] vmovdqu64 XWORD(%%ZT5), [%%KEY + HashKey_1 + HKeyGap] GHASH_MUL2 %%J0, XWORD(%%ZT0), XWORD(%%ZT5), XWORD(%%ZT1), XWORD(%%ZT2), XWORD(%%ZT3), XWORD(%%ZT4) - ;; **ZT1, ZT2, ZT3 overwritten with ghash products - vpshufb %%J0, [rel SHUF_MASK] ; perform a 16Byte swap +%if %0 == 4 + vpshufb %%J0, %%J0, [rel SHUF_MASK] +%elif %0 == 5 + vpshufb %%J0, %%J0, XWORD(%%SHUFMASK) +%endif %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -2906,26 +2969,34 @@ %define %%IV_LEN %31 ; [in] IV length ;; prepare IV -%if %0 == 31 ;; IV is different than 12 bytes - CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%CUR_COUNT, \ - %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, \ - %%ZT8, %%ZT9, %%ZT10, %%ZT11, %%ZT12, %%ZT13, \ - %%ZT14, %%ZT15, %%ZT16, %%ZT17, %%GPR1, %%GPR2, %%GPR3, %%MASKREG -%else ;; IV is 12 bytes +%if %0 == 31 ;; IV may be different than 12 bytes + cmp %%IV_LEN, 12 + je %%_iv_length_is_12_bytes + + CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%CUR_COUNT + jmp %%_iv_prep_is_done + +%endif + +%%_iv_length_is_12_bytes: ;; read 12 IV bytes and pad with 0x00000001 vmovdqa64 %%CUR_COUNT, [rel ONEf] mov %%GPR2, %%IV mov DWORD(%%GPR1), 0x0000_0fff kmovd %%MASKREG, DWORD(%%GPR1) vmovdqu8 %%CUR_COUNT{%%MASKREG}, [%%GPR2] ; ctr = IV | 0x1 -%endif + +%%_iv_prep_is_done: + vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv + vpshufb %%CUR_COUNT, %%CUR_COUNT, [rel SHUF_MASK] + vmovdqu64 [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv (LE format) ;; calculate AAD hash cmp %%A_LEN, 12 jne %%_aad_is_not_12_bytes ;; load 12 bytes of AAD -%if %0 == 31 ;; IV is different than 12 bytes +%if %0 == 31 ;; IV may be different than 12 bytes mov DWORD(%%GPR1), 0x0000_0fff kmovd %%MASKREG, DWORD(%%GPR1) %endif @@ -2933,7 +3004,7 @@ vmovdqu8 XWORD(%%AAD_HASH){%%MASKREG}{z}, [%%GPR1] vmovdqu8 XWORD(%%ZT0), [%%GDATA_KEY + HashKey_1] vmovdqu8 XWORD(%%ZT5), [%%GDATA_KEY + HashKey_1 + HKeyGap] - vpshufb XWORD(%%AAD_HASH), [rel SHUF_MASK] + vpshufb XWORD(%%AAD_HASH), XWORD(%%AAD_HASH), [rel SHUF_MASK] ;; GHASH 12 bytes of AAD GHASH_MUL2 XWORD(%%AAD_HASH), XWORD(%%ZT0), XWORD(%%ZT5), \ @@ -2942,12 +3013,16 @@ jmp %%_aad_compute_done %%_aad_is_not_12_bytes: - vpxor %%AAD_HASH, %%AAD_HASH - CALC_AAD_HASH %%A_IN, %%A_LEN, %%AAD_HASH, %%GDATA_KEY, \ - %%ZT0, %%ZT1, %%ZT2, %%ZT3, %%ZT4, %%ZT5, %%ZT6, %%ZT7, %%ZT8, %%ZT9, \ - %%ZT10, %%ZT11, %%ZT12, %%ZT13, %%ZT14, %%ZT15, %%ZT16, %%ZT17, \ - %%GPR1, %%GPR2, %%GPR3, %%MASKREG - ;; **ZT1, ZT2, ZT3 may contain AAD but AAD is not considered sensitive + vpxor xmm0, xmm0, xmm0 + ;; arg1 - GDATA_KEY + ;; r12 - message pointer + ;; r13 - message length + ;; xmm0 - hash in/out + mov r12, %%A_IN + mov r13, %%A_LEN + call ghash_internal_vaes_avx512 + vmovdqa %%AAD_HASH, xmm0 + %%_aad_compute_done: ;; set up context fields @@ -2961,9 +3036,6 @@ mov [%%GDATA_CTX + PBlockLen], %%GPR1 ; ctx.partial_block_length = 0 %endif - vmovdqu64 [%%GDATA_CTX + OrigIV], %%CUR_COUNT ; ctx.orig_IV = iv - vpshufb %%CUR_COUNT, [rel SHUF_MASK] - vmovdqu64 [%%GDATA_CTX + CurCount], %%CUR_COUNT ; ctx.current_counter = iv (LE format) %endmacro ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; @@ -3077,6 +3149,606 @@ %endmacro ; GCM_ENC_DEC_SMALL +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; GCM_ENC_DEC_0_TO_256 +;; - combines and optimizes functionality of three macros: +;; - GCM_INIT +;; - GCM_ENC_DEC +;; - GCM_COMPLETE +;; - works for packet sizes between 0 and 256 bytes +;; - it is limited to single_call case only +;; - works with AAD size +;; - works with IV size provided IV length is provided +;; Output: C and T +;; Clobbers rax, r12, r13, zmm0-zmm23, zmm26-zmm29, zmm30, zmm31, k1, k2, r11 (windows) +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +%macro GCM_ENC_DEC_0_TO_256 10-11 +%define %%GDATA_KEY %1 ; [in] key pointer +%define %%CIPH_PLAIN_OUT %2 ; [in] output buffer pointer +%define %%PLAIN_CIPH_IN %3 ; [in] input buffer pointer +%define %%PLAIN_CIPH_LEN %4 ; [in] buffer length +%define %%IV %5 ; [in] IV pointer +%define %%A_IN %6 ; [in] AAD pointer +%define %%A_LEN %7 ; [in] AAD length in bytes +%define %%AUTH_TAG %8 ; [in] pointer to store auth tag into (GP or mem) +%define %%AUTH_TAG_LEN %9 ; [in] length in bytes of auth tag (GP or mem) +%define %%ENC_DEC %10 ; [in] cipher direction +%define %%IV_LEN %11 ; [in] IV length + +%define %%IA0 rax +%define %%IA1 r12 +%define %%IA2 r13 +%define %%IA3 r11 + +%define %%CTR_BLOCKz zmm0 +%define %%CTR_BLOCKx xmm0 ; hardcoded in GCM_INIT + +%define %%AAD_HASHz zmm1 +%define %%AAD_HASHy ymm1 +%define %%AAD_HASHx xmm1 ; hardcoded in GCM_COMPLETE + +%define %%SHUF_MASK zmm30 +%define %%SHUF_MASKy ymm30 +%define %%SHUF_MASKx xmm30 + +%define %%ORIG_IV zmm31 +%define %%ORIG_IVx xmm31 + +%define %%ZTMP0 zmm2 +%define %%ZTMP1 zmm3 +%define %%ZTMP2 zmm4 +%define %%ZTMP3 zmm5 +%define %%ZTMP4 zmm6 +%define %%ZTMP5 zmm7 +%define %%ZTMP6 zmm8 +%define %%ZTMP7 zmm9 +%define %%ZTMP8 zmm10 +%define %%ZTMP9 zmm11 +%define %%ZTMP10 zmm12 +%define %%ZTMP11 zmm13 +%define %%ZTMP12 zmm14 +%define %%ZTMP13 zmm15 +%define %%ZTMP14 zmm16 +%define %%ZTMP15 zmm17 +%define %%ZTMP16 zmm18 +%define %%ZTMP17 zmm19 +%define %%ZTMP18 zmm20 +%define %%ZTMP19 zmm21 +%define %%ZTMP20 zmm22 +%define %%ZTMP21 zmm23 +%define %%ZTMP22 zmm24 ; not used +%define %%ZTMP23 zmm25 ; not used +%define %%ZTMP24 zmm26 +%define %%ZTMP25 zmm27 +%define %%ZTMP26 zmm28 +%define %%ZTMP27 zmm29 + +%define %%DAT0 %%ZTMP24 +%define %%DAT1 %%ZTMP25 +%define %%DAT2 %%ZTMP26 +%define %%DAT3 %%ZTMP27 + +%define %%MASK_TEXT k1 +%define %%MASK_TAG k1 +%define %%MASK_IVAAD k2 + + ;; =================================================================== + ;; prepare IV +%if %0 == 11 + ;; IV may be different than 12 bytes + cmp %%IV_LEN, 12 + je %%_iv_length_is_12_bytes + + CALC_J0 %%GDATA_KEY, %%IV, %%IV_LEN, %%ORIG_IVx + jmp %%_iv_prep_is_done +%endif ;; IV_LEN provided + +%%_iv_length_is_12_bytes: + ;; read 12 IV bytes and pad with 0x00000001 + vmovdqa64 %%ORIG_IVx, [rel ONEf] + mov %%IA2, %%IV + mov DWORD(%%IA1), 0x0000_0fff + kmovd %%MASK_IVAAD, DWORD(%%IA1) + vmovdqu8 %%ORIG_IVx{%%MASK_IVAAD}, [%%IA2] ; ctr = IV | 0x1 + +%%_iv_prep_is_done: + ;; set up context fields + vpshufb %%CTR_BLOCKx, %%ORIG_IVx, [rel SHUF_MASK] + + ;; =================================================================== + ;; check for zero message length + +%ifidn __OUTPUT_FORMAT__, win64 + cmp %%PLAIN_CIPH_LEN, 0 +%else + or %%PLAIN_CIPH_LEN, %%PLAIN_CIPH_LEN +%endif + je %%_small_initial_num_blocks_is_0 + + ;; =================================================================== + ;; Prepare %%LENGTH register +%ifidn __OUTPUT_FORMAT__, win64 +%define %%LENGTH %%IA3 + mov %%LENGTH, %%PLAIN_CIPH_LEN +%else +%define %%LENGTH %%PLAIN_CIPH_LEN ;; PLAIN_CIPH_LEN is a register on linux +%endif + ;; =================================================================== + ;; Determine how many blocks to process + ;; - process one additional block if there is a partial block (round up) + +%define %%NUM_BLOCKS %%IA1 + + mov DWORD(%%NUM_BLOCKS), DWORD(%%LENGTH) + add DWORD(%%NUM_BLOCKS), 15 + shr DWORD(%%NUM_BLOCKS), 4 + ;; %%NUM_BLOCKS can be in the range from 0 to 16 + + cmp DWORD(%%NUM_BLOCKS), 8 + je %%_small_initial_num_blocks_is_8 + jb %%_small_initial_num_blocks_is_7_1 + + cmp DWORD(%%NUM_BLOCKS), 12 + je %%_small_initial_num_blocks_is_12 + jb %%_small_initial_num_blocks_is_11_9 + + ;; 16, 15, 14 or 13 + cmp DWORD(%%NUM_BLOCKS), 15 + ja %%_small_initial_num_blocks_is_16 + je %%_small_initial_num_blocks_is_15 + cmp DWORD(%%NUM_BLOCKS), 14 + je %%_small_initial_num_blocks_is_14 + jmp %%_small_initial_num_blocks_is_13 + +%%_small_initial_num_blocks_is_11_9: + ;; 11, 10 or 9 + cmp DWORD(%%NUM_BLOCKS), 10 + ja %%_small_initial_num_blocks_is_11 + je %%_small_initial_num_blocks_is_10 + jmp %%_small_initial_num_blocks_is_9 + +%%_small_initial_num_blocks_is_7_1: + cmp DWORD(%%NUM_BLOCKS), 4 + je %%_small_initial_num_blocks_is_4 + jb %%_small_initial_num_blocks_is_3_1 + ;; 7, 6 or 5 + cmp DWORD(%%NUM_BLOCKS), 6 + ja %%_small_initial_num_blocks_is_7 + je %%_small_initial_num_blocks_is_6 + jmp %%_small_initial_num_blocks_is_5 + +%%_small_initial_num_blocks_is_3_1: + ;; 3, 2 or 1 + cmp DWORD(%%NUM_BLOCKS), 2 + ja %%_small_initial_num_blocks_is_3 + je %%_small_initial_num_blocks_is_2 + + ;; for %%NUM_BLOCKS == 1, just fall through and no 'jmp' needed + + ;; =================================================================== + ;; Use rep to generate different optimized code for block size variants + ;; - one block size variant has to be the first one + +%assign num_blocks 1 +%rep 16 + + ;; =================================================================== + ;; =================================================================== + ;; Optimized small packet AES-GCM generation + ;; - at this stage, IV is ready + ;; - prepare counter blocks + ;; - do AES-CTR & encryption of original IV + ;; - do AAD, GHASH of message and block with sizes + +%%_small_initial_num_blocks_is_ %+ num_blocks : + +%define %%CTR0 %%ZTMP0 +%define %%CTR1 %%ZTMP1 +%define %%CTR2 %%ZTMP2 +%define %%CTR3 %%ZTMP3 + + ;; =================================================================== + ;; - load shuffle mask + ;; - retrieve 32-bit counter in BE format +%if num_blocks == 1 + vmovdqa64 %%SHUF_MASKx, [rel SHUF_MASK] +%elif num_blocks == 2 + vmovdqa64 %%SHUF_MASKy, [rel SHUF_MASK] +%else + vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK] +%endif + vmovd DWORD(%%IA2), %%CTR_BLOCKx + + ;; =================================================================== + ;; get load/store mask for plain/cipher text + lea %%IA0, [rel byte64_len_to_mask_table] + mov %%IA1, %%LENGTH +%if num_blocks > 12 + sub %%IA1, 3 * 64 +%elif num_blocks > 8 + sub %%IA1, 2 * 64 +%elif num_blocks > 4 + sub %%IA1, 64 +%endif + kmovq %%MASK_TEXT, [%%IA0 + %%IA1*8] + + ;; =================================================================== + ;; Check if counter blocks can be prepared in BE format or + ;; LE format is required + cmp BYTE(%%IA2), 256 - num_blocks + jae %%_ctr_overflow_ %+ num_blocks + + ;; =================================================================== + ;; Prepare AES counter blocks (BE format, no byte overflow) +%if num_blocks == 1 + vpaddd XWORD(%%CTR0), %%ORIG_IVx, [rel ONEf] +%elif num_blocks == 2 + vshufi64x2 YWORD(%%CTR0), YWORD(%%ORIG_IV), YWORD(%%ORIG_IV), 0 + vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_addbe_1234] +%else + vshufi64x2 %%CTR_BLOCKz, %%ORIG_IV, %%ORIG_IV, 0 + vpaddd %%CTR0, %%CTR_BLOCKz, [rel ddq_addbe_1234] +%if num_blocks > 4 + vpaddd %%CTR1, %%CTR_BLOCKz, [rel ddq_addbe_5678] +%endif +%if num_blocks > 8 + vpaddd %%CTR2, %%CTR0, [rel ddq_addbe_8888] +%endif +%if num_blocks > 12 + vpaddd %%CTR3, %%CTR1, [rel ddq_addbe_8888] +%endif +%endif + jmp %%_ctr_ready_ %+ num_blocks + +%%_ctr_overflow_ %+ num_blocks : + ;; =================================================================== + ;; Prepare AES counter blocks (LE format, byte overflow) +%if num_blocks == 1 + vpaddd XWORD(%%CTR0), %%CTR_BLOCKx, [rel ONE] +%elif num_blocks == 2 + vshufi64x2 YWORD(%%CTR0), YWORD(%%CTR_BLOCKz), YWORD(%%CTR_BLOCKz), 0 + vpaddd YWORD(%%CTR0), YWORD(%%CTR0), [rel ddq_add_1234] +%else + vshufi64x2 %%CTR_BLOCKz, %%CTR_BLOCKz, %%CTR_BLOCKz, 0 + vpaddd %%CTR0, %%CTR_BLOCKz, [rel ddq_add_1234] +%if num_blocks > 4 + vpaddd %%CTR1, %%CTR_BLOCKz, [rel ddq_add_5678] +%endif +%if num_blocks > 8 + vpaddd %%CTR2, %%CTR0, [rel ddq_add_8888] +%endif +%if num_blocks > 12 + vpaddd %%CTR3, %%CTR1, [rel ddq_add_8888] +%endif +%endif + + ;; =================================================================== + ;; shuffle the counter blocks for AES rounds + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpshufb, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK + +%%_ctr_ready_ %+ num_blocks : + + ;; =================================================================== + ;; append original IV to message blocks for AES encryption, if possible + +%if (num_blocks % 4) != 0 +%assign num_blocks_aes (num_blocks + 1) +%assign blend_orig_iv_aes 1 + +%if (num_blocks >= 14) && (num_blocks <= 15) + vinserti64x2 %%CTR3, %%ORIG_IVx, num_blocks - 12 +%elif (num_blocks == 13) + vinserti64x2 YWORD(%%CTR3), %%ORIG_IVx, num_blocks - 12 +%elif (num_blocks >= 10) && (num_blocks <= 11) + vinserti64x2 %%CTR2, %%ORIG_IVx, num_blocks - 8 +%elif (num_blocks == 9) + vinserti64x2 YWORD(%%CTR2), %%ORIG_IVx, num_blocks - 8 +%elif (num_blocks >= 6) && (num_blocks <= 7) + vinserti64x2 %%CTR1, %%ORIG_IVx, num_blocks - 4 +%elif (num_blocks == 5) + vinserti64x2 YWORD(%%CTR1), %%ORIG_IVx, num_blocks - 4 +%elif (num_blocks >= 2) && (num_blocks <= 3) + vinserti64x2 %%CTR0, %%ORIG_IVx, num_blocks +%else ; (num_blocks == 1) + vinserti64x2 YWORD(%%CTR0), %%ORIG_IVx, num_blocks +%endif + +%else + ;; 16, 12, 8, 4 or 0 block cases +%assign num_blocks_aes num_blocks +%assign blend_orig_iv_aes 0 +%endif + + ;; =================================================================== + ;; load plain/cipher text + ZMM_LOAD_MASKED_BLOCKS_0_16 num_blocks, %%PLAIN_CIPH_IN, 0, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, %%MASK_TEXT + + + ;; =================================================================== + ;; AES rounds and XOR with plain/cipher text +%assign j 0 + + vbroadcastf64x2 %%ZTMP10, [%%GDATA_KEY + (j * 16)] +%if blend_orig_iv_aes == 0 + vpxorq %%ORIG_IVx, %%ORIG_IVx, XWORD(%%ZTMP10) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks_aes, vpxorq, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%ZTMP10, %%ZTMP10, %%ZTMP10, %%ZTMP10 +%assign j (j + 1) + +%rep NROUNDS + vbroadcastf64x2 %%ZTMP10, [%%GDATA_KEY + (j * 16)] +%if blend_orig_iv_aes == 0 + vaesenc %%ORIG_IVx, %%ORIG_IVx, XWORD(%%ZTMP10) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks_aes, vaesenc, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%ZTMP10, %%ZTMP10, %%ZTMP10, %%ZTMP10 +%assign j (j + 1) +%endrep + + vbroadcastf64x2 %%ZTMP10, [%%GDATA_KEY + (j * 16)] +%if blend_orig_iv_aes == 0 + vaesenclast %%ORIG_IVx, %%ORIG_IVx, XWORD(%%ZTMP10) +%endif + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks_aes, vaesenclast, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%ZTMP10, %%ZTMP10, %%ZTMP10, %%ZTMP10 + + ;; =================================================================== + ;; Extract encrypted original IV +%if blend_orig_iv_aes != 0 +%if num_blocks >= 12 + vextracti32x4 %%ORIG_IVx, %%CTR3, num_blocks - 12 +%elif num_blocks >= 8 + vextracti32x4 %%ORIG_IVx, %%CTR2, num_blocks - 8 +%elif num_blocks >= 4 + vextracti32x4 %%ORIG_IVx, %%CTR1, num_blocks - 4 +%else + vextracti32x4 %%ORIG_IVx, %%CTR0, num_blocks +%endif +%endif + + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpxorq, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3 + + ;; =================================================================== + ;; write cipher/plain text back to output and + ZMM_STORE_MASKED_BLOCKS_0_16 num_blocks, %%CIPH_PLAIN_OUT, 0, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, %%MASK_TEXT + + ;; =================================================================== + ;; Shuffle the cipher text blocks for hashing part + ;; - GHASH always works on cipher text +%ifidn %%ENC_DEC, DEC + ;; Decrypt case + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK +%else + ;; Encrypt case + + ;; - zero bytes outside the mask before hashing +%if num_blocks <= 4 + vmovdqu8 %%CTR0{%%MASK_TEXT}{z}, %%CTR0 +%elif num_blocks <= 8 + vmovdqu8 %%CTR1{%%MASK_TEXT}{z}, %%CTR1 +%elif num_blocks <= 12 + vmovdqu8 %%CTR2{%%MASK_TEXT}{z}, %%CTR2 +%else + vmovdqu8 %%CTR3{%%MASK_TEXT}{z}, %%CTR3 +%endif + + ;; - cipher blocks are in CTR0-CTR3 + ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 num_blocks, vpshufb, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, \ + %%CTR0, %%CTR1, %%CTR2, %%CTR3, \ + %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK, %%SHUF_MASK +%endif ; Encrypt + + ;; =================================================================== + ;; Calculate AAD hash + cmp %%A_LEN, 12 + jne %%_aad_is_not_12_bytes_ %+ num_blocks + + ;; =================================================================== + ;; load 12 bytes of AAD (most common case) + ;; - AAD and block with sizes get hashed together + ;; - one reduction for everything (AAD + message + length block) + +%if %0 == 11 ;; IV may be different than 12 bytes and %%MASK_IVAAD not set + mov DWORD(%%IA1), 0x0000_0fff + kmovd %%MASK_IVAAD, DWORD(%%IA1) +%endif + mov %%IA1, %%A_IN + vmovdqu8 %%AAD_HASHx{%%MASK_IVAAD}{z}, [%%IA1] + vpshufb %%AAD_HASHx, %%AAD_HASHx, %%SHUF_MASKx + + vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN + vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) + vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits + vinserti64x2 %%AAD_HASHy, XWORD(%%ZTMP15), 1 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; GHASH 12 byte AAD with length block using respective GHASH key powers + ;; AAD_HASHy = [ AAD: 0-127 | LENGTH: 128-255 ] + ;; HASH_KEY = [ HK ^ (N + 2) | HK ^ 1 ] + +%assign num_blocks2 (num_blocks + 2) +%define %%HKeyN2 HashKey_ %+ num_blocks2 + + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + %%HKeyN2 + HKeyGap] + vinserti64x2 YWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1 + HKeyGap], 1 + vpclmulqdq YWORD(%%ZTMP14), %%AAD_HASHy, YWORD(%%ZTMP13), 0x00 ; TLL = GH_L * KK_L + vpclmulqdq YWORD(%%ZTMP15), %%AAD_HASHy, YWORD(%%ZTMP13), 0x10 ; TLH = GH_L * KK_H + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + %%HKeyN2] + vinserti64x2 YWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1], 1 + vpclmulqdq YWORD(%%ZTMP16), %%AAD_HASHy, YWORD(%%ZTMP13), 0x01 ; THL = GH_H * HK_L + vpclmulqdq YWORD(%%ZTMP17), %%AAD_HASHy, YWORD(%%ZTMP13), 0x11 ; THH = GH_H * HK_H + +%undef %%HKeyN2 + + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + ;; add products + + vpxorq YWORD(%%ZTMP14), YWORD(%%ZTMP14), YWORD(%%ZTMP16) ;; TLL += THL + vpxorq YWORD(%%ZTMP15), YWORD(%%ZTMP15), YWORD(%%ZTMP17) ;; TLH += THH + + ;; =================================================================== + ;; continue with message GHASH followed by reduction + ;; + ;; Hash key powers and corresponding message blocks: + ;; HASH_KEY = [ HK ^ (N + 1), HK ^ N, ... HK ^ 2 ] + ;; MSG = [ MSG1, MSG2, ... MSGN ] + + GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP20, \ + %%ZTMP21, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, \ + 1, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, num_blocks, %%ZTMP15, %%ZTMP14 + + jmp %%_small_initial_blocks_encrypted + +%%_aad_is_not_12_bytes_ %+ num_blocks: + ;; =================================================================== + ;; Calculate AAD hash (different than 12 bytes) + + vpxor xmm0, xmm0, xmm0 + ;; arg1 - GDATA_KEY + ;; r12 - message pointer + ;; r13 - message length + ;; xmm0 - hash in/out + mov r12, %%A_IN + mov r13, %%A_LEN + call ghash_internal_vaes_avx512 + vmovdqa64 %%AAD_HASHx, xmm0 + +%if num_blocks == 16 + ;; =================================================================== + ;; message GHASH compute + GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \ + %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, %%AAD_HASHz, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, num_blocks + + ;; =================================================================== + ;; GHASH length block + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1] + vmovdqu8 XWORD(%%ZTMP14), [%%GDATA_KEY + HashKey_1 + HKeyGap] + + vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN + vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) + vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits + + vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP15) + GHASH_MUL2 %%AAD_HASHx, XWORD(%%ZTMP13), XWORD(%%ZTMP14), XWORD(%%ZTMP16), XWORD(%%ZTMP17), XWORD(%%ZTMP18), XWORD(%%ZTMP19) + +%else + ;; =================================================================== + ;; create & append length block into message for GHASH + vmovq XWORD(%%ZTMP15), %%PLAIN_CIPH_LEN + vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) + vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits + +%if num_blocks == 12 + vmovdqa64 XWORD(%%DAT3), XWORD(%%ZTMP15) +%elif num_blocks > 12 + vinserti64x2 %%DAT3, XWORD(%%ZTMP15), num_blocks - 12 +%elif num_blocks == 8 + vmovdqa64 XWORD(%%DAT2), XWORD(%%ZTMP15) +%elif num_blocks > 8 + vinserti64x2 %%DAT2, XWORD(%%ZTMP15), num_blocks - 8 +%elif num_blocks == 4 + vmovdqa64 XWORD(%%DAT1), XWORD(%%ZTMP15) +%elif num_blocks > 4 + vinserti64x2 %%DAT1, XWORD(%%ZTMP15), num_blocks - 4 +%else + vinserti64x2 %%DAT0, XWORD(%%ZTMP15), num_blocks +%endif + + ;; =================================================================== + ;; message + length block GHASH compute + +%assign num_blocks2 (num_blocks + 1) + + GHASH_1_TO_16 %%GDATA_KEY, %%AAD_HASHx, \ + %%ZTMP10, %%ZTMP11, %%ZTMP12, %%ZTMP13, %%ZTMP14, \ + %%ZTMP15, %%ZTMP16, %%ZTMP17, %%ZTMP18, %%ZTMP19, %%AAD_HASHz, \ + %%DAT0, %%DAT1, %%DAT2, %%DAT3, num_blocks2 + +%endif + jmp %%_small_initial_blocks_encrypted + + ;; =================================================================== + ;; increment number of blocks and repeat code generation +%assign num_blocks (num_blocks + 1) + +%endrep + + ;; =================================================================== + ;; Zero message size case (not optimized, not used very often) +%%_small_initial_num_blocks_is_0: + vmovdqa64 %%SHUF_MASKx, [rel SHUF_MASK] + + ;; =================================================================== + ;; calculate AAD hash for 0 message length case + vpxor xmm0, xmm0, xmm0 + ;; arg1 - GDATA_KEY + ;; r12 - message pointer + ;; r13 - message length + ;; xmm0 - hash in/out + mov r12, %%A_IN + mov r13, %%A_LEN + call ghash_internal_vaes_avx512 + vmovdqa64 %%AAD_HASHx, xmm0 + + ;; =================================================================== + ;; encrypt original IV + ENCRYPT_SINGLE_BLOCK %%GDATA_KEY, %%ORIG_IVx ; E(K, Y0) + + ;; =================================================================== + ;; GHASH length block + vmovdqu8 XWORD(%%ZTMP13), [%%GDATA_KEY + HashKey_1] + vmovdqu8 XWORD(%%ZTMP14), [%%GDATA_KEY + HashKey_1 + HKeyGap] + + vpxorq XWORD(%%ZTMP15), XWORD(%%ZTMP15), XWORD(%%ZTMP15) ; len(C) = 0 + vpinsrq XWORD(%%ZTMP15), %%A_LEN, 1 ; ZTMP15 = len(A)||len(C) + vpsllq XWORD(%%ZTMP15), XWORD(%%ZTMP15), 3 ; convert bytes into bits + + vpxorq %%AAD_HASHx, %%AAD_HASHx, XWORD(%%ZTMP15) + GHASH_MUL2 %%AAD_HASHx, XWORD(%%ZTMP13), XWORD(%%ZTMP14), XWORD(%%ZTMP16), XWORD(%%ZTMP17), XWORD(%%ZTMP18), XWORD(%%ZTMP19) + +%%_small_initial_blocks_encrypted: + ;; =================================================================== + ;; Complete GMAC computation + ;; S => %%AAD_HASHx + ;; CIPHER(J0) => %%ORIG_IVx + ;; T = MSB(GCTR(J0,S)) + vpshufb %%AAD_HASHx, %%AAD_HASHx, %%SHUF_MASKx + vpxorq %%ORIG_IVx, %%ORIG_IVx, %%AAD_HASHx + + ;; =================================================================== + ;; Store the tag T + mov %%IA0, %%AUTH_TAG + mov %%IA1, %%AUTH_TAG_LEN + + lea %%IA2, [rel byte64_len_to_mask_table] + kmovq %%MASK_TAG, [%%IA2 + %%IA1*8] + vmovdqu8 [%%IA0]{%%MASK_TAG}, %%ORIG_IVx + +%endmacro ; GCM_ENC_DEC_0_TO_256 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; GCM_ENC_DEC Encodes/Decodes given data. Assumes that the passed gcm_context_data struct ; has been initialized by GCM_INIT @@ -3086,7 +3758,7 @@ ; Output: A cipher of the given plain text (CIPH_PLAIN_OUT), and updated GDATA_CTX ; Clobbers rax, r10-r15, and zmm0-zmm31, k1 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -%macro GCM_ENC_DEC 7 +%macro GCM_ENC_DEC 7-8 %define %%GDATA_KEY %1 ; [in] key pointer %define %%GDATA_CTX %2 ; [in] context pointer %define %%CIPH_PLAIN_OUT %3 ; [in] output buffer pointer @@ -3094,6 +3766,15 @@ %define %%PLAIN_CIPH_LEN %5 ; [in] buffer length %define %%ENC_DEC %6 ; [in] cipher direction %define %%INSTANCE_TYPE %7 ; [in] 'single_call' or 'multi_call' selection +%define %%MSG_SIZE_SCOPE %8 ; [in] '>256' to remove small packets code path + +%assign include_small_packets 1 + +%if %0 > 7 +%ifidn %%MSG_SIZE_SCOPE, '>256' +%assign include_small_packets 0 +%endif +%endif %define %%IA0 r10 %define %%IA1 r12 @@ -3175,12 +3856,14 @@ ;;; - hash 16 blocks ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction) +%if include_small_packets != 0 %ifidn __OUTPUT_FORMAT__, win64 cmp %%PLAIN_CIPH_LEN, 0 %else or %%PLAIN_CIPH_LEN, %%PLAIN_CIPH_LEN %endif je %%_enc_dec_done +%endif ; include_small_packets != 0 ;; Update length of data processed %ifidn __OUTPUT_FORMAT__, win64 @@ -3226,8 +3909,10 @@ je %%_enc_dec_done %endif ; %%INSTANCE_TYPE, multi_call +%if include_small_packets != 0 cmp %%LENGTH, (16 * 16) jbe %%_message_below_equal_16_blocks +%endif ; include_small_packets != 0 vmovdqa64 %%SHUF_MASK, [rel SHUF_MASK] vmovdqa64 %%ADDBE_4x4, [rel ddq_addbe_4444] @@ -3481,8 +4166,12 @@ %ifidn %%INSTANCE_TYPE, multi_call vpshufb %%CTR_BLOCKx, %%CTR_BLOCKx, XWORD(%%SHUF_MASK) %endif + +%if include_small_packets != 0 jmp %%_ghash_done +%endif ; include_small_packets != 0 +%if include_small_packets != 0 %%_message_below_equal_16_blocks: ;; Determine how many blocks to process ;; - process one additional block if there is a partial block @@ -3509,6 +4198,7 @@ vpxorq %%ZTMP10, %%ZTMP10, %%ZTMP10 %endif ;; fall through to exit +%endif ; include_small_packets != 0 %%_ghash_done: %ifdef SAFE_DATA diff --git a/lib/include/kasumi_internal.h b/lib/include/kasumi_internal.h index 7b84526bfd2b2e3f153ff8da221ab0d42d79977d..4e76bc114a2559874088d8c090b0cb8304e8a115 100644 --- a/lib/include/kasumi_internal.h +++ b/lib/include/kasumi_internal.h @@ -1334,7 +1334,6 @@ kasumi_f8_n_buffer(const kasumi_key_sched_t *pKeySchedule, const uint64_t IV[], if (bufCount > 16) { pOut[0] = NULL; - printf("dataCount too high (%u)\n", (unsigned) bufCount); return; } diff --git a/lib/include/snow3g_common.h b/lib/include/snow3g_common.h index a58a5ef94649119b36235d19011eb066f2f3afbf..8dbfd17e0aa712ad7c4ed70dd607edd724893aa0 100644 --- a/lib/include/snow3g_common.h +++ b/lib/include/snow3g_common.h @@ -35,7 +35,6 @@ #ifndef SNOW3G_COMMON_H #define SNOW3G_COMMON_H -#include /* printf() */ #include /* memset(), memcpy() */ #include @@ -3103,7 +3102,6 @@ SNOW3G_F8_N_BUFFER(const snow3g_key_schedule_t *pCtx, const void *const IV[], if (packetCount > NUM_PACKETS_16) { pBufferOut[0] = NULL; - printf("packetCount too high (%u)\n", (unsigned) packetCount); return; } @@ -3267,7 +3265,6 @@ SNOW3G_F8_N_BUFFER_MULTIKEY(const snow3g_key_schedule_t *const pCtx[], const voi if (packetCount > NUM_PACKETS_16) { pBufferOut[0] = NULL; - printf("packetCount too high (%u)\n", (unsigned) packetCount); return; } diff --git a/lib/include/zuc_internal.h b/lib/include/zuc_internal.h index e93062a55482ead9413ce5a09ddf640926267ab9..9c9624538fb2c8d06898f5c7da35de04c43cf1aa 100644 --- a/lib/include/zuc_internal.h +++ b/lib/include/zuc_internal.h @@ -61,17 +61,6 @@ #define ZUC_MIN_BYTELEN 1 #define ZUC_MAX_BYTELEN (ZUC_MAX_BITLEN / 8) -#ifdef DEBUG -#ifdef _WIN32 -#define DEBUG_PRINT(_fmt, ...) \ - fprintf(stderr, "%s()::%d " _fmt, __FUNCTION__, __LINE__, __VA_ARGS__) -#else -#define DEBUG_PRINT(_fmt, ...) fprintf(stderr, "%s()::%d " _fmt, __func__, __LINE__, __VA_ARGS__) -#endif -#else -#define DEBUG_PRINT(_fmt, ...) -#endif - /** ****************************************************************************** * @description diff --git a/lib/win_x64.mak b/lib/win_x64.mak index e226dbdf8106cfa8a497687b309def1af91a59e8..2828abb14b015272e87381c8433d8e7662a52d77 100644 --- a/lib/win_x64.mak +++ b/lib/win_x64.mak @@ -304,6 +304,7 @@ lib_objs1 = \ $(OBJ_DIR)\poly_avx512.obj \ $(OBJ_DIR)\poly_fma_avx512.obj \ $(OBJ_DIR)\des_x16_avx512.obj \ + $(OBJ_DIR)\des_common_avx512.obj \ $(OBJ_DIR)\aes_cntr_api_by16_vaes_avx512.obj \ $(OBJ_DIR)\aes_cntr_bit_api_by16_vaes_avx512.obj \ $(OBJ_DIR)\aes_cntr_ccm_api_by16_vaes_avx512.obj \ @@ -559,24 +560,21 @@ no_aesni_objs = \ gcm_objs = \ $(OBJ_DIR)\gcm.obj \ + $(OBJ_DIR)\ghash_by8_avx2.obj \ $(OBJ_DIR)\aes128_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes128_gcm_vaes_avx2.obj \ - $(OBJ_DIR)\aes128_gcm_by8_avx512.obj \ $(OBJ_DIR)\aes128_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes128_gcm_sgl_api_vaes_avx512.obj \ - $(OBJ_DIR)\aes128_gmac_api_vaes_avx512.obj \ + $(OBJ_DIR)\ghash_api_vaes_avx512.obj \ + $(OBJ_DIR)\gmac_api_vaes_avx512.obj \ $(OBJ_DIR)\aes192_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes192_gcm_vaes_avx2.obj \ - $(OBJ_DIR)\aes192_gcm_by8_avx512.obj \ $(OBJ_DIR)\aes192_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes192_gcm_sgl_api_vaes_avx512.obj \ - $(OBJ_DIR)\aes192_gmac_api_vaes_avx512.obj \ $(OBJ_DIR)\aes256_gcm_by8_avx2.obj \ $(OBJ_DIR)\aes256_gcm_vaes_avx2.obj \ - $(OBJ_DIR)\aes256_gcm_by8_avx512.obj \ $(OBJ_DIR)\aes256_gcm_api_vaes_avx512.obj \ $(OBJ_DIR)\aes256_gcm_sgl_api_vaes_avx512.obj \ - $(OBJ_DIR)\aes256_gmac_api_vaes_avx512.obj \ $(OBJ_DIR)\gcm128_api_by8_sse.obj \ $(OBJ_DIR)\gcm128_sgl_api_by8_sse.obj \ $(OBJ_DIR)\gcm128_gmac_api_by8_sse.obj \ diff --git a/perf/cmake/unix.cmake b/perf/cmake/unix.cmake index 9c9d897ebea6d193190fbc3a3c716cd7388261fa..4ce2e632615775a46ecd8f90e6fcfd0ca7ead41e 100644 --- a/perf/cmake/unix.cmake +++ b/perf/cmake/unix.cmake @@ -61,7 +61,7 @@ if(CMAKE_COMPILER_IS_GNUCC) string(APPEND CMAKE_C_FLAGS " -fno-strict-overflow") endif() -if(CC_HAS_CET) +if(CET_SUPPORT) string(APPEND CMAKE_C_FLAGS " -fcf-protection=full") string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-z,ibt -Wl,-z,shstk -Wl,-z,cet-report=error") endif() diff --git a/test/cmake/unix.cmake b/test/cmake/unix.cmake index c8949fc3c060a5b453a3660bbbcade5180b1f047..02f17675a35ee0a128d7e30341cf9c921ace148e 100644 --- a/test/cmake/unix.cmake +++ b/test/cmake/unix.cmake @@ -62,7 +62,7 @@ if(CMAKE_COMPILER_IS_GNUCC) string(APPEND CMAKE_C_FLAGS " -fno-strict-overflow") endif() -if(CC_HAS_CET) +if(CET_SUPPORT) string(APPEND CMAKE_C_FLAGS " -fcf-protection=full") string(APPEND CMAKE_EXE_LINKER_FLAGS " -Wl,-z,ibt -Wl,-z,shstk -Wl,-z,cet-report=error") endif()